Created
October 4, 2018 08:14
-
-
Save gunessenturk/ec45f95a3e62fe29df2d1ef0c080f345 to your computer and use it in GitHub Desktop.
TDI_project_week4
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"import dill\n", | |
"from bs4 import BeautifulSoup\n", | |
"from datetime import datetime\n", | |
"import re\n", | |
"import pandas as pd\n", | |
"import datetime as dt\n", | |
"import unicodedata" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 125, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn import base\n", | |
"from sklearn.feature_extraction import DictVectorizer\n", | |
"from sklearn.pipeline import Pipeline, FeatureUnion\n", | |
"from sklearn.neighbors import NearestNeighbors" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import nltk\n", | |
"from nltk import word_tokenize\n", | |
"from collections import Counter\n", | |
"from nltk.stem import PorterStemmer # one of the several available stemmers\n", | |
"from nltk.corpus import stopwords\n", | |
"from nltk import ngrams\n", | |
"from nltk import pos_tag" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ENGLISH_STOPWORDS = stopwords.words('english')\n", | |
"#KENNEDY_URL = 'http://www.kennedy-center.org'\n", | |
"#WASHINGTON_CONSERVATORY_URL = 'http://www.washingtonconservatory.org/html/'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 70, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"composer_nationality = {'hindemith': 'german', 'debussy': 'french', 'mozart': 'austrian', 'liszt': 'hungarian',\n", | |
" 'schumann': 'german', 'faure':'french', 'ravel': 'french', 'saint-saens': 'french', \n", | |
" 'saint-saans': 'french',\n", | |
" 'tchaikovsky': 'russian', 'brahms': 'german', 'boccherini': 'italian', 'messiaen': 'french', \n", | |
" 'barber': 'american', 'rachmaninoff': 'russian', 'schubert': 'austrian', \n", | |
" 'schoenberg': 'austrian', 'mendelssohn': 'german', 'beethoven': 'german', 'prokofiev':'russian', \n", | |
" 'shostakovich':'russian', 'khachaturian': 'russian', 'britten': 'english', \n", | |
" 'mahler': 'austrian', 'wagner': 'german', 'berlioz': 'french', 'roussel': 'french', \n", | |
" 'haydn': 'austrian', 'bruckner': 'austrian', 'franck': 'french', 'rossini': 'italian', \n", | |
" 'copland': 'american', 'stravinsky': 'russian', 'bates': 'american', 'muhly': 'american', \n", | |
" 'nesbett': 'english', 'byrd': 'english', 'praetorius': 'german', 'bartók': 'hungarian', \n", | |
" 'bartok': 'hungarian', 'rzewski': 'american', 'loeffler': 'german', 'poulenc': 'french', \n", | |
" 'bach': 'german', 'scarlatti': 'italian', 'vivaldi': 'italian'\n", | |
" \n", | |
" }\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 71, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"composer_period = {'hindemith': 'modern', 'debussy': 'modern', 'mozart': 'classical', 'liszt': 'romantic',\n", | |
" 'schumann': 'romantic', 'faure':'romantic', 'ravel': 'modern', 'saint-saens': 'romantic', \n", | |
" 'saint-saans': 'romantic',\n", | |
" 'chopin': 'romantic', 'tchaikovsky': 'romantic', 'brahms': 'romantic', 'boccherini': 'classical', \n", | |
" 'messiaen': 'modern', 'barber': 'modern', 'rachmaninoff': 'romantic', 'schubert': 'romantic', \n", | |
" 'schoenberg': 'modern', 'silvestrov': 'modern', 'sylvestrov': 'modern', 'mendelssohn': 'romantic', \n", | |
" 'beethoven': 'classical', 'prokofiev': 'modern', 'shostakovich': 'modern', 'khachaturian': 'modern', \n", | |
" 'britten': 'modern', 'mahler': 'romantic', 'wagner': 'romantic', 'berlioz': 'romantic', \n", | |
" 'roussel': 'modern', 'haydn': 'classical', 'bruckner': 'romantic', 'franck': 'romantic', \n", | |
" 'sibelius': 'romantic', 'rossini': 'classical', 'copland': 'modern', 'berio': 'modern', \n", | |
" 'dvorak': 'romantic', 'stravinsky': 'modern', 'bates': 'modern', 'muhly': 'modern', \n", | |
" 'nesbett': 'renaissance', 'byrd': 'renaissance', 'praetorius': 'renaissance', 'schifrin': 'modern', \n", | |
" 'lutoslawski': 'modern', 'bartók': 'modern', 'bartok': 'modern', 'rzewski': 'modern', \n", | |
" 'martin': 'modern', 'baran': 'modern', 'loeffler': 'modern', 'poulenc': 'modern', \n", | |
" 'penderecki': 'modern', 'bach': 'baroque', 'scarlatti': 'baroque', 'vivaldi': 'baroque'\n", | |
" }" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_rel_links_KC(genre): # genre is CLA for classical, CHA for chamber\n", | |
" # Use requests.get to download the page.\n", | |
" page = requests.get('http://www.kennedy-center.org/calendar/genre/' + genre) \n", | |
" soup = BeautifulSoup(page.text, \"lxml\")\n", | |
"\n", | |
" # Get all relative links to individual concerts\n", | |
" events = soup.select('h4') \n", | |
" links = [get_link_KC(event) for event in events]\n", | |
" return links" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Extract relative urls for each event\n", | |
"def get_link_KC(event):\n", | |
" rel_url = re.search( '<a\\shref=\"(.+)\">' , str(event)).group(1)\n", | |
" return rel_url" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Returns a list of tuples (<Performer name>, <role>)\n", | |
"def get_performers_KC(blurb):\n", | |
" if not re.search(r'blurbpadding\">[\\s]*(\\n?(\\w+\\s\\w+),\\s(\\w+)<br/>\\r*)+', str(blurb)):\n", | |
" return None\n", | |
" performers = re.search(r'blurbpadding\">[\\s]*(\\n?(\\w+\\s\\w+),\\s(\\w+)<br/>\\r*)+', str(blurb)).group(0)\n", | |
" performers = list(filter(lambda x: (len(x)>4),performers.split('<br/>')))\n", | |
" performers = list(map(lambda x: re.search('([A-Z].+),\\s(.+)', x),performers))\n", | |
" performers = [[performer.group(1), performer.group(2)] for performer in performers]\n", | |
" return performers" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Returns a list of lists [<Composer name>, <piece name>]\n", | |
"def get_program_KC(soup):\n", | |
"\n", | |
" composer_re = re.compile(r'<a href=\"/artist/index/[\\w]+\">(.+)</a>')\n", | |
" composition_re = re.compile(r'<a href=\"/artist/composition/[\\w]+\">(.+)</a>')\n", | |
" TAG_RE = re.compile(r'<[^>]+>')\n", | |
" #composers = re.findall(composer_re, str(soup))\n", | |
" #if composers:\n", | |
" # compositions = re.findall(composition_re, str(soup))\n", | |
" # compositions = [TAG_RE.sub('', composition) for composition in compositions]\n", | |
" # pieces = list(zip(composers, compositions))\n", | |
"\n", | |
" #else:\n", | |
" blurb = soup.select('div.blurbpadding') \n", | |
" pieces = re.findall(r'<strong>(.+</strong>[:|,].*)', str(blurb))\n", | |
" TAG_RE = re.compile(r'<[^>]+>')\n", | |
" pieces = [TAG_RE.sub('', piece) for piece in pieces]\n", | |
" pieces = [piece for piece in pieces if len(piece)<60]\n", | |
" pieces = [re.compile(r'\\xa0').sub('', piece) for piece in pieces]\n", | |
" if (pieces) and (':' in pieces[0]):\n", | |
" pieces = [piece.split(':') for piece in pieces]\n", | |
" else:\n", | |
" pieces = [piece.split(',') for piece in pieces]\n", | |
" \n", | |
" if pieces == []:\n", | |
" pieces = ['Program: TBD']\n", | |
" else:\n", | |
" for piece in pieces:\n", | |
" piece[0] = piece[0].title()\n", | |
" \n", | |
" return pieces" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Returns the range of ticket proces\n", | |
"def get_price_KC(soup):\n", | |
" price = soup.find('div', {'class': re.compile(r'price*')}).text\n", | |
" price = re.search(r'(\\$\\d+\\.\\d\\d(\\s-\\s\\$\\d+\\.\\d\\d)?)', price).group(0) \n", | |
" price = re.compile(r'\\$').sub('', price)\n", | |
" return price" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Washington Conservatory\n", | |
"def get_rel_links_WC(): \n", | |
" # Download the page.\n", | |
" page = requests.get('http://www.washingtonconservatory.org/html/concerts.htm#professional') \n", | |
" soup = BeautifulSoup(page.text, \"lxml\")\n", | |
" \n", | |
" # Get all relative links to individual concerts\n", | |
" links = [link['href'] for link in soup.find_all(\"a\", {'href': re.compile(r'concerts1819_.*')})]\n", | |
" return links" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Get program for Washington Conservatory\n", | |
"def get_program_WC(soup):\n", | |
" program = soup.find_all('li')\n", | |
" program = [item for item in program if item.text != '']\n", | |
" YEAR_RE = re.compile(r'\\([\\d-]+\\)')\n", | |
" if not program:\n", | |
" program = ['Program: TBD']\n", | |
" else: \n", | |
" program = [item.text for item in program]\n", | |
" program = [YEAR_RE.sub('', item) for item in program]\n", | |
" program = [item.split(':') for item in program]\n", | |
" for index, item in enumerate(program):\n", | |
" program[index] = [x.strip() for x in program[index]]\n", | |
" \n", | |
" return program" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Get event date and time\n", | |
"def get_date_WC(soup):\n", | |
" date = soup.find('p', {'class': 'center', 'style': re.compile(r'position: relative.*')}).text.strip().split('\\n')[0]\n", | |
" date = pd.Timestamp(date)\n", | |
" return date" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Return a list of performers\n", | |
"def get_performers_WC(soup):\n", | |
" performers = soup.find('h3', {'style': re.compile(r'margin-left: 20px.*')})\n", | |
" performers = performers.findChildren()[0]\n", | |
" #performers = [item.strip() for item in performers if ',' in item]\n", | |
" performers = re.findall('([A-Z][A-Z]+\\.?-?\\s?[A-Z]*\\.?-?\\s?[A-Z]*\\.?,?\\s?[a-z]*\\s?[a-z]*\\s?[a-z]*)', performers.text)\n", | |
" performers = [performer.strip() for performer in performers]\n", | |
" performers = [performer.split(', ') for performer in performers]\n", | |
" if not performers:\n", | |
" return ['Performers: TBD']\n", | |
" for performer in performers:\n", | |
" if len(performer) == 1:\n", | |
" performer[0] = performer[0].title()\n", | |
" else:\n", | |
" performer[0] = performer[0].title()\n", | |
" performer[1] = performer[1].lower() \n", | |
" \n", | |
" return performers" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 159, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Get event links from Friday Morning Music Club, Inc.\n", | |
"def get_links_FMMC():\n", | |
" # Download the page.\n", | |
" links = []\n", | |
" for i in range(4):\n", | |
" page = requests.get('http://www.fmmc.org/events/list/?tribe_paged='+str(i+1)+'&tribe_event_display=list') \n", | |
" soup = BeautifulSoup(page.text, \"lxml\")\n", | |
"\n", | |
" # Get all relative links to individual concerts\n", | |
" links.extend([link['href'] for link in soup.find_all(\"a\", {'class': 'tribe-event-url'})])\n", | |
" links = [link for link in links if ('concert' in link) | ('orchestra' in link) | ('chorale' in link)]\n", | |
" return links " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 197, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Get event date and time for Friday Morning Music Club, Inc.\n", | |
"def get_date_FMMC(soup):\n", | |
" date = soup.find_all('abbr', {'class': re.compile(r'.*tribe-events-start.*')})\n", | |
" date = date[0]['title']\n", | |
" time = soup.find_all('div', {'class': re.compile(r'.*tribe-events-start.*')})\n", | |
" time = time[0].text.strip().split(' - ')[0]\n", | |
" return pd.Timestamp(date + ' ' + time)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 389, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Get program for Friday Morning Music Club, Inc.\n", | |
"def get_program_FMMC(soup):\n", | |
" program = soup.find_all('div', {'class': re.compile(r'tribe-events-single-event-description.*')})\n", | |
" pieces = re.findall('[A-Z]+[:|,].*', str(program[0]))\n", | |
" performer_re = re.compile(r'<strong>[\\w|\\s]*</strong>')\n", | |
" tag_re = re.compile(r'</?\\w+/?>')\n", | |
" pieces = [re.sub(performer_re, '', item) for item in pieces]\n", | |
" pieces = [re.sub('\\.?\\s,.*', '', item) for item in pieces]\n", | |
" pieces = [item.split(':') for item in pieces]\n", | |
" pieces = [item for item in pieces if len(item)>1]\n", | |
" pieces = [[item[0].title().strip(), re.sub(tag_re, '', item[1]).strip()] for item in pieces]\n", | |
" return pieces" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"links = get_links_FMMC()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 390, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[['Beethoven', 'Sonata in F-sharp Major, op. 78'], ['Bruch', 'Eight Pieces for clarinet, viola and piano (selections)']]\n", | |
"[]\n", | |
"[['Brahms, Joel Friedman, Chopin, Wieniawski, J.S. Bach, Sarasate', 'Selected works']]\n", | |
"[['Rheinberger', 'Nonet in E-flat Major, op. 139']]\n", | |
"[['Beethoven', 'Sonata in F-sharp Major, op. 78'], ['Korngold', 'Selected songs'], ['Mozart', 'Duo in G Major, K.423']]\n", | |
"[['Haydn', 'Andante with variations in F Minor, Hob.XVII'], ['Fiske', 'Sonata for piano and clarinet']]\n", | |
"[['Balakirev', 'Mazurkas'], ['Rorem', 'Nantucket Songs'], ['Martinu', 'Sonata for flute, violin, and piano, H. 254']]\n", | |
"[['Stamitz', 'Quartet, op. 10, no. 5'], ['Martinu', 'Sonata'], ['Haydn', 'Sonata, Hob. XVI']]\n", | |
"[]\n", | |
"[['Mozart', 'Selected arias'], ['Beethoven', 'Sonata in F-sharp Major, op. 78. DEBUSSY'], ['Albert', 'Doppler Effect. GARY SCHOCKER']]\n", | |
"[['Brahms', 'Klavierstücke, op. 118'], ['Ginastera', 'Cinco Canciones Populares Argentinas'], ['Shostakovich', 'String Quartet No. 1 in C Major, op. 49. and']]\n", | |
"[['Bach, Scarlatti, Lovelady And York', 'Selected works'], ['Brahms', 'Klavierstücke, op. 118. Stephanie Ng, piano.'], ['Shostakovich', 'String Quartet No. 1 in C Major, op. 49']]\n", | |
"[['Bach', 'Movements from Partita in E Major, BWV 1006'], ['Schubert', 'Salve Regina in A Major, D. 676; Songs on Goethe’s “Mignon,” compiled and transcribed by Aribert Reimann'], ['Castello', 'Sonata Prima Concertate in stil moderno'], ['Telemann', 'Solo Sonata No 7']]\n", | |
"[['Poulenc', 'Suite Française'], ['Clarke, Gordon Jacob', 'Duos for clarinet and viola'], ['Mozart, Puccini, Delibes And Gershwin', 'Selected songs. Liana Diaz-Rivera, soprano; piano TBA.']]\n", | |
"[['Jolivet', 'Sonatine for flute and clarinet'], ['Strauss', 'Sonata in E-flat Major, op. 18.']]\n", | |
"[['Zyman', 'Fantasia Mexicana. and'], ['Haydn', 'Sonata in F Major, Hob. XVI'], ['Tchaikovsky', 'Five Romances (in Russian)']]\n", | |
"[['Rachmaninoff', 'All-Night Vigil (selections).'], ['Chesnokov', 'Salvation is Created.'], ['Rheinberger', 'Abendlied.']]\n", | |
"[['Rachmaninoff', 'All-Night Vigil (selections).'], ['Chesnokov', 'Salvation is Created.'], ['Rheinberger', 'Abendlied.']]\n", | |
"[['Davis', 'Merrie English Love Songs; and additional music for voice and trombone'], ['Jolivet', 'Sonatine for flute and clarinet'], ['Krstic', 'Croatian Songs, Part I; Get Along Home (selections)']]\n", | |
"[]\n", | |
"[['Mozart', 'Serenata notturna K. 239.'], ['Beethoven', 'Symphony No. 1.'], ['Beethoven', 'Violin Concerto.']]\n", | |
"[]\n", | |
"[]\n", | |
"[['Tchaikovsky', 'Selected Romances'], ['Haydn', 'F-Minor Variations. Guity Adjoodani, piano.'], ['Shikele', 'Dances for Three']]\n", | |
"[]\n", | |
"[]\n" | |
] | |
} | |
], | |
"source": [ | |
"for link in links:\n", | |
" concertPage = requests.get(link) \n", | |
" soup = BeautifulSoup(concertPage.text, \"lxml\")\n", | |
" print(get_program_FMMC(soup))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 387, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"concertPage = requests.get(links[20]) \n", | |
"soup = BeautifulSoup(concertPage.text, \"lxml\")\n", | |
"program = soup.find_all('div', {'class': re.compile(r'tribe-events-single-event-description.*')})\n", | |
"pieces = re.findall('[A-Z]+[:|,].*', str(program[0]))\n", | |
"performer_re = re.compile(r'<strong>[\\w|\\s]*</strong>')\n", | |
"tag_re = re.compile(r'</?\\w+/?>')\n", | |
"pieces = [re.sub(performer_re, '', item) for item in pieces]\n", | |
"pieces = [re.sub('\\.?\\s,.*', '', item) for item in pieces]\n", | |
"pieces = [item.split(':') for item in pieces]\n", | |
"pieces = [item for item in pieces if len(item)>1]\n", | |
"pieces = [[item[0].title().strip(), re.sub(tag_re, '', item[1]).strip()] for item in pieces]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 385, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pieces = re.findall('[A-Z]+[:|,].*', str(program[0]))\n", | |
"performer_re = re.compile(r'<strong>[\\w|\\s]*</strong>')\n", | |
"tag_re = re.compile(r'</?\\w+/?>')\n", | |
"pieces = [re.sub(performer_re, '', item) for item in pieces]\n", | |
"pieces = [re.sub('\\.?\\s,.*', '', item) for item in pieces]\n", | |
"pieces = [item.split(':') for item in pieces]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 388, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[['Mozart', 'Serenata notturna K. 239.'],\n", | |
" ['Beethoven', 'Symphony No. 1.'],\n", | |
" ['Beethoven', 'Violin Concerto.']]" | |
] | |
}, | |
"execution_count": 388, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pieces" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 384, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'<div class=\"tribe-events-single-event-description tribe-events-content\">\\n<input class=\"fooboxshare_post_id\" type=\"hidden\" value=\"5091\"/><p>FMMC Avanti Orchestra, <strong>Pablo Saelzer</strong>, Conductor.<br/>\\nSoloist: <strong>Youjin Lee</strong>, violin, 2018 Washington International Competition first-prize winner.</p>\\n<p><strong>Program:</strong><br/>\\n• MOZART: Serenata notturna K. 239.<br/>\\n• BEETHOVEN: Symphony No. 1.<br/>\\n• BEETHOVEN: Violin Concerto.</p>\\n<p>Read the <a href=\"http://www.fmmc.org/performance-opportunities/avantiorchestra/\">Avanti page</a> for more details about the orchestra and other concerts this season.</p>\\n<p>The <a href=\"https://fmmcfoundation.org/current-competitions/2018-washington-international-competition-strings/\">Washington International Competition for String Players</a> Finals took place at the Terrace Theater of the Kennedy Center for the Performing Arts in Washington, DC, on June 24, 2018. </p>\\n</div>'" | |
] | |
}, | |
"execution_count": 384, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"str(program[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 373, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['MOZART: Selected arias. <strong>Debora Madsen</strong>, soprano; <strong>Gillian Cookson</strong>, piano.<br/>',\n", | |
" 'BEETHOVEN: Sonata in F-sharp Major, op. 78. DEBUSSY: <em>Gardens in the Rain</em>. <strong>Joan Berman Mizrahi</strong>, piano.<br/>',\n", | |
" 'ALBERT: <em>Doppler Effect</em>. GARY SCHOCKER: <em>Danger-High Voltage</em>. SAMUEL ZYMAN: <em>Fantasia Mexicana</em>. <strong>Laura Benning</strong> and <strong>Gwyn Jones</strong>, flutes; piano TBD.</p>']" | |
] | |
}, | |
"execution_count": 373, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"re.findall('[A-Z]+[:|,].*', str(program[0]))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 72, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class Event():\n", | |
" def __init__(self, program=[], performers=[], date=None, price=None, place=None, tags=[], tags_simple=[]):\n", | |
" self.program = program\n", | |
" self.performers = performers\n", | |
" self.date = date\n", | |
" self.price = price\n", | |
" self.place = place\n", | |
" self.tags = tags\n", | |
" self.tags_simple = tags_simple\n", | |
" \n", | |
" def addTag(self, tag):\n", | |
" self.tags.append(tag)\n", | |
" \n", | |
" def printEvent(self):\n", | |
" date_str = self.date.strftime('%A, %B {}, %Y {}:%M %p'.format(self.date.day, self.date.hour%12))\n", | |
" print(date_str)\n", | |
" print(self.performers)\n", | |
" print(self.program)\n", | |
" print(self.place)\n", | |
" print(self.price)\n", | |
" print('\\n')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Returns a list of event objects from Washington Conservatory\n", | |
"def getEvents_WC():\n", | |
" WASHINGTON_CONSERVATORY_URL = 'http://www.washingtonconservatory.org/html/'\n", | |
" events = []\n", | |
" for link in get_rel_links_WC():\n", | |
" concertPage = requests.get(WASHINGTON_CONSERVATORY_URL + link) \n", | |
" soup = BeautifulSoup(concertPage.text, \"lxml\")\n", | |
" program = get_program_WC(soup)\n", | |
" date = get_date_WC(soup)\n", | |
" performers = get_performers_WC(soup)\n", | |
" events.append(Event(program=program, performers=performers, date=date, \n", | |
" place='Washington Conservatory', price='Donation'))\n", | |
" return events" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Returns a list of event objects from Kennedy Center\n", | |
"def getEvents_KC(genre):\n", | |
" KENNEDY_URL = 'http://www.kennedy-center.org'\n", | |
" events = []\n", | |
" for link in get_rel_links_KC(genre):\n", | |
" concertPage = requests.get(KENNEDY_URL + link) \n", | |
" soup = BeautifulSoup(concertPage.text, \"lxml\")\n", | |
" date = pd.Timestamp(soup.find('meta', {'name': 'StartDate'})['content']).to_pydatetime()\n", | |
" #price = soup.find('div', {'class': re.compile(r'price*')}).text\n", | |
" #price = re.search(r'(\\$\\d+\\.\\d\\d(\\s-\\s\\$\\d+\\.\\d\\d)?)', price).group(0) \n", | |
" price = get_price_KC(soup)\n", | |
" blurb = soup.select('div.blurbpadding') \n", | |
" if not get_performers_KC(blurb):\n", | |
" continue\n", | |
" performers = get_performers_KC(blurb)\n", | |
" program = get_program_KC(soup)\n", | |
" events.append(Event(program=program, performers=performers, date=date, \n", | |
" place='Kennedy Center', price=price))\n", | |
" return events" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 119, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def dfFromEvents(event_list):\n", | |
" columns = ['Date', 'Time', 'Venue', 'Price', 'Performers', 'Program', 'Tags', 'Tags_Simple']\n", | |
" df = pd.DataFrame(columns=columns)\n", | |
" for event in event_list:\n", | |
" tags = [] \n", | |
" tags_simple = []\n", | |
" composers = []\n", | |
" tags.extend([performer[1] for performer in event.performers if len(performer)>1])\n", | |
" \n", | |
" \n", | |
" tags = [ re.compile(r'conductor').sub('orchestra', tag) for tag in tags ]\n", | |
" tags_simple = [ re.compile(r'conductor').sub('orchestra', tag) for tag in tags_simple ]\n", | |
" \n", | |
" if 'orchestra' not in tags:\n", | |
" tags.append('chamber')\n", | |
" tags_simple.append('chamber')\n", | |
" else: \n", | |
" tags_simple.append('orchestra')\n", | |
" \n", | |
" tags = [ re.compile(r'\\s?solo\\s?').sub('', tag) for tag in tags ]\n", | |
" tags = [ re.compile(r' and').sub(',', tag) for tag in tags ]\n", | |
" \n", | |
" \n", | |
" #tags.extend( [p[0] for p in event.program if len(p[0])>1] )\n", | |
" composers.extend( [p[0] for p in event.program if len(p[0])>1] )\n", | |
" \n", | |
" #for index in range(len(tags)):\n", | |
" # tags[index] = tags[index].lower()\n", | |
" # if len(tags[index].split(' ')) > 1:\n", | |
" # tags[index] = tags[index].split(' ')[-1]\n", | |
" \n", | |
" for index in range(len(composers)):\n", | |
" composers[index] = composers[index].lower()\n", | |
" if len(composers[index].split(' ')) > 1:\n", | |
" composers[index] = composers[index].split(' ')[-1]\n", | |
" \n", | |
" \n", | |
" #tags = [tag for tag in tags if (not tag.isdigit())] \n", | |
" #tags = [unicodedata.normalize('NFKD', tag).encode('ascii','ignore').decode(\"ascii\") for tag in tags]\n", | |
" composers = [tag for tag in composers if (not tag.isdigit())] \n", | |
" composers = [unicodedata.normalize('NFKD', tag).encode('ascii','ignore').decode(\"ascii\") for tag in composers]\n", | |
" \n", | |
" composers = [composer for composer in composers if (len(composer)>1) & (composer != 'tbd')]\n", | |
" \n", | |
" tags.extend(set(composers))\n", | |
" \n", | |
" tags_simple.extend(list(set([composer_period[composer] for composer in composers if composer in composer_period.keys()])))\n", | |
" tags_simple.extend(list(set([composer_nationality[composer] for composer in composers if composer in composer_nationality.keys()])))\n", | |
" \n", | |
" tags = [tag for tag in tags if (len(tag)>1) & (tag != 'tbd')]\n", | |
" \n", | |
" \n", | |
" d = {'Date': event.date.date(), 'Time': event.date.time(), 'Venue': event.place, \n", | |
" 'Price': event.price, 'Performers': event.performers, 'Program': event.program, \n", | |
" 'Tags': tags, 'Tags_Simple': tags_simple}\n", | |
" df = df.append(pd.Series(d), ignore_index=True)\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 120, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_WC = dfFromEvents(getEvents_WC())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 107, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"composers = ['prokofiev', 'shostakovich', 'schubert', 'schubert']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 112, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['romantic', 'modern']" | |
] | |
}, | |
"execution_count": 112, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"list(set([composer_period[composer] for composer in composers]))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 121, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_KC = dfFromEvents(getEvents_KC('CLA')+getEvents_KC('CHA'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 122, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_all = pd.concat([df_WC, df_KC]).reset_index().drop(columns='index')\n", | |
"del df_WC\n", | |
"del df_KC" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 161, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#df_all.sort_values(by='Date')[:10]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 166, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Date</th>\n", | |
" <th>Time</th>\n", | |
" <th>Venue</th>\n", | |
" <th>Price</th>\n", | |
" <th>Performers</th>\n", | |
" <th>Program</th>\n", | |
" <th>Tags</th>\n", | |
" <th>Tags_Simple</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>2018-10-06</td>\n", | |
" <td>20:00:00</td>\n", | |
" <td>Washington Conservatory</td>\n", | |
" <td>Donation</td>\n", | |
" <td>[[Pressenda Chamber Players]]</td>\n", | |
" <td>[[Maurice Ravel, Trio in A Minor], [Peter Tcha...</td>\n", | |
" <td>[chamber, ravel, tchaikovsky]</td>\n", | |
" <td>[chamber, romantic, modern, russian, french]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>37</th>\n", | |
" <td>2018-10-10</td>\n", | |
" <td>19:30:00</td>\n", | |
" <td>Kennedy Center</td>\n", | |
" <td>50.00</td>\n", | |
" <td>[[Joel Link, violin], [Bryan Lee, violin]]</td>\n", | |
" <td>[[Anton Webern, Langsamer Satz], [Mason Bates...</td>\n", | |
" <td>[violin, violin, chamber, schubert, webern, ba...</td>\n", | |
" <td>[chamber, romantic, modern, austrian, american]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>2018-10-11</td>\n", | |
" <td>19:00:00</td>\n", | |
" <td>Kennedy Center</td>\n", | |
" <td>15.00 - 89.00</td>\n", | |
" <td>[[Christoph Eschenbach, conductor], [Ray Chen,...</td>\n", | |
" <td>[[Mendelssohn, Calm Sea and Prosperous Voyage...</td>\n", | |
" <td>[orchestra, violin, beethoven, mendelssohn]</td>\n", | |
" <td>[orchestra, romantic, classical, german]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>38</th>\n", | |
" <td>2018-10-23</td>\n", | |
" <td>19:30:00</td>\n", | |
" <td>Kennedy Center</td>\n", | |
" <td>55.00</td>\n", | |
" <td>[[Joseph Kalichstein, piano], [Jaime Laredo, v...</td>\n", | |
" <td>[[Robert Schumann, Selections from Canonic Et...</td>\n", | |
" <td>[piano, violin, cello, chamber, ravel, mendels...</td>\n", | |
" <td>[chamber, romantic, modern, german, french]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>2018-11-01</td>\n", | |
" <td>19:00:00</td>\n", | |
" <td>Kennedy Center</td>\n", | |
" <td>15.00 - 89.00</td>\n", | |
" <td>[[James Gaffigan, conductor], [Simon Trpceski,...</td>\n", | |
" <td>[[Prokofiev, Symphony No. 3], [Shostakovich, P...</td>\n", | |
" <td>[orchestra, piano, prokofiev, shostakovich, kh...</td>\n", | |
" <td>[orchestra, modern, russian]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>2018-11-29</td>\n", | |
" <td>19:00:00</td>\n", | |
" <td>Kennedy Center</td>\n", | |
" <td>15.00 - 89.00</td>\n", | |
" <td>[[Gianandrea Noseda, conductor], [Karina Flore...</td>\n", | |
" <td>[[Britten, War Requiem]]</td>\n", | |
" <td>[orchestra, soprano, tenor, baritone, britten]</td>\n", | |
" <td>[orchestra, modern, english]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>2018-12-01</td>\n", | |
" <td>20:00:00</td>\n", | |
" <td>Washington Conservatory</td>\n", | |
" <td>Donation</td>\n", | |
" <td>[[Pressenda Chamber Players]]</td>\n", | |
" <td>[[Johannes Brahms, Clarinet Trio in A Minor, O...</td>\n", | |
" <td>[chamber, messiaen, brahms]</td>\n", | |
" <td>[chamber, romantic, modern, german, french]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>20</th>\n", | |
" <td>2018-12-06</td>\n", | |
" <td>19:00:00</td>\n", | |
" <td>Kennedy Center</td>\n", | |
" <td>15.00 - 89.00</td>\n", | |
" <td>[[Gianandrea Noseda, conductor]]</td>\n", | |
" <td>[[Mahler, Symphony No. 1, “Titan”]]</td>\n", | |
" <td>[orchestra, mahler]</td>\n", | |
" <td>[orchestra, romantic, austrian]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>2019-01-05</td>\n", | |
" <td>20:00:00</td>\n", | |
" <td>Washington Conservatory</td>\n", | |
" <td>Donation</td>\n", | |
" <td>[[Alexander Paley, piano]]</td>\n", | |
" <td>[[Chopin, Etudes], [Tchaikovsky, Romeo and Jul...</td>\n", | |
" <td>[piano, chamber, tchaikovsky, chopin]</td>\n", | |
" <td>[chamber, romantic, russian]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td>2019-01-18</td>\n", | |
" <td>20:00:00</td>\n", | |
" <td>Kennedy Center</td>\n", | |
" <td>15.00 - 89.00</td>\n", | |
" <td>[[Gianandrea Noseda, conductor], [Renée Flemin...</td>\n", | |
" <td>[[Schubert, Rosamunde—Overture and incidental...</td>\n", | |
" <td>[orchestra, soprano, schubert, schubert/berio]</td>\n", | |
" <td>[orchestra, romantic, austrian]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>40</th>\n", | |
" <td>2019-01-30</td>\n", | |
" <td>19:30:00</td>\n", | |
" <td>Kennedy Center</td>\n", | |
" <td>45.00</td>\n", | |
" <td>[[Valerie Coleman, flute]]</td>\n", | |
" <td>[[Lalo Schifrin, La Nouvelle Orleans ]]</td>\n", | |
" <td>[flute, chamber, schifrin]</td>\n", | |
" <td>[chamber, modern]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>23</th>\n", | |
" <td>2019-01-31</td>\n", | |
" <td>19:00:00</td>\n", | |
" <td>Kennedy Center</td>\n", | |
" <td>15.00 - 89.00</td>\n", | |
" <td>[[Gianandrea Noseda, conductor], [Daniil Trifo...</td>\n", | |
" <td>[[Beethoven, Piano Concerto No. 5 “Emperor”],...</td>\n", | |
" <td>[orchestra, piano, beethoven, shostakovich]</td>\n", | |
" <td>[orchestra, modern, classical, german, russian]</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Date Time Venue Price \\\n", | |
"6 2018-10-06 20:00:00 Washington Conservatory Donation \n", | |
"37 2018-10-10 19:30:00 Kennedy Center 50.00 \n", | |
"17 2018-10-11 19:00:00 Kennedy Center 15.00 - 89.00 \n", | |
"38 2018-10-23 19:30:00 Kennedy Center 55.00 \n", | |
"18 2018-11-01 19:00:00 Kennedy Center 15.00 - 89.00 \n", | |
"19 2018-11-29 19:00:00 Kennedy Center 15.00 - 89.00 \n", | |
"8 2018-12-01 20:00:00 Washington Conservatory Donation \n", | |
"20 2018-12-06 19:00:00 Kennedy Center 15.00 - 89.00 \n", | |
"9 2019-01-05 20:00:00 Washington Conservatory Donation \n", | |
"22 2019-01-18 20:00:00 Kennedy Center 15.00 - 89.00 \n", | |
"40 2019-01-30 19:30:00 Kennedy Center 45.00 \n", | |
"23 2019-01-31 19:00:00 Kennedy Center 15.00 - 89.00 \n", | |
"\n", | |
" Performers \\\n", | |
"6 [[Pressenda Chamber Players]] \n", | |
"37 [[Joel Link, violin], [Bryan Lee, violin]] \n", | |
"17 [[Christoph Eschenbach, conductor], [Ray Chen,... \n", | |
"38 [[Joseph Kalichstein, piano], [Jaime Laredo, v... \n", | |
"18 [[James Gaffigan, conductor], [Simon Trpceski,... \n", | |
"19 [[Gianandrea Noseda, conductor], [Karina Flore... \n", | |
"8 [[Pressenda Chamber Players]] \n", | |
"20 [[Gianandrea Noseda, conductor]] \n", | |
"9 [[Alexander Paley, piano]] \n", | |
"22 [[Gianandrea Noseda, conductor], [Renée Flemin... \n", | |
"40 [[Valerie Coleman, flute]] \n", | |
"23 [[Gianandrea Noseda, conductor], [Daniil Trifo... \n", | |
"\n", | |
" Program \\\n", | |
"6 [[Maurice Ravel, Trio in A Minor], [Peter Tcha... \n", | |
"37 [[Anton Webern, Langsamer Satz], [Mason Bates... \n", | |
"17 [[Mendelssohn, Calm Sea and Prosperous Voyage... \n", | |
"38 [[Robert Schumann, Selections from Canonic Et... \n", | |
"18 [[Prokofiev, Symphony No. 3], [Shostakovich, P... \n", | |
"19 [[Britten, War Requiem]] \n", | |
"8 [[Johannes Brahms, Clarinet Trio in A Minor, O... \n", | |
"20 [[Mahler, Symphony No. 1, “Titan”]] \n", | |
"9 [[Chopin, Etudes], [Tchaikovsky, Romeo and Jul... \n", | |
"22 [[Schubert, Rosamunde—Overture and incidental... \n", | |
"40 [[Lalo Schifrin, La Nouvelle Orleans ]] \n", | |
"23 [[Beethoven, Piano Concerto No. 5 “Emperor”],... \n", | |
"\n", | |
" Tags \\\n", | |
"6 [chamber, ravel, tchaikovsky] \n", | |
"37 [violin, violin, chamber, schubert, webern, ba... \n", | |
"17 [orchestra, violin, beethoven, mendelssohn] \n", | |
"38 [piano, violin, cello, chamber, ravel, mendels... \n", | |
"18 [orchestra, piano, prokofiev, shostakovich, kh... \n", | |
"19 [orchestra, soprano, tenor, baritone, britten] \n", | |
"8 [chamber, messiaen, brahms] \n", | |
"20 [orchestra, mahler] \n", | |
"9 [piano, chamber, tchaikovsky, chopin] \n", | |
"22 [orchestra, soprano, schubert, schubert/berio] \n", | |
"40 [flute, chamber, schifrin] \n", | |
"23 [orchestra, piano, beethoven, shostakovich] \n", | |
"\n", | |
" Tags_Simple \n", | |
"6 [chamber, romantic, modern, russian, french] \n", | |
"37 [chamber, romantic, modern, austrian, american] \n", | |
"17 [orchestra, romantic, classical, german] \n", | |
"38 [chamber, romantic, modern, german, french] \n", | |
"18 [orchestra, modern, russian] \n", | |
"19 [orchestra, modern, english] \n", | |
"8 [chamber, romantic, modern, german, french] \n", | |
"20 [orchestra, romantic, austrian] \n", | |
"9 [chamber, romantic, russian] \n", | |
"22 [orchestra, romantic, austrian] \n", | |
"40 [chamber, modern] \n", | |
"23 [orchestra, modern, classical, german, russian] " | |
] | |
}, | |
"execution_count": 166, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_all[df_all['Date'] > dt.datetime.today().date()].sort_values(by='Date')[:15].drop(index=[7, 39, 21])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 68, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#today = dt.datetime.today().date()\n", | |
"#df_all[ (df_all['Date'] > today) & (df_all['Date'] <= (today + dt.timedelta(weeks=5))) ].sort_values(by='Date')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 158, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#df_all" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 126, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class DictEncoder(base.BaseEstimator, base.TransformerMixin):\n", | |
" \n", | |
" def __init__(self, col):\n", | |
" self.col = col\n", | |
" \n", | |
" def fit(self, X, y=None):\n", | |
" return self\n", | |
" \n", | |
" def transform(self, X):\n", | |
" \n", | |
" def to_dict(l):\n", | |
" try:\n", | |
" return {x: 1 for x in l}\n", | |
" except TypeError:\n", | |
" return {}\n", | |
" \n", | |
" return X[self.col].apply(to_dict)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 127, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<49x14 sparse matrix of type '<class 'numpy.float64'>'\n", | |
"\twith 169 stored elements in Compressed Sparse Row format>" | |
] | |
}, | |
"execution_count": 127, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tag_pipe = Pipeline([('encoder', DictEncoder('Tags_Simple')),\n", | |
" ('vectorizer', DictVectorizer())])\n", | |
"features = tag_pipe.fit_transform(df_all)\n", | |
"features" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 142, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"nn = NearestNeighbors(n_neighbors=2).fit(features)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 138, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Date</th>\n", | |
" <th>Time</th>\n", | |
" <th>Venue</th>\n", | |
" <th>Price</th>\n", | |
" <th>Performers</th>\n", | |
" <th>Program</th>\n", | |
" <th>Tags</th>\n", | |
" <th>Tags_Simple</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>31</th>\n", | |
" <td>2019-05-16</td>\n", | |
" <td>19:00:00</td>\n", | |
" <td>Kennedy Center</td>\n", | |
" <td>15.00 - 89.00</td>\n", | |
" <td>[[Gianandrea Noseda, conductor], [Erika Grimal...</td>\n", | |
" <td>[[Liszt, Dante Symphony], [Rossini, Stabat M...</td>\n", | |
" <td>[orchestra, soprano, liszt, rossini]</td>\n", | |
" <td>[orchestra, romantic, classical, hungarian, it...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>2018-12-01</td>\n", | |
" <td>20:00:00</td>\n", | |
" <td>Washington Conservatory</td>\n", | |
" <td>Donation</td>\n", | |
" <td>[[Pressenda Chamber Players]]</td>\n", | |
" <td>[[Johannes Brahms, Clarinet Trio in A Minor, O...</td>\n", | |
" <td>[chamber, messiaen, brahms]</td>\n", | |
" <td>[chamber, romantic, modern, german, french]</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Date Time Venue Price \\\n", | |
"31 2019-05-16 19:00:00 Kennedy Center 15.00 - 89.00 \n", | |
"8 2018-12-01 20:00:00 Washington Conservatory Donation \n", | |
"\n", | |
" Performers \\\n", | |
"31 [[Gianandrea Noseda, conductor], [Erika Grimal... \n", | |
"8 [[Pressenda Chamber Players]] \n", | |
"\n", | |
" Program \\\n", | |
"31 [[Liszt, Dante Symphony], [Rossini, Stabat M... \n", | |
"8 [[Johannes Brahms, Clarinet Trio in A Minor, O... \n", | |
"\n", | |
" Tags \\\n", | |
"31 [orchestra, soprano, liszt, rossini] \n", | |
"8 [chamber, messiaen, brahms] \n", | |
"\n", | |
" Tags_Simple \n", | |
"31 [orchestra, romantic, classical, hungarian, it... \n", | |
"8 [chamber, romantic, modern, german, french] " | |
] | |
}, | |
"execution_count": 138, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_all.iloc[[31, 8]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 150, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Date</th>\n", | |
" <th>Time</th>\n", | |
" <th>Venue</th>\n", | |
" <th>Price</th>\n", | |
" <th>Performers</th>\n", | |
" <th>Program</th>\n", | |
" <th>Tags</th>\n", | |
" <th>Tags_Simple</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>31</th>\n", | |
" <td>2019-05-16</td>\n", | |
" <td>19:00:00</td>\n", | |
" <td>Kennedy Center</td>\n", | |
" <td>15.00 - 89.00</td>\n", | |
" <td>[[Gianandrea Noseda, conductor], [Erika Grimal...</td>\n", | |
" <td>[[Liszt, Dante Symphony], [Rossini, Stabat M...</td>\n", | |
" <td>[orchestra, soprano, liszt, rossini]</td>\n", | |
" <td>[orchestra, romantic, classical, hungarian, it...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>2018-10-11</td>\n", | |
" <td>19:00:00</td>\n", | |
" <td>Kennedy Center</td>\n", | |
" <td>15.00 - 89.00</td>\n", | |
" <td>[[Christoph Eschenbach, conductor], [Ray Chen,...</td>\n", | |
" <td>[[Mendelssohn, Calm Sea and Prosperous Voyage...</td>\n", | |
" <td>[orchestra, violin, beethoven, mendelssohn]</td>\n", | |
" <td>[orchestra, romantic, classical, german]</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Date Time Venue Price \\\n", | |
"31 2019-05-16 19:00:00 Kennedy Center 15.00 - 89.00 \n", | |
"17 2018-10-11 19:00:00 Kennedy Center 15.00 - 89.00 \n", | |
"\n", | |
" Performers \\\n", | |
"31 [[Gianandrea Noseda, conductor], [Erika Grimal... \n", | |
"17 [[Christoph Eschenbach, conductor], [Ray Chen,... \n", | |
"\n", | |
" Program \\\n", | |
"31 [[Liszt, Dante Symphony], [Rossini, Stabat M... \n", | |
"17 [[Mendelssohn, Calm Sea and Prosperous Voyage... \n", | |
"\n", | |
" Tags \\\n", | |
"31 [orchestra, soprano, liszt, rossini] \n", | |
"17 [orchestra, violin, beethoven, mendelssohn] \n", | |
"\n", | |
" Tags_Simple \n", | |
"31 [orchestra, romantic, classical, hungarian, it... \n", | |
"17 [orchestra, romantic, classical, german] " | |
] | |
}, | |
"execution_count": 150, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dists, indices = nn.kneighbors(features[31])\n", | |
"df_all.iloc[indices[0]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 157, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Date</th>\n", | |
" <th>Time</th>\n", | |
" <th>Venue</th>\n", | |
" <th>Price</th>\n", | |
" <th>Performers</th>\n", | |
" <th>Program</th>\n", | |
" <th>Tags</th>\n", | |
" <th>Tags_Simple</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>2018-12-01</td>\n", | |
" <td>20:00:00</td>\n", | |
" <td>Washington Conservatory</td>\n", | |
" <td>Donation</td>\n", | |
" <td>[[Pressenda Chamber Players]]</td>\n", | |
" <td>[[Johannes Brahms, Clarinet Trio in A Minor, O...</td>\n", | |
" <td>[chamber, messiaen, brahms]</td>\n", | |
" <td>[chamber, romantic, modern, german, french]</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>38</th>\n", | |
" <td>2018-10-23</td>\n", | |
" <td>19:30:00</td>\n", | |
" <td>Kennedy Center</td>\n", | |
" <td>55.00</td>\n", | |
" <td>[[Joseph Kalichstein, piano], [Jaime Laredo, v...</td>\n", | |
" <td>[[Robert Schumann, Selections from Canonic Et...</td>\n", | |
" <td>[piano, violin, cello, chamber, ravel, mendels...</td>\n", | |
" <td>[chamber, romantic, modern, german, french]</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Date Time Venue Price \\\n", | |
"8 2018-12-01 20:00:00 Washington Conservatory Donation \n", | |
"38 2018-10-23 19:30:00 Kennedy Center 55.00 \n", | |
"\n", | |
" Performers \\\n", | |
"8 [[Pressenda Chamber Players]] \n", | |
"38 [[Joseph Kalichstein, piano], [Jaime Laredo, v... \n", | |
"\n", | |
" Program \\\n", | |
"8 [[Johannes Brahms, Clarinet Trio in A Minor, O... \n", | |
"38 [[Robert Schumann, Selections from Canonic Et... \n", | |
"\n", | |
" Tags \\\n", | |
"8 [chamber, messiaen, brahms] \n", | |
"38 [piano, violin, cello, chamber, ravel, mendels... \n", | |
"\n", | |
" Tags_Simple \n", | |
"8 [chamber, romantic, modern, german, french] \n", | |
"38 [chamber, romantic, modern, german, french] " | |
] | |
}, | |
"execution_count": 157, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dists, indices = nn.kneighbors(features[38])\n", | |
"df_all.iloc[indices[0]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 148, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['chamber', 'messiaen', 'brahms']" | |
] | |
}, | |
"execution_count": 148, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_all.iloc[8]['Tags']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"nltk.download('punkt')\n", | |
"nltk.download('stopwords')\n", | |
"nltk.download('averaged_perceptron_tagger')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"concertPage = requests.get(KENNEDY_URL + links[26]) \n", | |
"soup = BeautifulSoup(concertPage.text, \"lxml\")\n", | |
"date = soup.find('meta', {'name': 'StartDate'})['content']\n", | |
"blurb = soup.select('div.blurbpadding') " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#tokens = word_tokenize(blurb[0].text.lower())\n", | |
"tokens = word_tokenize(blurb[0].text)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"clean_tokens = []\n", | |
"stemmer = PorterStemmer()\n", | |
"tokens_stemmed = []\n", | |
"for token in tokens:\n", | |
" stemmed_token = stemmer.stem(token)\n", | |
" tokens_stemmed.append(stemmed_token)\n", | |
"\n", | |
"frequencies = Counter(tokens_stemmed)\n", | |
"for token, count in frequencies.most_common(25):\n", | |
" if (token not in ENGLISH_STOPWORDS) & (len(token)>2):\n", | |
" clean_tokens.append(token)\n", | |
" print(token, count)\n", | |
" \n", | |
"#trigrams = ngrams([token for token in tokens_stemmed if (token not in ENGLISH_STOPWORDS) & (len(token)>2)], 3)\n", | |
"trigrams = ngrams(tokens, 3)\n", | |
"\n", | |
"frequencies_2 = Counter(trigrams)\n", | |
"for token, count in frequencies_2.most_common(10):\n", | |
" print(token, count)\n", | |
"\n", | |
"\n", | |
"print(list(trigrams))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"tags = pos_tag(tokens)\n", | |
"print(blurb[0].text)\n", | |
"print([tag for tag in tags if tag[1] == 'NNP'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"[w[0] for w in tags if w[1]=='NNP']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"blurb[0].text" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"BeautifulSoup\t Event\t composer_nationality\t composer_period\t datetime\t dfFromEvents\t df_all\t dill\t dt\t \n", | |
"getEvents_KC\t getEvents_WC\t get_date_WC\t get_link_KC\t get_performers_KC\t get_performers_WC\t get_price_KC\t get_program_KC\t get_program_WC\t \n", | |
"get_rel_links_KC\t get_rel_links_WC\t pd\t re\t requests\t unicodedata\t \n" | |
] | |
} | |
], | |
"source": [ | |
"%who" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment