Skip to content

Instantly share code, notes, and snippets.

@gunessenturk
Created October 4, 2018 08:14
Show Gist options
  • Save gunessenturk/ec45f95a3e62fe29df2d1ef0c080f345 to your computer and use it in GitHub Desktop.
Save gunessenturk/ec45f95a3e62fe29df2d1ef0c080f345 to your computer and use it in GitHub Desktop.
TDI_project_week4
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import dill\n",
"from bs4 import BeautifulSoup\n",
"from datetime import datetime\n",
"import re\n",
"import pandas as pd\n",
"import datetime as dt\n",
"import unicodedata"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {},
"outputs": [],
"source": [
"from sklearn import base\n",
"from sklearn.feature_extraction import DictVectorizer\n",
"from sklearn.pipeline import Pipeline, FeatureUnion\n",
"from sklearn.neighbors import NearestNeighbors"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"from nltk import word_tokenize\n",
"from collections import Counter\n",
"from nltk.stem import PorterStemmer # one of the several available stemmers\n",
"from nltk.corpus import stopwords\n",
"from nltk import ngrams\n",
"from nltk import pos_tag"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"ENGLISH_STOPWORDS = stopwords.words('english')\n",
"#KENNEDY_URL = 'http://www.kennedy-center.org'\n",
"#WASHINGTON_CONSERVATORY_URL = 'http://www.washingtonconservatory.org/html/'"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"composer_nationality = {'hindemith': 'german', 'debussy': 'french', 'mozart': 'austrian', 'liszt': 'hungarian',\n",
" 'schumann': 'german', 'faure':'french', 'ravel': 'french', 'saint-saens': 'french', \n",
" 'saint-saans': 'french',\n",
" 'tchaikovsky': 'russian', 'brahms': 'german', 'boccherini': 'italian', 'messiaen': 'french', \n",
" 'barber': 'american', 'rachmaninoff': 'russian', 'schubert': 'austrian', \n",
" 'schoenberg': 'austrian', 'mendelssohn': 'german', 'beethoven': 'german', 'prokofiev':'russian', \n",
" 'shostakovich':'russian', 'khachaturian': 'russian', 'britten': 'english', \n",
" 'mahler': 'austrian', 'wagner': 'german', 'berlioz': 'french', 'roussel': 'french', \n",
" 'haydn': 'austrian', 'bruckner': 'austrian', 'franck': 'french', 'rossini': 'italian', \n",
" 'copland': 'american', 'stravinsky': 'russian', 'bates': 'american', 'muhly': 'american', \n",
" 'nesbett': 'english', 'byrd': 'english', 'praetorius': 'german', 'bartók': 'hungarian', \n",
" 'bartok': 'hungarian', 'rzewski': 'american', 'loeffler': 'german', 'poulenc': 'french', \n",
" 'bach': 'german', 'scarlatti': 'italian', 'vivaldi': 'italian'\n",
" \n",
" }\n"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"composer_period = {'hindemith': 'modern', 'debussy': 'modern', 'mozart': 'classical', 'liszt': 'romantic',\n",
" 'schumann': 'romantic', 'faure':'romantic', 'ravel': 'modern', 'saint-saens': 'romantic', \n",
" 'saint-saans': 'romantic',\n",
" 'chopin': 'romantic', 'tchaikovsky': 'romantic', 'brahms': 'romantic', 'boccherini': 'classical', \n",
" 'messiaen': 'modern', 'barber': 'modern', 'rachmaninoff': 'romantic', 'schubert': 'romantic', \n",
" 'schoenberg': 'modern', 'silvestrov': 'modern', 'sylvestrov': 'modern', 'mendelssohn': 'romantic', \n",
" 'beethoven': 'classical', 'prokofiev': 'modern', 'shostakovich': 'modern', 'khachaturian': 'modern', \n",
" 'britten': 'modern', 'mahler': 'romantic', 'wagner': 'romantic', 'berlioz': 'romantic', \n",
" 'roussel': 'modern', 'haydn': 'classical', 'bruckner': 'romantic', 'franck': 'romantic', \n",
" 'sibelius': 'romantic', 'rossini': 'classical', 'copland': 'modern', 'berio': 'modern', \n",
" 'dvorak': 'romantic', 'stravinsky': 'modern', 'bates': 'modern', 'muhly': 'modern', \n",
" 'nesbett': 'renaissance', 'byrd': 'renaissance', 'praetorius': 'renaissance', 'schifrin': 'modern', \n",
" 'lutoslawski': 'modern', 'bartók': 'modern', 'bartok': 'modern', 'rzewski': 'modern', \n",
" 'martin': 'modern', 'baran': 'modern', 'loeffler': 'modern', 'poulenc': 'modern', \n",
" 'penderecki': 'modern', 'bach': 'baroque', 'scarlatti': 'baroque', 'vivaldi': 'baroque'\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"def get_rel_links_KC(genre): # genre is CLA for classical, CHA for chamber\n",
" # Use requests.get to download the page.\n",
" page = requests.get('http://www.kennedy-center.org/calendar/genre/' + genre) \n",
" soup = BeautifulSoup(page.text, \"lxml\")\n",
"\n",
" # Get all relative links to individual concerts\n",
" events = soup.select('h4') \n",
" links = [get_link_KC(event) for event in events]\n",
" return links"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"# Extract relative urls for each event\n",
"def get_link_KC(event):\n",
" rel_url = re.search( '<a\\shref=\"(.+)\">' , str(event)).group(1)\n",
" return rel_url"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"# Returns a list of tuples (<Performer name>, <role>)\n",
"def get_performers_KC(blurb):\n",
" if not re.search(r'blurbpadding\">[\\s]*(\\n?(\\w+\\s\\w+),\\s(\\w+)<br/>\\r*)+', str(blurb)):\n",
" return None\n",
" performers = re.search(r'blurbpadding\">[\\s]*(\\n?(\\w+\\s\\w+),\\s(\\w+)<br/>\\r*)+', str(blurb)).group(0)\n",
" performers = list(filter(lambda x: (len(x)>4),performers.split('<br/>')))\n",
" performers = list(map(lambda x: re.search('([A-Z].+),\\s(.+)', x),performers))\n",
" performers = [[performer.group(1), performer.group(2)] for performer in performers]\n",
" return performers"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"# Returns a list of lists [<Composer name>, <piece name>]\n",
"def get_program_KC(soup):\n",
"\n",
" composer_re = re.compile(r'<a href=\"/artist/index/[\\w]+\">(.+)</a>')\n",
" composition_re = re.compile(r'<a href=\"/artist/composition/[\\w]+\">(.+)</a>')\n",
" TAG_RE = re.compile(r'<[^>]+>')\n",
" #composers = re.findall(composer_re, str(soup))\n",
" #if composers:\n",
" # compositions = re.findall(composition_re, str(soup))\n",
" # compositions = [TAG_RE.sub('', composition) for composition in compositions]\n",
" # pieces = list(zip(composers, compositions))\n",
"\n",
" #else:\n",
" blurb = soup.select('div.blurbpadding') \n",
" pieces = re.findall(r'<strong>(.+</strong>[:|,].*)', str(blurb))\n",
" TAG_RE = re.compile(r'<[^>]+>')\n",
" pieces = [TAG_RE.sub('', piece) for piece in pieces]\n",
" pieces = [piece for piece in pieces if len(piece)<60]\n",
" pieces = [re.compile(r'\\xa0').sub('', piece) for piece in pieces]\n",
" if (pieces) and (':' in pieces[0]):\n",
" pieces = [piece.split(':') for piece in pieces]\n",
" else:\n",
" pieces = [piece.split(',') for piece in pieces]\n",
" \n",
" if pieces == []:\n",
" pieces = ['Program: TBD']\n",
" else:\n",
" for piece in pieces:\n",
" piece[0] = piece[0].title()\n",
" \n",
" return pieces"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"# Returns the range of ticket proces\n",
"def get_price_KC(soup):\n",
" price = soup.find('div', {'class': re.compile(r'price*')}).text\n",
" price = re.search(r'(\\$\\d+\\.\\d\\d(\\s-\\s\\$\\d+\\.\\d\\d)?)', price).group(0) \n",
" price = re.compile(r'\\$').sub('', price)\n",
" return price"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Washington Conservatory\n",
"def get_rel_links_WC(): \n",
" # Download the page.\n",
" page = requests.get('http://www.washingtonconservatory.org/html/concerts.htm#professional') \n",
" soup = BeautifulSoup(page.text, \"lxml\")\n",
" \n",
" # Get all relative links to individual concerts\n",
" links = [link['href'] for link in soup.find_all(\"a\", {'href': re.compile(r'concerts1819_.*')})]\n",
" return links"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Get program for Washington Conservatory\n",
"def get_program_WC(soup):\n",
" program = soup.find_all('li')\n",
" program = [item for item in program if item.text != '']\n",
" YEAR_RE = re.compile(r'\\([\\d-]+\\)')\n",
" if not program:\n",
" program = ['Program: TBD']\n",
" else: \n",
" program = [item.text for item in program]\n",
" program = [YEAR_RE.sub('', item) for item in program]\n",
" program = [item.split(':') for item in program]\n",
" for index, item in enumerate(program):\n",
" program[index] = [x.strip() for x in program[index]]\n",
" \n",
" return program"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Get event date and time\n",
"def get_date_WC(soup):\n",
" date = soup.find('p', {'class': 'center', 'style': re.compile(r'position: relative.*')}).text.strip().split('\\n')[0]\n",
" date = pd.Timestamp(date)\n",
" return date"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Return a list of performers\n",
"def get_performers_WC(soup):\n",
" performers = soup.find('h3', {'style': re.compile(r'margin-left: 20px.*')})\n",
" performers = performers.findChildren()[0]\n",
" #performers = [item.strip() for item in performers if ',' in item]\n",
" performers = re.findall('([A-Z][A-Z]+\\.?-?\\s?[A-Z]*\\.?-?\\s?[A-Z]*\\.?,?\\s?[a-z]*\\s?[a-z]*\\s?[a-z]*)', performers.text)\n",
" performers = [performer.strip() for performer in performers]\n",
" performers = [performer.split(', ') for performer in performers]\n",
" if not performers:\n",
" return ['Performers: TBD']\n",
" for performer in performers:\n",
" if len(performer) == 1:\n",
" performer[0] = performer[0].title()\n",
" else:\n",
" performer[0] = performer[0].title()\n",
" performer[1] = performer[1].lower() \n",
" \n",
" return performers"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [],
"source": [
"# Get event links from Friday Morning Music Club, Inc.\n",
"def get_links_FMMC():\n",
" # Download the page.\n",
" links = []\n",
" for i in range(4):\n",
" page = requests.get('http://www.fmmc.org/events/list/?tribe_paged='+str(i+1)+'&tribe_event_display=list') \n",
" soup = BeautifulSoup(page.text, \"lxml\")\n",
"\n",
" # Get all relative links to individual concerts\n",
" links.extend([link['href'] for link in soup.find_all(\"a\", {'class': 'tribe-event-url'})])\n",
" links = [link for link in links if ('concert' in link) | ('orchestra' in link) | ('chorale' in link)]\n",
" return links "
]
},
{
"cell_type": "code",
"execution_count": 197,
"metadata": {},
"outputs": [],
"source": [
"# Get event date and time for Friday Morning Music Club, Inc.\n",
"def get_date_FMMC(soup):\n",
" date = soup.find_all('abbr', {'class': re.compile(r'.*tribe-events-start.*')})\n",
" date = date[0]['title']\n",
" time = soup.find_all('div', {'class': re.compile(r'.*tribe-events-start.*')})\n",
" time = time[0].text.strip().split(' - ')[0]\n",
" return pd.Timestamp(date + ' ' + time)"
]
},
{
"cell_type": "code",
"execution_count": 389,
"metadata": {},
"outputs": [],
"source": [
"# Get program for Friday Morning Music Club, Inc.\n",
"def get_program_FMMC(soup):\n",
" program = soup.find_all('div', {'class': re.compile(r'tribe-events-single-event-description.*')})\n",
" pieces = re.findall('[A-Z]+[:|,].*', str(program[0]))\n",
" performer_re = re.compile(r'<strong>[\\w|\\s]*</strong>')\n",
" tag_re = re.compile(r'</?\\w+/?>')\n",
" pieces = [re.sub(performer_re, '', item) for item in pieces]\n",
" pieces = [re.sub('\\.?\\s,.*', '', item) for item in pieces]\n",
" pieces = [item.split(':') for item in pieces]\n",
" pieces = [item for item in pieces if len(item)>1]\n",
" pieces = [[item[0].title().strip(), re.sub(tag_re, '', item[1]).strip()] for item in pieces]\n",
" return pieces"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"links = get_links_FMMC()"
]
},
{
"cell_type": "code",
"execution_count": 390,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['Beethoven', 'Sonata in F-sharp Major, op. 78'], ['Bruch', 'Eight Pieces for clarinet, viola and piano (selections)']]\n",
"[]\n",
"[['Brahms, Joel Friedman, Chopin, Wieniawski, J.S. Bach, Sarasate', 'Selected works']]\n",
"[['Rheinberger', 'Nonet in E-flat Major, op. 139']]\n",
"[['Beethoven', 'Sonata in F-sharp Major, op. 78'], ['Korngold', 'Selected songs'], ['Mozart', 'Duo in G Major, K.423']]\n",
"[['Haydn', 'Andante with variations in F Minor, Hob.XVII'], ['Fiske', 'Sonata for piano and clarinet']]\n",
"[['Balakirev', 'Mazurkas'], ['Rorem', 'Nantucket Songs'], ['Martinu', 'Sonata for flute, violin, and piano, H. 254']]\n",
"[['Stamitz', 'Quartet, op. 10, no. 5'], ['Martinu', 'Sonata'], ['Haydn', 'Sonata, Hob. XVI']]\n",
"[]\n",
"[['Mozart', 'Selected arias'], ['Beethoven', 'Sonata in F-sharp Major, op. 78. DEBUSSY'], ['Albert', 'Doppler Effect. GARY SCHOCKER']]\n",
"[['Brahms', 'Klavierstücke, op. 118'], ['Ginastera', 'Cinco Canciones Populares Argentinas'], ['Shostakovich', 'String Quartet No. 1 in C Major, op. 49. and']]\n",
"[['Bach, Scarlatti, Lovelady And York', 'Selected works'], ['Brahms', 'Klavierstücke, op. 118. Stephanie Ng, piano.'], ['Shostakovich', 'String Quartet No. 1 in C Major, op. 49']]\n",
"[['Bach', 'Movements from Partita in E Major, BWV 1006'], ['Schubert', 'Salve Regina in A Major, D. 676; Songs on Goethe’s “Mignon,” compiled and transcribed by Aribert Reimann'], ['Castello', 'Sonata Prima Concertate in stil moderno'], ['Telemann', 'Solo Sonata No 7']]\n",
"[['Poulenc', 'Suite Française'], ['Clarke, Gordon Jacob', 'Duos for clarinet and viola'], ['Mozart, Puccini, Delibes And Gershwin', 'Selected songs. Liana Diaz-Rivera, soprano; piano TBA.']]\n",
"[['Jolivet', 'Sonatine for flute and clarinet'], ['Strauss', 'Sonata in E-flat Major, op. 18.']]\n",
"[['Zyman', 'Fantasia Mexicana. and'], ['Haydn', 'Sonata in F Major, Hob. XVI'], ['Tchaikovsky', 'Five Romances (in Russian)']]\n",
"[['Rachmaninoff', 'All-Night Vigil (selections).'], ['Chesnokov', 'Salvation is Created.'], ['Rheinberger', 'Abendlied.']]\n",
"[['Rachmaninoff', 'All-Night Vigil (selections).'], ['Chesnokov', 'Salvation is Created.'], ['Rheinberger', 'Abendlied.']]\n",
"[['Davis', 'Merrie English Love Songs; and additional music for voice and trombone'], ['Jolivet', 'Sonatine for flute and clarinet'], ['Krstic', 'Croatian Songs, Part I; Get Along Home (selections)']]\n",
"[]\n",
"[['Mozart', 'Serenata notturna K. 239.'], ['Beethoven', 'Symphony No. 1.'], ['Beethoven', 'Violin Concerto.']]\n",
"[]\n",
"[]\n",
"[['Tchaikovsky', 'Selected Romances'], ['Haydn', 'F-Minor Variations. Guity Adjoodani, piano.'], ['Shikele', 'Dances for Three']]\n",
"[]\n",
"[]\n"
]
}
],
"source": [
"for link in links:\n",
" concertPage = requests.get(link) \n",
" soup = BeautifulSoup(concertPage.text, \"lxml\")\n",
" print(get_program_FMMC(soup))"
]
},
{
"cell_type": "code",
"execution_count": 387,
"metadata": {},
"outputs": [],
"source": [
"concertPage = requests.get(links[20]) \n",
"soup = BeautifulSoup(concertPage.text, \"lxml\")\n",
"program = soup.find_all('div', {'class': re.compile(r'tribe-events-single-event-description.*')})\n",
"pieces = re.findall('[A-Z]+[:|,].*', str(program[0]))\n",
"performer_re = re.compile(r'<strong>[\\w|\\s]*</strong>')\n",
"tag_re = re.compile(r'</?\\w+/?>')\n",
"pieces = [re.sub(performer_re, '', item) for item in pieces]\n",
"pieces = [re.sub('\\.?\\s,.*', '', item) for item in pieces]\n",
"pieces = [item.split(':') for item in pieces]\n",
"pieces = [item for item in pieces if len(item)>1]\n",
"pieces = [[item[0].title().strip(), re.sub(tag_re, '', item[1]).strip()] for item in pieces]"
]
},
{
"cell_type": "code",
"execution_count": 385,
"metadata": {},
"outputs": [],
"source": [
"pieces = re.findall('[A-Z]+[:|,].*', str(program[0]))\n",
"performer_re = re.compile(r'<strong>[\\w|\\s]*</strong>')\n",
"tag_re = re.compile(r'</?\\w+/?>')\n",
"pieces = [re.sub(performer_re, '', item) for item in pieces]\n",
"pieces = [re.sub('\\.?\\s,.*', '', item) for item in pieces]\n",
"pieces = [item.split(':') for item in pieces]"
]
},
{
"cell_type": "code",
"execution_count": 388,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['Mozart', 'Serenata notturna K. 239.'],\n",
" ['Beethoven', 'Symphony No. 1.'],\n",
" ['Beethoven', 'Violin Concerto.']]"
]
},
"execution_count": 388,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pieces"
]
},
{
"cell_type": "code",
"execution_count": 384,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'<div class=\"tribe-events-single-event-description tribe-events-content\">\\n<input class=\"fooboxshare_post_id\" type=\"hidden\" value=\"5091\"/><p>FMMC Avanti Orchestra, <strong>Pablo Saelzer</strong>, Conductor.<br/>\\nSoloist: <strong>Youjin Lee</strong>, violin, 2018 Washington International Competition first-prize winner.</p>\\n<p><strong>Program:</strong><br/>\\n• MOZART: Serenata notturna K. 239.<br/>\\n• BEETHOVEN: Symphony No. 1.<br/>\\n• BEETHOVEN: Violin Concerto.</p>\\n<p>Read the <a href=\"http://www.fmmc.org/performance-opportunities/avantiorchestra/\">Avanti page</a> for more details about the orchestra and other concerts this season.</p>\\n<p>The <a href=\"https://fmmcfoundation.org/current-competitions/2018-washington-international-competition-strings/\">Washington International Competition for String Players</a> Finals took place at the Terrace Theater of the Kennedy Center for the Performing Arts in Washington, DC, on June 24, 2018. </p>\\n</div>'"
]
},
"execution_count": 384,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"str(program[0])"
]
},
{
"cell_type": "code",
"execution_count": 373,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['MOZART: Selected arias. <strong>Debora Madsen</strong>, soprano; <strong>Gillian Cookson</strong>, piano.<br/>',\n",
" 'BEETHOVEN: Sonata in F-sharp Major, op. 78. DEBUSSY: <em>Gardens in the Rain</em>. <strong>Joan Berman Mizrahi</strong>, piano.<br/>',\n",
" 'ALBERT: <em>Doppler Effect</em>. GARY SCHOCKER: <em>Danger-High Voltage</em>. SAMUEL ZYMAN: <em>Fantasia Mexicana</em>. <strong>Laura Benning</strong> and <strong>Gwyn Jones</strong>, flutes; piano TBD.</p>']"
]
},
"execution_count": 373,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"re.findall('[A-Z]+[:|,].*', str(program[0]))"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"class Event():\n",
" def __init__(self, program=[], performers=[], date=None, price=None, place=None, tags=[], tags_simple=[]):\n",
" self.program = program\n",
" self.performers = performers\n",
" self.date = date\n",
" self.price = price\n",
" self.place = place\n",
" self.tags = tags\n",
" self.tags_simple = tags_simple\n",
" \n",
" def addTag(self, tag):\n",
" self.tags.append(tag)\n",
" \n",
" def printEvent(self):\n",
" date_str = self.date.strftime('%A, %B {}, %Y {}:%M %p'.format(self.date.day, self.date.hour%12))\n",
" print(date_str)\n",
" print(self.performers)\n",
" print(self.program)\n",
" print(self.place)\n",
" print(self.price)\n",
" print('\\n')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Returns a list of event objects from Washington Conservatory\n",
"def getEvents_WC():\n",
" WASHINGTON_CONSERVATORY_URL = 'http://www.washingtonconservatory.org/html/'\n",
" events = []\n",
" for link in get_rel_links_WC():\n",
" concertPage = requests.get(WASHINGTON_CONSERVATORY_URL + link) \n",
" soup = BeautifulSoup(concertPage.text, \"lxml\")\n",
" program = get_program_WC(soup)\n",
" date = get_date_WC(soup)\n",
" performers = get_performers_WC(soup)\n",
" events.append(Event(program=program, performers=performers, date=date, \n",
" place='Washington Conservatory', price='Donation'))\n",
" return events"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"# Returns a list of event objects from Kennedy Center\n",
"def getEvents_KC(genre):\n",
" KENNEDY_URL = 'http://www.kennedy-center.org'\n",
" events = []\n",
" for link in get_rel_links_KC(genre):\n",
" concertPage = requests.get(KENNEDY_URL + link) \n",
" soup = BeautifulSoup(concertPage.text, \"lxml\")\n",
" date = pd.Timestamp(soup.find('meta', {'name': 'StartDate'})['content']).to_pydatetime()\n",
" #price = soup.find('div', {'class': re.compile(r'price*')}).text\n",
" #price = re.search(r'(\\$\\d+\\.\\d\\d(\\s-\\s\\$\\d+\\.\\d\\d)?)', price).group(0) \n",
" price = get_price_KC(soup)\n",
" blurb = soup.select('div.blurbpadding') \n",
" if not get_performers_KC(blurb):\n",
" continue\n",
" performers = get_performers_KC(blurb)\n",
" program = get_program_KC(soup)\n",
" events.append(Event(program=program, performers=performers, date=date, \n",
" place='Kennedy Center', price=price))\n",
" return events"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
"def dfFromEvents(event_list):\n",
" columns = ['Date', 'Time', 'Venue', 'Price', 'Performers', 'Program', 'Tags', 'Tags_Simple']\n",
" df = pd.DataFrame(columns=columns)\n",
" for event in event_list:\n",
" tags = [] \n",
" tags_simple = []\n",
" composers = []\n",
" tags.extend([performer[1] for performer in event.performers if len(performer)>1])\n",
" \n",
" \n",
" tags = [ re.compile(r'conductor').sub('orchestra', tag) for tag in tags ]\n",
" tags_simple = [ re.compile(r'conductor').sub('orchestra', tag) for tag in tags_simple ]\n",
" \n",
" if 'orchestra' not in tags:\n",
" tags.append('chamber')\n",
" tags_simple.append('chamber')\n",
" else: \n",
" tags_simple.append('orchestra')\n",
" \n",
" tags = [ re.compile(r'\\s?solo\\s?').sub('', tag) for tag in tags ]\n",
" tags = [ re.compile(r' and').sub(',', tag) for tag in tags ]\n",
" \n",
" \n",
" #tags.extend( [p[0] for p in event.program if len(p[0])>1] )\n",
" composers.extend( [p[0] for p in event.program if len(p[0])>1] )\n",
" \n",
" #for index in range(len(tags)):\n",
" # tags[index] = tags[index].lower()\n",
" # if len(tags[index].split(' ')) > 1:\n",
" # tags[index] = tags[index].split(' ')[-1]\n",
" \n",
" for index in range(len(composers)):\n",
" composers[index] = composers[index].lower()\n",
" if len(composers[index].split(' ')) > 1:\n",
" composers[index] = composers[index].split(' ')[-1]\n",
" \n",
" \n",
" #tags = [tag for tag in tags if (not tag.isdigit())] \n",
" #tags = [unicodedata.normalize('NFKD', tag).encode('ascii','ignore').decode(\"ascii\") for tag in tags]\n",
" composers = [tag for tag in composers if (not tag.isdigit())] \n",
" composers = [unicodedata.normalize('NFKD', tag).encode('ascii','ignore').decode(\"ascii\") for tag in composers]\n",
" \n",
" composers = [composer for composer in composers if (len(composer)>1) & (composer != 'tbd')]\n",
" \n",
" tags.extend(set(composers))\n",
" \n",
" tags_simple.extend(list(set([composer_period[composer] for composer in composers if composer in composer_period.keys()])))\n",
" tags_simple.extend(list(set([composer_nationality[composer] for composer in composers if composer in composer_nationality.keys()])))\n",
" \n",
" tags = [tag for tag in tags if (len(tag)>1) & (tag != 'tbd')]\n",
" \n",
" \n",
" d = {'Date': event.date.date(), 'Time': event.date.time(), 'Venue': event.place, \n",
" 'Price': event.price, 'Performers': event.performers, 'Program': event.program, \n",
" 'Tags': tags, 'Tags_Simple': tags_simple}\n",
" df = df.append(pd.Series(d), ignore_index=True)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
"df_WC = dfFromEvents(getEvents_WC())"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
"composers = ['prokofiev', 'shostakovich', 'schubert', 'schubert']"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['romantic', 'modern']"
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(set([composer_period[composer] for composer in composers]))"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [],
"source": [
"df_KC = dfFromEvents(getEvents_KC('CLA')+getEvents_KC('CHA'))"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [],
"source": [
"df_all = pd.concat([df_WC, df_KC]).reset_index().drop(columns='index')\n",
"del df_WC\n",
"del df_KC"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [],
"source": [
"#df_all.sort_values(by='Date')[:10]"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Date</th>\n",
" <th>Time</th>\n",
" <th>Venue</th>\n",
" <th>Price</th>\n",
" <th>Performers</th>\n",
" <th>Program</th>\n",
" <th>Tags</th>\n",
" <th>Tags_Simple</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>2018-10-06</td>\n",
" <td>20:00:00</td>\n",
" <td>Washington Conservatory</td>\n",
" <td>Donation</td>\n",
" <td>[[Pressenda Chamber Players]]</td>\n",
" <td>[[Maurice Ravel, Trio in A Minor], [Peter Tcha...</td>\n",
" <td>[chamber, ravel, tchaikovsky]</td>\n",
" <td>[chamber, romantic, modern, russian, french]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>2018-10-10</td>\n",
" <td>19:30:00</td>\n",
" <td>Kennedy Center</td>\n",
" <td>50.00</td>\n",
" <td>[[Joel Link, violin], [Bryan Lee, violin]]</td>\n",
" <td>[[Anton Webern, Langsamer Satz], [Mason Bates...</td>\n",
" <td>[violin, violin, chamber, schubert, webern, ba...</td>\n",
" <td>[chamber, romantic, modern, austrian, american]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>2018-10-11</td>\n",
" <td>19:00:00</td>\n",
" <td>Kennedy Center</td>\n",
" <td>15.00 - 89.00</td>\n",
" <td>[[Christoph Eschenbach, conductor], [Ray Chen,...</td>\n",
" <td>[[Mendelssohn, Calm Sea and Prosperous Voyage...</td>\n",
" <td>[orchestra, violin, beethoven, mendelssohn]</td>\n",
" <td>[orchestra, romantic, classical, german]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>2018-10-23</td>\n",
" <td>19:30:00</td>\n",
" <td>Kennedy Center</td>\n",
" <td>55.00</td>\n",
" <td>[[Joseph Kalichstein, piano], [Jaime Laredo, v...</td>\n",
" <td>[[Robert Schumann, Selections from Canonic Et...</td>\n",
" <td>[piano, violin, cello, chamber, ravel, mendels...</td>\n",
" <td>[chamber, romantic, modern, german, french]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>2018-11-01</td>\n",
" <td>19:00:00</td>\n",
" <td>Kennedy Center</td>\n",
" <td>15.00 - 89.00</td>\n",
" <td>[[James Gaffigan, conductor], [Simon Trpceski,...</td>\n",
" <td>[[Prokofiev, Symphony No. 3], [Shostakovich, P...</td>\n",
" <td>[orchestra, piano, prokofiev, shostakovich, kh...</td>\n",
" <td>[orchestra, modern, russian]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>2018-11-29</td>\n",
" <td>19:00:00</td>\n",
" <td>Kennedy Center</td>\n",
" <td>15.00 - 89.00</td>\n",
" <td>[[Gianandrea Noseda, conductor], [Karina Flore...</td>\n",
" <td>[[Britten, War Requiem]]</td>\n",
" <td>[orchestra, soprano, tenor, baritone, britten]</td>\n",
" <td>[orchestra, modern, english]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>2018-12-01</td>\n",
" <td>20:00:00</td>\n",
" <td>Washington Conservatory</td>\n",
" <td>Donation</td>\n",
" <td>[[Pressenda Chamber Players]]</td>\n",
" <td>[[Johannes Brahms, Clarinet Trio in A Minor, O...</td>\n",
" <td>[chamber, messiaen, brahms]</td>\n",
" <td>[chamber, romantic, modern, german, french]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>2018-12-06</td>\n",
" <td>19:00:00</td>\n",
" <td>Kennedy Center</td>\n",
" <td>15.00 - 89.00</td>\n",
" <td>[[Gianandrea Noseda, conductor]]</td>\n",
" <td>[[Mahler, Symphony No. 1, “Titan”]]</td>\n",
" <td>[orchestra, mahler]</td>\n",
" <td>[orchestra, romantic, austrian]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>2019-01-05</td>\n",
" <td>20:00:00</td>\n",
" <td>Washington Conservatory</td>\n",
" <td>Donation</td>\n",
" <td>[[Alexander Paley, piano]]</td>\n",
" <td>[[Chopin, Etudes], [Tchaikovsky, Romeo and Jul...</td>\n",
" <td>[piano, chamber, tchaikovsky, chopin]</td>\n",
" <td>[chamber, romantic, russian]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>2019-01-18</td>\n",
" <td>20:00:00</td>\n",
" <td>Kennedy Center</td>\n",
" <td>15.00 - 89.00</td>\n",
" <td>[[Gianandrea Noseda, conductor], [Renée Flemin...</td>\n",
" <td>[[Schubert, Rosamunde—Overture and incidental...</td>\n",
" <td>[orchestra, soprano, schubert, schubert/berio]</td>\n",
" <td>[orchestra, romantic, austrian]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>2019-01-30</td>\n",
" <td>19:30:00</td>\n",
" <td>Kennedy Center</td>\n",
" <td>45.00</td>\n",
" <td>[[Valerie Coleman, flute]]</td>\n",
" <td>[[Lalo Schifrin, La Nouvelle Orleans ]]</td>\n",
" <td>[flute, chamber, schifrin]</td>\n",
" <td>[chamber, modern]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>2019-01-31</td>\n",
" <td>19:00:00</td>\n",
" <td>Kennedy Center</td>\n",
" <td>15.00 - 89.00</td>\n",
" <td>[[Gianandrea Noseda, conductor], [Daniil Trifo...</td>\n",
" <td>[[Beethoven, Piano Concerto No. 5 “Emperor”],...</td>\n",
" <td>[orchestra, piano, beethoven, shostakovich]</td>\n",
" <td>[orchestra, modern, classical, german, russian]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Date Time Venue Price \\\n",
"6 2018-10-06 20:00:00 Washington Conservatory Donation \n",
"37 2018-10-10 19:30:00 Kennedy Center 50.00 \n",
"17 2018-10-11 19:00:00 Kennedy Center 15.00 - 89.00 \n",
"38 2018-10-23 19:30:00 Kennedy Center 55.00 \n",
"18 2018-11-01 19:00:00 Kennedy Center 15.00 - 89.00 \n",
"19 2018-11-29 19:00:00 Kennedy Center 15.00 - 89.00 \n",
"8 2018-12-01 20:00:00 Washington Conservatory Donation \n",
"20 2018-12-06 19:00:00 Kennedy Center 15.00 - 89.00 \n",
"9 2019-01-05 20:00:00 Washington Conservatory Donation \n",
"22 2019-01-18 20:00:00 Kennedy Center 15.00 - 89.00 \n",
"40 2019-01-30 19:30:00 Kennedy Center 45.00 \n",
"23 2019-01-31 19:00:00 Kennedy Center 15.00 - 89.00 \n",
"\n",
" Performers \\\n",
"6 [[Pressenda Chamber Players]] \n",
"37 [[Joel Link, violin], [Bryan Lee, violin]] \n",
"17 [[Christoph Eschenbach, conductor], [Ray Chen,... \n",
"38 [[Joseph Kalichstein, piano], [Jaime Laredo, v... \n",
"18 [[James Gaffigan, conductor], [Simon Trpceski,... \n",
"19 [[Gianandrea Noseda, conductor], [Karina Flore... \n",
"8 [[Pressenda Chamber Players]] \n",
"20 [[Gianandrea Noseda, conductor]] \n",
"9 [[Alexander Paley, piano]] \n",
"22 [[Gianandrea Noseda, conductor], [Renée Flemin... \n",
"40 [[Valerie Coleman, flute]] \n",
"23 [[Gianandrea Noseda, conductor], [Daniil Trifo... \n",
"\n",
" Program \\\n",
"6 [[Maurice Ravel, Trio in A Minor], [Peter Tcha... \n",
"37 [[Anton Webern, Langsamer Satz], [Mason Bates... \n",
"17 [[Mendelssohn, Calm Sea and Prosperous Voyage... \n",
"38 [[Robert Schumann, Selections from Canonic Et... \n",
"18 [[Prokofiev, Symphony No. 3], [Shostakovich, P... \n",
"19 [[Britten, War Requiem]] \n",
"8 [[Johannes Brahms, Clarinet Trio in A Minor, O... \n",
"20 [[Mahler, Symphony No. 1, “Titan”]] \n",
"9 [[Chopin, Etudes], [Tchaikovsky, Romeo and Jul... \n",
"22 [[Schubert, Rosamunde—Overture and incidental... \n",
"40 [[Lalo Schifrin, La Nouvelle Orleans ]] \n",
"23 [[Beethoven, Piano Concerto No. 5 “Emperor”],... \n",
"\n",
" Tags \\\n",
"6 [chamber, ravel, tchaikovsky] \n",
"37 [violin, violin, chamber, schubert, webern, ba... \n",
"17 [orchestra, violin, beethoven, mendelssohn] \n",
"38 [piano, violin, cello, chamber, ravel, mendels... \n",
"18 [orchestra, piano, prokofiev, shostakovich, kh... \n",
"19 [orchestra, soprano, tenor, baritone, britten] \n",
"8 [chamber, messiaen, brahms] \n",
"20 [orchestra, mahler] \n",
"9 [piano, chamber, tchaikovsky, chopin] \n",
"22 [orchestra, soprano, schubert, schubert/berio] \n",
"40 [flute, chamber, schifrin] \n",
"23 [orchestra, piano, beethoven, shostakovich] \n",
"\n",
" Tags_Simple \n",
"6 [chamber, romantic, modern, russian, french] \n",
"37 [chamber, romantic, modern, austrian, american] \n",
"17 [orchestra, romantic, classical, german] \n",
"38 [chamber, romantic, modern, german, french] \n",
"18 [orchestra, modern, russian] \n",
"19 [orchestra, modern, english] \n",
"8 [chamber, romantic, modern, german, french] \n",
"20 [orchestra, romantic, austrian] \n",
"9 [chamber, romantic, russian] \n",
"22 [orchestra, romantic, austrian] \n",
"40 [chamber, modern] \n",
"23 [orchestra, modern, classical, german, russian] "
]
},
"execution_count": 166,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_all[df_all['Date'] > dt.datetime.today().date()].sort_values(by='Date')[:15].drop(index=[7, 39, 21])"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"#today = dt.datetime.today().date()\n",
"#df_all[ (df_all['Date'] > today) & (df_all['Date'] <= (today + dt.timedelta(weeks=5))) ].sort_values(by='Date')"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [],
"source": [
"#df_all"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {},
"outputs": [],
"source": [
"class DictEncoder(base.BaseEstimator, base.TransformerMixin):\n",
" \n",
" def __init__(self, col):\n",
" self.col = col\n",
" \n",
" def fit(self, X, y=None):\n",
" return self\n",
" \n",
" def transform(self, X):\n",
" \n",
" def to_dict(l):\n",
" try:\n",
" return {x: 1 for x in l}\n",
" except TypeError:\n",
" return {}\n",
" \n",
" return X[self.col].apply(to_dict)"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<49x14 sparse matrix of type '<class 'numpy.float64'>'\n",
"\twith 169 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tag_pipe = Pipeline([('encoder', DictEncoder('Tags_Simple')),\n",
" ('vectorizer', DictVectorizer())])\n",
"features = tag_pipe.fit_transform(df_all)\n",
"features"
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {},
"outputs": [],
"source": [
"nn = NearestNeighbors(n_neighbors=2).fit(features)"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Date</th>\n",
" <th>Time</th>\n",
" <th>Venue</th>\n",
" <th>Price</th>\n",
" <th>Performers</th>\n",
" <th>Program</th>\n",
" <th>Tags</th>\n",
" <th>Tags_Simple</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>2019-05-16</td>\n",
" <td>19:00:00</td>\n",
" <td>Kennedy Center</td>\n",
" <td>15.00 - 89.00</td>\n",
" <td>[[Gianandrea Noseda, conductor], [Erika Grimal...</td>\n",
" <td>[[Liszt, Dante Symphony], [Rossini, Stabat M...</td>\n",
" <td>[orchestra, soprano, liszt, rossini]</td>\n",
" <td>[orchestra, romantic, classical, hungarian, it...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>2018-12-01</td>\n",
" <td>20:00:00</td>\n",
" <td>Washington Conservatory</td>\n",
" <td>Donation</td>\n",
" <td>[[Pressenda Chamber Players]]</td>\n",
" <td>[[Johannes Brahms, Clarinet Trio in A Minor, O...</td>\n",
" <td>[chamber, messiaen, brahms]</td>\n",
" <td>[chamber, romantic, modern, german, french]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Date Time Venue Price \\\n",
"31 2019-05-16 19:00:00 Kennedy Center 15.00 - 89.00 \n",
"8 2018-12-01 20:00:00 Washington Conservatory Donation \n",
"\n",
" Performers \\\n",
"31 [[Gianandrea Noseda, conductor], [Erika Grimal... \n",
"8 [[Pressenda Chamber Players]] \n",
"\n",
" Program \\\n",
"31 [[Liszt, Dante Symphony], [Rossini, Stabat M... \n",
"8 [[Johannes Brahms, Clarinet Trio in A Minor, O... \n",
"\n",
" Tags \\\n",
"31 [orchestra, soprano, liszt, rossini] \n",
"8 [chamber, messiaen, brahms] \n",
"\n",
" Tags_Simple \n",
"31 [orchestra, romantic, classical, hungarian, it... \n",
"8 [chamber, romantic, modern, german, french] "
]
},
"execution_count": 138,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_all.iloc[[31, 8]]"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Date</th>\n",
" <th>Time</th>\n",
" <th>Venue</th>\n",
" <th>Price</th>\n",
" <th>Performers</th>\n",
" <th>Program</th>\n",
" <th>Tags</th>\n",
" <th>Tags_Simple</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>2019-05-16</td>\n",
" <td>19:00:00</td>\n",
" <td>Kennedy Center</td>\n",
" <td>15.00 - 89.00</td>\n",
" <td>[[Gianandrea Noseda, conductor], [Erika Grimal...</td>\n",
" <td>[[Liszt, Dante Symphony], [Rossini, Stabat M...</td>\n",
" <td>[orchestra, soprano, liszt, rossini]</td>\n",
" <td>[orchestra, romantic, classical, hungarian, it...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>2018-10-11</td>\n",
" <td>19:00:00</td>\n",
" <td>Kennedy Center</td>\n",
" <td>15.00 - 89.00</td>\n",
" <td>[[Christoph Eschenbach, conductor], [Ray Chen,...</td>\n",
" <td>[[Mendelssohn, Calm Sea and Prosperous Voyage...</td>\n",
" <td>[orchestra, violin, beethoven, mendelssohn]</td>\n",
" <td>[orchestra, romantic, classical, german]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Date Time Venue Price \\\n",
"31 2019-05-16 19:00:00 Kennedy Center 15.00 - 89.00 \n",
"17 2018-10-11 19:00:00 Kennedy Center 15.00 - 89.00 \n",
"\n",
" Performers \\\n",
"31 [[Gianandrea Noseda, conductor], [Erika Grimal... \n",
"17 [[Christoph Eschenbach, conductor], [Ray Chen,... \n",
"\n",
" Program \\\n",
"31 [[Liszt, Dante Symphony], [Rossini, Stabat M... \n",
"17 [[Mendelssohn, Calm Sea and Prosperous Voyage... \n",
"\n",
" Tags \\\n",
"31 [orchestra, soprano, liszt, rossini] \n",
"17 [orchestra, violin, beethoven, mendelssohn] \n",
"\n",
" Tags_Simple \n",
"31 [orchestra, romantic, classical, hungarian, it... \n",
"17 [orchestra, romantic, classical, german] "
]
},
"execution_count": 150,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dists, indices = nn.kneighbors(features[31])\n",
"df_all.iloc[indices[0]]"
]
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Date</th>\n",
" <th>Time</th>\n",
" <th>Venue</th>\n",
" <th>Price</th>\n",
" <th>Performers</th>\n",
" <th>Program</th>\n",
" <th>Tags</th>\n",
" <th>Tags_Simple</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>2018-12-01</td>\n",
" <td>20:00:00</td>\n",
" <td>Washington Conservatory</td>\n",
" <td>Donation</td>\n",
" <td>[[Pressenda Chamber Players]]</td>\n",
" <td>[[Johannes Brahms, Clarinet Trio in A Minor, O...</td>\n",
" <td>[chamber, messiaen, brahms]</td>\n",
" <td>[chamber, romantic, modern, german, french]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>2018-10-23</td>\n",
" <td>19:30:00</td>\n",
" <td>Kennedy Center</td>\n",
" <td>55.00</td>\n",
" <td>[[Joseph Kalichstein, piano], [Jaime Laredo, v...</td>\n",
" <td>[[Robert Schumann, Selections from Canonic Et...</td>\n",
" <td>[piano, violin, cello, chamber, ravel, mendels...</td>\n",
" <td>[chamber, romantic, modern, german, french]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Date Time Venue Price \\\n",
"8 2018-12-01 20:00:00 Washington Conservatory Donation \n",
"38 2018-10-23 19:30:00 Kennedy Center 55.00 \n",
"\n",
" Performers \\\n",
"8 [[Pressenda Chamber Players]] \n",
"38 [[Joseph Kalichstein, piano], [Jaime Laredo, v... \n",
"\n",
" Program \\\n",
"8 [[Johannes Brahms, Clarinet Trio in A Minor, O... \n",
"38 [[Robert Schumann, Selections from Canonic Et... \n",
"\n",
" Tags \\\n",
"8 [chamber, messiaen, brahms] \n",
"38 [piano, violin, cello, chamber, ravel, mendels... \n",
"\n",
" Tags_Simple \n",
"8 [chamber, romantic, modern, german, french] \n",
"38 [chamber, romantic, modern, german, french] "
]
},
"execution_count": 157,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dists, indices = nn.kneighbors(features[38])\n",
"df_all.iloc[indices[0]]"
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['chamber', 'messiaen', 'brahms']"
]
},
"execution_count": 148,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_all.iloc[8]['Tags']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('averaged_perceptron_tagger')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"concertPage = requests.get(KENNEDY_URL + links[26]) \n",
"soup = BeautifulSoup(concertPage.text, \"lxml\")\n",
"date = soup.find('meta', {'name': 'StartDate'})['content']\n",
"blurb = soup.select('div.blurbpadding') "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#tokens = word_tokenize(blurb[0].text.lower())\n",
"tokens = word_tokenize(blurb[0].text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clean_tokens = []\n",
"stemmer = PorterStemmer()\n",
"tokens_stemmed = []\n",
"for token in tokens:\n",
" stemmed_token = stemmer.stem(token)\n",
" tokens_stemmed.append(stemmed_token)\n",
"\n",
"frequencies = Counter(tokens_stemmed)\n",
"for token, count in frequencies.most_common(25):\n",
" if (token not in ENGLISH_STOPWORDS) & (len(token)>2):\n",
" clean_tokens.append(token)\n",
" print(token, count)\n",
" \n",
"#trigrams = ngrams([token for token in tokens_stemmed if (token not in ENGLISH_STOPWORDS) & (len(token)>2)], 3)\n",
"trigrams = ngrams(tokens, 3)\n",
"\n",
"frequencies_2 = Counter(trigrams)\n",
"for token, count in frequencies_2.most_common(10):\n",
" print(token, count)\n",
"\n",
"\n",
"print(list(trigrams))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tags = pos_tag(tokens)\n",
"print(blurb[0].text)\n",
"print([tag for tag in tags if tag[1] == 'NNP'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"[w[0] for w in tags if w[1]=='NNP']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"blurb[0].text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"BeautifulSoup\t Event\t composer_nationality\t composer_period\t datetime\t dfFromEvents\t df_all\t dill\t dt\t \n",
"getEvents_KC\t getEvents_WC\t get_date_WC\t get_link_KC\t get_performers_KC\t get_performers_WC\t get_price_KC\t get_program_KC\t get_program_WC\t \n",
"get_rel_links_KC\t get_rel_links_WC\t pd\t re\t requests\t unicodedata\t \n"
]
}
],
"source": [
"%who"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment