Created
September 26, 2018 06:46
-
-
Save gunessenturk/688c8472d2541d4d65f938547c9eb44a to your computer and use it in GitHub Desktop.
TDI_project_week3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"import dill\n", | |
"from bs4 import BeautifulSoup\n", | |
"from datetime import datetime\n", | |
"import re\n", | |
"import pandas as pd\n", | |
"import datetime as dt\n", | |
"\n", | |
"import nltk\n", | |
"from nltk import word_tokenize\n", | |
"from collections import Counter\n", | |
"from nltk.stem import PorterStemmer # one of the several available stemmers\n", | |
"from nltk.corpus import stopwords\n", | |
"from nltk import ngrams\n", | |
"from nltk import pos_tag" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ENGLISH_STOPWORDS = stopwords.words('english')\n", | |
"KENNEDY_URL = 'http://www.kennedy-center.org'\n", | |
"WASHINGTON_CONSERVATORY_URL = 'http://www.washingtonconservatory.org/html/'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_rel_links_KC(genre): # genre is CLA for classical, CHA for chamber\n", | |
" # Use requests.get to download the page.\n", | |
" page = requests.get('http://www.kennedy-center.org/calendar/genre/' + genre) \n", | |
" soup = BeautifulSoup(page.text, \"lxml\")\n", | |
"\n", | |
" # Get all relative links to individual concerts\n", | |
" events = soup.select('h4') \n", | |
" links = [get_link_KC(event) for event in events]\n", | |
" return links" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Extract relative urls for each event\n", | |
"def get_link_KC(event):\n", | |
" rel_url = re.search( '<a\\shref=\"(.+)\">' , str(event)).group(1)\n", | |
" return rel_url" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Returns a list of tuples (<Performer name>, <role>)\n", | |
"def get_performers_KC(blurb):\n", | |
" if not re.search(r'blurbpadding\">[\\s]*(\\n?(\\w+\\s\\w+),\\s(\\w+)<br/>\\r*)+', str(blurb)):\n", | |
" return None\n", | |
" performers = re.search(r'blurbpadding\">[\\s]*(\\n?(\\w+\\s\\w+),\\s(\\w+)<br/>\\r*)+', str(blurb)).group(0)\n", | |
" performers = list(filter(lambda x: (len(x)>4),performers.split('<br/>')))\n", | |
" performers = list(map(lambda x: re.search('([A-Z].+),\\s(.+)', x),performers))\n", | |
" performers = [(performer.group(1), performer.group(2)) for performer in performers]\n", | |
" return performers" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Returns a list of lists [<Composer name>, <piece name>]\n", | |
"def get_program_KC(soup):\n", | |
"\n", | |
" composer_re = re.compile(r'<a href=\"/artist/index/[\\w]+\">(.+)</a>')\n", | |
" composition_re = re.compile(r'<a href=\"/artist/composition/[\\w]+\">(.+)</a>')\n", | |
" TAG_RE = re.compile(r'<[^>]+>')\n", | |
" #composers = re.findall(composer_re, str(soup))\n", | |
" #if composers:\n", | |
" # compositions = re.findall(composition_re, str(soup))\n", | |
" # compositions = [TAG_RE.sub('', composition) for composition in compositions]\n", | |
" # pieces = list(zip(composers, compositions))\n", | |
"\n", | |
" #else:\n", | |
" blurb = soup.select('div.blurbpadding') \n", | |
" pieces = re.findall(r'<strong>(.+</strong>[:|,].*)', str(blurb))\n", | |
" TAG_RE = re.compile(r'<[^>]+>')\n", | |
" pieces = [TAG_RE.sub('', piece) for piece in pieces]\n", | |
" pieces = [piece for piece in pieces if len(piece)<60]\n", | |
" pieces = [re.compile(r'\\xa0').sub('', piece) for piece in pieces]\n", | |
" if (pieces) and (':' in pieces[0]):\n", | |
" pieces = [piece.split(':') for piece in pieces]\n", | |
" else:\n", | |
" pieces = [piece.split(',') for piece in pieces]\n", | |
" \n", | |
" if pieces == []:\n", | |
" pieces = ['Program: TBD']\n", | |
" \n", | |
" return pieces" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Returns the range of ticket proces\n", | |
"def get_price_KC(soup):\n", | |
" price = soup.find('div', {'class': re.compile(r'price*')}).text\n", | |
" price = re.search(r'(\\$\\d+\\.\\d\\d(\\s-\\s\\$\\d+\\.\\d\\d)?)', price).group(0) \n", | |
" price = re.compile(r'\\$').sub('', price)\n", | |
" return price" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Washington Conservatory\n", | |
"def get_rel_links_WC(): \n", | |
" # Download the page.\n", | |
" page = requests.get('http://www.washingtonconservatory.org/html/concerts.htm#professional') \n", | |
" soup = BeautifulSoup(page.text, \"lxml\")\n", | |
" \n", | |
" # Get all relative links to individual concerts\n", | |
" links = [link['href'] for link in soup.find_all(\"a\", {'href': re.compile(r'concerts1819_.*')})]\n", | |
" return links" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Get program\n", | |
"def get_program_WC(soup):\n", | |
" program = soup.find_all('li')\n", | |
" program = [item for item in program if item.text != '']\n", | |
" if not program:\n", | |
" program = ['Program: TBD']\n", | |
" else: \n", | |
" program = [item.text for item in program]\n", | |
" return program" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Get event date and time\n", | |
"def get_date_WC(soup):\n", | |
" date = soup.find('p', {'class': 'center', 'style': re.compile(r'position: relative.*')}).text.strip().split('\\n')[0]\n", | |
" date = pd.Timestamp(date)\n", | |
" return date" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Return a list of performers\n", | |
"def get_performers_WC(soup):\n", | |
" performers = soup.find('h3', {'style': re.compile(r'margin-left: 20px.*')})\n", | |
" performers = performers.findChildren()[0]\n", | |
" #performers = [item.strip() for item in performers if ',' in item]\n", | |
" performers = re.findall('([A-Z][A-Z]+\\.?-?\\s?[A-Z]*\\.?-?\\s?[A-Z]*\\.?,?\\s?[a-z]*\\s?[a-z]*\\s?[a-z]*)', performers.text)\n", | |
" performers = [performer.strip() for performer in performers]\n", | |
" performers = [performer.split(', ') for performer in performers]\n", | |
" if not performers:\n", | |
" return ['Performers: TBD']\n", | |
" return performers" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class Event():\n", | |
" def __init__(self, program=[], performers=[], date=None, price=None, place=None, tags=[]):\n", | |
" self.program = program\n", | |
" self.performers = performers\n", | |
" self.date = date\n", | |
" self.price = price\n", | |
" self.place = place\n", | |
" self.tags = tags\n", | |
" \n", | |
" def addTag(self, tag):\n", | |
" self.tags.append(tag)\n", | |
" \n", | |
" def printEvent(self):\n", | |
" date_str = self.date.strftime('%A, %B {}, %Y {}:%M %p'.format(self.date.day, self.date.hour%12))\n", | |
" print(date_str)\n", | |
" print(self.performers)\n", | |
" print(self.program)\n", | |
" print(self.place)\n", | |
" print(self.price)\n", | |
" print('\\n')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Returns a list of event objects from Washington Conservatory\n", | |
"def getEvents_WC():\n", | |
" events = []\n", | |
" for link in get_rel_links_WC():\n", | |
" concertPage = requests.get(WASHINGTON_CONSERVATORY_URL + link) \n", | |
" soup = BeautifulSoup(concertPage.text, \"lxml\")\n", | |
" program = get_program_WC(soup)\n", | |
" date = get_date_WC(soup)\n", | |
" performers = get_performers_WC(soup)\n", | |
" events.append(Event(program=program, performers=performers, date=date, \n", | |
" place='Washington Conservatory', price='Donation'))\n", | |
" return events" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Returns a list of event objects from Kennedy Center\n", | |
"def getEvents_KC(genre):\n", | |
" events = []\n", | |
" for link in get_rel_links_KC(genre):\n", | |
" concertPage = requests.get(KENNEDY_URL + link) \n", | |
" soup = BeautifulSoup(concertPage.text, \"lxml\")\n", | |
" date = pd.Timestamp(soup.find('meta', {'name': 'StartDate'})['content']).to_pydatetime()\n", | |
" #price = soup.find('div', {'class': re.compile(r'price*')}).text\n", | |
" #price = re.search(r'(\\$\\d+\\.\\d\\d(\\s-\\s\\$\\d+\\.\\d\\d)?)', price).group(0) \n", | |
" price = get_price_KC(soup)\n", | |
" blurb = soup.select('div.blurbpadding') \n", | |
" if not get_performers_KC(blurb):\n", | |
" continue\n", | |
" performers = get_performers_KC(blurb)\n", | |
" program = get_program_KC(soup)\n", | |
" events.append(Event(program=program, performers=performers, date=date, \n", | |
" place='Kennedy Center', price=price))\n", | |
" return events" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"all_events = []\n", | |
"all_events.extend(getEvents_WC())\n", | |
"all_events.extend(getEvents_KC('CLA'))\n", | |
"all_events.extend(getEvents_KC('CHA'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"[event.printEvent() for event in all_events]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"columns = ['Date', 'Time', 'Venue', 'Price', 'Performers', 'Program', 'Tags']\n", | |
"df = pd.DataFrame(columns=columns)\n", | |
"for event in all_events:\n", | |
" tags = [] \n", | |
" tags.extend([performer[1] for performer in event.performers if len(performer)>1])\n", | |
" tags.extend( [p[0] for p in event.program if len(p[0])>1] )\n", | |
" tags = [ re.compile(r'conductor').sub('orchestra', tag) for tag in tags ]\n", | |
" tags = [ re.compile(r'\\s?solo\\s?').sub('', tag) for tag in tags ]\n", | |
" tags = [ re.compile(r' and').sub(',', tag) for tag in tags ]\n", | |
" d = {'Date': event.date.date(), 'Time': event.date.time(), 'Venue': event.place, \n", | |
" 'Price': event.price, 'Performers': event.performers, 'Program': event.program, 'Tags': tags}\n", | |
" df = df.append(pd.Series(d), ignore_index=True)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df[df['Date'] > dt.datetime.today().date()].sort_values(by='Date')[:20]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"string = 'ever, erf3e, 3ef3, 3vwre'\n", | |
"string.split(',', 2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dt.strftime('%I:%M %p')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dt.datetime.today()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"nltk.download('punkt')\n", | |
"nltk.download('stopwords')\n", | |
"nltk.download('averaged_perceptron_tagger')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"concertPage = requests.get(KENNEDY_URL + links[26]) \n", | |
"soup = BeautifulSoup(concertPage.text, \"lxml\")\n", | |
"date = soup.find('meta', {'name': 'StartDate'})['content']\n", | |
"blurb = soup.select('div.blurbpadding') " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#tokens = word_tokenize(blurb[0].text.lower())\n", | |
"tokens = word_tokenize(blurb[0].text)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"clean_tokens = []\n", | |
"stemmer = PorterStemmer()\n", | |
"tokens_stemmed = []\n", | |
"for token in tokens:\n", | |
" stemmed_token = stemmer.stem(token)\n", | |
" tokens_stemmed.append(stemmed_token)\n", | |
"\n", | |
"frequencies = Counter(tokens_stemmed)\n", | |
"for token, count in frequencies.most_common(25):\n", | |
" if (token not in ENGLISH_STOPWORDS) & (len(token)>2):\n", | |
" clean_tokens.append(token)\n", | |
" print(token, count)\n", | |
" \n", | |
"#trigrams = ngrams([token for token in tokens_stemmed if (token not in ENGLISH_STOPWORDS) & (len(token)>2)], 3)\n", | |
"trigrams = ngrams(tokens, 3)\n", | |
"\n", | |
"frequencies_2 = Counter(trigrams)\n", | |
"for token, count in frequencies_2.most_common(10):\n", | |
" print(token, count)\n", | |
"\n", | |
"\n", | |
"print(list(trigrams))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"tags = pos_tag(tokens)\n", | |
"print(blurb[0].text)\n", | |
"print([tag for tag in tags if tag[1] == 'NNP'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"[w[0] for w in tags if w[1]=='NNP']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"blurb[0].text" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment