Skip to content

Instantly share code, notes, and snippets.

@gunessenturk
Created September 26, 2018 06:46
Show Gist options
  • Save gunessenturk/688c8472d2541d4d65f938547c9eb44a to your computer and use it in GitHub Desktop.
Save gunessenturk/688c8472d2541d4d65f938547c9eb44a to your computer and use it in GitHub Desktop.
TDI_project_week3
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import dill\n",
"from bs4 import BeautifulSoup\n",
"from datetime import datetime\n",
"import re\n",
"import pandas as pd\n",
"import datetime as dt\n",
"\n",
"import nltk\n",
"from nltk import word_tokenize\n",
"from collections import Counter\n",
"from nltk.stem import PorterStemmer # one of the several available stemmers\n",
"from nltk.corpus import stopwords\n",
"from nltk import ngrams\n",
"from nltk import pos_tag"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ENGLISH_STOPWORDS = stopwords.words('english')\n",
"KENNEDY_URL = 'http://www.kennedy-center.org'\n",
"WASHINGTON_CONSERVATORY_URL = 'http://www.washingtonconservatory.org/html/'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_rel_links_KC(genre): # genre is CLA for classical, CHA for chamber\n",
" # Use requests.get to download the page.\n",
" page = requests.get('http://www.kennedy-center.org/calendar/genre/' + genre) \n",
" soup = BeautifulSoup(page.text, \"lxml\")\n",
"\n",
" # Get all relative links to individual concerts\n",
" events = soup.select('h4') \n",
" links = [get_link_KC(event) for event in events]\n",
" return links"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Extract relative urls for each event\n",
"def get_link_KC(event):\n",
" rel_url = re.search( '<a\\shref=\"(.+)\">' , str(event)).group(1)\n",
" return rel_url"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Returns a list of tuples (<Performer name>, <role>)\n",
"def get_performers_KC(blurb):\n",
" if not re.search(r'blurbpadding\">[\\s]*(\\n?(\\w+\\s\\w+),\\s(\\w+)<br/>\\r*)+', str(blurb)):\n",
" return None\n",
" performers = re.search(r'blurbpadding\">[\\s]*(\\n?(\\w+\\s\\w+),\\s(\\w+)<br/>\\r*)+', str(blurb)).group(0)\n",
" performers = list(filter(lambda x: (len(x)>4),performers.split('<br/>')))\n",
" performers = list(map(lambda x: re.search('([A-Z].+),\\s(.+)', x),performers))\n",
" performers = [(performer.group(1), performer.group(2)) for performer in performers]\n",
" return performers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Returns a list of lists [<Composer name>, <piece name>]\n",
"def get_program_KC(soup):\n",
"\n",
" composer_re = re.compile(r'<a href=\"/artist/index/[\\w]+\">(.+)</a>')\n",
" composition_re = re.compile(r'<a href=\"/artist/composition/[\\w]+\">(.+)</a>')\n",
" TAG_RE = re.compile(r'<[^>]+>')\n",
" #composers = re.findall(composer_re, str(soup))\n",
" #if composers:\n",
" # compositions = re.findall(composition_re, str(soup))\n",
" # compositions = [TAG_RE.sub('', composition) for composition in compositions]\n",
" # pieces = list(zip(composers, compositions))\n",
"\n",
" #else:\n",
" blurb = soup.select('div.blurbpadding') \n",
" pieces = re.findall(r'<strong>(.+</strong>[:|,].*)', str(blurb))\n",
" TAG_RE = re.compile(r'<[^>]+>')\n",
" pieces = [TAG_RE.sub('', piece) for piece in pieces]\n",
" pieces = [piece for piece in pieces if len(piece)<60]\n",
" pieces = [re.compile(r'\\xa0').sub('', piece) for piece in pieces]\n",
" if (pieces) and (':' in pieces[0]):\n",
" pieces = [piece.split(':') for piece in pieces]\n",
" else:\n",
" pieces = [piece.split(',') for piece in pieces]\n",
" \n",
" if pieces == []:\n",
" pieces = ['Program: TBD']\n",
" \n",
" return pieces"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Returns the range of ticket proces\n",
"def get_price_KC(soup):\n",
" price = soup.find('div', {'class': re.compile(r'price*')}).text\n",
" price = re.search(r'(\\$\\d+\\.\\d\\d(\\s-\\s\\$\\d+\\.\\d\\d)?)', price).group(0) \n",
" price = re.compile(r'\\$').sub('', price)\n",
" return price"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Washington Conservatory\n",
"def get_rel_links_WC(): \n",
" # Download the page.\n",
" page = requests.get('http://www.washingtonconservatory.org/html/concerts.htm#professional') \n",
" soup = BeautifulSoup(page.text, \"lxml\")\n",
" \n",
" # Get all relative links to individual concerts\n",
" links = [link['href'] for link in soup.find_all(\"a\", {'href': re.compile(r'concerts1819_.*')})]\n",
" return links"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get program\n",
"def get_program_WC(soup):\n",
" program = soup.find_all('li')\n",
" program = [item for item in program if item.text != '']\n",
" if not program:\n",
" program = ['Program: TBD']\n",
" else: \n",
" program = [item.text for item in program]\n",
" return program"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get event date and time\n",
"def get_date_WC(soup):\n",
" date = soup.find('p', {'class': 'center', 'style': re.compile(r'position: relative.*')}).text.strip().split('\\n')[0]\n",
" date = pd.Timestamp(date)\n",
" return date"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Return a list of performers\n",
"def get_performers_WC(soup):\n",
" performers = soup.find('h3', {'style': re.compile(r'margin-left: 20px.*')})\n",
" performers = performers.findChildren()[0]\n",
" #performers = [item.strip() for item in performers if ',' in item]\n",
" performers = re.findall('([A-Z][A-Z]+\\.?-?\\s?[A-Z]*\\.?-?\\s?[A-Z]*\\.?,?\\s?[a-z]*\\s?[a-z]*\\s?[a-z]*)', performers.text)\n",
" performers = [performer.strip() for performer in performers]\n",
" performers = [performer.split(', ') for performer in performers]\n",
" if not performers:\n",
" return ['Performers: TBD']\n",
" return performers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class Event():\n",
" def __init__(self, program=[], performers=[], date=None, price=None, place=None, tags=[]):\n",
" self.program = program\n",
" self.performers = performers\n",
" self.date = date\n",
" self.price = price\n",
" self.place = place\n",
" self.tags = tags\n",
" \n",
" def addTag(self, tag):\n",
" self.tags.append(tag)\n",
" \n",
" def printEvent(self):\n",
" date_str = self.date.strftime('%A, %B {}, %Y {}:%M %p'.format(self.date.day, self.date.hour%12))\n",
" print(date_str)\n",
" print(self.performers)\n",
" print(self.program)\n",
" print(self.place)\n",
" print(self.price)\n",
" print('\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Returns a list of event objects from Washington Conservatory\n",
"def getEvents_WC():\n",
" events = []\n",
" for link in get_rel_links_WC():\n",
" concertPage = requests.get(WASHINGTON_CONSERVATORY_URL + link) \n",
" soup = BeautifulSoup(concertPage.text, \"lxml\")\n",
" program = get_program_WC(soup)\n",
" date = get_date_WC(soup)\n",
" performers = get_performers_WC(soup)\n",
" events.append(Event(program=program, performers=performers, date=date, \n",
" place='Washington Conservatory', price='Donation'))\n",
" return events"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Returns a list of event objects from Kennedy Center\n",
"def getEvents_KC(genre):\n",
" events = []\n",
" for link in get_rel_links_KC(genre):\n",
" concertPage = requests.get(KENNEDY_URL + link) \n",
" soup = BeautifulSoup(concertPage.text, \"lxml\")\n",
" date = pd.Timestamp(soup.find('meta', {'name': 'StartDate'})['content']).to_pydatetime()\n",
" #price = soup.find('div', {'class': re.compile(r'price*')}).text\n",
" #price = re.search(r'(\\$\\d+\\.\\d\\d(\\s-\\s\\$\\d+\\.\\d\\d)?)', price).group(0) \n",
" price = get_price_KC(soup)\n",
" blurb = soup.select('div.blurbpadding') \n",
" if not get_performers_KC(blurb):\n",
" continue\n",
" performers = get_performers_KC(blurb)\n",
" program = get_program_KC(soup)\n",
" events.append(Event(program=program, performers=performers, date=date, \n",
" place='Kennedy Center', price=price))\n",
" return events"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_events = []\n",
"all_events.extend(getEvents_WC())\n",
"all_events.extend(getEvents_KC('CLA'))\n",
"all_events.extend(getEvents_KC('CHA'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"[event.printEvent() for event in all_events]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"columns = ['Date', 'Time', 'Venue', 'Price', 'Performers', 'Program', 'Tags']\n",
"df = pd.DataFrame(columns=columns)\n",
"for event in all_events:\n",
" tags = [] \n",
" tags.extend([performer[1] for performer in event.performers if len(performer)>1])\n",
" tags.extend( [p[0] for p in event.program if len(p[0])>1] )\n",
" tags = [ re.compile(r'conductor').sub('orchestra', tag) for tag in tags ]\n",
" tags = [ re.compile(r'\\s?solo\\s?').sub('', tag) for tag in tags ]\n",
" tags = [ re.compile(r' and').sub(',', tag) for tag in tags ]\n",
" d = {'Date': event.date.date(), 'Time': event.date.time(), 'Venue': event.place, \n",
" 'Price': event.price, 'Performers': event.performers, 'Program': event.program, 'Tags': tags}\n",
" df = df.append(pd.Series(d), ignore_index=True)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[df['Date'] > dt.datetime.today().date()].sort_values(by='Date')[:20]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"string = 'ever, erf3e, 3ef3, 3vwre'\n",
"string.split(',', 2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dt.strftime('%I:%M %p')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dt.datetime.today()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('averaged_perceptron_tagger')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"concertPage = requests.get(KENNEDY_URL + links[26]) \n",
"soup = BeautifulSoup(concertPage.text, \"lxml\")\n",
"date = soup.find('meta', {'name': 'StartDate'})['content']\n",
"blurb = soup.select('div.blurbpadding') "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#tokens = word_tokenize(blurb[0].text.lower())\n",
"tokens = word_tokenize(blurb[0].text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clean_tokens = []\n",
"stemmer = PorterStemmer()\n",
"tokens_stemmed = []\n",
"for token in tokens:\n",
" stemmed_token = stemmer.stem(token)\n",
" tokens_stemmed.append(stemmed_token)\n",
"\n",
"frequencies = Counter(tokens_stemmed)\n",
"for token, count in frequencies.most_common(25):\n",
" if (token not in ENGLISH_STOPWORDS) & (len(token)>2):\n",
" clean_tokens.append(token)\n",
" print(token, count)\n",
" \n",
"#trigrams = ngrams([token for token in tokens_stemmed if (token not in ENGLISH_STOPWORDS) & (len(token)>2)], 3)\n",
"trigrams = ngrams(tokens, 3)\n",
"\n",
"frequencies_2 = Counter(trigrams)\n",
"for token, count in frequencies_2.most_common(10):\n",
" print(token, count)\n",
"\n",
"\n",
"print(list(trigrams))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tags = pos_tag(tokens)\n",
"print(blurb[0].text)\n",
"print([tag for tag in tags if tag[1] == 'NNP'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"[w[0] for w in tags if w[1]=='NNP']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"blurb[0].text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment