Created
February 12, 2019 13:42
-
-
Save dstanek/2e3448da010eb8950f765c12c9e8c0af to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import sys | |
import urllib | |
import lxml.html | |
import requests | |
import requests_cache | |
requests_cache.install_cache('cache') | |
BASE = 'https://www.tvfoodmaps.com/show/Diners-Drive-Ins-Dives' | |
def get(url): | |
resp = requests.get(url) | |
resp.raise_for_status() | |
return resp | |
def xpath(data, path): | |
root = lxml.html.fromstring(data) | |
return root.xpath(path) | |
def iter_state_urls(data): | |
root = lxml.html.fromstring(data) | |
for a in xpath(data, '//a'): | |
if 'Dives in ' in a.attrib.get('title', ''): | |
yield a.attrib['href'] | |
def iter_dives_in_a_state(state_url): | |
resp = get(state_url) | |
viewall_btn = xpath(resp.content, '//div[@id="paging-info"]/h4/a')[0] | |
viewall_url = urllib.parse.urljoin(BASE, viewall_btn.attrib['href']) | |
resp = get(viewall_url) | |
for div in xpath(resp.content, '//div[@class="inner-results"]'): | |
yield parse_dive(div) | |
def parse_dive(div): | |
name = div.xpath('h3/a')[0].text | |
image = div.xpath('div[@class="pull-left"]/img')[0].attrib['src'] | |
if '?' in image: | |
image = image[:image.index('?')] | |
address = div.xpath('p[@class="searchResAddress"]')[0].text | |
if ' (' in address: | |
address = address[:address.index(' (')] | |
desc = div.xpath('p/i') | |
if desc: | |
desc = desc[0].text.strip().replace('\xa0', ' ') | |
else: | |
desc = '' | |
return {'name': name, 'address': address, 'image': image, 'desc': desc} | |
writer = csv.writer(sys.stdout) | |
writer.writerow(['name', 'address', 'image', 'desc']) | |
resp = get(BASE) | |
for state_url in iter_state_urls(resp.content): | |
state_url = urllib.parse.urljoin(BASE, state_url) | |
for dive in iter_dives_in_a_state(state_url): | |
writer.writerow([dive['name'], dive['address'], dive['image'], dive['desc']]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment