Last active
August 29, 2015 14:10
-
-
Save flibbertigibbet/b18e6b102fcb22549d14 to your computer and use it in GitHub Desktop.
Fetch updated GTFS feeds for Massachusetts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import requests, os, pickle, datetime, zipfile, subprocess, csv | |
class FeedFetcher(): | |
def __init__(self, ddir=os.getcwd()): | |
self.ddir = ddir | |
self.tc = {} # time checks for GTFS fetches | |
self.new_use = [] # list of new feeds successfully downloaded and validated | |
def verify(self, file_name): | |
# file_name is local to download directory | |
f = os.path.join(self.ddir, file_name) | |
if not os.path.isfile(f): | |
print("File " + f + " not found; cannot verify it.") | |
return False | |
print("Validating feed in " + file_name + "...") | |
try: | |
p = subprocess.Popen(['feedvalidator.py', '--output=CONSOLE', | |
'-m', '-n', f], stdout=subprocess.PIPE) | |
out = p.communicate() | |
res = out[0].split('\n') | |
errct = res[-2:-1][0] # output line with count of errors/warnings | |
if errct.find('error') > -1: | |
print("Feed validator found errors in " + file_name + ": " + errct + ".") | |
return False | |
elif out[0].find('this feed is in the future,') > -1: | |
print("Feed validator found GTFS not in service until future for " + | |
file_name + ".") | |
return False | |
else: | |
if errct.find('successfully') > -1: | |
print("Feed " + file_name + " looks great: " + errct + ".") | |
else: | |
# have warnings | |
print("Feed " + file_name + " looks ok: " + errct[7:] + ".") | |
return True | |
except: | |
print("Failed to run feed validator on GTFS " + file_name + ".") | |
return False | |
print("How did we get here? In GTFS validation for " + file_name + ".") | |
return False # shouldn't get here | |
def verify_all(self): | |
# just run verify on all feeds in log file | |
time_check = os.path.join(self.ddir, 'time_checks.p') | |
if os.path.isfile(time_check): | |
tcf = open(time_check, 'rb') | |
self.tc = pickle.load(tcf) | |
tcf.close() | |
print("Loaded time check file.") | |
if self.tc.has_key('last_check'): | |
last_check = self.tc['last_check'] | |
print("Last check: ") | |
print(last_check) | |
timedelt = datetime.datetime.now() - last_check | |
print("Time since last check: " ) | |
print(timedelt) | |
else: | |
print("Couldn't find last check time in log file; that's odd.") | |
gtfs = self.tc | |
del gtfs['last_check'] | |
for g in gtfs: | |
if g.endswith('.zip'): | |
if g.startswith('septa'): | |
self.verify('google_bus.zip') | |
self.verify('google_rail.zip') | |
else: | |
self.verify(g) | |
else: | |
print("What is " + g + "? That doesn't look like a GTFS file name.") | |
print("All done verifying!") | |
else: | |
print("No log file found! Can't verify GTFS.") | |
def check_header_newer(self, url, file_name): | |
# return 1 if newer file available to download; | |
# return 0 if info missing; | |
# return -1 if current file is most recent. | |
if self.tc.has_key(file_name): | |
last_info = self.tc.get(file_name) | |
hdr = requests.head(url) | |
hdr = hdr.headers | |
if hdr.get('last-modified'): | |
last_mod = hdr.get('last-modified') | |
if last_mod == last_info: | |
print("No new download available for " + file_name + ".") | |
return -1 | |
else: | |
print("New download available for " + file_name + ".") | |
print("Last downloaded: " + last_info + ".") | |
print("New download posted: " + last_mod + ".") | |
return 1 | |
else: | |
print("No last-modified header set for " + file_name + " download link.") | |
return 0 | |
else: | |
print("Time check entry for " + file_name + " not found.") | |
return 0 | |
# shouldn't happen | |
print("How did we get here? Failed checking header info.") | |
return 0 | |
def get_stream(self, url, file_name, do_stream=True, session=None, do_verify=True): | |
if self.check_header_newer(url, file_name) == -1: | |
return False | |
# file_name is local to download directory | |
f = os.path.join(self.ddir, file_name) | |
print("Getting file " + f + "...") | |
if not session: | |
stream = requests.get(url, stream=do_stream) | |
else: | |
stream = session.get(url, stream=do_stream) | |
if stream.ok: | |
stream_file = open(f, 'wb') | |
if do_stream: | |
for chunk in stream.iter_content(): | |
stream_file.write(chunk) | |
else: | |
stream_file.write(stream.content) | |
stream_file.close() | |
info = os.stat(f) | |
if info.st_size < 10000: | |
# file smaller than 10K may not be a GTFS; just warn | |
print('Warning:') | |
print("Download for " + f + " is only " + str(info.st_size) + " bytes.") | |
print("It may not be a valid GTFS.") | |
if not zipfile.is_zipfile(f): | |
print("BAD DOWNLOAD FOR " + f + ".") | |
print("Download for " + f + " is not a zip file.") | |
return False | |
if stream.headers.get('last-modified'): | |
self.tc[file_name] = stream.headers.get('last-modified') | |
else: | |
# format like last-modified header | |
self.tc[file_name] = datetime.datetime.utcnow( | |
).strftime("%a, %d %b %Y %H:%M:%S GMT") | |
print("Download completed successfully.") | |
# verify download | |
if do_verify: | |
if self.verify(file_name): | |
print("GTFS verification succeeded.") | |
self.new_use.append(file_name) | |
return True | |
else: | |
print("GTFS verification failed.") | |
return False | |
else: | |
print("Skipping GTFS verification in get_stream.") | |
# not adding to new_use here; do elsewhere | |
return True | |
else: | |
print("DOWNLOAD FAILED FOR " + f + ".") | |
return False | |
def fetch(self): | |
# pickled log of last times downloaded | |
time_check = os.path.join(self.ddir, 'time_checks.p') | |
if os.path.isfile(time_check): | |
tcf = open(time_check, 'rb') | |
self.tc = pickle.load(tcf) | |
tcf.close() | |
print("Loaded time check file.") | |
if self.tc.has_key('last_check'): | |
last_check = self.tc['last_check'] | |
print("Last check: ") | |
print(last_check) | |
timedelt = datetime.datetime.now() - last_check | |
print("Time since last check: " ) | |
print(timedelt) | |
else: | |
print("Will create new time check file.") | |
self.tc['last_check'] = datetime.datetime.now() | |
####### MBTA AND MA REGIONAL ####### | |
# MBTA: http://www.mbta.com/rider_tools/developers/ | |
# MA REGIONAL: http://www.massdot.state.ma.us/developersdata.aspx | |
########################### | |
ma_feeds = {'mbta': 'http://www.mbta.com/uploadedfiles/MBTA_GTFS.zip', | |
'berkshire': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/brta_google_transit.zip', | |
'brockton': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/bat_google_transit.zip', | |
'cape_ann': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/cata_google_transit.zip', | |
'cape_cod': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/ccrta_google_transit.zip', | |
'franklin': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/frta_google_transit.zip', | |
'attleboro': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/gatra_google_transit.zip', | |
'lowell': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/lrta_google_transit.zip', | |
'merrimack': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/mvrta_google_transit.zip', | |
'metrowest': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/mwrta_google_transit.zip', | |
'montachusett': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/mart_google_transit.zip', | |
'nantucket': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/nrta_google_transit.zip', | |
'pioneer': 'http://www.pvta.com/g_trans/google_transit.zip', | |
'southeastern': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/srta_google_transit.zip', | |
'vineyard': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/vta_google_transit.zip', | |
'worchester': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/wrta_google_transit.zip', | |
'ferries': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/ferries_google_transit.zip' | |
} | |
# The feeds for the private bus companies are full of errors; omitting these for now. | |
# 'bloom_tours': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/Bloom_google_transit.zip', | |
# 'boston_express': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/boston_express_google_transit.zip', | |
# 'coach_bus': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/coach_google_transit.zip', | |
# 'dattco': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/dattco_google_transit.zip', | |
# 'peter_pan': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/peter_pan_google_transit.zip', | |
# 'plymouth_brockton': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/PB_google_transit.zip', | |
# 'yankee': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/yankee_google_transit.zip' | |
for source in ma_feeds: | |
print('Going to check for %s feed...' % source) | |
filename = '%s.zip' % source | |
self.get_stream(ma_feeds[source], filename, do_stream=True) | |
########################### | |
print("Downloading finished. Writing time check file...") | |
tcf = open(time_check, 'wb') | |
pickle.dump(self.tc, tcf) | |
tcf.close() | |
print("Time check file written.") | |
print("Writing 'new_use.csv', file of validated new downloads...") | |
nu = open('new_use.csv', 'wb') | |
nuw = csv.writer(nu) | |
for n in self.new_use: | |
print("Got new GTFS " + n) | |
nuw.writerow([n]) | |
nu.close() | |
print("Done writing 'new_use.csv'.") | |
print("All done!") | |
############################### |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment