Created
August 9, 2014 21:55
-
-
Save elebow/002ec17cd7647fc1b251 to your computer and use it in GitHub Desktop.
Prepare JSON for space calendar
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import urllib.request | |
import re | |
import json | |
#The source is HTML 3.2. Every modern parser I tried choked on it. So, we'll use regexes to extract what we want. The | |
#source HTML appears to be generated by a simple algorithm and has a very regular structure. I am aware of the | |
#implications of using regexes on HTML, but I think it's safe given the known constraints on our input. | |
months = {"Jan":0,"Feb":1,"Mar":2,"Apr":3,"May":4,"Jun":5,"Jul":6,"Aug":7,"Sep":8,"Oct":9,"Nov":10,"Dec":11} | |
ul_regex = r"<h2>\w+ \d{4}<\/h2>\s*<ul>.*?<\/ul>" | |
items = [] | |
with urllib.request.urlopen("http://www2.jpl.nasa.gov/calendar/index.html") as f: | |
lines = f.read().decode("utf-8").replace("\n", "") | |
for ul in re.findall(ul_regex, lines)[1:]: #the first <ul> is a table of contents, so start from index 1 | |
lis = ul.split("<li> ") | |
date = lis[0] #"<h2>August 2014</h2> <ul>" | |
date = re.sub(r"<.*?>", "", date).strip() #strip tags and whitespace | |
year = date.split(" ")[1] #we only need the year | |
lis[-1] = lis[-1].replace("</ul>", "") #remove the closing tag from the last one | |
for li in lis[1:]: #the first <li> has the date and opening tag, so start from index 1 | |
li = re.sub(r"\s\s+", " ", li).strip() #collapse extra whitespace left over from the indentation in the HTML source | |
(date, text) = li.split(" -", 1); #split the date from the item text | |
#Some of the items have a little image at the front. Remove it and the following [...] text | |
if text[0:8] == "<img src": | |
text = re.sub("<img .*?]", "", text); | |
#the date could be "mmm dd", "mmm dd-dd", or "mmm dd-mmm dd" | |
m = re.match("(?:([A-Za-z]{3}) (\d\d)|([A-Za-z]{3}) (\d\d)-(\d\d)|([A-Za-z]{3}) (\d\d)-([A-Za-z]{3}) (\d\d))$", date) | |
if m == None: | |
continue; #Some of the items have no date. We'll just leave them out of the result. | |
#m.span()[1] will always be 6, 9, or 13, corresponding to the three possible date formats | |
x = m.span()[1] | |
if x == 6: | |
date1 = "%s %s %s" % (year, months[m.group(1)], m.group(2)) | |
date2 = None | |
elif x == 9: | |
date1 = "%s %s %s" % (year, months[m.group(3)], m.group(4)) | |
date2 = "%s %s %s" % (year, months[m.group(3)], m.group(5)) | |
elif x == 13: | |
date1 = "%s %s %s" % (year, months[m.group(6)], m.group(7)) | |
date2 = "%s %s %s" % (year, months[m.group(8)], m.group(9)) | |
else: | |
continue #can't happen, but skip this item | |
items.append({"date1":date1, "date2":date2, "text":text}) | |
print(json.dumps(items)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment