elebow · August 9, 2014 21:55
diff --git a/gistfile1.py b/gistfile1.py
 #!/usr/bin/python3

 import urllib.request
 import re
 import json

 #The source is HTML 3.2. Every modern parser I tried choked on it. So, we'll use regexes to extract what we want. The
 #source HTML appears to be generated by a simple algorithm and has a very regular structure. I am aware of the
 #implications of using regexes on HTML, but I think it's safe given the known constraints on our input.

 months = {"Jan":0,"Feb":1,"Mar":2,"Apr":3,"May":4,"Jun":5,"Jul":6,"Aug":7,"Sep":8,"Oct":9,"Nov":10,"Dec":11}
 ul_regex = r"<h2>\w+ \d{4}<\/h2>\s*<ul>.*?<\/ul>"

 items = []

 with urllib.request.urlopen("http://www2.jpl.nasa.gov/calendar/index.html") as f:
 	lines = f.read().decode("utf-8").replace("\n", "")

 for ul in re.findall(ul_regex, lines)[1:]:	#the first <ul> is a table of contents, so start from index 1
 	lis = ul.split("<li> ")

 	date = lis[0]	#"<h2>August 2014</h2>										<ul>"
 	date = re.sub(r"<.*?>", "", date).strip() #strip tags and whitespace
 	year = date.split(" ")[1]	#we only need the year

 	lis[-1] = lis[-1].replace("</ul>", "")		#remove the closing tag from the last one

 	for li in lis[1:]:	#the first <li> has the date and opening tag, so start from index 1
 		li = re.sub(r"\s\s+", " ", li).strip()		#collapse extra whitespace left over from the indentation in the HTML source

 		(date, text) = li.split(" -", 1);   #split the date from the item text

 		#Some of the items have a little image at the front. Remove it and the following [...] text
 		if text[0:8] == "<img src":
 			text = re.sub("<img .*?]", "", text);

 		#the date could be "mmm dd", "mmm dd-dd", or "mmm dd-mmm dd"
 		m = re.match("(?:([A-Za-z]{3}) (\d\d)|([A-Za-z]{3}) (\d\d)-(\d\d)|([A-Za-z]{3}) (\d\d)-([A-Za-z]{3}) (\d\d))$", date)
 		if m == None:
 			continue;   #Some of the items have no date. We'll just leave them out of the result.

 		#m.span()[1] will always be 6, 9, or 13, corresponding to the three possible date formats
 		x = m.span()[1]
 		if x == 6:
 			date1 = "%s %s %s" % (year, months[m.group(1)], m.group(2))
 			date2 = None
 		elif x == 9:
 			date1 = "%s %s %s" % (year, months[m.group(3)], m.group(4))
 			date2 = "%s %s %s" % (year, months[m.group(3)], m.group(5))
 		elif x == 13:
 			date1 = "%s %s %s" % (year, months[m.group(6)], m.group(7))
 			date2 = "%s %s %s" % (year, months[m.group(8)], m.group(9))
 		else:
 			continue	#can't happen, but skip this item

 		items.append({"date1":date1, "date2":date2, "text":text})

 print(json.dumps(items))
	#!/usr/bin/python3

	import urllib.request
	import re
	import json

	#The source is HTML 3.2. Every modern parser I tried choked on it. So, we'll use regexes to extract what we want. The
	#source HTML appears to be generated by a simple algorithm and has a very regular structure. I am aware of the
	#implications of using regexes on HTML, but I think it's safe given the known constraints on our input.

	months = {"Jan":0,"Feb":1,"Mar":2,"Apr":3,"May":4,"Jun":5,"Jul":6,"Aug":7,"Sep":8,"Oct":9,"Nov":10,"Dec":11}
	ul_regex = r"<h2>\w+ \d{4}<\/h2>\s<ul>.?<\/ul>"

	items = []

	with urllib.request.urlopen("http://www2.jpl.nasa.gov/calendar/index.html") as f:
	lines = f.read().decode("utf-8").replace("\n", "")

	for ul in re.findall(ul_regex, lines)[1:]: #the first <ul> is a table of contents, so start from index 1
	lis = ul.split("<li> ")

	date = lis[0] #"<h2>August 2014</h2> <ul>"
	date = re.sub(r"<.*?>", "", date).strip() #strip tags and whitespace
	year = date.split(" ")[1] #we only need the year

	lis[-1] = lis[-1].replace("</ul>", "") #remove the closing tag from the last one

	for li in lis[1:]: #the first <li> has the date and opening tag, so start from index 1
	li = re.sub(r"\s\s+", " ", li).strip() #collapse extra whitespace left over from the indentation in the HTML source

	(date, text) = li.split(" -", 1); #split the date from the item text

	#Some of the items have a little image at the front. Remove it and the following [...] text
	if text[0:8] == "<img src":
	text = re.sub("<img .*?]", "", text);

	#the date could be "mmm dd", "mmm dd-dd", or "mmm dd-mmm dd"
	m = re.match("(?:([A-Za-z]{3}) (\d\d)\|([A-Za-z]{3}) (\d\d)-(\d\d)\|([A-Za-z]{3}) (\d\d)-([A-Za-z]{3}) (\d\d))$", date)
	if m == None:
	continue; #Some of the items have no date. We'll just leave them out of the result.

	#m.span()[1] will always be 6, 9, or 13, corresponding to the three possible date formats
	x = m.span()[1]
	if x == 6:
	date1 = "%s %s %s" % (year, months[m.group(1)], m.group(2))
	date2 = None
	elif x == 9:
	date1 = "%s %s %s" % (year, months[m.group(3)], m.group(4))
	date2 = "%s %s %s" % (year, months[m.group(3)], m.group(5))
	elif x == 13:
	date1 = "%s %s %s" % (year, months[m.group(6)], m.group(7))
	date2 = "%s %s %s" % (year, months[m.group(8)], m.group(9))
	else:
	continue #can't happen, but skip this item

	items.append({"date1":date1, "date2":date2, "text":text})

	print(json.dumps(items))