Created
July 29, 2016 22:30
-
-
Save mynameisfiber/4523c58e0a63479eef646405c4b6d63f to your computer and use it in GitHub Desktop.
Extract ingress codes from the blog
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html | |
import requests | |
import re | |
import pickle | |
def get_codes(dom): | |
codes = set() | |
codes.update(_get_codes_urls(dom)) | |
codes.update(_get_codes_alt_id(dom)) | |
codes.update(_get_codes_span(dom)) | |
return codes | |
def filter_codes(func): | |
blacklist = ('menu', 'post', 'text', 'poll', 'calendar', 'blog', 'search', | |
'image', 'social') | |
is_code = re.compile("^(?=\S*[a-z])(?=\S*[0-9])\S+$") | |
def _(*args, **kwargs): | |
for code_candidate in func(*args, **kwargs): | |
if is_code.match(code_candidate) is not None and \ | |
not any(b in code_candidate for b in blacklist): | |
yield code_candidate | |
return _ | |
@filter_codes | |
def _get_codes_urls(dom): | |
for url in dom.xpath(".//a/@href"): | |
if "#" in url: | |
_, code_candidate = url.rsplit("#", 1) | |
yield code_candidate | |
@filter_codes | |
def _get_codes_alt_id(dom): | |
yield from dom.xpath(".//@alt|.//@id") | |
@filter_codes | |
def _get_codes_span(dom): | |
yield from dom.xpath(".//span/text()") | |
if __name__ == "__main__": | |
data = requests.get("http://investigate.ingress.com/") | |
dom = html.fromstring(data.content) | |
codes = get_codes(dom) | |
try: | |
with open("codes_history.pkl", 'rb') as fd: | |
previous_codes = pickle.load(fd) | |
except: | |
previous_codes = set() | |
with open("codes_history.pkl", 'wb+') as fd: | |
pickle.dump(codes | previous_codes, fd) | |
new_codes = codes - previous_codes | |
print("\n".join(new_codes) or "No New Codes") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment