Last active
June 9, 2016 16:33
-
-
Save mpenkov/401f62731709cd05ca3de860d8ee7d17 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Check whether each line contains tab-separated decodable JSON.""" | |
import sys | |
import json | |
import logging | |
logging.basicConfig(level=logging.ERROR) | |
for i, line in enumerate(sys.stdin, 1): | |
try: | |
key, value = line.split("\t", 1) | |
json.loads(key) | |
json.loads(value) | |
except ValueError as e: | |
logging.error("badness on line %d", i) | |
logging.error("<line>") | |
logging.error("%r", line) | |
logging.error("</line>") | |
logging.exception(e) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urlparse | |
import urllib | |
import lxml.html | |
import lxml.html.clean | |
import lxml.etree | |
import itertools | |
import re | |
import logging | |
import gzip | |
import boto | |
import warc | |
import ssl | |
import time | |
import socket | |
import sys | |
import traceback | |
from boto.s3.key import Key | |
from gzipstream import GzipStreamFile | |
from mrjob.job import MRJob | |
logger = logging.getLogger(__name__) | |
# | |
# This will cause logging messages to appear twice during local runs, but on | |
# EMR runs, they will appear only once. | |
# | |
logger.addHandler(logging.StreamHandler(sys.stdout)) | |
logger.setLevel(logging.INFO) | |
"""The max number of contact links to keep per domain.""" | |
MAX_NUM_KEEP = 5 | |
SLEEP_TIME = 5 | |
NUM_RETRIES = 5 | |
WORST_SCORE = 100 | |
def read_from_s3(line): | |
# If we're on EC2 or running on a Hadoop cluster, pull files via S3 | |
logger.info('Reading from Amazon S3') | |
# Connect to Amazon S3 using anonymous credentials | |
conn = boto.connect_s3(anon=True) | |
pds = conn.get_bucket('aws-publicdatasets') | |
# Start a connection to one of the WARC files | |
k = Key(pds, line) | |
f = warc.WARCFile(fileobj=GzipStreamFile(k)) | |
return f | |
def read_from_local(line): | |
# If we're local, use files on the local file system | |
logger.info('Loading local file {}'.format(line)) | |
f = warc.WARCFile(fileobj=gzip.open(line)) | |
return f | |
class CCJob(MRJob): | |
def process_record(self, record): | |
""" | |
Override process_record with your mapper | |
""" | |
raise NotImplementedError('Process record needs to be customized') | |
def mapper(self, _, fpath, num_retries=NUM_RETRIES): | |
# | |
# We're loading a gzipped WARC file from S3, and several things could | |
# go wrong. | |
# | |
# We could trip over a network error. In that case, we sleep for a | |
# while and retry. The sleep interval increases after each failure to | |
# reduce the stress on the network. Since we retry the entire file, | |
# we will need to handle duplicate problems later, in the reducer. | |
# | |
# We could also trip over a malformed WARC file. We can't do anything | |
# here, so give up on the remainder of the file completely. | |
# | |
if fpath.startswith("local "): | |
_, fpath = fpath.split(" ", 1) | |
read = read_from_local | |
else: | |
read = read_from_s3 | |
try: | |
for attempt in xrange(1, num_retries + 1): | |
try: | |
f = read(fpath) | |
for i, record in enumerate(f): | |
for key, value in self.process_record(record): | |
yield key, value | |
self.increment_counter( | |
'commoncrawl', 'processed_records', 1 | |
) | |
break | |
except (ssl.SSLError, socket.error): | |
# | |
# ssl.SSLError: The read operation timed out | |
# socket.error: [Errno 104] Connection reset by peer | |
# | |
logger.error("encountered network error, retrying") | |
time.sleep(attempt * SLEEP_TIME) | |
if attempt == num_retries: | |
self.increment_counter('commoncrawl', 'failed_downloads', 1) | |
logger.error( | |
"failed to download %s after %d attempts", | |
fpath, num_retries | |
) | |
except Exception: | |
# | |
# The WARC parser raises IOError if it encounters a problem. | |
# If our IOError comes from the WARC parser, then give up on the | |
# file. If it comes from somewhere else, we have a different | |
# problem to deal with. | |
# | |
# http://stackoverflow.com/questions/1095601/find-module-name-of-the-originating-exception-in-python | |
# | |
exc_type, exc_value, exc_tb = sys.exc_info() | |
filename, _, _, _ = traceback.extract_tb(exc_tb)[-1] | |
if "/warc/" in filename: | |
logger.error("Malformed WARC file: %s, giving up", fpath) | |
self.increment_counter('commoncrawl', 'bad_warc_files', 1) | |
else: | |
# | |
# This isn't the exception you're looking for. | |
# | |
raise | |
def squash(text): | |
"""mrjob splits data on newlines, and then decodes each line according to | |
the specified protocol. If the data contains newlines, it will be | |
over-segmented and cause problems for the protocol parsers. Therefore, | |
we must make sure our data doesn't contain newlines.""" | |
return re.sub(r"\s\s+", " ", text) | |
class MrStageA(CCJob): | |
def process_record(self, record): | |
if record['Content-Type'] == 'application/http; msgtype=response': | |
uri = record["WARC-Target-URI"] | |
parsed = urlparse.urlparse(uri) | |
domain = parsed.netloc | |
score = 0 | |
is_homepage = parsed.path in ["", "/"] | |
if is_homepage or score < WORST_SCORE: | |
payload = record.payload.read() | |
# | |
# The HTTP response is defined by a specification: first | |
# part is headers (metadata) and then following two CRLFs | |
# (newlines) has the data for the response | |
# | |
headers, body = payload.split('\r\n\r\n', 1) | |
if 'Content-Type: text/html' in headers: | |
try: | |
uri = to_utf8(uri) | |
output_dict = split_html(uri, body) | |
key, value = domain, (score, output_dict) | |
yield key, value | |
except (UnicodeEncodeError, UnicodeDecodeError): | |
logger.error("Unicode encode/decode error: %s", uri) | |
self.increment_counter('commoncrawl', 'processed_records', 1) | |
def clean_html(html): | |
"""Removes parts of HTML unnecessary for processing.""" | |
kill_tags = ["map", "base", "iframe", "select", "noscript"] | |
kwargs = {"scripts": True, "javascript": True, "comments": True, | |
"style": True, "links": True, "meta": True, | |
"page_structure": False, "processing_instructions": True, | |
"embedded": True, "frames": False, "forms": False, | |
"annoying_tags": True, "kill_tags": kill_tags} | |
cleaner = lxml.html.clean.Cleaner(**kwargs) | |
# | |
# If clean_html is given a unicode string, it will always return a unicode | |
# string. If it is given a byte string, then the output will be a | |
# utf8-encoded byte string or an ascii string, depending on what was in the | |
# input. Make sure the input is unicode to keep things simple. | |
# | |
try: | |
html = unicode(html) | |
except UnicodeDecodeError: | |
html = html.decode("utf-8", "replace") | |
return cleaner.clean_html(html) | |
def to_utf8(s): | |
# | |
# Make sure all dictionary values are safely encoded to UTF-8 to prevent | |
# crashes later on down the line. | |
# TODO: This shouldn't be necessary since we originally decode from UTF-8. | |
# | |
try: | |
return s.encode("utf-8", "replace") | |
except UnicodeDecodeError: | |
# | |
# The encoding above can fail if s is a byte string (as opposed to a | |
# unicode string. In that case, we can't know it's encoding for sure, | |
# so we just assume it's utf-8. In the worst case, the non-ASCII | |
# characters will end up being replaced. | |
# | |
return s.decode("utf-8", "replace").encode("utf-8", "replace") | |
def split_html(url, html): | |
"""Split the HTML into the title, headings and text for feeding into | |
ElasticSearch.""" | |
try: | |
root = lxml.html.document_fromstring(clean_html(html)) | |
html_title = root.xpath("//title/text()") | |
if html_title: | |
html_title = squash(to_utf8(urllib.unquote(html_title[0].strip()))) | |
html_headings = [ | |
to_utf8(elt.text.strip()) for elt in itertools.chain( | |
root.iterfind(".//h1"), root.iterfind(".//h2"), | |
root.iterfind(".//h3"), root.iterfind(".//h4") | |
) if elt.text | |
] | |
html_headings = [squash(h) for h in html_headings if h] | |
parts = root.xpath(".//text()") | |
if root.tail: | |
parts.append(root.tail) | |
html_text = to_utf8( | |
urllib.unquote(squash(" ".join(parts)).strip()) | |
) | |
return { | |
"url": url, "title": html_title, | |
"headings": html_headings, "text": html_text | |
} | |
except UnicodeDecodeError: | |
logger.error("Bad html string from url: %s" % url) | |
return {"url": to_utf8(url), "error": "unable to parse html"} | |
except Exception: | |
exc_type, exc_value, exc_tb = sys.exc_info() | |
filename, _, _, _ = traceback.extract_tb(exc_tb)[-1] | |
if "/lxml/" in filename: | |
logger.error("Bad html string from url: %s" % url) | |
return {"url": to_utf8(url), "error": "unable to parse html"} | |
else: | |
# | |
# This isn't the exception you're looking for. | |
# | |
raise | |
if __name__ == '__main__': | |
MrStageA.run() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
runners: | |
emr: | |
aws_region: us-west-2 | |
# Either set the environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY | |
# or set the two variables below | |
# aws_access_key_id: | |
# aws_secret_access_key: | |
# For more control, it's highly recommended to add your key pair | |
ec2_key_pair: your-key-pair | |
ec2_key_pair_file: your-key-file | |
#ssh_tunnel_to_job_tracker: true | |
ec2_instance_type: m1.large | |
ec2_master_instance_type: m1.large | |
# ec2_master_instance_bid_price: '0.1' | |
# ec2_core_instance_bid_price: '0.1' | |
# EMR allows a max of 20 EC2 instances per AWS account, including master | |
num_ec2_instances: 19 | |
# EMR comes with Python 2.6 by default -- installing Python 2.7 takes a while but might be necessary | |
# We also install packages needed for streaming compressed files from S3 or reading WARC files | |
# There's a newer AMI version but it has issues with the released stable mrjob | |
ami_version: 3.0.4 | |
interpreter: python2.7 | |
bootstrap: | |
- sudo yum -y --releasever=2014.09 install -y python27 python27-devel gcc-c++ | |
- sudo yum -y --releasever=2014.09 install libxml2 libxml2-devel libxslt libxslt-devel | |
- wget --no-check-certificate http://bootstrap.pypa.io/get-pip.py | |
- sudo python2.7 get-pip.py | |
- sudo pip2.7 install boto mrjob simplejson warc lxml certifi nose | |
- sudo pip2.7 install https://github.com/commoncrawl/gzipstream/archive/master.zip | |
s3_tmp_dir: s3://your-bucket/tmp |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00001-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00002-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00003-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00004-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00005-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00006-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00007-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00008-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00009-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00010-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00011-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00012-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00013-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00014-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00015-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00016-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00017-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00018-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00019-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00020-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00021-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00022-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00023-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00024-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00025-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00026-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00027-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00028-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00029-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00030-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00031-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00032-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00033-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00034-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00035-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00036-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00037-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00038-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00039-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00040-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00041-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00042-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00043-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00044-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00045-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00046-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00047-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00048-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00049-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00050-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00051-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00052-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00053-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00054-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00055-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00056-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00057-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00058-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00059-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00060-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00061-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00062-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00063-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00064-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00065-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00066-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00067-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00068-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00069-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00070-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00071-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00072-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00073-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00074-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00075-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00076-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00077-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00078-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00079-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00080-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00081-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00082-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00083-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00084-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00085-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00086-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00087-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00088-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00089-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00090-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00091-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00092-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00093-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00094-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00095-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00096-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00097-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00098-ip-10-180-136-8.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00099-ip-10-180-136-8.ec2.internal.warc.gz |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment