ericharley · November 9, 2018 20:00
diff --git a/doit.py b/doit.py
 import csv
 import gzip
 import requests
 from StringIO import StringIO

 # Parameters
 prefix = 'https://commoncrawl.s3.amazonaws.com/'
 fileout_extension = "pdf"

 def get_file(warc_filename, warc_record_offset, warc_record_length, content_digest):

 	# compute request parameters
 	url = prefix + warc_filename

 	# Each WARC file is composed of many gzip files concatenated together
 	# Calculate the start and the end of the relevant byte range
 	offset = int(warc_record_offset)
 	length = int(warc_record_length)
 	offset_end = offset + length - 1

 	# Use the Range HTTP header to specify the set of bytes
 	resp = requests.get(url, headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})

 	# decompress the returned WARC response
 	raw_data = StringIO(resp.content)
 	f = gzip.GzipFile(fileobj=raw_data)
 	data = f.read()
 	warc, header, response = data.strip().split('\r\n\r\n', 2)

 	return response

 with open('files_cc_1k.csv', 'r') as csvfile:
 	reader = csv.DictReader(csvfile)

 	for row in reader:
 		response = get_file(row['warc_filename'], row['warc_record_offset'], row['warc_record_length'], row['content_digest'])

 		# Write out file to disk
 		output_filename = row['content_digest'] + "." + fileout_extension
 		output_file = open(output_filename, "wb")
 		output_file.write(response)
 		output_file.close()
	import csv
	import gzip
	import requests
	from StringIO import StringIO

	# Parameters
	prefix = 'https://commoncrawl.s3.amazonaws.com/'
	fileout_extension = "pdf"

	def get_file(warc_filename, warc_record_offset, warc_record_length, content_digest):

	# compute request parameters
	url = prefix + warc_filename

	# Each WARC file is composed of many gzip files concatenated together
	# Calculate the start and the end of the relevant byte range
	offset = int(warc_record_offset)
	length = int(warc_record_length)
	offset_end = offset + length - 1

	# Use the Range HTTP header to specify the set of bytes
	resp = requests.get(url, headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})

	# decompress the returned WARC response
	raw_data = StringIO(resp.content)
	f = gzip.GzipFile(fileobj=raw_data)
	data = f.read()
	warc, header, response = data.strip().split('\r\n\r\n', 2)

	return response

	with open('files_cc_1k.csv', 'r') as csvfile:
	reader = csv.DictReader(csvfile)

	for row in reader:
	response = get_file(row['warc_filename'], row['warc_record_offset'], row['warc_record_length'], row['content_digest'])

	# Write out file to disk
	output_filename = row['content_digest'] + "." + fileout_extension
	output_file = open(output_filename, "wb")
	output_file.write(response)
	output_file.close()