Last active
January 29, 2017 17:23
-
-
Save greglinch/8adcf412448775e01b69 to your computer and use it in GitHub Desktop.
Upload PDFs from URLs in csv to DocumentCloud.org using Ben Welsh's python-documentcloud API wrapper https://python-documentcloud.readthedocs.io/en/latest/gettingstarted.html#uploading-a-pdf-from-a-url
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from documentcloud import DocumentCloud | |
import urllib, cStringIO, csv | |
## Create the DocumentCloud.org client | |
client = DocumentCloud("USERNAME", "PASSWORD") | |
## Set additional data to store with document by mapping csv field keys to new values that will be they keys on Document Cloud | |
## you could abstract this by providing these key-value pairs in a separate csv, then supplying the data csv and field mapping csv as args in the command line | |
field_mapping = { | |
"wpid": "wpid", | |
"name": "org_name", | |
"city": "org_city", | |
"state": "org_state", | |
"year": "org_year", | |
"revenue": "org_revenue", | |
"org_type": "org_type", | |
"desc": "org_description", | |
"docurl": "source_url", | |
"ein": "org_ein" | |
} | |
def upload_doc(data_dict): | |
""" | |
Map fields from csv to Document Cloud fields and upload | |
""" | |
## create dict | |
clean_data_kwargs = {} | |
## map the old values as the new keys and the old keys as the new values | |
for key, value in data_dict.items(): | |
new_key = field_mapping[key] | |
clean_data_kwargs[new_key] = value | |
## Download the URL with urllib | |
url = clean_data_kwargs["source_url"] | |
file_contents = urllib.urlopen(url).read() | |
## Stuff it in a file object with cStringIO | |
file_obj = cStringIO.StringIO(file_contents) | |
## Set kwargs for documentcloud.org | |
kwargs = { | |
"title": clean_data_kwargs["org_name"] + " - " + clean_data_kwargs["org_year"], # update as needed | |
"source": "SOURCE", | |
"description": "DESC", | |
"access": "ACCESS", | |
"project": "PROJ", | |
"data": clean_data_kwargs, # optional | |
"secure": False # or True if you don't want to send docs to OpenCalais | |
} | |
## Upload that to DocumentCloud | |
obj = client.documents.upload(file_obj, **kwargs) | |
print "Uploaded: %s" % (kwargs["title"]) | |
print "\n" | |
## set the file name of the csv with all your urls and doc metadata | |
filename = "FILENAME.csv" | |
## open the csv | |
with open(filename, 'rb') as handle: | |
## read the csv | |
reader = csv.DictReader(handle) | |
## loop thru the rows | |
for row in reader: | |
## pass each row to the function | |
upload_doc(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment