from documentcloud import DocumentCloud import urllib, cStringIO, csv ## Create the DocumentCloud.org client client = DocumentCloud("USERNAME", "PASSWORD") ## Set additional data to store with document by mapping csv field keys to new values that will be they keys on Document Cloud ## you could abstract this by providing these key-value pairs in a separate csv, then supplying the data csv and field mapping csv as args in the command line field_mapping = { "wpid": "wpid", "name": "org_name", "city": "org_city", "state": "org_state", "year": "org_year", "revenue": "org_revenue", "org_type": "org_type", "desc": "org_description", "docurl": "source_url", "ein": "org_ein" } def upload_doc(data_dict): """ Map fields from csv to Document Cloud fields and upload """ ## create dict clean_data_kwargs = {} ## map the old values as the new keys and the old keys as the new values for key, value in data_dict.items(): new_key = field_mapping[key] clean_data_kwargs[new_key] = value ## Download the URL with urllib url = clean_data_kwargs["source_url"] file_contents = urllib.urlopen(url).read() ## Stuff it in a file object with cStringIO file_obj = cStringIO.StringIO(file_contents) ## Set kwargs for documentcloud.org kwargs = { "title": clean_data_kwargs["org_name"] + " - " + clean_data_kwargs["org_year"], # update as needed "source": "SOURCE", "description": "DESC", "access": "ACCESS", "project": "PROJ", "data": clean_data_kwargs, # optional "secure": False # or True if you don't want to send docs to OpenCalais } ## Upload that to DocumentCloud obj = client.documents.upload(file_obj, **kwargs) print "Uploaded: %s" % (kwargs["title"]) print "\n" ## set the file name of the csv with all your urls and doc metadata filename = "FILENAME.csv" ## open the csv with open(filename, 'rb') as handle: ## read the csv reader = csv.DictReader(handle) ## loop thru the rows for row in reader: ## pass each row to the function upload_doc(row)