Created
July 22, 2020 19:30
-
-
Save foobarbecue/4738be626392855ef92541e479c7d0c8 to your computer and use it in GitHub Desktop.
Example of uploading multiple papers to dataverse, using doi.org metadata
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyDataverse.api import Api | |
from pyDataverse.exceptions import DatasetNotFoundError | |
from typing import List, Dict | |
import requests | |
import json | |
import pdfx | |
import re | |
from glob2 import glob | |
api = Api('https://example.com/dataverse', api_token='xxxx') | |
dv_name ='mooncaves' | |
def doi2citeinfo(doi: str): | |
""" | |
Given a document object identifier doi, retrieves citation information and returns it in json format. | |
:param doi: document object identifier | |
:return: citation information in CSL json | |
""" | |
citeformat = {'Accept':'application/vnd.citationstyles.csl+json;q=1.0'} | |
ref = requests.get(f'https://doi.org/{doi}', headers = citeformat) | |
citeinfo = ref.json() | |
return citeinfo | |
def citeinfo2datasetJson(citeinfo: Dict, ): | |
""" | |
Given citation information in the format output by doi2citeinfo, returns citation info in Dataverse native json. | |
:param citeinfo: citation information in CSL json | |
:return: citation information in Dataverse native json | |
""" | |
try: | |
author = f"{citeinfo['author'][0]['family']}, {citeinfo['author'][0]['given']}" | |
except KeyError: | |
author = f"{citeinfo['author'][0]['name']}" | |
if len(citeinfo['author'][0]['affiliation']) > 0: | |
affiliation = str(citeinfo['author'][0]['affiliation'])[0] | |
else: | |
affiliation = 'unknown' | |
email = "[email protected]" | |
description = "unknown" | |
subject = "Other" | |
# Munging into the format specified at http://guides.dataverse.org/en/latest/api/native-api.html#id36 | |
ds_citeinfo = {'datasetVersion':{'metadataBlocks':{'citation':{'fields':[ | |
#TITLE | |
{ | |
"value": citeinfo['title'], | |
"typeClass": "primitive", | |
"multiple": False, | |
"typeName": "title" | |
}, | |
#AUTHOR | |
{ | |
"value": [ | |
{ | |
"authorName": { | |
"value": author, | |
"typeClass": "primitive", | |
"multiple": False, | |
"typeName": "authorName" | |
}, | |
"authorAffiliation": { | |
"value": affiliation, | |
"typeClass": "primitive", | |
"multiple": False, | |
"typeName": "authorAffiliation" | |
} | |
} | |
], | |
"typeClass": "compound", | |
"multiple": True, | |
"typeName": "author" | |
}, | |
#CONTACT | |
{ | |
"value": [ | |
{ "datasetContactEmail" : { | |
"typeClass": "primitive", | |
"multiple": False, | |
"typeName": "datasetContactEmail", | |
"value" : email | |
}, | |
"datasetContactName" : { | |
"typeClass": "primitive", | |
"multiple": False, | |
"typeName": "datasetContactName", | |
"value": author | |
} | |
}], | |
"typeClass": "compound", | |
"multiple": True, | |
"typeName": "datasetContact" | |
}, | |
#DESCRIPTION | |
{ | |
"value": [ { | |
"dsDescriptionValue":{ | |
"value": description, | |
"multiple":False, | |
"typeClass": "primitive", | |
"typeName": "dsDescriptionValue" | |
}}], | |
"typeClass": "compound", | |
"multiple": True, | |
"typeName": "dsDescription" | |
}, | |
#SUBJECT | |
{ | |
"value": [ | |
subject | |
], | |
"typeClass": "controlledVocabulary", | |
"multiple": True, | |
"typeName": "subject" | |
} | |
], | |
"displayName": "Citation Metadata" | |
}}}} | |
return json.dumps(ds_citeinfo) | |
def create_dataset(doi: str): | |
ds_json = citeinfo2datasetJson(doi2citeinfo(doi)) | |
res = api.create_dataset(dataverse=dv_name, metadata=ds_json, identifier='doi:'+doi, auth=True) | |
return res | |
def pdfdir2datasets(dir_path: str, clobber=False): | |
""" | |
Given a directory full of pdfs: for each pdf, extract the DOI, use the DOI to look up citation info, publish it as a | |
dataset to a dataverse, and then upload the pdf as a file associated with that dataset. | |
:param dir_path: Directory of pdfs | |
""" | |
pdfs = glob(f'{dir_path}/*.pdf') | |
for pdf_path in pdfs: | |
print(f'adding {pdf_path}') | |
pdf_xtract = pdfx.PDFx(pdf_path) | |
pdf_metadata = pdf_xtract.get_metadata() | |
if 'doi' in pdf_metadata.keys(): | |
doi = pdf_metadata['doi'] | |
elif 'WPS-ARTICLEDOI' in pdf_metadata.keys(): | |
doi = pdf_metadata['WPS-ARTICLEDOI'] | |
elif 'crossmark' in pdf_metadata.keys(): | |
doi = pdf_metadata['crossmark']['DOI'] | |
else: | |
try: | |
doi = re.search('(?:doi.org/|doi:)(.*)', pdf_xtract.get_text(), re.I).groups()[0] | |
except: | |
print(f'could not find DOI for {pdf_path}') | |
continue | |
identifier = 'doi:' + doi | |
if clobber: | |
try: | |
api.delete_dataset(identifier=identifier, is_pid=True) | |
except: | |
pass | |
create_dataset(doi=doi) | |
api.upload_file(identifier=identifier, filename=pdf_path, is_pid=True) | |
api.publish_dataset(identifier, type='major') | |
pdfdir2datasets(r'/path/to/dir', clobber=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment