Skip to content

Instantly share code, notes, and snippets.

@foobarbecue
Created July 22, 2020 19:30
Show Gist options
  • Save foobarbecue/4738be626392855ef92541e479c7d0c8 to your computer and use it in GitHub Desktop.
Save foobarbecue/4738be626392855ef92541e479c7d0c8 to your computer and use it in GitHub Desktop.
Example of uploading multiple papers to dataverse, using doi.org metadata
from pyDataverse.api import Api
from pyDataverse.exceptions import DatasetNotFoundError
from typing import List, Dict
import requests
import json
import pdfx
import re
from glob2 import glob
api = Api('https://example.com/dataverse', api_token='xxxx')
dv_name ='mooncaves'
def doi2citeinfo(doi: str):
"""
Given a document object identifier doi, retrieves citation information and returns it in json format.
:param doi: document object identifier
:return: citation information in CSL json
"""
citeformat = {'Accept':'application/vnd.citationstyles.csl+json;q=1.0'}
ref = requests.get(f'https://doi.org/{doi}', headers = citeformat)
citeinfo = ref.json()
return citeinfo
def citeinfo2datasetJson(citeinfo: Dict, ):
"""
Given citation information in the format output by doi2citeinfo, returns citation info in Dataverse native json.
:param citeinfo: citation information in CSL json
:return: citation information in Dataverse native json
"""
try:
author = f"{citeinfo['author'][0]['family']}, {citeinfo['author'][0]['given']}"
except KeyError:
author = f"{citeinfo['author'][0]['name']}"
if len(citeinfo['author'][0]['affiliation']) > 0:
affiliation = str(citeinfo['author'][0]['affiliation'])[0]
else:
affiliation = 'unknown'
email = "[email protected]"
description = "unknown"
subject = "Other"
# Munging into the format specified at http://guides.dataverse.org/en/latest/api/native-api.html#id36
ds_citeinfo = {'datasetVersion':{'metadataBlocks':{'citation':{'fields':[
#TITLE
{
"value": citeinfo['title'],
"typeClass": "primitive",
"multiple": False,
"typeName": "title"
},
#AUTHOR
{
"value": [
{
"authorName": {
"value": author,
"typeClass": "primitive",
"multiple": False,
"typeName": "authorName"
},
"authorAffiliation": {
"value": affiliation,
"typeClass": "primitive",
"multiple": False,
"typeName": "authorAffiliation"
}
}
],
"typeClass": "compound",
"multiple": True,
"typeName": "author"
},
#CONTACT
{
"value": [
{ "datasetContactEmail" : {
"typeClass": "primitive",
"multiple": False,
"typeName": "datasetContactEmail",
"value" : email
},
"datasetContactName" : {
"typeClass": "primitive",
"multiple": False,
"typeName": "datasetContactName",
"value": author
}
}],
"typeClass": "compound",
"multiple": True,
"typeName": "datasetContact"
},
#DESCRIPTION
{
"value": [ {
"dsDescriptionValue":{
"value": description,
"multiple":False,
"typeClass": "primitive",
"typeName": "dsDescriptionValue"
}}],
"typeClass": "compound",
"multiple": True,
"typeName": "dsDescription"
},
#SUBJECT
{
"value": [
subject
],
"typeClass": "controlledVocabulary",
"multiple": True,
"typeName": "subject"
}
],
"displayName": "Citation Metadata"
}}}}
return json.dumps(ds_citeinfo)
def create_dataset(doi: str):
ds_json = citeinfo2datasetJson(doi2citeinfo(doi))
res = api.create_dataset(dataverse=dv_name, metadata=ds_json, identifier='doi:'+doi, auth=True)
return res
def pdfdir2datasets(dir_path: str, clobber=False):
"""
Given a directory full of pdfs: for each pdf, extract the DOI, use the DOI to look up citation info, publish it as a
dataset to a dataverse, and then upload the pdf as a file associated with that dataset.
:param dir_path: Directory of pdfs
"""
pdfs = glob(f'{dir_path}/*.pdf')
for pdf_path in pdfs:
print(f'adding {pdf_path}')
pdf_xtract = pdfx.PDFx(pdf_path)
pdf_metadata = pdf_xtract.get_metadata()
if 'doi' in pdf_metadata.keys():
doi = pdf_metadata['doi']
elif 'WPS-ARTICLEDOI' in pdf_metadata.keys():
doi = pdf_metadata['WPS-ARTICLEDOI']
elif 'crossmark' in pdf_metadata.keys():
doi = pdf_metadata['crossmark']['DOI']
else:
try:
doi = re.search('(?:doi.org/|doi:)(.*)', pdf_xtract.get_text(), re.I).groups()[0]
except:
print(f'could not find DOI for {pdf_path}')
continue
identifier = 'doi:' + doi
if clobber:
try:
api.delete_dataset(identifier=identifier, is_pid=True)
except:
pass
create_dataset(doi=doi)
api.upload_file(identifier=identifier, filename=pdf_path, is_pid=True)
api.publish_dataset(identifier, type='major')
pdfdir2datasets(r'/path/to/dir', clobber=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment