Skip to content

Instantly share code, notes, and snippets.

@LukasWallrich
Created August 1, 2025 15:32
Show Gist options
  • Save LukasWallrich/9413261130a882a0d9995981fee50743 to your computer and use it in GitHub Desktop.
Save LukasWallrich/9413261130a882a0d9995981fee50743 to your computer and use it in GitHub Desktop.
Get metadata for many crossref DOIs in one R call
library(httr)
library(jsonlite)
library(dplyr)
library(purrr)
library(stringr)
`%||%` <- function(a, b) if (!is.null(a)) a else b
extract_date <- function(x) {
if (is.null(x)) return(NA_character_)
parts <- x$`date-parts`[[1]]
parts <- as.integer(parts)
if (length(parts) >= 3) sprintf("%04d-%02d-%02d", parts[1], parts[2], parts[3])
else if (length(parts) == 2) sprintf("%04d-%02d", parts[1], parts[2])
else sprintf("%04d", parts[1])
}
format_authors <- function(authors) {
if (is.null(authors) || length(authors) == 0) return(NA_character_)
names <- map_chr(authors, function(a) {
if (!is.null(a$family) && !is.null(a$given)) {
paste0(a$family, ", ", a$given)
} else if (!is.null(a$name)) {
a$name
} else if (!is.null(a$literal)) {
a$literal
} else {
NA_character_
}
})
str_c(na.omit(names), collapse = "; ")
}
item_to_row <- function(item) {
tibble(
doi = item$DOI %||% NA_character_,
title = if (length(item$title)) item$title[[1]] else NA_character_,
type = item$type %||% NA_character_,
publisher = item$publisher %||% NA_character_,
container_title = if (length(item$`container-title`)) item$`container-title`[[1]] else NA_character_,
author = format_authors(item$author),
published = {
if (!is.null(item$`published-print`)) extract_date(item$`published-print`)
else if (!is.null(item$`published-online`)) extract_date(item$`published-online`)
else if (!is.null(item$issued)) extract_date(item$issued)
else NA_character_
},
url = item$URL %||% NA_character_
)
}
filter_valid_dois <- function(dois) {
# trim and keep original for warning message
dois_trim <- trimws(dois)
# valid pattern: starts with "10.", then ≥4 digits, slash, then non-empty suffix
pat <- "^10\\.[0-9]{4,}/\\S+$"
ok <- grepl(pat, dois_trim, perl = TRUE)
invalid <- unique(dois[!ok])
if (length(invalid)) {
warning(
sprintf(
"Dropped %d invalid DOI(s): %s",
length(invalid),
paste(invalid, collapse = ", ")
)
)
}
dois_trim[ok]
}
get_crossref_metadata <- function(dois, email, batch_size = 500, pause = 1) {
stopifnot(length(email) == 1, nzchar(email))
dois <- filter_valid_dois(dois)
chunks <- split(dois, ceiling(seq_along(dois) / batch_size))
out <- list()
for (chunk in chunks) {
filter_val <- paste0("doi:", paste0(chunk, collapse = ",doi:"))
params <- list(filter = filter_val, rows = length(chunk))
ua <- user_agent(sprintf("R (https://example.com) mailto:%s", email))
resp <- NULL
for (attempt in 1:3) {
resp <- GET("https://api.crossref.org/works", ua, query = params)
if (status_code(resp) == 200) break
Sys.sleep(2 ^ (attempt - 1))
}
if (is.null(resp) || status_code(resp) != 200) {
warning("Failed to fetch batch: ", paste(chunk, collapse = ", "))
next
}
content_txt <- content(resp, as = "text", encoding = "UTF-8")
parsed <- fromJSON(content_txt, simplifyVector = FALSE)
items <- parsed$message$items
if (length(items) == 0) next
df <- map_dfr(items, item_to_row)
out[[length(out) + 1]] <- df
Sys.sleep(pause)
}
result <- bind_rows(out)
# annotate missing DOIs if any
missing <- setdiff(tolower(dois), tolower(result$doi))
if (length(missing)) {
warning("No metadata returned for these DOIs: ", paste(missing, collapse = ", "))
}
result
}
@LukasWallrich
Copy link
Author

rcrossref::cr_works makes one API call per DOI - this requests many at once and is much faster (though not as robustly tested yet)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment