Created
August 1, 2025 15:32
-
-
Save LukasWallrich/9413261130a882a0d9995981fee50743 to your computer and use it in GitHub Desktop.
Get metadata for many crossref DOIs in one R call
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(httr) | |
library(jsonlite) | |
library(dplyr) | |
library(purrr) | |
library(stringr) | |
`%||%` <- function(a, b) if (!is.null(a)) a else b | |
extract_date <- function(x) { | |
if (is.null(x)) return(NA_character_) | |
parts <- x$`date-parts`[[1]] | |
parts <- as.integer(parts) | |
if (length(parts) >= 3) sprintf("%04d-%02d-%02d", parts[1], parts[2], parts[3]) | |
else if (length(parts) == 2) sprintf("%04d-%02d", parts[1], parts[2]) | |
else sprintf("%04d", parts[1]) | |
} | |
format_authors <- function(authors) { | |
if (is.null(authors) || length(authors) == 0) return(NA_character_) | |
names <- map_chr(authors, function(a) { | |
if (!is.null(a$family) && !is.null(a$given)) { | |
paste0(a$family, ", ", a$given) | |
} else if (!is.null(a$name)) { | |
a$name | |
} else if (!is.null(a$literal)) { | |
a$literal | |
} else { | |
NA_character_ | |
} | |
}) | |
str_c(na.omit(names), collapse = "; ") | |
} | |
item_to_row <- function(item) { | |
tibble( | |
doi = item$DOI %||% NA_character_, | |
title = if (length(item$title)) item$title[[1]] else NA_character_, | |
type = item$type %||% NA_character_, | |
publisher = item$publisher %||% NA_character_, | |
container_title = if (length(item$`container-title`)) item$`container-title`[[1]] else NA_character_, | |
author = format_authors(item$author), | |
published = { | |
if (!is.null(item$`published-print`)) extract_date(item$`published-print`) | |
else if (!is.null(item$`published-online`)) extract_date(item$`published-online`) | |
else if (!is.null(item$issued)) extract_date(item$issued) | |
else NA_character_ | |
}, | |
url = item$URL %||% NA_character_ | |
) | |
} | |
filter_valid_dois <- function(dois) { | |
# trim and keep original for warning message | |
dois_trim <- trimws(dois) | |
# valid pattern: starts with "10.", then ≥4 digits, slash, then non-empty suffix | |
pat <- "^10\\.[0-9]{4,}/\\S+$" | |
ok <- grepl(pat, dois_trim, perl = TRUE) | |
invalid <- unique(dois[!ok]) | |
if (length(invalid)) { | |
warning( | |
sprintf( | |
"Dropped %d invalid DOI(s): %s", | |
length(invalid), | |
paste(invalid, collapse = ", ") | |
) | |
) | |
} | |
dois_trim[ok] | |
} | |
get_crossref_metadata <- function(dois, email, batch_size = 500, pause = 1) { | |
stopifnot(length(email) == 1, nzchar(email)) | |
dois <- filter_valid_dois(dois) | |
chunks <- split(dois, ceiling(seq_along(dois) / batch_size)) | |
out <- list() | |
for (chunk in chunks) { | |
filter_val <- paste0("doi:", paste0(chunk, collapse = ",doi:")) | |
params <- list(filter = filter_val, rows = length(chunk)) | |
ua <- user_agent(sprintf("R (https://example.com) mailto:%s", email)) | |
resp <- NULL | |
for (attempt in 1:3) { | |
resp <- GET("https://api.crossref.org/works", ua, query = params) | |
if (status_code(resp) == 200) break | |
Sys.sleep(2 ^ (attempt - 1)) | |
} | |
if (is.null(resp) || status_code(resp) != 200) { | |
warning("Failed to fetch batch: ", paste(chunk, collapse = ", ")) | |
next | |
} | |
content_txt <- content(resp, as = "text", encoding = "UTF-8") | |
parsed <- fromJSON(content_txt, simplifyVector = FALSE) | |
items <- parsed$message$items | |
if (length(items) == 0) next | |
df <- map_dfr(items, item_to_row) | |
out[[length(out) + 1]] <- df | |
Sys.sleep(pause) | |
} | |
result <- bind_rows(out) | |
# annotate missing DOIs if any | |
missing <- setdiff(tolower(dois), tolower(result$doi)) | |
if (length(missing)) { | |
warning("No metadata returned for these DOIs: ", paste(missing, collapse = ", ")) | |
} | |
result | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
rcrossref::cr_works makes one API call per DOI - this requests many at once and is much faster (though not as robustly tested yet)