LukasWallrich · August 1, 2025 15:32 · LukasWallrich · Aug 1, 2025
diff --git a/crossref_by_dois.R b/crossref_by_dois.R
 library(httr)
 library(jsonlite)
 library(dplyr)
 library(purrr)
 library(stringr)

 `%||%` <- function(a, b) if (!is.null(a)) a else b

 extract_date <- function(x) {
  if (is.null(x)) return(NA_character_)
  parts <- x$`date-parts`[[1]]
  parts <- as.integer(parts)
  if (length(parts) >= 3) sprintf("%04d-%02d-%02d", parts[1], parts[2], parts[3])
  else if (length(parts) == 2) sprintf("%04d-%02d", parts[1], parts[2])
  else sprintf("%04d", parts[1])
 }

 format_authors <- function(authors) {
  if (is.null(authors) || length(authors) == 0) return(NA_character_)
  names <- map_chr(authors, function(a) {
    if (!is.null(a$family) && !is.null(a$given)) {
      paste0(a$family, ", ", a$given)
    } else if (!is.null(a$name)) {
      a$name
    } else if (!is.null(a$literal)) {
      a$literal
    } else {
      NA_character_
    }
  })
  str_c(na.omit(names), collapse = "; ")
 }

 item_to_row <- function(item) {
  tibble(
    doi = item$DOI %||% NA_character_,
    title = if (length(item$title)) item$title[[1]] else NA_character_,
    type = item$type %||% NA_character_,
    publisher = item$publisher %||% NA_character_,
    container_title = if (length(item$`container-title`)) item$`container-title`[[1]] else NA_character_,
    author = format_authors(item$author),
    published = {
      if (!is.null(item$`published-print`)) extract_date(item$`published-print`)
      else if (!is.null(item$`published-online`)) extract_date(item$`published-online`)
      else if (!is.null(item$issued)) extract_date(item$issued)
      else NA_character_
    },
    url = item$URL %||% NA_character_
  )
 }

 filter_valid_dois <- function(dois) {
  # trim and keep original for warning message
  dois_trim <- trimws(dois)
  # valid pattern: starts with "10.", then ≥4 digits, slash, then non-empty suffix
  pat <- "^10\\.[0-9]{4,}/\\S+$"
  ok <- grepl(pat, dois_trim, perl = TRUE)
  invalid <- unique(dois[!ok])
  if (length(invalid)) {
    warning(
      sprintf(
        "Dropped %d invalid DOI(s): %s",
        length(invalid),
        paste(invalid, collapse = ", ")
      )
    )
  }
  dois_trim[ok]
 }

 get_crossref_metadata <- function(dois, email, batch_size = 500, pause = 1) {
  stopifnot(length(email) == 1, nzchar(email))
  dois <- filter_valid_dois(dois)
  chunks <- split(dois, ceiling(seq_along(dois) / batch_size))
  out <- list()
  for (chunk in chunks) {
    filter_val <- paste0("doi:", paste0(chunk, collapse = ",doi:"))
    params <- list(filter = filter_val, rows = length(chunk))
    ua <- user_agent(sprintf("R (https://example.com) mailto:%s", email))
    resp <- NULL
    for (attempt in 1:3) {
      resp <- GET("https://api.crossref.org/works", ua, query = params)
      if (status_code(resp) == 200) break
      Sys.sleep(2 ^ (attempt - 1))
    }
    if (is.null(resp) || status_code(resp) != 200) {
      warning("Failed to fetch batch: ", paste(chunk, collapse = ", "))
      next
    }
    content_txt <- content(resp, as = "text", encoding = "UTF-8")
    parsed <- fromJSON(content_txt, simplifyVector = FALSE)
    items <- parsed$message$items
    if (length(items) == 0) next
    df <- map_dfr(items, item_to_row)
    out[[length(out) + 1]] <- df
    Sys.sleep(pause)
  }
  result <- bind_rows(out)
  # annotate missing DOIs if any
  missing <- setdiff(tolower(dois), tolower(result$doi))
  if (length(missing)) {
    warning("No metadata returned for these DOIs: ", paste(missing, collapse = ", "))
  }
  result
 }
	library(httr)
	library(jsonlite)
	library(dplyr)
	library(purrr)
	library(stringr)

	`%\|\|%` <- function(a, b) if (!is.null(a)) a else b

	extract_date <- function(x) {
	if (is.null(x)) return(NA_character_)
	parts <- x$`date-parts`[[1]]
	parts <- as.integer(parts)
	if (length(parts) >= 3) sprintf("%04d-%02d-%02d", parts[1], parts[2], parts[3])
	else if (length(parts) == 2) sprintf("%04d-%02d", parts[1], parts[2])
	else sprintf("%04d", parts[1])
	}

	format_authors <- function(authors) {
	if (is.null(authors) \|\| length(authors) == 0) return(NA_character_)
	names <- map_chr(authors, function(a) {
	if (!is.null(a$family) && !is.null(a$given)) {
	paste0(a$family, ", ", a$given)
	} else if (!is.null(a$name)) {
	a$name
	} else if (!is.null(a$literal)) {
	a$literal
	} else {
	NA_character_
	}
	})
	str_c(na.omit(names), collapse = "; ")
	}

	item_to_row <- function(item) {
	tibble(
	doi = item$DOI %\|\|% NA_character_,
	title = if (length(item$title)) item$title[[1]] else NA_character_,
	type = item$type %\|\|% NA_character_,
	publisher = item$publisher %\|\|% NA_character_,
	container_title = if (length(item$`container-title`)) item$`container-title`[[1]] else NA_character_,
	author = format_authors(item$author),
	published = {
	if (!is.null(item$`published-print`)) extract_date(item$`published-print`)
	else if (!is.null(item$`published-online`)) extract_date(item$`published-online`)
	else if (!is.null(item$issued)) extract_date(item$issued)
	else NA_character_
	},
	url = item$URL %\|\|% NA_character_
	)
	}

	filter_valid_dois <- function(dois) {
	# trim and keep original for warning message
	dois_trim <- trimws(dois)
	# valid pattern: starts with "10.", then ≥4 digits, slash, then non-empty suffix
	pat <- "^10\\.[0-9]{4,}/\\S+$"
	ok <- grepl(pat, dois_trim, perl = TRUE)
	invalid <- unique(dois[!ok])
	if (length(invalid)) {
	warning(
	sprintf(
	"Dropped %d invalid DOI(s): %s",
	length(invalid),
	paste(invalid, collapse = ", ")
	)
	)
	}
	dois_trim[ok]
	}

	get_crossref_metadata <- function(dois, email, batch_size = 500, pause = 1) {
	stopifnot(length(email) == 1, nzchar(email))
	dois <- filter_valid_dois(dois)
	chunks <- split(dois, ceiling(seq_along(dois) / batch_size))
	out <- list()
	for (chunk in chunks) {
	filter_val <- paste0("doi:", paste0(chunk, collapse = ",doi:"))
	params <- list(filter = filter_val, rows = length(chunk))
	ua <- user_agent(sprintf("R (https://example.com) mailto:%s", email))
	resp <- NULL
	for (attempt in 1:3) {
	resp <- GET("https://api.crossref.org/works", ua, query = params)
	if (status_code(resp) == 200) break
	Sys.sleep(2 ^ (attempt - 1))
	}
	if (is.null(resp) \|\| status_code(resp) != 200) {
	warning("Failed to fetch batch: ", paste(chunk, collapse = ", "))
	next
	}
	content_txt <- content(resp, as = "text", encoding = "UTF-8")
	parsed <- fromJSON(content_txt, simplifyVector = FALSE)
	items <- parsed$message$items
	if (length(items) == 0) next
	df <- map_dfr(items, item_to_row)
	out[[length(out) + 1]] <- df
	Sys.sleep(pause)
	}
	result <- bind_rows(out)
	# annotate missing DOIs if any
	missing <- setdiff(tolower(dois), tolower(result$doi))
	if (length(missing)) {
	warning("No metadata returned for these DOIs: ", paste(missing, collapse = ", "))
	}
	result
	}