Skip to content

Instantly share code, notes, and snippets.

@mathzero
Last active November 19, 2024 11:21
Show Gist options
  • Save mathzero/70f33fa243ba0bb4f46b72b8a13646e0 to your computer and use it in GitHub Desktop.
Save mathzero/70f33fa243ba0bb4f46b72b8a13646e0 to your computer and use it in GitHub Desktop.
Programatically search PubMed
#' Function requires a PubMed API key. They are free and easily obtained
#' Instructions get an API key:
#' https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/
#' Register here: https://account.ncbi.nlm.nih.gov/
#' This function runs a PubMed query and returns a data frame of metadata for the results
#' Where the query returns >10,000 (which normally triggers an error from the API),
#' the function automatically breaks down the search into chunks by date and returns a combined and de-duped data set
# Load libraries
library(rentrez)
library(pbapply)
fetch_pubmed_data <- function(query, batch_size = 1000, api_key = NULL, max_retries = 3, delay = 0.5) {
# Helper function to fetch data for a single query
fetch_single_query <- function(q, batch_size, api_key, max_retries, delay, query_label = "") {
# Perform an initial search to get the total count and web history
search_results <- entrez_search(db = "pubmed", term = q, use_history = TRUE,
retmax = 0, api_key = api_key)
total_records <- search_results$count
# Print concise message without full query
if (query_label != "") {
cat(query_label, ": ", total_records, " records found.\n", sep = "")
} else {
cat("Total records found: ", total_records, "\n", sep = "")
}
if (total_records == 0) {
return(NULL)
}
# Calculate start positions for each batch
starts <- seq(0, total_records - 1, by = batch_size)
# Function to fetch a batch of summaries with error handling
fetch_batch <- function(start) {
retries <- 0
current_delay <- delay
repeat {
tryCatch({
summaries <- entrez_summary(db = "pubmed", web_history = search_results$web_history,
retstart = start, retmax = batch_size, api_key = api_key)
return(summaries)
}, error = function(e) {
retries <<- retries + 1
if (retries > max_retries) {
stop("Maximum retries exceeded for batch starting at ", start, ". Last error: ", e$message)
}
message("Error fetching batch starting at ", start, ": ", e$message)
message("Retrying in ", current_delay, " seconds...")
Sys.sleep(current_delay)
current_delay <<- current_delay * 2 # Exponential backoff
})
}
}
# Fetch all summaries with a progress bar and error handling
summaries_list <- pblapply(starts, fetch_batch)
# Combine all summaries into a single list
all_summaries <- do.call(c, summaries_list)
# Convert the list of summaries to a data frame
df <- do.call(rbind, lapply(all_summaries, function(x) {
# Extract desired fields, handling missing values and data types
authors_list <- NA
# Enhanced error handling for authors extraction
tryCatch({
if (!is.null(x$authors)) {
if (is.data.frame(x$authors) && nrow(x$authors) > 0 && "name" %in% names(x$authors)) {
authors_list <- paste(x$authors$name, collapse = "; ")
} else if (is.list(x$authors) && length(x$authors) > 0) {
author_names <- sapply(x$authors, function(a) {
if (is.list(a) && "name" %in% names(a)) {
return(a$name)
} else if (is.character(a)) {
return(a)
} else {
return(NA)
}
})
authors_list <- paste(na.omit(author_names), collapse = "; ")
} else if (is.atomic(x$authors) && length(x$authors) > 0 && !is.na(x$authors)) {
authors_list <- paste(x$authors, collapse = "; ")
}
}
}, error = function(e) {
authors_list <- NA
})
data.frame(
uid = if (!is.null(x$uid)) x$uid else NA,
pubdate = if (!is.null(x$pubdate)) x$pubdate else NA,
epubdate = if (!is.null(x$epubdate)) x$epubdate else NA,
source = if (!is.null(x$source)) x$source else NA,
authors = authors_list,
lastauthor = if (!is.null(x$lastauthor)) x$lastauthor else NA,
title = if (!is.null(x$title)) x$title else NA,
volume = if (!is.null(x$volume)) x$volume else NA,
issue = if (!is.null(x$issue)) x$issue else NA,
pages = if (!is.null(x$pages)) x$pages else NA,
lang = if (!is.null(x$lang)) x$lang else NA,
issn = if (!is.null(x$issn)) x$issn else NA,
essn = if (!is.null(x$essn)) x$essn else NA,
pubtype = if (!is.null(x$pubtype)) {
paste(x$pubtype, collapse = "; ")
} else {
NA
},
pmcrefcount = if (!is.null(x$pmcrefcount)) x$pmcrefcount else NA,
fulljournalname = if (!is.null(x$fulljournalname)) x$fulljournalname else NA,
elocationid = if (!is.null(x$elocationid)) x$elocationid else NA,
doi = if (!is.null(x$articleids)) {
doi_index <- which(x$articleids$idtype == "doi")
if (length(doi_index) > 0) {
x$articleids$value[doi_index[1]]
} else {
NA
}
} else {
NA
},
stringsAsFactors = FALSE
)
}))
return(df)
}
# Helper function to get the last day of the month
last_day_of_month <- function(date_obj) {
date_obj <- as.Date(format(date_obj, "%Y-%m-01")) # Ensure first day of month
next_month <- as.Date(format(date_obj, "%Y-%m-01")) + 31
next_month <- as.Date(format(next_month, "%Y-%m-01"))
last_day <- next_month - 1
return(last_day)
}
# Helper function to recursively split date ranges
split_date_range <- function(query_base, start_date_obj, end_date_obj, batch_size, api_key, max_retries, delay) {
# Format dates to strings
start_date_str <- format(start_date_obj, "%Y/%m/%d")
end_date_str <- format(end_date_obj, "%Y/%m/%d")
# Create the sub-query for the current interval
interval_query <- paste0(query_base,
" AND (\"", start_date_str, "\"[Date - Publication] : \"", end_date_str, "\"[Date - Publication])")
# Get the total records for the sub-query
sub_search <- entrez_search(db = "pubmed", term = interval_query, retmax = 0, api_key = api_key)
sub_total <- sub_search$count
# Print concise message
interval_label <- paste("Date range", start_date_str, "to", end_date_str)
# cat(interval_label, ": ", sub_total, " records found.\n", sep = "")
if (sub_total == 0) {
return(NULL) # No records found for this interval
}
if (sub_total <= 10000) {
# Fetch data for this interval
df_interval <- fetch_single_query(interval_query, batch_size, api_key, max_retries, delay, query_label = interval_label)
return(df_interval)
} else {
# Split the date range into two halves
mid_date_obj <- start_date_obj + floor(as.numeric(end_date_obj - start_date_obj) / 2)
# Ensure that mid_date_obj is not equal to start_date_obj or end_date_obj to prevent infinite recursion
if (mid_date_obj <= start_date_obj || mid_date_obj >= end_date_obj) {
# Try splitting by months
months_seq <- seq.Date(start_date_obj, end_date_obj, by = "1 month")
if (length(months_seq) <= 1) {
stop("Cannot split date range further to reduce records below 10,000.")
}
combined_df <- NULL
for (i in seq_along(months_seq)[-length(months_seq)]) {
sub_start <- months_seq[i]
sub_end <- months_seq[i + 1] - 1 # Subtract one day
df_sub <- split_date_range(query_base, sub_start, sub_end, batch_size, api_key, max_retries, delay)
combined_df <- rbind(combined_df, df_sub)
}
return(combined_df)
}
# Recursively split the first half
df_first_half <- split_date_range(query_base, start_date_obj, mid_date_obj, batch_size, api_key, max_retries, delay)
# Recursively split the second half
df_second_half <- split_date_range(query_base, mid_date_obj + 1, end_date_obj, batch_size, api_key, max_retries, delay)
# Combine results
combined_df <- rbind(df_first_half, df_second_half)
return(combined_df)
}
}
# Main function logic
# Perform an initial search to get the total count
initial_search <- entrez_search(db = "pubmed", term = query, retmax = 0, api_key = api_key)
total_records <- initial_search$count
cat("Total records found for initial query: ", total_records, "\n", sep = "")
# Check if total records <= 10,000
if (total_records <= 10000) {
return(fetch_single_query(query, batch_size, api_key, max_retries, delay))
} else {
cat("N records > 10,000. Chunking by date to reduce query size")
# Attempt to detect date range in the query
# Regular expressions to find date ranges in various formats
date_range_patterns <- list(
# Pattern for ("YYYY-MM-DD"[Date - Publication] : "YYYY-MM-DD"[Date - Publication])
"\\(\\s*\"?(\\d{4}-\\d{2}-\\d{2})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*[:\\-]\\s*\"?(\\d{4}-\\d{2}-\\d{2})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*\\)",
# Pattern for ("YYYY-MM"[Date - Publication] : "YYYY-MM"[Date - Publication])
"\\(\\s*\"?(\\d{4}-\\d{2})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*[:\\-]\\s*\"?(\\d{4}-\\d{2})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*\\)",
# Pattern for ("YYYY"[Date - Publication] : "YYYY"[Date - Publication])
"\\(\\s*\"?(\\d{4})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*[:\\-]\\s*\"?(\\d{4})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*\\)",
# Pattern for YYYY[:\-]YYYY[DP]
"(\\d{4})\\s*[:\\-]\\s*(\\d{4})\\s*\\[DP\\]"
)
date_range_found <- FALSE
for (pattern in date_range_patterns) {
date_ranges <- gregexpr(pattern, query, perl = TRUE)
date_range_matches <- regmatches(query, date_ranges)
if (length(date_range_matches[[1]]) > 0) {
# Extract the date range(s) from the query
date_range_string <- date_range_matches[[1]][1]
date_range_found <- TRUE
# Remove the date range from the query to avoid duplication
query_without_date <- trimws(gsub(pattern, "", query, perl = TRUE))
# Extract the start and end dates using regmatches
matches <- regexec(pattern, date_range_string, perl = TRUE)
date_parts <- regmatches(date_range_string, matches)
start_date <- date_parts[[1]][2]
end_date <- date_parts[[1]][3]
if (is.na(end_date) || end_date == "") {
# Single date provided
end_date <- start_date
}
cat("Detected date range in query: ", start_date, " to ", end_date, "\n", sep = "")
break
}
}
if (!date_range_found) {
# No date range detected, set default range
start_date <- "1900-01-01"
end_date <- format(Sys.Date(), "%Y-%m-%d")
query_without_date <- query
cat("No date range detected in query. Using default date range: ", start_date, " to ", end_date, "\n", sep = "")
}
# Parse dates into Date objects using base::as.Date()
parse_date <- function(date_str, is_end_date = FALSE) {
# Remove surrounding quotes and whitespace
date_str <- gsub('^"|"$', '', date_str)
date_str <- trimws(date_str)
# Try parsing with different formats
if (grepl("^\\d{4}-\\d{2}-\\d{2}$", date_str)) {
date_obj <- base::as.Date(date_str, format = "%Y-%m-%d")
} else if (grepl("^\\d{4}-\\d{2}$", date_str)) {
# Handle YYYY-MM format
date_obj <- base::as.Date(paste0(date_str, "-01"), format = "%Y-%m-%d")
if (is_end_date) {
# Get last day of the month
date_obj <- last_day_of_month(date_obj)
}
} else if (grepl("^\\d{4}$", date_str)) {
# Handle YYYY format
if (is_end_date) {
date_obj <- base::as.Date(paste0(date_str, "-12-31"), format = "%Y-%m-%d")
} else {
date_obj <- base::as.Date(paste0(date_str, "-01-01"), format = "%Y-%m-%d")
}
} else {
date_obj <- NA
}
return(date_obj)
}
start_date_obj <- parse_date(start_date)
if (is.na(start_date_obj)) {
stop("Failed to parse start date: ", start_date)
}
end_date_obj <- parse_date(end_date, is_end_date = TRUE)
if (is.na(end_date_obj)) {
stop("Failed to parse end date: ", end_date)
}
# Ensure end_date_obj is not before start_date_obj
if (end_date_obj < start_date_obj) {
stop("End date is before start date in the date range.")
}
# Remove any residual colons or brackets from the query
query_without_date <- gsub("\\s*[:;]\\s*", " ", query_without_date)
query_without_date <- gsub("\\s*\\[\\s*\\]", " ", query_without_date)
# Trim any extra whitespace
query_without_date <- trimws(query_without_date)
# Recursively split the date range and fetch data
combined_df <- split_date_range(query_without_date, start_date_obj, end_date_obj, batch_size, api_key, max_retries, delay)
# Remove duplicate records based on PMID (uid)
if (!is.null(combined_df)) {
combined_df <- combined_df[!duplicated(combined_df$uid), ]
# Reset row names
row.names(combined_df) <- NULL
}
return(combined_df)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment