Last active
November 19, 2024 11:21
-
-
Save mathzero/70f33fa243ba0bb4f46b72b8a13646e0 to your computer and use it in GitHub Desktop.
Programatically search PubMed
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Function requires a PubMed API key. They are free and easily obtained | |
#' Instructions get an API key: | |
#' https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/ | |
#' Register here: https://account.ncbi.nlm.nih.gov/ | |
#' This function runs a PubMed query and returns a data frame of metadata for the results | |
#' Where the query returns >10,000 (which normally triggers an error from the API), | |
#' the function automatically breaks down the search into chunks by date and returns a combined and de-duped data set | |
# Load libraries | |
library(rentrez) | |
library(pbapply) | |
fetch_pubmed_data <- function(query, batch_size = 1000, api_key = NULL, max_retries = 3, delay = 0.5) { | |
# Helper function to fetch data for a single query | |
fetch_single_query <- function(q, batch_size, api_key, max_retries, delay, query_label = "") { | |
# Perform an initial search to get the total count and web history | |
search_results <- entrez_search(db = "pubmed", term = q, use_history = TRUE, | |
retmax = 0, api_key = api_key) | |
total_records <- search_results$count | |
# Print concise message without full query | |
if (query_label != "") { | |
cat(query_label, ": ", total_records, " records found.\n", sep = "") | |
} else { | |
cat("Total records found: ", total_records, "\n", sep = "") | |
} | |
if (total_records == 0) { | |
return(NULL) | |
} | |
# Calculate start positions for each batch | |
starts <- seq(0, total_records - 1, by = batch_size) | |
# Function to fetch a batch of summaries with error handling | |
fetch_batch <- function(start) { | |
retries <- 0 | |
current_delay <- delay | |
repeat { | |
tryCatch({ | |
summaries <- entrez_summary(db = "pubmed", web_history = search_results$web_history, | |
retstart = start, retmax = batch_size, api_key = api_key) | |
return(summaries) | |
}, error = function(e) { | |
retries <<- retries + 1 | |
if (retries > max_retries) { | |
stop("Maximum retries exceeded for batch starting at ", start, ". Last error: ", e$message) | |
} | |
message("Error fetching batch starting at ", start, ": ", e$message) | |
message("Retrying in ", current_delay, " seconds...") | |
Sys.sleep(current_delay) | |
current_delay <<- current_delay * 2 # Exponential backoff | |
}) | |
} | |
} | |
# Fetch all summaries with a progress bar and error handling | |
summaries_list <- pblapply(starts, fetch_batch) | |
# Combine all summaries into a single list | |
all_summaries <- do.call(c, summaries_list) | |
# Convert the list of summaries to a data frame | |
df <- do.call(rbind, lapply(all_summaries, function(x) { | |
# Extract desired fields, handling missing values and data types | |
authors_list <- NA | |
# Enhanced error handling for authors extraction | |
tryCatch({ | |
if (!is.null(x$authors)) { | |
if (is.data.frame(x$authors) && nrow(x$authors) > 0 && "name" %in% names(x$authors)) { | |
authors_list <- paste(x$authors$name, collapse = "; ") | |
} else if (is.list(x$authors) && length(x$authors) > 0) { | |
author_names <- sapply(x$authors, function(a) { | |
if (is.list(a) && "name" %in% names(a)) { | |
return(a$name) | |
} else if (is.character(a)) { | |
return(a) | |
} else { | |
return(NA) | |
} | |
}) | |
authors_list <- paste(na.omit(author_names), collapse = "; ") | |
} else if (is.atomic(x$authors) && length(x$authors) > 0 && !is.na(x$authors)) { | |
authors_list <- paste(x$authors, collapse = "; ") | |
} | |
} | |
}, error = function(e) { | |
authors_list <- NA | |
}) | |
data.frame( | |
uid = if (!is.null(x$uid)) x$uid else NA, | |
pubdate = if (!is.null(x$pubdate)) x$pubdate else NA, | |
epubdate = if (!is.null(x$epubdate)) x$epubdate else NA, | |
source = if (!is.null(x$source)) x$source else NA, | |
authors = authors_list, | |
lastauthor = if (!is.null(x$lastauthor)) x$lastauthor else NA, | |
title = if (!is.null(x$title)) x$title else NA, | |
volume = if (!is.null(x$volume)) x$volume else NA, | |
issue = if (!is.null(x$issue)) x$issue else NA, | |
pages = if (!is.null(x$pages)) x$pages else NA, | |
lang = if (!is.null(x$lang)) x$lang else NA, | |
issn = if (!is.null(x$issn)) x$issn else NA, | |
essn = if (!is.null(x$essn)) x$essn else NA, | |
pubtype = if (!is.null(x$pubtype)) { | |
paste(x$pubtype, collapse = "; ") | |
} else { | |
NA | |
}, | |
pmcrefcount = if (!is.null(x$pmcrefcount)) x$pmcrefcount else NA, | |
fulljournalname = if (!is.null(x$fulljournalname)) x$fulljournalname else NA, | |
elocationid = if (!is.null(x$elocationid)) x$elocationid else NA, | |
doi = if (!is.null(x$articleids)) { | |
doi_index <- which(x$articleids$idtype == "doi") | |
if (length(doi_index) > 0) { | |
x$articleids$value[doi_index[1]] | |
} else { | |
NA | |
} | |
} else { | |
NA | |
}, | |
stringsAsFactors = FALSE | |
) | |
})) | |
return(df) | |
} | |
# Helper function to get the last day of the month | |
last_day_of_month <- function(date_obj) { | |
date_obj <- as.Date(format(date_obj, "%Y-%m-01")) # Ensure first day of month | |
next_month <- as.Date(format(date_obj, "%Y-%m-01")) + 31 | |
next_month <- as.Date(format(next_month, "%Y-%m-01")) | |
last_day <- next_month - 1 | |
return(last_day) | |
} | |
# Helper function to recursively split date ranges | |
split_date_range <- function(query_base, start_date_obj, end_date_obj, batch_size, api_key, max_retries, delay) { | |
# Format dates to strings | |
start_date_str <- format(start_date_obj, "%Y/%m/%d") | |
end_date_str <- format(end_date_obj, "%Y/%m/%d") | |
# Create the sub-query for the current interval | |
interval_query <- paste0(query_base, | |
" AND (\"", start_date_str, "\"[Date - Publication] : \"", end_date_str, "\"[Date - Publication])") | |
# Get the total records for the sub-query | |
sub_search <- entrez_search(db = "pubmed", term = interval_query, retmax = 0, api_key = api_key) | |
sub_total <- sub_search$count | |
# Print concise message | |
interval_label <- paste("Date range", start_date_str, "to", end_date_str) | |
# cat(interval_label, ": ", sub_total, " records found.\n", sep = "") | |
if (sub_total == 0) { | |
return(NULL) # No records found for this interval | |
} | |
if (sub_total <= 10000) { | |
# Fetch data for this interval | |
df_interval <- fetch_single_query(interval_query, batch_size, api_key, max_retries, delay, query_label = interval_label) | |
return(df_interval) | |
} else { | |
# Split the date range into two halves | |
mid_date_obj <- start_date_obj + floor(as.numeric(end_date_obj - start_date_obj) / 2) | |
# Ensure that mid_date_obj is not equal to start_date_obj or end_date_obj to prevent infinite recursion | |
if (mid_date_obj <= start_date_obj || mid_date_obj >= end_date_obj) { | |
# Try splitting by months | |
months_seq <- seq.Date(start_date_obj, end_date_obj, by = "1 month") | |
if (length(months_seq) <= 1) { | |
stop("Cannot split date range further to reduce records below 10,000.") | |
} | |
combined_df <- NULL | |
for (i in seq_along(months_seq)[-length(months_seq)]) { | |
sub_start <- months_seq[i] | |
sub_end <- months_seq[i + 1] - 1 # Subtract one day | |
df_sub <- split_date_range(query_base, sub_start, sub_end, batch_size, api_key, max_retries, delay) | |
combined_df <- rbind(combined_df, df_sub) | |
} | |
return(combined_df) | |
} | |
# Recursively split the first half | |
df_first_half <- split_date_range(query_base, start_date_obj, mid_date_obj, batch_size, api_key, max_retries, delay) | |
# Recursively split the second half | |
df_second_half <- split_date_range(query_base, mid_date_obj + 1, end_date_obj, batch_size, api_key, max_retries, delay) | |
# Combine results | |
combined_df <- rbind(df_first_half, df_second_half) | |
return(combined_df) | |
} | |
} | |
# Main function logic | |
# Perform an initial search to get the total count | |
initial_search <- entrez_search(db = "pubmed", term = query, retmax = 0, api_key = api_key) | |
total_records <- initial_search$count | |
cat("Total records found for initial query: ", total_records, "\n", sep = "") | |
# Check if total records <= 10,000 | |
if (total_records <= 10000) { | |
return(fetch_single_query(query, batch_size, api_key, max_retries, delay)) | |
} else { | |
cat("N records > 10,000. Chunking by date to reduce query size") | |
# Attempt to detect date range in the query | |
# Regular expressions to find date ranges in various formats | |
date_range_patterns <- list( | |
# Pattern for ("YYYY-MM-DD"[Date - Publication] : "YYYY-MM-DD"[Date - Publication]) | |
"\\(\\s*\"?(\\d{4}-\\d{2}-\\d{2})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*[:\\-]\\s*\"?(\\d{4}-\\d{2}-\\d{2})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*\\)", | |
# Pattern for ("YYYY-MM"[Date - Publication] : "YYYY-MM"[Date - Publication]) | |
"\\(\\s*\"?(\\d{4}-\\d{2})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*[:\\-]\\s*\"?(\\d{4}-\\d{2})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*\\)", | |
# Pattern for ("YYYY"[Date - Publication] : "YYYY"[Date - Publication]) | |
"\\(\\s*\"?(\\d{4})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*[:\\-]\\s*\"?(\\d{4})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*\\)", | |
# Pattern for YYYY[:\-]YYYY[DP] | |
"(\\d{4})\\s*[:\\-]\\s*(\\d{4})\\s*\\[DP\\]" | |
) | |
date_range_found <- FALSE | |
for (pattern in date_range_patterns) { | |
date_ranges <- gregexpr(pattern, query, perl = TRUE) | |
date_range_matches <- regmatches(query, date_ranges) | |
if (length(date_range_matches[[1]]) > 0) { | |
# Extract the date range(s) from the query | |
date_range_string <- date_range_matches[[1]][1] | |
date_range_found <- TRUE | |
# Remove the date range from the query to avoid duplication | |
query_without_date <- trimws(gsub(pattern, "", query, perl = TRUE)) | |
# Extract the start and end dates using regmatches | |
matches <- regexec(pattern, date_range_string, perl = TRUE) | |
date_parts <- regmatches(date_range_string, matches) | |
start_date <- date_parts[[1]][2] | |
end_date <- date_parts[[1]][3] | |
if (is.na(end_date) || end_date == "") { | |
# Single date provided | |
end_date <- start_date | |
} | |
cat("Detected date range in query: ", start_date, " to ", end_date, "\n", sep = "") | |
break | |
} | |
} | |
if (!date_range_found) { | |
# No date range detected, set default range | |
start_date <- "1900-01-01" | |
end_date <- format(Sys.Date(), "%Y-%m-%d") | |
query_without_date <- query | |
cat("No date range detected in query. Using default date range: ", start_date, " to ", end_date, "\n", sep = "") | |
} | |
# Parse dates into Date objects using base::as.Date() | |
parse_date <- function(date_str, is_end_date = FALSE) { | |
# Remove surrounding quotes and whitespace | |
date_str <- gsub('^"|"$', '', date_str) | |
date_str <- trimws(date_str) | |
# Try parsing with different formats | |
if (grepl("^\\d{4}-\\d{2}-\\d{2}$", date_str)) { | |
date_obj <- base::as.Date(date_str, format = "%Y-%m-%d") | |
} else if (grepl("^\\d{4}-\\d{2}$", date_str)) { | |
# Handle YYYY-MM format | |
date_obj <- base::as.Date(paste0(date_str, "-01"), format = "%Y-%m-%d") | |
if (is_end_date) { | |
# Get last day of the month | |
date_obj <- last_day_of_month(date_obj) | |
} | |
} else if (grepl("^\\d{4}$", date_str)) { | |
# Handle YYYY format | |
if (is_end_date) { | |
date_obj <- base::as.Date(paste0(date_str, "-12-31"), format = "%Y-%m-%d") | |
} else { | |
date_obj <- base::as.Date(paste0(date_str, "-01-01"), format = "%Y-%m-%d") | |
} | |
} else { | |
date_obj <- NA | |
} | |
return(date_obj) | |
} | |
start_date_obj <- parse_date(start_date) | |
if (is.na(start_date_obj)) { | |
stop("Failed to parse start date: ", start_date) | |
} | |
end_date_obj <- parse_date(end_date, is_end_date = TRUE) | |
if (is.na(end_date_obj)) { | |
stop("Failed to parse end date: ", end_date) | |
} | |
# Ensure end_date_obj is not before start_date_obj | |
if (end_date_obj < start_date_obj) { | |
stop("End date is before start date in the date range.") | |
} | |
# Remove any residual colons or brackets from the query | |
query_without_date <- gsub("\\s*[:;]\\s*", " ", query_without_date) | |
query_without_date <- gsub("\\s*\\[\\s*\\]", " ", query_without_date) | |
# Trim any extra whitespace | |
query_without_date <- trimws(query_without_date) | |
# Recursively split the date range and fetch data | |
combined_df <- split_date_range(query_without_date, start_date_obj, end_date_obj, batch_size, api_key, max_retries, delay) | |
# Remove duplicate records based on PMID (uid) | |
if (!is.null(combined_df)) { | |
combined_df <- combined_df[!duplicated(combined_df$uid), ] | |
# Reset row names | |
row.names(combined_df) <- NULL | |
} | |
return(combined_df) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment