mathzero · November 19, 2024 11:21
diff --git a/programmatic_pubmed_query.R b/programmatic_pubmed_query.R
 #' Function requires a PubMed API key. They are free and easily obtained
 #' Instructions get an API key:
 #' https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/
 #' Register here: https://account.ncbi.nlm.nih.gov/


 #' This function runs a PubMed query and returns a data frame of metadata for the results
 #' Where the query returns >10,000 (which normally triggers an error from the API), 
 #' the function automatically breaks down the search into chunks by date and returns a combined and de-duped data set


 # Load libraries
 library(rentrez)
 library(pbapply)  

 fetch_pubmed_data <- function(query, batch_size = 1000, api_key = NULL, max_retries = 3, delay = 0.5) {
  
  # Helper function to fetch data for a single query
  fetch_single_query <- function(q, batch_size, api_key, max_retries, delay, query_label = "") {
    # Perform an initial search to get the total count and web history
    search_results <- entrez_search(db = "pubmed", term = q, use_history = TRUE,
                                    retmax = 0, api_key = api_key)
    total_records <- search_results$count
    
    # Print concise message without full query
    if (query_label != "") {
      cat(query_label, ": ", total_records, " records found.\n", sep = "")
    } else {
      cat("Total records found: ", total_records, "\n", sep = "")
    }
    
    if (total_records == 0) {
      return(NULL)
    }
    
    # Calculate start positions for each batch
    starts <- seq(0, total_records - 1, by = batch_size)
    
    # Function to fetch a batch of summaries with error handling
    fetch_batch <- function(start) {
      retries <- 0
      current_delay <- delay
      repeat {
        tryCatch({
          summaries <- entrez_summary(db = "pubmed", web_history = search_results$web_history,
                                      retstart = start, retmax = batch_size, api_key = api_key)
          return(summaries)
        }, error = function(e) {
          retries <<- retries + 1
          if (retries > max_retries) {
            stop("Maximum retries exceeded for batch starting at ", start, ". Last error: ", e$message)
          }
          message("Error fetching batch starting at ", start, ": ", e$message)
          message("Retrying in ", current_delay, " seconds...")
          Sys.sleep(current_delay)
          current_delay <<- current_delay * 2  # Exponential backoff
        })
      }
    }
    
    # Fetch all summaries with a progress bar and error handling
    summaries_list <- pblapply(starts, fetch_batch)
    
    # Combine all summaries into a single list
    all_summaries <- do.call(c, summaries_list)
    
    # Convert the list of summaries to a data frame
    df <- do.call(rbind, lapply(all_summaries, function(x) {
      # Extract desired fields, handling missing values and data types
      authors_list <- NA
      # Enhanced error handling for authors extraction
      tryCatch({
        if (!is.null(x$authors)) {
          if (is.data.frame(x$authors) && nrow(x$authors) > 0 && "name" %in% names(x$authors)) {
            authors_list <- paste(x$authors$name, collapse = "; ")
          } else if (is.list(x$authors) && length(x$authors) > 0) {
            author_names <- sapply(x$authors, function(a) {
              if (is.list(a) && "name" %in% names(a)) {
                return(a$name)
              } else if (is.character(a)) {
                return(a)
              } else {
                return(NA)
              }
            })
            authors_list <- paste(na.omit(author_names), collapse = "; ")
          } else if (is.atomic(x$authors) && length(x$authors) > 0 && !is.na(x$authors)) {
            authors_list <- paste(x$authors, collapse = "; ")
          }
        }
      }, error = function(e) {
        authors_list <- NA
      })
      
      data.frame(
        uid = if (!is.null(x$uid)) x$uid else NA,
        pubdate = if (!is.null(x$pubdate)) x$pubdate else NA,
        epubdate = if (!is.null(x$epubdate)) x$epubdate else NA,
        source = if (!is.null(x$source)) x$source else NA,
        authors = authors_list,
        lastauthor = if (!is.null(x$lastauthor)) x$lastauthor else NA,
        title = if (!is.null(x$title)) x$title else NA,
        volume = if (!is.null(x$volume)) x$volume else NA,
        issue = if (!is.null(x$issue)) x$issue else NA,
        pages = if (!is.null(x$pages)) x$pages else NA,
        lang = if (!is.null(x$lang)) x$lang else NA,
        issn = if (!is.null(x$issn)) x$issn else NA,
        essn = if (!is.null(x$essn)) x$essn else NA,
        pubtype = if (!is.null(x$pubtype)) {
          paste(x$pubtype, collapse = "; ")
        } else {
          NA
        },
        pmcrefcount = if (!is.null(x$pmcrefcount)) x$pmcrefcount else NA,
        fulljournalname = if (!is.null(x$fulljournalname)) x$fulljournalname else NA,
        elocationid = if (!is.null(x$elocationid)) x$elocationid else NA,
        doi = if (!is.null(x$articleids)) {
          doi_index <- which(x$articleids$idtype == "doi")
          if (length(doi_index) > 0) {
            x$articleids$value[doi_index[1]]
          } else {
            NA
          }
        } else {
          NA
        },
        stringsAsFactors = FALSE
      )
    }))
    
    return(df)
  }
  
  # Helper function to get the last day of the month
  last_day_of_month <- function(date_obj) {
    date_obj <- as.Date(format(date_obj, "%Y-%m-01"))  # Ensure first day of month
    next_month <- as.Date(format(date_obj, "%Y-%m-01")) + 31
    next_month <- as.Date(format(next_month, "%Y-%m-01"))
    last_day <- next_month - 1
    return(last_day)
  }
  
  # Helper function to recursively split date ranges
  split_date_range <- function(query_base, start_date_obj, end_date_obj, batch_size, api_key, max_retries, delay) {
    # Format dates to strings
    start_date_str <- format(start_date_obj, "%Y/%m/%d")
    end_date_str <- format(end_date_obj, "%Y/%m/%d")
    
    # Create the sub-query for the current interval
    interval_query <- paste0(query_base,
                             " AND (\"", start_date_str, "\"[Date - Publication] : \"", end_date_str, "\"[Date - Publication])")
    
    # Get the total records for the sub-query
    sub_search <- entrez_search(db = "pubmed", term = interval_query, retmax = 0, api_key = api_key)
    sub_total <- sub_search$count
    
    # Print concise message
    interval_label <- paste("Date range", start_date_str, "to", end_date_str)
    # cat(interval_label, ": ", sub_total, " records found.\n", sep = "")
    
    if (sub_total == 0) {
      return(NULL)  # No records found for this interval
    }
    
    if (sub_total <= 10000) {
      # Fetch data for this interval
      df_interval <- fetch_single_query(interval_query, batch_size, api_key, max_retries, delay, query_label = interval_label)
      return(df_interval)
    } else {
      # Split the date range into two halves
      mid_date_obj <- start_date_obj + floor(as.numeric(end_date_obj - start_date_obj) / 2)
      
      # Ensure that mid_date_obj is not equal to start_date_obj or end_date_obj to prevent infinite recursion
      if (mid_date_obj <= start_date_obj || mid_date_obj >= end_date_obj) {
        # Try splitting by months
        months_seq <- seq.Date(start_date_obj, end_date_obj, by = "1 month")
        if (length(months_seq) <= 1) {
          stop("Cannot split date range further to reduce records below 10,000.")
        }
        combined_df <- NULL
        for (i in seq_along(months_seq)[-length(months_seq)]) {
          sub_start <- months_seq[i]
          sub_end <- months_seq[i + 1] - 1  # Subtract one day
          df_sub <- split_date_range(query_base, sub_start, sub_end, batch_size, api_key, max_retries, delay)
          combined_df <- rbind(combined_df, df_sub)
        }
        return(combined_df)
      }
      
      # Recursively split the first half
      df_first_half <- split_date_range(query_base, start_date_obj, mid_date_obj, batch_size, api_key, max_retries, delay)
      
      # Recursively split the second half
      df_second_half <- split_date_range(query_base, mid_date_obj + 1, end_date_obj, batch_size, api_key, max_retries, delay)
      
      # Combine results
      combined_df <- rbind(df_first_half, df_second_half)
      return(combined_df)
    }
  }
  
  # Main function logic
  # Perform an initial search to get the total count
  initial_search <- entrez_search(db = "pubmed", term = query, retmax = 0, api_key = api_key)
  total_records <- initial_search$count
  
  cat("Total records found for initial query: ", total_records, "\n", sep = "")
  
  # Check if total records <= 10,000
  if (total_records <= 10000) {
    return(fetch_single_query(query, batch_size, api_key, max_retries, delay))
  } else {
    
    cat("N records > 10,000. Chunking by date to reduce query size")
    
    # Attempt to detect date range in the query
    # Regular expressions to find date ranges in various formats
    date_range_patterns <- list(
      # Pattern for ("YYYY-MM-DD"[Date - Publication] : "YYYY-MM-DD"[Date - Publication])
      "\\(\\s*\"?(\\d{4}-\\d{2}-\\d{2})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*[:\\-]\\s*\"?(\\d{4}-\\d{2}-\\d{2})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*\\)",
      # Pattern for ("YYYY-MM"[Date - Publication] : "YYYY-MM"[Date - Publication])
      "\\(\\s*\"?(\\d{4}-\\d{2})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*[:\\-]\\s*\"?(\\d{4}-\\d{2})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*\\)",
      # Pattern for ("YYYY"[Date - Publication] : "YYYY"[Date - Publication])
      "\\(\\s*\"?(\\d{4})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*[:\\-]\\s*\"?(\\d{4})\"?\\s*\\[Date\\s*-\\s*Publication\\]\\s*\\)",
      # Pattern for YYYY[:\-]YYYY[DP]
      "(\\d{4})\\s*[:\\-]\\s*(\\d{4})\\s*\\[DP\\]"
    )
    
    date_range_found <- FALSE
    for (pattern in date_range_patterns) {
      date_ranges <- gregexpr(pattern, query, perl = TRUE)
      date_range_matches <- regmatches(query, date_ranges)
      if (length(date_range_matches[[1]]) > 0) {
        # Extract the date range(s) from the query
        date_range_string <- date_range_matches[[1]][1]
        date_range_found <- TRUE
        
        # Remove the date range from the query to avoid duplication
        query_without_date <- trimws(gsub(pattern, "", query, perl = TRUE))
        
        # Extract the start and end dates using regmatches
        matches <- regexec(pattern, date_range_string, perl = TRUE)
        date_parts <- regmatches(date_range_string, matches)
        start_date <- date_parts[[1]][2]
        end_date <- date_parts[[1]][3]
        
        if (is.na(end_date) || end_date == "") {
          # Single date provided
          end_date <- start_date
        }
        
        cat("Detected date range in query: ", start_date, " to ", end_date, "\n", sep = "")
        break
      }
    }
    
    if (!date_range_found) {
      # No date range detected, set default range
      start_date <- "1900-01-01"
      end_date <- format(Sys.Date(), "%Y-%m-%d")
      query_without_date <- query
      cat("No date range detected in query. Using default date range: ", start_date, " to ", end_date, "\n", sep = "")
    }
    
    # Parse dates into Date objects using base::as.Date()
    parse_date <- function(date_str, is_end_date = FALSE) {
      # Remove surrounding quotes and whitespace
      date_str <- gsub('^"|"$', '', date_str)
      date_str <- trimws(date_str)
      
      # Try parsing with different formats
      if (grepl("^\\d{4}-\\d{2}-\\d{2}$", date_str)) {
        date_obj <- base::as.Date(date_str, format = "%Y-%m-%d")
      } else if (grepl("^\\d{4}-\\d{2}$", date_str)) {
        # Handle YYYY-MM format
        date_obj <- base::as.Date(paste0(date_str, "-01"), format = "%Y-%m-%d")
        if (is_end_date) {
          # Get last day of the month
          date_obj <- last_day_of_month(date_obj)
        }
      } else if (grepl("^\\d{4}$", date_str)) {
        # Handle YYYY format
        if (is_end_date) {
          date_obj <- base::as.Date(paste0(date_str, "-12-31"), format = "%Y-%m-%d")
        } else {
          date_obj <- base::as.Date(paste0(date_str, "-01-01"), format = "%Y-%m-%d")
        }
      } else {
        date_obj <- NA
      }
      return(date_obj)
    }
    
    start_date_obj <- parse_date(start_date)
    if (is.na(start_date_obj)) {
      stop("Failed to parse start date: ", start_date)
    }
    
    end_date_obj <- parse_date(end_date, is_end_date = TRUE)
    if (is.na(end_date_obj)) {
      stop("Failed to parse end date: ", end_date)
    }
    
    # Ensure end_date_obj is not before start_date_obj
    if (end_date_obj < start_date_obj) {
      stop("End date is before start date in the date range.")
    }
    
    # Remove any residual colons or brackets from the query
    query_without_date <- gsub("\\s*[:;]\\s*", " ", query_without_date)
    query_without_date <- gsub("\\s*\\[\\s*\\]", " ", query_without_date)
    
    # Trim any extra whitespace
    query_without_date <- trimws(query_without_date)
    
    # Recursively split the date range and fetch data
    combined_df <- split_date_range(query_without_date, start_date_obj, end_date_obj, batch_size, api_key, max_retries, delay)
    
    # Remove duplicate records based on PMID (uid)
    if (!is.null(combined_df)) {
      combined_df <- combined_df[!duplicated(combined_df$uid), ]
      # Reset row names
      row.names(combined_df) <- NULL
    }
    
    return(combined_df)
  }
 }
	#' Function requires a PubMed API key. They are free and easily obtained
	#' Instructions get an API key:
	#' https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/
	#' Register here: https://account.ncbi.nlm.nih.gov/


	#' This function runs a PubMed query and returns a data frame of metadata for the results
	#' Where the query returns >10,000 (which normally triggers an error from the API),
	#' the function automatically breaks down the search into chunks by date and returns a combined and de-duped data set


	# Load libraries
	library(rentrez)
	library(pbapply)

	fetch_pubmed_data <- function(query, batch_size = 1000, api_key = NULL, max_retries = 3, delay = 0.5) {

	# Helper function to fetch data for a single query
	fetch_single_query <- function(q, batch_size, api_key, max_retries, delay, query_label = "") {
	# Perform an initial search to get the total count and web history
	search_results <- entrez_search(db = "pubmed", term = q, use_history = TRUE,
	retmax = 0, api_key = api_key)
	total_records <- search_results$count

	# Print concise message without full query
	if (query_label != "") {
	cat(query_label, ": ", total_records, " records found.\n", sep = "")
	} else {
	cat("Total records found: ", total_records, "\n", sep = "")
	}

	if (total_records == 0) {
	return(NULL)
	}

	# Calculate start positions for each batch
	starts <- seq(0, total_records - 1, by = batch_size)

	# Function to fetch a batch of summaries with error handling
	fetch_batch <- function(start) {
	retries <- 0
	current_delay <- delay
	repeat {
	tryCatch({
	summaries <- entrez_summary(db = "pubmed", web_history = search_results$web_history,
	retstart = start, retmax = batch_size, api_key = api_key)
	return(summaries)
	}, error = function(e) {
	retries <<- retries + 1
	if (retries > max_retries) {
	stop("Maximum retries exceeded for batch starting at ", start, ". Last error: ", e$message)
	}
	message("Error fetching batch starting at ", start, ": ", e$message)
	message("Retrying in ", current_delay, " seconds...")
	Sys.sleep(current_delay)
	current_delay <<- current_delay * 2 # Exponential backoff
	})
	}
	}

	# Fetch all summaries with a progress bar and error handling
	summaries_list <- pblapply(starts, fetch_batch)

	# Combine all summaries into a single list
	all_summaries <- do.call(c, summaries_list)

	# Convert the list of summaries to a data frame
	df <- do.call(rbind, lapply(all_summaries, function(x) {
	# Extract desired fields, handling missing values and data types
	authors_list <- NA
	# Enhanced error handling for authors extraction
	tryCatch({
	if (!is.null(x$authors)) {
	if (is.data.frame(x$authors) && nrow(x$authors) > 0 && "name" %in% names(x$authors)) {
	authors_list <- paste(x$authors$name, collapse = "; ")
	} else if (is.list(x$authors) && length(x$authors) > 0) {
	author_names <- sapply(x$authors, function(a) {
	if (is.list(a) && "name" %in% names(a)) {
	return(a$name)
	} else if (is.character(a)) {
	return(a)
	} else {
	return(NA)
	}
	})
	authors_list <- paste(na.omit(author_names), collapse = "; ")
	} else if (is.atomic(x$authors) && length(x$authors) > 0 && !is.na(x$authors)) {
	authors_list <- paste(x$authors, collapse = "; ")
	}
	}
	}, error = function(e) {
	authors_list <- NA
	})

	data.frame(
	uid = if (!is.null(x$uid)) x$uid else NA,
	pubdate = if (!is.null(x$pubdate)) x$pubdate else NA,
	epubdate = if (!is.null(x$epubdate)) x$epubdate else NA,
	source = if (!is.null(x$source)) x$source else NA,
	authors = authors_list,
	lastauthor = if (!is.null(x$lastauthor)) x$lastauthor else NA,
	title = if (!is.null(x$title)) x$title else NA,
	volume = if (!is.null(x$volume)) x$volume else NA,
	issue = if (!is.null(x$issue)) x$issue else NA,
	pages = if (!is.null(x$pages)) x$pages else NA,
	lang = if (!is.null(x$lang)) x$lang else NA,
	issn = if (!is.null(x$issn)) x$issn else NA,
	essn = if (!is.null(x$essn)) x$essn else NA,
	pubtype = if (!is.null(x$pubtype)) {
	paste(x$pubtype, collapse = "; ")
	} else {
	NA
	},
	pmcrefcount = if (!is.null(x$pmcrefcount)) x$pmcrefcount else NA,
	fulljournalname = if (!is.null(x$fulljournalname)) x$fulljournalname else NA,
	elocationid = if (!is.null(x$elocationid)) x$elocationid else NA,
	doi = if (!is.null(x$articleids)) {
	doi_index <- which(x$articleids$idtype == "doi")
	if (length(doi_index) > 0) {
	x$articleids$value[doi_index[1]]
	} else {
	NA
	}
	} else {
	NA
	},
	stringsAsFactors = FALSE
	)
	}))

	return(df)
	}

	# Helper function to get the last day of the month
	last_day_of_month <- function(date_obj) {
	date_obj <- as.Date(format(date_obj, "%Y-%m-01")) # Ensure first day of month
	next_month <- as.Date(format(date_obj, "%Y-%m-01")) + 31
	next_month <- as.Date(format(next_month, "%Y-%m-01"))
	last_day <- next_month - 1
	return(last_day)
	}

	# Helper function to recursively split date ranges
	split_date_range <- function(query_base, start_date_obj, end_date_obj, batch_size, api_key, max_retries, delay) {
	# Format dates to strings
	start_date_str <- format(start_date_obj, "%Y/%m/%d")
	end_date_str <- format(end_date_obj, "%Y/%m/%d")

	# Create the sub-query for the current interval
	interval_query <- paste0(query_base,
	" AND (\"", start_date_str, "\"[Date - Publication] : \"", end_date_str, "\"[Date - Publication])")

	# Get the total records for the sub-query
	sub_search <- entrez_search(db = "pubmed", term = interval_query, retmax = 0, api_key = api_key)
	sub_total <- sub_search$count

	# Print concise message
	interval_label <- paste("Date range", start_date_str, "to", end_date_str)
	# cat(interval_label, ": ", sub_total, " records found.\n", sep = "")

	if (sub_total == 0) {
	return(NULL) # No records found for this interval
	}

	if (sub_total <= 10000) {
	# Fetch data for this interval
	df_interval <- fetch_single_query(interval_query, batch_size, api_key, max_retries, delay, query_label = interval_label)
	return(df_interval)
	} else {
	# Split the date range into two halves
	mid_date_obj <- start_date_obj + floor(as.numeric(end_date_obj - start_date_obj) / 2)

	# Ensure that mid_date_obj is not equal to start_date_obj or end_date_obj to prevent infinite recursion
	if (mid_date_obj <= start_date_obj \|\| mid_date_obj >= end_date_obj) {
	# Try splitting by months
	months_seq <- seq.Date(start_date_obj, end_date_obj, by = "1 month")
	if (length(months_seq) <= 1) {
	stop("Cannot split date range further to reduce records below 10,000.")
	}
	combined_df <- NULL
	for (i in seq_along(months_seq)[-length(months_seq)]) {
	sub_start <- months_seq[i]
	sub_end <- months_seq[i + 1] - 1 # Subtract one day
	df_sub <- split_date_range(query_base, sub_start, sub_end, batch_size, api_key, max_retries, delay)
	combined_df <- rbind(combined_df, df_sub)
	}
	return(combined_df)
	}

	# Recursively split the first half
	df_first_half <- split_date_range(query_base, start_date_obj, mid_date_obj, batch_size, api_key, max_retries, delay)

	# Recursively split the second half
	df_second_half <- split_date_range(query_base, mid_date_obj + 1, end_date_obj, batch_size, api_key, max_retries, delay)

	# Combine results
	combined_df <- rbind(df_first_half, df_second_half)
	return(combined_df)
	}
	}

	# Main function logic
	# Perform an initial search to get the total count
	initial_search <- entrez_search(db = "pubmed", term = query, retmax = 0, api_key = api_key)
	total_records <- initial_search$count

	cat("Total records found for initial query: ", total_records, "\n", sep = "")

	# Check if total records <= 10,000
	if (total_records <= 10000) {
	return(fetch_single_query(query, batch_size, api_key, max_retries, delay))
	} else {

	cat("N records > 10,000. Chunking by date to reduce query size")

	# Attempt to detect date range in the query
	# Regular expressions to find date ranges in various formats
	date_range_patterns <- list(
	# Pattern for ("YYYY-MM-DD"[Date - Publication] : "YYYY-MM-DD"[Date - Publication])
	"\\(\\s\"?(\\d{4}-\\d{2}-\\d{2})\"?\\s\\[Date\\s-\\sPublication\\]\\s[:\\-]\\s\"?(\\d{4}-\\d{2}-\\d{2})\"?\\s\\[Date\\s-\\sPublication\\]\\s\\)",
	# Pattern for ("YYYY-MM"[Date - Publication] : "YYYY-MM"[Date - Publication])
	"\\(\\s\"?(\\d{4}-\\d{2})\"?\\s\\[Date\\s-\\sPublication\\]\\s[:\\-]\\s\"?(\\d{4}-\\d{2})\"?\\s\\[Date\\s-\\sPublication\\]\\s\\)",
	# Pattern for ("YYYY"[Date - Publication] : "YYYY"[Date - Publication])
	"\\(\\s\"?(\\d{4})\"?\\s\\[Date\\s-\\sPublication\\]\\s[:\\-]\\s\"?(\\d{4})\"?\\s\\[Date\\s-\\sPublication\\]\\s\\)",
	# Pattern for YYYY[:\-]YYYY[DP]
	"(\\d{4})\\s[:\\-]\\s(\\d{4})\\s*\\[DP\\]"
	)

	date_range_found <- FALSE
	for (pattern in date_range_patterns) {
	date_ranges <- gregexpr(pattern, query, perl = TRUE)
	date_range_matches <- regmatches(query, date_ranges)
	if (length(date_range_matches[[1]]) > 0) {
	# Extract the date range(s) from the query
	date_range_string <- date_range_matches[[1]][1]
	date_range_found <- TRUE

	# Remove the date range from the query to avoid duplication
	query_without_date <- trimws(gsub(pattern, "", query, perl = TRUE))

	# Extract the start and end dates using regmatches
	matches <- regexec(pattern, date_range_string, perl = TRUE)
	date_parts <- regmatches(date_range_string, matches)
	start_date <- date_parts[[1]][2]
	end_date <- date_parts[[1]][3]

	if (is.na(end_date) \|\| end_date == "") {
	# Single date provided
	end_date <- start_date
	}

	cat("Detected date range in query: ", start_date, " to ", end_date, "\n", sep = "")
	break
	}
	}

	if (!date_range_found) {
	# No date range detected, set default range
	start_date <- "1900-01-01"
	end_date <- format(Sys.Date(), "%Y-%m-%d")
	query_without_date <- query
	cat("No date range detected in query. Using default date range: ", start_date, " to ", end_date, "\n", sep = "")
	}

	# Parse dates into Date objects using base::as.Date()
	parse_date <- function(date_str, is_end_date = FALSE) {
	# Remove surrounding quotes and whitespace
	date_str <- gsub('^"\|"$', '', date_str)
	date_str <- trimws(date_str)

	# Try parsing with different formats
	if (grepl("^\\d{4}-\\d{2}-\\d{2}$", date_str)) {
	date_obj <- base::as.Date(date_str, format = "%Y-%m-%d")
	} else if (grepl("^\\d{4}-\\d{2}$", date_str)) {
	# Handle YYYY-MM format
	date_obj <- base::as.Date(paste0(date_str, "-01"), format = "%Y-%m-%d")
	if (is_end_date) {
	# Get last day of the month
	date_obj <- last_day_of_month(date_obj)
	}
	} else if (grepl("^\\d{4}$", date_str)) {
	# Handle YYYY format
	if (is_end_date) {
	date_obj <- base::as.Date(paste0(date_str, "-12-31"), format = "%Y-%m-%d")
	} else {
	date_obj <- base::as.Date(paste0(date_str, "-01-01"), format = "%Y-%m-%d")
	}
	} else {
	date_obj <- NA
	}
	return(date_obj)
	}

	start_date_obj <- parse_date(start_date)
	if (is.na(start_date_obj)) {
	stop("Failed to parse start date: ", start_date)
	}

	end_date_obj <- parse_date(end_date, is_end_date = TRUE)
	if (is.na(end_date_obj)) {
	stop("Failed to parse end date: ", end_date)
	}

	# Ensure end_date_obj is not before start_date_obj
	if (end_date_obj < start_date_obj) {
	stop("End date is before start date in the date range.")
	}

	# Remove any residual colons or brackets from the query
	query_without_date <- gsub("\\s[:;]\\s", " ", query_without_date)
	query_without_date <- gsub("\\s\\[\\s\\]", " ", query_without_date)

	# Trim any extra whitespace
	query_without_date <- trimws(query_without_date)

	# Recursively split the date range and fetch data
	combined_df <- split_date_range(query_without_date, start_date_obj, end_date_obj, batch_size, api_key, max_retries, delay)

	# Remove duplicate records based on PMID (uid)
	if (!is.null(combined_df)) {
	combined_df <- combined_df[!duplicated(combined_df$uid), ]
	# Reset row names
	row.names(combined_df) <- NULL
	}

	return(combined_df)
	}
	}