Last active
October 25, 2022 11:22
-
-
Save joelnitta/fcd248ce3ba87aa264eac5fcdac9287b to your computer and use it in GitHub Desktop.
R code for extracting NCBI taxonomy data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(assertr) | |
# Define custom parsing function | |
#' Parse a single record from the NCBI taxonomy database | |
#' | |
#' The NCBI taxonomy database contains names in the `names.dmp` file. | |
#' A single record (corresponding to one `taxid`) looks like this: | |
#' | |
#' # A tibble: 4 × 3 | |
#' taxid name class | |
#' <chr> <chr> <chr> | |
#' 1 857989 Alansmia glandulifera (A.Rojas) Moguel & M.Kessler authority | |
#' 2 857989 Alansmia glandulifera scientific name | |
#' 3 857989 Terpsichore glandulifera A.Rojas authority | |
#' 4 857989 Terpsichore glandulifera synonym | |
#' | |
#' @param record Tibble (dataframe); a single record of NCBI taxonomy data | |
#' | |
#' @return Tibble; parsed data with columns "species", "accepted", and | |
#' "scientific_name" | |
#' @examples | |
#' record <- tribble( | |
#' ~taxid, ~name, ~class, | |
#' "857989", "Alansmia glandulifera (A.Rojas) Moguel & M.Kessler", "authority", #nolint | |
#' "857989", "Alansmia glandulifera", "scientific name", | |
#' "857989", "Terpsichore glandulifera A.Rojas", "authority", | |
#' "857989", "Terpsichore glandulifera", "synonym" | |
#' ) | |
#' parse_ncbi_tax_record(record) | |
parse_ncbi_tax_record <- function(record) { | |
# Each record should have exactly 1 accepted species name | |
accepted_species <- record %>% | |
filter(class == "scientific name") %>% | |
pull(name) | |
assertthat::assert_that( | |
length(accepted_species) == 1, | |
msg = "Not exactly 1 accepted scientific name detected") | |
# Confusingly, "synonyms" may sometimes contain the accepted name :/ | |
synonyms <- record %>% | |
filter(class == "synonym") %>% | |
pull(name) | |
# Scientific names (with author) have the class "authority" | |
sci_names <- record %>% | |
filter(class == "authority") %>% | |
pull(name) | |
# The results should always have at least taxon ID and species | |
species_dat <- tibble(species = accepted_species, accepted = TRUE) | |
# Create empty tibbles to hold other name data | |
acc_sci_names_dat <- tibble() | |
syn_sci_names_dat <- tibble() | |
# If other sci names are given, one of them should be the species name | |
if (length(sci_names) > 0) | |
acc_sci_names_dat <- tibble(scientific_name = sci_names) %>% | |
mutate(species = str_extract( | |
scientific_name, stringr::fixed(accepted_species))) %>% | |
filter(!is.na(species)) %>% | |
mutate(accepted = TRUE) | |
# If "synonym" and other sci names are given, one (or more) of them are | |
# the synonym | |
if (length(synonyms) > 0 && length(sci_names) > 0) | |
syn_sci_names_dat <- tibble(scientific_name = sci_names) %>% | |
# Each sci name should only correspond to max. one synonym | |
mutate(species = str_extract_first_target(scientific_name, synonyms)) %>% | |
filter(!is.na(species)) %>% | |
mutate(accepted = FALSE) | |
# If "synonym" is present but no other sci names are given, "synonym" is | |
# actually the scientific name of the species | |
if (length(synonyms) > 0 && length(sci_names) == 0) | |
acc_sci_names_dat <- tibble(scientific_name = synonyms) %>% | |
mutate(species = str_extract( | |
scientific_name, stringr::fixed(accepted_species))) %>% | |
filter(!is.na(species)) %>% | |
mutate(accepted = TRUE) | |
# Combine scientific names of synonyms and accepted names | |
combined_sci_names_dat <- bind_rows(syn_sci_names_dat, acc_sci_names_dat) | |
# Join to accepted species with taxon ID | |
if (nrow(combined_sci_names_dat) > 0) | |
species_dat <- full_join( | |
species_dat, combined_sci_names_dat, by = c("species", "accepted")) | |
species_dat | |
} | |
# To use parse_ncbi_tax_record(), first need to do some pre-processing | |
# First download taxdump file (old format) from NCBI FTP site | |
# https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/ | |
# e.g., | |
# https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz | |
taxdump_zip_file <- "path_to_taxdump.tar.gz" | |
# Unzip names.dmp to a temporary directory | |
temp_dir <- tempdir(check = TRUE) | |
utils::unzip( | |
taxdump_zip_file, files = "names.dmp", | |
overwrite = TRUE, junkpaths = TRUE, exdir = temp_dir) | |
# Load raw NCBI data | |
ncbi_raw <- | |
fs::path(temp_dir, "names.dmp") %>% | |
readr::read_delim( | |
delim = "\t|\t", col_names = FALSE, | |
col_types = cols(.default = col_character()) | |
) | |
# Delete temporary unzipped file | |
fs::file_delete(fs::path(temp_dir, "names.dmp")) | |
ncbi_names <- | |
ncbi_raw %>% | |
# Select only needed columns | |
transmute( | |
taxid = as.character(X1), | |
name = X2, | |
class = X4) %>% | |
# Make sure there are no hidden fields in `class` | |
verify(all(str_count(class, "\\|") == 1)) %>% | |
# Drop field separators in `class` | |
mutate(class = str_remove_all(class, "\\\t\\|")) %>% | |
# Only keep useful names: exclude common names, | |
# alternative spellings (`equivalent name`), type material, | |
# temporary names with 'sp.' (`includes`) | |
filter(class %in% c("authority", "scientific name", "synonym")) | |
# parse names | |
ncbi_names_parsed <- | |
ncbi_names %>% | |
group_by(taxid) %>% | |
nest() %>% | |
ungroup() %>% | |
mutate( | |
data = map(data, parse_ncbi_tax_record) | |
) %>% | |
unnest(data) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment