Last active
October 26, 2023 08:46
-
-
Save crazyhottommy/042aa268bbc8c6a4067545600da10d24 to your computer and use it in GitHub Desktop.
TCGA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
```{r} | |
TAAs<- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "NECTIN4", "EPCAM", "MUC16", "MUC1", "CD276", | |
"FOLH1", "DLL3", "VTCN1", "PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2") | |
``` | |
```{r} | |
TCGAExprsAndPheno2 <- function(gene_vector, cancer_types = "all", expression_type = "TPM"){ | |
library(recount3) | |
## Get all possible projects | |
all_proj <- available_projects() | |
tcga_proj <- all_proj %>% filter(file_source == "tcga") | |
if (cancer_types != "all") { | |
tcga_proj <- tcga_proj[tcga_proj$project %in% cancer_types,] | |
} | |
final_tcga <- NULL | |
for (tcga_ind in seq(nrow(tcga_proj))) { | |
proj_info <- tcga_proj[tcga_ind, c("project", "organism", "project_home")] | |
rse_gene_temp<- create_rse(proj_info) | |
count_matrix <- rse_gene_temp@assays@data$raw_counts | |
## Select gene ind | |
if (!identical("all", gene_vector)) { | |
gene_ind <- rse_gene_temp@rowRanges$gene_name %in% gene_vector | |
} else { | |
gene_ind <- seq(rse_gene_temp@rowRanges$gene_name) | |
} | |
if (expression_type == "TPM") { | |
#### Creating TPM | |
kilobase_length <- rse_gene_temp@rowRanges$bp_length | |
reads_per_rpk <- count_matrix/kilobase_length | |
per_mil_scale <- colSums(reads_per_rpk)/1000000 | |
tpm_matrix <- t(t(reads_per_rpk)/per_mil_scale) | |
## Make sure they match the ENSG and gene order | |
exprs_vector <- tpm_matrix[gene_ind,] | |
} else{ | |
#log2 normalize the count matrix | |
total_reads<- colSums(count_matrix) | |
normalized_counts<- log2(t(t(count_matrix)*10000000/total_reads) + 1) | |
exprs_vector <- normalized_counts[gene_ind,] | |
} | |
tcga_data_frame <- rse_gene_temp@colData@listData |> | |
as.data.frame() | |
if (nrow(exprs_vector) > length(gene_vector)) { | |
ind_vector <- rownames(exprs_vector) | |
} else { | |
ind_vector <- rse_gene_temp@rowRanges[gene_ind, ]$gene_name | |
} | |
tcga_data_frame[,ind_vector] <- exprs_vector | |
tcga_data_frame <- select(tcga_data_frame, !!ind_vector, everything()) | |
final_tcga <- rbind(final_tcga, tcga_data_frame) | |
} | |
## Group cancer data | |
final_tcga_filt <- final_tcga %>% | |
mutate(sample_type = case_when( | |
tcga.cgc_sample_sample_type == "Additional - New Primary" ~ "cancer", | |
tcga.cgc_sample_sample_type == "Additional Metastatic" ~ "metastatic", | |
tcga.cgc_sample_sample_type == "Metastatic" ~ "metastatic", | |
tcga.cgc_sample_sample_type == "Primary Blood Derived Cancer - Peripheral Blood " ~ "cancer", | |
tcga.cgc_sample_sample_type == "Primary Tumor" ~ "cancer", | |
tcga.cgc_sample_sample_type == "Recurrent Tumor" ~ "cancer", | |
tcga.cgc_sample_sample_type == "Solid Tissue Normal" ~ "normal" | |
)) | |
if (startsWith(colnames(final_tcga_filt)[1], "ENSG")) { | |
message("Multiple columns share gene names, returning ENSG") | |
} | |
return(final_tcga_filt) | |
} | |
``` | |
```{r} | |
tcga_TAA <- TCGAExprsAndPheno2(gene_vector = c(TAAs, "CD24"),cancer_type="all") | |
tcga_df<- tcga_TAA %>% | |
select(any_of(TAAs), CD24, tcga.tcga_barcode, sample_type, study) %>% | |
group_by(sample_type, study) %>% | |
summarise(across(1:20, ~log2(.x+1))) %>% | |
summarise(across(1:20, median)) %>% | |
arrange(study) %>% | |
filter(!is.na(sample_type)) | |
tcga_df<- tcga_df %>% | |
filter(sample_type == "cancer") | |
tcga_mat<- tcga_df[, -c(1,2)] %>% as.matrix() | |
rownames(tcga_mat)<- tcga_df %>% pull(study) | |
cell_fun = function(j, i, x, y, w, h, fill) { | |
grid.rect(x = x, y = y, width = w, height = h, | |
gp = gpar(col = "black", fill = NA)) | |
} | |
Heatmap(tcga_mat, cluster_columns = TRUE, cell_fun = cell_fun, | |
name = "log2TPM") | |
``` |
I rewrote the code, and now the results make sense:
#BiocManager::install("recount3")
library(recount3)
library(purrr)
human_projects <- available_projects()
tcga_info = subset(
human_projects,
file_source == "tcga" & project_type == "data_sources"
)
seq(nrow(tcga_info))
proj_info <- map(seq(nrow(tcga_info)), ~tcga_info[.x, ])
rse_tcga <- map(proj_info, ~create_rse(.x))
/Users/tommytang/Library/Caches/org.R-project.R/R/recount3
does not exist, create directory? (yes/no): yes
TAAs<- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "NECTIN4", "EPCAM", "MUC16", "MUC1", "CD276", "FOLH1", "DLL3", "VTCN1", "PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2", "CD24")
#### Creating TPM
count2tpm<- function(rse){
count_matrix <- rse@assays@data$raw_counts
gene_length <- rse@rowRanges$bp_length
reads_per_rpk <- count_matrix/gene_length
per_mil_scale <- colSums(reads_per_rpk)/1000000
tpm_matrix <- t(t(reads_per_rpk)/per_mil_scale)
## Make sure they match the ENSG and gene order
gene_ind<- rse@rowRanges$gene_name %in% TAAs
tpm_submatrix <- tpm_matrix[gene_ind,]
rownames(tpm_submatrix)<- rse@rowRanges[gene_ind, ]$gene_name
return(tpm_submatrix)
}
tpm_data<- map(rse_tcga, count2tpm)
metadata<- map(rse_tcga, ~.x@colData@listData %>% as.data.frame())
tpm_data2<- reduce(tpm_data, cbind)
metadata2<- reduce(metadata, rbind)
dim(tpm_data2)
dim(metadata2)
metadata2<- metadata2 %>%
select(tcga.tcga_barcode, tcga.cgc_sample_sample_type, study) %>%
mutate(sample_type = case_when(
tcga.cgc_sample_sample_type == "Additional - New Primary" ~ "cancer",
tcga.cgc_sample_sample_type == "Additional Metastatic" ~ "metastatic",
tcga.cgc_sample_sample_type == "Metastatic" ~ "metastatic",
tcga.cgc_sample_sample_type == "Primary Blood Derived Cancer - Peripheral Blood " ~ "cancer",
tcga.cgc_sample_sample_type == "Primary Tumor" ~ "cancer",
tcga.cgc_sample_sample_type == "Recurrent Tumor" ~ "cancer",
tcga.cgc_sample_sample_type == "Solid Tissue Normal" ~ "normal"
))
final_df<- cbind(t(tpm_data2), metadata2)
library(ComplexHeatmap)
tcga_df<- final_df %>%
filter(sample_type == "cancer") %>%
group_by(sample_type, study) %>%
summarise(across(1:20, ~log2(.x+1))) %>%
summarise(across(1:20, median)) %>%
arrange(study) %>%
filter(!is.na(sample_type))
tcga_mat<- tcga_df[, -c(1,2)] %>% as.matrix()
rownames(tcga_mat)<- tcga_df %>% pull(study)
cell_fun = function(j, i, x, y, w, h, fill) {
grid.rect(x = x, y = y, width = w, height = h,
gp = gpar(col = "black", fill = NA))
}
Heatmap(tcga_mat, cluster_columns = TRUE, cell_fun = cell_fun,
name = "log2TPM")
You can also scale the expression for each gene across the cancer types
Heatmap(scale(tcga_mat), cluster_columns = TRUE, cell_fun = cell_fun,
name = "scaled\nlog2TPM")
That's great, Tommy!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It seems the recount3 raw count is not right... I tried using a different method and the result (raw counts to TPM) is similar to the RSEM
a different data resource
get the gene length
convert counts to TPM