Last active
October 26, 2023 08:46
-
-
Save crazyhottommy/042aa268bbc8c6a4067545600da10d24 to your computer and use it in GitHub Desktop.
TCGA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
```{r} | |
TAAs<- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "NECTIN4", "EPCAM", "MUC16", "MUC1", "CD276", | |
"FOLH1", "DLL3", "VTCN1", "PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2") | |
``` | |
```{r} | |
TCGAExprsAndPheno2 <- function(gene_vector, cancer_types = "all", expression_type = "TPM"){ | |
library(recount3) | |
## Get all possible projects | |
all_proj <- available_projects() | |
tcga_proj <- all_proj %>% filter(file_source == "tcga") | |
if (cancer_types != "all") { | |
tcga_proj <- tcga_proj[tcga_proj$project %in% cancer_types,] | |
} | |
final_tcga <- NULL | |
for (tcga_ind in seq(nrow(tcga_proj))) { | |
proj_info <- tcga_proj[tcga_ind, c("project", "organism", "project_home")] | |
rse_gene_temp<- create_rse(proj_info) | |
count_matrix <- rse_gene_temp@assays@data$raw_counts | |
## Select gene ind | |
if (!identical("all", gene_vector)) { | |
gene_ind <- rse_gene_temp@rowRanges$gene_name %in% gene_vector | |
} else { | |
gene_ind <- seq(rse_gene_temp@rowRanges$gene_name) | |
} | |
if (expression_type == "TPM") { | |
#### Creating TPM | |
kilobase_length <- rse_gene_temp@rowRanges$bp_length | |
reads_per_rpk <- count_matrix/kilobase_length | |
per_mil_scale <- colSums(reads_per_rpk)/1000000 | |
tpm_matrix <- t(t(reads_per_rpk)/per_mil_scale) | |
## Make sure they match the ENSG and gene order | |
exprs_vector <- tpm_matrix[gene_ind,] | |
} else{ | |
#log2 normalize the count matrix | |
total_reads<- colSums(count_matrix) | |
normalized_counts<- log2(t(t(count_matrix)*10000000/total_reads) + 1) | |
exprs_vector <- normalized_counts[gene_ind,] | |
} | |
tcga_data_frame <- rse_gene_temp@colData@listData |> | |
as.data.frame() | |
if (nrow(exprs_vector) > length(gene_vector)) { | |
ind_vector <- rownames(exprs_vector) | |
} else { | |
ind_vector <- rse_gene_temp@rowRanges[gene_ind, ]$gene_name | |
} | |
tcga_data_frame[,ind_vector] <- exprs_vector | |
tcga_data_frame <- select(tcga_data_frame, !!ind_vector, everything()) | |
final_tcga <- rbind(final_tcga, tcga_data_frame) | |
} | |
## Group cancer data | |
final_tcga_filt <- final_tcga %>% | |
mutate(sample_type = case_when( | |
tcga.cgc_sample_sample_type == "Additional - New Primary" ~ "cancer", | |
tcga.cgc_sample_sample_type == "Additional Metastatic" ~ "metastatic", | |
tcga.cgc_sample_sample_type == "Metastatic" ~ "metastatic", | |
tcga.cgc_sample_sample_type == "Primary Blood Derived Cancer - Peripheral Blood " ~ "cancer", | |
tcga.cgc_sample_sample_type == "Primary Tumor" ~ "cancer", | |
tcga.cgc_sample_sample_type == "Recurrent Tumor" ~ "cancer", | |
tcga.cgc_sample_sample_type == "Solid Tissue Normal" ~ "normal" | |
)) | |
if (startsWith(colnames(final_tcga_filt)[1], "ENSG")) { | |
message("Multiple columns share gene names, returning ENSG") | |
} | |
return(final_tcga_filt) | |
} | |
``` | |
```{r} | |
tcga_TAA <- TCGAExprsAndPheno2(gene_vector = c(TAAs, "CD24"),cancer_type="all") | |
tcga_df<- tcga_TAA %>% | |
select(any_of(TAAs), CD24, tcga.tcga_barcode, sample_type, study) %>% | |
group_by(sample_type, study) %>% | |
summarise(across(1:20, ~log2(.x+1))) %>% | |
summarise(across(1:20, median)) %>% | |
arrange(study) %>% | |
filter(!is.na(sample_type)) | |
tcga_df<- tcga_df %>% | |
filter(sample_type == "cancer") | |
tcga_mat<- tcga_df[, -c(1,2)] %>% as.matrix() | |
rownames(tcga_mat)<- tcga_df %>% pull(study) | |
cell_fun = function(j, i, x, y, w, h, fill) { | |
grid.rect(x = x, y = y, width = w, height = h, | |
gp = gpar(col = "black", fill = NA)) | |
} | |
Heatmap(tcga_mat, cluster_columns = TRUE, cell_fun = cell_fun, | |
name = "log2TPM") | |
``` |
It seems the recount3 raw count is not right... I tried using a different method and the result (raw counts to TPM) is similar to the RSEM
a different data resource
library(ExperimentHub)
library(ComplexHeatmap)
eh = ExperimentHub()
query(eh , "GSE62944")
tcga_data<- eh[["EH1043"]]
colData(tcga_data)[1:5, 1:5]
count_mat<- assay(tcga_data)
count_mat[1:5, 1:5]
pheno_data<- phenoData(tcga_data)
get the gene length
#BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene")
#BiocManager::install("org.Hs.eg.db")
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)
library(readr)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
hg19_genes<- genes(txdb)
hg19_genes
# map the Entrez ID to gene symbol
gene_symbol<- AnnotationDbi::select(org.Hs.eg.db, keys=hg19_genes$gene_id,
columns="SYMBOL", keytype="ENTREZID")
all.equal(hg19_genes$gene_id, gene_symbol$ENTREZID)
hg19_genes$symbol<- gene_symbol$SYMBOL
hg19_genes
gene_df<- data.frame(EntrezID = hg19_genes$gene_id,
Symbol = hg19_genes$symbol,
Gene_length = width(hg19_genes))
# almost 3000 genes not in the dataframe..
table(rownames(count_mat) %in% gene_df$Symbol)
indx<- intersect(rownames(count_mat), gene_df$Symbol)
setdiff(rownames(count_mat), gene_df$Symbol)
setdiff(gene_df$Symbol, rownames(count_mat))
count_mat<- count_mat[indx, ]
gene_df<- gene_df %>%
slice(match(indx, Symbol))
dim(gene_df)
convert counts to TPM
count_to_tpm<- function(mat, gene_length){
mat<- mat/gene_length
total_per_sample<- colSums(mat)
mat<- t(t(mat)/total_per_sample)
return(mat*1000000)
}
tpm_mat<- count_to_tpm(count_mat, gene_df$Gene_length)
TAAs<- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "EPCAM", "MUC16", "MUC1", "CD276",
"FOLH1", "DLL3", "VTCN1", "PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2")
# can not find "NECTIN4", it might be a different name
TAAs[!TAAs %in% rownames(tpm_mat)]
tpm_sub<- tpm_mat[TAAs, ]
tpm_median<- cbind(t(tpm_sub), CancerType = as.character(colData(tcga_data)$CancerType)) %>%
as.data.frame() %>%
mutate(across(1:18, as.numeric)) %>%
group_by(CancerType) %>%
summarise(across(1:18, median))
tpm_sub_mat<- as.matrix(tpm_median[,-1])
rownames(tpm_sub_mat)<- tpm_median$CancerType
col_fun<- circlize::colorRamp2(c(-2,0,2), colors = c("blue", "white", "red"))
Heatmap(t(scale(log2(tpm_sub_mat +1))), cluster_columns = TRUE, cell_fun = cell_fun,
name = "log2TPM", col = col_fun)
I rewrote the code, and now the results make sense:
#BiocManager::install("recount3")
library(recount3)
library(purrr)
human_projects <- available_projects()
tcga_info = subset(
human_projects,
file_source == "tcga" & project_type == "data_sources"
)
seq(nrow(tcga_info))
proj_info <- map(seq(nrow(tcga_info)), ~tcga_info[.x, ])
rse_tcga <- map(proj_info, ~create_rse(.x))
/Users/tommytang/Library/Caches/org.R-project.R/R/recount3
does not exist, create directory? (yes/no): yes
TAAs<- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "NECTIN4", "EPCAM", "MUC16", "MUC1", "CD276", "FOLH1", "DLL3", "VTCN1", "PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2", "CD24")
#### Creating TPM
count2tpm<- function(rse){
count_matrix <- rse@assays@data$raw_counts
gene_length <- rse@rowRanges$bp_length
reads_per_rpk <- count_matrix/gene_length
per_mil_scale <- colSums(reads_per_rpk)/1000000
tpm_matrix <- t(t(reads_per_rpk)/per_mil_scale)
## Make sure they match the ENSG and gene order
gene_ind<- rse@rowRanges$gene_name %in% TAAs
tpm_submatrix <- tpm_matrix[gene_ind,]
rownames(tpm_submatrix)<- rse@rowRanges[gene_ind, ]$gene_name
return(tpm_submatrix)
}
tpm_data<- map(rse_tcga, count2tpm)
metadata<- map(rse_tcga, ~.x@colData@listData %>% as.data.frame())
tpm_data2<- reduce(tpm_data, cbind)
metadata2<- reduce(metadata, rbind)
dim(tpm_data2)
dim(metadata2)
metadata2<- metadata2 %>%
select(tcga.tcga_barcode, tcga.cgc_sample_sample_type, study) %>%
mutate(sample_type = case_when(
tcga.cgc_sample_sample_type == "Additional - New Primary" ~ "cancer",
tcga.cgc_sample_sample_type == "Additional Metastatic" ~ "metastatic",
tcga.cgc_sample_sample_type == "Metastatic" ~ "metastatic",
tcga.cgc_sample_sample_type == "Primary Blood Derived Cancer - Peripheral Blood " ~ "cancer",
tcga.cgc_sample_sample_type == "Primary Tumor" ~ "cancer",
tcga.cgc_sample_sample_type == "Recurrent Tumor" ~ "cancer",
tcga.cgc_sample_sample_type == "Solid Tissue Normal" ~ "normal"
))
final_df<- cbind(t(tpm_data2), metadata2)
library(ComplexHeatmap)
tcga_df<- final_df %>%
filter(sample_type == "cancer") %>%
group_by(sample_type, study) %>%
summarise(across(1:20, ~log2(.x+1))) %>%
summarise(across(1:20, median)) %>%
arrange(study) %>%
filter(!is.na(sample_type))
tcga_mat<- tcga_df[, -c(1,2)] %>% as.matrix()
rownames(tcga_mat)<- tcga_df %>% pull(study)
cell_fun = function(j, i, x, y, w, h, fill) {
grid.rect(x = x, y = y, width = w, height = h,
gp = gpar(col = "black", fill = NA))
}
Heatmap(tcga_mat, cluster_columns = TRUE, cell_fun = cell_fun,
name = "log2TPM")
You can also scale the expression for each gene across the cancer types
Heatmap(scale(tcga_mat), cluster_columns = TRUE, cell_fun = cell_fun,
name = "scaled\nlog2TPM")
That's great, Tommy!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks, Peter! This result looks right to me, with MSLN high in MESO and FOLH1 high in PRAD.
I need to figure out what's wrong with my code working with recount3 data... likely the way I calculate TPM is wrong.
Thanks again!
Tommy