Last active
October 26, 2023 08:46
-
-
Save crazyhottommy/042aa268bbc8c6a4067545600da10d24 to your computer and use it in GitHub Desktop.
TCGA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
```{r} | |
TAAs<- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "NECTIN4", "EPCAM", "MUC16", "MUC1", "CD276", | |
"FOLH1", "DLL3", "VTCN1", "PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2") | |
``` | |
```{r} | |
TCGAExprsAndPheno2 <- function(gene_vector, cancer_types = "all", expression_type = "TPM"){ | |
library(recount3) | |
## Get all possible projects | |
all_proj <- available_projects() | |
tcga_proj <- all_proj %>% filter(file_source == "tcga") | |
if (cancer_types != "all") { | |
tcga_proj <- tcga_proj[tcga_proj$project %in% cancer_types,] | |
} | |
final_tcga <- NULL | |
for (tcga_ind in seq(nrow(tcga_proj))) { | |
proj_info <- tcga_proj[tcga_ind, c("project", "organism", "project_home")] | |
rse_gene_temp<- create_rse(proj_info) | |
count_matrix <- rse_gene_temp@assays@data$raw_counts | |
## Select gene ind | |
if (!identical("all", gene_vector)) { | |
gene_ind <- rse_gene_temp@rowRanges$gene_name %in% gene_vector | |
} else { | |
gene_ind <- seq(rse_gene_temp@rowRanges$gene_name) | |
} | |
if (expression_type == "TPM") { | |
#### Creating TPM | |
kilobase_length <- rse_gene_temp@rowRanges$bp_length | |
reads_per_rpk <- count_matrix/kilobase_length | |
per_mil_scale <- colSums(reads_per_rpk)/1000000 | |
tpm_matrix <- t(t(reads_per_rpk)/per_mil_scale) | |
## Make sure they match the ENSG and gene order | |
exprs_vector <- tpm_matrix[gene_ind,] | |
} else{ | |
#log2 normalize the count matrix | |
total_reads<- colSums(count_matrix) | |
normalized_counts<- log2(t(t(count_matrix)*10000000/total_reads) + 1) | |
exprs_vector <- normalized_counts[gene_ind,] | |
} | |
tcga_data_frame <- rse_gene_temp@colData@listData |> | |
as.data.frame() | |
if (nrow(exprs_vector) > length(gene_vector)) { | |
ind_vector <- rownames(exprs_vector) | |
} else { | |
ind_vector <- rse_gene_temp@rowRanges[gene_ind, ]$gene_name | |
} | |
tcga_data_frame[,ind_vector] <- exprs_vector | |
tcga_data_frame <- select(tcga_data_frame, !!ind_vector, everything()) | |
final_tcga <- rbind(final_tcga, tcga_data_frame) | |
} | |
## Group cancer data | |
final_tcga_filt <- final_tcga %>% | |
mutate(sample_type = case_when( | |
tcga.cgc_sample_sample_type == "Additional - New Primary" ~ "cancer", | |
tcga.cgc_sample_sample_type == "Additional Metastatic" ~ "metastatic", | |
tcga.cgc_sample_sample_type == "Metastatic" ~ "metastatic", | |
tcga.cgc_sample_sample_type == "Primary Blood Derived Cancer - Peripheral Blood " ~ "cancer", | |
tcga.cgc_sample_sample_type == "Primary Tumor" ~ "cancer", | |
tcga.cgc_sample_sample_type == "Recurrent Tumor" ~ "cancer", | |
tcga.cgc_sample_sample_type == "Solid Tissue Normal" ~ "normal" | |
)) | |
if (startsWith(colnames(final_tcga_filt)[1], "ENSG")) { | |
message("Multiple columns share gene names, returning ENSG") | |
} | |
return(final_tcga_filt) | |
} | |
``` | |
```{r} | |
tcga_TAA <- TCGAExprsAndPheno2(gene_vector = c(TAAs, "CD24"),cancer_type="all") | |
tcga_df<- tcga_TAA %>% | |
select(any_of(TAAs), CD24, tcga.tcga_barcode, sample_type, study) %>% | |
group_by(sample_type, study) %>% | |
summarise(across(1:20, ~log2(.x+1))) %>% | |
summarise(across(1:20, median)) %>% | |
arrange(study) %>% | |
filter(!is.na(sample_type)) | |
tcga_df<- tcga_df %>% | |
filter(sample_type == "cancer") | |
tcga_mat<- tcga_df[, -c(1,2)] %>% as.matrix() | |
rownames(tcga_mat)<- tcga_df %>% pull(study) | |
cell_fun = function(j, i, x, y, w, h, fill) { | |
grid.rect(x = x, y = y, width = w, height = h, | |
gp = gpar(col = "black", fill = NA)) | |
} | |
Heatmap(tcga_mat, cluster_columns = TRUE, cell_fun = cell_fun, | |
name = "log2TPM") | |
``` |
That's great, Tommy!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I rewrote the code, and now the results make sense:
/Users/tommytang/Library/Caches/org.R-project.R/R/recount3
does not exist, create directory? (yes/no): yes
You can also scale the expression for each gene across the cancer types