Forked from stephenturner/deseq2-analysis-template.R
Created
October 1, 2018 13:02
Revisions
-
stephenturner created this gist
Jul 30, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,151 @@ ## RNA-seq analysis with DESeq2 ## Stephen Turner, @genetics_blog # RNA-seq data from GSE52202 # http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=gse52202. All patients with # ALS, 4 with C9 expansion ("exp"), 4 controls without expansion ("ctl") # Import & pre-process ---------------------------------------------------- # Import data from featureCounts ## Previously ran at command line something like this: ## featureCounts -a genes.gtf -o counts.txt -T 12 -t exon -g gene_id GSM*.sam countdata <- read.table("counts.txt", header=TRUE, row.names=1) # Remove first five columns (chr, start, end, strand, length) countdata <- countdata[ ,6:ncol(countdata)] # Remove .bam or .sam from filenames colnames(countdata) <- gsub("\\.[sb]am$", "", colnames(countdata)) # Convert to matrix countdata <- as.matrix(countdata) head(countdata) # Assign condition (first four are controls, second four contain the expansion) (condition <- factor(c(rep("ctl", 4), rep("exp", 4)))) # Analysis with DESeq2 ---------------------------------------------------- library(DESeq2) # Create a coldata frame and instantiate the DESeqDataSet. See ?DESeqDataSetFromMatrix (coldata <- data.frame(row.names=colnames(countdata), condition)) dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition) dds # Run the DESeq pipeline dds <- DESeq(dds) # Plot dispersions png("qc-dispersions.png", 1000, 1000, pointsize=20) plotDispEsts(dds, main="Dispersion plot") dev.off() # Regularized log transformation for clustering/heatmaps, etc rld <- rlogTransformation(dds) head(assay(rld)) hist(assay(rld)) # Colors for plots below ## Ugly: ## (mycols <- 1:length(unique(condition))) ## Use RColorBrewer, better library(RColorBrewer) (mycols <- brewer.pal(8, "Dark2")[1:length(unique(condition))]) # Sample distance heatmap sampleDists <- as.matrix(dist(t(assay(rld)))) library(gplots) png("qc-heatmap-samples.png", w=1000, h=1000, pointsize=20) heatmap.2(as.matrix(sampleDists), key=F, trace="none", col=colorpanel(100, "black", "white"), ColSideColors=mycols[condition], RowSideColors=mycols[condition], margin=c(10, 10), main="Sample Distance Matrix") dev.off() # Principal components analysis ## Could do with built-in DESeq2 function: ## DESeq2::plotPCA(rld, intgroup="condition") ## I like mine better: rld_pca <- function (rld, intgroup = "condition", ntop = 500, colors=NULL, legendpos="bottomleft", main="PCA Biplot", textcx=1, ...) { require(genefilter) require(calibrate) require(RColorBrewer) rv = rowVars(assay(rld)) select = order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))] pca = prcomp(t(assay(rld)[select, ])) fac = factor(apply(as.data.frame(colData(rld)[, intgroup, drop = FALSE]), 1, paste, collapse = " : ")) if (is.null(colors)) { if (nlevels(fac) >= 3) { colors = brewer.pal(nlevels(fac), "Paired") } else { colors = c("black", "red") } } pc1var <- round(summary(pca)$importance[2,1]*100, digits=1) pc2var <- round(summary(pca)$importance[2,2]*100, digits=1) pc1lab <- paste0("PC1 (",as.character(pc1var),"%)") pc2lab <- paste0("PC1 (",as.character(pc2var),"%)") plot(PC2~PC1, data=as.data.frame(pca$x), bg=colors[fac], pch=21, xlab=pc1lab, ylab=pc2lab, main=main, ...) with(as.data.frame(pca$x), textxy(PC1, PC2, labs=rownames(as.data.frame(pca$x)), cex=textcx)) legend(legendpos, legend=levels(fac), col=colors, pch=20) # rldyplot(PC2 ~ PC1, groups = fac, data = as.data.frame(pca$rld), # pch = 16, cerld = 2, aspect = "iso", col = colours, main = draw.key(key = list(rect = list(col = colours), # terldt = list(levels(fac)), rep = FALSE))) } png("qc-pca.png", 1000, 1000, pointsize=20) rld_pca(rld, colors=mycols, intgroup="condition", xlim=c(-75, 35)) dev.off() # Get differential expression results res <- results(dds) table(res$padj<0.05) ## Order by adjusted p-value res <- res[order(res$padj), ] ## Merge with normalized count data resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE) names(resdata)[1] <- "Gene" head(resdata) ## Write results write.csv(resdata, file="diffexpr-results.csv") ## Examine plot of p-values hist(res$pvalue, breaks=50, col="grey") ## Examine independent filtering attr(res, "filterThreshold") plot(attr(res,"filterNumRej"), type="b", xlab="quantiles of baseMean", ylab="number of rejections") ## MA plot ## Could do with built-in DESeq2 function: ## DESeq2::plotMA(dds, ylim=c(-1,1), cex=1) ## I like mine better: maplot <- function (res, thresh=0.05, labelsig=TRUE, textcx=1, ...) { with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x", ...)) with(subset(res, padj<thresh), points(baseMean, log2FoldChange, col="red", pch=20, cex=1.5)) if (labelsig) { require(calibrate) with(subset(res, padj<thresh), textxy(baseMean, log2FoldChange, labs=Gene, cex=textcx, col=2)) } } png("diffexpr-maplot.png", 1500, 1000, pointsize=20) maplot(resdata, main="MA Plot") dev.off() ## Volcano plot with "significant" genes labeled volcanoplot <- function (res, lfcthresh=2, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) { with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...)) with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...)) with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="orange", ...)) with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="green", ...)) if (labelsig) { require(calibrate) with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...)) } legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,sep=""), "both"), pch=20, col=c("red","orange","green")) } png("diffexpr-volcanoplot.png", 1200, 1000, pointsize=20) volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-2.3, 2)) dev.off()