Skip to content

Instantly share code, notes, and snippets.

Revisions

  1. @stephenturner stephenturner created this gist Jul 30, 2014.
    151 changes: 151 additions & 0 deletions deseq2-analysis-template.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,151 @@
    ## RNA-seq analysis with DESeq2
    ## Stephen Turner, @genetics_blog

    # RNA-seq data from GSE52202
    # http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=gse52202. All patients with
    # ALS, 4 with C9 expansion ("exp"), 4 controls without expansion ("ctl")

    # Import & pre-process ----------------------------------------------------

    # Import data from featureCounts
    ## Previously ran at command line something like this:
    ## featureCounts -a genes.gtf -o counts.txt -T 12 -t exon -g gene_id GSM*.sam
    countdata <- read.table("counts.txt", header=TRUE, row.names=1)

    # Remove first five columns (chr, start, end, strand, length)
    countdata <- countdata[ ,6:ncol(countdata)]

    # Remove .bam or .sam from filenames
    colnames(countdata) <- gsub("\\.[sb]am$", "", colnames(countdata))

    # Convert to matrix
    countdata <- as.matrix(countdata)
    head(countdata)

    # Assign condition (first four are controls, second four contain the expansion)
    (condition <- factor(c(rep("ctl", 4), rep("exp", 4))))

    # Analysis with DESeq2 ----------------------------------------------------

    library(DESeq2)

    # Create a coldata frame and instantiate the DESeqDataSet. See ?DESeqDataSetFromMatrix
    (coldata <- data.frame(row.names=colnames(countdata), condition))
    dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
    dds

    # Run the DESeq pipeline
    dds <- DESeq(dds)

    # Plot dispersions
    png("qc-dispersions.png", 1000, 1000, pointsize=20)
    plotDispEsts(dds, main="Dispersion plot")
    dev.off()

    # Regularized log transformation for clustering/heatmaps, etc
    rld <- rlogTransformation(dds)
    head(assay(rld))
    hist(assay(rld))

    # Colors for plots below
    ## Ugly:
    ## (mycols <- 1:length(unique(condition)))
    ## Use RColorBrewer, better
    library(RColorBrewer)
    (mycols <- brewer.pal(8, "Dark2")[1:length(unique(condition))])

    # Sample distance heatmap
    sampleDists <- as.matrix(dist(t(assay(rld))))
    library(gplots)
    png("qc-heatmap-samples.png", w=1000, h=1000, pointsize=20)
    heatmap.2(as.matrix(sampleDists), key=F, trace="none",
    col=colorpanel(100, "black", "white"),
    ColSideColors=mycols[condition], RowSideColors=mycols[condition],
    margin=c(10, 10), main="Sample Distance Matrix")
    dev.off()

    # Principal components analysis
    ## Could do with built-in DESeq2 function:
    ## DESeq2::plotPCA(rld, intgroup="condition")
    ## I like mine better:
    rld_pca <- function (rld, intgroup = "condition", ntop = 500, colors=NULL, legendpos="bottomleft", main="PCA Biplot", textcx=1, ...) {
    require(genefilter)
    require(calibrate)
    require(RColorBrewer)
    rv = rowVars(assay(rld))
    select = order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
    pca = prcomp(t(assay(rld)[select, ]))
    fac = factor(apply(as.data.frame(colData(rld)[, intgroup, drop = FALSE]), 1, paste, collapse = " : "))
    if (is.null(colors)) {
    if (nlevels(fac) >= 3) {
    colors = brewer.pal(nlevels(fac), "Paired")
    } else {
    colors = c("black", "red")
    }
    }
    pc1var <- round(summary(pca)$importance[2,1]*100, digits=1)
    pc2var <- round(summary(pca)$importance[2,2]*100, digits=1)
    pc1lab <- paste0("PC1 (",as.character(pc1var),"%)")
    pc2lab <- paste0("PC1 (",as.character(pc2var),"%)")
    plot(PC2~PC1, data=as.data.frame(pca$x), bg=colors[fac], pch=21, xlab=pc1lab, ylab=pc2lab, main=main, ...)
    with(as.data.frame(pca$x), textxy(PC1, PC2, labs=rownames(as.data.frame(pca$x)), cex=textcx))
    legend(legendpos, legend=levels(fac), col=colors, pch=20)
    # rldyplot(PC2 ~ PC1, groups = fac, data = as.data.frame(pca$rld),
    # pch = 16, cerld = 2, aspect = "iso", col = colours, main = draw.key(key = list(rect = list(col = colours),
    # terldt = list(levels(fac)), rep = FALSE)))
    }
    png("qc-pca.png", 1000, 1000, pointsize=20)
    rld_pca(rld, colors=mycols, intgroup="condition", xlim=c(-75, 35))
    dev.off()


    # Get differential expression results
    res <- results(dds)
    table(res$padj<0.05)
    ## Order by adjusted p-value
    res <- res[order(res$padj), ]
    ## Merge with normalized count data
    resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
    names(resdata)[1] <- "Gene"
    head(resdata)
    ## Write results
    write.csv(resdata, file="diffexpr-results.csv")

    ## Examine plot of p-values
    hist(res$pvalue, breaks=50, col="grey")

    ## Examine independent filtering
    attr(res, "filterThreshold")
    plot(attr(res,"filterNumRej"), type="b", xlab="quantiles of baseMean", ylab="number of rejections")

    ## MA plot
    ## Could do with built-in DESeq2 function:
    ## DESeq2::plotMA(dds, ylim=c(-1,1), cex=1)
    ## I like mine better:
    maplot <- function (res, thresh=0.05, labelsig=TRUE, textcx=1, ...) {
    with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x", ...))
    with(subset(res, padj<thresh), points(baseMean, log2FoldChange, col="red", pch=20, cex=1.5))
    if (labelsig) {
    require(calibrate)
    with(subset(res, padj<thresh), textxy(baseMean, log2FoldChange, labs=Gene, cex=textcx, col=2))
    }
    }
    png("diffexpr-maplot.png", 1500, 1000, pointsize=20)
    maplot(resdata, main="MA Plot")
    dev.off()

    ## Volcano plot with "significant" genes labeled
    volcanoplot <- function (res, lfcthresh=2, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) {
    with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...))
    with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...))
    with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="orange", ...))
    with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="green", ...))
    if (labelsig) {
    require(calibrate)
    with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...))
    }
    legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,sep=""), "both"), pch=20, col=c("red","orange","green"))
    }
    png("diffexpr-volcanoplot.png", 1200, 1000, pointsize=20)
    volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-2.3, 2))
    dev.off()