Last active
February 21, 2018 04:02
-
-
Save tgirke/57b32e2c86b4912a13c6bec60255333b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################### | |
## Alignment Stats with Support for FASTQ Files with >500M Reads ## | |
################################################################### | |
alignStats <- function(args) { | |
fqpaths <- infile1(args) | |
bampaths <- outpaths(args) | |
bamexists <- file.exists(bampaths) | |
fqpaths <- fqpaths[bamexists] | |
bampaths <- bampaths[bamexists] | |
## Obtain total read number from FASTQ files | |
Nreads <- countLines(fqpaths)/4 | |
names(Nreads) <- names(fqpaths) | |
## Solve integer overflow problem of countLines when fastq files have more than .Machine$integer.max lines | |
Nreads[sign(Nreads)==-1] <- NA | |
if(any(is.na(Nreads))) { | |
badNreads <- which(is.na(Nreads)) | |
for(i in seq_along(badNreads)) { | |
srNreads <- 0 | |
f <- ShortRead::FastqStreamer(fqpaths[badNreads[i]], 10^7) | |
while(length(fq <- yield(f))) { | |
srNreads <- srNreads + length(fq) | |
} | |
close(f) | |
Nreads[badNreads[i]] <- srNreads | |
} | |
} | |
## If reads are PE multiply by 2 as a rough approximation | |
if(nchar(infile2(args))[1] > 0) Nreads <- Nreads * 2 | |
## Obtain total number of alignments from BAM files | |
bfl <- BamFileList(bampaths, yieldSize=50000, index=character()) | |
param <- ScanBamParam(flag=scanBamFlag(isUnmappedQuery=FALSE)) | |
Nalign <- countBam(bfl, param=param) | |
## Obtain number of primary alignments from BAM files | |
param <- ScanBamParam(flag=scanBamFlag(isSecondaryAlignment=FALSE, isUnmappedQuery=FALSE)) | |
Nalignprim <- countBam(bfl, param=param) | |
statsDF <- data.frame(FileName=names(Nreads), | |
Nreads=Nreads, | |
Nalign=Nalign$records, | |
Perc_Aligned=Nalign$records/Nreads*100, | |
Nalign_Primary=Nalignprim$records, | |
Perc_Aligned_Primary=Nalignprim$records/Nreads*100 | |
) | |
if(nchar(infile2(args))[1] > 0) colnames(statsDF)[which(colnames(statsDF)=="Nreads")] <- "Nreads2x" | |
return(statsDF) | |
} | |
## Usage: | |
# source("https://gist.githubusercontent.com/tgirke/57b32e2c86b4912a13c6bec60255333b/raw/eda035cc519e29d982a575b1aed65f3179013ea5/alignStats.R") | |
# read_statsDF <- alignStats(args=args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment