Created
June 28, 2019 21:08
Revisions
-
jamestwebber created this gist
Jun 28, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,109 @@ #!/usr/bin/env python3 import argparse import gzip import itertools import os import multiprocessing from collections import Counter, defaultdict if __name__ == '__main__': parser = argparse.ArgumentParser( prog='filter_fastq', description=('Script to filter a fastq.gz file for 20bp guides' ' and output a count file') ) parser.add_argument('-b', '--guides', required=True, help='fasta file of guide sequences') parser.add_argument('-f', '--fastq', required=True, nargs='+', help='fastq[.gz] file(s) of results to quantify') parser.add_argument('-o', '--output', required=True, help='file to write guide counts') args = parser.parse_args() print('Reading guides from {}...'.format(args.guides)) # read in the guide sequences, assuming a structure of # >[gene1].[sg_label] # [sequence] # >[gene2].[sg_label] # etc with open(args.guides) as f: lines = [line.strip() for line in f] labels = [line[1:] for line in lines[::2]] sequences = lines[1::2] sequence_set = set(sequences) print('{} sequences for {} labels\n'.format(len(sequence_set), len(labels))) # some sequences map to multiple genes seq2lbl = defaultdict(set) for seq,lbl in zip(sequences, labels): seq2lbl[seq].add(lbl) # function to read and count a fastq file def read_fastq(fq_fn): print('Counting reads from {}...'.format(fq_fn)) if fq_fn.endswith('.gz'): with gzip.open(fq_fn, 'rb') as f: seqs = [line[:20].decode() for line in itertools.islice(f, 1, None, 4)] else: with open(fq_fn) as f: seqs = [line[:20] for line in itertools.islice(f, 1, None, 4)] # count up the sequences seq_counts = Counter(seq for seq in seqs if seq in sequence_set) # map to guide labels lbl_c = {lbl:seq_counts[seq] for seq in seq_counts for lbl in seq2lbl[seq]} total_c = sum(lbl_c.values()) # summary statistics res = [('Total reads', len(seqs)), ('Guide reads', total_c), ('% Guide reads', '{:.1f}%'.format(100 * total_c / len(seqs)))] return lbl_c, total_c, res # use multiprocessing to read in multiple fastq files in parallel pool = multiprocessing.Pool(processes=min(multiprocessing.cpu_count(), len(args.fastq))) try: lbl_counts, total_counts, results = map( lambda s: dict(zip(args.fastq, s)), zip(*pool.map(read_fastq, args.fastq)) ) finally: pool.close() pool.join() # print summary stats to the console for fastq_fn in args.fastq: print('\nResults for {}:\n\t{}'.format( fastq_fn, '\n\t'.join('{:16}:{:>12}'.format(n, v) for n,v in results[fastq_fn]) )) header = ['sgRNA', 'gene'] header.extend(os.path.basename(fn) for fn in args.fastq) header.extend('normalized {}'.format(os.path.basename(fn)) for fn in args.fastq) # write total results to file with open(args.output, 'w') as OUT: print(','.join(header), file=OUT) for lbl in sorted(labels): c = ','.join('{:d}'.format(lbl_counts[fastq_fn].get(lbl, 0)) for fastq_fn in args.fastq) n = ','.join('{:f}'.format(lbl_counts[fastq_fn].get(lbl, 0) / total_counts[fastq_fn]) for fastq_fn in args.fastq) print(','.join((lbl, lbl.split('.')[0], c, n)), file=OUT)