jamestwebber · June 28, 2019 21:08 · Jun 28, 2019
diff --git a/filter_fastq.py b/filter_fastq.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+import argparse
+import gzip
+import itertools
+import os
+
+import multiprocessing
+
+from collections import Counter, defaultdict
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        prog='filter_fastq',
+        description=('Script to filter a fastq.gz file for 20bp guides'
+                     ' and output a count file')
+    )
+
+    parser.add_argument('-b', '--guides', required=True,
+                        help='fasta file of guide sequences')
+    parser.add_argument('-f', '--fastq', required=True, nargs='+',
+                        help='fastq[.gz] file(s) of results to quantify')
+    parser.add_argument('-o', '--output', required=True,
+                        help='file to write guide counts')
+
+    args = parser.parse_args()
+
+    print('Reading guides from {}...'.format(args.guides))
+
+    # read in the guide sequences, assuming a structure of
+    # >[gene1].[sg_label]
+    # [sequence]
+    # >[gene2].[sg_label]
+    # etc
+    with open(args.guides) as f:
+        lines = [line.strip() for line in f]
+        labels = [line[1:] for line in lines[::2]]
+        sequences = lines[1::2]
+
+    sequence_set = set(sequences)
+
+    print('{} sequences for {} labels\n'.format(len(sequence_set), len(labels)))
+
+    # some sequences map to multiple genes
+    seq2lbl = defaultdict(set)
+    for seq,lbl in zip(sequences, labels):
+        seq2lbl[seq].add(lbl)
+
+    # function to read and count a fastq file
+    def read_fastq(fq_fn):
+        print('Counting reads from {}...'.format(fq_fn))
+
+        if fq_fn.endswith('.gz'):
+            with gzip.open(fq_fn, 'rb') as f:
+                seqs = [line[:20].decode() for line in itertools.islice(f, 1, None, 4)]
+        else:
+            with open(fq_fn) as f:
+                seqs = [line[:20] for line in itertools.islice(f, 1, None, 4)]
+
+        # count up the sequences
+        seq_counts = Counter(seq for seq in seqs if seq in sequence_set)
+        # map to guide labels
+        lbl_c = {lbl:seq_counts[seq] for seq in seq_counts for lbl in seq2lbl[seq]}
+
+        total_c = sum(lbl_c.values())
+
+        # summary statistics
+        res = [('Total reads', len(seqs)),
+               ('Guide reads', total_c),
+               ('% Guide reads', '{:.1f}%'.format(100 * total_c / len(seqs)))]
+
+        return lbl_c, total_c, res
+
+    # use multiprocessing to read in multiple fastq files in parallel
+    pool = multiprocessing.Pool(processes=min(multiprocessing.cpu_count(),
+                                              len(args.fastq)))
+
+    try:
+        lbl_counts, total_counts, results = map(
+                lambda s: dict(zip(args.fastq, s)),
+                zip(*pool.map(read_fastq, args.fastq))
+        )
+    finally:
+        pool.close()
+        pool.join()
+
+    # print summary stats to the console
+    for fastq_fn in args.fastq:
+        print('\nResults for {}:\n\t{}'.format(
+                fastq_fn, '\n\t'.join('{:16}:{:>12}'.format(n, v)
+                                      for n,v in results[fastq_fn])
+        ))
+
+    header = ['sgRNA', 'gene']
+    header.extend(os.path.basename(fn) for fn in args.fastq)
+    header.extend('normalized {}'.format(os.path.basename(fn)) for fn in args.fastq)
+
+    # write total results to file
+    with open(args.output, 'w') as OUT:
+        print(','.join(header), file=OUT)
+        for lbl in sorted(labels):
+            c = ','.join('{:d}'.format(lbl_counts[fastq_fn].get(lbl, 0))
+                         for fastq_fn in args.fastq)
+            n = ','.join('{:f}'.format(lbl_counts[fastq_fn].get(lbl, 0)
+                                       / total_counts[fastq_fn])
+                         for fastq_fn in args.fastq)
+
+            print(','.join((lbl, lbl.split('.')[0], c, n)), file=OUT)