Skip to content

Instantly share code, notes, and snippets.

@jamestwebber
Created June 28, 2019 21:08

Revisions

  1. jamestwebber created this gist Jun 28, 2019.
    109 changes: 109 additions & 0 deletions filter_fastq.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,109 @@
    #!/usr/bin/env python3

    import argparse
    import gzip
    import itertools
    import os

    import multiprocessing

    from collections import Counter, defaultdict


    if __name__ == '__main__':
    parser = argparse.ArgumentParser(
    prog='filter_fastq',
    description=('Script to filter a fastq.gz file for 20bp guides'
    ' and output a count file')
    )

    parser.add_argument('-b', '--guides', required=True,
    help='fasta file of guide sequences')
    parser.add_argument('-f', '--fastq', required=True, nargs='+',
    help='fastq[.gz] file(s) of results to quantify')
    parser.add_argument('-o', '--output', required=True,
    help='file to write guide counts')

    args = parser.parse_args()

    print('Reading guides from {}...'.format(args.guides))

    # read in the guide sequences, assuming a structure of
    # >[gene1].[sg_label]
    # [sequence]
    # >[gene2].[sg_label]
    # etc
    with open(args.guides) as f:
    lines = [line.strip() for line in f]
    labels = [line[1:] for line in lines[::2]]
    sequences = lines[1::2]

    sequence_set = set(sequences)

    print('{} sequences for {} labels\n'.format(len(sequence_set), len(labels)))

    # some sequences map to multiple genes
    seq2lbl = defaultdict(set)
    for seq,lbl in zip(sequences, labels):
    seq2lbl[seq].add(lbl)

    # function to read and count a fastq file
    def read_fastq(fq_fn):
    print('Counting reads from {}...'.format(fq_fn))

    if fq_fn.endswith('.gz'):
    with gzip.open(fq_fn, 'rb') as f:
    seqs = [line[:20].decode() for line in itertools.islice(f, 1, None, 4)]
    else:
    with open(fq_fn) as f:
    seqs = [line[:20] for line in itertools.islice(f, 1, None, 4)]

    # count up the sequences
    seq_counts = Counter(seq for seq in seqs if seq in sequence_set)
    # map to guide labels
    lbl_c = {lbl:seq_counts[seq] for seq in seq_counts for lbl in seq2lbl[seq]}

    total_c = sum(lbl_c.values())

    # summary statistics
    res = [('Total reads', len(seqs)),
    ('Guide reads', total_c),
    ('% Guide reads', '{:.1f}%'.format(100 * total_c / len(seqs)))]

    return lbl_c, total_c, res

    # use multiprocessing to read in multiple fastq files in parallel
    pool = multiprocessing.Pool(processes=min(multiprocessing.cpu_count(),
    len(args.fastq)))

    try:
    lbl_counts, total_counts, results = map(
    lambda s: dict(zip(args.fastq, s)),
    zip(*pool.map(read_fastq, args.fastq))
    )
    finally:
    pool.close()
    pool.join()

    # print summary stats to the console
    for fastq_fn in args.fastq:
    print('\nResults for {}:\n\t{}'.format(
    fastq_fn, '\n\t'.join('{:16}:{:>12}'.format(n, v)
    for n,v in results[fastq_fn])
    ))

    header = ['sgRNA', 'gene']
    header.extend(os.path.basename(fn) for fn in args.fastq)
    header.extend('normalized {}'.format(os.path.basename(fn)) for fn in args.fastq)

    # write total results to file
    with open(args.output, 'w') as OUT:
    print(','.join(header), file=OUT)
    for lbl in sorted(labels):
    c = ','.join('{:d}'.format(lbl_counts[fastq_fn].get(lbl, 0))
    for fastq_fn in args.fastq)
    n = ','.join('{:f}'.format(lbl_counts[fastq_fn].get(lbl, 0)
    / total_counts[fastq_fn])
    for fastq_fn in args.fastq)

    print(','.join((lbl, lbl.split('.')[0], c, n)), file=OUT)