Created
June 28, 2019 21:08
-
-
Save jamestwebber/7f8dbba12a68b0f367223a5ed10f5d1c to your computer and use it in GitHub Desktop.
Filters fastq files for a given list of 20bp guide sequences and outputs a count file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import gzip | |
import itertools | |
import os | |
import multiprocessing | |
from collections import Counter, defaultdict | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser( | |
prog='filter_fastq', | |
description=('Script to filter a fastq.gz file for 20bp guides' | |
' and output a count file') | |
) | |
parser.add_argument('-b', '--guides', required=True, | |
help='fasta file of guide sequences') | |
parser.add_argument('-f', '--fastq', required=True, nargs='+', | |
help='fastq[.gz] file(s) of results to quantify') | |
parser.add_argument('-o', '--output', required=True, | |
help='file to write guide counts') | |
args = parser.parse_args() | |
print('Reading guides from {}...'.format(args.guides)) | |
# read in the guide sequences, assuming a structure of | |
# >[gene1].[sg_label] | |
# [sequence] | |
# >[gene2].[sg_label] | |
# etc | |
with open(args.guides) as f: | |
lines = [line.strip() for line in f] | |
labels = [line[1:] for line in lines[::2]] | |
sequences = lines[1::2] | |
sequence_set = set(sequences) | |
print('{} sequences for {} labels\n'.format(len(sequence_set), len(labels))) | |
# some sequences map to multiple genes | |
seq2lbl = defaultdict(set) | |
for seq,lbl in zip(sequences, labels): | |
seq2lbl[seq].add(lbl) | |
# function to read and count a fastq file | |
def read_fastq(fq_fn): | |
print('Counting reads from {}...'.format(fq_fn)) | |
if fq_fn.endswith('.gz'): | |
with gzip.open(fq_fn, 'rb') as f: | |
seqs = [line[:20].decode() for line in itertools.islice(f, 1, None, 4)] | |
else: | |
with open(fq_fn) as f: | |
seqs = [line[:20] for line in itertools.islice(f, 1, None, 4)] | |
# count up the sequences | |
seq_counts = Counter(seq for seq in seqs if seq in sequence_set) | |
# map to guide labels | |
lbl_c = {lbl:seq_counts[seq] for seq in seq_counts for lbl in seq2lbl[seq]} | |
total_c = sum(lbl_c.values()) | |
# summary statistics | |
res = [('Total reads', len(seqs)), | |
('Guide reads', total_c), | |
('% Guide reads', '{:.1f}%'.format(100 * total_c / len(seqs)))] | |
return lbl_c, total_c, res | |
# use multiprocessing to read in multiple fastq files in parallel | |
pool = multiprocessing.Pool(processes=min(multiprocessing.cpu_count(), | |
len(args.fastq))) | |
try: | |
lbl_counts, total_counts, results = map( | |
lambda s: dict(zip(args.fastq, s)), | |
zip(*pool.map(read_fastq, args.fastq)) | |
) | |
finally: | |
pool.close() | |
pool.join() | |
# print summary stats to the console | |
for fastq_fn in args.fastq: | |
print('\nResults for {}:\n\t{}'.format( | |
fastq_fn, '\n\t'.join('{:16}:{:>12}'.format(n, v) | |
for n,v in results[fastq_fn]) | |
)) | |
header = ['sgRNA', 'gene'] | |
header.extend(os.path.basename(fn) for fn in args.fastq) | |
header.extend('normalized {}'.format(os.path.basename(fn)) for fn in args.fastq) | |
# write total results to file | |
with open(args.output, 'w') as OUT: | |
print(','.join(header), file=OUT) | |
for lbl in sorted(labels): | |
c = ','.join('{:d}'.format(lbl_counts[fastq_fn].get(lbl, 0)) | |
for fastq_fn in args.fastq) | |
n = ','.join('{:f}'.format(lbl_counts[fastq_fn].get(lbl, 0) | |
/ total_counts[fastq_fn]) | |
for fastq_fn in args.fastq) | |
print(','.join((lbl, lbl.split('.')[0], c, n)), file=OUT) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment