Skip to content

Instantly share code, notes, and snippets.

@hussius
Created November 17, 2016 09:39

Revisions

  1. hussius created this gist Nov 17, 2016.
    42 changes: 42 additions & 0 deletions sum_by_gene.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,42 @@
    import sys
    import gzip

    if len(sys.argv)<3:
    sys.exit("python sum_per_gene.py <cDNA FASTA file> <TPM table>")

    ensg = {}

    mapf = gzip.open(sys.argv[1])
    ctr = 0
    for line in mapf:
    if not line.startswith('>'): continue
    ctr += 1
    if ctr % 1000 == 0: sys.stderr.write(ctr)
    c = line.split(' ')
    tx = c[0][1:]
    gene = c[3].split(':')[1].split('.')[0]
    ensg[tx] = gene

    gene_tpm = {}

    tpm = open(sys.argv[2])
    print(tpm.readline().strip()) # Header

    for line in tpm:
    c = line.split()
    tx = c[0]
    vals = []
    for val in c[1:len(c)]:
    vals.append(float(val))
    gene = ensg[tx]
    if gene in gene_tpm:
    prev = gene_tpm[gene]
    curr = vals
    new = [prev[i]+curr[i] for i in range(0,len(prev))]
    gene_tpm[gene] = new
    else:
    gene_tpm[gene] = vals

    for g in gene_tpm:
    string_rep = [x for x in map(str, gene_tpm[g])]
    print(g + '\t' + '\t'.join(string_rep))