Created
August 18, 2011 23:44
-
-
Save davidliwei/1155568 to your computer and use it in GitHub Desktop.
Converting Cufflinks predictions (.GTF) into .BED annotations
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
''' | |
gtf2bed.py converts GTF file to BED file. | |
Usage: gtf2bed.py {OPTIONS} [.GTF file] | |
History | |
Nov.5th 2012: | |
1. Allow conversion from general GTF files (instead of only Cufflinks supports). | |
2. If multiple identical transcript_id exist, transcript_id will be appended a string like "_DUP#" to separate. | |
''' | |
import sys; | |
import re; | |
if len(sys.argv)<2: | |
print('This script converts .GTF into .BED annotations.\n'); | |
print('Usage: gtf2bed {OPTIONS} [.GTF file]\n'); | |
print('Options:'); | |
print('-c color\tSpecify the color of the track. This is a RGB value represented as "r,g,b". Default 255,0,0 (red)'); | |
print('\nNote:'); | |
print('1\tOnly "exon" and "transcript" are recognized in the feature field (3rd field).'); | |
print('2\tIn the attribute list of .GTF file, the script tries to find "gene_id", "transcript_id" and "FPKM" attribute, and convert them as name and score field in .BED file.'); | |
print('Author: Wei Li (li.david.wei AT gmail.com)'); | |
sys.exit(); | |
color='255,0,0' | |
for i in range(len(sys.argv)): | |
if sys.argv[i]=='-c': | |
color=sys.argv[i+1]; | |
allids={}; | |
def printbedline(estart,eend,field,nline): | |
try: | |
estp=estart[0]-1; | |
eedp=eend[-1]; | |
# use regular expression to get transcript_id, gene_id and expression level | |
geneid=re.findall(r'gene_id \"([\w\.]+)\"',field[8]) | |
transid=re.findall(r'transcript_id \"([\w\.]+)\"',field[8]) | |
fpkmval=re.findall(r'FPKM \"([\d\.]+)\"',field[8]) | |
if len(geneid)==0: | |
print('Warning: no gene_id field ',file=sys.stderr); | |
else: | |
geneid=geneid[0]; | |
if len(transid)==0: | |
print('Warning: no transcript_id field',file=sys.stderr); | |
transid='Trans_'+str(nline); | |
else: | |
transid=transid[0]; | |
if transid in allids.keys(): | |
transid2=transid+'_DUP'+str(allids[transid]); | |
allids[transid]=allids[transid]+1; | |
transid=transid2; | |
else: | |
allids[transid]=1; | |
if len(fpkmval)==0: | |
#print('Warning: no FPKM field',file=sys.stderr); | |
fpkmval='100'; | |
else: | |
fpkmval=fpkmval[0]; | |
fpkmint=round(float(fpkmval)); | |
print(field[0]+'\t'+str(estp)+'\t'+str(eedp)+'\t'+transid+'\t'+str(fpkmint)+'\t'+field[6]+'\t'+str(estp)+'\t'+str(eedp)+'\t'+color+'\t'+str(len(estart))+'\t',end=''); | |
seglen=[eend[i]-estart[i]+1 for i in range(len(estart))]; | |
segstart=[estart[i]-estart[0] for i in range(len(estart))]; | |
strl=str(seglen[0]); | |
for i in range(1,len(seglen)): | |
strl+=','+str(seglen[i]); | |
strs=str(segstart[0]); | |
for i in range(1,len(segstart)): | |
strs+=','+str(segstart[i]); | |
print(strl+'\t'+strs); | |
except ValueError: | |
print('Error: non-number fields at line '+str(nline),file=sys.stderr); | |
estart=[]; | |
eend=[]; | |
# read lines one to one | |
nline=0; | |
prevfield=[]; | |
prevtransid=''; | |
for lines in open(sys.argv[-1]): | |
field=lines.strip().split('\t'); | |
nline=nline+1; | |
if len(field)<9: | |
print('Error: the GTF should has at least 9 fields at line '+str(nline),file=sys.stderr); | |
continue; | |
if field[1]!='Cufflinks': | |
pass; | |
#print('Warning: the second field is expected to be \'Cufflinks\' at line '+str(nline),file=sys.stderr); | |
if field[2]!='exon' and field[2] !='transcript': | |
#print('Error: the third filed is expected to be \'exon\' or \'transcript\' at line '+str(nline),file=sys.stderr); | |
continue; | |
transid=re.findall(r'transcript_id \"([\w\.]+)\"',field[8]); | |
if len(transid)>0: | |
transid=transid[0]; | |
else: | |
transid=''; | |
if field[2]=='transcript' or (prevtransid != '' and transid!='' and transid != prevtransid): | |
#print('prev:'+prevtransid+', current:'+transid); | |
# A new transcript record, write | |
if len(estart)!=0: | |
printbedline(estart,eend,prevfield,nline); | |
estart=[]; | |
eend=[]; | |
prevfield=field; | |
prevtransid=transid; | |
if field[2]=='exon': | |
try: | |
est=int(field[3]); | |
eed=int(field[4]); | |
estart+=[est]; | |
eend+=[eed]; | |
except ValueError: | |
print('Error: non-number fields at line '+str(nline),file=sys.stderr); | |
# the last record | |
if len(estart)!=0: | |
printbedline(estart,eend,field,nline); |
That's true. I'm not offering a general gtf to bed tool, but one specifically for Cufflinks predictions.
this function doesn't seem work properly for genes on the reverse strand, see the following output for the input:
Input:
chr1 HAVANA gene 3205901 3671498 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUSG00000051951.5"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "Xkr4"; level 2; havana_gene "OTTMUSG00000026353.2";
chr1 HAVANA transcript 3205901 3216344 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000162897.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "Xkr4-003"; level 2; tag "basic"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086625.1";
chr1 HAVANA exon 3213609 3216344 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000162897.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "Xkr4-003"; exon_number 1; exon_id "ENSMUSE00000858910.1"; level 2; tag "basic"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086625.1";
chr1 HAVANA exon 3205901 3207317 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000162897.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "Xkr4-003"; exon_number 2; exon_id "ENSMUSE00000866652.1"; level 2; tag "basic"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086625.1";
chr1 HAVANA transcript 3206523 3215632 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000159265.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "Xkr4-002"; level 2; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086624.1";
chr1 HAVANA exon 3213439 3215632 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000159265.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "Xkr4-002"; exon_number 1; exon_id "ENSMUSE00000863980.1"; level 2; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086624.1";
chr1 HAVANA exon 3206523 3207317 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000159265.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "Xkr4-002"; exon_number 2; exon_id "ENSMUSE00000867897.1"; level 2; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086624.1";
chr1 HAVANA transcript 3214482 3671498 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "Xkr4-001"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
chr1 HAVANA exon 3670552 3671498 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "Xkr4-001"; exon_number 1; exon_id "ENSMUSE00000485541.3"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
output
chr1 3213608 3207317 ENSMUST00000162897.1 100 - 3213608 3207317 255,0,0 2 2736,1417 0,-7708
chr1 3213438 3207317 ENSMUST00000159265.1 100 - 3213438 3207317 255,0,0 2 2194,795 0,-6916
chr1 3670551 3671498 ENSMUST00000070533.4 100 - 3670551 3671498 255,0,0 1 947 0
while the gtf2bed.pl function will output:
chr1 3206522 3215632 ENSMUST00000159265.1 0 - 3206522 3215632 0 2 795,2194, 0,6916,
chr1 3205900 3216344 ENSMUST00000162897.1 0 - 3205900 3216344 0 2 1417,2736, 0,7708,
chr1 3466586 3513553 ENSMUST00000161581.1 0 + 3466586 3513553 0 2 101,149, 0,46818,
The script doesn't work at all.
Do you now another script to convert gtf to bed format?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Not cufflinks-specific, handles start/stop sites & does not require FPKM values:
http://ea-utils.googlecode.com/svn/trunk/clipper/gtf2bed