Last active
December 31, 2015 00:38
-
-
Save heliy/7908332 to your computer and use it in GitHub Desktop.
查找长序列里剪切识别位点
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding:UTF-8 | |
#!/usr/bin/python | |
#使用方法: | |
#python seqcutloc.py fasta文件 序列 输出文件 | |
#./seqcutloc.py fasta文件 序列 输出文件 | |
#算法是通过将长序列按碱基切开 在拼接回原序列 | |
#eg seq:ATAATCGGCGATTAATCGAAT cut:AAT | |
#切完后得到子序列 AT CGGCGATT CG 长度为2 8 2 | |
#拼接得到 第一个位点:2+1=3 第二个位点:2+3+8+1=14 第三个位点:2+3+8+3+2+1=19 | |
import string | |
import sys | |
# fasta文件类 | |
class fasta(object): | |
def __init__(self,head,seq): | |
self.head=head | |
self.seq=seq | |
#处理单个fasta文件数据 | |
def sinfasta(sfas,target): | |
targloc=[] | |
name=sfas.head.split('|')[3] | |
sub=sfas.seq.split(target) | |
if len(sub)<1: | |
return #没有╮(╯_╰)╭ | |
i=1 #找到的序号 | |
before=1 #之前的位置标记 | |
tarlen=len(target) | |
for s in sub[:-1]: #最末尾的不用考虑 | |
before+=len(s) #该切点开始位置 | |
targloc.append([name,str(i),str(before),str(before+tarlen)]) | |
before+=tarlen #该切点结束位置 | |
i+=1 | |
return targloc | |
#将fna文件中的单个fasta分开 | |
def file2singles(filename): | |
fl=[] | |
f=open(filename,'r') | |
fastalist=f.read().split('>') | |
f.close() | |
for fas in fastalist: | |
if len(fas)<10: | |
continue | |
head=fas.split("\n")[0] | |
seq=fas.replace(head,'').replace('\n','') | |
fa=fasta(head,seq) | |
fl.append(fa) | |
return fl | |
#主处理过程 | |
def process(filename,target,tofile): | |
w=open(tofile,'w+') | |
falist=file2singles(filename) | |
for fa in falist: | |
result=sinfasta(fa,target) | |
lines=[string.join(item,'\t')+'\n' for item in result] | |
w.writelines(lines) | |
w.write('\n') | |
w.close() | |
return | |
if __name__=="__main__": | |
[filename,target,tofile]=sys.argv[1:] | |
process(filename,target,tofile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment