蛋白全序列氨基酸可变性分析
#!/usr/bin/python# -*- coding: utf-8 -*-
fasta_sequence = open('C:/Python27/exercise/acetoin.fasta','r') #读取多序列比对的fasta文件
seq = fasta_sequence.readlines() #把fasta格式按行读取,存入列表seq中
seq_num = len(seq) #fasta文件行数
#新建一个fasta文件,由于前者fasta格式有注释行>gi,所以通过这段代码把注释行去掉
seq_align = file('C:/Python27/exercise/fasta.txt','w')
for i in range(1,seq_num,2):
seq_align.write(seq)
num = seq_num/2 #num表示同源序列序列数量
seq_align.close()
#打开纯文本序列,把其存入一个名为m的列表中,该列表中元素存储序列的字符串
seq_align = open('C:/Python27/exercise/fasta.txt','r')
m = []
for i in range(0,num):
m.append(seq_align.readline())
#找到多序列比对文件中的gap,并整列删除,之后将其存入名为seq matrix的文本文件中
space = []
for j in range(0,len(m)):
if m == '-':
space.append(j)
#print space
seq_matrix = open('C:/Python27/exercise/seq matrix.txt','w')
for i in range(0,num):
t = 0
for j in range(0,len(space)):
k = space
m = m[:k-t]+m
t = t+1
seq_matrix.write(m+'\n')
seq_matrix.close()
#建立氨基酸索引字典,key为氨基酸,value为氨基酸频数
amino_acid = {'G':0,'A':0,'L':0,'I':0,'V':0,'P':0,'F':0,'M':0,'W':0,'S':0,'Q':0,'T':0,'C':0,'N':0,'Y':0,'D':0,'E':0,'K':0,'R':0,'H':0}
#建立一个序列长度个元素,每个元素为空的列表
n = []
for i in range(0,len(m)):
n.append('')
#将序列矩阵转置,进行氨基酸频数统计,并将其写入文件trans_aa.txt文件中
for i in range(0,len(m)):
for j in range(1,num):
n = n+m
trans_aa =file('C:/Python27/exercise/trans_aa.txt','w')
for i in range(0,256):
trans_aa.write(n+'\n')
trans_aa.close()
trans_aa =open('C:/Python27/exercise/trans_aa.txt','r')
statistics = trans_aa.readlines()
length = len(statistics) #目标氨基酸的序列长度
#把氨基酸频数统计数据写入文本文件fre.txt
num_aa = []
fre = file('C:/Python27/exercise/fre.txt','w')
for i in range(0,length):
s = ''
k_aa = ''
for key in amino_acid:
amino_acid = statistics.count(key)
s = s+' '+str(amino_acid)
k_aa = k_aa+' '+key
num_aa.append(s)
fre.write(' '+k_aa+'\n')
for i in range(0,length):
fre.write(str(i)+' '+m+num_aa+'\n')
fre.close()
小蒲如果在帖子的最开始给大家普及一下为什么要研究氨基酸的可变性就好了~哈哈
页:
[1]