包含不同长度范围内的蛋白序列
from Bio import SeqIOdef split_protein_sequences(input_file, output_file_0_400, output_file_401_1000, output_file_1001):
# 打开输出文件
with open(output_file_0_400, "w") as f_0_400, open(output_file_401_1000, "w") as f_401_1000, open(output_file_1001, "w") as f_1001:
# 读取输入的FASTA文件
for record in SeqIO.parse(input_file, "fasta"):
sequence_length = len(record.seq)
# 根据长度写入不同的文件
if sequence_length <= 400:
SeqIO.write(record, f_0_400, "fasta")
elif 401 <= sequence_length <= 1000:
SeqIO.write(record, f_401_1000, "fasta")
elif sequence_length > 1000:
SeqIO.write(record, f_1001, "fasta")
if __name__ == "__main__":
# 输入蛋白序列的FASTA文件
input_file = "genome_protein_sequences.fasta"
# 输出文件
output_file_0_400 = "protein_0_400.fasta"
output_file_401_1000 = "protein_401_1000.fasta"
output_file_1001 = "protein_1001_plus.fasta"
# 调用函数分割蛋白序列
split_protein_sequences(input_file, output_file_0_400, output_file_401_1000, output_file_1001)
print("蛋白序列文件已成功分割并导出。")
protein_0_400.fasta: 保存长度在0-400个氨基酸的蛋白序列。protein_401_1000.fasta: 保存长度在401-1000个氨基酸的蛋白序列。protein_1001_plus.fasta: 保存长度在1001个氨基酸以上的蛋白序列。
页:
[1]