# 读取结构域位置信息文件
with open(domain_file, "r") as file:
reader = csv.reader(file, delimiter="\t")
for row in reader:
gene_id = row[0]
start = int(row[1])
end = int(row[2])
if gene_id not in domain_positions:
domain_positions[gene_id] = []
domain_positions[gene_id].append((start, end))
return domain_positions
def extract_domain_sequences(input_fasta, domain_positions, output_fasta):
"""
提取蛋白结构域序列
:param input_fasta: 输入的基因家族氨基酸序列FASTA文件
:param domain_positions: 结构域的位置列表,格式为 {序列ID: [(起始, 结束), ...]}
:param output_fasta: 输出的结构域序列FASTA文件
"""
with open(output_fasta, "w") as output_handle:
for record in SeqIO.parse(input_fasta, "fasta"):
if record.id in domain_positions:
positions = domain_positions[record.id]
for i, (start, end) in enumerate(positions):
# 提取结构域序列
domain_seq = record.seq[start-1:end] # 因为序列索引从0开始,FASTA的氨基酸编号从1开始
domain_id = f"{record.id}_domain_{i+1}" # 给结构域序列命名
new_record = record[:0] # 创建一个新的空记录
new_record.id = domain_id
new_record.description = f"Domain from {start} to {end}"
new_record.seq = domain_seq