annotate cpt_fasta_translate/fasta_translate.py @ 0:cb42bee49abb draft

Uploaded
author cpt
date Fri, 10 Jun 2022 08:47:31 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cb42bee49abb Uploaded
cpt
parents:
diff changeset
1 #!/usr/bin/env python
cb42bee49abb Uploaded
cpt
parents:
diff changeset
2 import sys
cb42bee49abb Uploaded
cpt
parents:
diff changeset
3 import logging
cb42bee49abb Uploaded
cpt
parents:
diff changeset
4 import argparse
cb42bee49abb Uploaded
cpt
parents:
diff changeset
5 from Bio import SeqIO
cb42bee49abb Uploaded
cpt
parents:
diff changeset
6 from Bio.Data import CodonTable
cb42bee49abb Uploaded
cpt
parents:
diff changeset
7
cb42bee49abb Uploaded
cpt
parents:
diff changeset
8 logging.basicConfig(level=logging.INFO)
cb42bee49abb Uploaded
cpt
parents:
diff changeset
9 log = logging.getLogger()
cb42bee49abb Uploaded
cpt
parents:
diff changeset
10
cb42bee49abb Uploaded
cpt
parents:
diff changeset
11
cb42bee49abb Uploaded
cpt
parents:
diff changeset
12 def translate(fasta_file, target="protein", table=11, strip_stops=False, met=False):
cb42bee49abb Uploaded
cpt
parents:
diff changeset
13 records = list(SeqIO.parse(fasta_file, "fasta"))
cb42bee49abb Uploaded
cpt
parents:
diff changeset
14
cb42bee49abb Uploaded
cpt
parents:
diff changeset
15 for record in records:
cb42bee49abb Uploaded
cpt
parents:
diff changeset
16 if target == "protein":
cb42bee49abb Uploaded
cpt
parents:
diff changeset
17 mod = len(record.seq) % 3
cb42bee49abb Uploaded
cpt
parents:
diff changeset
18 if mod != 0:
cb42bee49abb Uploaded
cpt
parents:
diff changeset
19 record.seq = record.seq[0:-mod]
cb42bee49abb Uploaded
cpt
parents:
diff changeset
20
cb42bee49abb Uploaded
cpt
parents:
diff changeset
21 # Read http://biopython.org/DIST/docs/api/Bio.Seq.Seq-class.html#transcribe
cb42bee49abb Uploaded
cpt
parents:
diff changeset
22 # for valid CDS conditions.
cb42bee49abb Uploaded
cpt
parents:
diff changeset
23
cb42bee49abb Uploaded
cpt
parents:
diff changeset
24 # Will first try to translate sequence as a CDS,
cb42bee49abb Uploaded
cpt
parents:
diff changeset
25 # then just as a sequence if this fails.
cb42bee49abb Uploaded
cpt
parents:
diff changeset
26
cb42bee49abb Uploaded
cpt
parents:
diff changeset
27 try:
cb42bee49abb Uploaded
cpt
parents:
diff changeset
28 tmpseq = record.seq.translate(table=table, cds=True)
cb42bee49abb Uploaded
cpt
parents:
diff changeset
29 except CodonTable.TranslationError as cte:
cb42bee49abb Uploaded
cpt
parents:
diff changeset
30 log.info("Translation issue at %s: %s", record.id, cte)
cb42bee49abb Uploaded
cpt
parents:
diff changeset
31 tmpseq = record.seq.translate(table=table, cds=False)
cb42bee49abb Uploaded
cpt
parents:
diff changeset
32
cb42bee49abb Uploaded
cpt
parents:
diff changeset
33 # check if stop in middle of protein
cb42bee49abb Uploaded
cpt
parents:
diff changeset
34 if "*" in tmpseq:
cb42bee49abb Uploaded
cpt
parents:
diff changeset
35 log.info(
cb42bee49abb Uploaded
cpt
parents:
diff changeset
36 "Trimming %s from %s to %s due to stop codons",
cb42bee49abb Uploaded
cpt
parents:
diff changeset
37 record.id,
cb42bee49abb Uploaded
cpt
parents:
diff changeset
38 len(record.seq),
cb42bee49abb Uploaded
cpt
parents:
diff changeset
39 3 * len(tmpseq) - 3,
cb42bee49abb Uploaded
cpt
parents:
diff changeset
40 )
cb42bee49abb Uploaded
cpt
parents:
diff changeset
41 tmpseq = tmpseq[0 : str(tmpseq).index("*")]
cb42bee49abb Uploaded
cpt
parents:
diff changeset
42
cb42bee49abb Uploaded
cpt
parents:
diff changeset
43 # add stop to end if strip_stops=False
cb42bee49abb Uploaded
cpt
parents:
diff changeset
44 if not strip_stops:
cb42bee49abb Uploaded
cpt
parents:
diff changeset
45 tmpseq = tmpseq + "*"
cb42bee49abb Uploaded
cpt
parents:
diff changeset
46
cb42bee49abb Uploaded
cpt
parents:
diff changeset
47 if met:
cb42bee49abb Uploaded
cpt
parents:
diff changeset
48 tmpseq = "M" + tmpseq[1:]
cb42bee49abb Uploaded
cpt
parents:
diff changeset
49
cb42bee49abb Uploaded
cpt
parents:
diff changeset
50 record.seq = tmpseq
cb42bee49abb Uploaded
cpt
parents:
diff changeset
51 if len(record.seq) > 0:
cb42bee49abb Uploaded
cpt
parents:
diff changeset
52 SeqIO.write(record, sys.stdout, "fasta")
cb42bee49abb Uploaded
cpt
parents:
diff changeset
53 else:
cb42bee49abb Uploaded
cpt
parents:
diff changeset
54 record.seq = record.seq.transcribe()
cb42bee49abb Uploaded
cpt
parents:
diff changeset
55 SeqIO.write(record, sys.stdout, "fasta")
cb42bee49abb Uploaded
cpt
parents:
diff changeset
56
cb42bee49abb Uploaded
cpt
parents:
diff changeset
57
cb42bee49abb Uploaded
cpt
parents:
diff changeset
58 if __name__ == "__main__":
cb42bee49abb Uploaded
cpt
parents:
diff changeset
59 parser = argparse.ArgumentParser(description="Translate fasta file")
cb42bee49abb Uploaded
cpt
parents:
diff changeset
60 parser.add_argument("fasta_file", type=argparse.FileType("r"), help="Fasta file")
cb42bee49abb Uploaded
cpt
parents:
diff changeset
61 parser.add_argument("--target", choices=["protein", "rna"])
cb42bee49abb Uploaded
cpt
parents:
diff changeset
62 parser.add_argument(
cb42bee49abb Uploaded
cpt
parents:
diff changeset
63 "--table",
cb42bee49abb Uploaded
cpt
parents:
diff changeset
64 type=int,
cb42bee49abb Uploaded
cpt
parents:
diff changeset
65 default=11,
cb42bee49abb Uploaded
cpt
parents:
diff changeset
66 help="Translation table to use",
cb42bee49abb Uploaded
cpt
parents:
diff changeset
67 choices=range(1, 23),
cb42bee49abb Uploaded
cpt
parents:
diff changeset
68 )
cb42bee49abb Uploaded
cpt
parents:
diff changeset
69 parser.add_argument(
cb42bee49abb Uploaded
cpt
parents:
diff changeset
70 "--strip_stops", action="store_true", help="Remove stop characters"
cb42bee49abb Uploaded
cpt
parents:
diff changeset
71 )
cb42bee49abb Uploaded
cpt
parents:
diff changeset
72 parser.add_argument(
cb42bee49abb Uploaded
cpt
parents:
diff changeset
73 "--met", action="store_true", help="Convert first residue to Met"
cb42bee49abb Uploaded
cpt
parents:
diff changeset
74 )
cb42bee49abb Uploaded
cpt
parents:
diff changeset
75
cb42bee49abb Uploaded
cpt
parents:
diff changeset
76 args = parser.parse_args()
cb42bee49abb Uploaded
cpt
parents:
diff changeset
77 translate(**vars(args))