Mercurial > repos > peterjc > align_back_trans
annotate tools/align_back_trans/align_back_trans.py @ 7:883842b81796 draft default tip
"Update all the pico_galaxy tools on main Tool Shed"
author | peterjc |
---|---|
date | Fri, 16 Apr 2021 22:26:52 +0000 |
parents | b27388e5a0bb |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
7 | 2 """Back-translate a protein alignment to nucleotides. |
0 | 3 |
4 This tool is a short Python script (using Biopython library functions) to | |
5 load a protein alignment, and matching nucleotide FASTA file of unaligned | |
6 sequences, in order to produce a codon aware nucleotide alignment - which | |
7 can be viewed as a back translation. | |
8 | |
9 The development repository for this tool is here: | |
10 | |
3 | 11 * https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans |
0 | 12 |
13 This tool is available with a Galaxy wrapper from the Galaxy Tool Shed at: | |
14 | |
15 * http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans | |
16 | |
17 See accompanying text file for licence details (MIT licence). | |
18 """ | |
19 | |
5 | 20 from __future__ import print_function |
21 | |
0 | 22 import sys |
4
c8469274d136
v0.0.8 Using Biopython 1.67 from Tool Shed or Conda package
peterjc
parents:
3
diff
changeset
|
23 |
c8469274d136
v0.0.8 Using Biopython 1.67 from Tool Shed or Conda package
peterjc
parents:
3
diff
changeset
|
24 from Bio import AlignIO |
c8469274d136
v0.0.8 Using Biopython 1.67 from Tool Shed or Conda package
peterjc
parents:
3
diff
changeset
|
25 from Bio import SeqIO |
c8469274d136
v0.0.8 Using Biopython 1.67 from Tool Shed or Conda package
peterjc
parents:
3
diff
changeset
|
26 |
0 | 27 from Bio.Align import MultipleSeqAlignment |
4
c8469274d136
v0.0.8 Using Biopython 1.67 from Tool Shed or Conda package
peterjc
parents:
3
diff
changeset
|
28 from Bio.Alphabet import generic_protein |
0 | 29 from Bio.Data.CodonTable import ambiguous_generic_by_id |
4
c8469274d136
v0.0.8 Using Biopython 1.67 from Tool Shed or Conda package
peterjc
parents:
3
diff
changeset
|
30 from Bio.Seq import Seq |
0 | 31 |
32 if "-v" in sys.argv or "--version" in sys.argv: | |
5 | 33 print("v0.0.9") |
0 | 34 sys.exit(0) |
35 | |
36 | |
37 def check_trans(identifier, nuc, prot, table): | |
7 | 38 """Return nucleotide sequence, if works (can remove trailing stop).""" |
0 | 39 if len(nuc) % 3: |
7 | 40 sys.exit( |
41 "Nucleotide sequence for %s is length %i (not a multiple of three)" | |
42 % (identifier, len(nuc)) | |
43 ) | |
0 | 44 |
45 p = str(prot).upper().replace("*", "X") | |
46 t = str(nuc.translate(table)).upper().replace("*", "X") | |
47 if len(t) == len(p) + 1: | |
48 if str(nuc)[-3:].upper() in ambiguous_generic_by_id[table].stop_codons: | |
3 | 49 # Allow this... |
0 | 50 t = t[:-1] |
3 | 51 nuc = nuc[:-3] # edit return value |
0 | 52 if len(t) != len(p): |
7 | 53 err = ( |
54 "Inconsistent lengths for %s, ungapped protein %i, " | |
55 "tripled %i vs ungapped nucleotide %i." | |
56 % (identifier, len(p), len(p) * 3, len(nuc)) | |
57 ) | |
2
9fbf29a8c12b
v0.0.6 use format_source; v0.0.5 more explicit error msg, citation info
peterjc
parents:
1
diff
changeset
|
58 if t.endswith(p): |
9fbf29a8c12b
v0.0.6 use format_source; v0.0.5 more explicit error msg, citation info
peterjc
parents:
1
diff
changeset
|
59 err += "\nThere are %i extra nucleotides at the start." % (len(t) - len(p)) |
9fbf29a8c12b
v0.0.6 use format_source; v0.0.5 more explicit error msg, citation info
peterjc
parents:
1
diff
changeset
|
60 elif t.startswith(p): |
9fbf29a8c12b
v0.0.6 use format_source; v0.0.5 more explicit error msg, citation info
peterjc
parents:
1
diff
changeset
|
61 err += "\nThere are %i extra nucleotides at the end." % (len(t) - len(p)) |
9fbf29a8c12b
v0.0.6 use format_source; v0.0.5 more explicit error msg, citation info
peterjc
parents:
1
diff
changeset
|
62 elif p in t: |
3 | 63 # TODO - Calculate and report the number to trim at start and end? |
2
9fbf29a8c12b
v0.0.6 use format_source; v0.0.5 more explicit error msg, citation info
peterjc
parents:
1
diff
changeset
|
64 err += "\nHowever, protein sequence found within translated nucleotides." |
9fbf29a8c12b
v0.0.6 use format_source; v0.0.5 more explicit error msg, citation info
peterjc
parents:
1
diff
changeset
|
65 elif p[1:] in t: |
7 | 66 err += "\nHowever, ignoring first amino acid, protein sequence found " |
67 "within translated nucleotides." | |
3 | 68 sys.exit(err) |
0 | 69 |
70 if t == p: | |
71 return nuc | |
72 elif p.startswith("M") and "M" + t[1:] == p: | |
3 | 73 # Close, was there a start codon? |
0 | 74 if str(nuc[0:3]).upper() in ambiguous_generic_by_id[table].start_codons: |
75 return nuc | |
76 else: | |
7 | 77 sys.exit( |
78 "Translation check failed for %s\n" | |
79 "Would match if %s was a start codon (check correct table used)\n" | |
80 % (identifier, nuc[0:3].upper()) | |
81 ) | |
0 | 82 else: |
3 | 83 # Allow * vs X here? e.g. internal stop codons |
84 m = "".join("." if x == y else "!" for (x, y) in zip(p, t)) | |
0 | 85 if len(prot) < 70: |
86 sys.stderr.write("Protein: %s\n" % p) | |
87 sys.stderr.write(" %s\n" % m) | |
88 sys.stderr.write("Translation: %s\n" % t) | |
89 else: | |
90 for offset in range(0, len(p), 60): | |
7 | 91 sys.stderr.write("Protein: %s\n" % p[offset : offset + 60]) |
92 sys.stderr.write(" %s\n" % m[offset : offset + 60]) | |
93 sys.stderr.write("Translation: %s\n\n" % t[offset : offset + 60]) | |
3 | 94 sys.exit("Translation check failed for %s\n" % identifier) |
95 | |
0 | 96 |
7 | 97 def sequence_back_translate( |
98 aligned_protein_record, unaligned_nucleotide_record, gap, table=0 | |
99 ): | |
6 | 100 """Back-translate a sequence.""" |
3 | 101 # TODO - Separate arguments for protein gap and nucleotide gap? |
0 | 102 if not gap or len(gap) != 1: |
103 raise ValueError("Please supply a single gap character") | |
104 | |
105 alpha = unaligned_nucleotide_record.seq.alphabet | |
106 if hasattr(alpha, "gap_char"): | |
107 gap_codon = alpha.gap_char * 3 | |
108 assert len(gap_codon) == 3 | |
109 else: | |
110 from Bio.Alphabet import Gapped | |
7 | 111 |
0 | 112 alpha = Gapped(alpha, gap) |
3 | 113 gap_codon = gap * 3 |
0 | 114 |
115 ungapped_protein = aligned_protein_record.seq.ungap(gap) | |
116 ungapped_nucleotide = unaligned_nucleotide_record.seq | |
117 if table: | |
7 | 118 ungapped_nucleotide = check_trans( |
119 aligned_protein_record.id, ungapped_nucleotide, ungapped_protein, table | |
120 ) | |
0 | 121 elif len(ungapped_protein) * 3 != len(ungapped_nucleotide): |
7 | 122 sys.exit( |
123 "Inconsistent lengths for %s, ungapped protein %i, " | |
124 "tripled %i vs ungapped nucleotide %i" | |
125 % ( | |
126 aligned_protein_record.id, | |
127 len(ungapped_protein), | |
128 len(ungapped_protein) * 3, | |
129 len(ungapped_nucleotide), | |
130 ) | |
131 ) | |
0 | 132 |
133 seq = [] | |
134 nuc = str(ungapped_nucleotide) | |
135 for amino_acid in aligned_protein_record.seq: | |
136 if amino_acid == gap: | |
137 seq.append(gap_codon) | |
138 else: | |
139 seq.append(nuc[:3]) | |
140 nuc = nuc[3:] | |
7 | 141 assert not nuc, "Nucleotide sequence for %r longer than protein %r" % ( |
142 unaligned_nucleotide_record.id, | |
143 aligned_protein_record.id, | |
144 ) | |
0 | 145 |
3 | 146 aligned_nuc = unaligned_nucleotide_record[:] # copy for most annotation |
147 aligned_nuc.letter_annotation = {} # clear this | |
148 aligned_nuc.seq = Seq("".join(seq), alpha) # replace this | |
0 | 149 assert len(aligned_protein_record.seq) * 3 == len(aligned_nuc) |
150 return aligned_nuc | |
151 | |
3 | 152 |
7 | 153 def alignment_back_translate( |
154 protein_alignment, nucleotide_records, key_function=None, gap=None, table=0 | |
155 ): | |
0 | 156 """Thread nucleotide sequences onto a protein alignment.""" |
3 | 157 # TODO - Separate arguments for protein and nucleotide gap characters? |
0 | 158 if gap is None: |
159 gap = "-" | |
160 | |
161 aligned = [] | |
4
c8469274d136
v0.0.8 Using Biopython 1.67 from Tool Shed or Conda package
peterjc
parents:
3
diff
changeset
|
162 try: |
c8469274d136
v0.0.8 Using Biopython 1.67 from Tool Shed or Conda package
peterjc
parents:
3
diff
changeset
|
163 if key_function is None: |
c8469274d136
v0.0.8 Using Biopython 1.67 from Tool Shed or Conda package
peterjc
parents:
3
diff
changeset
|
164 for protein in protein_alignment: |
c8469274d136
v0.0.8 Using Biopython 1.67 from Tool Shed or Conda package
peterjc
parents:
3
diff
changeset
|
165 nucleotide = nucleotide_records[protein.id] |
c8469274d136
v0.0.8 Using Biopython 1.67 from Tool Shed or Conda package
peterjc
parents:
3
diff
changeset
|
166 aligned.append(sequence_back_translate(protein, nucleotide, gap, table)) |
c8469274d136
v0.0.8 Using Biopython 1.67 from Tool Shed or Conda package
peterjc
parents:
3
diff
changeset
|
167 else: |
c8469274d136
v0.0.8 Using Biopython 1.67 from Tool Shed or Conda package
peterjc
parents:
3
diff
changeset
|
168 for protein in protein_alignment: |
c8469274d136
v0.0.8 Using Biopython 1.67 from Tool Shed or Conda package
peterjc
parents:
3
diff
changeset
|
169 nucleotide = nucleotide_records[key_function(protein.id)] |
c8469274d136
v0.0.8 Using Biopython 1.67 from Tool Shed or Conda package
peterjc
parents:
3
diff
changeset
|
170 aligned.append(sequence_back_translate(protein, nucleotide, gap, table)) |
c8469274d136
v0.0.8 Using Biopython 1.67 from Tool Shed or Conda package
peterjc
parents:
3
diff
changeset
|
171 except KeyError: |
7 | 172 raise ValueError( |
173 "Could not find nucleotide sequence for protein %r" % protein.id | |
174 ) | |
4
c8469274d136
v0.0.8 Using Biopython 1.67 from Tool Shed or Conda package
peterjc
parents:
3
diff
changeset
|
175 |
0 | 176 return MultipleSeqAlignment(aligned) |
177 | |
178 | |
179 if len(sys.argv) == 4: | |
180 align_format, prot_align_file, nuc_fasta_file = sys.argv[1:] | |
181 nuc_align_file = sys.stdout | |
182 table = 0 | |
183 elif len(sys.argv) == 5: | |
184 align_format, prot_align_file, nuc_fasta_file, nuc_align_file = sys.argv[1:] | |
185 table = 0 | |
186 elif len(sys.argv) == 6: | |
187 align_format, prot_align_file, nuc_fasta_file, nuc_align_file, table = sys.argv[1:] | |
188 else: | |
7 | 189 sys.exit( |
190 """This is a Python script for 'back-translating' a protein alignment, | |
0 | 191 |
192 It requires three, four or five arguments: | |
193 - alignment format (e.g. fasta, clustal), | |
194 - aligned protein file (in specified format), | |
195 - unaligned nucleotide file (in fasta format). | |
196 - aligned nucleotiode output file (in same format), optional. | |
197 - NCBI translation table (0 for none), optional | |
198 | |
199 The nucleotide alignment is printed to stdout if no output filename is given. | |
200 | |
201 Example usage: | |
202 | |
203 $ python align_back_trans.py fasta demo_prot_align.fasta demo_nucs.fasta demo_nuc_align.fasta | |
204 | |
205 Warning: If the output file already exists, it will be overwritten. | |
206 | |
207 This script is available with sample data and a Galaxy wrapper here: | |
208 https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans | |
209 http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans | |
7 | 210 """ # noqa: E501 |
211 ) | |
0 | 212 |
213 try: | |
214 table = int(table) | |
3 | 215 except ValueError: |
216 sys.exit("Bad table argument %r" % table) | |
0 | 217 |
218 prot_align = AlignIO.read(prot_align_file, align_format, alphabet=generic_protein) | |
219 nuc_dict = SeqIO.index(nuc_fasta_file, "fasta") | |
220 nuc_align = alignment_back_translate(prot_align, nuc_dict, gap="-", table=table) | |
221 AlignIO.write(nuc_align, nuc_align_file, align_format) |