Mercurial > repos > jasper > align_back_trans
comparison tools/align_back_trans/align_back_trans.py @ 0:6c6b16cab42a draft
Uploaded
author | jasper |
---|---|
date | Tue, 09 May 2017 13:22:02 -0400 |
parents | |
children | 647866afd876 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6c6b16cab42a |
---|---|
1 #!/usr/bin/env python | |
2 """Back-translate a protein alignment to nucleotides | |
3 | |
4 This tool is a short Python script (using Biopython library functions) to | |
5 load a protein alignment, and matching nucleotide FASTA file of unaligned | |
6 sequences, in order to produce a codon aware nucleotide alignment - which | |
7 can be viewed as a back translation. | |
8 | |
9 The development repository for this tool is here: | |
10 | |
11 * https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans | |
12 | |
13 This tool is available with a Galaxy wrapper from the Galaxy Tool Shed at: | |
14 | |
15 * http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans | |
16 | |
17 See accompanying text file for licence details (MIT licence). | |
18 """ | |
19 | |
20 import sys | |
21 from Bio.Seq import Seq | |
22 from Bio.Alphabet import generic_protein | |
23 from Bio.Align import MultipleSeqAlignment | |
24 from Bio import SeqIO | |
25 from Bio import AlignIO | |
26 from Bio.Data.CodonTable import ambiguous_generic_by_id | |
27 | |
28 if "-v" in sys.argv or "--version" in sys.argv: | |
29 print "v0.0.7" | |
30 sys.exit(0) | |
31 | |
32 | |
33 def check_trans(identifier, nuc, prot, table): | |
34 """Returns nucleotide sequence if works (can remove trailing stop)""" | |
35 if len(nuc) % 3: | |
36 sys.exit("Nucleotide sequence for %s is length %i (not a multiple of three)" | |
37 % (identifier, len(nuc))) | |
38 | |
39 p = str(prot).upper().replace("*", "X") | |
40 t = str(nuc.translate(table)).upper().replace("*", "X") | |
41 if len(t) == len(p) + 1: | |
42 if str(nuc)[-3:].upper() in ambiguous_generic_by_id[table].stop_codons: | |
43 # Allow this... | |
44 t = t[:-1] | |
45 nuc = nuc[:-3] # edit return value | |
46 if len(t) != len(p): | |
47 err = ("Inconsistent lengths for %s, ungapped protein %i, " | |
48 "tripled %i vs ungapped nucleotide %i." % | |
49 (identifier, len(p), len(p) * 3, len(nuc))) | |
50 if t.endswith(p): | |
51 err += "\nThere are %i extra nucleotides at the start." % (len(t) - len(p)) | |
52 elif t.startswith(p): | |
53 err += "\nThere are %i extra nucleotides at the end." % (len(t) - len(p)) | |
54 elif p in t: | |
55 # TODO - Calculate and report the number to trim at start and end? | |
56 err += "\nHowever, protein sequence found within translated nucleotides." | |
57 elif p[1:] in t: | |
58 err += "\nHowever, ignoring first amino acid, protein sequence found within translated nucleotides." | |
59 sys.exit(err) | |
60 | |
61 if t == p: | |
62 return nuc | |
63 elif p.startswith("M") and "M" + t[1:] == p: | |
64 # Close, was there a start codon? | |
65 if str(nuc[0:3]).upper() in ambiguous_generic_by_id[table].start_codons: | |
66 return nuc | |
67 else: | |
68 sys.exit("Translation check failed for %s\n" | |
69 "Would match if %s was a start codon (check correct table used)\n" | |
70 % (identifier, nuc[0:3].upper())) | |
71 else: | |
72 # Allow * vs X here? e.g. internal stop codons | |
73 m = "".join("." if x == y else "!" for (x, y) in zip(p, t)) | |
74 if len(prot) < 70: | |
75 sys.stderr.write("Protein: %s\n" % p) | |
76 sys.stderr.write(" %s\n" % m) | |
77 sys.stderr.write("Translation: %s\n" % t) | |
78 else: | |
79 for offset in range(0, len(p), 60): | |
80 sys.stderr.write("Protein: %s\n" % p[offset:offset + 60]) | |
81 sys.stderr.write(" %s\n" % m[offset:offset + 60]) | |
82 sys.stderr.write("Translation: %s\n\n" % t[offset:offset + 60]) | |
83 sys.exit("Translation check failed for %s\n" % identifier) | |
84 | |
85 | |
86 def sequence_back_translate(aligned_protein_record, unaligned_nucleotide_record, gap, table=0): | |
87 # TODO - Separate arguments for protein gap and nucleotide gap? | |
88 if not gap or len(gap) != 1: | |
89 raise ValueError("Please supply a single gap character") | |
90 | |
91 alpha = unaligned_nucleotide_record.seq.alphabet | |
92 if hasattr(alpha, "gap_char"): | |
93 gap_codon = alpha.gap_char * 3 | |
94 assert len(gap_codon) == 3 | |
95 else: | |
96 from Bio.Alphabet import Gapped | |
97 alpha = Gapped(alpha, gap) | |
98 gap_codon = gap * 3 | |
99 | |
100 ungapped_protein = aligned_protein_record.seq.ungap(gap) | |
101 ungapped_nucleotide = unaligned_nucleotide_record.seq | |
102 if table: | |
103 ungapped_nucleotide = check_trans(aligned_protein_record.id, ungapped_nucleotide, ungapped_protein, table) | |
104 elif len(ungapped_protein) * 3 != len(ungapped_nucleotide): | |
105 sys.exit("Inconsistent lengths for %s, ungapped protein %i, " | |
106 "tripled %i vs ungapped nucleotide %i" % | |
107 (aligned_protein_record.id, | |
108 len(ungapped_protein), | |
109 len(ungapped_protein) * 3, | |
110 len(ungapped_nucleotide))) | |
111 | |
112 seq = [] | |
113 nuc = str(ungapped_nucleotide) | |
114 for amino_acid in aligned_protein_record.seq: | |
115 if amino_acid == gap: | |
116 seq.append(gap_codon) | |
117 else: | |
118 seq.append(nuc[:3]) | |
119 nuc = nuc[3:] | |
120 assert not nuc, "Nucleotide sequence for %r longer than protein %r" \ | |
121 % (unaligned_nucleotide_record.id, aligned_protein_record.id) | |
122 | |
123 aligned_nuc = unaligned_nucleotide_record[:] # copy for most annotation | |
124 aligned_nuc.letter_annotation = {} # clear this | |
125 aligned_nuc.seq = Seq("".join(seq), alpha) # replace this | |
126 assert len(aligned_protein_record.seq) * 3 == len(aligned_nuc) | |
127 return aligned_nuc | |
128 | |
129 | |
130 def alignment_back_translate(protein_alignment, nucleotide_records, key_function=None, gap=None, table=0): | |
131 """Thread nucleotide sequences onto a protein alignment.""" | |
132 # TODO - Separate arguments for protein and nucleotide gap characters? | |
133 if key_function is None: | |
134 key_function = lambda x: x | |
135 if gap is None: | |
136 gap = "-" | |
137 | |
138 aligned = [] | |
139 for protein in protein_alignment: | |
140 try: | |
141 nucleotide = nucleotide_records[key_function(protein.id)] | |
142 except KeyError: | |
143 raise ValueError("Could not find nucleotide sequence for protein %r" | |
144 % protein.id) | |
145 aligned.append(sequence_back_translate(protein, nucleotide, gap, table)) | |
146 return MultipleSeqAlignment(aligned) | |
147 | |
148 | |
149 if len(sys.argv) == 4: | |
150 align_format, prot_align_file, nuc_fasta_file = sys.argv[1:] | |
151 nuc_align_file = sys.stdout | |
152 table = 0 | |
153 elif len(sys.argv) == 5: | |
154 align_format, prot_align_file, nuc_fasta_file, nuc_align_file = sys.argv[1:] | |
155 table = 0 | |
156 elif len(sys.argv) == 6: | |
157 align_format, prot_align_file, nuc_fasta_file, nuc_align_file, table = sys.argv[1:] | |
158 else: | |
159 sys.exit("""This is a Python script for 'back-translating' a protein alignment, | |
160 | |
161 It requires three, four or five arguments: | |
162 - alignment format (e.g. fasta, clustal), | |
163 - aligned protein file (in specified format), | |
164 - unaligned nucleotide file (in fasta format). | |
165 - aligned nucleotiode output file (in same format), optional. | |
166 - NCBI translation table (0 for none), optional | |
167 | |
168 The nucleotide alignment is printed to stdout if no output filename is given. | |
169 | |
170 Example usage: | |
171 | |
172 $ python align_back_trans.py fasta demo_prot_align.fasta demo_nucs.fasta demo_nuc_align.fasta | |
173 | |
174 Warning: If the output file already exists, it will be overwritten. | |
175 | |
176 This script is available with sample data and a Galaxy wrapper here: | |
177 https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans | |
178 http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans | |
179 """) | |
180 | |
181 try: | |
182 table = int(table) | |
183 except ValueError: | |
184 sys.exit("Bad table argument %r" % table) | |
185 | |
186 prot_align = AlignIO.read(prot_align_file, align_format, alphabet=generic_protein) | |
187 nuc_dict = SeqIO.index(nuc_fasta_file, "fasta") | |
188 nuc_align = alignment_back_translate(prot_align, nuc_dict, gap="-", table=table) | |
189 AlignIO.write(nuc_align, nuc_align_file, align_format) |