Mercurial > repos > peterjc > align_back_trans
view tools/align_back_trans/align_back_trans.py @ 7:883842b81796 draft default tip
"Update all the pico_galaxy tools on main Tool Shed"
author | peterjc |
---|---|
date | Fri, 16 Apr 2021 22:26:52 +0000 |
parents | b27388e5a0bb |
children |
line wrap: on
line source
#!/usr/bin/env python """Back-translate a protein alignment to nucleotides. This tool is a short Python script (using Biopython library functions) to load a protein alignment, and matching nucleotide FASTA file of unaligned sequences, in order to produce a codon aware nucleotide alignment - which can be viewed as a back translation. The development repository for this tool is here: * https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans This tool is available with a Galaxy wrapper from the Galaxy Tool Shed at: * http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans See accompanying text file for licence details (MIT licence). """ from __future__ import print_function import sys from Bio import AlignIO from Bio import SeqIO from Bio.Align import MultipleSeqAlignment from Bio.Alphabet import generic_protein from Bio.Data.CodonTable import ambiguous_generic_by_id from Bio.Seq import Seq if "-v" in sys.argv or "--version" in sys.argv: print("v0.0.9") sys.exit(0) def check_trans(identifier, nuc, prot, table): """Return nucleotide sequence, if works (can remove trailing stop).""" if len(nuc) % 3: sys.exit( "Nucleotide sequence for %s is length %i (not a multiple of three)" % (identifier, len(nuc)) ) p = str(prot).upper().replace("*", "X") t = str(nuc.translate(table)).upper().replace("*", "X") if len(t) == len(p) + 1: if str(nuc)[-3:].upper() in ambiguous_generic_by_id[table].stop_codons: # Allow this... t = t[:-1] nuc = nuc[:-3] # edit return value if len(t) != len(p): err = ( "Inconsistent lengths for %s, ungapped protein %i, " "tripled %i vs ungapped nucleotide %i." % (identifier, len(p), len(p) * 3, len(nuc)) ) if t.endswith(p): err += "\nThere are %i extra nucleotides at the start." % (len(t) - len(p)) elif t.startswith(p): err += "\nThere are %i extra nucleotides at the end." % (len(t) - len(p)) elif p in t: # TODO - Calculate and report the number to trim at start and end? err += "\nHowever, protein sequence found within translated nucleotides." elif p[1:] in t: err += "\nHowever, ignoring first amino acid, protein sequence found " "within translated nucleotides." sys.exit(err) if t == p: return nuc elif p.startswith("M") and "M" + t[1:] == p: # Close, was there a start codon? if str(nuc[0:3]).upper() in ambiguous_generic_by_id[table].start_codons: return nuc else: sys.exit( "Translation check failed for %s\n" "Would match if %s was a start codon (check correct table used)\n" % (identifier, nuc[0:3].upper()) ) else: # Allow * vs X here? e.g. internal stop codons m = "".join("." if x == y else "!" for (x, y) in zip(p, t)) if len(prot) < 70: sys.stderr.write("Protein: %s\n" % p) sys.stderr.write(" %s\n" % m) sys.stderr.write("Translation: %s\n" % t) else: for offset in range(0, len(p), 60): sys.stderr.write("Protein: %s\n" % p[offset : offset + 60]) sys.stderr.write(" %s\n" % m[offset : offset + 60]) sys.stderr.write("Translation: %s\n\n" % t[offset : offset + 60]) sys.exit("Translation check failed for %s\n" % identifier) def sequence_back_translate( aligned_protein_record, unaligned_nucleotide_record, gap, table=0 ): """Back-translate a sequence.""" # TODO - Separate arguments for protein gap and nucleotide gap? if not gap or len(gap) != 1: raise ValueError("Please supply a single gap character") alpha = unaligned_nucleotide_record.seq.alphabet if hasattr(alpha, "gap_char"): gap_codon = alpha.gap_char * 3 assert len(gap_codon) == 3 else: from Bio.Alphabet import Gapped alpha = Gapped(alpha, gap) gap_codon = gap * 3 ungapped_protein = aligned_protein_record.seq.ungap(gap) ungapped_nucleotide = unaligned_nucleotide_record.seq if table: ungapped_nucleotide = check_trans( aligned_protein_record.id, ungapped_nucleotide, ungapped_protein, table ) elif len(ungapped_protein) * 3 != len(ungapped_nucleotide): sys.exit( "Inconsistent lengths for %s, ungapped protein %i, " "tripled %i vs ungapped nucleotide %i" % ( aligned_protein_record.id, len(ungapped_protein), len(ungapped_protein) * 3, len(ungapped_nucleotide), ) ) seq = [] nuc = str(ungapped_nucleotide) for amino_acid in aligned_protein_record.seq: if amino_acid == gap: seq.append(gap_codon) else: seq.append(nuc[:3]) nuc = nuc[3:] assert not nuc, "Nucleotide sequence for %r longer than protein %r" % ( unaligned_nucleotide_record.id, aligned_protein_record.id, ) aligned_nuc = unaligned_nucleotide_record[:] # copy for most annotation aligned_nuc.letter_annotation = {} # clear this aligned_nuc.seq = Seq("".join(seq), alpha) # replace this assert len(aligned_protein_record.seq) * 3 == len(aligned_nuc) return aligned_nuc def alignment_back_translate( protein_alignment, nucleotide_records, key_function=None, gap=None, table=0 ): """Thread nucleotide sequences onto a protein alignment.""" # TODO - Separate arguments for protein and nucleotide gap characters? if gap is None: gap = "-" aligned = [] try: if key_function is None: for protein in protein_alignment: nucleotide = nucleotide_records[protein.id] aligned.append(sequence_back_translate(protein, nucleotide, gap, table)) else: for protein in protein_alignment: nucleotide = nucleotide_records[key_function(protein.id)] aligned.append(sequence_back_translate(protein, nucleotide, gap, table)) except KeyError: raise ValueError( "Could not find nucleotide sequence for protein %r" % protein.id ) return MultipleSeqAlignment(aligned) if len(sys.argv) == 4: align_format, prot_align_file, nuc_fasta_file = sys.argv[1:] nuc_align_file = sys.stdout table = 0 elif len(sys.argv) == 5: align_format, prot_align_file, nuc_fasta_file, nuc_align_file = sys.argv[1:] table = 0 elif len(sys.argv) == 6: align_format, prot_align_file, nuc_fasta_file, nuc_align_file, table = sys.argv[1:] else: sys.exit( """This is a Python script for 'back-translating' a protein alignment, It requires three, four or five arguments: - alignment format (e.g. fasta, clustal), - aligned protein file (in specified format), - unaligned nucleotide file (in fasta format). - aligned nucleotiode output file (in same format), optional. - NCBI translation table (0 for none), optional The nucleotide alignment is printed to stdout if no output filename is given. Example usage: $ python align_back_trans.py fasta demo_prot_align.fasta demo_nucs.fasta demo_nuc_align.fasta Warning: If the output file already exists, it will be overwritten. This script is available with sample data and a Galaxy wrapper here: https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans """ # noqa: E501 ) try: table = int(table) except ValueError: sys.exit("Bad table argument %r" % table) prot_align = AlignIO.read(prot_align_file, align_format, alphabet=generic_protein) nuc_dict = SeqIO.index(nuc_fasta_file, "fasta") nuc_align = alignment_back_translate(prot_align, nuc_dict, gap="-", table=table) AlignIO.write(nuc_align, nuc_align_file, align_format)