view tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 11:d51db443aaa4 draft

v0.2.3 Python 3 compatible print function.
author peterjc
date Wed, 30 May 2018 08:33:20 -0400
parents a06ad07431ba
children 71905a6d52a7
line wrap: on
line source

#!/usr/bin/env python
"""Find ORFs in a nucleotide sequence file.

For more details, see the help text and argument descriptions in the
accompanying get_orfs_or_cdss.xml file which defines a Galaxy interface.

This tool is a short Python script which requires Biopython. If you use
this tool in scientific work leading to a publication, please cite the
Biopython application note:

Cock et al 2009. Biopython: freely available Python tools for computational
molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.

This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute
(formerly SCRI), Dundee, UK. All rights reserved.

See accompanying text file for licence details (MIT licence).
"""

from __future__ import print_function

import re
import sys

from optparse import OptionParser

usage = r"""Use as follows:

$ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 \
-t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa \
--ob cds.bed --og cds.gff3
"""

try:
    from Bio.Seq import Seq, reverse_complement, translate
    from Bio.SeqRecord import SeqRecord
    from Bio import SeqIO
    from Bio.Data import CodonTable
except ImportError:
    sys.exit("Missing Biopython library")


parser = OptionParser(usage=usage)
parser.add_option('-i', '--input', dest='input_file',
                  default=None, help='Input fasta file',
                  metavar='FILE')
parser.add_option('-f', '--format', dest='seq_format',
                  default='fasta', help='Sequence format (e.g. fasta, fastq, sff)')
parser.add_option('--table', dest='table',
                  default=1, help='NCBI Translation table', type='int')
parser.add_option('-t', '--ftype', dest='ftype', type='choice',
                  choices=['CDS', 'ORF'], default='ORF',
                  help='Find ORF or CDSs')
parser.add_option('-e', '--ends', dest='ends', type='choice',
                  choices=['open', 'closed'], default='closed',
                  help='Open or closed. Closed ensures start/stop codons are present')
parser.add_option('-m', '--mode', dest='mode', type='choice',
                  choices=['all', 'top', 'one'], default='all',
                  help='Output all ORFs/CDSs from sequence, all ORFs/CDSs '
                  'with max length, or first with maximum length')
parser.add_option('--min_len', dest='min_len',
                  default=10, help='Minimum ORF/CDS length', type='int')
parser.add_option('-s', '--strand', dest='strand', type='choice',
                  choices=['forward', 'reverse', 'both'], default='both',
                  help='Strand to search for features on')
parser.add_option('--on', dest='out_nuc_file',
                  default=None, help='Output nucleotide sequences, or - for STDOUT',
                  metavar='FILE')
parser.add_option('--op', dest='out_prot_file',
                  default=None, help='Output protein sequences, or - for STDOUT',
                  metavar='FILE')
parser.add_option('--ob', dest='out_bed_file',
                  default=None, help='Output BED file, or - for STDOUT',
                  metavar='FILE')
parser.add_option('--og', dest='out_gff3_file',
                  default=None, help='Output GFF3 file, or - for STDOUT',
                  metavar='FILE')
parser.add_option('-v', '--version', dest='version',
                  default=False, action='store_true',
                  help='Show version and quit')

options, args = parser.parse_args()

if options.version:
    print("v0.2.3")
    sys.exit(0)

if not options.input_file:
    sys.exit("Input file is required")

if not any((options.out_nuc_file, options.out_prot_file, options.out_bed_file, options.out_gff3_file)):
    sys.exit("At least one output file is required")

try:
    table_obj = CodonTable.ambiguous_generic_by_id[options.table]
except KeyError:
    sys.exit("Unknown codon table %i" % options.table)

if options.seq_format.lower() == "sff":
    seq_format = "sff-trim"
elif options.seq_format.lower() == "fasta":
    seq_format = "fasta"
elif options.seq_format.lower().startswith("fastq"):
    seq_format = "fastq"
else:
    sys.exit("Unsupported file type %r" % options.seq_format)

print("Genetic code table %i" % options.table)
print("Minimum length %i aa" % options.min_len)
# print "Taking %s ORF(s) from %s strand(s)" % (mode, strand)

starts = sorted(table_obj.start_codons)
assert "NNN" not in starts
re_starts = re.compile("|".join(starts))

stops = sorted(table_obj.stop_codons)
assert "NNN" not in stops
re_stops = re.compile("|".join(stops))


def start_chop_and_trans(s, strict=True):
    """Returns offset, trimmed nuc, protein."""
    if strict:
        assert s[-3:] in stops, s
    assert len(s) % 3 == 0
    for match in re_starts.finditer(s):
        # Must check the start is in frame
        start = match.start()
        if start % 3 == 0:
            n = s[start:]
            assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
            if strict:
                t = translate(n, options.table, cds=True)
            else:
                # Use when missing stop codon,
                t = "M" + translate(n[3:], options.table, to_stop=True)
            return start, n, t
    return None, None, None


def break_up_frame(s):
    """Returns offset, nuc, protein."""
    start = 0
    for match in re_stops.finditer(s):
        index = match.start() + 3
        if index % 3 != 0:
            continue
        n = s[start:index]
        if options.ftype == "CDS":
            offset, n, t = start_chop_and_trans(n)
        else:
            offset = 0
            t = translate(n, options.table, to_stop=True)
        if n and len(t) >= options.min_len:
            yield start + offset, n, t
        start = index
    if options.ends == "open":
        # No stop codon, Biopython's strict CDS translate will fail
        n = s[start:]
        # Ensure we have whole codons
        # TODO - Try appending N instead?
        # TODO - Do the next four lines more elegantly
        if len(n) % 3:
            n = n[:-1]
        if len(n) % 3:
            n = n[:-1]
        if options.ftype == "CDS":
            offset, n, t = start_chop_and_trans(n, strict=False)
        else:
            offset = 0
            t = translate(n, options.table, to_stop=True)
        if n and len(t) >= options.min_len:
            yield start + offset, n, t


def get_all_peptides(nuc_seq):
    """Returns start, end, strand, nucleotides, protein.

    Co-ordinates are Python style zero-based.
    """
    # TODO - Refactor to use a generator function (in start order)
    # rather than making a list and sorting?
    answer = []
    full_len = len(nuc_seq)
    if options.strand != "reverse":
        for frame in range(0, 3):
            for offset, n, t in break_up_frame(nuc_seq[frame:]):
                start = frame + offset  # zero based
                answer.append((start, start + len(n), +1, n, t))
    if options.strand != "forward":
        rc = reverse_complement(nuc_seq)
        for frame in range(0, 3):
            for offset, n, t in break_up_frame(rc[frame:]):
                start = full_len - frame - offset  # zero based
                answer.append((start - len(n), start, -1, n, t))
    answer.sort()
    return answer


def get_top_peptides(nuc_seq):
    """Returns all peptides of max length."""
    values = list(get_all_peptides(nuc_seq))
    if not values:
        raise StopIteration
    max_len = max(len(x[-1]) for x in values)
    for x in values:
        if len(x[-1]) == max_len:
            yield x


def get_one_peptide(nuc_seq):
    """Returns first (left most) peptide with max length."""
    values = list(get_top_peptides(nuc_seq))
    if not values:
        raise StopIteration
    yield values[0]


if options.mode == "all":
    get_peptides = get_all_peptides
elif options.mode == "top":
    get_peptides = get_top_peptides
elif options.mode == "one":
    get_peptides = get_one_peptide

in_count = 0
out_count = 0
if options.out_nuc_file == "-":
    out_nuc = sys.stdout
elif options.out_nuc_file:
    out_nuc = open(options.out_nuc_file, "w")
else:
    out_nuc = None

if options.out_prot_file == "-":
    out_prot = sys.stdout
elif options.out_prot_file:
    out_prot = open(options.out_prot_file, "w")
else:
    out_prot = None

if options.out_bed_file == "-":
    out_bed = sys.stdout
elif options.out_bed_file:
    out_bed = open(options.out_bed_file, "w")
else:
    out_bed = None

if options.out_gff3_file == "-":
    out_gff3 = sys.stdout
elif options.out_gff3_file:
    out_gff3 = open(options.out_gff3_file, "w")
else:
    out_gff3 = None

if out_gff3:
    out_gff3.write('##gff-version 3\n')

for record in SeqIO.parse(options.input_file, seq_format):
    for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())):
        out_count += 1
        if f_strand == +1:
            loc = "%i..%i" % (f_start + 1, f_end)
        else:
            loc = "complement(%i..%i)" % (f_start + 1, f_end)
        descr = "length %i aa, %i bp, from %s of %s" \
                % (len(t), len(n), loc, record.description)
        fid = record.id + "|%s%i" % (options.ftype, i + 1)
        r = SeqRecord(Seq(n), id=fid, name="", description=descr)
        t = SeqRecord(Seq(t), id=fid, name="", description=descr)
        if out_nuc:
            SeqIO.write(r, out_nuc, "fasta")
        if out_prot:
            SeqIO.write(t, out_prot, "fasta")
        nice_strand = '+' if f_strand == +1 else '-'
        if out_bed:
            out_bed.write('\t'.join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand])) + '\n')
        if out_gff3:
            out_gff3.write('\t'.join(map(str, [record.id, 'getOrfsOrCds', 'CDS', f_start + 1, f_end, '.',
                                               nice_strand, 0, 'ID=%s%s' % (options.ftype, i + 1)])) + '\n')
    in_count += 1
if out_nuc and out_nuc is not sys.stdout:
    out_nuc.close()
if out_prot and out_prot is not sys.stdout:
    out_prot.close()
if out_bed and out_bed is not sys.stdout:
    out_bed.close()

print("Found %i %ss in %i sequences" % (out_count, options.ftype, in_count))