Mercurial > repos > peterjc > get_orfs_or_cdss
changeset 0:9cff9a1176ea
Uploaded v0.0.1
author | peterjc |
---|---|
date | Thu, 19 Jan 2012 10:17:10 -0500 (2012-01-19) |
parents | |
children | 922d69bd5258 |
files | tools/filters/get_orfs_or_cdss.py tools/filters/get_orfs_or_cdss.txt tools/filters/get_orfs_or_cdss.xml |
diffstat | 3 files changed, 441 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/filters/get_orfs_or_cdss.py Thu Jan 19 10:17:10 2012 -0500 @@ -0,0 +1,219 @@ +#!/usr/bin/env python +"""Find ORFs in a nucleotide sequence file. + +get_orfs_or_cdss.py $input_fasta $input_format $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file + +Takes ten command line options, input sequence filename, format, genetic +code, CDS vs ORF, end type (open, closed), selection mode (all, top, one), +minimum length (in amino acids), strand (both, forward, reverse), output +nucleotide filename, and output protein filename. + +This tool is a short Python script which requires Biopython. If you use +this tool in scientific work leading to a publication, please cite the +Biopython application note: + +Cock et al 2009. Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + +This script is copyright 2011 by Peter Cock, The James Hutton Institute +(formerly SCRI), Dundee, UK. All rights reserved. + +See accompanying text file for licence details (MIT/BSD style). + +This is version 0.0.1 of the script. +""" +import sys +import re + +def stop_err(msg, err=1): + sys.stderr.write(msg.rstrip() + "\n") + sys.exit(err) + +try: + from Bio.Seq import Seq, reverse_complement, translate + from Bio.SeqRecord import SeqRecord + from Bio import SeqIO + from Bio.Data import CodonTable +except ImportError: + stop_err("Missing Biopython library") + +#Parse Command Line +try: + input_file, seq_format, table, ftype, ends, mode, min_len, strand, out_nuc_file, out_prot_file = sys.argv[1:] +except ValueError: + stop_err("Expected ten arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) + +try: + table = int(table) +except ValueError: + stop_err("Expected integer for genetic code table, got %s" % table) + +try: + table_obj = CodonTable.ambiguous_generic_by_id[table] +except KeyError: + stop_err("Unknown codon table %i" % table) + +if ftype not in ["CDS", "ORF"]: + stop_err("Expected CDS or ORF, got %s" % ftype) + +if ends not in ["open", "closed"]: + stop_err("Expected open or closed for end treatment, got %s" % ends) + +try: + min_len = int(min_len) +except ValueError: + stop_err("Expected integer for min_len, got %s" % min_len) + +if seq_format.lower()=="sff": + seq_format = "sff-trim" +elif seq_format.lower()=="fasta": + seq_format = "fasta" +elif seq_format.lower().startswith("fastq"): + seq_format = "fastq" +else: + stop_err("Unsupported file type %r" % seq_format) + +print "Genetic code table %i" % table +print "Minimum length %i aa" % min_len +#print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) + +starts = sorted(table_obj.start_codons) +assert "NNN" not in starts +re_starts = re.compile("|".join(starts)) + +stops = sorted(table_obj.stop_codons) +assert "NNN" not in stops +re_stops = re.compile("|".join(stops)) + +def start_chop_and_trans(s, strict=True): + """Returns offset, trimmed nuc, protein.""" + if strict: + assert s[-3:] in stops, s + assert len(s) % 3 == 0 + for match in re_starts.finditer(s): + #Must check the start is in frame + start = match.start() + if start % 3 == 0: + n = s[start:] + assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) + if strict: + t = translate(n, table, cds=True) + else: + #Use when missing stop codon, + t = "M" + translate(n[3:], table, to_stop=True) + return start, n, t + return None, None, None + +def break_up_frame(s): + """Returns offset, nuc, protein.""" + start = 0 + for match in re_stops.finditer(s): + index = match.start() + 3 + if index % 3 != 0: + continue + n = s[start:index] + if ftype=="CDS": + offset, n, t = start_chop_and_trans(n) + else: + offset = 0 + t = translate(n, table, to_stop=True) + if n and len(t) >= min_len: + yield start + offset, n, t + start = index + if ends == "open": + #No stop codon, Biopython's strict CDS translate will fail + n = s[start:] + #Ensure we have whole codons + #TODO - Try appending N instead? + #TODO - Do the next four lines more elegantly + if len(n) % 3: + n = n[:-1] + if len(n) % 3: + n = n[:-1] + if ftype=="CDS": + offset, n, t = start_chop_and_trans(n, strict=False) + else: + offset = 0 + t = translate(n, table, to_stop=True) + if n and len(t) >= min_len: + yield start + offset, n, t + + +def get_all_peptides(nuc_seq): + """Returns start, end, strand, nucleotides, protein. + + Co-ordinates are Python style zero-based. + """ + #TODO - Refactor to use a generator function (in start order) + #rather than making a list and sorting? + answer = [] + full_len = len(nuc_seq) + if strand != "reverse": + for frame in range(0,3): + for offset, n, t in break_up_frame(nuc_seq[frame:]): + start = frame + offset #zero based + answer.append((start, start + len(n), +1, n, t)) + if strand != "forward": + rc = reverse_complement(nuc_seq) + for frame in range(0,3) : + for offset, n, t in break_up_frame(rc[frame:]): + start = full_len - frame - offset #zero based + answer.append((start, start + len(n), -1, n ,t)) + answer.sort() + return answer + +def get_top_peptides(nuc_seq): + """Returns all peptides of max length.""" + values = list(get_all_peptides(nuc_seq)) + if not values: + raise StopIteration + max_len = max(len(x[-1]) for x in values) + for x in values: + if len(x[-1]) == max_len: + yield x + +def get_one_peptide(nuc_seq): + """Returns first (left most) peptide with max length.""" + values = list(get_top_peptides(nuc_seq)) + if not values: + raise StopIteration + yield values[0] + +if mode == "all": + get_peptides = get_all_peptides +elif mode == "top": + get_peptides = get_top_peptides +elif mode == "one": + get_peptides = get_one_peptide + +in_count = 0 +out_count = 0 +if out_nuc_file == "-": + out_nuc = sys.stdout +else: + out_nuc = open(out_nuc_file, "w") +if out_prot_file == "-": + out_prot = sys.stdout +else: + out_prot = open(out_prot_file, "w") +for record in SeqIO.parse(input_file, seq_format): + for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): + out_count += 1 + if f_strand == +1: + loc = "%i..%i" % (f_start+1, f_end) + else: + loc = "complement(%i..%i)" % (f_start+1, f_end) + descr = "length %i aa, %i bp, from %s of %s" \ + % (len(t), len(n), loc, record.description) + r = SeqRecord(Seq(n), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) + t = SeqRecord(Seq(t), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) + SeqIO.write(r, out_nuc, "fasta") + SeqIO.write(t, out_prot, "fasta") + in_count += 1 +if out_nuc is not sys.stdout: + out_nuc.close() +if out_prot is not sys.stdout: + out_prot.close() + +print "Found %i %ss in %i sequences" % (out_count, ftype, in_count)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/filters/get_orfs_or_cdss.txt Thu Jan 19 10:17:10 2012 -0500 @@ -0,0 +1,75 @@ +Galaxy tool to find ORFs or simple CDSs +======================================= + +This tool is copyright 2011 by Peter Cock, The James Hutton Institute +(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. +See the licence text below. + +This tool is a short Python script (using Biopython library functions) +to search nucleotide sequences for open reading frames (ORFs) or coding +sequences (CDSs) where the first potential start codon is used. See the +help text in the XML file for more information. + +There are just two files to install: + +* get_orfs_or_cdss.py (the Python script) +* get_orfs_or_cdss.xml (the Galaxy tool definition) + +The suggested location is in the Galaxy folder tools/filters next to the tool +for calling sff_extract.py for converting SFF to FASTQ or FASTA + QUAL. + +You will also need to modify the tools_conf.xml file to tell Galaxy to offer the +tool. One suggested location is in the filters section. Simply add the line: + +<tool file="filters/get_orfs_or_cdss.xml" /> + +You will also need to install Biopython 1.54 or later. If you want to run +the unit tests, include this line in tools_conf.xml.sample and the sample +FASTA files under the test-data directory. That's it. + + +History +======= + +v0.0.1 - Initial version. + + +Developers +========== + +This script and related tools are being developed on the following hg branch: +http://bitbucket.org/peterjc/galaxy-central/src/tools + +For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball use +the following command from the Galaxy root folder: + +tar -czf get_orfs_or_cdss.tar.gz tools/filters/get_orfs_or_cdss.* + +Check this worked: + +$ tar -tzf get_orfs_or_cdss.tar.gz +filter/get_orfs_or_cdss.py +filter/get_orfs_or_cdss.txt +filter/get_orfs_or_cdss.xml + + +Licence (MIT/BSD style) +======================= + +Permission to use, copy, modify, and distribute this software and its +documentation with or without modifications and for any purpose and +without fee is hereby granted, provided that any copyright notices +appear in all copies and that both those copyright notices and this +permission notice appear in supporting documentation, and that the +names of the contributors or copyright holders not be used in +advertising or publicity pertaining to distribution of the software +without specific prior permission. + +THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT +OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +OR PERFORMANCE OF THIS SOFTWARE.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/filters/get_orfs_or_cdss.xml Thu Jan 19 10:17:10 2012 -0500 @@ -0,0 +1,147 @@ +<tool id="get_orfs_or_cdss" name="Get open reading frames (ORFs) or coding sequences (CDSs)" version="0.0.1"> + <description>e.g. to get peptides from ESTs</description> + <command interpreter="python"> +get_orfs_or_cdss.py $input_file $input_file.ext $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file + </command> + <inputs> + <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file (nucleotides)" help="FASTA, FASTQ, or SFF format." /> + <param name="table" type="select" label="Genetic code" help="Tables from the NCBI, these determine the start and stop codons"> + <option value="1">1. Standard</option> + <option value="2">2. Vertebrate Mitochondrial</option> + <option value="3">3. Yeast Mitochondrial</option> + <option value="4">4. Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma</option> + <option value="5">5. Invertebrate Mitochondrial</option> + <option value="6">6. Ciliate Macronuclear and Dasycladacean</option> + <option value="9">9. Echinoderm Mitochondrial</option> + <option value="10">10. Euplotid Nuclear</option> + <option value="11">11. Bacterial</option> + <option value="12">12. Alternative Yeast Nuclear</option> + <option value="13">13. Ascidian Mitochondrial</option> + <option value="14">14. Flatworm Mitochondrial</option> + <option value="15">15. Blepharisma Macronuclear</option> + <option value="16">16. Chlorophycean Mitochondrial</option> + <option value="21">21. Trematode Mitochondrial</option> + <option value="22">22. Scenedesmus obliquus</option> + <option value="23">23. Thraustochytrium Mitochondrial</option> + </param> + <param name="ftype" type="select" value="True" label="Look for ORFs or CDSs"> + <option value="ORF">Look for ORFs (check for stop codons only, ignore start codons)</option> + <option value="CDS">Look for CDSs (with start and stop codons)</option> + </param> + <param name="ends" type="select" value="open" label="Sequence end treatment"> + <option value="open">Open ended (will allow missing start/stop codons at the ends)</option> + <option value="closed">Complete (will check for start/stop codons at the ends)</option> + <!-- TODO? Circular, for using this on finished bacteria etc --> + </param> + + <param name="mode" type="select" label="Selection criteria" help="Suppose a sequence has ORFs/CDSs of lengths 100, 102 and 102 -- which should be taken? These options would return 3, 2 or 1 ORF."> + <option value="all">All ORFs/CDSs from each sequence</option> + <option value="top">All ORFs/CDSs from each sequence with the maximum length</option> + <option value="one">First ORF/CDS from each sequence with the maximum length</option> + </param> + <param name="min_len" type="integer" size="5" value="30" label="Minimum length ORF/CDS (in amino acids, e.g. 30 aa = 90 bp plus any stop codon)"> + </param> + <param name="strand" type="select" label="Strand to search" help="Use the forward only option if your sequence directionality is known (e.g. from poly-A tails, or strand specific RNA sequencing."> + <option value="both">Search both the forward and reverse strand</option> + <option value="forward">Only search the forward strand</option> + <option value="reverse">Only search the reverse strand</option> + </param> + </inputs> + <outputs> + <data name="out_nuc_file" format="fasta" label="${ftype.value}s (nucleotides)" /> + <data name="out_prot_file" format="fasta" label="${ftype.value}s (amino acids)" /> + </outputs> + <tests> + <test> + <param name="input_file" value="get_orf_input.fasta" /> + <param name="table" value="1" /> + <param name="ftype" value="CDS" /> + <param name="ends" value="open" /> + <param name="mode" value="all" /> + <param name="min_len" value="10" /> + <param name="strand" value="forward" /> + <output name="out_nuc_file" file="get_orf_input.t1_nuc_out.fasta" /> + <output name="out_prot_file" file="get_orf_input.t1_prot_out.fasta" /> + </test> + <test> + <param name="input_file" value="get_orf_input.fasta" /> + <param name="table" value="11" /> + <param name="ftype" value="CDS" /> + <param name="ends" value="closed" /> + <param name="mode" value="all" /> + <param name="min_len" value="10" /> + <param name="strand" value="forward" /> + <output name="out_nuc_file" file="get_orf_input.t11_nuc_out.fasta" /> + <output name="out_prot_file" file="get_orf_input.t11_prot_out.fasta" /> + </test> + <test> + <param name="input_file" value="get_orf_input.fasta" /> + <param name="table" value="11" /> + <param name="ftype" value="CDS" /> + <param name="ends" value="open" /> + <param name="mode" value="all" /> + <param name="min_len" value="10" /> + <param name="strand" value="forward" /> + <output name="out_nuc_file" file="get_orf_input.t11_open_nuc_out.fasta" /> + <output name="out_prot_file" file="get_orf_input.t11_open_prot_out.fasta" /> + </test> + </tests> + <requirements> + <requirement type="python-module">Bio</requirement> + </requirements> + <help> + +**What it does** + +Takes an input file of nucleotide sequences (typically FASTA, but also FASTQ +and Standard Flowgram Format (SFF) are supported), and searches each sequence +for open reading frames (ORFs) or potential coding sequences (CDSs) of the +given minimum length. These are returned as FASTA files of nucleotides and +protein sequences. + +You can choose to have all the ORFs/CDSs above the minimum length for each +sequence (similar to the EMBOSS getorf tool), those with the longest length +equal, or the first ORF/CDS with the longest length (in the special case +where a sequence encodes two or more long ORFs/CDSs of the same length). The +last option is a reasonable choice when the input sequences represent EST or +mRNA sequences, where only one ORF/CDS is expected. + +Note that if no ORFs/CDSs in a sequence match the criteria, there will be no +output for that sequence. + +Also note that the ORFs/CDSs are assigned modified identifiers to distinguish +them from the original full length sequences, by appending a suffix. + +The start and stop codons are taken from the `NCBI Genetic Codes +<http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>`_. +When searching for ORFs, the sequences will run from stop codon to stop +codon, and any start codons are ignored. When searching for CDSs, the first +potential start codon will be used, giving the longest possible CDS within +each ORF, and thus the longest possible protein sequence. This is useful +for things like BLAST or domain searching, but since this may not be the +correct start codon may not be appropriate for signal peptide detection +etc. + +**Example Usage** + +Given some EST sequences (Sanger capillary reads) assembled into unigenes, +or a transcriptome assembly from some RNA-Seq, each of your nucleotide +sequences should (barring sequencing, assembly errors, frame-shifts etc) +encode one protein as a single ORF/CDS, which you wish to extract (and +perhaps translate into amino acids). + +If your RNS-Seq data was strand specific, and assembled taking this into +account, you should only search for ORFs/CDSs on the forward strand. + +**Citation** + +This tool uses Biopython. If you use this tool in scientific work leading +to a publication, please cite the Biopython application note (and Galaxy +too of course): + +Cock et al 2009. Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + + </help> +</tool>