Mercurial > repos > peterjc > get_orfs_or_cdss
changeset 5:5208c15805ec draft
Uploaded v0.0.5 dependant on Biopython 1.62
author | peterjc |
---|---|
date | Mon, 28 Oct 2013 05:19:38 -0400 |
parents | d51819d2d7e2 |
children | 64e67f172188 |
files | tools/filters/get_orfs_or_cdss.py tools/filters/get_orfs_or_cdss.rst tools/filters/get_orfs_or_cdss.xml tools/filters/repository_dependencies.xml tools/get_orfs_or_cdss/README.rst tools/get_orfs_or_cdss/get_orfs_or_cdss.py tools/get_orfs_or_cdss/get_orfs_or_cdss.xml tools/get_orfs_or_cdss/repository_dependencies.xml |
diffstat | 8 files changed, 525 insertions(+), 516 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/filters/get_orfs_or_cdss.py Mon Jul 29 09:30:44 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,223 +0,0 @@ -#!/usr/bin/env python -"""Find ORFs in a nucleotide sequence file. - -get_orfs_or_cdss.py $input_fasta $input_format $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file - -Takes ten command line options, input sequence filename, format, genetic -code, CDS vs ORF, end type (open, closed), selection mode (all, top, one), -minimum length (in amino acids), strand (both, forward, reverse), output -nucleotide filename, and output protein filename. - -This tool is a short Python script which requires Biopython. If you use -this tool in scientific work leading to a publication, please cite the -Biopython application note: - -Cock et al 2009. Biopython: freely available Python tools for computational -molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. - -This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute -(formerly SCRI), Dundee, UK. All rights reserved. - -See accompanying text file for licence details (MIT/BSD style). - -This is version 0.0.3 of the script. -""" -import sys -import re - -if "-v" in sys.argv or "--version" in sys.argv: - print "v0.0.3" - sys.exit(0) - -def stop_err(msg, err=1): - sys.stderr.write(msg.rstrip() + "\n") - sys.exit(err) - -try: - from Bio.Seq import Seq, reverse_complement, translate - from Bio.SeqRecord import SeqRecord - from Bio import SeqIO - from Bio.Data import CodonTable -except ImportError: - stop_err("Missing Biopython library") - -#Parse Command Line -try: - input_file, seq_format, table, ftype, ends, mode, min_len, strand, out_nuc_file, out_prot_file = sys.argv[1:] -except ValueError: - stop_err("Expected ten arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) - -try: - table = int(table) -except ValueError: - stop_err("Expected integer for genetic code table, got %s" % table) - -try: - table_obj = CodonTable.ambiguous_generic_by_id[table] -except KeyError: - stop_err("Unknown codon table %i" % table) - -if ftype not in ["CDS", "ORF"]: - stop_err("Expected CDS or ORF, got %s" % ftype) - -if ends not in ["open", "closed"]: - stop_err("Expected open or closed for end treatment, got %s" % ends) - -try: - min_len = int(min_len) -except ValueError: - stop_err("Expected integer for min_len, got %s" % min_len) - -if seq_format.lower()=="sff": - seq_format = "sff-trim" -elif seq_format.lower()=="fasta": - seq_format = "fasta" -elif seq_format.lower().startswith("fastq"): - seq_format = "fastq" -else: - stop_err("Unsupported file type %r" % seq_format) - -print "Genetic code table %i" % table -print "Minimum length %i aa" % min_len -#print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) - -starts = sorted(table_obj.start_codons) -assert "NNN" not in starts -re_starts = re.compile("|".join(starts)) - -stops = sorted(table_obj.stop_codons) -assert "NNN" not in stops -re_stops = re.compile("|".join(stops)) - -def start_chop_and_trans(s, strict=True): - """Returns offset, trimmed nuc, protein.""" - if strict: - assert s[-3:] in stops, s - assert len(s) % 3 == 0 - for match in re_starts.finditer(s): - #Must check the start is in frame - start = match.start() - if start % 3 == 0: - n = s[start:] - assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) - if strict: - t = translate(n, table, cds=True) - else: - #Use when missing stop codon, - t = "M" + translate(n[3:], table, to_stop=True) - return start, n, t - return None, None, None - -def break_up_frame(s): - """Returns offset, nuc, protein.""" - start = 0 - for match in re_stops.finditer(s): - index = match.start() + 3 - if index % 3 != 0: - continue - n = s[start:index] - if ftype=="CDS": - offset, n, t = start_chop_and_trans(n) - else: - offset = 0 - t = translate(n, table, to_stop=True) - if n and len(t) >= min_len: - yield start + offset, n, t - start = index - if ends == "open": - #No stop codon, Biopython's strict CDS translate will fail - n = s[start:] - #Ensure we have whole codons - #TODO - Try appending N instead? - #TODO - Do the next four lines more elegantly - if len(n) % 3: - n = n[:-1] - if len(n) % 3: - n = n[:-1] - if ftype=="CDS": - offset, n, t = start_chop_and_trans(n, strict=False) - else: - offset = 0 - t = translate(n, table, to_stop=True) - if n and len(t) >= min_len: - yield start + offset, n, t - - -def get_all_peptides(nuc_seq): - """Returns start, end, strand, nucleotides, protein. - - Co-ordinates are Python style zero-based. - """ - #TODO - Refactor to use a generator function (in start order) - #rather than making a list and sorting? - answer = [] - full_len = len(nuc_seq) - if strand != "reverse": - for frame in range(0,3): - for offset, n, t in break_up_frame(nuc_seq[frame:]): - start = frame + offset #zero based - answer.append((start, start + len(n), +1, n, t)) - if strand != "forward": - rc = reverse_complement(nuc_seq) - for frame in range(0,3) : - for offset, n, t in break_up_frame(rc[frame:]): - start = full_len - frame - offset #zero based - answer.append((start - len(n), start, -1, n ,t)) - answer.sort() - return answer - -def get_top_peptides(nuc_seq): - """Returns all peptides of max length.""" - values = list(get_all_peptides(nuc_seq)) - if not values: - raise StopIteration - max_len = max(len(x[-1]) for x in values) - for x in values: - if len(x[-1]) == max_len: - yield x - -def get_one_peptide(nuc_seq): - """Returns first (left most) peptide with max length.""" - values = list(get_top_peptides(nuc_seq)) - if not values: - raise StopIteration - yield values[0] - -if mode == "all": - get_peptides = get_all_peptides -elif mode == "top": - get_peptides = get_top_peptides -elif mode == "one": - get_peptides = get_one_peptide - -in_count = 0 -out_count = 0 -if out_nuc_file == "-": - out_nuc = sys.stdout -else: - out_nuc = open(out_nuc_file, "w") -if out_prot_file == "-": - out_prot = sys.stdout -else: - out_prot = open(out_prot_file, "w") -for record in SeqIO.parse(input_file, seq_format): - for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): - out_count += 1 - if f_strand == +1: - loc = "%i..%i" % (f_start+1, f_end) - else: - loc = "complement(%i..%i)" % (f_start+1, f_end) - descr = "length %i aa, %i bp, from %s of %s" \ - % (len(t), len(n), loc, record.description) - r = SeqRecord(Seq(n), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) - t = SeqRecord(Seq(t), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) - SeqIO.write(r, out_nuc, "fasta") - SeqIO.write(t, out_prot, "fasta") - in_count += 1 -if out_nuc is not sys.stdout: - out_nuc.close() -if out_prot is not sys.stdout: - out_prot.close() - -print "Found %i %ss in %i sequences" % (out_count, ftype, in_count)
--- a/tools/filters/get_orfs_or_cdss.rst Mon Jul 29 09:30:44 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,121 +0,0 @@ -Galaxy tool to find ORFs or simple CDSs -======================================= - -This tool is copyright 2011-2013 by Peter Cock, The James Hutton Institute -(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. -See the licence text below. - -This tool is a short Python script (using Biopython library functions) -to search nucleotide sequences for open reading frames (ORFs) or coding -sequences (CDSs) where the first potential start codon is used. See the -help text in the XML file for more information. - -This tool is available from the Galaxy Tool Shed at: - -* http://toolshed.g2.bx.psu.edu/view/peterjc/get_orfs_or_cdss - -See also the EMBOSS tool ``getorf`` which offers similar functionality and -has also been wrapped for use within Galaxy. - - -Automated Installation -====================== - -This should be straightforward using the Galaxy Tool Shed, which should be -able to automatically install the dependency on Biopython, and then install -this tool and run its unit tests. - - -Manual Installation -=================== - -There are just two files to install to use this tool from within Galaxy: - -* get_orfs_or_cdss.py (the Python script) -* get_orfs_or_cdss.xml (the Galaxy tool definition) - -If you are installing this manually (rather than via the Tool Shed), the -suggested location is in the Galaxy folder tools/filters next to the tool -for calling sff_extract.py for converting SFF to FASTQ or FASTA + QUAL. -You will also need to modify the tools_conf.xml file to tell Galaxy to offer the -tool. One suggested location is in the filters section. Simply add the line:: - - <tool file="filters/get_orfs_or_cdss.xml" /> - -You will also need to install Biopython 1.54 or later. If you want to run -the unit tests, include this line in tools_conf.xml.sample and the sample -FASTA files under the test-data directory. Then:: - - ./run_functional_tests.sh -id get_orfs_or_cdss - -That's it. - - -History -======= - -======= ====================================================================== -Version Changes -------- ---------------------------------------------------------------------- -v0.0.1 - Initial version. -v0.0.2 - Correct labelling issue on reverse strand. - - Use the new <stdio> settings in the XML wrappers to catch errors -v0.0.3 - Include unit tests. - - Record Python script version when run from Galaxy. -v0.0.4 - Link to Tool Shed added to help text and this documentation. -v0.0.5 - Automated intallation of the Biopython dependency. - - Use reStructuredText for this README file. - - Adopt standard MIT License. -======= ====================================================================== - - -Developers -========== - -This script and related tools are being developed on the following hg branch: -http://bitbucket.org/peterjc/galaxy-central/src/tools - -For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use -the following command from the Galaxy root folder:: - - $ tar -czf get_orfs_or_cdss.tar.gz tools/filters/get_orfs_or_cdss.* tools/filters/repository_dependencies.xml test-data/get_orf_input*.fasta test-data/Ssuis.fasta - -Check this worked:: - - $ tar -tzf get_orfs_or_cdss.tar.gz - filter/get_orfs_or_cdss.py - filter/get_orfs_or_cdss.rst - filter/get_orfs_or_cdss.xml - tools/filters/repository_dependencies.xml - test-data/get_orf_input.fasta - test-data/get_orf_input.Suis_ORF.nuc.fasta - test-data/get_orf_input.Suis_ORF.prot.fasta - test-data/get_orf_input.t11_nuc_out.fasta - test-data/get_orf_input.t11_open_nuc_out.fasta - test-data/get_orf_input.t11_open_prot_out.fasta - test-data/get_orf_input.t11_prot_out.fasta - test-data/get_orf_input.t1_nuc_out.fasta - test-data/get_orf_input.t1_prot_out.fasta - test-data/Ssuis.fasta - - -Licence (MIT) -============= - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE.
--- a/tools/filters/get_orfs_or_cdss.xml Mon Jul 29 09:30:44 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,166 +0,0 @@ -<tool id="get_orfs_or_cdss" name="Get open reading frames (ORFs) or coding sequences (CDSs)" version="0.0.5"> - <description>e.g. to get peptides from ESTs</description> - <version_command interpreter="python">get_orfs_or_cdss.py --version</version_command> - <command interpreter="python"> -get_orfs_or_cdss.py $input_file $input_file.ext $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file - </command> - <stdio> - <!-- Anything other than zero is an error --> - <exit_code range="1:" /> - <exit_code range=":-1" /> - </stdio> - <inputs> - <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file (nucleotides)" help="FASTA, FASTQ, or SFF format." /> - <param name="table" type="select" label="Genetic code" help="Tables from the NCBI, these determine the start and stop codons"> - <option value="1">1. Standard</option> - <option value="2">2. Vertebrate Mitochondrial</option> - <option value="3">3. Yeast Mitochondrial</option> - <option value="4">4. Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma</option> - <option value="5">5. Invertebrate Mitochondrial</option> - <option value="6">6. Ciliate Macronuclear and Dasycladacean</option> - <option value="9">9. Echinoderm Mitochondrial</option> - <option value="10">10. Euplotid Nuclear</option> - <option value="11">11. Bacterial</option> - <option value="12">12. Alternative Yeast Nuclear</option> - <option value="13">13. Ascidian Mitochondrial</option> - <option value="14">14. Flatworm Mitochondrial</option> - <option value="15">15. Blepharisma Macronuclear</option> - <option value="16">16. Chlorophycean Mitochondrial</option> - <option value="21">21. Trematode Mitochondrial</option> - <option value="22">22. Scenedesmus obliquus</option> - <option value="23">23. Thraustochytrium Mitochondrial</option> - </param> - <param name="ftype" type="select" value="True" label="Look for ORFs or CDSs"> - <option value="ORF">Look for ORFs (check for stop codons only, ignore start codons)</option> - <option value="CDS">Look for CDSs (with start and stop codons)</option> - </param> - <param name="ends" type="select" value="open" label="Sequence end treatment"> - <option value="open">Open ended (will allow missing start/stop codons at the ends)</option> - <option value="closed">Complete (will check for start/stop codons at the ends)</option> - <!-- TODO? Circular, for using this on finished bacteria etc --> - </param> - - <param name="mode" type="select" label="Selection criteria" help="Suppose a sequence has ORFs/CDSs of lengths 100, 102 and 102 -- which should be taken? These options would return 3, 2 or 1 ORF."> - <option value="all">All ORFs/CDSs from each sequence</option> - <option value="top">All ORFs/CDSs from each sequence with the maximum length</option> - <option value="one">First ORF/CDS from each sequence with the maximum length</option> - </param> - <param name="min_len" type="integer" size="5" value="30" label="Minimum length ORF/CDS (in amino acids, e.g. 30 aa = 90 bp plus any stop codon)"> - </param> - <param name="strand" type="select" label="Strand to search" help="Use the forward only option if your sequence directionality is known (e.g. from poly-A tails, or strand specific RNA sequencing."> - <option value="both">Search both the forward and reverse strand</option> - <option value="forward">Only search the forward strand</option> - <option value="reverse">Only search the reverse strand</option> - </param> - </inputs> - <outputs> - <data name="out_nuc_file" format="fasta" label="${ftype.value}s (nucleotides)" /> - <data name="out_prot_file" format="fasta" label="${ftype.value}s (amino acids)" /> - </outputs> - <tests> - <test> - <param name="input_file" value="get_orf_input.fasta" /> - <param name="table" value="1" /> - <param name="ftype" value="CDS" /> - <param name="ends" value="open" /> - <param name="mode" value="all" /> - <param name="min_len" value="10" /> - <param name="strand" value="forward" /> - <output name="out_nuc_file" file="get_orf_input.t1_nuc_out.fasta" /> - <output name="out_prot_file" file="get_orf_input.t1_prot_out.fasta" /> - </test> - <test> - <param name="input_file" value="get_orf_input.fasta" /> - <param name="table" value="11" /> - <param name="ftype" value="CDS" /> - <param name="ends" value="closed" /> - <param name="mode" value="all" /> - <param name="min_len" value="10" /> - <param name="strand" value="forward" /> - <output name="out_nuc_file" file="get_orf_input.t11_nuc_out.fasta" /> - <output name="out_prot_file" file="get_orf_input.t11_prot_out.fasta" /> - </test> - <test> - <param name="input_file" value="get_orf_input.fasta" /> - <param name="table" value="11" /> - <param name="ftype" value="CDS" /> - <param name="ends" value="open" /> - <param name="mode" value="all" /> - <param name="min_len" value="10" /> - <param name="strand" value="forward" /> - <output name="out_nuc_file" file="get_orf_input.t11_open_nuc_out.fasta" /> - <output name="out_prot_file" file="get_orf_input.t11_open_prot_out.fasta" /> - </test> - <test> - <param name="input_file" value="Ssuis.fasta" /> - <param name="table" value="11" /> - <param name="ftype" value="ORF" /> - <param name="ends" value="open" /> - <param name="mode" value="all" /> - <param name="min_len" value="100" /> - <param name="strand" value="both" /> - <output name="out_nuc_file" file="get_orf_input.Suis_ORF.nuc.fasta" /> - <output name="out_prot_file" file="get_orf_input.Suis_ORF.prot.fasta" /> - </test> - </tests> - <requirements> - <requirement type="python-module">Bio</requirement> - </requirements> - <help> - -**What it does** - -Takes an input file of nucleotide sequences (typically FASTA, but also FASTQ -and Standard Flowgram Format (SFF) are supported), and searches each sequence -for open reading frames (ORFs) or potential coding sequences (CDSs) of the -given minimum length. These are returned as FASTA files of nucleotides and -protein sequences. - -You can choose to have all the ORFs/CDSs above the minimum length for each -sequence (similar to the EMBOSS getorf tool), those with the longest length -equal, or the first ORF/CDS with the longest length (in the special case -where a sequence encodes two or more long ORFs/CDSs of the same length). The -last option is a reasonable choice when the input sequences represent EST or -mRNA sequences, where only one ORF/CDS is expected. - -Note that if no ORFs/CDSs in a sequence match the criteria, there will be no -output for that sequence. - -Also note that the ORFs/CDSs are assigned modified identifiers to distinguish -them from the original full length sequences, by appending a suffix. - -The start and stop codons are taken from the `NCBI Genetic Codes -<http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>`_. -When searching for ORFs, the sequences will run from stop codon to stop -codon, and any start codons are ignored. When searching for CDSs, the first -potential start codon will be used, giving the longest possible CDS within -each ORF, and thus the longest possible protein sequence. This is useful -for things like BLAST or domain searching, but since this may not be the -correct start codon may not be appropriate for signal peptide detection -etc. - -**Example Usage** - -Given some EST sequences (Sanger capillary reads) assembled into unigenes, -or a transcriptome assembly from some RNA-Seq, each of your nucleotide -sequences should (barring sequencing, assembly errors, frame-shifts etc) -encode one protein as a single ORF/CDS, which you wish to extract (and -perhaps translate into amino acids). - -If your RNS-Seq data was strand specific, and assembled taking this into -account, you should only search for ORFs/CDSs on the forward strand. - -**Citation** - -This tool uses Biopython. If you use this tool in scientific work leading -to a publication, please cite the Biopython application note (and Galaxy -too of course): - -Cock et al 2009. Biopython: freely available Python tools for computational -molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. - -This tool is available to install into other Galaxy Instances via the Galaxy -Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/get_orfs_or_cdss - </help> -</tool>
--- a/tools/filters/repository_dependencies.xml Mon Jul 29 09:30:44 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -<?xml version="1.0"?> -<repositories description="This requires Biopython as a dependency."> -<!-- Leave out the tool shed and revision to get the current - tool shed and latest revision at the time of upload --> -<repository changeset_revision="5d0c54f7fea2" name="package_biopython_1_61" owner="biopython" toolshed="http://toolshed.g2.bx.psu.edu" /> -</repositories>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/get_orfs_or_cdss/README.rst Mon Oct 28 05:19:38 2013 -0400 @@ -0,0 +1,125 @@ +Galaxy tool to find ORFs or simple CDSs +======================================= + +This tool is copyright 2011-2013 by Peter Cock, The James Hutton Institute +(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. +See the licence text below (MIT licence). + +This tool is a short Python script (using Biopython library functions) +to search nucleotide sequences for open reading frames (ORFs) or coding +sequences (CDSs) where the first potential start codon is used. See the +help text in the XML file for more information. + +This tool is available from the Galaxy Tool Shed at: + +* http://toolshed.g2.bx.psu.edu/view/peterjc/get_orfs_or_cdss + +See also the EMBOSS tool ``getorf`` which offers similar functionality and +has also been wrapped for use within Galaxy. + + +Automated Installation +====================== + +This should be straightforward using the Galaxy Tool Shed, which should be +able to automatically install the dependency on Biopython, and then install +this tool and run its unit tests. + + +Manual Installation +=================== + +There are just two files to install to use this tool from within Galaxy: + +* get_orfs_or_cdss.py (the Python script) +* get_orfs_or_cdss.xml (the Galaxy tool definition) + +The suggested location is in a dedicated tools/get_orfs_or_cdss folder. + +You will also need to modify the tools_conf.xml file to tell Galaxy to offer the +tool. One suggested location is in the filters section. Simply add the line:: + + <tool file="get_orfs_or_cdss/get_orfs_or_cdss.xml" /> + +You will also need to install Biopython 1.54 or later. If you want to run +the unit tests, include this line in tools_conf.xml.sample and the sample +FASTA files under the test-data directory. Then:: + + ./run_functional_tests.sh -id get_orfs_or_cdss + +That's it. + + +History +======= + +======= ====================================================================== +Version Changes +------- ---------------------------------------------------------------------- +v0.0.1 - Initial version. +v0.0.2 - Correct labelling issue on reverse strand. + - Use the new <stdio> settings in the XML wrappers to catch errors +v0.0.3 - Include unit tests. + - Record Python script version when run from Galaxy. +v0.0.4 - Link to Tool Shed added to help text and this documentation. +v0.0.5 - Automated intallation of the Biopython dependency. + - Use reStructuredText for this README file. + - Adopt standard MIT License. + - Updated citation information (Cock et al. 2013). + - Renamed folder and adopted README.rst naming. +======= ====================================================================== + + +Developers +========== + +This script and related tools were initially developed on the following hg branch: +http://bitbucket.org/peterjc/galaxy-central/src/tools + +Development has now moved to a dedicated GitHub repository: +https://github.com/peterjc/pico_galaxy/tree/master/tools + +For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use +the following command from the Galaxy root folder:: + + $ tar -czf get_orfs_or_cdss.tar.gz tools/get_orfs_or_cdss/README.rst tools/get_orfs_or_cdss/get_orfs_or_cdss.* tools/get_orfs_or_cdss/repository_dependencies.xml test-data/get_orf_input*.fasta test-data/Ssuis.fasta + +Check this worked:: + + $ tar -tzf get_orfs_or_cdss.tar.gz + tools/get_orfs_or_cdss/README.rst + tools/get_orfs_or_cdss/get_orfs_or_cdss.py + tools/get_orfs_or_cdss/get_orfs_or_cdss.xml + tools/get_orfs_or_cdss/repository_dependencies.xml + test-data/get_orf_input.fasta + test-data/get_orf_input.Suis_ORF.nuc.fasta + test-data/get_orf_input.Suis_ORF.prot.fasta + test-data/get_orf_input.t11_nuc_out.fasta + test-data/get_orf_input.t11_open_nuc_out.fasta + test-data/get_orf_input.t11_open_prot_out.fasta + test-data/get_orf_input.t11_prot_out.fasta + test-data/get_orf_input.t1_nuc_out.fasta + test-data/get_orf_input.t1_prot_out.fasta + test-data/Ssuis.fasta + + +Licence (MIT) +============= + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/get_orfs_or_cdss/get_orfs_or_cdss.py Mon Oct 28 05:19:38 2013 -0400 @@ -0,0 +1,223 @@ +#!/usr/bin/env python +"""Find ORFs in a nucleotide sequence file. + +get_orfs_or_cdss.py $input_fasta $input_format $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file + +Takes ten command line options, input sequence filename, format, genetic +code, CDS vs ORF, end type (open, closed), selection mode (all, top, one), +minimum length (in amino acids), strand (both, forward, reverse), output +nucleotide filename, and output protein filename. + +This tool is a short Python script which requires Biopython. If you use +this tool in scientific work leading to a publication, please cite the +Biopython application note: + +Cock et al 2009. Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + +This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute +(formerly SCRI), Dundee, UK. All rights reserved. + +See accompanying text file for licence details (MIT/BSD style). + +This is version 0.0.3 of the script. +""" +import sys +import re + +if "-v" in sys.argv or "--version" in sys.argv: + print "v0.0.3" + sys.exit(0) + +def stop_err(msg, err=1): + sys.stderr.write(msg.rstrip() + "\n") + sys.exit(err) + +try: + from Bio.Seq import Seq, reverse_complement, translate + from Bio.SeqRecord import SeqRecord + from Bio import SeqIO + from Bio.Data import CodonTable +except ImportError: + stop_err("Missing Biopython library") + +#Parse Command Line +try: + input_file, seq_format, table, ftype, ends, mode, min_len, strand, out_nuc_file, out_prot_file = sys.argv[1:] +except ValueError: + stop_err("Expected ten arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) + +try: + table = int(table) +except ValueError: + stop_err("Expected integer for genetic code table, got %s" % table) + +try: + table_obj = CodonTable.ambiguous_generic_by_id[table] +except KeyError: + stop_err("Unknown codon table %i" % table) + +if ftype not in ["CDS", "ORF"]: + stop_err("Expected CDS or ORF, got %s" % ftype) + +if ends not in ["open", "closed"]: + stop_err("Expected open or closed for end treatment, got %s" % ends) + +try: + min_len = int(min_len) +except ValueError: + stop_err("Expected integer for min_len, got %s" % min_len) + +if seq_format.lower()=="sff": + seq_format = "sff-trim" +elif seq_format.lower()=="fasta": + seq_format = "fasta" +elif seq_format.lower().startswith("fastq"): + seq_format = "fastq" +else: + stop_err("Unsupported file type %r" % seq_format) + +print "Genetic code table %i" % table +print "Minimum length %i aa" % min_len +#print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) + +starts = sorted(table_obj.start_codons) +assert "NNN" not in starts +re_starts = re.compile("|".join(starts)) + +stops = sorted(table_obj.stop_codons) +assert "NNN" not in stops +re_stops = re.compile("|".join(stops)) + +def start_chop_and_trans(s, strict=True): + """Returns offset, trimmed nuc, protein.""" + if strict: + assert s[-3:] in stops, s + assert len(s) % 3 == 0 + for match in re_starts.finditer(s): + #Must check the start is in frame + start = match.start() + if start % 3 == 0: + n = s[start:] + assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) + if strict: + t = translate(n, table, cds=True) + else: + #Use when missing stop codon, + t = "M" + translate(n[3:], table, to_stop=True) + return start, n, t + return None, None, None + +def break_up_frame(s): + """Returns offset, nuc, protein.""" + start = 0 + for match in re_stops.finditer(s): + index = match.start() + 3 + if index % 3 != 0: + continue + n = s[start:index] + if ftype=="CDS": + offset, n, t = start_chop_and_trans(n) + else: + offset = 0 + t = translate(n, table, to_stop=True) + if n and len(t) >= min_len: + yield start + offset, n, t + start = index + if ends == "open": + #No stop codon, Biopython's strict CDS translate will fail + n = s[start:] + #Ensure we have whole codons + #TODO - Try appending N instead? + #TODO - Do the next four lines more elegantly + if len(n) % 3: + n = n[:-1] + if len(n) % 3: + n = n[:-1] + if ftype=="CDS": + offset, n, t = start_chop_and_trans(n, strict=False) + else: + offset = 0 + t = translate(n, table, to_stop=True) + if n and len(t) >= min_len: + yield start + offset, n, t + + +def get_all_peptides(nuc_seq): + """Returns start, end, strand, nucleotides, protein. + + Co-ordinates are Python style zero-based. + """ + #TODO - Refactor to use a generator function (in start order) + #rather than making a list and sorting? + answer = [] + full_len = len(nuc_seq) + if strand != "reverse": + for frame in range(0,3): + for offset, n, t in break_up_frame(nuc_seq[frame:]): + start = frame + offset #zero based + answer.append((start, start + len(n), +1, n, t)) + if strand != "forward": + rc = reverse_complement(nuc_seq) + for frame in range(0,3) : + for offset, n, t in break_up_frame(rc[frame:]): + start = full_len - frame - offset #zero based + answer.append((start - len(n), start, -1, n ,t)) + answer.sort() + return answer + +def get_top_peptides(nuc_seq): + """Returns all peptides of max length.""" + values = list(get_all_peptides(nuc_seq)) + if not values: + raise StopIteration + max_len = max(len(x[-1]) for x in values) + for x in values: + if len(x[-1]) == max_len: + yield x + +def get_one_peptide(nuc_seq): + """Returns first (left most) peptide with max length.""" + values = list(get_top_peptides(nuc_seq)) + if not values: + raise StopIteration + yield values[0] + +if mode == "all": + get_peptides = get_all_peptides +elif mode == "top": + get_peptides = get_top_peptides +elif mode == "one": + get_peptides = get_one_peptide + +in_count = 0 +out_count = 0 +if out_nuc_file == "-": + out_nuc = sys.stdout +else: + out_nuc = open(out_nuc_file, "w") +if out_prot_file == "-": + out_prot = sys.stdout +else: + out_prot = open(out_prot_file, "w") +for record in SeqIO.parse(input_file, seq_format): + for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): + out_count += 1 + if f_strand == +1: + loc = "%i..%i" % (f_start+1, f_end) + else: + loc = "complement(%i..%i)" % (f_start+1, f_end) + descr = "length %i aa, %i bp, from %s of %s" \ + % (len(t), len(n), loc, record.description) + r = SeqRecord(Seq(n), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) + t = SeqRecord(Seq(t), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) + SeqIO.write(r, out_nuc, "fasta") + SeqIO.write(t, out_prot, "fasta") + in_count += 1 +if out_nuc is not sys.stdout: + out_nuc.close() +if out_prot is not sys.stdout: + out_prot.close() + +print "Found %i %ss in %i sequences" % (out_count, ftype, in_count)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/get_orfs_or_cdss/get_orfs_or_cdss.xml Mon Oct 28 05:19:38 2013 -0400 @@ -0,0 +1,171 @@ +<tool id="get_orfs_or_cdss" name="Get open reading frames (ORFs) or coding sequences (CDSs)" version="0.0.5"> + <description>e.g. to get peptides from ESTs</description> + <requirements> + <requirement type="package" version="1.62">biopython</requirement> + <requirement type="python-module">Bio</requirement> + </requirements> + <version_command interpreter="python">get_orfs_or_cdss.py --version</version_command> + <command interpreter="python"> +get_orfs_or_cdss.py $input_file $input_file.ext $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file + </command> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> + <inputs> + <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file (nucleotides)" help="FASTA, FASTQ, or SFF format." /> + <param name="table" type="select" label="Genetic code" help="Tables from the NCBI, these determine the start and stop codons"> + <option value="1">1. Standard</option> + <option value="2">2. Vertebrate Mitochondrial</option> + <option value="3">3. Yeast Mitochondrial</option> + <option value="4">4. Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma</option> + <option value="5">5. Invertebrate Mitochondrial</option> + <option value="6">6. Ciliate Macronuclear and Dasycladacean</option> + <option value="9">9. Echinoderm Mitochondrial</option> + <option value="10">10. Euplotid Nuclear</option> + <option value="11">11. Bacterial</option> + <option value="12">12. Alternative Yeast Nuclear</option> + <option value="13">13. Ascidian Mitochondrial</option> + <option value="14">14. Flatworm Mitochondrial</option> + <option value="15">15. Blepharisma Macronuclear</option> + <option value="16">16. Chlorophycean Mitochondrial</option> + <option value="21">21. Trematode Mitochondrial</option> + <option value="22">22. Scenedesmus obliquus</option> + <option value="23">23. Thraustochytrium Mitochondrial</option> + </param> + <param name="ftype" type="select" value="True" label="Look for ORFs or CDSs"> + <option value="ORF">Look for ORFs (check for stop codons only, ignore start codons)</option> + <option value="CDS">Look for CDSs (with start and stop codons)</option> + </param> + <param name="ends" type="select" value="open" label="Sequence end treatment"> + <option value="open">Open ended (will allow missing start/stop codons at the ends)</option> + <option value="closed">Complete (will check for start/stop codons at the ends)</option> + <!-- TODO? Circular, for using this on finished bacteria etc --> + </param> + <param name="mode" type="select" label="Selection criteria" help="Suppose a sequence has ORFs/CDSs of lengths 100, 102 and 102 -- which should be taken? These options would return 3, 2 or 1 ORF."> + <option value="all">All ORFs/CDSs from each sequence</option> + <option value="top">All ORFs/CDSs from each sequence with the maximum length</option> + <option value="one">First ORF/CDS from each sequence with the maximum length</option> + </param> + <param name="min_len" type="integer" size="5" value="30" label="Minimum length ORF/CDS (in amino acids, e.g. 30 aa = 90 bp plus any stop codon)" /> + <param name="strand" type="select" label="Strand to search" help="Use the forward only option if your sequence directionality is known (e.g. from poly-A tails, or strand specific RNA sequencing."> + <option value="both">Search both the forward and reverse strand</option> + <option value="forward">Only search the forward strand</option> + <option value="reverse">Only search the reverse strand</option> + </param> + </inputs> + <outputs> + <data name="out_nuc_file" format="fasta" label="${ftype.value}s (nucleotides)" /> + <data name="out_prot_file" format="fasta" label="${ftype.value}s (amino acids)" /> + </outputs> + <tests> + <test> + <param name="input_file" value="get_orf_input.fasta" /> + <param name="table" value="1" /> + <param name="ftype" value="CDS" /> + <param name="ends" value="open" /> + <param name="mode" value="all" /> + <param name="min_len" value="10" /> + <param name="strand" value="forward" /> + <output name="out_nuc_file" file="get_orf_input.t1_nuc_out.fasta" /> + <output name="out_prot_file" file="get_orf_input.t1_prot_out.fasta" /> + </test> + <test> + <param name="input_file" value="get_orf_input.fasta" /> + <param name="table" value="11" /> + <param name="ftype" value="CDS" /> + <param name="ends" value="closed" /> + <param name="mode" value="all" /> + <param name="min_len" value="10" /> + <param name="strand" value="forward" /> + <output name="out_nuc_file" file="get_orf_input.t11_nuc_out.fasta" /> + <output name="out_prot_file" file="get_orf_input.t11_prot_out.fasta" /> + </test> + <test> + <param name="input_file" value="get_orf_input.fasta" /> + <param name="table" value="11" /> + <param name="ftype" value="CDS" /> + <param name="ends" value="open" /> + <param name="mode" value="all" /> + <param name="min_len" value="10" /> + <param name="strand" value="forward" /> + <output name="out_nuc_file" file="get_orf_input.t11_open_nuc_out.fasta" /> + <output name="out_prot_file" file="get_orf_input.t11_open_prot_out.fasta" /> + </test> + <test> + <param name="input_file" value="Ssuis.fasta" /> + <param name="table" value="11" /> + <param name="ftype" value="ORF" /> + <param name="ends" value="open" /> + <param name="mode" value="all" /> + <param name="min_len" value="100" /> + <param name="strand" value="both" /> + <output name="out_nuc_file" file="get_orf_input.Suis_ORF.nuc.fasta" /> + <output name="out_prot_file" file="get_orf_input.Suis_ORF.prot.fasta" /> + </test> + </tests> + <help> +**What it does** + +Takes an input file of nucleotide sequences (typically FASTA, but also FASTQ +and Standard Flowgram Format (SFF) are supported), and searches each sequence +for open reading frames (ORFs) or potential coding sequences (CDSs) of the +given minimum length. These are returned as FASTA files of nucleotides and +protein sequences. + +You can choose to have all the ORFs/CDSs above the minimum length for each +sequence (similar to the EMBOSS getorf tool), those with the longest length +equal, or the first ORF/CDS with the longest length (in the special case +where a sequence encodes two or more long ORFs/CDSs of the same length). The +last option is a reasonable choice when the input sequences represent EST or +mRNA sequences, where only one ORF/CDS is expected. + +Note that if no ORFs/CDSs in a sequence match the criteria, there will be no +output for that sequence. + +Also note that the ORFs/CDSs are assigned modified identifiers to distinguish +them from the original full length sequences, by appending a suffix. + +The start and stop codons are taken from the `NCBI Genetic Codes +<http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>`_. +When searching for ORFs, the sequences will run from stop codon to stop +codon, and any start codons are ignored. When searching for CDSs, the first +potential start codon will be used, giving the longest possible CDS within +each ORF, and thus the longest possible protein sequence. This is useful +for things like BLAST or domain searching, but since this may not be the +correct start codon may not be appropriate for signal peptide detection +etc. + +**Example Usage** + +Given some EST sequences (Sanger capillary reads) assembled into unigenes, +or a transcriptome assembly from some RNA-Seq, each of your nucleotide +sequences should (barring sequencing, assembly errors, frame-shifts etc) +encode one protein as a single ORF/CDS, which you wish to extract (and +perhaps translate into amino acids). + +If your RNS-Seq data was strand specific, and assembled taking this into +account, you should only search for ORFs/CDSs on the forward strand. + +**Citation** + +If you use this Galaxy tool in work leading to a scientific publication please +cite the following paper: + +Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). +Galaxy tools and workflows for sequence analysis with applications +in molecular plant pathology. PeerJ 1:e167 +http://dx.doi.org/10.7717/peerj.167 + +This tool uses Biopython, so you may also wish to cite the Biopython +application note (and Galaxy too of course): + +Cock et al (2009). Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + +This tool is available to install into other Galaxy Instances via the Galaxy +Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/get_orfs_or_cdss + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/get_orfs_or_cdss/repository_dependencies.xml Mon Oct 28 05:19:38 2013 -0400 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<repositories description="This requires Biopython as a dependency."> +<!-- Leave out the tool shed and revision to get the current + tool shed and latest revision at the time of upload --> +<repository changeset_revision="3e82cbc44886" name="package_biopython_1_62" owner="biopython" toolshed="http://toolshed.g2.bx.psu.edu" /> +</repositories>