Mercurial > repos > peterjc > get_orfs_or_cdss
changeset 12:71905a6d52a7 draft default tip
"Update all the pico_galaxy tools on main Tool Shed"
author | peterjc |
---|---|
date | Fri, 16 Apr 2021 22:37:04 +0000 (2021-04-16) |
parents | d51db443aaa4 |
children | |
files | tools/get_orfs_or_cdss/get_orfs_or_cdss.py tools/get_orfs_or_cdss/get_orfs_or_cdss.xml tools/get_orfs_or_cdss/tool_dependencies.xml |
diffstat | 3 files changed, 147 insertions(+), 57 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/get_orfs_or_cdss/get_orfs_or_cdss.py Wed May 30 08:33:20 2018 -0400 +++ b/tools/get_orfs_or_cdss/get_orfs_or_cdss.py Fri Apr 16 22:37:04 2021 +0000 @@ -10,7 +10,7 @@ Cock et al 2009. Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. +https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878. This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute (formerly SCRI), Dundee, UK. All rights reserved. @@ -42,43 +42,100 @@ parser = OptionParser(usage=usage) -parser.add_option('-i', '--input', dest='input_file', - default=None, help='Input fasta file', - metavar='FILE') -parser.add_option('-f', '--format', dest='seq_format', - default='fasta', help='Sequence format (e.g. fasta, fastq, sff)') -parser.add_option('--table', dest='table', - default=1, help='NCBI Translation table', type='int') -parser.add_option('-t', '--ftype', dest='ftype', type='choice', - choices=['CDS', 'ORF'], default='ORF', - help='Find ORF or CDSs') -parser.add_option('-e', '--ends', dest='ends', type='choice', - choices=['open', 'closed'], default='closed', - help='Open or closed. Closed ensures start/stop codons are present') -parser.add_option('-m', '--mode', dest='mode', type='choice', - choices=['all', 'top', 'one'], default='all', - help='Output all ORFs/CDSs from sequence, all ORFs/CDSs ' - 'with max length, or first with maximum length') -parser.add_option('--min_len', dest='min_len', - default=10, help='Minimum ORF/CDS length', type='int') -parser.add_option('-s', '--strand', dest='strand', type='choice', - choices=['forward', 'reverse', 'both'], default='both', - help='Strand to search for features on') -parser.add_option('--on', dest='out_nuc_file', - default=None, help='Output nucleotide sequences, or - for STDOUT', - metavar='FILE') -parser.add_option('--op', dest='out_prot_file', - default=None, help='Output protein sequences, or - for STDOUT', - metavar='FILE') -parser.add_option('--ob', dest='out_bed_file', - default=None, help='Output BED file, or - for STDOUT', - metavar='FILE') -parser.add_option('--og', dest='out_gff3_file', - default=None, help='Output GFF3 file, or - for STDOUT', - metavar='FILE') -parser.add_option('-v', '--version', dest='version', - default=False, action='store_true', - help='Show version and quit') +parser.add_option( + "-i", + "--input", + dest="input_file", + default=None, + help="Input fasta file", + metavar="FILE", +) +parser.add_option( + "-f", + "--format", + dest="seq_format", + default="fasta", + help="Sequence format (e.g. fasta, fastq, sff)", +) +parser.add_option( + "--table", dest="table", default=1, help="NCBI Translation table", type="int" +) +parser.add_option( + "-t", + "--ftype", + dest="ftype", + type="choice", + choices=["CDS", "ORF"], + default="ORF", + help="Find ORF or CDSs", +) +parser.add_option( + "-e", + "--ends", + dest="ends", + type="choice", + choices=["open", "closed"], + default="closed", + help="Open or closed. Closed ensures start/stop codons are present", +) +parser.add_option( + "-m", + "--mode", + dest="mode", + type="choice", + choices=["all", "top", "one"], + default="all", + help="Output all ORFs/CDSs from sequence, all ORFs/CDSs " + "with max length, or first with maximum length", +) +parser.add_option( + "--min_len", dest="min_len", default=10, help="Minimum ORF/CDS length", type="int" +) +parser.add_option( + "-s", + "--strand", + dest="strand", + type="choice", + choices=["forward", "reverse", "both"], + default="both", + help="Strand to search for features on", +) +parser.add_option( + "--on", + dest="out_nuc_file", + default=None, + help="Output nucleotide sequences, or - for STDOUT", + metavar="FILE", +) +parser.add_option( + "--op", + dest="out_prot_file", + default=None, + help="Output protein sequences, or - for STDOUT", + metavar="FILE", +) +parser.add_option( + "--ob", + dest="out_bed_file", + default=None, + help="Output BED file, or - for STDOUT", + metavar="FILE", +) +parser.add_option( + "--og", + dest="out_gff3_file", + default=None, + help="Output GFF3 file, or - for STDOUT", + metavar="FILE", +) +parser.add_option( + "-v", + "--version", + dest="version", + default=False, + action="store_true", + help="Show version and quit", +) options, args = parser.parse_args() @@ -89,7 +146,14 @@ if not options.input_file: sys.exit("Input file is required") -if not any((options.out_nuc_file, options.out_prot_file, options.out_bed_file, options.out_gff3_file)): +if not any( + ( + options.out_nuc_file, + options.out_prot_file, + options.out_bed_file, + options.out_gff3_file, + ) +): sys.exit("At least one output file is required") try: @@ -120,7 +184,7 @@ def start_chop_and_trans(s, strict=True): - """Returns offset, trimmed nuc, protein.""" + """Return offset, trimmed nuc, protein.""" if strict: assert s[-3:] in stops, s assert len(s) % 3 == 0 @@ -140,7 +204,7 @@ def break_up_frame(s): - """Returns offset, nuc, protein.""" + """Return offset, nuc, protein.""" start = 0 for match in re_stops.finditer(s): index = match.start() + 3 @@ -175,7 +239,7 @@ def get_all_peptides(nuc_seq): - """Returns start, end, strand, nucleotides, protein. + """Return start, end, strand, nucleotides, protein. Co-ordinates are Python style zero-based. """ @@ -199,7 +263,7 @@ def get_top_peptides(nuc_seq): - """Returns all peptides of max length.""" + """Return all peptides of max length.""" values = list(get_all_peptides(nuc_seq)) if not values: raise StopIteration @@ -210,7 +274,7 @@ def get_one_peptide(nuc_seq): - """Returns first (left most) peptide with max length.""" + """Return first (left most) peptide with max length.""" values = list(get_top_peptides(nuc_seq)) if not values: raise StopIteration @@ -255,17 +319,23 @@ out_gff3 = None if out_gff3: - out_gff3.write('##gff-version 3\n') + out_gff3.write("##gff-version 3\n") for record in SeqIO.parse(options.input_file, seq_format): - for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): + for i, (f_start, f_end, f_strand, n, t) in enumerate( + get_peptides(str(record.seq).upper()) + ): out_count += 1 if f_strand == +1: loc = "%i..%i" % (f_start + 1, f_end) else: loc = "complement(%i..%i)" % (f_start + 1, f_end) - descr = "length %i aa, %i bp, from %s of %s" \ - % (len(t), len(n), loc, record.description) + descr = "length %i aa, %i bp, from %s of %s" % ( + len(t), + len(n), + loc, + record.description, + ) fid = record.id + "|%s%i" % (options.ftype, i + 1) r = SeqRecord(Seq(n), id=fid, name="", description=descr) t = SeqRecord(Seq(t), id=fid, name="", description=descr) @@ -273,12 +343,32 @@ SeqIO.write(r, out_nuc, "fasta") if out_prot: SeqIO.write(t, out_prot, "fasta") - nice_strand = '+' if f_strand == +1 else '-' + nice_strand = "+" if f_strand == +1 else "-" if out_bed: - out_bed.write('\t'.join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand])) + '\n') + out_bed.write( + "\t".join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand])) + + "\n" + ) if out_gff3: - out_gff3.write('\t'.join(map(str, [record.id, 'getOrfsOrCds', 'CDS', f_start + 1, f_end, '.', - nice_strand, 0, 'ID=%s%s' % (options.ftype, i + 1)])) + '\n') + out_gff3.write( + "\t".join( + map( + str, + [ + record.id, + "getOrfsOrCds", + "CDS", + f_start + 1, + f_end, + ".", + nice_strand, + 0, + "ID=%s%s" % (options.ftype, i + 1), + ], + ) + ) + + "\n" + ) in_count += 1 if out_nuc and out_nuc is not sys.stdout: out_nuc.close()
--- a/tools/get_orfs_or_cdss/get_orfs_or_cdss.xml Wed May 30 08:33:20 2018 -0400 +++ b/tools/get_orfs_or_cdss/get_orfs_or_cdss.xml Fri Apr 16 22:37:04 2021 +0000 @@ -184,14 +184,14 @@ Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). Galaxy tools and workflows for sequence analysis with applications in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 +https://doi.org/10.7717/peerj.167 This tool uses Biopython, so you may also wish to cite the Biopython application note (and Galaxy too of course): Cock et al (2009). Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. +https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878. This tool is available to install into other Galaxy Instances via the Galaxy Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/get_orfs_or_cdss
--- a/tools/get_orfs_or_cdss/tool_dependencies.xml Wed May 30 08:33:20 2018 -0400 +++ b/tools/get_orfs_or_cdss/tool_dependencies.xml Fri Apr 16 22:37:04 2021 +0000 @@ -1,6 +1,6 @@ -<?xml version="1.0"?> +<?xml version="1.0" ?> <tool_dependency> <package name="biopython" version="1.67"> - <repository changeset_revision="a12f73c3b116" name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" changeset_revision="a12f73c3b116"/> </package> -</tool_dependency> +</tool_dependency> \ No newline at end of file