Mercurial > repos > peterjc > get_orfs_or_cdss
changeset 8:09a8be9247ca draft
v0.2.0 with GFF3 output
author | peterjc |
---|---|
date | Sat, 09 Jan 2016 23:42:32 -0500 |
parents | 705a2e2df7fb |
children | a06ad07431ba |
files | tools/get_orfs_or_cdss/README.rst tools/get_orfs_or_cdss/get_orfs_or_cdss.py tools/get_orfs_or_cdss/get_orfs_or_cdss.xml tools/get_orfs_or_cdss/tool_dependencies.xml |
diffstat | 4 files changed, 85 insertions(+), 48 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/get_orfs_or_cdss/README.rst Thu Jul 30 12:35:31 2015 -0400 +++ b/tools/get_orfs_or_cdss/README.rst Sat Jan 09 23:42:32 2016 -0500 @@ -3,6 +3,7 @@ This tool is copyright 2011-2015 by Peter Cock, The James Hutton Institute (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. +Additions copyright 2015-2016 by Eric Rasche. See the licence text below (MIT licence). This tool is a short Python script (using Biopython library functions) @@ -75,6 +76,7 @@ - Using ``optparse`` for the Python command line API (Eric Rasche). - Added NCBI genetic code table 24, Pterobranchia Mitochondrial. v0.1.1 - Reorder XML elements (internal change only). +v0.2.0 - Tool now also outputs GFF3 formatted calls (Eric Rasche). ======= ====================================================================== @@ -91,12 +93,12 @@ Planemo commands (which requires you have set your Tool Shed access details in ``~/.planemo.yml`` and that you have access rights on the Tool Shed):: - $ planemo shed_update --shed_target testtoolshed --check_diff ~/repositories/pico_galaxy/tools/get_orfs_or_cdss/ + $ planemo shed_update -t testtoolshed --check_diff ~/repositories/pico_galaxy/tools/get_orfs_or_cdss/ ... or:: - $ planemo shed_update --shed_target toolshed --check_diff ~/repositories/pico_galaxy/tools/get_orfs_or_cdss/ + $ planemo shed_update -t toolshed --check_diff ~/repositories/pico_galaxy/tools/get_orfs_or_cdss/ ... To just build and check the tar ball, use::
--- a/tools/get_orfs_or_cdss/get_orfs_or_cdss.py Thu Jul 30 12:35:31 2015 -0400 +++ b/tools/get_orfs_or_cdss/get_orfs_or_cdss.py Sat Jan 09 23:42:32 2016 -0500 @@ -16,20 +16,14 @@ (formerly SCRI), Dundee, UK. All rights reserved. See accompanying text file for licence details (MIT licence). - -This is version 0.1.0 of the script. """ import sys import re from optparse import OptionParser -def sys_exit(msg, err=1): - sys.stderr.write(msg.rstrip() + "\n") - sys.exit(err) - usage = """Use as follows: -$ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed +$ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed --og cds.gff3 """ try: @@ -38,7 +32,7 @@ from Bio import SeqIO from Bio.Data import CodonTable except ImportError: - sys_exit("Missing Biopython library") + sys.exit("Missing Biopython library") parser = OptionParser(usage=usage) @@ -73,6 +67,9 @@ parser.add_option('--ob', dest='out_bed_file', default=None, help='Output BED file, or - for STDOUT', metavar='FILE') +parser.add_option('--og', dest='out_gff3_file', + default=None, help='Output GFF3 file, or - for STDOUT', + metavar='FILE') parser.add_option('-v', '--version', dest='version', default=False, action='store_true', help='Show version and quit') @@ -80,26 +77,32 @@ options, args = parser.parse_args() if options.version: - print "v0.1.0" + print("v0.2.0") sys.exit(0) +if not options.input_file: + sys.exit("Input file is required") + +if not any((options.out_nuc_file, options.out_prot_file, options.out_bed_file, options.out_gff3_file)): + sys.exit("At least one output file is required") + try: table_obj = CodonTable.ambiguous_generic_by_id[options.table] except KeyError: - sys_exit("Unknown codon table %i" % options.table) + sys.exit("Unknown codon table %i" % options.table) -if options.seq_format.lower()=="sff": +if options.seq_format.lower() == "sff": seq_format = "sff-trim" -elif options.seq_format.lower()=="fasta": +elif options.seq_format.lower() == "fasta": seq_format = "fasta" elif options.seq_format.lower().startswith("fastq"): seq_format = "fastq" else: - sys_exit("Unsupported file type %r" % options.seq_format) + sys.exit("Unsupported file type %r" % options.seq_format) print "Genetic code table %i" % options.table print "Minimum length %i aa" % options.min_len -#print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) +# print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) starts = sorted(table_obj.start_codons) assert "NNN" not in starts @@ -109,13 +112,14 @@ assert "NNN" not in stops re_stops = re.compile("|".join(stops)) + def start_chop_and_trans(s, strict=True): """Returns offset, trimmed nuc, protein.""" if strict: assert s[-3:] in stops, s assert len(s) % 3 == 0 for match in re_starts.finditer(s): - #Must check the start is in frame + # Must check the start is in frame start = match.start() if start % 3 == 0: n = s[start:] @@ -123,11 +127,12 @@ if strict: t = translate(n, options.table, cds=True) else: - #Use when missing stop codon, + # Use when missing stop codon, t = "M" + translate(n[3:], options.table, to_stop=True) return start, n, t return None, None, None + def break_up_frame(s): """Returns offset, nuc, protein.""" start = 0 @@ -136,7 +141,7 @@ if index % 3 != 0: continue n = s[start:index] - if options.ftype=="CDS": + if options.ftype == "CDS": offset, n, t = start_chop_and_trans(n) else: offset = 0 @@ -145,16 +150,16 @@ yield start + offset, n, t start = index if options.ends == "open": - #No stop codon, Biopython's strict CDS translate will fail + # No stop codon, Biopython's strict CDS translate will fail n = s[start:] - #Ensure we have whole codons - #TODO - Try appending N instead? - #TODO - Do the next four lines more elegantly + # Ensure we have whole codons + # TODO - Try appending N instead? + # TODO - Do the next four lines more elegantly if len(n) % 3: n = n[:-1] if len(n) % 3: n = n[:-1] - if options.ftype=="CDS": + if options.ftype == "CDS": offset, n, t = start_chop_and_trans(n, strict=False) else: offset = 0 @@ -168,24 +173,25 @@ Co-ordinates are Python style zero-based. """ - #TODO - Refactor to use a generator function (in start order) - #rather than making a list and sorting? + # TODO - Refactor to use a generator function (in start order) + # rather than making a list and sorting? answer = [] full_len = len(nuc_seq) if options.strand != "reverse": - for frame in range(0,3): + for frame in range(0, 3): for offset, n, t in break_up_frame(nuc_seq[frame:]): - start = frame + offset #zero based + start = frame + offset # zero based answer.append((start, start + len(n), +1, n, t)) if options.strand != "forward": rc = reverse_complement(nuc_seq) - for frame in range(0,3) : + for frame in range(0, 3): for offset, n, t in break_up_frame(rc[frame:]): - start = full_len - frame - offset #zero based - answer.append((start - len(n), start, -1, n ,t)) + start = full_len - frame - offset # zero based + answer.append((start - len(n), start, -1, n, t)) answer.sort() return answer + def get_top_peptides(nuc_seq): """Returns all peptides of max length.""" values = list(get_all_peptides(nuc_seq)) @@ -196,6 +202,7 @@ if len(x[-1]) == max_len: yield x + def get_one_peptide(nuc_seq): """Returns first (left most) peptide with max length.""" values = list(get_top_peptides(nuc_seq)) @@ -214,40 +221,63 @@ out_count = 0 if options.out_nuc_file == "-": out_nuc = sys.stdout +elif options.out_nuc_file: + out_nuc = open(options.out_nuc_file, "w") else: - out_nuc = open(options.out_nuc_file, "w") + out_nuc = None if options.out_prot_file == "-": out_prot = sys.stdout +elif options.out_prot_file: + out_prot = open(options.out_prot_file, "w") else: - out_prot = open(options.out_prot_file, "w") + out_prot = None if options.out_bed_file == "-": out_bed = sys.stdout +elif options.out_bed_file: + out_bed = open(options.out_bed_file, "w") else: - out_bed = open(options.out_bed_file, "w") + out_bed = None + +if options.out_gff3_file == "-": + out_gff3 = sys.stdout +elif options.out_gff3_file: + out_gff3 = open(options.out_gff3_file, "w") +else: + out_gff3 = None + +if out_gff3: + out_gff3.write('##gff-version 3\n') for record in SeqIO.parse(options.input_file, seq_format): for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): out_count += 1 if f_strand == +1: - loc = "%i..%i" % (f_start+1, f_end) + loc = "%i..%i" % (f_start + 1, f_end) else: - loc = "complement(%i..%i)" % (f_start+1, f_end) + loc = "complement(%i..%i)" % (f_start + 1, f_end) descr = "length %i aa, %i bp, from %s of %s" \ % (len(t), len(n), loc, record.description) - fid = record.id + "|%s%i" % (options.ftype, i+1) - r = SeqRecord(Seq(n), id = fid, name = "", description= descr) - t = SeqRecord(Seq(t), id = fid, name = "", description= descr) - SeqIO.write(r, out_nuc, "fasta") - SeqIO.write(t, out_prot, "fasta") - out_bed.write('\t'.join(map(str,[record.id, f_start, f_end, fid, 0, '+' if f_strand == +1 else '-'])) + '\n') + fid = record.id + "|%s%i" % (options.ftype, i + 1) + r = SeqRecord(Seq(n), id=fid, name="", description=descr) + t = SeqRecord(Seq(t), id=fid, name="", description=descr) + if out_nuc: + SeqIO.write(r, out_nuc, "fasta") + if out_prot: + SeqIO.write(t, out_prot, "fasta") + nice_strand = '+' if f_strand == +1 else '-' + if out_bed: + out_bed.write('\t'.join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand])) + '\n') + if out_gff3: + out_gff3.write('\t'.join(map(str, [record.id, 'getOrfsOrCds', 'CDS', f_start + 1, f_end, '.', + nice_strand, 0, 'ID=%s%s' % (options.ftype, i + 1)])) + '\n') in_count += 1 -if out_nuc is not sys.stdout: +if out_nuc and out_nuc is not sys.stdout: out_nuc.close() -if out_prot is not sys.stdout: +if out_prot and out_prot is not sys.stdout: out_prot.close() -if out_bed is not sys.stdout: +if out_bed and out_bed is not sys.stdout: out_bed.close() print "Found %i %ss in %i sequences" % (out_count, options.ftype, in_count)
--- a/tools/get_orfs_or_cdss/get_orfs_or_cdss.xml Thu Jul 30 12:35:31 2015 -0400 +++ b/tools/get_orfs_or_cdss/get_orfs_or_cdss.xml Sat Jan 09 23:42:32 2016 -0500 @@ -1,4 +1,4 @@ -<tool id="get_orfs_or_cdss" name="Get open reading frames (ORFs) or coding sequences (CDSs)" version="0.1.1"> +<tool id="get_orfs_or_cdss" name="Get open reading frames (ORFs) or coding sequences (CDSs)" version="0.2.0"> <description>e.g. to get peptides from ESTs</description> <requirements> <requirement type="package" version="1.65">biopython</requirement> @@ -11,7 +11,7 @@ </stdio> <version_command interpreter="python">get_orfs_or_cdss.py --version</version_command> <command interpreter="python"> -get_orfs_or_cdss.py -i $input_file -f $input_file.ext --table $table -t $ftype -e $ends -m $mode --min_len $min_len -s $strand --on $out_nuc_file --op $out_prot_file --ob $out_bed_file +get_orfs_or_cdss.py -i $input_file -f $input_file.ext --table $table -t $ftype -e $ends -m $mode --min_len $min_len -s $strand --on $out_nuc_file --op $out_prot_file --ob $out_bed_file --og $out_gff3_file </command> <inputs> <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file (nucleotides)" help="FASTA, FASTQ, or SFF format." /> @@ -60,6 +60,7 @@ <data name="out_nuc_file" format="fasta" label="${ftype.value}s (nucleotides)" /> <data name="out_prot_file" format="fasta" label="${ftype.value}s (amino acids)" /> <data name="out_bed_file" format="bed6" label="${ftype.value}s (bed)" /> + <data name="out_gff3_file" format="gff3" label="${ftype.value}s (gff3)" /> </outputs> <tests> <test> @@ -73,6 +74,7 @@ <output name="out_nuc_file" file="get_orf_input.t1_nuc_out.fasta" /> <output name="out_prot_file" file="get_orf_input.t1_prot_out.fasta" /> <output name="out_bed_file" file="get_orf_input.t1_bed_out.bed" /> + <output name="out_gff3_file" file="get_orf_input.t1_gff3_out.gff3" /> </test> <test> <param name="input_file" value="get_orf_input.fasta" /> @@ -85,6 +87,7 @@ <output name="out_nuc_file" file="get_orf_input.t11_nuc_out.fasta" /> <output name="out_prot_file" file="get_orf_input.t11_prot_out.fasta" /> <output name="out_bed_file" file="get_orf_input.t11_bed_out.bed" /> + <output name="out_gff3_file" file="get_orf_input.t11_gff3_out.gff3" /> </test> <test> <param name="input_file" value="get_orf_input.fasta" /> @@ -97,6 +100,7 @@ <output name="out_nuc_file" file="get_orf_input.t11_open_nuc_out.fasta" /> <output name="out_prot_file" file="get_orf_input.t11_open_prot_out.fasta" /> <output name="out_bed_file" file="get_orf_input.t11_open_bed_out.bed" /> + <output name="out_gff3_file" file="get_orf_input.t11_open_gff3_out.gff3" /> </test> <test> <param name="input_file" value="Ssuis.fasta" /> @@ -109,6 +113,7 @@ <output name="out_nuc_file" file="get_orf_input.Suis_ORF.nuc.fasta" /> <output name="out_prot_file" file="get_orf_input.Suis_ORF.prot.fasta" /> <output name="out_bed_file" file="get_orf_input.Suis_ORF.bed" /> + <output name="out_gff3_file" file="get_orf_input.Suis_ORF.gff3" /> </test> </tests> <help>
--- a/tools/get_orfs_or_cdss/tool_dependencies.xml Thu Jul 30 12:35:31 2015 -0400 +++ b/tools/get_orfs_or_cdss/tool_dependencies.xml Sat Jan 09 23:42:32 2016 -0500 @@ -1,6 +1,6 @@ <?xml version="1.0"?> <tool_dependency> <package name="biopython" version="1.65"> - <repository changeset_revision="dc595937617c" name="package_biopython_1_65" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="030f1a505d40" name="package_biopython_1_65" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> </package> </tool_dependency>