Mercurial > repos > cpt > cpt_gbk_adjacent
changeset 1:e29c36ee61e0 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:42:35 +0000 |
parents | 1311f97dccfa |
children | e2531f4dcdde |
files | adjacent_features.py adjacent_features.xml cpt-macros.xml cpt_gbk_adjacent/adjacent_features.py cpt_gbk_adjacent/adjacent_features.xml cpt_gbk_adjacent/cpt-macros.xml cpt_gbk_adjacent/macros.xml macros.xml |
diffstat | 8 files changed, 743 insertions(+), 775 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/adjacent_features.py Mon Jun 05 02:42:35 2023 +0000 @@ -0,0 +1,444 @@ +#!/usr/bin/env python +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.Data import CodonTable +from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import SeqFeature, FeatureLocation +from Bio.Alphabet import generic_dna, generic_protein +import argparse +import logging + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger() + + +def extract_features( + genbankFiles=None, + fastaFiles=None, + upOut=None, + downOut=None, + genesOnly=False, + cdsOnly=True, + forceSeqID=False, + forward=1, + behind=1, + outProt=True, + tTable=11, + fTable=11, +): + + genList = [] + fastaList = [] + + for fileX in genbankFiles: + opener = SeqIO.parse(fileX, "genbank") + for ( + openRec + ) in ( + opener + ): # To turn the generator into objects (or else we end up with a list of generators) + genList.append(openRec) + + for fileX in fastaFiles: + opener = SeqIO.parse(fileX, "fasta") + for openRec in opener: # Technically flattens multifastas too + fastaList.append(openRec) + + for seqMatch in fastaList: + longOut = seqMatch.description + protID = seqMatch.id + if fTable != 0: + fSeq = seqMatch.seq.translate(table=fTable, cds=False) + else: + fSeq = seqMatch.seq + + for gbk in genList: + sourceOut = gbk.id + num = -1 + for feat in gbk.features: + num += 1 + + if (genesOnly and feat.type != "gene") or ( + cdsOnly and feat.type != "CDS" + ): + continue + + if "codon_start" in feat.qualifiers: + offset = 1 - int(feat.qualifiers["codon_start"][0]) + else: + offset = 0 + + temp = gbk.seq[feat.location.start : feat.location.end] + if feat.location.strand == -1: + temp = gbk.seq[feat.location.start : feat.location.end - offset] + temp = temp.reverse_complement() + else: + temp = gbk.seq[feat.location.start + offset : feat.location.end] + + if tTable != 0: + try: + gSeq = temp.translate(table=tTable, cds=True) + except CodonTable.TranslationError as cte: + # log.info("Translation issue at %s", cte) + gSeq = temp.translate(table=tTable, cds=False) + else: + gSeq = temp + + if not ("protein_id" in feat.qualifiers): + feat.qualifiers["protein_id"] = [ + "++++++++" + ] # Junk value for genesOnly flag + + if (gSeq == fSeq) and ( + protID == feat.qualifiers["protein_id"][0] or forceSeqID == False + ): + goBack = num - 1 + goAhead = num + 1 + numBack = behind + numAhead = forward + backList = [] + aheadList = [] + + while numBack != 0 and goBack >= 0: + if (genesOnly and gbk.features[goBack].type != "gene") or ( + cdsOnly and gbk.features[goBack].type != "CDS" + ): + goBack -= 1 + continue + backList.append(gbk.features[goBack]) + numBack -= 1 + goBack -= 1 + + while numAhead != 0 and goAhead < len(gbk.features): + if (genesOnly and gbk.features[goAhead].type != "gene") or ( + cdsOnly and gbk.features[goAhead].type != "CDS" + ): + goAhead += 1 + continue + aheadList.append(gbk.features[goAhead]) + numAhead -= 1 + goAhead += 1 + + backList.reverse() + if feat.location.strand == -1: + tmpList = aheadList + aheadList = backList + backList = tmpList + + for item in backList: + addition = "" + header = "" + if "product" in item.qualifiers: + addition = " -" + str(item.qualifiers["product"][0]) + "-" + if "protein_id" in item.qualifiers: + header = ( + ">" + + (item.qualifiers["protein_id"][0]) + + addition + + " (5' of " + + longOut + + " found within " + + sourceOut + + ")\n" + ) + else: + header = ( + ">" + + (item.qualifiers["locus_tag"][0]) + + addition + + " (5' of " + + longOut + + " found within " + + sourceOut + + ")\n" + ) + if outProt == True: + if "translation" in item.qualifiers: + upOut.write(header) + upOut.write( + str(item.qualifiers["translation"][0]) + "\n\n" + ) + else: + modS = 0 + modE = 0 + if "codon_start" in item.qualifiers: + if item.location.strand > 0: + modS = ( + int(item.qualifiers["codon_start"][0]) - 1 + ) + else: + modE = ( + int(item.qualifiers["codon_start"][0]) - 1 + ) + + seqHold = gbk.seq[ + item.location.start + + modS : item.location.end + - modE + ] + if item.location.strand == -1: + seqHold = seqHold.reverse_complement() + if cdsOnly: + try: + finalSeq = "" + if tTable != 0: + finalSeq = ( + str( + seqHold.translate( + table=tTable, cds=True + ) + ) + + "\n\n" + ) + else: + finalSeq = str(seqHold) + "\n\n" + # upOut.write(header) + # upOut.write(finalSeq) + except Exception as bdct: + log.warn( + "ERROR %s %s", + item.qualifiers["locus_tag"][0], + bdct, + ) + finalSeq = "" + if tTable != 0: + finalSeq = ( + str( + seqHold.translate( + table=tTable, cds=False + ) + ) + + "\n\n" + ) + else: + finalSeq = str(seqHold) + "\n\n" + header = ( + ">" + + (item.qualifiers["locus_tag"][0]) + + addition + + " [INCOMPLETE] (5' of " + + longOut + + " found within " + + sourceOut + + ")\n" + ) + upOut.write(header) + upOut.write(finalSeq) + else: + + if tTable != 0: + upOut.write(header) + upOut.write( + str( + seqHold.translate( + table=tTable, cds=False + ) + ) + + "\n\n" + ) + else: + upOut.write(header) + upOut.write(str(seqHold) + "\n\n") + else: + upOut.write(header) + upOut.write( + str(gbk.seq[item.location.start : item.location.end]) + + "\n\n" + ) + + for item in aheadList: + addition = "" + header = "" + if "product" in item.qualifiers: + addition = " -" + str(item.qualifiers["product"][0]) + "-" + if "protein_id" in item.qualifiers: + header = ( + ">" + + (item.qualifiers["protein_id"][0]) + + addition + + " (3' of " + + longOut + + " found within " + + sourceOut + + ")\n" + ) + else: + header = ( + ">" + + (item.qualifiers["locus_tag"][0]) + + addition + + " (3' of " + + longOut + + " found within " + + sourceOut + + ")\n" + ) + if outProt == True: + if "translation" in item.qualifiers: + downOut.write(header) + downOut.write( + str(item.qualifiers["translation"][0]) + "\n\n" + ) + else: + modS = 0 + modE = 0 + if "codon_start" in item.qualifiers: + if item.location.strand > 0: + modS = ( + int(item.qualifiers["codon_start"][0]) - 1 + ) + else: + modE = ( + int(item.qualifiers["codon_start"][0]) - 1 + ) + + seqHold = gbk.seq[ + item.location.start + + modS : item.location.end + - modE + ] + if item.location.strand == -1: + seqHold = seqHold.reverse_complement() + if cdsOnly: + try: + finalSeq = "" + if tTable != 0: + finalSeq = ( + str( + seqHold.translate( + table=tTable, cds=True + ) + ) + + "\n\n" + ) + else: + finalSeq = str(seqHold) + "\n\n" + # downOut.write(header) + # downOut.write(finalSeq) + except Exception as bdct: + log.warn( + "ERROR %s %s", + item.qualifiers["locus_tag"][0], + bdct, + ) + finalSeq = "" + if tTable != 0: + finalSeq = ( + str( + seqHold.translate( + table=tTable, cds=False + ) + ) + + "\n\n" + ) + else: + finalSeq = str(seqHold) + "\n\n" + header = ( + ">" + + (item.qualifiers["locus_tag"][0]) + + addition + + " [INCOMPLETE] (3' of " + + longOut + + " found within " + + sourceOut + + ")\n" + ) + downOut.write(header) + downOut.write(finalSeq) + else: + + if tTable != 0: + downOut.write(header) + downOut.write( + str( + seqHold.translate( + table=tTable, cds=False + ) + ) + + "\n\n" + ) + else: + downOut.write(header) + downOut.write(str(seqHold) + "\n\n") + else: + downOut.write(header) + downOut.write( + str(gbk.seq[item.location.start : item.location.end]) + + "\n\n" + ) + # print(longOut) + + return + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Export a subset of features from a Genbank file", epilog="" + ) + parser.add_argument( + "-genbankFiles", nargs="+", type=argparse.FileType("r"), help="Genbank file" + ) + parser.add_argument( + "-fastaFiles", + nargs="+", + type=argparse.FileType("r"), + help="Fasta file to match against", + ) + parser.add_argument( + "-tTable", + type=int, + default=11, + help="Translation table to use", + choices=range(0, 23), + ) + parser.add_argument( + "-fTable", + type=int, + default=11, + help="Translation table to use", + choices=range(0, 23), + ) + parser.add_argument( + "-upOut", + type=argparse.FileType("w"), + help="Upstream Fasta output", + default="test-data/upOut.fa", + ) + parser.add_argument( + "-downOut", + type=argparse.FileType("w"), + help="Downstream Fasta output", + default="test-data/downOut.fa", + ) + parser.add_argument( + "--genesOnly", + action="store_true", + help="Search and return only Gene type features", + ) + parser.add_argument( + "--cdsOnly", + action="store_true", + help="Search and return only CDS type features", + ) + parser.add_argument( + "--forceSeqID", + action="store_true", + help="Search and return only CDS type features", + ) + parser.add_argument( + "--outProt", action="store_true", help="Output the translated sequence" + ) + parser.add_argument( + "--forward", + type=int, + default=1, + help="Number of features upstream from the hit to return", + ) + parser.add_argument( + "--behind", + type=int, + default=1, + help="Number of features downstream from the hit to return", + ) + args = parser.parse_args() + extract_features(**vars(args))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/adjacent_features.xml Mon Jun 05 02:42:35 2023 +0000 @@ -0,0 +1,110 @@ +<tool id="edu.tamu.cpt2.gbk.adjacent_features" name="Find adjacent Genbank features" version="RC4"> + <description>Searches a Genbank file for a given FASTA sequence, then outputs a file with adjacent upstream features, and another with adjacent downstream features.</description> + <macros> + <import>macros.xml</import> + <import>cpt-macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="aggressive"><![CDATA[ +python '$__tool_directory__/adjacent_features.py' +#set repeat_var_1 = '" "'.join([ str('$var') for '$var' in '$gbkIn' ]) +#set repeat_var_2 = '" "'.join([ str('$var') for '$var' in '$fastaIn' ]) +-genbankFiles "$repeat_var_1" +-fastaFiles "$repeat_var_2" +-tTable '$tTable' +-fTable '$fTable' +-upOut '$upstreamOut' +-downOut '$downstreamOut' +'$translate' +'$forceID' +'$mode' +--forward '$ahead' +--behind '$back' +]]></command> + <inputs> + <param label="Genbank file" name="gbkIn" type="data" format="genbank" multiple="True"/> + <param label="Fasta file" name="fastaIn" type="data" format="fasta" multiple="True"/> + <param label="Translation table to use on Fasta input:" name="fTable" type="select"> + <option value="0" selected="true">[0] Do not translate/ Fasta already translated</option> + <option value="1">[1] The Standard Code</option> + <option value="2">[2] The Vertebrate Mitochondrial Code</option> + <option value="3">[3] The Yeast Mitochondrial Code</option> + <option value="4">[4] The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option> + <option value="5">[5] The Invertebrate Mitochondrial Code </option> + <option value="6">[6] The Ciliate, Dasycladacean and Hexamita Nuclear Code</option> + <option value="9">[9] The Echinoderm and Flatworm Mitochondrial Code</option> + <option value="10">[10] The Euplotid Nuclear Code</option> + <option value="11">[11] The Bacterial, Archaeal and Plant Plastid Code</option> + <option value="12">[12] The Alternative Yeast Nuclear Code</option> + <option value="13">[13] The Ascidian Mitochondrial Code</option> + <option value="14">[14] The Alternative Flatworm Mitochondrial Code</option> + <option value="15">[15] Blepharisma Nuclear Code</option> + <option value="16">[16] Chlorophycean Mitochondrial Code</option> + <option value="21">[21] Trematode Mitochondrial Code</option> + <option value="22">[22] Scenedesmus Obliquus Mitochondrial Code</option> + <option value="23">[23] Thraustochytrium Mitochondrial Code</option> + <option value="24">[24] Pterobranchia Mitochondrial Code</option> + <option value="25">[25] Candidate Division SR1 and Gracilibacteria Code</option> + </param> + <param label="Translation table to use on Genbank features:" name="tTable" type="select"> + <option value="0">[0] Do not translate (Use nucleotide sequence of features)</option> + <option value="1">[1] The Standard Code</option> + <option value="2">[2] The Vertebrate Mitochondrial Code</option> + <option value="3">[3] The Yeast Mitochondrial Code</option> + <option value="4">[4] The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option> + <option value="5">[5] The Invertebrate Mitochondrial Code </option> + <option value="6">[6] The Ciliate, Dasycladacean and Hexamita Nuclear Code</option> + <option value="9">[9] The Echinoderm and Flatworm Mitochondrial Code</option> + <option value="10">[10] The Euplotid Nuclear Code</option> + <option value="11" selected="true">[11] The Bacterial, Archaeal and Plant Plastid Code</option> + <option value="12">[12] The Alternative Yeast Nuclear Code</option> + <option value="13">[13] The Ascidian Mitochondrial Code</option> + <option value="14">[14] The Alternative Flatworm Mitochondrial Code</option> + <option value="15">[15] Blepharisma Nuclear Code</option> + <option value="16">[16] Chlorophycean Mitochondrial Code</option> + <option value="21">[21] Trematode Mitochondrial Code</option> + <option value="22">[22] Scenedesmus Obliquus Mitochondrial Code</option> + <option value="23">[23] Thraustochytrium Mitochondrial Code</option> + <option value="24">[24] Pterobranchia Mitochondrial Code</option> + <option value="25">[25] Candidate Division SR1 and Gracilibacteria Code</option> + </param> + <param label="Number of features upstream to return" name="ahead" type="integer" value="1"/> + <param label="Number of features downstream to return" name="back" type="integer" value="1"/> + <param label="Translate output to protein sequence" name="translate" type="boolean" truevalue="--outProt" falsevalue=""/> + <param label="Genbank Protein's ID must also match Fasta Sequence's ID " name="forceID" type="boolean" truevalue="--forceSeqID" falsevalue="" checked="true"/> + <param name="mode" type="select" label="Mode"> + <option value="--genesOnly">Search only Gene-type features</option> + <option value="--cdsOnly">Search only CDS-type features</option> + <option value="">Search all features (Will most likely cause duplicate results, as this will include sub-features)</option> + </param> + </inputs> + <outputs> + <data name="upstreamOut" format="fasta" label="upOut"/> + <data name="downstreamOut" format="fasta" label="downOut"/> + </outputs> + <help><![CDATA[ +Currently Experimental: Uploaded for review purposes + +**What it does** + +For a given Fasta file, this tool searches through the features of a Genbank file for one that matches the sequence in the Fasta. If found, it will then output a specified number of features upstream from the hit and a specified number of features downstream as a multifasta file. + +The drop down menus provide a selection of translation tables for the Fasta and Genbank inputs. If "translate output to protein sequence" +is selected, the output will be translated using the table selected for the Genbank translation. + +It is currently recommended to select either the Gene only or CDS only options for mode, as searching all features will include sub-features of neighbors (ie, selecting 2 for upstream will give you the neighboring gene and then its CDS sub feature, rather than 2 proper neighbors) as well as the sub-features of the search hit itself. + + +]]></help> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt-macros.xml Mon Jun 05 02:42:35 2023 +0000 @@ -0,0 +1,115 @@ +<macros> + <xml name="gff_requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="2.12.1">requests</requirement> + <requirement type="package" version="1.2.2">cpt_gffparser</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd '$__tool_directory__' && git rev-parse HEAD + ]]> + </version_command> + </xml> + <xml name="citation/mijalisrasche"> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex">@unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-crr"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020-AJC-solo"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-clm"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="sl-citations-clm"> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </xml> +</macros>
--- a/cpt_gbk_adjacent/adjacent_features.py Fri Jun 17 12:43:45 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,444 +0,0 @@ -#!/usr/bin/env python -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.Data import CodonTable -from Bio.SeqRecord import SeqRecord -from Bio.SeqFeature import SeqFeature, FeatureLocation -from Bio.Alphabet import generic_dna, generic_protein -import argparse -import logging - -logging.basicConfig(level=logging.INFO) -log = logging.getLogger() - - -def extract_features( - genbankFiles=None, - fastaFiles=None, - upOut=None, - downOut=None, - genesOnly=False, - cdsOnly=True, - forceSeqID=False, - forward=1, - behind=1, - outProt=True, - tTable=11, - fTable=11, -): - - genList = [] - fastaList = [] - - for fileX in genbankFiles: - opener = SeqIO.parse(fileX, "genbank") - for ( - openRec - ) in ( - opener - ): # To turn the generator into objects (or else we end up with a list of generators) - genList.append(openRec) - - for fileX in fastaFiles: - opener = SeqIO.parse(fileX, "fasta") - for openRec in opener: # Technically flattens multifastas too - fastaList.append(openRec) - - for seqMatch in fastaList: - longOut = seqMatch.description - protID = seqMatch.id - if fTable != 0: - fSeq = seqMatch.seq.translate(table=fTable, cds=False) - else: - fSeq = seqMatch.seq - - for gbk in genList: - sourceOut = gbk.id - num = -1 - for feat in gbk.features: - num += 1 - - if (genesOnly and feat.type != "gene") or ( - cdsOnly and feat.type != "CDS" - ): - continue - - if "codon_start" in feat.qualifiers: - offset = 1 - int(feat.qualifiers["codon_start"][0]) - else: - offset = 0 - - - temp = gbk.seq[feat.location.start : feat.location.end] - if feat.location.strand == -1: - temp = gbk.seq[feat.location.start : feat.location.end - offset] - temp = temp.reverse_complement() - else: - temp = gbk.seq[feat.location.start + offset : feat.location.end] - - if tTable != 0: - try: - gSeq = temp.translate(table=tTable, cds=True) - except CodonTable.TranslationError as cte: - # log.info("Translation issue at %s", cte) - gSeq = temp.translate(table=tTable, cds=False) - else: - gSeq = temp - - if not ("protein_id" in feat.qualifiers): - feat.qualifiers["protein_id"] = [ - "++++++++" - ] # Junk value for genesOnly flag - - if (gSeq == fSeq) and (protID == feat.qualifiers["protein_id"][0] or forceSeqID == False): - goBack = num - 1 - goAhead = num + 1 - numBack = behind - numAhead = forward - backList = [] - aheadList = [] - - while numBack != 0 and goBack >= 0: - if (genesOnly and gbk.features[goBack].type != "gene") or ( - cdsOnly and gbk.features[goBack].type != "CDS" - ): - goBack -= 1 - continue - backList.append(gbk.features[goBack]) - numBack -= 1 - goBack -= 1 - - while numAhead != 0 and goAhead < len(gbk.features): - if (genesOnly and gbk.features[goAhead].type != "gene") or ( - cdsOnly and gbk.features[goAhead].type != "CDS" - ): - goAhead += 1 - continue - aheadList.append(gbk.features[goAhead]) - numAhead -= 1 - goAhead += 1 - - backList.reverse() - if feat.location.strand == -1: - tmpList = aheadList - aheadList = backList - backList = tmpList - - - for item in backList: - addition = "" - header = "" - if "product" in item.qualifiers: - addition = " -" + str(item.qualifiers["product"][0]) + "-" - if "protein_id" in item.qualifiers: - header = ( - ">" - + (item.qualifiers["protein_id"][0]) - + addition - + " (5' of " - + longOut - + " found within " - + sourceOut - + ")\n" - ) - else: - header = ( - ">" - + (item.qualifiers["locus_tag"][0]) - + addition - + " (5' of " - + longOut - + " found within " - + sourceOut - + ")\n" - ) - if outProt == True: - if "translation" in item.qualifiers: - upOut.write(header) - upOut.write( - str(item.qualifiers["translation"][0]) + "\n\n" - ) - else: - modS = 0 - modE = 0 - if "codon_start" in item.qualifiers: - if item.location.strand > 0: - modS = ( - int(item.qualifiers["codon_start"][0]) - 1 - ) - else: - modE = ( - int(item.qualifiers["codon_start"][0]) - 1 - ) - - seqHold = gbk.seq[ - item.location.start - + modS : item.location.end - - modE - ] - if item.location.strand == -1: - seqHold = seqHold.reverse_complement() - if cdsOnly: - try: - finalSeq = "" - if tTable != 0: - finalSeq = ( - str( - seqHold.translate( - table=tTable, cds=True - ) - ) - + "\n\n" - ) - else: - finalSeq = str(seqHold) + "\n\n" - # upOut.write(header) - # upOut.write(finalSeq) - except Exception as bdct: - log.warn( - "ERROR %s %s", - item.qualifiers["locus_tag"][0], - bdct, - ) - finalSeq = "" - if tTable != 0: - finalSeq = ( - str( - seqHold.translate( - table=tTable, cds=False - ) - ) - + "\n\n" - ) - else: - finalSeq = str(seqHold) + "\n\n" - header = ( - ">" - + (item.qualifiers["locus_tag"][0]) - + addition - + " [INCOMPLETE] (5' of " - + longOut - + " found within " - + sourceOut - + ")\n" - ) - upOut.write(header) - upOut.write(finalSeq) - else: - - if tTable != 0: - upOut.write(header) - upOut.write( - str( - seqHold.translate( - table=tTable, cds=False - ) - ) - + "\n\n" - ) - else: - upOut.write(header) - upOut.write(str(seqHold) + "\n\n") - else: - upOut.write(header) - upOut.write( - str(gbk.seq[item.location.start : item.location.end]) - + "\n\n" - ) - - for item in aheadList: - addition = "" - header = "" - if "product" in item.qualifiers: - addition = " -" + str(item.qualifiers["product"][0]) + "-" - if "protein_id" in item.qualifiers: - header = ( - ">" - + (item.qualifiers["protein_id"][0]) - + addition - + " (3' of " - + longOut - + " found within " - + sourceOut - + ")\n" - ) - else: - header = ( - ">" - + (item.qualifiers["locus_tag"][0]) - + addition - + " (3' of " - + longOut - + " found within " - + sourceOut - + ")\n" - ) - if outProt == True: - if "translation" in item.qualifiers: - downOut.write(header) - downOut.write( - str(item.qualifiers["translation"][0]) + "\n\n" - ) - else: - modS = 0 - modE = 0 - if "codon_start" in item.qualifiers: - if item.location.strand > 0: - modS = ( - int(item.qualifiers["codon_start"][0]) - 1 - ) - else: - modE = ( - int(item.qualifiers["codon_start"][0]) - 1 - ) - - seqHold = gbk.seq[ - item.location.start - + modS : item.location.end - - modE - ] - if item.location.strand == -1: - seqHold = seqHold.reverse_complement() - if cdsOnly: - try: - finalSeq = "" - if tTable != 0: - finalSeq = ( - str( - seqHold.translate( - table=tTable, cds=True - ) - ) - + "\n\n" - ) - else: - finalSeq = str(seqHold) + "\n\n" - # downOut.write(header) - # downOut.write(finalSeq) - except Exception as bdct: - log.warn( - "ERROR %s %s", - item.qualifiers["locus_tag"][0], - bdct, - ) - finalSeq = "" - if tTable != 0: - finalSeq = ( - str( - seqHold.translate( - table=tTable, cds=False - ) - ) - + "\n\n" - ) - else: - finalSeq = str(seqHold) + "\n\n" - header = ( - ">" - + (item.qualifiers["locus_tag"][0]) - + addition - + " [INCOMPLETE] (3' of " - + longOut - + " found within " - + sourceOut - + ")\n" - ) - downOut.write(header) - downOut.write(finalSeq) - else: - - if tTable != 0: - downOut.write(header) - downOut.write( - str( - seqHold.translate( - table=tTable, cds=False - ) - ) - + "\n\n" - ) - else: - downOut.write(header) - downOut.write(str(seqHold) + "\n\n") - else: - downOut.write(header) - downOut.write( - str(gbk.seq[item.location.start : item.location.end]) - + "\n\n" - ) - # print(longOut) - - return - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Export a subset of features from a Genbank file", epilog="" - ) - parser.add_argument( - "-genbankFiles", nargs="+", type=argparse.FileType("r"), help="Genbank file" - ) - parser.add_argument( - "-fastaFiles", - nargs="+", - type=argparse.FileType("r"), - help="Fasta file to match against", - ) - parser.add_argument( - "-tTable", - type=int, - default=11, - help="Translation table to use", - choices=range(0, 23), - ) - parser.add_argument( - "-fTable", - type=int, - default=11, - help="Translation table to use", - choices=range(0, 23), - ) - parser.add_argument( - "-upOut", - type=argparse.FileType("w"), - help="Upstream Fasta output", - default="test-data/upOut.fa", - ) - parser.add_argument( - "-downOut", - type=argparse.FileType("w"), - help="Downstream Fasta output", - default="test-data/downOut.fa", - ) - parser.add_argument( - "--genesOnly", - action="store_true", - help="Search and return only Gene type features", - ) - parser.add_argument( - "--cdsOnly", - action="store_true", - help="Search and return only CDS type features", - ) - parser.add_argument( - "--forceSeqID", - action="store_true", - help="Search and return only CDS type features", - ) - parser.add_argument( - "--outProt", action="store_true", help="Output the translated sequence" - ) - parser.add_argument( - "--forward", - type=int, - default=1, - help="Number of features upstream from the hit to return", - ) - parser.add_argument( - "--behind", - type=int, - default=1, - help="Number of features downstream from the hit to return", - ) - args = parser.parse_args() - extract_features(**vars(args))
--- a/cpt_gbk_adjacent/adjacent_features.xml Fri Jun 17 12:43:45 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,111 +0,0 @@ -<?xml version="1.0"?> -<tool id="edu.tamu.cpt2.gbk.adjacent_features" name="Find adjacent Genbank features" version="RC4"> - <description>Searches a Genbank file for a given FASTA sequence, then outputs a file with adjacent upstream features, and another with adjacent downstream features.</description> - <macros> - <import>macros.xml</import> - <import>cpt-macros.xml</import> - </macros> - <expand macro="requirements"/> - <command detect_errors="aggressive"><![CDATA[ -python $__tool_directory__/adjacent_features.py -#set repeat_var_1 = '" "'.join([ str($var) for $var in $gbkIn ]) -#set repeat_var_2 = '" "'.join([ str($var) for $var in $fastaIn ]) --genbankFiles "$repeat_var_1" --fastaFiles "$repeat_var_2" --tTable $tTable --fTable $fTable --upOut $upstreamOut --downOut $downstreamOut -$translate -$forceID -$mode ---forward $ahead ---behind $back -]]></command> - <inputs> - <param label="Genbank file" name="gbkIn" type="data" format="genbank" multiple="True"/> - <param label="Fasta file" name="fastaIn" type="data" format="fasta" multiple="True"/> - <param label="Translation table to use on Fasta input:" name="fTable" type="select"> - <option value="0" selected="true">[0] Do not translate/ Fasta already translated</option> - <option value="1">[1] The Standard Code</option> - <option value="2">[2] The Vertebrate Mitochondrial Code</option> - <option value="3">[3] The Yeast Mitochondrial Code</option> - <option value="4">[4] The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option> - <option value="5">[5] The Invertebrate Mitochondrial Code </option> - <option value="6">[6] The Ciliate, Dasycladacean and Hexamita Nuclear Code</option> - <option value="9">[9] The Echinoderm and Flatworm Mitochondrial Code</option> - <option value="10">[10] The Euplotid Nuclear Code</option> - <option value="11">[11] The Bacterial, Archaeal and Plant Plastid Code</option> - <option value="12">[12] The Alternative Yeast Nuclear Code</option> - <option value="13">[13] The Ascidian Mitochondrial Code</option> - <option value="14">[14] The Alternative Flatworm Mitochondrial Code</option> - <option value="15">[15] Blepharisma Nuclear Code</option> - <option value="16">[16] Chlorophycean Mitochondrial Code</option> - <option value="21">[21] Trematode Mitochondrial Code</option> - <option value="22">[22] Scenedesmus Obliquus Mitochondrial Code</option> - <option value="23">[23] Thraustochytrium Mitochondrial Code</option> - <option value="24">[24] Pterobranchia Mitochondrial Code</option> - <option value="25">[25] Candidate Division SR1 and Gracilibacteria Code</option> - </param> - <param label="Translation table to use on Genbank features:" name="tTable" type="select"> - <option value="0">[0] Do not translate (Use nucleotide sequence of features)</option> - <option value="1">[1] The Standard Code</option> - <option value="2">[2] The Vertebrate Mitochondrial Code</option> - <option value="3">[3] The Yeast Mitochondrial Code</option> - <option value="4">[4] The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option> - <option value="5">[5] The Invertebrate Mitochondrial Code </option> - <option value="6">[6] The Ciliate, Dasycladacean and Hexamita Nuclear Code</option> - <option value="9">[9] The Echinoderm and Flatworm Mitochondrial Code</option> - <option value="10">[10] The Euplotid Nuclear Code</option> - <option value="11" selected="true">[11] The Bacterial, Archaeal and Plant Plastid Code</option> - <option value="12">[12] The Alternative Yeast Nuclear Code</option> - <option value="13">[13] The Ascidian Mitochondrial Code</option> - <option value="14">[14] The Alternative Flatworm Mitochondrial Code</option> - <option value="15">[15] Blepharisma Nuclear Code</option> - <option value="16">[16] Chlorophycean Mitochondrial Code</option> - <option value="21">[21] Trematode Mitochondrial Code</option> - <option value="22">[22] Scenedesmus Obliquus Mitochondrial Code</option> - <option value="23">[23] Thraustochytrium Mitochondrial Code</option> - <option value="24">[24] Pterobranchia Mitochondrial Code</option> - <option value="25">[25] Candidate Division SR1 and Gracilibacteria Code</option> - </param> - <param label="Number of features upstream to return" name="ahead" type="integer" value="1"/> - <param label="Number of features downstream to return" name="back" type="integer" value="1"/> - <param label="Translate output to protein sequence" name="translate" type="boolean" truevalue="--outProt" falsevalue=""/> - <param label="Genbank Protein's ID must also match Fasta Sequence's ID " name="forceID" type="boolean" truevalue="--forceSeqID" falsevalue="" checked="true"/> - <param name="mode" type="select" label="Mode"> - <option value="--genesOnly">Search only Gene-type features</option> - <option value="--cdsOnly">Search only CDS-type features</option> - <option value="">Search all features (Will most likely cause duplicate results, as this will include sub-features)</option> - </param> - </inputs> - <outputs> - <data name="upstreamOut" format="fasta" label="upOut"/> - <data name="downstreamOut" format="fasta" label="downOut"/> - </outputs> - <help><![CDATA[ -Currently Experimental: Uploaded for review purposes - -**What it does** - -For a given Fasta file, this tool searches through the features of a Genbank file for one that matches the sequence in the Fasta. If found, it will then output a specified number of features upstream from the hit and a specified number of features downstream as a multifasta file. - -The drop down menus provide a selection of translation tables for the Fasta and Genbank inputs. If "translate output to protein sequence" -is selected, the output will be translated using the table selected for the Genbank translation. - -It is currently recommended to select either the Gene only or CDS only options for mode, as searching all features will include sub-features of neighbors (ie, selecting 2 for upstream will give you the neighboring gene and then its CDS sub feature, rather than 2 proper neighbors) as well as the sub-features of the search hit itself. - - -]]></help> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - </citations> -</tool>
--- a/cpt_gbk_adjacent/cpt-macros.xml Fri Jun 17 12:43:45 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,115 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="gff_requirements"> - <requirements> - <requirement type="package" version="2.7">python</requirement> - <requirement type="package" version="1.65">biopython</requirement> - <requirement type="package" version="2.12.1">requests</requirement> - <yield/> - </requirements> - <version_command> - <![CDATA[ - cd $__tool_directory__ && git rev-parse HEAD - ]]> - </version_command> - </xml> - <xml name="citation/mijalisrasche"> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex">@unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - </xml> - <xml name="citations"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-crr"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Ross}, - title = {CPT Galaxy Tools}, - year = {2020-}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020-AJC-solo"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-clm"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="sl-citations-clm"> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </xml> -</macros>
--- a/cpt_gbk_adjacent/macros.xml Fri Jun 17 12:43:45 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,105 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="requirements"> - <requirements> - <requirement type="package" version="3.8.13">python</requirement> - <requirement type="package" version="1.79">biopython</requirement> - <requirement type="package" version="1.2.2">cpt_gffparser</requirement> - <yield/> - </requirements> - </xml> - <xml name="ldap_ref" - token_name="dn_ref" - token_label="Pick a DN" - token_fromfile="ldap_people.loc"> - <repeat name="repeat_@NAME@" title="@LABEL@"> - <param name="@NAME@" label="Select a @LABEL@" type="select"> - <options from_file="@FROMFILE@"> - <column name="name" index="0"/> - <column name="value" index="1"/> - </options> - </param> - </repeat> - </xml> - <xml name="ldap_ref_single" - token_name="dn_ref" - token_label="Pick a DN" - token_fromfile="ldap_people.loc"> - <param name="@NAME@" label="Select a @LABEL@" type="select"> - <options from_file="@FROMFILE@"> - <column name="name" index="0"/> - <column name="value" index="1"/> - </options> - </param> - </xml> - <xml name="gbk_feature_type" - token_label="Feature type to remove" - token_multiple="True" - token_optional="False" - token_name="positional_2"> - <param label="@LABEL@" optional="@TOKEN_OPTIONAL" multiple="@MULTIPLE@" name="feature_type" type="select"> - <option value="-10_signal">-10_signal</option> - <option value="-35_signal">-35_signal</option> - <option value="3'UTR">3'UTR</option> - <option value="5'UTR">5'UTR</option> - <option value="CAAT_signal">CAAT_signal</option> - <option selected="true" value="CDS">CDS</option> - <option value="C_region">C_region</option> - <option value="D-loop">D-loop</option> - <option value="D_segment">D_segment</option> - <option value="GC_signal">GC_signal</option> - <option value="J_segment">J_segment</option> - <option value="LTR">LTR</option> - <option value="N_region">N_region</option> - <option value="RBS">RBS</option> - <option value="STS">STS</option> - <option value="S_region">S_region</option> - <option value="TATA_signal">TATA_signal</option> - <option value="V_region">V_region</option> - <option value="V_segment">V_segment</option> - <option value="all">all</option> - <option value="assembly_gap">assembly_gap</option> - <option value="attenuator">attenuator</option> - <option value="enhancer">enhancer</option> - <option value="exon">exon</option> - <option value="gap">gap</option> - <option value="gene">gene</option> - <option value="iDNA">iDNA</option> - <option value="intron">intron</option> - <option value="mRNA">mRNA</option> - <option value="mat_peptide">mat_peptide</option> - <option value="misc_RNA">misc_RNA</option> - <option value="misc_binding">misc_binding</option> - <option value="misc_difference">misc_difference</option> - <option value="misc_feature">misc_feature</option> - <option value="misc_recomb">misc_recomb</option> - <option value="misc_signal">misc_signal</option> - <option value="misc_structure">misc_structure</option> - <option value="mobile_element">mobile_element</option> - <option value="modified_base">modified_base</option> - <option value="ncRNA">ncRNA</option> - <option value="old_sequence">old_sequence</option> - <option value="operon">operon</option> - <option value="oriT">oriT</option> - <option value="polyA_signal">polyA_signal</option> - <option value="polyA_site">polyA_site</option> - <option value="precursor_RNA">precursor_RNA</option> - <option value="prim_transcript">prim_transcript</option> - <option value="primer_bind">primer_bind</option> - <option value="promoter">promoter</option> - <option value="protein_bind">protein_bind</option> - <option value="rRNA">rRNA</option> - <option value="rep_origin">rep_origin</option> - <option value="repeat_region">repeat_region</option> - <option value="sig_peptide">sig_peptide</option> - <option value="source">source</option> - <option value="stem_loop">stem_loop</option> - <option value="tRNA">tRNA</option> - <option value="terminator">terminator</option> - <option value="tmRNA">tmRNA</option> - <option value="transit_peptide">transit_peptide</option> - <option value="unsure">unsure</option> - <option value="variation">variation</option> - </param> - </xml> -</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Jun 05 02:42:35 2023 +0000 @@ -0,0 +1,74 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package">progressivemauve</requirement> + <!--<requirement type="package" version="2.7">python</requirement>--> + <requirement type="package" version="0.6.4">bcbiogff</requirement> + <yield/> + </requirements> + </xml> + <token name="@WRAPPER_VERSION@">2.4.0</token> + <xml name="citation/progressive_mauve"> + <citation type="doi">10.1371/journal.pone.0011147</citation> + </xml> + <xml name="citation/gepard"> + <citation type="doi">10.1093/bioinformatics/btm039</citation> + </xml> + <token name="@XMFA_INPUT@"> + '$xmfa' + </token> + <xml name="xmfa_input" token_formats="xmfa"> + <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA"/> + </xml> + <token name="@XMFA_FA_INPUT@"> + '$sequences' + </token> + <xml name="xmfa_fa_input"> + <param type="data" format="fasta" name="sequences" label="Sequences in alignment" help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/> + </xml> + <xml name="genome_selector"> + <conditional name="reference_genome"> + <param name="reference_genome_source" type="select" label="Reference Genome"> + <option value="history" selected="True">From History</option> + <option value="cached">Locally Cached</option> + </param> + <when value="cached"> + <param name="fasta_indexes" type="select" label="Source FASTA Sequence"> + <options from_data_table="all_fasta"/> + </param> + </when> + <when value="history"> + <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> + </when> + </conditional> + </xml> + <xml name="gff3_input"> + <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> + </xml> + <xml name="input/gff3+fasta"> + <expand macro="gff3_input"/> + <expand macro="genome_selector"/> + </xml> + <token name="@INPUT_GFF@"> + '$gff3_data' + </token> + <token name="@INPUT_FASTA@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> + <token name="@GENOME_SELECTOR_PRE@"> + #if $reference_genome.reference_genome_source == 'history': + ln -s '$reference_genome.genome_fasta' genomeref.fa; + #end if + </token> + <token name="@GENOME_SELECTOR@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> +</macros>