Mercurial > repos > yating-l > snap

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Group.py	Wed Apr 12 17:37:47 2017 -0400
@@ -0,0 +1,59 @@
+from operator import itemgetter
+
+# Input: A group: a list that contains lines belonging to the same gene
+class Group:
+    # Modify "type" column and "attributes" colunm, initialize id, gene, source, stream
+    def __init__(self, group):
+        self.group = group
+        self.id = str(group[0][0])
+        self.source = str(group[0][1])
+        self.stream = str(group[0][6])
+        self.gene = str(group[0][8])
+        for x in range(0, len(group)):
+            self.group[x][2] = "CDS"
+            self.group[x][8] = "Parent=mRNA_" + self.gene
+            self.group[x][3] = int(self.group[x][3])
+            self.group[x][4] = int(self.group[x][4])
+
+    # Order the group elements accoriding to Stream, +: ascanding order, -: descanding order
+    def order(self):
+        self.num = len(self.group)
+        if self.stream == "+":
+            self.group = sorted(self.group, key=itemgetter(3))
+            self.min_item = self.group[0][3]
+            self.max_item = self.group[self.num-1][4]
+        elif self.stream == "-":
+            self.group = sorted(self.group, key=itemgetter(3), reverse=True)
+            self.min_item = self.group[self.num-1][3]
+            self.max_item = self.group[0][4]
+        else:
+            print("Stream in invalid!\n")
+
+    def phaseCalculator(self, i, donor = 0):
+        if i >= self.num:
+            pass
+        else:
+            self.type = self.group[i][2]
+            self.size = self.group[i][4] - self.group[i][3] + 1
+        if self.num == 1:
+            if self.type == "Eterm":
+                self.group[i][7] = str(self.size % 3)
+            else:
+                self.group[i][7] = "0"
+        elif self.num > 1 and i < self.num:
+            accept = (3 - donor) % 3
+            self.group[i][7] = str(accept)
+            donor = (self.size - accept) % 3
+            i = i + 1
+            self.phaseCalculator(i, donor)
+
+
+    def writer(self, gff3):
+        self.order()
+        self.phaseCalculator(0)
+        gff3.write(self.id + "\t" + self.source + "\tgene\t" + str(self.min_item) + "\t" + str(self.max_item) + "\t.\t" + self.stream + "\t.\t" + "ID=" + self.gene + "\n")
+        gff3.write(self.id + "\t" + self.source + "\tmRNA\t" + str(self.min_item) + "\t" + str(self.max_item) + "\t.\t" + self.stream + "\t.\t" + "ID=mRNA_" + self.gene + ";Parent=" + self.gene + "\n")
+        for x in range(0, len(self.group)):
+            self.group[x][3] = str(self.group[x][3])
+            self.group[x][4] = str(self.group[x][4])
+            gff3.write("\t".join(self.group[x]) + "\n")
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.md	Wed Apr 12 17:37:47 2017 -0400
@@ -0,0 +1,2 @@
+# SNAP
+Galaxy wrapper for SNAP gene prediction tool
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gff2Togff3.py	Wed Apr 12 17:37:47 2017 -0400
@@ -0,0 +1,59 @@
+import argparse
+import sys
+import fileinput
+from Group import Group
+
+def main():
+        parser = argparse.ArgumentParser(description='Get a gff file and the output gff3 file')
+        parser.add_argument('--input', help='input gff file')
+        parser.add_argument('--output', help='output gff3 file', required=True)
+        args = parser.parse_args()
+        input = args.input
+        output = args.output
+        if not sys.stdin.isatty():
+            c = Convertor(sys.stdin, output)
+        else:
+            c = Convertor(input, output)
+        c.convert()
+
+class Convertor:
+    def __init__(self, input, output):
+        if type(input) is str:
+            with open(input) as self.f:
+                self.li = [line.rstrip().split("\t") for line in self.f]
+        else:
+            self.li = [line.rstrip().split("\t") for line in input]
+        self.gff3 = open(output, "w")
+        self.gff3.write("##gff-version 3\n")
+
+    def convert(self):
+        index = 0
+        while index in range(0, len(self.li)):
+            index = self.groupAsgene(index)
+        self.gff3.close()
+
+
+    def groupAsgene(self, start = 0):
+        gene = self.li[start][8]
+        index = len(self.li)
+        for i in range(start+1, len(self.li)):
+            line = self.li[i]
+            if gene != line[8]:
+                index = i
+                break
+        if index >= len(self.li):
+            group = self.li[start:len(self.li)]
+        else:
+            group = self.li[start:index]
+        g = Group(group)
+        g.writer(self.gff3)
+        return index
+
+
+
+
+if __name__ == "__main__":
+    main()
+
+
+
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/readme.rst	Wed Apr 12 17:37:47 2017 -0400
@@ -0,0 +1,46 @@
+Galaxy wrapper for SNAP
+========================
+
+This wrapper is copyright 2016-2017 by Yating Liu
+
+This is a wrapper for the gene prediction tool SNAP. SNAP is a general purpose gene finding program suitable for both eukaryotic and prokaryotic genomes. SNAP is an acroynm for Semi-HMM-based Nucleic Acid
+Parser.
+
+Reference
+----------------------
+
+    Korf I. Gene finding in novel Genomes. BMC Bioinformatics 2004, 5:59
+
+Installation
+-----------------------
+
+To install SNAP, please download SNAP from
+
+http://korflab.ucdavis.edu/Software/snap-2013-11-29.tar.gz
+
+and follow the installation instractions. The software is routinely compiled and tested on Mac OS X. It should compile
+fine on any Linux/Unix type operating systems.
+The default compiler is gcc. If you have gcc installed, the easiest is to just compile as:
+```
+  make
+```
+
+The ZOE environment variable is used by SNAP to find the HMM files. Set this
+to the directory containing this file. For example, if you unpackaged the tar-ball in /usr/local/snap, set the ZOE environment variable to /usr/local/snap
+
+```
+    setenv ZOE /usr/local/snap # csh, tcsh, etc
+```
+  or
+```
+    export ZOE=/usr/local/snap # sh, bash, etc
+```
+To install the wrapper copy the snap folder in the galaxy tools and modify the $GALAXY_ROOT/config/tool_conf.xml file to make the tool available to Galaxy. For example:
+```
+<tool file="galaxy/tools/myTools/snap/snap.xml" />
+```
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snap.xml	Wed Apr 12 17:37:47 2017 -0400
@@ -0,0 +1,170 @@
+<tool id="snap" name="Semi-HMM-based Nucleic Acid Parser (SNAP)" version="1.0">
+    <requirements>
+        <requirement type="package" version="1.0">snap</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" />
+    </stdio>
+    <command><![CDATA[
+        snap
+        #if $settings.advanced == "advanced"
+        $settings.lcmask
+            #if $settings.strand == "true"
+                $settings.strand.onestrand
+            #end if
+        #end if
+        #if $proteins == "-aa":
+        $proteins "$output2"
+        #end if
+        #if $transcripts == "-tx":
+        $transcripts "$output3"
+        #end if
+        -gff
+        -quiet
+        $organism
+        $input1
+        | python $__tool_directory__/gff2Togff3.py --output $output1
+    ]]></command>
+    <inputs>
+        <param type="data" name="input1" format="fasta" />
+        <param name="organism" label="Model Organism" type="select" multiple="false" format="text" help="Choose a specialised trainingset.">
+            <option value="A.gambiae.hmm">A.gambiae</option>
+            <option value="A.mellifera.hmm">A.mellifera</option>
+            <option value="A.thaliana.hmm">Arabidopsis thaliana</option>
+            <option value="Acanium.hmm">Acanium</option>
+            <option value="At.hmm">Thale</option>
+            <option value="B.malayi.hmm">Brugia</option>
+            <option value="B.mori.hmm">B.mori</option>
+            <option value="C.elegans.hmm">C.elegans</option>
+            <option value="C.intestinalis.hmm">Ciona</option>
+            <option value="Ce.hmm">Ce</option>
+            <option value="D.melanogaster.hmm">Drosophila melanogaster</option>
+            <option value="Dm.hmm">Dm</option>
+            <option value="ixodesA.hmm">ixodesA</option>
+            <option value="ixodesB.hmm">ixodesB</option>
+            <option value="mam39-ro.hmm">mam39-ro</option>
+            <option value="mam39.hmm">mam39</option>
+            <option value="mam46-ro.hmm">mam46-ro</option>
+            <option value="mam46.hmm">mam46</option>
+            <option value="mam54-ro.hmm">mam54-ro</option>
+            <option value="mam54.hmm">mam54</option>
+            <option value="mamiso.hmm">mamiso</option>
+            <option value="minimal.hmm">minimal</option>
+            <option value="Nasonia.hmm">Nasonia</option>
+            <option value="nGASP.hmm">nGASP</option>
+            <option value="nGASPr.hmm">nGASPr</option>
+            <option value="O.sativa.hmm">O.sativa</option>
+            <option value="Os.hmm">Os</option>
+        </param>
+        <param name="proteins" type="select" label="Create FASTA file of proteins">
+            <option value="">No</option>
+            <option value="-aa">Yes</option>
+        </param>
+        <param name="transcripts" type="select" label="Create FASTA file of transcripts">
+            <option value="">No</option>
+            <option value="-tx">Yes</option>
+        </param>
+        <conditional name="settings">
+            <param name="advanced" type="select" label="Specify advanced parameters">
+                <option value="simple" selected="true">No, use program defaults.</option>
+                <option value="advanced">Yes, see full parameter list.</option>
+            </param>
+            <when value="simple">
+            </when>
+            <when value="advanced">
+                <param name="lcmask" type="boolean" label="treat lowercase as N" truevalue="-lcmask" falsevalue="" />
+                <conditional name="strand">
+                    <param name="strand" type="boolean" label="predict on one strand only" />
+                    <when value="true">
+                    <param name="onestrand" type="select" label="Specify which strand to predict">
+                        <option value="-plus">predict on plus strand only.</option>
+                        <option value="-minus">predict on minus strand only.</option>
+                    </param>
+                    </when>
+                    <when value="false">
+                    </when>
+                </conditional>
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="output1" format="gff3" label="${tool.name} on ${on_string}: GTF/GFF3">
+        </data>
+        <data name="output2" format="fasta" label="${tool.name} on ${on_string}: Protein sequence">
+            <filter>proteins == "-aa"</filter>
+        </data>
+        <data name="output3" format="fasta" label="${tool.name} on ${on_string}: Coding sequence">
+            <filter>transcripts == "-tx"</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input1" value="thale.dna.gz"/>
+            <param name="organism" value="At.hmm" />
+            <output name="output1" file="thale.gff3"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+        The general form of the snap command line is:
+
+    snap <HMM file> <FASTA file> [options]
+
+HMM file:
+
+    The most convenient way to specify the HMM file is by name. This requires
+    that the ZOE environment variable is set. In this case, snap will look
+    for the HMM file in $ZOE/HMM. You may also specify the HMM file by an
+    explicit path. The following are equivalent if $ZOE is in /usr/local:
+
+        snap C.elegans.hmm ...
+        snap /usr/local/Zoe/HMM/C.elegans.hmm ...
+        snap worm ...  # there are a few convenient aliases in $ZOE/HMM
+
+FASTA file:
+
+    If you have several sequences to analyze, it is more efficient to run
+    snap on a concatenated FASTA file rather than separate runs on single
+    sequence files. The seqeuence may be in a compressed format
+
+    If sequences have been masked with lowercase letters, use -lcmask to
+    prevent exons from appearing in masked DNA.
+
+Output:
+
+    Annotation is reported to stdout in a non-standard format (ZFF). You can
+    change to GFF or ACEDB with the -gff or -ace options. Proteins and
+    transcripts are reported to FASTA files with the -aa and -tx options.
+
+External definitions:
+
+    SNAP allows you to adjust the score of any sequence model at any point
+    in a sequence. This behavior is invoked by giving a ZFF file to SNAP:
+
+        snap <hmm> <sequence> -xdef <ZFF file>
+
+    Each feature description uses the 'group' field to issue a command:
+
+        SET     set the score
+        ADJ     adjust the score up or down
+        OK      set non-cannonical scores
+
+     >FOO
+     Acceptor 120 120 + +50 . . . SET  (sets an Acceptor to 50)
+     Donor    212 212 + -20 . . . ADJ  (lowers a Donor by -20)
+     Inter    338 579 +  -2 . . . ADJ  (lowers Inter by -2 in a range)
+     Coding   440 512 -  +3 . . . ADJ  (raises Coding by +3 in a range)
+     Donor    625 638 +  -5 . . . OK   (sets range of odd Donors to -5)
+
+If the output has scrolled off your screen, try 'snap -help | more'
+
+    ]]></help>
+    <citations>
+        <citation type="bibtex">
+@misc{renameTODO,
+  author = {LastTODO, FirstTODO},
+  year = {TODO},
+  title = {TODO},
+  url = {http://korflab.ucdavis.edu/Software/snap-2013-11-29.tar.gz},
+}</citation>
+    </citations>
+</tool>
Binary file test-data/thale.dna.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Wed Apr 12 17:37:47 2017 -0400
@@ -0,0 +1,25 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="snap" version="1.0">
+        <install version="1.0">
+            <actions_group>
+                <actions>
+                    <action type="download_by_url">http://korflab.ucdavis.edu/Software/snap-2013-11-29.tar.gz</action>
+                    <action type="shell_command">make</action>
+                    <action type="move_directory_files">
+                        <source_directory>.</source_directory>
+                        <destination_directory>$INSTALL_DIR</destination_directory>
+                    </action>
+                    <action type="set_environment">
+                        <environment_variable name="ZOE" action="set_to">$INSTALL_DIR</environment_variable>
+                        <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR</environment_variable>
+                    </action>
+                </actions>
+            </actions_group>
+        </install>
+        <readme>SNAP is a general purpose gene finding program suitable for both eukaryotic
+            and prokaryotic genomes. SNAP is an acroynm for Semi-HMM-based Nucleic Acid
+            Parser.
+        </readme>
+    </package>
+</tool_dependency>