changeset 0:7af33315bc5e draft

Uploaded
author crs4
date Mon, 09 Sep 2013 06:11:47 -0400
parents
children f77ce4f92b46
files COPYING datatypes_conf.xml edge_pro.py edge_pro.xml get_edge_data.py get_edge_data.xml tool_dependencies.xml
diffstat 7 files changed, 431 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/COPYING	Mon Sep 09 06:11:47 2013 -0400
@@ -0,0 +1,23 @@
+Copyright © 2012-2013 CRS4 Srl. http://www.crs4.it/
+Created by:
+Paolo Uva <paolo.uva@crs4.it>
+Nicola Soranzo <nicola.soranzo@crs4.it>
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml	Mon Sep 09 06:11:47 2013 -0400
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+<datatypes>
+  <registration>
+    <datatype extension="ptt" type="galaxy.datatypes.data:Text" mimetype="text/plain" subclass="True" display_in_upload="true" />
+    <datatype extension="rnt" type="galaxy.datatypes.data:Text" mimetype="text/plain" subclass="True" display_in_upload="true" />
+  </registration>
+</datatypes>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/edge_pro.py	Mon Sep 09 06:11:47 2013 -0400
@@ -0,0 +1,107 @@
+# -*- coding: utf-8 -*-
+"""
+Wrapper for EDGE - Gene expression in Prokaryotes
+Author: Paolo Uva paolo dot uva at crs4 dot it
+Date: March 18, 2013
+"""
+
+import glob
+import optparse
+import os
+import shutil
+import subprocess
+import tempfile
+import sys
+
+def __main__():
+    # Parse Command Line
+    parser = optparse.OptionParser()
+    # Input
+    parser.add_option('-g', '--genome', dest="genome")
+    parser.add_option('-p', '--ptt', dest="ptt")
+    parser.add_option('-r', '--rnt', dest="rnt")
+    parser.add_option('-u', '--input1', dest="input1")
+    parser.add_option('-v', '--input2', dest="input2")
+    parser.add_option('-t', '--num-threads', dest="num_threads", type='int')
+    parser.add_option('-m', '--minInsertSize', dest="minInsertSize", type='int')
+    parser.add_option('-M', '--maxInsertSize', dest="maxInsertSize", type='int')
+    parser.add_option('-w', '--window', dest="window", type='int')
+    parser.add_option('-i', '--utrSize', dest="utrSize", type='int')
+    parser.add_option('-x', '--similarity', dest="similarity", type='float')
+    parser.add_option('-c', '--minCoverage', dest="minCoverage", type='int')
+    parser.add_option('-l', '--readLength', dest="readLength", type='int')
+    # Output
+    parser.add_option('--out-aln', dest="out_aln")
+    parser.add_option('--out-rpkm', dest="out_rpkm")
+    parser.add_option('--out-log', dest="out_log")
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error('Wrong number of arguments')
+    
+    # Build command
+    if options.input2:
+        input2_flag = '-v %s' % (options.input2)
+    else:
+        input2_flag = ''
+    if options.num_threads is not None:
+        num_threads_flag = '-t %d' % options.num_threads
+    else:
+        num_threads_flag = ''
+    if options.minInsertSize is not None:
+        minInsertSize_flag = '-m %d' % options.minInsertSize
+    else:
+        minInsertSize_flag = ''
+    if options.maxInsertSize is not None:
+        maxInsertSize_flag = '-M %d' % options.maxInsertSize
+    else:
+        maxInsertSize_flag = ''
+    if options.window is not None:
+        window_flag = '-w %d' % options.window
+    else:
+        window_flag = ''
+    if options.utrSize is not None:
+        utrSize_flag = '-i %d' % options.utrSize
+    else:
+        utrSize_flag = ''
+    if options.similarity is not None:
+        similarity_flag = '-x %s' % options.similarity
+    else:
+        similarity_flag = ''
+    if options.readLength is not None:
+        readLength_flag = '-l %d' % (options.readLength)
+    else:
+        readLength_flag = ''
+    if options.minCoverage is not None:
+        minCoverage_flag = '-c %d' % options.minCoverage
+    else:
+        minCoverage_flag = ''
+    
+    wd = tempfile.mkdtemp()
+    try:
+        prefix = os.path.join(wd, 'out')
+        cl = 'edge.pl -g %s -p %s -r %s -u %s -o %s %s %s %s %s %s %s %s %s %s' % (options.genome, options.ptt, options.rnt, options.input1, prefix, input2_flag, num_threads_flag, minInsertSize_flag, maxInsertSize_flag, window_flag, utrSize_flag, similarity_flag, readLength_flag, minCoverage_flag)
+        print cl
+        
+        if options.out_log:
+            sout = open(options.out_log, 'w')
+        else:
+            sout = sys.stdout
+        try:
+            subprocess.check_call(cl, stdout=sout, stderr=subprocess.STDOUT, shell=True) # need to redirect stderr because edge.pl calls bowtie2 and count which write some logging info there
+        finally:
+            if sout != sys.stdout:
+                sout.close()
+
+        # Move alignment file
+        shutil.move(prefix + '.alignments', options.out_aln)
+        # Concatenate multiple RPKM files together
+        with open(options.out_rpkm, 'wb') as destination:
+            for filename in glob.iglob(prefix + '.rpkm_*'):
+                with open(filename, 'rb') as source:
+                    shutil.copyfileobj(source, destination)
+    finally:
+        shutil.rmtree(wd)
+
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/edge_pro.xml	Mon Sep 09 06:11:47 2013 -0400
@@ -0,0 +1,133 @@
+<tool id="edge_pro" name="EDGE-pro" version="1.0.0">
+  <description>Gene expression in Prokaryotes</description>
+  <requirements>
+    <requirement type="package" version="1.3.1">edge-pro</requirement>
+  </requirements>
+  <command interpreter="python">
+    edge_pro.py
+    \${EDGE_PRO_SITE_OPTIONS:---num-threads=4}
+    
+    ## Mandatory input parameters
+    --genome $genome
+    --ptt $ptt
+    --rnt $rnt
+    ## First input file always required
+    --input1 $singlePaired.input1
+    ## Second input only if input is paired-end
+    #if $singlePaired.sPaired == "paired"
+      --input2=$singlePaired.input2
+    #end if
+    
+    ## Optional input parameters
+    #if $params.settingsType == "full"
+      #if str($params.minInsertSize)
+        --minInsertSize=$params.minInsertSize
+      #end if
+      #if str($params.maxInsertSize)
+        --maxInsertSize=$params.maxInsertSize
+      #end if
+      #if str($params.window)
+        --window=$params.window
+      #end if
+      #if str($params.utrSize)
+        --utrSize=$params.utrSize
+      #end if
+      #if str($params.similarity)
+        --similarity=$params.similarity
+      #end if
+      #if str($params.readLength)
+        --readLength=$params.readLength
+      #end if
+      #if str($params.minCoverage)
+        --minCoverage=$params.minCoverage
+      #end if
+    #end if
+    
+    ## Outputs
+    --out-aln $out_aln
+    --out-rpkm $out_rpkm
+    --out-log $out_log
+  </command>
+  <inputs>
+    <conditional name="singlePaired">
+      <param name="sPaired" type="select" label="Is this library mate-paired?">
+        <option value="single">Single-end</option>
+        <option value="paired">Paired-end</option>
+      </param>
+      <when value="single">
+        <param format="fastqsanger" name="input1" type="data" label="FASTQ file" help="FASTQ format with Sanger-scaled quality values (Galaxy fastqsanger datatype)"/>
+      </when>
+      <when value="paired">
+        <param format="fastqsanger" name="input1" type="data" label="Forward FASTQ file" help="FASTQ format with Sanger-scaled quality values (Galaxy fastqsanger datatype)" />
+        <param format="fastqsanger" name="input2" type="data" label="Reverse FASTQ file" help="FASTQ format with Sanger-scaled quality values (Galaxy fastqsanger datatype)" />
+      </when>
+    </conditional>
+    
+    <param format="fasta" name="genome" type="data" label="Select the reference genome from your history (-g)" help="FASTA format" />
+    <param format="ptt" name="ptt" type="data" label="Coordinates of coding genes (PTT file)" help="PTT file with coordinates of coding genes (-p)" />
+    <param format="rnt" name="rnt" type="data" label="Coordinates of structural RNAs (RNT file)" help="RNT file with coordinates of structural RNA (-r)" />
+    
+    <conditional name="params">
+      <param name="settingsType" type="select" label="Parameter settings" help="For most needs, use default settings. If you want full control use Full Parameter List">
+        <option value="preSet">Use Defaults</option>
+        <option value="full">Full parameter list</option>
+      </param>
+      <when value="preSet" />
+      <!-- Full/advanced params. -->
+      <when value="full">
+        <param name="minInsertSize" type="integer" optional="true" value="0" label="Minimun insert size for a read pair (-m)" help="For paired-end reads only" />
+        <param name="maxInsertSize" type="integer" optional="true" value="500" label="Maximun insert size for a read pair (-M)" help="For paired-end reads only" />
+        <param name="window" type="integer" optional="true" value="100" label="Window length for coverage distribution (-w)" help="Used to distribute the coverage between two overlapping genes. See help below for details" />
+        <param name="utrSize" type="integer" optional="true" value="40" label="Size of the untranslated region (-i)" help="Enter the size of the untranslated region between the initial transcription site and the start codon" />
+        <param name="similarity" type="float" optional="true" value="0.15" label="Percentage for similar coverage (-x)" help="Enter the percentage used to determine when two coverage values are considered similar. See help below for details" />
+        <param name="readLength" type="integer" optional="true" value="" label="Read length (-l)" help="If not specified, the first 1000 reads are used to approximate the read length" />
+        <param name="minCoverage" type="integer" optional="true" value="3" label="Minimum average coverage for expressed genes (-c)" help="Coverage less than specified is assumed to be noise and gene is considered to not be expressed" />
+      </when> <!-- full -->
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data format="sam" name="out_aln" label="${tool.name} on ${on_string}: alignment"/>
+    <data format="tabular" name="out_rpkm" label="${tool.name} on ${on_string}: rpkm"/>
+    <data format="txt" name="out_log" label="${tool.name} on ${on_string}: log"/>
+  </outputs>
+
+  <help>
+
+**What it does**
+
+`EDGE-pro`_, Estimated Degree of Gene Expression in PROkaryots is an efficient software system to estimate gene expression levels in prokaryotic genomes from RNA-seq data. EDGE-pro uses Bowtie2 for alignment and then estimates expression directly from the alignment results.
+EDGE-pro includes routines to assign reads aligning to overlapping gene regions accurately. 15% or more of bacterial genes overlap other genes, making this a significant problem for bacterial RNA-seq, one that is generally ignored by programs designed for eukaryotic RNA-seq experiments.
+
+**Input files:**
+
+.. class:: infomark
+
+Input files with gene coordinates in PTT and RNT format can be retrieved with the Get EDGE-pro Files tool available in Galaxy, or downloaded from the `NCBI ftp repository`_.
+This tool accepts files in Sanger FASTQ format (Galaxy *fastqsanger* datatype). Use the FASTQ Groomer tool to prepare your files.
+
+.. _NCBI ftp repository: ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/
+
+.. class:: warningmark
+
+All 3 types of files (FASTA reference genome, PTT and RNT) must have the same order of chromosomes/plasmids (e.g. if chr1 is before chr2 in genome.fasta file, then chr1 must be before chr2 in ptt and rnt files as well). If there is no PTT or RNT file for one of chromosomes/plasmids, place this chromosome/plasmid at the end of the file.
+
+**License and citation**
+
+This Galaxy tool is Copyright © 2012-2013 `CRS4 Srl.`_ and is released under the `MIT license`_.
+
+.. _CRS4 Srl.: http://www.crs4.it/
+.. _MIT license: http://opensource.org/licenses/MIT
+
+If you use this tool in Galaxy, please cite |Cuccuru2013|_.
+
+.. |Cuccuru2013| replace:: Cuccuru, G., Orsini, M., Pinna, A., Sbardellati, A., Soranzo, N., Travaglione, A., Uva, P., Zanetti, G., Fotia, G. (2013) Orione, a web-based framework for NGS analysis in microbiology. *Submitted*
+.. _Cuccuru2013: http://orione.crs4.it/
+
+This tool uses `EDGE-pro`_, which is licensed separately. Please cite |Magoc2013|_.
+
+.. _EDGE-pro: http://ccb.jhu.edu/software/EDGE-pro/
+.. |Magoc2013| replace:: Magoc, T., Wood, D., Salzberg, S. L. (2013) EDGE-pro: Estimated Degree of Gene Expression in Prokaryotic Genomes. *Evol. Bioinform.* 2013:9, 127-136
+.. _Magoc2013: http://www.la-press.com/edge-pro-estimated-degree-of-gene-expression-in-prokaryotic-genomes-article-a3586
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_edge_data.py	Mon Sep 09 06:11:47 2013 -0400
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+
+from ftplib import FTP
+import optparse
+import sys
+
+class GetData:
+
+    def __init__(self, gbkid, fnafile, pttfile, rntfile, gfffile):
+        self.gbkid = gbkid
+        self.fnafile = fnafile
+        self.pttfile = pttfile
+        self.rntfile = rntfile
+        self.gfffile = gfffile
+        self.ftpurl = 'ftp.ncbi.nlm.nih.gov'
+        self.folder = '/genomes/Bacteria/'
+
+    def getData(self):
+        """ """
+        strainName = self._getStrainName()
+        print strainName
+        if not strainName:
+            sys.exit("Unrecognized RefSeq Genomic Accession ID")
+        ftp = FTP(self.ftpurl)
+        ftp.login()
+        newDir = self.folder + strainName
+        ftp.cwd(newDir)
+        
+        directoryFiles = []
+        ftp.retrlines('NLST',  directoryFiles.append)
+        for fileName in directoryFiles:
+            try:
+                if '.fna' in fileName and self.gbkid in fileName:
+                    #print "downloading", fileName
+                    with open(self.fnafile, 'w') as outFile:
+                        ftp.retrbinary("RETR " + fileName, outFile.write)
+                elif '.ptt' in fileName and self.gbkid in fileName:
+                    #print "downloading", fileName
+                    with open(self.pttfile, 'w') as outFile:
+                        ftp.retrbinary("RETR " + fileName, outFile.write)
+                elif '.rnt' in fileName and self.gbkid in fileName:
+                    #print "downloading", fileName
+                    with open(self.rntfile, 'w') as outFile:
+                        ftp.retrbinary("RETR " + fileName, outFile.write)
+                elif '.gff' in fileName and self.gbkid in fileName:
+                    #print "downloading", fileName
+                    with open(self.gfffile, 'w') as outFile:
+                        ftp.retrbinary("RETR " + fileName, outFile.write)
+                #elif '.gbk' in fileName and self.gbkid in fileName:
+                #    print "downloading", fileName
+                #    with open(fileName, 'w') as outFile:
+                #       ftp.retrbinary("RETR " + fileName, outFile.write)
+            except:
+                pass
+
+    def _getStrainName(self):
+        """ """
+        ftp = FTP(self.ftpurl)
+        ftp.login()
+        ftp.cwd(self.folder)
+        
+        straindirectories = []
+        ftp.retrlines("NLST " , straindirectories.append)
+        #print "scanning directories..."
+        for strainName in straindirectories:
+            try:
+                newDir = self.folder + strainName
+                ftp.cwd(newDir)
+                strainFiles = []
+                ftp.retrlines('NLST',  strainFiles.append)
+                for element in strainFiles:
+                    if self.gbkid in element:
+                        return strainName
+            except:
+                pass
+        return None
+
+
+def __main__():
+    """ main function """
+    parser = optparse.OptionParser()
+    parser.add_option('-i', dest='gbkid', help='RefSeq Genomic Accession ID')
+    parser.add_option('--fna', dest='fnafile', help='Output FASTA file name')
+    parser.add_option('--ptt', dest='pttfile', help='Output PTT file name')
+    parser.add_option('--rnt', dest='rntfile', help='Output RNT file name')
+    parser.add_option('--gff', dest='gfffile', help='Output GFF file name')
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error('Wrong number of arguments')
+    
+    S = GetData(options.gbkid, options.fnafile, options.pttfile, options.rntfile, options.gfffile)
+    S.getData()
+
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_edge_data.xml	Mon Sep 09 06:11:47 2013 -0400
@@ -0,0 +1,38 @@
+<tool id="get_edge_data" name="Get EDGE-pro files" version="1.0.1">
+  <description></description>
+  <command interpreter="python">get_edge_data.py -i "$gbkid" --fna $fnafile --ptt $pttfile --rnt $rntfile --gff $gfffile</command>
+
+  <inputs>
+    <param name="gbkid" type="text" value="NC_" label="RefSeq Genomic Accession ID">
+      <validator type="empty_field" />
+    </param>
+  </inputs>
+  <outputs>
+    <data name="fnafile" format="fasta" label="${tool.name} on ${on_string}: FASTA" />
+    <data name="pttfile" format="ptt" label="${tool.name} on ${on_string}: PTT" />
+    <data name="rntfile" format="rnt" label="${tool.name} on ${on_string}: RNT" />
+    <data name="gfffile" format="gff" label="${tool.name} on ${on_string}: GFF" />
+  </outputs>
+  <tests>
+
+  </tests>
+  <help>
+**What it does**
+
+This tool retrieves from NCBI the files required by `EDGE-pro`_.
+
+.. _EDGE-pro: http://ccb.jhu.edu/software/EDGE-pro/
+
+**License and citation**
+
+This Galaxy tool is Copyright © 2012-2013 `CRS4 Srl.`_ and is released under the `MIT license`_.
+
+.. _CRS4 Srl.: http://www.crs4.it/
+.. _MIT license: http://opensource.org/licenses/MIT
+
+If you use this tool in Galaxy, please cite |Cuccuru2013|_.
+
+.. |Cuccuru2013| replace:: Cuccuru, G., Orsini, M., Pinna, A., Sbardellati, A., Soranzo, N., Travaglione, A., Uva, P., Zanetti, G., Fotia, G. (2013) Orione, a web-based framework for NGS analysis in microbiology. *Submitted*
+.. _Cuccuru2013: http://orione.crs4.it/
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon Sep 09 06:11:47 2013 -0400
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="edge-pro" version="1.3.1">
+    <install version="1.0">
+      <actions>
+        <action type="download_by_url">http://ccb.jhu.edu/software/EDGE-pro/EDGE_pro_v1.3.1.tar.gz</action>
+        <action type="shell_command">make</action>
+        <action type="move_file">
+          <source>count</source>
+          <destination>$INSTALL_DIR/bin</destination>
+        </action>
+        <action type="move_file">
+          <source>edge.pl</source>
+          <destination>$INSTALL_DIR/bin</destination>
+        </action>
+        <action type="set_environment">
+          <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
+        </action>
+      </actions>
+    </install>
+    <readme>
+Dependencies of EDGE-pro which needs to be installed separately:
+- bowtie2 ( http://bowtie-bio.sourceforge.net/bowtie2/ ), in particular bowtie2, bowtie2-align and bowtie2-build need to be in the $PATH . These are usually already installed for Galaxy;
+- Switch Perl core module ( http://search.cpan.org/~rgarcia/Switch/Switch.pm ).
+    </readme>
+  </package>
+</tool_dependency>