# HG changeset patch # User crs4 # Date 1378721507 14400 # Node ID 7af33315bc5efbdfe4f50c54ebabcdabaec95d12 Uploaded diff -r 000000000000 -r 7af33315bc5e COPYING --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/COPYING Mon Sep 09 06:11:47 2013 -0400 @@ -0,0 +1,23 @@ +Copyright © 2012-2013 CRS4 Srl. http://www.crs4.it/ +Created by: +Paolo Uva +Nicola Soranzo + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff -r 000000000000 -r 7af33315bc5e datatypes_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Mon Sep 09 06:11:47 2013 -0400 @@ -0,0 +1,7 @@ + + + + + + + diff -r 000000000000 -r 7af33315bc5e edge_pro.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/edge_pro.py Mon Sep 09 06:11:47 2013 -0400 @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- +""" +Wrapper for EDGE - Gene expression in Prokaryotes +Author: Paolo Uva paolo dot uva at crs4 dot it +Date: March 18, 2013 +""" + +import glob +import optparse +import os +import shutil +import subprocess +import tempfile +import sys + +def __main__(): + # Parse Command Line + parser = optparse.OptionParser() + # Input + parser.add_option('-g', '--genome', dest="genome") + parser.add_option('-p', '--ptt', dest="ptt") + parser.add_option('-r', '--rnt', dest="rnt") + parser.add_option('-u', '--input1', dest="input1") + parser.add_option('-v', '--input2', dest="input2") + parser.add_option('-t', '--num-threads', dest="num_threads", type='int') + parser.add_option('-m', '--minInsertSize', dest="minInsertSize", type='int') + parser.add_option('-M', '--maxInsertSize', dest="maxInsertSize", type='int') + parser.add_option('-w', '--window', dest="window", type='int') + parser.add_option('-i', '--utrSize', dest="utrSize", type='int') + parser.add_option('-x', '--similarity', dest="similarity", type='float') + parser.add_option('-c', '--minCoverage', dest="minCoverage", type='int') + parser.add_option('-l', '--readLength', dest="readLength", type='int') + # Output + parser.add_option('--out-aln', dest="out_aln") + parser.add_option('--out-rpkm', dest="out_rpkm") + parser.add_option('--out-log', dest="out_log") + (options, args) = parser.parse_args() + if len(args) > 0: + parser.error('Wrong number of arguments') + + # Build command + if options.input2: + input2_flag = '-v %s' % (options.input2) + else: + input2_flag = '' + if options.num_threads is not None: + num_threads_flag = '-t %d' % options.num_threads + else: + num_threads_flag = '' + if options.minInsertSize is not None: + minInsertSize_flag = '-m %d' % options.minInsertSize + else: + minInsertSize_flag = '' + if options.maxInsertSize is not None: + maxInsertSize_flag = '-M %d' % options.maxInsertSize + else: + maxInsertSize_flag = '' + if options.window is not None: + window_flag = '-w %d' % options.window + else: + window_flag = '' + if options.utrSize is not None: + utrSize_flag = '-i %d' % options.utrSize + else: + utrSize_flag = '' + if options.similarity is not None: + similarity_flag = '-x %s' % options.similarity + else: + similarity_flag = '' + if options.readLength is not None: + readLength_flag = '-l %d' % (options.readLength) + else: + readLength_flag = '' + if options.minCoverage is not None: + minCoverage_flag = '-c %d' % options.minCoverage + else: + minCoverage_flag = '' + + wd = tempfile.mkdtemp() + try: + prefix = os.path.join(wd, 'out') + cl = 'edge.pl -g %s -p %s -r %s -u %s -o %s %s %s %s %s %s %s %s %s %s' % (options.genome, options.ptt, options.rnt, options.input1, prefix, input2_flag, num_threads_flag, minInsertSize_flag, maxInsertSize_flag, window_flag, utrSize_flag, similarity_flag, readLength_flag, minCoverage_flag) + print cl + + if options.out_log: + sout = open(options.out_log, 'w') + else: + sout = sys.stdout + try: + subprocess.check_call(cl, stdout=sout, stderr=subprocess.STDOUT, shell=True) # need to redirect stderr because edge.pl calls bowtie2 and count which write some logging info there + finally: + if sout != sys.stdout: + sout.close() + + # Move alignment file + shutil.move(prefix + '.alignments', options.out_aln) + # Concatenate multiple RPKM files together + with open(options.out_rpkm, 'wb') as destination: + for filename in glob.iglob(prefix + '.rpkm_*'): + with open(filename, 'rb') as source: + shutil.copyfileobj(source, destination) + finally: + shutil.rmtree(wd) + + +if __name__ == "__main__": + __main__() diff -r 000000000000 -r 7af33315bc5e edge_pro.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/edge_pro.xml Mon Sep 09 06:11:47 2013 -0400 @@ -0,0 +1,133 @@ + + Gene expression in Prokaryotes + + edge-pro + + + edge_pro.py + \${EDGE_PRO_SITE_OPTIONS:---num-threads=4} + + ## Mandatory input parameters + --genome $genome + --ptt $ptt + --rnt $rnt + ## First input file always required + --input1 $singlePaired.input1 + ## Second input only if input is paired-end + #if $singlePaired.sPaired == "paired" + --input2=$singlePaired.input2 + #end if + + ## Optional input parameters + #if $params.settingsType == "full" + #if str($params.minInsertSize) + --minInsertSize=$params.minInsertSize + #end if + #if str($params.maxInsertSize) + --maxInsertSize=$params.maxInsertSize + #end if + #if str($params.window) + --window=$params.window + #end if + #if str($params.utrSize) + --utrSize=$params.utrSize + #end if + #if str($params.similarity) + --similarity=$params.similarity + #end if + #if str($params.readLength) + --readLength=$params.readLength + #end if + #if str($params.minCoverage) + --minCoverage=$params.minCoverage + #end if + #end if + + ## Outputs + --out-aln $out_aln + --out-rpkm $out_rpkm + --out-log $out_log + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +`EDGE-pro`_, Estimated Degree of Gene Expression in PROkaryots is an efficient software system to estimate gene expression levels in prokaryotic genomes from RNA-seq data. EDGE-pro uses Bowtie2 for alignment and then estimates expression directly from the alignment results. +EDGE-pro includes routines to assign reads aligning to overlapping gene regions accurately. 15% or more of bacterial genes overlap other genes, making this a significant problem for bacterial RNA-seq, one that is generally ignored by programs designed for eukaryotic RNA-seq experiments. + +**Input files:** + +.. class:: infomark + +Input files with gene coordinates in PTT and RNT format can be retrieved with the Get EDGE-pro Files tool available in Galaxy, or downloaded from the `NCBI ftp repository`_. +This tool accepts files in Sanger FASTQ format (Galaxy *fastqsanger* datatype). Use the FASTQ Groomer tool to prepare your files. + +.. _NCBI ftp repository: ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/ + +.. class:: warningmark + +All 3 types of files (FASTA reference genome, PTT and RNT) must have the same order of chromosomes/plasmids (e.g. if chr1 is before chr2 in genome.fasta file, then chr1 must be before chr2 in ptt and rnt files as well). If there is no PTT or RNT file for one of chromosomes/plasmids, place this chromosome/plasmid at the end of the file. + +**License and citation** + +This Galaxy tool is Copyright © 2012-2013 `CRS4 Srl.`_ and is released under the `MIT license`_. + +.. _CRS4 Srl.: http://www.crs4.it/ +.. _MIT license: http://opensource.org/licenses/MIT + +If you use this tool in Galaxy, please cite |Cuccuru2013|_. + +.. |Cuccuru2013| replace:: Cuccuru, G., Orsini, M., Pinna, A., Sbardellati, A., Soranzo, N., Travaglione, A., Uva, P., Zanetti, G., Fotia, G. (2013) Orione, a web-based framework for NGS analysis in microbiology. *Submitted* +.. _Cuccuru2013: http://orione.crs4.it/ + +This tool uses `EDGE-pro`_, which is licensed separately. Please cite |Magoc2013|_. + +.. _EDGE-pro: http://ccb.jhu.edu/software/EDGE-pro/ +.. |Magoc2013| replace:: Magoc, T., Wood, D., Salzberg, S. L. (2013) EDGE-pro: Estimated Degree of Gene Expression in Prokaryotic Genomes. *Evol. Bioinform.* 2013:9, 127-136 +.. _Magoc2013: http://www.la-press.com/edge-pro-estimated-degree-of-gene-expression-in-prokaryotic-genomes-article-a3586 + + diff -r 000000000000 -r 7af33315bc5e get_edge_data.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_edge_data.py Mon Sep 09 06:11:47 2013 -0400 @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- + +from ftplib import FTP +import optparse +import sys + +class GetData: + + def __init__(self, gbkid, fnafile, pttfile, rntfile, gfffile): + self.gbkid = gbkid + self.fnafile = fnafile + self.pttfile = pttfile + self.rntfile = rntfile + self.gfffile = gfffile + self.ftpurl = 'ftp.ncbi.nlm.nih.gov' + self.folder = '/genomes/Bacteria/' + + def getData(self): + """ """ + strainName = self._getStrainName() + print strainName + if not strainName: + sys.exit("Unrecognized RefSeq Genomic Accession ID") + ftp = FTP(self.ftpurl) + ftp.login() + newDir = self.folder + strainName + ftp.cwd(newDir) + + directoryFiles = [] + ftp.retrlines('NLST', directoryFiles.append) + for fileName in directoryFiles: + try: + if '.fna' in fileName and self.gbkid in fileName: + #print "downloading", fileName + with open(self.fnafile, 'w') as outFile: + ftp.retrbinary("RETR " + fileName, outFile.write) + elif '.ptt' in fileName and self.gbkid in fileName: + #print "downloading", fileName + with open(self.pttfile, 'w') as outFile: + ftp.retrbinary("RETR " + fileName, outFile.write) + elif '.rnt' in fileName and self.gbkid in fileName: + #print "downloading", fileName + with open(self.rntfile, 'w') as outFile: + ftp.retrbinary("RETR " + fileName, outFile.write) + elif '.gff' in fileName and self.gbkid in fileName: + #print "downloading", fileName + with open(self.gfffile, 'w') as outFile: + ftp.retrbinary("RETR " + fileName, outFile.write) + #elif '.gbk' in fileName and self.gbkid in fileName: + # print "downloading", fileName + # with open(fileName, 'w') as outFile: + # ftp.retrbinary("RETR " + fileName, outFile.write) + except: + pass + + def _getStrainName(self): + """ """ + ftp = FTP(self.ftpurl) + ftp.login() + ftp.cwd(self.folder) + + straindirectories = [] + ftp.retrlines("NLST " , straindirectories.append) + #print "scanning directories..." + for strainName in straindirectories: + try: + newDir = self.folder + strainName + ftp.cwd(newDir) + strainFiles = [] + ftp.retrlines('NLST', strainFiles.append) + for element in strainFiles: + if self.gbkid in element: + return strainName + except: + pass + return None + + +def __main__(): + """ main function """ + parser = optparse.OptionParser() + parser.add_option('-i', dest='gbkid', help='RefSeq Genomic Accession ID') + parser.add_option('--fna', dest='fnafile', help='Output FASTA file name') + parser.add_option('--ptt', dest='pttfile', help='Output PTT file name') + parser.add_option('--rnt', dest='rntfile', help='Output RNT file name') + parser.add_option('--gff', dest='gfffile', help='Output GFF file name') + (options, args) = parser.parse_args() + if len(args) > 0: + parser.error('Wrong number of arguments') + + S = GetData(options.gbkid, options.fnafile, options.pttfile, options.rntfile, options.gfffile) + S.getData() + + +if __name__ == "__main__": + __main__() diff -r 000000000000 -r 7af33315bc5e get_edge_data.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_edge_data.xml Mon Sep 09 06:11:47 2013 -0400 @@ -0,0 +1,38 @@ + + + get_edge_data.py -i "$gbkid" --fna $fnafile --ptt $pttfile --rnt $rntfile --gff $gfffile + + + + + + + + + + + + + + + + +**What it does** + +This tool retrieves from NCBI the files required by `EDGE-pro`_. + +.. _EDGE-pro: http://ccb.jhu.edu/software/EDGE-pro/ + +**License and citation** + +This Galaxy tool is Copyright © 2012-2013 `CRS4 Srl.`_ and is released under the `MIT license`_. + +.. _CRS4 Srl.: http://www.crs4.it/ +.. _MIT license: http://opensource.org/licenses/MIT + +If you use this tool in Galaxy, please cite |Cuccuru2013|_. + +.. |Cuccuru2013| replace:: Cuccuru, G., Orsini, M., Pinna, A., Sbardellati, A., Soranzo, N., Travaglione, A., Uva, P., Zanetti, G., Fotia, G. (2013) Orione, a web-based framework for NGS analysis in microbiology. *Submitted* +.. _Cuccuru2013: http://orione.crs4.it/ + + diff -r 000000000000 -r 7af33315bc5e tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Sep 09 06:11:47 2013 -0400 @@ -0,0 +1,27 @@ + + + + + + http://ccb.jhu.edu/software/EDGE-pro/EDGE_pro_v1.3.1.tar.gz + make + + count + $INSTALL_DIR/bin + + + edge.pl + $INSTALL_DIR/bin + + + $INSTALL_DIR/bin + + + + +Dependencies of EDGE-pro which needs to be installed separately: +- bowtie2 ( http://bowtie-bio.sourceforge.net/bowtie2/ ), in particular bowtie2, bowtie2-align and bowtie2-build need to be in the $PATH . These are usually already installed for Galaxy; +- Switch Perl core module ( http://search.cpan.org/~rgarcia/Switch/Switch.pm ). + + +