Mercurial > repos > fabio > btman
changeset 18:be864d79c9c7 draft
Uploaded 20190304
author | fabio |
---|---|
date | Mon, 04 Mar 2019 08:30:03 -0500 |
parents | f02c2c58a6f9 |
children | 7f712cc0d3d5 |
files | .shed.yml btman-1.0.0/.shed.yml btman-1.0.0/build.sh btman-1.0.0/cluster.sh btman-1.0.0/create.py btman-1.0.0/create.xml btman-1.0.0/dataset.tsv btman-1.0.0/macros.xml btman-1.0.0/makebf.sh btman-1.0.0/query.py btman-1.0.0/query.tsv btman-1.0.0/query.xml create.py create.xml macros.xml query.py query.xml |
diffstat | 17 files changed, 738 insertions(+), 586 deletions(-) [+] |
line wrap: on
line diff
--- a/.shed.yml Fri Jan 18 10:12:40 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -name: btman -owner: iuc -categories: - - Data Source - - Web Services -description: BloomTree Manager -long_description: | - A fast querying tool to identify all publicly available sequenced - samples which express a transcript of interest -remote_repository_url: https://github.com/fabio-cumbo/bloomtree-manager -homepage_url: https://github.com/fabio-cumbo/bloomtree-manager -type: unrestricted -auto_tool_repositories: - name_template: "{{ tool_id }}" - descriptor_template: "Wrapper for BloomTree Manager: {{ tool_name }}." -suite: - name: "btman_suite" - description: "A suite of Galaxy tools designed to work with the BloomTree Manager." - long_description: | - A fast querying tool to identify all publicly available sequenced - samples which express a transcript of interest
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/btman-1.0.0/.shed.yml Mon Mar 04 08:30:03 2019 -0500 @@ -0,0 +1,21 @@ +name: btman +owner: iuc +categories: + - Data Source + - Web Services +description: BloomTree Manager +long_description: | + A suite of tools to fast create and query Sequence Bloom Trees + supporting determined/how split filters +remote_repository_url: https://github.com/fabio-cumbo/bloomtree-manager +homepage_url: https://github.com/fabio-cumbo/bloomtree-manager +type: unrestricted +auto_tool_repositories: + name_template: "{{ tool_id }}" + descriptor_template: "Wrapper for BloomTree Manager: {{ tool_name }}." +suite: + name: "btman_suite" + description: "A suite of Galaxy tools designed to work with the BloomTree Manager." + long_description: | + A suite of tools to fast create and query Sequence Bloom Trees + supporting determined/how split filters
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/btman-1.0.0/build.sh Mon Mar 04 08:30:03 2019 -0500 @@ -0,0 +1,7 @@ +#!/bin/bash + +outExpDir=$1 + +cd ${outExpDir} + +howdesbt build --HowDe --tree=union.txt --outtree=howde.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/btman-1.0.0/cluster.sh Mon Mar 04 08:30:03 2019 -0500 @@ -0,0 +1,10 @@ +#!/bin/bash + +outExpDir=$1 +bfsize=$2 + +cd ${outExpDir} + +ls *.bf > leafnames.txt +howdesbt cluster --list=leafnames.txt --bits=${bfsize} --tree=union.txt --nodename=node{number} --keepallnodes +#rm leafnames_txt_?
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/btman-1.0.0/create.py Mon Mar 04 08:30:03 2019 -0500 @@ -0,0 +1,233 @@ +#!/usr/bin/env python + +import sys, os, optparse, shutil, glob + +__version__ = "1.0.0" +# in the case of collections, exitcodes equal to 0 and 1 are not considered errors +ERR_EXIT_CODE = 2 +OK_EXIT_CODE = 0 +VALID_CHARS = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' + +def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ): + print message + with open( logfilepath, 'a+' ) as out: + out.write( message + '\n' ) + if exit: + sys.exit( exitcode ) + +def downloadAccessions(formats, filepaths, outlogfile, outdirpath): + downloaded_files = { } + for dataset_idx in range(0, len(formats)): + if formats[ dataset_idx ] == 'accessions': + print filepaths[ dataset_idx ] + with open( filepaths[ dataset_idx ] ) as accessions: + for line in accessions: + print line + accession = line.split( '\t' )[0].strip() + if accession: + printLog( outlogfile, 'Downloading \"' + accession.upper() + '\" with the fastq-dump tool (part of the sra-tools utility)' ) + fastq_dump_exitcode = os.system( 'fastq-dump --outdir ' + outdirpath + ' --fasta ' + accession.upper() ) + if fastq_dump_exitcode > 0: + printLog( outlogfile, '> FASTA file: FAILED ( \"' + accession.upper() + '\" will be excluded )' ) + else: + #os.rename( os.path.join( outdirpath, accession.upper() + '.fasta' ), os.path.join( outdirpath, accession.upper() + '_fasta' ) ) + printLog( outlogfile, '> FASTA file: \"' + accession.upper() + '.fasta\"' ) + accession_data = { + 'format': '.fasta', + 'filepath': os.path.join( outdirpath, accession.upper() + '.fasta' ), + 'filename': ''.join( c for c in accession.upper() if c in VALID_CHARS ) + } + downloaded_files[ accession.upper() ] = accession_data + return downloaded_files + +# format = { fasta, fastq, accession } +# this version skip the quality control procedure +def createSBT( options, args ): + outlogfile = str( options.outfile ) + outdirpath = str( options.outdir ) + if not outdirpath.endswith('/'): outdirpath += '/' + if not os.path.exists( outdirpath ): + os.mkdir( outdirpath ) + outdirpath = os.path.abspath( outdirpath ) + os.chdir( outdirpath ) + tooldirpath = os.path.abspath( str( options.tooldir ) ) + if not tooldirpath.endswith('/'): tooldirpath += '/' + + formats = [ fo for fo in str( options.formats ).split( '|' ) if fo.strip() ] + filepaths = [ filepath for filepath in str( options.filepaths ).split( '|' ) if filepath.strip() ] + filenames = [ filename for filename in str( options.filenames ).split( '|' ) if filename.strip() ] + compressed = [ True == int(c) for c in str( options.compressed ).split( '|' ) if c.strip() ] + minabundances = [ int(minab) for minab in str( options.minabundances ).split( '|' ) if minab.strip() ] + qualitythresholds = [ float(qthres) for qthres in str( options.qualitythresholds ).split( '|' ) if qthres.strip() ] + + klen = int( options.klen ) + bfsize = int( options.bfsize ) + + if len(formats) == len(filepaths) == len(filenames) == len(compressed) == len(minabundances) == len(qualitythresholds): + printLog( outlogfile, 'Retrieving experiments' ) + accessions = downloadAccessions( formats, filepaths, outlogfile, outdirpath ) + printLog( outlogfile, '> ' + str( len( accessions ) ) + ' experiments retrieved from the Sequence Read Archive' ) + acc_arr = [ a for a in accessions ] + print str( acc_arr ) + if bfsize < 0: # estimate bloom filter size + data_paths = ' '.join( accessions[ accession ][ 'filepath' ] for accession in accessions if 'filepath' in accessions[ accession ] ) + print data_paths + if len( data_paths ) > 0: + data_paths += ' ' + for dataset_idx in range(0, len(formats)): + if formats[ dataset_idx ] != 'accessions': + data_paths += ' '.join( path for path in filepaths[ dataset_idx ].split( ',' ) ) + # ntcard + printLog( outlogfile, 'Estimating the Bloom Filter size with ntcard' ) + if len( data_paths ) > 0: + ntcard_res_filepath = os.path.join( outdirpath, 'freq_k' + str( klen ) + '.hist' ) + ntcard_exitcode = os.system( 'ntcard --kmer=' + str( klen ) + ' ' + data_paths ) + print 'ntcard --kmer=' + str( klen ) + ' ' + data_paths + if ntcard_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(ntcard_exitcode) + '] an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True ) + else: + if os.path.exists( ntcard_res_filepath ): + os.rename( ntcard_res_filepath, os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' ) ) + ntcard_res_filepath = os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' ) + var_F0 = None + var_f1 = None + with open( ntcard_res_filepath ) as ntcard_res: + for line in ntcard_res: + line = line.strip() + if line: + line_split = line.split( '\t' ) + if len(line_split) == 2: + if line_split[0] == 'F0': + var_F0 = int( line_split[1] ) + elif line_split[0] == 'f1': + var_f1 = int( line_split[1] ) + if var_F0 is not None and var_f1 is not None: + break + if var_F0 is not None and var_f1 is not None: + bfsize = var_F0 - var_f1 + printLog( outlogfile, '> estimated Bloom Filter size: ' + str(bfsize) ) + else: + printLog( outlogfile, '> an error has occurred while estimating the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> unable to estimate the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True ) + + if bfsize > 0: + for dataset_idx in range(0, len(formats)): + if formats[ dataset_idx ] == 'accessions': + with open( filepaths[ dataset_idx ] ) as accessions_file: + for line in accessions_file: + accession = line.split( '\t' )[0].strip().upper() + if accession in accessions: + curr_format = accessions[ accession ][ 'format' ] + curr_compressed = 'uncompress' + curr_filepath = accessions[ accession ][ 'filepath' ] + curr_filename = accessions[ accession ][ 'filename' ] + printLog( outlogfile, 'Processing \"' + accession + '\" ( format=\"' + curr_format + + '\", compressed=\"' + str(False) + '\", fixed_name=\"' + curr_filename + '\" )' ) + print 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1' + makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' + + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1' ) + if makebf_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + accession + '\" will be excluded )' ) + else: + printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename + '.bf\"' ) + else: + curr_format = '.' + formats[ dataset_idx ].lower() + curr_compressed = '.gz' if compressed[ dataset_idx ] else 'uncompress' + curr_filepaths = filepaths[ dataset_idx ].split( ',' ) + curr_filenames = filenames[ dataset_idx ].split( ',' ) + for curr_idx in range(0, len(curr_formats)): + curr_filename_fixed = ''.join( c for c in curr_filenames[ curr_idx ] if c in VALID_CHARS ) + printLog( outlogfile, 'Processing \"' + curr_filenames[ curr_idx ] + '\" ( format=\"' + curr_format + + '\", compressed=\"' + str(compressed[ dataset_idx ]) + '\", fixed_name=\"' + curr_filename_fixed + '\" )' ) + if compressed[ dataset_idx ]: + makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' + + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 1' ) + else: + makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' + + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 0' ) + if makebf_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + curr_filenames[ curr_idx ] + '\" will be excluded )' ) + else: + printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename_fixed + '.bf\"' ) + # Create a tree topology + printLog( outlogfile, 'Creating a tree topology file' ) + bf_counter = len( glob.glob1( outdirpath, '*.bf' ) ) + if bf_counter > 0: + cluster_exitcode = os.system( 'sh ' + tooldirpath + 'cluster.sh ' + outdirpath + ' ' + str( bfsize ) ) + if cluster_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(cluster_exitcode) + '] an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True ) + else: + # Build the HowDeSBT nodes + if os.path.exists( os.path.join( outdirpath, 'leafnames.txt' ) ): + printLog( outlogfile, 'Building the Bloom Filter files for the tree' ) + build_exitcode = os.system( 'sh ' + tooldirpath + 'build.sh ' + outdirpath ) + if build_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(build_exitcode) + '] an error has occurred during the creation of the Bloom Filter files for the tree', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> the tree has been successfully built: \"howde.txt\"', exitcode=OK_EXIT_CODE, exit=True ) + ''' + howde_filepath = os.path.join( outdirpath, 'howde.txt' ) + howde_galaxy_filepath = os.path.join( outdirpath, 'howde_galaxy.txt' ) + howde_galaxy = open( howde_galaxy_filepath, 'w' ) + with open( howde_filepath ) as howde_file: + for line in howde_file: + line = line.strip() + if line: + # trim stars * and get node name + # find galaxy file path to the node name + # rewrite path with stars + howde_galaxy.close() + ''' + else: + printLog( outlogfile, '> an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> no Bloom Filter files found', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> ERROR: the Bloom Filter size is ' + str( bfsize ), exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, 'Something went wrong with the input parameters', exitcode=ERR_EXIT_CODE, exit=True ) + +def __main__(): + # Parse the command line options + usage = ("Usage: create.py --formats file_formats --filepaths file_paths --filenames file_names " + "--compressed file_compressed --minabundance min_abundance --qualitythresholds quality_thresholds " + "--klen kmer_len --bfsize bloom_filter_size --outfile out_log_file_path --outdir out_dir_path") + parser = optparse.OptionParser(usage = usage) + parser.add_option("-v", "--version", action="store_true", dest="version", + default=False, help="display version and exit") + parser.add_option("-f", "--formats", type="string", + action="store", dest="formats", help="list of file formats separated by a tab character") + parser.add_option("-p", "--filepaths", type="string", + action="store", dest="filepaths", help="list of input file paths separated by a tab character") + parser.add_option("-n", "--filenames", type="string", + action="store", dest="filenames", help="list of input file names separated by a tab character") + parser.add_option("-c", "--compressed", type="string", + action="store", dest="compressed", help="list of compressed flags related to the imput files separated by a tab character") + parser.add_option("-m", "--minabundances", type="string", + action="store", dest="minabundances", help="list of blooom filter minimum abundances related to the imput files separated by a tab character") + parser.add_option("-q", "--qualitythresholds", type="string", + action="store", dest="qualitythresholds", help="list of quality thresholds related to the imput files separated by a tab character") + parser.add_option("-k", "--klen", type="int", default=21, + action="store", dest="klen", help="k-mer length") + parser.add_option("-b", "--bfsize", type="int", default=-1, + action="store", dest="bfsize", help="bloom filter size") + parser.add_option("-o", "--outfile", type="string", default="sbtres.txt", + action="store", dest="outfile", help="output log file path") + parser.add_option("-d", "--outdir", type="string", default="sbtres.txt", + action="store", dest="outdir", help="output directory path") + parser.add_option("-t", "--tooldir", type="string", default="./", + action="store", dest="tooldir", help="tool directory path") + + (options, args) = parser.parse_args() + if options.version: + print __version__ + else: + createSBT( options, args ) + +if __name__ == "__main__": __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/btman-1.0.0/create.xml Mon Mar 04 08:30:03 2019 -0500 @@ -0,0 +1,173 @@ +<?xml version="1.0"?> +<tool name="BloomTree Manager - Create" id="btman_create" version="1.0.0"> + <description>a Sequence Bloom Tree</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"> +<![CDATA[ + python '$__tool_directory__/create.py' + + #set formats = '' + #set filepaths = '' + #set filenames = '' + #set compressed = '' + #set minab = '' + #set qthres = '' + #for $i, $exp in enumerate( $experiments ): + #set formats += str( $exp.conditional_format.format ) + '|' + #if $exp.conditional_format.format == 'accessions': + #set filepaths += str( $exp.conditional_format.accession_numbers ) + '|' + #set filenames += str( $exp.conditional_format.accession_numbers.name ) + '|' + #set compressed += '0|' + #else: + #if $exp.conditional_format.format == 'fasta': + #set compressed += str( $exp.conditional_format.conditional_fasta_compressed.fasta_compressed ) + '|' + #if $exp.conditional_format.conditional_fasta_compressed.fasta_compressed == 0: + #set filepaths += ','.join( [ str( $f ) for $f in $exp.conditional_format.conditional_fasta_compressed.fastafiles ] ) + '|' + #set filenames += ','.join( [ str( $f.name ) for $f in $exp.conditional_format.conditional_fasta_compressed.fastafiles ] ) + '|' + #else: + #set filepaths += ','.join( [ str( $f ) for $f in $exp.conditional_format.conditional_fasta_compressed.fastagzfiles ] ) + '|' + #set filenames += ','.join( [ str( $f.name ) for $f in $exp.conditional_format.conditional_fasta_compressed.fastagzfiles ] ) + '|' + #end if + #elif $exp.conditional_format.format == 'fastq': + #set compressed += str( $exp.conditional_format.conditional_fastq_compressed.fastq_compressed ) + '|' + #if $exp.conditional_format.conditional_fastq_compressed.fastq_compressed == 0: + #set filepaths += ','.join( [ str( $f ) for $f in $exp.conditional_format.conditional_fastq_compressed.fastqfiles ] ) + '|' + #set filenames += ','.join( [ str( $f.name ) for $f in $exp.conditional_format.conditional_fastq_compressed.fastqfiles ] ) + '|' + #else: + #set filepaths += ','.join( [ str( $f ) for $f in $exp.conditional_format.conditional_fastq_compressed.fastqgzfiles ] ) + '|' + #set filenames += ','.join( [ str( $f.name ) for $f in $exp.conditional_format.conditional_fastq_compressed.fastqgzfiles ] ) + '|' + #end if + #end if + #end if + #set minab += str( $exp.min_abundance ) + '|' + #if $exp.conditional_quality.quality_control == '1': + #set qthres += str( $exp.conditional_quality.quality_threshold ) + '|' + #else: + #set qthres += '-1.0|' + #end if + #end for + #set klen = $kmer_len + #set bfsize = -1 + #if $bloomsize_condition.bloomsize_control == '0': + #set bfsize = $bloomsize_condition.bloom_filter_size + #end if + + --formats '${formats}' + --filepaths '${filepaths}' + --filenames '${filenames}' + --compressed '${compressed}' + --minabundances '${minab}' + --qualitythresholds '${qthres}' + + --klen ${klen} + --bfsize ${bfsize} + + --outfile '${resulttxt}' + --outdir 'sbt' + --tooldir '$__tool_directory__' +]]> + </command> + <inputs> + <repeat name="experiments" title="Select a list of experiments" help="Select a set of experiments on which the Sequence Bloom Tree will be built." min="1"> + <conditional name="conditional_format"> + <param name="format" type="select" label="Select the experiment format" help="FASTA and FASTQ are the supported formats"> + <option value="fasta">FASTA Experiments</option> + <option value="fastq">FASTQ Experiments</option> + <option value="accessions">SRA Accession Numbers</option> + </param> + <when value="fasta"> + <conditional name="conditional_fasta_compressed"> + <param name="fasta_compressed" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Are your experiments compressed?" /> + <when value="0"> + <param format="fasta" name="fastafiles" multiple="true" type="data" label="Select one or more FASTA experiments" /> + </when> + <when value="1"> + <param format="fastagz" name="fastagzfiles" multiple="true" type="data" label="Select one or more FASTA .gz experiments" /> + </when> + </conditional> + </when> + <when value="fastq"> + <conditional name="conditional_fastq_compressed"> + <param name="fastq_compressed" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Are youe experiments compressed?" /> + <when value="0"> + <param format="fastq" name="fastqfiles" multiple="true" type="data" label="Select one or more FASTQ experiments" /> + </when> + <when value="1"> + <param format="fastqgz" name="fastqgzfiles" multiple="true" type="data" label="Select one or more FASTQ .gz experiments" /> + </when> + </conditional> + </when> + <when value="accessions"> + <param name="accession_numbers" type="data" format="tabular" label="Select a list of SRA Accession Numbers" help="Select a tabular file with a list of accession numbers in the first column." /> + </when> + </conditional> + + <param name="min_abundance" type="integer" value="2" min="0" label="Insert a Bloom filter minimum abundance" help="This value is the minimum abundance cutoff for the creation of the Bloom filter." /> + + <conditional name="conditional_quality"> + <param name="quality_control" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Apply a quality control procedure" /> + <when value="1"> + <param name="quality_threshold" size="1" type="float" value="0.8" min="0.0" max="1.0" label="Quality threshold" help="If the number of sequences flagged as poor quality on the total number of sequences in a file is less than this threshold, the whole experiment will be excluded." /> + </when> + </conditional> + </repeat> + + <param name="kmer_len" type="integer" value="21" min="0" label="K-mer length" /> + + <conditional name="bloomsize_condition"> + <param name="bloomsize_control" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Automatically estimate the Bloom filter size" /> + <when value="0"> + <param name="bloom_filter_size" size="1" type="integer" value="1" min="1" label="Bloom Filter size" help="Disable this field to let the tool estimate an appropriate Bloom filter size." /> + </when> + </conditional> + </inputs> + <outputs> + <collection name="list_output" type="list" label="${tool.name} SBT Collection"> + <discover_datasets pattern="(?P<identifier_0>.*(?=\.)).(?P<ext>[^\.]*$)" ext="auto" directory="sbt" /> + </collection> + <data format="txt" name="resulttxt" label="${tool.name} SBT: Result" from_work_dir="sbtres.txt" /> + </outputs> + + <help><![CDATA[ +This tool allows to create Sequence Bloom Trees starting from a set of FASTA or FASTQ files. +It also allows to control the quality of the input dataset and exclude the files that do not reach a specified quality level. + +----- + +**Input file** + +The input of this tool is a set of FASTA or FASTQ experiments, additionally to a set of SRA accession numbers. +For each of the selected experiments, the minimum abundance for the corresponding Bloom filter is required. +Additionally, a quality control procedure could be applied to guarantee that the quality of every experiment always exceed a +specified treshold. Otherwise, experiments with low quality level will be discarded. + +The k-mer length must also be specified, additionally to the Bloom filter size. This last field is optional and it will be +automatically estimated if not provided. + +----- + +**Output** + +This tool returns a collection containing the Sequence Bloom Tree nodes and a file representing the organization of the tree. + +Take a look at the Query tool documentation for a detailed description about how +to query a Sequence Bloom Tree. + +----- + +.. class:: infomark + +**Notes** + +This Galaxy tool has been developed by Fabio Cumbo. + +Please visit this GithHub_repository_ for more information about the BloomTree Manager + +.. _GithHub_repository: https://github.com/fabio-cumbo/bloomtree-manager + ]]></help> + + <expand macro="citations" /> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/btman-1.0.0/dataset.tsv Mon Mar 04 08:30:03 2019 -0500 @@ -0,0 +1,11 @@ +SRR833714 +SRR833713 +SRR833715 +SRR567161 +SRR567146 +SRR191393 +SRR191449 +SRR191448 +SRR191447 +SRR191446 +SRR191445
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/btman-1.0.0/macros.xml Mon Mar 04 08:30:03 2019 -0500 @@ -0,0 +1,16 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="2.7.10">python</requirement> + <requirement type="package" version="2.9.1">sra-tools</requirement> + <requirement type="package" version="1.0.0">ntcard</requirement> + <requirement type="package" version="1.00.00">howdesbt</requirement> + </requirements> + </xml> + + <xml name="citations"> + <citations> + <citation type="doi">10.1101/090464</citation> + </citations> + </xml> +</macros> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/btman-1.0.0/makebf.sh Mon Mar 04 08:30:03 2019 -0500 @@ -0,0 +1,33 @@ +#!/bin/bash + +expPath=$1 +expName=$2 +expFormat=$3 +expCompress=$4 + +outExpDir=$5 + +klen=$6 +minab=$7 +bfsize=$8 + +rmCompressed=$9 +rmSource=${10} + +cd ${outExpDir} + +if [ "${expCompress}" == ".gz" ]; then + gzip -dc ${expPath} > ${expName}${expFormat} + howdesbt makebf K=${klen} --min=${minab} --bits=${bfsize} ${expName}${expFormat} --out=${expName}.bf + if [ "${rmCompressed}" -eq "1" ]; then + rm ${expPath} + fi + if [ "${rmSource}" -eq "1" ]; then + rm ${expName}${expFormat} + fi +else + howdesbt makebf K=${klen} --min=${minab} --bits=${bfsize} ${expPath} --out=${expName}.bf + if [ "${rmSource}" -eq "1" ]; then + rm ${expPath} + fi +fi
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/btman-1.0.0/query.py Mon Mar 04 08:30:03 2019 -0500 @@ -0,0 +1,148 @@ +#!/usr/bin/env python + +import sys, os, optparse, shutil + +__version__ = "1.0.0" +VALID_CHARS = '.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' +# in the case of collections, exitcodes equal to 0 and 1 are not considered errors +ERR_EXIT_CODE = 2 +OK_EXIT_CODE = 0 + +def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ): + print message + with open( logfilepath, 'a+' ) as out: + out.write( message + '\n' ) + if exit: + sys.exit( exitcode ) + +def querySBT( options, args ): + output_dir_path = options.outputdir + outlogfile = options.outfile + + tree_file_paths = options.treep.split( ',' ) + tree_file_names = options.treen.split( ',' ) + tree_def_filepath = None + leafnames_filepath = None + for idx, tree_file_name in enumerate( tree_file_names ): + if tree_file_name == 'howde': + tree_def_filepath = tree_file_paths[ idx ] + elif tree_file_name == 'leafnames': + leafnames_filepath = tree_file_paths[ idx ] + if tree_def_filepath is not None and leafnames_filepath is not None: + break + + if tree_def_filepath is not None and leafnames_filepath is not None: + leafnames_counter = 0 + with open( leafnames_filepath ) as leafnames_file: + for line in leafnames_file: + if line.strip(): + leafnames_counter += 1 + if leafnames_counter > 0: + printLog( outlogfile, 'The selected collection contains a valid tree' ) + shutil.copyfile( tree_def_filepath, 'howde.txt' ) + tree_def_filepath = 'howde.txt' + for idx, tree_file_name in enumerate( tree_file_names ): + if tree_file_name.endswith( 'detbrief.rrr' ): + shutil.copyfile( tree_file_paths[ idx ], tree_file_name + '.bf' ) + + printLog( outlogfile, 'Creating batch of queries' ) + # create tmp batch file + batch_file_name = 'queries.fa' + batch_file = open( batch_file_name, 'w' ) + + comma_sep_file_paths = options.files + # check if options.files contains at least one file path + if comma_sep_file_paths is not None: + # split file paths + file_paths = comma_sep_file_paths.split(",") + # split file names + file_names = options.names.split(",") + for idx, file_path in enumerate(file_paths): + fixed_file_name = ''.join( c for c in file_names[ idx ] if c in VALID_CHARS ) + printLog( outlogfile, '> processing file ' + file_names[ idx ] + ' ( fixed_name=\"' + fixed_file_name + '\" ) ' ) + with open(file_path, 'r') as content_file: + for line in content_file: + line = line.strip() + if line: + line_split = line.strip().split("\t") # split on tab + if len(line_split) == 2: # 0:id , 1:seq , otherwise skip line + original_seq_id = line_split[0] + # fix seq_id using valid chars only + seq_id = ''.join( c for c in original_seq_id if c in VALID_CHARS ) + printLog( outlogfile, '> sequence ' + original_seq_id + ' ( fixed_name=\"' + seq_id + '\" )' ) + seq_text = line_split[1] + + # write on batch file + batch_file.write( '> ' + fixed_file_name + '_' + seq_id + '\n' + seq_text + '\n' ) + batch_file.close() + # query the tree + printLog( outlogfile, 'Querying the tree' ) + query_res_file_path = os.path.abspath( 'answer.txt' ) + sort_param = '--sort' + if options.sort == 0: + sort_param = '' + query_exitcode = os.system( 'howdesbt query --tree=' + os.path.abspath( tree_def_filepath ) + ' ' + os.path.abspath( batch_file_name ) + '=' + str(options.threshold) + ' --out=' + query_res_file_path ) + ' ' + sort_param + if query_exitcode > 0: + printLog( outlogfile, '> ERROR: an error has occurred while querying the tree with the sequence [id: ' + seq_id + '] in input file ' + file_names[ idx ] ) + else: + if os.path.exists( query_res_file_path ): + with open( query_res_file_path ) as query_res_file: + file_path = '' + theta_matches = 0 + for line in query_res_file: + line = line.strip() + if line: + if line.startswith( '*' ): + line_split = line.split( ' ' ) + theta_matches = int( line_split[ 1 ] ) + file_name = line_split[ 0 ].replace( '*', '' ) + file_path = os.path.join( output_dir_path, file_name + '_txt' ) + open( file_path, 'a' ).close() + else: + res_file = open( file_path, 'a+' ) + fraction = str( theta_matches ) + '/' + str( leafnames_counter ) + score = format( round( float( theta_matches ) / float( leafnames_counter ) , 6 ), '6f' ) + res_file.write( line + '\t' + fraction + '\t' + score + '\n' ) + res_file.close() + else: + printLog( outlogfile, 'An error has occurred while querying the tree', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True ) + +def __main__(): + # Parse the command line options + usage = "Usage: query.py --files comma_sep_file_paths --names comma_seq_file_names --sequences sequences_text --search search_mode --exact exact_alg --sthreshold threshold --outputdir output_dir_path" + parser = optparse.OptionParser(usage = usage) + parser.add_option("-v", "--version", action="store_true", dest="version", + default=False, help="display version and exit") + parser.add_option("-f", "--files", type="string", + action="store", dest="files", help="comma separated files path") + parser.add_option("-n", "--names", type="string", + action="store", dest="names", help="comma separated names associated to the files specified in --files") + parser.add_option("-k", "--treep", type="string", + action="store", dest="treep", help="paths of files in collection") + parser.add_option("-m", "--treen", type="string", + action="store", dest="treen", help="names of files in collection") + parser.add_option("-t", "--threshold", type="float", default=0.7, + action="store", dest="threshold", help="search threshold") + parser.add_option("-s", "--sort", type="int", default=1, + action="store", dest="sort", help="sort results") + parser.add_option("-o", "--outputdir", type="string", default="output", + action="store", dest="outputdir", help="output directory (collection) path") + parser.add_option("-r", "--outfile", type="string", default="query.txt", + action="store", dest="outfile", help="output log file path") + + (options, args) = parser.parse_args() + if options.version: + print __version__ + else: + # create output dir (collection) + output_dir_path = options.outputdir + if not os.path.exists(output_dir_path): + os.makedirs(output_dir_path) + + querySBT( options, args ) + +if __name__ == "__main__": __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/btman-1.0.0/query.tsv Mon Mar 04 08:30:03 2019 -0500 @@ -0,0 +1,3 @@ +0 CCAACCAAAGGGAAAACTTTTTTCCGACTTTGGCCTAAAGGGTTTAACGGCCAAGTCAGAAGGGAAAAAGTTGCGCCAAAAATGGCGTTAAAATGTGTAATCAGAGAAGCGACACGAAAAGGGGATCAGCTCTTGGCTGGCAATTGGTAGGTCAGAGGTGGATTGGGAAAAGGCAAGTCAGCAACTGTCGATGACGGCGACTGACTGTTAATGAAAATTGTTTTGGCTGTGTGGAAAAAAATACGCGGGAATCCGTGAATTTTCCGAGGAGCTGGTGGAGCGAAGAAAACGGGGTGCTGCTGTTGTAAATGATTGGTGAAAGTCACACGCCCGCAGCCTTGCCAAACTAATTAACGCCAAATGGAGCTAAGGCCTTTGAATGATGGCTGCAGGCTAGCTTATGAAAAGGGGTTGAAGAGAAGTGGAAAAATTGGTAGAAAGGGATTTGCTCAAGATGCC +1 TTAATGACAGGGCCACATGATGTGAAAAAAAATCAGAAACCGAGTCAACGTGAGAAGATAGTACGTACTACCGCAAATGAATGGCCATTTCATTTGCATGTTGGGAGCAACAGAAATGAGAGAGCATCCGAAGCTAACCACAAAAATGGACTTTGCTTCATTATGCACAAACACGCCAATAAATGTAACGAGAAAGATAGTAGGAGCGAAAGACGAGACGAGACAAACAGGAAGAAGACGAGTGGACGAGTGTTTTTTGTAACGAAACTCTTAATCGCTCCTTTGCAGGCTTAAGCTGATAGTTGCTACGTTTATGCCATGAATTTCAAGATCTCTCAAATGCGTGAAAATCCAGTTTATGCGACAGACAAATTCATGTATTTGAAAAATCTTAGCTGATAGAAATCAAAGGTGATT +2 CAATTAATGATAAATATTTTATAAGGTGCGGAAATAAAGTGAGGAATATCTTTTAAATTCAAGTTCAATTCTGAAAGC \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/btman-1.0.0/query.xml Mon Mar 04 08:30:03 2019 -0500 @@ -0,0 +1,83 @@ +<?xml version="1.0"?> +<tool name="BloomTree Manager - Query" id="btman_query" version="1.0.0"> + <description>a Sequence Bloom Tree</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"> +<![CDATA[ + python '$__tool_directory__/query.py' + + #set coll_paths = ','.join( [ str( $f ) for $f in $treecollection ] ) + #if $coll_paths is not 'None': + --treep '${coll_paths}' + #set coll_names = ','.join( [ str( $f.name ) for $f in $treecollection ] ) + --treen '${coll_names}' + #end if + + #set file_paths = ','.join( [ str( $f ) for $f in $txtfiles ] ) + #if $file_paths is not 'None': + --files '${file_paths}' + #set file_names = ','.join( [ str( $f.name ) for $f in $txtfiles ] ) + --names '${file_names}' + #end if + + --threshold ${threshold} + --sort ${sort} + + --outputdir 'answer_collection' + --outfile '${resulttxt}' +]]> + </command> + <inputs> + <param name="treecollection" type="data_collection" collection_type="list" label="Select a Sequence Bloom Tree" help="Select a collection generated by the Create tool of the BloomTree Manager suite." /> + <param format="tabular" name="txtfiles" type="data" label="Select query files" multiple="true" optional="false" help="Select one or more tabular files containing (ID, TRANSCRIPT) couples for each line. The content of these files will be merged and the result will represent a query to the Sequence Bloom Tree that will return a collection containing a file for each ID. The content of these files as result of the tool will be a list of accession numbers." /> + <param name="threshold" size="1" type="float" value="0.7" min="0.0" max="1.0" label="Search threshold" help="Fraction of query kmers that must be present in a leaf to be considered a match." /> + <param name="sort" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Sort matched leaves by the number of query kmers present, and report the number of kmers present (if not selected we just report the matched leaves without regard to which matches are better)." /> + </inputs> + <outputs> + <collection name="output_collect" type="list:list" label="BloomTree Manager - Query result collection"> + <discover_datasets pattern="(?P<identifier_0>[^_]+)_(?P<identifier_1>[^_]+)_(?P<ext>[^_]+)" directory="answer_collection" ext="auto" /> + </collection> + <data format="txt" name="resulttxt" label="${tool.name} SBT: Result" from_work_dir="query.txt" /> + </outputs> + + <help><![CDATA[ +This tool is part of the BloomTree Manager Framework that allow to rapidly identify all +sequenced samples which express a transcript of interest. + +---- + +The input for this tool is a list of (ID, TRANSCRIPT) couples, one for each line, +in a tab delimited format:: + + id0 CCAACCAAAGGGAAAACTTTTTTCCGACTTTGGCCTAAAGGGTTTAACGGCCAAGTCAGAAGGGAAAAAGTTGCGCCA + id1 TTAATGACAGGGCCACATGATGTGAAAAAAAATCAGAAACCGAGTCAACGTGAGAAGATAGTACGTACTACCGCAAAT + ... + idn CAATTAATGATAAATATTTTATAAGGTGCGGAAATAAAGTGAGGAATATCTTTTAAATTCAAGTTCAATTCTGAAAGC + +The ID can contain alphanumeric characters in addition to spaces, dots, dashes, and round and square brackets. +Any additional character will be trimmed out. + +The Sequence Bloom Tree identifier must be also specified. It is a string that identify an existing Sequence +Bloom Tree, which should be built with the Create tool of the BloomTree Manager Suite. + +The output of the tool is a list of collections in which every collection contains a file for each ID with a list of +accession numbers representing the samples that express one particular transcript. + +---- + +.. class:: infomark + +**Notes** + +This Galaxy tool has been developed by Fabio Cumbo. + +Please visit this GithHub_repository_ for more information about the BloomTree Manager + +.. _GithHub_repository: https://github.com/fabio-cumbo/bloomtree-manager + ]]></help> + + <expand macro="citations" /> +</tool>
--- a/create.py Fri Jan 18 10:12:40 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,132 +0,0 @@ -#!/usr/bin/env python - -# https://github.com/ross/requests-futures -# http://docs.python-requests.org/en/master/user/quickstart/#more-complicated-post-requests - -import sys, os, uuid, optparse, requests, json, time -#from requests_futures.sessions import FuturesSession - -#### NN14 #### -SERVICE_URL = "http://nn14.galaxyproject.org:8080/"; -#service_url = "http://127.0.0.1:8082/"; -CREATE_URL = SERVICE_URL+"tree/create"; -STATUS_URL = SERVICE_URL+"status/<query_id>"; -############## -# query delay in seconds -QUERY_DELAY = 30; -############## - -__version__ = "1.0.0"; -ERR_EXIT_CODE = 1; -OK_EXIT_CODE = 0; - -def raiseException( exitcode, message, errorfilepath ): - with open(errorfilepath, 'w') as out: - out.write(message); - sys.exit(exitcode); - -def create_request( options, args, data ): - outfilepath = options.outfile; - cluster_id_2_query_id = { }; - - for cluster_id in data: - payload = { }; - payload["accessions"] = data[cluster_id]; - # add additional parameters to the payload - payload["qualitycontrol"] = int(options.qualitycontrol); - payload["qualitythreshold"] = float(options.qualitythreshold); - payload["klen"] = int(options.klen); - payload["minabundance"] = int(options.minabundance); - # set the content type to application/json - headers = {'Content-type': 'application/json'}; - # create a session - session = requests.Session(); - # make a synchronous post request to the create route - req = session.post(CREATE_URL, headers=headers, json=payload); - resp_code = req.status_code; - #print(str(req.content)+"\n\n"); - if resp_code == requests.codes.ok: - resp_content = str(req.content); - # convert out to json - json_content = json.loads(resp_content); - # retrieve query id - query_id = json_content['query_id']; - cluster_id_2_query_id[cluster_id] = query_id; - else: - with open(outfilepath, 'a+') as outfile: - outfile.write( "An error has occurred while submitting data to the /tree/create endpoint for the cluster " + cluster_id + "\n\n" ); - - build_flags = [ ] - while len(build_flags) < len(cluster_id_2_query_id): - for idx, cluster_id in enumerate( cluster_id_2_query_id ): - if cluster_id not in build_flags: - query_id = cluster_id_2_query_id[ cluster_id ]; - # create a new session - session = requests.Session(); - # make a synchronous get request to the status route - status_query_url = STATUS_URL.replace("<query_id>", query_id); - status_req = session.get(status_query_url); - status_resp_content = str(status_req.content); - #print(status_resp_content+"\n\n"); - # convert out to json - json_status_content = json.loads(status_resp_content); - # take a look at the state - # state attribute is always available - if json_status_content['state'] == 'SUCCESS': - build_flags.append( cluster_id ); - built_tree_id = json_status_content['results']['tree_id']; - with open(outfilepath, 'a+') as outfile: - outfile.write( "Query ID: " + str(query_id) + "\n" + "Query status: " + str(json_status_content['state']) + "\n" + "Cluster ID: " + cluster_id + "\n" + "Sequence Bloom Tree ID: " + built_tree_id + "\n\n" ); - elif json_status_content['state'] in ['FAILURE', 'REVOKED']: - build_flags.append( cluster_id ); - with open(outfilepath, 'a+') as outfile: - outfile.write( "Query ID: " + str(query_id) + "\n" + "Query status: " + str(json_status_content['state']) + "\n" + "Cluster ID: " + cluster_id + "\n\n" ); - if len(build_flags) < len(cluster_id_2_query_id): - time.sleep(QUERY_DELAY); # in seconds - return sys.exit(OK_EXIT_CODE); - -def create( options, args ): - multiple_data = {}; - experiment_list_file_path = options.explist; - with open(experiment_list_file_path) as explist: - for line in explist: - if line.strip() != "": - line_split = line.strip().split("\t"); # split on tab - if len(line_split) == 2: # 0:accession , 1:cluster_id , otherwise skip line - accession = line_split[0]; - cluster_id = line_split[1]; - if cluster_id in multiple_data: - multiple_data[cluster_id].append( accession ); - else: - multiple_data[cluster_id] = [ accession ]; - if len(multiple_data) > 0: - return create_request( options, args, multiple_data ); - else: - return raiseException( ERR_EXIT_CODE, "An error has occurred. Please be sure that your input file is valid.", options.outfile ); - -def __main__(): - # Parse the command line options - usage = "Usage: create.py --explist experiment_list --qualitycontrol quality_control --qualitythreshold quality_threshold --klen kmer_len --minabundance min_abundance --outfile output_file_path"; - parser = optparse.OptionParser(usage = usage); - parser.add_option("-v", "--version", action="store_true", dest="version", - default=False, help="display version and exit") - parser.add_option("-l", "--explist", type="string", - action="store", dest="explist", help="tabular file with a list of SRA accessions and their cluster label"); - parser.add_option("-q", "--qualitycontrol", type="int", default=0 - action="store", dest="qualitycontrol", help="flag to enable or disable the experiment quality control"); - parser.add_option("-t", "--qualitythreshold", type="float", default=0.0 - action="store", dest="qualitythreshold", help="quality threshold, if quality control is enabled only"); - parser.add_option("-k", "--klen", type="int", default=21, - action="store", dest="klen", help="k-mer length"); - parser.add_option("-m", "--minabundance", type="int", default=2, - action="store", dest="minabundance", help="minimum abundance"); - parser.add_option("-o", "--outfile", type="string", default="outfile_txt", - action="store", dest="outfile", help="output file path"); - - (options, args) = parser.parse_args(); - if options.version: - print __version__; - else: - return create( options, args ); - -if __name__ == "__main__": __main__()
--- a/create.xml Fri Jan 18 10:12:40 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,104 +0,0 @@ -<?xml version="1.0"?> -<tool name="BloomTree Manager - Create" id="btman_create" version="1.0.0"> - <description>a Sequence Bloom Tree</description> - <macros> - <import>macros.xml</import> - </macros> - <expand macro="requirements" /> - <command detect_errors="exit_code"> -<![CDATA[ - python '$__tool_directory__/create.py' - - --explist '${explist}' - - --qualitycontrol ${conditional_quality.quality_control} - #if $conditional_quality.quality_control == '0': - --qualitythreshold 0.0 - #elif $conditional_quality.quality_control == '1': - --qualitythreshold ${conditional_quality.quality_threshold} - #end if - - --klen ${kmer_len} - --minabundance ${min_abundance} - - --outfile '${outfile}' -]]> - </command> - <inputs> - <param format="tabular" name="explist" type="data" label="Select a file with the list of experiments" help="This should be a tabular file with two columns. Take a look at the tool documentation for a detailed explanation about the its content." /> - - <conditional name="conditional_quality"> - <param name="quality_control" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Apply a quality control procedure" /> - <when value="1"> - <param name="quality_threshold" size="1" type="float" value="0.8" min="0.0" max="1.0" label="Quality threshold" help="If the number of sequences flagged as poor quality on the total number of sequences in a file is less than this threshold, the sequence file will be excluded." /> - </when> - </conditional> - - <param name="kmer_len" type="integer" value="21" min="0" label="K-mer length" /> - <param name="min_abundance" type="integer" value="2" min="0" label="Bloom filter minimum abundance" help="This value is the minimum abundance cutoff for the creation of the Bloom filters. It is worth noting that the same minimum abundance is used for each Bloom filter." /> - </inputs> - <outputs> - <data format="txt" name="outfile" label="${tool.name} SBT: Result" from_work_dir="btman.create.txt" /> - </outputs> - - <help><![CDATA[ -This tool is part of the BloomTree Manager Framework that allow to create a Sequence Bloom Tree starting -with a set of FASTA or FASTQ files. It allows also to control the quality of the input dataset and -exclude the files that do not reach a specified quality level. - ------ - -**Input file** - -The input file for this tool must contain two columns with their values delimited by a tab. -The first column contains a list of SRA accessions, and the second column contains a unique identifier -for each set of SRA accessions. - -The input file is structured like the example below:: - - SRR805782 blood - SRR837459 blood - SRR837458 blood - SRR837453 blood - SRR837456 blood - ... - SRR791048 breast - SRR553483 breast - SRR553482 breast - SRR791045 breast - ... - SRR950876 brain - SRR786621 brain - -It is worth noting that for each cluster of accessions, every accession should be unique. -It is indeed possible to repeat an accession in multiple clusters. - -The tool will create a Sequence Bloom Tree for each cluster of accessions. - ------ - -**Output** - -The tool returns a single text file only. It contains the a tree identifier, one for -each cluster of accessions specified in the input file, that can be used with the -Query tool of the BloomTree Manager Suite to search for the presence of a set of -specific transcripts. - -Take a look at the Query tool documentation for a detailed description about how -to query a Sequence Bloom Tree. - ------ - -.. class:: infomark - -**Notes** - -This Galaxy tool has been developed by Fabio Cumbo. - -Please visit this GithHub_repository_ for more information about the BloomTree Manager - -.. _GithHub_repository: https://github.com/fabio-cumbo/bloomtree-manager - ]]></help> - - <expand macro="citations" /> -</tool>
--- a/macros.xml Fri Jan 18 10:12:40 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,14 +0,0 @@ -<macros> - <xml name="requirements"> - <requirements> - <requirement type="package" version="2.7.10">python</requirement> - <requirement type="package" version="2.18.4">requests</requirement> - </requirements> - </xml> - - <xml name="citations"> - <citations> - <citation type="doi">10.1101/090464</citation> - </citations> - </xml> -</macros> \ No newline at end of file
--- a/query.py Fri Jan 18 10:12:40 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,221 +0,0 @@ -#!/usr/bin/env python - -# https://github.com/ross/requests-futures -# http://docs.python-requests.org/en/master/user/quickstart/#more-complicated-post-requests - -import sys, os, uuid, optparse, requests, json, time -#from requests_futures.sessions import FuturesSession - -#### NN14 #### -SERVICE_URL = "http://nn14.galaxyproject.org:8080/"; -#service_url = "http://127.0.0.1:8082/"; -QUERY_URL = SERVICE_URL+"tree/<tree_id>/query"; -STATUS_URL = SERVICE_URL+"status/<query_id>"; -############## -# query delay in seconds -QUERY_DELAY = 30; -############## - -__version__ = "1.0.0"; -VALID_CHARS = '.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ' -# in the case of collections, exitcodes equal to 0 and 1 are not considered errors -ERR_EXIT_CODE = 2; -OK_EXIT_CODE = 0; - -def raiseException( exitcode, message, output_dir_path, errorfilename ): - errorfilepath = os.path.join(output_dir_path, errorfilename+"_txt"); - with open(errorfilepath, 'w') as out: - out.write(message); - sys.exit(exitcode); - -def query_request( options, args, payload ): - output_dir_path = options.outputdir; - # add additional parameters to the payload - #payload["tree_id"] = str(options.treeid); - payload["search_mode"] = str(options.search); - payload["exact_algorithm"] = int(options.exact); - payload["search_threshold"] = float(options.sthreshold); - payload["sort"] = int(options.sortcontrol); - # set the content type to application/json - headers = {'Content-type': 'application/json'}; - - # create a session - session = requests.Session(); - # make a synchronous post request to the query route - req = session.post(QUERY_URL.replace("<tree_id>", str(options.treeid)), headers=headers, json=payload); - resp_code = req.status_code; - #print(str(req.content)+"\n\n"); - if resp_code == requests.codes.ok: - resp_content = str(req.content); - # convert out to json - json_content = json.loads(resp_content); - # retrieve query id - query_id = json_content['query_id']; - query_processed = False; - # results json content - json_status_content = None; - while query_processed is False: - # create a new session - session = requests.Session(); - # make a synchronous get request to the status route - status_query_url = STATUS_URL.replace("<query_id>", query_id); - status_req = session.get(status_query_url); - status_resp_content = str(status_req.content); - #print(status_resp_content+"\n\n"); - # convert out to json - json_status_content = json.loads(status_resp_content); - # take a look at the state - # state attribute is always available - if json_status_content['state'] == 'SUCCESS': - query_processed = True; - break; - elif json_status_content['state'] in ['FAILURE', 'REVOKED']: - return raiseException( ERR_EXIT_CODE, "Query ID: "+str(query_id)+"\nQuery status: "+str(json_status_content['state']), output_dir_path, str(options.errorfile) ); - else: - time.sleep(QUERY_DELAY); # in seconds - - out_file_format = "tabular"; - for block in json_status_content['results']: - seq_id = block['sequence_id']; - # put response block in the output collection - output_file_path = os.path.join(output_dir_path, seq_id + "_" + out_file_format); - accessions_list = ""; - hits_block = block['hits']; - accessions_dict = { }; - is_sabutan = False; - for hit in hits_block: - if type(hit) is dict: # sabutan - #accessions_list = accessions_list + str(hit['accession_number']) + "\t" + str(hit['score']) + "\n"; - accession_number = hit['accession_number']; - #------------ - #score = hit['score']; - #score_split = score.split("/"); - #accessions_dict[accession_number] = "{0:.6f}".format(float(score_split[0])/float(score_split[1])); - #------------ - fraction = hit['fraction']; - score = hit['score']; - accession_scores = { - "fraction": str(fraction), - "score": float(score) - } - accessions_dict[accession_number] = accession_scores; - is_sabutan = True; - else: # all-some - accessions_list = accessions_list + str(hit) + "\n"; - if is_sabutan: - sorted_accessions = sorted(accessions_dict, key=lambda i: float(accessions_dict[i]["score"]), reverse=True); - for acc in sorted_accessions: - accessions_list = accessions_list + str(acc) + "\t" + str(accessions_dict[acc]["fraction"]) + "\t" + str(accessions_dict[acc]["score"]) + "\n"; - with open(output_file_path, 'w') as out: - out.write(accessions_list.strip()); - return sys.exit(OK_EXIT_CODE); - else: - return raiseException( ERR_EXIT_CODE, "Unable to query the remote server. Please try again in a while.", output_dir_path, str(options.errorfile) ); - -def query( options, args ): - output_dir_path = options.outputdir; - multiple_data = {}; - comma_sep_file_paths = options.files; - #print("files: "+str(comma_sep_file_paths)+" - "+str(type(comma_sep_file_paths))); - # check if options.files contains at least one file path - if comma_sep_file_paths is not None: - # split file paths - file_paths = comma_sep_file_paths.split(","); - # split file names - comma_sep_file_names = str(options.names); - #print("names: "+str(comma_sep_file_names)); - file_names = comma_sep_file_names.split(","); - for idx, file_path in enumerate(file_paths): - #file_name = file_names[idx]; - with open(file_path, 'r') as content_file: - for line in content_file: - if line.strip() != "": - line_split = line.strip().split("\t"); # split on tab - if len(line_split) == 2: # 0:id , 1:seq , otherwise skip line - seq_id = line_split[0]; - # fix seq_id using valid chars only - seq_id = ''.join(e for e in seq_id if e in VALID_CHARS) - seq_text = line_split[1]; - if seq_id in multiple_data: - return raiseException( ERR_EXIT_CODE, "Error: the id '"+seq_id+"' is duplicated", output_dir_path, str(options.errorfile) ); - multiple_data[seq_id] = seq_text; - if len(multiple_data) > 0: - return query_request( options, args, multiple_data ); - #return echo( options, args ); - else: - return raiseException( ERR_EXIT_CODE, "An error has occurred. Please be sure that your input files are valid.", output_dir_path, str(options.errorfile) ); - else: - # try with the sequence in --sequence - text_content = options.sequences; - #print("sequences: "+text_content); - # check if options.sequences contains a list of sequences (one for each row) - if text_content is not None: - text_content = str(text_content); - if text_content.strip(): - # populate a dictionary with the files containing the sequences to query - text_content = text_content.strip().split("__cn__"); # split on new line - for line in text_content: - if line.strip() != "": - line_split = line.strip().split("__tc__"); # split on tab - if len(line_split) == 2: # 0:id , 1:seq , otherwise skip line - seq_id = line_split[0]; - # fix seq_id using valid chars only - seq_id = ''.join(e for e in seq_id if e in VALID_CHARS) - seq_text = line_split[1]; - if seq_id in multiple_data: - return raiseException( ERR_EXIT_CODE, "Error: the id '"+seq_id+"' is duplicated", output_dir_path, str(options.errorfile) ); - multiple_data[seq_id] = seq_text; - if len(multiple_data) > 0: - return query_request( options, args, multiple_data ); - #return echo( options, args ); - else: - return raiseException( ERR_EXIT_CODE, "An error has occurred. Please be sure that your input files are valid.", output_dir_path, str(options.errorfile) ); - else: - return raiseException( ERR_EXIT_CODE, "You have to insert at least one row formatted as a tab delimited (ID, SEQUENCE) couple", output_dir_path, str(options.errorfile) ); - return ERR_EXIT_CODE; - -def __main__(): - # Parse the command line options - usage = "Usage: query.py --files comma_sep_file_paths --names comma_seq_file_names --sequences sequences_text --search search_mode --exact exact_alg --sthreshold threshold --outputdir output_dir_path"; - parser = optparse.OptionParser(usage = usage); - parser.add_option("-v", "--version", action="store_true", dest="version", - default=False, help="display version and exit") - parser.add_option("-f", "--files", type="string", - action="store", dest="files", help="comma separated files path"); - parser.add_option("-n", "--names", type="string", - action="store", dest="names", help="comma separated names associated to the files specified in --files"); - parser.add_option("-s", "--sequences", type="string", - action="store", dest="sequences", help="contains a list of sequences (one for each row)"); - parser.add_option("-a", "--fasta", type="string", - action="store", dest="fasta", help="contains the content of a fasta file"); - parser.add_option("-x", "--search", type="string", default="rrr", - action="store", dest="search", help="search mode"); - parser.add_option("-e", "--exact", type="int", default=0, - action="store", dest="exact", help="exact algorithm (required if search is 1 only)"); - parser.add_option("-k", "--tree", type="string", default=0, - action="store", dest="treeid", help="the id of the tree that will be queried"); - parser.add_option("-t", "--sthreshold", type="float", - action="store", dest="sthreshold", help="threshold applied to the search algrithm"); - parser.add_option("-z", "--sort", type="int", default=1, - action="store", dest="sortcontrol", help="boolean required to sort the result"); - parser.add_option("-o", "--outputdir", type="string", default="output", - action="store", dest="outputdir", help="output directory (collection) path"); - parser.add_option("-r", "--errorfile", type="string", default="error_txt", - action="store", dest="errorfile", help="error file name containing error messages"); - - # TEST - #sequences = 'NM_001169378.2__tc__atttcggatgctttggagggaggaactctagtgctgcattgattggggcgtgtgttaatgatattcccagttcgcatggcgagcatcgattcctggtacgtatgtgggccccttgactcccacttatcgcacttgtcgttcgcaatttgcatgaattccgcttcgtctgaaacgcacttgcgccagacttctccggctggtctgatctggtctgtgatccggtctggtggggcgccagttgcgtttcgagctcatcaccagtcactccgcagtcgcattctgccagaggtctccgatcaagagcgcttctccattcgagattcaaacgcagcgcggtctgacgccgccacatcgagtgaaatccatatcgatggccacattcacacaggacgagatcgacttcctgcgcagccatggcaacgagctgtgtgccaagacctggctgggattgtgggatccgaagcgggctgtgcaccagcaggagcagcgcgaactgatgatggacaagtatgagcggaagcgatactacctggagccggccagtcctcttaagtcgctggccaatgcggtcaacctgaagtcgtctgctccggcgacgaaccacactcagaatggccaccaaaatgggtatgccagcatccatttgacgcctcctgctgcccagcggacctcggccaatggattgcagaaggtggccaactcgtcgagtaactcttctggaaagacctcatcctcgatcagtaggccacactataatcaccagaacaacagccaaaacaacaatcacgatgcctttggcctgggtggcggattgagcagcctgaacagcgccggttccacatccactggagctctttccgacaccagcagttgtgctagcaatggcttcggtgcggactgcgactttgtggctgactttggctcggccaacattttcgacgccacatcggcgcgttccacaggatcgccggcggtgtcgtccgtgtcctcagtgggttccagcaatggctacgccaaggtgcagcccatccgggcagctcatctccagcagcaacagcagttgcagcagcagctgcatcagcagcagctcctcaatggcaatggtcatcagggcactgagaactttgccgacttcgatcacgctcccatctacaatgcagtggctccaccgacttttaacgattggatcagcgactggagcaggcggggcttccacgatcccttcgacgattgcgatgactcgccaccaggtgcccgccctccagcacctgcgccagctcctgctcaagttcccgcagtatcatcaccattgccaaccgtccgagaagaaccagagcttgcgtggaatttttgggaggacgagatgcgaatagaggcgcaggaaaaggagtcccaaactaaacagccggagttgggctactccttttcgattagtactactacgcccctttccccttcgaatcccttcctgccctaccttgtcagtgaggagcagcatcgaaatcatccagagaagccctccttttcgtattcgttgttcagctccatatcaaatagttcgcaagaagatcaggcggatgatcatgagatgaatgttttaaatgccaatttccatgatttctttacgtggagtgctcccttgcagaacggccatacgaccagtccgcccaagggcggaaatgcagcgatggcgcccagtgaggatcgatatgccgctcttaaggatctcgacgagcagctgcgagaactgaaggccagcgaaagcgccacagagacgcccacgcccaccagtggcaatgttcaggccacagatgcctttggtggagccctcaacaacaatccaaatcccttcaagggccagcaacagcagcagctcagcagccatgtggtgaatccattccagcagcagcaacagcagcagcaccagcagaatctctatggccagttgacgctcataccaaatgcctacggcagcagttcccagcagcagatggggcaccatctcctccagcagcagcagcagcaacagcagagcttcttcaacttcaacaacaacgggttcgccatctcgcagggtctgcccaacggctgcggcttcggcagcatgcaacccgctcctgtgatggccaacaatccctttgcagccagcggcgccatgaacaccaacaatccattcttatgagactcaacccgggagaatccgcctcgcgccacctggcagaggcgctgagccagcgaacaaagagcagacgcggaggaaccgaaccgaaattagtccattttactaacaatagcgttaatctatgtatacataatgcacgccggagagcactctttgtgtacatagcccaaatatgtacacccgaaaggctccacgctgacgctagtcctcgcggatggcggaggcggactggggcgttgatatattcttttacatggtaactctactctaacgtttacggatacggatatttgtatttgccgtttgccctagaactctatacttgtactaagcgcccatgaacacttcatccactaacatagctactaatcctcatcctagtggaggatgcagttggtccagacactctgttatttgttttatccatcctcgtacttgtctttgtcccatttagcactttcgttgcggataagaactttgtcagttattgattgtgtggccttaataagattataaaactaaatattataacgtacgactatacatatacggatacagatacagattcagacacagttagtacagatacagatatacatatacgcttttgtacctaatgaattgcttcttgtttccattgctaatcatctgcttttcgtgtgctaattttatacactagtacgtgcgatatcggccgtgcagatagattgctcagctcgcgagtcaagcctcttttggttgcacccacggcagacatttgtacatatactgtctgattgtaagcctcgtgtaatacctccattaacaccactcccccaccacccatccatcgaaccccgaatccatgactcaattcactgctcacatgtccatgcccatgccttaacgtgtcaaacattatcgaagccttaaagttatttaaaactacgaaatttcaataaaaacaaataagaacgctatc'; - #(options, args) = parser.parse_args(['-x', 'rrr', '-t', 0.5, '-s', sequences, '-o', 'collection_content']); - - (options, args) = parser.parse_args(); - if options.version: - print __version__; - else: - # create output dir (collection) - output_dir_path = options.outputdir; - if not os.path.exists(output_dir_path): - os.makedirs(output_dir_path); - - return query( options, args ); - -if __name__ == "__main__": __main__()
--- a/query.xml Fri Jan 18 10:12:40 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,94 +0,0 @@ -<?xml version="1.0"?> -<tool name="BloomTree Manager - Query" id="btman_query" version="1.0.0"> - <description>a Sequence Bloom Tree</description> - <macros> - <import>macros.xml</import> - </macros> - <expand macro="requirements" /> - <command detect_errors="exit_code"> -<![CDATA[ - python '$__tool_directory__/query.py' - - --tree '${treeid}' - --search 'rrr' - --sthreshold ${sthreshold} - --sort ${sortcontrol} - --exact 0 - - #if $conditional_input.inputtype == '0': - #set file_paths = ','.join( [ str( $f ) for $f in $conditional_input.txtfiles ] ) - #if $file_paths is not 'None': - --files '${file_paths}' - #set file_names = ','.join( [ str( $f.name ) for $f in $conditional_input.txtfiles ] ) - --names '${file_names}' - #end if - #elif $conditional_input.inputtype == '1': - --sequences '${conditional_input.sequences}' - #end if - - --outputdir 'collection_content' -]]> - </command> - <inputs> - <conditional name="conditional_input"> - <param name="inputtype" type="select" label="Input mode" help="Select a mode based on how do you want to specify the input"> - <option value="0" selected="true">By file</option> - <option value="1">By manually inserted text</option> - </param> - <when value="0"> - <param format="tabular" name="txtfiles" type="data" label="Select files" multiple="true" optional="false" help="Select one or more tabular files containing (ID, TRANSCRIPT) couples for each line. The content of these files will be merged and the result will represent a query to the Sequence Bloom Tree that will return a collection containing a file for each ID. The content of these files as result of the tool will be a list of accession numbers." /> - </when> - <when value="1"> - <param name="sequences" type="text" area="True" size="5x25" label="Manually insert sequences" optional="false" help="Insert a list of (ID, TRANSCRIPT) couples in a tab delimited format, one for each line. The content of this text box will represent a query to the Sequence Bloom Tree that will return a collection containing a file for each ID. The content of these files as result of the tool will be a list of accession numbers." /> - </when> - </conditional> - - <param name="sthreshold" size="3" type="float" value="0.7" min="0.0" max="1.0" label="Search threshold" help="This threshold controls the specificity. Lower values will produce more hits to the query. Higher values are more stringent and will produce fewer hits." /> - <param name="sortcontrol" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Sort the result by the number of hits per transcript." /> - - <param name="treeid" size="30" type="text" value="" label="Sequence Bloom Tree identifier" help="Set this field according to the result of the Create tool of the BloomTree Manager Suite." /> - </inputs> - <outputs> - <collection name="output_collect" type="list" label="BloomTree Manager - Query result collection"> - <discover_datasets pattern="(?P<identifier_0>[^_]+)_(?P<ext>[^_]+)" directory="collection_content" ext="auto" /> - </collection> - </outputs> - - <help><![CDATA[ -This tool is part of the BloomTree Manager Framework that allow to rapidly identify all -sequenced samples which express a transcript of interest. - ----- - -The input for this tool is a list of (ID, TRANSCRIPT) couples, one for each line, -in a tab delimited format:: - - id0 CCAACCAAAGGGAAAACTTTTTTCCGACTTTGGCCTAAAGGGTTTAACGGCCAAGTCAGAAGGGAAAAAGTTGCGCCA - id1 TTAATGACAGGGCCACATGATGTGAAAAAAAATCAGAAACCGAGTCAACGTGAGAAGATAGTACGTACTACCGCAAAT - ... - idn CAATTAATGATAAATATTTTATAAGGTGCGGAAATAAAGTGAGGAATATCTTTTAAATTCAAGTTCAATTCTGAAAGC - -The ID can contain alphanumeric characters in addition to spaces, dots, dashes, and round and square brackets. -Any additional character will be trimmed out. - -The Sequence Bloom Tree identifier must be also specified. It is a string that identify an existing Sequence -Bloom Tree, which should be built with the Create tool of the BloomTree Manager Suite. - -The output of the tool is a collection that contains a file for each ID with a list of -accession numbers representing the samples that express one particular transcript. - ----- - -.. class:: infomark - -**Notes** - -This Galaxy tool has been developed by Fabio Cumbo. - -Please visit this GithHub_repository_ for more information about the BloomTree Manager - -.. _GithHub_repository: https://github.com/fabio-cumbo/bloomtree-manager - ]]></help> - - <expand macro="citations" /> -</tool>