Mercurial > repos > fabio > btman
changeset 19:7f712cc0d3d5 draft
Uploaded 20190304.2
author | fabio |
---|---|
date | Mon, 04 Mar 2019 08:31:28 -0500 |
parents | be864d79c9c7 |
children | 1dc3f0c61817 |
files | .shed.yml btman-1.0.0/.shed.yml btman-1.0.0/build.sh btman-1.0.0/cluster.sh btman-1.0.0/create.py btman-1.0.0/create.xml btman-1.0.0/dataset.tsv btman-1.0.0/macros.xml btman-1.0.0/makebf.sh btman-1.0.0/query.py btman-1.0.0/query.tsv btman-1.0.0/query.xml build.sh cluster.sh create.py create.xml dataset.tsv macros.xml makebf.sh query.py query.tsv query.xml |
diffstat | 22 files changed, 738 insertions(+), 738 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.shed.yml Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,21 @@ +name: btman +owner: iuc +categories: + - Data Source + - Web Services +description: BloomTree Manager +long_description: | + A suite of tools to fast create and query Sequence Bloom Trees + supporting determined/how split filters +remote_repository_url: https://github.com/fabio-cumbo/bloomtree-manager +homepage_url: https://github.com/fabio-cumbo/bloomtree-manager +type: unrestricted +auto_tool_repositories: + name_template: "{{ tool_id }}" + descriptor_template: "Wrapper for BloomTree Manager: {{ tool_name }}." +suite: + name: "btman_suite" + description: "A suite of Galaxy tools designed to work with the BloomTree Manager." + long_description: | + A suite of tools to fast create and query Sequence Bloom Trees + supporting determined/how split filters
--- a/btman-1.0.0/.shed.yml Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -name: btman -owner: iuc -categories: - - Data Source - - Web Services -description: BloomTree Manager -long_description: | - A suite of tools to fast create and query Sequence Bloom Trees - supporting determined/how split filters -remote_repository_url: https://github.com/fabio-cumbo/bloomtree-manager -homepage_url: https://github.com/fabio-cumbo/bloomtree-manager -type: unrestricted -auto_tool_repositories: - name_template: "{{ tool_id }}" - descriptor_template: "Wrapper for BloomTree Manager: {{ tool_name }}." -suite: - name: "btman_suite" - description: "A suite of Galaxy tools designed to work with the BloomTree Manager." - long_description: | - A suite of tools to fast create and query Sequence Bloom Trees - supporting determined/how split filters
--- a/btman-1.0.0/build.sh Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -#!/bin/bash - -outExpDir=$1 - -cd ${outExpDir} - -howdesbt build --HowDe --tree=union.txt --outtree=howde.txt
--- a/btman-1.0.0/cluster.sh Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -#!/bin/bash - -outExpDir=$1 -bfsize=$2 - -cd ${outExpDir} - -ls *.bf > leafnames.txt -howdesbt cluster --list=leafnames.txt --bits=${bfsize} --tree=union.txt --nodename=node{number} --keepallnodes -#rm leafnames_txt_?
--- a/btman-1.0.0/create.py Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,233 +0,0 @@ -#!/usr/bin/env python - -import sys, os, optparse, shutil, glob - -__version__ = "1.0.0" -# in the case of collections, exitcodes equal to 0 and 1 are not considered errors -ERR_EXIT_CODE = 2 -OK_EXIT_CODE = 0 -VALID_CHARS = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' - -def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ): - print message - with open( logfilepath, 'a+' ) as out: - out.write( message + '\n' ) - if exit: - sys.exit( exitcode ) - -def downloadAccessions(formats, filepaths, outlogfile, outdirpath): - downloaded_files = { } - for dataset_idx in range(0, len(formats)): - if formats[ dataset_idx ] == 'accessions': - print filepaths[ dataset_idx ] - with open( filepaths[ dataset_idx ] ) as accessions: - for line in accessions: - print line - accession = line.split( '\t' )[0].strip() - if accession: - printLog( outlogfile, 'Downloading \"' + accession.upper() + '\" with the fastq-dump tool (part of the sra-tools utility)' ) - fastq_dump_exitcode = os.system( 'fastq-dump --outdir ' + outdirpath + ' --fasta ' + accession.upper() ) - if fastq_dump_exitcode > 0: - printLog( outlogfile, '> FASTA file: FAILED ( \"' + accession.upper() + '\" will be excluded )' ) - else: - #os.rename( os.path.join( outdirpath, accession.upper() + '.fasta' ), os.path.join( outdirpath, accession.upper() + '_fasta' ) ) - printLog( outlogfile, '> FASTA file: \"' + accession.upper() + '.fasta\"' ) - accession_data = { - 'format': '.fasta', - 'filepath': os.path.join( outdirpath, accession.upper() + '.fasta' ), - 'filename': ''.join( c for c in accession.upper() if c in VALID_CHARS ) - } - downloaded_files[ accession.upper() ] = accession_data - return downloaded_files - -# format = { fasta, fastq, accession } -# this version skip the quality control procedure -def createSBT( options, args ): - outlogfile = str( options.outfile ) - outdirpath = str( options.outdir ) - if not outdirpath.endswith('/'): outdirpath += '/' - if not os.path.exists( outdirpath ): - os.mkdir( outdirpath ) - outdirpath = os.path.abspath( outdirpath ) - os.chdir( outdirpath ) - tooldirpath = os.path.abspath( str( options.tooldir ) ) - if not tooldirpath.endswith('/'): tooldirpath += '/' - - formats = [ fo for fo in str( options.formats ).split( '|' ) if fo.strip() ] - filepaths = [ filepath for filepath in str( options.filepaths ).split( '|' ) if filepath.strip() ] - filenames = [ filename for filename in str( options.filenames ).split( '|' ) if filename.strip() ] - compressed = [ True == int(c) for c in str( options.compressed ).split( '|' ) if c.strip() ] - minabundances = [ int(minab) for minab in str( options.minabundances ).split( '|' ) if minab.strip() ] - qualitythresholds = [ float(qthres) for qthres in str( options.qualitythresholds ).split( '|' ) if qthres.strip() ] - - klen = int( options.klen ) - bfsize = int( options.bfsize ) - - if len(formats) == len(filepaths) == len(filenames) == len(compressed) == len(minabundances) == len(qualitythresholds): - printLog( outlogfile, 'Retrieving experiments' ) - accessions = downloadAccessions( formats, filepaths, outlogfile, outdirpath ) - printLog( outlogfile, '> ' + str( len( accessions ) ) + ' experiments retrieved from the Sequence Read Archive' ) - acc_arr = [ a for a in accessions ] - print str( acc_arr ) - if bfsize < 0: # estimate bloom filter size - data_paths = ' '.join( accessions[ accession ][ 'filepath' ] for accession in accessions if 'filepath' in accessions[ accession ] ) - print data_paths - if len( data_paths ) > 0: - data_paths += ' ' - for dataset_idx in range(0, len(formats)): - if formats[ dataset_idx ] != 'accessions': - data_paths += ' '.join( path for path in filepaths[ dataset_idx ].split( ',' ) ) - # ntcard - printLog( outlogfile, 'Estimating the Bloom Filter size with ntcard' ) - if len( data_paths ) > 0: - ntcard_res_filepath = os.path.join( outdirpath, 'freq_k' + str( klen ) + '.hist' ) - ntcard_exitcode = os.system( 'ntcard --kmer=' + str( klen ) + ' ' + data_paths ) - print 'ntcard --kmer=' + str( klen ) + ' ' + data_paths - if ntcard_exitcode > 0: - printLog( outlogfile, '> [exitcode: ' + str(ntcard_exitcode) + '] an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True ) - else: - if os.path.exists( ntcard_res_filepath ): - os.rename( ntcard_res_filepath, os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' ) ) - ntcard_res_filepath = os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' ) - var_F0 = None - var_f1 = None - with open( ntcard_res_filepath ) as ntcard_res: - for line in ntcard_res: - line = line.strip() - if line: - line_split = line.split( '\t' ) - if len(line_split) == 2: - if line_split[0] == 'F0': - var_F0 = int( line_split[1] ) - elif line_split[0] == 'f1': - var_f1 = int( line_split[1] ) - if var_F0 is not None and var_f1 is not None: - break - if var_F0 is not None and var_f1 is not None: - bfsize = var_F0 - var_f1 - printLog( outlogfile, '> estimated Bloom Filter size: ' + str(bfsize) ) - else: - printLog( outlogfile, '> an error has occurred while estimating the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True ) - else: - printLog( outlogfile, '> an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True ) - else: - printLog( outlogfile, '> unable to estimate the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True ) - - if bfsize > 0: - for dataset_idx in range(0, len(formats)): - if formats[ dataset_idx ] == 'accessions': - with open( filepaths[ dataset_idx ] ) as accessions_file: - for line in accessions_file: - accession = line.split( '\t' )[0].strip().upper() - if accession in accessions: - curr_format = accessions[ accession ][ 'format' ] - curr_compressed = 'uncompress' - curr_filepath = accessions[ accession ][ 'filepath' ] - curr_filename = accessions[ accession ][ 'filename' ] - printLog( outlogfile, 'Processing \"' + accession + '\" ( format=\"' + curr_format + - '\", compressed=\"' + str(False) + '\", fixed_name=\"' + curr_filename + '\" )' ) - print 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1' - makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' + - curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + - str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1' ) - if makebf_exitcode > 0: - printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + accession + '\" will be excluded )' ) - else: - printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename + '.bf\"' ) - else: - curr_format = '.' + formats[ dataset_idx ].lower() - curr_compressed = '.gz' if compressed[ dataset_idx ] else 'uncompress' - curr_filepaths = filepaths[ dataset_idx ].split( ',' ) - curr_filenames = filenames[ dataset_idx ].split( ',' ) - for curr_idx in range(0, len(curr_formats)): - curr_filename_fixed = ''.join( c for c in curr_filenames[ curr_idx ] if c in VALID_CHARS ) - printLog( outlogfile, 'Processing \"' + curr_filenames[ curr_idx ] + '\" ( format=\"' + curr_format + - '\", compressed=\"' + str(compressed[ dataset_idx ]) + '\", fixed_name=\"' + curr_filename_fixed + '\" )' ) - if compressed[ dataset_idx ]: - makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' + - curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + - str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 1' ) - else: - makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' + - curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + - str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 0' ) - if makebf_exitcode > 0: - printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + curr_filenames[ curr_idx ] + '\" will be excluded )' ) - else: - printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename_fixed + '.bf\"' ) - # Create a tree topology - printLog( outlogfile, 'Creating a tree topology file' ) - bf_counter = len( glob.glob1( outdirpath, '*.bf' ) ) - if bf_counter > 0: - cluster_exitcode = os.system( 'sh ' + tooldirpath + 'cluster.sh ' + outdirpath + ' ' + str( bfsize ) ) - if cluster_exitcode > 0: - printLog( outlogfile, '> [exitcode: ' + str(cluster_exitcode) + '] an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True ) - else: - # Build the HowDeSBT nodes - if os.path.exists( os.path.join( outdirpath, 'leafnames.txt' ) ): - printLog( outlogfile, 'Building the Bloom Filter files for the tree' ) - build_exitcode = os.system( 'sh ' + tooldirpath + 'build.sh ' + outdirpath ) - if build_exitcode > 0: - printLog( outlogfile, '> [exitcode: ' + str(build_exitcode) + '] an error has occurred during the creation of the Bloom Filter files for the tree', exitcode=ERR_EXIT_CODE, exit=True ) - else: - printLog( outlogfile, '> the tree has been successfully built: \"howde.txt\"', exitcode=OK_EXIT_CODE, exit=True ) - ''' - howde_filepath = os.path.join( outdirpath, 'howde.txt' ) - howde_galaxy_filepath = os.path.join( outdirpath, 'howde_galaxy.txt' ) - howde_galaxy = open( howde_galaxy_filepath, 'w' ) - with open( howde_filepath ) as howde_file: - for line in howde_file: - line = line.strip() - if line: - # trim stars * and get node name - # find galaxy file path to the node name - # rewrite path with stars - howde_galaxy.close() - ''' - else: - printLog( outlogfile, '> an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True ) - else: - printLog( outlogfile, '> no Bloom Filter files found', exitcode=ERR_EXIT_CODE, exit=True ) - else: - printLog( outlogfile, '> ERROR: the Bloom Filter size is ' + str( bfsize ), exitcode=ERR_EXIT_CODE, exit=True ) - else: - printLog( outlogfile, 'Something went wrong with the input parameters', exitcode=ERR_EXIT_CODE, exit=True ) - -def __main__(): - # Parse the command line options - usage = ("Usage: create.py --formats file_formats --filepaths file_paths --filenames file_names " - "--compressed file_compressed --minabundance min_abundance --qualitythresholds quality_thresholds " - "--klen kmer_len --bfsize bloom_filter_size --outfile out_log_file_path --outdir out_dir_path") - parser = optparse.OptionParser(usage = usage) - parser.add_option("-v", "--version", action="store_true", dest="version", - default=False, help="display version and exit") - parser.add_option("-f", "--formats", type="string", - action="store", dest="formats", help="list of file formats separated by a tab character") - parser.add_option("-p", "--filepaths", type="string", - action="store", dest="filepaths", help="list of input file paths separated by a tab character") - parser.add_option("-n", "--filenames", type="string", - action="store", dest="filenames", help="list of input file names separated by a tab character") - parser.add_option("-c", "--compressed", type="string", - action="store", dest="compressed", help="list of compressed flags related to the imput files separated by a tab character") - parser.add_option("-m", "--minabundances", type="string", - action="store", dest="minabundances", help="list of blooom filter minimum abundances related to the imput files separated by a tab character") - parser.add_option("-q", "--qualitythresholds", type="string", - action="store", dest="qualitythresholds", help="list of quality thresholds related to the imput files separated by a tab character") - parser.add_option("-k", "--klen", type="int", default=21, - action="store", dest="klen", help="k-mer length") - parser.add_option("-b", "--bfsize", type="int", default=-1, - action="store", dest="bfsize", help="bloom filter size") - parser.add_option("-o", "--outfile", type="string", default="sbtres.txt", - action="store", dest="outfile", help="output log file path") - parser.add_option("-d", "--outdir", type="string", default="sbtres.txt", - action="store", dest="outdir", help="output directory path") - parser.add_option("-t", "--tooldir", type="string", default="./", - action="store", dest="tooldir", help="tool directory path") - - (options, args) = parser.parse_args() - if options.version: - print __version__ - else: - createSBT( options, args ) - -if __name__ == "__main__": __main__()
--- a/btman-1.0.0/create.xml Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,173 +0,0 @@ -<?xml version="1.0"?> -<tool name="BloomTree Manager - Create" id="btman_create" version="1.0.0"> - <description>a Sequence Bloom Tree</description> - <macros> - <import>macros.xml</import> - </macros> - <expand macro="requirements" /> - <command detect_errors="exit_code"> -<![CDATA[ - python '$__tool_directory__/create.py' - - #set formats = '' - #set filepaths = '' - #set filenames = '' - #set compressed = '' - #set minab = '' - #set qthres = '' - #for $i, $exp in enumerate( $experiments ): - #set formats += str( $exp.conditional_format.format ) + '|' - #if $exp.conditional_format.format == 'accessions': - #set filepaths += str( $exp.conditional_format.accession_numbers ) + '|' - #set filenames += str( $exp.conditional_format.accession_numbers.name ) + '|' - #set compressed += '0|' - #else: - #if $exp.conditional_format.format == 'fasta': - #set compressed += str( $exp.conditional_format.conditional_fasta_compressed.fasta_compressed ) + '|' - #if $exp.conditional_format.conditional_fasta_compressed.fasta_compressed == 0: - #set filepaths += ','.join( [ str( $f ) for $f in $exp.conditional_format.conditional_fasta_compressed.fastafiles ] ) + '|' - #set filenames += ','.join( [ str( $f.name ) for $f in $exp.conditional_format.conditional_fasta_compressed.fastafiles ] ) + '|' - #else: - #set filepaths += ','.join( [ str( $f ) for $f in $exp.conditional_format.conditional_fasta_compressed.fastagzfiles ] ) + '|' - #set filenames += ','.join( [ str( $f.name ) for $f in $exp.conditional_format.conditional_fasta_compressed.fastagzfiles ] ) + '|' - #end if - #elif $exp.conditional_format.format == 'fastq': - #set compressed += str( $exp.conditional_format.conditional_fastq_compressed.fastq_compressed ) + '|' - #if $exp.conditional_format.conditional_fastq_compressed.fastq_compressed == 0: - #set filepaths += ','.join( [ str( $f ) for $f in $exp.conditional_format.conditional_fastq_compressed.fastqfiles ] ) + '|' - #set filenames += ','.join( [ str( $f.name ) for $f in $exp.conditional_format.conditional_fastq_compressed.fastqfiles ] ) + '|' - #else: - #set filepaths += ','.join( [ str( $f ) for $f in $exp.conditional_format.conditional_fastq_compressed.fastqgzfiles ] ) + '|' - #set filenames += ','.join( [ str( $f.name ) for $f in $exp.conditional_format.conditional_fastq_compressed.fastqgzfiles ] ) + '|' - #end if - #end if - #end if - #set minab += str( $exp.min_abundance ) + '|' - #if $exp.conditional_quality.quality_control == '1': - #set qthres += str( $exp.conditional_quality.quality_threshold ) + '|' - #else: - #set qthres += '-1.0|' - #end if - #end for - #set klen = $kmer_len - #set bfsize = -1 - #if $bloomsize_condition.bloomsize_control == '0': - #set bfsize = $bloomsize_condition.bloom_filter_size - #end if - - --formats '${formats}' - --filepaths '${filepaths}' - --filenames '${filenames}' - --compressed '${compressed}' - --minabundances '${minab}' - --qualitythresholds '${qthres}' - - --klen ${klen} - --bfsize ${bfsize} - - --outfile '${resulttxt}' - --outdir 'sbt' - --tooldir '$__tool_directory__' -]]> - </command> - <inputs> - <repeat name="experiments" title="Select a list of experiments" help="Select a set of experiments on which the Sequence Bloom Tree will be built." min="1"> - <conditional name="conditional_format"> - <param name="format" type="select" label="Select the experiment format" help="FASTA and FASTQ are the supported formats"> - <option value="fasta">FASTA Experiments</option> - <option value="fastq">FASTQ Experiments</option> - <option value="accessions">SRA Accession Numbers</option> - </param> - <when value="fasta"> - <conditional name="conditional_fasta_compressed"> - <param name="fasta_compressed" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Are your experiments compressed?" /> - <when value="0"> - <param format="fasta" name="fastafiles" multiple="true" type="data" label="Select one or more FASTA experiments" /> - </when> - <when value="1"> - <param format="fastagz" name="fastagzfiles" multiple="true" type="data" label="Select one or more FASTA .gz experiments" /> - </when> - </conditional> - </when> - <when value="fastq"> - <conditional name="conditional_fastq_compressed"> - <param name="fastq_compressed" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Are youe experiments compressed?" /> - <when value="0"> - <param format="fastq" name="fastqfiles" multiple="true" type="data" label="Select one or more FASTQ experiments" /> - </when> - <when value="1"> - <param format="fastqgz" name="fastqgzfiles" multiple="true" type="data" label="Select one or more FASTQ .gz experiments" /> - </when> - </conditional> - </when> - <when value="accessions"> - <param name="accession_numbers" type="data" format="tabular" label="Select a list of SRA Accession Numbers" help="Select a tabular file with a list of accession numbers in the first column." /> - </when> - </conditional> - - <param name="min_abundance" type="integer" value="2" min="0" label="Insert a Bloom filter minimum abundance" help="This value is the minimum abundance cutoff for the creation of the Bloom filter." /> - - <conditional name="conditional_quality"> - <param name="quality_control" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Apply a quality control procedure" /> - <when value="1"> - <param name="quality_threshold" size="1" type="float" value="0.8" min="0.0" max="1.0" label="Quality threshold" help="If the number of sequences flagged as poor quality on the total number of sequences in a file is less than this threshold, the whole experiment will be excluded." /> - </when> - </conditional> - </repeat> - - <param name="kmer_len" type="integer" value="21" min="0" label="K-mer length" /> - - <conditional name="bloomsize_condition"> - <param name="bloomsize_control" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Automatically estimate the Bloom filter size" /> - <when value="0"> - <param name="bloom_filter_size" size="1" type="integer" value="1" min="1" label="Bloom Filter size" help="Disable this field to let the tool estimate an appropriate Bloom filter size." /> - </when> - </conditional> - </inputs> - <outputs> - <collection name="list_output" type="list" label="${tool.name} SBT Collection"> - <discover_datasets pattern="(?P<identifier_0>.*(?=\.)).(?P<ext>[^\.]*$)" ext="auto" directory="sbt" /> - </collection> - <data format="txt" name="resulttxt" label="${tool.name} SBT: Result" from_work_dir="sbtres.txt" /> - </outputs> - - <help><![CDATA[ -This tool allows to create Sequence Bloom Trees starting from a set of FASTA or FASTQ files. -It also allows to control the quality of the input dataset and exclude the files that do not reach a specified quality level. - ------ - -**Input file** - -The input of this tool is a set of FASTA or FASTQ experiments, additionally to a set of SRA accession numbers. -For each of the selected experiments, the minimum abundance for the corresponding Bloom filter is required. -Additionally, a quality control procedure could be applied to guarantee that the quality of every experiment always exceed a -specified treshold. Otherwise, experiments with low quality level will be discarded. - -The k-mer length must also be specified, additionally to the Bloom filter size. This last field is optional and it will be -automatically estimated if not provided. - ------ - -**Output** - -This tool returns a collection containing the Sequence Bloom Tree nodes and a file representing the organization of the tree. - -Take a look at the Query tool documentation for a detailed description about how -to query a Sequence Bloom Tree. - ------ - -.. class:: infomark - -**Notes** - -This Galaxy tool has been developed by Fabio Cumbo. - -Please visit this GithHub_repository_ for more information about the BloomTree Manager - -.. _GithHub_repository: https://github.com/fabio-cumbo/bloomtree-manager - ]]></help> - - <expand macro="citations" /> -</tool>
--- a/btman-1.0.0/dataset.tsv Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ -SRR833714 -SRR833713 -SRR833715 -SRR567161 -SRR567146 -SRR191393 -SRR191449 -SRR191448 -SRR191447 -SRR191446 -SRR191445
--- a/btman-1.0.0/macros.xml Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,16 +0,0 @@ -<macros> - <xml name="requirements"> - <requirements> - <requirement type="package" version="2.7.10">python</requirement> - <requirement type="package" version="2.9.1">sra-tools</requirement> - <requirement type="package" version="1.0.0">ntcard</requirement> - <requirement type="package" version="1.00.00">howdesbt</requirement> - </requirements> - </xml> - - <xml name="citations"> - <citations> - <citation type="doi">10.1101/090464</citation> - </citations> - </xml> -</macros> \ No newline at end of file
--- a/btman-1.0.0/makebf.sh Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -#!/bin/bash - -expPath=$1 -expName=$2 -expFormat=$3 -expCompress=$4 - -outExpDir=$5 - -klen=$6 -minab=$7 -bfsize=$8 - -rmCompressed=$9 -rmSource=${10} - -cd ${outExpDir} - -if [ "${expCompress}" == ".gz" ]; then - gzip -dc ${expPath} > ${expName}${expFormat} - howdesbt makebf K=${klen} --min=${minab} --bits=${bfsize} ${expName}${expFormat} --out=${expName}.bf - if [ "${rmCompressed}" -eq "1" ]; then - rm ${expPath} - fi - if [ "${rmSource}" -eq "1" ]; then - rm ${expName}${expFormat} - fi -else - howdesbt makebf K=${klen} --min=${minab} --bits=${bfsize} ${expPath} --out=${expName}.bf - if [ "${rmSource}" -eq "1" ]; then - rm ${expPath} - fi -fi
--- a/btman-1.0.0/query.py Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,148 +0,0 @@ -#!/usr/bin/env python - -import sys, os, optparse, shutil - -__version__ = "1.0.0" -VALID_CHARS = '.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' -# in the case of collections, exitcodes equal to 0 and 1 are not considered errors -ERR_EXIT_CODE = 2 -OK_EXIT_CODE = 0 - -def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ): - print message - with open( logfilepath, 'a+' ) as out: - out.write( message + '\n' ) - if exit: - sys.exit( exitcode ) - -def querySBT( options, args ): - output_dir_path = options.outputdir - outlogfile = options.outfile - - tree_file_paths = options.treep.split( ',' ) - tree_file_names = options.treen.split( ',' ) - tree_def_filepath = None - leafnames_filepath = None - for idx, tree_file_name in enumerate( tree_file_names ): - if tree_file_name == 'howde': - tree_def_filepath = tree_file_paths[ idx ] - elif tree_file_name == 'leafnames': - leafnames_filepath = tree_file_paths[ idx ] - if tree_def_filepath is not None and leafnames_filepath is not None: - break - - if tree_def_filepath is not None and leafnames_filepath is not None: - leafnames_counter = 0 - with open( leafnames_filepath ) as leafnames_file: - for line in leafnames_file: - if line.strip(): - leafnames_counter += 1 - if leafnames_counter > 0: - printLog( outlogfile, 'The selected collection contains a valid tree' ) - shutil.copyfile( tree_def_filepath, 'howde.txt' ) - tree_def_filepath = 'howde.txt' - for idx, tree_file_name in enumerate( tree_file_names ): - if tree_file_name.endswith( 'detbrief.rrr' ): - shutil.copyfile( tree_file_paths[ idx ], tree_file_name + '.bf' ) - - printLog( outlogfile, 'Creating batch of queries' ) - # create tmp batch file - batch_file_name = 'queries.fa' - batch_file = open( batch_file_name, 'w' ) - - comma_sep_file_paths = options.files - # check if options.files contains at least one file path - if comma_sep_file_paths is not None: - # split file paths - file_paths = comma_sep_file_paths.split(",") - # split file names - file_names = options.names.split(",") - for idx, file_path in enumerate(file_paths): - fixed_file_name = ''.join( c for c in file_names[ idx ] if c in VALID_CHARS ) - printLog( outlogfile, '> processing file ' + file_names[ idx ] + ' ( fixed_name=\"' + fixed_file_name + '\" ) ' ) - with open(file_path, 'r') as content_file: - for line in content_file: - line = line.strip() - if line: - line_split = line.strip().split("\t") # split on tab - if len(line_split) == 2: # 0:id , 1:seq , otherwise skip line - original_seq_id = line_split[0] - # fix seq_id using valid chars only - seq_id = ''.join( c for c in original_seq_id if c in VALID_CHARS ) - printLog( outlogfile, '> sequence ' + original_seq_id + ' ( fixed_name=\"' + seq_id + '\" )' ) - seq_text = line_split[1] - - # write on batch file - batch_file.write( '> ' + fixed_file_name + '_' + seq_id + '\n' + seq_text + '\n' ) - batch_file.close() - # query the tree - printLog( outlogfile, 'Querying the tree' ) - query_res_file_path = os.path.abspath( 'answer.txt' ) - sort_param = '--sort' - if options.sort == 0: - sort_param = '' - query_exitcode = os.system( 'howdesbt query --tree=' + os.path.abspath( tree_def_filepath ) + ' ' + os.path.abspath( batch_file_name ) + '=' + str(options.threshold) + ' --out=' + query_res_file_path ) + ' ' + sort_param - if query_exitcode > 0: - printLog( outlogfile, '> ERROR: an error has occurred while querying the tree with the sequence [id: ' + seq_id + '] in input file ' + file_names[ idx ] ) - else: - if os.path.exists( query_res_file_path ): - with open( query_res_file_path ) as query_res_file: - file_path = '' - theta_matches = 0 - for line in query_res_file: - line = line.strip() - if line: - if line.startswith( '*' ): - line_split = line.split( ' ' ) - theta_matches = int( line_split[ 1 ] ) - file_name = line_split[ 0 ].replace( '*', '' ) - file_path = os.path.join( output_dir_path, file_name + '_txt' ) - open( file_path, 'a' ).close() - else: - res_file = open( file_path, 'a+' ) - fraction = str( theta_matches ) + '/' + str( leafnames_counter ) - score = format( round( float( theta_matches ) / float( leafnames_counter ) , 6 ), '6f' ) - res_file.write( line + '\t' + fraction + '\t' + score + '\n' ) - res_file.close() - else: - printLog( outlogfile, 'An error has occurred while querying the tree', exitcode=ERR_EXIT_CODE, exit=True ) - else: - printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True ) - else: - printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True ) - -def __main__(): - # Parse the command line options - usage = "Usage: query.py --files comma_sep_file_paths --names comma_seq_file_names --sequences sequences_text --search search_mode --exact exact_alg --sthreshold threshold --outputdir output_dir_path" - parser = optparse.OptionParser(usage = usage) - parser.add_option("-v", "--version", action="store_true", dest="version", - default=False, help="display version and exit") - parser.add_option("-f", "--files", type="string", - action="store", dest="files", help="comma separated files path") - parser.add_option("-n", "--names", type="string", - action="store", dest="names", help="comma separated names associated to the files specified in --files") - parser.add_option("-k", "--treep", type="string", - action="store", dest="treep", help="paths of files in collection") - parser.add_option("-m", "--treen", type="string", - action="store", dest="treen", help="names of files in collection") - parser.add_option("-t", "--threshold", type="float", default=0.7, - action="store", dest="threshold", help="search threshold") - parser.add_option("-s", "--sort", type="int", default=1, - action="store", dest="sort", help="sort results") - parser.add_option("-o", "--outputdir", type="string", default="output", - action="store", dest="outputdir", help="output directory (collection) path") - parser.add_option("-r", "--outfile", type="string", default="query.txt", - action="store", dest="outfile", help="output log file path") - - (options, args) = parser.parse_args() - if options.version: - print __version__ - else: - # create output dir (collection) - output_dir_path = options.outputdir - if not os.path.exists(output_dir_path): - os.makedirs(output_dir_path) - - querySBT( options, args ) - -if __name__ == "__main__": __main__()
--- a/btman-1.0.0/query.tsv Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -0 CCAACCAAAGGGAAAACTTTTTTCCGACTTTGGCCTAAAGGGTTTAACGGCCAAGTCAGAAGGGAAAAAGTTGCGCCAAAAATGGCGTTAAAATGTGTAATCAGAGAAGCGACACGAAAAGGGGATCAGCTCTTGGCTGGCAATTGGTAGGTCAGAGGTGGATTGGGAAAAGGCAAGTCAGCAACTGTCGATGACGGCGACTGACTGTTAATGAAAATTGTTTTGGCTGTGTGGAAAAAAATACGCGGGAATCCGTGAATTTTCCGAGGAGCTGGTGGAGCGAAGAAAACGGGGTGCTGCTGTTGTAAATGATTGGTGAAAGTCACACGCCCGCAGCCTTGCCAAACTAATTAACGCCAAATGGAGCTAAGGCCTTTGAATGATGGCTGCAGGCTAGCTTATGAAAAGGGGTTGAAGAGAAGTGGAAAAATTGGTAGAAAGGGATTTGCTCAAGATGCC -1 TTAATGACAGGGCCACATGATGTGAAAAAAAATCAGAAACCGAGTCAACGTGAGAAGATAGTACGTACTACCGCAAATGAATGGCCATTTCATTTGCATGTTGGGAGCAACAGAAATGAGAGAGCATCCGAAGCTAACCACAAAAATGGACTTTGCTTCATTATGCACAAACACGCCAATAAATGTAACGAGAAAGATAGTAGGAGCGAAAGACGAGACGAGACAAACAGGAAGAAGACGAGTGGACGAGTGTTTTTTGTAACGAAACTCTTAATCGCTCCTTTGCAGGCTTAAGCTGATAGTTGCTACGTTTATGCCATGAATTTCAAGATCTCTCAAATGCGTGAAAATCCAGTTTATGCGACAGACAAATTCATGTATTTGAAAAATCTTAGCTGATAGAAATCAAAGGTGATT -2 CAATTAATGATAAATATTTTATAAGGTGCGGAAATAAAGTGAGGAATATCTTTTAAATTCAAGTTCAATTCTGAAAGC \ No newline at end of file
--- a/btman-1.0.0/query.xml Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,83 +0,0 @@ -<?xml version="1.0"?> -<tool name="BloomTree Manager - Query" id="btman_query" version="1.0.0"> - <description>a Sequence Bloom Tree</description> - <macros> - <import>macros.xml</import> - </macros> - <expand macro="requirements" /> - <command detect_errors="exit_code"> -<![CDATA[ - python '$__tool_directory__/query.py' - - #set coll_paths = ','.join( [ str( $f ) for $f in $treecollection ] ) - #if $coll_paths is not 'None': - --treep '${coll_paths}' - #set coll_names = ','.join( [ str( $f.name ) for $f in $treecollection ] ) - --treen '${coll_names}' - #end if - - #set file_paths = ','.join( [ str( $f ) for $f in $txtfiles ] ) - #if $file_paths is not 'None': - --files '${file_paths}' - #set file_names = ','.join( [ str( $f.name ) for $f in $txtfiles ] ) - --names '${file_names}' - #end if - - --threshold ${threshold} - --sort ${sort} - - --outputdir 'answer_collection' - --outfile '${resulttxt}' -]]> - </command> - <inputs> - <param name="treecollection" type="data_collection" collection_type="list" label="Select a Sequence Bloom Tree" help="Select a collection generated by the Create tool of the BloomTree Manager suite." /> - <param format="tabular" name="txtfiles" type="data" label="Select query files" multiple="true" optional="false" help="Select one or more tabular files containing (ID, TRANSCRIPT) couples for each line. The content of these files will be merged and the result will represent a query to the Sequence Bloom Tree that will return a collection containing a file for each ID. The content of these files as result of the tool will be a list of accession numbers." /> - <param name="threshold" size="1" type="float" value="0.7" min="0.0" max="1.0" label="Search threshold" help="Fraction of query kmers that must be present in a leaf to be considered a match." /> - <param name="sort" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Sort matched leaves by the number of query kmers present, and report the number of kmers present (if not selected we just report the matched leaves without regard to which matches are better)." /> - </inputs> - <outputs> - <collection name="output_collect" type="list:list" label="BloomTree Manager - Query result collection"> - <discover_datasets pattern="(?P<identifier_0>[^_]+)_(?P<identifier_1>[^_]+)_(?P<ext>[^_]+)" directory="answer_collection" ext="auto" /> - </collection> - <data format="txt" name="resulttxt" label="${tool.name} SBT: Result" from_work_dir="query.txt" /> - </outputs> - - <help><![CDATA[ -This tool is part of the BloomTree Manager Framework that allow to rapidly identify all -sequenced samples which express a transcript of interest. - ----- - -The input for this tool is a list of (ID, TRANSCRIPT) couples, one for each line, -in a tab delimited format:: - - id0 CCAACCAAAGGGAAAACTTTTTTCCGACTTTGGCCTAAAGGGTTTAACGGCCAAGTCAGAAGGGAAAAAGTTGCGCCA - id1 TTAATGACAGGGCCACATGATGTGAAAAAAAATCAGAAACCGAGTCAACGTGAGAAGATAGTACGTACTACCGCAAAT - ... - idn CAATTAATGATAAATATTTTATAAGGTGCGGAAATAAAGTGAGGAATATCTTTTAAATTCAAGTTCAATTCTGAAAGC - -The ID can contain alphanumeric characters in addition to spaces, dots, dashes, and round and square brackets. -Any additional character will be trimmed out. - -The Sequence Bloom Tree identifier must be also specified. It is a string that identify an existing Sequence -Bloom Tree, which should be built with the Create tool of the BloomTree Manager Suite. - -The output of the tool is a list of collections in which every collection contains a file for each ID with a list of -accession numbers representing the samples that express one particular transcript. - ----- - -.. class:: infomark - -**Notes** - -This Galaxy tool has been developed by Fabio Cumbo. - -Please visit this GithHub_repository_ for more information about the BloomTree Manager - -.. _GithHub_repository: https://github.com/fabio-cumbo/bloomtree-manager - ]]></help> - - <expand macro="citations" /> -</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/build.sh Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,7 @@ +#!/bin/bash + +outExpDir=$1 + +cd ${outExpDir} + +howdesbt build --HowDe --tree=union.txt --outtree=howde.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.sh Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,10 @@ +#!/bin/bash + +outExpDir=$1 +bfsize=$2 + +cd ${outExpDir} + +ls *.bf > leafnames.txt +howdesbt cluster --list=leafnames.txt --bits=${bfsize} --tree=union.txt --nodename=node{number} --keepallnodes +#rm leafnames_txt_?
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/create.py Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,233 @@ +#!/usr/bin/env python + +import sys, os, optparse, shutil, glob + +__version__ = "1.0.0" +# in the case of collections, exitcodes equal to 0 and 1 are not considered errors +ERR_EXIT_CODE = 2 +OK_EXIT_CODE = 0 +VALID_CHARS = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' + +def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ): + print message + with open( logfilepath, 'a+' ) as out: + out.write( message + '\n' ) + if exit: + sys.exit( exitcode ) + +def downloadAccessions(formats, filepaths, outlogfile, outdirpath): + downloaded_files = { } + for dataset_idx in range(0, len(formats)): + if formats[ dataset_idx ] == 'accessions': + print filepaths[ dataset_idx ] + with open( filepaths[ dataset_idx ] ) as accessions: + for line in accessions: + print line + accession = line.split( '\t' )[0].strip() + if accession: + printLog( outlogfile, 'Downloading \"' + accession.upper() + '\" with the fastq-dump tool (part of the sra-tools utility)' ) + fastq_dump_exitcode = os.system( 'fastq-dump --outdir ' + outdirpath + ' --fasta ' + accession.upper() ) + if fastq_dump_exitcode > 0: + printLog( outlogfile, '> FASTA file: FAILED ( \"' + accession.upper() + '\" will be excluded )' ) + else: + #os.rename( os.path.join( outdirpath, accession.upper() + '.fasta' ), os.path.join( outdirpath, accession.upper() + '_fasta' ) ) + printLog( outlogfile, '> FASTA file: \"' + accession.upper() + '.fasta\"' ) + accession_data = { + 'format': '.fasta', + 'filepath': os.path.join( outdirpath, accession.upper() + '.fasta' ), + 'filename': ''.join( c for c in accession.upper() if c in VALID_CHARS ) + } + downloaded_files[ accession.upper() ] = accession_data + return downloaded_files + +# format = { fasta, fastq, accession } +# this version skip the quality control procedure +def createSBT( options, args ): + outlogfile = str( options.outfile ) + outdirpath = str( options.outdir ) + if not outdirpath.endswith('/'): outdirpath += '/' + if not os.path.exists( outdirpath ): + os.mkdir( outdirpath ) + outdirpath = os.path.abspath( outdirpath ) + os.chdir( outdirpath ) + tooldirpath = os.path.abspath( str( options.tooldir ) ) + if not tooldirpath.endswith('/'): tooldirpath += '/' + + formats = [ fo for fo in str( options.formats ).split( '|' ) if fo.strip() ] + filepaths = [ filepath for filepath in str( options.filepaths ).split( '|' ) if filepath.strip() ] + filenames = [ filename for filename in str( options.filenames ).split( '|' ) if filename.strip() ] + compressed = [ True == int(c) for c in str( options.compressed ).split( '|' ) if c.strip() ] + minabundances = [ int(minab) for minab in str( options.minabundances ).split( '|' ) if minab.strip() ] + qualitythresholds = [ float(qthres) for qthres in str( options.qualitythresholds ).split( '|' ) if qthres.strip() ] + + klen = int( options.klen ) + bfsize = int( options.bfsize ) + + if len(formats) == len(filepaths) == len(filenames) == len(compressed) == len(minabundances) == len(qualitythresholds): + printLog( outlogfile, 'Retrieving experiments' ) + accessions = downloadAccessions( formats, filepaths, outlogfile, outdirpath ) + printLog( outlogfile, '> ' + str( len( accessions ) ) + ' experiments retrieved from the Sequence Read Archive' ) + acc_arr = [ a for a in accessions ] + print str( acc_arr ) + if bfsize < 0: # estimate bloom filter size + data_paths = ' '.join( accessions[ accession ][ 'filepath' ] for accession in accessions if 'filepath' in accessions[ accession ] ) + print data_paths + if len( data_paths ) > 0: + data_paths += ' ' + for dataset_idx in range(0, len(formats)): + if formats[ dataset_idx ] != 'accessions': + data_paths += ' '.join( path for path in filepaths[ dataset_idx ].split( ',' ) ) + # ntcard + printLog( outlogfile, 'Estimating the Bloom Filter size with ntcard' ) + if len( data_paths ) > 0: + ntcard_res_filepath = os.path.join( outdirpath, 'freq_k' + str( klen ) + '.hist' ) + ntcard_exitcode = os.system( 'ntcard --kmer=' + str( klen ) + ' ' + data_paths ) + print 'ntcard --kmer=' + str( klen ) + ' ' + data_paths + if ntcard_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(ntcard_exitcode) + '] an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True ) + else: + if os.path.exists( ntcard_res_filepath ): + os.rename( ntcard_res_filepath, os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' ) ) + ntcard_res_filepath = os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' ) + var_F0 = None + var_f1 = None + with open( ntcard_res_filepath ) as ntcard_res: + for line in ntcard_res: + line = line.strip() + if line: + line_split = line.split( '\t' ) + if len(line_split) == 2: + if line_split[0] == 'F0': + var_F0 = int( line_split[1] ) + elif line_split[0] == 'f1': + var_f1 = int( line_split[1] ) + if var_F0 is not None and var_f1 is not None: + break + if var_F0 is not None and var_f1 is not None: + bfsize = var_F0 - var_f1 + printLog( outlogfile, '> estimated Bloom Filter size: ' + str(bfsize) ) + else: + printLog( outlogfile, '> an error has occurred while estimating the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> unable to estimate the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True ) + + if bfsize > 0: + for dataset_idx in range(0, len(formats)): + if formats[ dataset_idx ] == 'accessions': + with open( filepaths[ dataset_idx ] ) as accessions_file: + for line in accessions_file: + accession = line.split( '\t' )[0].strip().upper() + if accession in accessions: + curr_format = accessions[ accession ][ 'format' ] + curr_compressed = 'uncompress' + curr_filepath = accessions[ accession ][ 'filepath' ] + curr_filename = accessions[ accession ][ 'filename' ] + printLog( outlogfile, 'Processing \"' + accession + '\" ( format=\"' + curr_format + + '\", compressed=\"' + str(False) + '\", fixed_name=\"' + curr_filename + '\" )' ) + print 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1' + makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' + + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1' ) + if makebf_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + accession + '\" will be excluded )' ) + else: + printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename + '.bf\"' ) + else: + curr_format = '.' + formats[ dataset_idx ].lower() + curr_compressed = '.gz' if compressed[ dataset_idx ] else 'uncompress' + curr_filepaths = filepaths[ dataset_idx ].split( ',' ) + curr_filenames = filenames[ dataset_idx ].split( ',' ) + for curr_idx in range(0, len(curr_formats)): + curr_filename_fixed = ''.join( c for c in curr_filenames[ curr_idx ] if c in VALID_CHARS ) + printLog( outlogfile, 'Processing \"' + curr_filenames[ curr_idx ] + '\" ( format=\"' + curr_format + + '\", compressed=\"' + str(compressed[ dataset_idx ]) + '\", fixed_name=\"' + curr_filename_fixed + '\" )' ) + if compressed[ dataset_idx ]: + makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' + + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 1' ) + else: + makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' + + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 0' ) + if makebf_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + curr_filenames[ curr_idx ] + '\" will be excluded )' ) + else: + printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename_fixed + '.bf\"' ) + # Create a tree topology + printLog( outlogfile, 'Creating a tree topology file' ) + bf_counter = len( glob.glob1( outdirpath, '*.bf' ) ) + if bf_counter > 0: + cluster_exitcode = os.system( 'sh ' + tooldirpath + 'cluster.sh ' + outdirpath + ' ' + str( bfsize ) ) + if cluster_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(cluster_exitcode) + '] an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True ) + else: + # Build the HowDeSBT nodes + if os.path.exists( os.path.join( outdirpath, 'leafnames.txt' ) ): + printLog( outlogfile, 'Building the Bloom Filter files for the tree' ) + build_exitcode = os.system( 'sh ' + tooldirpath + 'build.sh ' + outdirpath ) + if build_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(build_exitcode) + '] an error has occurred during the creation of the Bloom Filter files for the tree', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> the tree has been successfully built: \"howde.txt\"', exitcode=OK_EXIT_CODE, exit=True ) + ''' + howde_filepath = os.path.join( outdirpath, 'howde.txt' ) + howde_galaxy_filepath = os.path.join( outdirpath, 'howde_galaxy.txt' ) + howde_galaxy = open( howde_galaxy_filepath, 'w' ) + with open( howde_filepath ) as howde_file: + for line in howde_file: + line = line.strip() + if line: + # trim stars * and get node name + # find galaxy file path to the node name + # rewrite path with stars + howde_galaxy.close() + ''' + else: + printLog( outlogfile, '> an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> no Bloom Filter files found', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> ERROR: the Bloom Filter size is ' + str( bfsize ), exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, 'Something went wrong with the input parameters', exitcode=ERR_EXIT_CODE, exit=True ) + +def __main__(): + # Parse the command line options + usage = ("Usage: create.py --formats file_formats --filepaths file_paths --filenames file_names " + "--compressed file_compressed --minabundance min_abundance --qualitythresholds quality_thresholds " + "--klen kmer_len --bfsize bloom_filter_size --outfile out_log_file_path --outdir out_dir_path") + parser = optparse.OptionParser(usage = usage) + parser.add_option("-v", "--version", action="store_true", dest="version", + default=False, help="display version and exit") + parser.add_option("-f", "--formats", type="string", + action="store", dest="formats", help="list of file formats separated by a tab character") + parser.add_option("-p", "--filepaths", type="string", + action="store", dest="filepaths", help="list of input file paths separated by a tab character") + parser.add_option("-n", "--filenames", type="string", + action="store", dest="filenames", help="list of input file names separated by a tab character") + parser.add_option("-c", "--compressed", type="string", + action="store", dest="compressed", help="list of compressed flags related to the imput files separated by a tab character") + parser.add_option("-m", "--minabundances", type="string", + action="store", dest="minabundances", help="list of blooom filter minimum abundances related to the imput files separated by a tab character") + parser.add_option("-q", "--qualitythresholds", type="string", + action="store", dest="qualitythresholds", help="list of quality thresholds related to the imput files separated by a tab character") + parser.add_option("-k", "--klen", type="int", default=21, + action="store", dest="klen", help="k-mer length") + parser.add_option("-b", "--bfsize", type="int", default=-1, + action="store", dest="bfsize", help="bloom filter size") + parser.add_option("-o", "--outfile", type="string", default="sbtres.txt", + action="store", dest="outfile", help="output log file path") + parser.add_option("-d", "--outdir", type="string", default="sbtres.txt", + action="store", dest="outdir", help="output directory path") + parser.add_option("-t", "--tooldir", type="string", default="./", + action="store", dest="tooldir", help="tool directory path") + + (options, args) = parser.parse_args() + if options.version: + print __version__ + else: + createSBT( options, args ) + +if __name__ == "__main__": __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/create.xml Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,173 @@ +<?xml version="1.0"?> +<tool name="BloomTree Manager - Create" id="btman_create" version="1.0.0"> + <description>a Sequence Bloom Tree</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"> +<![CDATA[ + python '$__tool_directory__/create.py' + + #set formats = '' + #set filepaths = '' + #set filenames = '' + #set compressed = '' + #set minab = '' + #set qthres = '' + #for $i, $exp in enumerate( $experiments ): + #set formats += str( $exp.conditional_format.format ) + '|' + #if $exp.conditional_format.format == 'accessions': + #set filepaths += str( $exp.conditional_format.accession_numbers ) + '|' + #set filenames += str( $exp.conditional_format.accession_numbers.name ) + '|' + #set compressed += '0|' + #else: + #if $exp.conditional_format.format == 'fasta': + #set compressed += str( $exp.conditional_format.conditional_fasta_compressed.fasta_compressed ) + '|' + #if $exp.conditional_format.conditional_fasta_compressed.fasta_compressed == 0: + #set filepaths += ','.join( [ str( $f ) for $f in $exp.conditional_format.conditional_fasta_compressed.fastafiles ] ) + '|' + #set filenames += ','.join( [ str( $f.name ) for $f in $exp.conditional_format.conditional_fasta_compressed.fastafiles ] ) + '|' + #else: + #set filepaths += ','.join( [ str( $f ) for $f in $exp.conditional_format.conditional_fasta_compressed.fastagzfiles ] ) + '|' + #set filenames += ','.join( [ str( $f.name ) for $f in $exp.conditional_format.conditional_fasta_compressed.fastagzfiles ] ) + '|' + #end if + #elif $exp.conditional_format.format == 'fastq': + #set compressed += str( $exp.conditional_format.conditional_fastq_compressed.fastq_compressed ) + '|' + #if $exp.conditional_format.conditional_fastq_compressed.fastq_compressed == 0: + #set filepaths += ','.join( [ str( $f ) for $f in $exp.conditional_format.conditional_fastq_compressed.fastqfiles ] ) + '|' + #set filenames += ','.join( [ str( $f.name ) for $f in $exp.conditional_format.conditional_fastq_compressed.fastqfiles ] ) + '|' + #else: + #set filepaths += ','.join( [ str( $f ) for $f in $exp.conditional_format.conditional_fastq_compressed.fastqgzfiles ] ) + '|' + #set filenames += ','.join( [ str( $f.name ) for $f in $exp.conditional_format.conditional_fastq_compressed.fastqgzfiles ] ) + '|' + #end if + #end if + #end if + #set minab += str( $exp.min_abundance ) + '|' + #if $exp.conditional_quality.quality_control == '1': + #set qthres += str( $exp.conditional_quality.quality_threshold ) + '|' + #else: + #set qthres += '-1.0|' + #end if + #end for + #set klen = $kmer_len + #set bfsize = -1 + #if $bloomsize_condition.bloomsize_control == '0': + #set bfsize = $bloomsize_condition.bloom_filter_size + #end if + + --formats '${formats}' + --filepaths '${filepaths}' + --filenames '${filenames}' + --compressed '${compressed}' + --minabundances '${minab}' + --qualitythresholds '${qthres}' + + --klen ${klen} + --bfsize ${bfsize} + + --outfile '${resulttxt}' + --outdir 'sbt' + --tooldir '$__tool_directory__' +]]> + </command> + <inputs> + <repeat name="experiments" title="Select a list of experiments" help="Select a set of experiments on which the Sequence Bloom Tree will be built." min="1"> + <conditional name="conditional_format"> + <param name="format" type="select" label="Select the experiment format" help="FASTA and FASTQ are the supported formats"> + <option value="fasta">FASTA Experiments</option> + <option value="fastq">FASTQ Experiments</option> + <option value="accessions">SRA Accession Numbers</option> + </param> + <when value="fasta"> + <conditional name="conditional_fasta_compressed"> + <param name="fasta_compressed" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Are your experiments compressed?" /> + <when value="0"> + <param format="fasta" name="fastafiles" multiple="true" type="data" label="Select one or more FASTA experiments" /> + </when> + <when value="1"> + <param format="fastagz" name="fastagzfiles" multiple="true" type="data" label="Select one or more FASTA .gz experiments" /> + </when> + </conditional> + </when> + <when value="fastq"> + <conditional name="conditional_fastq_compressed"> + <param name="fastq_compressed" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Are youe experiments compressed?" /> + <when value="0"> + <param format="fastq" name="fastqfiles" multiple="true" type="data" label="Select one or more FASTQ experiments" /> + </when> + <when value="1"> + <param format="fastqgz" name="fastqgzfiles" multiple="true" type="data" label="Select one or more FASTQ .gz experiments" /> + </when> + </conditional> + </when> + <when value="accessions"> + <param name="accession_numbers" type="data" format="tabular" label="Select a list of SRA Accession Numbers" help="Select a tabular file with a list of accession numbers in the first column." /> + </when> + </conditional> + + <param name="min_abundance" type="integer" value="2" min="0" label="Insert a Bloom filter minimum abundance" help="This value is the minimum abundance cutoff for the creation of the Bloom filter." /> + + <conditional name="conditional_quality"> + <param name="quality_control" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Apply a quality control procedure" /> + <when value="1"> + <param name="quality_threshold" size="1" type="float" value="0.8" min="0.0" max="1.0" label="Quality threshold" help="If the number of sequences flagged as poor quality on the total number of sequences in a file is less than this threshold, the whole experiment will be excluded." /> + </when> + </conditional> + </repeat> + + <param name="kmer_len" type="integer" value="21" min="0" label="K-mer length" /> + + <conditional name="bloomsize_condition"> + <param name="bloomsize_control" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Automatically estimate the Bloom filter size" /> + <when value="0"> + <param name="bloom_filter_size" size="1" type="integer" value="1" min="1" label="Bloom Filter size" help="Disable this field to let the tool estimate an appropriate Bloom filter size." /> + </when> + </conditional> + </inputs> + <outputs> + <collection name="list_output" type="list" label="${tool.name} SBT Collection"> + <discover_datasets pattern="(?P<identifier_0>.*(?=\.)).(?P<ext>[^\.]*$)" ext="auto" directory="sbt" /> + </collection> + <data format="txt" name="resulttxt" label="${tool.name} SBT: Result" from_work_dir="sbtres.txt" /> + </outputs> + + <help><![CDATA[ +This tool allows to create Sequence Bloom Trees starting from a set of FASTA or FASTQ files. +It also allows to control the quality of the input dataset and exclude the files that do not reach a specified quality level. + +----- + +**Input file** + +The input of this tool is a set of FASTA or FASTQ experiments, additionally to a set of SRA accession numbers. +For each of the selected experiments, the minimum abundance for the corresponding Bloom filter is required. +Additionally, a quality control procedure could be applied to guarantee that the quality of every experiment always exceed a +specified treshold. Otherwise, experiments with low quality level will be discarded. + +The k-mer length must also be specified, additionally to the Bloom filter size. This last field is optional and it will be +automatically estimated if not provided. + +----- + +**Output** + +This tool returns a collection containing the Sequence Bloom Tree nodes and a file representing the organization of the tree. + +Take a look at the Query tool documentation for a detailed description about how +to query a Sequence Bloom Tree. + +----- + +.. class:: infomark + +**Notes** + +This Galaxy tool has been developed by Fabio Cumbo. + +Please visit this GithHub_repository_ for more information about the BloomTree Manager + +.. _GithHub_repository: https://github.com/fabio-cumbo/bloomtree-manager + ]]></help> + + <expand macro="citations" /> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dataset.tsv Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,11 @@ +SRR833714 +SRR833713 +SRR833715 +SRR567161 +SRR567146 +SRR191393 +SRR191449 +SRR191448 +SRR191447 +SRR191446 +SRR191445
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,16 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="2.7.10">python</requirement> + <requirement type="package" version="2.9.1">sra-tools</requirement> + <requirement type="package" version="1.0.0">ntcard</requirement> + <requirement type="package" version="1.00.00">howdesbt</requirement> + </requirements> + </xml> + + <xml name="citations"> + <citations> + <citation type="doi">10.1101/090464</citation> + </citations> + </xml> +</macros> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/makebf.sh Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,33 @@ +#!/bin/bash + +expPath=$1 +expName=$2 +expFormat=$3 +expCompress=$4 + +outExpDir=$5 + +klen=$6 +minab=$7 +bfsize=$8 + +rmCompressed=$9 +rmSource=${10} + +cd ${outExpDir} + +if [ "${expCompress}" == ".gz" ]; then + gzip -dc ${expPath} > ${expName}${expFormat} + howdesbt makebf K=${klen} --min=${minab} --bits=${bfsize} ${expName}${expFormat} --out=${expName}.bf + if [ "${rmCompressed}" -eq "1" ]; then + rm ${expPath} + fi + if [ "${rmSource}" -eq "1" ]; then + rm ${expName}${expFormat} + fi +else + howdesbt makebf K=${klen} --min=${minab} --bits=${bfsize} ${expPath} --out=${expName}.bf + if [ "${rmSource}" -eq "1" ]; then + rm ${expPath} + fi +fi
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query.py Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,148 @@ +#!/usr/bin/env python + +import sys, os, optparse, shutil + +__version__ = "1.0.0" +VALID_CHARS = '.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' +# in the case of collections, exitcodes equal to 0 and 1 are not considered errors +ERR_EXIT_CODE = 2 +OK_EXIT_CODE = 0 + +def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ): + print message + with open( logfilepath, 'a+' ) as out: + out.write( message + '\n' ) + if exit: + sys.exit( exitcode ) + +def querySBT( options, args ): + output_dir_path = options.outputdir + outlogfile = options.outfile + + tree_file_paths = options.treep.split( ',' ) + tree_file_names = options.treen.split( ',' ) + tree_def_filepath = None + leafnames_filepath = None + for idx, tree_file_name in enumerate( tree_file_names ): + if tree_file_name == 'howde': + tree_def_filepath = tree_file_paths[ idx ] + elif tree_file_name == 'leafnames': + leafnames_filepath = tree_file_paths[ idx ] + if tree_def_filepath is not None and leafnames_filepath is not None: + break + + if tree_def_filepath is not None and leafnames_filepath is not None: + leafnames_counter = 0 + with open( leafnames_filepath ) as leafnames_file: + for line in leafnames_file: + if line.strip(): + leafnames_counter += 1 + if leafnames_counter > 0: + printLog( outlogfile, 'The selected collection contains a valid tree' ) + shutil.copyfile( tree_def_filepath, 'howde.txt' ) + tree_def_filepath = 'howde.txt' + for idx, tree_file_name in enumerate( tree_file_names ): + if tree_file_name.endswith( 'detbrief.rrr' ): + shutil.copyfile( tree_file_paths[ idx ], tree_file_name + '.bf' ) + + printLog( outlogfile, 'Creating batch of queries' ) + # create tmp batch file + batch_file_name = 'queries.fa' + batch_file = open( batch_file_name, 'w' ) + + comma_sep_file_paths = options.files + # check if options.files contains at least one file path + if comma_sep_file_paths is not None: + # split file paths + file_paths = comma_sep_file_paths.split(",") + # split file names + file_names = options.names.split(",") + for idx, file_path in enumerate(file_paths): + fixed_file_name = ''.join( c for c in file_names[ idx ] if c in VALID_CHARS ) + printLog( outlogfile, '> processing file ' + file_names[ idx ] + ' ( fixed_name=\"' + fixed_file_name + '\" ) ' ) + with open(file_path, 'r') as content_file: + for line in content_file: + line = line.strip() + if line: + line_split = line.strip().split("\t") # split on tab + if len(line_split) == 2: # 0:id , 1:seq , otherwise skip line + original_seq_id = line_split[0] + # fix seq_id using valid chars only + seq_id = ''.join( c for c in original_seq_id if c in VALID_CHARS ) + printLog( outlogfile, '> sequence ' + original_seq_id + ' ( fixed_name=\"' + seq_id + '\" )' ) + seq_text = line_split[1] + + # write on batch file + batch_file.write( '> ' + fixed_file_name + '_' + seq_id + '\n' + seq_text + '\n' ) + batch_file.close() + # query the tree + printLog( outlogfile, 'Querying the tree' ) + query_res_file_path = os.path.abspath( 'answer.txt' ) + sort_param = '--sort' + if options.sort == 0: + sort_param = '' + query_exitcode = os.system( 'howdesbt query --tree=' + os.path.abspath( tree_def_filepath ) + ' ' + os.path.abspath( batch_file_name ) + '=' + str(options.threshold) + ' --out=' + query_res_file_path ) + ' ' + sort_param + if query_exitcode > 0: + printLog( outlogfile, '> ERROR: an error has occurred while querying the tree with the sequence [id: ' + seq_id + '] in input file ' + file_names[ idx ] ) + else: + if os.path.exists( query_res_file_path ): + with open( query_res_file_path ) as query_res_file: + file_path = '' + theta_matches = 0 + for line in query_res_file: + line = line.strip() + if line: + if line.startswith( '*' ): + line_split = line.split( ' ' ) + theta_matches = int( line_split[ 1 ] ) + file_name = line_split[ 0 ].replace( '*', '' ) + file_path = os.path.join( output_dir_path, file_name + '_txt' ) + open( file_path, 'a' ).close() + else: + res_file = open( file_path, 'a+' ) + fraction = str( theta_matches ) + '/' + str( leafnames_counter ) + score = format( round( float( theta_matches ) / float( leafnames_counter ) , 6 ), '6f' ) + res_file.write( line + '\t' + fraction + '\t' + score + '\n' ) + res_file.close() + else: + printLog( outlogfile, 'An error has occurred while querying the tree', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True ) + +def __main__(): + # Parse the command line options + usage = "Usage: query.py --files comma_sep_file_paths --names comma_seq_file_names --sequences sequences_text --search search_mode --exact exact_alg --sthreshold threshold --outputdir output_dir_path" + parser = optparse.OptionParser(usage = usage) + parser.add_option("-v", "--version", action="store_true", dest="version", + default=False, help="display version and exit") + parser.add_option("-f", "--files", type="string", + action="store", dest="files", help="comma separated files path") + parser.add_option("-n", "--names", type="string", + action="store", dest="names", help="comma separated names associated to the files specified in --files") + parser.add_option("-k", "--treep", type="string", + action="store", dest="treep", help="paths of files in collection") + parser.add_option("-m", "--treen", type="string", + action="store", dest="treen", help="names of files in collection") + parser.add_option("-t", "--threshold", type="float", default=0.7, + action="store", dest="threshold", help="search threshold") + parser.add_option("-s", "--sort", type="int", default=1, + action="store", dest="sort", help="sort results") + parser.add_option("-o", "--outputdir", type="string", default="output", + action="store", dest="outputdir", help="output directory (collection) path") + parser.add_option("-r", "--outfile", type="string", default="query.txt", + action="store", dest="outfile", help="output log file path") + + (options, args) = parser.parse_args() + if options.version: + print __version__ + else: + # create output dir (collection) + output_dir_path = options.outputdir + if not os.path.exists(output_dir_path): + os.makedirs(output_dir_path) + + querySBT( options, args ) + +if __name__ == "__main__": __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query.tsv Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,3 @@ +0 CCAACCAAAGGGAAAACTTTTTTCCGACTTTGGCCTAAAGGGTTTAACGGCCAAGTCAGAAGGGAAAAAGTTGCGCCAAAAATGGCGTTAAAATGTGTAATCAGAGAAGCGACACGAAAAGGGGATCAGCTCTTGGCTGGCAATTGGTAGGTCAGAGGTGGATTGGGAAAAGGCAAGTCAGCAACTGTCGATGACGGCGACTGACTGTTAATGAAAATTGTTTTGGCTGTGTGGAAAAAAATACGCGGGAATCCGTGAATTTTCCGAGGAGCTGGTGGAGCGAAGAAAACGGGGTGCTGCTGTTGTAAATGATTGGTGAAAGTCACACGCCCGCAGCCTTGCCAAACTAATTAACGCCAAATGGAGCTAAGGCCTTTGAATGATGGCTGCAGGCTAGCTTATGAAAAGGGGTTGAAGAGAAGTGGAAAAATTGGTAGAAAGGGATTTGCTCAAGATGCC +1 TTAATGACAGGGCCACATGATGTGAAAAAAAATCAGAAACCGAGTCAACGTGAGAAGATAGTACGTACTACCGCAAATGAATGGCCATTTCATTTGCATGTTGGGAGCAACAGAAATGAGAGAGCATCCGAAGCTAACCACAAAAATGGACTTTGCTTCATTATGCACAAACACGCCAATAAATGTAACGAGAAAGATAGTAGGAGCGAAAGACGAGACGAGACAAACAGGAAGAAGACGAGTGGACGAGTGTTTTTTGTAACGAAACTCTTAATCGCTCCTTTGCAGGCTTAAGCTGATAGTTGCTACGTTTATGCCATGAATTTCAAGATCTCTCAAATGCGTGAAAATCCAGTTTATGCGACAGACAAATTCATGTATTTGAAAAATCTTAGCTGATAGAAATCAAAGGTGATT +2 CAATTAATGATAAATATTTTATAAGGTGCGGAAATAAAGTGAGGAATATCTTTTAAATTCAAGTTCAATTCTGAAAGC \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query.xml Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,83 @@ +<?xml version="1.0"?> +<tool name="BloomTree Manager - Query" id="btman_query" version="1.0.0"> + <description>a Sequence Bloom Tree</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"> +<![CDATA[ + python '$__tool_directory__/query.py' + + #set coll_paths = ','.join( [ str( $f ) for $f in $treecollection ] ) + #if $coll_paths is not 'None': + --treep '${coll_paths}' + #set coll_names = ','.join( [ str( $f.name ) for $f in $treecollection ] ) + --treen '${coll_names}' + #end if + + #set file_paths = ','.join( [ str( $f ) for $f in $txtfiles ] ) + #if $file_paths is not 'None': + --files '${file_paths}' + #set file_names = ','.join( [ str( $f.name ) for $f in $txtfiles ] ) + --names '${file_names}' + #end if + + --threshold ${threshold} + --sort ${sort} + + --outputdir 'answer_collection' + --outfile '${resulttxt}' +]]> + </command> + <inputs> + <param name="treecollection" type="data_collection" collection_type="list" label="Select a Sequence Bloom Tree" help="Select a collection generated by the Create tool of the BloomTree Manager suite." /> + <param format="tabular" name="txtfiles" type="data" label="Select query files" multiple="true" optional="false" help="Select one or more tabular files containing (ID, TRANSCRIPT) couples for each line. The content of these files will be merged and the result will represent a query to the Sequence Bloom Tree that will return a collection containing a file for each ID. The content of these files as result of the tool will be a list of accession numbers." /> + <param name="threshold" size="1" type="float" value="0.7" min="0.0" max="1.0" label="Search threshold" help="Fraction of query kmers that must be present in a leaf to be considered a match." /> + <param name="sort" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Sort matched leaves by the number of query kmers present, and report the number of kmers present (if not selected we just report the matched leaves without regard to which matches are better)." /> + </inputs> + <outputs> + <collection name="output_collect" type="list:list" label="BloomTree Manager - Query result collection"> + <discover_datasets pattern="(?P<identifier_0>[^_]+)_(?P<identifier_1>[^_]+)_(?P<ext>[^_]+)" directory="answer_collection" ext="auto" /> + </collection> + <data format="txt" name="resulttxt" label="${tool.name} SBT: Result" from_work_dir="query.txt" /> + </outputs> + + <help><![CDATA[ +This tool is part of the BloomTree Manager Framework that allow to rapidly identify all +sequenced samples which express a transcript of interest. + +---- + +The input for this tool is a list of (ID, TRANSCRIPT) couples, one for each line, +in a tab delimited format:: + + id0 CCAACCAAAGGGAAAACTTTTTTCCGACTTTGGCCTAAAGGGTTTAACGGCCAAGTCAGAAGGGAAAAAGTTGCGCCA + id1 TTAATGACAGGGCCACATGATGTGAAAAAAAATCAGAAACCGAGTCAACGTGAGAAGATAGTACGTACTACCGCAAAT + ... + idn CAATTAATGATAAATATTTTATAAGGTGCGGAAATAAAGTGAGGAATATCTTTTAAATTCAAGTTCAATTCTGAAAGC + +The ID can contain alphanumeric characters in addition to spaces, dots, dashes, and round and square brackets. +Any additional character will be trimmed out. + +The Sequence Bloom Tree identifier must be also specified. It is a string that identify an existing Sequence +Bloom Tree, which should be built with the Create tool of the BloomTree Manager Suite. + +The output of the tool is a list of collections in which every collection contains a file for each ID with a list of +accession numbers representing the samples that express one particular transcript. + +---- + +.. class:: infomark + +**Notes** + +This Galaxy tool has been developed by Fabio Cumbo. + +Please visit this GithHub_repository_ for more information about the BloomTree Manager + +.. _GithHub_repository: https://github.com/fabio-cumbo/bloomtree-manager + ]]></help> + + <expand macro="citations" /> +</tool>