# HG changeset patch
# User fabio
# Date 1551706288 18000
# Node ID 7f712cc0d3d5dcb246a3c2b98f4423ca5c922627
# Parent be864d79c9c731ce3277eaa7935cf89762bbfe32
Uploaded 20190304.2
diff -r be864d79c9c7 -r 7f712cc0d3d5 .shed.yml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/.shed.yml Mon Mar 04 08:31:28 2019 -0500
@@ -0,0 +1,21 @@
+name: btman
+owner: iuc
+categories:
+ - Data Source
+ - Web Services
+description: BloomTree Manager
+long_description: |
+ A suite of tools to fast create and query Sequence Bloom Trees
+ supporting determined/how split filters
+remote_repository_url: https://github.com/fabio-cumbo/bloomtree-manager
+homepage_url: https://github.com/fabio-cumbo/bloomtree-manager
+type: unrestricted
+auto_tool_repositories:
+ name_template: "{{ tool_id }}"
+ descriptor_template: "Wrapper for BloomTree Manager: {{ tool_name }}."
+suite:
+ name: "btman_suite"
+ description: "A suite of Galaxy tools designed to work with the BloomTree Manager."
+ long_description: |
+ A suite of tools to fast create and query Sequence Bloom Trees
+ supporting determined/how split filters
diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/.shed.yml
--- a/btman-1.0.0/.shed.yml Mon Mar 04 08:30:03 2019 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,21 +0,0 @@
-name: btman
-owner: iuc
-categories:
- - Data Source
- - Web Services
-description: BloomTree Manager
-long_description: |
- A suite of tools to fast create and query Sequence Bloom Trees
- supporting determined/how split filters
-remote_repository_url: https://github.com/fabio-cumbo/bloomtree-manager
-homepage_url: https://github.com/fabio-cumbo/bloomtree-manager
-type: unrestricted
-auto_tool_repositories:
- name_template: "{{ tool_id }}"
- descriptor_template: "Wrapper for BloomTree Manager: {{ tool_name }}."
-suite:
- name: "btman_suite"
- description: "A suite of Galaxy tools designed to work with the BloomTree Manager."
- long_description: |
- A suite of tools to fast create and query Sequence Bloom Trees
- supporting determined/how split filters
diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/build.sh
--- a/btman-1.0.0/build.sh Mon Mar 04 08:30:03 2019 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-outExpDir=$1
-
-cd ${outExpDir}
-
-howdesbt build --HowDe --tree=union.txt --outtree=howde.txt
diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/cluster.sh
--- a/btman-1.0.0/cluster.sh Mon Mar 04 08:30:03 2019 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-#!/bin/bash
-
-outExpDir=$1
-bfsize=$2
-
-cd ${outExpDir}
-
-ls *.bf > leafnames.txt
-howdesbt cluster --list=leafnames.txt --bits=${bfsize} --tree=union.txt --nodename=node{number} --keepallnodes
-#rm leafnames_txt_?
diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/create.py
--- a/btman-1.0.0/create.py Mon Mar 04 08:30:03 2019 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,233 +0,0 @@
-#!/usr/bin/env python
-
-import sys, os, optparse, shutil, glob
-
-__version__ = "1.0.0"
-# in the case of collections, exitcodes equal to 0 and 1 are not considered errors
-ERR_EXIT_CODE = 2
-OK_EXIT_CODE = 0
-VALID_CHARS = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
-
-def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ):
- print message
- with open( logfilepath, 'a+' ) as out:
- out.write( message + '\n' )
- if exit:
- sys.exit( exitcode )
-
-def downloadAccessions(formats, filepaths, outlogfile, outdirpath):
- downloaded_files = { }
- for dataset_idx in range(0, len(formats)):
- if formats[ dataset_idx ] == 'accessions':
- print filepaths[ dataset_idx ]
- with open( filepaths[ dataset_idx ] ) as accessions:
- for line in accessions:
- print line
- accession = line.split( '\t' )[0].strip()
- if accession:
- printLog( outlogfile, 'Downloading \"' + accession.upper() + '\" with the fastq-dump tool (part of the sra-tools utility)' )
- fastq_dump_exitcode = os.system( 'fastq-dump --outdir ' + outdirpath + ' --fasta ' + accession.upper() )
- if fastq_dump_exitcode > 0:
- printLog( outlogfile, '> FASTA file: FAILED ( \"' + accession.upper() + '\" will be excluded )' )
- else:
- #os.rename( os.path.join( outdirpath, accession.upper() + '.fasta' ), os.path.join( outdirpath, accession.upper() + '_fasta' ) )
- printLog( outlogfile, '> FASTA file: \"' + accession.upper() + '.fasta\"' )
- accession_data = {
- 'format': '.fasta',
- 'filepath': os.path.join( outdirpath, accession.upper() + '.fasta' ),
- 'filename': ''.join( c for c in accession.upper() if c in VALID_CHARS )
- }
- downloaded_files[ accession.upper() ] = accession_data
- return downloaded_files
-
-# format = { fasta, fastq, accession }
-# this version skip the quality control procedure
-def createSBT( options, args ):
- outlogfile = str( options.outfile )
- outdirpath = str( options.outdir )
- if not outdirpath.endswith('/'): outdirpath += '/'
- if not os.path.exists( outdirpath ):
- os.mkdir( outdirpath )
- outdirpath = os.path.abspath( outdirpath )
- os.chdir( outdirpath )
- tooldirpath = os.path.abspath( str( options.tooldir ) )
- if not tooldirpath.endswith('/'): tooldirpath += '/'
-
- formats = [ fo for fo in str( options.formats ).split( '|' ) if fo.strip() ]
- filepaths = [ filepath for filepath in str( options.filepaths ).split( '|' ) if filepath.strip() ]
- filenames = [ filename for filename in str( options.filenames ).split( '|' ) if filename.strip() ]
- compressed = [ True == int(c) for c in str( options.compressed ).split( '|' ) if c.strip() ]
- minabundances = [ int(minab) for minab in str( options.minabundances ).split( '|' ) if minab.strip() ]
- qualitythresholds = [ float(qthres) for qthres in str( options.qualitythresholds ).split( '|' ) if qthres.strip() ]
-
- klen = int( options.klen )
- bfsize = int( options.bfsize )
-
- if len(formats) == len(filepaths) == len(filenames) == len(compressed) == len(minabundances) == len(qualitythresholds):
- printLog( outlogfile, 'Retrieving experiments' )
- accessions = downloadAccessions( formats, filepaths, outlogfile, outdirpath )
- printLog( outlogfile, '> ' + str( len( accessions ) ) + ' experiments retrieved from the Sequence Read Archive' )
- acc_arr = [ a for a in accessions ]
- print str( acc_arr )
- if bfsize < 0: # estimate bloom filter size
- data_paths = ' '.join( accessions[ accession ][ 'filepath' ] for accession in accessions if 'filepath' in accessions[ accession ] )
- print data_paths
- if len( data_paths ) > 0:
- data_paths += ' '
- for dataset_idx in range(0, len(formats)):
- if formats[ dataset_idx ] != 'accessions':
- data_paths += ' '.join( path for path in filepaths[ dataset_idx ].split( ',' ) )
- # ntcard
- printLog( outlogfile, 'Estimating the Bloom Filter size with ntcard' )
- if len( data_paths ) > 0:
- ntcard_res_filepath = os.path.join( outdirpath, 'freq_k' + str( klen ) + '.hist' )
- ntcard_exitcode = os.system( 'ntcard --kmer=' + str( klen ) + ' ' + data_paths )
- print 'ntcard --kmer=' + str( klen ) + ' ' + data_paths
- if ntcard_exitcode > 0:
- printLog( outlogfile, '> [exitcode: ' + str(ntcard_exitcode) + '] an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True )
- else:
- if os.path.exists( ntcard_res_filepath ):
- os.rename( ntcard_res_filepath, os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' ) )
- ntcard_res_filepath = os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' )
- var_F0 = None
- var_f1 = None
- with open( ntcard_res_filepath ) as ntcard_res:
- for line in ntcard_res:
- line = line.strip()
- if line:
- line_split = line.split( '\t' )
- if len(line_split) == 2:
- if line_split[0] == 'F0':
- var_F0 = int( line_split[1] )
- elif line_split[0] == 'f1':
- var_f1 = int( line_split[1] )
- if var_F0 is not None and var_f1 is not None:
- break
- if var_F0 is not None and var_f1 is not None:
- bfsize = var_F0 - var_f1
- printLog( outlogfile, '> estimated Bloom Filter size: ' + str(bfsize) )
- else:
- printLog( outlogfile, '> an error has occurred while estimating the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True )
- else:
- printLog( outlogfile, '> an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True )
- else:
- printLog( outlogfile, '> unable to estimate the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True )
-
- if bfsize > 0:
- for dataset_idx in range(0, len(formats)):
- if formats[ dataset_idx ] == 'accessions':
- with open( filepaths[ dataset_idx ] ) as accessions_file:
- for line in accessions_file:
- accession = line.split( '\t' )[0].strip().upper()
- if accession in accessions:
- curr_format = accessions[ accession ][ 'format' ]
- curr_compressed = 'uncompress'
- curr_filepath = accessions[ accession ][ 'filepath' ]
- curr_filename = accessions[ accession ][ 'filename' ]
- printLog( outlogfile, 'Processing \"' + accession + '\" ( format=\"' + curr_format +
- '\", compressed=\"' + str(False) + '\", fixed_name=\"' + curr_filename + '\" )' )
- print 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1'
- makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' +
- curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' +
- str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1' )
- if makebf_exitcode > 0:
- printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + accession + '\" will be excluded )' )
- else:
- printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename + '.bf\"' )
- else:
- curr_format = '.' + formats[ dataset_idx ].lower()
- curr_compressed = '.gz' if compressed[ dataset_idx ] else 'uncompress'
- curr_filepaths = filepaths[ dataset_idx ].split( ',' )
- curr_filenames = filenames[ dataset_idx ].split( ',' )
- for curr_idx in range(0, len(curr_formats)):
- curr_filename_fixed = ''.join( c for c in curr_filenames[ curr_idx ] if c in VALID_CHARS )
- printLog( outlogfile, 'Processing \"' + curr_filenames[ curr_idx ] + '\" ( format=\"' + curr_format +
- '\", compressed=\"' + str(compressed[ dataset_idx ]) + '\", fixed_name=\"' + curr_filename_fixed + '\" )' )
- if compressed[ dataset_idx ]:
- makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' +
- curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' +
- str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 1' )
- else:
- makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' +
- curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' +
- str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 0' )
- if makebf_exitcode > 0:
- printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + curr_filenames[ curr_idx ] + '\" will be excluded )' )
- else:
- printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename_fixed + '.bf\"' )
- # Create a tree topology
- printLog( outlogfile, 'Creating a tree topology file' )
- bf_counter = len( glob.glob1( outdirpath, '*.bf' ) )
- if bf_counter > 0:
- cluster_exitcode = os.system( 'sh ' + tooldirpath + 'cluster.sh ' + outdirpath + ' ' + str( bfsize ) )
- if cluster_exitcode > 0:
- printLog( outlogfile, '> [exitcode: ' + str(cluster_exitcode) + '] an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True )
- else:
- # Build the HowDeSBT nodes
- if os.path.exists( os.path.join( outdirpath, 'leafnames.txt' ) ):
- printLog( outlogfile, 'Building the Bloom Filter files for the tree' )
- build_exitcode = os.system( 'sh ' + tooldirpath + 'build.sh ' + outdirpath )
- if build_exitcode > 0:
- printLog( outlogfile, '> [exitcode: ' + str(build_exitcode) + '] an error has occurred during the creation of the Bloom Filter files for the tree', exitcode=ERR_EXIT_CODE, exit=True )
- else:
- printLog( outlogfile, '> the tree has been successfully built: \"howde.txt\"', exitcode=OK_EXIT_CODE, exit=True )
- '''
- howde_filepath = os.path.join( outdirpath, 'howde.txt' )
- howde_galaxy_filepath = os.path.join( outdirpath, 'howde_galaxy.txt' )
- howde_galaxy = open( howde_galaxy_filepath, 'w' )
- with open( howde_filepath ) as howde_file:
- for line in howde_file:
- line = line.strip()
- if line:
- # trim stars * and get node name
- # find galaxy file path to the node name
- # rewrite path with stars
- howde_galaxy.close()
- '''
- else:
- printLog( outlogfile, '> an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True )
- else:
- printLog( outlogfile, '> no Bloom Filter files found', exitcode=ERR_EXIT_CODE, exit=True )
- else:
- printLog( outlogfile, '> ERROR: the Bloom Filter size is ' + str( bfsize ), exitcode=ERR_EXIT_CODE, exit=True )
- else:
- printLog( outlogfile, 'Something went wrong with the input parameters', exitcode=ERR_EXIT_CODE, exit=True )
-
-def __main__():
- # Parse the command line options
- usage = ("Usage: create.py --formats file_formats --filepaths file_paths --filenames file_names "
- "--compressed file_compressed --minabundance min_abundance --qualitythresholds quality_thresholds "
- "--klen kmer_len --bfsize bloom_filter_size --outfile out_log_file_path --outdir out_dir_path")
- parser = optparse.OptionParser(usage = usage)
- parser.add_option("-v", "--version", action="store_true", dest="version",
- default=False, help="display version and exit")
- parser.add_option("-f", "--formats", type="string",
- action="store", dest="formats", help="list of file formats separated by a tab character")
- parser.add_option("-p", "--filepaths", type="string",
- action="store", dest="filepaths", help="list of input file paths separated by a tab character")
- parser.add_option("-n", "--filenames", type="string",
- action="store", dest="filenames", help="list of input file names separated by a tab character")
- parser.add_option("-c", "--compressed", type="string",
- action="store", dest="compressed", help="list of compressed flags related to the imput files separated by a tab character")
- parser.add_option("-m", "--minabundances", type="string",
- action="store", dest="minabundances", help="list of blooom filter minimum abundances related to the imput files separated by a tab character")
- parser.add_option("-q", "--qualitythresholds", type="string",
- action="store", dest="qualitythresholds", help="list of quality thresholds related to the imput files separated by a tab character")
- parser.add_option("-k", "--klen", type="int", default=21,
- action="store", dest="klen", help="k-mer length")
- parser.add_option("-b", "--bfsize", type="int", default=-1,
- action="store", dest="bfsize", help="bloom filter size")
- parser.add_option("-o", "--outfile", type="string", default="sbtres.txt",
- action="store", dest="outfile", help="output log file path")
- parser.add_option("-d", "--outdir", type="string", default="sbtres.txt",
- action="store", dest="outdir", help="output directory path")
- parser.add_option("-t", "--tooldir", type="string", default="./",
- action="store", dest="tooldir", help="tool directory path")
-
- (options, args) = parser.parse_args()
- if options.version:
- print __version__
- else:
- createSBT( options, args )
-
-if __name__ == "__main__": __main__()
diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/create.xml
--- a/btman-1.0.0/create.xml Mon Mar 04 08:30:03 2019 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,173 +0,0 @@
-
-
- a Sequence Bloom Tree
-
- macros.xml
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/dataset.tsv
--- a/btman-1.0.0/dataset.tsv Mon Mar 04 08:30:03 2019 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,11 +0,0 @@
-SRR833714
-SRR833713
-SRR833715
-SRR567161
-SRR567146
-SRR191393
-SRR191449
-SRR191448
-SRR191447
-SRR191446
-SRR191445
diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/macros.xml
--- a/btman-1.0.0/macros.xml Mon Mar 04 08:30:03 2019 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,16 +0,0 @@
-
-
-
- python
- sra-tools
- ntcard
- howdesbt
-
-
-
-
-
- 10.1101/090464
-
-
-
\ No newline at end of file
diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/makebf.sh
--- a/btman-1.0.0/makebf.sh Mon Mar 04 08:30:03 2019 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-expPath=$1
-expName=$2
-expFormat=$3
-expCompress=$4
-
-outExpDir=$5
-
-klen=$6
-minab=$7
-bfsize=$8
-
-rmCompressed=$9
-rmSource=${10}
-
-cd ${outExpDir}
-
-if [ "${expCompress}" == ".gz" ]; then
- gzip -dc ${expPath} > ${expName}${expFormat}
- howdesbt makebf K=${klen} --min=${minab} --bits=${bfsize} ${expName}${expFormat} --out=${expName}.bf
- if [ "${rmCompressed}" -eq "1" ]; then
- rm ${expPath}
- fi
- if [ "${rmSource}" -eq "1" ]; then
- rm ${expName}${expFormat}
- fi
-else
- howdesbt makebf K=${klen} --min=${minab} --bits=${bfsize} ${expPath} --out=${expName}.bf
- if [ "${rmSource}" -eq "1" ]; then
- rm ${expPath}
- fi
-fi
diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/query.py
--- a/btman-1.0.0/query.py Mon Mar 04 08:30:03 2019 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,148 +0,0 @@
-#!/usr/bin/env python
-
-import sys, os, optparse, shutil
-
-__version__ = "1.0.0"
-VALID_CHARS = '.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
-# in the case of collections, exitcodes equal to 0 and 1 are not considered errors
-ERR_EXIT_CODE = 2
-OK_EXIT_CODE = 0
-
-def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ):
- print message
- with open( logfilepath, 'a+' ) as out:
- out.write( message + '\n' )
- if exit:
- sys.exit( exitcode )
-
-def querySBT( options, args ):
- output_dir_path = options.outputdir
- outlogfile = options.outfile
-
- tree_file_paths = options.treep.split( ',' )
- tree_file_names = options.treen.split( ',' )
- tree_def_filepath = None
- leafnames_filepath = None
- for idx, tree_file_name in enumerate( tree_file_names ):
- if tree_file_name == 'howde':
- tree_def_filepath = tree_file_paths[ idx ]
- elif tree_file_name == 'leafnames':
- leafnames_filepath = tree_file_paths[ idx ]
- if tree_def_filepath is not None and leafnames_filepath is not None:
- break
-
- if tree_def_filepath is not None and leafnames_filepath is not None:
- leafnames_counter = 0
- with open( leafnames_filepath ) as leafnames_file:
- for line in leafnames_file:
- if line.strip():
- leafnames_counter += 1
- if leafnames_counter > 0:
- printLog( outlogfile, 'The selected collection contains a valid tree' )
- shutil.copyfile( tree_def_filepath, 'howde.txt' )
- tree_def_filepath = 'howde.txt'
- for idx, tree_file_name in enumerate( tree_file_names ):
- if tree_file_name.endswith( 'detbrief.rrr' ):
- shutil.copyfile( tree_file_paths[ idx ], tree_file_name + '.bf' )
-
- printLog( outlogfile, 'Creating batch of queries' )
- # create tmp batch file
- batch_file_name = 'queries.fa'
- batch_file = open( batch_file_name, 'w' )
-
- comma_sep_file_paths = options.files
- # check if options.files contains at least one file path
- if comma_sep_file_paths is not None:
- # split file paths
- file_paths = comma_sep_file_paths.split(",")
- # split file names
- file_names = options.names.split(",")
- for idx, file_path in enumerate(file_paths):
- fixed_file_name = ''.join( c for c in file_names[ idx ] if c in VALID_CHARS )
- printLog( outlogfile, '> processing file ' + file_names[ idx ] + ' ( fixed_name=\"' + fixed_file_name + '\" ) ' )
- with open(file_path, 'r') as content_file:
- for line in content_file:
- line = line.strip()
- if line:
- line_split = line.strip().split("\t") # split on tab
- if len(line_split) == 2: # 0:id , 1:seq , otherwise skip line
- original_seq_id = line_split[0]
- # fix seq_id using valid chars only
- seq_id = ''.join( c for c in original_seq_id if c in VALID_CHARS )
- printLog( outlogfile, '> sequence ' + original_seq_id + ' ( fixed_name=\"' + seq_id + '\" )' )
- seq_text = line_split[1]
-
- # write on batch file
- batch_file.write( '> ' + fixed_file_name + '_' + seq_id + '\n' + seq_text + '\n' )
- batch_file.close()
- # query the tree
- printLog( outlogfile, 'Querying the tree' )
- query_res_file_path = os.path.abspath( 'answer.txt' )
- sort_param = '--sort'
- if options.sort == 0:
- sort_param = ''
- query_exitcode = os.system( 'howdesbt query --tree=' + os.path.abspath( tree_def_filepath ) + ' ' + os.path.abspath( batch_file_name ) + '=' + str(options.threshold) + ' --out=' + query_res_file_path ) + ' ' + sort_param
- if query_exitcode > 0:
- printLog( outlogfile, '> ERROR: an error has occurred while querying the tree with the sequence [id: ' + seq_id + '] in input file ' + file_names[ idx ] )
- else:
- if os.path.exists( query_res_file_path ):
- with open( query_res_file_path ) as query_res_file:
- file_path = ''
- theta_matches = 0
- for line in query_res_file:
- line = line.strip()
- if line:
- if line.startswith( '*' ):
- line_split = line.split( ' ' )
- theta_matches = int( line_split[ 1 ] )
- file_name = line_split[ 0 ].replace( '*', '' )
- file_path = os.path.join( output_dir_path, file_name + '_txt' )
- open( file_path, 'a' ).close()
- else:
- res_file = open( file_path, 'a+' )
- fraction = str( theta_matches ) + '/' + str( leafnames_counter )
- score = format( round( float( theta_matches ) / float( leafnames_counter ) , 6 ), '6f' )
- res_file.write( line + '\t' + fraction + '\t' + score + '\n' )
- res_file.close()
- else:
- printLog( outlogfile, 'An error has occurred while querying the tree', exitcode=ERR_EXIT_CODE, exit=True )
- else:
- printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True )
- else:
- printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True )
-
-def __main__():
- # Parse the command line options
- usage = "Usage: query.py --files comma_sep_file_paths --names comma_seq_file_names --sequences sequences_text --search search_mode --exact exact_alg --sthreshold threshold --outputdir output_dir_path"
- parser = optparse.OptionParser(usage = usage)
- parser.add_option("-v", "--version", action="store_true", dest="version",
- default=False, help="display version and exit")
- parser.add_option("-f", "--files", type="string",
- action="store", dest="files", help="comma separated files path")
- parser.add_option("-n", "--names", type="string",
- action="store", dest="names", help="comma separated names associated to the files specified in --files")
- parser.add_option("-k", "--treep", type="string",
- action="store", dest="treep", help="paths of files in collection")
- parser.add_option("-m", "--treen", type="string",
- action="store", dest="treen", help="names of files in collection")
- parser.add_option("-t", "--threshold", type="float", default=0.7,
- action="store", dest="threshold", help="search threshold")
- parser.add_option("-s", "--sort", type="int", default=1,
- action="store", dest="sort", help="sort results")
- parser.add_option("-o", "--outputdir", type="string", default="output",
- action="store", dest="outputdir", help="output directory (collection) path")
- parser.add_option("-r", "--outfile", type="string", default="query.txt",
- action="store", dest="outfile", help="output log file path")
-
- (options, args) = parser.parse_args()
- if options.version:
- print __version__
- else:
- # create output dir (collection)
- output_dir_path = options.outputdir
- if not os.path.exists(output_dir_path):
- os.makedirs(output_dir_path)
-
- querySBT( options, args )
-
-if __name__ == "__main__": __main__()
diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/query.tsv
--- a/btman-1.0.0/query.tsv Mon Mar 04 08:30:03 2019 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-0 CCAACCAAAGGGAAAACTTTTTTCCGACTTTGGCCTAAAGGGTTTAACGGCCAAGTCAGAAGGGAAAAAGTTGCGCCAAAAATGGCGTTAAAATGTGTAATCAGAGAAGCGACACGAAAAGGGGATCAGCTCTTGGCTGGCAATTGGTAGGTCAGAGGTGGATTGGGAAAAGGCAAGTCAGCAACTGTCGATGACGGCGACTGACTGTTAATGAAAATTGTTTTGGCTGTGTGGAAAAAAATACGCGGGAATCCGTGAATTTTCCGAGGAGCTGGTGGAGCGAAGAAAACGGGGTGCTGCTGTTGTAAATGATTGGTGAAAGTCACACGCCCGCAGCCTTGCCAAACTAATTAACGCCAAATGGAGCTAAGGCCTTTGAATGATGGCTGCAGGCTAGCTTATGAAAAGGGGTTGAAGAGAAGTGGAAAAATTGGTAGAAAGGGATTTGCTCAAGATGCC
-1 TTAATGACAGGGCCACATGATGTGAAAAAAAATCAGAAACCGAGTCAACGTGAGAAGATAGTACGTACTACCGCAAATGAATGGCCATTTCATTTGCATGTTGGGAGCAACAGAAATGAGAGAGCATCCGAAGCTAACCACAAAAATGGACTTTGCTTCATTATGCACAAACACGCCAATAAATGTAACGAGAAAGATAGTAGGAGCGAAAGACGAGACGAGACAAACAGGAAGAAGACGAGTGGACGAGTGTTTTTTGTAACGAAACTCTTAATCGCTCCTTTGCAGGCTTAAGCTGATAGTTGCTACGTTTATGCCATGAATTTCAAGATCTCTCAAATGCGTGAAAATCCAGTTTATGCGACAGACAAATTCATGTATTTGAAAAATCTTAGCTGATAGAAATCAAAGGTGATT
-2 CAATTAATGATAAATATTTTATAAGGTGCGGAAATAAAGTGAGGAATATCTTTTAAATTCAAGTTCAATTCTGAAAGC
\ No newline at end of file
diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/query.xml
--- a/btman-1.0.0/query.xml Mon Mar 04 08:30:03 2019 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,83 +0,0 @@
-
-
- a Sequence Bloom Tree
-
- macros.xml
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r be864d79c9c7 -r 7f712cc0d3d5 build.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/build.sh Mon Mar 04 08:31:28 2019 -0500
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+outExpDir=$1
+
+cd ${outExpDir}
+
+howdesbt build --HowDe --tree=union.txt --outtree=howde.txt
diff -r be864d79c9c7 -r 7f712cc0d3d5 cluster.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.sh Mon Mar 04 08:31:28 2019 -0500
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+outExpDir=$1
+bfsize=$2
+
+cd ${outExpDir}
+
+ls *.bf > leafnames.txt
+howdesbt cluster --list=leafnames.txt --bits=${bfsize} --tree=union.txt --nodename=node{number} --keepallnodes
+#rm leafnames_txt_?
diff -r be864d79c9c7 -r 7f712cc0d3d5 create.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/create.py Mon Mar 04 08:31:28 2019 -0500
@@ -0,0 +1,233 @@
+#!/usr/bin/env python
+
+import sys, os, optparse, shutil, glob
+
+__version__ = "1.0.0"
+# in the case of collections, exitcodes equal to 0 and 1 are not considered errors
+ERR_EXIT_CODE = 2
+OK_EXIT_CODE = 0
+VALID_CHARS = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+
+def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ):
+ print message
+ with open( logfilepath, 'a+' ) as out:
+ out.write( message + '\n' )
+ if exit:
+ sys.exit( exitcode )
+
+def downloadAccessions(formats, filepaths, outlogfile, outdirpath):
+ downloaded_files = { }
+ for dataset_idx in range(0, len(formats)):
+ if formats[ dataset_idx ] == 'accessions':
+ print filepaths[ dataset_idx ]
+ with open( filepaths[ dataset_idx ] ) as accessions:
+ for line in accessions:
+ print line
+ accession = line.split( '\t' )[0].strip()
+ if accession:
+ printLog( outlogfile, 'Downloading \"' + accession.upper() + '\" with the fastq-dump tool (part of the sra-tools utility)' )
+ fastq_dump_exitcode = os.system( 'fastq-dump --outdir ' + outdirpath + ' --fasta ' + accession.upper() )
+ if fastq_dump_exitcode > 0:
+ printLog( outlogfile, '> FASTA file: FAILED ( \"' + accession.upper() + '\" will be excluded )' )
+ else:
+ #os.rename( os.path.join( outdirpath, accession.upper() + '.fasta' ), os.path.join( outdirpath, accession.upper() + '_fasta' ) )
+ printLog( outlogfile, '> FASTA file: \"' + accession.upper() + '.fasta\"' )
+ accession_data = {
+ 'format': '.fasta',
+ 'filepath': os.path.join( outdirpath, accession.upper() + '.fasta' ),
+ 'filename': ''.join( c for c in accession.upper() if c in VALID_CHARS )
+ }
+ downloaded_files[ accession.upper() ] = accession_data
+ return downloaded_files
+
+# format = { fasta, fastq, accession }
+# this version skip the quality control procedure
+def createSBT( options, args ):
+ outlogfile = str( options.outfile )
+ outdirpath = str( options.outdir )
+ if not outdirpath.endswith('/'): outdirpath += '/'
+ if not os.path.exists( outdirpath ):
+ os.mkdir( outdirpath )
+ outdirpath = os.path.abspath( outdirpath )
+ os.chdir( outdirpath )
+ tooldirpath = os.path.abspath( str( options.tooldir ) )
+ if not tooldirpath.endswith('/'): tooldirpath += '/'
+
+ formats = [ fo for fo in str( options.formats ).split( '|' ) if fo.strip() ]
+ filepaths = [ filepath for filepath in str( options.filepaths ).split( '|' ) if filepath.strip() ]
+ filenames = [ filename for filename in str( options.filenames ).split( '|' ) if filename.strip() ]
+ compressed = [ True == int(c) for c in str( options.compressed ).split( '|' ) if c.strip() ]
+ minabundances = [ int(minab) for minab in str( options.minabundances ).split( '|' ) if minab.strip() ]
+ qualitythresholds = [ float(qthres) for qthres in str( options.qualitythresholds ).split( '|' ) if qthres.strip() ]
+
+ klen = int( options.klen )
+ bfsize = int( options.bfsize )
+
+ if len(formats) == len(filepaths) == len(filenames) == len(compressed) == len(minabundances) == len(qualitythresholds):
+ printLog( outlogfile, 'Retrieving experiments' )
+ accessions = downloadAccessions( formats, filepaths, outlogfile, outdirpath )
+ printLog( outlogfile, '> ' + str( len( accessions ) ) + ' experiments retrieved from the Sequence Read Archive' )
+ acc_arr = [ a for a in accessions ]
+ print str( acc_arr )
+ if bfsize < 0: # estimate bloom filter size
+ data_paths = ' '.join( accessions[ accession ][ 'filepath' ] for accession in accessions if 'filepath' in accessions[ accession ] )
+ print data_paths
+ if len( data_paths ) > 0:
+ data_paths += ' '
+ for dataset_idx in range(0, len(formats)):
+ if formats[ dataset_idx ] != 'accessions':
+ data_paths += ' '.join( path for path in filepaths[ dataset_idx ].split( ',' ) )
+ # ntcard
+ printLog( outlogfile, 'Estimating the Bloom Filter size with ntcard' )
+ if len( data_paths ) > 0:
+ ntcard_res_filepath = os.path.join( outdirpath, 'freq_k' + str( klen ) + '.hist' )
+ ntcard_exitcode = os.system( 'ntcard --kmer=' + str( klen ) + ' ' + data_paths )
+ print 'ntcard --kmer=' + str( klen ) + ' ' + data_paths
+ if ntcard_exitcode > 0:
+ printLog( outlogfile, '> [exitcode: ' + str(ntcard_exitcode) + '] an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True )
+ else:
+ if os.path.exists( ntcard_res_filepath ):
+ os.rename( ntcard_res_filepath, os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' ) )
+ ntcard_res_filepath = os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' )
+ var_F0 = None
+ var_f1 = None
+ with open( ntcard_res_filepath ) as ntcard_res:
+ for line in ntcard_res:
+ line = line.strip()
+ if line:
+ line_split = line.split( '\t' )
+ if len(line_split) == 2:
+ if line_split[0] == 'F0':
+ var_F0 = int( line_split[1] )
+ elif line_split[0] == 'f1':
+ var_f1 = int( line_split[1] )
+ if var_F0 is not None and var_f1 is not None:
+ break
+ if var_F0 is not None and var_f1 is not None:
+ bfsize = var_F0 - var_f1
+ printLog( outlogfile, '> estimated Bloom Filter size: ' + str(bfsize) )
+ else:
+ printLog( outlogfile, '> an error has occurred while estimating the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True )
+ else:
+ printLog( outlogfile, '> an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True )
+ else:
+ printLog( outlogfile, '> unable to estimate the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True )
+
+ if bfsize > 0:
+ for dataset_idx in range(0, len(formats)):
+ if formats[ dataset_idx ] == 'accessions':
+ with open( filepaths[ dataset_idx ] ) as accessions_file:
+ for line in accessions_file:
+ accession = line.split( '\t' )[0].strip().upper()
+ if accession in accessions:
+ curr_format = accessions[ accession ][ 'format' ]
+ curr_compressed = 'uncompress'
+ curr_filepath = accessions[ accession ][ 'filepath' ]
+ curr_filename = accessions[ accession ][ 'filename' ]
+ printLog( outlogfile, 'Processing \"' + accession + '\" ( format=\"' + curr_format +
+ '\", compressed=\"' + str(False) + '\", fixed_name=\"' + curr_filename + '\" )' )
+ print 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1'
+ makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' +
+ curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' +
+ str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1' )
+ if makebf_exitcode > 0:
+ printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + accession + '\" will be excluded )' )
+ else:
+ printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename + '.bf\"' )
+ else:
+ curr_format = '.' + formats[ dataset_idx ].lower()
+ curr_compressed = '.gz' if compressed[ dataset_idx ] else 'uncompress'
+ curr_filepaths = filepaths[ dataset_idx ].split( ',' )
+ curr_filenames = filenames[ dataset_idx ].split( ',' )
+ for curr_idx in range(0, len(curr_formats)):
+ curr_filename_fixed = ''.join( c for c in curr_filenames[ curr_idx ] if c in VALID_CHARS )
+ printLog( outlogfile, 'Processing \"' + curr_filenames[ curr_idx ] + '\" ( format=\"' + curr_format +
+ '\", compressed=\"' + str(compressed[ dataset_idx ]) + '\", fixed_name=\"' + curr_filename_fixed + '\" )' )
+ if compressed[ dataset_idx ]:
+ makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' +
+ curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' +
+ str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 1' )
+ else:
+ makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' +
+ curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' +
+ str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 0' )
+ if makebf_exitcode > 0:
+ printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + curr_filenames[ curr_idx ] + '\" will be excluded )' )
+ else:
+ printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename_fixed + '.bf\"' )
+ # Create a tree topology
+ printLog( outlogfile, 'Creating a tree topology file' )
+ bf_counter = len( glob.glob1( outdirpath, '*.bf' ) )
+ if bf_counter > 0:
+ cluster_exitcode = os.system( 'sh ' + tooldirpath + 'cluster.sh ' + outdirpath + ' ' + str( bfsize ) )
+ if cluster_exitcode > 0:
+ printLog( outlogfile, '> [exitcode: ' + str(cluster_exitcode) + '] an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True )
+ else:
+ # Build the HowDeSBT nodes
+ if os.path.exists( os.path.join( outdirpath, 'leafnames.txt' ) ):
+ printLog( outlogfile, 'Building the Bloom Filter files for the tree' )
+ build_exitcode = os.system( 'sh ' + tooldirpath + 'build.sh ' + outdirpath )
+ if build_exitcode > 0:
+ printLog( outlogfile, '> [exitcode: ' + str(build_exitcode) + '] an error has occurred during the creation of the Bloom Filter files for the tree', exitcode=ERR_EXIT_CODE, exit=True )
+ else:
+ printLog( outlogfile, '> the tree has been successfully built: \"howde.txt\"', exitcode=OK_EXIT_CODE, exit=True )
+ '''
+ howde_filepath = os.path.join( outdirpath, 'howde.txt' )
+ howde_galaxy_filepath = os.path.join( outdirpath, 'howde_galaxy.txt' )
+ howde_galaxy = open( howde_galaxy_filepath, 'w' )
+ with open( howde_filepath ) as howde_file:
+ for line in howde_file:
+ line = line.strip()
+ if line:
+ # trim stars * and get node name
+ # find galaxy file path to the node name
+ # rewrite path with stars
+ howde_galaxy.close()
+ '''
+ else:
+ printLog( outlogfile, '> an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True )
+ else:
+ printLog( outlogfile, '> no Bloom Filter files found', exitcode=ERR_EXIT_CODE, exit=True )
+ else:
+ printLog( outlogfile, '> ERROR: the Bloom Filter size is ' + str( bfsize ), exitcode=ERR_EXIT_CODE, exit=True )
+ else:
+ printLog( outlogfile, 'Something went wrong with the input parameters', exitcode=ERR_EXIT_CODE, exit=True )
+
+def __main__():
+ # Parse the command line options
+ usage = ("Usage: create.py --formats file_formats --filepaths file_paths --filenames file_names "
+ "--compressed file_compressed --minabundance min_abundance --qualitythresholds quality_thresholds "
+ "--klen kmer_len --bfsize bloom_filter_size --outfile out_log_file_path --outdir out_dir_path")
+ parser = optparse.OptionParser(usage = usage)
+ parser.add_option("-v", "--version", action="store_true", dest="version",
+ default=False, help="display version and exit")
+ parser.add_option("-f", "--formats", type="string",
+ action="store", dest="formats", help="list of file formats separated by a tab character")
+ parser.add_option("-p", "--filepaths", type="string",
+ action="store", dest="filepaths", help="list of input file paths separated by a tab character")
+ parser.add_option("-n", "--filenames", type="string",
+ action="store", dest="filenames", help="list of input file names separated by a tab character")
+ parser.add_option("-c", "--compressed", type="string",
+ action="store", dest="compressed", help="list of compressed flags related to the imput files separated by a tab character")
+ parser.add_option("-m", "--minabundances", type="string",
+ action="store", dest="minabundances", help="list of blooom filter minimum abundances related to the imput files separated by a tab character")
+ parser.add_option("-q", "--qualitythresholds", type="string",
+ action="store", dest="qualitythresholds", help="list of quality thresholds related to the imput files separated by a tab character")
+ parser.add_option("-k", "--klen", type="int", default=21,
+ action="store", dest="klen", help="k-mer length")
+ parser.add_option("-b", "--bfsize", type="int", default=-1,
+ action="store", dest="bfsize", help="bloom filter size")
+ parser.add_option("-o", "--outfile", type="string", default="sbtres.txt",
+ action="store", dest="outfile", help="output log file path")
+ parser.add_option("-d", "--outdir", type="string", default="sbtres.txt",
+ action="store", dest="outdir", help="output directory path")
+ parser.add_option("-t", "--tooldir", type="string", default="./",
+ action="store", dest="tooldir", help="tool directory path")
+
+ (options, args) = parser.parse_args()
+ if options.version:
+ print __version__
+ else:
+ createSBT( options, args )
+
+if __name__ == "__main__": __main__()
diff -r be864d79c9c7 -r 7f712cc0d3d5 create.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/create.xml Mon Mar 04 08:31:28 2019 -0500
@@ -0,0 +1,173 @@
+
+
+ a Sequence Bloom Tree
+
+ macros.xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r be864d79c9c7 -r 7f712cc0d3d5 dataset.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/dataset.tsv Mon Mar 04 08:31:28 2019 -0500
@@ -0,0 +1,11 @@
+SRR833714
+SRR833713
+SRR833715
+SRR567161
+SRR567146
+SRR191393
+SRR191449
+SRR191448
+SRR191447
+SRR191446
+SRR191445
diff -r be864d79c9c7 -r 7f712cc0d3d5 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Mon Mar 04 08:31:28 2019 -0500
@@ -0,0 +1,16 @@
+
+
+
+ python
+ sra-tools
+ ntcard
+ howdesbt
+
+
+
+
+
+ 10.1101/090464
+
+
+
\ No newline at end of file
diff -r be864d79c9c7 -r 7f712cc0d3d5 makebf.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/makebf.sh Mon Mar 04 08:31:28 2019 -0500
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+expPath=$1
+expName=$2
+expFormat=$3
+expCompress=$4
+
+outExpDir=$5
+
+klen=$6
+minab=$7
+bfsize=$8
+
+rmCompressed=$9
+rmSource=${10}
+
+cd ${outExpDir}
+
+if [ "${expCompress}" == ".gz" ]; then
+ gzip -dc ${expPath} > ${expName}${expFormat}
+ howdesbt makebf K=${klen} --min=${minab} --bits=${bfsize} ${expName}${expFormat} --out=${expName}.bf
+ if [ "${rmCompressed}" -eq "1" ]; then
+ rm ${expPath}
+ fi
+ if [ "${rmSource}" -eq "1" ]; then
+ rm ${expName}${expFormat}
+ fi
+else
+ howdesbt makebf K=${klen} --min=${minab} --bits=${bfsize} ${expPath} --out=${expName}.bf
+ if [ "${rmSource}" -eq "1" ]; then
+ rm ${expPath}
+ fi
+fi
diff -r be864d79c9c7 -r 7f712cc0d3d5 query.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/query.py Mon Mar 04 08:31:28 2019 -0500
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+
+import sys, os, optparse, shutil
+
+__version__ = "1.0.0"
+VALID_CHARS = '.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+# in the case of collections, exitcodes equal to 0 and 1 are not considered errors
+ERR_EXIT_CODE = 2
+OK_EXIT_CODE = 0
+
+def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ):
+ print message
+ with open( logfilepath, 'a+' ) as out:
+ out.write( message + '\n' )
+ if exit:
+ sys.exit( exitcode )
+
+def querySBT( options, args ):
+ output_dir_path = options.outputdir
+ outlogfile = options.outfile
+
+ tree_file_paths = options.treep.split( ',' )
+ tree_file_names = options.treen.split( ',' )
+ tree_def_filepath = None
+ leafnames_filepath = None
+ for idx, tree_file_name in enumerate( tree_file_names ):
+ if tree_file_name == 'howde':
+ tree_def_filepath = tree_file_paths[ idx ]
+ elif tree_file_name == 'leafnames':
+ leafnames_filepath = tree_file_paths[ idx ]
+ if tree_def_filepath is not None and leafnames_filepath is not None:
+ break
+
+ if tree_def_filepath is not None and leafnames_filepath is not None:
+ leafnames_counter = 0
+ with open( leafnames_filepath ) as leafnames_file:
+ for line in leafnames_file:
+ if line.strip():
+ leafnames_counter += 1
+ if leafnames_counter > 0:
+ printLog( outlogfile, 'The selected collection contains a valid tree' )
+ shutil.copyfile( tree_def_filepath, 'howde.txt' )
+ tree_def_filepath = 'howde.txt'
+ for idx, tree_file_name in enumerate( tree_file_names ):
+ if tree_file_name.endswith( 'detbrief.rrr' ):
+ shutil.copyfile( tree_file_paths[ idx ], tree_file_name + '.bf' )
+
+ printLog( outlogfile, 'Creating batch of queries' )
+ # create tmp batch file
+ batch_file_name = 'queries.fa'
+ batch_file = open( batch_file_name, 'w' )
+
+ comma_sep_file_paths = options.files
+ # check if options.files contains at least one file path
+ if comma_sep_file_paths is not None:
+ # split file paths
+ file_paths = comma_sep_file_paths.split(",")
+ # split file names
+ file_names = options.names.split(",")
+ for idx, file_path in enumerate(file_paths):
+ fixed_file_name = ''.join( c for c in file_names[ idx ] if c in VALID_CHARS )
+ printLog( outlogfile, '> processing file ' + file_names[ idx ] + ' ( fixed_name=\"' + fixed_file_name + '\" ) ' )
+ with open(file_path, 'r') as content_file:
+ for line in content_file:
+ line = line.strip()
+ if line:
+ line_split = line.strip().split("\t") # split on tab
+ if len(line_split) == 2: # 0:id , 1:seq , otherwise skip line
+ original_seq_id = line_split[0]
+ # fix seq_id using valid chars only
+ seq_id = ''.join( c for c in original_seq_id if c in VALID_CHARS )
+ printLog( outlogfile, '> sequence ' + original_seq_id + ' ( fixed_name=\"' + seq_id + '\" )' )
+ seq_text = line_split[1]
+
+ # write on batch file
+ batch_file.write( '> ' + fixed_file_name + '_' + seq_id + '\n' + seq_text + '\n' )
+ batch_file.close()
+ # query the tree
+ printLog( outlogfile, 'Querying the tree' )
+ query_res_file_path = os.path.abspath( 'answer.txt' )
+ sort_param = '--sort'
+ if options.sort == 0:
+ sort_param = ''
+ query_exitcode = os.system( 'howdesbt query --tree=' + os.path.abspath( tree_def_filepath ) + ' ' + os.path.abspath( batch_file_name ) + '=' + str(options.threshold) + ' --out=' + query_res_file_path ) + ' ' + sort_param
+ if query_exitcode > 0:
+ printLog( outlogfile, '> ERROR: an error has occurred while querying the tree with the sequence [id: ' + seq_id + '] in input file ' + file_names[ idx ] )
+ else:
+ if os.path.exists( query_res_file_path ):
+ with open( query_res_file_path ) as query_res_file:
+ file_path = ''
+ theta_matches = 0
+ for line in query_res_file:
+ line = line.strip()
+ if line:
+ if line.startswith( '*' ):
+ line_split = line.split( ' ' )
+ theta_matches = int( line_split[ 1 ] )
+ file_name = line_split[ 0 ].replace( '*', '' )
+ file_path = os.path.join( output_dir_path, file_name + '_txt' )
+ open( file_path, 'a' ).close()
+ else:
+ res_file = open( file_path, 'a+' )
+ fraction = str( theta_matches ) + '/' + str( leafnames_counter )
+ score = format( round( float( theta_matches ) / float( leafnames_counter ) , 6 ), '6f' )
+ res_file.write( line + '\t' + fraction + '\t' + score + '\n' )
+ res_file.close()
+ else:
+ printLog( outlogfile, 'An error has occurred while querying the tree', exitcode=ERR_EXIT_CODE, exit=True )
+ else:
+ printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True )
+ else:
+ printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True )
+
+def __main__():
+ # Parse the command line options
+ usage = "Usage: query.py --files comma_sep_file_paths --names comma_seq_file_names --sequences sequences_text --search search_mode --exact exact_alg --sthreshold threshold --outputdir output_dir_path"
+ parser = optparse.OptionParser(usage = usage)
+ parser.add_option("-v", "--version", action="store_true", dest="version",
+ default=False, help="display version and exit")
+ parser.add_option("-f", "--files", type="string",
+ action="store", dest="files", help="comma separated files path")
+ parser.add_option("-n", "--names", type="string",
+ action="store", dest="names", help="comma separated names associated to the files specified in --files")
+ parser.add_option("-k", "--treep", type="string",
+ action="store", dest="treep", help="paths of files in collection")
+ parser.add_option("-m", "--treen", type="string",
+ action="store", dest="treen", help="names of files in collection")
+ parser.add_option("-t", "--threshold", type="float", default=0.7,
+ action="store", dest="threshold", help="search threshold")
+ parser.add_option("-s", "--sort", type="int", default=1,
+ action="store", dest="sort", help="sort results")
+ parser.add_option("-o", "--outputdir", type="string", default="output",
+ action="store", dest="outputdir", help="output directory (collection) path")
+ parser.add_option("-r", "--outfile", type="string", default="query.txt",
+ action="store", dest="outfile", help="output log file path")
+
+ (options, args) = parser.parse_args()
+ if options.version:
+ print __version__
+ else:
+ # create output dir (collection)
+ output_dir_path = options.outputdir
+ if not os.path.exists(output_dir_path):
+ os.makedirs(output_dir_path)
+
+ querySBT( options, args )
+
+if __name__ == "__main__": __main__()
diff -r be864d79c9c7 -r 7f712cc0d3d5 query.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/query.tsv Mon Mar 04 08:31:28 2019 -0500
@@ -0,0 +1,3 @@
+0 CCAACCAAAGGGAAAACTTTTTTCCGACTTTGGCCTAAAGGGTTTAACGGCCAAGTCAGAAGGGAAAAAGTTGCGCCAAAAATGGCGTTAAAATGTGTAATCAGAGAAGCGACACGAAAAGGGGATCAGCTCTTGGCTGGCAATTGGTAGGTCAGAGGTGGATTGGGAAAAGGCAAGTCAGCAACTGTCGATGACGGCGACTGACTGTTAATGAAAATTGTTTTGGCTGTGTGGAAAAAAATACGCGGGAATCCGTGAATTTTCCGAGGAGCTGGTGGAGCGAAGAAAACGGGGTGCTGCTGTTGTAAATGATTGGTGAAAGTCACACGCCCGCAGCCTTGCCAAACTAATTAACGCCAAATGGAGCTAAGGCCTTTGAATGATGGCTGCAGGCTAGCTTATGAAAAGGGGTTGAAGAGAAGTGGAAAAATTGGTAGAAAGGGATTTGCTCAAGATGCC
+1 TTAATGACAGGGCCACATGATGTGAAAAAAAATCAGAAACCGAGTCAACGTGAGAAGATAGTACGTACTACCGCAAATGAATGGCCATTTCATTTGCATGTTGGGAGCAACAGAAATGAGAGAGCATCCGAAGCTAACCACAAAAATGGACTTTGCTTCATTATGCACAAACACGCCAATAAATGTAACGAGAAAGATAGTAGGAGCGAAAGACGAGACGAGACAAACAGGAAGAAGACGAGTGGACGAGTGTTTTTTGTAACGAAACTCTTAATCGCTCCTTTGCAGGCTTAAGCTGATAGTTGCTACGTTTATGCCATGAATTTCAAGATCTCTCAAATGCGTGAAAATCCAGTTTATGCGACAGACAAATTCATGTATTTGAAAAATCTTAGCTGATAGAAATCAAAGGTGATT
+2 CAATTAATGATAAATATTTTATAAGGTGCGGAAATAAAGTGAGGAATATCTTTTAAATTCAAGTTCAATTCTGAAAGC
\ No newline at end of file
diff -r be864d79c9c7 -r 7f712cc0d3d5 query.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/query.xml Mon Mar 04 08:31:28 2019 -0500
@@ -0,0 +1,83 @@
+
+
+ a Sequence Bloom Tree
+
+ macros.xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+