# HG changeset patch # User fabio # Date 1551706288 18000 # Node ID 7f712cc0d3d5dcb246a3c2b98f4423ca5c922627 # Parent be864d79c9c731ce3277eaa7935cf89762bbfe32 Uploaded 20190304.2 diff -r be864d79c9c7 -r 7f712cc0d3d5 .shed.yml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.shed.yml Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,21 @@ +name: btman +owner: iuc +categories: + - Data Source + - Web Services +description: BloomTree Manager +long_description: | + A suite of tools to fast create and query Sequence Bloom Trees + supporting determined/how split filters +remote_repository_url: https://github.com/fabio-cumbo/bloomtree-manager +homepage_url: https://github.com/fabio-cumbo/bloomtree-manager +type: unrestricted +auto_tool_repositories: + name_template: "{{ tool_id }}" + descriptor_template: "Wrapper for BloomTree Manager: {{ tool_name }}." +suite: + name: "btman_suite" + description: "A suite of Galaxy tools designed to work with the BloomTree Manager." + long_description: | + A suite of tools to fast create and query Sequence Bloom Trees + supporting determined/how split filters diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/.shed.yml --- a/btman-1.0.0/.shed.yml Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -name: btman -owner: iuc -categories: - - Data Source - - Web Services -description: BloomTree Manager -long_description: | - A suite of tools to fast create and query Sequence Bloom Trees - supporting determined/how split filters -remote_repository_url: https://github.com/fabio-cumbo/bloomtree-manager -homepage_url: https://github.com/fabio-cumbo/bloomtree-manager -type: unrestricted -auto_tool_repositories: - name_template: "{{ tool_id }}" - descriptor_template: "Wrapper for BloomTree Manager: {{ tool_name }}." -suite: - name: "btman_suite" - description: "A suite of Galaxy tools designed to work with the BloomTree Manager." - long_description: | - A suite of tools to fast create and query Sequence Bloom Trees - supporting determined/how split filters diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/build.sh --- a/btman-1.0.0/build.sh Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -#!/bin/bash - -outExpDir=$1 - -cd ${outExpDir} - -howdesbt build --HowDe --tree=union.txt --outtree=howde.txt diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/cluster.sh --- a/btman-1.0.0/cluster.sh Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -#!/bin/bash - -outExpDir=$1 -bfsize=$2 - -cd ${outExpDir} - -ls *.bf > leafnames.txt -howdesbt cluster --list=leafnames.txt --bits=${bfsize} --tree=union.txt --nodename=node{number} --keepallnodes -#rm leafnames_txt_? diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/create.py --- a/btman-1.0.0/create.py Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,233 +0,0 @@ -#!/usr/bin/env python - -import sys, os, optparse, shutil, glob - -__version__ = "1.0.0" -# in the case of collections, exitcodes equal to 0 and 1 are not considered errors -ERR_EXIT_CODE = 2 -OK_EXIT_CODE = 0 -VALID_CHARS = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' - -def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ): - print message - with open( logfilepath, 'a+' ) as out: - out.write( message + '\n' ) - if exit: - sys.exit( exitcode ) - -def downloadAccessions(formats, filepaths, outlogfile, outdirpath): - downloaded_files = { } - for dataset_idx in range(0, len(formats)): - if formats[ dataset_idx ] == 'accessions': - print filepaths[ dataset_idx ] - with open( filepaths[ dataset_idx ] ) as accessions: - for line in accessions: - print line - accession = line.split( '\t' )[0].strip() - if accession: - printLog( outlogfile, 'Downloading \"' + accession.upper() + '\" with the fastq-dump tool (part of the sra-tools utility)' ) - fastq_dump_exitcode = os.system( 'fastq-dump --outdir ' + outdirpath + ' --fasta ' + accession.upper() ) - if fastq_dump_exitcode > 0: - printLog( outlogfile, '> FASTA file: FAILED ( \"' + accession.upper() + '\" will be excluded )' ) - else: - #os.rename( os.path.join( outdirpath, accession.upper() + '.fasta' ), os.path.join( outdirpath, accession.upper() + '_fasta' ) ) - printLog( outlogfile, '> FASTA file: \"' + accession.upper() + '.fasta\"' ) - accession_data = { - 'format': '.fasta', - 'filepath': os.path.join( outdirpath, accession.upper() + '.fasta' ), - 'filename': ''.join( c for c in accession.upper() if c in VALID_CHARS ) - } - downloaded_files[ accession.upper() ] = accession_data - return downloaded_files - -# format = { fasta, fastq, accession } -# this version skip the quality control procedure -def createSBT( options, args ): - outlogfile = str( options.outfile ) - outdirpath = str( options.outdir ) - if not outdirpath.endswith('/'): outdirpath += '/' - if not os.path.exists( outdirpath ): - os.mkdir( outdirpath ) - outdirpath = os.path.abspath( outdirpath ) - os.chdir( outdirpath ) - tooldirpath = os.path.abspath( str( options.tooldir ) ) - if not tooldirpath.endswith('/'): tooldirpath += '/' - - formats = [ fo for fo in str( options.formats ).split( '|' ) if fo.strip() ] - filepaths = [ filepath for filepath in str( options.filepaths ).split( '|' ) if filepath.strip() ] - filenames = [ filename for filename in str( options.filenames ).split( '|' ) if filename.strip() ] - compressed = [ True == int(c) for c in str( options.compressed ).split( '|' ) if c.strip() ] - minabundances = [ int(minab) for minab in str( options.minabundances ).split( '|' ) if minab.strip() ] - qualitythresholds = [ float(qthres) for qthres in str( options.qualitythresholds ).split( '|' ) if qthres.strip() ] - - klen = int( options.klen ) - bfsize = int( options.bfsize ) - - if len(formats) == len(filepaths) == len(filenames) == len(compressed) == len(minabundances) == len(qualitythresholds): - printLog( outlogfile, 'Retrieving experiments' ) - accessions = downloadAccessions( formats, filepaths, outlogfile, outdirpath ) - printLog( outlogfile, '> ' + str( len( accessions ) ) + ' experiments retrieved from the Sequence Read Archive' ) - acc_arr = [ a for a in accessions ] - print str( acc_arr ) - if bfsize < 0: # estimate bloom filter size - data_paths = ' '.join( accessions[ accession ][ 'filepath' ] for accession in accessions if 'filepath' in accessions[ accession ] ) - print data_paths - if len( data_paths ) > 0: - data_paths += ' ' - for dataset_idx in range(0, len(formats)): - if formats[ dataset_idx ] != 'accessions': - data_paths += ' '.join( path for path in filepaths[ dataset_idx ].split( ',' ) ) - # ntcard - printLog( outlogfile, 'Estimating the Bloom Filter size with ntcard' ) - if len( data_paths ) > 0: - ntcard_res_filepath = os.path.join( outdirpath, 'freq_k' + str( klen ) + '.hist' ) - ntcard_exitcode = os.system( 'ntcard --kmer=' + str( klen ) + ' ' + data_paths ) - print 'ntcard --kmer=' + str( klen ) + ' ' + data_paths - if ntcard_exitcode > 0: - printLog( outlogfile, '> [exitcode: ' + str(ntcard_exitcode) + '] an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True ) - else: - if os.path.exists( ntcard_res_filepath ): - os.rename( ntcard_res_filepath, os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' ) ) - ntcard_res_filepath = os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' ) - var_F0 = None - var_f1 = None - with open( ntcard_res_filepath ) as ntcard_res: - for line in ntcard_res: - line = line.strip() - if line: - line_split = line.split( '\t' ) - if len(line_split) == 2: - if line_split[0] == 'F0': - var_F0 = int( line_split[1] ) - elif line_split[0] == 'f1': - var_f1 = int( line_split[1] ) - if var_F0 is not None and var_f1 is not None: - break - if var_F0 is not None and var_f1 is not None: - bfsize = var_F0 - var_f1 - printLog( outlogfile, '> estimated Bloom Filter size: ' + str(bfsize) ) - else: - printLog( outlogfile, '> an error has occurred while estimating the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True ) - else: - printLog( outlogfile, '> an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True ) - else: - printLog( outlogfile, '> unable to estimate the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True ) - - if bfsize > 0: - for dataset_idx in range(0, len(formats)): - if formats[ dataset_idx ] == 'accessions': - with open( filepaths[ dataset_idx ] ) as accessions_file: - for line in accessions_file: - accession = line.split( '\t' )[0].strip().upper() - if accession in accessions: - curr_format = accessions[ accession ][ 'format' ] - curr_compressed = 'uncompress' - curr_filepath = accessions[ accession ][ 'filepath' ] - curr_filename = accessions[ accession ][ 'filename' ] - printLog( outlogfile, 'Processing \"' + accession + '\" ( format=\"' + curr_format + - '\", compressed=\"' + str(False) + '\", fixed_name=\"' + curr_filename + '\" )' ) - print 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1' - makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' + - curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + - str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1' ) - if makebf_exitcode > 0: - printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + accession + '\" will be excluded )' ) - else: - printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename + '.bf\"' ) - else: - curr_format = '.' + formats[ dataset_idx ].lower() - curr_compressed = '.gz' if compressed[ dataset_idx ] else 'uncompress' - curr_filepaths = filepaths[ dataset_idx ].split( ',' ) - curr_filenames = filenames[ dataset_idx ].split( ',' ) - for curr_idx in range(0, len(curr_formats)): - curr_filename_fixed = ''.join( c for c in curr_filenames[ curr_idx ] if c in VALID_CHARS ) - printLog( outlogfile, 'Processing \"' + curr_filenames[ curr_idx ] + '\" ( format=\"' + curr_format + - '\", compressed=\"' + str(compressed[ dataset_idx ]) + '\", fixed_name=\"' + curr_filename_fixed + '\" )' ) - if compressed[ dataset_idx ]: - makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' + - curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + - str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 1' ) - else: - makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' + - curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + - str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 0' ) - if makebf_exitcode > 0: - printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + curr_filenames[ curr_idx ] + '\" will be excluded )' ) - else: - printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename_fixed + '.bf\"' ) - # Create a tree topology - printLog( outlogfile, 'Creating a tree topology file' ) - bf_counter = len( glob.glob1( outdirpath, '*.bf' ) ) - if bf_counter > 0: - cluster_exitcode = os.system( 'sh ' + tooldirpath + 'cluster.sh ' + outdirpath + ' ' + str( bfsize ) ) - if cluster_exitcode > 0: - printLog( outlogfile, '> [exitcode: ' + str(cluster_exitcode) + '] an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True ) - else: - # Build the HowDeSBT nodes - if os.path.exists( os.path.join( outdirpath, 'leafnames.txt' ) ): - printLog( outlogfile, 'Building the Bloom Filter files for the tree' ) - build_exitcode = os.system( 'sh ' + tooldirpath + 'build.sh ' + outdirpath ) - if build_exitcode > 0: - printLog( outlogfile, '> [exitcode: ' + str(build_exitcode) + '] an error has occurred during the creation of the Bloom Filter files for the tree', exitcode=ERR_EXIT_CODE, exit=True ) - else: - printLog( outlogfile, '> the tree has been successfully built: \"howde.txt\"', exitcode=OK_EXIT_CODE, exit=True ) - ''' - howde_filepath = os.path.join( outdirpath, 'howde.txt' ) - howde_galaxy_filepath = os.path.join( outdirpath, 'howde_galaxy.txt' ) - howde_galaxy = open( howde_galaxy_filepath, 'w' ) - with open( howde_filepath ) as howde_file: - for line in howde_file: - line = line.strip() - if line: - # trim stars * and get node name - # find galaxy file path to the node name - # rewrite path with stars - howde_galaxy.close() - ''' - else: - printLog( outlogfile, '> an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True ) - else: - printLog( outlogfile, '> no Bloom Filter files found', exitcode=ERR_EXIT_CODE, exit=True ) - else: - printLog( outlogfile, '> ERROR: the Bloom Filter size is ' + str( bfsize ), exitcode=ERR_EXIT_CODE, exit=True ) - else: - printLog( outlogfile, 'Something went wrong with the input parameters', exitcode=ERR_EXIT_CODE, exit=True ) - -def __main__(): - # Parse the command line options - usage = ("Usage: create.py --formats file_formats --filepaths file_paths --filenames file_names " - "--compressed file_compressed --minabundance min_abundance --qualitythresholds quality_thresholds " - "--klen kmer_len --bfsize bloom_filter_size --outfile out_log_file_path --outdir out_dir_path") - parser = optparse.OptionParser(usage = usage) - parser.add_option("-v", "--version", action="store_true", dest="version", - default=False, help="display version and exit") - parser.add_option("-f", "--formats", type="string", - action="store", dest="formats", help="list of file formats separated by a tab character") - parser.add_option("-p", "--filepaths", type="string", - action="store", dest="filepaths", help="list of input file paths separated by a tab character") - parser.add_option("-n", "--filenames", type="string", - action="store", dest="filenames", help="list of input file names separated by a tab character") - parser.add_option("-c", "--compressed", type="string", - action="store", dest="compressed", help="list of compressed flags related to the imput files separated by a tab character") - parser.add_option("-m", "--minabundances", type="string", - action="store", dest="minabundances", help="list of blooom filter minimum abundances related to the imput files separated by a tab character") - parser.add_option("-q", "--qualitythresholds", type="string", - action="store", dest="qualitythresholds", help="list of quality thresholds related to the imput files separated by a tab character") - parser.add_option("-k", "--klen", type="int", default=21, - action="store", dest="klen", help="k-mer length") - parser.add_option("-b", "--bfsize", type="int", default=-1, - action="store", dest="bfsize", help="bloom filter size") - parser.add_option("-o", "--outfile", type="string", default="sbtres.txt", - action="store", dest="outfile", help="output log file path") - parser.add_option("-d", "--outdir", type="string", default="sbtres.txt", - action="store", dest="outdir", help="output directory path") - parser.add_option("-t", "--tooldir", type="string", default="./", - action="store", dest="tooldir", help="tool directory path") - - (options, args) = parser.parse_args() - if options.version: - print __version__ - else: - createSBT( options, args ) - -if __name__ == "__main__": __main__() diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/create.xml --- a/btman-1.0.0/create.xml Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,173 +0,0 @@ - - - a Sequence Bloom Tree - - macros.xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/dataset.tsv --- a/btman-1.0.0/dataset.tsv Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ -SRR833714 -SRR833713 -SRR833715 -SRR567161 -SRR567146 -SRR191393 -SRR191449 -SRR191448 -SRR191447 -SRR191446 -SRR191445 diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/macros.xml --- a/btman-1.0.0/macros.xml Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,16 +0,0 @@ - - - - python - sra-tools - ntcard - howdesbt - - - - - - 10.1101/090464 - - - \ No newline at end of file diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/makebf.sh --- a/btman-1.0.0/makebf.sh Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -#!/bin/bash - -expPath=$1 -expName=$2 -expFormat=$3 -expCompress=$4 - -outExpDir=$5 - -klen=$6 -minab=$7 -bfsize=$8 - -rmCompressed=$9 -rmSource=${10} - -cd ${outExpDir} - -if [ "${expCompress}" == ".gz" ]; then - gzip -dc ${expPath} > ${expName}${expFormat} - howdesbt makebf K=${klen} --min=${minab} --bits=${bfsize} ${expName}${expFormat} --out=${expName}.bf - if [ "${rmCompressed}" -eq "1" ]; then - rm ${expPath} - fi - if [ "${rmSource}" -eq "1" ]; then - rm ${expName}${expFormat} - fi -else - howdesbt makebf K=${klen} --min=${minab} --bits=${bfsize} ${expPath} --out=${expName}.bf - if [ "${rmSource}" -eq "1" ]; then - rm ${expPath} - fi -fi diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/query.py --- a/btman-1.0.0/query.py Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,148 +0,0 @@ -#!/usr/bin/env python - -import sys, os, optparse, shutil - -__version__ = "1.0.0" -VALID_CHARS = '.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' -# in the case of collections, exitcodes equal to 0 and 1 are not considered errors -ERR_EXIT_CODE = 2 -OK_EXIT_CODE = 0 - -def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ): - print message - with open( logfilepath, 'a+' ) as out: - out.write( message + '\n' ) - if exit: - sys.exit( exitcode ) - -def querySBT( options, args ): - output_dir_path = options.outputdir - outlogfile = options.outfile - - tree_file_paths = options.treep.split( ',' ) - tree_file_names = options.treen.split( ',' ) - tree_def_filepath = None - leafnames_filepath = None - for idx, tree_file_name in enumerate( tree_file_names ): - if tree_file_name == 'howde': - tree_def_filepath = tree_file_paths[ idx ] - elif tree_file_name == 'leafnames': - leafnames_filepath = tree_file_paths[ idx ] - if tree_def_filepath is not None and leafnames_filepath is not None: - break - - if tree_def_filepath is not None and leafnames_filepath is not None: - leafnames_counter = 0 - with open( leafnames_filepath ) as leafnames_file: - for line in leafnames_file: - if line.strip(): - leafnames_counter += 1 - if leafnames_counter > 0: - printLog( outlogfile, 'The selected collection contains a valid tree' ) - shutil.copyfile( tree_def_filepath, 'howde.txt' ) - tree_def_filepath = 'howde.txt' - for idx, tree_file_name in enumerate( tree_file_names ): - if tree_file_name.endswith( 'detbrief.rrr' ): - shutil.copyfile( tree_file_paths[ idx ], tree_file_name + '.bf' ) - - printLog( outlogfile, 'Creating batch of queries' ) - # create tmp batch file - batch_file_name = 'queries.fa' - batch_file = open( batch_file_name, 'w' ) - - comma_sep_file_paths = options.files - # check if options.files contains at least one file path - if comma_sep_file_paths is not None: - # split file paths - file_paths = comma_sep_file_paths.split(",") - # split file names - file_names = options.names.split(",") - for idx, file_path in enumerate(file_paths): - fixed_file_name = ''.join( c for c in file_names[ idx ] if c in VALID_CHARS ) - printLog( outlogfile, '> processing file ' + file_names[ idx ] + ' ( fixed_name=\"' + fixed_file_name + '\" ) ' ) - with open(file_path, 'r') as content_file: - for line in content_file: - line = line.strip() - if line: - line_split = line.strip().split("\t") # split on tab - if len(line_split) == 2: # 0:id , 1:seq , otherwise skip line - original_seq_id = line_split[0] - # fix seq_id using valid chars only - seq_id = ''.join( c for c in original_seq_id if c in VALID_CHARS ) - printLog( outlogfile, '> sequence ' + original_seq_id + ' ( fixed_name=\"' + seq_id + '\" )' ) - seq_text = line_split[1] - - # write on batch file - batch_file.write( '> ' + fixed_file_name + '_' + seq_id + '\n' + seq_text + '\n' ) - batch_file.close() - # query the tree - printLog( outlogfile, 'Querying the tree' ) - query_res_file_path = os.path.abspath( 'answer.txt' ) - sort_param = '--sort' - if options.sort == 0: - sort_param = '' - query_exitcode = os.system( 'howdesbt query --tree=' + os.path.abspath( tree_def_filepath ) + ' ' + os.path.abspath( batch_file_name ) + '=' + str(options.threshold) + ' --out=' + query_res_file_path ) + ' ' + sort_param - if query_exitcode > 0: - printLog( outlogfile, '> ERROR: an error has occurred while querying the tree with the sequence [id: ' + seq_id + '] in input file ' + file_names[ idx ] ) - else: - if os.path.exists( query_res_file_path ): - with open( query_res_file_path ) as query_res_file: - file_path = '' - theta_matches = 0 - for line in query_res_file: - line = line.strip() - if line: - if line.startswith( '*' ): - line_split = line.split( ' ' ) - theta_matches = int( line_split[ 1 ] ) - file_name = line_split[ 0 ].replace( '*', '' ) - file_path = os.path.join( output_dir_path, file_name + '_txt' ) - open( file_path, 'a' ).close() - else: - res_file = open( file_path, 'a+' ) - fraction = str( theta_matches ) + '/' + str( leafnames_counter ) - score = format( round( float( theta_matches ) / float( leafnames_counter ) , 6 ), '6f' ) - res_file.write( line + '\t' + fraction + '\t' + score + '\n' ) - res_file.close() - else: - printLog( outlogfile, 'An error has occurred while querying the tree', exitcode=ERR_EXIT_CODE, exit=True ) - else: - printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True ) - else: - printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True ) - -def __main__(): - # Parse the command line options - usage = "Usage: query.py --files comma_sep_file_paths --names comma_seq_file_names --sequences sequences_text --search search_mode --exact exact_alg --sthreshold threshold --outputdir output_dir_path" - parser = optparse.OptionParser(usage = usage) - parser.add_option("-v", "--version", action="store_true", dest="version", - default=False, help="display version and exit") - parser.add_option("-f", "--files", type="string", - action="store", dest="files", help="comma separated files path") - parser.add_option("-n", "--names", type="string", - action="store", dest="names", help="comma separated names associated to the files specified in --files") - parser.add_option("-k", "--treep", type="string", - action="store", dest="treep", help="paths of files in collection") - parser.add_option("-m", "--treen", type="string", - action="store", dest="treen", help="names of files in collection") - parser.add_option("-t", "--threshold", type="float", default=0.7, - action="store", dest="threshold", help="search threshold") - parser.add_option("-s", "--sort", type="int", default=1, - action="store", dest="sort", help="sort results") - parser.add_option("-o", "--outputdir", type="string", default="output", - action="store", dest="outputdir", help="output directory (collection) path") - parser.add_option("-r", "--outfile", type="string", default="query.txt", - action="store", dest="outfile", help="output log file path") - - (options, args) = parser.parse_args() - if options.version: - print __version__ - else: - # create output dir (collection) - output_dir_path = options.outputdir - if not os.path.exists(output_dir_path): - os.makedirs(output_dir_path) - - querySBT( options, args ) - -if __name__ == "__main__": __main__() diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/query.tsv --- a/btman-1.0.0/query.tsv Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -0 CCAACCAAAGGGAAAACTTTTTTCCGACTTTGGCCTAAAGGGTTTAACGGCCAAGTCAGAAGGGAAAAAGTTGCGCCAAAAATGGCGTTAAAATGTGTAATCAGAGAAGCGACACGAAAAGGGGATCAGCTCTTGGCTGGCAATTGGTAGGTCAGAGGTGGATTGGGAAAAGGCAAGTCAGCAACTGTCGATGACGGCGACTGACTGTTAATGAAAATTGTTTTGGCTGTGTGGAAAAAAATACGCGGGAATCCGTGAATTTTCCGAGGAGCTGGTGGAGCGAAGAAAACGGGGTGCTGCTGTTGTAAATGATTGGTGAAAGTCACACGCCCGCAGCCTTGCCAAACTAATTAACGCCAAATGGAGCTAAGGCCTTTGAATGATGGCTGCAGGCTAGCTTATGAAAAGGGGTTGAAGAGAAGTGGAAAAATTGGTAGAAAGGGATTTGCTCAAGATGCC -1 TTAATGACAGGGCCACATGATGTGAAAAAAAATCAGAAACCGAGTCAACGTGAGAAGATAGTACGTACTACCGCAAATGAATGGCCATTTCATTTGCATGTTGGGAGCAACAGAAATGAGAGAGCATCCGAAGCTAACCACAAAAATGGACTTTGCTTCATTATGCACAAACACGCCAATAAATGTAACGAGAAAGATAGTAGGAGCGAAAGACGAGACGAGACAAACAGGAAGAAGACGAGTGGACGAGTGTTTTTTGTAACGAAACTCTTAATCGCTCCTTTGCAGGCTTAAGCTGATAGTTGCTACGTTTATGCCATGAATTTCAAGATCTCTCAAATGCGTGAAAATCCAGTTTATGCGACAGACAAATTCATGTATTTGAAAAATCTTAGCTGATAGAAATCAAAGGTGATT -2 CAATTAATGATAAATATTTTATAAGGTGCGGAAATAAAGTGAGGAATATCTTTTAAATTCAAGTTCAATTCTGAAAGC \ No newline at end of file diff -r be864d79c9c7 -r 7f712cc0d3d5 btman-1.0.0/query.xml --- a/btman-1.0.0/query.xml Mon Mar 04 08:30:03 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,83 +0,0 @@ - - - a Sequence Bloom Tree - - macros.xml - - - - - - - - - - - - - - - - - - - - - - diff -r be864d79c9c7 -r 7f712cc0d3d5 build.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/build.sh Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,7 @@ +#!/bin/bash + +outExpDir=$1 + +cd ${outExpDir} + +howdesbt build --HowDe --tree=union.txt --outtree=howde.txt diff -r be864d79c9c7 -r 7f712cc0d3d5 cluster.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.sh Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,10 @@ +#!/bin/bash + +outExpDir=$1 +bfsize=$2 + +cd ${outExpDir} + +ls *.bf > leafnames.txt +howdesbt cluster --list=leafnames.txt --bits=${bfsize} --tree=union.txt --nodename=node{number} --keepallnodes +#rm leafnames_txt_? diff -r be864d79c9c7 -r 7f712cc0d3d5 create.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/create.py Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,233 @@ +#!/usr/bin/env python + +import sys, os, optparse, shutil, glob + +__version__ = "1.0.0" +# in the case of collections, exitcodes equal to 0 and 1 are not considered errors +ERR_EXIT_CODE = 2 +OK_EXIT_CODE = 0 +VALID_CHARS = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' + +def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ): + print message + with open( logfilepath, 'a+' ) as out: + out.write( message + '\n' ) + if exit: + sys.exit( exitcode ) + +def downloadAccessions(formats, filepaths, outlogfile, outdirpath): + downloaded_files = { } + for dataset_idx in range(0, len(formats)): + if formats[ dataset_idx ] == 'accessions': + print filepaths[ dataset_idx ] + with open( filepaths[ dataset_idx ] ) as accessions: + for line in accessions: + print line + accession = line.split( '\t' )[0].strip() + if accession: + printLog( outlogfile, 'Downloading \"' + accession.upper() + '\" with the fastq-dump tool (part of the sra-tools utility)' ) + fastq_dump_exitcode = os.system( 'fastq-dump --outdir ' + outdirpath + ' --fasta ' + accession.upper() ) + if fastq_dump_exitcode > 0: + printLog( outlogfile, '> FASTA file: FAILED ( \"' + accession.upper() + '\" will be excluded )' ) + else: + #os.rename( os.path.join( outdirpath, accession.upper() + '.fasta' ), os.path.join( outdirpath, accession.upper() + '_fasta' ) ) + printLog( outlogfile, '> FASTA file: \"' + accession.upper() + '.fasta\"' ) + accession_data = { + 'format': '.fasta', + 'filepath': os.path.join( outdirpath, accession.upper() + '.fasta' ), + 'filename': ''.join( c for c in accession.upper() if c in VALID_CHARS ) + } + downloaded_files[ accession.upper() ] = accession_data + return downloaded_files + +# format = { fasta, fastq, accession } +# this version skip the quality control procedure +def createSBT( options, args ): + outlogfile = str( options.outfile ) + outdirpath = str( options.outdir ) + if not outdirpath.endswith('/'): outdirpath += '/' + if not os.path.exists( outdirpath ): + os.mkdir( outdirpath ) + outdirpath = os.path.abspath( outdirpath ) + os.chdir( outdirpath ) + tooldirpath = os.path.abspath( str( options.tooldir ) ) + if not tooldirpath.endswith('/'): tooldirpath += '/' + + formats = [ fo for fo in str( options.formats ).split( '|' ) if fo.strip() ] + filepaths = [ filepath for filepath in str( options.filepaths ).split( '|' ) if filepath.strip() ] + filenames = [ filename for filename in str( options.filenames ).split( '|' ) if filename.strip() ] + compressed = [ True == int(c) for c in str( options.compressed ).split( '|' ) if c.strip() ] + minabundances = [ int(minab) for minab in str( options.minabundances ).split( '|' ) if minab.strip() ] + qualitythresholds = [ float(qthres) for qthres in str( options.qualitythresholds ).split( '|' ) if qthres.strip() ] + + klen = int( options.klen ) + bfsize = int( options.bfsize ) + + if len(formats) == len(filepaths) == len(filenames) == len(compressed) == len(minabundances) == len(qualitythresholds): + printLog( outlogfile, 'Retrieving experiments' ) + accessions = downloadAccessions( formats, filepaths, outlogfile, outdirpath ) + printLog( outlogfile, '> ' + str( len( accessions ) ) + ' experiments retrieved from the Sequence Read Archive' ) + acc_arr = [ a for a in accessions ] + print str( acc_arr ) + if bfsize < 0: # estimate bloom filter size + data_paths = ' '.join( accessions[ accession ][ 'filepath' ] for accession in accessions if 'filepath' in accessions[ accession ] ) + print data_paths + if len( data_paths ) > 0: + data_paths += ' ' + for dataset_idx in range(0, len(formats)): + if formats[ dataset_idx ] != 'accessions': + data_paths += ' '.join( path for path in filepaths[ dataset_idx ].split( ',' ) ) + # ntcard + printLog( outlogfile, 'Estimating the Bloom Filter size with ntcard' ) + if len( data_paths ) > 0: + ntcard_res_filepath = os.path.join( outdirpath, 'freq_k' + str( klen ) + '.hist' ) + ntcard_exitcode = os.system( 'ntcard --kmer=' + str( klen ) + ' ' + data_paths ) + print 'ntcard --kmer=' + str( klen ) + ' ' + data_paths + if ntcard_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(ntcard_exitcode) + '] an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True ) + else: + if os.path.exists( ntcard_res_filepath ): + os.rename( ntcard_res_filepath, os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' ) ) + ntcard_res_filepath = os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' ) + var_F0 = None + var_f1 = None + with open( ntcard_res_filepath ) as ntcard_res: + for line in ntcard_res: + line = line.strip() + if line: + line_split = line.split( '\t' ) + if len(line_split) == 2: + if line_split[0] == 'F0': + var_F0 = int( line_split[1] ) + elif line_split[0] == 'f1': + var_f1 = int( line_split[1] ) + if var_F0 is not None and var_f1 is not None: + break + if var_F0 is not None and var_f1 is not None: + bfsize = var_F0 - var_f1 + printLog( outlogfile, '> estimated Bloom Filter size: ' + str(bfsize) ) + else: + printLog( outlogfile, '> an error has occurred while estimating the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> unable to estimate the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True ) + + if bfsize > 0: + for dataset_idx in range(0, len(formats)): + if formats[ dataset_idx ] == 'accessions': + with open( filepaths[ dataset_idx ] ) as accessions_file: + for line in accessions_file: + accession = line.split( '\t' )[0].strip().upper() + if accession in accessions: + curr_format = accessions[ accession ][ 'format' ] + curr_compressed = 'uncompress' + curr_filepath = accessions[ accession ][ 'filepath' ] + curr_filename = accessions[ accession ][ 'filename' ] + printLog( outlogfile, 'Processing \"' + accession + '\" ( format=\"' + curr_format + + '\", compressed=\"' + str(False) + '\", fixed_name=\"' + curr_filename + '\" )' ) + print 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1' + makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' + + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1' ) + if makebf_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + accession + '\" will be excluded )' ) + else: + printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename + '.bf\"' ) + else: + curr_format = '.' + formats[ dataset_idx ].lower() + curr_compressed = '.gz' if compressed[ dataset_idx ] else 'uncompress' + curr_filepaths = filepaths[ dataset_idx ].split( ',' ) + curr_filenames = filenames[ dataset_idx ].split( ',' ) + for curr_idx in range(0, len(curr_formats)): + curr_filename_fixed = ''.join( c for c in curr_filenames[ curr_idx ] if c in VALID_CHARS ) + printLog( outlogfile, 'Processing \"' + curr_filenames[ curr_idx ] + '\" ( format=\"' + curr_format + + '\", compressed=\"' + str(compressed[ dataset_idx ]) + '\", fixed_name=\"' + curr_filename_fixed + '\" )' ) + if compressed[ dataset_idx ]: + makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' + + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 1' ) + else: + makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' + + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 0' ) + if makebf_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + curr_filenames[ curr_idx ] + '\" will be excluded )' ) + else: + printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename_fixed + '.bf\"' ) + # Create a tree topology + printLog( outlogfile, 'Creating a tree topology file' ) + bf_counter = len( glob.glob1( outdirpath, '*.bf' ) ) + if bf_counter > 0: + cluster_exitcode = os.system( 'sh ' + tooldirpath + 'cluster.sh ' + outdirpath + ' ' + str( bfsize ) ) + if cluster_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(cluster_exitcode) + '] an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True ) + else: + # Build the HowDeSBT nodes + if os.path.exists( os.path.join( outdirpath, 'leafnames.txt' ) ): + printLog( outlogfile, 'Building the Bloom Filter files for the tree' ) + build_exitcode = os.system( 'sh ' + tooldirpath + 'build.sh ' + outdirpath ) + if build_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(build_exitcode) + '] an error has occurred during the creation of the Bloom Filter files for the tree', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> the tree has been successfully built: \"howde.txt\"', exitcode=OK_EXIT_CODE, exit=True ) + ''' + howde_filepath = os.path.join( outdirpath, 'howde.txt' ) + howde_galaxy_filepath = os.path.join( outdirpath, 'howde_galaxy.txt' ) + howde_galaxy = open( howde_galaxy_filepath, 'w' ) + with open( howde_filepath ) as howde_file: + for line in howde_file: + line = line.strip() + if line: + # trim stars * and get node name + # find galaxy file path to the node name + # rewrite path with stars + howde_galaxy.close() + ''' + else: + printLog( outlogfile, '> an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> no Bloom Filter files found', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> ERROR: the Bloom Filter size is ' + str( bfsize ), exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, 'Something went wrong with the input parameters', exitcode=ERR_EXIT_CODE, exit=True ) + +def __main__(): + # Parse the command line options + usage = ("Usage: create.py --formats file_formats --filepaths file_paths --filenames file_names " + "--compressed file_compressed --minabundance min_abundance --qualitythresholds quality_thresholds " + "--klen kmer_len --bfsize bloom_filter_size --outfile out_log_file_path --outdir out_dir_path") + parser = optparse.OptionParser(usage = usage) + parser.add_option("-v", "--version", action="store_true", dest="version", + default=False, help="display version and exit") + parser.add_option("-f", "--formats", type="string", + action="store", dest="formats", help="list of file formats separated by a tab character") + parser.add_option("-p", "--filepaths", type="string", + action="store", dest="filepaths", help="list of input file paths separated by a tab character") + parser.add_option("-n", "--filenames", type="string", + action="store", dest="filenames", help="list of input file names separated by a tab character") + parser.add_option("-c", "--compressed", type="string", + action="store", dest="compressed", help="list of compressed flags related to the imput files separated by a tab character") + parser.add_option("-m", "--minabundances", type="string", + action="store", dest="minabundances", help="list of blooom filter minimum abundances related to the imput files separated by a tab character") + parser.add_option("-q", "--qualitythresholds", type="string", + action="store", dest="qualitythresholds", help="list of quality thresholds related to the imput files separated by a tab character") + parser.add_option("-k", "--klen", type="int", default=21, + action="store", dest="klen", help="k-mer length") + parser.add_option("-b", "--bfsize", type="int", default=-1, + action="store", dest="bfsize", help="bloom filter size") + parser.add_option("-o", "--outfile", type="string", default="sbtres.txt", + action="store", dest="outfile", help="output log file path") + parser.add_option("-d", "--outdir", type="string", default="sbtres.txt", + action="store", dest="outdir", help="output directory path") + parser.add_option("-t", "--tooldir", type="string", default="./", + action="store", dest="tooldir", help="tool directory path") + + (options, args) = parser.parse_args() + if options.version: + print __version__ + else: + createSBT( options, args ) + +if __name__ == "__main__": __main__() diff -r be864d79c9c7 -r 7f712cc0d3d5 create.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/create.xml Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,173 @@ + + + a Sequence Bloom Tree + + macros.xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r be864d79c9c7 -r 7f712cc0d3d5 dataset.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dataset.tsv Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,11 @@ +SRR833714 +SRR833713 +SRR833715 +SRR567161 +SRR567146 +SRR191393 +SRR191449 +SRR191448 +SRR191447 +SRR191446 +SRR191445 diff -r be864d79c9c7 -r 7f712cc0d3d5 macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,16 @@ + + + + python + sra-tools + ntcard + howdesbt + + + + + + 10.1101/090464 + + + \ No newline at end of file diff -r be864d79c9c7 -r 7f712cc0d3d5 makebf.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/makebf.sh Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,33 @@ +#!/bin/bash + +expPath=$1 +expName=$2 +expFormat=$3 +expCompress=$4 + +outExpDir=$5 + +klen=$6 +minab=$7 +bfsize=$8 + +rmCompressed=$9 +rmSource=${10} + +cd ${outExpDir} + +if [ "${expCompress}" == ".gz" ]; then + gzip -dc ${expPath} > ${expName}${expFormat} + howdesbt makebf K=${klen} --min=${minab} --bits=${bfsize} ${expName}${expFormat} --out=${expName}.bf + if [ "${rmCompressed}" -eq "1" ]; then + rm ${expPath} + fi + if [ "${rmSource}" -eq "1" ]; then + rm ${expName}${expFormat} + fi +else + howdesbt makebf K=${klen} --min=${minab} --bits=${bfsize} ${expPath} --out=${expName}.bf + if [ "${rmSource}" -eq "1" ]; then + rm ${expPath} + fi +fi diff -r be864d79c9c7 -r 7f712cc0d3d5 query.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query.py Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,148 @@ +#!/usr/bin/env python + +import sys, os, optparse, shutil + +__version__ = "1.0.0" +VALID_CHARS = '.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' +# in the case of collections, exitcodes equal to 0 and 1 are not considered errors +ERR_EXIT_CODE = 2 +OK_EXIT_CODE = 0 + +def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ): + print message + with open( logfilepath, 'a+' ) as out: + out.write( message + '\n' ) + if exit: + sys.exit( exitcode ) + +def querySBT( options, args ): + output_dir_path = options.outputdir + outlogfile = options.outfile + + tree_file_paths = options.treep.split( ',' ) + tree_file_names = options.treen.split( ',' ) + tree_def_filepath = None + leafnames_filepath = None + for idx, tree_file_name in enumerate( tree_file_names ): + if tree_file_name == 'howde': + tree_def_filepath = tree_file_paths[ idx ] + elif tree_file_name == 'leafnames': + leafnames_filepath = tree_file_paths[ idx ] + if tree_def_filepath is not None and leafnames_filepath is not None: + break + + if tree_def_filepath is not None and leafnames_filepath is not None: + leafnames_counter = 0 + with open( leafnames_filepath ) as leafnames_file: + for line in leafnames_file: + if line.strip(): + leafnames_counter += 1 + if leafnames_counter > 0: + printLog( outlogfile, 'The selected collection contains a valid tree' ) + shutil.copyfile( tree_def_filepath, 'howde.txt' ) + tree_def_filepath = 'howde.txt' + for idx, tree_file_name in enumerate( tree_file_names ): + if tree_file_name.endswith( 'detbrief.rrr' ): + shutil.copyfile( tree_file_paths[ idx ], tree_file_name + '.bf' ) + + printLog( outlogfile, 'Creating batch of queries' ) + # create tmp batch file + batch_file_name = 'queries.fa' + batch_file = open( batch_file_name, 'w' ) + + comma_sep_file_paths = options.files + # check if options.files contains at least one file path + if comma_sep_file_paths is not None: + # split file paths + file_paths = comma_sep_file_paths.split(",") + # split file names + file_names = options.names.split(",") + for idx, file_path in enumerate(file_paths): + fixed_file_name = ''.join( c for c in file_names[ idx ] if c in VALID_CHARS ) + printLog( outlogfile, '> processing file ' + file_names[ idx ] + ' ( fixed_name=\"' + fixed_file_name + '\" ) ' ) + with open(file_path, 'r') as content_file: + for line in content_file: + line = line.strip() + if line: + line_split = line.strip().split("\t") # split on tab + if len(line_split) == 2: # 0:id , 1:seq , otherwise skip line + original_seq_id = line_split[0] + # fix seq_id using valid chars only + seq_id = ''.join( c for c in original_seq_id if c in VALID_CHARS ) + printLog( outlogfile, '> sequence ' + original_seq_id + ' ( fixed_name=\"' + seq_id + '\" )' ) + seq_text = line_split[1] + + # write on batch file + batch_file.write( '> ' + fixed_file_name + '_' + seq_id + '\n' + seq_text + '\n' ) + batch_file.close() + # query the tree + printLog( outlogfile, 'Querying the tree' ) + query_res_file_path = os.path.abspath( 'answer.txt' ) + sort_param = '--sort' + if options.sort == 0: + sort_param = '' + query_exitcode = os.system( 'howdesbt query --tree=' + os.path.abspath( tree_def_filepath ) + ' ' + os.path.abspath( batch_file_name ) + '=' + str(options.threshold) + ' --out=' + query_res_file_path ) + ' ' + sort_param + if query_exitcode > 0: + printLog( outlogfile, '> ERROR: an error has occurred while querying the tree with the sequence [id: ' + seq_id + '] in input file ' + file_names[ idx ] ) + else: + if os.path.exists( query_res_file_path ): + with open( query_res_file_path ) as query_res_file: + file_path = '' + theta_matches = 0 + for line in query_res_file: + line = line.strip() + if line: + if line.startswith( '*' ): + line_split = line.split( ' ' ) + theta_matches = int( line_split[ 1 ] ) + file_name = line_split[ 0 ].replace( '*', '' ) + file_path = os.path.join( output_dir_path, file_name + '_txt' ) + open( file_path, 'a' ).close() + else: + res_file = open( file_path, 'a+' ) + fraction = str( theta_matches ) + '/' + str( leafnames_counter ) + score = format( round( float( theta_matches ) / float( leafnames_counter ) , 6 ), '6f' ) + res_file.write( line + '\t' + fraction + '\t' + score + '\n' ) + res_file.close() + else: + printLog( outlogfile, 'An error has occurred while querying the tree', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True ) + +def __main__(): + # Parse the command line options + usage = "Usage: query.py --files comma_sep_file_paths --names comma_seq_file_names --sequences sequences_text --search search_mode --exact exact_alg --sthreshold threshold --outputdir output_dir_path" + parser = optparse.OptionParser(usage = usage) + parser.add_option("-v", "--version", action="store_true", dest="version", + default=False, help="display version and exit") + parser.add_option("-f", "--files", type="string", + action="store", dest="files", help="comma separated files path") + parser.add_option("-n", "--names", type="string", + action="store", dest="names", help="comma separated names associated to the files specified in --files") + parser.add_option("-k", "--treep", type="string", + action="store", dest="treep", help="paths of files in collection") + parser.add_option("-m", "--treen", type="string", + action="store", dest="treen", help="names of files in collection") + parser.add_option("-t", "--threshold", type="float", default=0.7, + action="store", dest="threshold", help="search threshold") + parser.add_option("-s", "--sort", type="int", default=1, + action="store", dest="sort", help="sort results") + parser.add_option("-o", "--outputdir", type="string", default="output", + action="store", dest="outputdir", help="output directory (collection) path") + parser.add_option("-r", "--outfile", type="string", default="query.txt", + action="store", dest="outfile", help="output log file path") + + (options, args) = parser.parse_args() + if options.version: + print __version__ + else: + # create output dir (collection) + output_dir_path = options.outputdir + if not os.path.exists(output_dir_path): + os.makedirs(output_dir_path) + + querySBT( options, args ) + +if __name__ == "__main__": __main__() diff -r be864d79c9c7 -r 7f712cc0d3d5 query.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query.tsv Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,3 @@ +0 CCAACCAAAGGGAAAACTTTTTTCCGACTTTGGCCTAAAGGGTTTAACGGCCAAGTCAGAAGGGAAAAAGTTGCGCCAAAAATGGCGTTAAAATGTGTAATCAGAGAAGCGACACGAAAAGGGGATCAGCTCTTGGCTGGCAATTGGTAGGTCAGAGGTGGATTGGGAAAAGGCAAGTCAGCAACTGTCGATGACGGCGACTGACTGTTAATGAAAATTGTTTTGGCTGTGTGGAAAAAAATACGCGGGAATCCGTGAATTTTCCGAGGAGCTGGTGGAGCGAAGAAAACGGGGTGCTGCTGTTGTAAATGATTGGTGAAAGTCACACGCCCGCAGCCTTGCCAAACTAATTAACGCCAAATGGAGCTAAGGCCTTTGAATGATGGCTGCAGGCTAGCTTATGAAAAGGGGTTGAAGAGAAGTGGAAAAATTGGTAGAAAGGGATTTGCTCAAGATGCC +1 TTAATGACAGGGCCACATGATGTGAAAAAAAATCAGAAACCGAGTCAACGTGAGAAGATAGTACGTACTACCGCAAATGAATGGCCATTTCATTTGCATGTTGGGAGCAACAGAAATGAGAGAGCATCCGAAGCTAACCACAAAAATGGACTTTGCTTCATTATGCACAAACACGCCAATAAATGTAACGAGAAAGATAGTAGGAGCGAAAGACGAGACGAGACAAACAGGAAGAAGACGAGTGGACGAGTGTTTTTTGTAACGAAACTCTTAATCGCTCCTTTGCAGGCTTAAGCTGATAGTTGCTACGTTTATGCCATGAATTTCAAGATCTCTCAAATGCGTGAAAATCCAGTTTATGCGACAGACAAATTCATGTATTTGAAAAATCTTAGCTGATAGAAATCAAAGGTGATT +2 CAATTAATGATAAATATTTTATAAGGTGCGGAAATAAAGTGAGGAATATCTTTTAAATTCAAGTTCAATTCTGAAAGC \ No newline at end of file diff -r be864d79c9c7 -r 7f712cc0d3d5 query.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query.xml Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,83 @@ + + + a Sequence Bloom Tree + + macros.xml + + + + + + + + + + + + + + + + + + + + + +