Mercurial > repos > fabio > btman
diff btman-1.0.0/create.py @ 18:be864d79c9c7 draft
Uploaded 20190304
author | fabio |
---|---|
date | Mon, 04 Mar 2019 08:30:03 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/btman-1.0.0/create.py Mon Mar 04 08:30:03 2019 -0500 @@ -0,0 +1,233 @@ +#!/usr/bin/env python + +import sys, os, optparse, shutil, glob + +__version__ = "1.0.0" +# in the case of collections, exitcodes equal to 0 and 1 are not considered errors +ERR_EXIT_CODE = 2 +OK_EXIT_CODE = 0 +VALID_CHARS = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' + +def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ): + print message + with open( logfilepath, 'a+' ) as out: + out.write( message + '\n' ) + if exit: + sys.exit( exitcode ) + +def downloadAccessions(formats, filepaths, outlogfile, outdirpath): + downloaded_files = { } + for dataset_idx in range(0, len(formats)): + if formats[ dataset_idx ] == 'accessions': + print filepaths[ dataset_idx ] + with open( filepaths[ dataset_idx ] ) as accessions: + for line in accessions: + print line + accession = line.split( '\t' )[0].strip() + if accession: + printLog( outlogfile, 'Downloading \"' + accession.upper() + '\" with the fastq-dump tool (part of the sra-tools utility)' ) + fastq_dump_exitcode = os.system( 'fastq-dump --outdir ' + outdirpath + ' --fasta ' + accession.upper() ) + if fastq_dump_exitcode > 0: + printLog( outlogfile, '> FASTA file: FAILED ( \"' + accession.upper() + '\" will be excluded )' ) + else: + #os.rename( os.path.join( outdirpath, accession.upper() + '.fasta' ), os.path.join( outdirpath, accession.upper() + '_fasta' ) ) + printLog( outlogfile, '> FASTA file: \"' + accession.upper() + '.fasta\"' ) + accession_data = { + 'format': '.fasta', + 'filepath': os.path.join( outdirpath, accession.upper() + '.fasta' ), + 'filename': ''.join( c for c in accession.upper() if c in VALID_CHARS ) + } + downloaded_files[ accession.upper() ] = accession_data + return downloaded_files + +# format = { fasta, fastq, accession } +# this version skip the quality control procedure +def createSBT( options, args ): + outlogfile = str( options.outfile ) + outdirpath = str( options.outdir ) + if not outdirpath.endswith('/'): outdirpath += '/' + if not os.path.exists( outdirpath ): + os.mkdir( outdirpath ) + outdirpath = os.path.abspath( outdirpath ) + os.chdir( outdirpath ) + tooldirpath = os.path.abspath( str( options.tooldir ) ) + if not tooldirpath.endswith('/'): tooldirpath += '/' + + formats = [ fo for fo in str( options.formats ).split( '|' ) if fo.strip() ] + filepaths = [ filepath for filepath in str( options.filepaths ).split( '|' ) if filepath.strip() ] + filenames = [ filename for filename in str( options.filenames ).split( '|' ) if filename.strip() ] + compressed = [ True == int(c) for c in str( options.compressed ).split( '|' ) if c.strip() ] + minabundances = [ int(minab) for minab in str( options.minabundances ).split( '|' ) if minab.strip() ] + qualitythresholds = [ float(qthres) for qthres in str( options.qualitythresholds ).split( '|' ) if qthres.strip() ] + + klen = int( options.klen ) + bfsize = int( options.bfsize ) + + if len(formats) == len(filepaths) == len(filenames) == len(compressed) == len(minabundances) == len(qualitythresholds): + printLog( outlogfile, 'Retrieving experiments' ) + accessions = downloadAccessions( formats, filepaths, outlogfile, outdirpath ) + printLog( outlogfile, '> ' + str( len( accessions ) ) + ' experiments retrieved from the Sequence Read Archive' ) + acc_arr = [ a for a in accessions ] + print str( acc_arr ) + if bfsize < 0: # estimate bloom filter size + data_paths = ' '.join( accessions[ accession ][ 'filepath' ] for accession in accessions if 'filepath' in accessions[ accession ] ) + print data_paths + if len( data_paths ) > 0: + data_paths += ' ' + for dataset_idx in range(0, len(formats)): + if formats[ dataset_idx ] != 'accessions': + data_paths += ' '.join( path for path in filepaths[ dataset_idx ].split( ',' ) ) + # ntcard + printLog( outlogfile, 'Estimating the Bloom Filter size with ntcard' ) + if len( data_paths ) > 0: + ntcard_res_filepath = os.path.join( outdirpath, 'freq_k' + str( klen ) + '.hist' ) + ntcard_exitcode = os.system( 'ntcard --kmer=' + str( klen ) + ' ' + data_paths ) + print 'ntcard --kmer=' + str( klen ) + ' ' + data_paths + if ntcard_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(ntcard_exitcode) + '] an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True ) + else: + if os.path.exists( ntcard_res_filepath ): + os.rename( ntcard_res_filepath, os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' ) ) + ntcard_res_filepath = os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' ) + var_F0 = None + var_f1 = None + with open( ntcard_res_filepath ) as ntcard_res: + for line in ntcard_res: + line = line.strip() + if line: + line_split = line.split( '\t' ) + if len(line_split) == 2: + if line_split[0] == 'F0': + var_F0 = int( line_split[1] ) + elif line_split[0] == 'f1': + var_f1 = int( line_split[1] ) + if var_F0 is not None and var_f1 is not None: + break + if var_F0 is not None and var_f1 is not None: + bfsize = var_F0 - var_f1 + printLog( outlogfile, '> estimated Bloom Filter size: ' + str(bfsize) ) + else: + printLog( outlogfile, '> an error has occurred while estimating the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> unable to estimate the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True ) + + if bfsize > 0: + for dataset_idx in range(0, len(formats)): + if formats[ dataset_idx ] == 'accessions': + with open( filepaths[ dataset_idx ] ) as accessions_file: + for line in accessions_file: + accession = line.split( '\t' )[0].strip().upper() + if accession in accessions: + curr_format = accessions[ accession ][ 'format' ] + curr_compressed = 'uncompress' + curr_filepath = accessions[ accession ][ 'filepath' ] + curr_filename = accessions[ accession ][ 'filename' ] + printLog( outlogfile, 'Processing \"' + accession + '\" ( format=\"' + curr_format + + '\", compressed=\"' + str(False) + '\", fixed_name=\"' + curr_filename + '\" )' ) + print 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1' + makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' + + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1' ) + if makebf_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + accession + '\" will be excluded )' ) + else: + printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename + '.bf\"' ) + else: + curr_format = '.' + formats[ dataset_idx ].lower() + curr_compressed = '.gz' if compressed[ dataset_idx ] else 'uncompress' + curr_filepaths = filepaths[ dataset_idx ].split( ',' ) + curr_filenames = filenames[ dataset_idx ].split( ',' ) + for curr_idx in range(0, len(curr_formats)): + curr_filename_fixed = ''.join( c for c in curr_filenames[ curr_idx ] if c in VALID_CHARS ) + printLog( outlogfile, 'Processing \"' + curr_filenames[ curr_idx ] + '\" ( format=\"' + curr_format + + '\", compressed=\"' + str(compressed[ dataset_idx ]) + '\", fixed_name=\"' + curr_filename_fixed + '\" )' ) + if compressed[ dataset_idx ]: + makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' + + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 1' ) + else: + makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' + + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 0' ) + if makebf_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + curr_filenames[ curr_idx ] + '\" will be excluded )' ) + else: + printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename_fixed + '.bf\"' ) + # Create a tree topology + printLog( outlogfile, 'Creating a tree topology file' ) + bf_counter = len( glob.glob1( outdirpath, '*.bf' ) ) + if bf_counter > 0: + cluster_exitcode = os.system( 'sh ' + tooldirpath + 'cluster.sh ' + outdirpath + ' ' + str( bfsize ) ) + if cluster_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(cluster_exitcode) + '] an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True ) + else: + # Build the HowDeSBT nodes + if os.path.exists( os.path.join( outdirpath, 'leafnames.txt' ) ): + printLog( outlogfile, 'Building the Bloom Filter files for the tree' ) + build_exitcode = os.system( 'sh ' + tooldirpath + 'build.sh ' + outdirpath ) + if build_exitcode > 0: + printLog( outlogfile, '> [exitcode: ' + str(build_exitcode) + '] an error has occurred during the creation of the Bloom Filter files for the tree', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> the tree has been successfully built: \"howde.txt\"', exitcode=OK_EXIT_CODE, exit=True ) + ''' + howde_filepath = os.path.join( outdirpath, 'howde.txt' ) + howde_galaxy_filepath = os.path.join( outdirpath, 'howde_galaxy.txt' ) + howde_galaxy = open( howde_galaxy_filepath, 'w' ) + with open( howde_filepath ) as howde_file: + for line in howde_file: + line = line.strip() + if line: + # trim stars * and get node name + # find galaxy file path to the node name + # rewrite path with stars + howde_galaxy.close() + ''' + else: + printLog( outlogfile, '> an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> no Bloom Filter files found', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, '> ERROR: the Bloom Filter size is ' + str( bfsize ), exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, 'Something went wrong with the input parameters', exitcode=ERR_EXIT_CODE, exit=True ) + +def __main__(): + # Parse the command line options + usage = ("Usage: create.py --formats file_formats --filepaths file_paths --filenames file_names " + "--compressed file_compressed --minabundance min_abundance --qualitythresholds quality_thresholds " + "--klen kmer_len --bfsize bloom_filter_size --outfile out_log_file_path --outdir out_dir_path") + parser = optparse.OptionParser(usage = usage) + parser.add_option("-v", "--version", action="store_true", dest="version", + default=False, help="display version and exit") + parser.add_option("-f", "--formats", type="string", + action="store", dest="formats", help="list of file formats separated by a tab character") + parser.add_option("-p", "--filepaths", type="string", + action="store", dest="filepaths", help="list of input file paths separated by a tab character") + parser.add_option("-n", "--filenames", type="string", + action="store", dest="filenames", help="list of input file names separated by a tab character") + parser.add_option("-c", "--compressed", type="string", + action="store", dest="compressed", help="list of compressed flags related to the imput files separated by a tab character") + parser.add_option("-m", "--minabundances", type="string", + action="store", dest="minabundances", help="list of blooom filter minimum abundances related to the imput files separated by a tab character") + parser.add_option("-q", "--qualitythresholds", type="string", + action="store", dest="qualitythresholds", help="list of quality thresholds related to the imput files separated by a tab character") + parser.add_option("-k", "--klen", type="int", default=21, + action="store", dest="klen", help="k-mer length") + parser.add_option("-b", "--bfsize", type="int", default=-1, + action="store", dest="bfsize", help="bloom filter size") + parser.add_option("-o", "--outfile", type="string", default="sbtres.txt", + action="store", dest="outfile", help="output log file path") + parser.add_option("-d", "--outdir", type="string", default="sbtres.txt", + action="store", dest="outdir", help="output directory path") + parser.add_option("-t", "--tooldir", type="string", default="./", + action="store", dest="tooldir", help="tool directory path") + + (options, args) = parser.parse_args() + if options.version: + print __version__ + else: + createSBT( options, args ) + +if __name__ == "__main__": __main__()