Mercurial > repos > fabio > btman
comparison btman-1.0.0/create.py @ 18:be864d79c9c7 draft
Uploaded 20190304
| author | fabio |
|---|---|
| date | Mon, 04 Mar 2019 08:30:03 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 17:f02c2c58a6f9 | 18:be864d79c9c7 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import sys, os, optparse, shutil, glob | |
| 4 | |
| 5 __version__ = "1.0.0" | |
| 6 # in the case of collections, exitcodes equal to 0 and 1 are not considered errors | |
| 7 ERR_EXIT_CODE = 2 | |
| 8 OK_EXIT_CODE = 0 | |
| 9 VALID_CHARS = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' | |
| 10 | |
| 11 def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ): | |
| 12 print message | |
| 13 with open( logfilepath, 'a+' ) as out: | |
| 14 out.write( message + '\n' ) | |
| 15 if exit: | |
| 16 sys.exit( exitcode ) | |
| 17 | |
| 18 def downloadAccessions(formats, filepaths, outlogfile, outdirpath): | |
| 19 downloaded_files = { } | |
| 20 for dataset_idx in range(0, len(formats)): | |
| 21 if formats[ dataset_idx ] == 'accessions': | |
| 22 print filepaths[ dataset_idx ] | |
| 23 with open( filepaths[ dataset_idx ] ) as accessions: | |
| 24 for line in accessions: | |
| 25 print line | |
| 26 accession = line.split( '\t' )[0].strip() | |
| 27 if accession: | |
| 28 printLog( outlogfile, 'Downloading \"' + accession.upper() + '\" with the fastq-dump tool (part of the sra-tools utility)' ) | |
| 29 fastq_dump_exitcode = os.system( 'fastq-dump --outdir ' + outdirpath + ' --fasta ' + accession.upper() ) | |
| 30 if fastq_dump_exitcode > 0: | |
| 31 printLog( outlogfile, '> FASTA file: FAILED ( \"' + accession.upper() + '\" will be excluded )' ) | |
| 32 else: | |
| 33 #os.rename( os.path.join( outdirpath, accession.upper() + '.fasta' ), os.path.join( outdirpath, accession.upper() + '_fasta' ) ) | |
| 34 printLog( outlogfile, '> FASTA file: \"' + accession.upper() + '.fasta\"' ) | |
| 35 accession_data = { | |
| 36 'format': '.fasta', | |
| 37 'filepath': os.path.join( outdirpath, accession.upper() + '.fasta' ), | |
| 38 'filename': ''.join( c for c in accession.upper() if c in VALID_CHARS ) | |
| 39 } | |
| 40 downloaded_files[ accession.upper() ] = accession_data | |
| 41 return downloaded_files | |
| 42 | |
| 43 # format = { fasta, fastq, accession } | |
| 44 # this version skip the quality control procedure | |
| 45 def createSBT( options, args ): | |
| 46 outlogfile = str( options.outfile ) | |
| 47 outdirpath = str( options.outdir ) | |
| 48 if not outdirpath.endswith('/'): outdirpath += '/' | |
| 49 if not os.path.exists( outdirpath ): | |
| 50 os.mkdir( outdirpath ) | |
| 51 outdirpath = os.path.abspath( outdirpath ) | |
| 52 os.chdir( outdirpath ) | |
| 53 tooldirpath = os.path.abspath( str( options.tooldir ) ) | |
| 54 if not tooldirpath.endswith('/'): tooldirpath += '/' | |
| 55 | |
| 56 formats = [ fo for fo in str( options.formats ).split( '|' ) if fo.strip() ] | |
| 57 filepaths = [ filepath for filepath in str( options.filepaths ).split( '|' ) if filepath.strip() ] | |
| 58 filenames = [ filename for filename in str( options.filenames ).split( '|' ) if filename.strip() ] | |
| 59 compressed = [ True == int(c) for c in str( options.compressed ).split( '|' ) if c.strip() ] | |
| 60 minabundances = [ int(minab) for minab in str( options.minabundances ).split( '|' ) if minab.strip() ] | |
| 61 qualitythresholds = [ float(qthres) for qthres in str( options.qualitythresholds ).split( '|' ) if qthres.strip() ] | |
| 62 | |
| 63 klen = int( options.klen ) | |
| 64 bfsize = int( options.bfsize ) | |
| 65 | |
| 66 if len(formats) == len(filepaths) == len(filenames) == len(compressed) == len(minabundances) == len(qualitythresholds): | |
| 67 printLog( outlogfile, 'Retrieving experiments' ) | |
| 68 accessions = downloadAccessions( formats, filepaths, outlogfile, outdirpath ) | |
| 69 printLog( outlogfile, '> ' + str( len( accessions ) ) + ' experiments retrieved from the Sequence Read Archive' ) | |
| 70 acc_arr = [ a for a in accessions ] | |
| 71 print str( acc_arr ) | |
| 72 if bfsize < 0: # estimate bloom filter size | |
| 73 data_paths = ' '.join( accessions[ accession ][ 'filepath' ] for accession in accessions if 'filepath' in accessions[ accession ] ) | |
| 74 print data_paths | |
| 75 if len( data_paths ) > 0: | |
| 76 data_paths += ' ' | |
| 77 for dataset_idx in range(0, len(formats)): | |
| 78 if formats[ dataset_idx ] != 'accessions': | |
| 79 data_paths += ' '.join( path for path in filepaths[ dataset_idx ].split( ',' ) ) | |
| 80 # ntcard | |
| 81 printLog( outlogfile, 'Estimating the Bloom Filter size with ntcard' ) | |
| 82 if len( data_paths ) > 0: | |
| 83 ntcard_res_filepath = os.path.join( outdirpath, 'freq_k' + str( klen ) + '.hist' ) | |
| 84 ntcard_exitcode = os.system( 'ntcard --kmer=' + str( klen ) + ' ' + data_paths ) | |
| 85 print 'ntcard --kmer=' + str( klen ) + ' ' + data_paths | |
| 86 if ntcard_exitcode > 0: | |
| 87 printLog( outlogfile, '> [exitcode: ' + str(ntcard_exitcode) + '] an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True ) | |
| 88 else: | |
| 89 if os.path.exists( ntcard_res_filepath ): | |
| 90 os.rename( ntcard_res_filepath, os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' ) ) | |
| 91 ntcard_res_filepath = os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' ) | |
| 92 var_F0 = None | |
| 93 var_f1 = None | |
| 94 with open( ntcard_res_filepath ) as ntcard_res: | |
| 95 for line in ntcard_res: | |
| 96 line = line.strip() | |
| 97 if line: | |
| 98 line_split = line.split( '\t' ) | |
| 99 if len(line_split) == 2: | |
| 100 if line_split[0] == 'F0': | |
| 101 var_F0 = int( line_split[1] ) | |
| 102 elif line_split[0] == 'f1': | |
| 103 var_f1 = int( line_split[1] ) | |
| 104 if var_F0 is not None and var_f1 is not None: | |
| 105 break | |
| 106 if var_F0 is not None and var_f1 is not None: | |
| 107 bfsize = var_F0 - var_f1 | |
| 108 printLog( outlogfile, '> estimated Bloom Filter size: ' + str(bfsize) ) | |
| 109 else: | |
| 110 printLog( outlogfile, '> an error has occurred while estimating the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True ) | |
| 111 else: | |
| 112 printLog( outlogfile, '> an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True ) | |
| 113 else: | |
| 114 printLog( outlogfile, '> unable to estimate the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True ) | |
| 115 | |
| 116 if bfsize > 0: | |
| 117 for dataset_idx in range(0, len(formats)): | |
| 118 if formats[ dataset_idx ] == 'accessions': | |
| 119 with open( filepaths[ dataset_idx ] ) as accessions_file: | |
| 120 for line in accessions_file: | |
| 121 accession = line.split( '\t' )[0].strip().upper() | |
| 122 if accession in accessions: | |
| 123 curr_format = accessions[ accession ][ 'format' ] | |
| 124 curr_compressed = 'uncompress' | |
| 125 curr_filepath = accessions[ accession ][ 'filepath' ] | |
| 126 curr_filename = accessions[ accession ][ 'filename' ] | |
| 127 printLog( outlogfile, 'Processing \"' + accession + '\" ( format=\"' + curr_format + | |
| 128 '\", compressed=\"' + str(False) + '\", fixed_name=\"' + curr_filename + '\" )' ) | |
| 129 print 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1' | |
| 130 makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' + | |
| 131 curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + | |
| 132 str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1' ) | |
| 133 if makebf_exitcode > 0: | |
| 134 printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + accession + '\" will be excluded )' ) | |
| 135 else: | |
| 136 printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename + '.bf\"' ) | |
| 137 else: | |
| 138 curr_format = '.' + formats[ dataset_idx ].lower() | |
| 139 curr_compressed = '.gz' if compressed[ dataset_idx ] else 'uncompress' | |
| 140 curr_filepaths = filepaths[ dataset_idx ].split( ',' ) | |
| 141 curr_filenames = filenames[ dataset_idx ].split( ',' ) | |
| 142 for curr_idx in range(0, len(curr_formats)): | |
| 143 curr_filename_fixed = ''.join( c for c in curr_filenames[ curr_idx ] if c in VALID_CHARS ) | |
| 144 printLog( outlogfile, 'Processing \"' + curr_filenames[ curr_idx ] + '\" ( format=\"' + curr_format + | |
| 145 '\", compressed=\"' + str(compressed[ dataset_idx ]) + '\", fixed_name=\"' + curr_filename_fixed + '\" )' ) | |
| 146 if compressed[ dataset_idx ]: | |
| 147 makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' + | |
| 148 curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + | |
| 149 str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 1' ) | |
| 150 else: | |
| 151 makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' + | |
| 152 curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + | |
| 153 str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 0' ) | |
| 154 if makebf_exitcode > 0: | |
| 155 printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + curr_filenames[ curr_idx ] + '\" will be excluded )' ) | |
| 156 else: | |
| 157 printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename_fixed + '.bf\"' ) | |
| 158 # Create a tree topology | |
| 159 printLog( outlogfile, 'Creating a tree topology file' ) | |
| 160 bf_counter = len( glob.glob1( outdirpath, '*.bf' ) ) | |
| 161 if bf_counter > 0: | |
| 162 cluster_exitcode = os.system( 'sh ' + tooldirpath + 'cluster.sh ' + outdirpath + ' ' + str( bfsize ) ) | |
| 163 if cluster_exitcode > 0: | |
| 164 printLog( outlogfile, '> [exitcode: ' + str(cluster_exitcode) + '] an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True ) | |
| 165 else: | |
| 166 # Build the HowDeSBT nodes | |
| 167 if os.path.exists( os.path.join( outdirpath, 'leafnames.txt' ) ): | |
| 168 printLog( outlogfile, 'Building the Bloom Filter files for the tree' ) | |
| 169 build_exitcode = os.system( 'sh ' + tooldirpath + 'build.sh ' + outdirpath ) | |
| 170 if build_exitcode > 0: | |
| 171 printLog( outlogfile, '> [exitcode: ' + str(build_exitcode) + '] an error has occurred during the creation of the Bloom Filter files for the tree', exitcode=ERR_EXIT_CODE, exit=True ) | |
| 172 else: | |
| 173 printLog( outlogfile, '> the tree has been successfully built: \"howde.txt\"', exitcode=OK_EXIT_CODE, exit=True ) | |
| 174 ''' | |
| 175 howde_filepath = os.path.join( outdirpath, 'howde.txt' ) | |
| 176 howde_galaxy_filepath = os.path.join( outdirpath, 'howde_galaxy.txt' ) | |
| 177 howde_galaxy = open( howde_galaxy_filepath, 'w' ) | |
| 178 with open( howde_filepath ) as howde_file: | |
| 179 for line in howde_file: | |
| 180 line = line.strip() | |
| 181 if line: | |
| 182 # trim stars * and get node name | |
| 183 # find galaxy file path to the node name | |
| 184 # rewrite path with stars | |
| 185 howde_galaxy.close() | |
| 186 ''' | |
| 187 else: | |
| 188 printLog( outlogfile, '> an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True ) | |
| 189 else: | |
| 190 printLog( outlogfile, '> no Bloom Filter files found', exitcode=ERR_EXIT_CODE, exit=True ) | |
| 191 else: | |
| 192 printLog( outlogfile, '> ERROR: the Bloom Filter size is ' + str( bfsize ), exitcode=ERR_EXIT_CODE, exit=True ) | |
| 193 else: | |
| 194 printLog( outlogfile, 'Something went wrong with the input parameters', exitcode=ERR_EXIT_CODE, exit=True ) | |
| 195 | |
| 196 def __main__(): | |
| 197 # Parse the command line options | |
| 198 usage = ("Usage: create.py --formats file_formats --filepaths file_paths --filenames file_names " | |
| 199 "--compressed file_compressed --minabundance min_abundance --qualitythresholds quality_thresholds " | |
| 200 "--klen kmer_len --bfsize bloom_filter_size --outfile out_log_file_path --outdir out_dir_path") | |
| 201 parser = optparse.OptionParser(usage = usage) | |
| 202 parser.add_option("-v", "--version", action="store_true", dest="version", | |
| 203 default=False, help="display version and exit") | |
| 204 parser.add_option("-f", "--formats", type="string", | |
| 205 action="store", dest="formats", help="list of file formats separated by a tab character") | |
| 206 parser.add_option("-p", "--filepaths", type="string", | |
| 207 action="store", dest="filepaths", help="list of input file paths separated by a tab character") | |
| 208 parser.add_option("-n", "--filenames", type="string", | |
| 209 action="store", dest="filenames", help="list of input file names separated by a tab character") | |
| 210 parser.add_option("-c", "--compressed", type="string", | |
| 211 action="store", dest="compressed", help="list of compressed flags related to the imput files separated by a tab character") | |
| 212 parser.add_option("-m", "--minabundances", type="string", | |
| 213 action="store", dest="minabundances", help="list of blooom filter minimum abundances related to the imput files separated by a tab character") | |
| 214 parser.add_option("-q", "--qualitythresholds", type="string", | |
| 215 action="store", dest="qualitythresholds", help="list of quality thresholds related to the imput files separated by a tab character") | |
| 216 parser.add_option("-k", "--klen", type="int", default=21, | |
| 217 action="store", dest="klen", help="k-mer length") | |
| 218 parser.add_option("-b", "--bfsize", type="int", default=-1, | |
| 219 action="store", dest="bfsize", help="bloom filter size") | |
| 220 parser.add_option("-o", "--outfile", type="string", default="sbtres.txt", | |
| 221 action="store", dest="outfile", help="output log file path") | |
| 222 parser.add_option("-d", "--outdir", type="string", default="sbtres.txt", | |
| 223 action="store", dest="outdir", help="output directory path") | |
| 224 parser.add_option("-t", "--tooldir", type="string", default="./", | |
| 225 action="store", dest="tooldir", help="tool directory path") | |
| 226 | |
| 227 (options, args) = parser.parse_args() | |
| 228 if options.version: | |
| 229 print __version__ | |
| 230 else: | |
| 231 createSBT( options, args ) | |
| 232 | |
| 233 if __name__ == "__main__": __main__() |
