view create.py @ 20:1dc3f0c61817 draft

Uploaded 20190304
author fabio
date Mon, 04 Mar 2019 09:14:04 -0500
parents 7f712cc0d3d5
children
line wrap: on
line source

#!/usr/bin/env python

import sys, os, optparse, shutil, glob

__version__ = "1.0.0"
# in the case of collections, exitcodes equal to 0 and 1 are not considered errors
ERR_EXIT_CODE = 2
OK_EXIT_CODE = 0
VALID_CHARS = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'

def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ):
    print message
    with open( logfilepath, 'a+' ) as out:
        out.write( message + '\n' )
    if exit:
        sys.exit( exitcode )

def downloadAccessions(formats, filepaths, outlogfile, outdirpath):
    downloaded_files = { }
    for dataset_idx in range(0, len(formats)):
        if formats[ dataset_idx ] == 'accessions':
            print filepaths[ dataset_idx ]
            with open( filepaths[ dataset_idx ] ) as accessions:
                for line in accessions:
                    print line
                    accession = line.split( '\t' )[0].strip()
                    if accession:
                        printLog( outlogfile, 'Downloading \"' + accession.upper() + '\" with the fastq-dump tool (part of the sra-tools utility)' )
                        fastq_dump_exitcode = os.system( 'fastq-dump --outdir ' + outdirpath + ' --fasta ' + accession.upper() )
                        if fastq_dump_exitcode > 0:
                            printLog( outlogfile, '> FASTA file: FAILED ( \"' + accession.upper() + '\" will be excluded )' )
                        else:
                            #os.rename( os.path.join( outdirpath, accession.upper() + '.fasta' ), os.path.join( outdirpath, accession.upper() + '_fasta' ) )
                            printLog( outlogfile, '> FASTA file: \"' + accession.upper() + '.fasta\"' )
                            accession_data = {
                                'format': '.fasta',
                                'filepath': os.path.join( outdirpath, accession.upper() + '.fasta' ),
                                'filename': ''.join( c for c in accession.upper() if c in VALID_CHARS )
                            }
                            downloaded_files[ accession.upper() ] = accession_data
    return downloaded_files

# format = { fasta, fastq, accession }
# this version skip the quality control procedure
def createSBT( options, args ):
    outlogfile = str( options.outfile )
    outdirpath = str( options.outdir )
    if not outdirpath.endswith('/'): outdirpath += '/'
    if not os.path.exists( outdirpath ):
        os.mkdir( outdirpath )
    outdirpath = os.path.abspath( outdirpath )
    os.chdir( outdirpath )
    tooldirpath = os.path.abspath( str( options.tooldir ) )
    if not tooldirpath.endswith('/'): tooldirpath += '/'
    
    formats = [ fo for fo in str( options.formats ).split( '|' ) if fo.strip() ]
    filepaths = [ filepath for filepath in str( options.filepaths ).split( '|' ) if filepath.strip() ]
    filenames = [ filename for filename in str( options.filenames ).split( '|' ) if filename.strip() ]
    compressed = [ True == int(c) for c in str( options.compressed ).split( '|' ) if c.strip() ]
    minabundances = [ int(minab) for minab in str( options.minabundances ).split( '|' ) if minab.strip() ]
    qualitythresholds = [ float(qthres) for qthres in str( options.qualitythresholds ).split( '|' ) if qthres.strip() ]

    klen = int( options.klen )
    bfsize = int( options.bfsize )

    if len(formats) == len(filepaths) == len(filenames) == len(compressed) == len(minabundances) == len(qualitythresholds):
        printLog( outlogfile, 'Retrieving experiments' )
        accessions = downloadAccessions( formats, filepaths, outlogfile, outdirpath )
        printLog( outlogfile, '> ' + str( len( accessions ) ) + ' experiments retrieved from the Sequence Read Archive' )
        acc_arr = [ a for a in accessions ]
        print str( acc_arr )
        if bfsize < 0: # estimate bloom filter size
            data_paths = ' '.join( accessions[ accession ][ 'filepath' ] for accession in accessions if 'filepath' in accessions[ accession ] )
            print data_paths
            if len( data_paths ) > 0:
                data_paths += ' '
            for dataset_idx in range(0, len(formats)):
                if formats[ dataset_idx ] != 'accessions':
                    data_paths += ' '.join( path for path in filepaths[ dataset_idx ].split( ',' ) )
            # ntcard
            printLog( outlogfile, 'Estimating the Bloom Filter size with ntcard' )
            if len( data_paths ) > 0:
                ntcard_res_filepath = os.path.join( outdirpath, 'freq_k' + str( klen ) + '.hist' )
                ntcard_exitcode = os.system( 'ntcard --kmer=' + str( klen ) + ' ' + data_paths )
                print 'ntcard --kmer=' + str( klen ) + ' ' + data_paths
                if ntcard_exitcode > 0:
                    printLog( outlogfile, '> [exitcode: ' + str(ntcard_exitcode) + '] an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True )
                else:
                    if os.path.exists( ntcard_res_filepath ):
                        os.rename( ntcard_res_filepath, os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' ) )
                        ntcard_res_filepath = os.path.join( outdirpath, 'ntcard' + str( klen ) + '.txt' )
                        var_F0 = None
                        var_f1 = None
                        with open( ntcard_res_filepath ) as ntcard_res:
                            for line in ntcard_res:
                                line = line.strip()
                                if line:
                                    line_split = line.split( '\t' )
                                    if len(line_split) == 2:
                                        if line_split[0] == 'F0':
                                            var_F0 = int( line_split[1] )
                                        elif line_split[0] == 'f1':
                                            var_f1 = int( line_split[1] )
                                if var_F0 is not None and var_f1 is not None:
                                    break
                        if var_F0 is not None and var_f1 is not None:
                            bfsize = var_F0 - var_f1
                            printLog( outlogfile, '> estimated Bloom Filter size: ' + str(bfsize) )
                        else:
                            printLog( outlogfile, '> an error has occurred while estimating the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True )
                    else:
                        printLog( outlogfile, '> an error with ntcard has occurred', exitcode=ERR_EXIT_CODE, exit=True )
            else:
                printLog( outlogfile, '> unable to estimate the Bloom Filter size', exitcode=ERR_EXIT_CODE, exit=True )

        if bfsize > 0:
            for dataset_idx in range(0, len(formats)):
                if formats[ dataset_idx ] == 'accessions':
                    with open( filepaths[ dataset_idx ] ) as accessions_file:
                        for line in accessions_file:
                            accession = line.split( '\t' )[0].strip().upper()
                            if accession in accessions:
                                curr_format = accessions[ accession ][ 'format' ]
                                curr_compressed = 'uncompress'
                                curr_filepath = accessions[ accession ][ 'filepath' ]
                                curr_filename = accessions[ accession ][ 'filename' ]
                                printLog( outlogfile, 'Processing \"' + accession + '\" ( format=\"' + curr_format + 
                                                        '\", compressed=\"' + str(False) + '\", fixed_name=\"' + curr_filename + '\" )' )
                                print 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' + curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1'
                                makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepath + ' ' + curr_filename + ' ' +
                                                                curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + 
                                                                str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 1 1' )
                                if makebf_exitcode > 0:
                                    printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + accession + '\" will be excluded )' )
                                else:
                                    printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename + '.bf\"' )
                else:
                    curr_format = '.' + formats[ dataset_idx ].lower()
                    curr_compressed = '.gz' if compressed[ dataset_idx ] else 'uncompress'
                    curr_filepaths = filepaths[ dataset_idx ].split( ',' )
                    curr_filenames = filenames[ dataset_idx ].split( ',' )
                    for curr_idx in range(0, len(curr_formats)):
                        curr_filename_fixed = ''.join( c for c in curr_filenames[ curr_idx ] if c in VALID_CHARS )
                        printLog( outlogfile, 'Processing \"' + curr_filenames[ curr_idx ] + '\" ( format=\"' + curr_format + 
                                            '\", compressed=\"' + str(compressed[ dataset_idx ]) + '\", fixed_name=\"' + curr_filename_fixed + '\" )' )
                        if compressed[ dataset_idx ]:
                            makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' +
                                                        curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + 
                                                        str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 1' )
                        else:
                            makebf_exitcode = os.system( 'sh ' + tooldirpath + 'makebf.sh ' + curr_filepaths[ dataset_idx ] + ' ' + curr_filename_fixed + ' ' +
                                                        curr_format + ' ' + str(curr_compressed) + ' ' + outdirpath + ' ' + str( klen ) + ' ' + 
                                                        str( minabundances[ dataset_idx ] ) + ' ' + str( bfsize ) + ' 0 0' )
                        if makebf_exitcode > 0:
                            printLog( outlogfile, '> [exitcode: ' + str(makebf_exitcode) + '] Bloom Filter file: FAILED ( \"' + curr_filenames[ curr_idx ] + '\" will be excluded )' )
                        else:
                            printLog( outlogfile, '> Bloom Filter file: \"' + curr_filename_fixed + '.bf\"' )
            # Create a tree topology
            printLog( outlogfile, 'Creating a tree topology file' )
            bf_counter = len( glob.glob1( outdirpath, '*.bf' ) )
            if bf_counter > 0:
                cluster_exitcode = os.system( 'sh ' + tooldirpath + 'cluster.sh ' + outdirpath + ' ' + str( bfsize ) )
                if cluster_exitcode > 0:
                    printLog( outlogfile, '> [exitcode: ' + str(cluster_exitcode) + '] an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True )
                else:
                    # Build the HowDeSBT nodes
                    if os.path.exists( os.path.join( outdirpath, 'leafnames.txt' ) ):
                        printLog( outlogfile, 'Building the Bloom Filter files for the tree' )
                        build_exitcode = os.system( 'sh ' + tooldirpath + 'build.sh ' + outdirpath )
                        if build_exitcode > 0:
                            printLog( outlogfile, '> [exitcode: ' + str(build_exitcode) + '] an error has occurred during the creation of the Bloom Filter files for the tree', exitcode=ERR_EXIT_CODE, exit=True )
                        else:
                            printLog( outlogfile, '> the tree has been successfully built: \"howde.txt\"', exitcode=OK_EXIT_CODE, exit=True )
                            '''
                            howde_filepath = os.path.join( outdirpath, 'howde.txt' )
                            howde_galaxy_filepath = os.path.join( outdirpath, 'howde_galaxy.txt' )
                            howde_galaxy = open( howde_galaxy_filepath, 'w' )
                            with open( howde_filepath ) as howde_file:
                                for line in howde_file:
                                    line = line.strip()
                                    if line:
                                        # trim stars * and get node name
                                        # find galaxy file path to the node name
                                        # rewrite path with stars
                            howde_galaxy.close()
                            '''
                    else:
                        printLog( outlogfile, '> an error has occurred during the creation of the topology file', exitcode=ERR_EXIT_CODE, exit=True )
            else:
                printLog( outlogfile, '> no Bloom Filter files found', exitcode=ERR_EXIT_CODE, exit=True )
        else:
            printLog( outlogfile, '> ERROR: the Bloom Filter size is ' + str( bfsize ), exitcode=ERR_EXIT_CODE, exit=True )
    else:
        printLog( outlogfile, 'Something went wrong with the input parameters', exitcode=ERR_EXIT_CODE, exit=True )

def __main__():
    # Parse the command line options
    usage = ("Usage: create.py --formats file_formats --filepaths file_paths --filenames file_names "
            "--compressed file_compressed --minabundance min_abundance --qualitythresholds quality_thresholds "
            "--klen kmer_len --bfsize bloom_filter_size --outfile out_log_file_path --outdir out_dir_path")
    parser = optparse.OptionParser(usage = usage)
    parser.add_option("-v", "--version", action="store_true", dest="version",
                    default=False, help="display version and exit")
    parser.add_option("-f", "--formats", type="string",
                    action="store", dest="formats", help="list of file formats separated by a tab character")
    parser.add_option("-p", "--filepaths", type="string",
                    action="store", dest="filepaths", help="list of input file paths separated by a tab character")
    parser.add_option("-n", "--filenames", type="string",
                    action="store", dest="filenames", help="list of input file names separated by a tab character")
    parser.add_option("-c", "--compressed", type="string",
                    action="store", dest="compressed", help="list of compressed flags related to the imput files separated by a tab character")
    parser.add_option("-m", "--minabundances", type="string",
                    action="store", dest="minabundances", help="list of blooom filter minimum abundances related to the imput files separated by a tab character")
    parser.add_option("-q", "--qualitythresholds", type="string",
                    action="store", dest="qualitythresholds", help="list of quality thresholds related to the imput files separated by a tab character")
    parser.add_option("-k", "--klen", type="int", default=21,
                    action="store", dest="klen", help="k-mer length")
    parser.add_option("-b", "--bfsize", type="int", default=-1,
                    action="store", dest="bfsize", help="bloom filter size")
    parser.add_option("-o", "--outfile", type="string", default="sbtres.txt",
                    action="store", dest="outfile", help="output log file path")
    parser.add_option("-d", "--outdir", type="string", default="sbtres.txt",
                    action="store", dest="outdir", help="output directory path")
    parser.add_option("-t", "--tooldir", type="string", default="./",
                    action="store", dest="tooldir", help="tool directory path")

    (options, args) = parser.parse_args()
    if options.version:
        print __version__
    else:
        createSBT( options, args )

if __name__ == "__main__": __main__()