Mercurial > repos > rmarenco > hubarchivecreator
diff hubArchiveCreator.py @ 29:7e8a8b732db3 draft
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 1a81ebd0ddea950b84af3fc830e9267a4814b29f
author | yating-l |
---|---|
date | Wed, 16 May 2018 18:04:20 -0400 |
parents | fcc1021bd496 |
children |
line wrap: on
line diff
--- a/hubArchiveCreator.py Mon Jul 10 17:08:38 2017 -0400 +++ b/hubArchiveCreator.py Wed May 16 18:04:20 2018 -0400 @@ -5,7 +5,7 @@ This Galaxy tool permits to prepare your files to be ready for Assembly Hub visualization. Program test arguments: -hubArchiveCreator.py -g test-data/augustusDbia3.gff3 -f test-data/dbia3.fa -d . -u ./tools -o output.html +hubArchiveCreator.py -g test-data/augustusDbia3.gff3 -f '{"false_path": "./test-data/common/dbia3.fa", "name":"dbia3"}' -d . -u ./tools -o output.html """ import argparse @@ -16,324 +16,75 @@ import sys # Internal dependencies -from Bam import Bam -from BedSimpleRepeats import BedSimpleRepeats -from BedSpliceJunctions import BedSpliceJunctions -from Bed import Bed -from cytoBand import cytoBand -from BigWig import BigWig -from util.Fasta import Fasta -from util.Filters import TraceBackFormatter -from Gff3 import Gff3 -from Gtf import Gtf -from Psl import Psl +from util.Reader import Reader +from util.Logger import Logger from TrackHub import TrackHub -from bigPsl import bigPsl -from BedBlastAlignments import BedBlastAlignments -from BigBed import BigBed + + # TODO: Verify each subprocessed dependency is accessible [gff3ToGenePred, genePredToBed, twoBitInfo, faToTwoBit, bedToBigBed, sort def main(argv): + # Command Line parsing init parser = argparse.ArgumentParser(description='Create a foo.txt inside the given folder.') - - # Reference genome mandatory - parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome') - - # GFF3 Management - parser.add_argument('--gff3', action='append', help='GFF3 format') - - # GTF Management - parser.add_argument('--gtf', action='append', help='GTF format') - - # Bed4+12 (TrfBig) - parser.add_argument('--bedSimpleRepeats', action='append', help='Bed4+12 format, using simpleRepeats.as') - - # Bed12+1 (regtools) - parser.add_argument('--bedSpliceJunctions', action='append', help='Bed12+1 format, using spliceJunctions.as') - - # Generic Bed (Blastx transformed to bed) - parser.add_argument('--bed', action='append', help='Bed generic format') - - #cytoBandIdeo - parser.add_argument('--cytoBand', action='append', help='Cytoband Track, using cytoBandIdeo.as') - - # BigPsl (blat alignment) - parser.add_argument('--bigpsl', action='append', help='bigPsl format, using bigPsl.as') - - # Bed12+12 (tblastn alignment) - parser.add_argument('--bedBlastAlignments', action='append', help='Bed12+12 format, using bigPsl.as') + parser.add_argument('-j', '--data_json', help='JSON file containing the metadata of the inputs') + parser.add_argument('-o', '--output', help='Name of the HTML summarizing the content of the Track Hub Archive') + + # Get the args passed in parameter + args = parser.parse_args() + json_inputs_data = args.data_json + outputFile = args.output - # BigWig Management - parser.add_argument('--bigwig', action='append', help='BigWig format') - - # Bam Management - parser.add_argument('--bam', action='append', help='Bam format') - - # Psl Management - parser.add_argument('--psl', action='append', help='Psl format') - - # BigBed Management - parser.add_argument('--bigbed', action='append', help='BigBed format') - - # TODO: Check if the running directory can have issues if we run the tool outside - parser.add_argument('-d', '--directory', - help='Running tool directory, where to find the templates. Default is running directory') - parser.add_argument('-u', '--ucsc_tools_path', - help='Directory where to find the executables needed to run this tool') - parser.add_argument('-e', '--extra_files_path', - help='Name, in galaxy, of the output folder. Where you would want to build the Track Hub Archive') - parser.add_argument('-o', '--output', help='Name of the HTML summarizing the content of the Track Hub Archive') - - parser.add_argument('-j', '--data_json', help='Json containing the metadata of the inputs') - - parser.add_argument('--user_email', help='Email of the user who launched the Hub Archive Creation') - - parser.add_argument('--genome_name', help='UCSC Genome Browser assembly ID') - - parser.add_argument('--debug_mode', action='store_true', help='Allow more details about the errors') + ##Parse JSON file with Reader + reader = Reader(json_inputs_data) # Begin init variables - - toolDirectory = '.' - extra_files_path = '.' - - # Get the args passed in parameter - args = parser.parse_args() - - extra_files_path = args.extra_files_path - toolDirectory = args.directory + extra_files_path = reader.getExtFilesPath() + toolDirectory = reader.getToolDir() + #outputFile = reader.getOutputDir() + user_email = reader.getUserEmail() + reference_genome = reader.getRefGenome() + debug_mode = reader.getDebugMode() #### Logging management #### # If we are in Debug mode, also print in stdout the debug dump - - configure_logger(extra_files_path=extra_files_path, debug=args.debug_mode) - + log = Logger(tool_directory=toolDirectory, debug=debug_mode, extra_files_path=extra_files_path) + log.setup_logging() + logging.info('#### HubArchiveCreator: Start ####\n') + logging.debug('---- Welcome in HubArchiveCreator Debug Mode ----\n') + logging.debug('JSON parameters: %s\n\n', json.dumps(reader.args)) #### END Logging management #### - array_inputs_reference_genome = json.loads(args.fasta) - - # TODO: Replace these with the object Fasta - input_fasta_file = array_inputs_reference_genome["false_path"] - input_fasta_file_name = sanitize_name_input(array_inputs_reference_genome["name"]) - genome_name = sanitize_name_input(args.genome_name) - - reference_genome = Fasta(input_fasta_file, - input_fasta_file_name, genome_name) - - user_email = args.user_email - - - # TODO: Use a class to have a better management of the structure of these inputs - # These inputs are populated in the Galaxy Wrapper xml and are in this format: - # ARRAY[DICT{FILE_PATH: DICT{NAME: NAME_VALUE, EXTRA_DATA: EXTRA_DATA_VALUE}}] - # EXTRA_DATA could be anything, for example the index of a BAM => {"index", FILE_PATH} - array_inputs_bam = args.bam - array_inputs_bed_generic = args.bed - array_inputs_bed_cytoBand = args.cytoBand - array_inputs_bed_simple_repeats = args.bedSimpleRepeats - array_inputs_bed_splice_junctions = args.bedSpliceJunctions - array_inputs_bigwig = args.bigwig - array_inputs_gff3 = args.gff3 - array_inputs_gtf = args.gtf - array_inputs_psl = args.psl - array_inputs_bigpsl = args.bigpsl - array_inputs_bed_blast_alignments = args.bedBlastAlignments - array_inputs_bigbed = args.bigbed - - outputFile = args.output - - json_inputs_data = args.data_json - - # TODO: Instead use a class to properly store the objects, with object_hook - inputs_data = json.loads(json_inputs_data) - # We remove the spaces in ["name"] of inputs_data - sanitize_name_inputs(inputs_data) - - # TODO: Check here all the binaries / tools we need. Exception if missing - # Create the Track Hub folder + logging.info('#### HubArchiveCreator: Creating the Track Hub folder ####\n') trackHub = TrackHub(reference_genome, user_email, outputFile, extra_files_path, toolDirectory) - all_datatype_dictionary = {} - - for (inputs, datatype_class) in [ - (array_inputs_bam, Bam), - (array_inputs_bed_generic, Bed), - (array_inputs_bed_cytoBand, cytoBand), - (array_inputs_bigwig, BigWig), - (array_inputs_bed_simple_repeats, BedSimpleRepeats), - (array_inputs_bed_splice_junctions, BedSpliceJunctions), - (array_inputs_gff3, Gff3), - (array_inputs_gtf, Gtf), - (array_inputs_psl, Psl), - (array_inputs_bigpsl, bigPsl), - (array_inputs_bed_blast_alignments, BedBlastAlignments), - (array_inputs_bigbed, BigBed)]: - if inputs: - all_datatype_dictionary.update(create_ordered_datatype_objects(datatype_class, inputs, inputs_data)) - # Create Ordered Dictionary to add the tracks in the tool form order + logging.info('#### HubArchiveCreator: Preparing track data ####\n') + all_datatype_dictionary = reader.getTracksData() all_datatype_ordered_dictionary = collections.OrderedDict(all_datatype_dictionary) logging.debug("----- End of all_datatype_dictionary processing -----") - logging.debug("all_datatype_ordered_dictionary keys are: {0}".format(all_datatype_ordered_dictionary.values())) + #logging.debug("all_datatype_ordered_dictionary are: %s", json.dumps(all_datatype_ordered_dictionary)) + logging.info('#### HubArchiveCreator: Adding tracks to Track Hub ####\n') logging.debug("----- Beginning of Track adding processing -----") + for index, datatypeObject in all_datatype_ordered_dictionary.iteritems(): - trackHub.addTrack(datatypeObject.track.trackDb) + trackHub.addTrack(datatypeObject.track.track_db) + logging.debug("----- End of Track adding processing -----") - # We process all the modifications to create the zip file - #trackHub.createZip() - - # We terminate le process and so create a HTML file summarizing all the files + # We terminate the process and so create a HTML file summarizing all the files + logging.info('#### HubArchiveCreator: Creating the HTML file ####\n') trackHub.terminate() - logging.debug('#### End of HubArchiveCreator Debug Mode: Bye! ####') + logging.debug('---- End of HubArchiveCreator Debug Mode: Bye! ----\n') + logging.info('#### HubArchiveCreator: Congratulation! Assembly Hub is created! ####\n') sys.exit(0) - -def sanitize_name_input(string_to_sanitize): - """ - Sanitize the string passed in parameter by replacing '/' and ' ' by '_' - - :param string_to_sanitize: - :return : - - :Example: - - >>> sanitize_name_input('this/is an//example') - this_is_an__example - """ - return string_to_sanitize \ - .replace("/", "_") \ - .replace(" ", "_") - - -def sanitize_name_inputs(inputs_data): - """ - Sanitize value of the keys "name" of the dictionary passed in parameter. - - Because sometimes output from Galaxy, or even just file name, from user inputs, have spaces. - Also, it can contain '/' character and could break the use of os.path function. - - :param inputs_data: dict[string, dict[string, string]] - """ - for key in inputs_data: - inputs_data[key]["name"] = sanitize_name_input(inputs_data[key]["name"]) - - -def create_ordered_datatype_objects(ExtensionClass, array_inputs, inputs_data): - """ - Function which executes the creation all the necessary files / folders for a special Datatype, for TrackHub - and update the dictionary of datatype - - :param ExtensionClass: - :param array_inputs: - :param inputs_data: - :type ExtensionClass: Datatype - :type array_inputs: list[string] - :type inputs_data: dict - :rtype: dict - """ - - datatype_dictionary = {} - - # TODO: Optimize this double loop - for input_false_path in array_inputs: - for key, data_value in inputs_data.items(): - if key == input_false_path: - logging.debug("input_false_path: " + input_false_path) - logging.debug("data_value: " + str(data_value)) - extensionObject = ExtensionClass(input_false_path, data_value) - datatype_dictionary.update({data_value["order_index"]: extensionObject}) - return datatype_dictionary - -def configure_logger(extra_files_path=None, debug=False): - if not extra_files_path: - raise Exception("Extra files path is not set. Stopping the application") - - - # All case log: log everything in a .log file - logger_file_name = ''.join([__name__, '.log']) - logging_file_path = os.path.join(extra_files_path, logger_file_name) - - logging.basicConfig(filename=logging_file_path, level=logging.DEBUG) - - log_stdout = logging.StreamHandler(sys.stdout) - if not debug: - configure_logger_user(log_stdout) - else: - configure_logger_dev(log_stdout) - - # stderr configuration - configure_logger_stderr() - - logging.debug('#### Welcome in HubArchiveCreator Debug Mode ####\n') - -def configure_logger_user(log_stdout=None): - """ - User Logger is defined as following: - - User needs to have WARN, ERROR and CRITICAL but well formatted / without traceback - in STDOUT - - Still access to full, brute and traceback for errors - in STDERR - - And further access to debug if needed - in .log - - """ - - if not log_stdout: - raise Exception("No log_stdout given. Stopping the application") - - # stdout for INFO / WARN / ERROR / CRITICAL - log_stdout.setLevel(logging.INFO) - - formatter = TraceBackFormatter('%(message)s') - - log_stdout.setFormatter(formatter) - - logging.getLogger().addHandler(log_stdout) - -def configure_logger_dev(log_stdout=None): - """ - Dev Logger is defined as following: - - Dev needs to have WARN, ERROR and CRITICAL but well formatted / without traceback, in stdout - - Still access to full, brute and traceback in stderr for errors - - And further access to debug if needed - - """ - if not log_stdout: - raise Exception("No log_stdout given. Stopping the application") - log_format = '%(message)s' - - # stdout and stderr and both identical for INFO / WARN / ERROR / CRITICAL - log_stdout.setLevel(logging.DEBUG) - - formatter = logging.Formatter(log_format) - - log_stdout.setFormatter(formatter) - - logging.getLogger().addHandler(log_stdout) - -def configure_logger_stderr(): - """ - Configure what should be logged in stderr - """ - log_error = logging.StreamHandler(sys.stderr) - log_error.setLevel(logging.ERROR) - log_error_format = '%(message)s' - - formatter_error = logging.Formatter(log_error_format) - - log_error.setFormatter(formatter_error) - - logging.getLogger().addHandler(log_error) - if __name__ == "__main__": - logging.getLogger(__name__) main(sys.argv)