diff hubArchiveCreator.py @ 29:7e8a8b732db3 draft

planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 1a81ebd0ddea950b84af3fc830e9267a4814b29f
author yating-l
date Wed, 16 May 2018 18:04:20 -0400
parents fcc1021bd496
children
line wrap: on
line diff
--- a/hubArchiveCreator.py	Mon Jul 10 17:08:38 2017 -0400
+++ b/hubArchiveCreator.py	Wed May 16 18:04:20 2018 -0400
@@ -5,7 +5,7 @@
 This Galaxy tool permits to prepare your files to be ready for
 Assembly Hub visualization.
 Program test arguments:
-hubArchiveCreator.py -g test-data/augustusDbia3.gff3 -f test-data/dbia3.fa -d . -u ./tools -o output.html
+hubArchiveCreator.py -g test-data/augustusDbia3.gff3 -f '{"false_path": "./test-data/common/dbia3.fa", "name":"dbia3"}' -d . -u ./tools -o output.html
 """
 
 import argparse
@@ -16,324 +16,75 @@
 import sys
 
 # Internal dependencies
-from Bam import Bam
-from BedSimpleRepeats import BedSimpleRepeats
-from BedSpliceJunctions import BedSpliceJunctions
-from Bed import Bed
-from cytoBand import cytoBand
-from BigWig import BigWig
-from util.Fasta import Fasta
-from util.Filters import TraceBackFormatter
-from Gff3 import Gff3
-from Gtf import Gtf
-from Psl import Psl
+from util.Reader import Reader
+from util.Logger import Logger
 from TrackHub import TrackHub
-from bigPsl import bigPsl
-from BedBlastAlignments import BedBlastAlignments
-from BigBed import BigBed
+
+
 
 # TODO: Verify each subprocessed dependency is accessible [gff3ToGenePred, genePredToBed, twoBitInfo, faToTwoBit, bedToBigBed, sort
 
 
 def main(argv):
+    
     # Command Line parsing init
     parser = argparse.ArgumentParser(description='Create a foo.txt inside the given folder.')
-
-    # Reference genome mandatory
-    parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome')
-
-    # GFF3 Management
-    parser.add_argument('--gff3', action='append', help='GFF3 format')
-
-    # GTF Management
-    parser.add_argument('--gtf', action='append', help='GTF format')
-
-    # Bed4+12 (TrfBig)
-    parser.add_argument('--bedSimpleRepeats', action='append', help='Bed4+12 format, using simpleRepeats.as')
-
-    # Bed12+1 (regtools)
-    parser.add_argument('--bedSpliceJunctions', action='append', help='Bed12+1 format, using spliceJunctions.as')
-
-    # Generic Bed (Blastx transformed to bed)
-    parser.add_argument('--bed', action='append', help='Bed generic format')
-
-    #cytoBandIdeo
-    parser.add_argument('--cytoBand', action='append', help='Cytoband Track, using cytoBandIdeo.as')
-
-    # BigPsl (blat alignment)
-    parser.add_argument('--bigpsl', action='append', help='bigPsl format, using bigPsl.as')
-
-    # Bed12+12 (tblastn alignment)
-    parser.add_argument('--bedBlastAlignments', action='append', help='Bed12+12 format, using bigPsl.as')
+    parser.add_argument('-j', '--data_json', help='JSON file containing the metadata of the inputs')
+    parser.add_argument('-o', '--output', help='Name of the HTML summarizing the content of the Track Hub Archive')
+    
+    # Get the args passed in parameter
+    args = parser.parse_args()
+    json_inputs_data = args.data_json
+    outputFile = args.output
 
-    # BigWig Management
-    parser.add_argument('--bigwig', action='append', help='BigWig format')
-
-    # Bam Management
-    parser.add_argument('--bam', action='append', help='Bam format')
-
-    # Psl Management
-    parser.add_argument('--psl', action='append', help='Psl format')
-
-    # BigBed Management
-    parser.add_argument('--bigbed', action='append', help='BigBed format')
-
-    # TODO: Check if the running directory can have issues if we run the tool outside
-    parser.add_argument('-d', '--directory',
-                        help='Running tool directory, where to find the templates. Default is running directory')
-    parser.add_argument('-u', '--ucsc_tools_path',
-                        help='Directory where to find the executables needed to run this tool')
-    parser.add_argument('-e', '--extra_files_path',
-                        help='Name, in galaxy, of the output folder. Where you would want to build the Track Hub Archive')
-    parser.add_argument('-o', '--output', help='Name of the HTML summarizing the content of the Track Hub Archive')
-
-    parser.add_argument('-j', '--data_json', help='Json containing the metadata of the inputs')
-
-    parser.add_argument('--user_email', help='Email of the user who launched the Hub Archive Creation')
-
-    parser.add_argument('--genome_name', help='UCSC Genome Browser assembly ID')
-
-    parser.add_argument('--debug_mode', action='store_true', help='Allow more details about the errors')
+    ##Parse JSON file with Reader
+    reader = Reader(json_inputs_data)
 
     # Begin init variables
-
-    toolDirectory = '.'
-    extra_files_path = '.'
-
-    # Get the args passed in parameter
-    args = parser.parse_args()
-
-    extra_files_path = args.extra_files_path
-    toolDirectory = args.directory
+    extra_files_path = reader.getExtFilesPath()
+    toolDirectory = reader.getToolDir()
+    #outputFile = reader.getOutputDir()
+    user_email = reader.getUserEmail()
+    reference_genome = reader.getRefGenome()
+    debug_mode = reader.getDebugMode()
 
     #### Logging management ####
     # If we are in Debug mode, also print in stdout the debug dump
-
-    configure_logger(extra_files_path=extra_files_path, debug=args.debug_mode)
-
+    log = Logger(tool_directory=toolDirectory, debug=debug_mode, extra_files_path=extra_files_path)
+    log.setup_logging()
+    logging.info('#### HubArchiveCreator: Start ####\n')
+    logging.debug('---- Welcome in HubArchiveCreator Debug Mode ----\n')
+    logging.debug('JSON parameters: %s\n\n', json.dumps(reader.args))
     #### END Logging management ####
 
-    array_inputs_reference_genome = json.loads(args.fasta)
-
-    # TODO: Replace these with the object Fasta
-    input_fasta_file = array_inputs_reference_genome["false_path"]
-    input_fasta_file_name = sanitize_name_input(array_inputs_reference_genome["name"])
-    genome_name = sanitize_name_input(args.genome_name)
-
-    reference_genome = Fasta(input_fasta_file,
-                             input_fasta_file_name, genome_name)
-
-    user_email = args.user_email
-
-
-    # TODO: Use a class to have a better management of the structure of these inputs
-    # These inputs are populated in the Galaxy Wrapper xml and are in this format:
-    # ARRAY[DICT{FILE_PATH: DICT{NAME: NAME_VALUE, EXTRA_DATA: EXTRA_DATA_VALUE}}]
-    # EXTRA_DATA could be anything, for example the index of a BAM => {"index", FILE_PATH}
-    array_inputs_bam = args.bam
-    array_inputs_bed_generic = args.bed
-    array_inputs_bed_cytoBand = args.cytoBand
-    array_inputs_bed_simple_repeats = args.bedSimpleRepeats
-    array_inputs_bed_splice_junctions = args.bedSpliceJunctions
-    array_inputs_bigwig = args.bigwig
-    array_inputs_gff3 = args.gff3
-    array_inputs_gtf = args.gtf
-    array_inputs_psl = args.psl
-    array_inputs_bigpsl = args.bigpsl
-    array_inputs_bed_blast_alignments = args.bedBlastAlignments
-    array_inputs_bigbed = args.bigbed
-
-    outputFile = args.output
-
-    json_inputs_data = args.data_json
-
-    # TODO: Instead use a class to properly store the objects, with object_hook
-    inputs_data = json.loads(json_inputs_data)
-    # We remove the spaces in ["name"] of inputs_data
-    sanitize_name_inputs(inputs_data)
-
-    # TODO: Check here all the binaries / tools we need. Exception if missing
-
     # Create the Track Hub folder
+    logging.info('#### HubArchiveCreator: Creating the Track Hub folder ####\n')
     trackHub = TrackHub(reference_genome, user_email, outputFile, extra_files_path, toolDirectory)
 
-    all_datatype_dictionary = {}
-
-    for (inputs, datatype_class) in [
-                        (array_inputs_bam, Bam),
-                        (array_inputs_bed_generic, Bed),
-                        (array_inputs_bed_cytoBand, cytoBand),
-                        (array_inputs_bigwig, BigWig),
-                        (array_inputs_bed_simple_repeats, BedSimpleRepeats),
-                        (array_inputs_bed_splice_junctions, BedSpliceJunctions),
-                        (array_inputs_gff3, Gff3),
-                        (array_inputs_gtf, Gtf),
-                        (array_inputs_psl, Psl),
-                        (array_inputs_bigpsl, bigPsl),
-                        (array_inputs_bed_blast_alignments, BedBlastAlignments),
-                        (array_inputs_bigbed, BigBed)]:
-        if inputs:
-            all_datatype_dictionary.update(create_ordered_datatype_objects(datatype_class, inputs, inputs_data))
-
     # Create Ordered Dictionary to add the tracks in the tool form order
+    logging.info('#### HubArchiveCreator: Preparing track data ####\n')
+    all_datatype_dictionary = reader.getTracksData()
     all_datatype_ordered_dictionary = collections.OrderedDict(all_datatype_dictionary)
 
     logging.debug("----- End of all_datatype_dictionary processing -----")
-    logging.debug("all_datatype_ordered_dictionary keys are: {0}".format(all_datatype_ordered_dictionary.values()))
+    #logging.debug("all_datatype_ordered_dictionary are: %s", json.dumps(all_datatype_ordered_dictionary))
 
+    logging.info('#### HubArchiveCreator: Adding tracks to Track Hub ####\n')
     logging.debug("----- Beginning of Track adding processing -----")
+
     for index, datatypeObject in all_datatype_ordered_dictionary.iteritems():
-        trackHub.addTrack(datatypeObject.track.trackDb)
+       trackHub.addTrack(datatypeObject.track.track_db)
+
     logging.debug("----- End of Track adding processing -----")
 
-    # We process all the modifications to create the zip file
-    #trackHub.createZip()
-
-    # We terminate le process and so create a HTML file summarizing all the files
+    # We terminate the process and so create a HTML file summarizing all the files
+    logging.info('#### HubArchiveCreator: Creating the HTML file ####\n')
     trackHub.terminate()
 
-    logging.debug('#### End of HubArchiveCreator Debug Mode: Bye! ####')
+    logging.debug('---- End of HubArchiveCreator Debug Mode: Bye! ----\n')
+    logging.info('#### HubArchiveCreator: Congratulation! Assembly Hub is created! ####\n')
 
     sys.exit(0)
 
-
-def sanitize_name_input(string_to_sanitize):
-    """
-    Sanitize the string passed in parameter by replacing '/' and ' ' by '_'
-
-    :param string_to_sanitize:
-    :return :
-
-    :Example:
-
-    >>> sanitize_name_input('this/is an//example')
-    this_is_an__example
-    """
-    return string_to_sanitize \
-            .replace("/", "_") \
-            .replace(" ", "_")
-
-
-def sanitize_name_inputs(inputs_data):
-    """
-    Sanitize value of the keys "name" of the dictionary passed in parameter.
-
-    Because sometimes output from Galaxy, or even just file name, from user inputs, have spaces.
-    Also, it can contain '/' character and could break the use of os.path function.
-
-    :param inputs_data: dict[string, dict[string, string]]
-    """
-    for key in inputs_data:
-        inputs_data[key]["name"] = sanitize_name_input(inputs_data[key]["name"])
-
-
-def create_ordered_datatype_objects(ExtensionClass, array_inputs, inputs_data):
-    """
-    Function which executes the creation all the necessary files / folders for a special Datatype, for TrackHub
-    and update the dictionary of datatype
-
-    :param ExtensionClass:
-    :param array_inputs:
-    :param inputs_data:
-    :type ExtensionClass: Datatype
-    :type array_inputs: list[string]
-    :type inputs_data: dict
-    :rtype: dict
-    """
-
-    datatype_dictionary = {}
-
-    # TODO: Optimize this double loop
-    for input_false_path in array_inputs:
-        for key, data_value in inputs_data.items():
-            if key == input_false_path:
-                logging.debug("input_false_path: " + input_false_path)
-                logging.debug("data_value: " + str(data_value))
-                extensionObject = ExtensionClass(input_false_path, data_value)
-                datatype_dictionary.update({data_value["order_index"]: extensionObject})
-    return datatype_dictionary
-
-def configure_logger(extra_files_path=None, debug=False):
-    if not extra_files_path:
-        raise Exception("Extra files path is not set. Stopping the application")
-
-
-    # All case log: log everything in a .log file
-    logger_file_name = ''.join([__name__, '.log'])
-    logging_file_path = os.path.join(extra_files_path, logger_file_name)
-
-    logging.basicConfig(filename=logging_file_path, level=logging.DEBUG)
-
-    log_stdout = logging.StreamHandler(sys.stdout)
-    if not debug:
-        configure_logger_user(log_stdout)
-    else:
-        configure_logger_dev(log_stdout)
-
-    # stderr configuration
-    configure_logger_stderr()
-
-    logging.debug('#### Welcome in HubArchiveCreator Debug Mode ####\n')
-
-def configure_logger_user(log_stdout=None):
-    """
-    User Logger is defined as following:
-        - User needs to have WARN, ERROR and CRITICAL but well formatted / without traceback
-            in STDOUT
-        - Still access to full, brute and traceback for errors
-            in STDERR
-        - And further access to debug if needed
-            in .log
-
-    """
-
-    if not log_stdout:
-        raise Exception("No log_stdout given. Stopping the application")
-
-    # stdout for INFO / WARN / ERROR / CRITICAL
-    log_stdout.setLevel(logging.INFO)
-
-    formatter = TraceBackFormatter('%(message)s')
-
-    log_stdout.setFormatter(formatter)
-
-    logging.getLogger().addHandler(log_stdout)
-
-def configure_logger_dev(log_stdout=None):
-    """
-    Dev Logger is defined as following:
-        - Dev needs to have WARN, ERROR and CRITICAL but well formatted / without traceback, in stdout
-        - Still access to full, brute and traceback in stderr for errors
-        - And further access to debug if needed
-
-    """
-    if not log_stdout:
-        raise Exception("No log_stdout given. Stopping the application")
-    log_format = '%(message)s'
-
-    # stdout and stderr and both identical for INFO / WARN / ERROR / CRITICAL
-    log_stdout.setLevel(logging.DEBUG)
-
-    formatter = logging.Formatter(log_format)
-
-    log_stdout.setFormatter(formatter)
-
-    logging.getLogger().addHandler(log_stdout)
-
-def configure_logger_stderr():
-    """
-    Configure what should be logged in stderr
-    """
-    log_error = logging.StreamHandler(sys.stderr)
-    log_error.setLevel(logging.ERROR)
-    log_error_format = '%(message)s'
-
-    formatter_error = logging.Formatter(log_error_format)
-
-    log_error.setFormatter(formatter_error)
-
-    logging.getLogger().addHandler(log_error)
-
 if __name__ == "__main__":
-    logging.getLogger(__name__)
     main(sys.argv)