Mercurial > repos > rmarenco > hubarchivecreator
changeset 13:25809f699cb3 draft
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
author | rmarenco |
---|---|
date | Thu, 11 Aug 2016 19:02:29 -0400 |
parents | 747475757cb0 |
children | cbd8982c9f51 |
files | Bam.pyc Bed.py Bed.pyc BedSimpleRepeats.py BedSimpleRepeats.pyc BigWig.py BigWig.pyc Datatype.py Datatype.pyc Gff3.py Gff3.pyc Gtf.py Gtf.pyc TrackDb.pyc TrackHub.pyc __init__.py bigGenePred.as hubArchiveCreator.py hubArchiveCreator.xml todo.md util/Fasta.pyc util/Filters.py util/Filters.pyc util/subtools.py util/subtools.pyc |
diffstat | 25 files changed, 347 insertions(+), 80 deletions(-) [+] |
line wrap: on
line diff
--- a/Bed.py Wed Jul 27 10:43:58 2016 -0400 +++ b/Bed.py Thu Aug 11 19:02:29 2016 -0400 @@ -33,7 +33,9 @@ myBigBedFilePath = os.path.join(self.myTrackFolderPath, trackName) with open(myBigBedFilePath, 'w') as self.bigBedFile: - subtools.bedToBigBed(self.sortedBedFile.name, self.chromSizesFile.name, self.bigBedFile.name) + subtools.bedToBigBed(self.sortedBedFile.name, + self.chromSizesFile.name, + self.bigBedFile.name) # Create the Track Object self.createTrack(file_path=trackName,
--- a/BedSimpleRepeats.py Wed Jul 27 10:43:58 2016 -0400 +++ b/BedSimpleRepeats.py Thu Aug 11 19:02:29 2016 -0400 @@ -27,10 +27,14 @@ # TODO: Change the name of the bb, to tool + genome + .bb trackName = "".join( ( self.name_bed_simple_repeats, '.bb' ) ) myBigBedFilePath = os.path.join(self.myTrackFolderPath, trackName) - auto_sql_option = "%s%s" % ('-as=', os.path.join(self.tool_directory, 'trf_simpleRepeat.as')) + + auto_sql_option = os.path.join(self.tool_directory, 'trf_simpleRepeat.as') + with open(myBigBedFilePath, 'w') as bigBedFile: - subtools.bedToBigBed(sortedBedFile.name, self.chromSizesFile.name, bigBedFile.name, - typeOption='-type=bed4+12', + subtools.bedToBigBed(sortedBedFile.name, + self.chromSizesFile.name, + bigBedFile.name, + typeOption='bed4+12', autoSql=auto_sql_option) # Create the Track Object
--- a/BigWig.py Wed Jul 27 10:43:58 2016 -0400 +++ b/BigWig.py Thu Aug 11 19:02:29 2016 -0400 @@ -29,7 +29,8 @@ # Create the Track Object self.createTrack(file_path=trackName, track_name=trackName, - long_label=self.name_bigwig, track_type='bigWig', visibility='full', + long_label=self.name_bigwig, + track_type='bigWig', visibility='full', priority=self.priority, track_file=myBigWigFilePath)
--- a/Datatype.py Wed Jul 27 10:43:58 2016 -0400 +++ b/Datatype.py Thu Aug 11 19:02:29 2016 -0400 @@ -40,8 +40,6 @@ if Datatype.tool_directory is None: raise TypeError(not_init_message.format('tool directory')) - - @staticmethod def pre_init(reference_genome, two_bit_path, chrom_sizes_file, extra_files_path, tool_directory, specie_folder, tracks_folder):
--- a/Gff3.py Wed Jul 27 10:43:58 2016 -0400 +++ b/Gff3.py Thu Aug 11 19:02:29 2016 -0400 @@ -21,53 +21,42 @@ self.priority = data_gff3["order_index"] # TODO: See if we need these temporary files as part of the generated files - genePredFile = tempfile.NamedTemporaryFile(bufsize=0, suffix=".genePred") - unsortedBedFile = tempfile.NamedTemporaryFile(bufsize=0, suffix=".unsortedBed") - sortedBedFile = tempfile.NamedTemporaryFile(suffix=".sortedBed") - - # TODO: Refactor into another Class to manage the twoBitInfo and ChromSizes (same process as in Gtf.py) + unsorted_genePred_file = tempfile.NamedTemporaryFile(bufsize=0, suffix=".genePred") + unsorted_bigGenePred_file = tempfile.NamedTemporaryFile(bufsize=0, suffix=".unsorted.bigGenePred") + sorted_biGenePred_file = tempfile.NamedTemporaryFile(suffix=".sorted.bigGenePred") # gff3ToGenePred processing - subtools.gff3ToGenePred(self.input_Gff3_false_path, genePredFile.name) + subtools.gff3ToGenePred(self.input_Gff3_false_path, unsorted_genePred_file.name) - # TODO: From there, refactor because common use with Gtf.py - # genePredToBed processing - subtools.genePredToBed(genePredFile.name, unsortedBedFile.name) + # genePredToBigGenePred + subtools.genePredToBigGenePred(unsorted_genePred_file.name, unsorted_bigGenePred_file.name) # Sort processing - subtools.sort(unsortedBedFile.name, sortedBedFile.name) + subtools.sort(unsorted_bigGenePred_file.name, sorted_biGenePred_file.name) # TODO: Check if no errors # bedToBigBed processing - # TODO: Change the name of the bb, to tool + genome + possible adding if multiple + .bb trackName = "".join( (self.name_gff3, ".bb" ) ) + + auto_sql_option = os.path.join(self.tool_directory, 'bigGenePred.as') + myBigBedFilePath = os.path.join(self.myTrackFolderPath, trackName) + with open(myBigBedFilePath, 'w') as bigBedFile: - subtools.bedToBigBed(sortedBedFile.name, self.chromSizesFile.name, bigBedFile.name) + subtools.bedToBigBed(sorted_biGenePred_file.name, + self.chromSizesFile.name, + bigBedFile.name, + autoSql=auto_sql_option, + typeOption='bed12+8', + tab=True) # Create the Track Object self.createTrack(file_path=trackName, track_name=trackName, - long_label=self.name_gff3, track_type='bigBed 12 +', visibility='dense', priority=self.priority, + long_label=self.name_gff3, + track_type='bigGenePred', visibility='dense', + priority=self.priority, track_file=myBigBedFilePath) - # dataURL = "tracks/%s" % trackName - # - # trackDb = TrackDb( - # trackName=trackName, - # longLabel=self.name_gff3, - # shortLabel=self.getShortName( self.name_gff3 ), - # trackDataURL=dataURL, - # trackType='bigBed 12 +', - # visibility='dense', - # priority=self.priority, - # ) - # - # self.track = Track( - # trackFile=myBigBedFilePath, - # trackDb=trackDb, - # ) - print("- Gff3 %s created" % self.name_gff3) - #print("- %s created in %s" % (trackName, myBigBedFilePath))
--- a/Gtf.py Wed Jul 27 10:43:58 2016 -0400 +++ b/Gtf.py Thu Aug 11 19:02:29 2016 -0400 @@ -25,47 +25,40 @@ # TODO: See if we need these temporary files as part of the generated files genePredFile = tempfile.NamedTemporaryFile(bufsize=0, suffix=".genePred") - unsortedBedFile = tempfile.NamedTemporaryFile(bufsize=0, suffix=".unsortedBed") - sortedBedFile = tempfile.NamedTemporaryFile(suffix=".sortedBed") + unsorted_bigGenePred_file = tempfile.NamedTemporaryFile(bufsize=0, suffix=".unsorted.bigGenePred") + sorted_bigGenePred_file = tempfile.NamedTemporaryFile(suffix=".sortedBed.bigGenePred") # GtfToGenePred subtools.gtfToGenePred(self.input_gtf_false_path, genePredFile.name) # TODO: From there, refactor because common use with Gff3.py - # genePredToBed processing - subtools.genePredToBed(genePredFile.name, unsortedBedFile.name) + # genePredToBigGenePred processing + subtools.genePredToBigGenePred(genePredFile.name, unsorted_bigGenePred_file.name) # Sort processing - subtools.sort(unsortedBedFile.name, sortedBedFile.name) + subtools.sort(unsorted_bigGenePred_file.name, sorted_bigGenePred_file.name) # bedToBigBed processing - # TODO: Change the name of the bb, to tool + genome + possible adding if multiple + .bb trackName = "".join( ( self.name_gtf, ".bb") ) + + auto_sql_option = os.path.join(self.tool_directory, 'bigGenePred.as') + myBigBedFilePath = os.path.join(self.myTrackFolderPath, trackName) + with open(myBigBedFilePath, 'w') as bigBedFile: - subtools.bedToBigBed(sortedBedFile.name, self.chromSizesFile.name, bigBedFile.name) + subtools.bedToBigBed(sorted_bigGenePred_file.name, + self.chromSizesFile.name, + bigBedFile.name, + autoSql=auto_sql_option, + typeOption='bed12+8', + tab=True) + # Create the Track Object self.createTrack(file_path=trackName, track_name=trackName, - long_label=self.name_gtf, track_type='bigBed 12 +', visibility='dense', priority=self.priority, + long_label=self.name_gtf, track_type='bigGenePred', + visibility='dense', priority=self.priority, track_file=myBigBedFilePath) - # - # dataURL = "tracks/%s" % trackName - # - # trackDb = TrackDb( - # trackName=trackName, - # longLabel=self.name_gtf, - # shortLabel=self.getShortName( self.name_gtf ), - # trackDataURL=dataURL, - # trackType='bigBed 12 +', - # visibility='dense', - # priority=self.priority, - # ) - # self.track = Track( - # trackFile=myBigBedFilePath, - # trackDb=trackDb, - # ) print("- Gtf %s created" % self.name_gtf) - #print("- %s created in %s" % (trackName, myBigBedFilePath))
--- a/__init__.py Wed Jul 27 10:43:58 2016 -0400 +++ b/__init__.py Thu Aug 11 19:02:29 2016 -0400 @@ -0,0 +1,19 @@ +# Set default logging handler to avoid "No handler found" warnings. +import logging +import sys + +try: # Python 2.7+ + from logging import NullHandler +except ImportError: + class NullHandler(logging.Handler): + def emit(self, record): + pass + +logging.getLogger(__name__).addHandler(NullHandler()) + +log_stdout = None +log_stderr = None + + +# TODO: Handle the Exception by dispatching the error depending on the (debug) mode +#class Exception(Exception): \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bigGenePred.as Thu Aug 11 19:02:29 2016 -0400 @@ -0,0 +1,25 @@ +table bigGenePred +"bigGenePred gene models" + ( + string chrom; "Reference sequence chromosome or scaffold" + uint chromStart; "Start position in chromosome" + uint chromEnd; "End position in chromosome" + string name; "Name or ID of item, ideally both human readable and unique" + uint score; "Score (0-1000)" + char[1] strand; "+ or - for strand" + uint thickStart; "Start of where display should be thick (start codon)" + uint thickEnd; "End of where display should be thick (stop codon)" + uint reserved; "RGB value (use R,G,B string in input file)" + int blockCount; "Number of blocks" + int[blockCount] blockSizes; "Comma separated list of block sizes" + int[blockCount] chromStarts; "Start positions relative to chromStart" + string name2; "Alternative/human readable name" + string cdsStartStat; "Status of CDS start annotation (none, unknown, incomplete, or complete)" + string cdsEndStat; "Status of CDS end annotation (none, unknown, incomplete, or complete)" + int[blockCount] exonFrames; "Exon frame {0,1,2}, or -1 if no frame for exon" + string type; "Transcript type" + string geneName; "Primary identifier for gene" + string geneName2; "Alternative/human readable gene name" + string geneType; "Gene type" + ) +
--- a/hubArchiveCreator.py Wed Jul 27 10:43:58 2016 -0400 +++ b/hubArchiveCreator.py Thu Aug 11 19:02:29 2016 -0400 @@ -11,6 +11,8 @@ import argparse import collections import json +import logging +import os import sys # Internal dependencies @@ -19,11 +21,11 @@ from Bed import Bed from BigWig import BigWig from util.Fasta import Fasta +from util.Filters import TraceBackFormatter from Gff3 import Gff3 from Gtf import Gtf from TrackHub import TrackHub - # TODO: Verify each subprocessed dependency is accessible [gff3ToGenePred, genePredToBed, twoBitInfo, faToTwoBit, bedToBigBed, sort @@ -67,7 +69,9 @@ parser.add_argument('--genome_name', help='UCSC Genome Browser assembly ID') - ucsc_tools_path = '' + parser.add_argument('--debug_mode', action='store_true', help='Allow more details about the errors') + + # Begin init variables toolDirectory = '.' extra_files_path = '.' @@ -75,6 +79,16 @@ # Get the args passed in parameter args = parser.parse_args() + extra_files_path = args.extra_files_path + toolDirectory = args.directory + + #### Logging management #### + # If we are in Debug mode, also print in stdout the debug dump + + configure_logger(extra_files_path=extra_files_path, debug=args.debug_mode) + + #### END Logging management #### + array_inputs_reference_genome = json.loads(args.fasta) # TODO: Replace these with the object Fasta @@ -100,7 +114,6 @@ array_inputs_bigwig = args.bigwig outputFile = args.output - json_inputs_data = args.data_json json_inputs_data = args.data_json @@ -108,10 +121,6 @@ # We remove the spaces in ["name"] of inputs_data sanitize_name_inputs(inputs_data) - if args.directory: - toolDirectory = args.directory - if args.extra_files_path: - extra_files_path = args.extra_files_path # TODO: Check here all the binaries / tools we need. Exception if missing @@ -141,9 +150,7 @@ # We terminate le process and so create a HTML file summarizing all the files trackHub.terminate() - print "\t" - print "--------------" - print "Well done guys! Your data are ready to be displayed in UCSC Track Hub." + logging.debug('#### End of HubArchiveCreator Debug Mode: Bye! ####') sys.exit(0) @@ -184,5 +191,87 @@ datatype_dictionary.update({data_value["order_index"]: extensionObject}) return datatype_dictionary +def configure_logger(extra_files_path=None, debug=False): + if not extra_files_path: + raise Exception("Extra files path is not set. Stopping the application") + + + # All case log: log everything in a .log file + logger_file_name = ''.join([__name__, '.log']) + logging_file_path = os.path.join(extra_files_path, logger_file_name) + + logging.basicConfig(filename=logging_file_path, level=logging.DEBUG) + + log_stdout = logging.StreamHandler(sys.stdout) + if not debug: + configure_logger_user(log_stdout) + else: + configure_logger_dev(log_stdout) + + # stderr configuration + configure_logger_stderr() + + logging.debug('#### Welcome in HubArchiveCreator Debug Mode ####\n') + +def configure_logger_user(log_stdout=None): + """ + User Logger is defined as following: + - User needs to have WARN, ERROR and CRITICAL but well formatted / without traceback + in STDOUT + - Still access to full, brute and traceback for errors + in STDERR + - And further access to debug if needed + in .log + :return: + """ + if not log_stdout: + raise Exception("No log_stdout given. Stopping the application") + + # stdout for INFO / WARN / ERROR / CRITICAL + log_stdout.setLevel(logging.INFO) + + formatter = TraceBackFormatter('%(message)s') + + log_stdout.setFormatter(formatter) + + logging.getLogger().addHandler(log_stdout) + +def configure_logger_dev(log_stdout=None): + """ + Dev Logger is defined as following: + - Dev needs to have WARN, ERROR and CRITICAL but well formatted / without traceback, in stdout + - Still access to full, brute and traceback in stderr for errors + - And further access to debug if needed + :return: + """ + if not log_stdout: + raise Exception("No log_stdout given. Stopping the application") + log_format = '%(message)s' + + # stdout and stderr and both identical for INFO / WARN / ERROR / CRITICAL + log_stdout.setLevel(logging.DEBUG) + + formatter = logging.Formatter(log_format) + + log_stdout.setFormatter(formatter) + + logging.getLogger().addHandler(log_stdout) + +def configure_logger_stderr(): + """ + Configure what should be logged in stderr + :return: + """ + log_error = logging.StreamHandler(sys.stderr) + log_error.setLevel(logging.ERROR) + log_error_format = '%(message)s' + + formatter_error = logging.Formatter(log_error_format) + + log_error.setFormatter(formatter_error) + + logging.getLogger().addHandler(log_error) + if __name__ == "__main__": + logging.getLogger(__name__) main(sys.argv)
--- a/hubArchiveCreator.xml Wed Jul 27 10:43:58 2016 -0400 +++ b/hubArchiveCreator.xml Thu Aug 11 19:02:29 2016 -0400 @@ -14,6 +14,7 @@ <requirement type="package" version="332">ucsc-bedtobigbed</requirement> <requirement type="package" version="332">ucsc-fatotwobit</requirement> <requirement type="package" version="324">ucsc-genepredtobed</requirement> + <requirement type="package" version="332">ucsc-genepredtobiggenepred</requirement> <requirement type="package" version="324">ucsc-gff3togenepred</requirement> <requirement type="package" version="324">ucsc-gtftogenepred</requirement> <requirement type="package" version="324">ucsc-twobitinfo</requirement> @@ -99,7 +100,13 @@ ## Retrieve the user email --user_email $__user_email__ - -d $__tool_directory__ -e $output.files_path -o $output; + -d $__tool_directory__ + + -e $output.files_path + + $advanced_options.debug_mode + + -o $output; ]]></command> <inputs> @@ -184,7 +191,28 @@ </when> </conditional> </repeat> - </inputs> + <conditional name="advanced_options"> + <param name="advanced_options_selector" type="select" label="Advanced options"> + <option value="off" selected="true">Hide advanced options</option> + <option value="on">Display advanced options</option> + </param> + <!-- TODO: Avoid redundancy here --> + <when value="on"> + <param name="debug_mode" type="boolean" + label="Activate debug mode" checked="false" + truevalue="--debug_mode" falsevalue=""> + <help> + Use this option if you are a G-OnRamp developer + </help> + </param> + </when> + <when value="off"> + <param name="debug_mode" type="hidden" + value=""> + </param> + </when> + </conditional> +</inputs> <outputs> <data format="trackhub" name="output"/>
--- a/todo.md Wed Jul 27 10:43:58 2016 -0400 +++ b/todo.md Thu Aug 11 19:02:29 2016 -0400 @@ -1,10 +1,44 @@ +### TEMP St Louis ### +- How to manage messages to user and debugging: + - User should receive INFO / WARN / ERROR / CRITICAL: + - User summary informations in stdout + - Full error in stderr + - Developer should receive all Logging stack: + - Not the user summary in stdout + - Full stack in stdout and stderr directly + + - HOWTO: + - Manage (at least) two type of Logging types: + - The user one => When Debug mode is not set or disabled + - The dev one => When Debug mode is enabled + - User: + - Two Handlers: stdout and stderr + - STDOUT: + - Filter stdout: + - NO ERROR and CRITICAL here + - (Warn) + - Formatter: + - Only show %(message) for clarity + - STDERR: + - Filter stderr => WARN / ERRROR / CRITICAL + - Formatter: + - Show message + - Show traceback + - Dev: + - One Handler: + - To both stdout and stderr + - Filter: + - Nope? + - Formatter: + - Show traceback in both + # HubArchiveCreator's TODO *TODO file inspired from: http://lifehacker.com/why-a-github-gist-is-my-favorite-to-do-list-1493063613* ### TO COMPLETE - + - [ ] Don't let the Tool Classes manage the archive (add or remove files / folders) => Everything should pass through TrackHub or another class dedicated to that - [ ] Move the class and others program related files, into separated folders - [ ] Take into account the name of the reference genome / the change:
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/util/Filters.py Thu Aug 11 19:02:29 2016 -0400 @@ -0,0 +1,9 @@ +import logging + +class TraceBackFormatter(logging.Formatter): + def format(self, record): + # If the log has some Exception text, we don't display them + s = super(TraceBackFormatter, self).format(record) + if record.exc_text or record.exc_info: + s = record.message + return s
--- a/util/subtools.py Wed Jul 27 10:43:58 2016 -0400 +++ b/util/subtools.py Thu Aug 11 19:02:29 2016 -0400 @@ -6,9 +6,21 @@ in HubArchiveCreator """ +import logging import os import subprocess +import sys +class PopenError(Exception): + def __init__(self, cmd, error, return_code): + self.cmd = cmd + self.error = error + self.return_code = return_code + + def __str__(self): + message = "The subprocess {0} has returned the error: {1}.".format(self.cmd, self.return_code) + message = ','.join((message, "Its error message is: {0}".format(self.error))) + return repr(message) def _handleExceptionAndCheckCall(array_call, **kwargs): """ @@ -20,10 +32,47 @@ stdout = kwargs.get('stdout') stderr = kwargs.get('stderr') shell = kwargs.get('shell') + + cmd = array_call[0] + + output = None + error = None + + # TODO: Check the value of array_call and <=[0] + logging.debug("Calling {0}:".format(cmd)) + + logging.debug("---------") + + # TODO: Use universal_newlines option from Popen? try: - p = subprocess.check_call(array_call, stdin=stdin, stdout=stdout, stderr=stderr, shell=shell) - except subprocess.CalledProcessError: - raise + p = subprocess.Popen(array_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell) + output, error = p.communicate() + + logging.debug("\t{0}".format(output)) + # If we detect an error from the subprocess, then we raise an exception + # TODO: Manage if we raise an exception for everything, or use CRITICAL etc... but not stop process + # TODO: The responsability of returning a sys.exit() should not be there, but up in the app. + if p.returncode: + raise PopenError(cmd, error, p.returncode) + + except OSError as e: + message = "The subprocess {0} has encountered an OSError: {1}".format(cmd, e.strerror) + if e.filename: + message = '\n'.join((message, ", against this file: {0}".format(e.filename))) + logging.error(message) + sys.exit(-1) + except PopenError as p: + message = "The subprocess {0} has returned the error: {1}.".format(p.cmd, p.return_code) + message = '\n'.join((message, "Its error message is: {0}".format(p.error))) + + logging.exception(message) + + sys.exit(p.return_code) + except Exception as e: + message = "The subprocess {0} has encountered an unknown error: {1}".format(cmd, e) + logging.exception(message) + + sys.exit(-1) return p @@ -76,6 +125,18 @@ p = _handleExceptionAndCheckCall(array_call) return p +def genePredToBigGenePred(gene_pred_file_name, unsorted_bigGenePred_file_name): + """ + Call genePredToBigGenePred and write the result into unsorted_bigGenePred_file_name + :param gene_pred_file_name: + :param unsorted_bigGenePred_file_name: + :return: + """ + array_call = ['genePredToBigGenePred', + gene_pred_file_name, + unsorted_bigGenePred_file_name] + p = _handleExceptionAndCheckCall(array_call) + return p def genePredToBed(gene_pred_file_name, unsorted_bed_file_name): """ @@ -113,7 +174,8 @@ return p -def bedToBigBed(sorted_bed_file_name, chrom_sizes_file_name, big_bed_file_name, typeOption=None, autoSql=None): +def bedToBigBed(sorted_bed_file_name, chrom_sizes_file_name, big_bed_file_name, + typeOption=None, autoSql=None, tab=False): """ Call bedToBigBed on sorted_bed_file_name, using chrom_sizes_file_name and write the result into big_bed_file_name :param sorted_bed_file_name: @@ -121,11 +183,25 @@ :param big_bed_file_name: :return: """ + + # TODO: Move this into the _handleExceptionAndCheckCall function + # Parse the array + logging.debug("sorted_bed_file_name: {0}".format(sorted_bed_file_name)) + logging.debug("chrom_sizes_file_name: {0}".format(chrom_sizes_file_name)) + logging.debug("big_bed_file_name: {0}".format(big_bed_file_name)) + logging.debug("typeOption: {0}".format(typeOption)) + logging.debug("autoSql: {0}".format(autoSql)) + logging.debug("tab option: {0}".format(tab)) + array_call = ['bedToBigBed', sorted_bed_file_name, chrom_sizes_file_name, big_bed_file_name] if typeOption: + typeOption = ''.join(['-type=', typeOption]) array_call.append(typeOption) if autoSql: + autoSql = ''.join(['-as=', autoSql]) array_call.append(autoSql) + if tab: + array_call.append('-tab') p = _handleExceptionAndCheckCall(array_call) return p