comparison hubArchiveCreator.py @ 29:7e8a8b732db3 draft

planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 1a81ebd0ddea950b84af3fc830e9267a4814b29f
author yating-l
date Wed, 16 May 2018 18:04:20 -0400
parents fcc1021bd496
children
comparison
equal deleted inserted replaced
28:6aa28a85cc38 29:7e8a8b732db3
3 3
4 """ 4 """
5 This Galaxy tool permits to prepare your files to be ready for 5 This Galaxy tool permits to prepare your files to be ready for
6 Assembly Hub visualization. 6 Assembly Hub visualization.
7 Program test arguments: 7 Program test arguments:
8 hubArchiveCreator.py -g test-data/augustusDbia3.gff3 -f test-data/dbia3.fa -d . -u ./tools -o output.html 8 hubArchiveCreator.py -g test-data/augustusDbia3.gff3 -f '{"false_path": "./test-data/common/dbia3.fa", "name":"dbia3"}' -d . -u ./tools -o output.html
9 """ 9 """
10 10
11 import argparse 11 import argparse
12 import collections 12 import collections
13 import json 13 import json
14 import logging 14 import logging
15 import os 15 import os
16 import sys 16 import sys
17 17
18 # Internal dependencies 18 # Internal dependencies
19 from Bam import Bam 19 from util.Reader import Reader
20 from BedSimpleRepeats import BedSimpleRepeats 20 from util.Logger import Logger
21 from BedSpliceJunctions import BedSpliceJunctions
22 from Bed import Bed
23 from cytoBand import cytoBand
24 from BigWig import BigWig
25 from util.Fasta import Fasta
26 from util.Filters import TraceBackFormatter
27 from Gff3 import Gff3
28 from Gtf import Gtf
29 from Psl import Psl
30 from TrackHub import TrackHub 21 from TrackHub import TrackHub
31 from bigPsl import bigPsl 22
32 from BedBlastAlignments import BedBlastAlignments 23
33 from BigBed import BigBed
34 24
35 # TODO: Verify each subprocessed dependency is accessible [gff3ToGenePred, genePredToBed, twoBitInfo, faToTwoBit, bedToBigBed, sort 25 # TODO: Verify each subprocessed dependency is accessible [gff3ToGenePred, genePredToBed, twoBitInfo, faToTwoBit, bedToBigBed, sort
36 26
37 27
38 def main(argv): 28 def main(argv):
29
39 # Command Line parsing init 30 # Command Line parsing init
40 parser = argparse.ArgumentParser(description='Create a foo.txt inside the given folder.') 31 parser = argparse.ArgumentParser(description='Create a foo.txt inside the given folder.')
32 parser.add_argument('-j', '--data_json', help='JSON file containing the metadata of the inputs')
33 parser.add_argument('-o', '--output', help='Name of the HTML summarizing the content of the Track Hub Archive')
34
35 # Get the args passed in parameter
36 args = parser.parse_args()
37 json_inputs_data = args.data_json
38 outputFile = args.output
41 39
42 # Reference genome mandatory 40 ##Parse JSON file with Reader
43 parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome') 41 reader = Reader(json_inputs_data)
44
45 # GFF3 Management
46 parser.add_argument('--gff3', action='append', help='GFF3 format')
47
48 # GTF Management
49 parser.add_argument('--gtf', action='append', help='GTF format')
50
51 # Bed4+12 (TrfBig)
52 parser.add_argument('--bedSimpleRepeats', action='append', help='Bed4+12 format, using simpleRepeats.as')
53
54 # Bed12+1 (regtools)
55 parser.add_argument('--bedSpliceJunctions', action='append', help='Bed12+1 format, using spliceJunctions.as')
56
57 # Generic Bed (Blastx transformed to bed)
58 parser.add_argument('--bed', action='append', help='Bed generic format')
59
60 #cytoBandIdeo
61 parser.add_argument('--cytoBand', action='append', help='Cytoband Track, using cytoBandIdeo.as')
62
63 # BigPsl (blat alignment)
64 parser.add_argument('--bigpsl', action='append', help='bigPsl format, using bigPsl.as')
65
66 # Bed12+12 (tblastn alignment)
67 parser.add_argument('--bedBlastAlignments', action='append', help='Bed12+12 format, using bigPsl.as')
68
69 # BigWig Management
70 parser.add_argument('--bigwig', action='append', help='BigWig format')
71
72 # Bam Management
73 parser.add_argument('--bam', action='append', help='Bam format')
74
75 # Psl Management
76 parser.add_argument('--psl', action='append', help='Psl format')
77
78 # BigBed Management
79 parser.add_argument('--bigbed', action='append', help='BigBed format')
80
81 # TODO: Check if the running directory can have issues if we run the tool outside
82 parser.add_argument('-d', '--directory',
83 help='Running tool directory, where to find the templates. Default is running directory')
84 parser.add_argument('-u', '--ucsc_tools_path',
85 help='Directory where to find the executables needed to run this tool')
86 parser.add_argument('-e', '--extra_files_path',
87 help='Name, in galaxy, of the output folder. Where you would want to build the Track Hub Archive')
88 parser.add_argument('-o', '--output', help='Name of the HTML summarizing the content of the Track Hub Archive')
89
90 parser.add_argument('-j', '--data_json', help='Json containing the metadata of the inputs')
91
92 parser.add_argument('--user_email', help='Email of the user who launched the Hub Archive Creation')
93
94 parser.add_argument('--genome_name', help='UCSC Genome Browser assembly ID')
95
96 parser.add_argument('--debug_mode', action='store_true', help='Allow more details about the errors')
97 42
98 # Begin init variables 43 # Begin init variables
99 44 extra_files_path = reader.getExtFilesPath()
100 toolDirectory = '.' 45 toolDirectory = reader.getToolDir()
101 extra_files_path = '.' 46 #outputFile = reader.getOutputDir()
102 47 user_email = reader.getUserEmail()
103 # Get the args passed in parameter 48 reference_genome = reader.getRefGenome()
104 args = parser.parse_args() 49 debug_mode = reader.getDebugMode()
105
106 extra_files_path = args.extra_files_path
107 toolDirectory = args.directory
108 50
109 #### Logging management #### 51 #### Logging management ####
110 # If we are in Debug mode, also print in stdout the debug dump 52 # If we are in Debug mode, also print in stdout the debug dump
111 53 log = Logger(tool_directory=toolDirectory, debug=debug_mode, extra_files_path=extra_files_path)
112 configure_logger(extra_files_path=extra_files_path, debug=args.debug_mode) 54 log.setup_logging()
113 55 logging.info('#### HubArchiveCreator: Start ####\n')
56 logging.debug('---- Welcome in HubArchiveCreator Debug Mode ----\n')
57 logging.debug('JSON parameters: %s\n\n', json.dumps(reader.args))
114 #### END Logging management #### 58 #### END Logging management ####
115 59
116 array_inputs_reference_genome = json.loads(args.fasta)
117
118 # TODO: Replace these with the object Fasta
119 input_fasta_file = array_inputs_reference_genome["false_path"]
120 input_fasta_file_name = sanitize_name_input(array_inputs_reference_genome["name"])
121 genome_name = sanitize_name_input(args.genome_name)
122
123 reference_genome = Fasta(input_fasta_file,
124 input_fasta_file_name, genome_name)
125
126 user_email = args.user_email
127
128
129 # TODO: Use a class to have a better management of the structure of these inputs
130 # These inputs are populated in the Galaxy Wrapper xml and are in this format:
131 # ARRAY[DICT{FILE_PATH: DICT{NAME: NAME_VALUE, EXTRA_DATA: EXTRA_DATA_VALUE}}]
132 # EXTRA_DATA could be anything, for example the index of a BAM => {"index", FILE_PATH}
133 array_inputs_bam = args.bam
134 array_inputs_bed_generic = args.bed
135 array_inputs_bed_cytoBand = args.cytoBand
136 array_inputs_bed_simple_repeats = args.bedSimpleRepeats
137 array_inputs_bed_splice_junctions = args.bedSpliceJunctions
138 array_inputs_bigwig = args.bigwig
139 array_inputs_gff3 = args.gff3
140 array_inputs_gtf = args.gtf
141 array_inputs_psl = args.psl
142 array_inputs_bigpsl = args.bigpsl
143 array_inputs_bed_blast_alignments = args.bedBlastAlignments
144 array_inputs_bigbed = args.bigbed
145
146 outputFile = args.output
147
148 json_inputs_data = args.data_json
149
150 # TODO: Instead use a class to properly store the objects, with object_hook
151 inputs_data = json.loads(json_inputs_data)
152 # We remove the spaces in ["name"] of inputs_data
153 sanitize_name_inputs(inputs_data)
154
155 # TODO: Check here all the binaries / tools we need. Exception if missing
156
157 # Create the Track Hub folder 60 # Create the Track Hub folder
61 logging.info('#### HubArchiveCreator: Creating the Track Hub folder ####\n')
158 trackHub = TrackHub(reference_genome, user_email, outputFile, extra_files_path, toolDirectory) 62 trackHub = TrackHub(reference_genome, user_email, outputFile, extra_files_path, toolDirectory)
159 63
160 all_datatype_dictionary = {}
161
162 for (inputs, datatype_class) in [
163 (array_inputs_bam, Bam),
164 (array_inputs_bed_generic, Bed),
165 (array_inputs_bed_cytoBand, cytoBand),
166 (array_inputs_bigwig, BigWig),
167 (array_inputs_bed_simple_repeats, BedSimpleRepeats),
168 (array_inputs_bed_splice_junctions, BedSpliceJunctions),
169 (array_inputs_gff3, Gff3),
170 (array_inputs_gtf, Gtf),
171 (array_inputs_psl, Psl),
172 (array_inputs_bigpsl, bigPsl),
173 (array_inputs_bed_blast_alignments, BedBlastAlignments),
174 (array_inputs_bigbed, BigBed)]:
175 if inputs:
176 all_datatype_dictionary.update(create_ordered_datatype_objects(datatype_class, inputs, inputs_data))
177
178 # Create Ordered Dictionary to add the tracks in the tool form order 64 # Create Ordered Dictionary to add the tracks in the tool form order
65 logging.info('#### HubArchiveCreator: Preparing track data ####\n')
66 all_datatype_dictionary = reader.getTracksData()
179 all_datatype_ordered_dictionary = collections.OrderedDict(all_datatype_dictionary) 67 all_datatype_ordered_dictionary = collections.OrderedDict(all_datatype_dictionary)
180 68
181 logging.debug("----- End of all_datatype_dictionary processing -----") 69 logging.debug("----- End of all_datatype_dictionary processing -----")
182 logging.debug("all_datatype_ordered_dictionary keys are: {0}".format(all_datatype_ordered_dictionary.values())) 70 #logging.debug("all_datatype_ordered_dictionary are: %s", json.dumps(all_datatype_ordered_dictionary))
183 71
72 logging.info('#### HubArchiveCreator: Adding tracks to Track Hub ####\n')
184 logging.debug("----- Beginning of Track adding processing -----") 73 logging.debug("----- Beginning of Track adding processing -----")
74
185 for index, datatypeObject in all_datatype_ordered_dictionary.iteritems(): 75 for index, datatypeObject in all_datatype_ordered_dictionary.iteritems():
186 trackHub.addTrack(datatypeObject.track.trackDb) 76 trackHub.addTrack(datatypeObject.track.track_db)
77
187 logging.debug("----- End of Track adding processing -----") 78 logging.debug("----- End of Track adding processing -----")
188 79
189 # We process all the modifications to create the zip file 80 # We terminate the process and so create a HTML file summarizing all the files
190 #trackHub.createZip() 81 logging.info('#### HubArchiveCreator: Creating the HTML file ####\n')
191
192 # We terminate le process and so create a HTML file summarizing all the files
193 trackHub.terminate() 82 trackHub.terminate()
194 83
195 logging.debug('#### End of HubArchiveCreator Debug Mode: Bye! ####') 84 logging.debug('---- End of HubArchiveCreator Debug Mode: Bye! ----\n')
85 logging.info('#### HubArchiveCreator: Congratulation! Assembly Hub is created! ####\n')
196 86
197 sys.exit(0) 87 sys.exit(0)
198 88
199
200 def sanitize_name_input(string_to_sanitize):
201 """
202 Sanitize the string passed in parameter by replacing '/' and ' ' by '_'
203
204 :param string_to_sanitize:
205 :return :
206
207 :Example:
208
209 >>> sanitize_name_input('this/is an//example')
210 this_is_an__example
211 """
212 return string_to_sanitize \
213 .replace("/", "_") \
214 .replace(" ", "_")
215
216
217 def sanitize_name_inputs(inputs_data):
218 """
219 Sanitize value of the keys "name" of the dictionary passed in parameter.
220
221 Because sometimes output from Galaxy, or even just file name, from user inputs, have spaces.
222 Also, it can contain '/' character and could break the use of os.path function.
223
224 :param inputs_data: dict[string, dict[string, string]]
225 """
226 for key in inputs_data:
227 inputs_data[key]["name"] = sanitize_name_input(inputs_data[key]["name"])
228
229
230 def create_ordered_datatype_objects(ExtensionClass, array_inputs, inputs_data):
231 """
232 Function which executes the creation all the necessary files / folders for a special Datatype, for TrackHub
233 and update the dictionary of datatype
234
235 :param ExtensionClass:
236 :param array_inputs:
237 :param inputs_data:
238 :type ExtensionClass: Datatype
239 :type array_inputs: list[string]
240 :type inputs_data: dict
241 :rtype: dict
242 """
243
244 datatype_dictionary = {}
245
246 # TODO: Optimize this double loop
247 for input_false_path in array_inputs:
248 for key, data_value in inputs_data.items():
249 if key == input_false_path:
250 logging.debug("input_false_path: " + input_false_path)
251 logging.debug("data_value: " + str(data_value))
252 extensionObject = ExtensionClass(input_false_path, data_value)
253 datatype_dictionary.update({data_value["order_index"]: extensionObject})
254 return datatype_dictionary
255
256 def configure_logger(extra_files_path=None, debug=False):
257 if not extra_files_path:
258 raise Exception("Extra files path is not set. Stopping the application")
259
260
261 # All case log: log everything in a .log file
262 logger_file_name = ''.join([__name__, '.log'])
263 logging_file_path = os.path.join(extra_files_path, logger_file_name)
264
265 logging.basicConfig(filename=logging_file_path, level=logging.DEBUG)
266
267 log_stdout = logging.StreamHandler(sys.stdout)
268 if not debug:
269 configure_logger_user(log_stdout)
270 else:
271 configure_logger_dev(log_stdout)
272
273 # stderr configuration
274 configure_logger_stderr()
275
276 logging.debug('#### Welcome in HubArchiveCreator Debug Mode ####\n')
277
278 def configure_logger_user(log_stdout=None):
279 """
280 User Logger is defined as following:
281 - User needs to have WARN, ERROR and CRITICAL but well formatted / without traceback
282 in STDOUT
283 - Still access to full, brute and traceback for errors
284 in STDERR
285 - And further access to debug if needed
286 in .log
287
288 """
289
290 if not log_stdout:
291 raise Exception("No log_stdout given. Stopping the application")
292
293 # stdout for INFO / WARN / ERROR / CRITICAL
294 log_stdout.setLevel(logging.INFO)
295
296 formatter = TraceBackFormatter('%(message)s')
297
298 log_stdout.setFormatter(formatter)
299
300 logging.getLogger().addHandler(log_stdout)
301
302 def configure_logger_dev(log_stdout=None):
303 """
304 Dev Logger is defined as following:
305 - Dev needs to have WARN, ERROR and CRITICAL but well formatted / without traceback, in stdout
306 - Still access to full, brute and traceback in stderr for errors
307 - And further access to debug if needed
308
309 """
310 if not log_stdout:
311 raise Exception("No log_stdout given. Stopping the application")
312 log_format = '%(message)s'
313
314 # stdout and stderr and both identical for INFO / WARN / ERROR / CRITICAL
315 log_stdout.setLevel(logging.DEBUG)
316
317 formatter = logging.Formatter(log_format)
318
319 log_stdout.setFormatter(formatter)
320
321 logging.getLogger().addHandler(log_stdout)
322
323 def configure_logger_stderr():
324 """
325 Configure what should be logged in stderr
326 """
327 log_error = logging.StreamHandler(sys.stderr)
328 log_error.setLevel(logging.ERROR)
329 log_error_format = '%(message)s'
330
331 formatter_error = logging.Formatter(log_error_format)
332
333 log_error.setFormatter(formatter_error)
334
335 logging.getLogger().addHandler(log_error)
336
337 if __name__ == "__main__": 89 if __name__ == "__main__":
338 logging.getLogger(__name__)
339 main(sys.argv) 90 main(sys.argv)