comparison hubArchiveCreator.py @ 0:f493979f1408 draft default tip

planemo upload for repository https://github.com/Yating-L/hubarchivecreator-test commit 48b59e91e2dcc2e97735ee35d587960cbfbce932-dirty
author yating-l
date Wed, 21 Dec 2016 12:13:04 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:f493979f1408
1 #!/usr/bin/python
2 # -*- coding: utf8 -*-
3
4 """
5 This Galaxy tool permits to prepare your files to be ready for
6 Assembly Hub visualization.
7 Program test arguments:
8 hubArchiveCreator.py -g test-data/augustusDbia3.gff3 -f test-data/dbia3.fa -d . -u ./tools -o output.html
9 """
10
11 import argparse
12 import collections
13 import json
14 import logging
15 import os
16 import sys
17
18 # Internal dependencies
19 from Bam import Bam
20 from BedSimpleRepeats import BedSimpleRepeats
21 from BedSpliceJunctions import BedSpliceJunctions
22 from Bed import Bed
23 from BigWig import BigWig
24 from util.Fasta import Fasta
25 from util.Filters import TraceBackFormatter
26 from Gff3 import Gff3
27 from Gtf import Gtf
28 from Psl import Psl
29 from TrackHub import TrackHub
30
31 # TODO: Verify each subprocessed dependency is accessible [gff3ToGenePred, genePredToBed, twoBitInfo, faToTwoBit, bedToBigBed, sort
32
33
34 def main(argv):
35 # Command Line parsing init
36 parser = argparse.ArgumentParser(description='Create a foo.txt inside the given folder.')
37
38 # Reference genome mandatory
39 parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome')
40
41 # GFF3 Management
42 parser.add_argument('--gff3', action='append', help='GFF3 format')
43
44 # GTF Management
45 parser.add_argument('--gtf', action='append', help='GTF format')
46
47 # Bed4+12 (TrfBig)
48 parser.add_argument('--bedSimpleRepeats', action='append', help='Bed4+12 format, using simpleRepeats.as')
49
50 # Bed12+1 (regtools)
51 parser.add_argument('--bedSpliceJunctions', action='append', help='Bed12+1 format, using spliceJunctions.as')
52
53 # Generic Bed (Blastx transformed to bed)
54 parser.add_argument('--bed', action='append', help='Bed generic format')
55
56 # BigWig Management
57 parser.add_argument('--bigwig', action='append', help='BigWig format')
58
59 # Bam Management
60 parser.add_argument('--bam', action='append', help='Bam format')
61
62 # Psl Management
63 parser.add_argument('--psl', action='append', help='Psl format')
64
65 # TODO: Check if the running directory can have issues if we run the tool outside
66 parser.add_argument('-d', '--directory',
67 help='Running tool directory, where to find the templates. Default is running directory')
68 parser.add_argument('-u', '--ucsc_tools_path',
69 help='Directory where to find the executables needed to run this tool')
70 parser.add_argument('-e', '--extra_files_path',
71 help='Name, in galaxy, of the output folder. Where you would want to build the Track Hub Archive')
72 parser.add_argument('-o', '--output', help='Name of the HTML summarizing the content of the Track Hub Archive')
73
74 parser.add_argument('-j', '--data_json', help='Json containing the metadata of the inputs')
75
76 parser.add_argument('--user_email', help='Email of the user who launched the Hub Archive Creation')
77
78 parser.add_argument('--genome_name', help='UCSC Genome Browser assembly ID')
79
80 parser.add_argument('--debug_mode', action='store_true', help='Allow more details about the errors')
81
82 # Begin init variables
83
84 toolDirectory = '.'
85 extra_files_path = '.'
86
87 # Get the args passed in parameter
88 args = parser.parse_args()
89
90 extra_files_path = args.extra_files_path
91 toolDirectory = args.directory
92
93 #### Logging management ####
94 # If we are in Debug mode, also print in stdout the debug dump
95
96 configure_logger(extra_files_path=extra_files_path, debug=args.debug_mode)
97
98 #### END Logging management ####
99
100 array_inputs_reference_genome = json.loads(args.fasta)
101
102 # TODO: Replace these with the object Fasta
103 input_fasta_file = array_inputs_reference_genome["false_path"]
104 input_fasta_file_name = sanitize_name_input(array_inputs_reference_genome["name"])
105 genome_name = sanitize_name_input(args.genome_name)
106
107 reference_genome = Fasta(input_fasta_file,
108 input_fasta_file_name, genome_name)
109
110 user_email = args.user_email
111
112
113 # TODO: Use a class to have a better management of the structure of these inputs
114 # These inputs are populated in the Galaxy Wrapper xml and are in this format:
115 # ARRAY[DICT{FILE_PATH: DICT{NAME: NAME_VALUE, EXTRA_DATA: EXTRA_DATA_VALUE}}]
116 # EXTRA_DATA could be anything, for example the index of a BAM => {"index", FILE_PATH}
117 array_inputs_bam = args.bam
118 array_inputs_bed_generic = args.bed
119 array_inputs_bed_simple_repeats = args.bedSimpleRepeats
120 array_inputs_bed_splice_junctions = args.bedSpliceJunctions
121 array_inputs_bigwig = args.bigwig
122 array_inputs_gff3 = args.gff3
123 array_inputs_gtf = args.gtf
124 array_inputs_psl = args.psl
125
126 outputFile = args.output
127
128 json_inputs_data = args.data_json
129
130 # TODO: Instead use a class to properly store the objects, with object_hook
131 inputs_data = json.loads(json_inputs_data)
132 # We remove the spaces in ["name"] of inputs_data
133 sanitize_name_inputs(inputs_data)
134
135 # TODO: Check here all the binaries / tools we need. Exception if missing
136
137 # Create the Track Hub folder
138 trackHub = TrackHub(reference_genome, user_email, outputFile, extra_files_path, toolDirectory)
139
140 all_datatype_dictionary = {}
141
142 for (inputs, datatype_class) in [
143 (array_inputs_bam, Bam),
144 (array_inputs_bed_generic, Bed),
145 (array_inputs_bigwig, BigWig),
146 (array_inputs_bed_simple_repeats, BedSimpleRepeats),
147 (array_inputs_bed_splice_junctions, BedSpliceJunctions),
148 (array_inputs_gff3, Gff3),
149 (array_inputs_gtf, Gtf),
150 (array_inputs_psl, Psl)]:
151 if inputs:
152 all_datatype_dictionary.update(create_ordered_datatype_objects(datatype_class, inputs, inputs_data))
153
154 # Create Ordered Dictionary to add the tracks in the tool form order
155 all_datatype_ordered_dictionary = collections.OrderedDict(all_datatype_dictionary)
156
157 logging.debug("----- End of all_datatype_dictionary processing -----")
158 logging.debug("all_datatype_ordered_dictionary keys are: {0}".format(all_datatype_ordered_dictionary.values()))
159
160 logging.debug("----- Beginning of Track adding processing -----")
161 for index, datatypeObject in all_datatype_ordered_dictionary.iteritems():
162 trackHub.addTrack(datatypeObject.track.trackDb)
163 logging.debug("----- End of Track adding processing -----")
164
165 # We process all the modifications to create the zip file
166 #trackHub.createZip()
167
168 # We terminate le process and so create a HTML file summarizing all the files
169 trackHub.terminate()
170
171 logging.debug('#### End of HubArchiveCreator Debug Mode: Bye! ####')
172
173 sys.exit(0)
174
175
176 def sanitize_name_input(string_to_sanitize):
177 """
178 Sanitize the string passed in parameter by replacing '/' and ' ' by '_'
179
180 :param string_to_sanitize:
181 :return :
182
183 :Example:
184
185 >>> sanitize_name_input('this/is an//example')
186 this_is_an__example
187 """
188 return string_to_sanitize \
189 .replace("/", "_") \
190 .replace(" ", "_")
191
192
193 def sanitize_name_inputs(inputs_data):
194 """
195 Sanitize value of the keys "name" of the dictionary passed in parameter.
196
197 Because sometimes output from Galaxy, or even just file name, from user inputs, have spaces.
198 Also, it can contain '/' character and could break the use of os.path function.
199
200 :param inputs_data: dict[string, dict[string, string]]
201 """
202 for key in inputs_data:
203 inputs_data[key]["name"] = sanitize_name_input(inputs_data[key]["name"])
204
205
206 def create_ordered_datatype_objects(ExtensionClass, array_inputs, inputs_data):
207 """
208 Function which executes the creation all the necessary files / folders for a special Datatype, for TrackHub
209 and update the dictionary of datatype
210
211 :param ExtensionClass:
212 :param array_inputs:
213 :param inputs_data:
214 :type ExtensionClass: Datatype
215 :type array_inputs: list[string]
216 :type inputs_data: dict
217 :rtype: dict
218 """
219
220 datatype_dictionary = {}
221
222 # TODO: Optimize this double loop
223 for input_false_path in array_inputs:
224 for key, data_value in inputs_data.items():
225 if key == input_false_path:
226 logging.debug("input_false_path: " + input_false_path)
227 logging.debug("data_value: " + str(data_value))
228 extensionObject = ExtensionClass(input_false_path, data_value)
229 datatype_dictionary.update({data_value["order_index"]: extensionObject})
230 return datatype_dictionary
231
232 def configure_logger(extra_files_path=None, debug=False):
233 if not extra_files_path:
234 raise Exception("Extra files path is not set. Stopping the application")
235
236
237 # All case log: log everything in a .log file
238 logger_file_name = ''.join([__name__, '.log'])
239 logging_file_path = os.path.join(extra_files_path, logger_file_name)
240
241 logging.basicConfig(filename=logging_file_path, level=logging.DEBUG)
242
243 log_stdout = logging.StreamHandler(sys.stdout)
244 if not debug:
245 configure_logger_user(log_stdout)
246 else:
247 configure_logger_dev(log_stdout)
248
249 # stderr configuration
250 configure_logger_stderr()
251
252 logging.debug('#### Welcome in HubArchiveCreator Debug Mode ####\n')
253
254 def configure_logger_user(log_stdout=None):
255 """
256 User Logger is defined as following:
257 - User needs to have WARN, ERROR and CRITICAL but well formatted / without traceback
258 in STDOUT
259 - Still access to full, brute and traceback for errors
260 in STDERR
261 - And further access to debug if needed
262 in .log
263
264 """
265
266 if not log_stdout:
267 raise Exception("No log_stdout given. Stopping the application")
268
269 # stdout for INFO / WARN / ERROR / CRITICAL
270 log_stdout.setLevel(logging.INFO)
271
272 formatter = TraceBackFormatter('%(message)s')
273
274 log_stdout.setFormatter(formatter)
275
276 logging.getLogger().addHandler(log_stdout)
277
278 def configure_logger_dev(log_stdout=None):
279 """
280 Dev Logger is defined as following:
281 - Dev needs to have WARN, ERROR and CRITICAL but well formatted / without traceback, in stdout
282 - Still access to full, brute and traceback in stderr for errors
283 - And further access to debug if needed
284
285 """
286 if not log_stdout:
287 raise Exception("No log_stdout given. Stopping the application")
288 log_format = '%(message)s'
289
290 # stdout and stderr and both identical for INFO / WARN / ERROR / CRITICAL
291 log_stdout.setLevel(logging.DEBUG)
292
293 formatter = logging.Formatter(log_format)
294
295 log_stdout.setFormatter(formatter)
296
297 logging.getLogger().addHandler(log_stdout)
298
299 def configure_logger_stderr():
300 """
301 Configure what should be logged in stderr
302 """
303 log_error = logging.StreamHandler(sys.stderr)
304 log_error.setLevel(logging.ERROR)
305 log_error_format = '%(message)s'
306
307 formatter_error = logging.Formatter(log_error_format)
308
309 log_error.setFormatter(formatter_error)
310
311 logging.getLogger().addHandler(log_error)
312
313 if __name__ == "__main__":
314 logging.getLogger(__name__)
315 main(sys.argv)