Mercurial > repos > rmarenco > hubarchivecreator
comparison hubArchiveCreator.py @ 29:7e8a8b732db3 draft
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 1a81ebd0ddea950b84af3fc830e9267a4814b29f
author | yating-l |
---|---|
date | Wed, 16 May 2018 18:04:20 -0400 |
parents | fcc1021bd496 |
children |
comparison
equal
deleted
inserted
replaced
28:6aa28a85cc38 | 29:7e8a8b732db3 |
---|---|
3 | 3 |
4 """ | 4 """ |
5 This Galaxy tool permits to prepare your files to be ready for | 5 This Galaxy tool permits to prepare your files to be ready for |
6 Assembly Hub visualization. | 6 Assembly Hub visualization. |
7 Program test arguments: | 7 Program test arguments: |
8 hubArchiveCreator.py -g test-data/augustusDbia3.gff3 -f test-data/dbia3.fa -d . -u ./tools -o output.html | 8 hubArchiveCreator.py -g test-data/augustusDbia3.gff3 -f '{"false_path": "./test-data/common/dbia3.fa", "name":"dbia3"}' -d . -u ./tools -o output.html |
9 """ | 9 """ |
10 | 10 |
11 import argparse | 11 import argparse |
12 import collections | 12 import collections |
13 import json | 13 import json |
14 import logging | 14 import logging |
15 import os | 15 import os |
16 import sys | 16 import sys |
17 | 17 |
18 # Internal dependencies | 18 # Internal dependencies |
19 from Bam import Bam | 19 from util.Reader import Reader |
20 from BedSimpleRepeats import BedSimpleRepeats | 20 from util.Logger import Logger |
21 from BedSpliceJunctions import BedSpliceJunctions | |
22 from Bed import Bed | |
23 from cytoBand import cytoBand | |
24 from BigWig import BigWig | |
25 from util.Fasta import Fasta | |
26 from util.Filters import TraceBackFormatter | |
27 from Gff3 import Gff3 | |
28 from Gtf import Gtf | |
29 from Psl import Psl | |
30 from TrackHub import TrackHub | 21 from TrackHub import TrackHub |
31 from bigPsl import bigPsl | 22 |
32 from BedBlastAlignments import BedBlastAlignments | 23 |
33 from BigBed import BigBed | |
34 | 24 |
35 # TODO: Verify each subprocessed dependency is accessible [gff3ToGenePred, genePredToBed, twoBitInfo, faToTwoBit, bedToBigBed, sort | 25 # TODO: Verify each subprocessed dependency is accessible [gff3ToGenePred, genePredToBed, twoBitInfo, faToTwoBit, bedToBigBed, sort |
36 | 26 |
37 | 27 |
38 def main(argv): | 28 def main(argv): |
29 | |
39 # Command Line parsing init | 30 # Command Line parsing init |
40 parser = argparse.ArgumentParser(description='Create a foo.txt inside the given folder.') | 31 parser = argparse.ArgumentParser(description='Create a foo.txt inside the given folder.') |
32 parser.add_argument('-j', '--data_json', help='JSON file containing the metadata of the inputs') | |
33 parser.add_argument('-o', '--output', help='Name of the HTML summarizing the content of the Track Hub Archive') | |
34 | |
35 # Get the args passed in parameter | |
36 args = parser.parse_args() | |
37 json_inputs_data = args.data_json | |
38 outputFile = args.output | |
41 | 39 |
42 # Reference genome mandatory | 40 ##Parse JSON file with Reader |
43 parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome') | 41 reader = Reader(json_inputs_data) |
44 | |
45 # GFF3 Management | |
46 parser.add_argument('--gff3', action='append', help='GFF3 format') | |
47 | |
48 # GTF Management | |
49 parser.add_argument('--gtf', action='append', help='GTF format') | |
50 | |
51 # Bed4+12 (TrfBig) | |
52 parser.add_argument('--bedSimpleRepeats', action='append', help='Bed4+12 format, using simpleRepeats.as') | |
53 | |
54 # Bed12+1 (regtools) | |
55 parser.add_argument('--bedSpliceJunctions', action='append', help='Bed12+1 format, using spliceJunctions.as') | |
56 | |
57 # Generic Bed (Blastx transformed to bed) | |
58 parser.add_argument('--bed', action='append', help='Bed generic format') | |
59 | |
60 #cytoBandIdeo | |
61 parser.add_argument('--cytoBand', action='append', help='Cytoband Track, using cytoBandIdeo.as') | |
62 | |
63 # BigPsl (blat alignment) | |
64 parser.add_argument('--bigpsl', action='append', help='bigPsl format, using bigPsl.as') | |
65 | |
66 # Bed12+12 (tblastn alignment) | |
67 parser.add_argument('--bedBlastAlignments', action='append', help='Bed12+12 format, using bigPsl.as') | |
68 | |
69 # BigWig Management | |
70 parser.add_argument('--bigwig', action='append', help='BigWig format') | |
71 | |
72 # Bam Management | |
73 parser.add_argument('--bam', action='append', help='Bam format') | |
74 | |
75 # Psl Management | |
76 parser.add_argument('--psl', action='append', help='Psl format') | |
77 | |
78 # BigBed Management | |
79 parser.add_argument('--bigbed', action='append', help='BigBed format') | |
80 | |
81 # TODO: Check if the running directory can have issues if we run the tool outside | |
82 parser.add_argument('-d', '--directory', | |
83 help='Running tool directory, where to find the templates. Default is running directory') | |
84 parser.add_argument('-u', '--ucsc_tools_path', | |
85 help='Directory where to find the executables needed to run this tool') | |
86 parser.add_argument('-e', '--extra_files_path', | |
87 help='Name, in galaxy, of the output folder. Where you would want to build the Track Hub Archive') | |
88 parser.add_argument('-o', '--output', help='Name of the HTML summarizing the content of the Track Hub Archive') | |
89 | |
90 parser.add_argument('-j', '--data_json', help='Json containing the metadata of the inputs') | |
91 | |
92 parser.add_argument('--user_email', help='Email of the user who launched the Hub Archive Creation') | |
93 | |
94 parser.add_argument('--genome_name', help='UCSC Genome Browser assembly ID') | |
95 | |
96 parser.add_argument('--debug_mode', action='store_true', help='Allow more details about the errors') | |
97 | 42 |
98 # Begin init variables | 43 # Begin init variables |
99 | 44 extra_files_path = reader.getExtFilesPath() |
100 toolDirectory = '.' | 45 toolDirectory = reader.getToolDir() |
101 extra_files_path = '.' | 46 #outputFile = reader.getOutputDir() |
102 | 47 user_email = reader.getUserEmail() |
103 # Get the args passed in parameter | 48 reference_genome = reader.getRefGenome() |
104 args = parser.parse_args() | 49 debug_mode = reader.getDebugMode() |
105 | |
106 extra_files_path = args.extra_files_path | |
107 toolDirectory = args.directory | |
108 | 50 |
109 #### Logging management #### | 51 #### Logging management #### |
110 # If we are in Debug mode, also print in stdout the debug dump | 52 # If we are in Debug mode, also print in stdout the debug dump |
111 | 53 log = Logger(tool_directory=toolDirectory, debug=debug_mode, extra_files_path=extra_files_path) |
112 configure_logger(extra_files_path=extra_files_path, debug=args.debug_mode) | 54 log.setup_logging() |
113 | 55 logging.info('#### HubArchiveCreator: Start ####\n') |
56 logging.debug('---- Welcome in HubArchiveCreator Debug Mode ----\n') | |
57 logging.debug('JSON parameters: %s\n\n', json.dumps(reader.args)) | |
114 #### END Logging management #### | 58 #### END Logging management #### |
115 | 59 |
116 array_inputs_reference_genome = json.loads(args.fasta) | |
117 | |
118 # TODO: Replace these with the object Fasta | |
119 input_fasta_file = array_inputs_reference_genome["false_path"] | |
120 input_fasta_file_name = sanitize_name_input(array_inputs_reference_genome["name"]) | |
121 genome_name = sanitize_name_input(args.genome_name) | |
122 | |
123 reference_genome = Fasta(input_fasta_file, | |
124 input_fasta_file_name, genome_name) | |
125 | |
126 user_email = args.user_email | |
127 | |
128 | |
129 # TODO: Use a class to have a better management of the structure of these inputs | |
130 # These inputs are populated in the Galaxy Wrapper xml and are in this format: | |
131 # ARRAY[DICT{FILE_PATH: DICT{NAME: NAME_VALUE, EXTRA_DATA: EXTRA_DATA_VALUE}}] | |
132 # EXTRA_DATA could be anything, for example the index of a BAM => {"index", FILE_PATH} | |
133 array_inputs_bam = args.bam | |
134 array_inputs_bed_generic = args.bed | |
135 array_inputs_bed_cytoBand = args.cytoBand | |
136 array_inputs_bed_simple_repeats = args.bedSimpleRepeats | |
137 array_inputs_bed_splice_junctions = args.bedSpliceJunctions | |
138 array_inputs_bigwig = args.bigwig | |
139 array_inputs_gff3 = args.gff3 | |
140 array_inputs_gtf = args.gtf | |
141 array_inputs_psl = args.psl | |
142 array_inputs_bigpsl = args.bigpsl | |
143 array_inputs_bed_blast_alignments = args.bedBlastAlignments | |
144 array_inputs_bigbed = args.bigbed | |
145 | |
146 outputFile = args.output | |
147 | |
148 json_inputs_data = args.data_json | |
149 | |
150 # TODO: Instead use a class to properly store the objects, with object_hook | |
151 inputs_data = json.loads(json_inputs_data) | |
152 # We remove the spaces in ["name"] of inputs_data | |
153 sanitize_name_inputs(inputs_data) | |
154 | |
155 # TODO: Check here all the binaries / tools we need. Exception if missing | |
156 | |
157 # Create the Track Hub folder | 60 # Create the Track Hub folder |
61 logging.info('#### HubArchiveCreator: Creating the Track Hub folder ####\n') | |
158 trackHub = TrackHub(reference_genome, user_email, outputFile, extra_files_path, toolDirectory) | 62 trackHub = TrackHub(reference_genome, user_email, outputFile, extra_files_path, toolDirectory) |
159 | 63 |
160 all_datatype_dictionary = {} | |
161 | |
162 for (inputs, datatype_class) in [ | |
163 (array_inputs_bam, Bam), | |
164 (array_inputs_bed_generic, Bed), | |
165 (array_inputs_bed_cytoBand, cytoBand), | |
166 (array_inputs_bigwig, BigWig), | |
167 (array_inputs_bed_simple_repeats, BedSimpleRepeats), | |
168 (array_inputs_bed_splice_junctions, BedSpliceJunctions), | |
169 (array_inputs_gff3, Gff3), | |
170 (array_inputs_gtf, Gtf), | |
171 (array_inputs_psl, Psl), | |
172 (array_inputs_bigpsl, bigPsl), | |
173 (array_inputs_bed_blast_alignments, BedBlastAlignments), | |
174 (array_inputs_bigbed, BigBed)]: | |
175 if inputs: | |
176 all_datatype_dictionary.update(create_ordered_datatype_objects(datatype_class, inputs, inputs_data)) | |
177 | |
178 # Create Ordered Dictionary to add the tracks in the tool form order | 64 # Create Ordered Dictionary to add the tracks in the tool form order |
65 logging.info('#### HubArchiveCreator: Preparing track data ####\n') | |
66 all_datatype_dictionary = reader.getTracksData() | |
179 all_datatype_ordered_dictionary = collections.OrderedDict(all_datatype_dictionary) | 67 all_datatype_ordered_dictionary = collections.OrderedDict(all_datatype_dictionary) |
180 | 68 |
181 logging.debug("----- End of all_datatype_dictionary processing -----") | 69 logging.debug("----- End of all_datatype_dictionary processing -----") |
182 logging.debug("all_datatype_ordered_dictionary keys are: {0}".format(all_datatype_ordered_dictionary.values())) | 70 #logging.debug("all_datatype_ordered_dictionary are: %s", json.dumps(all_datatype_ordered_dictionary)) |
183 | 71 |
72 logging.info('#### HubArchiveCreator: Adding tracks to Track Hub ####\n') | |
184 logging.debug("----- Beginning of Track adding processing -----") | 73 logging.debug("----- Beginning of Track adding processing -----") |
74 | |
185 for index, datatypeObject in all_datatype_ordered_dictionary.iteritems(): | 75 for index, datatypeObject in all_datatype_ordered_dictionary.iteritems(): |
186 trackHub.addTrack(datatypeObject.track.trackDb) | 76 trackHub.addTrack(datatypeObject.track.track_db) |
77 | |
187 logging.debug("----- End of Track adding processing -----") | 78 logging.debug("----- End of Track adding processing -----") |
188 | 79 |
189 # We process all the modifications to create the zip file | 80 # We terminate the process and so create a HTML file summarizing all the files |
190 #trackHub.createZip() | 81 logging.info('#### HubArchiveCreator: Creating the HTML file ####\n') |
191 | |
192 # We terminate le process and so create a HTML file summarizing all the files | |
193 trackHub.terminate() | 82 trackHub.terminate() |
194 | 83 |
195 logging.debug('#### End of HubArchiveCreator Debug Mode: Bye! ####') | 84 logging.debug('---- End of HubArchiveCreator Debug Mode: Bye! ----\n') |
85 logging.info('#### HubArchiveCreator: Congratulation! Assembly Hub is created! ####\n') | |
196 | 86 |
197 sys.exit(0) | 87 sys.exit(0) |
198 | 88 |
199 | |
200 def sanitize_name_input(string_to_sanitize): | |
201 """ | |
202 Sanitize the string passed in parameter by replacing '/' and ' ' by '_' | |
203 | |
204 :param string_to_sanitize: | |
205 :return : | |
206 | |
207 :Example: | |
208 | |
209 >>> sanitize_name_input('this/is an//example') | |
210 this_is_an__example | |
211 """ | |
212 return string_to_sanitize \ | |
213 .replace("/", "_") \ | |
214 .replace(" ", "_") | |
215 | |
216 | |
217 def sanitize_name_inputs(inputs_data): | |
218 """ | |
219 Sanitize value of the keys "name" of the dictionary passed in parameter. | |
220 | |
221 Because sometimes output from Galaxy, or even just file name, from user inputs, have spaces. | |
222 Also, it can contain '/' character and could break the use of os.path function. | |
223 | |
224 :param inputs_data: dict[string, dict[string, string]] | |
225 """ | |
226 for key in inputs_data: | |
227 inputs_data[key]["name"] = sanitize_name_input(inputs_data[key]["name"]) | |
228 | |
229 | |
230 def create_ordered_datatype_objects(ExtensionClass, array_inputs, inputs_data): | |
231 """ | |
232 Function which executes the creation all the necessary files / folders for a special Datatype, for TrackHub | |
233 and update the dictionary of datatype | |
234 | |
235 :param ExtensionClass: | |
236 :param array_inputs: | |
237 :param inputs_data: | |
238 :type ExtensionClass: Datatype | |
239 :type array_inputs: list[string] | |
240 :type inputs_data: dict | |
241 :rtype: dict | |
242 """ | |
243 | |
244 datatype_dictionary = {} | |
245 | |
246 # TODO: Optimize this double loop | |
247 for input_false_path in array_inputs: | |
248 for key, data_value in inputs_data.items(): | |
249 if key == input_false_path: | |
250 logging.debug("input_false_path: " + input_false_path) | |
251 logging.debug("data_value: " + str(data_value)) | |
252 extensionObject = ExtensionClass(input_false_path, data_value) | |
253 datatype_dictionary.update({data_value["order_index"]: extensionObject}) | |
254 return datatype_dictionary | |
255 | |
256 def configure_logger(extra_files_path=None, debug=False): | |
257 if not extra_files_path: | |
258 raise Exception("Extra files path is not set. Stopping the application") | |
259 | |
260 | |
261 # All case log: log everything in a .log file | |
262 logger_file_name = ''.join([__name__, '.log']) | |
263 logging_file_path = os.path.join(extra_files_path, logger_file_name) | |
264 | |
265 logging.basicConfig(filename=logging_file_path, level=logging.DEBUG) | |
266 | |
267 log_stdout = logging.StreamHandler(sys.stdout) | |
268 if not debug: | |
269 configure_logger_user(log_stdout) | |
270 else: | |
271 configure_logger_dev(log_stdout) | |
272 | |
273 # stderr configuration | |
274 configure_logger_stderr() | |
275 | |
276 logging.debug('#### Welcome in HubArchiveCreator Debug Mode ####\n') | |
277 | |
278 def configure_logger_user(log_stdout=None): | |
279 """ | |
280 User Logger is defined as following: | |
281 - User needs to have WARN, ERROR and CRITICAL but well formatted / without traceback | |
282 in STDOUT | |
283 - Still access to full, brute and traceback for errors | |
284 in STDERR | |
285 - And further access to debug if needed | |
286 in .log | |
287 | |
288 """ | |
289 | |
290 if not log_stdout: | |
291 raise Exception("No log_stdout given. Stopping the application") | |
292 | |
293 # stdout for INFO / WARN / ERROR / CRITICAL | |
294 log_stdout.setLevel(logging.INFO) | |
295 | |
296 formatter = TraceBackFormatter('%(message)s') | |
297 | |
298 log_stdout.setFormatter(formatter) | |
299 | |
300 logging.getLogger().addHandler(log_stdout) | |
301 | |
302 def configure_logger_dev(log_stdout=None): | |
303 """ | |
304 Dev Logger is defined as following: | |
305 - Dev needs to have WARN, ERROR and CRITICAL but well formatted / without traceback, in stdout | |
306 - Still access to full, brute and traceback in stderr for errors | |
307 - And further access to debug if needed | |
308 | |
309 """ | |
310 if not log_stdout: | |
311 raise Exception("No log_stdout given. Stopping the application") | |
312 log_format = '%(message)s' | |
313 | |
314 # stdout and stderr and both identical for INFO / WARN / ERROR / CRITICAL | |
315 log_stdout.setLevel(logging.DEBUG) | |
316 | |
317 formatter = logging.Formatter(log_format) | |
318 | |
319 log_stdout.setFormatter(formatter) | |
320 | |
321 logging.getLogger().addHandler(log_stdout) | |
322 | |
323 def configure_logger_stderr(): | |
324 """ | |
325 Configure what should be logged in stderr | |
326 """ | |
327 log_error = logging.StreamHandler(sys.stderr) | |
328 log_error.setLevel(logging.ERROR) | |
329 log_error_format = '%(message)s' | |
330 | |
331 formatter_error = logging.Formatter(log_error_format) | |
332 | |
333 log_error.setFormatter(formatter_error) | |
334 | |
335 logging.getLogger().addHandler(log_error) | |
336 | |
337 if __name__ == "__main__": | 89 if __name__ == "__main__": |
338 logging.getLogger(__name__) | |
339 main(sys.argv) | 90 main(sys.argv) |