comparison customizemetadata.py @ 0:ab86614989fd draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/humann commit 6b06711cfba45855d5a992ed1c73c472eaef644f
author thanhlv
date Mon, 13 Feb 2023 16:16:49 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ab86614989fd
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import argparse
5 import bz2
6 import json
7 import pickle
8 import re
9 import sys
10 from importlib.metadata import version
11 from pathlib import Path
12
13 from packaging.version import Version
14
15
16 def load_from_json(json_fp):
17 '''
18 Read JSON file with marker metadata
19
20 :param json_fp: Path to JSON file
21 '''
22 with open(json_fp, 'r') as json_f:
23 data = json.load(json_f)
24
25 for m in data['markers']:
26 data['markers'][m]['ext'] = set(data['markers'][m]['ext'])
27
28 for t in data['taxonomy']:
29 if isinstance(data['taxonomy'][t], list):
30 data['taxonomy'][t] = tuple(data['taxonomy'][t])
31 return data
32
33
34 def dump_to_json(data, json_fp):
35 '''
36 Dump marker metadata to JSON file
37
38 :param json_fp: Path to JSON file
39 '''
40 for m in data['markers']:
41 data['markers'][m]['ext'] = list(data['markers'][m]['ext'])
42
43 with open(json_fp, 'w') as json_f:
44 json.dump(data, json_f)
45
46
47 def transform_pkl_to_json(pkl_fp, json_fp):
48 '''
49 Read Pickle file and drop it to a JSON file
50
51 :param pkl_fp: Path to input Pickle file
52 :param json_fp: Path to output JSON file
53 '''
54 # load metadata from Pickle file
55 with bz2.BZ2File(pkl_fp, 'r') as pkl_f:
56 in_metadata = pickle.load(pkl_f)
57
58 out_metadata = {
59 'markers': in_metadata['markers'],
60 'taxonomy': in_metadata['taxonomy'],
61 'merged_taxon': {}
62 }
63
64 # transform merged_taxons tuple keys to string
65 for k in in_metadata['merged_taxon']:
66 n = ' , '.join(k)
67 out_metadata[n] = in_metadata['merged_taxon'][k]
68
69 # dump metadata to JSON file
70 dump_to_json(out_metadata, json_fp)
71
72
73 def validate_map_version(infile, file_type):
74 '''
75 Check conformity of a user-provided pkl file to Metaphlan SGB (>= v4.0).
76
77 :param infile: Path to input Pickle/JSON file
78 :param file_type: String definining file type, pkl or JSON. Case-insensitive
79 '''
80 file_type = file_type.lower()
81 if file_type == 'pkl' or file_type == 'pickle':
82 # load metadata from Pickle file
83 with bz2.BZ2File(infile, 'r') as pkl_f:
84 in_metadata = pickle.load(pkl_f)
85 elif file_type == 'json':
86 in_metadata = load_from_json(infile)
87 else:
88 raise ValueError("Unsupported file type to validate.")
89
90 # Get metaphlan version in $PATH
91 metaphlan_version = Version(version('metaphlan'))
92
93 # Ensure that there are 8 taxonomy levels separated with "|"s.
94 # v3 DB release encodes the taxids as: ('2|1224|1236|91347|543|547|354276', 4404432)
95 # v4 DB release encodes the taxids as: ('2|1224|1236|91347|543|547|354276|', 4404432)
96 for k in in_metadata['taxonomy']:
97 if (in_metadata['taxonomy'][k][0].count('|') != 7 and metaphlan_version >= Version('4')) or (in_metadata['taxonomy'][k][0].count('|') != 6 and metaphlan_version < Version('4')):
98 # raise ValueError("Missing/Extra values in GCA list")
99 print("The input taxonomy mapping file %s is incompatible with Metaphlan v.%s in $PATH." % (infile, metaphlan_version))
100 sys.exit(42)
101
102 print("%s is compatible with Metaphlan v.%s." % (infile, metaphlan_version))
103
104
105 def transform_json_to_pkl(json_fp, pkl_fp):
106 '''
107 Read JSON file and drop it to a Pickle file
108
109 :param json_fp: Path to input JSON file
110 :param pkl_fp: Path to output Pickle file
111 '''
112 # load metadata from JSON file
113 in_metadata = load_from_json(json_fp)
114
115 out_metadata = {
116 'markers': in_metadata['markers'],
117 'taxonomy': in_metadata['taxonomy'],
118 'merged_taxon': {}
119 }
120
121 # transform merged_taxons keys to tuple
122 for k in in_metadata['merged_taxon']:
123 n = ' , '.split(k)
124 out_metadata[n] = in_metadata['merged_taxon'][k]
125
126 # dump metadata to Pickle file
127 with bz2.BZ2File(pkl_fp, 'w') as pkl_f:
128 pickle.dump(out_metadata, pkl_f)
129
130
131 def add_marker(in_json_fp, out_json_fp, name, m_length, g_length, gca, k_name, k_id, p_name, p_id, c_name, c_id, o_name, o_id, f_name, f_id, g_name, g_id, s_name, s_id, t_name):
132 '''
133 Add marker to JSON file
134
135 :param in_json_fp: Path to input JSON file
136 :param out_json_fp: Path to output JSON file
137 :param name: Name of new marker
138 :param m_length: Length of new marker
139 :param g_length: List with lengths of genomes from which the new marker has been extracted
140 :param gca: List with GCA of genomes from which the new marker has been extracted
141 :param k_name: List with Name of Kingdom for genomes from which the new marker has been extracted
142 :param k_id: List with NCBI id of Kingdom for genomes from which the new marker has been extracted
143 :param p_name: List with Name of Phylum for genomes from which the new marker has been extracted
144 :param p_id: List with NCBI id of Phylum for genomes from which the new marker has been extracted
145 :param c_name: List with Name of Class for genomes from which the new marker has been extracted
146 :param c_id: List with NCBI id of Class for genomes from which the new marker has been extracted
147 :param o_name: List with Name of Order for genomes from which the new marker has been extracted
148 :param o_id: List with NCBI id of Order for genomes from which the new marker has been extracted
149 :param f_name: List with Name of Family for genomes from which the new marker has been extracted
150 :param f_id: List with NCBI id of Family for genomes from which the new marker has been extracted
151 :param g_name: List with Name of Genus for genomes from which the new marker has been extracted
152 :param g_id: List with NCBI id of Genus for genomes from which the new marker has been extracted
153 :param s_name: List with Name of Species for genomes from which the new marker has been extracted
154 :param s_id: List with NCBI id of Species for genomes from which the new marker has been extracted
155 :param t_name: List with Name of Strain for genomes from which the new marker has been extracted
156 '''
157 metadata = load_from_json(in_json_fp)
158
159 # check that all lists have same size
160 genome_n = len(g_length)
161 if len(gca) != genome_n:
162 raise ValueError("Missing/Extra values in GCA list")
163 if len(k_name) != genome_n:
164 raise ValueError("Missing/Extra values in Kingdom name list")
165 if len(k_id) != genome_n:
166 raise ValueError("Missing/Extra values in Kingdom ID list")
167 if len(p_name) != genome_n:
168 raise ValueError("Missing/Extra values in Phylum name list")
169 if len(p_id) != genome_n:
170 raise ValueError("Missing/Extra values in Phylum ID list")
171 if len(c_name) != genome_n:
172 raise ValueError("Missing/Extra values in Class name list")
173 if len(c_id) != genome_n:
174 raise ValueError("Missing/Extra values in Class ID list")
175 if len(o_name) != genome_n:
176 raise ValueError("Missing/Extra values in Order name list")
177 if len(o_id) != genome_n:
178 raise ValueError("Missing/Extra values in Order ID list")
179 if len(f_name) != genome_n:
180 raise ValueError("Missing/Extra values in Family name list")
181 if len(f_id) != genome_n:
182 raise ValueError("Missing/Extra values in Family ID list")
183 if len(g_name) != genome_n:
184 raise ValueError("Missing/Extra values in Genus name list")
185 if len(g_id) != genome_n:
186 raise ValueError("Missing/Extra values in Genus ID list")
187 if len(s_name) != genome_n:
188 raise ValueError("Missing/Extra values in Species name list")
189 if len(s_id) != genome_n:
190 raise ValueError("Missing/Extra values in Species ID list")
191 if len(t_name) != genome_n:
192 raise ValueError("Missing/Extra values in Strain name list")
193
194 # create dictionary to aggregate genome taxonomies and identify marker taxonomy
195 taxonomy = {
196 'k': set(),
197 'p': set(),
198 'c': set(),
199 'o': set(),
200 'f': set(),
201 'g': set(),
202 's': set(),
203 't': set(),
204 }
205
206 # parse genomes
207 for i in range(genome_n):
208 # add taxonomy of new genome
209 g_taxo_names = "k__%s|p__%s|c__%s|o__%s|f__%s|g__%s|s__%s|t__%s" % (
210 k_name[i],
211 p_name[i],
212 c_name[i],
213 o_name[i],
214 f_name[i],
215 g_name[i],
216 s_name[i],
217 t_name[i]
218 )
219 g_taxo_ids = "%s|%s|%s|%s|%s|%s|%s" % (
220 k_id[i],
221 p_id[i],
222 c_id[i],
223 o_id[i],
224 f_id[i],
225 g_id[i],
226 s_id[i]
227 )
228 metadata['taxonomy'][g_taxo_names] = (g_taxo_ids, g_length[i])
229 # aggregate taxon levels using sets
230 taxonomy['k'].add(k_name[i])
231 taxonomy['p'].add(p_name[i])
232 taxonomy['c'].add(c_name[i])
233 taxonomy['o'].add(o_name[i])
234 taxonomy['f'].add(f_name[i])
235 taxonomy['g'].add(g_name[i])
236 taxonomy['s'].add(s_name[i])
237 taxonomy['t'].add(t_name[i])
238
239 # extract clade and taxon of marker
240 clade = '' # last level before taxomy of genomes diverge
241 taxon = '' # combination of levels before divergence
242 for level in ['k', 'p', 'c', 'o', 'f', 'g', 's', 't']:
243 taxo = list(taxonomy[level])
244 if len(taxo) == 1:
245 clade = taxo[0]
246 taxon = "%s|%s__%s" % (taxon, level, taxo)
247
248 # add information about the new marker
249 metadata['markers'][name] = {
250 'clade': clade,
251 'ext': set(gca),
252 'len': m_length,
253 'taxon': taxon
254 }
255
256 dump_to_json(metadata, out_json_fp)
257
258
259 def format_markers(marker_l):
260 '''
261 Format markers
262
263 :param marker_l: list of markers
264 '''
265 markers = []
266 for m in marker_l:
267 m = m.rstrip()
268 if ' ' in m:
269 markers.append(m.split(' ')[0])
270 else:
271 markers.append(m)
272 return markers
273
274
275 def get_markers(marker_fp):
276 '''
277 Get markers from a file
278
279 :param marker_fp: Path to file with markers (1 per line)
280 '''
281 # load markers
282 with open(marker_fp, 'r') as marker_f:
283 markers = marker_f.readlines()
284
285 # format markers
286 markers = format_markers(markers)
287
288 return markers
289
290
291 def check_not_found_markers(found_markers, original_markers):
292 '''
293 Check list of markers
294
295 :param found_markers: list of found markers
296 :param original_markers: list of original markers
297 '''
298 if len(found_markers) != len(original_markers):
299 print('markers not found:')
300 for m in original_markers:
301 if m not in found_markers:
302 print('- "%s"' % m)
303
304
305 def prune_taxonomy(in_taxonomy, taxon_s, gca_s):
306 '''
307 Prune taxonomy to keep only listed taxonomy
308
309 :param in_taxonomy: dictionary with list of taxonomy
310 :param taxon_s: set of taxons to keep
311 :param gca_s: set of GCA ids to keep
312 '''
313 out_taxonomy = {}
314 kept_taxonomy = set()
315 kept_taxons = set()
316 kept_gca = set()
317 for t, v in in_taxonomy.items():
318 # check if t match element in list of taxon_s
319 kept_taxon = False
320 for t_k in taxon_s:
321 if t_k in t:
322 kept_taxon = True
323 out_taxonomy[t] = v
324 kept_taxonomy.add(t)
325 kept_taxons.add(t_k)
326 break
327 # check if GCA in the taxon id
328 s = re.search(r'GCA_\d+$', t)
329 if s:
330 gca = s[0]
331 # check if GCA in taxon id is in the list GCA to keep
332 if gca in gca_s:
333 kept_gca.add(gca)
334 if not kept_taxon:
335 out_taxonomy[t] = v
336 kept_taxonomy.add(t)
337
338 print('%s kept taxonomy' % len(kept_taxonomy))
339 print('%s / %s taxons not found' % (len(taxon_s) - len(kept_taxons), len(taxon_s)))
340 print('%s / %s GCA taxons not found' % (len(gca_s) - len(kept_gca), len(gca_s)))
341 return out_taxonomy
342
343
344 def remove_markers(in_json_fp, marker_fp, out_json_fp, kept_marker_fp):
345 '''
346 Remove markers from JSON file
347
348 :param in_json_fp: Path to input JSON file
349 :param marker_fp: Path to file with markers to remove (1 per line)
350 :param out_json_fp: Path to output JSON file
351 :param kept_marker_fp: Path to file with kept markers
352 '''
353 in_metadata = load_from_json(in_json_fp)
354
355 # load markers
356 markers_to_remove = set(get_markers(marker_fp))
357 print('%s markers to remove' % len(markers_to_remove))
358
359 # keep merged_taxon
360 out_metadata = {
361 'markers': {},
362 'taxonomy': {},
363 'merged_taxon': in_metadata['merged_taxon']
364 }
365
366 # parse markers to keep
367 removed_markers = []
368 kept_markers = []
369 taxons_to_keep = set()
370 gca_to_keep = set()
371 for m, v in in_metadata['markers'].items():
372 if m not in markers_to_remove:
373 out_metadata['markers'][m] = v
374 kept_markers.append(m)
375 taxons_to_keep.add(v['taxon'])
376 gca_to_keep.update(v['ext'])
377 else:
378 removed_markers.append(m)
379 print('%s removed markers' % len(removed_markers))
380
381 # check markers that are not found
382 check_not_found_markers(removed_markers, markers_to_remove)
383
384 # keep only taxonomy in taxons_to_keep or with GCA in gca_to_keep
385 out_metadata['taxonomy'] = prune_taxonomy(in_metadata['taxonomy'], taxons_to_keep, gca_to_keep)
386
387 # save to JSON
388 dump_to_json(out_metadata, out_json_fp)
389
390 # write list of kept markers
391 with open(kept_marker_fp, 'w') as kept_marker_f:
392 for m in kept_markers:
393 kept_marker_f.write("%s\n" % m)
394
395
396 def keep_markers(in_json_fp, marker_fp, out_json_fp):
397 '''
398 Keep markers from JSON file, others will be removed
399
400 :param in_json_fp: Path to input JSON file
401 :param marker_fp: Path to file with markers to keep (1 per line)
402 :param out_json_fp: Path to output JSON file
403 '''
404 in_metadata = load_from_json(in_json_fp)
405
406 # load markers
407 markers_to_keep = set(get_markers(marker_fp))
408 print('%s markers to keep' % len(markers_to_keep))
409
410 # keep merged_taxon
411 out_metadata = {
412 'markers': {},
413 'taxonomy': {},
414 'merged_taxon': in_metadata['merged_taxon']
415 }
416
417 # parse markers to keep
418 kept_markers = []
419 taxons_to_keep = set()
420 gca_to_keep = set()
421 for m, v in in_metadata['markers'].items():
422 if m in markers_to_keep:
423 out_metadata['markers'][m] = v
424 kept_markers.append(m)
425 taxons_to_keep.add(v['taxon'])
426 gca_to_keep.update(v['ext'])
427 print('%s kept markers' % len(kept_markers))
428
429 # check markers that are not found
430 check_not_found_markers(kept_markers, markers_to_keep)
431
432 # keep only taxonomy in taxons_to_keep or with GCA in gca_to_keep
433 out_metadata['taxonomy'] = prune_taxonomy(in_metadata['taxonomy'], taxons_to_keep, gca_to_keep)
434
435 # save to JSON
436 dump_to_json(out_metadata, out_json_fp)
437
438
439 if __name__ == '__main__':
440 # Read command line
441 parser = argparse.ArgumentParser(description='Customize MetaPhlan database')
442 subparsers = parser.add_subparsers(dest='function')
443 # transform_pkl_to_json subcommand
444 pkl_to_json_parser = subparsers.add_parser('transform_pkl_to_json', help='Transform Pickle to JSON to get marker metadata')
445 pkl_to_json_parser.add_argument('--pkl', help="Path to input Pickle file")
446 pkl_to_json_parser.add_argument('--json', help="Path to output JSON file")
447 # transform_json_to_pkl subcommand
448 json_to_pkl_parser = subparsers.add_parser('transform_json_to_pkl', help='Transform JSON to Pickle to push marker metadata')
449 json_to_pkl_parser.add_argument('--json', help="Path to input JSON file")
450 json_to_pkl_parser.add_argument('--pkl', help="Path to output Pickle file")
451 # add_marker subcommand
452 add_marker_parser = subparsers.add_parser('add_marker', help='Add new marker to JSON file')
453 add_marker_parser.add_argument('--in_json', help="Path to input JSON file")
454 add_marker_parser.add_argument('--out_json', help="Path to output JSON file")
455 add_marker_parser.add_argument('--name', help="Name of new marker")
456 add_marker_parser.add_argument('--m_length', help="Length of new marker")
457 add_marker_parser.add_argument('--g_length', help="Length of genome from which the new marker has been extracted", action="append")
458 add_marker_parser.add_argument('--gca', help="GCA of genome from which the new marker has been extracted", action="append")
459 add_marker_parser.add_argument('--k_name', help="Name of Kingdom for genome from which the new marker has been extracted", action="append")
460 add_marker_parser.add_argument('--k_id', help="NCBI id of Kingdom for genome from which the new marker has been extracted", action="append")
461 add_marker_parser.add_argument('--p_name', help="Name of Phylum for genome from which the new marker has been extracted", action="append")
462 add_marker_parser.add_argument('--p_id', help="NCBI id of Phylum for genome from which the new marker has been extracted", action="append")
463 add_marker_parser.add_argument('--c_name', help="Name of Class for genome from which the new marker has been extracted", action="append")
464 add_marker_parser.add_argument('--c_id', help="NCBI id of Class for genome from which the new marker has been extracted", action="append")
465 add_marker_parser.add_argument('--o_name', help="Name of Order for genome from which the new marker has been extracted", action="append")
466 add_marker_parser.add_argument('--o_id', help="NCBI id of Order for genome from which the new marker has been extracted", action="append")
467 add_marker_parser.add_argument('--f_name', help="Name of Family for genome from which the new marker has been extracted", action="append")
468 add_marker_parser.add_argument('--f_id', help="NCBI id of Family for genome from which the new marker has been extracted", action="append")
469 add_marker_parser.add_argument('--g_name', help="Name of Genus for genome from which the new marker has been extracted", action="append")
470 add_marker_parser.add_argument('--g_id', help="NCBI id of Genus for genome from which the new marker has been extracted", action="append")
471 add_marker_parser.add_argument('--s_name', help="Name of Species for genome from which the new marker has been extracted", action="append")
472 add_marker_parser.add_argument('--s_id', help="NCBI id of Species for genome from which the new marker has been extracted", action="append")
473 add_marker_parser.add_argument('--t_name', help="Name of Strain for genome from which the new marker has been extracted", action="append")
474 # remove_markers subcommand
475 remove_markers_parser = subparsers.add_parser('remove_markers', help='Remove markers from JSON file')
476 remove_markers_parser.add_argument('--in_json', help="Path to input JSON file")
477 remove_markers_parser.add_argument('--markers', help="Path to file with markers to remove (1 per line)")
478 remove_markers_parser.add_argument('--out_json', help="Path to output JSON file")
479 remove_markers_parser.add_argument('--kept_markers', help="Path to file with kept markers")
480 # keep_markers subcommand
481 keep_markers_parser = subparsers.add_parser('keep_markers', help='Keep markers from JSON file, others will be removed')
482 keep_markers_parser.add_argument('--in_json', help="Path to input JSON file")
483 keep_markers_parser.add_argument('--markers', help="Path to file with markers to keep (1 per line)")
484 keep_markers_parser.add_argument('--out_json', help="Path to output JSON file")
485
486 args = parser.parse_args()
487
488 if args.function == 'transform_pkl_to_json':
489 validate_map_version(Path(args.pkl), 'pkl')
490 transform_pkl_to_json(Path(args.pkl), Path(args.json))
491 elif args.function == 'transform_json_to_pkl':
492 validate_map_version(Path(args.json), 'json')
493 transform_json_to_pkl(Path(args.json), Path(args.pkl))
494 elif args.function == 'add_marker':
495 add_marker(
496 args.in_json,
497 args.out_json,
498 args.name,
499 args.m_length,
500 args.g_length,
501 args.gca,
502 args.k_name,
503 args.k_id,
504 args.p_name,
505 args.p_id,
506 args.c_name,
507 args.c_id,
508 args.o_name,
509 args.o_id,
510 args.f_name,
511 args.f_id,
512 args.g_name,
513 args.g_id,
514 args.s_name,
515 args.s_id,
516 args.t_name)
517 elif args.function == 'remove_markers':
518 remove_markers(args.in_json, args.markers, args.out_json, args.kept_markers)
519 elif args.function == 'keep_markers':
520 keep_markers(args.in_json, args.markers, args.out_json)