Mercurial > repos > bgruening > data_manager_semibin
comparison data_manager/data_manager_semibin.py @ 0:1e4dd26db773 draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_semibin commit aa9bfb2fb62547ee8bac34f0de5b3beaa0bfd1a4"
author | bgruening |
---|---|
date | Fri, 14 Oct 2022 21:29:47 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1e4dd26db773 |
---|---|
1 #!/usr/bin/env python | |
2 # | |
3 # Data manager for reference data for the MetaPhlAn Galaxy tools | |
4 import argparse | |
5 import json | |
6 import subprocess | |
7 from datetime import date | |
8 from pathlib import Path | |
9 | |
10 | |
11 # Utility functions for interacting with Galaxy JSON | |
12 def read_input_json(json_fp): | |
13 """Read the JSON supplied from the data manager tool | |
14 | |
15 Returns a tuple (param_dict,extra_files_path) | |
16 | |
17 'param_dict' is an arbitrary dictionary of parameters | |
18 input into the tool; 'extra_files_path' is the path | |
19 to a directory where output files must be put for the | |
20 receiving data manager to pick them up. | |
21 | |
22 NB the directory pointed to by 'extra_files_path' | |
23 doesn't exist initially, it is the job of the script | |
24 to create it if necessary. | |
25 | |
26 """ | |
27 with open(json_fp) as fh: | |
28 params = json.load(fh) | |
29 return (params['param_dict'], | |
30 Path(params['output_data'][0]['extra_files_path'])) | |
31 | |
32 | |
33 # Utility functions for creating data table dictionaries | |
34 # | |
35 # Example usage: | |
36 # >>> d = create_data_tables_dict() | |
37 # >>> add_data_table(d,'my_data') | |
38 # >>> add_data_table_entry(dict(dbkey='hg19',value='human')) | |
39 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse')) | |
40 # >>> print(json.dumps(d)) | |
41 def create_data_tables_dict(): | |
42 """Return a dictionary for storing data table information | |
43 | |
44 Returns a dictionary that can be used with 'add_data_table' | |
45 and 'add_data_table_entry' to store information about a | |
46 data table. It can be converted to JSON to be sent back to | |
47 the data manager. | |
48 | |
49 """ | |
50 d = { | |
51 'data_tables': {} | |
52 } | |
53 return d | |
54 | |
55 | |
56 def add_data_table(d, table): | |
57 """Add a data table to the data tables dictionary | |
58 | |
59 Creates a placeholder for a data table called 'table'. | |
60 | |
61 """ | |
62 d['data_tables'][table] = [] | |
63 | |
64 | |
65 def add_data_table_entry(d, table, entry): | |
66 """Add an entry to a data table | |
67 | |
68 Appends an entry to the data table 'table'. 'entry' | |
69 should be a dictionary where the keys are the names of | |
70 columns in the data table. | |
71 | |
72 Raises an exception if the named data table doesn't | |
73 exist. | |
74 | |
75 """ | |
76 try: | |
77 d['data_tables'][table].append(entry) | |
78 except KeyError: | |
79 raise Exception("add_data_table_entry: no table '%s'" % table) | |
80 | |
81 | |
82 def download_gtdb(data_tables, table_name, target_dp, test=False): | |
83 """Download GTDB | |
84 | |
85 Creates references to the specified file(s) on the Galaxy | |
86 server in the appropriate data table (determined from the | |
87 file extension). | |
88 | |
89 The 'data_tables' dictionary should have been created using | |
90 the 'create_data_tables_dict' and 'add_data_table' functions. | |
91 | |
92 Arguments: | |
93 data_tables: a dictionary containing the data table info | |
94 table_name: name of the table | |
95 target_dp: directory to put copy or link to the data file | |
96 | |
97 """ | |
98 db_dp = target_dp | |
99 if not test: | |
100 cmd = "SemiBin download_GTDB --reference-db-data-dir %s" % (db_dp) | |
101 subprocess.check_call(cmd, shell=True) | |
102 dbkey = 'gtdb' | |
103 name = "GTDB reference genome generated by MMseqs2 used in SemiBin" | |
104 else: | |
105 dbkey = 'test' | |
106 name = "Test" | |
107 empty_fp = db_dp / Path("empty") | |
108 empty_fp.touch() | |
109 add_data_table_entry( | |
110 data_tables, | |
111 table_name, | |
112 dict( | |
113 dbkey=dbkey, | |
114 value='%s' % (date.today().strftime("%d%m%Y")), | |
115 name=name, | |
116 path=str(db_dp))) | |
117 | |
118 | |
119 if __name__ == "__main__": | |
120 print("Starting...") | |
121 | |
122 # Read command line | |
123 parser = argparse.ArgumentParser(description='Download reference genomes (GTDB)') | |
124 parser.add_argument('--json', help="Path to JSON file") | |
125 parser.add_argument('--test', action='store_true', help="Test") | |
126 args = parser.parse_args() | |
127 print("args : %s" % args) | |
128 | |
129 # Read the input JSON | |
130 json_fp = Path(args.json) | |
131 params, target_dp = read_input_json(json_fp) | |
132 | |
133 # Make the target directory | |
134 print("Making %s" % target_dp) | |
135 target_dp.mkdir(parents=True, exist_ok=True) | |
136 | |
137 # Set up data tables dictionary | |
138 data_tables = create_data_tables_dict() | |
139 add_data_table(data_tables, "gtdb") | |
140 | |
141 # Fetch data from specified data sources | |
142 print("Download and build database") | |
143 download_gtdb( | |
144 data_tables, | |
145 "gtdb", | |
146 target_dp, | |
147 args.test) | |
148 | |
149 # Write output JSON | |
150 print("Outputting JSON") | |
151 with open(json_fp, 'w') as fh: | |
152 json.dump(data_tables, fh, sort_keys=True) | |
153 print("Done.") |