Mercurial > repos > iuc > data_manager_gtdbtk_database_installer
comparison data_manager/gtdbtk_database_installer.py @ 2:6ab422fba1a3 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit ad14947c3e13babe90a6878b45608fe56a16150d
| author | iuc |
|---|---|
| date | Tue, 13 Aug 2024 21:13:43 +0000 |
| parents | 629464b96c2e |
| children | c4830a9870fa |
comparison
equal
deleted
inserted
replaced
| 1:2814c058a087 | 2:6ab422fba1a3 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 | 2 |
| 3 import argparse | 3 import argparse |
| 4 import gzip | |
| 4 import json | 5 import json |
| 5 import os | 6 import os |
| 6 import shutil | 7 import shutil |
| 7 import sys | 8 import sys |
| 8 import tarfile | 9 import tarfile |
| 10 from datetime import datetime | |
| 9 from urllib.parse import urlparse | 11 from urllib.parse import urlparse |
| 10 from urllib.request import Request | 12 from urllib.request import Request, urlopen |
| 11 from urllib.request import urlopen | 13 |
| 14 # rather provide the urls based on the release, less error potential for the admins ! | |
| 15 urls = { | |
| 16 "202": { | |
| 17 "full": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz", | |
| 18 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/ar122_taxonomy_r202.tsv.gz", | |
| 19 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/bac120_taxonomy_r202.tsv.gz", | |
| 20 }, | |
| 21 "207": { | |
| 22 "full": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/auxillary_files/gtdbtk_r207_data.tar.gz", | |
| 23 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/ar53_taxonomy_r207.tsv.gz", | |
| 24 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/bac120_taxonomy_r207.tsv.gz", | |
| 25 }, | |
| 26 "214": { | |
| 27 "full": "https://data.gtdb.ecogenomic.org/releases/release214/214.0/auxillary_files/gtdbtk_r214_data.tar.gz", | |
| 28 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release214/214.0/ar53_taxonomy_r214.tsv.gz", | |
| 29 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release214/214.0/bac120_taxonomy_r214.tsv.gz", | |
| 30 }, | |
| 31 "220": { | |
| 32 "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz", | |
| 33 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_taxonomy_r220.tsv.gz", | |
| 34 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_taxonomy_r220.tsv.gz", | |
| 35 }, | |
| 36 "test": { # using VERSION to check if files are there | |
| 37 "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt", | |
| 38 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_taxonomy_r220.tsv.gz", | |
| 39 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_taxonomy_r220.tsv.gz", | |
| 40 }, | |
| 41 } | |
| 12 | 42 |
| 13 | 43 |
| 14 def url_download(url, target_directory): | 44 def url_download(url, target_directory): |
| 15 url_parts = urlparse(url) | 45 url_parts = urlparse(url) |
| 16 tarball = os.path.abspath(os.path.join(target_directory, os.path.basename(url_parts.path))) | 46 tarball = os.path.abspath( |
| 47 os.path.join(target_directory, os.path.basename(url_parts.path)) | |
| 48 ) | |
| 17 src = None | 49 src = None |
| 18 dst = None | 50 dst = None |
| 19 try: | 51 try: |
| 20 req = Request(url) | 52 req = Request(url) |
| 21 src = urlopen(req) | 53 src = urlopen(req) |
| 22 with open(tarball, 'wb') as dst: | 54 with open(tarball, "wb") as dst: |
| 23 while True: | 55 while True: |
| 24 chunk = src.read(2**10) | 56 chunk = src.read(2**10) |
| 25 if chunk: | 57 if chunk: |
| 26 dst.write(chunk) | 58 dst.write(chunk) |
| 27 else: | 59 else: |
| 30 sys.exit(str(e)) | 62 sys.exit(str(e)) |
| 31 finally: | 63 finally: |
| 32 if src is not None: | 64 if src is not None: |
| 33 src.close() | 65 src.close() |
| 34 if tarfile.is_tarfile(tarball): | 66 if tarfile.is_tarfile(tarball): |
| 35 fh = tarfile.open(tarball, 'r:*') | 67 fh = tarfile.open(tarball, "r:*") |
| 36 else: | 68 else: |
| 37 return tarball | 69 # unzip metadata file |
| 70 if ".gz" in tarball: | |
| 71 with gzip.open(tarball, "rb") as f_in: | |
| 72 unzipped_file = tarball.strip(".gz") | |
| 73 with open(unzipped_file, "wb") as f_out: | |
| 74 shutil.copyfileobj(f_in, f_out) | |
| 75 os.remove(tarball) | |
| 76 folder_of_unzipped_file = os.path.dirname(unzipped_file) | |
| 77 return folder_of_unzipped_file | |
| 78 else: | |
| 79 # this is basically only the return for the test not using a tarfile | |
| 80 return tarball | |
| 38 fh.extractall(target_directory) | 81 fh.extractall(target_directory) |
| 39 fh.close() | 82 fh.close() |
| 40 os.remove(tarball) | 83 os.remove(tarball) |
| 41 # The tarball extraction will create a directory named | 84 # The tarball extraction will create a directory named |
| 42 # something like release202 in the target_directory, so | 85 # something like release202 in the target_directory, so |
| 50 shutil.move(item_path, target_directory) | 93 shutil.move(item_path, target_directory) |
| 51 os.rmdir(subdir_path) | 94 os.rmdir(subdir_path) |
| 52 return target_directory | 95 return target_directory |
| 53 | 96 |
| 54 | 97 |
| 55 def download(database_id, database_name, url, out_file): | 98 def download(database_name, release, meta, test, out_file): |
| 56 | 99 |
| 57 with open(out_file) as fh: | 100 with open(out_file) as fh: |
| 58 params = json.load(fh) | 101 params = json.load(fh) |
| 59 | 102 |
| 60 target_directory = params['output_data'][0]['extra_files_path'] | 103 target_directory = params["output_data"][0]["extra_files_path"] |
| 61 os.makedirs(target_directory) | 104 os.makedirs(target_directory) |
| 62 file_path = url_download(url, target_directory) | 105 |
| 106 if test: | |
| 107 release = "test" | |
| 108 | |
| 109 # download both taxonomy metadata tables | |
| 110 if meta: | |
| 111 url = urls[release]["meta_ar"] | |
| 112 file_path = url_download(url, target_directory) | |
| 113 url = urls[release]["meta_bac"] | |
| 114 file_path = url_download(url, target_directory) | |
| 115 # download the full DB | |
| 116 else: | |
| 117 url = urls[release]["full"] | |
| 118 file_path = url_download(url, target_directory) | |
| 119 | |
| 120 time = datetime.utcnow().strftime("%Y-%m-%d") | |
| 63 | 121 |
| 64 data_manager_json = {"data_tables": {}} | 122 data_manager_json = {"data_tables": {}} |
| 65 data_manager_entry = {} | 123 data_manager_entry = {} |
| 66 data_manager_entry['value'] = database_id | 124 data_manager_entry["value"] = f"{database_name}_release_{release}_downloaded_{time}" |
| 67 data_manager_entry['name'] = database_name | 125 data_manager_entry["name"] = database_name |
| 68 data_manager_entry['path'] = file_path | 126 data_manager_entry["path"] = file_path |
| 69 data_manager_json["data_tables"]["gtdbtk_database"] = data_manager_entry | 127 data_manager_entry["version"] = release |
| 70 | 128 |
| 71 with open(out_file, 'w') as fh: | 129 # store in dedicated metadata table |
| 130 if meta: | |
| 131 data_manager_json["data_tables"][ | |
| 132 "gtdbtk_database_metadata_versioned" | |
| 133 ] = data_manager_entry | |
| 134 else: | |
| 135 data_manager_json["data_tables"][ | |
| 136 "gtdbtk_database_versioned" | |
| 137 ] = data_manager_entry | |
| 138 | |
| 139 with open(out_file, "w") as fh: | |
| 72 json.dump(data_manager_json, fh, sort_keys=True) | 140 json.dump(data_manager_json, fh, sort_keys=True) |
| 73 | 141 |
| 74 | 142 |
| 75 parser = argparse.ArgumentParser() | 143 parser = argparse.ArgumentParser() |
| 76 | 144 |
| 77 parser.add_argument('--database_name', dest='database_name', help='GTDB-Tk database display name') | 145 parser.add_argument( |
| 78 parser.add_argument('--database_id', dest='database_id', help='Unique GTDB-Tk database id') | 146 "--database_name", dest="database_name", help="GTDB-Tk database display name" |
| 79 parser.add_argument('--url', dest='url', help='URL to download GTDB-Tk databse version') | 147 ) |
| 80 parser.add_argument('--out_file', dest='out_file', help='JSON output file') | 148 |
| 149 parser.add_argument("--version", dest="version", help="DB version") | |
| 150 | |
| 151 parser.add_argument( | |
| 152 "--release", dest="release", help="Release of the GTDB-Tk database version" | |
| 153 ) | |
| 154 parser.add_argument("--out_file", dest="out_file", help="JSON output file") | |
| 155 parser.add_argument( | |
| 156 "--meta", | |
| 157 dest="meta", | |
| 158 action="store_true", | |
| 159 help="Store meta data flag", | |
| 160 ) | |
| 161 | |
| 162 parser.add_argument( | |
| 163 "--test", | |
| 164 dest="test", | |
| 165 action="store_true", | |
| 166 help="Run test", | |
| 167 ) | |
| 81 | 168 |
| 82 args = parser.parse_args() | 169 args = parser.parse_args() |
| 83 | 170 |
| 84 download(args.database_id, args.database_name, args.url, args.out_file) | 171 download( |
| 172 args.database_name, | |
| 173 args.release, | |
| 174 args.meta, | |
| 175 args.test, | |
| 176 args.out_file, | |
| 177 ) |
