Mercurial > repos > iuc > data_manager_gtdbtk_database_installer
comparison data_manager/gtdbtk_database_installer.py @ 4:10232d2b5062 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit 0bb620025a61de6caa8a93537fec8ea65eda43be
| author | iuc |
|---|---|
| date | Fri, 16 Aug 2024 08:44:14 +0000 |
| parents | c4830a9870fa |
| children | df84aaed4769 |
comparison
equal
deleted
inserted
replaced
| 3:c4830a9870fa | 4:10232d2b5062 |
|---|---|
| 7 import shutil | 7 import shutil |
| 8 import sys | 8 import sys |
| 9 import tarfile | 9 import tarfile |
| 10 from datetime import datetime | 10 from datetime import datetime |
| 11 from urllib.parse import urlparse | 11 from urllib.parse import urlparse |
| 12 from urllib.request import Request, urlopen | 12 from urllib.request import HTTPError, Request, urlopen |
| 13 | 13 |
| 14 # rather provide the urls based on the release, less error potential for the admins ! | 14 # rather provide the urls based on the release, less error potential for the admins ! |
| 15 urls = { | 15 urls = { |
| 16 "202": { | 16 "202": { |
| 17 "full": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz", | 17 "full": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz", |
| 31 "220": { | 31 "220": { |
| 32 "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz", | 32 "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz", |
| 33 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz", | 33 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz", |
| 34 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz", | 34 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz", |
| 35 }, | 35 }, |
| 36 "test": { # using VERSION to check if files are there | |
| 37 "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt", | |
| 38 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz", | |
| 39 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz", | |
| 40 }, | |
| 41 } | 36 } |
| 42 | 37 |
| 43 | 38 |
| 44 def url_download(url, target_directory): | 39 def is_urlfile(url): |
| 40 # Check if online file exists | |
| 41 try: | |
| 42 r = urlopen(url) # response | |
| 43 return r.getcode() < 400 | |
| 44 except HTTPError: | |
| 45 return False | |
| 46 | |
| 47 | |
| 48 def url_download(url, target_directory, meta): | |
| 49 | |
| 50 # download the url | |
| 45 url_parts = urlparse(url) | 51 url_parts = urlparse(url) |
| 46 tarball = os.path.abspath( | 52 tarball = os.path.abspath( |
| 47 os.path.join(target_directory, os.path.basename(url_parts.path)) | 53 os.path.join(target_directory, os.path.basename(url_parts.path)) |
| 48 ) | 54 ) |
| 49 src = None | 55 src = None |
| 61 except Exception as e: | 67 except Exception as e: |
| 62 sys.exit(str(e)) | 68 sys.exit(str(e)) |
| 63 finally: | 69 finally: |
| 64 if src is not None: | 70 if src is not None: |
| 65 src.close() | 71 src.close() |
| 66 if tarfile.is_tarfile(tarball): | 72 |
| 67 fh = tarfile.open(tarball, "r:*") | 73 # extract the metadata |
| 68 else: | 74 if meta: |
| 69 # unzip metadata file | 75 # extract the content of *.tar.gz into the target dir |
| 70 if ".gz" in tarball: | 76 if tarfile.is_tarfile(tarball): |
| 77 fh = tarfile.open(tarball, "r:*") | |
| 78 fh.extractall(target_directory) | |
| 79 fh.close() | |
| 80 os.remove(tarball) | |
| 81 return target_directory # return path to output folder | |
| 82 # extract the content of *.gz into the target dir | |
| 83 elif ".gz" in tarball: | |
| 71 with gzip.open(tarball, "rb") as f_in: | 84 with gzip.open(tarball, "rb") as f_in: |
| 72 unzipped_file = tarball.strip(".gz") | 85 unzipped_file = tarball.strip(".gz") |
| 73 with open(unzipped_file, "wb") as f_out: | 86 with open(unzipped_file, "wb") as f_out: |
| 74 shutil.copyfileobj(f_in, f_out) | 87 shutil.copyfileobj(f_in, f_out) |
| 75 os.remove(tarball) | 88 os.remove(tarball) |
| 76 folder_of_unzipped_file = os.path.dirname(unzipped_file) | 89 folder_of_unzipped_file = os.path.dirname(unzipped_file) |
| 77 return folder_of_unzipped_file | 90 return folder_of_unzipped_file |
| 78 else: | 91 else: |
| 79 # this is basically only the return for the test not using a tarfile | 92 sys.exit( |
| 93 "No correct input format for metadata file, must be .tar.gz or .gz" | |
| 94 ) | |
| 95 else: | |
| 96 # handle the DB | |
| 97 # extract the content of the folder in the tar.gz into the target dir | |
| 98 if tarfile.is_tarfile(tarball): | |
| 99 fh = tarfile.open(tarball, "r:*") | |
| 100 fh.extractall(target_directory) | |
| 101 fh.close() | |
| 102 os.remove(tarball) | |
| 103 else: | |
| 104 # handle the test case for the DB | |
| 80 return tarball | 105 return tarball |
| 81 fh.extractall(target_directory) | 106 |
| 82 fh.close() | 107 fh.extractall(target_directory) |
| 83 os.remove(tarball) | 108 fh.close() |
| 84 # The tarball extraction will create a directory named | 109 os.remove(tarball) |
| 85 # something like release202 in the target_directory, so | 110 # The tarball extraction will create a directory named |
| 86 # we need to move the items in that directory to the | 111 # something like release202 in the target_directory, so |
| 87 # target directory. | 112 # we need to move the items in that directory to the |
| 88 subdir = next(os.walk(target_directory))[1][0] | 113 # target directory. |
| 89 subdir_path = os.path.join(target_directory, subdir) | 114 subdir = next(os.walk(target_directory))[1][0] |
| 90 items = os.listdir(subdir_path) | 115 subdir_path = os.path.join(target_directory, subdir) |
| 91 for item in items: | 116 items = os.listdir(subdir_path) |
| 92 item_path = os.path.join(subdir_path, item) | 117 for item in items: |
| 93 shutil.move(item_path, target_directory) | 118 item_path = os.path.join(subdir_path, item) |
| 94 os.rmdir(subdir_path) | 119 shutil.move(item_path, target_directory) |
| 95 return target_directory | 120 os.rmdir(subdir_path) |
| 121 return target_directory | |
| 96 | 122 |
| 97 | 123 |
| 98 def download(database_name, release, meta, test, out_file): | 124 def download(database_name, release, meta, test, out_file): |
| 99 | 125 |
| 100 with open(out_file) as fh: | 126 with open(out_file) as fh: |
| 102 | 128 |
| 103 target_directory = params["output_data"][0]["extra_files_path"] | 129 target_directory = params["output_data"][0]["extra_files_path"] |
| 104 os.makedirs(target_directory) | 130 os.makedirs(target_directory) |
| 105 | 131 |
| 106 if test: | 132 if test: |
| 107 release = "test" | 133 # switch the DB to use the test case |
| 134 urls[release][ | |
| 135 "full" | |
| 136 ] = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt" | |
| 137 | |
| 138 # make use of the test to check if all urls exists | |
| 139 for _version, items in urls.items(): | |
| 140 for url in items.values(): | |
| 141 assert is_urlfile(url) | |
| 108 | 142 |
| 109 # download both taxonomy metadata tables | 143 # download both taxonomy metadata tables |
| 110 if meta: | 144 if meta: |
| 111 url = urls[release]["meta_ar"] | 145 url = urls[release]["meta_ar"] |
| 112 file_path = url_download(url, target_directory) | 146 file_path = url_download(url, target_directory, meta) |
| 113 url = urls[release]["meta_bac"] | 147 url = urls[release]["meta_bac"] |
| 114 file_path = url_download(url, target_directory) | 148 file_path = url_download(url, target_directory, meta) |
| 115 # download the full DB | 149 # download the full DB |
| 116 else: | 150 else: |
| 117 url = urls[release]["full"] | 151 url = urls[release]["full"] |
| 118 file_path = url_download(url, target_directory) | 152 file_path = url_download(url, target_directory, meta) |
| 119 | 153 |
| 120 time = datetime.utcnow().strftime("%Y-%m-%d") | 154 time = datetime.utcnow().strftime("%Y-%m-%d") |
| 121 | 155 |
| 122 data_manager_json = {"data_tables": {}} | 156 data_manager_json = {"data_tables": {}} |
| 123 data_manager_entry = {} | 157 data_manager_entry = {} |
