Mercurial > repos > iuc > data_manager_gtdbtk_database_installer
comparison data_manager/gtdbtk_database_installer.py @ 8:750d902de22c draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit 1019bf0fda897582e2bbdc773aebb3e08e285aae
| author | iuc |
|---|---|
| date | Mon, 21 Oct 2024 15:49:53 +0000 |
| parents | 3b1d503c6260 |
| children | 3248e43703e7 |
comparison
equal
deleted
inserted
replaced
| 7:3b1d503c6260 | 8:750d902de22c |
|---|---|
| 43 return r.getcode() < 400 | 43 return r.getcode() < 400 |
| 44 except HTTPError: | 44 except HTTPError: |
| 45 return False | 45 return False |
| 46 | 46 |
| 47 | 47 |
| 48 def extract_tar_iteratively(tarball, target_directory): | |
| 49 """ | |
| 50 Extracts a .tar, .tar.gz, or .tar.bz2 archive iteratively in a memory-efficient manner. | |
| 51 | |
| 52 This function processes the contents of the archive member-by-member, ensuring only | |
| 53 one file or directory is loaded into memory at any given time. It handles the creation | |
| 54 of directories and symbolic links, and streams large files to disk in chunks to avoid | |
| 55 memory overload. | |
| 56 | |
| 57 Args: | |
| 58 tarball (str): Path to the tar archive (e.g., .tar, .tar.gz, .tar.bz2) to be extracted. | |
| 59 target_directory (str): The destination directory where the archive content | |
| 60 will be extracted. | |
| 61 | |
| 62 Raises: | |
| 63 OSError: If there is an issue with file or directory creation, or writing to disk. | |
| 64 tarfile.TarError: If there is an issue opening or reading the tar archive. | |
| 65 | |
| 66 Example Usage: | |
| 67 extract_tar_iteratively("archive.tar.gz", "/path/to/extract") | |
| 68 | |
| 69 Notes: | |
| 70 - The function supports symbolic and hard links present in the tar archive. | |
| 71 - It ensures that directories are created before files are extracted. | |
| 72 - Large files are streamed to disk in 1 MB chunks to minimize memory usage. | |
| 73 - This function does not return anything but will populate the target directory with | |
| 74 the extracted content. | |
| 75 """ | |
| 76 | |
| 77 with tarfile.open(tarball, "r:*") as fh: | |
| 78 for member in fh: | |
| 79 # Full path to where the member should be extracted | |
| 80 member_path = os.path.join(target_directory, member.name) | |
| 81 | |
| 82 if member.isdir(): | |
| 83 # If it's a directory, ensure it exists | |
| 84 os.makedirs(member_path, exist_ok=True) | |
| 85 elif member.isfile(): | |
| 86 # If it's a file, extract it in chunks to avoid memory spikes | |
| 87 with fh.extractfile(member) as source, open( | |
| 88 member_path, "wb" | |
| 89 ) as target: | |
| 90 shutil.copyfileobj( | |
| 91 source, target, length=1024 * 1024 | |
| 92 ) # 1 MB chunks | |
| 93 elif member.issym() or member.islnk(): | |
| 94 # Handle symlinks or hard links if necessary | |
| 95 target_link = os.path.join(target_directory, member.name) | |
| 96 if member.issym(): | |
| 97 os.symlink(member.linkname, target_link) | |
| 98 elif member.islnk(): | |
| 99 os.link(member.linkname, target_link) | |
| 100 | |
| 101 | |
| 48 def url_download(url, target_directory, meta): | 102 def url_download(url, target_directory, meta): |
| 49 | 103 |
| 50 # download the url | 104 # download the url |
| 51 url_parts = urlparse(url) | 105 url_parts = urlparse(url) |
| 52 tarball = os.path.abspath( | 106 tarball = os.path.abspath( |
| 57 try: | 111 try: |
| 58 req = Request(url) | 112 req = Request(url) |
| 59 src = urlopen(req) | 113 src = urlopen(req) |
| 60 with open(tarball, "wb") as dst: | 114 with open(tarball, "wb") as dst: |
| 61 while True: | 115 while True: |
| 62 chunk = src.read(2**10) | 116 chunk = src.read(2**16) # Read in 64 KB chunks instead of 1 KB |
| 63 if chunk: | 117 if chunk: |
| 64 dst.write(chunk) | 118 dst.write(chunk) |
| 65 else: | 119 else: |
| 66 break | 120 break |
| 67 except Exception as e: | 121 except Exception as e: |
| 72 | 126 |
| 73 # extract the metadata | 127 # extract the metadata |
| 74 if meta: | 128 if meta: |
| 75 # extract the content of *.tar.gz into the target dir | 129 # extract the content of *.tar.gz into the target dir |
| 76 if tarfile.is_tarfile(tarball): | 130 if tarfile.is_tarfile(tarball): |
| 77 fh = tarfile.open(tarball, "r:*") | 131 extract_tar_iteratively(tarball, target_directory) |
| 78 fh.extractall(target_directory) | |
| 79 fh.close() | |
| 80 os.remove(tarball) | 132 os.remove(tarball) |
| 81 return target_directory # return path to output folder | 133 return target_directory # return path to output folder |
| 82 # extract the content of *.gz into the target dir | 134 # extract the content of *.gz into the target dir |
| 83 elif ".gz" in tarball: | 135 elif ".gz" in tarball: |
| 84 with gzip.open(tarball, "rb") as f_in: | 136 with gzip.open(tarball, "rb") as f_in: |
| 94 ) | 146 ) |
| 95 else: | 147 else: |
| 96 # handle the DB | 148 # handle the DB |
| 97 # extract the content of the folder in the tar.gz into the target dir | 149 # extract the content of the folder in the tar.gz into the target dir |
| 98 if tarfile.is_tarfile(tarball): | 150 if tarfile.is_tarfile(tarball): |
| 99 fh = tarfile.open(tarball, "r:*") | 151 extract_tar_iteratively(tarball, target_directory) |
| 100 fh.extractall(target_directory) | |
| 101 fh.close() | |
| 102 os.remove(tarball) | 152 os.remove(tarball) |
| 103 else: | 153 else: |
| 104 # handle the test case for the DB | 154 # handle the test case for the DB |
| 105 return tarball | 155 return tarball |
| 106 | 156 |
