Mercurial > repos > frogs > data_manager_frogs
comparison data_manager/FROGS_data_manager.py @ 0:7403d6c4f510 draft default tip
"planemo upload for repository https://github.com/geraldinepascal/FROGS-wrappers/ commit 2024a13846ea6f9bd94ae62e3b2a5a3aba8cd304"
| author | frogs |
|---|---|
| date | Mon, 23 Aug 2021 10:21:10 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:7403d6c4f510 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 # -*- coding: utf-8 -*- | |
| 3 # | |
| 4 # Copyright (C) 2021 INRA | |
| 5 # | |
| 6 # This program is free software: you can redistribute it and/or modify | |
| 7 # it under the terms of the GNU General Public License as published by | |
| 8 # the Free Software Foundation, either version 3 of the License, or | |
| 9 # (at your option) any later version. | |
| 10 # | |
| 11 # This program is distributed in the hope that it will be useful, | |
| 12 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 # GNU General Public License for more details. | |
| 15 # | |
| 16 # You should have received a copy of the GNU General Public License | |
| 17 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 18 # | |
| 19 | |
| 20 __author__ = 'David Christiany Migale Jouy en Josas / Maria Bernard - Sigenae Jouy en Josas' | |
| 21 __copyright__ = 'Copyright (C) 2020 INRAE' | |
| 22 __license__ = 'GNU General Public License' | |
| 23 __version__ = '3.2.3' | |
| 24 __email__ = 'frogs-support@inrae.fr' | |
| 25 __status__ = 'prod' | |
| 26 | |
| 27 # import json | |
| 28 import argparse | |
| 29 import os | |
| 30 # import sys | |
| 31 import tarfile | |
| 32 import time | |
| 33 import urllib | |
| 34 | |
| 35 from galaxy.util.json import from_json_string, to_json_string | |
| 36 | |
| 37 import requests | |
| 38 | |
| 39 # GALAXY_database=~/galaxy/galaxy-20.09/database | |
| 40 # FROGS_data_manager.py --database=frogs_db_data --all_dbs=false \ | |
| 41 # --date=0 --amplicons=16S --bases=SILVA --filters=Pintail100 \ | |
| 42 # --only_last_versions=true \ | |
| 43 # --tool_data=/home/maria/galaxy/galaxy-20.09/tool-data \ | |
| 44 # --output $GALAXY_database/objects/e/7/7/dataset_e7766c39-8f36-450c-adf5-3e4ee8d5c562.dat | |
| 45 | |
| 46 | |
| 47 def get_args(): | |
| 48 parser = argparse.ArgumentParser() | |
| 49 parser.add_argument("-d", "--database") | |
| 50 parser.add_argument("--all_dbs") | |
| 51 parser.add_argument("--date") | |
| 52 parser.add_argument("--amplicons") | |
| 53 parser.add_argument("--bases") | |
| 54 parser.add_argument("--filters") | |
| 55 parser.add_argument("--only_last_versions") | |
| 56 parser.add_argument("--tool_data") | |
| 57 parser.add_argument("-o", "--output") | |
| 58 args = parser.parse_args() | |
| 59 | |
| 60 return args | |
| 61 | |
| 62 | |
| 63 def _add_data_table_entry(data_manager_dict, data_table_entry, data_table): | |
| 64 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) | |
| 65 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) | |
| 66 data_manager_dict['data_tables'][data_table].append(data_table_entry) | |
| 67 return data_manager_dict | |
| 68 | |
| 69 | |
| 70 def keep_only_last_version(db_index): | |
| 71 db_dict = dict() | |
| 72 for line in db_index: | |
| 73 db_type = "_".join(line[1:4]) if line[3] != "" else "_".join(line[1:3]) | |
| 74 if db_type not in db_dict: | |
| 75 db_dict[db_type] = line | |
| 76 return list(db_dict.values()) | |
| 77 | |
| 78 | |
| 79 def frogs_sources(data_manager_dict, target_directory): | |
| 80 | |
| 81 # variables | |
| 82 amplicons_list = [] | |
| 83 bases_list = [] | |
| 84 filters_list = [] | |
| 85 if args.all_dbs == "false": | |
| 86 amplicons_list = [amplicon.lower().strip() for amplicon in args.amplicons.split(",") if amplicon != ""] | |
| 87 bases_list = [base.lower().strip() for base in args.bases.split(",") if base != ""] | |
| 88 filters_list = [filter.lower().strip() for filter in args.filters.split(",") if filter != ""] | |
| 89 bottom_date = int(args.date) | |
| 90 tool_data_path = args.tool_data | |
| 91 | |
| 92 # get frogs database index | |
| 93 frogs_db_index_link = "http://genoweb.toulouse.inra.fr/frogs_databanks/assignation/FROGS_databases.tsv" | |
| 94 with requests.Session() as s: | |
| 95 download = s.get(frogs_db_index_link) | |
| 96 decoded_content = download.content.decode('utf-8') | |
| 97 db_index = decoded_content.splitlines() | |
| 98 db_index = [line.split("\t") for line in db_index[1:]] | |
| 99 db_index = [[line[0], line[1].lower(), line[2].lower(), line[3].lower()] + line[4:] for line in db_index] | |
| 100 | |
| 101 # filter databases | |
| 102 if args.all_dbs == "false": | |
| 103 # filter by amplicons | |
| 104 if len(amplicons_list) != 0: | |
| 105 db_index = [line for line in db_index if any([amplicon in amplicons_list for amplicon in line[1].split(',')])] | |
| 106 # filter by base | |
| 107 if len(bases_list) != 0: | |
| 108 db_index = [line for line in db_index if line[2] in bases_list] | |
| 109 # filter by filters | |
| 110 if len(filters_list) != 0: | |
| 111 db_index = [line for line in db_index if line[3] in filters_list] | |
| 112 # filter by date | |
| 113 if bottom_date != 0: | |
| 114 db_index = [line for line in db_index if int(line[0]) >= bottom_date] | |
| 115 if args.only_last_versions == "true": | |
| 116 # keep only last version | |
| 117 db_index = keep_only_last_version(db_index) | |
| 118 | |
| 119 # get frogs dbs | |
| 120 os.chdir(target_directory) | |
| 121 dir_name = "frogs_db_" + time.strftime("%Y%m%d") | |
| 122 os.mkdir(dir_name) | |
| 123 dbs = set([]) | |
| 124 for line in db_index: | |
| 125 value = line[5] | |
| 126 name = value.replace("_", " ") if "_" not in line[4] else value.replace(line[4], "").replace("_", " ") + line[4] | |
| 127 link = line[6] | |
| 128 name_dir = "".join([line[6].replace(".tar.gz", "").split("/")[-1]]) | |
| 129 file_path = tool_data_path + "/frogs_db/" + name_dir | |
| 130 if not os.path.exists(file_path): # if the file is not already in frogs_db directory | |
| 131 | |
| 132 # download frogs db | |
| 133 dl_file = urllib.request.URLopener() | |
| 134 dl_file.retrieve(link, "tmp.tar.gz") | |
| 135 | |
| 136 # unzip frogs db | |
| 137 with tarfile.open("tmp.tar.gz") as tar: | |
| 138 tar.extractall(dir_name) | |
| 139 tar.close() | |
| 140 os.remove('tmp.tar.gz') | |
| 141 | |
| 142 # get fasta file path | |
| 143 tmp = set(os.listdir(dir_name)) | |
| 144 new_db = dir_name + "/" + "".join(tmp.difference(dbs)) | |
| 145 files = os.listdir(new_db) | |
| 146 fasta = "".join([file for file in files if file.endswith('.fasta')]) | |
| 147 path = new_db + '/' + fasta | |
| 148 dbs = os.listdir(dir_name) | |
| 149 # release = value + "_" + time.strftime("%Y-%m-%d") | |
| 150 # date = time.strftime("%Y%m%d") | |
| 151 path = os.path.join(target_directory, path) | |
| 152 | |
| 153 data_table_entry = dict(name=name, value=value, path=path) | |
| 154 _add_data_table_entry(data_manager_dict, data_table_entry, "frogs_db") | |
| 155 | |
| 156 # def HVL_sources(data_manager_dict,target_directory): | |
| 157 # HVL_dir = "http://genoweb.toulouse.inra.fr/frogs_databanks/HVL/ITS/UNITE_s_7.1_20112016" | |
| 158 # os.chdir(target_directory) | |
| 159 # for link in [HVL_dir + "/Unite_s_7.1_20112016_ITS1.fasta",HVL_dir + "/Unite_s_7.1_20112016_ITS2.fasta"]: | |
| 160 # file_name=link.split("/")[-1].replace('.fasta',"_"+time.strftime("%Y-%m-%d")+".fasta") | |
| 161 # dl_file = urllib.URLopener() | |
| 162 # dl_file.retrieve(link,file_name) | |
| 163 | |
| 164 # #get fasta file path | |
| 165 # path = os.path.join(target_directory,file_name) | |
| 166 # if link.endswith('ITS1.fasta'): | |
| 167 # name = "UNITE 7.1 ITS1 " + time.strftime("%Y-%m-%d") | |
| 168 # elif link.endswith('ITS2.fasta'): | |
| 169 # name = "UNITE 7.1 ITS2 " + time.strftime("%Y-%m-%d") | |
| 170 # value=file_name.replace('.fasta','') | |
| 171 | |
| 172 # data_table_entry = dict(name = name, value = value, path=path) | |
| 173 # _add_data_table_entry(data_manager_dict, data_table_entry, "frogs_HVL_db") | |
| 174 | |
| 175 | |
| 176 def main(): | |
| 177 | |
| 178 # get args from command line | |
| 179 global args | |
| 180 args = get_args() | |
| 181 | |
| 182 # Extract json file params | |
| 183 data_manager_dict = {} | |
| 184 filename = args.output | |
| 185 params = from_json_string(open(filename).read()) | |
| 186 target_directory = params['output_data'][0]['extra_files_path'] | |
| 187 os.mkdir(target_directory) | |
| 188 | |
| 189 # if args.database=="frogs_db_data": | |
| 190 frogs_sources(data_manager_dict, target_directory) | |
| 191 # elif args.database=="HVL_db_data": | |
| 192 # HVL_sources(data_manager_dict,target_directory) | |
| 193 | |
| 194 # save info to json file | |
| 195 open(filename, 'wt').write(to_json_string(data_manager_dict)) | |
| 196 | |
| 197 | |
| 198 if __name__ == "__main__": | |
| 199 main() |
