comparison data_managers/data_manager_rgi_build_db/data_manager/import_data.py @ 0:715bc9aeef69 draft

planemo upload for repository https://github.com/arpcard/rgi commit 7a78289be23c5a14ae39f454610fa8eca3f05188
author card
date Wed, 27 Feb 2019 09:08:21 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:715bc9aeef69
1 import argparse
2 import datetime
3 import json
4 import os
5 import shutil
6 import sys
7 import tarfile
8 import urllib.request, urllib.error, urllib.parse
9 import zipfile
10 import logging
11
12 path = os.path.join(os.getcwd(), 'rgi-database')
13 data_path = path
14
15 level = logging.WARNING
16 logger = logging.getLogger(__name__)
17 logger.setLevel(level)
18
19 def url_download(url, workdir):
20 file_path = os.path.join(workdir, 'download.dat')
21 if not os.path.exists(workdir):
22 os.makedirs(workdir)
23 src = None
24 dst = None
25 try:
26 req = urllib.request.Request(url)
27 src = urllib.request.urlopen(req)
28 dst = open(file_path, 'wb')
29 while True:
30 chunk = src.read(2**10)
31 if chunk:
32 dst.write(chunk)
33 else:
34 break
35 except Exception as e:
36 print(str(e), file=sys.stderr)
37 finally:
38 if src:
39 src.close()
40 if dst:
41 dst.close()
42 if tarfile.is_tarfile(file_path):
43 fh = tarfile.open(file_path, 'r:*')
44 elif zipfile.is_zipfile(file_path):
45 fh = zipfile.ZipFile(file_path, 'r')
46 else:
47 return
48 # extract only one file : card.json
49 for member in fh.getmembers():
50 if member.isreg(): # skip if the TarInfo is not files
51 member.name = os.path.basename(member.name) # remove the path by reset it
52 if member.name == 'card.json':
53 print('[import_data] extracting file: {}'.format(str(member.name)))
54 fh.extract(member.name,workdir)
55 os.remove(file_path)
56
57 def checkKeyExisted(key, my_dict):
58 try:
59 nonNone = my_dict[key] is not None
60 except KeyError:
61 nonNone = False
62 return nonNone
63
64 def data_version():
65 data_version = ''
66 with open(os.path.join(data_path, 'card.json')) as json_file:
67 json_data = json.load(json_file)
68 for item in list(json_data.keys()):
69 if item == '_version':
70 data_version = json_data[item]
71 json_file.close()
72 return data_version
73
74 def makeBlastDB():
75 if os.path.isfile(os.path.join(path, 'proteindb.fsa')) == True:
76 print('[import_data] create blast DB.')
77 os.system('makeblastdb -in {}/proteindb.fsa -dbtype prot -out {}/protein.db > /dev/null 2>&1'.format(path, path))
78
79 def makeDiamondDB():
80 if os.path.isfile(os.path.join(path, 'proteindb.fsa')) == True:
81 print('[import_data] create diamond DB.')
82 os.system('diamond makedb --quiet --in {}/proteindb.fsa --db {}/protein.db'.format(path, path))
83
84 def write_fasta_from_json():
85 '''Creates a fasta file from card.json file.'''
86 if os.path.isfile(os.path.join(path, 'proteindb.fsa')):
87 return
88 else:
89 try:
90 with open(os.path.join(data_path, 'card.json'), 'r') as jfile:
91 j = json.load(jfile)
92 except Exception as e:
93 logger.error(e)
94 exit()
95
96 with open(os.path.join(path, 'proteindb.fsa'), 'w') as fout:
97 for i in j:
98 if i.isdigit():
99 # model_type: protein homolog model
100 if j[i]['model_type_id'] == '40292':
101 try:
102 pass_bit_score = j[i]['model_param']['blastp_bit_score']['param_value']
103 except KeyError:
104 logger.warning('No bitscore for model (%s, %s). RGI will omit this model and keep running.' \
105 % (j[i]['model_id'], j[i]['model_name']))
106 logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca')
107 else:
108 try:
109 for seq in j[i]['model_sequences']['sequence']:
110 fout.write('>%s_%s | model_type_id: 40292 | pass_bitscore: %s | %s\n' % (i, seq, pass_bit_score, j[i]['ARO_name']))
111 fout.write('%s\n' %(j[i]['model_sequences']['sequence'][seq]['protein_sequence']['sequence']))
112 except Exception as e:
113 logger.warning('No model sequences for model (%s, %s). RGI will omit this model and keep running.' \
114 % (j[i]['model_id'], j[i]['model_name']))
115 logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca')
116
117
118 # model_type: protein variant model
119 elif j[i]['model_type_id'] == '40293':
120 try:
121 pass_bit_score = j[i]['model_param']['blastp_bit_score']['param_value']
122 except KeyError:
123 logger.warning('No bitscore for model (%s, %s). RGI will omit this model and keep running.' \
124 % (j[i]['model_id'], j[i]['model_name']))
125 logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca')
126 else:
127 try:
128 snpList = [j[i]['model_param']['snp']['param_value'][k] for k in j[i]['model_param']['snp']['param_value']]
129 except Exception as e:
130 logger.warning('No snp for model (%s, %s). RGI will omit this model and keep running.' \
131 % (j[i]['model_id'], j[i]['model_name']))
132 logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca')
133
134 try:
135 for seq in j[i]['model_sequences']['sequence']:
136 fout.write('>%s_%s | model_type_id: 40293 | pass_bit_score: %s | SNP: %s | %s\n' \
137 % (i, seq, pass_bit_score, ','.join(snpList), j[i]['ARO_name']))
138 fout.write('%s\n' % (j[i]['model_sequences']['sequence'][seq]['protein_sequence']['sequence']))
139 except Exception as e:
140 logger.warning('No model sequences for model (%s, %s). RGI will omit this model and keep running.' \
141 % (j[i]['model_id'], j[i]['model_name']))
142 logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca')
143
144 # model_type: protein overexpression model
145 elif j[i]['model_type_id'] == '41091':
146 try:
147 pass_bit_score = j[i]['model_param']['blastp_bit_score']['param_value']
148 except KeyError:
149 logger.warning('No bitscore for model (%s, %s). RGI will omit this model and keep running.' \
150 % (j[i]['model_id'], j[i]['model_name']))
151 logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca')
152 else:
153 try:
154 snpList = [j[i]['model_param']['snp']['param_value'][k] for k in j[i]['model_param']['snp']['param_value']]
155 except Exception as e:
156 logger.warning('No snp for model (%s, %s). RGI will omit this model and keep running.' \
157 % (j[i]['model_id'], j[i]['model_name']))
158 logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca')
159
160 try:
161 for seq in j[i]['model_sequences']['sequence']:
162 fout.write('>%s_%s | model_type_id: 41091 | pass_bit_score: %s | SNP: %s | %s\n' \
163 % (i, seq, pass_bit_score, ','.join(snpList), j[i]['ARO_name']))
164 fout.write('%s\n' % (j[i]['model_sequences']['sequence'][seq]['protein_sequence']['sequence']))
165 except Exception as e:
166 logger.warning('No model sequences for model (%s, %s). RGI will omit this model and keep running.' \
167 % (j[i]['model_id'], j[i]['model_name']))
168 logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca')
169
170 def _main(args):
171 if not os.path.exists(path):
172 print('[import_data] mkdir: {}'.format(path))
173 os.makedirs(path)
174 print('[import_data] path: {}'.format(path))
175 print(args)
176
177 if args.url == None:
178 url = 'https://card.mcmaster.ca/latest/data'
179 else:
180 url = args.url
181 print('[import_data] url: {}'.format(url))
182 workdir = os.path.join(os.getcwd(), 'rgi-database')
183 print('[import_data] working directory: {}'.format(workdir))
184 url_download(url, workdir)
185 write_fasta_from_json()
186 makeBlastDB()
187 makeDiamondDB()
188 version = data_version()
189 print('[import_data] data version: {}'.format(version))
190 return version
191
192 def run():
193 parser = argparse.ArgumentParser(description='Create data manager json.')
194 parser.add_argument('--url', dest='url', action='store', help='Url for CARD data')
195 args = parser.parse_args()
196 _main(args)
197
198 if __name__ == '__main__':
199 run()