diff data_managers/data_manager_rgi_build_db/data_manager/import_data.py @ 0:715bc9aeef69 draft

planemo upload for repository https://github.com/arpcard/rgi commit 7a78289be23c5a14ae39f454610fa8eca3f05188
author card
date Wed, 27 Feb 2019 09:08:21 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_managers/data_manager_rgi_build_db/data_manager/import_data.py	Wed Feb 27 09:08:21 2019 -0500
@@ -0,0 +1,199 @@
+import argparse
+import datetime
+import json
+import os
+import shutil
+import sys
+import tarfile
+import urllib.request, urllib.error, urllib.parse
+import zipfile
+import logging
+
+path = os.path.join(os.getcwd(), 'rgi-database') 
+data_path = path
+
+level = logging.WARNING
+logger = logging.getLogger(__name__)
+logger.setLevel(level)
+
+def url_download(url, workdir):
+    file_path = os.path.join(workdir, 'download.dat')
+    if not os.path.exists(workdir):
+        os.makedirs(workdir)
+    src = None
+    dst = None
+    try:
+        req = urllib.request.Request(url)
+        src = urllib.request.urlopen(req)
+        dst = open(file_path, 'wb')
+        while True:
+            chunk = src.read(2**10)
+            if chunk:
+                dst.write(chunk)
+            else:
+                break
+    except Exception as e:
+        print(str(e), file=sys.stderr)
+    finally:
+        if src:
+            src.close()
+        if dst:
+            dst.close()
+    if tarfile.is_tarfile(file_path):
+        fh = tarfile.open(file_path, 'r:*')
+    elif zipfile.is_zipfile(file_path):
+        fh = zipfile.ZipFile(file_path, 'r')
+    else:
+        return
+    # extract only one file : card.json
+    for member in fh.getmembers():
+        if member.isreg():  # skip if the TarInfo is not files
+            member.name = os.path.basename(member.name) # remove the path by reset it
+            if member.name == 'card.json':
+                print('[import_data] extracting file: {}'.format(str(member.name)))
+                fh.extract(member.name,workdir)
+    os.remove(file_path)
+
+def checkKeyExisted(key, my_dict):
+    try:
+        nonNone = my_dict[key] is not None
+    except KeyError:
+        nonNone = False
+    return nonNone
+
+def data_version():
+    data_version = ''
+    with open(os.path.join(data_path, 'card.json')) as json_file:
+        json_data = json.load(json_file)
+        for item in list(json_data.keys()):
+            if item == '_version':
+                data_version = json_data[item]
+    json_file.close()
+    return data_version
+
+def makeBlastDB():
+    if os.path.isfile(os.path.join(path, 'proteindb.fsa')) == True:
+        print('[import_data] create blast DB.')
+        os.system('makeblastdb -in {}/proteindb.fsa -dbtype prot -out {}/protein.db > /dev/null 2>&1'.format(path, path))
+
+def makeDiamondDB():
+    if os.path.isfile(os.path.join(path, 'proteindb.fsa')) == True:
+        print('[import_data] create diamond DB.')
+        os.system('diamond makedb --quiet --in {}/proteindb.fsa --db {}/protein.db'.format(path, path))
+
+def write_fasta_from_json():
+		'''Creates a fasta file from card.json file.'''
+		if os.path.isfile(os.path.join(path, 'proteindb.fsa')):
+			return
+		else:
+			try:
+				with open(os.path.join(data_path, 'card.json'), 'r') as jfile:
+					j = json.load(jfile)
+			except Exception as e:
+				logger.error(e)
+				exit()
+
+			with open(os.path.join(path, 'proteindb.fsa'), 'w') as fout:
+				for i in j:
+					if i.isdigit():
+		            	# model_type: protein homolog model
+						if j[i]['model_type_id'] == '40292':
+							try:
+								pass_bit_score = j[i]['model_param']['blastp_bit_score']['param_value']
+							except KeyError:
+								logger.warning('No bitscore for model (%s, %s). RGI will omit this model and keep running.' \
+									% (j[i]['model_id'], j[i]['model_name']))
+								logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca')
+							else:
+								try:
+									for seq in j[i]['model_sequences']['sequence']:
+										fout.write('>%s_%s | model_type_id: 40292 | pass_bitscore: %s | %s\n' % (i, seq, pass_bit_score, j[i]['ARO_name']))
+										fout.write('%s\n' %(j[i]['model_sequences']['sequence'][seq]['protein_sequence']['sequence']))
+								except Exception as e:
+									logger.warning('No model sequences for model (%s, %s). RGI will omit this model and keep running.' \
+										% (j[i]['model_id'], j[i]['model_name']))
+									logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca')
+
+
+		            	# model_type: protein variant model
+						elif j[i]['model_type_id'] == '40293':
+							try:
+								pass_bit_score = j[i]['model_param']['blastp_bit_score']['param_value']
+							except KeyError:
+								logger.warning('No bitscore for model (%s, %s). RGI will omit this model and keep running.' \
+									% (j[i]['model_id'], j[i]['model_name']))
+								logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca')
+							else:
+								try:
+									snpList = [j[i]['model_param']['snp']['param_value'][k] for k in j[i]['model_param']['snp']['param_value']]
+								except Exception as e:
+									logger.warning('No snp for model (%s, %s). RGI will omit this model and keep running.' \
+										% (j[i]['model_id'], j[i]['model_name']))
+									logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca')
+
+								try:
+									for seq in j[i]['model_sequences']['sequence']:
+										fout.write('>%s_%s | model_type_id: 40293 | pass_bit_score: %s | SNP: %s | %s\n' \
+											% (i, seq, pass_bit_score, ','.join(snpList), j[i]['ARO_name']))
+										fout.write('%s\n' % (j[i]['model_sequences']['sequence'][seq]['protein_sequence']['sequence']))
+								except Exception as e:
+									logger.warning('No model sequences for model (%s, %s). RGI will omit this model and keep running.' \
+										% (j[i]['model_id'], j[i]['model_name']))
+									logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca')
+
+		            	# model_type: protein overexpression model
+						elif j[i]['model_type_id'] == '41091':
+							try:
+								pass_bit_score = j[i]['model_param']['blastp_bit_score']['param_value']
+							except KeyError:
+								logger.warning('No bitscore for model (%s, %s). RGI will omit this model and keep running.' \
+									% (j[i]['model_id'], j[i]['model_name']))
+								logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca')
+							else:
+								try:
+									snpList = [j[i]['model_param']['snp']['param_value'][k] for k in j[i]['model_param']['snp']['param_value']]
+								except Exception as e:
+									logger.warning('No snp for model (%s, %s). RGI will omit this model and keep running.' \
+										% (j[i]['model_id'], j[i]['model_name']))
+									logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca')
+
+								try:
+									for seq in j[i]['model_sequences']['sequence']:
+										fout.write('>%s_%s | model_type_id: 41091 | pass_bit_score: %s | SNP: %s | %s\n' \
+											% (i, seq, pass_bit_score, ','.join(snpList), j[i]['ARO_name']))
+										fout.write('%s\n' % (j[i]['model_sequences']['sequence'][seq]['protein_sequence']['sequence']))
+								except Exception as e:
+									logger.warning('No model sequences for model (%s, %s). RGI will omit this model and keep running.' \
+										% (j[i]['model_id'], j[i]['model_name']))
+									logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca')
+
+def _main(args):
+    if not os.path.exists(path):
+        print('[import_data] mkdir: {}'.format(path))
+        os.makedirs(path)
+    print('[import_data] path: {}'.format(path))
+    print(args)
+
+    if args.url == None:
+        url = 'https://card.mcmaster.ca/latest/data'
+    else:
+        url = args.url
+    print('[import_data] url: {}'.format(url))
+    workdir = os.path.join(os.getcwd(), 'rgi-database')
+    print('[import_data] working directory: {}'.format(workdir))
+    url_download(url, workdir)
+    write_fasta_from_json()
+    makeBlastDB()
+    makeDiamondDB()
+    version = data_version()
+    print('[import_data] data version: {}'.format(version))
+    return version
+
+def run():
+    parser = argparse.ArgumentParser(description='Create data manager json.')
+    parser.add_argument('--url', dest='url', action='store', help='Url for CARD data')
+    args = parser.parse_args()
+    _main(args)
+
+if __name__ == '__main__':
+    run()