Mercurial > repos > bgruening > data_manager_diamond_database_builder
diff data_manager/data_manager_diamond_database_builder.py @ 2:5558f74bd296 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_diamond_database_builder commit 75abf7d4b23ed7ae8abce80609d81b20bc882863"
author | iuc |
---|---|
date | Sun, 21 Mar 2021 12:07:34 +0000 |
parents | 5a0d0bee4f8d |
children |
line wrap: on
line diff
--- a/data_manager/data_manager_diamond_database_builder.py Tue Dec 03 17:39:48 2019 -0500 +++ b/data_manager/data_manager_diamond_database_builder.py Sun Mar 21 12:07:34 2021 +0000 @@ -1,17 +1,19 @@ #!/usr/bin/env python +import bz2 +import gzip import json -import sys -import os -import tempfile -import shutil import optparse -import urllib2 +import os +import shutil import subprocess -from ftplib import FTP +import sys import tarfile +import tempfile +import urllib.error +import urllib.parse +import urllib.request import zipfile -import gzip -import bz2 +from ftplib import FTP CHUNK_SIZE = 2**20 # 1mb @@ -21,11 +23,6 @@ shutil.rmtree(tmp_dir) -def stop_err(msg): - sys.stderr.write(msg) - sys.exit(1) - - def _get_files_in_ftp_path(ftp, path): path_contents = [] ftp.retrlines('MLSD %s' % (path), path_contents.append) @@ -54,12 +51,17 @@ return [bz2.BZ2File(file_obj.name, 'rb')] -def download_from_ncbi(data_manager_dict, params, target_directory, database_id, database_name): +def download_from_ncbi(data_manager_dict, params, target_directory, + database_id, database_name): NCBI_FTP_SERVER = 'ftp.ncbi.nlm.nih.gov' NCBI_DOWNLOAD_PATH = '/blast/db/FASTA/' - COMPRESSED_EXTENSIONS = [('.tar.gz', _get_stream_readers_for_tar), ('.tar.bz2', _get_stream_readers_for_tar), ('.zip', _get_stream_readers_for_zip), ('.gz', _get_stream_readers_for_gzip), ('.bz2', _get_stream_readers_for_bz2)] + COMPRESSED_EXTENSIONS = [('.tar.gz', _get_stream_readers_for_tar), + ('.tar.bz2', _get_stream_readers_for_tar), + ('.zip', _get_stream_readers_for_zip), + ('.gz', _get_stream_readers_for_gzip), + ('.bz2', _get_stream_readers_for_bz2)] - ncbi_identifier = params['param_dict']['reference_source']['requested_identifier'] + ncbi_identifier = params['reference_source']['requested_identifier'] ftp = FTP(NCBI_FTP_SERVER) ftp.login() @@ -79,9 +81,9 @@ tmp_dir = tempfile.mkdtemp(prefix='tmp-data-manager-ncbi-') ncbi_fasta_filename = os.path.join(tmp_dir, "%s%s" % (ncbi_identifier, ext)) - fasta_base_filename = "%s.fa" % database_id - fasta_filename = os.path.join(target_directory, fasta_base_filename) - fasta_writer = open(fasta_filename, 'wb+') + # fasta_base_filename = "%s.fa" % database_id + # fasta_filename = os.path.join(target_directory, fasta_base_filename) + # fasta_writer = open(fasta_filename, 'wb+') tmp_extract_dir = os.path.join(tmp_dir, 'extracted_fasta') os.mkdir(tmp_extract_dir) @@ -106,8 +108,8 @@ def download_from_url(data_manager_dict, params, target_directory, database_id, database_name): # TODO: we should automatically do decompression here - urls = filter(bool, map(lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split('\n'))) - fasta_reader = [urllib2.urlopen(url) for url in urls] + urls = list(filter(bool, [x.strip() for x in params['reference_source']['user_url'].split('\n')])) + fasta_reader = [urllib.request.urlopen(url) for url in urls] data_table_entry = _stream_fasta_to_file(fasta_reader, target_directory, database_id, database_name, params) _add_data_table_entry(data_manager_dict, data_table_entry) @@ -115,19 +117,19 @@ def download_from_history(data_manager_dict, params, target_directory, database_id, database_name): # TODO: allow multiple FASTA input files - input_filename = params['param_dict']['reference_source']['input_fasta'] + input_filename = params['reference_source']['input_fasta'] if isinstance(input_filename, list): fasta_reader = [open(filename, 'rb') for filename in input_filename] else: - fasta_reader = open(input_filename) + fasta_reader = open(input_filename, 'rb') data_table_entry = _stream_fasta_to_file(fasta_reader, target_directory, database_id, database_name, params) _add_data_table_entry(data_manager_dict, data_table_entry) def copy_from_directory(data_manager_dict, params, target_directory, database_id, database_name): - input_filename = params['param_dict']['reference_source']['fasta_filename'] - create_symlink = params['param_dict']['reference_source']['create_symlink'] == 'create_symlink' + input_filename = params['reference_source']['fasta_filename'] + create_symlink = params['reference_source']['create_symlink'] == 'create_symlink' if create_symlink: data_table_entry = _create_symlink(input_filename, target_directory, database_id, database_name) else: @@ -146,7 +148,8 @@ return data_manager_dict -def _stream_fasta_to_file(fasta_stream, target_directory, database_id, database_name, params, close_stream=True): +def _stream_fasta_to_file(fasta_stream, target_directory, database_id, + database_name, params, close_stream=True): fasta_base_filename = "%s.fa" % database_id fasta_filename = os.path.join(target_directory, fasta_base_filename) @@ -154,53 +157,70 @@ temp_fasta.close() fasta_writer = open(temp_fasta.name, 'wb+') - if isinstance(fasta_stream, list) and len(fasta_stream) == 1: - fasta_stream = fasta_stream[0] + if not isinstance(fasta_stream, list): + fasta_stream = [fasta_stream] - if isinstance(fasta_stream, list): - last_char = None - for fh in fasta_stream: - if last_char not in [None, '\n', '\r']: - fasta_writer.write('\n') - while True: - data = fh.read(CHUNK_SIZE) - if data: - fasta_writer.write(data) - last_char = data[-1] - else: - break - if close_stream: - fh.close() - else: + last_char = None + for fh in fasta_stream: + if last_char not in [None, '\n', '\r']: + fasta_writer.write('\n') while True: - data = fasta_stream.read(CHUNK_SIZE) + data = fh.read(CHUNK_SIZE) if data: fasta_writer.write(data) + last_char = data[-1] else: break if close_stream: - fasta_stream.close() + fh.close() fasta_writer.close() - args = ['diamond', 'makedb', '--in', temp_fasta.name, '--db', fasta_filename] + args = ['diamond', 'makedb', + '--in', temp_fasta.name, + '--db', fasta_filename] + if params['tax_cond']['tax_select'] == "history": + for i in ["taxonmap", "taxonnodes", "taxonnames"]: + args.extend(['--' + i, params['tax_cond'][i]]) + elif params['tax_cond']['tax_select'] == "ncbi": + if os.path.isfile(os.path.join(params['tax_cond']['ncbi_tax'], 'prot.accession2taxid.FULL.gz')): + args.extend(['--taxonmap', + os.path.join(params['tax_cond']['ncbi_tax'], 'prot.accession2taxid.FULL.gz')]) + elif os.path.isfile(os.path.join(params['tax_cond']['ncbi_tax'], 'prot.accession2taxid.FULL')): + args.extend(['--taxonmap', + os.path.join(params['tax_cond']['ncbi_tax'], 'prot.accession2taxid.FULL')]) + elif os.path.isfile(os.path.join(params['tax_cond']['ncbi_tax'], 'prot.accession2taxid.gz')): + args.extend(['--taxonmap', + os.path.join(params['tax_cond']['ncbi_tax'], 'prot.accession2taxid.gz')]) + elif os.path.isfile(os.path.join(params['tax_cond']['ncbi_tax'], 'prot.accession2taxid')): + args.extend(['--taxonmap', + os.path.join(params['tax_cond']['ncbi_tax'], 'prot.accession2taxid')]) + else: + raise Exception('Unable to find prot.accession2taxid file in %s' % (params['tax_cond']['ncbi_tax'])) + + args.extend(['--taxonnodes', + os.path.join(params['tax_cond']['ncbi_tax'], 'nodes.dmp')]) + args.extend(['--taxonnames', + os.path.join(params['tax_cond']['ncbi_tax'], 'names.dmp')]) tmp_stderr = tempfile.NamedTemporaryFile(prefix="tmp-data-manager-diamond-database-builder-stderr") - proc = subprocess.Popen(args=args, shell=False, cwd=target_directory, stderr=tmp_stderr.fileno()) + proc = subprocess.Popen(args=args, shell=False, cwd=target_directory, + stderr=tmp_stderr.fileno()) return_code = proc.wait() if return_code: tmp_stderr.flush() tmp_stderr.seek(0) - print >> sys.stderr, "Error building diamond database:" + print("Error building diamond database:", file=sys.stderr) while True: chunk = tmp_stderr.read(CHUNK_SIZE) if not chunk: break - sys.stderr.write(chunk) + sys.stderr.write(chunk.decode('utf-8')) sys.exit(return_code) tmp_stderr.close() os.remove(temp_fasta.name) - return dict(value=database_id, name=database_name, db_path="%s.dmnd" % fasta_base_filename) + return dict(value=database_id, name=database_name, + db_path="%s.dmnd" % fasta_base_filename) def _create_symlink(input_filename, target_directory, database_id, database_name): @@ -210,27 +230,36 @@ return dict(value=database_id, name=database_name, db_path=fasta_base_filename) -REFERENCE_SOURCE_TO_DOWNLOAD = dict(ncbi=download_from_ncbi, url=download_from_url, history=download_from_history, directory=copy_from_directory) +REFERENCE_SOURCE_TO_DOWNLOAD = dict(ncbi=download_from_ncbi, + url=download_from_url, + history=download_from_history, + directory=copy_from_directory) def main(): # Parse Command Line parser = optparse.OptionParser() - parser.add_option('-d', '--dbkey_description', dest='dbkey_description', action='store', type="string", default=None, help='dbkey_description') + parser.add_option('-d', '--dbkey_description', dest='dbkey_description', + action='store', type="string", default=None, + help='dbkey_description') (options, args) = parser.parse_args() filename = args[0] - params = json.loads(open(filename).read()) + with open(filename) as fp: + params = json.load(fp) target_directory = params['output_data'][0]['extra_files_path'] os.mkdir(target_directory) data_manager_dict = {} - database_id = params['param_dict']['database_id'] - database_name = params['param_dict']['database_name'] + param_dict = params['param_dict'] + database_id = param_dict['database_id'] + database_name = param_dict['database_name'] + if param_dict['tax_cond']['tax_select'] == "ncbi": + param_dict['tax_cond']['ncbi_tax'] = args[1] # Fetch the FASTA - REFERENCE_SOURCE_TO_DOWNLOAD[params['param_dict']['reference_source']['reference_source_selector']](data_manager_dict, params, target_directory, database_id, database_name) + REFERENCE_SOURCE_TO_DOWNLOAD[param_dict['reference_source']['reference_source_selector']](data_manager_dict, param_dict, target_directory, database_id, database_name) # save info to json file open(filename, 'w').write(json.dumps(data_manager_dict, sort_keys=True))