Mercurial > repos > estrain > data_manager_mlst
view data_manager_mlst/data_manager/data_manager_mlst.py @ 2:1930c02df64e draft default tip
Uploaded
author | estrain |
---|---|
date | Thu, 18 Jan 2024 03:11:43 +0000 |
parents | a2da81d3378b |
children |
line wrap: on
line source
import os import subprocess import shutil import json import argparse import datetime import sys from urllib.request import urlopen def download_pubmlst_databases(): """Download databases from pubmlst.""" try: subprocess.run(["mlst-download_pub_mlst", "-d", "pubmlst"], check=True) except subprocess.CalledProcessError as e: print(f"Error downloading databases: {e}") sys.exit(1) def make_blast_database(output_directory): """Create a BLAST database from downloaded data.""" dir_path = os.getcwd() mlst_dir = os.path.join(dir_path, "pubmlst") output_directory = os.path.abspath(output_directory) output_mlst_dir = os.path.join(output_directory, "pubmlst") if os.path.exists(output_mlst_dir): shutil.rmtree(output_mlst_dir) shutil.move(mlst_dir, output_mlst_dir) blast_dir = os.path.join(output_directory, "blast") os.makedirs(blast_dir, exist_ok=True) blast_file = os.path.join(blast_dir, "mlst.fa") for scheme_dir in [d for d in os.listdir(output_mlst_dir) if os.path.isdir(os.path.join(output_mlst_dir, d))]: scheme = os.path.basename(scheme_dir) with open(blast_file, 'a') as outfile: for file_name in os.listdir(os.path.join(output_mlst_dir, scheme_dir)): if file_name.endswith('.tfa'): with open(os.path.join(output_mlst_dir, scheme_dir, file_name), 'r') as infile: for line in infile: if 'not a locus' not in line: if line.startswith('>'): outfile.write(f">{scheme}.{line[1:]}") else: outfile.write(line) try: subprocess.run(["makeblastdb", "-hash_index", "-in", blast_file, "-dbtype", "nucl", "-title", "PubMLST", "-parse_seqids"], check=True) except subprocess.CalledProcessError as e: print(f"Error creating BLAST database: {e}") sys.exit(1) def write_json(version, args_path, args_name, args_out): """Write data table entry to JSON file.""" data_table_entry = { 'data_tables': { 'mlst': [ { "value": version, "name": args_name, "path": args_path, } ] } } with open(args_out, 'w') as fh: json.dump(data_table_entry, fh, indent=2, sort_keys=True) def main(): parser = argparse.ArgumentParser(description='Download and process pubmlst databases') parser.add_argument('--out', type=str, required=True, nargs=1, help='output file') args = parser.parse_args() with open(args.out[0]) as fh: params = json.load(fh) output_directory = params['output_data'][0]['extra_files_path'] if not os.path.exists(output_directory): os.makedirs(output_directory) download_pubmlst_databases() make_blast_database(output_directory) url = 'https://raw.githubusercontent.com/tseemann/mlst/master/db/scheme_species_map.tab' # Use urllib to send a GET request to the URL try: with urlopen(url) as response: response_content = response.read().decode('utf-8') with open('scheme_species_map.tab', 'w') as file: file.write(response_content) print("File downloaded successfully") except Exception as e: print(f"Failed to retrieve the file: {e}") stab = "scheme_species_map.tab" shutil.copy(stab,output_directory) datetime_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") tablename = f"mlst_database_{datetime_str}" write_json(tablename, output_directory, tablename, args.out[0]) if __name__ == "__main__": main()