view data_manager_mlst/data_manager/data_manager_mlst.py @ 0:56271dcbc91c draft

Uploaded
author estrain
date Thu, 18 Jan 2024 02:22:30 +0000
parents
children a2da81d3378b
line wrap: on
line source

import os
import subprocess
import shutil
import json
import argparse
import datetime
import requests

def download_pubmlst_databases():
    """Download databases from pubmlst."""
    try:
        subprocess.run(["cp", "-R", "/mnt/data/mlst/db","pubmlst"], check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error downloading databases: {e}")
        sys.exit(1)

def make_blast_database(output_directory):
    """Create a BLAST database from downloaded data."""
    #dir_path = os.path.dirname(os.path.realpath(__file__))
    dir_path = os.getcwd()
    mlst_dir = os.path.join(dir_path, "pubmlst")
    output_directory = os.path.abspath(output_directory)
    output_mlst_dir = os.path.join(output_directory, "pubmlst")

    if os.path.exists(output_mlst_dir):
        shutil.rmtree(output_mlst_dir)
    shutil.move(mlst_dir, output_mlst_dir)

    blast_dir = os.path.join(output_directory, "blast")
    os.makedirs(blast_dir, exist_ok=True)

    blast_file = os.path.join(blast_dir, "mlst.fa")
    for scheme_dir in [d for d in os.listdir(output_mlst_dir) if os.path.isdir(os.path.join(output_mlst_dir, d))]:
        scheme = os.path.basename(scheme_dir)
        with open(blast_file, 'a') as outfile:
            for file_name in os.listdir(os.path.join(output_mlst_dir, scheme_dir)):
                if file_name.endswith('.tfa'):
                    with open(os.path.join(output_mlst_dir, scheme_dir, file_name), 'r') as infile:
                        for line in infile:
                            if 'not a locus' not in line:
                                if line.startswith('>'):
                                    outfile.write(f">{scheme}.{line[1:]}")
                                else:
                                    outfile.write(line)

    try:
        subprocess.run(["makeblastdb", "-hash_index", "-in", blast_file, "-dbtype", "nucl", "-title", "PubMLST", "-parse_seqids"], check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error creating BLAST database: {e}")
        sys.exit(1)

def write_json(version, args_path, args_name, args_out):
    """Write data table entry to JSON file."""
    data_table_entry = {
        'data_tables': {
            'mlst': [
                {
                    "value": version,
                    "name": args_name,
                    "path": args_path,
                }
            ]
        }
    }

    with open(args_out, 'w') as fh:
        json.dump(data_table_entry, fh, indent=2, sort_keys=True)

def main():
    parser = argparse.ArgumentParser(description='Download and process pubmlst databases')
    parser.add_argument('--out', type=str, required=True, nargs=1, help='output file')
    args = parser.parse_args()

    with open(args.out[0]) as fh:
        params = json.load(fh)

    output_directory = params['output_data'][0]['extra_files_path']
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    download_pubmlst_databases()
    make_blast_database(output_directory)

    
    url = 'https://raw.githubusercontent.com/tseemann/mlst/master/db/scheme_species_map.tab'

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
      with open('scheme_species_map.tab', 'w') as file:
        file.write(response.text)
      print("File downloaded successfully")
    else:
      print("Failed to retrieve the file")

    stab = "scheme_species_map.tab"
    shutil.copy(stab,output_directory) 
 
    datetime_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    tablename = f"mlst_database_{datetime_str}"

    write_json(tablename, output_directory, tablename, args.out[0])

if __name__ == "__main__":
    main()