Mercurial > repos > estrain > data_manager_mlst
changeset 0:56271dcbc91c draft
Uploaded
author | estrain |
---|---|
date | Thu, 18 Jan 2024 02:22:30 +0000 |
parents | |
children | a2da81d3378b |
files | data_manager_mlst/data_manager/data_manager_mlst.py data_manager_mlst/data_manager/data_manager_mlst.xml data_manager_mlst/data_manager_conf.xml data_manager_mlst/test-data/mlst.loc data_manager_mlst/tool-data/mlst.loc data_manager_mlst/tool_data_table_conf.xml.sample data_manager_mlst/tool_data_table_conf.xml.test |
diffstat | 7 files changed, 181 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_mlst/data_manager/data_manager_mlst.py Thu Jan 18 02:22:30 2024 +0000 @@ -0,0 +1,108 @@ +import os +import subprocess +import shutil +import json +import argparse +import datetime +import requests + +def download_pubmlst_databases(): + """Download databases from pubmlst.""" + try: + subprocess.run(["cp", "-R", "/mnt/data/mlst/db","pubmlst"], check=True) + except subprocess.CalledProcessError as e: + print(f"Error downloading databases: {e}") + sys.exit(1) + +def make_blast_database(output_directory): + """Create a BLAST database from downloaded data.""" + #dir_path = os.path.dirname(os.path.realpath(__file__)) + dir_path = os.getcwd() + mlst_dir = os.path.join(dir_path, "pubmlst") + output_directory = os.path.abspath(output_directory) + output_mlst_dir = os.path.join(output_directory, "pubmlst") + + if os.path.exists(output_mlst_dir): + shutil.rmtree(output_mlst_dir) + shutil.move(mlst_dir, output_mlst_dir) + + blast_dir = os.path.join(output_directory, "blast") + os.makedirs(blast_dir, exist_ok=True) + + blast_file = os.path.join(blast_dir, "mlst.fa") + for scheme_dir in [d for d in os.listdir(output_mlst_dir) if os.path.isdir(os.path.join(output_mlst_dir, d))]: + scheme = os.path.basename(scheme_dir) + with open(blast_file, 'a') as outfile: + for file_name in os.listdir(os.path.join(output_mlst_dir, scheme_dir)): + if file_name.endswith('.tfa'): + with open(os.path.join(output_mlst_dir, scheme_dir, file_name), 'r') as infile: + for line in infile: + if 'not a locus' not in line: + if line.startswith('>'): + outfile.write(f">{scheme}.{line[1:]}") + else: + outfile.write(line) + + try: + subprocess.run(["makeblastdb", "-hash_index", "-in", blast_file, "-dbtype", "nucl", "-title", "PubMLST", "-parse_seqids"], check=True) + except subprocess.CalledProcessError as e: + print(f"Error creating BLAST database: {e}") + sys.exit(1) + +def write_json(version, args_path, args_name, args_out): + """Write data table entry to JSON file.""" + data_table_entry = { + 'data_tables': { + 'mlst': [ + { + "value": version, + "name": args_name, + "path": args_path, + } + ] + } + } + + with open(args_out, 'w') as fh: + json.dump(data_table_entry, fh, indent=2, sort_keys=True) + +def main(): + parser = argparse.ArgumentParser(description='Download and process pubmlst databases') + parser.add_argument('--out', type=str, required=True, nargs=1, help='output file') + args = parser.parse_args() + + with open(args.out[0]) as fh: + params = json.load(fh) + + output_directory = params['output_data'][0]['extra_files_path'] + if not os.path.exists(output_directory): + os.makedirs(output_directory) + + download_pubmlst_databases() + make_blast_database(output_directory) + + + url = 'https://raw.githubusercontent.com/tseemann/mlst/master/db/scheme_species_map.tab' + + # Send a GET request to the URL + response = requests.get(url) + + # Check if the request was successful + if response.status_code == 200: + with open('scheme_species_map.tab', 'w') as file: + file.write(response.text) + print("File downloaded successfully") + else: + print("Failed to retrieve the file") + + stab = "scheme_species_map.tab" + shutil.copy(stab,output_directory) + + datetime_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + tablename = f"mlst_database_{datetime_str}" + + write_json(tablename, output_directory, tablename, args.out[0]) + +if __name__ == "__main__": + main() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_mlst/data_manager/data_manager_mlst.xml Thu Jan 18 02:22:30 2024 +0000 @@ -0,0 +1,30 @@ +<tool id="data_manager_mlst" name="mlst Data Manger" tool_type="manage_data" version="0.0.1" profile="20.01"> + <requirements> + <requirement type="package">blast</requirement> + <requirement type="package">mlst</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + python $__tool_directory__/data_manager_mlst.py --out ${output_file}; + ]]></command> + <inputs> + </inputs> + <outputs> + <data name="output_file" format="data_manager_json"/> + </outputs> + <tests> + <test> + </test> + </tests> + <help> + </help> + <citations> + <citation type="bibtex"> + @UNPUBLISHED{Seemann2016, + author = "Seemann T", + title = "MLST: Scan contig files against PubMLST typing schemes", + year = "2016", + url = {https://github.com/tseemann/mlst} + } + </citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_mlst/data_manager_conf.xml Thu Jan 18 02:22:30 2024 +0000 @@ -0,0 +1,19 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/data_manager_mlst.xml" id="data_manager_mlst"> + <data_table name="mlst"> + <output> + <column name="value" /> + <column name="name" /> + <column name="path" output_ref="output_file" > + <move type="directory" relativize_symlinks="True"> + <src>${path}</src> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">mlst/${value}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/mlst/${value}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_mlst/test-data/mlst.loc Thu Jan 18 02:22:30 2024 +0000 @@ -0,0 +1,6 @@ +# this is a tab separated file describing the location of mlst databases +# +# the columns are: +# value name path +# +# for example
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_mlst/tool-data/mlst.loc Thu Jan 18 02:22:30 2024 +0000 @@ -0,0 +1,6 @@ +# this is a tab separated file describing the location of mlst databases +# +# the columns are: +# value name path +# +# for example
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_mlst/tool_data_table_conf.xml.sample Thu Jan 18 02:22:30 2024 +0000 @@ -0,0 +1,6 @@ +<tables> + <table name="mlst" comment_char="#" allow_duplicate_entries="False"> + <columns>value, name, path</columns> + <file path="tool-data/mlst.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_mlst/tool_data_table_conf.xml.test Thu Jan 18 02:22:30 2024 +0000 @@ -0,0 +1,6 @@ +<tables> + <table name="mlst" comment_char="#" allow_duplicate_entries="False"> + <columns>value,name, path</columns> + <file path="${__HERE__}/test-data/mlst.loc" /> + </table> +</tables>