Mercurial > repos > iuc > vsnp_build_tables
changeset 3:85587c8eb25f draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 524a39e08f2bea8b8754284df606ff8dd27ed24b"
author | iuc |
---|---|
date | Wed, 02 Dec 2020 09:10:07 +0000 |
parents | a52b819aa990 |
children | efb86aade548 |
files | test-data/Mcap_Deer_DE_SRR650221.fastq.gz test-data/NC_002945v4.yml test-data/forward.fastq.gz test-data/output_dbkey.txt test-data/output_metrics.txt test-data/paired_dbkey.txt test-data/paired_metrics.txt test-data/reverse.fastq.gz test-data/vsnp_dnaprints.loc tool-data/vsnp_dnaprints.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test vsnp_determine_ref_from_data.py |
diffstat | 13 files changed, 272 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/NC_002945v4.yml Wed Dec 02 09:10:07 2020 +0000 @@ -0,0 +1,5 @@ +bovis: + - '11001110' + - '11011110' + - '11001100' +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_dbkey.txt Wed Dec 02 09:10:07 2020 +0000 @@ -0,0 +1,1 @@ +AF2122 \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_metrics.txt Wed Dec 02 09:10:07 2020 +0000 @@ -0,0 +1,6 @@ +Sample: Mcap_Deer_DE_SRR650221 +Brucella counts: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +TB counts: 2,2,0,0,4,5,0,0, +Para counts: 0,0,0, +Group: TB +dbkey: AF2122
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/paired_dbkey.txt Wed Dec 02 09:10:07 2020 +0000 @@ -0,0 +1,1 @@ +AF2122 \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/paired_metrics.txt Wed Dec 02 09:10:07 2020 +0000 @@ -0,0 +1,6 @@ +Sample: forward +Brucella counts: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +TB counts: 4,4,0,0,8,10,0,0, +Para counts: 0,0,0, +Group: TB +dbkey: AF2122
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/vsnp_dnaprints.loc Wed Dec 02 09:10:07 2020 +0000 @@ -0,0 +1,4 @@ +## vSNP DNAprints files +#Value Name Path Description +AF2122 Mycobacterium_AF2122/NC_002945v4.yml ${__HERE__}/NC_002945v4.yml DNAprints file for Mycobacterium bovis AF2122/97 +#NC_006932 Brucella_abortus1/NC_006932-NC_006933.yml /vsnp/NC_006932/Brucella_abortus1/NC_006932-NC_006933.yml DNAprints file for Brucella abortus bv. 1 str. 9-941
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/vsnp_dnaprints.loc.sample Wed Dec 02 09:10:07 2020 +0000 @@ -0,0 +1,4 @@ +## vSNP DNAprints files +#Value Name Path Description +#AF2122 Mycobacterium_AF2122/NC_002945v4.yml /vsnp/AF2122/Mycobacterium_AF2122/NC_002945v4.yml DNAprints file for Mycobacterium bovis AF2122/97 +#NC_006932 Brucella_abortus1/NC_006932-NC_006933.yml /vsnp/NC_006932/Brucella_abortus1/NC_006932-NC_006933.yml DNAprints file for Brucella abortus bv. 1 str. 9-941
--- a/tool_data_table_conf.xml.sample Wed Sep 30 19:07:23 2020 +0000 +++ b/tool_data_table_conf.xml.sample Wed Dec 02 09:10:07 2020 +0000 @@ -9,5 +9,10 @@ <columns>value, name, path, description</columns> <file path="tool-data/vsnp_genbank.loc" /> </table> + <!-- Location of dnaprints files for vsnp_dtermine_ref_from_data tool --> + <table name="vsnp_dnaprints" comment_char="#"> + <columns>value, name, path, description</columns> + <file path="tool-data/vsnp_dnaprints.loc" /> + </table> </tables>
--- a/tool_data_table_conf.xml.test Wed Sep 30 19:07:23 2020 +0000 +++ b/tool_data_table_conf.xml.test Wed Dec 02 09:10:07 2020 +0000 @@ -3,4 +3,13 @@ <columns>value, dbkey, name, path</columns> <file path="${__HERE__}/test-data/fasta_indexes.loc" /> </table> + <!-- Location of genbank files for vsnp_build_tables tool --> + <table name="vsnp_genbank" comment_char="#"> + <columns>value, name, path, description</columns> + <file path="${__HERE__}/test-data/vsnp_genbank.loc" /> + </table> + <table name="vsnp_dnaprints" comment_char="#"> + <columns>value, name, path, description</columns> + <file path="${__HERE__}/test-data/vsnp_dnaprints.loc" /> + </table> </tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vsnp_determine_ref_from_data.py Wed Dec 02 09:10:07 2020 +0000 @@ -0,0 +1,231 @@ +#!/usr/bin/env python + +import argparse +import gzip +import os +from collections import OrderedDict + +import yaml +from Bio.SeqIO.QualityIO import FastqGeneralIterator + +OUTPUT_DBKEY_DIR = 'output_dbkey' +OUTPUT_METRICS_DIR = 'output_metrics' + + +def get_base_file_name(file_path): + base_file_name = os.path.basename(file_path) + if base_file_name.find(".") > 0: + # Eliminate the extension. + return os.path.splitext(base_file_name)[0] + elif base_file_name.find("_fq") > 0: + # The "." character has likely + # changed to an "_" character. + return base_file_name.split("_fq")[0] + elif base_file_name.find("_fastq") > 0: + return base_file_name.split("_fastq")[0] + return base_file_name + + +def get_dbkey(dnaprints_dict, key, s): + # dnaprints_dict looks something like this: + # {'brucella': {'NC_002945v4': ['11001110', '11011110', '11001100']} + # {'bovis': {'NC_006895': ['11111110', '00010010', '01111011']}} + d = dnaprints_dict.get(key, {}) + for data_table_value, v_list in d.items(): + if s in v_list: + return data_table_value + return "" + + +def get_dnaprints_dict(dnaprint_fields): + # A dndprint_fields entry looks something liek this. + # [['AF2122', '/galaxy/tool-data/vsnp/AF2122/dnaprints/NC_002945v4.yml']] + dnaprints_dict = {} + for item in dnaprint_fields: + # Here item is a 2-element list of data + # table components, # value and path. + value = item[0] + path = item[1].strip() + with open(path, "rt") as fh: + # The format of all dnaprints yaml + # files is something like this: + # brucella: + # - 0111111111111111 + print_dict = yaml.load(fh, Loader=yaml.Loader) + for print_dict_k, print_dict_v in print_dict.items(): + dnaprints_v_dict = dnaprints_dict.get(print_dict_k, {}) + if len(dnaprints_v_dict) > 0: + # dnaprints_dict already contains k (e.g., 'brucella', + # and dnaprints_v_dict will be a dictionary # that + # looks something like this: + # {'NC_002945v4': ['11001110', '11011110', '11001100']} + value_list = dnaprints_v_dict.get(value, []) + value_list = value_list + print_dict_v + dnaprints_v_dict[value] = value_list + else: + # dnaprints_v_dict is an empty dictionary. + dnaprints_v_dict[value] = print_dict_v + dnaprints_dict[print_dict_k] = dnaprints_v_dict + # dnaprints_dict looks something like this: + # {'brucella': {'NC_002945v4': ['11001110', '11011110', '11001100']} + # {'bovis': {'NC_006895': ['11111110', '00010010', '01111011']}} + return dnaprints_dict + + +def get_group_and_dbkey(dnaprints_dict, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum): + if brucella_sum > 3: + group = "Brucella" + dbkey = get_dbkey(dnaprints_dict, "brucella", brucella_string) + elif bovis_sum > 3: + group = "TB" + dbkey = get_dbkey(dnaprints_dict, "bovis", bovis_string) + elif para_sum >= 1: + group = "paraTB" + dbkey = get_dbkey(dnaprints_dict, "para", para_string) + else: + group = "" + dbkey = "" + return group, dbkey + + +def get_oligo_dict(): + oligo_dict = {} + oligo_dict["01_ab1"] = "AATTGTCGGATAGCCTGGCGATAACGACGC" + oligo_dict["02_ab3"] = "CACACGCGGGCCGGAACTGCCGCAAATGAC" + oligo_dict["03_ab5"] = "GCTGAAGCGGCAGACCGGCAGAACGAATAT" + oligo_dict["04_mel"] = "TGTCGCGCGTCAAGCGGCGTGAAATCTCTG" + oligo_dict["05_suis1"] = "TGCGTTGCCGTGAAGCTTAATTCGGCTGAT" + oligo_dict["06_suis2"] = "GGCAATCATGCGCAGGGCTTTGCATTCGTC" + oligo_dict["07_suis3"] = "CAAGGCAGATGCACATAATCCGGCGACCCG" + oligo_dict["08_ceti1"] = "GTGAATATAGGGTGAATTGATCTTCAGCCG" + oligo_dict["09_ceti2"] = "TTACAAGCAGGCCTATGAGCGCGGCGTGAA" + oligo_dict["10_canis4"] = "CTGCTACATAAAGCACCCGGCGACCGAGTT" + oligo_dict["11_canis"] = "ATCGTTTTGCGGCATATCGCTGACCACAGC" + oligo_dict["12_ovis"] = "CACTCAATCTTCTCTACGGGCGTGGTATCC" + oligo_dict["13_ether2"] = "CGAAATCGTGGTGAAGGACGGGACCGAACC" + oligo_dict["14_63B1"] = "CCTGTTTAAAAGAATCGTCGGAACCGCTCT" + oligo_dict["15_16M0"] = "TCCCGCCGCCATGCCGCCGAAAGTCGCCGT" + oligo_dict["16_mel1b"] = "TCTGTCCAAACCCCGTGACCGAACAATAGA" + oligo_dict["17_tb157"] = "CTCTTCGTATACCGTTCCGTCGTCACCATGGTCCT" + oligo_dict["18_tb7"] = "TCACGCAGCCAACGATATTCGTGTACCGCGACGGT" + oligo_dict["19_tbbov"] = "CTGGGCGACCCGGCCGACCTGCACACCGCGCATCA" + oligo_dict["20_tb5"] = "CCGTGGTGGCGTATCGGGCCCCTGGATCGCGCCCT" + oligo_dict["21_tb2"] = "ATGTCTGCGTAAAGAAGTTCCATGTCCGGGAAGTA" + oligo_dict["22_tb3"] = "GAAGACCTTGATGCCGATCTGGGTGTCGATCTTGA" + oligo_dict["23_tb4"] = "CGGTGTTGAAGGGTCCCCCGTTCCAGAAGCCGGTG" + oligo_dict["24_tb6"] = "ACGGTGATTCGGGTGGTCGACACCGATGGTTCAGA" + oligo_dict["25_para"] = "CCTTTCTTGAAGGGTGTTCG" + oligo_dict["26_para_sheep"] = "CGTGGTGGCGACGGCGGCGGGCCTGTCTAT" + oligo_dict["27_para_cattle"] = "TCTCCTCGGTCGGTGATTCGGGGGCGCGGT" + return oligo_dict + + +def get_seq_counts(value, fastq_list, gzipped): + count = 0 + for fastq_file in fastq_list: + if gzipped: + with gzip.open(fastq_file, 'rt') as fh: + for title, seq, qual in FastqGeneralIterator(fh): + count += seq.count(value) + else: + with open(fastq_file, 'r') as fh: + for title, seq, qual in FastqGeneralIterator(fh): + count += seq.count(value) + return(value, count) + + +def get_species_counts(fastq_list, gzipped): + count_summary = {} + oligo_dict = get_oligo_dict() + for v1 in oligo_dict.values(): + returned_value, count = get_seq_counts(v1, fastq_list, gzipped) + for key, v2 in oligo_dict.items(): + if returned_value == v2: + count_summary.update({key: count}) + count_list = [] + for v in count_summary.values(): + count_list.append(v) + brucella_sum = sum(count_list[:16]) + bovis_sum = sum(count_list[16:24]) + para_sum = sum(count_list[24:]) + return count_summary, count_list, brucella_sum, bovis_sum, para_sum + + +def get_species_strings(count_summary): + binary_dictionary = {} + for k, v in count_summary.items(): + if v > 1: + binary_dictionary.update({k: 1}) + else: + binary_dictionary.update({k: 0}) + binary_dictionary = OrderedDict(sorted(binary_dictionary.items())) + binary_list = [] + for v in binary_dictionary.values(): + binary_list.append(v) + brucella_binary = binary_list[:16] + brucella_string = ''.join(str(e) for e in brucella_binary) + bovis_binary = binary_list[16:24] + bovis_string = ''.join(str(e) for e in bovis_binary) + para_binary = binary_list[24:] + para_string = ''.join(str(e) for e in para_binary) + return brucella_string, bovis_string, para_string + + +def output_dbkey(file_name, dbkey, output_file): + # Output the dbkey. + with open(output_file, "w") as fh: + fh.write("%s" % dbkey) + + +def output_files(fastq_file, count_list, group, dbkey, dbkey_file, metrics_file): + base_file_name = get_base_file_name(fastq_file) + output_dbkey(base_file_name, dbkey, dbkey_file) + output_metrics(base_file_name, count_list, group, dbkey, metrics_file) + + +def output_metrics(file_name, count_list, group, dbkey, output_file): + # Output the metrics. + with open(output_file, "w") as fh: + fh.write("Sample: %s\n" % file_name) + fh.write("Brucella counts: ") + for i in count_list[:16]: + fh.write("%d," % i) + fh.write("\nTB counts: ") + for i in count_list[16:24]: + fh.write("%d," % i) + fh.write("\nPara counts: ") + for i in count_list[24:]: + fh.write("%d," % i) + fh.write("\nGroup: %s" % group) + fh.write("\ndbkey: %s\n" % dbkey) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument('--dnaprint_fields', action='append', dest='dnaprint_fields', nargs=2, required=False, default=None, help="List of dnaprints data table value, name and path fields") + parser.add_argument('--read1', action='store', dest='read1', required=True, default=None, help='Required: single read') + parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read') + parser.add_argument('--gzipped', action='store_true', dest='gzipped', default=False, help='Input files are gzipped') + parser.add_argument('--output_dbkey', action='store', dest='output_dbkey', required=True, default=None, help='Output reference file') + parser.add_argument('--output_metrics', action='store', dest='output_metrics', required=True, default=None, help='Output metrics file') + + args = parser.parse_args() + + fastq_list = [args.read1] + if args.read2 is not None: + fastq_list.append(args.read2) + + # The value of dnaprint_fields is a list of lists, where each list is + # the [value, name, path] components of the vsnp_dnaprints data table. + # The data_manager_vsnp_dnaprints tool assigns the dbkey column from the + # all_fasta data table to the value column in the vsnp_dnaprints data + # table to ensure a proper mapping for discovering the dbkey. + dnaprints_dict = get_dnaprints_dict(args.dnaprint_fields) + + # Here fastq_list consists of either a single read + # or a set of paired reads, producing single outputs. + count_summary, count_list, brucella_sum, bovis_sum, para_sum = get_species_counts(fastq_list, args.gzipped) + brucella_string, bovis_string, para_string = get_species_strings(count_summary) + group, dbkey = get_group_and_dbkey(dnaprints_dict, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum) + output_files(args.read1, count_list, group, dbkey, dbkey_file=args.output_dbkey, metrics_file=args.output_metrics)