# HG changeset patch # User iuc # Date 1607613953 0 # Node ID efb86aade54838d71a304a79a3a39f76b4d90e72 # Parent 85587c8eb25fda5c12552882beb26b958783ff3a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 2e312886647244b416c64eca91e1a61dd1be939b" diff -r 85587c8eb25f -r efb86aade548 macros.xml --- a/macros.xml Wed Dec 02 09:10:07 2020 +0000 +++ b/macros.xml Thu Dec 10 15:25:53 2020 +0000 @@ -2,12 +2,6 @@ 1.0 19.09 - - - - - - diff -r 85587c8eb25f -r efb86aade548 test-data/CMC_20E1_R1.fastq.gz Binary file test-data/CMC_20E1_R1.fastq.gz has changed diff -r 85587c8eb25f -r efb86aade548 test-data/CMC_20E1_R2.fastq.gz Binary file test-data/CMC_20E1_R2.fastq.gz has changed diff -r 85587c8eb25f -r efb86aade548 test-data/forward.fastq.gz Binary file test-data/forward.fastq.gz has changed diff -r 85587c8eb25f -r efb86aade548 test-data/output_metrics.tabular --- a/test-data/output_metrics.tabular Wed Dec 02 09:10:07 2020 +0000 +++ b/test-data/output_metrics.tabular Thu Dec 10 15:25:53 2020 +0000 @@ -1,2 +1,3 @@ # File Number of Good SNPs Average Coverage Genome Coverage - 0 +vcf_input_vcf 0.659602 50.49% +vcf_input_vcf 0 diff -r 85587c8eb25f -r efb86aade548 test-data/output_metrics.txt --- a/test-data/output_metrics.txt Wed Dec 02 09:10:07 2020 +0000 +++ b/test-data/output_metrics.txt Thu Dec 10 15:25:53 2020 +0000 @@ -1,4 +1,4 @@ -Sample: Mcap_Deer_DE_SRR650221 +Sample: Mcap_Deer_DE_SRR650221_fastq_gz Brucella counts: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, TB counts: 2,2,0,0,4,5,0,0, Para counts: 0,0,0, diff -r 85587c8eb25f -r efb86aade548 test-data/paired_collection_metrics.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/paired_collection_metrics.txt Thu Dec 10 15:25:53 2020 +0000 @@ -0,0 +1,6 @@ +Sample: CMC_20E1_R1_fastq_gz +Brucella counts: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +TB counts: 4,4,0,0,8,10,0,0, +Para counts: 0,0,0, +Group: TB +dbkey: AF2122 diff -r 85587c8eb25f -r efb86aade548 test-data/paired_metrics.txt --- a/test-data/paired_metrics.txt Wed Dec 02 09:10:07 2020 +0000 +++ b/test-data/paired_metrics.txt Thu Dec 10 15:25:53 2020 +0000 @@ -1,4 +1,4 @@ -Sample: forward +Sample: CMC_20E1_R1_fastq_gz Brucella counts: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, TB counts: 4,4,0,0,8,10,0,0, Para counts: 0,0,0, diff -r 85587c8eb25f -r efb86aade548 test-data/reverse.fastq.gz Binary file test-data/reverse.fastq.gz has changed diff -r 85587c8eb25f -r efb86aade548 vsnp_add_zero_coverage.py --- a/vsnp_add_zero_coverage.py Wed Dec 02 09:10:07 2020 +0000 +++ b/vsnp_add_zero_coverage.py Thu Dec 10 15:25:53 2020 +0000 @@ -1,9 +1,7 @@ #!/usr/bin/env python import argparse -import multiprocessing import os -import queue import re import shutil @@ -11,179 +9,124 @@ import pysam from Bio import SeqIO -INPUT_BAM_DIR = 'input_bam_dir' -INPUT_VCF_DIR = 'input_vcf_dir' -OUTPUT_VCF_DIR = 'output_vcf_dir' -OUTPUT_METRICS_DIR = 'output_metrics_dir' - -def get_base_file_name(file_path): +def get_sample_name(file_path): base_file_name = os.path.basename(file_path) if base_file_name.find(".") > 0: # Eliminate the extension. return os.path.splitext(base_file_name)[0] - elif base_file_name.endswith("_vcf"): - # The "." character has likely - # changed to an "_" character. - return base_file_name.rstrip("_vcf") return base_file_name -def get_coverage_and_snp_count(task_queue, reference, output_metrics, output_vcf, timeout): - while True: - try: - tup = task_queue.get(block=True, timeout=timeout) - except queue.Empty: - break - bam_file, vcf_file = tup - # Create a coverage dictionary. - coverage_dict = {} - coverage_list = pysam.depth(bam_file, split_lines=True) - for line in coverage_list: - chrom, position, depth = line.split('\t') - coverage_dict["%s-%s" % (chrom, position)] = depth - # Convert it to a data frame. - coverage_df = pandas.DataFrame.from_dict(coverage_dict, orient='index', columns=["depth"]) - # Create a zero coverage dictionary. - zero_dict = {} - for record in SeqIO.parse(reference, "fasta"): - chrom = record.id - total_len = len(record.seq) - for pos in list(range(1, total_len + 1)): - zero_dict["%s-%s" % (str(chrom), str(pos))] = 0 - # Convert it to a data frame with depth_x - # and depth_y columns - index is NaN. - zero_df = pandas.DataFrame.from_dict(zero_dict, orient='index', columns=["depth"]) - coverage_df = zero_df.merge(coverage_df, left_index=True, right_index=True, how='outer') - # depth_x "0" column no longer needed. - coverage_df = coverage_df.drop(columns=['depth_x']) - coverage_df = coverage_df.rename(columns={'depth_y': 'depth'}) - # Covert the NaN to 0 coverage and get some metrics. - coverage_df = coverage_df.fillna(0) - coverage_df['depth'] = coverage_df['depth'].apply(int) - total_length = len(coverage_df) - average_coverage = coverage_df['depth'].mean() - zero_df = coverage_df[coverage_df['depth'] == 0] - total_zero_coverage = len(zero_df) - total_coverage = total_length - total_zero_coverage - genome_coverage = "{:.2%}".format(total_coverage / total_length) - # Process the associated VCF input. - column_names = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "Sample"] - vcf_df = pandas.read_csv(vcf_file, sep='\t', header=None, names=column_names, comment='#') - good_snp_count = len(vcf_df[(vcf_df['ALT'].str.len() == 1) & (vcf_df['REF'].str.len() == 1) & (vcf_df['QUAL'] > 150)]) - base_file_name = get_base_file_name(vcf_file) - if total_zero_coverage > 0: - header_file = "%s_header.csv" % base_file_name - with open(header_file, 'w') as outfile: - with open(vcf_file) as infile: - for line in infile: - if re.search('^#', line): - outfile.write("%s" % line) - vcf_df_snp = vcf_df[vcf_df['REF'].str.len() == 1] - vcf_df_snp = vcf_df_snp[vcf_df_snp['ALT'].str.len() == 1] - vcf_df_snp['ABS_VALUE'] = vcf_df_snp['CHROM'].map(str) + "-" + vcf_df_snp['POS'].map(str) - vcf_df_snp = vcf_df_snp.set_index('ABS_VALUE') - cat_df = pandas.concat([vcf_df_snp, zero_df], axis=1, sort=False) - cat_df = cat_df.drop(columns=['CHROM', 'POS', 'depth']) - cat_df[['ID', 'ALT', 'QUAL', 'FILTER', 'INFO']] = cat_df[['ID', 'ALT', 'QUAL', 'FILTER', 'INFO']].fillna('.') - cat_df['REF'] = cat_df['REF'].fillna('N') - cat_df['FORMAT'] = cat_df['FORMAT'].fillna('GT') - cat_df['Sample'] = cat_df['Sample'].fillna('./.') - cat_df['temp'] = cat_df.index.str.rsplit('-', n=1) - cat_df[['CHROM', 'POS']] = pandas.DataFrame(cat_df.temp.values.tolist(), index=cat_df.index) - cat_df = cat_df[['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'Sample']] - cat_df['POS'] = cat_df['POS'].astype(int) - cat_df = cat_df.sort_values(['CHROM', 'POS']) - body_file = "%s_body.csv" % base_file_name - cat_df.to_csv(body_file, sep='\t', header=False, index=False) - if output_vcf is None: - output_vcf_file = os.path.join(OUTPUT_VCF_DIR, "%s.vcf" % base_file_name) - else: - output_vcf_file = output_vcf - with open(output_vcf_file, "w") as outfile: - for cf in [header_file, body_file]: - with open(cf, "r") as infile: - for line in infile: - outfile.write("%s" % line) - else: - if output_vcf is None: - output_vcf_file = os.path.join(OUTPUT_VCF_DIR, "%s.vcf" % base_file_name) - else: - output_vcf_file = output_vcf - shutil.copyfile(vcf_file, output_vcf_file) - bam_metrics = [base_file_name, "", "%4f" % average_coverage, genome_coverage] - vcf_metrics = [base_file_name, str(good_snp_count), "", ""] - if output_metrics is None: - output_metrics_file = os.path.join(OUTPUT_METRICS_DIR, "%s.tabular" % base_file_name) - else: - output_metrics_file = output_metrics - metrics_columns = ["File", "Number of Good SNPs", "Average Coverage", "Genome Coverage"] - with open(output_metrics_file, "w") as fh: - fh.write("# %s\n" % "\t".join(metrics_columns)) - fh.write("%s\n" % "\t".join(bam_metrics)) - fh.write("%s\n" % "\t".join(vcf_metrics)) - task_queue.task_done() +def get_coverage_df(bam_file): + # Create a coverage dictionary. + coverage_dict = {} + coverage_list = pysam.depth(bam_file, split_lines=True) + for line in coverage_list: + chrom, position, depth = line.split('\t') + coverage_dict["%s-%s" % (chrom, position)] = depth + # Convert it to a data frame. + coverage_df = pandas.DataFrame.from_dict(coverage_dict, orient='index', columns=["depth"]) + return coverage_df + + +def get_zero_df(reference): + # Create a zero coverage dictionary. + zero_dict = {} + for record in SeqIO.parse(reference, "fasta"): + chrom = record.id + total_len = len(record.seq) + for pos in list(range(1, total_len + 1)): + zero_dict["%s-%s" % (str(chrom), str(pos))] = 0 + # Convert it to a data frame with depth_x + # and depth_y columns - index is NaN. + zero_df = pandas.DataFrame.from_dict(zero_dict, orient='index', columns=["depth"]) + return zero_df -def set_num_cpus(num_files, processes): - num_cpus = int(multiprocessing.cpu_count()) - if num_files < num_cpus and num_files < processes: - return num_files - if num_cpus < processes: - half_cpus = int(num_cpus / 2) - if num_files < half_cpus: - return num_files - return half_cpus - return processes +def output_zc_vcf_file(base_file_name, vcf_file, zero_df, total_zero_coverage, output_vcf): + column_names = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "Sample"] + vcf_df = pandas.read_csv(vcf_file, sep='\t', header=None, names=column_names, comment='#') + good_snp_count = len(vcf_df[(vcf_df['ALT'].str.len() == 1) & (vcf_df['REF'].str.len() == 1) & (vcf_df['QUAL'] > 150)]) + if total_zero_coverage > 0: + header_file = "%s_header.csv" % base_file_name + with open(header_file, 'w') as outfile: + with open(vcf_file) as infile: + for line in infile: + if re.search('^#', line): + outfile.write("%s" % line) + vcf_df_snp = vcf_df[vcf_df['REF'].str.len() == 1] + vcf_df_snp = vcf_df_snp[vcf_df_snp['ALT'].str.len() == 1] + vcf_df_snp['ABS_VALUE'] = vcf_df_snp['CHROM'].map(str) + "-" + vcf_df_snp['POS'].map(str) + vcf_df_snp = vcf_df_snp.set_index('ABS_VALUE') + cat_df = pandas.concat([vcf_df_snp, zero_df], axis=1, sort=False) + cat_df = cat_df.drop(columns=['CHROM', 'POS', 'depth']) + cat_df[['ID', 'ALT', 'QUAL', 'FILTER', 'INFO']] = cat_df[['ID', 'ALT', 'QUAL', 'FILTER', 'INFO']].fillna('.') + cat_df['REF'] = cat_df['REF'].fillna('N') + cat_df['FORMAT'] = cat_df['FORMAT'].fillna('GT') + cat_df['Sample'] = cat_df['Sample'].fillna('./.') + cat_df['temp'] = cat_df.index.str.rsplit('-', n=1) + cat_df[['CHROM', 'POS']] = pandas.DataFrame(cat_df.temp.values.tolist(), index=cat_df.index) + cat_df = cat_df[['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'Sample']] + cat_df['POS'] = cat_df['POS'].astype(int) + cat_df = cat_df.sort_values(['CHROM', 'POS']) + body_file = "%s_body.csv" % base_file_name + cat_df.to_csv(body_file, sep='\t', header=False, index=False) + with open(output_vcf, "w") as outfile: + for cf in [header_file, body_file]: + with open(cf, "r") as infile: + for line in infile: + outfile.write("%s" % line) + else: + shutil.move(vcf_file, output_vcf) + return good_snp_count + + +def output_metrics_file(base_file_name, average_coverage, genome_coverage, good_snp_count, output_metrics): + bam_metrics = [base_file_name, "", "%4f" % average_coverage, genome_coverage] + vcf_metrics = [base_file_name, str(good_snp_count), "", ""] + metrics_columns = ["File", "Number of Good SNPs", "Average Coverage", "Genome Coverage"] + with open(output_metrics, "w") as fh: + fh.write("# %s\n" % "\t".join(metrics_columns)) + fh.write("%s\n" % "\t".join(bam_metrics)) + fh.write("%s\n" % "\t".join(vcf_metrics)) + + +def output_files(vcf_file, total_zero_coverage, zero_df, output_vcf, average_coverage, genome_coverage, output_metrics): + base_file_name = get_sample_name(vcf_file) + good_snp_count = output_zc_vcf_file(base_file_name, vcf_file, zero_df, total_zero_coverage, output_vcf) + output_metrics_file(base_file_name, average_coverage, genome_coverage, good_snp_count, output_metrics) + + +def get_coverage_and_snp_count(bam_file, vcf_file, reference, output_metrics, output_vcf): + coverage_df = get_coverage_df(bam_file) + zero_df = get_zero_df(reference) + coverage_df = zero_df.merge(coverage_df, left_index=True, right_index=True, how='outer') + # depth_x "0" column no longer needed. + coverage_df = coverage_df.drop(columns=['depth_x']) + coverage_df = coverage_df.rename(columns={'depth_y': 'depth'}) + # Covert the NaN to 0 coverage and get some metrics. + coverage_df = coverage_df.fillna(0) + coverage_df['depth'] = coverage_df['depth'].apply(int) + total_length = len(coverage_df) + average_coverage = coverage_df['depth'].mean() + zero_df = coverage_df[coverage_df['depth'] == 0] + total_zero_coverage = len(zero_df) + total_coverage = total_length - total_zero_coverage + genome_coverage = "{:.2%}".format(total_coverage / total_length) + # Output a zero-coverage vcf fil and the metrics file. + output_files(vcf_file, total_zero_coverage, zero_df, output_vcf, average_coverage, genome_coverage, output_metrics) if __name__ == '__main__': parser = argparse.ArgumentParser() + parser.add_argument('--bam_input', action='store', dest='bam_input', help='bam input file') parser.add_argument('--output_metrics', action='store', dest='output_metrics', required=False, default=None, help='Output metrics text file') parser.add_argument('--output_vcf', action='store', dest='output_vcf', required=False, default=None, help='Output VCF file') parser.add_argument('--reference', action='store', dest='reference', help='Reference dataset') - parser.add_argument('--processes', action='store', dest='processes', type=int, help='User-selected number of processes to use for job splitting') + parser.add_argument('--vcf_input', action='store', dest='vcf_input', help='vcf input file') args = parser.parse_args() - # The assumption here is that the list of files - # in both INPUT_BAM_DIR and INPUT_VCF_DIR are - # equal in number and named such that they are - # properly matched if the directories contain - # more than 1 file (i.e., hopefully the bam file - # names and vcf file names will be something like - # Mbovis-01D6_* so they can be # sorted and properly - # associated with each other). - bam_files = [] - for file_name in sorted(os.listdir(INPUT_BAM_DIR)): - file_path = os.path.abspath(os.path.join(INPUT_BAM_DIR, file_name)) - bam_files.append(file_path) - vcf_files = [] - for file_name in sorted(os.listdir(INPUT_VCF_DIR)): - file_path = os.path.abspath(os.path.join(INPUT_VCF_DIR, file_name)) - vcf_files.append(file_path) - - multiprocessing.set_start_method('spawn') - queue1 = multiprocessing.JoinableQueue() - num_files = len(bam_files) - cpus = set_num_cpus(num_files, args.processes) - # Set a timeout for get()s in the queue. - timeout = 0.05 - - # Add each associated bam and vcf file pair to the queue. - for i, bam_file in enumerate(bam_files): - vcf_file = vcf_files[i] - queue1.put((bam_file, vcf_file)) - - # Complete the get_coverage_and_snp_count task. - processes = [multiprocessing.Process(target=get_coverage_and_snp_count, args=(queue1, args.reference, args.output_metrics, args.output_vcf, timeout, )) for _ in range(cpus)] - for p in processes: - p.start() - for p in processes: - p.join() - queue1.join() - - if queue1.empty(): - queue1.close() - queue1.join_thread() + get_coverage_and_snp_count(args.bam_input, args.vcf_input, args.reference, args.output_metrics, args.output_vcf) diff -r 85587c8eb25f -r efb86aade548 vsnp_build_tables.py --- a/vsnp_build_tables.py Wed Dec 02 09:10:07 2020 +0000 +++ b/vsnp_build_tables.py Thu Dec 10 15:25:53 2020 +0000 @@ -1,18 +1,13 @@ #!/usr/bin/env python import argparse -import multiprocessing import os -import queue import re import pandas import pandas.io.formats.excel from Bio import SeqIO -INPUT_JSON_AVG_MQ_DIR = 'input_json_avg_mq_dir' -INPUT_JSON_DIR = 'input_json_dir' -INPUT_NEWICK_DIR = 'input_newick_dir' # Maximum columns allowed in a LibreOffice # spreadsheet is 1024. Excel allows for # 16,384 columns, but we'll set the lower @@ -145,18 +140,12 @@ return annotation_dict -def get_base_file_name(file_path): +def get_sample_name(file_path): base_file_name = os.path.basename(file_path) if base_file_name.find(".") > 0: # Eliminate the extension. return os.path.splitext(base_file_name)[0] - elif base_file_name.find("_") > 0: - # The dot extension was likely changed to - # the " character. - items = base_file_name.split("_") - return "_".join(items[0:-1]) - else: - return base_file_name + return base_file_name def output_cascade_table(cascade_order, mqdf, group, annotation_dict): @@ -169,17 +158,20 @@ # is used by the excel_formatter. if count is None: if group is None: - json_file_name = "%s_order_mq.json" % type_str + json_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_order_mq.json" % type_str) excel_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_table.xlsx" % type_str) else: - json_file_name = "%s_%s_order_mq.json" % (group, type_str) + json_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_%s_order_mq.json" % (group, type_str)) excel_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_%s_table.xlsx" % (group, type_str)) else: + # The table has more columns than is allowed by the + # MAXCOLS setting, so multiple files will be produced + # as an output collection. if group is None: - json_file_name = "%s_order_mq_%d.json" % (type_str, count) + json_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_order_mq_%d.json" % (type_str, count)) excel_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_table_%d.xlsx" % (type_str, count)) else: - json_file_name = "%s_%s_order_mq_%d.json" % (group, type_str, count) + json_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_%s_order_mq_%d.json" % (group, type_str, count)) excel_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_%s_table_%d.xlsx" % (group, type_str, count)) df.to_json(json_file_name, orient='split') # Output the Excel file. @@ -229,94 +221,74 @@ output_excel(df, type_str, group_str, annotation_dict) -def preprocess_tables(task_queue, annotation_dict, timeout): - while True: - try: - tup = task_queue.get(block=True, timeout=timeout) - except queue.Empty: - break - newick_file, json_file, json_avg_mq_file = tup - avg_mq_series = pandas.read_json(json_avg_mq_file, typ='series', orient='split') - # Map quality to dataframe. - mqdf = avg_mq_series.to_frame(name='MQ') - mqdf = mqdf.T - # Get the group. - group = get_base_file_name(newick_file) - snps_df = pandas.read_json(json_file, orient='split') - with open(newick_file, 'r') as fh: - for line in fh: - line = re.sub('[:,]', '\n', line) - line = re.sub('[)(]', '', line) - line = re.sub(r'[0-9].*\.[0-9].*\n', '', line) - line = re.sub('root\n', '', line) - sample_order = line.split('\n') - sample_order = list([_f for _f in sample_order if _f]) - sample_order.insert(0, 'root') - tree_order = snps_df.loc[sample_order] - # Count number of SNPs in each column. - snp_per_column = [] - for column_header in tree_order: - count = 0 - column = tree_order[column_header] - for element in column: - if element != column[0]: - count = count + 1 - snp_per_column.append(count) - row1 = pandas.Series(snp_per_column, tree_order.columns, name="snp_per_column") - # Count number of SNPS from the - # top of each column in the table. - snp_from_top = [] - for column_header in tree_order: - count = 0 - column = tree_order[column_header] - # for each element in the column - # skip the first element - for element in column[1:]: - if element == column[0]: - count = count + 1 - else: - break - snp_from_top.append(count) - row2 = pandas.Series(snp_from_top, tree_order.columns, name="snp_from_top") - tree_order = tree_order.append([row1]) - tree_order = tree_order.append([row2]) - # In pandas=0.18.1 even this does not work: - # abc = row1.to_frame() - # abc = abc.T --> tree_order.shape (5, 18), abc.shape (1, 18) - # tree_order.append(abc) - # Continue to get error: "*** ValueError: all the input arrays must have same number of dimensions" - tree_order = tree_order.T - tree_order = tree_order.sort_values(['snp_from_top', 'snp_per_column'], ascending=[True, False]) - tree_order = tree_order.T - # Remove snp_per_column and snp_from_top rows. - cascade_order = tree_order[:-2] - # Output the cascade table. - output_cascade_table(cascade_order, mqdf, group, annotation_dict) - # Output the sorted table. - output_sort_table(cascade_order, mqdf, group, annotation_dict) - task_queue.task_done() - - -def set_num_cpus(num_files, processes): - num_cpus = int(multiprocessing.cpu_count()) - if num_files < num_cpus and num_files < processes: - return num_files - if num_cpus < processes: - half_cpus = int(num_cpus / 2) - if num_files < half_cpus: - return num_files - return half_cpus - return processes +def preprocess_tables(newick_file, json_file, json_avg_mq_file, annotation_dict): + avg_mq_series = pandas.read_json(json_avg_mq_file, typ='series', orient='split') + # Map quality to dataframe. + mqdf = avg_mq_series.to_frame(name='MQ') + mqdf = mqdf.T + # Get the group. + group = get_sample_name(newick_file) + snps_df = pandas.read_json(json_file, orient='split') + with open(newick_file, 'r') as fh: + for line in fh: + line = re.sub('[:,]', '\n', line) + line = re.sub('[)(]', '', line) + line = re.sub(r'[0-9].*\.[0-9].*\n', '', line) + line = re.sub('root\n', '', line) + sample_order = line.split('\n') + sample_order = list([_f for _f in sample_order if _f]) + sample_order.insert(0, 'root') + tree_order = snps_df.loc[sample_order] + # Count number of SNPs in each column. + snp_per_column = [] + for column_header in tree_order: + count = 0 + column = tree_order[column_header] + for element in column: + if element != column[0]: + count = count + 1 + snp_per_column.append(count) + row1 = pandas.Series(snp_per_column, tree_order.columns, name="snp_per_column") + # Count number of SNPS from the + # top of each column in the table. + snp_from_top = [] + for column_header in tree_order: + count = 0 + column = tree_order[column_header] + # for each element in the column + # skip the first element + for element in column[1:]: + if element == column[0]: + count = count + 1 + else: + break + snp_from_top.append(count) + row2 = pandas.Series(snp_from_top, tree_order.columns, name="snp_from_top") + tree_order = tree_order.append([row1]) + tree_order = tree_order.append([row2]) + # In pandas=0.18.1 even this does not work: + # abc = row1.to_frame() + # abc = abc.T --> tree_order.shape (5, 18), abc.shape (1, 18) + # tree_order.append(abc) + # Continue to get error: "*** ValueError: all the input arrays must have same number of dimensions" + tree_order = tree_order.T + tree_order = tree_order.sort_values(['snp_from_top', 'snp_per_column'], ascending=[True, False]) + tree_order = tree_order.T + # Remove snp_per_column and snp_from_top rows. + cascade_order = tree_order[:-2] + # Output the cascade table. + output_cascade_table(cascade_order, mqdf, group, annotation_dict) + # Output the sorted table. + output_sort_table(cascade_order, mqdf, group, annotation_dict) if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--input_avg_mq_json', action='store', dest='input_avg_mq_json', required=False, default=None, help='Average MQ json file') - parser.add_argument('--input_newick', action='store', dest='input_newick', required=False, default=None, help='Newick file') - parser.add_argument('--input_snps_json', action='store', dest='input_snps_json', required=False, default=None, help='SNPs json file') parser.add_argument('--gbk_file', action='store', dest='gbk_file', required=False, default=None, help='Optional gbk file'), - parser.add_argument('--processes', action='store', dest='processes', type=int, help='User-selected number of processes to use for job splitting') + parser.add_argument('--input_avg_mq_json', action='store', dest='input_avg_mq_json', help='Average MQ json file') + parser.add_argument('--input_newick', action='store', dest='input_newick', help='Newick file') + parser.add_argument('--input_snps_json', action='store', dest='input_snps_json', help='SNPs json file') args = parser.parse_args() @@ -327,56 +299,4 @@ else: annotation_dict = None - # The assumption here is that the list of files - # in both INPUT_NEWICK_DIR and INPUT_JSON_DIR are - # named such that they are properly matched if - # the directories contain more than 1 file (i.e., - # hopefully the newick file names and json file names - # will be something like Mbovis-01D6_* so they can be - # sorted and properly associated with each other). - if args.input_newick is not None: - newick_files = [args.input_newick] - else: - newick_files = [] - for file_name in sorted(os.listdir(INPUT_NEWICK_DIR)): - file_path = os.path.abspath(os.path.join(INPUT_NEWICK_DIR, file_name)) - newick_files.append(file_path) - if args.input_snps_json is not None: - json_files = [args.input_snps_json] - else: - json_files = [] - for file_name in sorted(os.listdir(INPUT_JSON_DIR)): - file_path = os.path.abspath(os.path.join(INPUT_JSON_DIR, file_name)) - json_files.append(file_path) - if args.input_avg_mq_json is not None: - json_avg_mq_files = [args.input_avg_mq_json] - else: - json_avg_mq_files = [] - for file_name in sorted(os.listdir(INPUT_JSON_AVG_MQ_DIR)): - file_path = os.path.abspath(os.path.join(INPUT_JSON_AVG_MQ_DIR, file_name)) - json_avg_mq_files.append(file_path) - - multiprocessing.set_start_method('spawn') - queue1 = multiprocessing.JoinableQueue() - queue2 = multiprocessing.JoinableQueue() - num_files = len(newick_files) - cpus = set_num_cpus(num_files, args.processes) - # Set a timeout for get()s in the queue. - timeout = 0.05 - - for i, newick_file in enumerate(newick_files): - json_file = json_files[i] - json_avg_mq_file = json_avg_mq_files[i] - queue1.put((newick_file, json_file, json_avg_mq_file)) - - # Complete the preprocess_tables task. - processes = [multiprocessing.Process(target=preprocess_tables, args=(queue1, annotation_dict, timeout, )) for _ in range(cpus)] - for p in processes: - p.start() - for p in processes: - p.join() - queue1.join() - - if queue1.empty(): - queue1.close() - queue1.join_thread() + preprocess_tables(args.input_newick, args.input_snps_json, args.input_avg_mq_json, annotation_dict) diff -r 85587c8eb25f -r efb86aade548 vsnp_build_tables.xml --- a/vsnp_build_tables.xml Wed Dec 02 09:10:07 2020 +0000 +++ b/vsnp_build_tables.xml Thu Dec 10 15:25:53 2020 +0000 @@ -1,4 +1,4 @@ - + macros.xml @@ -10,75 +10,35 @@ - - - - - - - - - - - - - - - - - - - - - - - - - + + + @@ -93,9 +53,9 @@ - + + - @@ -109,8 +69,8 @@ - - + + @@ -119,42 +79,39 @@ - - - + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - + + + - - - - - - - + + + + + + + + + + + + + @@ -189,7 +146,6 @@ **Required Options** - * **Choose the category for the files to be analyzed** - select "Single files" or "Collections of files", then select the appropriate history items (single SNPs json, average MQ json and newick files, or collections of each) based on the selected option. * **Use Genbank file** - Select "yes" to annotate the tables using the information in the Genbank file. Locally cached files, if available, provide the most widely used annotations, but more custom Genbank files can be chosen from the current history. diff -r 85587c8eb25f -r efb86aade548 vsnp_determine_ref_from_data.py --- a/vsnp_determine_ref_from_data.py Wed Dec 02 09:10:07 2020 +0000 +++ b/vsnp_determine_ref_from_data.py Thu Dec 10 15:25:53 2020 +0000 @@ -12,17 +12,11 @@ OUTPUT_METRICS_DIR = 'output_metrics' -def get_base_file_name(file_path): +def get_sample_name(file_path): base_file_name = os.path.basename(file_path) if base_file_name.find(".") > 0: # Eliminate the extension. return os.path.splitext(base_file_name)[0] - elif base_file_name.find("_fq") > 0: - # The "." character has likely - # changed to an "_" character. - return base_file_name.split("_fq")[0] - elif base_file_name.find("_fastq") > 0: - return base_file_name.split("_fastq")[0] return base_file_name @@ -178,7 +172,7 @@ def output_files(fastq_file, count_list, group, dbkey, dbkey_file, metrics_file): - base_file_name = get_base_file_name(fastq_file) + base_file_name = get_sample_name(fastq_file) output_dbkey(base_file_name, dbkey, dbkey_file) output_metrics(base_file_name, count_list, group, dbkey, metrics_file)