Mercurial > repos > iuc > vsnp_determine_ref_from_data

--- a/macros.xml	Tue Nov 16 08:28:27 2021 +0000
+++ b/macros.xml	Tue Nov 16 20:12:56 2021 +0000
@@ -1,7 +1,35 @@
 <?xml version='1.0' encoding='UTF-8'?>
 <macros>
-    <token name="@WRAPPER_VERSION@">1.0</token>
-    <token name="@PROFILE@">19.09</token>
+    <token name="@TOOL_VERSION@">1.0</token>
+    <token name="@VERSION_SUFFIX@">2</token>
+    <token name="@PROFILE@">21.01</token>
+    <xml name="biopython_requirement">
+        <requirement type="package" version="1.79">biopython</requirement>
+    </xml>
+    <xml name="numpy_requirement">
+        <requirement type="package" version="1.21.1">numpy</requirement>
+    </xml>
+    <xml name="openpyxl_requirement">
+        <requirement type="package" version="3.0.9">openpyxl</requirement>
+    </xml>
+    <xml name="pandas_requirement">
+        <requirement type="package" version="1.3.0">pandas</requirement>
+    </xml>
+    <xml name="pysam_requirement">
+        <requirement type="package" version="0.15.4">pysam</requirement>
+    </xml>
+    <xml name="pyvcf_requirement">
+        <requirement type="package" version="0.6.8">pyvcf</requirement>
+    </xml>
+    <xml name="pyyaml_requirement">
+        <requirement type="package" version="5.4.1">pyyaml</requirement>
+    </xml>
+    <xml name="xlrd_requirement">
+        <requirement type="package" version="2.0.1">xlrd</requirement>
+    </xml>
+    <xml name="xlsxwriter_requirement">
+        <requirement type="package" version="1.4.4">xlsxwriter</requirement>
+    </xml>
     <xml name="param_reference_source">
         <param name="reference_source" type="select" label="Choose the source for the reference genome">
             <option value="cached" selected="true">locally cached</option>
Binary file test-data/Mbovis-01D6_cascade_table.xlsx has changed
Binary file test-data/Mbovis-01D6_sort_table.xlsx has changed
Binary file test-data/Mbovis-01D_cascade_table.xlsx has changed
Binary file test-data/Mbovis-01D_sort_table.xlsx has changed
Binary file test-data/Mbovis-01_cascade_table.xlsx has changed
Binary file test-data/Mbovis-01_sort_table.xlsx has changed
Binary file test-data/cascade_table.xlsx has changed
--- a/test-data/output_metrics.tabular	Tue Nov 16 08:28:27 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-# File	Number of Good SNPs	Average Coverage	Genome Coverage
-vcf_input_vcf		0.659602	50.49%
-vcf_input_vcf	0
--- a/test-data/output_vcf.vcf	Tue Nov 16 08:28:27 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,100 +0,0 @@
-##fileformat=VCFv4.2
-##fileDate=20200302
-##source=freeBayes v1.3.1-dirty
-##reference=/home/galaxy/galaxy/tool-data/AF2122/seq/AF2122.fa
-##contig=<ID=NC_002945.4,length=4349904>
-##phasing=none
-##commandline="freebayes --region NC_002945.4:0..4349904 --bam b_0.bam --fasta-reference /home/galaxy/galaxy/tool-data/AF2122/seq/AF2122.fa --vcf ./vcf_output/part_NC_002945.4:0..4349904.vcf -u -n 0 --haplotype-length -1 --min-repeat-size 5 --min-repeat-entropy 1 -m 1 -q 0 -R 0 -Y 0 -e 1 -F 0.05 -C 2 -G 1 --min-alternate-qsum 0"
-##filter="QUAL > 0"
-##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
-##INFO=<ID=DP,Number=1,Type=Integer,Description="Total read depth at the locus">
-##INFO=<ID=DPB,Number=1,Type=Float,Description="Total read depth per bp at the locus; bases in reads overlapping / bases in haplotype">
-##INFO=<ID=AC,Number=A,Type=Integer,Description="Total number of alternate alleles in called genotypes">
-##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
-##INFO=<ID=AF,Number=A,Type=Float,Description="Estimated allele frequency in the range (0,1]">
-##INFO=<ID=RO,Number=1,Type=Integer,Description="Count of full observations of the reference haplotype.">
-##INFO=<ID=AO,Number=A,Type=Integer,Description="Count of full observations of this alternate haplotype.">
-##INFO=<ID=PRO,Number=1,Type=Float,Description="Reference allele observation count, with partial observations recorded fractionally">
-##INFO=<ID=PAO,Number=A,Type=Float,Description="Alternate allele observations, with partial observations recorded fractionally">
-##INFO=<ID=QR,Number=1,Type=Integer,Description="Reference allele quality sum in phred">
-##INFO=<ID=QA,Number=A,Type=Integer,Description="Alternate allele quality sum in phred">
-##INFO=<ID=PQR,Number=1,Type=Float,Description="Reference allele quality sum in phred for partial observations">
-##INFO=<ID=PQA,Number=A,Type=Float,Description="Alternate allele quality sum in phred for partial observations">
-##INFO=<ID=SRF,Number=1,Type=Integer,Description="Number of reference observations on the forward strand">
-##INFO=<ID=SRR,Number=1,Type=Integer,Description="Number of reference observations on the reverse strand">
-##INFO=<ID=SAF,Number=A,Type=Integer,Description="Number of alternate observations on the forward strand">
-##INFO=<ID=SAR,Number=A,Type=Integer,Description="Number of alternate observations on the reverse strand">
-##INFO=<ID=SRP,Number=1,Type=Float,Description="Strand balance probability for the reference allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SRF and SRR given E(SRF/SRR) ~ 0.5, derived using Hoeffding's inequality">
-##INFO=<ID=SAP,Number=A,Type=Float,Description="Strand balance probability for the alternate allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SAF and SAR given E(SAF/SAR) ~ 0.5, derived using Hoeffding's inequality">
-##INFO=<ID=AB,Number=A,Type=Float,Description="Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous">
-##INFO=<ID=ABP,Number=A,Type=Float,Description="Allele balance probability at heterozygous sites: Phred-scaled upper-bounds estimate of the probability of observing the deviation between ABR and ABA given E(ABR/ABA) ~ 0.5, derived using Hoeffding's inequality">
-##INFO=<ID=RUN,Number=A,Type=Integer,Description="Run length: the number of consecutive repeats of the alternate allele in the reference genome">
-##INFO=<ID=RPP,Number=A,Type=Float,Description="Read Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality">
-##INFO=<ID=RPPR,Number=1,Type=Float,Description="Read Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality">
-##INFO=<ID=RPL,Number=A,Type=Float,Description="Reads Placed Left: number of reads supporting the alternate balanced to the left (5') of the alternate allele">
-##INFO=<ID=RPR,Number=A,Type=Float,Description="Reads Placed Right: number of reads supporting the alternate balanced to the right (3') of the alternate allele">
-##INFO=<ID=EPP,Number=A,Type=Float,Description="End Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding's inequality">
-##INFO=<ID=EPPR,Number=1,Type=Float,Description="End Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding's inequality">
-##INFO=<ID=DPRA,Number=A,Type=Float,Description="Alternate allele depth ratio.  Ratio between depth in samples with each called alternate allele and those without.">
-##INFO=<ID=ODDS,Number=1,Type=Float,Description="The log odds ratio of the best genotype combination to the second-best.">
-##INFO=<ID=GTI,Number=1,Type=Integer,Description="Number of genotyping iterations required to reach convergence or bailout.">
-##INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele, either snp, mnp, ins, del, or complex.">
-##INFO=<ID=CIGAR,Number=A,Type=String,Description="The extended CIGAR representation of each alternate allele, with the exception that '=' is replaced by 'M' to ease VCF parsing.  Note that INDEL alleles do not have the first matched base (which is provided by default, per the spec) referred to by the CIGAR.">
-##INFO=<ID=NUMALT,Number=1,Type=Integer,Description="Number of unique non-reference alleles in called genotypes at this position.">
-##INFO=<ID=MEANALT,Number=A,Type=Float,Description="Mean number of unique non-reference allele observations per sample with the corresponding alternate alleles.">
-##INFO=<ID=LEN,Number=A,Type=Integer,Description="allele length">
-##INFO=<ID=MQM,Number=A,Type=Float,Description="Mean mapping quality of observed alternate alleles">
-##INFO=<ID=MQMR,Number=1,Type=Float,Description="Mean mapping quality of observed reference alleles">
-##INFO=<ID=PAIRED,Number=A,Type=Float,Description="Proportion of observed alternate alleles which are supported by properly paired read fragments">
-##INFO=<ID=PAIREDR,Number=1,Type=Float,Description="Proportion of observed reference alleles which are supported by properly paired read fragments">
-##INFO=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum depth in gVCF output block.">
-##INFO=<ID=END,Number=1,Type=Integer,Description="Last position (inclusive) in gVCF output record.">
-##INFO=<ID=technology.ILLUMINA,Number=A,Type=Float,Description="Fraction of observations supporting the alternate observed in reads from ILLUMINA">
-##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
-##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype">
-##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy">
-##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
-##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Number of observation for each allele">
-##FORMAT=<ID=RO,Number=1,Type=Integer,Description="Reference allele observation count">
-##FORMAT=<ID=QR,Number=1,Type=Integer,Description="Sum of quality of the reference observations">
-##FORMAT=<ID=AO,Number=A,Type=Integer,Description="Alternate allele observation count">
-##FORMAT=<ID=QA,Number=A,Type=Integer,Description="Sum of quality of the alternate observations">
-##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum depth in gVCF output block.">
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	13-1941-6
-NC_002945.4	1	.	N	.	.	.	.	GT	./.
-NC_002945.4	2	.	N	.	.	.	.	GT	./.
-NC_002945.4	3	.	N	.	.	.	.	GT	./.
-NC_002945.4	4	.	N	.	.	.	.	GT	./.
-NC_002945.4	5	.	N	.	.	.	.	GT	./.
-NC_002945.4	6	.	N	.	.	.	.	GT	./.
-NC_002945.4	7	.	N	.	.	.	.	GT	./.
-NC_002945.4	8	.	N	.	.	.	.	GT	./.
-NC_002945.4	9	.	N	.	.	.	.	GT	./.
-NC_002945.4	10	.	N	.	.	.	.	GT	./.
-NC_002945.4	11	.	N	.	.	.	.	GT	./.
-NC_002945.4	12	.	N	.	.	.	.	GT	./.
-NC_002945.4	13	.	N	.	.	.	.	GT	./.
-NC_002945.4	14	.	N	.	.	.	.	GT	./.
-NC_002945.4	15	.	N	.	.	.	.	GT	./.
-NC_002945.4	16	.	N	.	.	.	.	GT	./.
-NC_002945.4	17	.	N	.	.	.	.	GT	./.
-NC_002945.4	18	.	N	.	.	.	.	GT	./.
-NC_002945.4	19	.	N	.	.	.	.	GT	./.
-NC_002945.4	20	.	N	.	.	.	.	GT	./.
-NC_002945.4	21	.	N	.	.	.	.	GT	./.
-NC_002945.4	22	.	N	.	.	.	.	GT	./.
-NC_002945.4	23	.	N	.	.	.	.	GT	./.
-NC_002945.4	24	.	N	.	.	.	.	GT	./.
-NC_002945.4	25	.	N	.	.	.	.	GT	./.
-NC_002945.4	26	.	N	.	.	.	.	GT	./.
-NC_002945.4	27	.	N	.	.	.	.	GT	./.
-NC_002945.4	28	.	N	.	.	.	.	GT	./.
-NC_002945.4	29	.	N	.	.	.	.	GT	./.
-NC_002945.4	30	.	N	.	.	.	.	GT	./.
-NC_002945.4	31	.	N	.	.	.	.	GT	./.
-NC_002945.4	32	.	N	.	.	.	.	GT	./.
-NC_002945.4	33	.	N	.	.	.	.	GT	./.
-NC_002945.4	34	.	N	.	.	.	.	GT	./.
-NC_002945.4	35	.	N	.	.	.	.	GT	./.
-NC_002945.4	36	.	N	.	.	.	.	GT	./.
-NC_002945.4	37	.	N	.	.	.	.	GT	./.
Binary file test-data/sort_table.xlsx has changed
--- a/test-data/vsnp_statistics1.tabular	Tue Nov 16 08:28:27 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-	Reference	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
-Mcap_Deer_DE_SRR650221_fastq_gz	89	1.6 MB	121.0	29.7	      0.53	4317	17063	223	      0.05	8.27%	0.439436	36
--- a/test-data/vsnp_statistics2.tabular	Tue Nov 16 08:28:27 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-	Reference	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
-13-1941-6_S4_L001_R1_600000_fastq_gz	89	8.7 KB	100.0	65.7	      1.00	25	45	5	      0.20	98.74%	10.338671	611
-13-1941-6_S4_L001_R2_600000_fastq_gz	89	8.5 KB	100.0	66.3	      1.00	25	45	5	      0.20	98.74%	10.338671	611
--- a/test-data/vsnp_statistics3.tabular	Tue Nov 16 08:28:27 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-	Reference	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
-13-1941-6_S4_L001_R1_600000_fastq_gz	89	8.7 KB	100.0	65.7	      1.00	25	24	2	      0.08	0.13%	0.001252	0
-Mcap_Deer_DE_SRR650221_fastq_gz	89	1.6 MB	121.0	29.7	      0.53	4317	17063	223	      0.05	8.27%	0.439436	36
--- a/test-data/vsnp_statistics4.tabular	Tue Nov 16 08:28:27 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-	Reference	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
-13-1941-6_S4_L001_R1_600000_fastq_gz	89	8.7 KB	100.0	65.7	      1.00	25	46	4	      0.16	0.16%	0.002146	0
-13-1941-6_S4_L001_R2_600000_fastq_gz	89	8.5 KB	100.0	66.3	      1.00	25	46	4	      0.16	0.16%	0.002146	0
--- a/vsnp_build_tables.py	Tue Nov 16 08:28:27 2021 +0000
+++ b/vsnp_build_tables.py	Tue Nov 16 20:12:56 2021 +0000
@@ -1,7 +1,9 @@
 #!/usr/bin/env python

 import argparse
+import multiprocessing
 import os
+import queue
 import re

 import pandas
@@ -16,6 +18,9 @@
 # to use LibreOffice for Excel spreadsheets.
 MAXCOLS = 1024
 OUTPUT_EXCEL_DIR = 'output_excel_dir'
+INPUT_JSON_AVG_MQ_DIR = 'input_json_avg_mq_dir'
+INPUT_JSON_DIR = 'input_json_dir'
+INPUT_NEWICK_DIR = 'input_newick_dir'


 def annotate_table(table_df, group, annotation_dict):
@@ -221,74 +226,94 @@
         output_excel(df, type_str, group_str, annotation_dict)


-def preprocess_tables(newick_file, json_file, json_avg_mq_file, annotation_dict):
-    avg_mq_series = pandas.read_json(json_avg_mq_file, typ='series', orient='split')
-    # Map quality to dataframe.
-    mqdf = avg_mq_series.to_frame(name='MQ')
-    mqdf = mqdf.T
-    # Get the group.
-    group = get_sample_name(newick_file)
-    snps_df = pandas.read_json(json_file, orient='split')
-    with open(newick_file, 'r') as fh:
-        for line in fh:
-            line = re.sub('[:,]', '\n', line)
-            line = re.sub('[)(]', '', line)
-            line = re.sub(r'[0-9].*\.[0-9].*\n', '', line)
-            line = re.sub('root\n', '', line)
-    sample_order = line.split('\n')
-    sample_order = list([_f for _f in sample_order if _f])
-    sample_order.insert(0, 'root')
-    tree_order = snps_df.loc[sample_order]
-    # Count number of SNPs in each column.
-    snp_per_column = []
-    for column_header in tree_order:
-        count = 0
-        column = tree_order[column_header]
-        for element in column:
-            if element != column[0]:
-                count = count + 1
-        snp_per_column.append(count)
-    row1 = pandas.Series(snp_per_column, tree_order.columns, name="snp_per_column")
-    # Count number of SNPS from the
-    # top of each column in the table.
-    snp_from_top = []
-    for column_header in tree_order:
-        count = 0
-        column = tree_order[column_header]
-        # for each element in the column
-        # skip the first element
-        for element in column[1:]:
-            if element == column[0]:
-                count = count + 1
-            else:
-                break
-        snp_from_top.append(count)
-    row2 = pandas.Series(snp_from_top, tree_order.columns, name="snp_from_top")
-    tree_order = tree_order.append([row1])
-    tree_order = tree_order.append([row2])
-    # In pandas=0.18.1 even this does not work:
-    # abc = row1.to_frame()
-    # abc = abc.T --> tree_order.shape (5, 18), abc.shape (1, 18)
-    # tree_order.append(abc)
-    # Continue to get error: "*** ValueError: all the input arrays must have same number of dimensions"
-    tree_order = tree_order.T
-    tree_order = tree_order.sort_values(['snp_from_top', 'snp_per_column'], ascending=[True, False])
-    tree_order = tree_order.T
-    # Remove snp_per_column and snp_from_top rows.
-    cascade_order = tree_order[:-2]
-    # Output the cascade table.
-    output_cascade_table(cascade_order, mqdf, group, annotation_dict)
-    # Output the sorted table.
-    output_sort_table(cascade_order, mqdf, group, annotation_dict)
+def preprocess_tables(task_queue, annotation_dict, timeout):
+    while True:
+        try:
+            tup = task_queue.get(block=True, timeout=timeout)
+        except queue.Empty:
+            break
+        newick_file, json_file, json_avg_mq_file = tup
+        avg_mq_series = pandas.read_json(json_avg_mq_file, typ='series', orient='split')
+        # Map quality to dataframe.
+        mqdf = avg_mq_series.to_frame(name='MQ')
+        mqdf = mqdf.T
+        # Get the group.
+        group = get_sample_name(newick_file)
+        snps_df = pandas.read_json(json_file, orient='split')
+        with open(newick_file, 'r') as fh:
+            for line in fh:
+                line = re.sub('[:,]', '\n', line)
+                line = re.sub('[)(]', '', line)
+                line = re.sub(r'[0-9].*\.[0-9].*\n', '', line)
+                line = re.sub('root\n', '', line)
+        sample_order = line.split('\n')
+        sample_order = list([_f for _f in sample_order if _f])
+        sample_order.insert(0, 'root')
+        tree_order = snps_df.loc[sample_order]
+        # Count number of SNPs in each column.
+        snp_per_column = []
+        for column_header in tree_order:
+            count = 0
+            column = tree_order[column_header]
+            for element in column:
+                if element != column[0]:
+                    count = count + 1
+            snp_per_column.append(count)
+        row1 = pandas.Series(snp_per_column, tree_order.columns, name="snp_per_column")
+        # Count number of SNPS from the
+        # top of each column in the table.
+        snp_from_top = []
+        for column_header in tree_order:
+            count = 0
+            column = tree_order[column_header]
+            # for each element in the column
+            # skip the first element
+            for element in column[1:]:
+                if element == column[0]:
+                    count = count + 1
+                else:
+                    break
+            snp_from_top.append(count)
+        row2 = pandas.Series(snp_from_top, tree_order.columns, name="snp_from_top")
+        tree_order = tree_order.append([row1])
+        tree_order = tree_order.append([row2])
+        # In pandas=0.18.1 even this does not work:
+        # abc = row1.to_frame()
+        # abc = abc.T --> tree_order.shape (5, 18), abc.shape (1, 18)
+        # tree_order.append(abc)
+        # Continue to get error: "*** ValueError: all the input arrays must have same number of dimensions"
+        tree_order = tree_order.T
+        tree_order = tree_order.sort_values(['snp_from_top', 'snp_per_column'], ascending=[True, False])
+        tree_order = tree_order.T
+        # Remove snp_per_column and snp_from_top rows.
+        cascade_order = tree_order[:-2]
+        # Output the cascade table.
+        output_cascade_table(cascade_order, mqdf, group, annotation_dict)
+        # Output the sorted table.
+        output_sort_table(cascade_order, mqdf, group, annotation_dict)
+        task_queue.task_done()
+
+
+def set_num_cpus(num_files, processes):
+    num_cpus = int(multiprocessing.cpu_count())
+    if num_files < num_cpus and num_files < processes:
+        return num_files
+    if num_cpus < processes:
+        half_cpus = int(num_cpus / 2)
+        if num_files < half_cpus:
+            return num_files
+        return half_cpus
+    return processes


 if __name__ == '__main__':
     parser = argparse.ArgumentParser()

+    parser.add_argument('--input_avg_mq_json', action='store', dest='input_avg_mq_json', required=False, default=None, help='Average MQ json file')
+    parser.add_argument('--input_newick', action='store', dest='input_newick', required=False, default=None, help='Newick file')
+    parser.add_argument('--input_snps_json', action='store', dest='input_snps_json', required=False, default=None, help='SNPs json file')
     parser.add_argument('--gbk_file', action='store', dest='gbk_file', required=False, default=None, help='Optional gbk file'),
-    parser.add_argument('--input_avg_mq_json', action='store', dest='input_avg_mq_json', help='Average MQ json file')
-    parser.add_argument('--input_newick', action='store', dest='input_newick', help='Newick file')
-    parser.add_argument('--input_snps_json', action='store', dest='input_snps_json', help='SNPs json file')
+    parser.add_argument('--processes', action='store', dest='processes', type=int, help='User-selected number of processes to use for job splitting')

     args = parser.parse_args()

@@ -299,4 +324,56 @@
     else:
         annotation_dict = None

-    preprocess_tables(args.input_newick, args.input_snps_json, args.input_avg_mq_json, annotation_dict)
+    # The assumption here is that the list of files
+    # in both INPUT_NEWICK_DIR and INPUT_JSON_DIR are
+    # named such that they are properly matched if
+    # the directories contain more than 1 file (i.e.,
+    # hopefully the newick file names and json file names
+    # will be something like Mbovis-01D6_* so they can be
+    # sorted and properly associated with each other).
+    if args.input_newick is not None:
+        newick_files = [args.input_newick]
+    else:
+        newick_files = []
+        for file_name in sorted(os.listdir(INPUT_NEWICK_DIR)):
+            file_path = os.path.abspath(os.path.join(INPUT_NEWICK_DIR, file_name))
+            newick_files.append(file_path)
+    if args.input_snps_json is not None:
+        json_files = [args.input_snps_json]
+    else:
+        json_files = []
+        for file_name in sorted(os.listdir(INPUT_JSON_DIR)):
+            file_path = os.path.abspath(os.path.join(INPUT_JSON_DIR, file_name))
+            json_files.append(file_path)
+    if args.input_avg_mq_json is not None:
+        json_avg_mq_files = [args.input_avg_mq_json]
+    else:
+        json_avg_mq_files = []
+        for file_name in sorted(os.listdir(INPUT_JSON_AVG_MQ_DIR)):
+            file_path = os.path.abspath(os.path.join(INPUT_JSON_AVG_MQ_DIR, file_name))
+            json_avg_mq_files.append(file_path)
+
+    multiprocessing.set_start_method('spawn')
+    queue1 = multiprocessing.JoinableQueue()
+    queue2 = multiprocessing.JoinableQueue()
+    num_files = len(newick_files)
+    cpus = set_num_cpus(num_files, args.processes)
+    # Set a timeout for get()s in the queue.
+    timeout = 0.05
+
+    for i, newick_file in enumerate(newick_files):
+        json_file = json_files[i]
+        json_avg_mq_file = json_avg_mq_files[i]
+        queue1.put((newick_file, json_file, json_avg_mq_file))
+
+    # Complete the preprocess_tables task.
+    processes = [multiprocessing.Process(target=preprocess_tables, args=(queue1, annotation_dict, timeout, )) for _ in range(cpus)]
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+    queue1.join()
+
+    if queue1.empty():
+        queue1.close()
+        queue1.join_thread()
--- a/vsnp_determine_ref_from_data.xml	Tue Nov 16 08:28:27 2021 +0000
+++ b/vsnp_determine_ref_from_data.xml	Tue Nov 16 20:12:56 2021 +0000
@@ -1,11 +1,11 @@
-<tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="@WRAPPER_VERSION@.1+galaxy0" profile="@PROFILE@">
+<tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
     <description>from input data</description>
     <macros>
         <import>macros.xml</import>
     </macros>
     <requirements>
-        <requirement type="package" version="1.79">biopython</requirement>
-        <requirement type="package" version="5.4.1">pyyaml</requirement>
+        <expand macro="biopython_requirement"/>
+        <expand macro="pyyaml_requirement"/>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
 #import re
--- a/vsnp_get_snps.py	Tue Nov 16 08:28:27 2021 +0000
+++ b/vsnp_get_snps.py	Tue Nov 16 20:12:56 2021 +0000
@@ -440,7 +440,7 @@
     parser = argparse.ArgumentParser()

     parser.add_argument('--ac', action='store', dest='ac', type=int, help='Allele count value'),
-    parser.add_argument('--all_isolates', action='store_true', dest='all_isolates', required=False, default=False, help='Create table with all isolates'),
+    parser.add_argument('--all_isolates', action='store_true', dest='all_isolates', help='Create table with all isolates'),
     parser.add_argument('--input_excel', action='store', dest='input_excel', required=False, default=None, help='Optional Excel filter file'),
     parser.add_argument('--input_vcf_dir', action='store', dest='input_vcf_dir', help='Input vcf directory'),
     parser.add_argument('--min_mq', action='store', dest='min_mq', type=int, help='Minimum map quality value'),
--- a/vsnp_statistics.py	Tue Nov 16 08:28:27 2021 +0000
+++ b/vsnp_statistics.py	Tue Nov 16 20:12:56 2021 +0000
@@ -1,7 +1,6 @@
 #!/usr/bin/env python

 import argparse
-import csv
 import gzip
 import os
 from functools import partial
@@ -11,6 +10,18 @@
 from Bio import SeqIO


+class Statistics:
+
+    def __init__(self, reference, fastq_file, file_size, total_reads, mean_read_length, mean_read_quality, reads_passing_q30):
+        self.reference = reference
+        self.fastq_file = fastq_file
+        self.file_size = file_size
+        self.total_reads = total_reads
+        self.mean_read_length = mean_read_length
+        self.mean_read_quality = mean_read_quality
+        self.reads_passing_q30 = reads_passing_q30
+
+
 def nice_size(size):
     # Returns a readably formatted string with the size
     words = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB']
@@ -32,81 +43,119 @@
     return '??? bytes'


-def output_statistics(fastq_files, idxstats_files, metrics_files, output_file, gzipped, dbkey):
-    # Produce an Excel spreadsheet that
-    # contains a row for each sample.
-    columns = ['Reference', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30',
-               'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total',
-               'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count']
-    data_frames = []
-    for i, fastq_file in enumerate(fastq_files):
-        idxstats_file = idxstats_files[i]
-        metrics_file = metrics_files[i]
-        file_name_base = os.path.basename(fastq_file)
-        # Read fastq_file into a data frame.
-        _open = partial(gzip.open, mode='rt') if gzipped else open
-        with _open(fastq_file) as fh:
-            identifiers = []
-            seqs = []
-            letter_annotations = []
-            for seq_record in SeqIO.parse(fh, "fastq"):
-                identifiers.append(seq_record.id)
-                seqs.append(seq_record.seq)
-                letter_annotations.append(seq_record.letter_annotations["phred_quality"])
-        # Convert lists to Pandas series.
-        s1 = pandas.Series(identifiers, name='id')
-        s2 = pandas.Series(seqs, name='seq')
-        # Gather Series into a data frame.
-        fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id'])
-        total_reads = int(len(fastq_df.index) / 4)
-        current_sample_df = pandas.DataFrame(index=[file_name_base], columns=columns)
-        # Reference
-        current_sample_df.at[file_name_base, 'Reference'] = dbkey
-        # File Size
-        current_sample_df.at[file_name_base, 'File Size'] = nice_size(os.path.getsize(fastq_file))
-        # Mean Read Length
-        sampling_size = 10000
-        if sampling_size > total_reads:
-            sampling_size = total_reads
+def get_statistics(dbkey, fastq_file, gzipped):
+    sampling_size = 10000
+    # Read fastq_file into a data fram to
+    # get the phred quality scores.
+    _open = partial(gzip.open, mode='rt') if gzipped else open
+    with _open(fastq_file) as fh:
+        identifiers = []
+        seqs = []
+        letter_annotations = []
+        for seq_record in SeqIO.parse(fh, "fastq"):
+            identifiers.append(seq_record.id)
+            seqs.append(seq_record.seq)
+            letter_annotations.append(seq_record.letter_annotations["phred_quality"])
+    # Convert lists to Pandas series.
+    s1 = pandas.Series(identifiers, name='id')
+    s2 = pandas.Series(seqs, name='seq')
+    # Gather Series into a data frame.
+    fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id'])
+    # Starting at row 3, keep every 4 row
+    # random sample specified number of rows.
+    file_size = nice_size(os.path.getsize(fastq_file))
+    total_reads = len(seqs)
+    # Mean Read Length
+    if sampling_size > total_reads:
+        sampling_size = total_reads
+    try:
         fastq_df = fastq_df.iloc[3::4].sample(sampling_size)
-        dict_mean = {}
-        list_length = []
-        i = 0
-        for id, seq, in fastq_df.iterrows():
-            dict_mean[id] = numpy.mean(letter_annotations[i])
-            list_length.append(len(seq.array[0]))
-            i += 1
-        current_sample_df.at[file_name_base, 'Mean Read Length'] = '%.1f' % numpy.mean(list_length)
-        # Mean Read Quality
-        df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave'])
-        current_sample_df.at[file_name_base, 'Mean Read Quality'] = '%.1f' % df_mean['ave'].mean()
-        # Reads Passing Q30
-        reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30])
-        reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size)
-        current_sample_df.at[file_name_base, 'Reads Passing Q30'] = reads_passing_q30
+    except ValueError:
+        fastq_df = fastq_df.iloc[3::4].sample(sampling_size, replace=True)
+    dict_mean = {}
+    list_length = []
+    i = 0
+    for id, seq, in fastq_df.iterrows():
+        dict_mean[id] = numpy.mean(letter_annotations[i])
+        list_length.append(len(seq.array[0]))
+        i += 1
+    mean_read_length = '%.1f' % numpy.mean(list_length)
+    # Mean Read Quality
+    df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave'])
+    mean_read_quality = '%.1f' % df_mean['ave'].mean()
+    # Reads Passing Q30
+    reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30])
+    reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size)
+    stats = Statistics(dbkey, os.path.basename(fastq_file), file_size, total_reads, mean_read_length,
+                       mean_read_quality, reads_passing_q30)
+    return stats
+
+
+def accrue_statistics(dbkey, read1, read2, gzipped):
+    read1_stats = get_statistics(dbkey, read1, gzipped)
+    if read2 is None:
+        read2_stats = None
+    else:
+        read2_stats = get_statistics(dbkey, read2, gzipped)
+    return read1_stats, read2_stats
+
+
+def output_statistics(read1_stats, read2_stats, idxstats_file, metrics_file, output_file):
+    paired_reads = read2_stats is not None
+    if paired_reads:
+        columns = ['Read1 FASTQ', 'File Size', 'Reads', 'Mean Read Length', 'Mean Read Quality',
+                   'Reads Passing Q30', 'Read2 FASTQ', 'File Size', 'Reads', 'Mean Read Length', 'Mean Read Quality',
+                   'Reads Passing Q30', 'Total Reads', 'All Mapped Reads', 'Unmapped Reads',
+                   'Unmapped Reads Percentage of Total', 'Reference with Coverage', 'Average Depth of Coverage',
+                   'Good SNP Count', 'Reference']
+    else:
+        columns = ['FASTQ', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30',
+                   'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total',
+                   'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count', 'Reference']
+    with open(output_file, "w") as outfh:
+        # Make sure the header starts with a # so
+        # MultiQC can properly handle the output.
+        outfh.write("%s\n" % "\t".join(columns))
+        line_items = []
+        # Get the current stats and associated files.
+        # Get and output the statistics.
+        line_items.append(read1_stats.fastq_file)
+        line_items.append(read1_stats.file_size)
+        if paired_reads:
+            line_items.append(read1_stats.total_reads)
+        line_items.append(read1_stats.mean_read_length)
+        line_items.append(read1_stats.mean_read_quality)
+        line_items.append(read1_stats.reads_passing_q30)
+        if paired_reads:
+            line_items.append(read2_stats.fastq_file)
+            line_items.append(read2_stats.file_size)
+            line_items.append(read2_stats.total_reads)
+            line_items.append(read2_stats.mean_read_length)
+            line_items.append(read2_stats.mean_read_quality)
+            line_items.append(read2_stats.reads_passing_q30)
         # Total Reads
-        current_sample_df.at[file_name_base, 'Total Reads'] = total_reads
+        if paired_reads:
+            total_reads = read1_stats.total_reads + read2_stats.total_reads
+        else:
+            total_reads = read1_stats.total_reads
+        line_items.append(total_reads)
         # All Mapped Reads
         all_mapped_reads, unmapped_reads = process_idxstats_file(idxstats_file)
-        current_sample_df.at[file_name_base, 'All Mapped Reads'] = all_mapped_reads
-        # Unmapped Reads
-        current_sample_df.at[file_name_base, 'Unmapped Reads'] = unmapped_reads
+        line_items.append(all_mapped_reads)
+        line_items.append(unmapped_reads)
         # Unmapped Reads Percentage of Total
         if unmapped_reads > 0:
             unmapped_reads_percentage = '{:10.2f}'.format(unmapped_reads / total_reads)
         else:
             unmapped_reads_percentage = 0
-        current_sample_df.at[file_name_base, 'Unmapped Reads Percentage of Total'] = unmapped_reads_percentage
+        line_items.append(unmapped_reads_percentage)
         # Reference with Coverage
         ref_with_coverage, avg_depth_of_coverage, good_snp_count = process_metrics_file(metrics_file)
-        current_sample_df.at[file_name_base, 'Reference with Coverage'] = ref_with_coverage
-        # Average Depth of Coverage
-        current_sample_df.at[file_name_base, 'Average Depth of Coverage'] = avg_depth_of_coverage
-        # Good SNP Count
-        current_sample_df.at[file_name_base, 'Good SNP Count'] = good_snp_count
-        data_frames.append(current_sample_df)
-    output_df = pandas.concat(data_frames)
-    output_df.to_csv(output_file, sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\')
+        line_items.append(ref_with_coverage)
+        line_items.append(avg_depth_of_coverage)
+        line_items.append(good_snp_count)
+        line_items.append(read1_stats.reference)
+        outfh.write('%s\n' % '\t'.join(str(x) for x in line_items))


 def process_idxstats_file(idxstats_file):
@@ -150,44 +199,17 @@

 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey')
 parser.add_argument('--gzipped', action='store_true', dest='gzipped', required=False, default=False, help='Input files are gzipped')
-parser.add_argument('--input_idxstats_dir', action='store', dest='input_idxstats_dir', required=False, default=None, help='Samtools idxstats input directory')
-parser.add_argument('--input_metrics_dir', action='store', dest='input_metrics_dir', required=False, default=None, help='vSNP add zero coverage metrics input directory')
-parser.add_argument('--input_reads_dir', action='store', dest='input_reads_dir', required=False, default=None, help='Samples input directory')
-parser.add_argument('--list_paired', action='store_true', dest='list_paired', required=False, default=False, help='Input samples is a list of paired reads')
 parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file')
 parser.add_argument('--read1', action='store', dest='read1', help='Required: single read')
 parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read')
 parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', help='Output of samtools_idxstats')
-parser.add_argument('--vsnp_azc', action='store', dest='vsnp_azc', help='Output of vsnp_add_zero_coverage')
+parser.add_argument('--vsnp_azc_metrics', action='store', dest='vsnp_azc_metrics', help='Output of vsnp_add_zero_coverage')

 args = parser.parse_args()

-fastq_files = []
+stats_list = []
 idxstats_files = []
 metrics_files = []
 # Accumulate inputs.
-if args.read1 is not None:
-    # The inputs are not dataset collections, so
-    # read1, read2 (possibly) and vsnp_azc will also
-    # not be None.
-    fastq_files.append(args.read1)
-    idxstats_files.append(args.samtools_idxstats)
-    metrics_files.append(args.vsnp_azc)
-    if args.read2 is not None:
-        fastq_files.append(args.read2)
-        idxstats_files.append(args.samtools_idxstats)
-        metrics_files.append(args.vsnp_azc)
-else:
-    for file_name in sorted(os.listdir(args.input_reads_dir)):
-        fastq_files.append(os.path.join(args.input_reads_dir, file_name))
-    for file_name in sorted(os.listdir(args.input_idxstats_dir)):
-        idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name))
-        if args.list_paired:
-            # Add the idxstats file for reverse.
-            idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name))
-    for file_name in sorted(os.listdir(args.input_metrics_dir)):
-        metrics_files.append(os.path.join(args.input_metrics_dir, file_name))
-        if args.list_paired:
-            # Add the metrics file for reverse.
-            metrics_files.append(os.path.join(args.input_metrics_dir, file_name))
-output_statistics(fastq_files, idxstats_files, metrics_files, args.output, args.gzipped, args.dbkey)
+read1_stats, read2_stats = accrue_statistics(args.dbkey, args.read1, args.read2, args.gzipped)
+output_statistics(read1_stats, read2_stats, args.samtools_idxstats, args.vsnp_azc_metrics, args.output)