Mercurial > repos > greg > vsnp_statistics

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Sun Jan 03 15:47:28 2021 +0000
@@ -0,0 +1,24 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<macros>
+    <token name="@WRAPPER_VERSION@">1.0</token>
+    <token name="@PROFILE@">19.09</token>
+    <xml name="param_reference_source">
+        <param name="reference_source" type="select" label="Choose the source for the reference genome">
+            <option value="cached" selected="true">locally cached</option>
+            <option value="history">from history</option>
+        </param>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="bibtex">
+                @misc{None,
+                journal = {None},
+                author = {1. Stuber T},
+                title = {Manuscript in preparation},
+                year = {None},
+                url = {https://github.com/USDA-VS/vSNP},}
+            </citation>
+        </citations>
+    </xml>
+</macros>
+
--- a/vsnp_statistics.py	Thu Apr 30 11:01:40 2020 -0400
+++ b/vsnp_statistics.py	Sun Jan 03 15:47:28 2021 +0000
@@ -2,14 +2,12 @@

 import argparse
 import gzip
-import numpy
 import os
-import pandas
 import shutil

-INPUT_IDXSTATS_DIR = 'input_idxstats'
-INPUT_METRICS_DIR = 'input_metrics'
-INPUT_READS_DIR = 'input_reads'
+import numpy
+import pandas
+
 QUALITYKEY = {'!': '0', '"': '1', '#': '2', '$': '3', '%': '4', '&': '5', "'": '6', '(': '7',
               ')': '8', '*': '9', '+': '10', ',': '11', '-': '12', '.': '13', '/': '14', '0': '15',
               '1': '16', '2': '17', '3': '18', '4': '19', '5': '20', '6': '21', '7': '22',
@@ -26,24 +24,9 @@


 def fastq_to_df(fastq_file, gzipped):
-    if gzipped.lower() == "true":
+    if gzipped:
         return pandas.read_csv(gzip.open(fastq_file, "r"), header=None, sep="^")
-    else:
-        return pandas.read_csv(open(fastq_file, "r"), header=None, sep="^")
-
-
-def get_base_file_name(file_path):
-    base_file_name = os.path.basename(file_path)
-    if base_file_name.find(".") > 0:
-        # Eliminate the extension.
-        return os.path.splitext(base_file_name)[0]
-    elif base_file_name.find("_") > 0:
-        # The dot extension was likely changed to
-        # the " character.
-        items = base_file_name.split("_")
-        return "_".join(items[0:-1])
-    else:
-        return base_file_name
+    return pandas.read_csv(open(fastq_file, "r"), header=None, sep="^")


 def nice_size(size):
@@ -67,14 +50,14 @@
     return '??? bytes'


-def output_statistics(reads_files, idxstats_files, metrics_files, output_file, gzipped, dbkey):
+def output_statistics(fastq_files, idxstats_files, metrics_files, output_file, gzipped, dbkey):
     # Produce an Excel spreadsheet that
     # contains a row for each sample.
     columns = ['Reference', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30',
                'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total',
                'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count']
     data_frames = []
-    for i, fastq_file in enumerate(reads_files):
+    for i, fastq_file in enumerate(fastq_files):
         idxstats_file = idxstats_files[i]
         metrics_file = metrics_files[i]
         file_name_base = os.path.basename(fastq_file)
@@ -171,44 +154,48 @@
     return ref_with_coverage, avg_depth_of_coverage, good_snp_count


-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
+parser = argparse.ArgumentParser()

-    parser.add_argument('--read1', action='store', dest='read1', required=False, default=None, help='Required: single read')
-    parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read')
-    parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey')
-    parser.add_argument('--gzipped', action='store', dest='gzipped', help='Input files are gzipped')
-    parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', required=False, default=None, help='Output of samtools_idxstats')
-    parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file')
-    parser.add_argument('--vsnp_azc', action='store', dest='vsnp_azc', required=False, default=None, help='Output of vsnp_add_zero_coverage')
+parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey')
+parser.add_argument('--gzipped', action='store_true', dest='gzipped', required=False, default=False, help='Input files are gzipped')
+parser.add_argument('--input_idxstats_dir', action='store', dest='input_idxstats_dir', required=False, default=None, help='Samtools idxstats input directory')
+parser.add_argument('--input_metrics_dir', action='store', dest='input_metrics_dir', required=False, default=None, help='vSNP add zero coverage metrics input directory')
+parser.add_argument('--input_reads_dir', action='store', dest='input_reads_dir', required=False, default=None, help='Samples input directory')
+parser.add_argument('--list_paired', action='store_true', dest='list_paired', required=False, default=False, help='Input samples is a list of paired reads')
+parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file')
+parser.add_argument('--read1', action='store', dest='read1', help='Required: single read')
+parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read')
+parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', help='Output of samtools_idxstats')
+parser.add_argument('--vsnp_azc', action='store', dest='vsnp_azc', help='Output of vsnp_add_zero_coverage')

-    args = parser.parse_args()
-    print("args:\n%s\n" % str(args))
+args = parser.parse_args()

-    reads_files = []
-    idxstats_files = []
-    metrics_files = []
-    # Accumulate inputs.
-    if args.read1 is not None:
-        # The inputs are not dataset collections, so
-        # read1, read2 (possibly) and vsnp_azc will also
-        # not be None.
-        reads_files.append(args.read1)
+fastq_files = []
+idxstats_files = []
+metrics_files = []
+# Accumulate inputs.
+if args.read1 is not None:
+    # The inputs are not dataset collections, so
+    # read1, read2 (possibly) and vsnp_azc will also
+    # not be None.
+    fastq_files.append(args.read1)
+    idxstats_files.append(args.samtools_idxstats)
+    metrics_files.append(args.vsnp_azc)
+    if args.read2 is not None:
+        fastq_files.append(args.read2)
         idxstats_files.append(args.samtools_idxstats)
         metrics_files.append(args.vsnp_azc)
-        if args.read2 is not None:
-            reads_files.append(args.read2)
-            idxstats_files.append(args.samtools_idxstats)
-            metrics_files.append(args.vsnp_azc)
-    else:
-        for file_name in sorted(os.listdir(INPUT_READS_DIR)):
-            file_path = os.path.abspath(os.path.join(INPUT_READS_DIR, file_name))
-            reads_files.append(file_path)
-            base_file_name = get_base_file_name(file_path)
-        for file_name in sorted(os.listdir(INPUT_IDXSTATS_DIR)):
-            file_path = os.path.abspath(os.path.join(INPUT_IDXSTATS_DIR, file_name))
-            idxstats_files.append(file_path)
-        for file_name in sorted(os.listdir(INPUT_METRICS_DIR)):
-            file_path = os.path.abspath(os.path.join(INPUT_METRICS_DIR, file_name))
-            metrics_files.append(file_path)
-    output_statistics(reads_files, idxstats_files, metrics_files, args.output, args.gzipped, args.dbkey)
+else:
+    for file_name in sorted(os.listdir(args.input_reads_dir)):
+        fastq_files.append(os.path.join(args.input_reads_dir, file_name))
+    for file_name in sorted(os.listdir(args.input_idxstats_dir)):
+        idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name))
+        if args.list_paired:
+            # Add the idxstats file for reverse.
+            idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name))
+    for file_name in sorted(os.listdir(args.input_metrics_dir)):
+        metrics_files.append(os.path.join(args.input_metrics_dir, file_name))
+        if args.list_paired:
+            # Add the metrics file for reverse.
+            metrics_files.append(os.path.join(args.input_metrics_dir, file_name))
+output_statistics(fastq_files, idxstats_files, metrics_files, args.output, args.gzipped, args.dbkey)
--- a/vsnp_statistics.xml	Thu Apr 30 11:01:40 2020 -0400
+++ b/vsnp_statistics.xml	Sun Jan 03 15:47:28 2021 +0000
@@ -1,5 +1,8 @@
-<tool id="vsnp_statistics" name="vSNP: statistics" version="1.0.0">
+<tool id="vsnp_statistics" name="vSNP: statistics" version="@WRAPPER_VERSION@.1" profile="@PROFILE@">
     <description></description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
     <requirements>
         <requirement type="package" version="1.16.5">numpy</requirement>
         <requirement type="package" version="0.25.3">pandas</requirement>
@@ -7,102 +10,113 @@
         <requirement type="package" version="1.2.8">xlsxwriter</requirement>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
-#import os
 #import re
-#set gzipped = 'false'
-#set input_type = $input_type_cond.input_type
 #set input_idxstats_dir = 'input_idxstats'
 #set input_metrics_dir = 'input_metrics'
 #set input_reads_dir = 'input_reads'
 mkdir -p $input_idxstats_dir &&
 mkdir -p $input_metrics_dir &&
 mkdir -p $input_reads_dir &&
-#if str($input_type) == "single":
-    #set read_type_cond = $input_type_cond.read_type_cond
-    #set read1 = $read_type_cond.read1
+
+#if $input_type_cond.input_type  == 'single_files':
+    #set read1 = $input_type_cond.read_type_cond.read1
     #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
-    #if str($read_type_cond.read_type) == "single":
-        ln -s '${read1}' '${read1_identifier}' &&
-        #if $read1.is_of_type('fastqsanger.gz'):
-            #set gzipped = 'true'
-        #end if
+    ln -s '${read1}' '${read1_identifier}' &&
+    #if $input_type_cond.read_type_cond.read_type == 'pair':
+        #set read2 = $input_type_cond.read_type_cond.read2
+        #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
+        ln -s '${read2}' '${read2_identifier}' &&
     #else:
-        #set read2 = $read_type_cond.read2
-        #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
-        ln -s '${read1}' '${read1_identifier}' &&
-        ln -s '${read2}' '${read2_identifier}' &&
-        #if $read1.is_of_type('fastqsanger.gz') and $read2.is_of_type('fastqsanger.gz'):
-            #set gzipped = 'true'
-        #end if
+        #set read2 = None
     #end if
-    #set dbkey = $input_type_cond.vsnp_azc.metadata.dbkey
 #else:
-    #for $i in $input_type_cond.reads_collection:
-        #if $i.is_of_type('fastqsanger.gz'):
-            #set gzipped = 'true'
-        #end if
-        #set filename = $i.file_name
+    #if $input_type_cond.collection_type_cond.collection_type == 'single':
+        #for $i in $input_type_cond.collection_type_cond.reads_collection:
+            #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
+            ln -s '${i.file_name}' '$input_reads_dir/${identifier}' &&
+        #end for
+    #else:
+        #set read1 = $input_type_cond.collection_type_cond.reads_collection['forward']
+        #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.name))
+        ln -s '${read1}' '$input_reads_dir/${read1_identifier}' &&
+        #set read2 = $input_type_cond.collection_type_cond.reads_collection['reverse']
+        #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.name))
+        ln -s '${read2}' '$input_reads_dir/${read2_identifier}' &&
+    #end if
+    #for $i in $input_type_cond.samtools_idxstats:
         #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
-        ln -s '$filename' '$input_reads_dir/$identifier' &&
+        ln -s '${i.file_name}' '$input_idxstats_dir/${identifier}' &&
     #end for
-    #for $i in $input_type_cond.samtools_idxstats_collection:
-        #set filename = $i.file_name
+    #for $i in $input_type_cond.vsnp_azc:
         #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
-        ln -s '$filename' '$input_idxstats_dir/$identifier' &&
-    #end for
-    #for $i in $input_type_cond.azc_metrics_collection:
-        #set dbkey = $i.metadata.dbkey
-        #set filename = $i.file_name
-        #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
-        ln -s '$filename' '$input_metrics_dir/$identifier' &&
+        ln -s '${i.file_name}' '$input_metrics_dir/${identifier}' &&
     #end for
 #end if
+
 python '$__tool_directory__/vsnp_statistics.py'
---dbkey '$dbkey'
---gzipped '$gzipped'
-#if str($input_type) == "single":
-    #if str($read_type_cond.read_type) == "single":
-        --read1 '${read1_identifier}'
-    #else:
-        --read1 '${read1_identifier}'
-        --read2 '${read2_identifier}'
+#if $input_type_cond.input_type == 'single_files':
+    --dbkey '$input_type_cond.samtools_idxstats.metadata.dbkey'
+    #if $input_type_cond.read_type_cond.read1.is_of_type('fastqsanger.gz'):
+        --gzipped
+    #end if
+    --read1 '${read1_identifier}'
+    #if $input_type_cond.read_type_cond.read_type == 'pair':
+      --read2 '${read2_identifier}'
     #end if
-    --samtools_idxstats '$samtools_idxstats'
-    --vsnp_azc '$vsnp_azc'
+    --samtools_idxstats '$input_type_cond.samtools_idxstats'
+    --vsnp_azc '$input_type_cond.vsnp_azc'
+#else:
+    --dbkey '$input_type_cond.samtools_idxstats[0].metadata.dbkey'
+    #if $input_type_cond.collection_type_cond.reads_collection[0].is_of_type('fastqsanger.gz'):
+        --gzipped
+    #end if
+    #if $input_type_cond.collection_type_cond.collection_type == 'paired':
+        --list_paired
+    #end if
+    --input_idxstats_dir '$input_idxstats_dir'
+    --input_metrics_dir '$input_metrics_dir'
+    --input_reads_dir '$input_reads_dir'
 #end if
 --output '$output'
 ]]></command>
     <inputs>
         <conditional name="input_type_cond">
             <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
-                <option value="single" selected="true">Single files</option>
-                <option value="collection">Collections of files</option>
+                <option value="single_files" selected="true">Single files</option>
+                <option value="collections">Collections of files</option>
             </param>
-            <when value="single">
+            <when value="single_files">
                 <conditional name="read_type_cond">
                     <param name="read_type" type="select" label="Choose the read type">
-                        <option value="paired" selected="true">Paired</option>
-                        <option value="single">Single</option>
+                        <option value="single" selected="true">Single reads</option>
+                         <option value="pair">Paired reads</option>
                     </param>
-                    <when value="paired">
+                     <when value="single">
+                        <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
+                    </when>
+                    <when value="pair">
                         <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
                         <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
                     </when>
+                </conditional>
+                <param name="samtools_idxstats" type="data" format="tabular" label="Samtools idxstats file"/>
+                <param name="vsnp_azc" type="data" format="tabular" label="vSNP: add zero coverage metrics file"/>
+            </when>
+            <when value="collections">
+                <conditional name="collection_type_cond">
+                    <param name="collection_type" type="select" label="Collections of single reads or paired reads?">
+                        <option value="single" selected="true">Single reads</option>
+                        <option value="paired">Paired reads in separate datasets</option>
+                    </param>
                     <when value="single">
-                        <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
+                        <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="list" label="Collection of fastqsanger files"/>
+                    </when>
+                    <when value="paired">
+                        <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
                     </when>
                 </conditional>
-                <param name="samtools_idxstats" type="data" format="tabular" label="Samtools idxstats file">
-                    <validator type="unspecified_build"/>
-                </param>
-                <param name="vsnp_azc" type="data" format="tabular" label="vSNP zero coverage metrics file">
-                    <validator type="unspecified_build"/>
-                </param>
-            </when>
-            <when value="collection">
-                <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="list" label="Collection of fastqsanger files"/>
-                <param name="samtools_idxstats_collection" type="data_collection" format="tabular" collection_type="list" label="Collection of samtools idxstats files"/>
-                <param name="azc_metrics_collection" type="data_collection" format="tabular" collection_type="list" label="Collection of vSNP zero-coverage metrics files"/>
+                <param name="samtools_idxstats" type="data_collection" format="tabular" collection_type="list" label="Collection of samtools idxstats files"/>
+                <param name="vsnp_azc" type="data_collection" format="tabular" collection_type="list" label="Collection of vSNP: add zero coverage metrics files"/>
             </when>
         </conditional>
     </inputs>
@@ -110,36 +124,81 @@
         <data name="output" format="xlsx"/>
     </outputs>
     <tests>
-        <test>
+        <!-- A single fastq file -->
+        <test expect_num_outputs="1">
+            <param name="input_type" value="single_files"/>
+            <param name="read_type" value="single"/>
+            <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz" dbkey="89"/>
+            <param name="samtools_idxstats" value="samtools_idxstats1.tabular" ftype="tabular" dbkey="89"/>
+            <param name="vsnp_azc" value="add_zc_metrics1.tabular" ftype="tabular" dbkey="89"/>
+            <output name="output" file="vsnp_statistics1.xlsx" ftype="xlsx" compare="sim_size"/>
+        </test>
+        <!-- A set of paired fastq files -->
+        <test expect_num_outputs="1">
+            <param name="input_type" value="single_files"/>
+            <param name="read_type" value="pair"/>
             <param name="read1" value="13-1941-6_S4_L001_R1_600000.fastq.gz" ftype="fastqsanger.gz" dbkey="89"/>
             <param name="read2" value="13-1941-6_S4_L001_R2_600000.fastq.gz" ftype="fastqsanger.gz" dbkey="89"/>
-            <param name="samtools_idxstats" value="samtools_idxstats.tabular" ftype="tabular" dbkey="89"/>
-            <param name="vsnp_azc" value="add_zc_metrics.tabular" ftype="tabular" dbkey="89"/>
-            <output name="output" file="vsnp_statistics.xlsx" ftype="xlsx" compare="sim_size"/>
+            <param name="samtools_idxstats" value="samtools_idxstats2.tabular" ftype="tabular" dbkey="89"/>
+            <param name="vsnp_azc" value="add_zc_metrics2.tabular" ftype="tabular" dbkey="89"/>
+            <output name="output" file="vsnp_statistics2.xlsx" ftype="xlsx" compare="sim_size"/>
+        </test>
+        <!-- A collection of SE fastq files -->
+        <test expect_num_outputs="1">
+            <param name="input_type" value="collections"/>
+            <param name="read_type" value="single"/>
+            <param name="reads_collection">
+                <collection type="list">
+                    <element name="Mcap_Deer_DE_SRR650221.fastq.gz" value="Mcap_Deer_DE_SRR650221.fastq.gz" dbkey="89"/>
+                    <element name="13-1941-6_S4_L001_R1_600000.fastq.gz" value="13-1941-6_S4_L001_R1_600000.fastq.gz" dbkey="89"/>
+                </collection>
+            </param>
+            <param name="samtools_idxstats">
+                <collection type="list">
+                    <element name="13-1941-6_S4_L001_R1_600000.fastq.gz" value="samtools_idxstats3.tabular" dbkey="89"/>
+                    <element name="Mcap_Deer_DE_SRR650221.fastq.gz" value="samtools_idxstats4.tabular" dbkey="89"/>
+                </collection>
+            </param>
+            <param name="vsnp_azc">
+                <collection type="list">
+                    <element name="13-1941-6_S4_L001_R1_600000.fastq.gz" value="add_zc_metrics3.tabular" dbkey="89"/>
+                    <element name="Mcap_Deer_DE_SRR650221.fastq.gz" value="add_zc_metrics4.tabular" dbkey="89"/>
+                </collection>
+            </param>
+            <output name="output" file="vsnp_statistics3.xlsx" ftype="xlsx" compare="sim_size"/>
+        </test>
+        <!-- A collection of PE fastq files -->
+        <test expect_num_outputs="1">
+            <param name="input_type" value="collections"/>
+            <param name="collection_type" value="paired"/>
+            <param name="reads_collection">
+                <collection type="paired">
+                    <element name="forward" value="13-1941-6_S4_L001_R1_600000.fastq.gz" ftype="fastqsanger.gz"/>
+                    <element name="reverse" value="13-1941-6_S4_L001_R2_600000.fastq.gz" ftype="fastqsanger.gz"/>
+                </collection>
+            </param>
+            <param name="samtools_idxstats">
+                <collection type="list">
+                    <element name="13-1941-6_S4_L001_R1_600000.fastq" value="samtools_idxstats5.tabular" dbkey="89"/>
+                </collection>
+            </param>
+            <param name="vsnp_azc">
+                <collection type="list">
+                    <element name="13-1941-6_S4_L001_R1_600000.fastq" value="add_zc_metrics5.tabular" dbkey="89"/>
+                </collection>
+            </param>
+            <output name="output" file="vsnp_statistics4.xlsx" ftype="xlsx" compare="sim_size"/>
         </test>
     </tests>
     <help>
 **What it does**

-Accepts a single fastqsanger sample, a set of paired read samples, or a collections of samples along with associated
-SAMtools idxstats and vSNP zero coverage metrics files and extracts information from them to produce an Excel
-spreadsheet containing statistics for each sample.  Statistics include reference, file size, mean read length, mean
-read quality, reads passing Q30, total reads, all mapped reads, unmapped reads, unmapped reads percentage of total,
-reference with coverage, average depth of coverage and good SNP count.
-
-**Required options**
-
- * **Choose the type for files to be analyzed** - select "Single files" or "Collections of files", then select the appropriate history items (single or paired fastqsanger reads or collections of fastqsanger reads and associated idxstats and vSNP zero coverage metrics files) based on the selected option..
+Accepts associated fastq files, SAMtools idxstats files and **vSNP: add zero coverage** metrics files and extracts information from them
+to produce an Excel spreadsheet containing statistics for each sample.  The samples can be single or paired reads, and all associated inputs
+can be either single files or collections of files.  The output statistics include reference, file size, mean read length, mean read quality,
+reads passing Q30, total reads, all mapped reads, unmapped reads, unmapped reads percentage of total, reference with coverage, average depth
+of coverage and good SNP count.
     </help>
-    <citations>
-        <citation type="bibtex">
-            @misc{None,
-            journal = {None},
-            author = {1. Stuber T},
-            title = {Manuscript in preparation},
-            year = {None},
-            url = {https://github.com/USDA-VS/vSNP},}
-        </citation>
-    </citations>
+    <expand macro="citations"/>
 </tool>