# HG changeset patch
# User greg
# Date 1623865127 0
# Node ID d0fbdeaaa48890c5fe2c87003e4a2882c4a9bd5e
# Parent  2d6c6b01319efe3ca865352d10f91a6eea61b567
"planemo upload for repository https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/sequence_analysis/vsnp/vsnp_statistics commit 770e89322a15829580ed9577a853660f63233f32"

diff -r 2d6c6b01319e -r d0fbdeaaa488 .shed.yml
--- a/.shed.yml	Sun Jan 03 15:47:28 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,13 +0,0 @@
-name: vsnp_statistics
-owner: greg
-description: |
-  Contains a tool that produces an Excel spreadsheet containing statistics for samples and associated metrics files.
-homepage_url: https://github.com/USDA-VS/vSNP
-long_description: |
-  Contains a tool Accepts a single fastqsanger sample, a set of paired read samples, or a collections of samples
-  along with associated SAMtools idxstats and vSNP zero coverage metrics files and extracts information from them
-  to produce an Excel spreadsheet containing statistics for each sample.
-remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/sequence_analysis/vsnp/vsnp_statistics
-type: unrestricted
-categories:
-  - Sequence Analysis
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/13-1941-6_S4_L001_R1_600000.fastq.gz
Binary file test-data/13-1941-6_S4_L001_R1_600000.fastq.gz has changed
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/13-1941-6_S4_L001_R2_600000.fastq.gz
Binary file test-data/13-1941-6_S4_L001_R2_600000.fastq.gz has changed
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/Mcap_Deer_DE_SRR650221.fastq.gz
Binary file test-data/Mcap_Deer_DE_SRR650221.fastq.gz has changed
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/add_zc_metrics.tabular
--- a/test-data/add_zc_metrics.tabular	Sun Jan 03 15:47:28 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-# File	Number of Good SNPs	Average Coverage	Genome Coverage
-MarkDuplicates on data 4_ MarkDuplicates BAM output		10.338671	98.74%
-VCFfilter_ on data 7	611		
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/add_zc_metrics1.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/add_zc_metrics1.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+# File	Number of Good SNPs	Average Coverage	Genome Coverage
+Mcap_Deer_DE_SRR650221_fastq_gz		0.439436	8.27%
+Mcap_Deer_DE_SRR650221_fastq_gz	36		
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/add_zc_metrics2.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/add_zc_metrics2.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+# File	Number of Good SNPs	Average Coverage	Genome Coverage
+MarkDuplicates on data 4_ MarkDuplicates BAM output		10.338671	98.74%
+VCFfilter_ on data 7	611		
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/add_zc_metrics3.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/add_zc_metrics3.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+# File	Number of Good SNPs	Average Coverage	Genome Coverage
+13-1941-6_S4_L001_R1_600000_fastq_gz		0.001252	0.13%
+13-1941-6_S4_L001_R1_600000_fastq_gz	0		
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/add_zc_metrics4.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/add_zc_metrics4.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+# File	Number of Good SNPs	Average Coverage	Genome Coverage
+Mcap_Deer_DE_SRR650221_fastq_gz		0.439436	8.27%
+Mcap_Deer_DE_SRR650221_fastq_gz	36		
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/add_zc_metrics5.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/add_zc_metrics5.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+# File	Number of Good SNPs	Average Coverage	Genome Coverage
+13-1941-6_S4_L001_600000_fastq		0.002146	0.16%
+13-1941-6_S4_L001_600000_fastq	0		
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/samtools_idxstats.tabular
--- a/test-data/samtools_idxstats.tabular	Sun Jan 03 15:47:28 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-NC_002945.4	4349904	45	4047
-*	0	0	5
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/samtools_idxstats1.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samtools_idxstats1.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+NC_002945.4	4349904	17063	0
+*	0	0	223
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/samtools_idxstats2.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samtools_idxstats2.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+NC_002945.4	4349904	45	4047
+*	0	0	5
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/samtools_idxstats3.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samtools_idxstats3.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+NC_002945.4	4349904	24	0
+*	0	0	2
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/samtools_idxstats4.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samtools_idxstats4.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+NC_002945.4	4349904	17063	0
+*	0	0	223
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/samtools_idxstats5.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samtools_idxstats5.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+NC_002945.4	4349904	46	2
+*	0	0	4
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/vsnp_statistics.xlsx
Binary file test-data/vsnp_statistics.xlsx has changed
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/vsnp_statistics1.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/vsnp_statistics1.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+	Reference	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
+Mcap_Deer_DE_SRR650221_fastq_gz	89	1.6 MB	121.0	29.7	      0.53	4317	17063	223	      0.05	8.27%	0.439436	36
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/vsnp_statistics2.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/vsnp_statistics2.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+	Reference	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
+13-1941-6_S4_L001_R1_600000_fastq_gz	89	8.7 KB	100.0	65.7	      1.00	25	45	5	      0.20	98.74%	10.338671	611
+13-1941-6_S4_L001_R2_600000_fastq_gz	89	8.5 KB	100.0	66.3	      1.00	25	45	5	      0.20	98.74%	10.338671	611
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/vsnp_statistics3.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/vsnp_statistics3.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+	Reference	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
+13-1941-6_S4_L001_R1_600000_fastq_gz	89	8.7 KB	100.0	65.7	      1.00	25	24	2	      0.08	0.13%	0.001252	0
+Mcap_Deer_DE_SRR650221_fastq_gz	89	1.6 MB	121.0	29.7	      0.53	4317	17063	223	      0.05	8.27%	0.439436	36
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/vsnp_statistics4.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/vsnp_statistics4.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+	Reference	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
+13-1941-6_S4_L001_R1_600000_fastq_gz	89	8.7 KB	100.0	65.7	      1.00	25	46	4	      0.16	0.16%	0.002146	0
+13-1941-6_S4_L001_R2_600000_fastq_gz	89	8.5 KB	100.0	66.3	      1.00	25	46	4	      0.16	0.16%	0.002146	0
diff -r 2d6c6b01319e -r d0fbdeaaa488 vsnp_statistics.py
--- a/vsnp_statistics.py	Sun Jan 03 15:47:28 2021 +0000
+++ b/vsnp_statistics.py	Wed Jun 16 17:38:47 2021 +0000
@@ -1,32 +1,14 @@
 #!/usr/bin/env python
 
 import argparse
+import csv
 import gzip
 import os
-import shutil
+from functools import partial
 
 import numpy
 import pandas
-
-QUALITYKEY = {'!': '0', '"': '1', '#': '2', '$': '3', '%': '4', '&': '5', "'": '6', '(': '7',
-              ')': '8', '*': '9', '+': '10', ',': '11', '-': '12', '.': '13', '/': '14', '0': '15',
-              '1': '16', '2': '17', '3': '18', '4': '19', '5': '20', '6': '21', '7': '22',
-              '8': '23', '9': '24', ':': '25', ';': '26', '<': '27', '=': '28', '>': '29',
-              '?': '30', '@': '31', 'A': '32', 'B': '33', 'C': '34', 'D': '35', 'E': '36',
-              'F': '37', 'G': '38', 'H': '39', 'I': '40', 'J': '41', 'K': '42', 'L': '43',
-              'M': '44', 'N': '45', 'O': '46', 'P': '47', 'Q': '48', 'R': '49', 'S': '50',
-              'T': '51', 'U': '52', 'V': '53', 'W': '54', 'X': '55', 'Y': '56', 'Z': '57',
-              '_': '1', ']': '1', '[': '1', '\\': '1', '\n': '1', '`': '1', 'a': '1', 'b': '1',
-              'c': '1', 'd': '1', 'e': '1', 'f': '1', 'g': '1', 'h': '1', 'i': '1', 'j': '1',
-              'k': '1', 'l': '1', 'm': '1', 'n': '1', 'o': '1', 'p': '1', 'q': '1', 'r': '1',
-              's': '1', 't': '1', 'u': '1', 'v': '1', 'w': '1', 'x': '1', 'y': '1', 'z': '1',
-              ' ': '1'}
-
-
-def fastq_to_df(fastq_file, gzipped):
-    if gzipped:
-        return pandas.read_csv(gzip.open(fastq_file, "r"), header=None, sep="^")
-    return pandas.read_csv(open(fastq_file, "r"), header=None, sep="^")
+from Bio import SeqIO
 
 
 def nice_size(size):
@@ -62,7 +44,20 @@
         metrics_file = metrics_files[i]
         file_name_base = os.path.basename(fastq_file)
         # Read fastq_file into a data frame.
-        fastq_df = fastq_to_df(fastq_file, gzipped)
+        _open = partial(gzip.open, mode='rt') if gzipped else open
+        with _open(fastq_file) as fh:
+            identifiers = []
+            seqs = []
+            letter_annotations = []
+            for seq_record in SeqIO.parse(fh, "fastq"):
+                identifiers.append(seq_record.id)
+                seqs.append(seq_record.seq)
+                letter_annotations.append(seq_record.letter_annotations["phred_quality"])
+        # Convert lists to Pandas series.
+        s1 = pandas.Series(identifiers, name='id')
+        s2 = pandas.Series(seqs, name='seq')
+        # Gather Series into a data frame.
+        fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id'])
         total_reads = int(len(fastq_df.index) / 4)
         current_sample_df = pandas.DataFrame(index=[file_name_base], columns=columns)
         # Reference
@@ -76,19 +71,18 @@
         fastq_df = fastq_df.iloc[3::4].sample(sampling_size)
         dict_mean = {}
         list_length = []
-        for index, row in fastq_df.iterrows():
-            base_qualities = []
-            for base in list(row.array[0]):
-                base_qualities.append(int(QUALITYKEY[base]))
-            dict_mean[index] = numpy.mean(base_qualities)
-            list_length.append(len(row.array[0]))
-        current_sample_df.at[file_name_base, 'Mean Read Length'] = "%.1f" % numpy.mean(list_length)
+        i = 0
+        for id, seq, in fastq_df.iterrows():
+            dict_mean[id] = numpy.mean(letter_annotations[i])
+            list_length.append(len(seq.array[0]))
+            i += 1
+        current_sample_df.at[file_name_base, 'Mean Read Length'] = '%.1f' % numpy.mean(list_length)
         # Mean Read Quality
         df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave'])
-        current_sample_df.at[file_name_base, 'Mean Read Quality'] = "%.1f" % df_mean['ave'].mean()
+        current_sample_df.at[file_name_base, 'Mean Read Quality'] = '%.1f' % df_mean['ave'].mean()
         # Reads Passing Q30
         reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30])
-        reads_passing_q30 = "{:10.2f}".format(reads_gt_q30 / sampling_size)
+        reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size)
         current_sample_df.at[file_name_base, 'Reads Passing Q30'] = reads_passing_q30
         # Total Reads
         current_sample_df.at[file_name_base, 'Total Reads'] = total_reads
@@ -99,7 +93,7 @@
         current_sample_df.at[file_name_base, 'Unmapped Reads'] = unmapped_reads
         # Unmapped Reads Percentage of Total
         if unmapped_reads > 0:
-            unmapped_reads_percentage = "{:10.2f}".format(unmapped_reads / total_reads)
+            unmapped_reads_percentage = '{:10.2f}'.format(unmapped_reads / total_reads)
         else:
             unmapped_reads_percentage = 0
         current_sample_df.at[file_name_base, 'Unmapped Reads Percentage of Total'] = unmapped_reads_percentage
@@ -111,12 +105,8 @@
         # Good SNP Count
         current_sample_df.at[file_name_base, 'Good SNP Count'] = good_snp_count
         data_frames.append(current_sample_df)
-    excel_df = pandas.concat(data_frames)
-    excel_file_name = "output.xlsx"
-    writer = pandas.ExcelWriter(excel_file_name, engine='xlsxwriter')
-    excel_df.to_excel(writer, sheet_name='Sheet1')
-    writer.save()
-    shutil.move(excel_file_name, output_file)
+    output_df = pandas.concat(data_frames)
+    output_df.to_csv(output_file, sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\')
 
 
 def process_idxstats_file(idxstats_file):
@@ -124,6 +114,7 @@
     unmapped_reads = 0
     with open(idxstats_file, "r") as fh:
         for i, line in enumerate(fh):
+            line = line.rstrip('\r\n')
             items = line.split("\t")
             if i == 0:
                 # NC_002945.4 4349904 213570 4047
@@ -143,6 +134,7 @@
             if i == 0:
                 # Skip comments.
                 continue
+            line = line.rstrip('\r\n')
             items = line.split("\t")
             if i == 1:
                 # MarkDuplicates 10.338671 98.74%
diff -r 2d6c6b01319e -r d0fbdeaaa488 vsnp_statistics.xml
--- a/vsnp_statistics.xml	Sun Jan 03 15:47:28 2021 +0000
+++ b/vsnp_statistics.xml	Wed Jun 16 17:38:47 2021 +0000
@@ -4,10 +4,10 @@
         <import>macros.xml</import>
     </macros>
     <requirements>
+        <requirement type="package" version="1.78">biopython</requirement>
         <requirement type="package" version="1.16.5">numpy</requirement>
         <requirement type="package" version="0.25.3">pandas</requirement>
         <requirement type="package" version="1.2.0">xlrd</requirement>
-        <requirement type="package" version="1.2.8">xlsxwriter</requirement>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
 #import re
@@ -121,7 +121,7 @@
         </conditional>
     </inputs>
     <outputs>
-        <data name="output" format="xlsx"/>
+        <data name="output" format="tabular"/>
     </outputs>
     <tests>
         <!-- A single fastq file -->
@@ -131,7 +131,7 @@
             <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz" dbkey="89"/>
             <param name="samtools_idxstats" value="samtools_idxstats1.tabular" ftype="tabular" dbkey="89"/>
             <param name="vsnp_azc" value="add_zc_metrics1.tabular" ftype="tabular" dbkey="89"/>
-            <output name="output" file="vsnp_statistics1.xlsx" ftype="xlsx" compare="sim_size"/>
+            <output name="output" file="vsnp_statistics1.tabular" ftype="tabular"/>
         </test>
         <!-- A set of paired fastq files -->
         <test expect_num_outputs="1">
@@ -141,7 +141,7 @@
             <param name="read2" value="13-1941-6_S4_L001_R2_600000.fastq.gz" ftype="fastqsanger.gz" dbkey="89"/>
             <param name="samtools_idxstats" value="samtools_idxstats2.tabular" ftype="tabular" dbkey="89"/>
             <param name="vsnp_azc" value="add_zc_metrics2.tabular" ftype="tabular" dbkey="89"/>
-            <output name="output" file="vsnp_statistics2.xlsx" ftype="xlsx" compare="sim_size"/>
+            <output name="output" file="vsnp_statistics2.tabular" ftype="tabular"/>
         </test>
         <!-- A collection of SE fastq files -->
         <test expect_num_outputs="1">
@@ -165,7 +165,7 @@
                     <element name="Mcap_Deer_DE_SRR650221.fastq.gz" value="add_zc_metrics4.tabular" dbkey="89"/>
                 </collection>
             </param>
-            <output name="output" file="vsnp_statistics3.xlsx" ftype="xlsx" compare="sim_size"/>
+            <output name="output" file="vsnp_statistics3.tabular" ftype="tabular"/>
         </test>
         <!-- A collection of PE fastq files -->
         <test expect_num_outputs="1">
@@ -187,7 +187,7 @@
                     <element name="13-1941-6_S4_L001_R1_600000.fastq" value="add_zc_metrics5.tabular" dbkey="89"/>
                 </collection>
             </param>
-            <output name="output" file="vsnp_statistics4.xlsx" ftype="xlsx" compare="sim_size"/>
+            <output name="output" file="vsnp_statistics4.tabular" ftype="tabular"/>
         </test>
     </tests>
     <help>