Mercurial > repos > estrain > srst2

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scoreProfiles.py	Sun Dec 03 13:28:11 2017 -0500
@@ -0,0 +1,82 @@
+#!/usr/bin/env
+
+## Generate basic summary stats for SRST2 (v2) allele score output. Generate summaries for each profile defined in the database
+## author: errol strain, estrain@gmail.com
+
+from argparse import (ArgumentParser, FileType)
+import sys
+import glob
+from decimal import Decimal
+
+def parse_args():
+  "Parse the input arguments, use '-h' for help."
+
+  parser = ArgumentParser(description='Generate Summary Scores for SRST2 Allele Score Output')
+
+  # Read inputs
+  parser.add_argument('--mlst_definitions', type=str, required=True, nargs=1, help='MLST Definitions')
+  parser.add_argument('--output', type=str, required=True, nargs=1, help='MLST Definitions')
+  parser.add_argument('--profile_cov', type=str, required=False, help='Minimum Average Coverage to Report ST Profile',default="98")
+  parser.add_argument('--profile_max_mismatch', type=str, required=False, help='Maximum Number of Mismatches (SNPs)', default="1")
+
+  return parser.parse_args()
+
+args =parse_args()
+
+allHash = {}
+
+# Read in Profile Database
+profiles = open(args.mlst_definitions[0],"r")
+output = open(args.output[0],"w")
+
+# Minimum mean percent coverage for reporting profile
+min_per=float(args.profile_cov)
+# Maximum mean mismatch for reporting profile
+max_mis=float(args.profile_max_mismatch)
+
+# Read in Allele Scores
+# Scores should be in srts2*.scores file
+# Column 0:Allele, 1:Score, 2:Avg Depth, 5:% Coverage, 7:Mismatches, 8:Indels
+scoreFile = open(glob.glob("srst2*.scores")[0],"r")
+scoreFile.readline()
+
+for line in scoreFile.readlines() :
+  els = line.split("\t")
+  vals = els[0].split("_")
+  allHash.update({els[0]:line})
+
+
+# Allele names in first row
+als = profiles.readline().rstrip()
+filehead = als + str("\tMean_Score\tMean_Depth\tMean_%_Coverage\tTotal_Mismatches\tTotal_Indels\n")
+output.write(filehead)
+
+genes = als.split("\t")
+
+for line in profiles.readlines() :
+  line = line.rstrip()
+  els = line.split("\t")
+  alleles = []
+  for i in range(1,len(genes)) :
+    alleles.append(genes[i] + "_" + els[i])
+  mscore=0
+  mdepth=0
+  mcover=0
+  mmisma=0
+  mindel=0
+  for i in alleles :
+    if i in allHash :
+      vals=str(allHash[i]).split("\t")
+      mscore+=float(vals[1])
+      mdepth+=float(vals[2])
+      mcover+=float(vals[5])
+      mmisma+=int(vals[7])
+      mindel+=int(vals[8])
+
+  mscore=round(Decimal(mscore/(len(genes)-1)),5)
+  mdepth=round(Decimal(mdepth/(len(genes)-1)),2)
+  mcover=round(Decimal(mcover/(len(genes)-1)),2)
+
+  if mmisma<=max_mis and mcover>=min_per :
+    output.write(line + "\t" + str(mscore) + "\t" + str(mdepth) + "\t" + str(mcover) + "\t" + str(mmisma) + "\t" + str(mindel) + "\n")
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/srst2v2.xml	Sun Dec 03 13:28:11 2017 -0500
@@ -0,0 +1,168 @@
+<tool id="srst2" name="SRST2 - Short Read Sequence Typer (v2)" version="0.2.0">
+    <requirements>
+        <requirement type="package" version="0.2.0">srst2</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        #if $paired_conditional.sPaired == "paired"
+           ln -s $paired_conditional.fastq1 sample_1.fastq;
+           ln -s $paired_conditional.fastq2 sample_2.fastq;
+        #end if
+
+        srst2
+
+        #if $paired_conditional.sPaired == "single"
+            --input_se $paired_conditional.fastq1
+        #else if $paired_conditional.sPaired == "paired"
+            --input_pe sample_1.fastq sample_2.fastq
+        #end if
+
+        --output srst2out
+        --save_scores
+
+        #if $job_type.job == "mlst"
+          --mlst_definitions $job_type.mlst_definitions
+          --mlst_db $job_type.mlst_db
+          --mlst_delimiter $job_type.mlstdelim
+          --mlst_max_mismatch $job_type.mlst_max_mismatch
+        #else if $job_type.job == "gene"
+          --gene_db $job_type.genedb
+          --gene_max_mismatch $job_type.gene_max_mismatch
+        #end if
+
+        #if $options.select == "advanced"
+          #if $options.min_coverage
+              --min_coverage $options.min_coverage
+          #end if
+          #if $options.max_divergence
+              --max_divergence $options.max_divergence
+          #end if
+          #if $options.min_depth
+              --min_depth $options.min_depth
+          #end if
+          #if $options.min_edge_depth
+              --min_edge_depth $options.min_edge_depth
+          #end if
+          #if $options.prob_err
+              --prob_err $options.prob_err
+          #end if
+          #if $options.stop_after
+              --stop_after $options.stop_after
+          #end if
+              --other "'-p \${GALAXY_SLOTS:-1}
+          #if $options.maxins
+              --maxins $options.maxins
+              --minins $options.minins
+          #end if
+              '"
+          #if $options.mapq
+              --mapq $options.mapq
+          #end if
+          #if $options.baseq
+              --baseq $options.baseq
+          #end if
+      #else
+          --other "'-p \${GALAXY_SLOTS:-1}'"
+      #end if
+
+      #if $job_type.job == "mlst"
+        ; python $__tool_directory__/scoreProfiles.py --mlst_definitions $job_type.mlst_definitions --profile_cov $job_type.profile_cov --profile_max_mismatch $job_type.profile_max_mismatch --output srst2.pscores
+      #end if
+
+    ]]></command>
+    <inputs>
+      <conditional name="paired_conditional">
+        <param name="sPaired" type="select" label="Single-End or Paired-End FASTQ?">
+          <option value="single">Single-end</option>
+          <option value="paired">Paired-end</option>
+        </param>
+        <when value="single">
+          <param name="fastq1" type="data" format="fastq" label="FASTQ file" help="FASTQ" />
+        </when>
+        <when value="paired">
+          <param name="fastq1" type="data" format="fastq" label="Forward FASTQ file" help="FASTQ" />
+          <param name="fastq2" type="data" format="fastq" label="Reverse FASTQ file" help="FASTQ" />
+        </when>
+      </conditional>
+
+      <conditional name="job_type">
+        <param name="job" type="select" label="MLST or Gene Presence/Absence?">
+          <option value="mlst">MLST</option>
+          <option value="gene">Gene</option>
+        </param>
+        <when value="mlst">
+          <param type="data" name="mlst_db" label="Fasta file of MLST alleles" format="fasta" />
+          <param type="data" name="mlst_definitions" label="ST definitions for MLST scheme" format="tabular" />
+          <param type="text" name="mlstdelim" value="_" format="txt" label="Character(s) separating gene name from allele number in MLST database (default &apos;_&apos;)" />
+          <param type="integer" name="mlst_max_mismatch" value="10" format="txt" label="Maximum number of mismatches per read" />
+          <param type="float" name="profile_max_mismatch" value="1" format="txt" label="Maximum number of mismatches for reporting ST profile" />
+          <param type="float" name="profile_cov" value="98" format="txt" label="Minimum mean % coverage for reporting ST profile" />
+
+        </when>
+        <when value="gene">
+          <param name="genedb" type="data" format="fasta" label="Fasta file for gene database" />
+          <param name="gene_max_mismatch" type="integer" value="10" format="txt" label="Maximum number of mistaches per read (default 10)" />
+        </when>
+      </conditional>
+
+      <conditional name="options">
+        <param name="select" type="select" label="Options Type">
+          <option value="basic">Basic</option>
+          <option value="advanced">Advanced</option>
+        </param>
+        <when value="advanced">
+          <param name="min_coverage" type="integer" label="Minimum %coverage cutoff for gene reporting" value="90"/>
+          <param name="max_divergence" type="integer" label="Maximum %divergence cutoff for gene reporting" value="10"/>
+          <param name="min_depth" type="integer" label="Minimum mean depth to flag as dubious allele call" value="5"/>
+          <param name="min_edge_depth" type="integer" label="Minimum edge depth to flag as dubious allele call" value="2"/>
+          <param name="prob_err" type="float" label="Probability of sequencing error" value="0.01"/>
+          <param name="stop_after" type="integer" label="Stop mapping after this number of reads have been mapped (otherwise map all)" optional="true"/>
+          <param name="mapq" type="integer" label="Samtools -q parameter" value="1"/>
+          <param name="baseq" type="integer" label="Samtools -Q parameter" value="20"/>
+          <param name="minins" type="integer" label="Bowtie 2 -I parameter. The minimum fragment length for valid paired-end alignments." value="0" >
+            <validator type="in_range" message="Must be less than -X parameter." min="0"/>
+          </param>
+          <param name="maxins" type="integer" label="Bowtie 2 -X parameter. The maximum fragment length for valid paired-end alignments." value="1000" >
+            <validator type="in_range" message="Must be greater than -I parameter." min="0"/>
+          </param>
+          </when>
+        <when value="basic"/>
+      </conditional>
+
+    </inputs>
+
+    <outputs>
+      <data format="tabular" label="SRST2 Results" name="results" from_work_dir="*.txt"/>
+      <data format="tabular" label="SRST2 Allele Scores" name="scores" from_work_dir="*.scores">
+        <filter>job_type['job'] == "mlst"</filter>
+      </data>
+      <data format="tabular" label="SRST2 Profile Scores" name="pscores" from_work_dir="*.pscores">
+        <filter> job_type['job'] == "mlst"</filter>
+      </data>
+      <data format="tabular" label="SRST2 Predicted Alleles" name="alleles" from_work_dir="*results.txt">
+        <filter> job_type['job'] == "mlst"</filter>
+      </data>
+      <data format="tabular" label="SRST2 Gene Scores" name="gscores" from_work_dir="*.scores">
+        <filter>job_type['job'] == "gene"</filter>
+      </data>
+    </outputs>
+
+    <help><![CDATA[
+
+SRST2 - Short Read Sequence Typer (v2)
+
+This program is designed to take Illumina sequence data, a MLST database and/or a database of gene sequences (e.g. resistance genes, virulence genes, etc) and report the presence of STs and/or reference genes.
+
+    ]]></help>
+
+
+    <citations>
+        <citation type="bibtex">
+        @misc{pope_dashnow_zobel_holt_raven_schultz_inouye_tomita_2014,
+        title={SRST2: Rapid genomic surveillance for public health and hospital microbiology labs},
+        url={https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-014-0090-6},
+        journal={Genome Medicine}, publisher={BioMed Central},
+        author={Pope, Bernard J and Dashnow, Harriet and Zobel, Justin and Holt, Kathryn E and Raven, Lesley-Ann and Schultz, Mark B and Inouye, Michael and Tomita, Takehiro},
+        year={2014}, month={Nov}} ,
+       }</citation>
+    </citations>
+</tool>