diff pal_finder_wrapper.xml @ 8:4e625d3672ba draft

Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
author pjbriggs
date Wed, 16 May 2018 07:39:16 -0400
parents 5e133b7b79a6
children 52dbe2089d14
line wrap: on
line diff
--- a/pal_finder_wrapper.xml	Mon Mar 19 06:33:32 2018 -0400
+++ b/pal_finder_wrapper.xml	Wed May 16 07:39:16 2018 -0400
@@ -1,4 +1,4 @@
-<tool id="microsat_pal_finder" name="pal_finder" version="0.02.04.6">
+<tool id="microsat_pal_finder" name="pal_finder" version="0.02.04.7">
   <description>Find microsatellite repeat elements from sequencing reads and design PCR primers to amplify them</description>
   <macros>
     <import>pal_finder_macros.xml</import>
@@ -9,7 +9,7 @@
     <requirement type="package" version="1.65">biopython</requirement>
     <requirement type="package" version="2.8.1">pandaseq</requirement>
   </requirements>
-  <command><![CDATA[
+  <command detect_errors="exit_code"><![CDATA[
   @CONDA_PAL_FINDER_SCRIPT_DIR@ &&
   @CONDA_PAL_FINDER_DATA_DIR@ &&
   bash $__tool_directory__/pal_finder_wrapper.sh
@@ -26,6 +26,9 @@
     --454 "$platform.input_fasta"
   #end if
   $output_microsat_summary $output_pal_summary
+  #if $report_bad_primer_ranges
+    --bad_primer_ranges "$output_bad_primer_read_ids"
+  #end if
   #if $keep_config_file
     --output_config_file "$output_config_file"
   #end if
@@ -61,6 +64,10 @@
     #if str( $platform.assembly ) == '-assembly'
       $platform.assembly "$output_assembly"
     #end if
+    #set $use_all_reads = $platform.subset_conditional.use_all_reads
+    #if str( $use_all_reads ) != "yes"
+      --subset "$platform.subset_conditional.subset"
+    #end if
   #end if
   ]]></command>
   <inputs>
@@ -88,6 +95,13 @@
 		   label="Select FASTQ dataset collection with R1/R2 pair" />
 	  </when>
 	</conditional>
+	<conditional name="subset_conditional">
+	  <param name="use_all_reads" type="boolean" label="Use all reads for microsatellite detection?" checked="True" truevalue="yes" falsevalue="no" />
+	  <when value="no">
+	    <param name="subset" type="text" value="0.5" label="Number or fraction of reads to use" help="Either an integer number of reads or a decimal fraction (e.g. 0.5 to select 50% of reads)" />
+	  </when>
+	  <when value="yes" />
+	</conditional>
 	<param name="filters" type="select" display="checkboxes"
 	       multiple="True" label="Filters to apply to the pal_finder results"
 	       help="Apply none, one or more filters to refine results">
@@ -103,7 +117,7 @@
 	<param name="input_fasta" type="data" format="fasta" label="454 fasta file with raw reads" />
       </when>
     </conditional>
-    <param name="min_2mer_repeats" type="integer" value="6" label="Minimum number of 2-mer repeat units to detect" help="Set to zero to ignore repeats of this n-mer unit" />
+    <param name="min_2mer_repeats" type="integer" value="6" label="Minimum number of 2-mer repeat units to detect" min="1" help="Must detect at least one repeat of this n-mer unit" />
     <param name="min_3mer_repeats" type="integer" value="0" label="Minimum number of 3-mer repeat units" help="Set to zero to ignore repeats of this n-mer unit" />
     <param name="min_4mer_repeats" type="integer" value="0" label="Minimum number of 4-mer repeat units" help="Set to zero to ignore repeats of this n-mer unit" />
     <param name="min_5mer_repeats" type="integer" value="0" label="Minimum number of 5-mer repeat units" help="Set to zero to ignore repeats of this n-mer unit" />
@@ -155,7 +169,9 @@
 	       label="Maximum acceptable difference between melting temperatures of left and right primers (PRIMER_PAIR_MAX_DIFF_TM)"
 	       help="Temperature should be in degrees Celsius" />
       </when>
+      <when value="default" />
     </conditional>
+    <param name="report_bad_primer_ranges" type="boolean" truevalue="True" falsevalue="False" label="Output IDs for input reads which generate bad primer product size ranges" help="Can be used to screen reads in input Fastqs " />
     <param name="keep_config_file" type="boolean" truevalue="True" falsevalue="False"
 	   label="Output the config file to the history"
 	   help="Can be used to run pal_finder outside of Galaxy" />
@@ -169,6 +185,9 @@
     <data name="output_assembly" format="tabular" label="${tool.name} on ${on_string} for ${primer_prefix}: assembly">
       <filter>platform['assembly'] is True</filter>
     </data>
+    <data name="output_bad_primer_read_ids" format="tabular" label="${tool.name} on ${on_string} for ${primer_prefix}: read IDs generating bad primer ranges">
+      <filter>report_bad_primer_ranges is True</filter>
+    </data>
     <data name="output_config_file" format="txt" label="${tool.name} on ${on_string} for ${primer_prefix}: config file">
       <filter>keep_config_file is True</filter>
     </data>
@@ -247,6 +266,77 @@
       <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats.out.re_match" />
       <output name="output_filtered_microsats" compare="re_match" file="illuminaPE_filtered_microsats_rankmotifs.out.re_match" />
     </test>
+    <!-- Test with Illumina input using subset of reads -->
+    <test>
+      <param name="platform_type" value="illumina" />
+      <param name="filters" value="" />
+      <param name="assembly" value="false" />
+      <param name="use_all_reads" value="no" />
+      <param name="subset" value="0.5" />
+      <param name="input_fastq_r1" value="illuminaPE_r1.fq" ftype="fastqsanger" />
+      <param name="input_fastq_r2" value="illuminaPE_r2.fq" ftype="fastqsanger" />
+      <expand macro="output_illumina_microsat_subset_summary" />
+      <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats_subset.out.re_match" />
+    </test>
+    <!-- Test with Illumina input filter that doesn't find any
+	 microsatellites -->
+    <test expect_failure="true">
+      <param name="platform_type" value="illumina" />
+      <param name="filters" value="" />
+      <param name="assembly" value="false" />
+      <param name="min_2mer_repeats" value="8" />
+      <param name="input_fastq_r1" value="illuminaPE_r1_no_microsats.fq" ftype="fastqsanger" />
+      <param name="input_fastq_r2" value="illuminaPE_r2_no_microsats.fq" ftype="fastqsanger" />
+      <assert_stderr>
+	<has_text text="pal_finder failed to locate any microsatellites" />
+      </assert_stderr>
+    </test>
+    <!-- Test with Illumina input generating bad ranges -->
+    <test>
+      <param name="platform_type" value="illumina" />
+      <param name="filters" value="" />
+      <param name="assembly" value="false" />
+      <param name="min_2mer_repeats" value="8" />
+      <param name="input_fastq_r1" value="illuminaPE_r1_bad_ranges.fq" ftype="fastqsanger" />
+      <param name="input_fastq_r2" value="illuminaPE_r2_bad_ranges.fq" ftype="fastqsanger" />
+      <param name="min_2mer_repeats" value="8" />
+      <param name="min_3mer_repeats" value="8" />
+      <param name="min_4mer_repeats" value="8" />
+      <param name="min_5mer_repeats" value="8" />
+      <param name="min_6mer_repeats" value="8" />
+      <param name="primer_options" value="custom" />
+      <param name="primer_opt_size" value="25" />
+      <param name="primer_min_size" value="21" />
+      <param name="primer_max_size" value="30" />
+      <param name="primer_min_gc" value="40.0" />
+      <param name="primer_max_gc" value="60.0" />
+      <param name="primer_gc_clamp" value="3" />
+      <param name="primer_max_end_gc" value="5" />
+      <param name="primer_min_tm" value="60.0" />
+      <param name="primer_max_tm" value="80.0" />
+      <param name="primer_opt_tm" value="68.0" />
+      <param name="primer_pair_max_diff_tm" value="3.0" />
+      <param name="report_bad_primer_ranges" value="true" />
+      <expand macro="output_illumina_microsat_summary_bad_ranges" />
+      <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats_bad_ranges.out.re_match" />
+      <output name="output_bad_primer_read_ids" file="illuminaPE_bad_primer_read_ids.out" />
+    </test>
+    <!-- Test with bad n-mers specified -->
+    <test expect_failure="true">
+      <param name="platform_type" value="illumina" />
+      <param name="filters" value="" />
+      <param name="assembly" value="false" />
+      <param name="min_2mer_repeats" value="8" />
+      <param name="min_3mer_repeats" value="8" />
+      <param name="min_4mer_repeats" value="0" />
+      <param name="min_5mer_repeats" value="8" />
+      <param name="min_6mer_repeats" value="8" />
+      <param name="input_fastq_r1" value="illuminaPE_r1_no_microsats.fq" ftype="fastqsanger" />
+      <param name="input_fastq_r2" value="illuminaPE_r2_no_microsats.fq" ftype="fastqsanger" />
+      <assert_stderr>
+	<has_text text="Minimum number of 4-mers cannot be zero if number of 5-mers is non-zero" />
+      </assert_stderr>
+    </test>
     <!-- Test with 454 input -->
     <test>
       <param name="platform_type" value="454" />
@@ -282,6 +372,52 @@
 
 .. class:: infomark
 
+**Known issues**
+
+.. class:: warning
+
+**Low number of reads used for microsatellite detection/bad primer product size ranges**
+
+For some datasets pal_finder may generate 'bad' product size ranges (where the
+lower limit exceeds the upper limit) for one or more reads, for input into
+primer3_core. In these cases primer3_core will terminate prematurely, which can
+result in a substantially lower number of reads being used for microsatellite
+detection and potentially sub-optimal primer design.
+
+The number of reads generating the bad size ranges are reported in the
+*Summary of microsat types* output dataset as 'readsWithBadRanges'. Ideally
+the reported value should be zero.
+
+The conditions which cause this issue within pal_finder are still unclear,
+however we believe it to be associated with short or low quality reads. If this
+problem affects your data then:
+
+* Ensure that the input data are sufficiently trimmed and filtered (using
+  e.g. the Trimmomatic tool) before rerunning pal_finder.
+
+* A list of read IDs for which pal_finder generates bad product size ranges can
+  be output by turning on *Output IDs for input reads which generate bad primer
+  ranges*. This outputs an additional dataset with a list of read IDs which can
+  be used to remove read pairs from the input Fastq files (using e.g. the *Filter
+  sequences by ID* tool) before rerunning pal_finder.
+
+.. class:: warning
+
+**Pal_finder takes a long time to run for large input datasets**
+
+pal_finder was originally developed using MiSeq data, and is not optimised for
+working with the larger Fastqs that are output from other platforms such as
+HiSeq and NextSeq. As a consequence pal_finder may take a very long time to
+complete when operating on larger datasets.
+
+If this is a problem then the tool can be run using a subset of the input reads
+by unchecking the *Use all reads...* option and entering either an integer number
+of reads to use, or a decimal fraction (e.g. 0.5 will select 50% of the reads).
+
+-------------
+
+.. class:: infomark
+
 **Credits**
 
 This Galaxy tool has been developed by Peter Briggs within the Bioinformatics Core