Mercurial > repos > pjbriggs > pal_finder

<tool id="microsat_pal_finder" name="pal_finder" version="0.02.04.8">
  <description>Find microsatellite repeat elements from sequencing reads and design PCR primers to amplify them</description>
  <macros>
    <import>pal_finder_macros.xml</import>
  </macros>
  <requirements>
    <requirement type="package" version="0.02.04">pal_finder</requirement>
    <requirement type="package" version="2.7">python</requirement>
    <requirement type="package" version="1.65">biopython</requirement>
    <requirement type="package" version="2.8.1">pandaseq</requirement>
  </requirements>
  <command detect_errors="exit_code"><![CDATA[
  @CONDA_PAL_FINDER_SCRIPT_DIR@ &&
  @CONDA_PAL_FINDER_DATA_DIR@ &&
  bash $__tool_directory__/pal_finder_wrapper.sh
  #if str( $platform.platform_type ) == "illumina"
    #set $paired_input_type = $platform.paired_input_type_conditional.paired_input_type
    #if $paired_input_type == "pair_of_files"
      "$platform.paired_input_type_conditional.input_fastq_r1"
      "$platform.paired_input_type_conditional.input_fastq_r2"
    #else
      "$platform.paired_input_type_conditional.input_fastq_pair.forward"
      "$platform.paired_input_type_conditional.input_fastq_pair.reverse"
    #end if
  #else
    --454 "$platform.input_fasta"
  #end if
  $output_microsat_summary $output_pal_summary
  #if $report_bad_primer_ranges
    --bad_primer_ranges "$output_bad_primer_read_ids"
  #end if
  #if $keep_config_file
    --output_config_file "$output_config_file"
  #end if
  --primer-prefix "$primer_prefix"
  --2merMinReps $min_2mer_repeats
  --3merMinReps $min_3mer_repeats
  --4merMinReps $min_4mer_repeats
  --5merMinReps $min_5mer_repeats
  --6merMinReps $min_6mer_repeats
  #if str( $primer.primer_options ) == "custom"
  --primer-opt-size $primer.primer_opt_size
  --primer-min-size $primer.primer_min_size
  --primer-max-size $primer.primer_max_size
  --primer-min-gc $primer.primer_min_gc
  --primer-max-gc $primer.primer_max_gc
  --primer-gc-clamp $primer.primer_gc_clamp
  --primer-max-end-gc $primer.primer_max_end_gc
  --primer-min-tm $primer.primer_min_tm
  --primer-max-tm $primer.primer_max_tm
  --primer-opt-tm $primer.primer_opt_tm
  --primer-pair-max-diff-tm $primer.primer_pair_max_diff_tm
  #end if
  #if str( $mispriming.mispriming_options ) == "custom"
  --primer-mispriming-library $mispriming.mispriming_library
  #end if
  #if str( $platform.platform_type ) == "illumina"
    #if $platform.filters
      #for $filter in str($platform.filters).split(',')
        $filter
        --filter_microsats "$output_filtered_microsats"
      #end for
    #end if
    #if str( $platform.assembly ) == '-assembly'
      $platform.assembly "$output_assembly"
    #end if
    #set $use_all_reads = $platform.subset_conditional.use_all_reads
    #if str( $use_all_reads ) != "yes"
      --subset "$platform.subset_conditional.subset"
    #end if
  #end if
  ]]></command>
  <inputs>
    <param name="primer_prefix" type="text" value="test" size="25" label="Primer prefix" help="This prefix will be added to the beginning of all primer names" />
    <conditional name="platform">
      <param name="platform_type" type="select" label="Sequencing platform used to generate data" help="Currently pal_finder only handles Illumina paired-end reads and 454 single-end reads" >
	<option value="illumina" selected="true">Illumina</option>
	<option value="454">454</option>
      </param>
      <when value="illumina">
	<conditional name="paired_input_type_conditional">
          <param name="paired_input_type" type="select" label="Input Type">
            <option value="pair_of_files" selected="true">Pair of datasets</option>
            <option value="collection">Dataset collection pair</option>
	  </param>
	  <when value="pair_of_files">
	    <param name="input_fastq_r1" type="data" format="fastqsanger"
		   label="Illumina fastq file (read 1)" />
	    <param name="input_fastq_r2" type="data" format="fastqsanger"
		   label="Illumina fastq file (read 2)" />
	  </when>
	  <when value="collection">
	    <param name="input_fastq_pair" format="fastqsanger"
		   type="data_collection" collection_type="paired"
		   label="Select FASTQ dataset collection with R1/R2 pair" />
	  </when>
	</conditional>
	<conditional name="subset_conditional">
	  <param name="use_all_reads" type="boolean" label="Use all reads for microsatellite detection?" checked="True" truevalue="yes" falsevalue="no" />
	  <when value="no">
	    <param name="subset" type="text" value="0.5" label="Number or fraction of reads to use" help="Either an integer number of reads or a decimal fraction (e.g. 0.5 to select 50% of reads)" />
	  </when>
	  <when value="yes" />
	</conditional>
	<param name="filters" type="select" display="checkboxes"
	       multiple="True" label="Filters to apply to the pal_finder results"
	       help="Apply none, one or more filters to refine results">
          <option value="-primers" selected="True">Only include loci with designed primers</option>
          <option value="-occurrences" selected="True">Exclude loci where the primer sequences occur more than once in the reads</option>
          <option value="-rankmotifs" selected="True">Only include loci with 'perfect' motifs, and rank by motif size</option>
        </param>
	<param name="assembly" type="boolean"
	       checked="True" truevalue="-assembly" falsevalue=""
               label="Use PANDAseq to assemble paired-end reads and confirm primer sequences are present in high-quality assembly" />
      </when>
      <when value="454">
	<param name="input_fasta" type="data" format="fasta" label="454 fasta file with raw reads" />
      </when>
    </conditional>
    <param name="min_2mer_repeats" type="integer" value="6" label="Minimum number of 2-mer repeat units to detect" min="1" help="Must detect at least one repeat of this n-mer unit" />
    <param name="min_3mer_repeats" type="integer" value="0" label="Minimum number of 3-mer repeat units" help="Set to zero to ignore repeats of this n-mer unit" />
    <param name="min_4mer_repeats" type="integer" value="0" label="Minimum number of 4-mer repeat units" help="Set to zero to ignore repeats of this n-mer unit" />
    <param name="min_5mer_repeats" type="integer" value="0" label="Minimum number of 5-mer repeat units" help="Set to zero to ignore repeats of this n-mer unit" />
    <param name="min_6mer_repeats" type="integer" value="0" label="Minimum number of 6-mer repeat units" help="Set to zero to ignore repeats of this n-mer unit" />
    <conditional name="mispriming">
      <param name="mispriming_options" type="select" label="Mispriming library to use" help="Specify file of nucleotide sequences to avoid amplifying (PRIMER_MISPRIMING_LIBRARY)">
	<option value="default">Default from pal_finder</option>
	<option value="custom">Custom sequences from history</option>
      </param>
      <when value="default">
      </when>
      <when value="custom">
	<param name="mispriming_library" type="data" format="fasta" label="Select mispriming library from history" help="Fasta file containing sequences to avoid amplifying" />
      </when>
    </conditional>
    <conditional name="primer">
      <param name="primer_options" type="select" label="Primer settings to use" help="Advanced users can customise the settings for primer3 for more control">
	<option value="default">Defaults for pal_finder</option>
	<option value="custom">Custom</option>
      </param>
      <when value="custom">
	<param name="primer_opt_size" type="integer" value="20"
	       label="Optimum length (in bases) of a primer (PRIMER_OPT_SIZE)"
	       help="Primer3 will attempt to pick primers close to this length" />
	<param name="primer_min_size" type="integer" value="18"
	       label="Minimum acceptable length (in bases) of a primer (PRIMER_MIN_SIZE)"
	       help="Must be greater than 0 and less than or equal to PRIMER_MAX_SIZE" />
	<param name="primer_max_size" type="integer" value="30"
	       label="Maximum acceptable length (in bases) of a primer (PRIMER_MAX_SIZE)"
	       help="Currently this parameter cannot be larger than 35. This limit is governed by maximum oligo size for which primer3's melting-temperature is valid" />
	<param name="primer_min_gc" type="float" value="30.0"
	       label="Minimum allowable percentage of Gs and Cs in any primer (PRIMER_MIN_GC)" />
	<param name="primer_max_gc" type="float" value="80.0"
	       label="Maximum allowable percentage of Gs and Cs in any primer (PRIMER_MAX_GC)" />
	<param name="primer_gc_clamp" type="integer" value="2"
	       label="Specify number of consecutive Gs and Cs at 3' end of both the left and right primer (PRIMER_GC_CLAMP)" />
	<param name="primer_max_end_gc" type="integer" value="5"
	       label="Maximum number of Gs or Cs allowed in last five 3' bases of a left or right primer (PRIMER_MAX_END_GC)" />
	<param name="primer_min_tm" type="float" value="58.0"
	       label="Minimum acceptable melting temperature for a primer oligo (PRIMER_MIN_TM)"
	       help="Temperature should be in degrees Celsius" />
	<param name="primer_max_tm" type="float" value="65.0"
	       label="Maximum acceptable melting temperature for a primer oligo (PRIMER_MAX_TM)"
	       help="Temperature should be in degrees Celsius" />
	<param name="primer_opt_tm" type="float" value="62.0"
	       label="Optimum melting temperature for a primer (PRIMER_OPT_TM)"
	       help="Temperature should be in degrees Celsius" />
	<param name="primer_pair_max_diff_tm" type="float" value="2.0"
	       label="Maximum acceptable difference between melting temperatures of left and right primers (PRIMER_PAIR_MAX_DIFF_TM)"
	       help="Temperature should be in degrees Celsius" />
      </when>
      <when value="default" />
    </conditional>
    <param name="report_bad_primer_ranges" type="boolean" truevalue="True" falsevalue="False" label="Output IDs for input reads which generate bad primer product size ranges" help="Can be used to screen reads in input Fastqs " />
    <param name="keep_config_file" type="boolean" truevalue="True" falsevalue="False"
	   label="Output the config file to the history"
	   help="Can be used to run pal_finder outside of Galaxy" />
  </inputs>
  <outputs>
    <data name="output_pal_summary" format="tabular" label="${tool.name} on ${on_string} for ${primer_prefix}: all microsatellites (full details)" />
    <data name="output_filtered_microsats" format="tabular" label="${tool.name} on ${on_string} for ${primer_prefix}: filtered microsatellites (full details)">
      <filter>platform['platform_type'] == 'illumina' and platform['filters'] is not None</filter>
    </data>
    <data name="output_microsat_summary" format="txt" label="${tool.name} on ${on_string} for ${primer_prefix}: summary of microsatellite types" />
    <data name="output_assembly" format="tabular" label="${tool.name} on ${on_string} for ${primer_prefix}: assembly">
      <filter>platform['assembly'] is True</filter>
    </data>
    <data name="output_bad_primer_read_ids" format="tabular" label="${tool.name} on ${on_string} for ${primer_prefix}: read IDs generating bad primer ranges">
      <filter>report_bad_primer_ranges is True</filter>
    </data>
    <data name="output_config_file" format="txt" label="${tool.name} on ${on_string} for ${primer_prefix}: config file">
      <filter>keep_config_file is True</filter>
    </data>
  </outputs>
  <tests>
    <!-- Test with Illumina input -->
    <test>
      <param name="platform_type" value="illumina" />
      <param name="input_fastq_r1" value="illuminaPE_r1.fq" ftype="fastqsanger" />
      <param name="input_fastq_r2" value="illuminaPE_r2.fq" ftype="fastqsanger" />
      <expand macro="output_illumina_microsat_summary" />
      <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats.out.re_match" />
      <output name="output_filtered_microsats" compare="re_match" file="illuminaPE_filtered_microsats.out.re_match" />
      <output name="output_assembly" file="illuminaPE_assembly_after_filters.out" />
    </test>
    <!-- Test with Illumina input as dataset pair -->
    <test>
      <param name="platform_type" value="illumina" />
      <param name="paired_input_type" value="collection" />
      <param name="input_fastq_pair">
	<collection type="paired">
	  <element name="forward" value="illuminaPE_r1.fq" ftype="fastqsanger" />
	  <element name="reverse" value="illuminaPE_r2.fq" ftype="fastqsanger" />
	</collection>
      </param>
      <expand macro="output_illumina_microsat_summary" />
      <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats.out.re_match" />
      <output name="output_filtered_microsats" compare="re_match" file="illuminaPE_filtered_microsats.out.re_match" />
      <output name="output_assembly" file="illuminaPE_assembly_after_filters.out" />
    </test>
    <!-- Test with Illumina input filter to loci with PandaSEQ assembly
	 ('-assembly' option)
    -->
    <test>
      <param name="platform_type" value="illumina" />
      <param name="filters" value="" />
      <param name="input_fastq_r1" value="illuminaPE_r1.fq" ftype="fastqsanger" />
      <param name="input_fastq_r2" value="illuminaPE_r2.fq" ftype="fastqsanger" />
      <expand macro="output_illumina_microsat_summary" />
      <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats.out.re_match" />
      <output name="output_assembly" file="illuminaPE_assembly.out" />
    </test>
    <!-- Test with Illumina input filter to loci with primers
	 ('-primers' option) -->
    <test>
      <param name="platform_type" value="illumina" />
      <param name="filters" value="-primers" />
      <param name="assembly" value="false" />
      <param name="input_fastq_r1" value="illuminaPE_r1.fq" ftype="fastqsanger" />
      <param name="input_fastq_r2" value="illuminaPE_r2.fq" ftype="fastqsanger" />
      <expand macro="output_illumina_microsat_summary" />
      <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats.out.re_match" />
      <output name="output_filtered_microsats" compare="re_match" file="illuminaPE_filtered_microsats_primers.out.re_match" />
    </test>
    <!-- Test with Illumina input filter to loci which appear only once
	 ('-occurrences' option) -->
    <test>
      <param name="platform_type" value="illumina" />
      <param name="filters" value="-occurrences" />
      <param name="assembly" value="false" />
      <param name="input_fastq_r1" value="illuminaPE_r1.fq" ftype="fastqsanger" />
      <param name="input_fastq_r2" value="illuminaPE_r2.fq" ftype="fastqsanger" />
      <expand macro="output_illumina_microsat_summary" />
      <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats.out.re_match" />
      <output name="output_filtered_microsats" compare="re_match" file="illuminaPE_filtered_microsats_occurrences.out.re_match" />
    </test>
    <!-- Test with Illumina input filter and rank loci with perfect motifs
	 ('-rankmotifs' option) -->
    <test>
      <param name="platform_type" value="illumina" />
      <param name="filters" value="-rankmotifs" />
      <param name="assembly" value="false" />
      <param name="input_fastq_r1" value="illuminaPE_r1.fq" ftype="fastqsanger" />
      <param name="input_fastq_r2" value="illuminaPE_r2.fq" ftype="fastqsanger" />
      <expand macro="output_illumina_microsat_summary" />
      <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats.out.re_match" />
      <output name="output_filtered_microsats" compare="re_match" file="illuminaPE_filtered_microsats_rankmotifs.out.re_match" />
    </test>
    <!-- Test with Illumina input using subset of reads -->
    <test>
      <param name="platform_type" value="illumina" />
      <param name="filters" value="" />
      <param name="assembly" value="false" />
      <param name="use_all_reads" value="no" />
      <param name="subset" value="0.5" />
      <param name="input_fastq_r1" value="illuminaPE_r1.fq" ftype="fastqsanger" />
      <param name="input_fastq_r2" value="illuminaPE_r2.fq" ftype="fastqsanger" />
      <expand macro="output_illumina_microsat_subset_summary" />
      <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats_subset.out.re_match" />
    </test>
    <!-- Test with Illumina input filter that doesn't find any
	 microsatellites -->
    <test expect_failure="true">
      <param name="platform_type" value="illumina" />
      <param name="filters" value="" />
      <param name="assembly" value="false" />
      <param name="min_2mer_repeats" value="8" />
      <param name="input_fastq_r1" value="illuminaPE_r1_no_microsats.fq" ftype="fastqsanger" />
      <param name="input_fastq_r2" value="illuminaPE_r2_no_microsats.fq" ftype="fastqsanger" />
      <assert_stderr>
	<has_text text="pal_finder failed to locate any microsatellites" />
      </assert_stderr>
    </test>
    <!-- Test with Illumina input generating bad ranges -->
    <test>
      <param name="platform_type" value="illumina" />
      <param name="filters" value="" />
      <param name="assembly" value="false" />
      <param name="min_2mer_repeats" value="8" />
      <param name="input_fastq_r1" value="illuminaPE_r1_bad_ranges.fq" ftype="fastqsanger" />
      <param name="input_fastq_r2" value="illuminaPE_r2_bad_ranges.fq" ftype="fastqsanger" />
      <param name="min_2mer_repeats" value="8" />
      <param name="min_3mer_repeats" value="8" />
      <param name="min_4mer_repeats" value="8" />
      <param name="min_5mer_repeats" value="8" />
      <param name="min_6mer_repeats" value="8" />
      <param name="primer_options" value="custom" />
      <param name="primer_opt_size" value="25" />
      <param name="primer_min_size" value="21" />
      <param name="primer_max_size" value="30" />
      <param name="primer_min_gc" value="40.0" />
      <param name="primer_max_gc" value="60.0" />
      <param name="primer_gc_clamp" value="3" />
      <param name="primer_max_end_gc" value="5" />
      <param name="primer_min_tm" value="60.0" />
      <param name="primer_max_tm" value="80.0" />
      <param name="primer_opt_tm" value="68.0" />
      <param name="primer_pair_max_diff_tm" value="3.0" />
      <param name="report_bad_primer_ranges" value="true" />
      <expand macro="output_illumina_microsat_summary_bad_ranges" />
      <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats_bad_ranges.out.re_match" />
      <output name="output_bad_primer_read_ids" file="illuminaPE_bad_primer_read_ids.out" />
    </test>
    <!-- Test with bad n-mers specified -->
    <test expect_failure="true">
      <param name="platform_type" value="illumina" />
      <param name="filters" value="" />
      <param name="assembly" value="false" />
      <param name="min_2mer_repeats" value="8" />
      <param name="min_3mer_repeats" value="8" />
      <param name="min_4mer_repeats" value="0" />
      <param name="min_5mer_repeats" value="8" />
      <param name="min_6mer_repeats" value="8" />
      <param name="input_fastq_r1" value="illuminaPE_r1_no_microsats.fq" ftype="fastqsanger" />
      <param name="input_fastq_r2" value="illuminaPE_r2_no_microsats.fq" ftype="fastqsanger" />
      <assert_stderr>
	<has_text text="Minimum number of 4-mers cannot be zero if number of 5-mers is non-zero" />
      </assert_stderr>
    </test>
    <!-- Test with 454 input -->
    <test>
      <param name="platform_type" value="454" />
      <param name="input_fasta" value="454_in.fa" ftype="fasta" />
      <expand macro="output_454_microsat_summary" />
      <output name="output_pal_summary" compare="re_match" file="454_microsats.out.re_match" />
    </test>
  </tests>
  <help>
.. class:: infomark

**What it does**

This tool runs the pal_finder program, which finds microsatellite repeat elements
directly from raw 454 or Illumina paired-end sequencing reads. It then designs PCR
primers to amplify these repeat loci (Potentially Amplifiable Loci: PAL).

Optionally for Illumina data, one or more filters can be applied to the output from
pal_finder to:

 * Only include loci with designed primers
 * Exclude loci where the primer sequences occur more than once in the reads
 * Only include loci with 'perfect' motifs (and rank by motif size,largest to
   smallest)
 * Use PANDAseq to assemble paired-end reads and confirm primer sequences are
   present in high-quality assembly

Pal_finder runs the primer3_core program; information on the settings used in
primer3_core can be found in the Primer3 manual at
http://primer3.sourceforge.net/primer3_manual.htm

-------------

.. class:: infomark

**Known issues**

.. class:: warning

**Low number of reads used for microsatellite detection/bad primer product size ranges**

For some datasets pal_finder may generate 'bad' product size ranges (where the
lower limit exceeds the upper limit) for one or more reads, for input into
primer3_core. In these cases primer3_core will terminate prematurely, which can
result in a substantially lower number of reads being used for microsatellite
detection and potentially sub-optimal primer design.

The number of reads generating the bad size ranges are reported in the
*Summary of microsat types* output dataset as 'readsWithBadRanges'. Ideally
the reported value should be zero.

The conditions which cause this issue within pal_finder are still unclear,
however we believe it to be associated with short or low quality reads. If this
problem affects your data then:

* Ensure that the input data are sufficiently trimmed and filtered (using
  e.g. the Trimmomatic tool) before rerunning pal_finder.

* A list of read IDs for which pal_finder generates bad product size ranges can
  be output by turning on *Output IDs for input reads which generate bad primer
  ranges*. This outputs an additional dataset with a list of read IDs which can
  be used to remove read pairs from the input Fastq files (using e.g. the *Filter
  sequences by ID* tool) before rerunning pal_finder.

.. class:: warning

**Pal_finder takes a long time to run for large input datasets**

pal_finder was originally developed using MiSeq data, and is not optimised for
working with the larger Fastqs that are output from other platforms such as
HiSeq and NextSeq. As a consequence pal_finder may take a very long time to
complete when operating on larger datasets.

If this is a problem then the tool can be run using a subset of the input reads
by unchecking the *Use all reads...* option and entering either an integer number
of reads to use, or a decimal fraction (e.g. 0.5 will select 50% of the reads).

-------------

.. class:: infomark

**Credits**

This Galaxy tool has been developed by Peter Briggs within the Bioinformatics Core
Facility at the University of Manchester. It runs the pal_finder package which can be
obtained from http://sourceforge.net/projects/palfinder/:

 * PLoS One. 2012; 7(2): e30953 "Rapid Microsatellite Identification from Illumina Paired-End
   Genomic Sequencing in Two Birds and a Snake" Todd A. Castoe, Alexander W. Poole, A. P.
   Jason de Koning, Kenneth L. Jones, Diana F. Tomback, Sara J. Oyler-McCance, Jennifer A.
   Fike, Stacey L. Lance, Jeffrey W. Streicher, Eric N. Smith, and David D. Pollock

The paper is available at http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3279355/

This tool is compatible with pal_finder version 0.02.04, which in turn runs the
primer3_core program (version 2.0.0-alpha is required, available from
http://primer3.sourceforge.net/releases.php):

 * Steve Rozen and Helen J. Skaletsky (2000) "Primer3 on the WWW for general users and for
   biologist programmers". In: Krawetz S, Misener S (eds) Bioinformatics Methods and
   Protocols: Methods in Molecular Biology. Humana Press, Totowa, NJ, pp 365-386

The paper is available at
http://purl.com/STEVEROZEN/papers/rozen-and-skaletsky-2000-primer3.pdf

The filtering and assembly of the pal_finder output for Illumina data is performed
using a Python utility written by Graeme Fox at the University of Manchester, and which
is included with this tool; this utility uses the BioPython and PANDAseq packages.

Please kindly acknowledge both this Galaxy tool, the pal_finder and primer3 packages, and
the utility script and its dependencies if you use it in your work.
  </help>
  <citations>
    <!--
    See https://wiki.galaxyproject.org/Admin/Tools/ToolConfigSyntax#A.3Ccitations.3E_tag_set
    Can be either DOI or Bibtex
    Use http://www.bioinformatics.org/texmed/ to convert PubMed to Bibtex
    -->
    <citation type="doi">10.1371/journal.pone.0030953</citation>
    <citation type="bibtex">@Article{pmid10547847,
    Author="Rozen, S. and Skaletsky, H. ",
    Title="{{P}rimer3 on the {W}{W}{W} for general users and for biologist programmers}",
    Journal="Methods Mol. Biol.",
    Year="2000",
    Volume="132",
    Pages="365--386",
    URL="{http://purl.com/STEVEROZEN/papers/rozen-and-skaletsky-2000-primer3.pdf}"
    }</citation>
    <citation type="doi">10.1093/bioinformatics/btp163</citation>
    <citation type="doi">10.1186/1471-2105-13-31</citation>
  </citations>
</tool>
author	pjbriggs
date	Wed, 04 Jul 2018 06:05:52 -0400
parents	4e625d3672ba
children