view biohansel.xml @ 8:4fa9d5e748d4 draft

planemo upload for repository https://github.com/phac-nml/biohansel commit 759c65786566b73806fccaed05802fe01ec4226a
author nml
date Tue, 21 May 2019 15:05:40 -0400
parents 3360158bb0c5
children e305df0b1af7
line wrap: on
line source

<tool id="biohansel" name="biohansel" version="2.2.0">
  <description>SNP subtyping of genome sequence reads or assemblies</description>
  <requirements>
    <requirement type="package" version="2.2.0">bio_hansel</requirement>
  </requirements>
  <command detect_errors="exit_code">
<![CDATA[

#import re

## Illumina FASTQ naming regular expression (https://github.com/phac-nml/biohansel/issues/38)
#set global $ILLUMINA_REGEX = $re.compile(r'^([\w\-\_]+)_S\d+_L\d{3}_R(\d)_001\.fastq(\.gz)?$')
#set global $FASTQ_DUMP_REGEX = $re.compile(r'^(\w+|\d+):((forward|reverse)(_\d+)?)')

#def is_gzipped_fastq($data_input)
  ## Is FASTQ data param gzipped type? i.e. either 'fastq.gz' or 'fastqsanger.gz'?
  #return $data_input.is_of_type('fastqsanger.gz') or $data_input.is_of_type('fastq.gz')
#end def

#def get_fastq_ext($data_input)
  ## Get file extension for FASTQ data param
  #return '.fastq.gz' if $is_gzipped_fastq($data_input) else '.fastq'
#end def

#def base_sample_name($name)
  ## Get the base sample name and append 1/2 depending on if forward/reverse read
  #set $illumina_match = $ILLUMINA_REGEX.match($name)
  #set $fastq_dump_match = $FASTQ_DUMP_REGEX.match($name)

  #if $illumina_match
    #return $illumina_match.group(1)
  #elif $fastq_dump_match
    #return $fastq_dump_match.group(1)
  #elif $re.search(r'_R(1|2)', $name):
    #return $re.sub(r'(.+)_R(1|2)([^\.]*)(\..+)', r'\1\3', $name)
  #elif $re.match(r'.+_\d\.', $name):
    #return $re.sub(r'(.+)_(\d)(\..+)', r'\1', $name)
  #else    
    #return $name
  #end if
#end def

#def get_paired_fastq_filename($data_input, $name=None, $is_forward=True)
  ## Get paired FASTQ filename for a data param with appropriate file extension 
  ## with '_1' or '_2' appended if forward or reverse reads, respectively.
  #set $name = $name if $name is not None else $data_input.name
  #set $name = $base_sample_name($name)
  #set $postfix = '1' if $is_forward else '2'
  #set $ending = '_{}{}'.format($postfix, $get_fastq_ext($data_input))
  #return '"{}"'.format($name) if $ending in $name else '"{}{}"'.format($name, $ending)
#end def

## Create symlinks from Galaxy *.dat to <sample_name>(.fasta|.fastq|.fastq.gz)
#if $input.type == 'fasta'
  #set $input_files = '"{}"'.format($input.fasta.name)
  ln -s "$input.fasta" $input_files &&
#elif $input.type == 'paired'
  #set $forward_filename = $get_paired_fastq_filename($input.forward)
  #set $reverse_filename = $get_paired_fastq_filename($input.reverse, is_forward=False)
  #set $input_files = '{} {}'.format($forward_filename, $reverse_filename)
  ln -s "$input.forward" $forward_filename &&
  ln -s "$input.reverse" $reverse_filename &&
#elif $input.type == 'single'
  #set $input_files = '"{}"'.format($input.single.name)
  ln -s "$input.single" $input_files &&
#elif $input.type == 'paired_collection'
  #set $forward_filename = $get_paired_fastq_filename($input.paired_collection.forward)
  #set $reverse_filename = $get_paired_fastq_filename($input.paired_collection.reverse, is_forward=False)
  #set $input_files = '{} {}'.format($forward_filename, $reverse_filename)
  ln -s "$input.paired_collection.forward" $forward_filename &&
  ln -s "$input.paired_collection.reverse" $reverse_filename &&
#end if

## Checking for custom scheme.
#if $type_of_scheme.scheme_check == "custom":
  #if $type_of_scheme.scheme_input.is_of_type('fasta'):
    ln -s '$type_of_scheme.scheme_input' '$type_of_scheme.scheme_input.name' &&
  #end if
#end if

#def get_subtype_metadata_filename($data_input)
  ## Ensure that the subtype metadata table file has the proper extension - .tab or .csv
  #set $filename = $data_input.name
  #if $data_input.is_of_type('tabular')
    #return '{}.tab'.format($filename)
  #elif $data_input.is_of_type('csv')
    #return '{}.csv'.format($filename)
  #else
    #return None
  #end if
#end def

## Symlink to subtype metadata table if it is specified
#if $subtype_metadata
  #set global $subtype_metadata_filename = $get_subtype_metadata_filename($subtype_metadata)
  #if $subtype_metadata_filename
    ln -s '$subtype_metadata' '$subtype_metadata_filename' &&
  #end if
#else 
  #set global $subtype_metadata_filename = None
#end if

#################################
## biohansel command starts here:
#################################
hansel
  -vvv
  -t "\${GALAXY_SLOTS:-1}"
  -o results.tab
  -O match_results.tab
  -S tech_results.tab
  $dev_args.use_json
  $input_files
  --scheme
#if $type_of_scheme.scheme_check == "custom":
  '$type_of_scheme.scheme_input.name'
#else:
  $type_of_scheme.scheme_type
#end if
#if $subtype_metadata_filename
  --scheme-metadata '$subtype_metadata_filename'
#end if
#if $kmer_vals.kmer_min
  --min-kmer-freq $kmer_vals.kmer_min
#end if
#if $kmer_vals.kmer_max
  --max-kmer-freq $kmer_vals.kmer_max
#end if
#if $qc_vals.low_cov_depth_freq
  --low-cov-depth-freq $qc_vals.low_cov_depth_freq
#end if
#if $qc_vals.max_missing_kmers
  --max-missing-kmers $qc_vals.max_missing_kmers
#end if
#if $qc_vals.min_ambiguous_kmers
  --min-ambiguous-kmers $qc_vals.min_ambiguous_kmers
#end if
#if $qc_vals.max_intermediate_kmers
  --max-intermediate-kmers $qc_vals.max_intermediate_kmers
#end if
#if $qc_vals.low_coverage_warning
  --low-cov-warning $qc_vals.low_coverage_warning
#end if
]]>
  </command>
  <inputs>
    <conditional name="input">
      <param name="type" type="select" label="Sequence Data Type">
        <option value="fasta">Contigs (FASTA)</option>
        <option value="paired">Paired-end reads (FASTQ)</option>
        <option value="single">Single-end reads (FASTQ)</option>
        <option value="paired_collection">Paired-end reads collection (FASTQ)</option>
      </param>
      <when value="fasta">
        <param name="fasta"
          type="data" format="fasta"
          optional="false"
          label="Contigs (FASTA)"
          />
      </when>
      <when value="paired">
        <param name="forward"
          type="data" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz"
          optional="false"
          label="Forward reads (FASTQ)"
          help="Must have ASCII encoded quality scores"
          />
        <param name="reverse"
          type="data" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz"
          optional="false"
          label="Reverse reads (FASTQ)"
          help="File format must match the Forward FASTQ file"
          />
      </when>
      <when value="single">
        <param name="single"
          type="data" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz"
          optional="false"
          label="Single-end reads (FASTQ)"
          />
      </when>
      <when value="paired_collection">
        <param name="paired_collection"
          type="data_collection" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz,txt"
          collection_type="paired"
          optional="false"
          label="Paired-end reads collection (FASTQ)"
          />
      </when>
    </conditional>
    <conditional name="type_of_scheme">
      <param name="scheme_check" type="select"
        label="Subtyping Schemes"
        help="Use included scheme or custom">
        <option value="standard">Included Biohansel Schemes</option>
        <option value="custom">Select User Custom Scheme</option>
      </param>
      <when value="standard">
        <param name="scheme_type" type="select"
          label="Included SNP Subtyping Scheme">
          <option value="heidelberg">Salmonella Heidelberg subtyping scheme</option>
          <option value="enteritidis">Salmonella Enteritidis subtyping scheme</option>
          <option value="typhi">Salmonella Typhi subtyping scheme</option>
          <option value="typhimurium">Salmonella Typhimurium subtyping scheme</option>
          <option value="tb_speciation">Mycobacterium Tuberculosis subtyping scheme</option>
        </param>
      </when>
      <when value="custom">
        <param name="scheme_input" 
          type="data" format="fasta" 
          label="Your Custom Biohansel SNP Subtyping Scheme"/>
      </when>
    </conditional>
    <param name="subtype_metadata"
      type="data" format="tabular,csv"
      optional="true"
      label="Scheme Subtype Metadata Table [Optional]"
      help="CSV or tab-delimited format only. Must contain a 'subtype' column."
      />
    <!-- K-mer frequencies. -->
    <section name="kmer_vals" title="K-mer Frequency Thresholds" expanded="False">
      <param name="kmer_min" type="integer"
        argument="--min-kmer-freq"
        optional="true"
        min="0" value="8"
        label="Min k-mer frequency/coverage"
        help="default = 8"/>
      <param name="kmer_max" type="integer"
        argument="--max-kmer-freq"
        optional="true"
        min="1" value="1000"
        label="Max k-mer frequency/coverage"
        help="default = 1000"/>
    </section>
    <!-- Quality Checking Parameters -->
    <section name="qc_vals" title="Quality Checking Thresholds" expanded="False">
      <param name="low_cov_depth_freq" type="integer"
        argument="--low-cov-depth-freq"
        value="20" min="0"
        optional="true"
        label="QC: Frequency below this coverage are considered low coverage"
        help="default = 20"/>
      <param name="min_ambiguous_kmers" type="integer"
        argument="--min-ambiguous-kmers"
        optional="true"
        value="3" min="0"
        label="QC: Min number of kmers missing for Ambiguous Result"
        help="default = 3"/>
      <param name="max_missing_kmers" type="float"
        argument="--max-missing-kmers"
        optional="true"
        value="0.05" min="0" max="1"
        label="QC: Decimal Proportion of max allowed missing kmers" help="default = 0.05, valid values {0.0 - 1.0}"/>
      <param name="max_intermediate_kmers" type="float"
        argument="--max-intermediate-kmers"
        optional="true"
        value="0.05" min="0" max="1"
        label="QC: Decimal Proportion of max allowed missing kmers for an intermediate subtype"
        help="default = 0.05, valid values {0.0 - 1.0}"/> 
      <param name="low_coverage_warning" type="integer"
        argument="--low-cov-warning"
        optional="true"
        value="20"
        label="QC: Overall kmer coverage below this value will trigger a low coverage warning"
        help="default = 20"/> 
    </section>
    <section name="dev_args" title="Developer Options" expanded="False">
      <param name="use_json"
        type="boolean"
        checked="false"
        truevalue="--json"
        falsevalue=""
        label="Output JSON results"
        help="Use this option of you need json representations of analysis' details"/>
    </section>
  </inputs>
  <outputs>
    <data format="tabular" name="results.tab" from_work_dir="results.tab" label="results.tab"/>
    <data format="tabular" name="match_results.tab" from_work_dir="match_results.tab" label="match_results.tab"/>
    <data format="tabular" name="tech_results.tab" from_work_dir="tech_results.tab" label="tech_results.tab"/>
    <data format="json" name="results.json" from_work_dir="results.tab.json" label="results.json">
      <filter>dev_args['use_json']</filter>
    </data>
    <data format="json" name="match_results.json" from_work_dir="match_results.tab.json" label="match_results.json">
      <filter>dev_args['use_json']</filter>
    </data>
    <data format="json" name="tech_results.json" from_work_dir="tech_results.tab.json" label="tech_results.json">
      <filter>dev_args['use_json']</filter>
    </data>
  </outputs>
  <tests>
    <test>
      <conditional name="input">
        <param name="type" value="fasta"/>
        <param name="fasta" value="SRR1002850_SMALL.fasta"/>
      </conditional>
      <param name="type_of_scheme" value="heidelberg"/>
      <output name="results.tab"
        value="SRR1002850_SMALL.fasta-results.tab"
        ftype="tabular"
        compare="sim_size"
        delta="1000">
      </output>
      <output name="match_results.tab"
        value="SRR1002850_SMALL.fasta-match_results.tab"
        ftype="tabular"
        compare="sim_size"
        delta="16000">
      </output>
      <output name="tech_results.tab"
        value="SRR1002850_SMALL.fasta-tech_results.tab"
        ftype="tabular"
        lines_diff="0">
      </output>
    </test>
    <test>
      <conditional name="input">
        <param name="type" value="paired"/>
        <param name="forward" value="SRR5646583_SMALL_1.fastq"/>
        <param name="reverse" value="SRR5646583_SMALL_2.fastq"/>
      </conditional>
      <param name="type_of_scheme" value="heidelberg"/>
      <output name="tech_results.tab"
        value="SRR5646583_SMALL-tech_results.tab"
        ftype="tabular"
        lines_diff="0">
      </output>
      <output name="results.tab"
        value="SRR5646583_SMALL-results.tab"
        ftype="tabular"
        compare="sim_size"
        delta="1000">
      </output>
      <output name="match_results.tab"
        value="SRR5646583_SMALL-match_results.tab"
        ftype="tabular"
        compare="sim_size"
        delta="16000">
      </output>
    </test>
  </tests>
  <help><![CDATA[
Subtype microbial whole-genome sequencing (WGS) data using single-nucleotide polymorphism (SNP) targeting k-mer subtyping schemes.
 
**Usage**

1) Select the sequence data you wish to subtype (FASTAs or FASTQs)
2) Select subtyping scheme (e.g. *Salmonella enterica* serovar Heidelberg, Enteritidis, Typhimurium, or Typhi subtyping scheme, or your own custom `biohansel` compatible subtyping scheme)
3) [Optional] Select your subtype metadata information table to include subtype metadata along with your subtype results
4) Click ``Execute``

For more information, visit `the biohansel project page <https://github.com/phac-nml/biohansel>`_ or the `biohansel read the docs page <https://bio-hansel.readthedocs.io/en/readthedocs/>`_.


**Example analysis results of a single FASTA file**

Contents of ``results.tab``:

    +------------------+------------+----------------+-------------+------------------------------------------------+---------------------------------------------------------------+-------------------------+-----------------------+----------------------+-------------------------------+---------------------------+------------------------------------+--------------------------+-----------------------------------+------------------------+-----------+------------+
    | sample           | scheme     | scheme_version | subtype     | all_subtypes                                   | kmers_matching_subtype                                        | are_subtypes_consistent | inconsistent_subtypes | n_kmers_matching_all | n_kmers_matching_all_expected | n_kmers_matching_positive | n_kmers_matching_positive_expected | n_kmers_matching_subtype | n_kmers_matching_subtype_expected | file_path              | qc_status | qc_message |
    +------------------+------------+----------------+-------------+------------------------------------------------+---------------------------------------------------------------+-------------------------+-----------------------+----------------------+-------------------------------+---------------------------+------------------------------------+--------------------------+-----------------------------------+------------------------+-----------+------------+
    | SRR1002850_SMALL | heidelberg | 0.5.0          | 2.2.2.2.1.4 | 2; 2.2; 2.2.2; 2.2.2.2; 2.2.2.2.1; 2.2.2.2.1.4 | 2154958-2.2.2.2.1.4; 1037658-2.2.2.2.1.4; 3785187-2.2.2.2.1.4 | True                    |                       | 202                  | 202                           | 17                        | 17                                 | 3                        | 3                                 | SRR1002850_SMALL.fasta | PASS      |            |
    +------------------+------------+----------------+-------------+------------------------------------------------+---------------------------------------------------------------+-------------------------+-----------------------+----------------------+-------------------------------+---------------------------+------------------------------------+--------------------------+-----------------------------------+------------------------+-----------+------------+


Contents of ``match_results.tab``:

    +---------------------------+-----------------------------------+------------+---------------------------------------+-------------+-------------+-------------+-------------+------------------+------------------------+------------+----------------+-----------+------------+
    | kmername                  | seq                               | is_revcomp | contig_id                             | match_index | refposition | subtype     | is_pos_kmer | sample           | file_path              | scheme     | scheme_version | qc_status | qc_message |
    +---------------------------+-----------------------------------+------------+---------------------------------------+-------------+-------------+-------------+-------------+------------------+------------------------+------------+----------------+-----------+------------+
    | 2154958-2.2.2.2.1.4       | GGCGCGCCACGGTTACTCCCCGGTGGTCAGCCG | True       | NODE_1_length_726282_cov_40.4705_ID_1 | 13732       | 2154958     | 2.2.2.2.1.4 | True        | SRR1002850_SMALL | SRR1002850_SMALL.fasta | heidelberg | 0.5.0          | PASS      |            |
    +---------------------------+-----------------------------------+------------+---------------------------------------+-------------+-------------+-------------+-------------+------------------+------------------------+------------+----------------+-----------+------------+
    | negative2131791-2.2.3.1.3 | GCTGGGCGAAATGATGCAGTTCACCACTTGCTC | True       | NODE_1_length_726282_cov_40.4705_ID_1 | 36900       | 2131791     | 2.2.3.1.3   | False       | SRR1002850_SMALL | SRR1002850_SMALL.fasta | heidelberg | 0.5.0          | PASS      |            |
    +---------------------------+-----------------------------------+------------+---------------------------------------+-------------+-------------+-------------+-------------+------------------+------------------------+------------+----------------+-----------+------------+

    *Next 201 lines omitted.*



**Example analysis results of a single FASTQ readset**


Contents of ``results.tab``:

    +------------------+------------+----------------+-------------+------------------------------------------------+------------------------------------------+-------------------------+-----------------------+----------------------+-------------------------------+---------------------------+------------------------------------+--------------------------+-----------------------------------+----------------------------------------------------------+-------------------+-----------+------------+
    | sample           | scheme     | scheme_version | subtype     | all_subtypes                                   | kmers_matching_subtype                   | are_subtypes_consistent | inconsistent_subtypes | n_kmers_matching_all | n_kmers_matching_all_expected | n_kmers_matching_positive | n_kmers_matching_positive_expected | n_kmers_matching_subtype | n_kmers_matching_subtype_expected | file_path                                                | avg_kmer_coverage | qc_status | qc_message |
    +------------------+------------+----------------+-------------+------------------------------------------------+------------------------------------------+-------------------------+-----------------------+----------------------+-------------------------------+---------------------------+------------------------------------+--------------------------+-----------------------------------+----------------------------------------------------------+-------------------+-----------+------------+
    | SRR5646583_SMALL | heidelberg | 0.5.0          | 2.2.1.1.1.1 | 2; 2.2; 2.2.1; 2.2.1.1; 2.2.1.1.1; 2.2.1.1.1.1 | 1983064-2.2.1.1.1.1; 4211912-2.2.1.1.1.1 | True                    |                       | 202                  | 202                           | 20                        | 20                                 | 2                        | 2                                 | ['SRR5646583_SMALL_1.fastq', 'SRR5646583_SMALL_2.fastq'] | 42.631            | PASS      |            |
    +------------------+------------+----------------+-------------+------------------------------------------------+------------------------------------------+-------------------------+-----------------------+----------------------+-------------------------------+---------------------------+------------------------------------+--------------------------+-----------------------------------+----------------------------------------------------------+-------------------+-----------+------------+

Contents of ``match_results.tab``:

    +---------------------+-----------------------------------+------+-------------+-----------+-------------+-------------------+------------------+------------+----------------+-----------+------------+
    | kmername            | seq                               | freq | refposition | subtype   | is_pos_kmer | is_kmer_freq_okay | sample           | scheme     | scheme_version | qc_status | qc_message |
    +---------------------+-----------------------------------+------+-------------+-----------+-------------+-------------------+------------------+------------+----------------+-----------+------------+
    | negative4642573-1.2 | TACCAGGAAGTGCTGGAAGAGTTTAACGAACAT | 62   | 4642573     | 1.2       | False       | True              | SRR5646583_SMALL | heidelberg | 0.5.0          | PASS      |            |
    +---------------------+-----------------------------------+------+-------------+-----------+-------------+-------------------+------------------+------------+----------------+-----------+------------+
    | 21097-2.2.1.1.1     | GCAAATCGCGCCAGTCAAGTCCTCTTTTACCGT | 42   | 21097       | 2.2.1.1.1 | True        | True              | SRR5646583_SMALL | heidelberg | 0.5.0          | PASS      |            |
    +---------------------+-----------------------------------+------+-------------+-----------+-------------+-------------------+------------------+------------+----------------+-----------+------------+

    *Next 202 lines omitted.*


**Example Subtype Metadata**

A column with name `subtype` must exist and should have subtype designations that would appear in your biohansel results. There are no requirements for the number of columns or contents of those columns in the table - they can contain whatever you want.


    +-------------+-------+--------+------------------+
    | subtype     | clade | source | disease_symptoms |
    +-------------+-------+--------+------------------+
    | 1           | I     | geese  | death            |
    +-------------+-------+--------+------------------+
    | 1.1         | I     | moose  | burns            |
    +-------------+-------+--------+------------------+
    | 2.2.1.1.1   | II    | mouse  | boils            |
    +-------------+-------+--------+------------------+
    | 2.2.2.2.1.4 | IIa   | house  | rash             |
    +-------------+-------+--------+------------------+

The `biohansel` results table will be joined with the subtype metadata table on the `subtype` field so if there are subtype metadata for your `biohansel` results, it will show up in the final output table. For example, if you have a sample that produces a result with subtype "1", there will also be columns "clade", "source" and "disease_symptoms" with "I", "geese" and "death", respectively.


Galaxy wrapper written by Matthew Gopez and Peter Kruczkiewicz at the Public Health Agency of Canada, National Microbiology Laboratory.

    ]]></help>
  <citations>
    <citation type="bibtex">@ARTICLE{a1,
      title = {A robust genotyping scheme for Salmonella enterica serovar Heidelberg clones circulating in North America},
      author = {Geneviève Labbé, James Robertson, Peter Kruczkiewicz, Marisa Rankin, Matthew Gopez, Chad R. Laing, Philip Mabon, Kim Ziebell, Aleisha R. Reimer, Lorelee Tschetter, Gary Van Domselaar, Sadjia Bekal, Kimberley A. MacDonald, Linda Hoang, Linda Chui, Danielle Daignault, Durda Slavic, Frank Pollari, E. Jane Parmley, David Son, Darian Hole, Elissa Giang, Lok Kan Lee, Jonathan Moffat, Joanne MacKinnon, Roger Johnson, John H.E. Nash},
      url = {https://github.com/phac-nml/biohansel}
      }
    }</citation>
  </citations>
</tool>