Mercurial > repos > nml > refseq_masher

<tool id="refseq_masher_contains" name="RefSeq Masher Contains" version="0.1.1">
  <description>
    Find NCBI RefSeq Genomes contained in your sequences
  </description>
  <requirements>
    <requirement type="package" version="0.1.1">refseq_masher</requirement>
  </requirements>
  <command detect_errors="exit_code">
<![CDATA[

#import re

#if $input.type == 'fasta'
#set $input_files = '"{}"'.format($input.fasta.name)
  ln -s "$input.fasta" $input_files &&
#elif $input.type == 'paired'
#set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.forward.name) else '.fastq'
#set $_forward = '"{}_1{}"'.format($re.sub(r'_[12]\..+$', '', $input.forward.name), $_forward_ext)
#set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.reverse.name) else '.fastq'
#set $_reverse = '"{}_2{}"'.format($re.sub(r'_[12]\..+$', '', $input.reverse.name), $_reverse_ext)
#set $input_files = '{} {}'.format($_forward, $_reverse)
  ln -s "$input.forward" $_forward &&
  ln -s "$input.reverse" $_reverse &&
#elif $input.type == 'single'
#set $input_files = '"{}"'.format($input.single.name)
  ln -s "$input.single" $input_files &&
#elif $input.type == 'paired_collection'
#set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.forward)) else '.fastq'
#set $_forward = '"{}_1{}"'.format($input.paired_collection.name, $_forward_ext)
#set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.reverse)) else '.fastq'
#set $_reverse = '"{}_2{}"'.format($input.paired_collection.name, $_reverse_ext)
#set $input_files = '{} {}'.format($_forward, $_reverse)
  ln -s "$input.paired_collection.forward" $_forward &&
  ln -s "$input.paired_collection.reverse" $_reverse &&
#end if

refseq_masher
  $adv.verbosity
  contains
  --output refseq_masher-contains.${adv.output_type}
  --output-type $adv.output_type
  --top-n-results $adv.top_n_results
  --parallelism "\${GALAXY_SLOTS:-1}"
  --min-identity $adv.min_identity
  --max-pvalue $adv.max_pvalue
  $input_files

]]>
  </command>
  <inputs>
    <conditional name="input">
      <param name="type" type="select" label="Sequence input type">
        <option value="fasta">FASTA</option>
        <option value="paired">Paired-end FASTQs</option>
        <option value="single">Single-end FASTQ</option>
        <option value="paired_collection">Paired-end FASTQ collection</option>
      </param>
      <when value="fasta">
        <param name="fasta"
          type="data" format="fasta"
          optional="false"
          label="FASTA file"
          />
      </when>
      <when value="paired">
        <param name="forward"
          type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa"
          optional="false"
          label="Forward FASTQ file"
          help="Must have ASCII encoded quality scores"
          />
        <param name="reverse"
          type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa"
          optional="false"
          label="Reverse FASTQ file"
          help="File format must match the Forward FASTQ file"
          />
      </when>
      <when value="single">
        <param name="single"
          type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa"
          optional="false"
          label="Single-end FASTQ file"
          />
      </when>
      <when value="paired_collection">
        <param name="paired_collection"
          type="data_collection" format="fastq,fastqsanger,fastqillumina,fastqsolexa,fastq.gz,txt"
          collection_type="paired"
          optional="false"
          label="Paired-end FASTQ collection"
          help=""
          />
      </when>
    </conditional>
    <section name="adv" title="Advanced Options" expanded="false">
      <param name="top_n_results"
        type="integer"
        label="Top N matches to report (0 to report all)"
        min="0"
        value="0"
        optional="true"
        />
      <param name="min_identity"
        type="float" value="0.9" min="0.0" max="1.0"
        label="Mash dist min. identity to report"
        optional="true"
        />
      <param name="max_pvalue"
        type="float" value="0.01" min="0.0" max="1.0"
        label="Mash screen max. p-value to report"
        optional="true"
        />
      <param name="output_type"
        type="select"
        label="Output type"
        multiple="false">
        <option value="tab" selected="true">Tabular (tab-delimited values)</option>
        <option value="csv">CSV (Comma Separated Values)</option>
      </param>
      <param name="verbosity"
        type="select"
        label="Logging verbosity">
        <option value="">Error messages only</option>
        <option value="-v">Show warning messages</option>
        <option value="-vv" selected="true">Show info messages</option>
        <option value="-vvv">Show debug messages</option>
      </param>
    </section>
  </inputs>
  <outputs>
    <data
      name="output_path_csv"
      format="csv"
      label="RefSeq Masher contains table"
      from_work_dir="refseq_masher-contains.csv">
      <filter>adv['output_type'] == 'csv'</filter>
    </data>
    <data
      name="output_path_tab"
      format="tabular"
      label="RefSeq Masher contains table"
      from_work_dir="refseq_masher-contains.tab">
      <filter>adv['output_type'] == 'tab'</filter>
    </data>
  </outputs>
  <tests>
    <test>
      <conditional name="input">
        <param name="type" value="single"/>
        <param name="single" value="SRR1203042_1-head4000.fastq"/>
      </conditional>
      <section name="adv">
        <param name="top_n_results" value="5"/>
        <param name="output_type" value="tab"/>
        <param name="min_identity" value="0.9"/>
        <param name="max_pvalue" value="0.01"/>
      </section>
      <output name="output_path_tab"
        value="SRR1203042_1-head4000-contains.tab"
        ftype="tabular"
        lines_diff="0">
      </output>
    </test>
  </tests>
  <help>
<![CDATA[
RefSeq Masher - Containment
===========================

Find what NCBI RefSeq genomes are contained within your sequence data using Mash_ with a Mash sketch database of 54,925 NCBI RefSeq Genomes.


Source code available on Github at https://github.com/phac-nml/refseq_masher


`contains` - find what NCBI RefSeq Genomes are contained in your input sequences
--------------------------------------------------------------------------------

If you have a metagenomic sample or maybe a sample with some contamination, you may be interested in seeing what's in your sample. You can do this with `refseq_masher contains <INPUT>`.::

    Usage: refseq_masher contains [OPTIONS] INPUT...

      Find the NCBI RefSeq genomes contained in your sequence files using Mash
      Screen

      Input is expected to be one or more FASTA/FASTQ files or one or more
      directories containing FASTA/FASTQ files. Files can be Gzipped.

    Options:
      --mash-bin TEXT              Mash binary path (default="mash")
      -o, --output PATH            Output file path (default="-"/stdout)
      --output-type [tab|csv]      Output file type (tab|csv)
      -n, --top-n-results INTEGER  Output top N results sorted by identity in
                                   ascending order (default=0/all)
      -i, --min-identity FLOAT     Mash screen min identity to report
                                   (default=0.9)
      -v, --max-pvalue FLOAT       Mash screen max p-value to report
                                   (default=0.01)
      -p, --parallelism INTEGER    Mash screen parallelism; number of threads to
                                   spawn (default=1)
      -h, --help                   Show this message and exit.


Example - metagenomic a sample SAMEA1877340_
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

For this example, we're going to see what RefSeq genomes are contained within sample SAMEA1877340_ from BioProject PRJEB1775_.


Description from BioProject PRJEB1775_:

.. epigraph::

    Design, Setting and Patients Forty-five samples were selected from a set of fecal specimens obtained from patients with diarrhea during the 2011 outbreak of STEC O104:H4 in Germany. Samples were chosen to represent STEC-positive patients with a range of clinical conditions and colony counts together with a small number of patients with other infections (Campylobacter jejnuni, Clostridium difficile and Salmonella enterica). Samples were subjected to high-throughput sequencing on the Illumina MiSeq and HiSeq 2500, followed by bioinformatics analysis.


We're going to download the FASTQ files for ERR260489_::

    wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR260/ERR260489/ERR260489_1.fastq.gz
    wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR260/ERR260489/ERR260489_2.fastq.gz


We're going to run `refseq_masher` against these FASTQ files::

    refseq_masher -vv contains --top-n-results 50 -p 12 -o containment-ERR260489.tab ERR260489_1.fastq.gz ERR260489_2.fastq.gz

**Log**::

    2018-01-29 10:59:25,849 INFO: Grouped 2 fastqs into 1 groups [in ...refseq_masher/refseq_masher/utils.py:174]
    2018-01-29 10:59:25,849 INFO: Collected 0 FASTA inputs and 1 read sets [in ...refseq_masher/refseq_masher/utils.py:185]
    2018-01-29 10:59:25,849 INFO: Running Mash Screen with NCBI RefSeq sketch database against sample "ERR260489" with inputs: ['../ERR260489_1.fastq.gz', '../ERR260489_2.fastq.gz'] [in ...refseq_masher/refseq_masher/mash/screen.py:44]
    Loading ...refseq_masher/refseq_masher/data/RefSeqSketches.msh...
       4669418 distinct hashes.
    Streaming from 2 inputs...
       Estimated distinct k-mers in pool: 206836855
    Summing shared...
    Computing coverage medians...
    Writing output...
    2018-01-29 11:00:19,665 INFO: Ran Mash Screen on all input. Merging NCBI taxonomic information into results output. [in ...refseq_masher/refseq_masher/cli.py:134]
    2018-01-29 11:00:19,666 INFO: Fetching all taxonomy info for 23 unique NCBI Taxonomy UIDs [in ...refseq_masher/refseq_masher/taxonomy.py:35]
    2018-01-29 11:00:19,669 INFO: Dropping columns with all NA values (ncol=32) [in ...refseq_masher/refseq_masher/taxonomy.py:38]
    2018-01-29 11:00:19,671 INFO: Columns with all NA values dropped (ncol=12) [in ...refseq_masher/refseq_masher/taxonomy.py:40]
    2018-01-29 11:00:19,671 INFO: Merging Mash results with relevant taxonomic information [in ...refseq_masher/refseq_masher/taxonomy.py:41]
    2018-01-29 11:00:19,674 INFO: Merged Mash results with taxonomy info [in ...refseq_masher/refseq_masher/taxonomy.py:43]
    2018-01-29 11:00:19,674 INFO: Merged taxonomic information into results output [in ...refseq_masher/refseq_masher/cli.py:136]
    2018-01-29 11:00:19,674 INFO: Reordering output columns [in ...refseq_masher/refseq_masher/cli.py:137]
    2018-01-29 11:00:19,677 INFO: Wrote output to "containment-ERR260489.tab" [in ...refseq_masher/refseq_masher/writers.py:20]


**Output**

+-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
|  sample   | top_taxonomy_name                    | identity | shared_hashes  | median_multiplicity  | pvalue | full_taxonomy                                                                                                                                    | taxonomic_subspecies  | taxonomic_species            | taxonomic_genus  | taxonomic_family   | taxonomic_order  | taxonomic_class     | taxonomic_phylum  | taxonomic_superkingdom  | subspecies | serovar | plasmid  | bioproject | biosample | taxid   | assembly_accession  | match_id                                                                                     | taxonomic_species group  | match_comment  |
+===========+======================================+==========+================+======================+========+==================================================================================================================================================+=======================+==============================+==================+====================+==================+=====================+===================+=========================+============+=========+==========+============+===========+=========+=====================+==============================================================================================+==========================+================+
| ERR260489 | Bacteroides fragilis                 | 1.0      | 400/400        | 786                  | 0.0    | Bacteria; FCB group; Bacteroidetes/Chlorobi group; Bacteroidetes; Bacteroidia; Bacteroidales; Bacteroidaceae; Bacteroides; fragilis              |                       | Bacteroides fragilis         | Bacteroides      | Bacteroidaceae     | Bacteroidales    | Bacteroidia         | Bacteroidetes     | Bacteria                |            |         | pLV22a   |            |           | 817     |                     | ./rcn/refseq-NG-817-.-.-.-pLV22a-Bacteroides_fragilis.fna                                    |                          |                |
+-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
| [1 row]   |                                      |          |                |                      |        |                                                                                                                                                  |                       |                              |                  |                    |                  |                     |                   |                         |            |         |          |            |           |         |                     |                                                                                              |                          |                |
+-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
| ERR260489 | Escherichia coli O104:H4 str. E92/11 | 1.0      | 400/400        | 48                   | 0.0    | Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacterales; Enterobacteriaceae; Escherichia; coli; O104:H4; str. E92/11                     |                       | Escherichia coli             | Escherichia      | Enterobacteriaceae | Enterobacterales | Gammaproteobacteria | Proteobacteria    | Bacteria                |            |         | pE9211p3 |            |           | 1090927 | NZ_AHAU             | ./rcn/refseq-NZ-1090927-.-.-NZ_AHAU-pE9211p3-Escherichia_coli_O104_H4_str._E92_11.fna        |                          |                |
+-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
| [3 rows]  |                                      |          |                |                      |        |                                                                                                                                                  |                       |                              |                  |                    |                  |                     |                   |                         |            |         |          |            |           |         |                     |                                                                                              |                          |                |
+-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
| ERR260489 | Kingella kingae KKC2005004457        | 1.0      | 400/400        | 5                    | 0.0    | Bacteria; Proteobacteria; Betaproteobacteria; Neisseriales; Neisseriaceae; Kingella; kingae; KKC2005004457                                       |                       | Kingella kingae              | Kingella         | Neisseriaceae      | Neisseriales     | Betaproteobacteria  | Proteobacteria    | Bacteria                |            |         | unnamed  |            |           | 1229911 |                     | ./rcn/refseq-NG-1229911-.-.-.-unnamed-Kingella_kingae_KKC2005004457.fna                      |                          |                |
+-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
| ERR260489 | Bacteroides cellulosilyticus WH2     | 0.99984  | 399/400        | 772                  | 0.0    | Bacteria; FCB group; Bacteroidetes/Chlorobi group; Bacteroidetes; Bacteroidia; Bacteroidales; Bacteroidaceae; Bacteroides; cellulosilyticus; WH2 |                       | Bacteroides cellulosilyticus | Bacteroides      | Bacteroidaceae     | Bacteroidales    | Bacteroidia         | Bacteroidetes     | Bacteria                |            |         | pBWH2B   |            |           | 1268240 | NZ_ATFI             | ./rcn/refseq-NZ-1268240-.-.-NZ_ATFI-pBWH2B-Bacteroides_cellulosilyticus_WH2.fna              |                          |                |
+-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
| [1 row]   |                                      |          |                |                      |        |                                                                                                                                                  |                       |                              |                  |                    |                  |                     |                   |                         |            |         |          |            |           |         |                     |                                                                                              |                          |                |
+-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
| ERR260489 | Klebsiella pneumoniae                | 0.99984  | 399/400        | 4                    | 0.0    | Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacterales; Enterobacteriaceae; Klebsiella; pneumoniae                                      |                       | Klebsiella pneumoniae        | Klebsiella       | Enterobacteriaceae | Enterobacterales | Gammaproteobacteria | Proteobacteria    | Bacteria                |            |         | pMRC151  |            |           | 573     |                     | ./rcn/refseq-NG-573-.-.-.-pMRC151-Klebsiella_pneumoniae.fna                                  |                          |                |
+-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
| [37 rows] |                                      |          |                |                      |        |                                                                                                                                                  |                       |                              |                  |                    |                  |                     |                   |                         |            |         |          |            |           |         |                     |                                                                                              |                          |                |
+-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+

Some of the top genomes contained in this sample are sorted by identity and median multiplicity are:

- *Bacteroides fragilis* - fully contained (400/400) and high multiplicity (768)
- *Escherichia coli* O104:H4 - fully contained (400/400) and median multiplicity of 48
- *Kingella kingae* - fully contained (400/400) and median multiplicity of 5
- *Klebsiella pneumoniae* - 399/400 sketches contained with median multiplicity of 4

So with Mash we are able to find that the sample contained the expected genomic data (especially *E. coli* O104:H4).


Legal
-----

Copyright Government of Canada 2017

Written by: National Microbiology Laboratory, Public Health Agency of Canada

Licensed under the Apache License, Version 2.0 (the "License"); you may not use
this work except in compliance with the License. You may obtain a copy of the
License at:

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed
under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

Contact
-------

**Gary van Domselaar**: gary.vandomselaar@phac-aspc.gc.ca


.. _Mash: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x
.. _SAMEA1877340: https://www.ebi.ac.uk/ena/data/view/SAMEA1877340
.. _PRJEB1775: https://www.ebi.ac.uk/ena/data/view/PRJEB1775
.. _ERR260489: https://www.ebi.ac.uk/ena/data/view/ERR260489&display=html

]]>
  </help>
  <citations>
    <!-- Citation for Mash paper -->
    <citation type="doi">10.1186/s13059-016-0997-x</citation>
  </citations>
</tool>
author	nml
date	Thu, 15 Feb 2018 13:59:31 -0500
parents
children	2c1cb37a3ffe