Mercurial > repos > nml > refseq_masher

diff matches.xml @ 0:26df66c32861 draft
planemo upload commit 80c22275be05e29208e991019309dfffa9704f39
author: nml
date: Thu, 15 Feb 2018 13:59:31 -0500
children: 2c1cb37a3ffe
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matches.xml	Thu Feb 15 13:59:31 2018 -0500
@@ -0,0 +1,294 @@
+<tool id="refseq_masher_matches" name="RefSeq Masher Matches" version="0.1.1">
+  <description>
+    Find closest matching NCBI RefSeq Genomes to your sequences
+  </description>
+  <requirements>
+    <requirement type="package" version="0.1.1">refseq_masher</requirement>
+  </requirements>
+  <command detect_errors="exit_code">
+<![CDATA[
+
+#import re
+
+#if $input.type == 'fasta'
+#set $input_files = '"{}"'.format($input.fasta.name)
+  ln -s "$input.fasta" $input_files &&
+#elif $input.type == 'paired'
+#set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.forward.name) else '.fastq'
+#set $_forward = '"{}_1{}"'.format($re.sub(r'_[12]\..+$', '', $input.forward.name), $_forward_ext)
+#set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.reverse.name) else '.fastq'
+#set $_reverse = '"{}_2{}"'.format($re.sub(r'_[12]\..+$', '', $input.reverse.name), $_reverse_ext)
+#set $input_files = '{} {}'.format($_forward, $_reverse)
+  ln -s "$input.forward" $_forward &&
+  ln -s "$input.reverse" $_reverse &&
+#elif $input.type == 'single'
+#set $input_files = '"{}"'.format($input.single.name)
+  ln -s "$input.single" $input_files &&
+#elif $input.type == 'paired_collection'
+#set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.forward)) else '.fastq'
+#set $_forward = '"{}_1{}"'.format($input.paired_collection.name, $_forward_ext)
+#set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.reverse)) else '.fastq'
+#set $_reverse = '"{}_2{}"'.format($input.paired_collection.name, $_reverse_ext)
+#set $input_files = '{} {}'.format($_forward, $_reverse)
+  ln -s "$input.paired_collection.forward" $_forward &&
+  ln -s "$input.paired_collection.reverse" $_reverse &&
+#end if
+
+refseq_masher 
+  $adv.verbosity 
+  matches 
+  --output refseq_masher-matches.${adv.output_type}
+  --output-type $adv.output_type
+  --top-n-results $top_n_results
+#if $adv.min_kmer_threshold
+  --min-kmer-threshold $adv.min_kmer_threshold
+#end if
+  -T "\${TMPDIR:-/tmp}"
+  $input_files
+]]>
+  </command>
+  <inputs>
+    <conditional name="input">
+      <param name="type" type="select" label="Sequence input type">
+        <option value="fasta">Genome FASTA</option>
+        <option value="paired">Paired-end FASTQs</option>
+        <option value="single">Single-end FASTQ</option>
+        <option value="paired_collection">Paired-end FASTQ collection</option>
+      </param>
+      <when value="fasta">
+        <param name="fasta"
+          type="data" format="fasta"
+          optional="false"
+          label="Genome FASTA file"
+          />
+      </when>
+      <when value="paired">
+        <param name="forward"
+          type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa"
+          optional="false"
+          label="Forward FASTQ file"
+          />
+        <param name="reverse"
+          type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa"
+          optional="false"
+          label="Reverse FASTQ file"
+          help="File format must match the Forward FASTQ file"
+          />
+      </when>
+      <when value="single">
+        <param name="single"
+          type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa"
+          optional="false"
+          label="Single-end FASTQ file"
+          />
+      </when>
+      <when value="paired_collection">
+        <param name="paired_collection"
+          type="data_collection" format="fastq,fastqsanger,fastqillumina,fastqsolexa,fastq.gz,txt"
+          collection_type="paired"
+          optional="false"
+          label="Paired-end FASTQ collection"
+          />
+      </when>
+    </conditional>
+    <param name="top_n_results"
+      type="integer"
+      min="0"
+      value="20"
+      optional="true"
+      label="Top N matches to report (set to 0 to report all)"
+      />
+    <section name="adv" title="Advanced Options" expanded="false">
+      <param name="min_kmer_threshold"
+        type="integer"
+        min="1"
+        value="8"
+        optional="true"
+        label="Mash sketch of reads: Minimum copies of each k-mer required to pass noise filter for reads (default=8)"
+        />
+      <param name="output_type"
+        type="select"
+        label="Output type"
+        multiple="false">
+        <option value="tab" selected="true">
+          Tabular (tab-delimited values)
+        </option>
+        <option value="csv">
+          CSV (Comma Separated Values)
+        </option>
+      </param>
+      <param name="verbosity"
+        type="select"
+        label="Logging verbosity">
+        <option value="">Error messages only</option>
+        <option value="-v">Show warning messages</option>
+        <option value="-vv" selected="true">Show info messages</option>
+        <option value="-vvv">Show debug messages</option>
+      </param>
+    </section>
+  </inputs>
+  <outputs>
+    <data name="output_path_csv" 
+      format="csv" 
+      label="RefSeq Masher matches table"
+      from_work_dir="refseq_masher-matches.csv">
+      <filter>adv['output_type'] == 'csv'</filter>
+    </data>
+    <data name="output_path_tab" 
+      format="tabular" 
+      label="RefSeq Masher matches table"
+      from_work_dir="refseq_masher-matches.tab">
+      <filter>adv['output_type'] == 'tab'</filter>
+    </data>
+  </outputs>
+  <tests>
+    <test>
+      <conditional name="input">
+        <param name="type" value="fasta"/>
+        <param name="fasta" value="Se-Enteritidis.fasta"/>
+      </conditional>
+      <param name="top_n_results" value="1"/>
+      <section name="adv">
+        <param name="output_type" value="tab"/>
+      </section>
+      <output name="output_path_tab"
+        value="Se-Enteritidis-refseq_masher-matches.tab"
+        ftype="tabular"
+        lines_diff="0">
+      </output>
+    </test>
+    <test>
+      <conditional name="input">
+        <param name="type" value="single"/>
+        <param name="single" value="SRR1203042_1-head4000.fastq"/>
+      </conditional>
+      <param name="top_n_results" value="1"/>
+      <section name="adv">
+        <param name="output_type" value="tab"/>
+        <param name="min_kmer_threshold" value="2"/>
+      </section>
+      <output name="output_path_tab"
+        value="SRR1203042_1-head4000-refseq_masher-matches-m2.tab"
+        ftype="tabular"
+        lines_diff="0">
+      </output>
+    </test>
+  </tests>
+  <help>
+<![CDATA[
+RefSeq Masher - Genomic Distance
+================================
+
+Find what NCBI RefSeq genomes most closely match your sequence data using Mash_ with a Mash sketch database of 54,925 NCBI RefSeq Genomes.
+
+
+Source code available on Github at https://github.com/phac-nml/refseq_masher
+
+
+`matches` - find the closest matching NCBI RefSeq Genomes in your input sequences
+---------------------------------------------------------------------------------
+
+Command-line usage information::
+
+    Usage: refseq_masher matches [OPTIONS] INPUT...
+
+      Find NCBI RefSeq genome matches for an input genome fasta file
+
+      Input is expected to be one or more FASTA/FASTQ files or one or more
+      directories containing FASTA/FASTQ files. Files can be Gzipped.
+
+    Options:
+      --mash-bin TEXT                 Mash binary path (default="mash")
+      -o, --output PATH               Output file path (default="-"/stdout)
+      --output-type [tab|csv]         Output file type (tab|csv)
+      -n, --top-n-results INTEGER     Output top N results sorted by distance in
+                                      ascending order (default=5)
+      -m, --min-kmer-threshold INTEGER
+                                      Mash sketch of reads: "Minimum copies of
+                                      each k-mer required to pass noise filter for
+                                      reads" (default=8)
+      -h, --help                      Show this message and exit.
+
+
+Example
+~~~~~~~
+
+With the FNA.GZ_ file for *Salmonella enterica* subsp. enterica serovar Enteritidis str. CHS44_::
+
+
+    # download sequence file
+    wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/329/025/GCF_000329025.1_ASM32902v1/GCF_000329025.1_ASM32902v1_genomic.fna.gz
+
+    # find RefSeq matches
+    refseq_masher -vv matches GCF_000329025.1_ASM32902v1_genomic.fna.gz
+
+
+**Log**::
+
+
+    2018-01-29 11:02:13,786 INFO: Collected 1 FASTA inputs and 0 read sets [in ...refseq_masher/refseq_masher/utils.py:185]
+    2018-01-29 11:02:13,786 INFO: Creating Mash sketch file for ...refseq_masher/GCF_000329025.1_ASM32902v1_genomic.fna.gz [in ...refseq_masher/refseq_masher/mash/sketch.py:24]
+    2018-01-29 11:02:14,055 INFO: Created Mash sketch file at "/tmp/GCF_000329025.1_ASM32902v1_genomic.msh" [in ...refseq_masher/refseq_masher/mash/sketch.py:40]
+    2018-01-29 11:02:14,613 INFO: Ran Mash dist successfully (output length=11647035). Parsing Mash dist output [in ...refseq_masher/refseq_masher/mash/dist.py:64]
+    2018-01-29 11:02:15,320 INFO: Parsed Mash dist output into Pandas DataFrame with 54924 rows [in ...refseq_masher/refseq_masher/mash/dist.py:67]
+    2018-01-29 11:02:15,321 INFO: Deleting temporary sketch file "/tmp/GCF_000329025.1_ASM32902v1_genomic.msh" [in ...refseq_masher/refseq_masher/mash/dist.py:72]
+    2018-01-29 11:02:15,321 INFO: Sketch file "/tmp/GCF_000329025.1_ASM32902v1_genomic.msh" deleted! [in ...refseq_masher/refseq_masher/mash/dist.py:74]
+    2018-01-29 11:02:15,322 INFO: Ran Mash dist on all input. Merging NCBI taxonomic information into results output. [in ...refseq_masher/refseq_masher/cli.py:88]
+    2018-01-29 11:02:15,323 INFO: Fetching all taxonomy info for 5 unique NCBI Taxonomy UIDs [in ...refseq_masher/refseq_masher/taxonomy.py:35]
+    2018-01-29 11:02:15,325 INFO: Dropping columns with all NA values (ncol=32) [in ...refseq_masher/refseq_masher/taxonomy.py:38]
+    2018-01-29 11:02:15,327 INFO: Columns with all NA values dropped (ncol=11) [in ...refseq_masher/refseq_masher/taxonomy.py:40]
+    2018-01-29 11:02:15,327 INFO: Merging Mash results with relevant taxonomic information [in ...refseq_masher/refseq_masher/taxonomy.py:41]
+    2018-01-29 11:02:15,329 INFO: Merged Mash results with taxonomy info [in ...refseq_masher/refseq_masher/taxonomy.py:43]
+    2018-01-29 11:02:15,329 INFO: Merged taxonomic info into results output [in ...refseq_masher/refseq_masher/cli.py:90]
+    2018-01-29 11:02:15,329 INFO: Reordering output columns [in ...refseq_masher/refseq_masher/cli.py:91]
+    2018-01-29 11:02:15,331 INFO: Writing output to stdout [in ...refseq_masher/refseq_masher/writers.py:16]
+
+
+**Output**
+
++---------------------------------------+--------------------------------------------------------------------+----------+--------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------+---------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+-------------+---------+-------------+--------------+--------+---------------------+------------------------------------------------------------------------------------------------------------------------------------------+
+|  sample                               | top_taxonomy_name                                                  | distance | pvalue | matching | full_taxonomy                                                                                                                                               | taxonomic_subspecies                | taxonomic_species   | taxonomic_genus  | taxonomic_family   | taxonomic_order  | taxonomic_class     | taxonomic_phylum  | taxonomic_superkingdom  | subspecies | serovar     | plasmid | bioproject  | biosample    | taxid  | assembly_accession  | match_id                                                                                                                                 |
++=======================================+====================================================================+==========+========+==========+=============================================================================================================================================================+=====================================+=====================+==================+====================+==================+=====================+===================+=========================+============+=============+=========+=============+==============+========+=====================+==========================================================================================================================================+
+| GCF_000329025.1_ASM32902v1_genomic    | Salmonella enterica subsp. enterica serovar Enteritidis str. CHS44 | 0.0      | 0.0    | 400/400  | Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacterales; Enterobacteriaceae; Salmonella; enterica; subsp. enterica; serovar Enteritidis; str. CHS44 | Salmonella enterica subsp. enterica | Salmonella enterica | Salmonella       | Enterobacteriaceae | Enterobacterales | Gammaproteobacteria | Proteobacteria    | Bacteria                | enterica   | Enteritidis |         | PRJNA185053 | SAMN01041154 | 702979 | NZ_ALFF             | ./rcn/refseq-NZ-702979-PRJNA185053-SAMN01041154-NZ_ALFF-.-Salmonella_enterica_subsp._enterica_serovar_Enteritidis_str._CHS44.fna         |
++---------------------------------------+--------------------------------------------------------------------+----------+--------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------+---------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+-------------+---------+-------------+--------------+--------+---------------------+------------------------------------------------------------------------------------------------------------------------------------------+
+
+
+The top match is *Salmonella enterica* subsp. enterica serovar Enteritidis str. CHS44_ with a distance of 0.0 and 400/400 sketches matching, which is what we expected. There's other taxonomic information available in the results table that may be useful. 
+
+
+
+Legal
+-----
+
+Copyright Government of Canada 2017
+
+Written by: National Microbiology Laboratory, Public Health Agency of Canada
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use
+this work except in compliance with the License. You may obtain a copy of the
+License at:
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+Contact
+-------
+
+**Gary van Domselaar**: gary.vandomselaar@phac-aspc.gc.ca
+
+.. _Mash: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x
+.. _FNA.GZ: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/329/025/GCF_000329025.1_ASM32902v1/GCF_000329025.1_ASM32902v1_genomic.fna.gz
+.. _CHS44: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/329/025/GCF_000329025.1_ASM32902v1/
+
+
+]]>
+  </help>
+  <citations>
+    <!-- Citation for Mash paper -->
+    <citation type="doi">10.1186/s13059-016-0997-x</citation>
+  </citations>
+</tool>
author	nml
date	Thu, 15 Feb 2018 13:59:31 -0500
parents
children	2c1cb37a3ffe