comparison contains.xml @ 0:26df66c32861 draft

planemo upload commit 80c22275be05e29208e991019309dfffa9704f39
author nml
date Thu, 15 Feb 2018 13:59:31 -0500
parents
children 2c1cb37a3ffe
comparison
equal deleted inserted replaced
-1:000000000000 0:26df66c32861
1 <tool id="refseq_masher_contains" name="RefSeq Masher Contains" version="0.1.1">
2 <description>
3 Find NCBI RefSeq Genomes contained in your sequences
4 </description>
5 <requirements>
6 <requirement type="package" version="0.1.1">refseq_masher</requirement>
7 </requirements>
8 <command detect_errors="exit_code">
9 <![CDATA[
10
11 #import re
12
13 #if $input.type == 'fasta'
14 #set $input_files = '"{}"'.format($input.fasta.name)
15 ln -s "$input.fasta" $input_files &&
16 #elif $input.type == 'paired'
17 #set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.forward.name) else '.fastq'
18 #set $_forward = '"{}_1{}"'.format($re.sub(r'_[12]\..+$', '', $input.forward.name), $_forward_ext)
19 #set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.reverse.name) else '.fastq'
20 #set $_reverse = '"{}_2{}"'.format($re.sub(r'_[12]\..+$', '', $input.reverse.name), $_reverse_ext)
21 #set $input_files = '{} {}'.format($_forward, $_reverse)
22 ln -s "$input.forward" $_forward &&
23 ln -s "$input.reverse" $_reverse &&
24 #elif $input.type == 'single'
25 #set $input_files = '"{}"'.format($input.single.name)
26 ln -s "$input.single" $input_files &&
27 #elif $input.type == 'paired_collection'
28 #set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.forward)) else '.fastq'
29 #set $_forward = '"{}_1{}"'.format($input.paired_collection.name, $_forward_ext)
30 #set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.reverse)) else '.fastq'
31 #set $_reverse = '"{}_2{}"'.format($input.paired_collection.name, $_reverse_ext)
32 #set $input_files = '{} {}'.format($_forward, $_reverse)
33 ln -s "$input.paired_collection.forward" $_forward &&
34 ln -s "$input.paired_collection.reverse" $_reverse &&
35 #end if
36
37 refseq_masher
38 $adv.verbosity
39 contains
40 --output refseq_masher-contains.${adv.output_type}
41 --output-type $adv.output_type
42 --top-n-results $adv.top_n_results
43 --parallelism "\${GALAXY_SLOTS:-1}"
44 --min-identity $adv.min_identity
45 --max-pvalue $adv.max_pvalue
46 $input_files
47
48 ]]>
49 </command>
50 <inputs>
51 <conditional name="input">
52 <param name="type" type="select" label="Sequence input type">
53 <option value="fasta">FASTA</option>
54 <option value="paired">Paired-end FASTQs</option>
55 <option value="single">Single-end FASTQ</option>
56 <option value="paired_collection">Paired-end FASTQ collection</option>
57 </param>
58 <when value="fasta">
59 <param name="fasta"
60 type="data" format="fasta"
61 optional="false"
62 label="FASTA file"
63 />
64 </when>
65 <when value="paired">
66 <param name="forward"
67 type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa"
68 optional="false"
69 label="Forward FASTQ file"
70 help="Must have ASCII encoded quality scores"
71 />
72 <param name="reverse"
73 type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa"
74 optional="false"
75 label="Reverse FASTQ file"
76 help="File format must match the Forward FASTQ file"
77 />
78 </when>
79 <when value="single">
80 <param name="single"
81 type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa"
82 optional="false"
83 label="Single-end FASTQ file"
84 />
85 </when>
86 <when value="paired_collection">
87 <param name="paired_collection"
88 type="data_collection" format="fastq,fastqsanger,fastqillumina,fastqsolexa,fastq.gz,txt"
89 collection_type="paired"
90 optional="false"
91 label="Paired-end FASTQ collection"
92 help=""
93 />
94 </when>
95 </conditional>
96 <section name="adv" title="Advanced Options" expanded="false">
97 <param name="top_n_results"
98 type="integer"
99 label="Top N matches to report (0 to report all)"
100 min="0"
101 value="0"
102 optional="true"
103 />
104 <param name="min_identity"
105 type="float" value="0.9" min="0.0" max="1.0"
106 label="Mash dist min. identity to report"
107 optional="true"
108 />
109 <param name="max_pvalue"
110 type="float" value="0.01" min="0.0" max="1.0"
111 label="Mash screen max. p-value to report"
112 optional="true"
113 />
114 <param name="output_type"
115 type="select"
116 label="Output type"
117 multiple="false">
118 <option value="tab" selected="true">Tabular (tab-delimited values)</option>
119 <option value="csv">CSV (Comma Separated Values)</option>
120 </param>
121 <param name="verbosity"
122 type="select"
123 label="Logging verbosity">
124 <option value="">Error messages only</option>
125 <option value="-v">Show warning messages</option>
126 <option value="-vv" selected="true">Show info messages</option>
127 <option value="-vvv">Show debug messages</option>
128 </param>
129 </section>
130 </inputs>
131 <outputs>
132 <data
133 name="output_path_csv"
134 format="csv"
135 label="RefSeq Masher contains table"
136 from_work_dir="refseq_masher-contains.csv">
137 <filter>adv['output_type'] == 'csv'</filter>
138 </data>
139 <data
140 name="output_path_tab"
141 format="tabular"
142 label="RefSeq Masher contains table"
143 from_work_dir="refseq_masher-contains.tab">
144 <filter>adv['output_type'] == 'tab'</filter>
145 </data>
146 </outputs>
147 <tests>
148 <test>
149 <conditional name="input">
150 <param name="type" value="single"/>
151 <param name="single" value="SRR1203042_1-head4000.fastq"/>
152 </conditional>
153 <section name="adv">
154 <param name="top_n_results" value="5"/>
155 <param name="output_type" value="tab"/>
156 <param name="min_identity" value="0.9"/>
157 <param name="max_pvalue" value="0.01"/>
158 </section>
159 <output name="output_path_tab"
160 value="SRR1203042_1-head4000-contains.tab"
161 ftype="tabular"
162 lines_diff="0">
163 </output>
164 </test>
165 </tests>
166 <help>
167 <![CDATA[
168 RefSeq Masher - Containment
169 ===========================
170
171 Find what NCBI RefSeq genomes are contained within your sequence data using Mash_ with a Mash sketch database of 54,925 NCBI RefSeq Genomes.
172
173
174 Source code available on Github at https://github.com/phac-nml/refseq_masher
175
176
177 `contains` - find what NCBI RefSeq Genomes are contained in your input sequences
178 --------------------------------------------------------------------------------
179
180 If you have a metagenomic sample or maybe a sample with some contamination, you may be interested in seeing what's in your sample. You can do this with `refseq_masher contains <INPUT>`.::
181
182 Usage: refseq_masher contains [OPTIONS] INPUT...
183
184 Find the NCBI RefSeq genomes contained in your sequence files using Mash
185 Screen
186
187 Input is expected to be one or more FASTA/FASTQ files or one or more
188 directories containing FASTA/FASTQ files. Files can be Gzipped.
189
190 Options:
191 --mash-bin TEXT Mash binary path (default="mash")
192 -o, --output PATH Output file path (default="-"/stdout)
193 --output-type [tab|csv] Output file type (tab|csv)
194 -n, --top-n-results INTEGER Output top N results sorted by identity in
195 ascending order (default=0/all)
196 -i, --min-identity FLOAT Mash screen min identity to report
197 (default=0.9)
198 -v, --max-pvalue FLOAT Mash screen max p-value to report
199 (default=0.01)
200 -p, --parallelism INTEGER Mash screen parallelism; number of threads to
201 spawn (default=1)
202 -h, --help Show this message and exit.
203
204
205 Example - metagenomic a sample SAMEA1877340_
206 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
207
208 For this example, we're going to see what RefSeq genomes are contained within sample SAMEA1877340_ from BioProject PRJEB1775_.
209
210
211 Description from BioProject PRJEB1775_:
212
213 .. epigraph::
214
215 Design, Setting and Patients Forty-five samples were selected from a set of fecal specimens obtained from patients with diarrhea during the 2011 outbreak of STEC O104:H4 in Germany. Samples were chosen to represent STEC-positive patients with a range of clinical conditions and colony counts together with a small number of patients with other infections (Campylobacter jejnuni, Clostridium difficile and Salmonella enterica). Samples were subjected to high-throughput sequencing on the Illumina MiSeq and HiSeq 2500, followed by bioinformatics analysis.
216
217
218 We're going to download the FASTQ files for ERR260489_::
219
220 wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR260/ERR260489/ERR260489_1.fastq.gz
221 wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR260/ERR260489/ERR260489_2.fastq.gz
222
223
224 We're going to run `refseq_masher` against these FASTQ files::
225
226 refseq_masher -vv contains --top-n-results 50 -p 12 -o containment-ERR260489.tab ERR260489_1.fastq.gz ERR260489_2.fastq.gz
227
228 **Log**::
229
230 2018-01-29 10:59:25,849 INFO: Grouped 2 fastqs into 1 groups [in ...refseq_masher/refseq_masher/utils.py:174]
231 2018-01-29 10:59:25,849 INFO: Collected 0 FASTA inputs and 1 read sets [in ...refseq_masher/refseq_masher/utils.py:185]
232 2018-01-29 10:59:25,849 INFO: Running Mash Screen with NCBI RefSeq sketch database against sample "ERR260489" with inputs: ['../ERR260489_1.fastq.gz', '../ERR260489_2.fastq.gz'] [in ...refseq_masher/refseq_masher/mash/screen.py:44]
233 Loading ...refseq_masher/refseq_masher/data/RefSeqSketches.msh...
234 4669418 distinct hashes.
235 Streaming from 2 inputs...
236 Estimated distinct k-mers in pool: 206836855
237 Summing shared...
238 Computing coverage medians...
239 Writing output...
240 2018-01-29 11:00:19,665 INFO: Ran Mash Screen on all input. Merging NCBI taxonomic information into results output. [in ...refseq_masher/refseq_masher/cli.py:134]
241 2018-01-29 11:00:19,666 INFO: Fetching all taxonomy info for 23 unique NCBI Taxonomy UIDs [in ...refseq_masher/refseq_masher/taxonomy.py:35]
242 2018-01-29 11:00:19,669 INFO: Dropping columns with all NA values (ncol=32) [in ...refseq_masher/refseq_masher/taxonomy.py:38]
243 2018-01-29 11:00:19,671 INFO: Columns with all NA values dropped (ncol=12) [in ...refseq_masher/refseq_masher/taxonomy.py:40]
244 2018-01-29 11:00:19,671 INFO: Merging Mash results with relevant taxonomic information [in ...refseq_masher/refseq_masher/taxonomy.py:41]
245 2018-01-29 11:00:19,674 INFO: Merged Mash results with taxonomy info [in ...refseq_masher/refseq_masher/taxonomy.py:43]
246 2018-01-29 11:00:19,674 INFO: Merged taxonomic information into results output [in ...refseq_masher/refseq_masher/cli.py:136]
247 2018-01-29 11:00:19,674 INFO: Reordering output columns [in ...refseq_masher/refseq_masher/cli.py:137]
248 2018-01-29 11:00:19,677 INFO: Wrote output to "containment-ERR260489.tab" [in ...refseq_masher/refseq_masher/writers.py:20]
249
250
251
252 **Output**
253
254 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
255 | sample | top_taxonomy_name | identity | shared_hashes | median_multiplicity | pvalue | full_taxonomy | taxonomic_subspecies | taxonomic_species | taxonomic_genus | taxonomic_family | taxonomic_order | taxonomic_class | taxonomic_phylum | taxonomic_superkingdom | subspecies | serovar | plasmid | bioproject | biosample | taxid | assembly_accession | match_id | taxonomic_species group | match_comment |
256 +===========+======================================+==========+================+======================+========+==================================================================================================================================================+=======================+==============================+==================+====================+==================+=====================+===================+=========================+============+=========+==========+============+===========+=========+=====================+==============================================================================================+==========================+================+
257 | ERR260489 | Bacteroides fragilis | 1.0 | 400/400 | 786 | 0.0 | Bacteria; FCB group; Bacteroidetes/Chlorobi group; Bacteroidetes; Bacteroidia; Bacteroidales; Bacteroidaceae; Bacteroides; fragilis | | Bacteroides fragilis | Bacteroides | Bacteroidaceae | Bacteroidales | Bacteroidia | Bacteroidetes | Bacteria | | | pLV22a | | | 817 | | ./rcn/refseq-NG-817-.-.-.-pLV22a-Bacteroides_fragilis.fna | | |
258 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
259 | [1 row] | | | | | | | | | | | | | | | | | | | | | | | | |
260 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
261 | ERR260489 | Escherichia coli O104:H4 str. E92/11 | 1.0 | 400/400 | 48 | 0.0 | Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacterales; Enterobacteriaceae; Escherichia; coli; O104:H4; str. E92/11 | | Escherichia coli | Escherichia | Enterobacteriaceae | Enterobacterales | Gammaproteobacteria | Proteobacteria | Bacteria | | | pE9211p3 | | | 1090927 | NZ_AHAU | ./rcn/refseq-NZ-1090927-.-.-NZ_AHAU-pE9211p3-Escherichia_coli_O104_H4_str._E92_11.fna | | |
262 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
263 | [3 rows] | | | | | | | | | | | | | | | | | | | | | | | | |
264 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
265 | ERR260489 | Kingella kingae KKC2005004457 | 1.0 | 400/400 | 5 | 0.0 | Bacteria; Proteobacteria; Betaproteobacteria; Neisseriales; Neisseriaceae; Kingella; kingae; KKC2005004457 | | Kingella kingae | Kingella | Neisseriaceae | Neisseriales | Betaproteobacteria | Proteobacteria | Bacteria | | | unnamed | | | 1229911 | | ./rcn/refseq-NG-1229911-.-.-.-unnamed-Kingella_kingae_KKC2005004457.fna | | |
266 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
267 | ERR260489 | Bacteroides cellulosilyticus WH2 | 0.99984 | 399/400 | 772 | 0.0 | Bacteria; FCB group; Bacteroidetes/Chlorobi group; Bacteroidetes; Bacteroidia; Bacteroidales; Bacteroidaceae; Bacteroides; cellulosilyticus; WH2 | | Bacteroides cellulosilyticus | Bacteroides | Bacteroidaceae | Bacteroidales | Bacteroidia | Bacteroidetes | Bacteria | | | pBWH2B | | | 1268240 | NZ_ATFI | ./rcn/refseq-NZ-1268240-.-.-NZ_ATFI-pBWH2B-Bacteroides_cellulosilyticus_WH2.fna | | |
268 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
269 | [1 row] | | | | | | | | | | | | | | | | | | | | | | | | |
270 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
271 | ERR260489 | Klebsiella pneumoniae | 0.99984 | 399/400 | 4 | 0.0 | Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacterales; Enterobacteriaceae; Klebsiella; pneumoniae | | Klebsiella pneumoniae | Klebsiella | Enterobacteriaceae | Enterobacterales | Gammaproteobacteria | Proteobacteria | Bacteria | | | pMRC151 | | | 573 | | ./rcn/refseq-NG-573-.-.-.-pMRC151-Klebsiella_pneumoniae.fna | | |
272 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
273 | [37 rows] | | | | | | | | | | | | | | | | | | | | | | | | |
274 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+
275
276 Some of the top genomes contained in this sample are sorted by identity and median multiplicity are:
277
278 - *Bacteroides fragilis* - fully contained (400/400) and high multiplicity (768)
279 - *Escherichia coli* O104:H4 - fully contained (400/400) and median multiplicity of 48
280 - *Kingella kingae* - fully contained (400/400) and median multiplicity of 5
281 - *Klebsiella pneumoniae* - 399/400 sketches contained with median multiplicity of 4
282
283 So with Mash we are able to find that the sample contained the expected genomic data (especially *E. coli* O104:H4).
284
285
286
287 Legal
288 -----
289
290 Copyright Government of Canada 2017
291
292 Written by: National Microbiology Laboratory, Public Health Agency of Canada
293
294 Licensed under the Apache License, Version 2.0 (the "License"); you may not use
295 this work except in compliance with the License. You may obtain a copy of the
296 License at:
297
298 http://www.apache.org/licenses/LICENSE-2.0
299
300 Unless required by applicable law or agreed to in writing, software distributed
301 under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
302 CONDITIONS OF ANY KIND, either express or implied. See the License for the
303 specific language governing permissions and limitations under the License.
304
305 Contact
306 -------
307
308 **Gary van Domselaar**: gary.vandomselaar@phac-aspc.gc.ca
309
310
311
312 .. _Mash: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x
313 .. _SAMEA1877340: https://www.ebi.ac.uk/ena/data/view/SAMEA1877340
314 .. _PRJEB1775: https://www.ebi.ac.uk/ena/data/view/PRJEB1775
315 .. _ERR260489: https://www.ebi.ac.uk/ena/data/view/ERR260489&display=html
316
317 ]]>
318 </help>
319 <citations>
320 <!-- Citation for Mash paper -->
321 <citation type="doi">10.1186/s13059-016-0997-x</citation>
322 </citations>
323 </tool>