Mercurial > repos > nml > refseq_masher
comparison contains.xml @ 0:26df66c32861 draft
planemo upload commit 80c22275be05e29208e991019309dfffa9704f39
author | nml |
---|---|
date | Thu, 15 Feb 2018 13:59:31 -0500 |
parents | |
children | 2c1cb37a3ffe |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:26df66c32861 |
---|---|
1 <tool id="refseq_masher_contains" name="RefSeq Masher Contains" version="0.1.1"> | |
2 <description> | |
3 Find NCBI RefSeq Genomes contained in your sequences | |
4 </description> | |
5 <requirements> | |
6 <requirement type="package" version="0.1.1">refseq_masher</requirement> | |
7 </requirements> | |
8 <command detect_errors="exit_code"> | |
9 <![CDATA[ | |
10 | |
11 #import re | |
12 | |
13 #if $input.type == 'fasta' | |
14 #set $input_files = '"{}"'.format($input.fasta.name) | |
15 ln -s "$input.fasta" $input_files && | |
16 #elif $input.type == 'paired' | |
17 #set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.forward.name) else '.fastq' | |
18 #set $_forward = '"{}_1{}"'.format($re.sub(r'_[12]\..+$', '', $input.forward.name), $_forward_ext) | |
19 #set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.reverse.name) else '.fastq' | |
20 #set $_reverse = '"{}_2{}"'.format($re.sub(r'_[12]\..+$', '', $input.reverse.name), $_reverse_ext) | |
21 #set $input_files = '{} {}'.format($_forward, $_reverse) | |
22 ln -s "$input.forward" $_forward && | |
23 ln -s "$input.reverse" $_reverse && | |
24 #elif $input.type == 'single' | |
25 #set $input_files = '"{}"'.format($input.single.name) | |
26 ln -s "$input.single" $input_files && | |
27 #elif $input.type == 'paired_collection' | |
28 #set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.forward)) else '.fastq' | |
29 #set $_forward = '"{}_1{}"'.format($input.paired_collection.name, $_forward_ext) | |
30 #set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.reverse)) else '.fastq' | |
31 #set $_reverse = '"{}_2{}"'.format($input.paired_collection.name, $_reverse_ext) | |
32 #set $input_files = '{} {}'.format($_forward, $_reverse) | |
33 ln -s "$input.paired_collection.forward" $_forward && | |
34 ln -s "$input.paired_collection.reverse" $_reverse && | |
35 #end if | |
36 | |
37 refseq_masher | |
38 $adv.verbosity | |
39 contains | |
40 --output refseq_masher-contains.${adv.output_type} | |
41 --output-type $adv.output_type | |
42 --top-n-results $adv.top_n_results | |
43 --parallelism "\${GALAXY_SLOTS:-1}" | |
44 --min-identity $adv.min_identity | |
45 --max-pvalue $adv.max_pvalue | |
46 $input_files | |
47 | |
48 ]]> | |
49 </command> | |
50 <inputs> | |
51 <conditional name="input"> | |
52 <param name="type" type="select" label="Sequence input type"> | |
53 <option value="fasta">FASTA</option> | |
54 <option value="paired">Paired-end FASTQs</option> | |
55 <option value="single">Single-end FASTQ</option> | |
56 <option value="paired_collection">Paired-end FASTQ collection</option> | |
57 </param> | |
58 <when value="fasta"> | |
59 <param name="fasta" | |
60 type="data" format="fasta" | |
61 optional="false" | |
62 label="FASTA file" | |
63 /> | |
64 </when> | |
65 <when value="paired"> | |
66 <param name="forward" | |
67 type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" | |
68 optional="false" | |
69 label="Forward FASTQ file" | |
70 help="Must have ASCII encoded quality scores" | |
71 /> | |
72 <param name="reverse" | |
73 type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" | |
74 optional="false" | |
75 label="Reverse FASTQ file" | |
76 help="File format must match the Forward FASTQ file" | |
77 /> | |
78 </when> | |
79 <when value="single"> | |
80 <param name="single" | |
81 type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" | |
82 optional="false" | |
83 label="Single-end FASTQ file" | |
84 /> | |
85 </when> | |
86 <when value="paired_collection"> | |
87 <param name="paired_collection" | |
88 type="data_collection" format="fastq,fastqsanger,fastqillumina,fastqsolexa,fastq.gz,txt" | |
89 collection_type="paired" | |
90 optional="false" | |
91 label="Paired-end FASTQ collection" | |
92 help="" | |
93 /> | |
94 </when> | |
95 </conditional> | |
96 <section name="adv" title="Advanced Options" expanded="false"> | |
97 <param name="top_n_results" | |
98 type="integer" | |
99 label="Top N matches to report (0 to report all)" | |
100 min="0" | |
101 value="0" | |
102 optional="true" | |
103 /> | |
104 <param name="min_identity" | |
105 type="float" value="0.9" min="0.0" max="1.0" | |
106 label="Mash dist min. identity to report" | |
107 optional="true" | |
108 /> | |
109 <param name="max_pvalue" | |
110 type="float" value="0.01" min="0.0" max="1.0" | |
111 label="Mash screen max. p-value to report" | |
112 optional="true" | |
113 /> | |
114 <param name="output_type" | |
115 type="select" | |
116 label="Output type" | |
117 multiple="false"> | |
118 <option value="tab" selected="true">Tabular (tab-delimited values)</option> | |
119 <option value="csv">CSV (Comma Separated Values)</option> | |
120 </param> | |
121 <param name="verbosity" | |
122 type="select" | |
123 label="Logging verbosity"> | |
124 <option value="">Error messages only</option> | |
125 <option value="-v">Show warning messages</option> | |
126 <option value="-vv" selected="true">Show info messages</option> | |
127 <option value="-vvv">Show debug messages</option> | |
128 </param> | |
129 </section> | |
130 </inputs> | |
131 <outputs> | |
132 <data | |
133 name="output_path_csv" | |
134 format="csv" | |
135 label="RefSeq Masher contains table" | |
136 from_work_dir="refseq_masher-contains.csv"> | |
137 <filter>adv['output_type'] == 'csv'</filter> | |
138 </data> | |
139 <data | |
140 name="output_path_tab" | |
141 format="tabular" | |
142 label="RefSeq Masher contains table" | |
143 from_work_dir="refseq_masher-contains.tab"> | |
144 <filter>adv['output_type'] == 'tab'</filter> | |
145 </data> | |
146 </outputs> | |
147 <tests> | |
148 <test> | |
149 <conditional name="input"> | |
150 <param name="type" value="single"/> | |
151 <param name="single" value="SRR1203042_1-head4000.fastq"/> | |
152 </conditional> | |
153 <section name="adv"> | |
154 <param name="top_n_results" value="5"/> | |
155 <param name="output_type" value="tab"/> | |
156 <param name="min_identity" value="0.9"/> | |
157 <param name="max_pvalue" value="0.01"/> | |
158 </section> | |
159 <output name="output_path_tab" | |
160 value="SRR1203042_1-head4000-contains.tab" | |
161 ftype="tabular" | |
162 lines_diff="0"> | |
163 </output> | |
164 </test> | |
165 </tests> | |
166 <help> | |
167 <![CDATA[ | |
168 RefSeq Masher - Containment | |
169 =========================== | |
170 | |
171 Find what NCBI RefSeq genomes are contained within your sequence data using Mash_ with a Mash sketch database of 54,925 NCBI RefSeq Genomes. | |
172 | |
173 | |
174 Source code available on Github at https://github.com/phac-nml/refseq_masher | |
175 | |
176 | |
177 `contains` - find what NCBI RefSeq Genomes are contained in your input sequences | |
178 -------------------------------------------------------------------------------- | |
179 | |
180 If you have a metagenomic sample or maybe a sample with some contamination, you may be interested in seeing what's in your sample. You can do this with `refseq_masher contains <INPUT>`.:: | |
181 | |
182 Usage: refseq_masher contains [OPTIONS] INPUT... | |
183 | |
184 Find the NCBI RefSeq genomes contained in your sequence files using Mash | |
185 Screen | |
186 | |
187 Input is expected to be one or more FASTA/FASTQ files or one or more | |
188 directories containing FASTA/FASTQ files. Files can be Gzipped. | |
189 | |
190 Options: | |
191 --mash-bin TEXT Mash binary path (default="mash") | |
192 -o, --output PATH Output file path (default="-"/stdout) | |
193 --output-type [tab|csv] Output file type (tab|csv) | |
194 -n, --top-n-results INTEGER Output top N results sorted by identity in | |
195 ascending order (default=0/all) | |
196 -i, --min-identity FLOAT Mash screen min identity to report | |
197 (default=0.9) | |
198 -v, --max-pvalue FLOAT Mash screen max p-value to report | |
199 (default=0.01) | |
200 -p, --parallelism INTEGER Mash screen parallelism; number of threads to | |
201 spawn (default=1) | |
202 -h, --help Show this message and exit. | |
203 | |
204 | |
205 Example - metagenomic a sample SAMEA1877340_ | |
206 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
207 | |
208 For this example, we're going to see what RefSeq genomes are contained within sample SAMEA1877340_ from BioProject PRJEB1775_. | |
209 | |
210 | |
211 Description from BioProject PRJEB1775_: | |
212 | |
213 .. epigraph:: | |
214 | |
215 Design, Setting and Patients Forty-five samples were selected from a set of fecal specimens obtained from patients with diarrhea during the 2011 outbreak of STEC O104:H4 in Germany. Samples were chosen to represent STEC-positive patients with a range of clinical conditions and colony counts together with a small number of patients with other infections (Campylobacter jejnuni, Clostridium difficile and Salmonella enterica). Samples were subjected to high-throughput sequencing on the Illumina MiSeq and HiSeq 2500, followed by bioinformatics analysis. | |
216 | |
217 | |
218 We're going to download the FASTQ files for ERR260489_:: | |
219 | |
220 wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR260/ERR260489/ERR260489_1.fastq.gz | |
221 wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR260/ERR260489/ERR260489_2.fastq.gz | |
222 | |
223 | |
224 We're going to run `refseq_masher` against these FASTQ files:: | |
225 | |
226 refseq_masher -vv contains --top-n-results 50 -p 12 -o containment-ERR260489.tab ERR260489_1.fastq.gz ERR260489_2.fastq.gz | |
227 | |
228 **Log**:: | |
229 | |
230 2018-01-29 10:59:25,849 INFO: Grouped 2 fastqs into 1 groups [in ...refseq_masher/refseq_masher/utils.py:174] | |
231 2018-01-29 10:59:25,849 INFO: Collected 0 FASTA inputs and 1 read sets [in ...refseq_masher/refseq_masher/utils.py:185] | |
232 2018-01-29 10:59:25,849 INFO: Running Mash Screen with NCBI RefSeq sketch database against sample "ERR260489" with inputs: ['../ERR260489_1.fastq.gz', '../ERR260489_2.fastq.gz'] [in ...refseq_masher/refseq_masher/mash/screen.py:44] | |
233 Loading ...refseq_masher/refseq_masher/data/RefSeqSketches.msh... | |
234 4669418 distinct hashes. | |
235 Streaming from 2 inputs... | |
236 Estimated distinct k-mers in pool: 206836855 | |
237 Summing shared... | |
238 Computing coverage medians... | |
239 Writing output... | |
240 2018-01-29 11:00:19,665 INFO: Ran Mash Screen on all input. Merging NCBI taxonomic information into results output. [in ...refseq_masher/refseq_masher/cli.py:134] | |
241 2018-01-29 11:00:19,666 INFO: Fetching all taxonomy info for 23 unique NCBI Taxonomy UIDs [in ...refseq_masher/refseq_masher/taxonomy.py:35] | |
242 2018-01-29 11:00:19,669 INFO: Dropping columns with all NA values (ncol=32) [in ...refseq_masher/refseq_masher/taxonomy.py:38] | |
243 2018-01-29 11:00:19,671 INFO: Columns with all NA values dropped (ncol=12) [in ...refseq_masher/refseq_masher/taxonomy.py:40] | |
244 2018-01-29 11:00:19,671 INFO: Merging Mash results with relevant taxonomic information [in ...refseq_masher/refseq_masher/taxonomy.py:41] | |
245 2018-01-29 11:00:19,674 INFO: Merged Mash results with taxonomy info [in ...refseq_masher/refseq_masher/taxonomy.py:43] | |
246 2018-01-29 11:00:19,674 INFO: Merged taxonomic information into results output [in ...refseq_masher/refseq_masher/cli.py:136] | |
247 2018-01-29 11:00:19,674 INFO: Reordering output columns [in ...refseq_masher/refseq_masher/cli.py:137] | |
248 2018-01-29 11:00:19,677 INFO: Wrote output to "containment-ERR260489.tab" [in ...refseq_masher/refseq_masher/writers.py:20] | |
249 | |
250 | |
251 | |
252 **Output** | |
253 | |
254 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ | |
255 | sample | top_taxonomy_name | identity | shared_hashes | median_multiplicity | pvalue | full_taxonomy | taxonomic_subspecies | taxonomic_species | taxonomic_genus | taxonomic_family | taxonomic_order | taxonomic_class | taxonomic_phylum | taxonomic_superkingdom | subspecies | serovar | plasmid | bioproject | biosample | taxid | assembly_accession | match_id | taxonomic_species group | match_comment | | |
256 +===========+======================================+==========+================+======================+========+==================================================================================================================================================+=======================+==============================+==================+====================+==================+=====================+===================+=========================+============+=========+==========+============+===========+=========+=====================+==============================================================================================+==========================+================+ | |
257 | ERR260489 | Bacteroides fragilis | 1.0 | 400/400 | 786 | 0.0 | Bacteria; FCB group; Bacteroidetes/Chlorobi group; Bacteroidetes; Bacteroidia; Bacteroidales; Bacteroidaceae; Bacteroides; fragilis | | Bacteroides fragilis | Bacteroides | Bacteroidaceae | Bacteroidales | Bacteroidia | Bacteroidetes | Bacteria | | | pLV22a | | | 817 | | ./rcn/refseq-NG-817-.-.-.-pLV22a-Bacteroides_fragilis.fna | | | | |
258 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ | |
259 | [1 row] | | | | | | | | | | | | | | | | | | | | | | | | | | |
260 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ | |
261 | ERR260489 | Escherichia coli O104:H4 str. E92/11 | 1.0 | 400/400 | 48 | 0.0 | Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacterales; Enterobacteriaceae; Escherichia; coli; O104:H4; str. E92/11 | | Escherichia coli | Escherichia | Enterobacteriaceae | Enterobacterales | Gammaproteobacteria | Proteobacteria | Bacteria | | | pE9211p3 | | | 1090927 | NZ_AHAU | ./rcn/refseq-NZ-1090927-.-.-NZ_AHAU-pE9211p3-Escherichia_coli_O104_H4_str._E92_11.fna | | | | |
262 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ | |
263 | [3 rows] | | | | | | | | | | | | | | | | | | | | | | | | | | |
264 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ | |
265 | ERR260489 | Kingella kingae KKC2005004457 | 1.0 | 400/400 | 5 | 0.0 | Bacteria; Proteobacteria; Betaproteobacteria; Neisseriales; Neisseriaceae; Kingella; kingae; KKC2005004457 | | Kingella kingae | Kingella | Neisseriaceae | Neisseriales | Betaproteobacteria | Proteobacteria | Bacteria | | | unnamed | | | 1229911 | | ./rcn/refseq-NG-1229911-.-.-.-unnamed-Kingella_kingae_KKC2005004457.fna | | | | |
266 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ | |
267 | ERR260489 | Bacteroides cellulosilyticus WH2 | 0.99984 | 399/400 | 772 | 0.0 | Bacteria; FCB group; Bacteroidetes/Chlorobi group; Bacteroidetes; Bacteroidia; Bacteroidales; Bacteroidaceae; Bacteroides; cellulosilyticus; WH2 | | Bacteroides cellulosilyticus | Bacteroides | Bacteroidaceae | Bacteroidales | Bacteroidia | Bacteroidetes | Bacteria | | | pBWH2B | | | 1268240 | NZ_ATFI | ./rcn/refseq-NZ-1268240-.-.-NZ_ATFI-pBWH2B-Bacteroides_cellulosilyticus_WH2.fna | | | | |
268 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ | |
269 | [1 row] | | | | | | | | | | | | | | | | | | | | | | | | | | |
270 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ | |
271 | ERR260489 | Klebsiella pneumoniae | 0.99984 | 399/400 | 4 | 0.0 | Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacterales; Enterobacteriaceae; Klebsiella; pneumoniae | | Klebsiella pneumoniae | Klebsiella | Enterobacteriaceae | Enterobacterales | Gammaproteobacteria | Proteobacteria | Bacteria | | | pMRC151 | | | 573 | | ./rcn/refseq-NG-573-.-.-.-pMRC151-Klebsiella_pneumoniae.fna | | | | |
272 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ | |
273 | [37 rows] | | | | | | | | | | | | | | | | | | | | | | | | | | |
274 +-----------+--------------------------------------+----------+----------------+----------------------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+---------+----------+------------+-----------+---------+---------------------+----------------------------------------------------------------------------------------------+--------------------------+----------------+ | |
275 | |
276 Some of the top genomes contained in this sample are sorted by identity and median multiplicity are: | |
277 | |
278 - *Bacteroides fragilis* - fully contained (400/400) and high multiplicity (768) | |
279 - *Escherichia coli* O104:H4 - fully contained (400/400) and median multiplicity of 48 | |
280 - *Kingella kingae* - fully contained (400/400) and median multiplicity of 5 | |
281 - *Klebsiella pneumoniae* - 399/400 sketches contained with median multiplicity of 4 | |
282 | |
283 So with Mash we are able to find that the sample contained the expected genomic data (especially *E. coli* O104:H4). | |
284 | |
285 | |
286 | |
287 Legal | |
288 ----- | |
289 | |
290 Copyright Government of Canada 2017 | |
291 | |
292 Written by: National Microbiology Laboratory, Public Health Agency of Canada | |
293 | |
294 Licensed under the Apache License, Version 2.0 (the "License"); you may not use | |
295 this work except in compliance with the License. You may obtain a copy of the | |
296 License at: | |
297 | |
298 http://www.apache.org/licenses/LICENSE-2.0 | |
299 | |
300 Unless required by applicable law or agreed to in writing, software distributed | |
301 under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |
302 CONDITIONS OF ANY KIND, either express or implied. See the License for the | |
303 specific language governing permissions and limitations under the License. | |
304 | |
305 Contact | |
306 ------- | |
307 | |
308 **Gary van Domselaar**: gary.vandomselaar@phac-aspc.gc.ca | |
309 | |
310 | |
311 | |
312 .. _Mash: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x | |
313 .. _SAMEA1877340: https://www.ebi.ac.uk/ena/data/view/SAMEA1877340 | |
314 .. _PRJEB1775: https://www.ebi.ac.uk/ena/data/view/PRJEB1775 | |
315 .. _ERR260489: https://www.ebi.ac.uk/ena/data/view/ERR260489&display=html | |
316 | |
317 ]]> | |
318 </help> | |
319 <citations> | |
320 <!-- Citation for Mash paper --> | |
321 <citation type="doi">10.1186/s13059-016-0997-x</citation> | |
322 </citations> | |
323 </tool> |