Mercurial > repos > iuc > vsnp_get_snps
comparison vsnp_get_snps.xml @ 0:ec6e02f4eab7 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
author | iuc |
---|---|
date | Tue, 16 Nov 2021 08:26:58 +0000 |
parents | |
children | 9ac0b1d5560d |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ec6e02f4eab7 |
---|---|
1 <tool id="vsnp_get_snps" name="vSNP: get SNPs" version="@WRAPPER_VERSION@.0" profile="@PROFILE@"> | |
2 <description></description> | |
3 <macros> | |
4 <import>macros.xml</import> | |
5 </macros> | |
6 <requirements> | |
7 <requirement type="package" version="3.0.9">openpyxl</requirement> | |
8 <requirement type="package" version="1.3.4">pandas</requirement> | |
9 <requirement type="package" version="0.6.8">pyvcf</requirement> | |
10 <requirement type="package" version="2.0.1">xlrd</requirement> | |
11 </requirements> | |
12 <command detect_errors="exit_code"><![CDATA[ | |
13 #import re | |
14 | |
15 #set input_vcf_dir = 'input_vcf_dir' | |
16 #set output_json_avg_mq_dir = 'output_json_avg_mq_dir' | |
17 #set output_json_snps_dir = 'output_json_snps_dir' | |
18 #set output_snps_dir = 'output_snps_dir' | |
19 | |
20 mkdir -p $input_vcf_dir && | |
21 mkdir -p $output_json_avg_mq_dir && | |
22 mkdir -p $output_json_snps_dir && | |
23 mkdir -p $output_snps_dir && | |
24 | |
25 #set dbkey = '?' | |
26 #for $i in $input_vcf_collection: | |
27 #if str($dbkey) == '?': | |
28 #set dbkey = $i.metadata.dbkey | |
29 #else if str($dbkey) != $i.metadata.dbkey: | |
30 >&2 echo "The dbkeys associated with the zero coverage VCF files with SNPs found in closely related isolate groups are not unique" && | |
31 exit 1 | |
32 #end if | |
33 #set vcf_identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier)) | |
34 ln -s '${i}' '$input_vcf_dir/${vcf_identifier}' && | |
35 #end for | |
36 #if str($dbkey) == '?': | |
37 >&2 echo "The dbkey must be set for the zero coverage VCF files with SNPs found in closely related isolate groups" && exit 1 | |
38 #end if | |
39 #if str($input_zc_vcf_type_cond.input_zc_vcf_type) == "single": | |
40 #set zc_vcf_identifier = re.sub('[^\s\w\-]', '_', str($input_zc_vcf.element_identifier)) | |
41 ln -s '${input_zc_vcf}' '$input_vcf_dir/${zc_vcf_identifier}' && | |
42 #else | |
43 #for $i in $input_zc_vcf_type_cond.input_zc_vcf_collection: | |
44 #set zc_vcf_identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier)) | |
45 ln -s '${i}' '$input_vcf_dir/${zc_vcf_identifier}' && | |
46 #end for | |
47 #end if | |
48 #if str($input_excel_cond.input_excel_param) == 'yes': | |
49 #if str($input_excel_cond.excel_source_cond.excel_source) == 'cached': | |
50 #set excel_file = 'No genome specified for input VCF (database) file(s)' | |
51 #set excel_fields = $__app__.tool_data_tables['vsnp_excel'].get_fields() | |
52 ## The value of excel_fields is a nested list that looks like this. | |
53 ## [['AF2122', 'Mbovis_define_filter.xlsx', '~/tool-data/vsnp/AF2122/excel/Mbovis_define_filter.xlsx', 'Excel file for AF2122'],...] | |
54 #for $i in $excel_fields: | |
55 #if str($i[0]) == $dbkey: | |
56 #set excel_file = $i[2] | |
57 #break | |
58 #end if | |
59 #end for | |
60 #else: | |
61 #set excel_file = $input_excel_cond.excel_source_cond.input_excel | |
62 #end if | |
63 #end if | |
64 python '$__tool_directory__/vsnp_get_snps.py' | |
65 --ac $ac | |
66 #if str($input_excel_cond.input_excel_param) == 'yes': | |
67 --input_excel '$excel_file' | |
68 #end if | |
69 $all_isolates | |
70 --input_vcf_dir '$input_vcf_dir' | |
71 --min_mq $min_mq | |
72 --min_quality_score $min_quality_score | |
73 --output_json_avg_mq_dir '$output_json_avg_mq_dir' | |
74 --output_json_snps_dir '$output_json_snps_dir' | |
75 --output_snps_dir '$output_snps_dir' | |
76 --output_summary '$output_summary' | |
77 --processes \${GALAXY_SLOTS:-8} | |
78 --quality_score_n_threshold $quality_score_n_threshold | |
79 --dbkey '$dbkey' | |
80 ]]></command> | |
81 <inputs> | |
82 <conditional name="input_zc_vcf_type_cond"> | |
83 <param name="input_zc_vcf_type" type="select" label="Choose the category of the files to be analyzed"> | |
84 <option value="collection" selected="true">A collection of zero coverage VCF files</option> | |
85 <option value="single">A single zero coverage VCF file</option> | |
86 </param> | |
87 <when value="single"> | |
88 <param name="input_zc_vcf" type="data" format="vcf" label="Zero coverage VCF file"/> | |
89 </when> | |
90 <when value="collection"> | |
91 <param name="input_zc_vcf_collection" format="vcf" type="data_collection" collection_type="list" label="Collection of zero coverage VCF files"/> | |
92 </when> | |
93 </conditional> | |
94 <param name="input_vcf_collection" format="vcf" type="data_collection" collection_type="list" label="Collection of zero coverage VCF files with SNPs found in closely related isolate groups"/> | |
95 <param name="ac" type="integer" min="0" value="2" label="Allele count threshold" help="At least 1 position must have this value for a SNP to be added to a group"/> | |
96 <param name="min_mq" type="integer" min="0" value="56" label="Map quality threshold" help="At least 1 position must have a higher MQ value for a SNP to be added to a group"/> | |
97 <param name="min_quality_score" type="integer" min="0" value="150" label="Quality score threshold" help="At least 1 position must have a higher quality score for a SNP to be added to a group"/> | |
98 <param name="quality_score_n_threshold" type="integer" min="0" value="150" label="Minimum quality score N value for alleles" help="Alleles are marked as N for quality scores between this value and the minimum quality score value above"/> | |
99 <conditional name="input_excel_cond"> | |
100 <param name="input_excel_param" type="select" label="Use Excel file for grouping and filtering?"> | |
101 <option value="yes" selected="true">Yes</option> | |
102 <option value="no">No</option> | |
103 </param> | |
104 <when value="yes"> | |
105 <conditional name="excel_source_cond"> | |
106 <param name="excel_source" type="select" label="Choose the source for the Excel file"> | |
107 <option value="cached">locally cached</option> | |
108 <option value="history">from history</option> | |
109 </param> | |
110 <when value="cached"> | |
111 <param name="input_excel" type="select" label="Excel file"> | |
112 <options from_data_table="vsnp_excel"> | |
113 <filter type="data_meta" column="0" key="dbkey" ref="input_vcf_collection"/> | |
114 <validator type="no_options" message="No built-in Excel grouping and filtering datasets are available"/> | |
115 </options> | |
116 </param> | |
117 </when> | |
118 <when value="history"> | |
119 <param name="input_excel" type="data" format="xlsx" label="Excel file"/> | |
120 </when> | |
121 </conditional> | |
122 </when> | |
123 <when value="no"/> | |
124 </conditional> | |
125 <param name="all_isolates" type="boolean" truevalue="--all_isolates" falsevalue="" checked="false" label="Create a group containing all isolates?"/> | |
126 </inputs> | |
127 <outputs> | |
128 <collection name="snps" type="list" label="${tool.name} on ${on_string} (SNPs)"> | |
129 <discover_datasets pattern="__name_and_ext__" directory="output_snps_dir"/> | |
130 </collection> | |
131 <collection name="json_avg_mq" type="list" label="${tool.name} on ${on_string} (average mq)"> | |
132 <discover_datasets pattern="__name_and_ext__" directory="output_json_avg_mq_dir"/> | |
133 </collection> | |
134 <collection name="json_snps" type="list" label="${tool.name} on ${on_string} (SNPs as json)"> | |
135 <discover_datasets pattern="__name_and_ext__" directory="output_json_snps_dir"/> | |
136 </collection> | |
137 <data name="output_summary" format="html" label="${tool.name} on ${on_string} (summary)"/> | |
138 </outputs> | |
139 <tests> | |
140 <!-- | |
141 Unfortunately the test files cannot be gzipped since Galaxy changes the file names | |
142 to be something like 00-0121_WI_Cervid_99-A_vcf_gz, and the VCF Reader requires | |
143 gzipped files to have a .gz extension. The exception is | |
144 UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte | |
145 --> | |
146 <!-- A single vcf input, no excel file, all_isolates is False --> | |
147 <test expect_num_outputs="4"> | |
148 <param name="input_zc_vcf_type" value="single"/> | |
149 <param name="input_zc_vcf" value="input_zc_vcf.vcf" ftype="vcf" dbkey="89"/> | |
150 <param name="input_vcf_collection"> | |
151 <collection type="list"> | |
152 <element name="SRR8073662_zc.vcf" value="SRR8073662_zc.vcf" dbkey="89"/> | |
153 <element name="SRR1792272_zc.vcf" value="SRR1792272_zc.vcf" dbkey="89"/> | |
154 </collection> | |
155 </param> | |
156 <param name="input_excel_param" value="no"/> | |
157 <output_collection name="snps" type="list" count="1"> | |
158 <element name="all_vcf" ftype="fasta"> | |
159 <assert_contents> | |
160 <has_size value="150"/> | |
161 </assert_contents> | |
162 </element> | |
163 </output_collection> | |
164 <output_collection name="json_avg_mq" type="list" count="1"> | |
165 <element name="all_vcf" ftype="json"> | |
166 <assert_contents> | |
167 <has_size value="551"/> | |
168 </assert_contents> | |
169 </element> | |
170 </output_collection> | |
171 <output_collection name="json_snps" type="list" count="1"> | |
172 <element name="all_vcf" ftype="json"> | |
173 <assert_contents> | |
174 <has_size value="876"/> | |
175 </assert_contents> | |
176 </element> | |
177 </output_collection> | |
178 <output name="output_summary" ftype="html"> | |
179 <assert_contents> | |
180 <has_size value="303"/> | |
181 </assert_contents> | |
182 </output> | |
183 </test> | |
184 <!-- An input collection, no excel file, all_isolates is False --> | |
185 <test expect_num_outputs="4"> | |
186 <param name="input_zc_vcf_type" value="collection"/> | |
187 <param name="input_zc_vcf_collection"> | |
188 <collection type="list"> | |
189 <element name="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" value="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" dbkey="89"/> | |
190 <element name="BCG_Tokyo_Unknown_JP_DRR029468.vcf" value="BCG_Tokyo_Unknown_JP_DRR029468.vcf" dbkey="89"/> | |
191 </collection> | |
192 </param> | |
193 <param name="input_vcf_collection"> | |
194 <collection type="list"> | |
195 <element name="01_1787_FL_Zoo_Jaguar.vcf" value="01_1787_FL_Zoo_Jaguar.vcf" dbkey="89"/> | |
196 <element name="02_5877_MEX_TX_Fed.vcf" value="02_5877_MEX_TX_Fed.vcf" dbkey="89"/> | |
197 <element name="02_0585_COA_TX_Fed.vcf" value="02_0585_COA_TX_Fed.vcf" dbkey="89"/> | |
198 </collection> | |
199 </param> | |
200 <param name="input_excel_param" value="no"/> | |
201 <output_collection name="snps" type="list" count="1"> | |
202 <element name="all_vcf" ftype="fasta"> | |
203 <assert_contents> | |
204 <has_size value="5226"/> | |
205 </assert_contents> | |
206 </element> | |
207 </output_collection> | |
208 <output_collection name="json_avg_mq" type="list" count="1"> | |
209 <element name="all_vcf" ftype="json"> | |
210 <assert_contents> | |
211 <has_size value="24332"/> | |
212 </assert_contents> | |
213 </element> | |
214 </output_collection> | |
215 <output_collection name="json_snps" type="list" count="1"> | |
216 <element name="all_vcf" ftype="json"> | |
217 <assert_contents> | |
218 <has_size value="38798"/> | |
219 </assert_contents> | |
220 </element> | |
221 </output_collection> | |
222 <output name="output_summary" ftype="html"> | |
223 <assert_contents> | |
224 <has_size value="303"/> | |
225 </assert_contents> | |
226 </output> | |
227 </test> | |
228 <!-- An input collection, an excel file, all_isolates is False --> | |
229 <test expect_num_outputs="4"> | |
230 <param name="input_zc_vcf_type" value="collection"/> | |
231 <param name="input_zc_vcf_collection"> | |
232 <collection type="list"> | |
233 <element name="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" value="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" dbkey="89"/> | |
234 <element name="BCG_Tokyo_Unknown_JP_DRR029468.vcf" value="BCG_Tokyo_Unknown_JP_DRR029468.vcf" dbkey="89"/> | |
235 </collection> | |
236 </param> | |
237 <param name="input_vcf_collection"> | |
238 <collection type="list"> | |
239 <element name="01_1787_FL_Zoo_Jaguar.vcf" value="01_1787_FL_Zoo_Jaguar.vcf" dbkey="89"/> | |
240 <element name="02_5877_MEX_TX_Fed.vcf" value="02_5877_MEX_TX_Fed.vcf" dbkey="89"/> | |
241 <element name="02_0585_COA_TX_Fed.vcf" value="02_0585_COA_TX_Fed.vcf" dbkey="89"/> | |
242 </collection> | |
243 </param> | |
244 <param name="input_excel_param" value="yes"/> | |
245 <param name="input_excel" value="89"/> | |
246 <output_collection name="snps" type="list" count="1"> | |
247 <element name="Mbovis-17" ftype="fasta"> | |
248 <assert_contents> | |
249 <has_size value="749"/> | |
250 </assert_contents> | |
251 </element> | |
252 </output_collection> | |
253 <output_collection name="json_avg_mq" type="list" count="1"> | |
254 <element name="Mbovis-17" ftype="json"> | |
255 <assert_contents> | |
256 <has_size value="10884"/> | |
257 </assert_contents> | |
258 </element> | |
259 </output_collection> | |
260 <output_collection name="json_snps" type="list" count="1"> | |
261 <element name="Mbovis-17" ftype="json"> | |
262 <assert_contents> | |
263 <has_size value="6396"/> | |
264 </assert_contents> | |
265 </element> | |
266 </output_collection> | |
267 <output name="output_summary" ftype="html"> | |
268 <assert_contents> | |
269 <has_size value="1057"/> | |
270 </assert_contents> | |
271 </output> | |
272 </test> | |
273 <!-- An input collection, an excel file, all_isolates is True --> | |
274 <test expect_num_outputs="4"> | |
275 <param name="input_zc_vcf_type" value="collection"/> | |
276 <param name="input_zc_vcf_collection"> | |
277 <collection type="list"> | |
278 <element name="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" value="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" dbkey="89"/> | |
279 <element name="BCG_Tokyo_Unknown_JP_DRR029468.vcf" value="BCG_Tokyo_Unknown_JP_DRR029468.vcf" dbkey="89"/> | |
280 </collection> | |
281 </param> | |
282 <param name="input_vcf_collection"> | |
283 <collection type="list"> | |
284 <element name="01_1787_FL_Zoo_Jaguar.vcf" value="01_1787_FL_Zoo_Jaguar.vcf" dbkey="89"/> | |
285 <element name="02_5877_MEX_TX_Fed.vcf" value="02_5877_MEX_TX_Fed.vcf" dbkey="89"/> | |
286 <element name="02_0585_COA_TX_Fed.vcf" value="02_0585_COA_TX_Fed.vcf" dbkey="89"/> | |
287 </collection> | |
288 </param> | |
289 <param name="input_excel_param" value="yes"/> | |
290 <param name="input_excel" value="89"/> | |
291 <param name="all_isolates" value="--all_isolates"/> | |
292 <output_collection name="snps" type="list" count="2"> | |
293 <element name="Mbovis-17" ftype="fasta"> | |
294 <assert_contents> | |
295 <has_size value="749"/> | |
296 </assert_contents> | |
297 </element> | |
298 <element name="all_vcf" ftype="fasta"> | |
299 <assert_contents> | |
300 <has_size value="4920"/> | |
301 </assert_contents> | |
302 </element> | |
303 </output_collection> | |
304 <output_collection name="json_avg_mq" type="list" count="2"> | |
305 <element name="Mbovis-17" ftype="json"> | |
306 <assert_contents> | |
307 <has_size value="10884"/> | |
308 </assert_contents> | |
309 </element> | |
310 <element name="all_vcf" ftype="json"> | |
311 <assert_contents> | |
312 <has_size value="24332"/> | |
313 </assert_contents> | |
314 </element> | |
315 </output_collection> | |
316 <output_collection name="json_snps" type="list" count="2"> | |
317 <element name="Mbovis-17" ftype="json"> | |
318 <assert_contents> | |
319 <has_size value="6396"/> | |
320 </assert_contents> | |
321 </element> | |
322 <element name="all_vcf" ftype="json"> | |
323 <assert_contents> | |
324 <has_size value="36466"/> | |
325 </assert_contents> | |
326 </element> | |
327 </output_collection> | |
328 <output name="output_summary" ftype="html"> | |
329 <assert_contents> | |
330 <has_size value="1056"/> | |
331 </assert_contents> | |
332 </output> | |
333 </test> | |
334 </tests> | |
335 <help> | |
336 **What it does** | |
337 | |
338 Accepts a zero coverage VCF file produced by the **vSNP: add zero coverage** tool (or a collection of them) along with a collection | |
339 of zero coverage VCF files that have been aligned with the same reference and contain SNPs called between closely related isolate groups. | |
340 The tool produces fasta files containing SNP alignments, json files containing the SNP positions and additional json files containing | |
341 the average map quality values. | |
342 | |
343 The SNP alignments produced by this tool are used to create phylogenetic trees, so larger input collections result in more populated | |
344 phylogenetic trees. Both of the json outputs are used by the **vSNP: build tables** tool to produce annotated SNP tables in the form | |
345 of Excel spreadsheets. | |
346 | |
347 An Excel spreadsheet containing specified SNPs can optiomally be used to filter desired SNP positions by group. Users can choose a | |
348 locally cached Excel spreadsheet or one from their current history. | |
349 | |
350 A SNP is added to a group if it has at least one position with a specified allele count value, a quality score greater than a specified | |
351 value, and a map quality greater than a specified value. | |
352 | |
353 If the allele count equals the specified value (2) and the quality score for a SNP position is greater than the minimum quality score | |
354 value (150), the alternate allele is called. | |
355 | |
356 However, if the allele count is 1, the position is called ambiguous. Deletions are called when the alternate allele is a gap. If the | |
357 quality score is less than or equal to the minimum quality score N value for alleles (150), the allele is marked "N". | |
358 | |
359 **Required Options** | |
360 | |
361 * **Zero coverage VCF file(s)** - Select a single or collection of zero coverage VCF files, typically produced by the **vSNP: add zero coverage** tool, from the current history. | |
362 * **Collection of zero coverage VCF files with SNPs found in closely related isolate groups** - Select a dataset collection of zero coverage vcf files from the current history. | |
363 | |
364 **Additional Options** | |
365 | |
366 * **Allele count threshold** - At least 1 position must have an allele count greater than this value for a SNP to be added to a group (2 is optimal). | |
367 * **Map quality threshold** - At least 1 position must have a higher MQ value for a SNP to be added to a group (56 is optimal). | |
368 * **Quality score threshold** -At least 1 position must have a higher quality score for a SNP to be added to a group (150 is optimal). | |
369 * **Minimum quality score N value for alleles** - If none of the avove 3 requirements is met and the quality score is less than or equal to the minimum quality score N value for alleles, the allele is marked "N" (150 is optimal). | |
370 * **Use Excel file for grouping and filtering?** - select Yes to filter desired SNP positions by group. A cached Excel spreadsheet provides the most widely used SNP positions for grouping, but a custom spreadhseet can be selected from the current history. | |
371 * **Create a group containing all isolates?** - select Yes to output an additional group containing of all isolates. | |
372 </help> | |
373 <expand macro="citations"/> | |
374 </tool> | |
375 |