comparison vsnp_get_snps.xml @ 0:ec6e02f4eab7 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
author iuc
date Tue, 16 Nov 2021 08:26:58 +0000
parents
children 9ac0b1d5560d
comparison
equal deleted inserted replaced
-1:000000000000 0:ec6e02f4eab7
1 <tool id="vsnp_get_snps" name="vSNP: get SNPs" version="@WRAPPER_VERSION@.0" profile="@PROFILE@">
2 <description></description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <requirements>
7 <requirement type="package" version="3.0.9">openpyxl</requirement>
8 <requirement type="package" version="1.3.4">pandas</requirement>
9 <requirement type="package" version="0.6.8">pyvcf</requirement>
10 <requirement type="package" version="2.0.1">xlrd</requirement>
11 </requirements>
12 <command detect_errors="exit_code"><![CDATA[
13 #import re
14
15 #set input_vcf_dir = 'input_vcf_dir'
16 #set output_json_avg_mq_dir = 'output_json_avg_mq_dir'
17 #set output_json_snps_dir = 'output_json_snps_dir'
18 #set output_snps_dir = 'output_snps_dir'
19
20 mkdir -p $input_vcf_dir &&
21 mkdir -p $output_json_avg_mq_dir &&
22 mkdir -p $output_json_snps_dir &&
23 mkdir -p $output_snps_dir &&
24
25 #set dbkey = '?'
26 #for $i in $input_vcf_collection:
27 #if str($dbkey) == '?':
28 #set dbkey = $i.metadata.dbkey
29 #else if str($dbkey) != $i.metadata.dbkey:
30 >&2 echo "The dbkeys associated with the zero coverage VCF files with SNPs found in closely related isolate groups are not unique" &&
31 exit 1
32 #end if
33 #set vcf_identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
34 ln -s '${i}' '$input_vcf_dir/${vcf_identifier}' &&
35 #end for
36 #if str($dbkey) == '?':
37 >&2 echo "The dbkey must be set for the zero coverage VCF files with SNPs found in closely related isolate groups" && exit 1
38 #end if
39 #if str($input_zc_vcf_type_cond.input_zc_vcf_type) == "single":
40 #set zc_vcf_identifier = re.sub('[^\s\w\-]', '_', str($input_zc_vcf.element_identifier))
41 ln -s '${input_zc_vcf}' '$input_vcf_dir/${zc_vcf_identifier}' &&
42 #else
43 #for $i in $input_zc_vcf_type_cond.input_zc_vcf_collection:
44 #set zc_vcf_identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
45 ln -s '${i}' '$input_vcf_dir/${zc_vcf_identifier}' &&
46 #end for
47 #end if
48 #if str($input_excel_cond.input_excel_param) == 'yes':
49 #if str($input_excel_cond.excel_source_cond.excel_source) == 'cached':
50 #set excel_file = 'No genome specified for input VCF (database) file(s)'
51 #set excel_fields = $__app__.tool_data_tables['vsnp_excel'].get_fields()
52 ## The value of excel_fields is a nested list that looks like this.
53 ## [['AF2122', 'Mbovis_define_filter.xlsx', '~/tool-data/vsnp/AF2122/excel/Mbovis_define_filter.xlsx', 'Excel file for AF2122'],...]
54 #for $i in $excel_fields:
55 #if str($i[0]) == $dbkey:
56 #set excel_file = $i[2]
57 #break
58 #end if
59 #end for
60 #else:
61 #set excel_file = $input_excel_cond.excel_source_cond.input_excel
62 #end if
63 #end if
64 python '$__tool_directory__/vsnp_get_snps.py'
65 --ac $ac
66 #if str($input_excel_cond.input_excel_param) == 'yes':
67 --input_excel '$excel_file'
68 #end if
69 $all_isolates
70 --input_vcf_dir '$input_vcf_dir'
71 --min_mq $min_mq
72 --min_quality_score $min_quality_score
73 --output_json_avg_mq_dir '$output_json_avg_mq_dir'
74 --output_json_snps_dir '$output_json_snps_dir'
75 --output_snps_dir '$output_snps_dir'
76 --output_summary '$output_summary'
77 --processes \${GALAXY_SLOTS:-8}
78 --quality_score_n_threshold $quality_score_n_threshold
79 --dbkey '$dbkey'
80 ]]></command>
81 <inputs>
82 <conditional name="input_zc_vcf_type_cond">
83 <param name="input_zc_vcf_type" type="select" label="Choose the category of the files to be analyzed">
84 <option value="collection" selected="true">A collection of zero coverage VCF files</option>
85 <option value="single">A single zero coverage VCF file</option>
86 </param>
87 <when value="single">
88 <param name="input_zc_vcf" type="data" format="vcf" label="Zero coverage VCF file"/>
89 </when>
90 <when value="collection">
91 <param name="input_zc_vcf_collection" format="vcf" type="data_collection" collection_type="list" label="Collection of zero coverage VCF files"/>
92 </when>
93 </conditional>
94 <param name="input_vcf_collection" format="vcf" type="data_collection" collection_type="list" label="Collection of zero coverage VCF files with SNPs found in closely related isolate groups"/>
95 <param name="ac" type="integer" min="0" value="2" label="Allele count threshold" help="At least 1 position must have this value for a SNP to be added to a group"/>
96 <param name="min_mq" type="integer" min="0" value="56" label="Map quality threshold" help="At least 1 position must have a higher MQ value for a SNP to be added to a group"/>
97 <param name="min_quality_score" type="integer" min="0" value="150" label="Quality score threshold" help="At least 1 position must have a higher quality score for a SNP to be added to a group"/>
98 <param name="quality_score_n_threshold" type="integer" min="0" value="150" label="Minimum quality score N value for alleles" help="Alleles are marked as N for quality scores between this value and the minimum quality score value above"/>
99 <conditional name="input_excel_cond">
100 <param name="input_excel_param" type="select" label="Use Excel file for grouping and filtering?">
101 <option value="yes" selected="true">Yes</option>
102 <option value="no">No</option>
103 </param>
104 <when value="yes">
105 <conditional name="excel_source_cond">
106 <param name="excel_source" type="select" label="Choose the source for the Excel file">
107 <option value="cached">locally cached</option>
108 <option value="history">from history</option>
109 </param>
110 <when value="cached">
111 <param name="input_excel" type="select" label="Excel file">
112 <options from_data_table="vsnp_excel">
113 <filter type="data_meta" column="0" key="dbkey" ref="input_vcf_collection"/>
114 <validator type="no_options" message="No built-in Excel grouping and filtering datasets are available"/>
115 </options>
116 </param>
117 </when>
118 <when value="history">
119 <param name="input_excel" type="data" format="xlsx" label="Excel file"/>
120 </when>
121 </conditional>
122 </when>
123 <when value="no"/>
124 </conditional>
125 <param name="all_isolates" type="boolean" truevalue="--all_isolates" falsevalue="" checked="false" label="Create a group containing all isolates?"/>
126 </inputs>
127 <outputs>
128 <collection name="snps" type="list" label="${tool.name} on ${on_string} (SNPs)">
129 <discover_datasets pattern="__name_and_ext__" directory="output_snps_dir"/>
130 </collection>
131 <collection name="json_avg_mq" type="list" label="${tool.name} on ${on_string} (average mq)">
132 <discover_datasets pattern="__name_and_ext__" directory="output_json_avg_mq_dir"/>
133 </collection>
134 <collection name="json_snps" type="list" label="${tool.name} on ${on_string} (SNPs as json)">
135 <discover_datasets pattern="__name_and_ext__" directory="output_json_snps_dir"/>
136 </collection>
137 <data name="output_summary" format="html" label="${tool.name} on ${on_string} (summary)"/>
138 </outputs>
139 <tests>
140 <!--
141 Unfortunately the test files cannot be gzipped since Galaxy changes the file names
142 to be something like 00-0121_WI_Cervid_99-A_vcf_gz, and the VCF Reader requires
143 gzipped files to have a .gz extension. The exception is
144 UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
145 -->
146 <!-- A single vcf input, no excel file, all_isolates is False -->
147 <test expect_num_outputs="4">
148 <param name="input_zc_vcf_type" value="single"/>
149 <param name="input_zc_vcf" value="input_zc_vcf.vcf" ftype="vcf" dbkey="89"/>
150 <param name="input_vcf_collection">
151 <collection type="list">
152 <element name="SRR8073662_zc.vcf" value="SRR8073662_zc.vcf" dbkey="89"/>
153 <element name="SRR1792272_zc.vcf" value="SRR1792272_zc.vcf" dbkey="89"/>
154 </collection>
155 </param>
156 <param name="input_excel_param" value="no"/>
157 <output_collection name="snps" type="list" count="1">
158 <element name="all_vcf" ftype="fasta">
159 <assert_contents>
160 <has_size value="150"/>
161 </assert_contents>
162 </element>
163 </output_collection>
164 <output_collection name="json_avg_mq" type="list" count="1">
165 <element name="all_vcf" ftype="json">
166 <assert_contents>
167 <has_size value="551"/>
168 </assert_contents>
169 </element>
170 </output_collection>
171 <output_collection name="json_snps" type="list" count="1">
172 <element name="all_vcf" ftype="json">
173 <assert_contents>
174 <has_size value="876"/>
175 </assert_contents>
176 </element>
177 </output_collection>
178 <output name="output_summary" ftype="html">
179 <assert_contents>
180 <has_size value="303"/>
181 </assert_contents>
182 </output>
183 </test>
184 <!-- An input collection, no excel file, all_isolates is False -->
185 <test expect_num_outputs="4">
186 <param name="input_zc_vcf_type" value="collection"/>
187 <param name="input_zc_vcf_collection">
188 <collection type="list">
189 <element name="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" value="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" dbkey="89"/>
190 <element name="BCG_Tokyo_Unknown_JP_DRR029468.vcf" value="BCG_Tokyo_Unknown_JP_DRR029468.vcf" dbkey="89"/>
191 </collection>
192 </param>
193 <param name="input_vcf_collection">
194 <collection type="list">
195 <element name="01_1787_FL_Zoo_Jaguar.vcf" value="01_1787_FL_Zoo_Jaguar.vcf" dbkey="89"/>
196 <element name="02_5877_MEX_TX_Fed.vcf" value="02_5877_MEX_TX_Fed.vcf" dbkey="89"/>
197 <element name="02_0585_COA_TX_Fed.vcf" value="02_0585_COA_TX_Fed.vcf" dbkey="89"/>
198 </collection>
199 </param>
200 <param name="input_excel_param" value="no"/>
201 <output_collection name="snps" type="list" count="1">
202 <element name="all_vcf" ftype="fasta">
203 <assert_contents>
204 <has_size value="5226"/>
205 </assert_contents>
206 </element>
207 </output_collection>
208 <output_collection name="json_avg_mq" type="list" count="1">
209 <element name="all_vcf" ftype="json">
210 <assert_contents>
211 <has_size value="24332"/>
212 </assert_contents>
213 </element>
214 </output_collection>
215 <output_collection name="json_snps" type="list" count="1">
216 <element name="all_vcf" ftype="json">
217 <assert_contents>
218 <has_size value="38798"/>
219 </assert_contents>
220 </element>
221 </output_collection>
222 <output name="output_summary" ftype="html">
223 <assert_contents>
224 <has_size value="303"/>
225 </assert_contents>
226 </output>
227 </test>
228 <!-- An input collection, an excel file, all_isolates is False -->
229 <test expect_num_outputs="4">
230 <param name="input_zc_vcf_type" value="collection"/>
231 <param name="input_zc_vcf_collection">
232 <collection type="list">
233 <element name="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" value="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" dbkey="89"/>
234 <element name="BCG_Tokyo_Unknown_JP_DRR029468.vcf" value="BCG_Tokyo_Unknown_JP_DRR029468.vcf" dbkey="89"/>
235 </collection>
236 </param>
237 <param name="input_vcf_collection">
238 <collection type="list">
239 <element name="01_1787_FL_Zoo_Jaguar.vcf" value="01_1787_FL_Zoo_Jaguar.vcf" dbkey="89"/>
240 <element name="02_5877_MEX_TX_Fed.vcf" value="02_5877_MEX_TX_Fed.vcf" dbkey="89"/>
241 <element name="02_0585_COA_TX_Fed.vcf" value="02_0585_COA_TX_Fed.vcf" dbkey="89"/>
242 </collection>
243 </param>
244 <param name="input_excel_param" value="yes"/>
245 <param name="input_excel" value="89"/>
246 <output_collection name="snps" type="list" count="1">
247 <element name="Mbovis-17" ftype="fasta">
248 <assert_contents>
249 <has_size value="749"/>
250 </assert_contents>
251 </element>
252 </output_collection>
253 <output_collection name="json_avg_mq" type="list" count="1">
254 <element name="Mbovis-17" ftype="json">
255 <assert_contents>
256 <has_size value="10884"/>
257 </assert_contents>
258 </element>
259 </output_collection>
260 <output_collection name="json_snps" type="list" count="1">
261 <element name="Mbovis-17" ftype="json">
262 <assert_contents>
263 <has_size value="6396"/>
264 </assert_contents>
265 </element>
266 </output_collection>
267 <output name="output_summary" ftype="html">
268 <assert_contents>
269 <has_size value="1057"/>
270 </assert_contents>
271 </output>
272 </test>
273 <!-- An input collection, an excel file, all_isolates is True -->
274 <test expect_num_outputs="4">
275 <param name="input_zc_vcf_type" value="collection"/>
276 <param name="input_zc_vcf_collection">
277 <collection type="list">
278 <element name="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" value="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" dbkey="89"/>
279 <element name="BCG_Tokyo_Unknown_JP_DRR029468.vcf" value="BCG_Tokyo_Unknown_JP_DRR029468.vcf" dbkey="89"/>
280 </collection>
281 </param>
282 <param name="input_vcf_collection">
283 <collection type="list">
284 <element name="01_1787_FL_Zoo_Jaguar.vcf" value="01_1787_FL_Zoo_Jaguar.vcf" dbkey="89"/>
285 <element name="02_5877_MEX_TX_Fed.vcf" value="02_5877_MEX_TX_Fed.vcf" dbkey="89"/>
286 <element name="02_0585_COA_TX_Fed.vcf" value="02_0585_COA_TX_Fed.vcf" dbkey="89"/>
287 </collection>
288 </param>
289 <param name="input_excel_param" value="yes"/>
290 <param name="input_excel" value="89"/>
291 <param name="all_isolates" value="--all_isolates"/>
292 <output_collection name="snps" type="list" count="2">
293 <element name="Mbovis-17" ftype="fasta">
294 <assert_contents>
295 <has_size value="749"/>
296 </assert_contents>
297 </element>
298 <element name="all_vcf" ftype="fasta">
299 <assert_contents>
300 <has_size value="4920"/>
301 </assert_contents>
302 </element>
303 </output_collection>
304 <output_collection name="json_avg_mq" type="list" count="2">
305 <element name="Mbovis-17" ftype="json">
306 <assert_contents>
307 <has_size value="10884"/>
308 </assert_contents>
309 </element>
310 <element name="all_vcf" ftype="json">
311 <assert_contents>
312 <has_size value="24332"/>
313 </assert_contents>
314 </element>
315 </output_collection>
316 <output_collection name="json_snps" type="list" count="2">
317 <element name="Mbovis-17" ftype="json">
318 <assert_contents>
319 <has_size value="6396"/>
320 </assert_contents>
321 </element>
322 <element name="all_vcf" ftype="json">
323 <assert_contents>
324 <has_size value="36466"/>
325 </assert_contents>
326 </element>
327 </output_collection>
328 <output name="output_summary" ftype="html">
329 <assert_contents>
330 <has_size value="1056"/>
331 </assert_contents>
332 </output>
333 </test>
334 </tests>
335 <help>
336 **What it does**
337
338 Accepts a zero coverage VCF file produced by the **vSNP: add zero coverage** tool (or a collection of them) along with a collection
339 of zero coverage VCF files that have been aligned with the same reference and contain SNPs called between closely related isolate groups.
340 The tool produces fasta files containing SNP alignments, json files containing the SNP positions and additional json files containing
341 the average map quality values.
342
343 The SNP alignments produced by this tool are used to create phylogenetic trees, so larger input collections result in more populated
344 phylogenetic trees. Both of the json outputs are used by the **vSNP: build tables** tool to produce annotated SNP tables in the form
345 of Excel spreadsheets.
346
347 An Excel spreadsheet containing specified SNPs can optiomally be used to filter desired SNP positions by group. Users can choose a
348 locally cached Excel spreadsheet or one from their current history.
349
350 A SNP is added to a group if it has at least one position with a specified allele count value, a quality score greater than a specified
351 value, and a map quality greater than a specified value.
352
353 If the allele count equals the specified value (2) and the quality score for a SNP position is greater than the minimum quality score
354 value (150), the alternate allele is called.
355
356 However, if the allele count is 1, the position is called ambiguous. Deletions are called when the alternate allele is a gap. If the
357 quality score is less than or equal to the minimum quality score N value for alleles (150), the allele is marked "N".
358
359 **Required Options**
360
361 * **Zero coverage VCF file(s)** - Select a single or collection of zero coverage VCF files, typically produced by the **vSNP: add zero coverage** tool, from the current history.
362 * **Collection of zero coverage VCF files with SNPs found in closely related isolate groups** - Select a dataset collection of zero coverage vcf files from the current history.
363
364 **Additional Options**
365
366 * **Allele count threshold** - At least 1 position must have an allele count greater than this value for a SNP to be added to a group (2 is optimal).
367 * **Map quality threshold** - At least 1 position must have a higher MQ value for a SNP to be added to a group (56 is optimal).
368 * **Quality score threshold** -At least 1 position must have a higher quality score for a SNP to be added to a group (150 is optimal).
369 * **Minimum quality score N value for alleles** - If none of the avove 3 requirements is met and the quality score is less than or equal to the minimum quality score N value for alleles, the allele is marked "N" (150 is optimal).
370 * **Use Excel file for grouping and filtering?** - select Yes to filter desired SNP positions by group. A cached Excel spreadsheet provides the most widely used SNP positions for grouping, but a custom spreadhseet can be selected from the current history.
371 * **Create a group containing all isolates?** - select Yes to output an additional group containing of all isolates.
372 </help>
373 <expand macro="citations"/>
374 </tool>
375