3
|
1 <tool id="vsnp_get_snps" name="vSNP: get SNPs" version="@WRAPPER_VERSION@.0" profile="@PROFILE@">
|
0
|
2 <description></description>
|
3
|
3 <macros>
|
|
4 <import>macros.xml</import>
|
|
5 </macros>
|
0
|
6 <requirements>
|
|
7 <requirement type="package" version="0.25.3">pandas</requirement>
|
|
8 <requirement type="package" version="0.6.8">pyvcf</requirement>
|
|
9 <requirement type="package" version="1.2.0">xlrd</requirement>
|
|
10 </requirements>
|
|
11 <command detect_errors="exit_code"><![CDATA[
|
3
|
12 #import re
|
|
13
|
0
|
14 #set input_vcf_dir = 'input_vcf_dir'
|
|
15 #set output_json_avg_mq_dir = 'output_json_avg_mq_dir'
|
|
16 #set output_json_snps_dir = 'output_json_snps_dir'
|
|
17 #set output_snps_dir = 'output_snps_dir'
|
3
|
18
|
0
|
19 mkdir -p $input_vcf_dir &&
|
|
20 mkdir -p $output_json_avg_mq_dir &&
|
|
21 mkdir -p $output_json_snps_dir &&
|
|
22 mkdir -p $output_snps_dir &&
|
3
|
23
|
|
24 #set dbkey = '?'
|
0
|
25 #for $i in $input_vcf_collection:
|
3
|
26 #if str($dbkey) == '?':
|
|
27 #set dbkey = $i.metadata.dbkey
|
|
28 #else if str($dbkey) != $i.metadata.dbkey:
|
|
29 >&2 echo "The dbkeys associated with the zero coverage VCF files with SNPs found in closely related isolate groups are not unique" &&
|
|
30 exit 1
|
|
31 #end if
|
|
32 #set vcf_identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
|
|
33 ln -s '${i}' '$input_vcf_dir/${vcf_identifier}' &&
|
0
|
34 #end for
|
3
|
35 #if str($dbkey) == '?':
|
|
36 >&2 echo "The dbkey must be set for the zero coverage VCF files with SNPs found in closely related isolate groups" && exit 1
|
|
37 #end if
|
|
38 #if str($input_zc_vcf_type_cond.input_zc_vcf_type) == "single":
|
|
39 #set zc_vcf_identifier = re.sub('[^\s\w\-]', '_', str($input_zc_vcf.element_identifier))
|
|
40 ln -s '${input_zc_vcf}' '$input_vcf_dir/${zc_vcf_identifier}' &&
|
0
|
41 #else
|
|
42 #for $i in $input_zc_vcf_type_cond.input_zc_vcf_collection:
|
3
|
43 #set zc_vcf_identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
|
|
44 ln -s '${i}' '$input_vcf_dir/${zc_vcf_identifier}' &&
|
0
|
45 #end for
|
|
46 #end if
|
3
|
47 #if str($input_excel_cond.input_excel_param) == 'yes':
|
|
48 #if str($input_excel_cond.excel_source_cond.excel_source) == 'cached':
|
|
49 #set excel_file = 'No genome specified for input VCF (database) file(s)'
|
0
|
50 #set excel_fields = $__app__.tool_data_tables['vsnp_excel'].get_fields()
|
3
|
51 ## The value of excel_fields is a nested list that looks like this.
|
|
52 ## [['AF2122', 'Mbovis_define_filter.xlsx', '~/tool-data/vsnp/AF2122/excel/Mbovis_define_filter.xlsx', 'Excel file for AF2122'],...]
|
0
|
53 #for $i in $excel_fields:
|
3
|
54 #if str($i[0]) == $dbkey:
|
0
|
55 #set excel_file = $i[2]
|
|
56 #break
|
|
57 #end if
|
|
58 #end for
|
|
59 #else:
|
3
|
60 #set excel_file = $input_excel_cond.excel_source_cond.input_excel
|
0
|
61 #end if
|
|
62 #end if
|
|
63 python '$__tool_directory__/vsnp_get_snps.py'
|
3
|
64 --ac $ac
|
|
65 #if str($input_excel_cond.input_excel_param) == 'yes':
|
|
66 --input_excel '$excel_file'
|
0
|
67 #end if
|
3
|
68 $all_isolates
|
|
69 --input_vcf_dir '$input_vcf_dir'
|
|
70 --min_mq $min_mq
|
|
71 --min_quality_score $min_quality_score
|
|
72 --output_json_avg_mq_dir '$output_json_avg_mq_dir'
|
|
73 --output_json_snps_dir '$output_json_snps_dir'
|
|
74 --output_snps_dir '$output_snps_dir'
|
0
|
75 --output_summary '$output_summary'
|
3
|
76 --processes \${GALAXY_SLOTS:-8}
|
|
77 --quality_score_n_threshold $quality_score_n_threshold
|
|
78 --dbkey '$dbkey'
|
0
|
79 ]]></command>
|
|
80 <inputs>
|
|
81 <conditional name="input_zc_vcf_type_cond">
|
|
82 <param name="input_zc_vcf_type" type="select" label="Choose the category of the files to be analyzed">
|
3
|
83 <option value="collection" selected="true">A collection of zero coverage VCF files</option>
|
|
84 <option value="single">A single zero coverage VCF file</option>
|
0
|
85 </param>
|
|
86 <when value="single">
|
3
|
87 <param name="input_zc_vcf" type="data" format="vcf" label="Zero coverage VCF file"/>
|
0
|
88 </when>
|
|
89 <when value="collection">
|
3
|
90 <param name="input_zc_vcf_collection" format="vcf" type="data_collection" collection_type="list" label="Collection of zero coverage VCF files"/>
|
0
|
91 </when>
|
|
92 </conditional>
|
3
|
93 <param name="input_vcf_collection" format="vcf" type="data_collection" collection_type="list" label="Collection of zero coverage VCF files with SNPs found in closely related isolate groups"/>
|
|
94 <param name="ac" type="integer" min="0" value="2" label="Allele count threshold" help="At least 1 position must have this value for a SNP to be added to a group"/>
|
|
95 <param name="min_mq" type="integer" min="0" value="56" label="Map quality threshold" help="At least 1 position must have a higher MQ value for a SNP to be added to a group"/>
|
|
96 <param name="min_quality_score" type="integer" min="0" value="150" label="Quality score threshold" help="At least 1 position must have a higher quality score for a SNP to be added to a group"/>
|
|
97 <param name="quality_score_n_threshold" type="integer" min="0" value="150" label="Minimum quality score N value for alleles" help="Alleles are marked as N for quality scores between this value and the minimum quality score value above"/>
|
|
98 <conditional name="input_excel_cond">
|
|
99 <param name="input_excel_param" type="select" label="Use Excel file for grouping and filtering?">
|
0
|
100 <option value="yes" selected="true">Yes</option>
|
|
101 <option value="no">No</option>
|
|
102 </param>
|
|
103 <when value="yes">
|
3
|
104 <conditional name="excel_source_cond">
|
|
105 <param name="excel_source" type="select" label="Choose the source for the Excel file">
|
0
|
106 <option value="cached">locally cached</option>
|
|
107 <option value="history">from history</option>
|
|
108 </param>
|
|
109 <when value="cached">
|
3
|
110 <param name="input_excel" type="select" label="Excel file">
|
|
111 <options from_data_table="vsnp_excel">
|
|
112 <validator type="no_options" message="No built-in Excel grouping and filtering datasets are available"/>
|
|
113 </options>
|
0
|
114 </param>
|
|
115 </when>
|
|
116 <when value="history">
|
3
|
117 <param name="input_excel" type="data" format="xlsx" label="Excel file"/>
|
0
|
118 </when>
|
|
119 </conditional>
|
|
120 </when>
|
|
121 <when value="no"/>
|
|
122 </conditional>
|
3
|
123 <param argument="all_isolates" type="boolean" truevalue="--all_isolates" falsevalue="" checked="false" label="Create a group containing all isolates?"/>
|
0
|
124 </inputs>
|
|
125 <outputs>
|
3
|
126 <collection name="snps" type="list" label="${tool.name} on ${on_string} (SNPs)">
|
|
127 <discover_datasets pattern="__name_and_ext__" directory="output_snps_dir"/>
|
0
|
128 </collection>
|
3
|
129 <collection name="json_avg_mq" type="list" label="${tool.name} on ${on_string} (average mq)">
|
|
130 <discover_datasets pattern="__name_and_ext__" directory="output_json_avg_mq_dir"/>
|
0
|
131 </collection>
|
3
|
132 <collection name="json_snps" type="list" label="${tool.name} on ${on_string} (SNPs as json)">
|
|
133 <discover_datasets pattern="__name_and_ext__" directory="output_json_snps_dir"/>
|
0
|
134 </collection>
|
3
|
135 <data name="output_summary" format="html" label="${tool.name} on ${on_string} (summary)"/>
|
0
|
136 </outputs>
|
|
137 <tests>
|
3
|
138 <!--
|
|
139 Unfortunately the test files cannot be gzipped since Galaxy changes the file names
|
|
140 to be something like 00-0121_WI_Cervid_99-A_vcf_gz, and the VCF Reader requires
|
|
141 gzipped files to have a .gz extension. The exception is
|
|
142 UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
|
|
143 -->
|
|
144 <!-- A single vcf input, no excel file, all_isolates is False -->
|
|
145 <test expect_num_outputs="4">
|
|
146 <param name="input_zc_vcf_type" value="single"/>
|
0
|
147 <param name="input_zc_vcf" value="input_zc_vcf.vcf" ftype="vcf" dbkey="89"/>
|
|
148 <param name="input_vcf_collection">
|
|
149 <collection type="list">
|
|
150 <element name="SRR8073662_zc.vcf" value="SRR8073662_zc.vcf" dbkey="89"/>
|
|
151 <element name="SRR1792272_zc.vcf" value="SRR1792272_zc.vcf" dbkey="89"/>
|
|
152 </collection>
|
|
153 </param>
|
3
|
154 <param name="input_excel_param" value="no"/>
|
|
155 <output_collection name="snps" type="list" count="1">
|
|
156 <element name="all_vcf" file="all_vcf.fasta" ftype="fasta" compare="contains"/>
|
0
|
157 </output_collection>
|
3
|
158 <output_collection name="json_avg_mq" type="list" count="1">
|
|
159 <element name="all_vcf" file="json_avg_mq_all_vcf.json" ftype="json" compare="contains"/>
|
0
|
160 </output_collection>
|
3
|
161 <output_collection name="json_snps" type="list" count="1">
|
|
162 <element name="all_vcf" file="json_all_vcf.json" ftype="json" compare="contains"/>
|
0
|
163 </output_collection>
|
|
164 <output name="output_summary" file="output_summary.html" ftype="html" compare="contains"/>
|
|
165 </test>
|
3
|
166 <!-- An input collection, no excel file, all_isolates is False -->
|
|
167 <test expect_num_outputs="4">
|
|
168 <param name="input_zc_vcf_type" value="collection"/>
|
|
169 <param name="input_zc_vcf_collection">
|
|
170 <collection type="list">
|
|
171 <element name="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" value="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" dbkey="89"/>
|
|
172 <element name="BCG_Tokyo_Unknown_JP_DRR029468.vcf" value="BCG_Tokyo_Unknown_JP_DRR029468.vcf" dbkey="89"/>
|
|
173 </collection>
|
|
174 </param>
|
|
175 <param name="input_vcf_collection">
|
|
176 <collection type="list">
|
|
177 <element name="01_1787_FL_Zoo_Jaguar.vcf" value="01_1787_FL_Zoo_Jaguar.vcf" dbkey="89"/>
|
|
178 <element name="02_5877_MEX_TX_Fed.vcf" value="02_5877_MEX_TX_Fed.vcf" dbkey="89"/>
|
|
179 <element name="02_0585_COA_TX_Fed.vcf" value="02_0585_COA_TX_Fed.vcf" dbkey="89"/>
|
|
180 </collection>
|
|
181 </param>
|
|
182 <param name="input_excel_param" value="no"/>
|
|
183 <output_collection name="snps" type="list" count="1">
|
|
184 <element name="all_vcf" file="all_vcf2.fasta" ftype="fasta" compare="contains"/>
|
|
185 </output_collection>
|
|
186 <output_collection name="json_avg_mq" type="list" count="1">
|
|
187 <element name="all_vcf" file="json_avg_mq_all_vcf2.json" ftype="json" compare="contains"/>
|
|
188 </output_collection>
|
|
189 <output_collection name="json_snps" type="list" count="1">
|
|
190 <element name="all_vcf" file="json_all_vcf2.json" ftype="json" compare="contains"/>
|
|
191 </output_collection>
|
|
192 <output name="output_summary" file="output_summary2.html" ftype="html" compare="contains"/>
|
|
193 </test>
|
|
194 <!-- An input collection, an excel file, all_isolates is False -->
|
|
195 <test expect_num_outputs="4">
|
|
196 <param name="input_zc_vcf_type" value="collection"/>
|
|
197 <param name="input_zc_vcf_collection">
|
|
198 <collection type="list">
|
|
199 <element name="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" value="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" dbkey="89"/>
|
|
200 <element name="BCG_Tokyo_Unknown_JP_DRR029468.vcf" value="BCG_Tokyo_Unknown_JP_DRR029468.vcf" dbkey="89"/>
|
|
201 </collection>
|
|
202 </param>
|
|
203 <param name="input_vcf_collection">
|
|
204 <collection type="list">
|
|
205 <element name="01_1787_FL_Zoo_Jaguar.vcf" value="01_1787_FL_Zoo_Jaguar.vcf" dbkey="89"/>
|
|
206 <element name="02_5877_MEX_TX_Fed.vcf" value="02_5877_MEX_TX_Fed.vcf" dbkey="89"/>
|
|
207 <element name="02_0585_COA_TX_Fed.vcf" value="02_0585_COA_TX_Fed.vcf" dbkey="89"/>
|
|
208 </collection>
|
|
209 </param>
|
|
210 <param name="input_excel_param" value="yes"/>
|
|
211 <param name="input_excel" value="89"/>
|
|
212 <output_collection name="snps" type="list" count="1">
|
|
213 <element name="Mbovis-17" file="Mbovis-17_snps.fasta" ftype="fasta"/>
|
|
214 </output_collection>
|
|
215 <output_collection name="json_avg_mq" type="list" count="1">
|
|
216 <element name="Mbovis-17" file="Mbovis-17_avg_mq_json.json" ftype="json" compare="contains"/>
|
|
217 </output_collection>
|
|
218 <output_collection name="json_snps" type="list" count="1">
|
|
219 <element name="Mbovis-17" file="Mbovis-17_snps_json.json" ftype="json" compare="contains"/>
|
|
220 </output_collection>
|
|
221 <output name="output_summary" file="output_summary3.html" ftype="html" compare="contains"/>
|
|
222 </test>
|
|
223 <!-- An input collection, an excel file, all_isolates is True -->
|
|
224 <test expect_num_outputs="4">
|
|
225 <param name="input_zc_vcf_type" value="collection"/>
|
|
226 <param name="input_zc_vcf_collection">
|
|
227 <collection type="list">
|
|
228 <element name="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" value="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" dbkey="89"/>
|
|
229 <element name="BCG_Tokyo_Unknown_JP_DRR029468.vcf" value="BCG_Tokyo_Unknown_JP_DRR029468.vcf" dbkey="89"/>
|
|
230 </collection>
|
|
231 </param>
|
|
232 <param name="input_vcf_collection">
|
|
233 <collection type="list">
|
|
234 <element name="01_1787_FL_Zoo_Jaguar.vcf" value="01_1787_FL_Zoo_Jaguar.vcf" dbkey="89"/>
|
|
235 <element name="02_5877_MEX_TX_Fed.vcf" value="02_5877_MEX_TX_Fed.vcf" dbkey="89"/>
|
|
236 <element name="02_0585_COA_TX_Fed.vcf" value="02_0585_COA_TX_Fed.vcf" dbkey="89"/>
|
|
237 </collection>
|
|
238 </param>
|
|
239 <param name="input_excel_param" value="yes"/>
|
|
240 <param name="input_excel" value="89"/>
|
|
241 <param name="all_isolates" value="--all_isolates"/>
|
|
242 <output_collection name="snps" type="list" count="2">
|
|
243 <element name="Mbovis-17" file="Mbovis-17_snps.fasta" ftype="fasta"/>
|
|
244 <element name="all_vcf" file="all_vcf3.fasta" ftype="fasta"/>
|
|
245 </output_collection>
|
|
246 <output_collection name="json_avg_mq" type="list" count="2">
|
|
247 <element name="Mbovis-17" file="Mbovis-17_avg_mq_json.json" ftype="json" compare="contains"/>
|
|
248 <element name="all_vcf" file="Mbovis-17_avg_mq_json.json" ftype="json" compare="contains"/>
|
|
249 </output_collection>
|
|
250 <output_collection name="json_snps" type="list" count="2">
|
|
251 <element name="Mbovis-17" file="Mbovis-17_snps_json.json" ftype="json" compare="contains"/>
|
|
252 <element name="all_vcf" file="Mbovis-17_snps_json.json" ftype="json" compare="contains"/>
|
|
253 </output_collection>
|
|
254 <output name="output_summary" file="output_summary4.html" ftype="html" compare="contains"/>
|
|
255 </test>
|
0
|
256 </tests>
|
|
257 <help>
|
|
258 **What it does**
|
|
259
|
3
|
260 Accepts a zero coverage VCF file produced by the **vSNP: add zero coverage** tool (or a collection of them) along with a collection
|
|
261 of zero coverage VCF files that have been aligned with the same reference and contain SNPs called between closely related isolate groups.
|
|
262 The tool produces fasta files containing SNP alignments, json files containing the SNP positions and additional json files containing
|
|
263 the average map quality values.
|
|
264
|
|
265 The SNP alignments produced by this tool are used to create phylogenetic trees, so larger input collections result in more populated
|
|
266 phylogenetic trees. Both of the json outputs are used by the **vSNP: build tables** tool to produce annotated SNP tables in the form
|
|
267 of Excel spreadsheets.
|
|
268
|
|
269 An Excel spreadsheet containing specified SNPs can optiomally be used to filter desired SNP positions by group. Users can choose a
|
|
270 locally cached Excel spreadsheet or one from their current history.
|
|
271
|
|
272 A SNP is added to a group if it has at least one position with a specified allele count value, a quality score greater than a specified
|
|
273 value, and a map quality greater than a specified value.
|
|
274
|
|
275 If the allele count equals the specified value (2) and the quality score for a SNP position is greater than the minimum quality score
|
|
276 value (150), the alternate allele is called.
|
|
277
|
|
278 However, if the allele count is 1, the position is called ambiguous. Deletions are called when the alternate allele is a gap. If the
|
|
279 quality score is less than or equal to the minimum quality score N value for alleles (150), the allele is marked "N".
|
0
|
280
|
|
281 **Required Options**
|
|
282
|
3
|
283 * **Zero coverage VCF file(s)** - Select a single or collection of zero coverage VCF files, typically produced by the **vSNP: add zero coverage** tool, from the current history.
|
|
284 * **Collection of zero coverage VCF files with SNPs found in closely related isolate groups** - Select a dataset collection of zero coverage vcf files from the current history.
|
0
|
285
|
|
286 **Additional Options**
|
|
287
|
3
|
288 * **Allele count threshold** - At least 1 position must have an allele count greater than this value for a SNP to be added to a group (2 is optimal).
|
|
289 * **Map quality threshold** - At least 1 position must have a higher MQ value for a SNP to be added to a group (56 is optimal).
|
|
290 * **Quality score threshold** -At least 1 position must have a higher quality score for a SNP to be added to a group (150 is optimal).
|
|
291 * **Minimum quality score N value for alleles** - If none of the avove 3 requirements is met and the quality score is less than or equal to the minimum quality score N value for alleles, the allele is marked "N" (150 is optimal).
|
0
|
292 * **Use Excel file for grouping and filtering?** - select Yes to filter desired SNP positions by group. A cached Excel spreadsheet provides the most widely used SNP positions for grouping, but a custom spreadhseet can be selected from the current history.
|
3
|
293 * **Create a group containing all isolates?** - select Yes to output an additional group containing of all isolates.
|
0
|
294 </help>
|
3
|
295 <expand macro="citations"/>
|
0
|
296 </tool>
|
|
297
|