comparison deepvariant.xml @ 0:7608209110d3 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deepvariant commit e46feb5432b28a9360a1d4e8a6618e6ed91008fe"
author iuc
date Mon, 06 Sep 2021 17:34:08 +0000
parents
children b778a18bd878
comparison
equal deleted inserted replaced
-1:000000000000 0:7608209110d3
1 <tool id='deepvariant' name='DeepVariant' version='@TOOL_VERSION@+galaxy@SUFFIX_VERSION@' profile='20.01'>
2 <description>deep learning-based variant caller</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro='edam_ontology' />
7 <expand macro='requirements' />
8 <command detect_errors='exit_code'><![CDATA[
9 ln -s '${reads}' reads_alignment.bam
10 && ln -s '${reads.metadata.bam_index}' reads_alignment.bam.bai
11 #if $regions_conditional.regions_option == 'bed'
12 && ln -s '${regions_conditional.bed_file}' region.bed
13 #end if
14 #if $reference_genome.source == 'history':
15 #set $ref_genome = 'reference.fasta'
16 && ln -s -f '${reference_genome.history_item}' $ref_genome
17 && samtools faidx $ref_genome
18 #else:
19 #set $ref_genome = $reference_genome.index.fields.path
20 #end if
21 && run_deepvariant
22 --model_type=$model_type
23 --ref=$ref_genome
24 --reads=reads_alignment.bam
25 --output_vcf='./output.vcf.gz'
26 #if $output_gvcf
27 --output_gvcf='./output.g.vcf.gz'
28 #end if
29 #if $regions_conditional.regions_option == 'region'
30 --regions $regions_conditional.region_literal
31 #else if $regions_conditional.regions_option == 'bed'
32 --regions region.bed
33 #end if
34 --call_variants_extra_args="use_openvino=true" ## Setting this will use OpenVINO on Intel CPUs, which empirically reduces call_variants runtime by 15%-25%.
35 --num_shards=\${GALAXY_SLOTS:-2}
36 && gunzip './output.vcf.gz'
37 #if $output_gvcf
38 && gunzip './output.g.vcf.gz'
39 #end if
40 ]]> </command>
41 <inputs>
42 <conditional name="reference_genome">
43 <param name="source" type="select" label="Source for the reference genome" help="Built-in references were created using default options.">
44 <option value="indexed" selected="true">Use a built-in genome</option>
45 <option value="history">Use a genome from history</option>
46 </param>
47 <when value="indexed">
48 <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team.">
49 <options from_data_table="fasta_indexes">
50 <filter type="sort_by" column="2" />
51 <validator type="no_options" message="No genomes are available for the selected input dataset" />
52 </options>
53 </param>
54 </when>
55 <when value="history">
56 <param name="history_item" type="data" format="fasta" label="Reference genome" help="A reference genome in FASTA format" />
57 </when>
58 </conditional>
59 <param argument="--reads" type="data" format="BAM" label="BAM file" help="An aligned reads files for child and one or two parents in BAM format. The reads must be aligned to the reference genome" />
60 <param argument="--model_type" type="select" label="Model type" help="Type of model to use for variant calling">
61 <option value="WGS">WGS: Illumina whole genome sequencing</option>
62 <option value="WES">WES: Illumina whole exome sequencing</option>
63 <option value="PACBIO">PacBio HiFi</option>
64 <option value="HYBRID_PACBIO_ILLUMINA">Hybrid PacBio HiFi-Illumina</option>
65 </param>
66 <conditional name="regions_conditional">
67 <param name="regions_option" type="select" label="Select specific regions to process" help="Restrict the analysis to specific regions. A space-separated list of chromosome regions to process. Individual elements can be region literals, such as chr20:10-20 or paths to BED files.">
68 <option value="disabled" selected="True">Disabled</option>
69 <option value="region">Specify region literals</option>
70 <option value="bed">Provide a BED file</option>
71 </param>
72 <when value="disabled"/>
73 <when value="region">
74 <param name="region_literal" argument="--regions" type="text" label="Regions" help="This option refers to contigs present in the reference genome. These arguments accept space-separated lists, so all of the follow examples are valid arguments: 'chr20:10,000,000-11,000,000', 'chr20 chr21' and 'chr20'">
75 <sanitizer invalid_char="">
76 <valid initial="string.letters,string.digits">
77 <add value="," />
78 <add value=":" />
79 <add value="-" />
80 </valid>
81 </sanitizer>
82 <validator type="regex">[0-9a-zA-Z,:-]+</validator>
83 </param>
84 </when>
85 <when value="bed">
86 <param name="bed_file" argument="--regions" type="data" format="bed" label="BED file" help="The BED should the store genomic regions of interest" />
87 </when>
88 </conditional>
89 <param argument="--output_gvcf" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Generate genomic VCF (gVCF) output" help="The key difference between a regular VCF and a gVCF is that the gVCF has records for all sites, whether there is a variant call there or not. The goal is to have every site represented in the file in order to do joint analysis of a cohort in subsequent steps" />
90 </inputs>
91 <outputs>
92 <data name="vcf_file" format="vcf" from_work_dir="output.vcf" label="${tool.name} on ${on_string}: VCF file"/>
93 <data name="html_report" format="html" from_work_dir="output.visual_report.html" label="${tool.name} on ${on_string}: HTML report"/>
94 <data name="gvcf_file" format="vcf" from_work_dir="output.g.vcf" label="${tool.name} on ${on_string}: gVCF file">
95 <filter>output_gvcf</filter>
96 </data>
97 </outputs>
98 <tests>
99 <test expect_num_outputs="2">
100 <conditional name="reference_genome">
101 <param name="source" value="history"/>
102 <param name="history_item" value="reference.fasta"/>
103 </conditional>
104 <param name="reads" value="reads.bam"/>
105 <param name="model_type" value="WGS"/>
106 <param name="output_gvcf" value="False"/>
107 <conditional name="regions_conditional">
108 <param name="regions_option" value="disabled"/>
109 </conditional>
110 <output name="vcf_file" file="output.vcf" ftype="vcf">
111 <assert_contents>
112 <has_text text="##fileformat=VCFv4.2"/>
113 <has_size value="2473"/>
114 </assert_contents>
115 </output>
116 <output name="html_report" file="report.html" ftype="html">
117 <assert_contents>
118 <has_size value="19287" delta="100"/>
119 </assert_contents>
120 </output>
121 </test>
122 <!-- Test region literal option-->
123 <test expect_num_outputs="2">
124 <conditional name="reference_genome">
125 <param name="source" value="history"/>
126 <param name="history_item" value="reference.fasta"/>
127 </conditional>
128 <param name="reads" value="reads.bam"/>
129 <param name="model_type" value="WGS"/>
130 <conditional name="regions_conditional">
131 <param name="regions_option" value="region"/>
132 <param name="region_literal" value="K03455:1-2669"/>
133 </conditional>
134 <output name="vcf_file" ftype="vcf">
135 <assert_contents>
136 <has_text text="##fileformat=VCFv4.2"/>
137 <has_size value="1846"/>
138 </assert_contents>
139 </output>
140 <output name="html_report" ftype="html">
141 <assert_contents>
142 <has_size value="18864" delta="100"/>
143 </assert_contents>
144 </output>
145 </test>
146 <!-- Test region bed option-->
147 <test expect_num_outputs="2">
148 <conditional name="reference_genome">
149 <param name="source" value="history"/>
150 <param name="history_item" value="reference.fasta"/>
151 </conditional>
152 <param name="reads" value="reads.bam"/>
153 <param name="model_type" value="WGS"/>
154 <conditional name="regions_conditional">
155 <param name="regions_option" value="bed"/>
156 <param name="bed_file" value="region.bed"/>
157 </conditional>
158 <output name="vcf_file" ftype="vcf">
159 <assert_contents>
160 <has_text text="##fileformat=VCFv4.2"/>
161 <has_size value="1846"/>
162 </assert_contents>
163 </output>
164 <output name="html_report" ftype="html">
165 <assert_contents>
166 <has_size value="18864" delta="100"/>
167 </assert_contents>
168 </output>
169 </test>
170 <!-- Test gvcf output option-->
171 <test expect_num_outputs="3">
172 <conditional name="reference_genome">
173 <param name="source" value="history"/>
174 <param name="history_item" value="reference.fasta"/>
175 </conditional>
176 <param name="reads" value="reads.bam"/>
177 <param name="model_type" value="WGS"/>
178 <param name="output_gvcf" value="True"/>
179 <conditional name="regions_conditional">
180 <param name="regions_option" value="region"/>
181 <param name="region_literal" value="K03455:1-2669"/>
182 </conditional>
183 <output name="vcf_file" ftype="vcf">
184 <assert_contents>
185 <has_text text="##fileformat=VCFv4.2"/>
186 <has_size value="1846"/>
187 </assert_contents>
188 </output>
189 <output name="gvcf_file" file="output.g.vcf" ftype="vcf">
190 <assert_contents>
191 <has_text text="##fileformat=VCFv4.2"/>
192 <has_size value="3195"/>
193 </assert_contents>
194 </output>
195 <output name="html_report" ftype="html">
196 <assert_contents>
197 <has_size value="18864" delta="100"/>
198 </assert_contents>
199 </output>
200 </test>
201 <!-- Test CRAM format input-->
202 <test expect_num_outputs="2">
203 <conditional name="reference_genome">
204 <param name="source" value="history"/>
205 <param name="history_item" value="reference.fasta"/>
206 </conditional>
207 <param name="reads" value="reads.cram"/>
208 <param name="model_type" value="WGS"/>
209 <conditional name="regions_conditional">
210 <param name="regions_option" value="disabled"/>
211 </conditional>
212 <output name="vcf_file" ftype="vcf">
213 <assert_contents>
214 <has_text text="##fileformat=VCFv4.2"/>
215 <has_size value="2473"/>
216 </assert_contents>
217 </output>
218 <output name="html_report" ftype="html">
219 <assert_contents>
220 <has_size value="19287" delta="100"/>
221 </assert_contents>
222 </output>
223 </test>
224 <!-- Test indexed reference format input-->
225 <test expect_num_outputs="2">
226 <conditional name="reference_genome">
227 <param name="source" value="indexed"/>
228 <param name="index" value="phix174"/>
229 </conditional>
230 <param name="reads" value="reads.bam"/>
231 <param name="model_type" value="WGS"/>
232 <conditional name="regions_conditional">
233 <param name="regions_option" value="disabled"/>
234 </conditional>
235 <output name="vcf_file" ftype="vcf">
236 <assert_contents>
237 <has_text text="##fileformat=VCFv4.2"/>
238 <has_size value="2473"/>
239 </assert_contents>
240 </output>
241 <output name="html_report" ftype="html">
242 <assert_contents>
243 <has_size value="19287" delta="100"/>
244 </assert_contents>
245 </output>
246 </test>
247 </tests>
248 <help><![CDATA[
249
250 .. class:: infomark
251
252 **Purpose**
253
254 DeepVariant is a deep learning-based variant caller that takes aligned reads (in BAM or CRAM format), produces pileup image tensors from them, classifies each tensor using a convolutional neural network, and finally reports the results in a standard VCF or gVCF file.
255
256 DeepVariant supports germline variant-calling in diploid organisms.
257
258 - NGS (Illumina) data for either a `whole genome <https://github.com/google/deepvariant/blob/r1.2/docs/deepvariant-case-study.md>`_ or `whole exome <https://github.com/google/deepvariant/blob/r1.2/docs/deepvariant-exome-case-study.md>`_.
259 - PacBio HiFi data, see the `PacBio case study <https://github.com/google/deepvariant/blob/r1.2/docs/deepvariant-pacbio-model-case-study.md>`_.
260 - Hybrid PacBio HiFi + Illumina WGS, see the `hybrid case study <https://github.com/google/deepvariant/blob/r1.2/docs/deepvariant-hybrid-case-study.md>`_.
261
262 Please also note:
263
264 For somatic data or any other samples where the genotypes go beyond two copies of DNA, DeepVariant will not work out of the box because the only genotypes supported are hom-alt, het, and hom-ref.
265
266 The models included with DeepVariant are only trained on human data. For other organisms, see the blog post on `non-human variant-calling <https://google.github.io/deepvariant/posts/2018-12-05-improved-non-human-variant-calling-using-species-specific-deepvariant-models/>`_ for some possible pitfalls and how to handle them.
267
268 ----
269
270 .. class:: infomark
271
272 **How DeepVariants works**
273
274 DeepVariant relies on `Nucleus <https://github.com/google/nucleus>`_, a library of Python and C++ code for reading and writing data in common genomics file formats (like SAM and VCF) designed for painless integration with the `TensorFlow <https://www.tensorflow.org/>`_ machine learning framework. Nucleus was built with DeepVariant in mind and open-sourced separately so it can be used by anyone in the genomics research community for other projects. See this blog post on `Using Nucleus and TensorFlow for DNA Sequencing Error Correction <https://google.github.io/deepvariant/posts/2019-01-31-using-nucleus-and-tensorflow-for-dna-sequencing-error-correction/>`_.
275
276
277 ]]> </help>
278 <expand macro="citations"/>
279 </tool>