0
|
1 <tool id="cg_calldiff" name="calldiff(beta) 1.5" version="1.0.0">
|
|
2 <!--
|
|
3 This tool creates a GUI for the calldiff function of cgatools from Complete Genomics, Inc.
|
|
4 written 6-18-2012 by bcrain@completegenomics.com
|
|
5 -->
|
|
6
|
|
7 <description>compares two Complete Genomics variant files.</description> <!--adds description in toolbar-->
|
|
8
|
|
9 <requirements>
|
|
10 <requirement type="binary">cgatools</requirement>
|
|
11 </requirements>
|
|
12
|
|
13 <command> <!--run executable-->
|
|
14 cgatools | head -1;
|
|
15 cgatools calldiff --beta
|
|
16 --reference ${crr.fields.path}
|
|
17 --variantsA $data_sources.inputA
|
|
18 --variantsB $data_sources.inputB
|
|
19 $validation
|
|
20 $diploid
|
|
21 --locus-stats-column-count $column
|
|
22 --max-hypothesis-count $hypothesis
|
|
23 --output-prefix cg_
|
|
24 --reports `echo ${report1} ${report2} ${report3} ${report4} ${report5} ${somatic.report6} | sed 's/ */,/g'`
|
|
25 #if $somatic.report6 == "SomaticOutput"
|
|
26 --genome-rootA $somatic.genomeA
|
|
27 --genome-rootB $somatic.genomeB
|
|
28 --calibration-root $somatic.calibration
|
|
29 #end if
|
|
30 </command>
|
|
31
|
|
32 <outputs>
|
|
33 <data format="tabular" name="output1" from_work_dir="cg_SuperlocusOutput.tsv" label="${tool.name} on ${on_string}: SuperlocusOutput">
|
|
34 <filter>(report1 == 'SuperlocusOutput')</filter>
|
|
35 </data>
|
|
36 <data format="tabular" name="output2" from_work_dir="cg_SuperlocusStats.tsv" label="${tool.name} on ${on_string}: SuperlocusStats">
|
|
37 <filter>(report2 == 'SuperlocusStats')</filter>
|
|
38 </data>
|
|
39 <data format="tabular" name="output3" from_work_dir="cg_LocusOutput.tsv" label="${tool.name} on ${on_string}: LocusOutput">
|
|
40 <filter>(report3 == 'LocusOutput')</filter>
|
|
41 </data>
|
|
42 <data format="tabular" name="output4" from_work_dir="cg_LocusStats.tsv" label="${tool.name} on ${on_string}: LocusStats">
|
|
43 <filter>(report4 == 'LocusStats')</filter>
|
|
44 </data>
|
|
45 <data format="tabular" name="output5a" from_work_dir="cg_VariantsA.tsv" label="${tool.name} on ${on_string}: VariantsA">
|
|
46 <filter>(report5 == 'VariantOutput')</filter>
|
|
47 </data>
|
|
48 <data format="tabular" name="output5b" from_work_dir="cg_VariantsB.tsv" label="${tool.name} on ${on_string}: VariantsB">
|
|
49 <filter>(report5 == 'VariantOutput')</filter>
|
|
50 </data>
|
|
51 <data format="tabular" name="output6" from_work_dir="cg_SomaticOutput.tsv" label="${tool.name} on ${on_string}: SomaticOutput">
|
|
52 <filter>(somatic['report6'] == 'SomaticOutput')</filter>
|
|
53 </data>
|
|
54 </outputs>
|
|
55
|
|
56 <inputs>
|
|
57 <!--form field to select crr file-->
|
|
58 <param name="crr" type="select" label="Reference genome (.crr file)">
|
|
59 <options from_data_table="cg_crr_files" />
|
|
60 </param>
|
|
61
|
|
62 <!--conditional to select variant file input-->
|
|
63 <conditional name="data_sources">
|
|
64 <param name="data_source" type="select" label="Where are the input varfiles?">
|
|
65 <option value="in" selected="true">imported into Galaxy</option>
|
|
66 <option value="out">located outside Galaxy (available only for local Galaxy instances)</option>
|
|
67 </param>
|
|
68 <when value="in">
|
|
69 <!--form field to select variant files-->
|
|
70 <param name="inputA" type="data" format="cg_var" label="Var file A">
|
|
71 <validator type="unspecified_build" />
|
|
72 <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
|
|
73 metadata_name="dbkey" metadata_column="1"
|
|
74 message="cgatools is not currently available for this build."/>
|
|
75 </param>
|
|
76 <param name="inputB" type="data" format="cg_var" label="Var file B">
|
|
77 <validator type="unspecified_build" />
|
|
78 <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
|
|
79 metadata_name="dbkey" metadata_column="1"
|
|
80 message="cgatools is not currently available for this build."/>
|
|
81 </param>
|
|
82 </when>
|
|
83 <when value="out">
|
|
84 <!--form field to select crr file-->
|
|
85 <param name="inputA" type="text" label="Variant file A (/path/varfile)" size="300" help="Variant file can be compressed (gz, bz2), e.g. /harddrive/GS00000XXXX-DID/GS00000YYYY-ASM/GS00123-DNA_G01_2000/ASM/var-GS00000YYYY-ASM.tsv.bz2"/>
|
|
86 <param name="inputB" type="text" label="Variant file B (/path/varfile)" size="300" help="Variant file can be compressed (gz, bz2), e.g. /harddrive/GS00000XXXX-DID/GS00000YYYY-ASM/GS00123-DNA_G01_2000/ASM/var-GS00000YYYY-ASM.tsv.bz2."/>
|
|
87 </when>
|
|
88 </conditional>
|
|
89
|
|
90 <param name="diploid" type="select" label="Use diploid variant model" help="Uses varScoreEAF instead of varScoreVAF in somatic score computations. Also, uses diploid variant model instead of variable allele mixture model.">
|
|
91 <option value="">no</option>
|
|
92 <option value="--diploid">yes</option>
|
|
93 </param>
|
|
94
|
|
95 <param name="column" type="integer" label="Number of columns for locus compare classification in the locus stats file (default 15)" value="15"/>
|
|
96
|
|
97 <param name="hypothesis" type="integer" label="Maximum number of possible phasings to consider for a superlocus (default 32)" value="32"/>
|
|
98
|
|
99 <param name="validation" type="select" label="Reference cover validation" help="Turns on/off validation that all bases of a chromosome are covered by calls of the variant file.">
|
|
100 <option value="">on</option>
|
|
101 <option value="--no-reference-cover-validation">off</option>
|
|
102 </param>
|
|
103
|
|
104 <param name="report1" type="select" label="Create report SuperlocusOutput">
|
|
105 <option value="">no</option>
|
|
106 <option value="SuperlocusOutput">yes</option>
|
|
107 </param>
|
|
108 <param name="report2" type="select" label="Create report SuperlocusStats">
|
|
109 <option value="">no</option>
|
|
110 <option value="SuperlocusStats">yes</option>
|
|
111 </param>
|
|
112 <param name="report3" type="select" label="Create report LocusOutput">
|
|
113 <option value="">no</option>
|
|
114 <option value="LocusOutput">yes</option>
|
|
115 </param>
|
|
116 <param name="report4" type="select" label="Create report LocusStats">
|
|
117 <option value="">no</option>
|
|
118 <option value="LocusStats">yes</option>
|
|
119 </param>
|
|
120 <param name="report5" type="select" label="Create report VariantOutput" help="Both variant files annotated by comparison results.If the somatic output report is requested, file A is also annotated with the same score ranks as produced in that report.">
|
|
121 <option value="">no</option>
|
|
122 <option value="VariantOutput">yes</option>
|
|
123 </param>
|
|
124
|
|
125 <conditional name="somatic">
|
|
126 <param name="report6" type="select" label="Create report SomaticOutput" help="This report can only be generated on local Galaxy instances. Report for the list of simple variations that are present only in file 'A', annotated with the score that indicates the probability of the variation being truly somatic. Note: generating this report slows calldiff by 10x-20x.">
|
|
127 <option value="">no</option>
|
|
128 <option value="SomaticOutput">yes</option>
|
|
129 </param>
|
|
130 <when value="SomaticOutput">
|
|
131 <param name="genomeA" type="text" size="300" label="Directory for genome A (/path/dir)" help="The 'A' genome directory, e.g. /harddrive/GS00000XXXX-DID/GS00000YYYY-ASM/GS00123-DNA_G01_2000; this directory is expected to contain ASM/REF and ASM/EVIDENCE subdirectories."/>
|
|
132 <param name="genomeB" type="text" size="300" label="Directory for genome B (/path/dir)" help="The 'B' genome directory, e.g. /harddrive/GS00000XXXX-DID/GS00000YYYY-ASM/GS00123-DNA_G01_2000; this directory is expected to contain ASM/REF and ASM/EVIDENCE subdirectories."/>
|
|
133 <param name="calibration" type="text" size="300" label="Directory calibration data (/path/dir)" help="The directory containing calibration data. For example, there should exist a file calibration-root/0.0.0/metrics.tsv. Calibration data can be downloaded from ftp://ftp.completegenomics.com/ScoreCalibrationFiles/var-calibration-v1.tgz"/>
|
|
134 </when>
|
|
135 </conditional>
|
|
136
|
|
137 </inputs>
|
|
138
|
|
139 <help>
|
|
140
|
|
141 **What it does**
|
|
142
|
|
143 This tool compares two Complete Genomics variant files.
|
|
144
|
|
145 **cgatools 1.5.0 Documentation**
|
|
146
|
|
147 Userguide: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-user-guide.pdf
|
|
148
|
|
149 Release notes: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-release-notes.pdf
|
|
150
|
|
151 **Command line reference**::
|
|
152
|
|
153 COMMAND NAME
|
|
154 calldiff - Compares two Complete Genomics variant files.
|
|
155
|
|
156 DESCRIPTION
|
|
157 Compares two Complete Genomics variant files. Divides the genome up into
|
|
158 superloci of nearby variants, then compares the superloci. Also refines the
|
|
159 comparison to determine per-call or per-locus comparison results.
|
|
160
|
|
161 Comparison results are usually described by a semi-colon separated string,
|
|
162 one per allele. Each allele's comparison result is one of the following
|
|
163 classifications:
|
|
164
|
|
165 ref-identical The alleles of the two variant files are identical, and
|
|
166 they are consistent with the reference.
|
|
167 alt-identical The alleles of the two variant files are identical, and
|
|
168 they are inconsistent with the reference.
|
|
169 ref-consistent The alleles of the two variant files are consistent,
|
|
170 and they are consistent with the reference.
|
|
171 alt-consistent The alleles of the two variant files are consistent,
|
|
172 and they are inconsistent with the reference.
|
|
173 onlyA The alleles of the two variant files are inconsistent,
|
|
174 and only file A is inconsistent with the reference.
|
|
175 onlyB The alleles of the two variant files are inconsistent,
|
|
176 and only file B is inconsistent with the reference.
|
|
177 mismatch The alleles of the two variant files are inconsistent,
|
|
178 and they are both inconsistent with the reference.
|
|
179 phase-mismatch The two variant files would be consistent if the
|
|
180 hapLink field had been empty, but they are
|
|
181 inconsistent.
|
|
182 ploidy-mismatch The superlocus did not have uniform ploidy.
|
|
183
|
|
184 In some contexts, this classification is rolled up into a simplified
|
|
185 classification, which is one of "identical", "consistent", "onlyA",
|
|
186 "onlyB", or "mismatch".
|
|
187
|
|
188 A good place to start looking at the results is the superlocus-output file.
|
|
189 It has columns defined as follows:
|
|
190
|
|
191 SuperlocusId An identifier given to the superlocus.
|
|
192 Chromosome The name of the chromosome.
|
|
193 Begin The 0-based offset of the start of the superlocus.
|
|
194 End The 0-based offset of the base one past the end of the
|
|
195 superlocus.
|
|
196 Classification The match classification of the superlocus.
|
|
197 Reference The reference sequence.
|
|
198 AllelesA A semicolon-separated list of the alleles (one per
|
|
199 haplotype) for variant file A, for the phasing with the
|
|
200 best comparison result.
|
|
201 AllelesB A semicolon-separated list of the alleles (one per
|
|
202 haplotype) for variant file B, for the phasing with the
|
|
203 best comparison result.
|
|
204
|
|
205 The locus-output file contains, for each locus in file A and file B that is
|
|
206 not consistent with the reference, an annotated set of calls for the locus.
|
|
207 The calls are annotated with the following columns:
|
|
208
|
|
209 SuperlocusId The id of the superlocus containing the locus.
|
|
210 File The variant file (A or B).
|
|
211 LocusClassification The locus classification is determined by the
|
|
212 varType column of the call that is inconsistent
|
|
213 with the reference, concatenated with a
|
|
214 modifier that describes whether the locus is
|
|
215 heterozygous, homozygous, or contains no-calls.
|
|
216 If there is no one variant in the locus (i.e.,
|
|
217 it is heterozygous alt-alt), the locus
|
|
218 classification begins with "other".
|
|
219 LocusDiffClassification The match classification for the locus. This is
|
|
220 defined to be the best of the comparison of the
|
|
221 locus to the same region in the other file, or
|
|
222 the comparison of the superlocus.
|
|
223
|
|
224 The somatic output file contains a list of putative somatic variations of
|
|
225 genome A. The output includes only those loci that can be classified as
|
|
226 snp, del, ins or sub in file A, and are called reference in the file B.
|
|
227 Every locus is annotated with the following columns:
|
|
228
|
|
229 VarCvgA The totalReadCount from file A for this locus
|
|
230 (computed on the fly if file A is not a
|
|
231 masterVar file).
|
|
232 VarScoreA The varScoreVAF from file A, or varScoreEAF if
|
|
233 the "--diploid" option is used.
|
|
234 RefCvgB The maximum of the uniqueSequenceCoverage
|
|
235 values for the locus in genome B.
|
|
236 RefScoreB Minimum of the reference scores of the locus in
|
|
237 genome B.
|
|
238 SomaticCategory The category used for determining the
|
|
239 calibrated scores and the SomaticRank.
|
|
240 VarScoreACalib The calibrated variant score of file A, under
|
|
241 the model selected by using or not using the
|
|
242 "--diploid" option, and corrected for the count
|
|
243 of heterozygous variants observed in this
|
|
244 genome. See user guide for more information.
|
|
245 VarScoreBCalib The calibrated reference score of file B, under
|
|
246 the model selected by using or not using the
|
|
247 "--diploid" option, and corrected for the count
|
|
248 of heterozygous variants observed in this
|
|
249 genome. See user guide for more information.
|
|
250 SomaticRank The estimated rank of this somatic mutation,
|
|
251 amongst all true somatic mutations within this
|
|
252 SomaticCategory. The value is a number between
|
|
253 0 and 1; a value of 0.012 means, for example,
|
|
254 that an estimated 1.2% of the true somatic
|
|
255 mutations in this somaticCategory have a
|
|
256 somaticScore less than the somaticScore for
|
|
257 this mutation. See user guide for more
|
|
258 information.
|
|
259 SomaticScore An integer that provides a total order on
|
|
260 quality for all somatic mutations. It is equal
|
|
261 to -10*log10( P(false)/P(true) ), under the
|
|
262 assumption that this genome has a rate of
|
|
263 somatic mutation equal to 1/Mb for
|
|
264 SomaticCategory snp, 1/10Mb for SomaticCategory
|
|
265 ins, 1/10Mb for SomaticCategory del, and 1/20Mb
|
|
266 for SomaticCategory sub. The computation is
|
|
267 based on the assumptions described in the user
|
|
268 guide, and is affected by choice of variant
|
|
269 model selected by using or not using the
|
|
270 "--diploid" option.
|
|
271 SomaticQuality Equal to VQHIGH for all somatic mutations where
|
|
272 SomaticScore >= -10. Otherwise, this column is
|
|
273 empty.
|
|
274
|
|
275 OPTIONS
|
|
276 -h [ --help ]
|
|
277 Print this help message.
|
|
278
|
|
279 --reference arg
|
|
280 The input crr file.
|
|
281
|
|
282 --variantsA arg
|
|
283 The "A" input variant file.
|
|
284
|
|
285 --variantsB arg
|
|
286 The "B" input variant file.
|
|
287
|
|
288 --output-prefix arg
|
|
289 The path prefix for all output reports.
|
|
290
|
|
291 --reports arg (=SuperlocusOutput,SuperlocusStats,LocusOutput,LocusStats)
|
|
292 Comma-separated list of reports to generate. (Beware any reports whose
|
|
293 name begins with "Debug".) A report is one of:
|
|
294 SuperlocusOutput Report for superlocus classification.
|
|
295 SuperlocusStats Report for superlocus classification stats.
|
|
296 LocusOutput Report for locus classification.
|
|
297 LocusStats Report for locus stats.
|
|
298 VariantOutput Both variant files annotated by comparison
|
|
299 results.If the somatic output report is
|
|
300 requested, file A is also annotated with the
|
|
301 same score ranks as produced in that report.
|
|
302 SomaticOutput Report for the list of simple variations that
|
|
303 are present only in file "A", annotated with
|
|
304 the score that indicates the probability of
|
|
305 the variation being truly somatic. Requires
|
|
306 beta, genome-rootA, and genome-rootB options
|
|
307 to be provided as well. Note: generating this
|
|
308 report slows calldiff by 10x-20x.
|
|
309 DebugCallOutput Report for call classification.
|
|
310 DebugSuperlocusOutput Report for debug superlocus information.
|
|
311 DebugSomaticOutput Report for distribution estimates used for
|
|
312 somatic rescoring. Only produced if
|
|
313 SomaticOutput is also turned on.
|
|
314
|
|
315 --diploid
|
|
316 Uses varScoreEAF instead of varScoreVAF in somatic score computations.
|
|
317 Also, uses diploid variant model instead of variable allele mixture
|
|
318 model.
|
|
319
|
|
320 --locus-stats-column-count arg (=15)
|
|
321 The number of columns for locus compare classification in the locus
|
|
322 stats file.
|
|
323
|
|
324 --max-hypothesis-count arg (=32)
|
|
325 The maximum number of possible phasings to consider for a superlocus.
|
|
326
|
|
327 --no-reference-cover-validation
|
|
328 Turns off validation that all bases of a chromosome are covered by
|
|
329 calls of the variant file.
|
|
330
|
|
331 --genome-rootA arg
|
|
332 The "A" genome directory, for example /data/GS00118-DNA_A01; this
|
|
333 directory is expected to contain ASM/REF and ASM/EVIDENCE
|
|
334 subdirectories.
|
|
335
|
|
336 --genome-rootB arg
|
|
337 The "B" genome directory.
|
|
338
|
|
339 --calibration-root arg
|
|
340 The directory containing calibration data. For example, there should
|
|
341 exist a file calibration-root/0.0.0/metrics.tsv.
|
|
342
|
|
343 --beta
|
|
344 This flag enables the SomaticOutput report, which is beta
|
|
345 functionality.
|
|
346
|
|
347 SUPPORTED FORMAT_VERSION
|
|
348 0.3 or later
|
|
349 </help>
|
|
350 </tool>
|