comparison tools/cgatools17/calldiff_v17.xml @ 1:3a2e0f376f26 draft

Minor change to tv2vcf.xml to allow for workflow automation
author dgdekoning
date Wed, 21 Oct 2015 10:09:15 -0400
parents
children
comparison
equal deleted inserted replaced
0:751b62d30ae1 1:3a2e0f376f26
1 <tool id="cg_calldiff" name="CallDiff" version="1.7.1">
2
3 <description>Compares two Complete Genomics variant files.</description>
4
5 <requirements>
6 <requirement type="package" version="1">cgatools17</requirement>
7 </requirements>
8
9 <command> <!--run executable-->
10 cgatools | head -1;
11 cgatools calldiff
12 --beta
13 --reference ${crr.fields.crr_path}
14 --variantsA $inputA
15 --variantsB $inputB
16 $validation
17 $diploid
18 --locus-stats-column-count $column
19 --max-hypothesis-count $hypothesis
20 --output-prefix cg_
21 --reports `echo ${report1} ${report2} ${report3} ${report4} ${report5} | sed 's/ */,/g'`
22 </command>
23
24 <inputs>
25 <!--form field to select crr file-->
26 <param name="crr" type="select" label="Reference Build">
27 <options from_data_table="cg_anno_files" />
28 </param>
29
30 <!-- input files -->
31 <param name="inputA" type="data" format="cg_var,tabular" label="Var file A"></param>
32 <param name="inputB" type="data" format="cg_var,tabular" label="Var file B"></param>
33
34 <!-- reports -->
35 <param name="report1" type="select" label="Create report SuperlocusOutput">
36 <option value="">no</option>
37 <option value="SuperlocusOutput">yes</option>
38 </param>
39 <param name="report2" type="select" label="Create report SuperlocusStats">
40 <option value="">no</option>
41 <option value="SuperlocusStats">yes</option>
42 </param>
43 <param name="report3" type="select" label="Create report LocusOutput">
44 <option value="">no</option>
45 <option value="LocusOutput">yes</option>
46 </param>
47 <param name="report4" type="select" label="Create report LocusStats">
48 <option value="">no</option>
49 <option value="LocusStats">yes</option>
50 </param>
51 <param name="report5" type="select" label="Create report VariantOutput" help="Both variant files annotated by comparison results.If the somatic output report is requested, file A is also annotated with the same score ranks as produced in that report.">
52 <option value="VariantOutput">yes</option>
53 <option value="">no</option>
54 </param>
55
56 <!-- parameters -->
57 <param name="diploid" type="select" label="Use diploid variant model" help="Uses varScoreEAF instead of varScoreVAF in somatic score computations. Also, uses diploid variant model instead of variable allele mixture model.">
58 <option value="">no</option>
59 <option value="--diploid">yes</option>
60 </param>
61
62 <param name="column" type="integer" label="Number of columns for locus compare classification in the locus stats file (default 15)" value="15"/>
63
64 <param name="hypothesis" type="integer" label="Maximum number of possible phasings to consider for a superlocus (default 32)" value="32"/>
65
66 <param name="validation" type="select" label="Reference cover validation" help="Turns on/off validation that all bases of a chromosome are covered by calls of the variant file.">
67 <option value="">on</option>
68 <option value="--no-reference-cover-validation">off</option>
69 </param>
70
71 <!-- prefix for output file so you dont have to manually rename history items -->
72 <param name="fname" type="text" value="" label="Prefix for your output file" help="Optional"/>
73 </inputs>
74
75 <outputs>
76 <data format="tabular" name="output1" from_work_dir="cg_SuperlocusOutput.tsv" label="$fname ${tool.name} on ${on_string}: SuperlocusOutput">
77 <filter>(report1 == 'SuperlocusOutput')</filter>
78 </data>
79 <data format="tabular" name="output2" from_work_dir="cg_SuperlocusStats.tsv" label="$fname ${tool.name} on ${on_string}: SuperlocusStats">
80 <filter>(report2 == 'SuperlocusStats')</filter>
81 </data>
82 <data format="tabular" name="output3" from_work_dir="cg_LocusOutput.tsv" label="$fname ${tool.name} on ${on_string}: LocusOutput">
83 <filter>(report3 == 'LocusOutput')</filter>
84 </data>
85 <data format="tabular" name="output4" from_work_dir="cg_LocusStats.tsv" label="$fname ${tool.name} on ${on_string}: LocusStats">
86 <filter>(report4 == 'LocusStats')</filter>
87 </data>
88 <data format="tabular" name="output5a" from_work_dir="cg_VariantsA.tsv" label="$fname ${tool.name} on ${on_string}: VariantsA">
89 <filter>(report5 == 'VariantOutput')</filter>
90 </data>
91 <data format="tabular" name="output5b" from_work_dir="cg_VariantsB.tsv" label="$fname ${tool.name} on ${on_string}: VariantsB">
92 <filter>(report5 == 'VariantOutput')</filter>
93 </data>
94 </outputs>
95
96 <tests>
97 <test>
98 <param name="inputA" value="HCC1187_T_chr22.tsv" />
99 <param name="inputA" value="HCC1187_N_chr22.tsv" />
100 <param name="crr" value="hg18" />
101 <param name="report1" value="SuperlocusOutput" />
102 <param name="report2" value="" />
103 <param name="report3" value="" />
104 <param name="report4" value="" />
105 <param name="report5" value="" />
106 <param name="diploid" value="" />
107 <param name="column" value="15" />
108 <param name="hypothesis" value="" />
109 <output name="output1" file="HCC1187_chr22_SuperLocusOutput.tsv" />
110 </test>
111 </tests>
112
113 <help>
114 **What it does**
115
116 This tool compares two Complete Genomics variant files.
117
118 **cgatools 1.7.1 Documentation**
119
120 Userguide: http://cgatools.sourceforge.net/docs/1.7.1/cgatools-user-guide.pdf
121
122 Release notes: http://cgatools.sourceforge.net/docs/1.7.1/cgatools-release-notes.pdf
123
124 **Command line reference**::
125
126 COMMAND NAME
127 calldiff - Compares two Complete Genomics variant files.
128
129 DESCRIPTION
130 Compares two Complete Genomics variant files. Divides the genome up into
131 superloci of nearby variants, then compares the superloci. Also refines the
132 comparison to determine per-call or per-locus comparison results.
133
134 Comparison results are usually described by a semi-colon separated string,
135 one per allele. Each allele's comparison result is one of the following
136 classifications:
137
138 ref-identical The alleles of the two variant files are identical, and
139 they are consistent with the reference.
140 alt-identical The alleles of the two variant files are identical, and
141 they are inconsistent with the reference.
142 ref-consistent The alleles of the two variant files are consistent,
143 and they are consistent with the reference.
144 alt-consistent The alleles of the two variant files are consistent,
145 and they are inconsistent with the reference.
146 onlyA The alleles of the two variant files are inconsistent,
147 and only file A is inconsistent with the reference.
148 onlyB The alleles of the two variant files are inconsistent,
149 and only file B is inconsistent with the reference.
150 mismatch The alleles of the two variant files are inconsistent,
151 and they are both inconsistent with the reference.
152 phase-mismatch The two variant files would be consistent if the
153 hapLink field had been empty, but they are
154 inconsistent.
155 ploidy-mismatch The superlocus did not have uniform ploidy.
156
157 In some contexts, this classification is rolled up into a simplified
158 classification, which is one of "identical", "consistent", "onlyA",
159 "onlyB", or "mismatch".
160
161 A good place to start looking at the results is the superlocus-output file.
162 It has columns defined as follows:
163
164 SuperlocusId An identifier given to the superlocus.
165 Chromosome The name of the chromosome.
166 Begin The 0-based offset of the start of the superlocus.
167 End The 0-based offset of the base one past the end of the
168 superlocus.
169 Classification The match classification of the superlocus.
170 Reference The reference sequence.
171 AllelesA A semicolon-separated list of the alleles (one per
172 haplotype) for variant file A, for the phasing with the
173 best comparison result.
174 AllelesB A semicolon-separated list of the alleles (one per
175 haplotype) for variant file B, for the phasing with the
176 best comparison result.
177
178 The locus-output file contains, for each locus in file A and file B that is
179 not consistent with the reference, an annotated set of calls for the locus.
180 The calls are annotated with the following columns:
181
182 SuperlocusId The id of the superlocus containing the locus.
183 File The variant file (A or B).
184 LocusClassification The locus classification is determined by the
185 varType column of the call that is inconsistent
186 with the reference, concatenated with a
187 modifier that describes whether the locus is
188 heterozygous, homozygous, or contains no-calls.
189 If there is no one variant in the locus (i.e.,
190 it is heterozygous alt-alt), the locus
191 classification begins with "other".
192 LocusDiffClassification The match classification for the locus. This is
193 defined to be the best of the comparison of the
194 locus to the same region in the other file, or
195 the comparison of the superlocus.
196
197 The somatic output file contains a list of putative somatic variations of
198 genome A. The output includes only those loci that can be classified as
199 snp, del, ins or sub in file A, and are called reference in the file B.
200 Every locus is annotated with the following columns:
201
202 VarCvgA The totalReadCount from file A for this locus
203 (computed on the fly if file A is not a
204 masterVar file).
205 VarScoreA The varScoreVAF from file A, or varScoreEAF if
206 the "--diploid" option is used.
207 RefCvgB The maximum of the uniqueSequenceCoverage
208 values for the locus in genome B.
209 RefScoreB Minimum of the reference scores of the locus in
210 genome B.
211 SomaticCategory The category used for determining the
212 calibrated scores and the SomaticRank.
213 VarScoreACalib The calibrated variant score of file A, under
214 the model selected by using or not using the
215 "--diploid" option, and corrected for the count
216 of heterozygous variants observed in this
217 genome. See user guide for more information.
218 VarScoreBCalib The calibrated reference score of file B, under
219 the model selected by using or not using the
220 "--diploid" option, and corrected for the count
221 of heterozygous variants observed in this
222 genome. See user guide for more information.
223 SomaticRank The estimated rank of this somatic mutation,
224 amongst all true somatic mutations within this
225 SomaticCategory. The value is a number between
226 0 and 1; a value of 0.012 means, for example,
227 that an estimated 1.2% of the true somatic
228 mutations in this somaticCategory have a
229 somaticScore less than the somaticScore for
230 this mutation. See user guide for more
231 information.
232 SomaticScore An integer that provides a total order on
233 quality for all somatic mutations. It is equal
234 to -10*log10( P(false)/P(true) ), under the
235 assumption that this genome has a rate of
236 somatic mutation equal to 1/Mb for
237 SomaticCategory snp, 1/10Mb for SomaticCategory
238 ins, 1/10Mb for SomaticCategory del, and 1/20Mb
239 for SomaticCategory sub. The computation is
240 based on the assumptions described in the user
241 guide, and is affected by choice of variant
242 model selected by using or not using the
243 "--diploid" option.
244 SomaticQuality Equal to VQHIGH for all somatic mutations where
245 SomaticScore &gt;= -10. Otherwise, this column is
246 empty.
247
248 OPTIONS
249 -h [ --help ]
250 Print this help message.
251
252 --reference arg
253 The input crr file.
254
255 --variantsA arg
256 The "A" input variant file.
257
258 --variantsB arg
259 The "B" input variant file.
260
261 --output-prefix arg
262 The path prefix for all output reports.
263
264 --reports arg (=SuperlocusOutput,SuperlocusStats,LocusOutput,LocusStats)
265 Comma-separated list of reports to generate. (Beware any reports whose
266 name begins with "Debug".) A report is one of:
267 SuperlocusOutput Report for superlocus classification.
268 SuperlocusStats Report for superlocus classification stats.
269 LocusOutput Report for locus classification.
270 LocusStats Report for locus stats.
271 VariantOutput Both variant files annotated by comparison
272 results.If the somatic output report is
273 requested, file A is also annotated with the
274 same score ranks as produced in that report.
275 SomaticOutput Report for the list of simple variations that
276 are present only in file "A", annotated with
277 the score that indicates the probability of
278 the variation being truly somatic. Requires
279 beta, genome-rootA, and genome-rootB options
280 to be provided as well. Note: generating this
281 report slows calldiff by 10x-20x.
282 DebugCallOutput Report for call classification.
283 DebugSuperlocusOutput Report for debug superlocus information.
284 DebugSomaticOutput Report for distribution estimates used for
285 somatic rescoring. Only produced if
286 SomaticOutput is also turned on.
287
288 --diploid
289 Uses varScoreEAF instead of varScoreVAF in somatic score computations.
290 Also, uses diploid variant model instead of variable allele mixture
291 model.
292
293 --locus-stats-column-count arg (=15)
294 The number of columns for locus compare classification in the locus
295 stats file.
296
297 --max-hypothesis-count arg (=32)
298 The maximum number of possible phasings to consider for a superlocus.
299
300 --no-reference-cover-validation
301 Turns off validation that all bases of a chromosome are covered by
302 calls of the variant file.
303
304 --genome-rootA arg
305 The "A" genome directory, for example /data/GS00118-DNA_A01; this
306 directory is expected to contain ASM/REF and ASM/EVIDENCE
307 subdirectories.
308
309 --genome-rootB arg
310 The "B" genome directory.
311
312 --calibration-root arg
313 The directory containing calibration data. For example, there should
314 exist a file calibration-root/0.0.0/metrics.tsv.
315
316 --beta
317 This flag enables the SomaticOutput report, which is beta
318 functionality.
319
320 SUPPORTED FORMAT_VERSION
321 0.3 or later
322 </help>
323 </tool>