23
|
1 <tool id="vcf_filter" name="VCF Filter" version="0.1.7.3">
|
0
|
2 <description>Extracts lines from a vcf variant file based on field-specific filters</description>
|
2
|
3 <macros>
|
|
4 <import>toolshed_macros.xml</import>
|
|
5 </macros>
|
23
|
6 <expand macro="requirements" />
|
21
|
7 <version_command>python3 -m MiModD version -q</version_command>
|
0
|
8 <command>
|
21
|
9 python3 -m MiModD vcf-filter
|
0
|
10 "$inputfile"
|
|
11 -o "$outputfile"
|
|
12 #if len($datasets):
|
|
13 -s
|
|
14 #for $i in $datasets
|
|
15 "$i.sample"
|
|
16 #end for
|
|
17 --gt
|
|
18 #for $i in $datasets
|
|
19 ## remove whitespace from free-text input
|
|
20 "#echo ("".join($i.GT.split()) or "ANY")#"
|
|
21 #echo " "
|
|
22 #end for
|
|
23 --dp
|
|
24 #for $i in $datasets
|
|
25 "$i.DP"
|
|
26 #end for
|
|
27 --gq
|
|
28 #for $i in $datasets
|
|
29 "$i.GQ"
|
|
30 #end for
|
5
|
31 --af
|
|
32 #for $i in $datasets
|
|
33 "#echo ($i.AF or "::")#"
|
|
34 #end for
|
0
|
35 #end if
|
|
36 #if len($regions):
|
|
37 -r
|
|
38 #for $i in $regions
|
|
39 #if $i.stop:
|
|
40 "$i.chrom:$i.start-$i.stop"
|
|
41 #else:
|
|
42 "$i.chrom:$i.start"
|
|
43 #end if
|
|
44 #end for
|
|
45 #end if
|
|
46 #if $vfilter:
|
|
47 --vfilter
|
4
|
48 ## remove ',' and replace with ' '
|
0
|
49 "#echo ('" "'.join($vfilter.split(',')))#"
|
|
50 #end if
|
|
51 $vartype
|
|
52 </command>
|
|
53
|
|
54 <inputs>
|
23
|
55 <param format="vcf" label="VCF input file" name="inputfile" type="data" />
|
|
56 <repeat default="0" min="0" name="datasets" title="Sample-specific Filter">
|
|
57 <param help="name of a sample as it appears in the VCF input file and that indicates the sample that this filter should be applied to." label="sample" name="sample" type="text" />
|
|
58 <param help="keep only variants for which the genotype of the sample matches the specified pattern; format: x/x where x = 0 is wildtype and x = 1 is mutant. Multiple genotypes can be specified as a comma-separated list." label="genotype pattern(s) for the inclusion of variants" name="GT" type="text" />
|
|
59 <param help="keep only variants with at least this sample-specific coverage at the variant site" label="depth of coverage for the sample at the variant site" name="DP" type="integer" value="0" />
|
|
60 <param help="keep only variants for which the genotype prediction for the sample has at least this quality" label="genotype quality for the variant in the sample" name="GQ" type="integer" value="0" />
|
|
61 <param help="expected format: [allele number]:[minimal fraction]:[maximal fraction]; keep only variants for which the fraction of sample-specific reads supporting a given allele number is between minimal and maximal fraction; if allele number is omitted, the filter operates on the most frequent non-reference allele instead" label="allelic fraction filter" name="AF" type="text" />
|
0
|
62 </repeat>
|
23
|
63 <repeat default="0" help="Filter variant sites by their position in the genome. If multiple Region Filters are specified, all variants that fall in ONE of the regions are reported." min="0" name="regions" title="Region Filter">
|
|
64 <param label="Chromosome" name="chrom" type="text" />
|
|
65 <param label="Region Start" name="start" type="text" />
|
|
66 <param label="Region End" name="stop" type="text" />
|
0
|
67 </repeat>
|
23
|
68 <param label="Select the types of variants to include in the output" name="vartype" type="select">
|
0
|
69 <option value="">all types of variants</option>
|
|
70 <option value="--no-indels">exclude indels</option>
|
|
71 <option value="--indels-only">only indels</option>
|
|
72 </param>
|
23
|
73 <param help="Filter output by sample name; only the sample-specific columns with their sample name matching any of the comma separated filters will be retained in the output." label="sample" name="vfilter" type="text" />
|
0
|
74 </inputs>
|
|
75
|
|
76 <outputs>
|
23
|
77 <data format="vcf" name="outputfile" />
|
0
|
78 </outputs>
|
|
79
|
|
80 <help>
|
|
81 .. class:: infomark
|
|
82
|
|
83 **What it does**
|
|
84
|
|
85 The tool filters a variant file in VCF format to generate a new VCF file with only a subset of the original variants.
|
|
86
|
|
87 The following types of variant filters can be set up:
|
|
88
|
|
89 1) Sample-specific filters:
|
|
90
|
|
91 Filter variants based on their characteristics in the sequenced reads of a specific sample. Multiple sample-specific filters are combined by logical AND, i.e., only variants that pass ALL sample-specific filters are kept.
|
|
92
|
|
93 2) Region filters:
|
|
94
|
|
95 Filter variants based on the genomic region they affect. Multiple region filters are combined by logical OR, i.e., variants passing ANY region filter are kept.
|
|
96
|
|
97 3) Variant type filter:
|
|
98
|
|
99 Filter variants by their type, i.e. whether they are single nucleotide variations (SNVs) or indels
|
|
100
|
|
101 In addition, the *sample* filter can be used to reduce the samples encoded in a multi-sample VCF file to just those specified by the filter.
|
|
102 The *sample* filter is included mainly for compatibility reasons: if an external tool cannot deal with the multisample file format, but instead looks only at the first sample-specific column of the file, you can use the filter to turn the multi-sample file into a single-sample file. Besides, the filter can also be used to change the order of the samples since it will sort the samples in the order specified in the filter field.
|
|
103
|
|
104 **Examples of sample-specific filters:**
|
|
105
|
|
106 *Simple genotype pattern*
|
|
107
|
23
|
108 genotype pattern: 1/1 ==> keep all variants in the vcf input file for which the specified sample's genotype is homozygous mutant
|
0
|
109
|
|
110 *Complex genotype pattern*
|
|
111
|
23
|
112 genotype pattern: 0/1, 0/0 ==> keep all variants for which the sample's genotype is either heterozygous or homozygous wildtype
|
0
|
113
|
|
114 *Multiple sample-specific filters*
|
|
115
|
|
116 Filter 1: genotype pattern: 0/0, Filter 2: genotype pattern 1/1:
|
23
|
117 ==> keep all variants for which the first sample's gentoype is homozygous wildtype **and** the second sample's genotype is homozygous mutant
|
0
|
118
|
|
119 *Combining sample-specific filter criteria*
|
|
120
|
|
121 genotype pattern: 1/1, depth of coverage: 3, genotype quality: 9
|
23
|
122 ==> keep variants for which the sample's genotype is homozygous mutant **and** for which this genotype assignment is corroborated by a genotype quality score of at least 9
|
0
|
123 **and** at least three reads from the sample cover the variant site
|
|
124
|
|
125 **TIP:**
|
|
126
|
|
127 As in the example above, genotype quality is typically most useful in combination with a genotype pattern.
|
|
128 It acts then, effectively, to make the genotype filter more stringent.
|
|
129
|
|
130
|
|
131
|
|
132 </help>
|
23
|
133 </tool> |