comparison snpSift_extractFields.xml @ 5:09d6806c609e draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpsift/snpsift commit 70ff70918368ff0deeb596c2190a770abe9e1c9b
author iuc
date Wed, 18 Apr 2018 07:28:51 -0400
parents 20c7d583fec1
children
comparison
equal deleted inserted replaced
4:b04635ebfab0 5:09d6806c609e
1 <tool id="snpSift_extractFields" name="SnpSift Extract Fields" version="@WRAPPER_VERSION@.0"> 1 <tool id="snpSift_extractFields" name="SnpSift Extract Fields" version="@WRAPPER_VERSION@.galaxy0">
2 <options sanitize="False" /> 2 <options sanitize="False" />
3 <description>from a VCF file into a tabular file</description> 3 <description>from a VCF file into a tabular file</description>
4 <macros> 4 <macros>
5 <import>snpSift_macros.xml</import> 5 <import>snpSift_macros.xml</import>
6 </macros> 6 </macros>
7 <expand macro="requirements" /> 7 <expand macro="requirements" />
8 <expand macro="stdio" /> 8 <expand macro="stdio" />
9 <expand macro="version_command" /> 9 <expand macro="version_command" />
10 <command><![CDATA[ 10 <command><![CDATA[
11 @CONDA_SNPSIFT_JAR_PATH@ && 11 @CONDA_SNPSIFT_JAR_PATH@ &&
12 cat '$input' 12 cat '${input}'
13 #if $one_effect_per_line: 13 #if $one_effect_per_line:
14 | "\$SNPSIFT_JAR_PATH/scripts/vcfEffOnePerLine.pl" 14 | perl "\$SNPSIFT_JAR_PATH/scripts/vcfEffOnePerLine.pl"
15 #end if 15 #end if
16 | SnpSift -Xmx6G extractFields 16 | SnpSift -Xmx6G extractFields
17 #if $separator: 17 #if $separator:
18 -s '$separator' 18 -s '${separator}'
19 #end if 19 #end if
20 #if $empty_text: 20 #if $empty_text:
21 -e '$empty_text' 21 -e '${empty_text}'
22 #end if 22 #end if
23 - 23 -
24 #echo ' '.join(['"%s"' % x for x in $extract.split()]) 24 #echo ' '.join(['"%s"' % x for x in $extract.split()])
25 > '$output' 25 > '${output}'
26 ]]></command> 26 ]]></command>
27 <inputs> 27 <inputs>
28 <param name="input" type="data" format="vcf" label="Variant input file in VCF format"/> 28 <param name="input" type="data" format="vcf" label="Variant input file in VCF format"/>
29 <param name="extract" type="text" label="Extract" help="Need help? See below a few examples." /> 29 <param name="extract" type="text" label="Fields to extract" value="CHROM POS ID REF ALT FILTER" help="Separated by spaces. See help below for an explanation" />
30 <param name="one_effect_per_line" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="One effect per line" help="When variants have more than one effect, lists one effect per line, while all other parameters in the line are repeated across mutiple lines" /> 30 <param name="one_effect_per_line" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="One effect per line" help="When variants have more than one effect, lists one effect per line, while all other parameters in the line are repeated across mutiple lines" />
31 <param name="separator" type="text" value="" label="multiple field separator" help="Separate multiple fields in one column with this character, e.g. a comma, rather than a column for each of the multiple values" /> 31 <param name="separator" type="text" value="" label="multiple field separator" help="Separate multiple fields in one column with this character, e.g. a comma, rather than a column for each of the multiple values" argument="-s" />
32 <param name="empty_text" type="text" value="" label="empty field text" help="Represent empty fields with this value, rather than leaving them blank" /> 32 <param name="empty_text" type="text" value="" label="empty field text" help="Represent empty fields with this value, rather than leaving them blank" argument="-e"/>
33 </inputs> 33 </inputs>
34 <outputs> 34 <outputs>
35 <data name="output" format="tabular" /> 35 <data name="output" format="tabular" />
36 </outputs> 36 </outputs>
37 <tests> 37 <tests>
38 <test> 38 <test>
39 <param name="input" ftype="vcf" value="test_rmInfo.vcf"/> 39 <param name="input" ftype="vcf" value="test_rmInfo.vcf"/>
40 <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/> 40 <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/>
41 <output name="output"> 41 <output name="output">
42 <assert_contents> 42 <assert_contents>
43 <has_text text="INTRAGENIC" /> 43 <has_text text="INTRAGENIC" />
44 <not_has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" /> 44 <not_has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" />
45 </assert_contents> 45 </assert_contents>
46 </output> 46 </output>
47 </test> 47 </test>
48
49 <test> 48 <test>
50 <param name="input" ftype="vcf" value="test_rmInfo.vcf"/> 49 <param name="input" ftype="vcf" value="test_rmInfo.vcf"/>
51 <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/> 50 <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/>
52 <param name="separator" value=","/> 51 <param name="separator" value=","/>
53 <output name="output"> 52 <output name="output">
54 <assert_contents> 53 <assert_contents>
55 <has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" /> 54 <has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" />
56 </assert_contents> 55 </assert_contents>
57 </output> 56 </output>
58 </test> 57 </test>
58 <test>
59 <param name="input" ftype="vcf" value="extFields_test3_in.vcf"/>
60 <param name="extract" value="CHROM POS ID REF ALT FILTER ANN[*].EFFECT"/>
61 <param name="one_effect_per_line" value="true"/>
62 <output name="output" value="extFields_test3_out.vcf"/>
63 </test>
59 </tests> 64 </tests>
60 <help><![CDATA[ 65 <help><![CDATA[
61 **SnpSift Extract Fields** 66 **What is does**
62 67
63 Extract fields from a VCF file to a TXT, tab separated format, that you can easily load in R, XLS, etc. 68 `SnpSift Extract Fields <http://snpeff.sourceforge.net/SnpSift.html#Extract>`_ selects columns from a VCF dataset into a Tab-delimited format.
64 69
65 http://snpeff.sourceforge.net/SnpSift.html#Extract 70 ------
66 71
67 You can also use sub-fields and genotype fields / sub-fields such as:: 72 .. class:: infomark
68 73
69 Standard VCF fields: 74 **How to know which fields to extract?**
70 CHROM 75
71 POS 76 A VCF dataset contains mandatory fields as well as optional fields. Mandatory fields are required by `VCF specifications <https://samtools.github.io/hts-specs/VCFv4.2.pdf>`_ and present in any valid VCF dataset. The **Fields to extract** input box of the tool above is already pre-filled with names of mandatory fields.
72 ID 77
73 REF 78 To know what other fields are available in a given VCF file simply look at its header. `INFO` and `FORMAT` lines will contain description of existing fields. For example, if you see a line:
74 ALT 79
75 FILTER 80 ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
76 INFO fields: 81
77 AF 82 you can use *NS* as the field name.
78 AC 83
79 DP 84 ------
80 MQ 85
81 etc. (any info field available) 86 **Dealing with field generated with SnpEff**
82 SnpEff 'ANN' fields: 87
83 "ANN[*].ALLELE" (alias GENOTYPE) 88 The current version of `SnpEff <http://snpeff.sourceforge.net/SnpEff_manual.html>`_ produces so called *ANN* fields::
84 "ANN[*].EFFECT" (alias ANNOTATION): Effect in Sequence ontology terms (e.g. 'missense_variant', 'synonymous_variant', 'stop_gained', etc.) 89
85 "ANN[*].IMPACT" { HIGH, MODERATE, LOW, MODIFIER } 90 "ANN[*].ALLELE" (alias GENOTYPE)
86 "ANN[*].GENE" Gene name (e.g. 'PSD3') 91 "ANN[*].EFFECT" (alias ANNOTATION): Effect in Sequence ontology terms (e.g. 'missense_variant', 'synonymous_variant', 'stop_gained', etc.)
87 "ANN[*].GENEID" Gene ID 92 "ANN[*].IMPACT" { HIGH, MODERATE, LOW, MODIFIER }
88 "ANN[*].FEATURE" 93 "ANN[*].GENE" Gene name (e.g. 'PSD3')
89 "ANN[*].FEATUREID" (alias TRID: Transcript ID) 94 "ANN[*].GENEID" Gene ID
90 "ANN[*].BIOTYPE" Biotype, as described by the annotations (e.g. 'protein_coding') 95 "ANN[*].FEATURE"
91 "ANN[*].RANK" Exon or Intron rank (i.e. exon number in a transcript) 96 "ANN[*].FEATUREID" (alias TRID: Transcript ID)
92 "ANN[*].HGVS_C" (alias HGVS_DNA, CODON): Variant in HGVS (DNA) notation 97 "ANN[*].BIOTYPE" Biotype, as described by the annotations (e.g. 'protein_coding')
93 "ANN[*].HGVS_P" (alias HGVS, HGVS_PROT, AA): Variant in HGVS (protein) notation 98 "ANN[*].RANK" Exon or Intron rank (i.e. exon number in a transcript)
94 "ANN[*].CDNA_POS" (alias POS_CDNA) 99 "ANN[*].HGVS_C" (alias HGVS_DNA, CODON): Variant in HGVS (DNA) notation
95 "ANN[*].CDNA_LEN" (alias LEN_CDNA) 100 "ANN[*].HGVS_P" (alias HGVS, HGVS_PROT, AA): Variant in HGVS (protein) notation
96 "ANN[*].CDS_POS" (alias POS_CDS) 101 "ANN[*].CDNA_POS" (alias POS_CDNA)
97 "ANN[*].CDS_LEN" (alias LEN_CDS) 102 "ANN[*].CDNA_LEN" (alias LEN_CDNA)
98 "ANN[*].AA_POS" (alias POS_AA) 103 "ANN[*].CDS_POS" (alias POS_CDS)
99 "ANN[*].AA_LEN" (alias LEN_AA) 104 "ANN[*].CDS_LEN" (alias LEN_CDS)
100 "ANN[*].DISTANCE" 105 "ANN[*].AA_POS" (alias POS_AA)
101 "ANN[*].ERRORS" (alias WARNING, INFOS) 106 "ANN[*].AA_LEN" (alias LEN_AA)
102 SnpEff 'EFF' fields (this is for older SnpEff/SnpSift versions, new version use 'ANN' field): 107 "ANN[*].DISTANCE"
103 "EFF[*].EFFECT" 108 "ANN[*].ERRORS" (alias WARNING, INFOS)
104 "EFF[*].IMPACT" 109
105 "EFF[*].FUNCLASS" 110 Older versions produced *EFF* fields::
106 "EFF[*].CODON" 111
107 "EFF[*].AA" 112 "EFF[*].EFFECT"
108 "EFF[*].AA_LEN" 113 "EFF[*].IMPACT"
109 "EFF[*].GENE" 114 "EFF[*].FUNCLASS"
110 "EFF[*].BIOTYPE" 115 "EFF[*].CODON"
111 "EFF[*].CODING" 116 "EFF[*].AA"
112 "EFF[*].TRID" 117 "EFF[*].AA_LEN"
113 "EFF[*].RANK" 118 "EFF[*].GENE"
114 SnpEff 'LOF' fields: 119 "EFF[*].BIOTYPE"
115 "LOF[*].GENE" 120 "EFF[*].CODING"
116 "LOF[*].GENEID" 121 "EFF[*].TRID"
117 "LOF[*].NUMTR" 122 "EFF[*].RANK"
118 "LOF[*].PERC" 123
119 SnpEff' NMD' fields: 124 In addition there are *LOF* and *NMD* fields::
120 "NMD[*].GENE" 125
121 "NMD[*].GENEID" 126 "LOF[*].GENE"
122 "NMD[*].NUMTR" 127 "LOF[*].GENEID"
123 "NMD[*].PERC" 128 "LOF[*].NUMTR"
124 129 "LOF[*].PERC"
125 Some examples: 130
126 131 "NMD[*].GENE"
127 - *Extracting chromosome, position, ID and allele frequency from a VCF file*: 132 "NMD[*].GENEID"
128 133 "NMD[*].NUMTR"
129 **CHROM POS ID AF** 134 "NMD[*].PERC"
130 135
131 The result will look something like:: 136 To find our whether your VCF contains *ANN* or *EFF* annotations simply look at its header.
132 137
133 #CHROM POS ID AF 138 -----
134 1 69134 0.086 139
135 1 69496 rs150690004 0.001 140 **Usage examples**
136 141
137 - *Extracting genotype fields*: 142 *Extracting chromosome, position, ID and allele frequency from a VCF file*:
138 143
139 **CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT** 144 **CHROM POS ID AF**
140 145
141 This means to extract: 146 The result will look something like::
142 147
143 - CHROM POS ID: regular fields (as in the previous example) 148 #CHROM POS ID AF
144 - THETA : This one is from INFO 149 1 69134 0.086
145 - GEN[0].GL[1] : Second likelihood from first genotype 150 1 69496 rs150690004 0.001
146 - GEN[1].GL : The whole GL fiels (all entries without separating them) 151
147 - GEN[3].GL[*] : All likelihoods form genotype 3 (this time they will be tab separated, as opposed to the previous one). 152 *Extracting genotype fields*:
148 - GEN[*].GT : Genotype subfields (GT) from ALL samples (tab separated). 153
149 154 **CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT**
150 The result will look something like:: 155
151 156 This means to extract:
152 #CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT 157
153 1 10583 rs58108140 0.0046 -0.47 -0.24,-0.44,-1.16 -0.48 -0.48 -0.48 0|0 0|0 0|0 0|1 0|0 0|1 0|0 0|0 0|1 158 - CHROM POS ID: regular fields (as in the previous example)
154 1 10611 rs189107123 0.0077 -0.48 -0.24,-0.44,-1.16 -0.48 -0.48 -0.48 0|0 0|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0 159 - THETA : This one is from INFO
155 1 13302 rs180734498 0.0048 -0.58 -2.45,-0.00,-5.00 -0.48 -0.48 -0.48 0|0 0|1 0|0 0|0 0|0 1|0 0|0 0|1 0|0 160 - GEN[0].GL[1] : Second likelihood from first genotype
156 161 - GEN[1].GL : The whole GL field (all entries without separating them)
157 - *Extracting fields with multiple values*: 162 - GEN[3].GL[*] : All likelihoods form genotype 3 (this time they will be tab separated, as opposed to the previous one).
158 (notice that there are multiple effect columns per line because there are mutiple effects per variant) 163 - GEN[*].GT : Genotype subfields (GT) from ALL samples (tab separated).
159 164
160 **CHROM POS REF ALT ANN[*].EFFECT** 165 The result will look something like::
161 166
162 The result will look something like:: 167 #CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT
163 168 1 10583 rs58108140 0.0046 -0.47 -0.24,-0.44,-1.16 -0.48 -0.48 -0.48 0|0 0|0 0|0 0|1 0|0 0|1 0|0 0|0 0|1
164 #CHROM POS REF ALT ANN[*].EFFECT 169 1 10611 rs189107123 0.0077 -0.48 -0.24,-0.44,-1.16 -0.48 -0.48 -0.48 0|0 0|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0
165 22 17071756 T C 3_prime_UTR_variant downstream_gene_variant 170 1 13302 rs180734498 0.0048 -0.58 -2.45,-0.00,-5.00 -0.48 -0.48 -0.48 0|0 0|1 0|0 0|0 0|0 1|0 0|0 0|1 0|0
166 22 17072035 C T missense_variant downstream_gene_variant 171
167 22 17072258 C A missense_variant downstream_gene_variant 172 *Extracting fields with multiple values*:
168 173 (notice that there are multiple effect columns per line because there are multiple effects per variant)
169 - *Extracting fields with multiple values using a comma as a multipe field separator:* 174
170 175 **CHROM POS REF ALT ANN[*].EFFECT**
171 **CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P** 176
172 177 The result will look something like::
173 The result will look something like:: 178
174 179 #CHROM POS REF ALT ANN[*].EFFECT
175 #CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P 180 22 17071756 T C 3_prime_UTR_variant downstream_gene_variant
176 22 17071756 T C 3_prime_UTR_variant,downstream_gene_variant .,. 181 22 17072035 C T missense_variant downstream_gene_variant
177 22 17072035 C T missense_variant,downstream_gene_variant p.Gly469Glu,. 182 22 17072258 C A missense_variant downstream_gene_variant
178 22 17072258 C A missense_variant,downstream_gene_variant p.Gly395Cys,. 183
179 184 *Extracting fields with multiple values using a comma as a multiple field separator:*
180 - *Extracting fields with multiple values, one effect per line:* 185
181 186 **CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P**
182 **CHROM POS REF ALT ANN[*].EFFECT** 187
183 188 The result will look something like::
184 The result will look something like:: 189
185 190 #CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P
186 #CHROM POS REF ALT ANN[*].EFFECT 191 22 17071756 T C 3_prime_UTR_variant,downstream_gene_variant .,.
187 22 17071756 T C 3_prime_UTR_variant 192 22 17072035 C T missense_variant,downstream_gene_variant p.Gly469Glu,.
188 22 17071756 T C downstream_gene_variant 193 22 17072258 C A missense_variant,downstream_gene_variant p.Gly395Cys,.
189 22 17072035 C T missense_variant 194
190 22 17072035 C T downstream_gene_variant 195 *Extracting fields with multiple values, one effect per line:*
191 22 17072258 C A missense_variant 196
192 22 17072258 C A downstream_gene_variant 197 **CHROM POS REF ALT ANN[*].EFFECT**
198
199 The result will look something like::
200
201 #CHROM POS REF ALT ANN[*].EFFECT
202 22 17071756 T C 3_prime_UTR_variant
203 22 17071756 T C downstream_gene_variant
204 22 17072035 C T missense_variant
205 22 17072035 C T downstream_gene_variant
206 22 17072258 C A missense_variant
207 22 17072258 C A downstream_gene_variant
193 208
194 @EXTERNAL_DOCUMENTATION@ 209 @EXTERNAL_DOCUMENTATION@
195 - http://snpeff.sourceforge.net/SnpSift.html#Extract 210 - http://snpeff.sourceforge.net/SnpSift.html#Extract
196 ]]></help> 211 ]]></help>
197 <expand macro="citations" /> 212 <expand macro="citations" />