Mercurial > repos > iuc > snpsift
comparison snpSift_extractFields.xml @ 5:09d6806c609e draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpsift/snpsift commit 70ff70918368ff0deeb596c2190a770abe9e1c9b
author | iuc |
---|---|
date | Wed, 18 Apr 2018 07:28:51 -0400 |
parents | 20c7d583fec1 |
children |
comparison
equal
deleted
inserted
replaced
4:b04635ebfab0 | 5:09d6806c609e |
---|---|
1 <tool id="snpSift_extractFields" name="SnpSift Extract Fields" version="@WRAPPER_VERSION@.0"> | 1 <tool id="snpSift_extractFields" name="SnpSift Extract Fields" version="@WRAPPER_VERSION@.galaxy0"> |
2 <options sanitize="False" /> | 2 <options sanitize="False" /> |
3 <description>from a VCF file into a tabular file</description> | 3 <description>from a VCF file into a tabular file</description> |
4 <macros> | 4 <macros> |
5 <import>snpSift_macros.xml</import> | 5 <import>snpSift_macros.xml</import> |
6 </macros> | 6 </macros> |
7 <expand macro="requirements" /> | 7 <expand macro="requirements" /> |
8 <expand macro="stdio" /> | 8 <expand macro="stdio" /> |
9 <expand macro="version_command" /> | 9 <expand macro="version_command" /> |
10 <command><![CDATA[ | 10 <command><![CDATA[ |
11 @CONDA_SNPSIFT_JAR_PATH@ && | 11 @CONDA_SNPSIFT_JAR_PATH@ && |
12 cat '$input' | 12 cat '${input}' |
13 #if $one_effect_per_line: | 13 #if $one_effect_per_line: |
14 | "\$SNPSIFT_JAR_PATH/scripts/vcfEffOnePerLine.pl" | 14 | perl "\$SNPSIFT_JAR_PATH/scripts/vcfEffOnePerLine.pl" |
15 #end if | 15 #end if |
16 | SnpSift -Xmx6G extractFields | 16 | SnpSift -Xmx6G extractFields |
17 #if $separator: | 17 #if $separator: |
18 -s '$separator' | 18 -s '${separator}' |
19 #end if | 19 #end if |
20 #if $empty_text: | 20 #if $empty_text: |
21 -e '$empty_text' | 21 -e '${empty_text}' |
22 #end if | 22 #end if |
23 - | 23 - |
24 #echo ' '.join(['"%s"' % x for x in $extract.split()]) | 24 #echo ' '.join(['"%s"' % x for x in $extract.split()]) |
25 > '$output' | 25 > '${output}' |
26 ]]></command> | 26 ]]></command> |
27 <inputs> | 27 <inputs> |
28 <param name="input" type="data" format="vcf" label="Variant input file in VCF format"/> | 28 <param name="input" type="data" format="vcf" label="Variant input file in VCF format"/> |
29 <param name="extract" type="text" label="Extract" help="Need help? See below a few examples." /> | 29 <param name="extract" type="text" label="Fields to extract" value="CHROM POS ID REF ALT FILTER" help="Separated by spaces. See help below for an explanation" /> |
30 <param name="one_effect_per_line" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="One effect per line" help="When variants have more than one effect, lists one effect per line, while all other parameters in the line are repeated across mutiple lines" /> | 30 <param name="one_effect_per_line" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="One effect per line" help="When variants have more than one effect, lists one effect per line, while all other parameters in the line are repeated across mutiple lines" /> |
31 <param name="separator" type="text" value="" label="multiple field separator" help="Separate multiple fields in one column with this character, e.g. a comma, rather than a column for each of the multiple values" /> | 31 <param name="separator" type="text" value="" label="multiple field separator" help="Separate multiple fields in one column with this character, e.g. a comma, rather than a column for each of the multiple values" argument="-s" /> |
32 <param name="empty_text" type="text" value="" label="empty field text" help="Represent empty fields with this value, rather than leaving them blank" /> | 32 <param name="empty_text" type="text" value="" label="empty field text" help="Represent empty fields with this value, rather than leaving them blank" argument="-e"/> |
33 </inputs> | 33 </inputs> |
34 <outputs> | 34 <outputs> |
35 <data name="output" format="tabular" /> | 35 <data name="output" format="tabular" /> |
36 </outputs> | 36 </outputs> |
37 <tests> | 37 <tests> |
38 <test> | 38 <test> |
39 <param name="input" ftype="vcf" value="test_rmInfo.vcf"/> | 39 <param name="input" ftype="vcf" value="test_rmInfo.vcf"/> |
40 <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/> | 40 <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/> |
41 <output name="output"> | 41 <output name="output"> |
42 <assert_contents> | 42 <assert_contents> |
43 <has_text text="INTRAGENIC" /> | 43 <has_text text="INTRAGENIC" /> |
44 <not_has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" /> | 44 <not_has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" /> |
45 </assert_contents> | 45 </assert_contents> |
46 </output> | 46 </output> |
47 </test> | 47 </test> |
48 | |
49 <test> | 48 <test> |
50 <param name="input" ftype="vcf" value="test_rmInfo.vcf"/> | 49 <param name="input" ftype="vcf" value="test_rmInfo.vcf"/> |
51 <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/> | 50 <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/> |
52 <param name="separator" value=","/> | 51 <param name="separator" value=","/> |
53 <output name="output"> | 52 <output name="output"> |
54 <assert_contents> | 53 <assert_contents> |
55 <has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" /> | 54 <has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" /> |
56 </assert_contents> | 55 </assert_contents> |
57 </output> | 56 </output> |
58 </test> | 57 </test> |
58 <test> | |
59 <param name="input" ftype="vcf" value="extFields_test3_in.vcf"/> | |
60 <param name="extract" value="CHROM POS ID REF ALT FILTER ANN[*].EFFECT"/> | |
61 <param name="one_effect_per_line" value="true"/> | |
62 <output name="output" value="extFields_test3_out.vcf"/> | |
63 </test> | |
59 </tests> | 64 </tests> |
60 <help><![CDATA[ | 65 <help><![CDATA[ |
61 **SnpSift Extract Fields** | 66 **What is does** |
62 | 67 |
63 Extract fields from a VCF file to a TXT, tab separated format, that you can easily load in R, XLS, etc. | 68 `SnpSift Extract Fields <http://snpeff.sourceforge.net/SnpSift.html#Extract>`_ selects columns from a VCF dataset into a Tab-delimited format. |
64 | 69 |
65 http://snpeff.sourceforge.net/SnpSift.html#Extract | 70 ------ |
66 | 71 |
67 You can also use sub-fields and genotype fields / sub-fields such as:: | 72 .. class:: infomark |
68 | 73 |
69 Standard VCF fields: | 74 **How to know which fields to extract?** |
70 CHROM | 75 |
71 POS | 76 A VCF dataset contains mandatory fields as well as optional fields. Mandatory fields are required by `VCF specifications <https://samtools.github.io/hts-specs/VCFv4.2.pdf>`_ and present in any valid VCF dataset. The **Fields to extract** input box of the tool above is already pre-filled with names of mandatory fields. |
72 ID | 77 |
73 REF | 78 To know what other fields are available in a given VCF file simply look at its header. `INFO` and `FORMAT` lines will contain description of existing fields. For example, if you see a line: |
74 ALT | 79 |
75 FILTER | 80 ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data"> |
76 INFO fields: | 81 |
77 AF | 82 you can use *NS* as the field name. |
78 AC | 83 |
79 DP | 84 ------ |
80 MQ | 85 |
81 etc. (any info field available) | 86 **Dealing with field generated with SnpEff** |
82 SnpEff 'ANN' fields: | 87 |
83 "ANN[*].ALLELE" (alias GENOTYPE) | 88 The current version of `SnpEff <http://snpeff.sourceforge.net/SnpEff_manual.html>`_ produces so called *ANN* fields:: |
84 "ANN[*].EFFECT" (alias ANNOTATION): Effect in Sequence ontology terms (e.g. 'missense_variant', 'synonymous_variant', 'stop_gained', etc.) | 89 |
85 "ANN[*].IMPACT" { HIGH, MODERATE, LOW, MODIFIER } | 90 "ANN[*].ALLELE" (alias GENOTYPE) |
86 "ANN[*].GENE" Gene name (e.g. 'PSD3') | 91 "ANN[*].EFFECT" (alias ANNOTATION): Effect in Sequence ontology terms (e.g. 'missense_variant', 'synonymous_variant', 'stop_gained', etc.) |
87 "ANN[*].GENEID" Gene ID | 92 "ANN[*].IMPACT" { HIGH, MODERATE, LOW, MODIFIER } |
88 "ANN[*].FEATURE" | 93 "ANN[*].GENE" Gene name (e.g. 'PSD3') |
89 "ANN[*].FEATUREID" (alias TRID: Transcript ID) | 94 "ANN[*].GENEID" Gene ID |
90 "ANN[*].BIOTYPE" Biotype, as described by the annotations (e.g. 'protein_coding') | 95 "ANN[*].FEATURE" |
91 "ANN[*].RANK" Exon or Intron rank (i.e. exon number in a transcript) | 96 "ANN[*].FEATUREID" (alias TRID: Transcript ID) |
92 "ANN[*].HGVS_C" (alias HGVS_DNA, CODON): Variant in HGVS (DNA) notation | 97 "ANN[*].BIOTYPE" Biotype, as described by the annotations (e.g. 'protein_coding') |
93 "ANN[*].HGVS_P" (alias HGVS, HGVS_PROT, AA): Variant in HGVS (protein) notation | 98 "ANN[*].RANK" Exon or Intron rank (i.e. exon number in a transcript) |
94 "ANN[*].CDNA_POS" (alias POS_CDNA) | 99 "ANN[*].HGVS_C" (alias HGVS_DNA, CODON): Variant in HGVS (DNA) notation |
95 "ANN[*].CDNA_LEN" (alias LEN_CDNA) | 100 "ANN[*].HGVS_P" (alias HGVS, HGVS_PROT, AA): Variant in HGVS (protein) notation |
96 "ANN[*].CDS_POS" (alias POS_CDS) | 101 "ANN[*].CDNA_POS" (alias POS_CDNA) |
97 "ANN[*].CDS_LEN" (alias LEN_CDS) | 102 "ANN[*].CDNA_LEN" (alias LEN_CDNA) |
98 "ANN[*].AA_POS" (alias POS_AA) | 103 "ANN[*].CDS_POS" (alias POS_CDS) |
99 "ANN[*].AA_LEN" (alias LEN_AA) | 104 "ANN[*].CDS_LEN" (alias LEN_CDS) |
100 "ANN[*].DISTANCE" | 105 "ANN[*].AA_POS" (alias POS_AA) |
101 "ANN[*].ERRORS" (alias WARNING, INFOS) | 106 "ANN[*].AA_LEN" (alias LEN_AA) |
102 SnpEff 'EFF' fields (this is for older SnpEff/SnpSift versions, new version use 'ANN' field): | 107 "ANN[*].DISTANCE" |
103 "EFF[*].EFFECT" | 108 "ANN[*].ERRORS" (alias WARNING, INFOS) |
104 "EFF[*].IMPACT" | 109 |
105 "EFF[*].FUNCLASS" | 110 Older versions produced *EFF* fields:: |
106 "EFF[*].CODON" | 111 |
107 "EFF[*].AA" | 112 "EFF[*].EFFECT" |
108 "EFF[*].AA_LEN" | 113 "EFF[*].IMPACT" |
109 "EFF[*].GENE" | 114 "EFF[*].FUNCLASS" |
110 "EFF[*].BIOTYPE" | 115 "EFF[*].CODON" |
111 "EFF[*].CODING" | 116 "EFF[*].AA" |
112 "EFF[*].TRID" | 117 "EFF[*].AA_LEN" |
113 "EFF[*].RANK" | 118 "EFF[*].GENE" |
114 SnpEff 'LOF' fields: | 119 "EFF[*].BIOTYPE" |
115 "LOF[*].GENE" | 120 "EFF[*].CODING" |
116 "LOF[*].GENEID" | 121 "EFF[*].TRID" |
117 "LOF[*].NUMTR" | 122 "EFF[*].RANK" |
118 "LOF[*].PERC" | 123 |
119 SnpEff' NMD' fields: | 124 In addition there are *LOF* and *NMD* fields:: |
120 "NMD[*].GENE" | 125 |
121 "NMD[*].GENEID" | 126 "LOF[*].GENE" |
122 "NMD[*].NUMTR" | 127 "LOF[*].GENEID" |
123 "NMD[*].PERC" | 128 "LOF[*].NUMTR" |
124 | 129 "LOF[*].PERC" |
125 Some examples: | 130 |
126 | 131 "NMD[*].GENE" |
127 - *Extracting chromosome, position, ID and allele frequency from a VCF file*: | 132 "NMD[*].GENEID" |
128 | 133 "NMD[*].NUMTR" |
129 **CHROM POS ID AF** | 134 "NMD[*].PERC" |
130 | 135 |
131 The result will look something like:: | 136 To find our whether your VCF contains *ANN* or *EFF* annotations simply look at its header. |
132 | 137 |
133 #CHROM POS ID AF | 138 ----- |
134 1 69134 0.086 | 139 |
135 1 69496 rs150690004 0.001 | 140 **Usage examples** |
136 | 141 |
137 - *Extracting genotype fields*: | 142 *Extracting chromosome, position, ID and allele frequency from a VCF file*: |
138 | 143 |
139 **CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT** | 144 **CHROM POS ID AF** |
140 | 145 |
141 This means to extract: | 146 The result will look something like:: |
142 | 147 |
143 - CHROM POS ID: regular fields (as in the previous example) | 148 #CHROM POS ID AF |
144 - THETA : This one is from INFO | 149 1 69134 0.086 |
145 - GEN[0].GL[1] : Second likelihood from first genotype | 150 1 69496 rs150690004 0.001 |
146 - GEN[1].GL : The whole GL fiels (all entries without separating them) | 151 |
147 - GEN[3].GL[*] : All likelihoods form genotype 3 (this time they will be tab separated, as opposed to the previous one). | 152 *Extracting genotype fields*: |
148 - GEN[*].GT : Genotype subfields (GT) from ALL samples (tab separated). | 153 |
149 | 154 **CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT** |
150 The result will look something like:: | 155 |
151 | 156 This means to extract: |
152 #CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT | 157 |
153 1 10583 rs58108140 0.0046 -0.47 -0.24,-0.44,-1.16 -0.48 -0.48 -0.48 0|0 0|0 0|0 0|1 0|0 0|1 0|0 0|0 0|1 | 158 - CHROM POS ID: regular fields (as in the previous example) |
154 1 10611 rs189107123 0.0077 -0.48 -0.24,-0.44,-1.16 -0.48 -0.48 -0.48 0|0 0|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0 | 159 - THETA : This one is from INFO |
155 1 13302 rs180734498 0.0048 -0.58 -2.45,-0.00,-5.00 -0.48 -0.48 -0.48 0|0 0|1 0|0 0|0 0|0 1|0 0|0 0|1 0|0 | 160 - GEN[0].GL[1] : Second likelihood from first genotype |
156 | 161 - GEN[1].GL : The whole GL field (all entries without separating them) |
157 - *Extracting fields with multiple values*: | 162 - GEN[3].GL[*] : All likelihoods form genotype 3 (this time they will be tab separated, as opposed to the previous one). |
158 (notice that there are multiple effect columns per line because there are mutiple effects per variant) | 163 - GEN[*].GT : Genotype subfields (GT) from ALL samples (tab separated). |
159 | 164 |
160 **CHROM POS REF ALT ANN[*].EFFECT** | 165 The result will look something like:: |
161 | 166 |
162 The result will look something like:: | 167 #CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT |
163 | 168 1 10583 rs58108140 0.0046 -0.47 -0.24,-0.44,-1.16 -0.48 -0.48 -0.48 0|0 0|0 0|0 0|1 0|0 0|1 0|0 0|0 0|1 |
164 #CHROM POS REF ALT ANN[*].EFFECT | 169 1 10611 rs189107123 0.0077 -0.48 -0.24,-0.44,-1.16 -0.48 -0.48 -0.48 0|0 0|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0 |
165 22 17071756 T C 3_prime_UTR_variant downstream_gene_variant | 170 1 13302 rs180734498 0.0048 -0.58 -2.45,-0.00,-5.00 -0.48 -0.48 -0.48 0|0 0|1 0|0 0|0 0|0 1|0 0|0 0|1 0|0 |
166 22 17072035 C T missense_variant downstream_gene_variant | 171 |
167 22 17072258 C A missense_variant downstream_gene_variant | 172 *Extracting fields with multiple values*: |
168 | 173 (notice that there are multiple effect columns per line because there are multiple effects per variant) |
169 - *Extracting fields with multiple values using a comma as a multipe field separator:* | 174 |
170 | 175 **CHROM POS REF ALT ANN[*].EFFECT** |
171 **CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P** | 176 |
172 | 177 The result will look something like:: |
173 The result will look something like:: | 178 |
174 | 179 #CHROM POS REF ALT ANN[*].EFFECT |
175 #CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P | 180 22 17071756 T C 3_prime_UTR_variant downstream_gene_variant |
176 22 17071756 T C 3_prime_UTR_variant,downstream_gene_variant .,. | 181 22 17072035 C T missense_variant downstream_gene_variant |
177 22 17072035 C T missense_variant,downstream_gene_variant p.Gly469Glu,. | 182 22 17072258 C A missense_variant downstream_gene_variant |
178 22 17072258 C A missense_variant,downstream_gene_variant p.Gly395Cys,. | 183 |
179 | 184 *Extracting fields with multiple values using a comma as a multiple field separator:* |
180 - *Extracting fields with multiple values, one effect per line:* | 185 |
181 | 186 **CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P** |
182 **CHROM POS REF ALT ANN[*].EFFECT** | 187 |
183 | 188 The result will look something like:: |
184 The result will look something like:: | 189 |
185 | 190 #CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P |
186 #CHROM POS REF ALT ANN[*].EFFECT | 191 22 17071756 T C 3_prime_UTR_variant,downstream_gene_variant .,. |
187 22 17071756 T C 3_prime_UTR_variant | 192 22 17072035 C T missense_variant,downstream_gene_variant p.Gly469Glu,. |
188 22 17071756 T C downstream_gene_variant | 193 22 17072258 C A missense_variant,downstream_gene_variant p.Gly395Cys,. |
189 22 17072035 C T missense_variant | 194 |
190 22 17072035 C T downstream_gene_variant | 195 *Extracting fields with multiple values, one effect per line:* |
191 22 17072258 C A missense_variant | 196 |
192 22 17072258 C A downstream_gene_variant | 197 **CHROM POS REF ALT ANN[*].EFFECT** |
198 | |
199 The result will look something like:: | |
200 | |
201 #CHROM POS REF ALT ANN[*].EFFECT | |
202 22 17071756 T C 3_prime_UTR_variant | |
203 22 17071756 T C downstream_gene_variant | |
204 22 17072035 C T missense_variant | |
205 22 17072035 C T downstream_gene_variant | |
206 22 17072258 C A missense_variant | |
207 22 17072258 C A downstream_gene_variant | |
193 | 208 |
194 @EXTERNAL_DOCUMENTATION@ | 209 @EXTERNAL_DOCUMENTATION@ |
195 - http://snpeff.sourceforge.net/SnpSift.html#Extract | 210 - http://snpeff.sourceforge.net/SnpSift.html#Extract |
196 ]]></help> | 211 ]]></help> |
197 <expand macro="citations" /> | 212 <expand macro="citations" /> |