comparison snpSift_dbnsfp.xml @ 0:0624d484adba draft

Uploaded
author iuc
date Thu, 22 Jan 2015 09:08:45 -0500
parents
children 1f4ee04c0841
comparison
equal deleted inserted replaced
-1:000000000000 0:0624d484adba
1 <tool id="snpSift_dbnsfp_generic" name="SnpSift dbNSFP" version="4.0.0">
2 <description>Add Annotations from dbNSFP and similar annotation DBs</description>
3 <expand macro="requirements" />
4 <macros>
5 <import>snpSift_macros.xml</import>
6 </macros>
7 <command>
8 java -Xmx6G -jar \$SNPEFF_JAR_PATH/SnpSift.jar dbnsfp -v
9 #if $db.dbsrc == 'cached' :
10 -db $db.dbnsfp
11 #if $db.annotations and $db.annotations.__str__ != '':
12 -f "$db.annotations"
13 #end if
14 #else :
15 -db "${db.dbnsfpdb.extra_files_path}/${db.dbnsfpdb.metadata.bgzip}"
16 #if $db.annotations and $db.annotations.__str__ != '':
17 -f "$db.annotations"
18 #end if
19 #end if
20 $input > $output
21 2> tmp.err &amp;&amp; grep -v file tmp.err
22 </command>
23 <inputs>
24 <param name="input" type="data" format="vcf" label="Variant input file in VCF format"/>
25 <conditional name="db">
26 <param name="dbsrc" type="select" label="dbNSFP ">
27 <option value="cached">Locally installed dbNSFP database </option>
28 <option value="history">dbNSFP database from your history</option>
29 </param>
30 <when value="cached">
31 <param name="dbnsfp" type="select" label="Genome">
32 <options from_data_table="snpsift_dbnsfp">
33 <column name="name" index="2"/>
34 <column name="value" index="3"/>
35 </options>
36 </param>
37 <param name="annotations" type="select" multiple="true" display="checkboxes" label="Annotate with">
38 <options from_data_table="snpsift_dbnsfp">
39 <column name="name" index="3"/>
40 <column name="value" index="3"/>
41 <filter type="param_value" ref="dbnsfp" column="2" />
42 <filter type="multiple_splitter" column="3" separator=","/>
43 </options>
44 </param>
45 </when>
46 <when value="history">
47 <param name="dbnsfpdb" type="data" format="snpsiftdbnsfp" label="DbNSFP"/>
48 <param name="annotations" type="select" multiple="true" display="checkboxes" label="Annotate with">
49 <options>
50 <filter type="data_meta" ref="dbnsfpdb" key="annotation" />
51 </options>
52 </param>
53 </when>
54 </conditional>
55 </inputs>
56 <expand macro="stdio" />
57 <outputs>
58 <data format="vcf" name="output" />
59 </outputs>
60 <tests>
61 <test>
62 <param name="input" ftype="vcf" value="test_annotate_in.vcf.vcf"/>
63 <param name="dbsrc" value="history"/>
64 <param name="dbnsfpdb" value="test_dbnsfpdb.tabular" ftype="dbnsfp.tabular" />
65 <annotations value="aaref,aaalt,genename,aapos,SIFT_score"/>
66 <output name="output">
67 <assert_contents>
68 <has_text text="dbNSFP_SIFT_score=0.15" />
69 </assert_contents>
70 </output>
71 </test>
72 </tests>
73 <help>
74
75 The dbNSFP is an integrated database of functional predictions from multiple algorithms (SIFT, Polyphen2, LRT and MutationTaster, PhyloP and GERP++, etc.).
76 It contains variant annotations such as:
77
78
79 1000Gp1_AC
80 Alternative allele counts in the whole 1000 genomes phase 1 (1000Gp1) data
81 1000Gp1_AF
82 Alternative allele frequency in the whole 1000Gp1 data
83 1000Gp1_AFR_AC
84 Alternative allele counts in the 1000Gp1 African descendent samples
85 1000Gp1_AFR_AF
86 Alternative allele frequency in the 1000Gp1 African descendent samples
87 1000Gp1_AMR_AC
88 Alternative allele counts in the 1000Gp1 American descendent samples
89 1000Gp1_AMR_AF
90 Alternative allele frequency in the 1000Gp1 American descendent samples
91 1000Gp1_ASN_AC
92 Alternative allele counts in the 1000Gp1 Asian descendent samples
93 1000Gp1_ASN_AF
94 Alternative allele frequency in the 1000Gp1 Asian descendent samples
95 1000Gp1_EUR_AC
96 Alternative allele counts in the 1000Gp1 European descendent samples
97 1000Gp1_EUR_AF
98 Alternative allele frequency in the 1000Gp1 European descendent samples
99 aaalt
100 Alternative amino acid. "." if the variant is a splicing site SNP (2bp on each end of an intron)
101 aapos
102 Amino acid position as to the protein. "-1" if the variant is a splicing site SNP (2bp on each end of an intron)
103 aapos_SIFT
104 ENSP id and amino acid positions corresponding to SIFT scores. Multiple entries separated by ";"
105 aapos_FATHMM
106 ENSP id and amino acid positions corresponding to FATHMM scores. Multiple entries separated by ";"
107 aaref
108 Reference amino acid. "." if the variant is a splicing site SNP (2bp on each end of an intron)
109 alt
110 Alternative nucleotide allele (as on the + strand)
111 Ancestral_allele
112 Ancestral allele (based on 1000 genomes reference data)
113 cds_strand
114 Coding sequence (CDS) strand (+ or -)
115 chr
116 Chromosome number
117 codonpos
118 Position on the codon (1, 2 or 3)
119 Ensembl_geneid
120 Ensembl gene ID
121 Ensembl_transcriptid
122 Ensembl transcript IDs (separated by ";")
123 ESP6500_AA_AF
124 Alternative allele frequency in the African American samples of the NHLBI GO Exome Sequencing Project (ESP6500 data set)
125 ESP6500_EA_AF
126 Alternative allele frequency in the European American samples of the NHLBI GO Exome Sequencing Project (ESP6500 data set)
127 FATHMM_pred
128 If a FATHMM_score is &lt;=-1.5 (or rankscore &lt;=0.81415) the corresponding non-synonymous SNP is predicted as "D(AMAGING)"; otherwise it is predicted as "T(OLERATED)". Multiple predictions separated by ";"
129 FATHMM_rankscore
130 FATHMMori scores were ranked among all FATHMMori scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of FATHMMori scores in dbNSFP. If there are multiple scores, only the most damaging (largest) rankscore is presented. The scores range from 0 to 1
131 FATHMM_score
132 FATHMM default score (FATHMMori)
133 fold-degenerate
134 Degenerate type (0, 2 or 3)
135 genename
136 Gene name; if the non-synonymous SNP can be assigned to multiple genes, gene names are separated by ";"
137 GERP++_NR
138 GERP++ neutral rate
139 GERP++_RS
140 GERP++ RS score, the larger the score, the more conserved the site
141 GERP++_RS_rankscore
142 GERP++ RS scores were ranked among all GERP++ RS scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of GERP++ RS scores in dbNSFP
143 hg18_pos(1-coor)
144 Physical position on the chromosome as to hg18 (1-based coordinate)
145 Interpro_domain
146 Domain or conserved site on which the variant locates
147 LR_pred
148 Prediction of our LR based ensemble prediction score, "T(olerated)" or "D(amaging)". The score cutoff between "D" and "T" is 0.5. The rankscore cutoff between "D" and "T" is 0.82268
149 LR_rankscore
150 LR scores were ranked among all LR scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of LR scores in dbNSFP. The scores range from 0 to 1
151 LR_score
152 Our logistic regression (LR) based ensemble prediction score, which incorporated 10 scores (SIFT, PolyPhen-2 HDIV, PolyPhen-2 HVAR, GERP++, MutationTaster, Mutation Assessor, FATHMM, LRT, SiPhy, PhyloP) and the maximum frequency observed in the 1000 genomes populations. Larger value means the SNV is more likely to be damaging. Scores range from 0 to 1
153 LRT_Omega
154 Estimated nonsynonymous-to-synonymous-rate ratio (Omega, reported by LRT)
155 LRT_converted_rankscore
156 LRTori scores were first converted as LRTnew=1-LRTori*0.5 if Omega&lt;1, or LRTnew=LRTori*0.5 if Omega&gt;=1. Then LRTnew scores were ranked among all LRTnew scores in dbNSFP. The rankscore is the ratio of the rank over the total number of the scores in dbNSFP. The scores range from 0.00166 to 0.85682
157 LRT_pred
158 LRT prediction, D(eleterious), N(eutral) or U(nknown), which is not solely determined by the score
159 LRT_score
160 The original LRT two-sided p-value (LRTori), ranges from 0 to 1
161 MutationAssessor_pred
162 MutationAssessor's functional impact of a variant
163 MutationAssessor_rankscore
164 MAori scores were ranked among all MAori scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of MAori scores in dbNSFP. The scores range from 0 to 1
165 MutationAssessor_score
166 MutationAssessor functional impact combined score (MAori)
167 MutationTaster_converted_rankscore
168 The MTori scores were first converted: if the prediction is "A" or "D" MTnew=MTori; if the prediction is "N" or "P", MTnew=1-MTori. Then MTnew scores were ranked among all MTnew scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of MTnew scores in dbNSFP. The scores range from 0.0931 to 0.80722
169 MutationTaster_pred
170 MutationTaster prediction
171 MutationTaster_score
172 MutationTaster p-value (MTori), ranges from 0 to 1
173 phastCons46way_placental
174 phastCons conservation score based on the multiple alignments of 33 placental mammal genomes (including human). The larger the score, the more conserved the site
175 phastCons46way_placental_rankscore
176 phastCons46way_placental scores were ranked among all phastCons46way_placental scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of phastCons46way_placental scores in dbNSFP
177 phastCons46way_primate
178 phastCons conservation score based on the multiple alignments of 10 primate genomes (including human). The larger the score, the more conserved the site
179 phastCons46way_primate_rankscore
180 phastCons46way_primate scores were ranked among all phastCons46way_primate scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of phastCons46way_primate scores in dbNSFP
181 phastCons100way_vertebrate
182 phastCons conservation score based on the multiple alignments of 100 vertebrate genomes (including human). The larger the score, the more conserved the site
183 phastCons100way_vertebrate_rankscore
184 phastCons100way_vertebrate scores were ranked among all phastCons100way_vertebrate scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of phastCons100way_vertebrate scores in dbNSFP
185 phyloP46way_placental
186 phyloP (phylogenetic p-values) conservation score based on the multiple alignments of 33 placental mammal genomes (including human). The larger the score, the more conserved the site
187 phyloP46way_placental_rankscore
188 phyloP46way_placental scores were ranked among all phyloP46way_placental scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of phyloP46way_placental scores in dbNSFP
189 phyloP46way_primate
190 phyloP (phylogenetic p-values) conservation score based on the multiple alignments of 10 primate genomes (including human). The larger the score, the more conserved the site
191 phyloP46way_primate_rankscore
192 phyloP46way_primate scores were ranked among all phyloP46way_primate scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of phyloP46way_primate scores in dbNSFP
193 phyloP100way_vertebrate
194 phyloP (phylogenetic p-values) conservation score based on the multiple alignments of 100 vertebrate genomes (including human). The larger the score, the more conserved the site
195 phyloP100way_vertebrate_rankscore
196 phyloP100way_vertebrate scores were ranked among all phyloP100way_vertebrate scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of phyloP100way_vertebrate scores in dbNSFP
197 Polyphen2_HDIV_pred
198 Polyphen2 prediction based on HumDiv
199 Polyphen2_HDIV_rankscore
200 Polyphen2 HDIV scores were first ranked among all HDIV scores in dbNSFP. The rankscore is the ratio of the rank the score over the total number of the scores in dbNSFP. If there are multiple scores, only the most damaging (largest) rankscore is presented. The scores range from 0.02656 to 0.89917
201 Polyphen2_HDIV_score
202 Polyphen2 score based on HumDiv, i.e. hdiv_prob. The score ranges from 0 to 1. Multiple entries separated by ";"
203 Polyphen2_HVAR_pred
204 Polyphen2 prediction based on HumVar
205 Polyphen2_HVAR_rankscore
206 Polyphen2 HVAR scores were first ranked among all HVAR scores in dbNSFP. The rankscore is the ratio of the rank the score over the total number of the scores in dbNSFP. If there are multiple scores, only the most damaging (largest) rankscore is presented. The scores range from 0.01281 to 0.9711
207 Polyphen2_HVAR_score
208 Polyphen2 score based on HumVar, i.e. hvar_prob. The score ranges from 0 to 1. Multiple entries separated by ";"
209 pos(1-coor)
210 Physical position on the chromosome as to hg19 (1-based coordinate)
211 RadialSVM_pred
212 Prediction of our SVM based ensemble prediction score, "T(olerated)" or "D(amaging)". The score cutoff between "D" and "T" is 0. The rankscore cutoff between "D" and "T" is 0.83357
213 RadialSVM_rankscore
214 RadialSVM scores were ranked among all RadialSVM scores in dbNSFP. The rankscore is the ratio of the rank of the screo over the total number of RadialSVM scores in dbNSFP. The scores range from 0 to 1
215 RadialSVM_score
216 Our support vector machine (SVM) based ensemble prediction score, which incorporated 10 scores (SIFT, PolyPhen-2 HDIV, PolyPhen-2 HVAR, GERP++, MutationTaster, Mutation Assessor, FATHMM, LRT, SiPhy, PhyloP) and the maximum frequency observed in the 1000 genomes populations. Larger value means the SNV is more likely to be damaging. Scores range from -2 to 3 in dbNSFP
217 ref
218 Reference nucleotide allele (as on the + strand)
219 refcodon
220 Reference codon
221 Reliability_index
222 Number of observed component scores (except the maximum frequency in the 1000 genomes populations) for RadialSVM and LR. Ranges from 1 to 10. As RadialSVM and LR scores are calculated based on imputed data, the less missing component scores, the higher the reliability of the scores and predictions
223 SIFT_converted_rankscore
224 SIFTori scores were first converted to SIFTnew=1-SIFTori, then ranked among all SIFTnew scores in dbNSFP. The rankscore is the ratio of the rank the SIFTnew score over the total number of SIFTnew scores in dbNSFP. If there are multiple scores, only the most damaging (largest) rankscore is presented. The rankscores range from 0.02654 to 0.87932
225 SIFT_pred
226 If SIFTori is smaller than 0.05 (rankscore&gt;0.55) the corresponding non-synonymous SNP is predicted as "D(amaging)"; otherwise it is predicted as "T(olerated)". Multiple predictions separated by ";"
227 SIFT_score
228 SIFT score (SIFTori). Scores range from 0 to 1. The smaller the score the more likely the SNP has damaging effect. Multiple scores separated by ";"
229 SiPhy_29way_logOdds
230 SiPhy score based on 29 mammals genomes. The larger the score, the more conserved the site
231 SiPhy_29way_pi
232 The estimated stationary distribution of A, C, G and T at the site, using SiPhy algorithm based on 29 mammals genomes
233 SLR_test_statistic
234 SLR test statistic for testing natural selection on codons. A negative value indicates negative selection, and a positive value indicates positive selection. Larger magnitude of the value suggests stronger evidence
235 Uniprot_aapos
236 Amino acid position as to Uniprot. Multiple entries separated by ";"
237 Uniprot_acc
238 Uniprot accession number. Multiple entries separated by ";"
239 Uniprot_id
240 Uniprot ID number. Multiple entries separated by ";"
241 UniSNP_ids
242 rs numbers from UniSNP, which is a cleaned version of dbSNP build 129, in format: rs number1;rs number2;...
243
244
245
246 The procedure for preparing the dbNSFP data for use in SnpSift dbnsfp is in the SnpSift documentation:
247 http://snpeff.sourceforge.net/SnpSift.html#dbNSFP
248
249 A couple dbNSFP databases are prebuilt for SnpSift at:
250 http://sourceforge.net/projects/snpeff/files/databases/dbNSFP/
251
252
253
254
255 **Uploading Your Own Annotations for any Genome**
256
257 The website for dbNSFP databases releases is:
258 https://sites.google.com/site/jpopgen/dbNSFP
259
260 But there is only annotation for human hg18, hg19, and hg38 genome builds.
261
262 However, any dbNSFP-like tabular file that be can used with SnpSift dbnsfp if it has:
263
264 - The first line of the file must be column headers that name the annotations.
265 - The first 4 columns are required and must be:
266
267 1. #chr - chromosome
268 2. pos(1-coor) - position in chromosome
269 3. ref - reference base
270 4. alt - alternate base
271
272
273 For example:
274
275 ::
276
277 #chr pos(1-coor) ref alt aaref aaalt genename SIFT_score
278 4 100239319 T A H L ADH1B 0
279 4 100239319 T C H R ADH1B 0.15
280 4 100239319 T G H P ADH1B 0
281
282
283 The custom galaxy datatypes for dbNSFP can automatically convert the specially formatted tabular file for use by SnpSift dbNSFP:
284 1. Upload the tabular file, set the datatype as: **"dbnsfp.tabular"**
285 2. Edit the history dataset attributes (pencil icon): Use "Convert Format" to convert the **"dbnsfp.tabular"** to the correct format for SnpSift dbnsfp: **"snpsiftdbnsfp"**.
286
287 The procedure for preparing the dbNSFP data for use in SnpSift dbnsfp is in the SnpSift documentation.
288
289
290 @EXTERNAL_DOCUMENTATION@
291 http://snpeff.sourceforge.net/SnpSift.html#dbNSFP
292
293 @CITATION_SECTION@
294
295
296 </help>
297 </tool>