Mercurial > repos > dereeper > sniplay
comparison egglib/egglib-2.1.5/include/egglib-cpp/NucleotideDiversity.hpp @ 3:345f88a8f483 draft
Uploaded
author | dereeper |
---|---|
date | Fri, 10 Jul 2015 10:38:43 -0400 |
parents | 420b57c3c185 |
children |
comparison
equal
deleted
inserted
replaced
2:feb40a9a8eae | 3:345f88a8f483 |
---|---|
1 /* | |
2 Copyright 2008-2009 Stéphane De Mita, Mathieu Siol | |
3 | |
4 This file is part of the EggLib library. | |
5 | |
6 EggLib is free software: you can redistribute it and/or modify | |
7 it under the terms of the GNU General Public License as published by | |
8 the Free Software Foundation, either version 3 of the License, or | |
9 (at your option) any later version. | |
10 | |
11 EggLib is distributed in the hope that it will be useful, | |
12 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 GNU General Public License for more details. | |
15 | |
16 You should have received a copy of the GNU General Public License | |
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>. | |
18 */ | |
19 | |
20 | |
21 #ifndef EGGLIB_NUCLEOTIDEDIVERSITY_HPP | |
22 #define EGGLIB_NUCLEOTIDEDIVERSITY_HPP | |
23 | |
24 | |
25 #include "BaseDiversity.hpp" | |
26 #include <string> | |
27 #include <vector> | |
28 | |
29 | |
30 | |
31 namespace egglib { | |
32 | |
33 | |
34 /** \brief Performs analyzes of population genetics | |
35 * | |
36 * \ingroup polymorphism | |
37 * | |
38 * This class computes several summary statistics based on | |
39 * nucleotide analysis. Note that it is possible to use the same | |
40 * object to analyze different data set. Calling the load() method | |
41 * erases all data preivously computed (if any). Calling the load() | |
42 * method is absolutely required to compute any statistics. Some | |
43 * statistics are not computed by default, but are if the | |
44 * corresponding accessor is used (only load() is required). | |
45 * | |
46 * Note that "unsecure" accessors don't perform out-of-bound checks. | |
47 * | |
48 * S is the number of varying sites (only in sites that were not | |
49 * rejected). | |
50 * | |
51 * eta is the minimum number of mutations, that is the sum of the | |
52 * number of alleles minus 1 for each varying site. eta = S if all | |
53 * sites have no variant or 2 alleles. eta is computed independently | |
54 * of the option multiple and IS NOT computed over lseff sites. | |
55 * | |
56 * Pi is the average number of pairwise differences between sequences | |
57 * (expressed here per site) or (as computed here) the mean per site | |
58 * (unbiased) heterozygosity. Pi is zero if no polymorphic sites. | |
59 * | |
60 * D is the Tajima's test of neutrality | |
61 * Ref. Tajima F.: Statistical method for testing the neutral | |
62 * mutation hypothesis by DNA polymorphism. Genetics 1989, 123:585-595. | |
63 * It is arbitrary set to 0 if no polymorphic sites. | |
64 * | |
65 * tW: thetaW: estimator of theta based on polymorphic sites (ref. | |
66 * e.g. Watterson 1975 Theor. Pop. Biol.). | |
67 * Both D and thetaW are computed assuming that rounded nseff samples | |
68 * have been sampled. | |
69 * The variance of D is computed using rounded nseff instead of ns. | |
70 * | |
71 * H is the Fay and Wu's test of neutrality. | |
72 * Z is the standardized version and E a similar test. | |
73 * Ref. Fay J. C., Wu C.-I.: Hitchhiking under positive Darwinian | |
74 * selection. Genetics 2000, 155:1405-1413. and Zeng K., Fu Y. X., | |
75 * Shi S., Wu C.-I.: Statistical tests for detecting positive | |
76 * selection by utilizing high-frequency variants. Genetics 2006, | |
77 * 174:1431-9. Both are arbitrary set to 0 if no polymorphic or | |
78 * orientable sites. | |
79 * | |
80 * tH and tL: theta H: estimators of theta based on derived | |
81 * polymorphic sites (ref in Fay and Wu and Zeng al.). The variance | |
82 * of H/Z are computed assuming that rounded nseff samples have | |
83 * been sampled. | |
84 * | |
85 */ | |
86 class NucleotideDiversity : public BaseDiversity { | |
87 | |
88 public: | |
89 | |
90 /** \brief Builds an object | |
91 * | |
92 */ | |
93 NucleotideDiversity(); | |
94 | |
95 | |
96 /** \brief Destroys an object | |
97 * | |
98 */ | |
99 virtual ~NucleotideDiversity(); | |
100 | |
101 | |
102 /** \brief Identifies polymorphic sites and computes basis | |
103 * statistics | |
104 * | |
105 * \param data an alignment object (subclass of CharMatrix). | |
106 * The presence of outgroup or of different populations will | |
107 * be detected based on the populationLabel members of the | |
108 * passed object. The populationLabel 999 will be interpreted | |
109 * as outgroups. If several outgroups are passed, sites were | |
110 * the outgroups are not consistent will be treated as "non- | |
111 * orientable". | |
112 * | |
113 * \param allowMultipleMutations if true, sites with more | |
114 * than two alleles will not be ignored. The sum of the | |
115 * frequencies of all alleles not matching the outgroup will | |
116 * treated as the derived allele frequency (for orientable | |
117 * sites). | |
118 * | |
119 * \param minimumExploitableData sites where the non-missing | |
120 * data (as defined by characterMapping) are at a frequency | |
121 * larger than this value will be removed from the analysis. | |
122 * Use 1. to take only 'complete' sites into account and 0. | |
123 * to use all sites. (The outgroup is not considered in this | |
124 * computation.) | |
125 * | |
126 * \param ignoreFrequency removes sites that are polymorph | |
127 * because of an allele at absolute frequency smaller than or | |
128 * equal to this value. If ignoreFrequency=1, no sites are | |
129 * removed, if ignoreFrequency=0, singleton sites are | |
130 * ignored. Such sites are completely removed from the | |
131 * analysis (not counted in lseff). Note that if more than | |
132 * one mutation is allowed, the site is removed only if all | |
133 * the alleles but one are smaller than or equal to this | |
134 * value. For example, an alignment column AAAAAAGAAT is | |
135 * ignored with an ignoreFrequency of 1, but AAAAAAGGAT is | |
136 * conserved (including the third allele T which is a | |
137 * singleton). | |
138 * | |
139 * \param characterMapping a string giving the list of | |
140 * characters that should be considered as valid data. If a | |
141 * space is present in the string, the characters left of the | |
142 * space will be treated as valid data and the characters | |
143 * right of the space will be treated as missing data, that | |
144 * is tolerated but ignored. All characters not in the string | |
145 * will cause an EggInvalidCharacterError to be raised. | |
146 * | |
147 * \param useZeroAsAncestral if true, all outgroups (if | |
148 * present) will be ignored and the character "0" will be | |
149 * considered as ancestral for all sites, whatever the | |
150 * character mapping. | |
151 * | |
152 */ | |
153 virtual void load( | |
154 CharMatrix& data, | |
155 bool allowMultipleMutations=false, | |
156 double minimumExploitableData=1., | |
157 unsigned int ignoreFrequency=0, | |
158 std::string characterMapping=dnaMapping, | |
159 bool useZeroAsAncestral=false | |
160 ); | |
161 | |
162 | |
163 // accessors for the "site analysis" section | |
164 | |
165 /// Number of polymorphic sites | |
166 unsigned int S() const; | |
167 | |
168 /// Number of polymorphic orientable sites | |
169 unsigned int So() const; | |
170 | |
171 /// Minimum number of mutations | |
172 unsigned int eta() const; | |
173 | |
174 /// Average of per-site number of sequences effectively used | |
175 double nseff() const; | |
176 | |
177 /// Number of sites effectively used | |
178 unsigned int lseff() const; | |
179 | |
180 /// Average of number of sequences effectively used at orientable sites | |
181 double nseffo() const; | |
182 | |
183 /// Number of orientable sites | |
184 unsigned int lseffo() const; | |
185 | |
186 /// Number of detected populations | |
187 unsigned int npop() const; | |
188 | |
189 /// Label of the population with given index (unsecure) | |
190 unsigned int popLabel(unsigned int popIndex) const; // no check! | |
191 | |
192 | |
193 // accessors for the "diversity" section | |
194 | |
195 /// Nucleotide diversity | |
196 double Pi(); | |
197 | |
198 /// Watterson estimator of theta | |
199 double thetaW(); | |
200 | |
201 /// Average of Pi over populations | |
202 double average_Pi(); | |
203 | |
204 /// Pi of a given population (unsecure) | |
205 double pop_Pi(unsigned int popIndex); // no check! | |
206 | |
207 /// Tajima's D | |
208 double D(); | |
209 | |
210 // accessors for the "outgroup diversity" section | |
211 | |
212 /// Fay and Wu estimator of theta | |
213 double thetaH(); | |
214 | |
215 /// Zeng et al. estimator of theta | |
216 double thetaL(); | |
217 | |
218 /// Fay and Wu's H | |
219 double H(); | |
220 | |
221 /// Standardized H | |
222 double Z(); | |
223 | |
224 /// Zeng et al.'s E | |
225 double E(); | |
226 | |
227 // accessors for the "differentiation" section | |
228 | |
229 /// Number of sites with at least one fixed difference | |
230 unsigned int FixedDifferences(); | |
231 | |
232 /// Number of sites with at least one allele shared among at least two populations | |
233 unsigned int CommonAlleles(); | |
234 | |
235 /// Number of sites with at least one non-fixed allele shared among at least two populations | |
236 unsigned int SharedAlleles(); | |
237 | |
238 /// Number of sites with at least one allele specific to one population | |
239 unsigned int SpecificAlleles(); | |
240 | |
241 /// Number of sites with at least one derived allele specific to one population | |
242 unsigned int SpecificDerivedAlleles(); | |
243 | |
244 /// Number of polymorphisms in a given population (unsecure) | |
245 unsigned int Polymorphisms(unsigned int pop); | |
246 | |
247 /// Number of specific alleles for a given population (unsecure) | |
248 unsigned int SpecificAlleles(unsigned int pop); | |
249 | |
250 /// Number of specific derived allele for a given population (unsecure) | |
251 unsigned int SpecificDerivedAlleles(unsigned int pop); | |
252 | |
253 /// Number of fixed differences between a given pair of populations (unsecure; pop2 must be larger than pop1) | |
254 unsigned int FixedDifferences(unsigned int pop1, unsigned int pop2); | |
255 | |
256 /// Number of common alleles between a given pair of populations (unsecure; pop2 must be larger than pop1) | |
257 unsigned int CommonAlleles(unsigned int pop1, unsigned int pop2); | |
258 | |
259 /// Number of shared non-fixed alleles between a given pair of populations (unsecure; pop2 must be larger than pop1) | |
260 unsigned int SharedAlleles(unsigned int pop1, unsigned int pop2); | |
261 | |
262 | |
263 // accessor for the "triConfigurations" section | |
264 | |
265 /** \brief Number falling into one of the possible site configurations | |
266 * | |
267 * The statistics are limited to three populations. | |
268 * Assuming an unrooted A/G polymorphism (A and G can be | |
269 * substitued), the site configurations are: | |
270 * - 0: A&G A A specific 1 | |
271 * - 1: A&G A G specific 1 + fixed 2-3 | |
272 * - 2: A A&G A specific 2 | |
273 * - 3: A A&G G specific 2 + fixed 1-3 | |
274 * - 4: A A A&G specific 3 | |
275 * - 5: A G A&G specific 3 + fixed 1-2 | |
276 * - 6: A&G A&G A shared 1-2 | |
277 * - 7: A&G A A&G shared 1-3 | |
278 * - 8: A A&G A&G shared 2-3 | |
279 * - 9: A&G A&G A&G shared 1-2-3 | |
280 * - 10: A G G fixed 1 | |
281 * - 11: A G A fixed 2 | |
282 * - 12: A A G fixed 3 | |
283 * | |
284 * \param index must be an index from 0 to 12. | |
285 * | |
286 */ | |
287 unsigned int triConfiguration(unsigned int index); | |
288 | |
289 | |
290 /// Builds and returns the vector of positions of all polymorphic sites | |
291 std::vector<unsigned int> polymorphic_positions() const; | |
292 | |
293 | |
294 /** \brief Builds and returns the vector of positions of all singleton sites | |
295 * | |
296 * A site singleton when it is polymorphic according to | |
297 * parameter of the diversity analysis, when it has exactly two | |
298 * alleles and one of them is at absolute frequency 1 (one | |
299 * copy) disregarding the outgroup. | |
300 * | |
301 */ | |
302 std::vector<unsigned int> singleton_positions() const; | |
303 | |
304 | |
305 protected: | |
306 | |
307 /** \brief This class cannot be copied | |
308 * | |
309 */ | |
310 NucleotideDiversity(const NucleotideDiversity& source) { } | |
311 | |
312 | |
313 /** \brief This class cannot be copied | |
314 * | |
315 */ | |
316 NucleotideDiversity& operator=(const NucleotideDiversity& source) { return *this; } | |
317 | |
318 | |
319 void init(); // initializes values | |
320 void clear(); // free memory but doesn't initializes | |
321 | |
322 // diversity (without outgroup) | |
323 void diversity(); | |
324 | |
325 // diversity with outgroup | |
326 void outgroupDiversity(); | |
327 | |
328 // site patterns | |
329 void differentiation(); | |
330 | |
331 // triconfigurations | |
332 void triConfigurations(); | |
333 | |
334 | |
335 // holders for statistics, with booleans flagging groups of stats | |
336 | |
337 bool b_analysisSites; | |
338 | |
339 bool b_diversity; | |
340 | |
341 double v_Pi; // nucleotide diversity | |
342 double v_thetaW; // theta (Watterson estimator) | |
343 double v_average_Pi; // average diversity across populations | |
344 double *v_pop_Pi; // diversity per population | |
345 double v_D; // Tajima's D | |
346 | |
347 bool b_outgroupDiversity; | |
348 | |
349 double v_thetaH; // theta (Fay and Wu estimator) | |
350 double v_thetaL; // theta (Zeng estimator) | |
351 double v_H; // Fay and Wu's H | |
352 double v_Z; // normalized Fay and Wu's H | |
353 double v_E; // Zeng et al.'s E | |
354 | |
355 bool b_differentiation; | |
356 | |
357 unsigned int *v_pairwiseFixedDifferences; | |
358 unsigned int *v_pairwiseCommonAlleles; | |
359 unsigned int *v_pairwiseSharedAlleles; | |
360 unsigned int *v_popPolymorphic; | |
361 unsigned int *v_popSpecific; | |
362 unsigned int *v_popSpecificDerived; | |
363 unsigned int v_countFixedDifferences; | |
364 unsigned int v_countCommonAlleles; | |
365 unsigned int v_countSharedAlleles; | |
366 unsigned int v_countSpecificAlleles; | |
367 unsigned int v_countSpecificDerivedAlleles; | |
368 | |
369 | |
370 bool b_triConfigurations; | |
371 | |
372 unsigned int *v_triConfigurations; | |
373 | |
374 }; | |
375 } | |
376 | |
377 #endif |