comparison egglib/egglib-2.1.5/include/egglib-cpp/NucleotideDiversity.hpp @ 3:345f88a8f483 draft

Uploaded
author dereeper
date Fri, 10 Jul 2015 10:38:43 -0400
parents 420b57c3c185
children
comparison
equal deleted inserted replaced
2:feb40a9a8eae 3:345f88a8f483
1 /*
2 Copyright 2008-2009 Stéphane De Mita, Mathieu Siol
3
4 This file is part of the EggLib library.
5
6 EggLib is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 EggLib is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20
21 #ifndef EGGLIB_NUCLEOTIDEDIVERSITY_HPP
22 #define EGGLIB_NUCLEOTIDEDIVERSITY_HPP
23
24
25 #include "BaseDiversity.hpp"
26 #include <string>
27 #include <vector>
28
29
30
31 namespace egglib {
32
33
34 /** \brief Performs analyzes of population genetics
35 *
36 * \ingroup polymorphism
37 *
38 * This class computes several summary statistics based on
39 * nucleotide analysis. Note that it is possible to use the same
40 * object to analyze different data set. Calling the load() method
41 * erases all data preivously computed (if any). Calling the load()
42 * method is absolutely required to compute any statistics. Some
43 * statistics are not computed by default, but are if the
44 * corresponding accessor is used (only load() is required).
45 *
46 * Note that "unsecure" accessors don't perform out-of-bound checks.
47 *
48 * S is the number of varying sites (only in sites that were not
49 * rejected).
50 *
51 * eta is the minimum number of mutations, that is the sum of the
52 * number of alleles minus 1 for each varying site. eta = S if all
53 * sites have no variant or 2 alleles. eta is computed independently
54 * of the option multiple and IS NOT computed over lseff sites.
55 *
56 * Pi is the average number of pairwise differences between sequences
57 * (expressed here per site) or (as computed here) the mean per site
58 * (unbiased) heterozygosity. Pi is zero if no polymorphic sites.
59 *
60 * D is the Tajima's test of neutrality
61 * Ref. Tajima F.: Statistical method for testing the neutral
62 * mutation hypothesis by DNA polymorphism. Genetics 1989, 123:585-595.
63 * It is arbitrary set to 0 if no polymorphic sites.
64 *
65 * tW: thetaW: estimator of theta based on polymorphic sites (ref.
66 * e.g. Watterson 1975 Theor. Pop. Biol.).
67 * Both D and thetaW are computed assuming that rounded nseff samples
68 * have been sampled.
69 * The variance of D is computed using rounded nseff instead of ns.
70 *
71 * H is the Fay and Wu's test of neutrality.
72 * Z is the standardized version and E a similar test.
73 * Ref. Fay J. C., Wu C.-I.: Hitchhiking under positive Darwinian
74 * selection. Genetics 2000, 155:1405-1413. and Zeng K., Fu Y. X.,
75 * Shi S., Wu C.-I.: Statistical tests for detecting positive
76 * selection by utilizing high-frequency variants. Genetics 2006,
77 * 174:1431-9. Both are arbitrary set to 0 if no polymorphic or
78 * orientable sites.
79 *
80 * tH and tL: theta H: estimators of theta based on derived
81 * polymorphic sites (ref in Fay and Wu and Zeng al.). The variance
82 * of H/Z are computed assuming that rounded nseff samples have
83 * been sampled.
84 *
85 */
86 class NucleotideDiversity : public BaseDiversity {
87
88 public:
89
90 /** \brief Builds an object
91 *
92 */
93 NucleotideDiversity();
94
95
96 /** \brief Destroys an object
97 *
98 */
99 virtual ~NucleotideDiversity();
100
101
102 /** \brief Identifies polymorphic sites and computes basis
103 * statistics
104 *
105 * \param data an alignment object (subclass of CharMatrix).
106 * The presence of outgroup or of different populations will
107 * be detected based on the populationLabel members of the
108 * passed object. The populationLabel 999 will be interpreted
109 * as outgroups. If several outgroups are passed, sites were
110 * the outgroups are not consistent will be treated as "non-
111 * orientable".
112 *
113 * \param allowMultipleMutations if true, sites with more
114 * than two alleles will not be ignored. The sum of the
115 * frequencies of all alleles not matching the outgroup will
116 * treated as the derived allele frequency (for orientable
117 * sites).
118 *
119 * \param minimumExploitableData sites where the non-missing
120 * data (as defined by characterMapping) are at a frequency
121 * larger than this value will be removed from the analysis.
122 * Use 1. to take only 'complete' sites into account and 0.
123 * to use all sites. (The outgroup is not considered in this
124 * computation.)
125 *
126 * \param ignoreFrequency removes sites that are polymorph
127 * because of an allele at absolute frequency smaller than or
128 * equal to this value. If ignoreFrequency=1, no sites are
129 * removed, if ignoreFrequency=0, singleton sites are
130 * ignored. Such sites are completely removed from the
131 * analysis (not counted in lseff). Note that if more than
132 * one mutation is allowed, the site is removed only if all
133 * the alleles but one are smaller than or equal to this
134 * value. For example, an alignment column AAAAAAGAAT is
135 * ignored with an ignoreFrequency of 1, but AAAAAAGGAT is
136 * conserved (including the third allele T which is a
137 * singleton).
138 *
139 * \param characterMapping a string giving the list of
140 * characters that should be considered as valid data. If a
141 * space is present in the string, the characters left of the
142 * space will be treated as valid data and the characters
143 * right of the space will be treated as missing data, that
144 * is tolerated but ignored. All characters not in the string
145 * will cause an EggInvalidCharacterError to be raised.
146 *
147 * \param useZeroAsAncestral if true, all outgroups (if
148 * present) will be ignored and the character "0" will be
149 * considered as ancestral for all sites, whatever the
150 * character mapping.
151 *
152 */
153 virtual void load(
154 CharMatrix& data,
155 bool allowMultipleMutations=false,
156 double minimumExploitableData=1.,
157 unsigned int ignoreFrequency=0,
158 std::string characterMapping=dnaMapping,
159 bool useZeroAsAncestral=false
160 );
161
162
163 // accessors for the "site analysis" section
164
165 /// Number of polymorphic sites
166 unsigned int S() const;
167
168 /// Number of polymorphic orientable sites
169 unsigned int So() const;
170
171 /// Minimum number of mutations
172 unsigned int eta() const;
173
174 /// Average of per-site number of sequences effectively used
175 double nseff() const;
176
177 /// Number of sites effectively used
178 unsigned int lseff() const;
179
180 /// Average of number of sequences effectively used at orientable sites
181 double nseffo() const;
182
183 /// Number of orientable sites
184 unsigned int lseffo() const;
185
186 /// Number of detected populations
187 unsigned int npop() const;
188
189 /// Label of the population with given index (unsecure)
190 unsigned int popLabel(unsigned int popIndex) const; // no check!
191
192
193 // accessors for the "diversity" section
194
195 /// Nucleotide diversity
196 double Pi();
197
198 /// Watterson estimator of theta
199 double thetaW();
200
201 /// Average of Pi over populations
202 double average_Pi();
203
204 /// Pi of a given population (unsecure)
205 double pop_Pi(unsigned int popIndex); // no check!
206
207 /// Tajima's D
208 double D();
209
210 // accessors for the "outgroup diversity" section
211
212 /// Fay and Wu estimator of theta
213 double thetaH();
214
215 /// Zeng et al. estimator of theta
216 double thetaL();
217
218 /// Fay and Wu's H
219 double H();
220
221 /// Standardized H
222 double Z();
223
224 /// Zeng et al.'s E
225 double E();
226
227 // accessors for the "differentiation" section
228
229 /// Number of sites with at least one fixed difference
230 unsigned int FixedDifferences();
231
232 /// Number of sites with at least one allele shared among at least two populations
233 unsigned int CommonAlleles();
234
235 /// Number of sites with at least one non-fixed allele shared among at least two populations
236 unsigned int SharedAlleles();
237
238 /// Number of sites with at least one allele specific to one population
239 unsigned int SpecificAlleles();
240
241 /// Number of sites with at least one derived allele specific to one population
242 unsigned int SpecificDerivedAlleles();
243
244 /// Number of polymorphisms in a given population (unsecure)
245 unsigned int Polymorphisms(unsigned int pop);
246
247 /// Number of specific alleles for a given population (unsecure)
248 unsigned int SpecificAlleles(unsigned int pop);
249
250 /// Number of specific derived allele for a given population (unsecure)
251 unsigned int SpecificDerivedAlleles(unsigned int pop);
252
253 /// Number of fixed differences between a given pair of populations (unsecure; pop2 must be larger than pop1)
254 unsigned int FixedDifferences(unsigned int pop1, unsigned int pop2);
255
256 /// Number of common alleles between a given pair of populations (unsecure; pop2 must be larger than pop1)
257 unsigned int CommonAlleles(unsigned int pop1, unsigned int pop2);
258
259 /// Number of shared non-fixed alleles between a given pair of populations (unsecure; pop2 must be larger than pop1)
260 unsigned int SharedAlleles(unsigned int pop1, unsigned int pop2);
261
262
263 // accessor for the "triConfigurations" section
264
265 /** \brief Number falling into one of the possible site configurations
266 *
267 * The statistics are limited to three populations.
268 * Assuming an unrooted A/G polymorphism (A and G can be
269 * substitued), the site configurations are:
270 * - 0: A&G A A specific 1
271 * - 1: A&G A G specific 1 + fixed 2-3
272 * - 2: A A&G A specific 2
273 * - 3: A A&G G specific 2 + fixed 1-3
274 * - 4: A A A&G specific 3
275 * - 5: A G A&G specific 3 + fixed 1-2
276 * - 6: A&G A&G A shared 1-2
277 * - 7: A&G A A&G shared 1-3
278 * - 8: A A&G A&G shared 2-3
279 * - 9: A&G A&G A&G shared 1-2-3
280 * - 10: A G G fixed 1
281 * - 11: A G A fixed 2
282 * - 12: A A G fixed 3
283 *
284 * \param index must be an index from 0 to 12.
285 *
286 */
287 unsigned int triConfiguration(unsigned int index);
288
289
290 /// Builds and returns the vector of positions of all polymorphic sites
291 std::vector<unsigned int> polymorphic_positions() const;
292
293
294 /** \brief Builds and returns the vector of positions of all singleton sites
295 *
296 * A site singleton when it is polymorphic according to
297 * parameter of the diversity analysis, when it has exactly two
298 * alleles and one of them is at absolute frequency 1 (one
299 * copy) disregarding the outgroup.
300 *
301 */
302 std::vector<unsigned int> singleton_positions() const;
303
304
305 protected:
306
307 /** \brief This class cannot be copied
308 *
309 */
310 NucleotideDiversity(const NucleotideDiversity& source) { }
311
312
313 /** \brief This class cannot be copied
314 *
315 */
316 NucleotideDiversity& operator=(const NucleotideDiversity& source) { return *this; }
317
318
319 void init(); // initializes values
320 void clear(); // free memory but doesn't initializes
321
322 // diversity (without outgroup)
323 void diversity();
324
325 // diversity with outgroup
326 void outgroupDiversity();
327
328 // site patterns
329 void differentiation();
330
331 // triconfigurations
332 void triConfigurations();
333
334
335 // holders for statistics, with booleans flagging groups of stats
336
337 bool b_analysisSites;
338
339 bool b_diversity;
340
341 double v_Pi; // nucleotide diversity
342 double v_thetaW; // theta (Watterson estimator)
343 double v_average_Pi; // average diversity across populations
344 double *v_pop_Pi; // diversity per population
345 double v_D; // Tajima's D
346
347 bool b_outgroupDiversity;
348
349 double v_thetaH; // theta (Fay and Wu estimator)
350 double v_thetaL; // theta (Zeng estimator)
351 double v_H; // Fay and Wu's H
352 double v_Z; // normalized Fay and Wu's H
353 double v_E; // Zeng et al.'s E
354
355 bool b_differentiation;
356
357 unsigned int *v_pairwiseFixedDifferences;
358 unsigned int *v_pairwiseCommonAlleles;
359 unsigned int *v_pairwiseSharedAlleles;
360 unsigned int *v_popPolymorphic;
361 unsigned int *v_popSpecific;
362 unsigned int *v_popSpecificDerived;
363 unsigned int v_countFixedDifferences;
364 unsigned int v_countCommonAlleles;
365 unsigned int v_countSharedAlleles;
366 unsigned int v_countSpecificAlleles;
367 unsigned int v_countSpecificDerivedAlleles;
368
369
370 bool b_triConfigurations;
371
372 unsigned int *v_triConfigurations;
373
374 };
375 }
376
377 #endif