Mercurial > repos > dereeper > sniplay
comparison egglib/egglib-2.1.5/include/egglib-cpp/NucleotideDiversity.hpp @ 3:345f88a8f483 draft
Uploaded
| author | dereeper | 
|---|---|
| date | Fri, 10 Jul 2015 10:38:43 -0400 | 
| parents | 420b57c3c185 | 
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 2:feb40a9a8eae | 3:345f88a8f483 | 
|---|---|
| 1 /* | |
| 2 Copyright 2008-2009 Stéphane De Mita, Mathieu Siol | |
| 3 | |
| 4 This file is part of the EggLib library. | |
| 5 | |
| 6 EggLib is free software: you can redistribute it and/or modify | |
| 7 it under the terms of the GNU General Public License as published by | |
| 8 the Free Software Foundation, either version 3 of the License, or | |
| 9 (at your option) any later version. | |
| 10 | |
| 11 EggLib is distributed in the hope that it will be useful, | |
| 12 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 GNU General Public License for more details. | |
| 15 | |
| 16 You should have received a copy of the GNU General Public License | |
| 17 along with EggLib. If not, see <http://www.gnu.org/licenses/>. | |
| 18 */ | |
| 19 | |
| 20 | |
| 21 #ifndef EGGLIB_NUCLEOTIDEDIVERSITY_HPP | |
| 22 #define EGGLIB_NUCLEOTIDEDIVERSITY_HPP | |
| 23 | |
| 24 | |
| 25 #include "BaseDiversity.hpp" | |
| 26 #include <string> | |
| 27 #include <vector> | |
| 28 | |
| 29 | |
| 30 | |
| 31 namespace egglib { | |
| 32 | |
| 33 | |
| 34 /** \brief Performs analyzes of population genetics | |
| 35 * | |
| 36 * \ingroup polymorphism | |
| 37 * | |
| 38 * This class computes several summary statistics based on | |
| 39 * nucleotide analysis. Note that it is possible to use the same | |
| 40 * object to analyze different data set. Calling the load() method | |
| 41 * erases all data preivously computed (if any). Calling the load() | |
| 42 * method is absolutely required to compute any statistics. Some | |
| 43 * statistics are not computed by default, but are if the | |
| 44 * corresponding accessor is used (only load() is required). | |
| 45 * | |
| 46 * Note that "unsecure" accessors don't perform out-of-bound checks. | |
| 47 * | |
| 48 * S is the number of varying sites (only in sites that were not | |
| 49 * rejected). | |
| 50 * | |
| 51 * eta is the minimum number of mutations, that is the sum of the | |
| 52 * number of alleles minus 1 for each varying site. eta = S if all | |
| 53 * sites have no variant or 2 alleles. eta is computed independently | |
| 54 * of the option multiple and IS NOT computed over lseff sites. | |
| 55 * | |
| 56 * Pi is the average number of pairwise differences between sequences | |
| 57 * (expressed here per site) or (as computed here) the mean per site | |
| 58 * (unbiased) heterozygosity. Pi is zero if no polymorphic sites. | |
| 59 * | |
| 60 * D is the Tajima's test of neutrality | |
| 61 * Ref. Tajima F.: Statistical method for testing the neutral | |
| 62 * mutation hypothesis by DNA polymorphism. Genetics 1989, 123:585-595. | |
| 63 * It is arbitrary set to 0 if no polymorphic sites. | |
| 64 * | |
| 65 * tW: thetaW: estimator of theta based on polymorphic sites (ref. | |
| 66 * e.g. Watterson 1975 Theor. Pop. Biol.). | |
| 67 * Both D and thetaW are computed assuming that rounded nseff samples | |
| 68 * have been sampled. | |
| 69 * The variance of D is computed using rounded nseff instead of ns. | |
| 70 * | |
| 71 * H is the Fay and Wu's test of neutrality. | |
| 72 * Z is the standardized version and E a similar test. | |
| 73 * Ref. Fay J. C., Wu C.-I.: Hitchhiking under positive Darwinian | |
| 74 * selection. Genetics 2000, 155:1405-1413. and Zeng K., Fu Y. X., | |
| 75 * Shi S., Wu C.-I.: Statistical tests for detecting positive | |
| 76 * selection by utilizing high-frequency variants. Genetics 2006, | |
| 77 * 174:1431-9. Both are arbitrary set to 0 if no polymorphic or | |
| 78 * orientable sites. | |
| 79 * | |
| 80 * tH and tL: theta H: estimators of theta based on derived | |
| 81 * polymorphic sites (ref in Fay and Wu and Zeng al.). The variance | |
| 82 * of H/Z are computed assuming that rounded nseff samples have | |
| 83 * been sampled. | |
| 84 * | |
| 85 */ | |
| 86 class NucleotideDiversity : public BaseDiversity { | |
| 87 | |
| 88 public: | |
| 89 | |
| 90 /** \brief Builds an object | |
| 91 * | |
| 92 */ | |
| 93 NucleotideDiversity(); | |
| 94 | |
| 95 | |
| 96 /** \brief Destroys an object | |
| 97 * | |
| 98 */ | |
| 99 virtual ~NucleotideDiversity(); | |
| 100 | |
| 101 | |
| 102 /** \brief Identifies polymorphic sites and computes basis | |
| 103 * statistics | |
| 104 * | |
| 105 * \param data an alignment object (subclass of CharMatrix). | |
| 106 * The presence of outgroup or of different populations will | |
| 107 * be detected based on the populationLabel members of the | |
| 108 * passed object. The populationLabel 999 will be interpreted | |
| 109 * as outgroups. If several outgroups are passed, sites were | |
| 110 * the outgroups are not consistent will be treated as "non- | |
| 111 * orientable". | |
| 112 * | |
| 113 * \param allowMultipleMutations if true, sites with more | |
| 114 * than two alleles will not be ignored. The sum of the | |
| 115 * frequencies of all alleles not matching the outgroup will | |
| 116 * treated as the derived allele frequency (for orientable | |
| 117 * sites). | |
| 118 * | |
| 119 * \param minimumExploitableData sites where the non-missing | |
| 120 * data (as defined by characterMapping) are at a frequency | |
| 121 * larger than this value will be removed from the analysis. | |
| 122 * Use 1. to take only 'complete' sites into account and 0. | |
| 123 * to use all sites. (The outgroup is not considered in this | |
| 124 * computation.) | |
| 125 * | |
| 126 * \param ignoreFrequency removes sites that are polymorph | |
| 127 * because of an allele at absolute frequency smaller than or | |
| 128 * equal to this value. If ignoreFrequency=1, no sites are | |
| 129 * removed, if ignoreFrequency=0, singleton sites are | |
| 130 * ignored. Such sites are completely removed from the | |
| 131 * analysis (not counted in lseff). Note that if more than | |
| 132 * one mutation is allowed, the site is removed only if all | |
| 133 * the alleles but one are smaller than or equal to this | |
| 134 * value. For example, an alignment column AAAAAAGAAT is | |
| 135 * ignored with an ignoreFrequency of 1, but AAAAAAGGAT is | |
| 136 * conserved (including the third allele T which is a | |
| 137 * singleton). | |
| 138 * | |
| 139 * \param characterMapping a string giving the list of | |
| 140 * characters that should be considered as valid data. If a | |
| 141 * space is present in the string, the characters left of the | |
| 142 * space will be treated as valid data and the characters | |
| 143 * right of the space will be treated as missing data, that | |
| 144 * is tolerated but ignored. All characters not in the string | |
| 145 * will cause an EggInvalidCharacterError to be raised. | |
| 146 * | |
| 147 * \param useZeroAsAncestral if true, all outgroups (if | |
| 148 * present) will be ignored and the character "0" will be | |
| 149 * considered as ancestral for all sites, whatever the | |
| 150 * character mapping. | |
| 151 * | |
| 152 */ | |
| 153 virtual void load( | |
| 154 CharMatrix& data, | |
| 155 bool allowMultipleMutations=false, | |
| 156 double minimumExploitableData=1., | |
| 157 unsigned int ignoreFrequency=0, | |
| 158 std::string characterMapping=dnaMapping, | |
| 159 bool useZeroAsAncestral=false | |
| 160 ); | |
| 161 | |
| 162 | |
| 163 // accessors for the "site analysis" section | |
| 164 | |
| 165 /// Number of polymorphic sites | |
| 166 unsigned int S() const; | |
| 167 | |
| 168 /// Number of polymorphic orientable sites | |
| 169 unsigned int So() const; | |
| 170 | |
| 171 /// Minimum number of mutations | |
| 172 unsigned int eta() const; | |
| 173 | |
| 174 /// Average of per-site number of sequences effectively used | |
| 175 double nseff() const; | |
| 176 | |
| 177 /// Number of sites effectively used | |
| 178 unsigned int lseff() const; | |
| 179 | |
| 180 /// Average of number of sequences effectively used at orientable sites | |
| 181 double nseffo() const; | |
| 182 | |
| 183 /// Number of orientable sites | |
| 184 unsigned int lseffo() const; | |
| 185 | |
| 186 /// Number of detected populations | |
| 187 unsigned int npop() const; | |
| 188 | |
| 189 /// Label of the population with given index (unsecure) | |
| 190 unsigned int popLabel(unsigned int popIndex) const; // no check! | |
| 191 | |
| 192 | |
| 193 // accessors for the "diversity" section | |
| 194 | |
| 195 /// Nucleotide diversity | |
| 196 double Pi(); | |
| 197 | |
| 198 /// Watterson estimator of theta | |
| 199 double thetaW(); | |
| 200 | |
| 201 /// Average of Pi over populations | |
| 202 double average_Pi(); | |
| 203 | |
| 204 /// Pi of a given population (unsecure) | |
| 205 double pop_Pi(unsigned int popIndex); // no check! | |
| 206 | |
| 207 /// Tajima's D | |
| 208 double D(); | |
| 209 | |
| 210 // accessors for the "outgroup diversity" section | |
| 211 | |
| 212 /// Fay and Wu estimator of theta | |
| 213 double thetaH(); | |
| 214 | |
| 215 /// Zeng et al. estimator of theta | |
| 216 double thetaL(); | |
| 217 | |
| 218 /// Fay and Wu's H | |
| 219 double H(); | |
| 220 | |
| 221 /// Standardized H | |
| 222 double Z(); | |
| 223 | |
| 224 /// Zeng et al.'s E | |
| 225 double E(); | |
| 226 | |
| 227 // accessors for the "differentiation" section | |
| 228 | |
| 229 /// Number of sites with at least one fixed difference | |
| 230 unsigned int FixedDifferences(); | |
| 231 | |
| 232 /// Number of sites with at least one allele shared among at least two populations | |
| 233 unsigned int CommonAlleles(); | |
| 234 | |
| 235 /// Number of sites with at least one non-fixed allele shared among at least two populations | |
| 236 unsigned int SharedAlleles(); | |
| 237 | |
| 238 /// Number of sites with at least one allele specific to one population | |
| 239 unsigned int SpecificAlleles(); | |
| 240 | |
| 241 /// Number of sites with at least one derived allele specific to one population | |
| 242 unsigned int SpecificDerivedAlleles(); | |
| 243 | |
| 244 /// Number of polymorphisms in a given population (unsecure) | |
| 245 unsigned int Polymorphisms(unsigned int pop); | |
| 246 | |
| 247 /// Number of specific alleles for a given population (unsecure) | |
| 248 unsigned int SpecificAlleles(unsigned int pop); | |
| 249 | |
| 250 /// Number of specific derived allele for a given population (unsecure) | |
| 251 unsigned int SpecificDerivedAlleles(unsigned int pop); | |
| 252 | |
| 253 /// Number of fixed differences between a given pair of populations (unsecure; pop2 must be larger than pop1) | |
| 254 unsigned int FixedDifferences(unsigned int pop1, unsigned int pop2); | |
| 255 | |
| 256 /// Number of common alleles between a given pair of populations (unsecure; pop2 must be larger than pop1) | |
| 257 unsigned int CommonAlleles(unsigned int pop1, unsigned int pop2); | |
| 258 | |
| 259 /// Number of shared non-fixed alleles between a given pair of populations (unsecure; pop2 must be larger than pop1) | |
| 260 unsigned int SharedAlleles(unsigned int pop1, unsigned int pop2); | |
| 261 | |
| 262 | |
| 263 // accessor for the "triConfigurations" section | |
| 264 | |
| 265 /** \brief Number falling into one of the possible site configurations | |
| 266 * | |
| 267 * The statistics are limited to three populations. | |
| 268 * Assuming an unrooted A/G polymorphism (A and G can be | |
| 269 * substitued), the site configurations are: | |
| 270 * - 0: A&G A A specific 1 | |
| 271 * - 1: A&G A G specific 1 + fixed 2-3 | |
| 272 * - 2: A A&G A specific 2 | |
| 273 * - 3: A A&G G specific 2 + fixed 1-3 | |
| 274 * - 4: A A A&G specific 3 | |
| 275 * - 5: A G A&G specific 3 + fixed 1-2 | |
| 276 * - 6: A&G A&G A shared 1-2 | |
| 277 * - 7: A&G A A&G shared 1-3 | |
| 278 * - 8: A A&G A&G shared 2-3 | |
| 279 * - 9: A&G A&G A&G shared 1-2-3 | |
| 280 * - 10: A G G fixed 1 | |
| 281 * - 11: A G A fixed 2 | |
| 282 * - 12: A A G fixed 3 | |
| 283 * | |
| 284 * \param index must be an index from 0 to 12. | |
| 285 * | |
| 286 */ | |
| 287 unsigned int triConfiguration(unsigned int index); | |
| 288 | |
| 289 | |
| 290 /// Builds and returns the vector of positions of all polymorphic sites | |
| 291 std::vector<unsigned int> polymorphic_positions() const; | |
| 292 | |
| 293 | |
| 294 /** \brief Builds and returns the vector of positions of all singleton sites | |
| 295 * | |
| 296 * A site singleton when it is polymorphic according to | |
| 297 * parameter of the diversity analysis, when it has exactly two | |
| 298 * alleles and one of them is at absolute frequency 1 (one | |
| 299 * copy) disregarding the outgroup. | |
| 300 * | |
| 301 */ | |
| 302 std::vector<unsigned int> singleton_positions() const; | |
| 303 | |
| 304 | |
| 305 protected: | |
| 306 | |
| 307 /** \brief This class cannot be copied | |
| 308 * | |
| 309 */ | |
| 310 NucleotideDiversity(const NucleotideDiversity& source) { } | |
| 311 | |
| 312 | |
| 313 /** \brief This class cannot be copied | |
| 314 * | |
| 315 */ | |
| 316 NucleotideDiversity& operator=(const NucleotideDiversity& source) { return *this; } | |
| 317 | |
| 318 | |
| 319 void init(); // initializes values | |
| 320 void clear(); // free memory but doesn't initializes | |
| 321 | |
| 322 // diversity (without outgroup) | |
| 323 void diversity(); | |
| 324 | |
| 325 // diversity with outgroup | |
| 326 void outgroupDiversity(); | |
| 327 | |
| 328 // site patterns | |
| 329 void differentiation(); | |
| 330 | |
| 331 // triconfigurations | |
| 332 void triConfigurations(); | |
| 333 | |
| 334 | |
| 335 // holders for statistics, with booleans flagging groups of stats | |
| 336 | |
| 337 bool b_analysisSites; | |
| 338 | |
| 339 bool b_diversity; | |
| 340 | |
| 341 double v_Pi; // nucleotide diversity | |
| 342 double v_thetaW; // theta (Watterson estimator) | |
| 343 double v_average_Pi; // average diversity across populations | |
| 344 double *v_pop_Pi; // diversity per population | |
| 345 double v_D; // Tajima's D | |
| 346 | |
| 347 bool b_outgroupDiversity; | |
| 348 | |
| 349 double v_thetaH; // theta (Fay and Wu estimator) | |
| 350 double v_thetaL; // theta (Zeng estimator) | |
| 351 double v_H; // Fay and Wu's H | |
| 352 double v_Z; // normalized Fay and Wu's H | |
| 353 double v_E; // Zeng et al.'s E | |
| 354 | |
| 355 bool b_differentiation; | |
| 356 | |
| 357 unsigned int *v_pairwiseFixedDifferences; | |
| 358 unsigned int *v_pairwiseCommonAlleles; | |
| 359 unsigned int *v_pairwiseSharedAlleles; | |
| 360 unsigned int *v_popPolymorphic; | |
| 361 unsigned int *v_popSpecific; | |
| 362 unsigned int *v_popSpecificDerived; | |
| 363 unsigned int v_countFixedDifferences; | |
| 364 unsigned int v_countCommonAlleles; | |
| 365 unsigned int v_countSharedAlleles; | |
| 366 unsigned int v_countSpecificAlleles; | |
| 367 unsigned int v_countSpecificDerivedAlleles; | |
| 368 | |
| 369 | |
| 370 bool b_triConfigurations; | |
| 371 | |
| 372 unsigned int *v_triConfigurations; | |
| 373 | |
| 374 }; | |
| 375 } | |
| 376 | |
| 377 #endif | 
