| 1 | 1 /* | 
|  | 2     Copyright 2010 Stéphane De Mita, Mathieu Siol | 
|  | 3 | 
|  | 4     This file is part of the EggLib library. | 
|  | 5 | 
|  | 6     EggLib is free software: you can redistribute it and/or modify | 
|  | 7     it under the terms of the GNU General Public License as published by | 
|  | 8     the Free Software Foundation, either version 3 of the License, or | 
|  | 9     (at your option) any later version. | 
|  | 10 | 
|  | 11     EggLib is distributed in the hope that it will be useful, | 
|  | 12     but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
|  | 14     GNU General Public License for more details. | 
|  | 15 | 
|  | 16     You should have received a copy of the GNU General Public License | 
|  | 17     along with EggLib.  If not, see <http://www.gnu.org/licenses/>. | 
|  | 18 */ | 
|  | 19 | 
|  | 20 #ifndef EGGLIB_HFSTATISTICS_HPP | 
|  | 21 #define EGGLIB_HFSTATISTICS_HPP | 
|  | 22 | 
|  | 23 | 
|  | 24 | 
|  | 25 namespace egglib { | 
|  | 26 | 
|  | 27 | 
|  | 28     /** \brief Computes Fst and Fit from haploid data | 
|  | 29     * | 
|  | 30     * The class requires loading data. Data are loaded by haploid | 
|  | 31     * (one genotype per individual). The analyses are cached: they are | 
|  | 32     * performed upon the first call to statistics accessors. The cache | 
|  | 33     * is emptied whenever a datum is loaded. | 
|  | 34     * | 
|  | 35     * The computations are performed after Weir and Cockerham. The | 
|  | 36     * statistic theta is generalized for multiple alleles. To allow | 
|  | 37     * computation of multi-locus statistics, variance components are | 
|  | 38     * also available. The two components of the variance are T1 and T2 | 
|  | 39     * and theta is T1/T2 (from Weir 1996 "Genetic Data Analysis II", | 
|  | 40     * Sinauer associates, Sunderland MA). | 
|  | 41     * | 
|  | 42     * \ingroup polymorphism | 
|  | 43     * | 
|  | 44     */ | 
|  | 45     class HFStatistics { | 
|  | 46 | 
|  | 47         public: | 
|  | 48 | 
|  | 49            /** \brief Constructor | 
|  | 50             * | 
|  | 51             */ | 
|  | 52             HFStatistics(); | 
|  | 53 | 
|  | 54 | 
|  | 55            /** \brief Destructor | 
|  | 56             * | 
|  | 57             */ | 
|  | 58             virtual ~HFStatistics(); | 
|  | 59 | 
|  | 60 | 
|  | 61            /** \brief Reserve sufficient memory for a given number of | 
|  | 62             * individuals. | 
|  | 63             * | 
|  | 64             * This method makes the load function faster by allocating | 
|  | 65             * all required memory at once. | 
|  | 66             * | 
|  | 67             * \param numberOfIndividuals a strictly positive integer. | 
|  | 68             * | 
|  | 69             */ | 
|  | 70             void reserve(unsigned int numberOfIndividuals); | 
|  | 71 | 
|  | 72 | 
|  | 73            /** \brief Loads the data for one individual | 
|  | 74             * | 
|  | 75             * \param genotype an integer giving the allele. | 
|  | 76             * \param populationLabel an integer indication belonging to | 
|  | 77             * a population. | 
|  | 78             * | 
|  | 79             * Genotypes and population labels are not required to be | 
|  | 80             * consecutive (both are labels, not indices). They are | 
|  | 81             * internally mapped to indices (the mapping can be obtained | 
|  | 82             * by accessors populationLabel and allele). | 
|  | 83             * | 
|  | 84             * All genotypes are considered to be valid (no missing data). | 
|  | 85             * If statistics were computed previous to call to this | 
|  | 86             * function, all data will be erased. | 
|  | 87             * | 
|  | 88             */ | 
|  | 89             void loadIndividual(unsigned int genotype, unsigned int populationLabel); | 
|  | 90 | 
|  | 91 | 
|  | 92            /** \brief Label of a population | 
|  | 93             * | 
|  | 94             * The index corresponds to the local mapping of populations | 
|  | 95             * regardless of the ranking of population labels. (No out | 
|  | 96             * of bound checking.) | 
|  | 97             * | 
|  | 98             */ | 
|  | 99             unsigned int populationLabel(unsigned int populationIndex); | 
|  | 100 | 
|  | 101 | 
|  | 102            /** \brief Value of an allele | 
|  | 103             * | 
|  | 104             * The index corresponds to the local mapping of alleles | 
|  | 105             * regardless of the ranking of allele values. (No out of | 
|  | 106             * bound checking.) | 
|  | 107             * | 
|  | 108             */ | 
|  | 109             unsigned int alleleValue(unsigned int alleleIndex); | 
|  | 110 | 
|  | 111 | 
|  | 112             /// Allele of a given individual (no checking) | 
|  | 113             unsigned int allele(unsigned int individualIndex) const; | 
|  | 114 | 
|  | 115             /// Population label of a given individual (no checking) | 
|  | 116             unsigned int individualLabel(unsigned int individualIndex) const; | 
|  | 117 | 
|  | 118 | 
|  | 119            /** \brief Number of alleles | 
|  | 120             * | 
|  | 121             */ | 
|  | 122             unsigned int numberOfAlleles(); | 
|  | 123 | 
|  | 124 | 
|  | 125            /** \brief Number of populations | 
|  | 126             * | 
|  | 127             */ | 
|  | 128             unsigned int numberOfPopulations(); | 
|  | 129 | 
|  | 130 | 
|  | 131            /** \brief Number of loaded genotypes | 
|  | 132             * | 
|  | 133             */ | 
|  | 134             unsigned int numberOfGenotypes() const; | 
|  | 135 | 
|  | 136 | 
|  | 137            /** \brief Absolute total allele frequency | 
|  | 138             * | 
|  | 139             */ | 
|  | 140             unsigned int alleleFrequencyTotal(unsigned int alleleIndex); | 
|  | 141 | 
|  | 142 | 
|  | 143            /** \brief Absolute allele frequency in a population | 
|  | 144             * | 
|  | 145             */ | 
|  | 146             unsigned int alleleFrequencyPerPopulation(unsigned int populationIndex, unsigned int alleleIndex); | 
|  | 147 | 
|  | 148 | 
|  | 149            /** \brief Sample size of a population | 
|  | 150             * | 
|  | 151             */ | 
|  | 152             unsigned int populationFrequency(unsigned int populationIndex); | 
|  | 153 | 
|  | 154 | 
|  | 155            /** \brief Weir-Cockerham theta-statistic | 
|  | 156             * | 
|  | 157             * Note: equivalent to Fst. | 
|  | 158             * | 
|  | 159             */ | 
|  | 160             double theta(); | 
|  | 161 | 
|  | 162 | 
|  | 163            /** \brief Between-population component of variance | 
|  | 164             * | 
|  | 165             */ | 
|  | 166             double T1(); | 
|  | 167 | 
|  | 168 | 
|  | 169            /** \brief Total variance | 
|  | 170             * | 
|  | 171             */ | 
|  | 172             double T2(); | 
|  | 173 | 
|  | 174 | 
|  | 175         protected: | 
|  | 176 | 
|  | 177             bool d_flag; | 
|  | 178             void d_init(); | 
|  | 179             void d_clear(); | 
|  | 180             unsigned int  d_reserved; | 
|  | 181             unsigned int  d_numberOfGenotypes; | 
|  | 182             unsigned int *d_genotypes; | 
|  | 183             unsigned int *d_populationLabels; | 
|  | 184 | 
|  | 185             bool s_flag; | 
|  | 186             void s_init(); | 
|  | 187             void s_clear(); | 
|  | 188             void s_compute(); | 
|  | 189             void processPopulations(); | 
|  | 190             void processAlleles(); | 
|  | 191             unsigned int getPopulationIndex(unsigned int) const; | 
|  | 192             unsigned int getAlleleIndex(unsigned int) const; | 
|  | 193             unsigned int    s_numberOfAlleles; | 
|  | 194             unsigned int   *s_alleleValueMapping; | 
|  | 195             unsigned int    s_numberOfPopulations; | 
|  | 196             unsigned int   *s_populationLabelMapping; | 
|  | 197             unsigned int   *s_populationFrequencies; | 
|  | 198             unsigned int   *s_alleleFrequenciesTotal; | 
|  | 199             unsigned int  **s_alleleFrequenciesPerPopulation; | 
|  | 200 | 
|  | 201             bool w_flag; | 
|  | 202             void w_init(); | 
|  | 203             void w_clear(); | 
|  | 204             void w_compute(); | 
|  | 205             double  w_T; | 
|  | 206             double *w_T1; | 
|  | 207             double *w_T2; | 
|  | 208             double  w_nbar; | 
|  | 209             double  w_nc; | 
|  | 210             double *w_pbar; | 
|  | 211             double *w_ssquare; | 
|  | 212             double  w_sum_T1; | 
|  | 213             double  w_sum_T2; | 
|  | 214 | 
|  | 215 | 
|  | 216         private: | 
|  | 217 | 
|  | 218             HFStatistics(const HFStatistics& source) { } | 
|  | 219 | 
|  | 220             HFStatistics& operator=(const HFStatistics& source) { | 
|  | 221                 return *this; | 
|  | 222             } | 
|  | 223 | 
|  | 224     }; | 
|  | 225 } | 
|  | 226 | 
|  | 227 #endif |