| 
1
 | 
     1 /*
 | 
| 
 | 
     2     Copyright 2009 Stéphane De Mita, Mathieu Siol
 | 
| 
 | 
     3 
 | 
| 
 | 
     4     This file is part of the EggLib library.
 | 
| 
 | 
     5 
 | 
| 
 | 
     6     EggLib is free software: you can redistribute it and/or modify
 | 
| 
 | 
     7     it under the terms of the GNU General Public License as published by
 | 
| 
 | 
     8     the Free Software Foundation, either version 3 of the License, or
 | 
| 
 | 
     9     (at your option) any later version.
 | 
| 
 | 
    10 
 | 
| 
 | 
    11     EggLib is distributed in the hope that it will be useful,
 | 
| 
 | 
    12     but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
| 
 | 
    13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
| 
 | 
    14     GNU General Public License for more details.
 | 
| 
 | 
    15 
 | 
| 
 | 
    16     You should have received a copy of the GNU General Public License
 | 
| 
 | 
    17     along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
 | 
| 
 | 
    18 */
 | 
| 
 | 
    19 
 | 
| 
 | 
    20 #ifndef EGGLIB_FSTATISTICS_HPP
 | 
| 
 | 
    21 #define EGGLIB_FSTATISTICS_HPP
 | 
| 
 | 
    22 
 | 
| 
 | 
    23 
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 namespace egglib {
 | 
| 
 | 
    26 
 | 
| 
 | 
    27 
 | 
| 
 | 
    28     /** \brief Computes Fis, Fst and Fit from diploid data
 | 
| 
 | 
    29     *
 | 
| 
 | 
    30     * The class requires loading data. Data are loaded by individual
 | 
| 
 | 
    31     * (two genotypes per individual). The analyses are cached: they are
 | 
| 
 | 
    32     * performed upon the first call to statistics accessors. The cache
 | 
| 
 | 
    33     * is emptied whenever a datum is loaded.
 | 
| 
 | 
    34     * 
 | 
| 
 | 
    35     * The computations are performed after Weir and Cockerham. The
 | 
| 
 | 
    36     * statistics F, theta and f are generalized for multiple alleles.
 | 
| 
 | 
    37     * To allow computation of multi-locus statistics, variance
 | 
| 
 | 
    38     * components are also available. The three components of the
 | 
| 
 | 
    39     * variance are Vpopulation (between-population), Vindividual
 | 
| 
 | 
    40     * (within-population, between-individual) and Vallele (within-
 | 
| 
 | 
    41     * individual). The formulas to compute the F-statistics are as
 | 
| 
 | 
    42     * follows:
 | 
| 
 | 
    43     *       - 1-F = Vallele/(Vpopulation+Vindividual+Vallele)
 | 
| 
 | 
    44     *       - theta = Vpopulation/(Vpopulation+Vindividual+Vallele)
 | 
| 
 | 
    45     *       - 1-f = Vallele/(Vindividual+Vallele).
 | 
| 
 | 
    46     * 
 | 
| 
 | 
    47     * \ingroup polymorphism
 | 
| 
 | 
    48     *
 | 
| 
 | 
    49     */
 | 
| 
 | 
    50     class FStatistics {
 | 
| 
 | 
    51     
 | 
| 
 | 
    52         public:
 | 
| 
 | 
    53     
 | 
| 
 | 
    54            /** \brief Constructor
 | 
| 
 | 
    55             * 
 | 
| 
 | 
    56             */ 
 | 
| 
 | 
    57             FStatistics();
 | 
| 
 | 
    58 
 | 
| 
 | 
    59             
 | 
| 
 | 
    60            /** \brief Destructor
 | 
| 
 | 
    61             * 
 | 
| 
 | 
    62             */ 
 | 
| 
 | 
    63             virtual ~FStatistics();
 | 
| 
 | 
    64 
 | 
| 
 | 
    65             
 | 
| 
 | 
    66            /** \brief Reserve sufficient memory for a given number of
 | 
| 
 | 
    67             * individuals.
 | 
| 
 | 
    68             * 
 | 
| 
 | 
    69             * This method makes the load function faster by allocating
 | 
| 
 | 
    70             * all required memory at once.
 | 
| 
 | 
    71             * 
 | 
| 
 | 
    72             * \param numberOfIndividuals a strictly positive integer.
 | 
| 
 | 
    73             * 
 | 
| 
 | 
    74             */
 | 
| 
 | 
    75             void reserve(unsigned int numberOfIndividuals);
 | 
| 
 | 
    76 
 | 
| 
 | 
    77 
 | 
| 
 | 
    78            /** \brief Loads the data for one individual
 | 
| 
 | 
    79             * 
 | 
| 
 | 
    80             * \param genotype1 an integer giving the first allele.
 | 
| 
 | 
    81             * \param genotype2 an integer giving the second allele.
 | 
| 
 | 
    82             * \param populationLabel an integer indication belonging to
 | 
| 
 | 
    83             * a population.
 | 
| 
 | 
    84             * 
 | 
| 
 | 
    85             * Genotypes and population labels are not required to be
 | 
| 
 | 
    86             * consecutive (both are labels, not indices). They are
 | 
| 
 | 
    87             * internally mapped to indices (the mapping can be obtained
 | 
| 
 | 
    88             * by accessors populationLabel and allele).
 | 
| 
 | 
    89             * 
 | 
| 
 | 
    90             * All genotypes are considered to be valid (no missing data).
 | 
| 
 | 
    91             * If statistics were computed previous to call to this
 | 
| 
 | 
    92             * function, all data will be erase.
 | 
| 
 | 
    93             * 
 | 
| 
 | 
    94             */
 | 
| 
 | 
    95             void loadIndividual(unsigned int genotype1,
 | 
| 
 | 
    96                     unsigned int genotype2, unsigned int populationLabel);
 | 
| 
 | 
    97 
 | 
| 
 | 
    98 
 | 
| 
 | 
    99            /** \brief Label of a population
 | 
| 
 | 
   100             * 
 | 
| 
 | 
   101             * The index corresponds to the local mapping of populations
 | 
| 
 | 
   102             * regardless of the ranking of population labels. (No out
 | 
| 
 | 
   103             * of bound checking.)
 | 
| 
 | 
   104             * 
 | 
| 
 | 
   105             */
 | 
| 
 | 
   106             unsigned int populationLabel(unsigned int populationIndex);
 | 
| 
 | 
   107 
 | 
| 
 | 
   108 
 | 
| 
 | 
   109            /** \brief Value of an allele
 | 
| 
 | 
   110             * 
 | 
| 
 | 
   111             * The index corresponds to the local mapping of alleles
 | 
| 
 | 
   112             * regardless of the ranking of allele values. (No out of
 | 
| 
 | 
   113             * bound checking.)
 | 
| 
 | 
   114             * 
 | 
| 
 | 
   115             */
 | 
| 
 | 
   116             unsigned int alleleValue(unsigned int alleleIndex);
 | 
| 
 | 
   117 
 | 
| 
 | 
   118 
 | 
| 
 | 
   119             /// First allele of a given individual (no checking)
 | 
| 
 | 
   120             unsigned int firstAllele(unsigned int individualIndex) const;
 | 
| 
 | 
   121 
 | 
| 
 | 
   122             /// Second allele of a given individual (no checking)
 | 
| 
 | 
   123             unsigned int secondAllele(unsigned int individualIndex) const;
 | 
| 
 | 
   124 
 | 
| 
 | 
   125             /// Population label of a given individual (no checking)
 | 
| 
 | 
   126             unsigned int individualLabel(unsigned int individualIndex) const;
 | 
| 
 | 
   127 
 | 
| 
 | 
   128 
 | 
| 
 | 
   129            /** \brief Number of alleles
 | 
| 
 | 
   130             * 
 | 
| 
 | 
   131             */
 | 
| 
 | 
   132             unsigned int numberOfAlleles();
 | 
| 
 | 
   133 
 | 
| 
 | 
   134 
 | 
| 
 | 
   135            /** \brief Number of populations
 | 
| 
 | 
   136             * 
 | 
| 
 | 
   137             */
 | 
| 
 | 
   138             unsigned int numberOfPopulations();
 | 
| 
 | 
   139 
 | 
| 
 | 
   140 
 | 
| 
 | 
   141            /** \brief Number of loaded genotypes
 | 
| 
 | 
   142             * 
 | 
| 
 | 
   143             */
 | 
| 
 | 
   144             unsigned int numberOfGenotypes() const;
 | 
| 
 | 
   145 
 | 
| 
 | 
   146 
 | 
| 
 | 
   147            /** \brief Absolute total allele frequency
 | 
| 
 | 
   148             * 
 | 
| 
 | 
   149             */
 | 
| 
 | 
   150             unsigned int alleleFrequencyTotal(unsigned int alleleIndex);
 | 
| 
 | 
   151             
 | 
| 
 | 
   152 
 | 
| 
 | 
   153            /** \brief Absolute allele frequency in a population
 | 
| 
 | 
   154             * 
 | 
| 
 | 
   155             */
 | 
| 
 | 
   156             unsigned int alleleFrequencyPerPopulation(unsigned int populationIndex, unsigned int alleleIndex);
 | 
| 
 | 
   157 
 | 
| 
 | 
   158 
 | 
| 
 | 
   159            /** \brief Absolute genotype frequency
 | 
| 
 | 
   160             * 
 | 
| 
 | 
   161             * Note that allele AB is considered different to BA (this
 | 
| 
 | 
   162             * means that values can be accessed both sides of the
 | 
| 
 | 
   163             * diagonal.
 | 
| 
 | 
   164             * 
 | 
| 
 | 
   165             */
 | 
| 
 | 
   166             unsigned int genotypeFrequencyTotal(unsigned int alleleIndex1, unsigned int alleleIndex2);
 | 
| 
 | 
   167 
 | 
| 
 | 
   168 
 | 
| 
 | 
   169            /** \brief Absolute genotype frequency in a population
 | 
| 
 | 
   170             * 
 | 
| 
 | 
   171             * Note that allele AB is considered different to BA (this
 | 
| 
 | 
   172             * means that values can be accessed both sides of the
 | 
| 
 | 
   173             * diagonal.
 | 
| 
 | 
   174             * 
 | 
| 
 | 
   175             */
 | 
| 
 | 
   176             unsigned int genotypeFrequencyPerPopulation(unsigned int populationIndex, unsigned int alleleIndex1, unsigned int alleleIndex2);
 | 
| 
 | 
   177 
 | 
| 
 | 
   178             
 | 
| 
 | 
   179            /** \brief Sample size of a population
 | 
| 
 | 
   180             * 
 | 
| 
 | 
   181             */
 | 
| 
 | 
   182             unsigned int populationFrequency(unsigned int populationIndex);
 | 
| 
 | 
   183 
 | 
| 
 | 
   184 
 | 
| 
 | 
   185            /** \brief Weir-Cockerham F-statistic
 | 
| 
 | 
   186             * 
 | 
| 
 | 
   187             * Note: equivalent to Fit.
 | 
| 
 | 
   188             * 
 | 
| 
 | 
   189             */
 | 
| 
 | 
   190             double F();
 | 
| 
 | 
   191 
 | 
| 
 | 
   192 
 | 
| 
 | 
   193            /** \brief Weir-Cockerham theta-statistic
 | 
| 
 | 
   194             * 
 | 
| 
 | 
   195             * Note: equivalent to Fst.
 | 
| 
 | 
   196             * 
 | 
| 
 | 
   197             */
 | 
| 
 | 
   198             double theta();
 | 
| 
 | 
   199 
 | 
| 
 | 
   200 
 | 
| 
 | 
   201            /** \brief Weir-Cockerham f-statistic
 | 
| 
 | 
   202             * 
 | 
| 
 | 
   203             * Note: equivalent to Fis.
 | 
| 
 | 
   204             * 
 | 
| 
 | 
   205             */
 | 
| 
 | 
   206             double f();
 | 
| 
 | 
   207             
 | 
| 
 | 
   208 
 | 
| 
 | 
   209            /** \brief Between-population component of variance
 | 
| 
 | 
   210             * 
 | 
| 
 | 
   211             */
 | 
| 
 | 
   212             double Vpopulation();
 | 
| 
 | 
   213 
 | 
| 
 | 
   214 
 | 
| 
 | 
   215            /** \brief Within-population, between-individual component of variance
 | 
| 
 | 
   216             * 
 | 
| 
 | 
   217             */
 | 
| 
 | 
   218             double Vindividual();
 | 
| 
 | 
   219             
 | 
| 
 | 
   220             
 | 
| 
 | 
   221            /** \brief Within-individual component of variance
 | 
| 
 | 
   222             * 
 | 
| 
 | 
   223             */
 | 
| 
 | 
   224             double Vallele();
 | 
| 
 | 
   225 
 | 
| 
 | 
   226 
 | 
| 
 | 
   227         protected:
 | 
| 
 | 
   228     
 | 
| 
 | 
   229             bool d_flag;
 | 
| 
 | 
   230             void d_init();
 | 
| 
 | 
   231             void d_clear();
 | 
| 
 | 
   232             unsigned int  d_reserved;
 | 
| 
 | 
   233             unsigned int  d_numberOfGenotypes;
 | 
| 
 | 
   234             unsigned int *d_genotypes;
 | 
| 
 | 
   235             unsigned int *d_populationLabels;
 | 
| 
 | 
   236 
 | 
| 
 | 
   237             bool s_flag;
 | 
| 
 | 
   238             void s_init();
 | 
| 
 | 
   239             void s_clear();
 | 
| 
 | 
   240             void s_compute();
 | 
| 
 | 
   241             void processPopulations();
 | 
| 
 | 
   242             void processAlleles();
 | 
| 
 | 
   243             unsigned int getPopulationIndex(unsigned int) const;
 | 
| 
 | 
   244             unsigned int getAlleleIndex(unsigned int) const;
 | 
| 
 | 
   245             unsigned int    s_numberOfAlleles;
 | 
| 
 | 
   246             unsigned int   *s_alleleValueMapping;
 | 
| 
 | 
   247             unsigned int    s_numberOfPopulations;
 | 
| 
 | 
   248             unsigned int   *s_populationLabelMapping;
 | 
| 
 | 
   249             unsigned int   *s_populationFrequencies;
 | 
| 
 | 
   250             unsigned int   *s_alleleFrequenciesTotal;
 | 
| 
 | 
   251             unsigned int  **s_alleleFrequenciesPerPopulation;
 | 
| 
 | 
   252             unsigned int  **s_genotypeFrequenciesTotal;
 | 
| 
 | 
   253             unsigned int ***s_genotypeFrequenciesPerPopulation;
 | 
| 
 | 
   254 
 | 
| 
 | 
   255             bool w_flag;
 | 
| 
 | 
   256             void w_init();
 | 
| 
 | 
   257             void w_clear();
 | 
| 
 | 
   258             void w_compute();
 | 
| 
 | 
   259             double  w_F;
 | 
| 
 | 
   260             double  w_T;
 | 
| 
 | 
   261             double  w_f;
 | 
| 
 | 
   262             double *w_a;
 | 
| 
 | 
   263             double *w_b;
 | 
| 
 | 
   264             double *w_c;
 | 
| 
 | 
   265             double  w_nbar;
 | 
| 
 | 
   266             double  w_nc;
 | 
| 
 | 
   267             double *w_pbar;
 | 
| 
 | 
   268             double *w_ssquare;
 | 
| 
 | 
   269             double *w_hbar;
 | 
| 
 | 
   270             double  w_sum_a;
 | 
| 
 | 
   271             double  w_sum_b;
 | 
| 
 | 
   272             double  w_sum_c;
 | 
| 
 | 
   273             double  w_sum_abc;
 | 
| 
 | 
   274             double  w_sum_bc;
 | 
| 
 | 
   275 
 | 
| 
 | 
   276     
 | 
| 
 | 
   277         private:
 | 
| 
 | 
   278             
 | 
| 
 | 
   279             FStatistics(const FStatistics& source) { }
 | 
| 
 | 
   280             
 | 
| 
 | 
   281             FStatistics& operator=(const FStatistics& source) {
 | 
| 
 | 
   282                 return *this;
 | 
| 
 | 
   283             }
 | 
| 
 | 
   284 
 | 
| 
 | 
   285     };
 | 
| 
 | 
   286 }
 | 
| 
 | 
   287 
 | 
| 
 | 
   288 #endif
 |