| 
1
 | 
     1 /*
 | 
| 
 | 
     2     Copyright 2010 Stéphane De Mita, Mathieu Siol
 | 
| 
 | 
     3 
 | 
| 
 | 
     4     This file is part of the EggLib library.
 | 
| 
 | 
     5 
 | 
| 
 | 
     6     EggLib is free software: you can redistribute it and/or modify
 | 
| 
 | 
     7     it under the terms of the GNU General Public License as published by
 | 
| 
 | 
     8     the Free Software Foundation, either version 3 of the License, or
 | 
| 
 | 
     9     (at your option) any later version.
 | 
| 
 | 
    10 
 | 
| 
 | 
    11     EggLib is distributed in the hope that it will be useful,
 | 
| 
 | 
    12     but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
| 
 | 
    13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
| 
 | 
    14     GNU General Public License for more details.
 | 
| 
 | 
    15 
 | 
| 
 | 
    16     You should have received a copy of the GNU General Public License
 | 
| 
 | 
    17     along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
 | 
| 
 | 
    18 */
 | 
| 
 | 
    19 
 | 
| 
 | 
    20 #ifndef EGGLIB_HFSTATISTICS_HPP
 | 
| 
 | 
    21 #define EGGLIB_HFSTATISTICS_HPP
 | 
| 
 | 
    22 
 | 
| 
 | 
    23 
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 namespace egglib {
 | 
| 
 | 
    26 
 | 
| 
 | 
    27 
 | 
| 
 | 
    28     /** \brief Computes Fst and Fit from haploid data
 | 
| 
 | 
    29     *
 | 
| 
 | 
    30     * The class requires loading data. Data are loaded by haploid
 | 
| 
 | 
    31     * (one genotype per individual). The analyses are cached: they are
 | 
| 
 | 
    32     * performed upon the first call to statistics accessors. The cache
 | 
| 
 | 
    33     * is emptied whenever a datum is loaded.
 | 
| 
 | 
    34     * 
 | 
| 
 | 
    35     * The computations are performed after Weir and Cockerham. The
 | 
| 
 | 
    36     * statistic theta is generalized for multiple alleles. To allow
 | 
| 
 | 
    37     * computation of multi-locus statistics, variance components are
 | 
| 
 | 
    38     * also available. The two components of the variance are T1 and T2
 | 
| 
 | 
    39     * and theta is T1/T2 (from Weir 1996 "Genetic Data Analysis II",
 | 
| 
 | 
    40     * Sinauer associates, Sunderland MA).
 | 
| 
 | 
    41     * 
 | 
| 
 | 
    42     * \ingroup polymorphism
 | 
| 
 | 
    43     *
 | 
| 
 | 
    44     */
 | 
| 
 | 
    45     class HFStatistics {
 | 
| 
 | 
    46     
 | 
| 
 | 
    47         public:
 | 
| 
 | 
    48     
 | 
| 
 | 
    49            /** \brief Constructor
 | 
| 
 | 
    50             * 
 | 
| 
 | 
    51             */ 
 | 
| 
 | 
    52             HFStatistics();
 | 
| 
 | 
    53 
 | 
| 
 | 
    54             
 | 
| 
 | 
    55            /** \brief Destructor
 | 
| 
 | 
    56             * 
 | 
| 
 | 
    57             */ 
 | 
| 
 | 
    58             virtual ~HFStatistics();
 | 
| 
 | 
    59 
 | 
| 
 | 
    60             
 | 
| 
 | 
    61            /** \brief Reserve sufficient memory for a given number of
 | 
| 
 | 
    62             * individuals.
 | 
| 
 | 
    63             * 
 | 
| 
 | 
    64             * This method makes the load function faster by allocating
 | 
| 
 | 
    65             * all required memory at once.
 | 
| 
 | 
    66             * 
 | 
| 
 | 
    67             * \param numberOfIndividuals a strictly positive integer.
 | 
| 
 | 
    68             * 
 | 
| 
 | 
    69             */
 | 
| 
 | 
    70             void reserve(unsigned int numberOfIndividuals);
 | 
| 
 | 
    71 
 | 
| 
 | 
    72 
 | 
| 
 | 
    73            /** \brief Loads the data for one individual
 | 
| 
 | 
    74             * 
 | 
| 
 | 
    75             * \param genotype an integer giving the allele.
 | 
| 
 | 
    76             * \param populationLabel an integer indication belonging to
 | 
| 
 | 
    77             * a population.
 | 
| 
 | 
    78             * 
 | 
| 
 | 
    79             * Genotypes and population labels are not required to be
 | 
| 
 | 
    80             * consecutive (both are labels, not indices). They are
 | 
| 
 | 
    81             * internally mapped to indices (the mapping can be obtained
 | 
| 
 | 
    82             * by accessors populationLabel and allele).
 | 
| 
 | 
    83             * 
 | 
| 
 | 
    84             * All genotypes are considered to be valid (no missing data).
 | 
| 
 | 
    85             * If statistics were computed previous to call to this
 | 
| 
 | 
    86             * function, all data will be erased.
 | 
| 
 | 
    87             * 
 | 
| 
 | 
    88             */
 | 
| 
 | 
    89             void loadIndividual(unsigned int genotype, unsigned int populationLabel);
 | 
| 
 | 
    90 
 | 
| 
 | 
    91 
 | 
| 
 | 
    92            /** \brief Label of a population
 | 
| 
 | 
    93             * 
 | 
| 
 | 
    94             * The index corresponds to the local mapping of populations
 | 
| 
 | 
    95             * regardless of the ranking of population labels. (No out
 | 
| 
 | 
    96             * of bound checking.)
 | 
| 
 | 
    97             * 
 | 
| 
 | 
    98             */
 | 
| 
 | 
    99             unsigned int populationLabel(unsigned int populationIndex);
 | 
| 
 | 
   100 
 | 
| 
 | 
   101 
 | 
| 
 | 
   102            /** \brief Value of an allele
 | 
| 
 | 
   103             * 
 | 
| 
 | 
   104             * The index corresponds to the local mapping of alleles
 | 
| 
 | 
   105             * regardless of the ranking of allele values. (No out of
 | 
| 
 | 
   106             * bound checking.)
 | 
| 
 | 
   107             * 
 | 
| 
 | 
   108             */
 | 
| 
 | 
   109             unsigned int alleleValue(unsigned int alleleIndex);
 | 
| 
 | 
   110 
 | 
| 
 | 
   111 
 | 
| 
 | 
   112             /// Allele of a given individual (no checking)
 | 
| 
 | 
   113             unsigned int allele(unsigned int individualIndex) const;
 | 
| 
 | 
   114 
 | 
| 
 | 
   115             /// Population label of a given individual (no checking)
 | 
| 
 | 
   116             unsigned int individualLabel(unsigned int individualIndex) const;
 | 
| 
 | 
   117 
 | 
| 
 | 
   118 
 | 
| 
 | 
   119            /** \brief Number of alleles
 | 
| 
 | 
   120             * 
 | 
| 
 | 
   121             */
 | 
| 
 | 
   122             unsigned int numberOfAlleles();
 | 
| 
 | 
   123 
 | 
| 
 | 
   124 
 | 
| 
 | 
   125            /** \brief Number of populations
 | 
| 
 | 
   126             * 
 | 
| 
 | 
   127             */
 | 
| 
 | 
   128             unsigned int numberOfPopulations();
 | 
| 
 | 
   129 
 | 
| 
 | 
   130 
 | 
| 
 | 
   131            /** \brief Number of loaded genotypes
 | 
| 
 | 
   132             * 
 | 
| 
 | 
   133             */
 | 
| 
 | 
   134             unsigned int numberOfGenotypes() const;
 | 
| 
 | 
   135 
 | 
| 
 | 
   136 
 | 
| 
 | 
   137            /** \brief Absolute total allele frequency
 | 
| 
 | 
   138             * 
 | 
| 
 | 
   139             */
 | 
| 
 | 
   140             unsigned int alleleFrequencyTotal(unsigned int alleleIndex);
 | 
| 
 | 
   141             
 | 
| 
 | 
   142 
 | 
| 
 | 
   143            /** \brief Absolute allele frequency in a population
 | 
| 
 | 
   144             * 
 | 
| 
 | 
   145             */
 | 
| 
 | 
   146             unsigned int alleleFrequencyPerPopulation(unsigned int populationIndex, unsigned int alleleIndex);
 | 
| 
 | 
   147 
 | 
| 
 | 
   148 
 | 
| 
 | 
   149            /** \brief Sample size of a population
 | 
| 
 | 
   150             * 
 | 
| 
 | 
   151             */
 | 
| 
 | 
   152             unsigned int populationFrequency(unsigned int populationIndex);
 | 
| 
 | 
   153 
 | 
| 
 | 
   154 
 | 
| 
 | 
   155            /** \brief Weir-Cockerham theta-statistic
 | 
| 
 | 
   156             * 
 | 
| 
 | 
   157             * Note: equivalent to Fst.
 | 
| 
 | 
   158             * 
 | 
| 
 | 
   159             */
 | 
| 
 | 
   160             double theta();
 | 
| 
 | 
   161 
 | 
| 
 | 
   162 
 | 
| 
 | 
   163            /** \brief Between-population component of variance
 | 
| 
 | 
   164             * 
 | 
| 
 | 
   165             */
 | 
| 
 | 
   166             double T1();
 | 
| 
 | 
   167 
 | 
| 
 | 
   168 
 | 
| 
 | 
   169            /** \brief Total variance
 | 
| 
 | 
   170             * 
 | 
| 
 | 
   171             */
 | 
| 
 | 
   172             double T2();
 | 
| 
 | 
   173             
 | 
| 
 | 
   174 
 | 
| 
 | 
   175         protected:
 | 
| 
 | 
   176     
 | 
| 
 | 
   177             bool d_flag;
 | 
| 
 | 
   178             void d_init();
 | 
| 
 | 
   179             void d_clear();
 | 
| 
 | 
   180             unsigned int  d_reserved;
 | 
| 
 | 
   181             unsigned int  d_numberOfGenotypes;
 | 
| 
 | 
   182             unsigned int *d_genotypes;
 | 
| 
 | 
   183             unsigned int *d_populationLabels;
 | 
| 
 | 
   184 
 | 
| 
 | 
   185             bool s_flag;
 | 
| 
 | 
   186             void s_init();
 | 
| 
 | 
   187             void s_clear();
 | 
| 
 | 
   188             void s_compute();
 | 
| 
 | 
   189             void processPopulations();
 | 
| 
 | 
   190             void processAlleles();
 | 
| 
 | 
   191             unsigned int getPopulationIndex(unsigned int) const;
 | 
| 
 | 
   192             unsigned int getAlleleIndex(unsigned int) const;
 | 
| 
 | 
   193             unsigned int    s_numberOfAlleles;
 | 
| 
 | 
   194             unsigned int   *s_alleleValueMapping;
 | 
| 
 | 
   195             unsigned int    s_numberOfPopulations;
 | 
| 
 | 
   196             unsigned int   *s_populationLabelMapping;
 | 
| 
 | 
   197             unsigned int   *s_populationFrequencies;
 | 
| 
 | 
   198             unsigned int   *s_alleleFrequenciesTotal;
 | 
| 
 | 
   199             unsigned int  **s_alleleFrequenciesPerPopulation;
 | 
| 
 | 
   200 
 | 
| 
 | 
   201             bool w_flag;
 | 
| 
 | 
   202             void w_init();
 | 
| 
 | 
   203             void w_clear();
 | 
| 
 | 
   204             void w_compute();
 | 
| 
 | 
   205             double  w_T;
 | 
| 
 | 
   206             double *w_T1;
 | 
| 
 | 
   207             double *w_T2;
 | 
| 
 | 
   208             double  w_nbar;
 | 
| 
 | 
   209             double  w_nc;
 | 
| 
 | 
   210             double *w_pbar;
 | 
| 
 | 
   211             double *w_ssquare;
 | 
| 
 | 
   212             double  w_sum_T1;
 | 
| 
 | 
   213             double  w_sum_T2;
 | 
| 
 | 
   214 
 | 
| 
 | 
   215     
 | 
| 
 | 
   216         private:
 | 
| 
 | 
   217             
 | 
| 
 | 
   218             HFStatistics(const HFStatistics& source) { }
 | 
| 
 | 
   219             
 | 
| 
 | 
   220             HFStatistics& operator=(const HFStatistics& source) {
 | 
| 
 | 
   221                 return *this;
 | 
| 
 | 
   222             }
 | 
| 
 | 
   223 
 | 
| 
 | 
   224     };
 | 
| 
 | 
   225 }
 | 
| 
 | 
   226 
 | 
| 
 | 
   227 #endif
 |