| 
1
 | 
     1 /*
 | 
| 
 | 
     2     Copyright 2008-2009 Stéphane De Mita, Mathieu Siol
 | 
| 
 | 
     3 
 | 
| 
 | 
     4     This file is part of the EggLib library.
 | 
| 
 | 
     5 
 | 
| 
 | 
     6     EggLib is free software: you can redistribute it and/or modify
 | 
| 
 | 
     7     it under the terms of the GNU General Public License as published by
 | 
| 
 | 
     8     the Free Software Foundation, either version 3 of the License, or
 | 
| 
 | 
     9     (at your option) any later version.
 | 
| 
 | 
    10 
 | 
| 
 | 
    11     EggLib is distributed in the hope that it will be useful,
 | 
| 
 | 
    12     but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
| 
 | 
    13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
| 
 | 
    14     GNU General Public License for more details.
 | 
| 
 | 
    15 
 | 
| 
 | 
    16     You should have received a copy of the GNU General Public License
 | 
| 
 | 
    17     along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
 | 
| 
 | 
    18 */
 | 
| 
 | 
    19 
 | 
| 
 | 
    20 
 | 
| 
 | 
    21 #ifndef EGGLIB_NUCLEOTIDEDIVERSITY_HPP
 | 
| 
 | 
    22 #define EGGLIB_NUCLEOTIDEDIVERSITY_HPP
 | 
| 
 | 
    23 
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 #include "BaseDiversity.hpp"
 | 
| 
 | 
    26 #include <string>
 | 
| 
 | 
    27 #include <vector>
 | 
| 
 | 
    28 
 | 
| 
 | 
    29 
 | 
| 
 | 
    30 
 | 
| 
 | 
    31 namespace egglib {
 | 
| 
 | 
    32 
 | 
| 
 | 
    33 
 | 
| 
 | 
    34    /** \brief Performs analyzes of population genetics
 | 
| 
 | 
    35     *
 | 
| 
 | 
    36     * \ingroup polymorphism
 | 
| 
 | 
    37     * 
 | 
| 
 | 
    38     * This class computes several summary statistics based on
 | 
| 
 | 
    39     * nucleotide analysis. Note that it is possible to use the same
 | 
| 
 | 
    40     * object to analyze different data set. Calling the load() method
 | 
| 
 | 
    41     * erases all data preivously computed (if any). Calling the load()
 | 
| 
 | 
    42     * method is absolutely required to compute any statistics. Some
 | 
| 
 | 
    43     * statistics are not computed by default, but are if the
 | 
| 
 | 
    44     * corresponding accessor is used (only load() is required).
 | 
| 
 | 
    45     * 
 | 
| 
 | 
    46     * Note that "unsecure" accessors don't perform out-of-bound checks.
 | 
| 
 | 
    47     * 
 | 
| 
 | 
    48     * S is the number of varying sites (only in sites that were not
 | 
| 
 | 
    49     * rejected).
 | 
| 
 | 
    50     * 
 | 
| 
 | 
    51     * eta is the minimum number of mutations, that is the sum of the
 | 
| 
 | 
    52     * number of alleles minus 1 for each varying site. eta = S if all
 | 
| 
 | 
    53     * sites have no variant or 2 alleles. eta is computed independently
 | 
| 
 | 
    54     * of the option multiple and IS NOT computed over lseff sites.
 | 
| 
 | 
    55     *
 | 
| 
 | 
    56     * Pi is the average number of pairwise differences between sequences
 | 
| 
 | 
    57     * (expressed here per site) or (as computed here) the mean per site
 | 
| 
 | 
    58     * (unbiased) heterozygosity. Pi is zero if no polymorphic sites.
 | 
| 
 | 
    59     *
 | 
| 
 | 
    60     * D is the Tajima's test of neutrality
 | 
| 
 | 
    61     * Ref. Tajima F.: Statistical method for testing the neutral
 | 
| 
 | 
    62     * mutation hypothesis by DNA polymorphism. Genetics 1989, 123:585-595.
 | 
| 
 | 
    63     * It is arbitrary set to 0 if no polymorphic sites.
 | 
| 
 | 
    64     *
 | 
| 
 | 
    65     * tW: thetaW: estimator of theta based on polymorphic sites (ref.
 | 
| 
 | 
    66     * e.g. Watterson 1975 Theor. Pop. Biol.).
 | 
| 
 | 
    67     * Both D and thetaW are computed assuming that rounded nseff samples
 | 
| 
 | 
    68     * have been sampled.
 | 
| 
 | 
    69     * The variance of D is computed using rounded nseff instead of ns.
 | 
| 
 | 
    70     *
 | 
| 
 | 
    71     * H is the Fay and Wu's test of neutrality.
 | 
| 
 | 
    72     * Z is the standardized version and E a similar test.
 | 
| 
 | 
    73     * Ref. Fay J. C., Wu C.-I.: Hitchhiking under positive Darwinian
 | 
| 
 | 
    74     * selection. Genetics 2000, 155:1405-1413. and Zeng K., Fu Y. X.,
 | 
| 
 | 
    75     * Shi S., Wu C.-I.: Statistical tests for detecting positive
 | 
| 
 | 
    76     * selection by utilizing high-frequency variants. Genetics 2006,
 | 
| 
 | 
    77     * 174:1431-9. Both are arbitrary set to 0 if no polymorphic or
 | 
| 
 | 
    78     * orientable sites.
 | 
| 
 | 
    79     *
 | 
| 
 | 
    80     * tH and tL: theta H: estimators of theta based on derived
 | 
| 
 | 
    81     * polymorphic sites (ref in Fay and Wu and Zeng al.). The variance
 | 
| 
 | 
    82     * of H/Z are computed assuming that rounded nseff samples have
 | 
| 
 | 
    83     * been sampled.
 | 
| 
 | 
    84     * 
 | 
| 
 | 
    85     */
 | 
| 
 | 
    86     class NucleotideDiversity : public BaseDiversity {
 | 
| 
 | 
    87 
 | 
| 
 | 
    88         public:
 | 
| 
 | 
    89 
 | 
| 
 | 
    90            /** \brief Builds an object
 | 
| 
 | 
    91             * 
 | 
| 
 | 
    92             */
 | 
| 
 | 
    93             NucleotideDiversity();
 | 
| 
 | 
    94 
 | 
| 
 | 
    95 
 | 
| 
 | 
    96            /** \brief Destroys an object
 | 
| 
 | 
    97             * 
 | 
| 
 | 
    98             */
 | 
| 
 | 
    99             virtual ~NucleotideDiversity();
 | 
| 
 | 
   100 
 | 
| 
 | 
   101 
 | 
| 
 | 
   102            /** \brief Identifies polymorphic sites and computes basis
 | 
| 
 | 
   103             * statistics
 | 
| 
 | 
   104             * 
 | 
| 
 | 
   105             * \param data an alignment object (subclass of CharMatrix).
 | 
| 
 | 
   106             * The presence of outgroup or of different populations will
 | 
| 
 | 
   107             * be detected based on the populationLabel members of the
 | 
| 
 | 
   108             * passed object. The populationLabel 999 will be interpreted
 | 
| 
 | 
   109             * as outgroups. If several outgroups are passed, sites were
 | 
| 
 | 
   110             * the outgroups are not consistent will be treated as "non-
 | 
| 
 | 
   111             * orientable".
 | 
| 
 | 
   112             * 
 | 
| 
 | 
   113             * \param allowMultipleMutations if true, sites with more
 | 
| 
 | 
   114             * than two alleles will not be ignored. The sum of the
 | 
| 
 | 
   115             * frequencies of all alleles not matching the outgroup will
 | 
| 
 | 
   116             * treated as the derived allele frequency (for orientable
 | 
| 
 | 
   117             * sites).
 | 
| 
 | 
   118             * 
 | 
| 
 | 
   119             * \param minimumExploitableData sites where the non-missing
 | 
| 
 | 
   120             * data (as defined by characterMapping) are at a frequency
 | 
| 
 | 
   121             * larger than this value will be removed from the analysis.
 | 
| 
 | 
   122             * Use 1. to take only 'complete' sites into account and 0.
 | 
| 
 | 
   123             * to use all sites. (The outgroup is not considered in this
 | 
| 
 | 
   124             * computation.)
 | 
| 
 | 
   125             * 
 | 
| 
 | 
   126             * \param ignoreFrequency removes sites that are polymorph
 | 
| 
 | 
   127             * because of an allele at absolute frequency smaller than or
 | 
| 
 | 
   128             * equal to this value. If ignoreFrequency=1, no sites are
 | 
| 
 | 
   129             * removed, if ignoreFrequency=0, singleton sites are
 | 
| 
 | 
   130             * ignored. Such sites are completely removed from the
 | 
| 
 | 
   131             * analysis (not counted in lseff). Note that if more than
 | 
| 
 | 
   132             * one mutation is allowed, the site is removed only if all
 | 
| 
 | 
   133             * the alleles but one are smaller than or equal to this
 | 
| 
 | 
   134             * value. For example, an alignment column AAAAAAGAAT is
 | 
| 
 | 
   135             * ignored with an ignoreFrequency of 1, but AAAAAAGGAT is
 | 
| 
 | 
   136             * conserved (including the third allele T which is a
 | 
| 
 | 
   137             * singleton).
 | 
| 
 | 
   138             * 
 | 
| 
 | 
   139             * \param characterMapping a string giving the list of
 | 
| 
 | 
   140             * characters that should be considered as valid data. If a
 | 
| 
 | 
   141             * space is present in the string, the characters left of the
 | 
| 
 | 
   142             * space will be treated as valid data and the characters
 | 
| 
 | 
   143             * right of the space will be treated as missing data, that
 | 
| 
 | 
   144             * is tolerated but ignored. All characters not in the string
 | 
| 
 | 
   145             * will cause an EggInvalidCharacterError to be raised.
 | 
| 
 | 
   146             * 
 | 
| 
 | 
   147             * \param useZeroAsAncestral if true, all outgroups (if
 | 
| 
 | 
   148             * present) will be ignored and the character "0" will be
 | 
| 
 | 
   149             * considered as ancestral for all sites, whatever the
 | 
| 
 | 
   150             * character mapping.
 | 
| 
 | 
   151             * 
 | 
| 
 | 
   152             */
 | 
| 
 | 
   153             virtual void load(
 | 
| 
 | 
   154                 CharMatrix& data,
 | 
| 
 | 
   155                 bool allowMultipleMutations=false,
 | 
| 
 | 
   156                 double minimumExploitableData=1.,
 | 
| 
 | 
   157                 unsigned int ignoreFrequency=0,
 | 
| 
 | 
   158                 std::string characterMapping=dnaMapping,
 | 
| 
 | 
   159                 bool useZeroAsAncestral=false
 | 
| 
 | 
   160             );
 | 
| 
 | 
   161 
 | 
| 
 | 
   162 
 | 
| 
 | 
   163         // accessors for the "site analysis" section
 | 
| 
 | 
   164 
 | 
| 
 | 
   165             /// Number of polymorphic sites
 | 
| 
 | 
   166             unsigned int S() const;
 | 
| 
 | 
   167             
 | 
| 
 | 
   168             /// Number of polymorphic orientable sites
 | 
| 
 | 
   169             unsigned int So() const;
 | 
| 
 | 
   170             
 | 
| 
 | 
   171             /// Minimum number of mutations
 | 
| 
 | 
   172             unsigned int eta() const;
 | 
| 
 | 
   173             
 | 
| 
 | 
   174             /// Average of per-site number of sequences effectively used
 | 
| 
 | 
   175             double nseff() const;
 | 
| 
 | 
   176             
 | 
| 
 | 
   177             /// Number of sites effectively used
 | 
| 
 | 
   178             unsigned int lseff() const;
 | 
| 
 | 
   179             
 | 
| 
 | 
   180             /// Average of number of sequences effectively used at orientable sites
 | 
| 
 | 
   181             double nseffo() const;
 | 
| 
 | 
   182             
 | 
| 
 | 
   183             /// Number of orientable sites
 | 
| 
 | 
   184             unsigned int lseffo() const;
 | 
| 
 | 
   185             
 | 
| 
 | 
   186             /// Number of detected populations
 | 
| 
 | 
   187             unsigned int  npop() const;
 | 
| 
 | 
   188             
 | 
| 
 | 
   189             /// Label of the population with given index (unsecure)
 | 
| 
 | 
   190             unsigned int popLabel(unsigned int popIndex) const; // no check!
 | 
| 
 | 
   191 
 | 
| 
 | 
   192 
 | 
| 
 | 
   193         // accessors for the "diversity" section
 | 
| 
 | 
   194 
 | 
| 
 | 
   195             /// Nucleotide diversity
 | 
| 
 | 
   196             double Pi();
 | 
| 
 | 
   197             
 | 
| 
 | 
   198             /// Watterson estimator of theta
 | 
| 
 | 
   199             double thetaW();
 | 
| 
 | 
   200             
 | 
| 
 | 
   201             /// Average of Pi over populations
 | 
| 
 | 
   202             double average_Pi();
 | 
| 
 | 
   203             
 | 
| 
 | 
   204             /// Pi of a given population (unsecure)
 | 
| 
 | 
   205             double pop_Pi(unsigned int popIndex);  // no check!
 | 
| 
 | 
   206             
 | 
| 
 | 
   207             /// Tajima's D
 | 
| 
 | 
   208             double D();
 | 
| 
 | 
   209 
 | 
| 
 | 
   210         // accessors for the "outgroup diversity" section
 | 
| 
 | 
   211         
 | 
| 
 | 
   212             /// Fay and Wu estimator of theta
 | 
| 
 | 
   213             double thetaH();
 | 
| 
 | 
   214             
 | 
| 
 | 
   215             /// Zeng et al. estimator of theta
 | 
| 
 | 
   216             double thetaL();
 | 
| 
 | 
   217             
 | 
| 
 | 
   218             /// Fay and Wu's H
 | 
| 
 | 
   219             double H();
 | 
| 
 | 
   220             
 | 
| 
 | 
   221             /// Standardized H
 | 
| 
 | 
   222             double Z();
 | 
| 
 | 
   223             
 | 
| 
 | 
   224             /// Zeng et al.'s E
 | 
| 
 | 
   225             double E();
 | 
| 
 | 
   226 
 | 
| 
 | 
   227          // accessors for the "differentiation" section
 | 
| 
 | 
   228          
 | 
| 
 | 
   229             /// Number of sites with at least one fixed difference
 | 
| 
 | 
   230             unsigned int FixedDifferences();
 | 
| 
 | 
   231             
 | 
| 
 | 
   232             /// Number of sites with at least one allele shared among at least two populations
 | 
| 
 | 
   233             unsigned int CommonAlleles();
 | 
| 
 | 
   234             
 | 
| 
 | 
   235             /// Number of sites with at least one non-fixed allele shared among at least two populations
 | 
| 
 | 
   236             unsigned int SharedAlleles();
 | 
| 
 | 
   237             
 | 
| 
 | 
   238             /// Number of sites with at least one allele specific to one population
 | 
| 
 | 
   239             unsigned int SpecificAlleles();
 | 
| 
 | 
   240             
 | 
| 
 | 
   241             /// Number of sites with at least one derived allele specific to one population
 | 
| 
 | 
   242             unsigned int SpecificDerivedAlleles();
 | 
| 
 | 
   243             
 | 
| 
 | 
   244             /// Number of polymorphisms in a given population (unsecure)
 | 
| 
 | 
   245             unsigned int Polymorphisms(unsigned int pop);
 | 
| 
 | 
   246             
 | 
| 
 | 
   247             /// Number of specific alleles for a given population (unsecure)
 | 
| 
 | 
   248             unsigned int SpecificAlleles(unsigned int pop);
 | 
| 
 | 
   249             
 | 
| 
 | 
   250             /// Number of specific derived allele for a given population (unsecure)
 | 
| 
 | 
   251             unsigned int SpecificDerivedAlleles(unsigned int pop);
 | 
| 
 | 
   252             
 | 
| 
 | 
   253             /// Number of fixed differences between a given pair of populations (unsecure; pop2 must be larger than pop1)
 | 
| 
 | 
   254             unsigned int FixedDifferences(unsigned int pop1, unsigned int pop2);
 | 
| 
 | 
   255 
 | 
| 
 | 
   256             /// Number of common alleles between a given pair of populations (unsecure; pop2 must be larger than pop1)
 | 
| 
 | 
   257             unsigned int CommonAlleles(unsigned int pop1, unsigned int pop2);
 | 
| 
 | 
   258 
 | 
| 
 | 
   259             /// Number of shared non-fixed alleles between a given pair of populations (unsecure; pop2 must be larger than pop1)
 | 
| 
 | 
   260             unsigned int SharedAlleles(unsigned int pop1, unsigned int pop2);
 | 
| 
 | 
   261 
 | 
| 
 | 
   262 
 | 
| 
 | 
   263         // accessor for the "triConfigurations" section
 | 
| 
 | 
   264 
 | 
| 
 | 
   265            /** \brief Number falling into one of the possible site configurations
 | 
| 
 | 
   266             *
 | 
| 
 | 
   267             * The statistics are limited to three populations.
 | 
| 
 | 
   268             * Assuming an unrooted A/G polymorphism (A and G can be
 | 
| 
 | 
   269             * substitued), the site configurations are:
 | 
| 
 | 
   270             *     -  0: A&G  A   A  specific 1
 | 
| 
 | 
   271             *     -  1: A&G  A   G  specific 1 + fixed 2-3
 | 
| 
 | 
   272             *     -  2:  A  A&G  A  specific 2
 | 
| 
 | 
   273             *     -  3:  A  A&G  G  specific 2 + fixed 1-3
 | 
| 
 | 
   274             *     -  4:  A   A  A&G specific 3
 | 
| 
 | 
   275             *     -  5:  A   G  A&G specific 3 + fixed 1-2
 | 
| 
 | 
   276             *     -  6: A&G A&G  A  shared 1-2
 | 
| 
 | 
   277             *     -  7: A&G  A  A&G shared 1-3
 | 
| 
 | 
   278             *     -  8:  A  A&G A&G shared 2-3
 | 
| 
 | 
   279             *     -  9: A&G A&G A&G shared 1-2-3
 | 
| 
 | 
   280             *     - 10:  A   G   G  fixed 1
 | 
| 
 | 
   281             *     - 11:  A   G   A  fixed 2
 | 
| 
 | 
   282             *     - 12:  A   A   G  fixed 3
 | 
| 
 | 
   283             *
 | 
| 
 | 
   284             * \param index must be an index from 0 to 12.
 | 
| 
 | 
   285             * 
 | 
| 
 | 
   286             */
 | 
| 
 | 
   287             unsigned int triConfiguration(unsigned int index);
 | 
| 
 | 
   288 
 | 
| 
 | 
   289 
 | 
| 
 | 
   290         /// Builds and returns the vector of positions of all polymorphic sites
 | 
| 
 | 
   291         std::vector<unsigned int> polymorphic_positions() const;
 | 
| 
 | 
   292 
 | 
| 
 | 
   293 
 | 
| 
 | 
   294         /** \brief Builds and returns the vector of positions of all singleton sites
 | 
| 
 | 
   295          * 
 | 
| 
 | 
   296          * A site singleton when it is polymorphic according to
 | 
| 
 | 
   297          * parameter of the diversity analysis, when it has exactly two
 | 
| 
 | 
   298          * alleles and one of them is at absolute frequency 1 (one
 | 
| 
 | 
   299          * copy) disregarding the outgroup.
 | 
| 
 | 
   300          * 
 | 
| 
 | 
   301          */
 | 
| 
 | 
   302         std::vector<unsigned int> singleton_positions() const;
 | 
| 
 | 
   303 
 | 
| 
 | 
   304 
 | 
| 
 | 
   305         protected:
 | 
| 
 | 
   306 
 | 
| 
 | 
   307            /** \brief This class cannot be copied
 | 
| 
 | 
   308             * 
 | 
| 
 | 
   309             */
 | 
| 
 | 
   310             NucleotideDiversity(const NucleotideDiversity& source) { }
 | 
| 
 | 
   311 
 | 
| 
 | 
   312 
 | 
| 
 | 
   313            /** \brief This class cannot be copied
 | 
| 
 | 
   314             * 
 | 
| 
 | 
   315             */
 | 
| 
 | 
   316             NucleotideDiversity& operator=(const NucleotideDiversity& source) { return *this; }
 | 
| 
 | 
   317 
 | 
| 
 | 
   318 
 | 
| 
 | 
   319             void init();  // initializes values
 | 
| 
 | 
   320             void clear();  // free memory but doesn't initializes
 | 
| 
 | 
   321             
 | 
| 
 | 
   322             // diversity (without outgroup)
 | 
| 
 | 
   323             void diversity();
 | 
| 
 | 
   324             
 | 
| 
 | 
   325             // diversity with outgroup
 | 
| 
 | 
   326             void outgroupDiversity();
 | 
| 
 | 
   327             
 | 
| 
 | 
   328             // site patterns
 | 
| 
 | 
   329             void differentiation();
 | 
| 
 | 
   330             
 | 
| 
 | 
   331             // triconfigurations
 | 
| 
 | 
   332             void triConfigurations();
 | 
| 
 | 
   333             
 | 
| 
 | 
   334 
 | 
| 
 | 
   335             // holders for statistics, with booleans flagging groups of stats
 | 
| 
 | 
   336             
 | 
| 
 | 
   337             bool b_analysisSites;
 | 
| 
 | 
   338             
 | 
| 
 | 
   339             bool b_diversity;
 | 
| 
 | 
   340             
 | 
| 
 | 
   341             double  v_Pi;             // nucleotide diversity
 | 
| 
 | 
   342             double  v_thetaW;         // theta (Watterson estimator)
 | 
| 
 | 
   343             double  v_average_Pi;     // average diversity across populations
 | 
| 
 | 
   344             double *v_pop_Pi;         // diversity per population
 | 
| 
 | 
   345             double  v_D;              // Tajima's D
 | 
| 
 | 
   346             
 | 
| 
 | 
   347             bool b_outgroupDiversity;
 | 
| 
 | 
   348             
 | 
| 
 | 
   349             double v_thetaH;        // theta (Fay and Wu estimator)
 | 
| 
 | 
   350             double v_thetaL;        // theta (Zeng estimator)
 | 
| 
 | 
   351             double v_H;             // Fay and Wu's H
 | 
| 
 | 
   352             double v_Z;             // normalized Fay and Wu's H
 | 
| 
 | 
   353             double v_E;             // Zeng et al.'s E
 | 
| 
 | 
   354             
 | 
| 
 | 
   355             bool b_differentiation;
 | 
| 
 | 
   356             
 | 
| 
 | 
   357             unsigned int  *v_pairwiseFixedDifferences;
 | 
| 
 | 
   358             unsigned int  *v_pairwiseCommonAlleles;
 | 
| 
 | 
   359             unsigned int  *v_pairwiseSharedAlleles;
 | 
| 
 | 
   360             unsigned int  *v_popPolymorphic;
 | 
| 
 | 
   361             unsigned int  *v_popSpecific;
 | 
| 
 | 
   362             unsigned int  *v_popSpecificDerived;
 | 
| 
 | 
   363             unsigned int   v_countFixedDifferences;
 | 
| 
 | 
   364             unsigned int   v_countCommonAlleles;
 | 
| 
 | 
   365             unsigned int   v_countSharedAlleles;
 | 
| 
 | 
   366             unsigned int   v_countSpecificAlleles;
 | 
| 
 | 
   367             unsigned int   v_countSpecificDerivedAlleles;
 | 
| 
 | 
   368         
 | 
| 
 | 
   369             
 | 
| 
 | 
   370             bool b_triConfigurations;
 | 
| 
 | 
   371             
 | 
| 
 | 
   372             unsigned int  *v_triConfigurations;
 | 
| 
 | 
   373 
 | 
| 
 | 
   374     };
 | 
| 
 | 
   375 }
 | 
| 
 | 
   376 
 | 
| 
 | 
   377 #endif
 |