| 1 | 1 /* | 
|  | 2     Copyright 2009 Stéphane De Mita, Mathieu Siol | 
|  | 3 | 
|  | 4     This file is part of the EggLib library. | 
|  | 5 | 
|  | 6     EggLib is free software: you can redistribute it and/or modify | 
|  | 7     it under the terms of the GNU General Public License as published by | 
|  | 8     the Free Software Foundation, either version 3 of the License, or | 
|  | 9     (at your option) any later version. | 
|  | 10 | 
|  | 11     EggLib is distributed in the hope that it will be useful, | 
|  | 12     but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
|  | 14     GNU General Public License for more details. | 
|  | 15 | 
|  | 16     You should have received a copy of the GNU General Public License | 
|  | 17     along with EggLib.  If not, see <http://www.gnu.org/licenses/>. | 
|  | 18 */ | 
|  | 19 | 
|  | 20 #ifndef EGGLIB_LINKAGEDISEQUILIBRUM_HPP | 
|  | 21 #define EGGLIB_LINKAGEDISEQUILIBRUM_HPP | 
|  | 22 | 
|  | 23 | 
|  | 24 #include "BaseDiversity.hpp" | 
|  | 25 #include "EggException.hpp" | 
|  | 26 | 
|  | 27 | 
|  | 28 namespace egglib { | 
|  | 29 | 
|  | 30    /** \brief Analyzes linkage disequilibrium per pair of polymorphic sites | 
|  | 31     * | 
|  | 32     * \ingroup polymorphism | 
|  | 33     * | 
|  | 34     * The class considers an alignment and detects polymorphic sites | 
|  | 35     * using the BaseDiversity functionality (shared with other classes | 
|  | 36     * of the module). Only sites with exactly two alleles are | 
|  | 37     * considered. Statistics of pairwise linkage disequilibrium can | 
|  | 38     * be accessed by pair index (note that out-of-range errors are not | 
|  | 39     * checked). Population labels are ignored (but outgroups are | 
|  | 40     * excluded from the analysis). | 
|  | 41     * | 
|  | 42     */ | 
|  | 43     class LinkageDisequilibrium : public BaseDiversity { | 
|  | 44 | 
|  | 45       public: | 
|  | 46 | 
|  | 47         /// Default constructor | 
|  | 48         LinkageDisequilibrium(); | 
|  | 49 | 
|  | 50         /// Destructor | 
|  | 51         virtual ~LinkageDisequilibrium(); | 
|  | 52 | 
|  | 53        /** \brief Analyzes polymorphic sites of an alignment | 
|  | 54         * | 
|  | 55         * \param data an alignment object (subclass of CharMatrix). | 
|  | 56         * The presence of outgroup or of different populations will | 
|  | 57         * be detected based on the populationLabel members of the | 
|  | 58         * passed object. The populationLabel 999 will be interpreted | 
|  | 59         * as outgroups. If several outgroups are passed, sites were | 
|  | 60         * the outgroups are not consistent will be treated as "non- | 
|  | 61         * orientable". | 
|  | 62         * | 
|  | 63         * \param minimumExploitableData site where the non-missing | 
|  | 64         * data (as defined by characterMapping) are at a frequency | 
|  | 65         * larger than this value will be removed from the analysis. | 
|  | 66         * Use 1. to take only 'complete' sites into account and 0. | 
|  | 67         * to use all sites. | 
|  | 68         * | 
|  | 69         * \param ignoreFrequency removes sites that are polymorphic | 
|  | 70         * because of an allele at absolute frequency smaller than or | 
|  | 71         * equal to this value. If ignoreFrequency=1, no sites are | 
|  | 72         * removed, if ignoreFrequency=1, singleton sites are | 
|  | 73         * ignored. Such sites are completely removed from the | 
|  | 74         * analysis (not counted in lseff). Note that if more than | 
|  | 75         * one mutation is allowed, the site is removed only if all | 
|  | 76         * the alleles but one are smaller than or equal to this | 
|  | 77         * value. For example, an alignment column AAAAAAGAAT is | 
|  | 78         * ignored with an ignoreFrequency of 1, but AAAAAAGGAT is | 
|  | 79         * conserved (including the third allele T which is a | 
|  | 80         * singleton). | 
|  | 81         * | 
|  | 82         * \param characterMapping a string giving the list of | 
|  | 83         * characters that should be considered as valid data. If a | 
|  | 84         * space is present in the string, the characters left of the | 
|  | 85         * space will be treated as valid data and the characters | 
|  | 86         * right of the space will be treated as missing data, that | 
|  | 87         * is tolerated but ignored. All characters not in the string | 
|  | 88         * will cause an EggInvalidCharacterError to be raised. | 
|  | 89         */ | 
|  | 90         void load(CharMatrix& data, | 
|  | 91                 double minimumExploitableData=1., | 
|  | 92                 unsigned int ignoreFrequency=0, | 
|  | 93                 std::string characterMapping=dnaMapping); | 
|  | 94 | 
|  | 95 | 
|  | 96         /// Number of pairs contained in the instance | 
|  | 97         unsigned int numberOfPairs() const; | 
|  | 98 | 
|  | 99         /// Alignment distance between a given pair | 
|  | 100         int d(unsigned int pair_index); | 
|  | 101 | 
|  | 102         /// D statistic for a given pair | 
|  | 103         double D(unsigned int pair_index); | 
|  | 104 | 
|  | 105         /// D' statistic for a given pair | 
|  | 106         double Dp(unsigned int pair_index); | 
|  | 107 | 
|  | 108         /// r statistic for a given pair | 
|  | 109         double r(unsigned int pair_index); | 
|  | 110 | 
|  | 111         /// r2 statistic for a given pair | 
|  | 112         double r2(unsigned int pair_index); | 
|  | 113 | 
|  | 114         /// position of the first site for a given pair | 
|  | 115         unsigned int site1(unsigned int pair_index); | 
|  | 116 | 
|  | 117         /// position of the second site for a given pair | 
|  | 118         unsigned int site2(unsigned int pair_index); | 
|  | 119 | 
|  | 120         /// correlation coefficient between r2 and distance | 
|  | 121         double correl() const; | 
|  | 122 | 
|  | 123        /** \brief Computes the minimal number of recombination events | 
|  | 124         * | 
|  | 125         * The computation is performed as described in Hudson, RR and | 
|  | 126         * NL Kaplan. 1985. Statistical properties of the number of | 
|  | 127         * recombination events in the history of a sample of DNA | 
|  | 128         * sequences. Genetics 111: 147-164. The returned parameter is | 
|  | 129         * the minimal number of recombination events, given by the | 
|  | 130         * number of non-overlapping pairs of segregating sites violating | 
|  | 131         * the rule of the four gamete. Only sites with two alleles are | 
|  | 132         * considered. Note that homoplasy (multiple mutations) mimicks | 
|  | 133         * recombination. The result of this function is not stored | 
|  | 134         * in this instance, and re-computed at each call. | 
|  | 135         * | 
|  | 136         * \param data the same CharMatrix instance as passed to the load | 
|  | 137         * method. The instance must not have been modified. | 
|  | 138         * | 
|  | 139         */ | 
|  | 140         unsigned int Rmin(CharMatrix& data) const; | 
|  | 141 | 
|  | 142 | 
|  | 143 | 
|  | 144       protected: | 
|  | 145 | 
|  | 146         // adds a pair of polymorphic sites | 
|  | 147         // assume position2>position1, | 
|  | 148         //  sites are polymorphic with exactly 2 alleles | 
|  | 149         void add(CharMatrix& data, unsigned int position1, unsigned int position2); | 
|  | 150 | 
|  | 151         // Constructor help | 
|  | 152         void init(); | 
|  | 153 | 
|  | 154         // Destructor helper | 
|  | 155         void clear(); | 
|  | 156 | 
|  | 157         // Resizes arrays | 
|  | 158         void reset(); | 
|  | 159 | 
|  | 160         // Small helper | 
|  | 161         inline double min(double a, double b) { return (a>b)?a:b;} | 
|  | 162 | 
|  | 163         // Small helper | 
|  | 164         inline double max(double a, double b) { return (a>b)?b:a;} | 
|  | 165 | 
|  | 166         // Small helper | 
|  | 167         inline void check(unsigned int pos) {  if (pos>=_n) throw EggRuntimeError("tried to access an invalid index"); } | 
|  | 168 | 
|  | 169        /* Performs correlation | 
|  | 170         * | 
|  | 171         * This function works independently from the rest of the class. | 
|  | 172         * | 
|  | 173         * \param n length of data arrays. | 
|  | 174         * \param x first data vector. | 
|  | 175         * \param y second data vector. | 
|  | 176         * \param r variable to receive the correlation coefficient. | 
|  | 177         * \param a variable to receive the regression slope. | 
|  | 178         */ | 
|  | 179         static void _correl(unsigned int n, const int* x, const double* y, double& r, double& a); | 
|  | 180 | 
|  | 181         // Distance between pairs | 
|  | 182         int* _d; | 
|  | 183 | 
|  | 184         // D (classical) measure of LD | 
|  | 185         double *_D; | 
|  | 186 | 
|  | 187         // D' | 
|  | 188         double *_Dp; | 
|  | 189 | 
|  | 190         // r, correlation coefficient | 
|  | 191         double *_r; | 
|  | 192 | 
|  | 193         // square r | 
|  | 194         double *_r2; | 
|  | 195 | 
|  | 196         // Data array (not managed by the instance) | 
|  | 197         unsigned int *_site1; | 
|  | 198 | 
|  | 199         // Data array (not managed by the instance) | 
|  | 200         unsigned int *_site2; | 
|  | 201 | 
|  | 202         // Number of pairs | 
|  | 203         unsigned int _n; | 
|  | 204 | 
|  | 205      private: | 
|  | 206 | 
|  | 207         /// Copy constructor not available | 
|  | 208         LinkageDisequilibrium(const LinkageDisequilibrium&) { } | 
|  | 209 | 
|  | 210         /// Assignment operator not available | 
|  | 211         LinkageDisequilibrium& operator=(const LinkageDisequilibrium&) { | 
|  | 212             return *this; | 
|  | 213         } | 
|  | 214 | 
|  | 215 | 
|  | 216         class Interval { | 
|  | 217             public: | 
|  | 218                 Interval(unsigned int, unsigned int); | 
|  | 219                 unsigned int a() const; | 
|  | 220                 unsigned int b() const; | 
|  | 221                 bool good() const; | 
|  | 222                 void set_false(); | 
|  | 223             private: | 
|  | 224                 unsigned int _a; | 
|  | 225                 unsigned int _b; | 
|  | 226                 unsigned int _good; | 
|  | 227         }; | 
|  | 228 | 
|  | 229 | 
|  | 230   }; | 
|  | 231 } | 
|  | 232 | 
|  | 233 #endif |