comparison egglib/egglib-2.1.5/include/egglib-cpp/LinkageDisequilibrium.hpp @ 9:98c37a5d67f4 draft

Uploaded
author dereeper
date Wed, 07 Feb 2018 22:08:47 -0500
parents 420b57c3c185
children
comparison
equal deleted inserted replaced
8:6bf69b40365c 9:98c37a5d67f4
1 /*
2 Copyright 2009 Stéphane De Mita, Mathieu Siol
3
4 This file is part of the EggLib library.
5
6 EggLib is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 EggLib is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef EGGLIB_LINKAGEDISEQUILIBRUM_HPP
21 #define EGGLIB_LINKAGEDISEQUILIBRUM_HPP
22
23
24 #include "BaseDiversity.hpp"
25 #include "EggException.hpp"
26
27
28 namespace egglib {
29
30 /** \brief Analyzes linkage disequilibrium per pair of polymorphic sites
31 *
32 * \ingroup polymorphism
33 *
34 * The class considers an alignment and detects polymorphic sites
35 * using the BaseDiversity functionality (shared with other classes
36 * of the module). Only sites with exactly two alleles are
37 * considered. Statistics of pairwise linkage disequilibrium can
38 * be accessed by pair index (note that out-of-range errors are not
39 * checked). Population labels are ignored (but outgroups are
40 * excluded from the analysis).
41 *
42 */
43 class LinkageDisequilibrium : public BaseDiversity {
44
45 public:
46
47 /// Default constructor
48 LinkageDisequilibrium();
49
50 /// Destructor
51 virtual ~LinkageDisequilibrium();
52
53 /** \brief Analyzes polymorphic sites of an alignment
54 *
55 * \param data an alignment object (subclass of CharMatrix).
56 * The presence of outgroup or of different populations will
57 * be detected based on the populationLabel members of the
58 * passed object. The populationLabel 999 will be interpreted
59 * as outgroups. If several outgroups are passed, sites were
60 * the outgroups are not consistent will be treated as "non-
61 * orientable".
62 *
63 * \param minimumExploitableData site where the non-missing
64 * data (as defined by characterMapping) are at a frequency
65 * larger than this value will be removed from the analysis.
66 * Use 1. to take only 'complete' sites into account and 0.
67 * to use all sites.
68 *
69 * \param ignoreFrequency removes sites that are polymorphic
70 * because of an allele at absolute frequency smaller than or
71 * equal to this value. If ignoreFrequency=1, no sites are
72 * removed, if ignoreFrequency=1, singleton sites are
73 * ignored. Such sites are completely removed from the
74 * analysis (not counted in lseff). Note that if more than
75 * one mutation is allowed, the site is removed only if all
76 * the alleles but one are smaller than or equal to this
77 * value. For example, an alignment column AAAAAAGAAT is
78 * ignored with an ignoreFrequency of 1, but AAAAAAGGAT is
79 * conserved (including the third allele T which is a
80 * singleton).
81 *
82 * \param characterMapping a string giving the list of
83 * characters that should be considered as valid data. If a
84 * space is present in the string, the characters left of the
85 * space will be treated as valid data and the characters
86 * right of the space will be treated as missing data, that
87 * is tolerated but ignored. All characters not in the string
88 * will cause an EggInvalidCharacterError to be raised.
89 */
90 void load(CharMatrix& data,
91 double minimumExploitableData=1.,
92 unsigned int ignoreFrequency=0,
93 std::string characterMapping=dnaMapping);
94
95
96 /// Number of pairs contained in the instance
97 unsigned int numberOfPairs() const;
98
99 /// Alignment distance between a given pair
100 int d(unsigned int pair_index);
101
102 /// D statistic for a given pair
103 double D(unsigned int pair_index);
104
105 /// D' statistic for a given pair
106 double Dp(unsigned int pair_index);
107
108 /// r statistic for a given pair
109 double r(unsigned int pair_index);
110
111 /// r2 statistic for a given pair
112 double r2(unsigned int pair_index);
113
114 /// position of the first site for a given pair
115 unsigned int site1(unsigned int pair_index);
116
117 /// position of the second site for a given pair
118 unsigned int site2(unsigned int pair_index);
119
120 /// correlation coefficient between r2 and distance
121 double correl() const;
122
123 /** \brief Computes the minimal number of recombination events
124 *
125 * The computation is performed as described in Hudson, RR and
126 * NL Kaplan. 1985. Statistical properties of the number of
127 * recombination events in the history of a sample of DNA
128 * sequences. Genetics 111: 147-164. The returned parameter is
129 * the minimal number of recombination events, given by the
130 * number of non-overlapping pairs of segregating sites violating
131 * the rule of the four gamete. Only sites with two alleles are
132 * considered. Note that homoplasy (multiple mutations) mimicks
133 * recombination. The result of this function is not stored
134 * in this instance, and re-computed at each call.
135 *
136 * \param data the same CharMatrix instance as passed to the load
137 * method. The instance must not have been modified.
138 *
139 */
140 unsigned int Rmin(CharMatrix& data) const;
141
142
143
144 protected:
145
146 // adds a pair of polymorphic sites
147 // assume position2>position1,
148 // sites are polymorphic with exactly 2 alleles
149 void add(CharMatrix& data, unsigned int position1, unsigned int position2);
150
151 // Constructor help
152 void init();
153
154 // Destructor helper
155 void clear();
156
157 // Resizes arrays
158 void reset();
159
160 // Small helper
161 inline double min(double a, double b) { return (a>b)?a:b;}
162
163 // Small helper
164 inline double max(double a, double b) { return (a>b)?b:a;}
165
166 // Small helper
167 inline void check(unsigned int pos) { if (pos>=_n) throw EggRuntimeError("tried to access an invalid index"); }
168
169 /* Performs correlation
170 *
171 * This function works independently from the rest of the class.
172 *
173 * \param n length of data arrays.
174 * \param x first data vector.
175 * \param y second data vector.
176 * \param r variable to receive the correlation coefficient.
177 * \param a variable to receive the regression slope.
178 */
179 static void _correl(unsigned int n, const int* x, const double* y, double& r, double& a);
180
181 // Distance between pairs
182 int* _d;
183
184 // D (classical) measure of LD
185 double *_D;
186
187 // D'
188 double *_Dp;
189
190 // r, correlation coefficient
191 double *_r;
192
193 // square r
194 double *_r2;
195
196 // Data array (not managed by the instance)
197 unsigned int *_site1;
198
199 // Data array (not managed by the instance)
200 unsigned int *_site2;
201
202 // Number of pairs
203 unsigned int _n;
204
205 private:
206
207 /// Copy constructor not available
208 LinkageDisequilibrium(const LinkageDisequilibrium&) { }
209
210 /// Assignment operator not available
211 LinkageDisequilibrium& operator=(const LinkageDisequilibrium&) {
212 return *this;
213 }
214
215
216 class Interval {
217 public:
218 Interval(unsigned int, unsigned int);
219 unsigned int a() const;
220 unsigned int b() const;
221 bool good() const;
222 void set_false();
223 private:
224 unsigned int _a;
225 unsigned int _b;
226 unsigned int _good;
227 };
228
229
230 };
231 }
232
233 #endif