comparison egglib/egglib-2.1.5/include/egglib-cpp/Mutator.hpp @ 9:98c37a5d67f4 draft

Uploaded
author dereeper
date Wed, 07 Feb 2018 22:08:47 -0500
parents 420b57c3c185
children
comparison
equal deleted inserted replaced
8:6bf69b40365c 9:98c37a5d67f4
1 /*
2 Copyright 2009, 2010, 2012 Stéphane De Mita, Mathieu Siol
3
4 This file is part of the EggLib library.
5
6 EggLib is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 EggLib is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef EGGLIB_MUTATOR_HPP
21 #define EGGLIB_MUTATOR_HPP
22
23
24 #include "DataMatrix.hpp"
25 #include "Random.hpp"
26 #include "Arg.hpp"
27 #include "Mutation.hpp"
28
29
30 namespace egglib {
31
32
33 /** \brief Implements mutation models
34 *
35 * \ingroup coalesce
36 *
37 * Works with a previously built Ancestral Reconbination Graph. The
38 * user must sets options using the setter-based interface. After
39 * that he or she can call the method mute() that will generates
40 * a DataMatrix object.
41 *
42 * Genotype data are represented by integer numbers. Regardless of
43 * the mutation model, the ancestral state is always 0. The user can
44 * set the rate of mutation (or, alternatively, fix the number of
45 * mutations that occurred - which is the number of segregating sites
46 * only with an infinite site model).
47 *
48 * Other options fall into two separate groups: the positions of the
49 * mutated sites and the process of mutation (how new alleles are
50 * generated).
51 *
52 * Concerning allele generation, several mutation models are available
53 * (coded by single letters):
54 * - F: fixed number of alleles. Among other markers, this model is
55 * appropriate for simulating nucleotides. The user is able
56 * to choose the number of alleles (where 2 is the standard
57 * for an infinite site model and 4 for a finite site model).
58 * Mutator allows assigning independent weights between all
59 * different transition types and can draw randomly the
60 * ancestral states, providing a way to emulate evolution of
61 * nucleotides with multiple mutations at the same site and
62 * reversion.
63 * - I: infinite number of alleles. At a given site, each mutation
64 * raises a new allele. The value of the alleles is therefore
65 * irrelevant (it only denotes its order of appearance). This
66 * model does not permit homoplasy.
67 * - S: stepwise mutation model. In this model the value of the
68 * alleles are interpreted as a size (typically for simulating
69 * a microsatellite marker). Each mutation either increases
70 * or decreases the allele size by an increment of one.
71 * - T: two-phase mutation model. This model is a generalization
72 * of the stepwise mutation model (S). For a mutation, the
73 * increment (either increase or decrease) is 1 with the
74 * probability given by the parameter (1-TPMproba). With
75 * probability TPMproba, the increment is drawn from a
76 * geometric distribution of parameter given by the other
77 * parameter (TPMparam).
78 *
79 * By default, the program will assume an infinite site model (ISM).
80 * Each mutation will occur to a new position drawn from the interval
81 * [0,1]. It is possible to set any mutation model with an ISM
82 * (including microsatellite-like models I, S and T). Alternatively,
83 * the user can specify a finite number of sites available for
84 * mutation. For a microsatellite marker, the user will want to
85 * specify a single site. It is possible to set a finite number of
86 * sites for all mutation models. In all cases, the mutations will
87 * be forced to target these sites. It is possible to apply weights
88 * independently to all sites. The higher the weight value
89 * (comparatively to the other sites), the higher the probability
90 * that this site mutes. The weights needs not to be relative. In
91 * addition, the user can set the positions of the different sites.
92 * Nothings forces him or her to place them in order. Note that this
93 * does not affect the mutation process, but on the amount of
94 * recombination that will be allowed between sites.
95 *
96 */
97 class Mutator {
98
99 public:
100
101 /** \brief Initializes with default values
102 *
103 * List of default values:
104 * - theta = 0
105 * - fixedNumberOfMutations = 0
106 * - model = F (fixed number of alleles)
107 * - fixed number of alleles = 2
108 * - infinite site model
109 * - TPM parameters are both preset to 0.5
110 *
111 */
112 Mutator();
113
114
115 /** \brief Destroys the object
116 *
117 */
118 ~Mutator();
119
120
121 /** \brief Copy constructor
122 *
123 */
124 Mutator(const Mutator&);
125
126
127 /** \brief Assignement operator
128 *
129 */
130 Mutator& operator=(const Mutator&);
131
132
133 /** \brief Restores default values of all parameters
134 *
135 */
136 void reset();
137
138
139 /** \brief Gets the fixed number of mutations
140 *
141 */
142 unsigned int fixedNumberOfMutations() const;
143
144
145 /** \brief Sets the fixed number of mutations
146 *
147 * The value can be 0. It is not allowed to set both the
148 * fixed number of mutations and the mutation rate at
149 * non-zero value
150 *
151 */
152 void fixedNumberOfMutations(unsigned int);
153
154
155 /** \brief Gets the mutation rate
156 *
157 */
158 double mutationRate() const;
159
160
161 /** \brief Sets the mutation rate
162 *
163 * The value cannot be negative. The value can be 0. It is
164 * not allowed to set both the fixed number of mutations and
165 * the mutation rate at non-zero value
166 *
167 */
168 void mutationRate(double);
169
170
171 /** \brief Gets the mutation model
172 *
173 * See the class documentation for the signification of the
174 * different one-letter codes.
175 *
176 */
177 char mutationModel() const;
178
179
180 /** \brief Sets the mutation model
181 *
182 * The passed character must be one of F, I, S and T. See the
183 * class documentation for their signification.
184 *
185 */
186 void mutationModel(char);
187
188
189 /** \brief Gets the fixed number of possible alleles
190 *
191 */
192 unsigned int numberOfAlleles() const;
193
194
195 /** \brief Sets the fixed number of possible alleles
196 *
197 * The value must be larger than 1. This parameter is
198 * meaningful only for the fixed number allele model of
199 * mutation, and ignored otherwise.
200 *
201 */
202 void numberOfAlleles(unsigned int);
203
204
205 /** \brief Sets a transition weight
206 *
207 * \param i row (previous allele index).
208 * \param j column (next allele index).
209 * \param value weight to apply.
210 *
211 * Indices i and j must be different. Weights can be any
212 * strictly positive value.
213 *
214 */
215 void transitionWeight(unsigned int i, unsigned int j, double value);
216
217
218 /** \brief Gets a transition weight
219 *
220 * \param i row (previous allele index).
221 * \param j column (next allele index).
222 *
223 * Indices i and j must be different.
224 *
225 */
226 double transitionWeight(unsigned int i, unsigned int j);
227
228
229 /** \brief Set to true to draw ancestral alleles randomly
230 *
231 * By default, the ancestral allele is always 0. If this
232 * variable is set to true, the ancestral allele will be
233 * randomly drawn from the defined number of alleles. This
234 * option is always ignored unless in combination with the
235 * Fixed Allele Number model.
236 *
237 */
238 void randomAncestralAllele(bool flag);
239
240
241 /** \brief true if ancestral alleles must be drawn randomly
242 *
243 */
244 bool randomAncestralAllele() const;
245
246
247 /** \brief Gets the TPM probability parameter
248 *
249 */
250 double TPMproba() const;
251
252
253 /** \brief Sets the TPM probability parameter
254 *
255 * This parameter is considered only if the mutation model
256 * is T (two-phase mutation model). It gives the probability
257 * that a mutation step is not fixed to be 1. If TPMproba is
258 * 0, the mutation model is SMM.
259 *
260 * The value must be >=0. and <=1.
261 *
262 */
263 void TPMproba(double value);
264
265
266 /** \brief Gets the TPM distribution parameter
267 *
268 */
269 double TPMparam() const;
270
271
272 /** \brief Sets the TPM distribution parameter
273 *
274 * This parameter is considered only if the mutation model
275 * is T (two-phase mutation model). It gives the parameter
276 * of the geometric distribution which is used to generate
277 * the mutation step (if it is not one).
278 *
279 * The value must be >=0. and <=1.
280 *
281 */
282 void TPMparam(double value);
283
284
285 /** \brief Gets the number of mutable sites
286 *
287 * A value a zero must be interpreted as the infinite site
288 * model. Note that after all calls all data from the tables
289 * sitePositions and siteWeights will be reset.
290 *
291 */
292 unsigned int numberOfSites() const;
293
294
295 /** \brief Sets the number of mutable sites
296 *
297 * The value of zero is accepted and imposed the infinite
298 * site model.
299 *
300 */
301 void numberOfSites(unsigned int);
302
303
304 /** \brief Gets the position of a given site
305 *
306 */
307 double sitePosition(unsigned int siteIndex) const;
308
309
310 /** \brief Set the position of a given site
311 *
312 * The position must be >=0 and <=1
313 *
314 */
315 void sitePosition(unsigned int siteIndex, double position);
316
317
318 /** \brief Gets the mutation weight of a given site
319 *
320 */
321 double siteWeight(unsigned int siteIndex) const;
322
323
324 /** \brief Set the site weight of a given site
325 *
326 * The weight must be strictly positive.
327 *
328 */
329 void siteWeight(unsigned int siteIndex, double weight);
330
331
332 /** \brief Performs mutation
333 *
334 * \param arg Ancestral recombination graph instance. If the
335 * ARG is partially built or not a all, or improperly so,
336 * the behaviour of this method is not defined.
337 *
338 * \param random The address of a Random instance to be
339 * used for generating random numbers.
340 *
341 * \return A DataMatrix instance containing simulated data.
342 *
343 */
344 DataMatrix mute(Arg* arg, Random* random);
345
346
347 /** \brief Gets the last number of mutations
348 *
349 * Returns the number of mutations of the last call of mute( ).
350 * By default, this method returns 0.
351 *
352 */
353 unsigned int numberOfMutations() const;
354
355
356 private:
357
358 void clear();
359 void init();
360 void copy(const Mutator&);
361
362 //int nextAllele(int allele, Random* random);
363 int TPMstep(double inTPMproba, Random* random);
364 void apply_mutation(unsigned int matrixIndex,
365 unsigned int actualSite, DataMatrix& data,
366 const Edge* edge, int allele,
367 unsigned int segment, Random* random);
368
369
370 char _model;
371 double _mutationRate;
372 unsigned int _fixedNumberOfMutations;
373 unsigned int _numberOfAlleles;
374 double** _transitionWeights;
375 bool _randomAncestralAllele;
376 unsigned int _numberOfSites;
377 double* _sitePositions;
378 double* _siteWeights;
379 double _TPMproba;
380 double _TPMparam;
381 int maxAllele;
382 unsigned int _numberOfMutations;
383 std::vector<Mutation> _cache_mutations;
384 unsigned int _cache_mutations_reserved;
385
386 };
387
388
389 bool compare(Mutation mutation1, Mutation mutation2); // returns True if mutation1 is older
390
391 }
392
393
394
395
396 #endif
397