| 
1
 | 
     1 /*
 | 
| 
 | 
     2     Copyright 2009, 2010, 2012 Stéphane De Mita, Mathieu Siol
 | 
| 
 | 
     3 
 | 
| 
 | 
     4     This file is part of the EggLib library.
 | 
| 
 | 
     5 
 | 
| 
 | 
     6     EggLib is free software: you can redistribute it and/or modify
 | 
| 
 | 
     7     it under the terms of the GNU General Public License as published by
 | 
| 
 | 
     8     the Free Software Foundation, either version 3 of the License, or
 | 
| 
 | 
     9     (at your option) any later version.
 | 
| 
 | 
    10 
 | 
| 
 | 
    11     EggLib is distributed in the hope that it will be useful,
 | 
| 
 | 
    12     but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
| 
 | 
    13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
| 
 | 
    14     GNU General Public License for more details.
 | 
| 
 | 
    15 
 | 
| 
 | 
    16     You should have received a copy of the GNU General Public License
 | 
| 
 | 
    17     along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
 | 
| 
 | 
    18 */
 | 
| 
 | 
    19 
 | 
| 
 | 
    20 #ifndef EGGLIB_MUTATOR_HPP
 | 
| 
 | 
    21 #define EGGLIB_MUTATOR_HPP
 | 
| 
 | 
    22 
 | 
| 
 | 
    23 
 | 
| 
 | 
    24 #include "DataMatrix.hpp"
 | 
| 
 | 
    25 #include "Random.hpp"
 | 
| 
 | 
    26 #include "Arg.hpp"
 | 
| 
 | 
    27 #include "Mutation.hpp"
 | 
| 
 | 
    28 
 | 
| 
 | 
    29 
 | 
| 
 | 
    30 namespace egglib {
 | 
| 
 | 
    31     
 | 
| 
 | 
    32 
 | 
| 
 | 
    33    /** \brief Implements mutation models
 | 
| 
 | 
    34     *
 | 
| 
 | 
    35     * \ingroup coalesce
 | 
| 
 | 
    36     * 
 | 
| 
 | 
    37     * Works with a previously built Ancestral Reconbination Graph. The
 | 
| 
 | 
    38     * user must sets options using the setter-based interface. After
 | 
| 
 | 
    39     * that he or she can call the method mute() that will generates
 | 
| 
 | 
    40     * a DataMatrix object.
 | 
| 
 | 
    41     * 
 | 
| 
 | 
    42     * Genotype data are represented by integer numbers. Regardless of
 | 
| 
 | 
    43     * the mutation model, the ancestral state is always 0. The user can
 | 
| 
 | 
    44     * set the rate of mutation (or, alternatively, fix the number of
 | 
| 
 | 
    45     * mutations that occurred - which is the number of segregating sites
 | 
| 
 | 
    46     * only with an infinite site model).
 | 
| 
 | 
    47     * 
 | 
| 
 | 
    48     * Other options fall into two separate groups: the positions of the
 | 
| 
 | 
    49     * mutated sites and the process of mutation (how new alleles are
 | 
| 
 | 
    50     * generated).
 | 
| 
 | 
    51     * 
 | 
| 
 | 
    52     * Concerning allele generation, several mutation models are available
 | 
| 
 | 
    53     * (coded by single letters):
 | 
| 
 | 
    54     *   - F: fixed number of alleles. Among other markers, this model is
 | 
| 
 | 
    55     *        appropriate for simulating nucleotides. The user is able
 | 
| 
 | 
    56     *        to choose the number of alleles (where 2 is the standard
 | 
| 
 | 
    57     *        for an infinite site model and 4 for a finite site model).
 | 
| 
 | 
    58     *        Mutator allows assigning independent weights between all
 | 
| 
 | 
    59     *        different transition types and can draw randomly the
 | 
| 
 | 
    60     *        ancestral states, providing a way to emulate evolution of
 | 
| 
 | 
    61     *        nucleotides with multiple mutations at the same site and
 | 
| 
 | 
    62     *        reversion.
 | 
| 
 | 
    63     *   - I: infinite number of alleles. At a given site, each mutation
 | 
| 
 | 
    64     *        raises a new allele. The value of the alleles is therefore
 | 
| 
 | 
    65     *        irrelevant (it only denotes its order of appearance). This
 | 
| 
 | 
    66     *        model does not permit homoplasy.
 | 
| 
 | 
    67     *   - S: stepwise mutation model. In this model the value of the
 | 
| 
 | 
    68     *        alleles are interpreted as a size (typically for simulating
 | 
| 
 | 
    69     *        a microsatellite marker). Each mutation either increases
 | 
| 
 | 
    70     *        or decreases the allele size by an increment of one.
 | 
| 
 | 
    71     *   - T: two-phase mutation model. This model is a generalization
 | 
| 
 | 
    72     *        of the stepwise mutation model (S). For a mutation, the
 | 
| 
 | 
    73     *        increment (either increase or decrease) is 1 with the
 | 
| 
 | 
    74     *        probability given by the parameter (1-TPMproba). With
 | 
| 
 | 
    75     *        probability TPMproba, the increment is drawn from a
 | 
| 
 | 
    76     *        geometric distribution of parameter given by the other
 | 
| 
 | 
    77     *        parameter (TPMparam).
 | 
| 
 | 
    78     * 
 | 
| 
 | 
    79     * By default, the program will assume an infinite site model (ISM).
 | 
| 
 | 
    80     * Each mutation will occur to a new position drawn from the interval
 | 
| 
 | 
    81     * [0,1]. It is possible to set any mutation model with an ISM 
 | 
| 
 | 
    82     * (including microsatellite-like models I, S and T). Alternatively,
 | 
| 
 | 
    83     * the user can specify a finite number of sites available for
 | 
| 
 | 
    84     * mutation. For a microsatellite marker, the user will want to
 | 
| 
 | 
    85     * specify a single site. It is possible to set a finite number of
 | 
| 
 | 
    86     * sites for all mutation models. In all cases, the mutations will
 | 
| 
 | 
    87     * be forced to target these sites. It is possible to apply weights
 | 
| 
 | 
    88     * independently to all sites. The higher the weight value
 | 
| 
 | 
    89     * (comparatively to the other sites), the higher the probability
 | 
| 
 | 
    90     * that this site mutes. The weights needs not to be relative. In
 | 
| 
 | 
    91     * addition, the user can set the positions of the different sites.
 | 
| 
 | 
    92     * Nothings forces him or her to place them in order. Note that this
 | 
| 
 | 
    93     * does not affect the mutation process, but on the amount of
 | 
| 
 | 
    94     * recombination that will be allowed between sites.
 | 
| 
 | 
    95     *
 | 
| 
 | 
    96     */
 | 
| 
 | 
    97     class Mutator {
 | 
| 
 | 
    98 
 | 
| 
 | 
    99         public:
 | 
| 
 | 
   100         
 | 
| 
 | 
   101            /** \brief Initializes with default values
 | 
| 
 | 
   102             * 
 | 
| 
 | 
   103             * List of default values:
 | 
| 
 | 
   104             *   - theta = 0
 | 
| 
 | 
   105             *   - fixedNumberOfMutations = 0
 | 
| 
 | 
   106             *   - model = F (fixed number of alleles)
 | 
| 
 | 
   107             *   - fixed number of alleles = 2
 | 
| 
 | 
   108             *   - infinite site model
 | 
| 
 | 
   109             *   - TPM parameters are both preset to 0.5
 | 
| 
 | 
   110             * 
 | 
| 
 | 
   111             */
 | 
| 
 | 
   112             Mutator();
 | 
| 
 | 
   113 
 | 
| 
 | 
   114             
 | 
| 
 | 
   115            /** \brief Destroys the object
 | 
| 
 | 
   116             * 
 | 
| 
 | 
   117             */
 | 
| 
 | 
   118             ~Mutator();
 | 
| 
 | 
   119             
 | 
| 
 | 
   120             
 | 
| 
 | 
   121            /** \brief Copy constructor
 | 
| 
 | 
   122             * 
 | 
| 
 | 
   123             */
 | 
| 
 | 
   124             Mutator(const Mutator&);
 | 
| 
 | 
   125         
 | 
| 
 | 
   126         
 | 
| 
 | 
   127            /** \brief Assignement operator
 | 
| 
 | 
   128             * 
 | 
| 
 | 
   129             */
 | 
| 
 | 
   130             Mutator& operator=(const Mutator&);
 | 
| 
 | 
   131 
 | 
| 
 | 
   132 
 | 
| 
 | 
   133            /** \brief Restores default values of all parameters
 | 
| 
 | 
   134             * 
 | 
| 
 | 
   135             */
 | 
| 
 | 
   136             void reset();
 | 
| 
 | 
   137             
 | 
| 
 | 
   138             
 | 
| 
 | 
   139            /** \brief Gets the fixed number of mutations
 | 
| 
 | 
   140             * 
 | 
| 
 | 
   141             */
 | 
| 
 | 
   142             unsigned int fixedNumberOfMutations() const;
 | 
| 
 | 
   143         
 | 
| 
 | 
   144 
 | 
| 
 | 
   145            /** \brief Sets the fixed number of mutations
 | 
| 
 | 
   146             * 
 | 
| 
 | 
   147             * The value can be 0. It is not allowed to set both the
 | 
| 
 | 
   148             * fixed number of mutations and the mutation rate at
 | 
| 
 | 
   149             * non-zero value
 | 
| 
 | 
   150             * 
 | 
| 
 | 
   151             */
 | 
| 
 | 
   152             void fixedNumberOfMutations(unsigned int);
 | 
| 
 | 
   153 
 | 
| 
 | 
   154 
 | 
| 
 | 
   155            /** \brief Gets the mutation rate
 | 
| 
 | 
   156             * 
 | 
| 
 | 
   157             */
 | 
| 
 | 
   158             double mutationRate() const;
 | 
| 
 | 
   159         
 | 
| 
 | 
   160         
 | 
| 
 | 
   161            /** \brief Sets the mutation rate
 | 
| 
 | 
   162             * 
 | 
| 
 | 
   163             * The value cannot be negative. The value can be 0.  It is
 | 
| 
 | 
   164             * not allowed to set both the fixed number of mutations and
 | 
| 
 | 
   165             * the mutation rate at non-zero value
 | 
| 
 | 
   166             * 
 | 
| 
 | 
   167             */
 | 
| 
 | 
   168             void mutationRate(double);
 | 
| 
 | 
   169 
 | 
| 
 | 
   170 
 | 
| 
 | 
   171            /** \brief Gets the mutation model
 | 
| 
 | 
   172             * 
 | 
| 
 | 
   173             * See the class documentation for the signification of the
 | 
| 
 | 
   174             * different one-letter codes.
 | 
| 
 | 
   175             * 
 | 
| 
 | 
   176             */
 | 
| 
 | 
   177             char mutationModel() const;
 | 
| 
 | 
   178             
 | 
| 
 | 
   179             
 | 
| 
 | 
   180            /** \brief Sets the mutation model
 | 
| 
 | 
   181             * 
 | 
| 
 | 
   182             * The passed character must be one of F, I, S and T. See the
 | 
| 
 | 
   183             * class documentation for their signification.
 | 
| 
 | 
   184             * 
 | 
| 
 | 
   185             */
 | 
| 
 | 
   186             void mutationModel(char);
 | 
| 
 | 
   187             
 | 
| 
 | 
   188             
 | 
| 
 | 
   189            /** \brief Gets the fixed number of possible alleles
 | 
| 
 | 
   190             * 
 | 
| 
 | 
   191             */
 | 
| 
 | 
   192             unsigned int numberOfAlleles() const;
 | 
| 
 | 
   193            
 | 
| 
 | 
   194             
 | 
| 
 | 
   195            /** \brief Sets the fixed number of possible alleles
 | 
| 
 | 
   196             * 
 | 
| 
 | 
   197             * The value must be larger than 1. This parameter is
 | 
| 
 | 
   198             * meaningful only for the fixed number allele model of
 | 
| 
 | 
   199             * mutation, and ignored otherwise.
 | 
| 
 | 
   200             *
 | 
| 
 | 
   201             */
 | 
| 
 | 
   202             void numberOfAlleles(unsigned int);
 | 
| 
 | 
   203            
 | 
| 
 | 
   204            
 | 
| 
 | 
   205            /** \brief Sets a transition weight
 | 
| 
 | 
   206             * 
 | 
| 
 | 
   207             * \param i row (previous allele index).
 | 
| 
 | 
   208             * \param j column (next allele index).
 | 
| 
 | 
   209             * \param value weight to apply.
 | 
| 
 | 
   210             * 
 | 
| 
 | 
   211             * Indices i and j must be different. Weights can be any
 | 
| 
 | 
   212             * strictly positive value.
 | 
| 
 | 
   213             * 
 | 
| 
 | 
   214             */
 | 
| 
 | 
   215             void transitionWeight(unsigned int i, unsigned int j, double value);
 | 
| 
 | 
   216 
 | 
| 
 | 
   217 
 | 
| 
 | 
   218            /** \brief Gets a transition weight
 | 
| 
 | 
   219             * 
 | 
| 
 | 
   220             * \param i row (previous allele index).
 | 
| 
 | 
   221             * \param j column (next allele index).
 | 
| 
 | 
   222             * 
 | 
| 
 | 
   223             * Indices i and j must be different.
 | 
| 
 | 
   224             * 
 | 
| 
 | 
   225             */
 | 
| 
 | 
   226             double transitionWeight(unsigned int i, unsigned int j);
 | 
| 
 | 
   227             
 | 
| 
 | 
   228             
 | 
| 
 | 
   229            /** \brief Set to true to draw ancestral alleles randomly
 | 
| 
 | 
   230             * 
 | 
| 
 | 
   231             * By default, the ancestral allele is always 0. If this
 | 
| 
 | 
   232             * variable is set to true, the ancestral allele will be
 | 
| 
 | 
   233             * randomly drawn from the defined number of alleles. This
 | 
| 
 | 
   234             * option is always ignored unless in combination with the
 | 
| 
 | 
   235             * Fixed Allele Number model.
 | 
| 
 | 
   236             * 
 | 
| 
 | 
   237             */
 | 
| 
 | 
   238             void randomAncestralAllele(bool flag);
 | 
| 
 | 
   239 
 | 
| 
 | 
   240             
 | 
| 
 | 
   241             /** \brief true if ancestral alleles must be drawn randomly
 | 
| 
 | 
   242              * 
 | 
| 
 | 
   243              */
 | 
| 
 | 
   244             bool randomAncestralAllele() const;
 | 
| 
 | 
   245 
 | 
| 
 | 
   246            
 | 
| 
 | 
   247            /** \brief Gets the TPM probability parameter
 | 
| 
 | 
   248             * 
 | 
| 
 | 
   249             */
 | 
| 
 | 
   250             double TPMproba() const;
 | 
| 
 | 
   251             
 | 
| 
 | 
   252             
 | 
| 
 | 
   253            /** \brief Sets the TPM probability parameter
 | 
| 
 | 
   254             * 
 | 
| 
 | 
   255             * This parameter is considered only if the mutation model
 | 
| 
 | 
   256             * is T (two-phase mutation model). It gives the probability
 | 
| 
 | 
   257             * that a mutation step is not fixed to be 1. If TPMproba is
 | 
| 
 | 
   258             * 0, the mutation model is SMM.
 | 
| 
 | 
   259             * 
 | 
| 
 | 
   260             * The value must be >=0. and <=1. 
 | 
| 
 | 
   261             * 
 | 
| 
 | 
   262             */
 | 
| 
 | 
   263             void TPMproba(double value);
 | 
| 
 | 
   264            
 | 
| 
 | 
   265             
 | 
| 
 | 
   266            /** \brief Gets the TPM distribution parameter
 | 
| 
 | 
   267             * 
 | 
| 
 | 
   268             */
 | 
| 
 | 
   269             double TPMparam() const;
 | 
| 
 | 
   270             
 | 
| 
 | 
   271             
 | 
| 
 | 
   272            /** \brief Sets the TPM distribution parameter
 | 
| 
 | 
   273             * 
 | 
| 
 | 
   274             * This parameter is considered only if the mutation model
 | 
| 
 | 
   275             * is T (two-phase mutation model). It gives the parameter
 | 
| 
 | 
   276             * of the geometric distribution which is used to generate
 | 
| 
 | 
   277             * the mutation step (if it is not one).
 | 
| 
 | 
   278             * 
 | 
| 
 | 
   279             * The value must be >=0. and <=1. 
 | 
| 
 | 
   280             * 
 | 
| 
 | 
   281             */
 | 
| 
 | 
   282             void TPMparam(double value);
 | 
| 
 | 
   283 
 | 
| 
 | 
   284 
 | 
| 
 | 
   285            /** \brief Gets the number of mutable sites
 | 
| 
 | 
   286             * 
 | 
| 
 | 
   287             * A value a zero must be interpreted as the infinite site
 | 
| 
 | 
   288             * model. Note that after all calls all data from the tables
 | 
| 
 | 
   289             * sitePositions and siteWeights will be reset.
 | 
| 
 | 
   290             * 
 | 
| 
 | 
   291             */
 | 
| 
 | 
   292             unsigned int numberOfSites() const;
 | 
| 
 | 
   293            
 | 
| 
 | 
   294             
 | 
| 
 | 
   295            /** \brief Sets the number of mutable sites
 | 
| 
 | 
   296             * 
 | 
| 
 | 
   297             * The value of zero is accepted and imposed the infinite
 | 
| 
 | 
   298             * site model.
 | 
| 
 | 
   299             * 
 | 
| 
 | 
   300             */
 | 
| 
 | 
   301             void numberOfSites(unsigned int);
 | 
| 
 | 
   302             
 | 
| 
 | 
   303             
 | 
| 
 | 
   304            /** \brief Gets the position of a given site
 | 
| 
 | 
   305             * 
 | 
| 
 | 
   306             */
 | 
| 
 | 
   307             double sitePosition(unsigned int siteIndex) const;
 | 
| 
 | 
   308 
 | 
| 
 | 
   309             
 | 
| 
 | 
   310            /** \brief Set the position of a given site
 | 
| 
 | 
   311             * 
 | 
| 
 | 
   312             * The position must be >=0 and <=1
 | 
| 
 | 
   313             * 
 | 
| 
 | 
   314             */
 | 
| 
 | 
   315             void sitePosition(unsigned int siteIndex, double position);
 | 
| 
 | 
   316 
 | 
| 
 | 
   317 
 | 
| 
 | 
   318            /** \brief Gets the mutation weight of a given site
 | 
| 
 | 
   319             * 
 | 
| 
 | 
   320             */
 | 
| 
 | 
   321             double siteWeight(unsigned int siteIndex) const;
 | 
| 
 | 
   322 
 | 
| 
 | 
   323             
 | 
| 
 | 
   324            /** \brief Set the site weight of a given site
 | 
| 
 | 
   325             * 
 | 
| 
 | 
   326             * The weight must be strictly positive.
 | 
| 
 | 
   327             * 
 | 
| 
 | 
   328             */
 | 
| 
 | 
   329             void siteWeight(unsigned int siteIndex, double weight);
 | 
| 
 | 
   330 
 | 
| 
 | 
   331 
 | 
| 
 | 
   332            /** \brief Performs mutation
 | 
| 
 | 
   333             * 
 | 
| 
 | 
   334             * \param arg Ancestral recombination graph instance. If the
 | 
| 
 | 
   335             * ARG is partially built or not a all, or improperly so,
 | 
| 
 | 
   336             * the behaviour of this method is not defined.
 | 
| 
 | 
   337             * 
 | 
| 
 | 
   338             * \param random The address of a Random instance to be
 | 
| 
 | 
   339             * used for generating random numbers.
 | 
| 
 | 
   340             * 
 | 
| 
 | 
   341             * \return A DataMatrix instance containing simulated data.
 | 
| 
 | 
   342             * 
 | 
| 
 | 
   343             */
 | 
| 
 | 
   344             DataMatrix mute(Arg* arg, Random* random);
 | 
| 
 | 
   345 
 | 
| 
 | 
   346 
 | 
| 
 | 
   347           /** \brief Gets the last number of mutations
 | 
| 
 | 
   348             *
 | 
| 
 | 
   349             * Returns the number of mutations of the last call of mute( ).
 | 
| 
 | 
   350             * By default, this method returns 0.
 | 
| 
 | 
   351             *
 | 
| 
 | 
   352             */
 | 
| 
 | 
   353             unsigned int numberOfMutations() const; 
 | 
| 
 | 
   354 
 | 
| 
 | 
   355 
 | 
| 
 | 
   356         private:
 | 
| 
 | 
   357         
 | 
| 
 | 
   358             void clear();
 | 
| 
 | 
   359             void init();
 | 
| 
 | 
   360             void copy(const Mutator&);
 | 
| 
 | 
   361 
 | 
| 
 | 
   362             //int nextAllele(int allele, Random* random);
 | 
| 
 | 
   363             int TPMstep(double inTPMproba, Random* random);
 | 
| 
 | 
   364             void apply_mutation(unsigned int matrixIndex,
 | 
| 
 | 
   365                                 unsigned int actualSite, DataMatrix& data,
 | 
| 
 | 
   366                                 const Edge* edge, int allele,
 | 
| 
 | 
   367                                 unsigned int segment, Random* random);
 | 
| 
 | 
   368 
 | 
| 
 | 
   369         
 | 
| 
 | 
   370             char _model;
 | 
| 
 | 
   371             double _mutationRate;
 | 
| 
 | 
   372             unsigned int _fixedNumberOfMutations;
 | 
| 
 | 
   373             unsigned int _numberOfAlleles;
 | 
| 
 | 
   374             double** _transitionWeights;
 | 
| 
 | 
   375             bool _randomAncestralAllele;
 | 
| 
 | 
   376             unsigned int _numberOfSites;
 | 
| 
 | 
   377             double* _sitePositions;
 | 
| 
 | 
   378             double* _siteWeights;
 | 
| 
 | 
   379             double _TPMproba;
 | 
| 
 | 
   380             double _TPMparam;
 | 
| 
 | 
   381             int maxAllele;
 | 
| 
 | 
   382             unsigned int _numberOfMutations;
 | 
| 
 | 
   383             std::vector<Mutation> _cache_mutations;
 | 
| 
 | 
   384             unsigned int _cache_mutations_reserved;
 | 
| 
 | 
   385 
 | 
| 
 | 
   386     };
 | 
| 
 | 
   387 
 | 
| 
 | 
   388 
 | 
| 
 | 
   389     bool compare(Mutation mutation1, Mutation mutation2); // returns True if mutation1 is older
 | 
| 
 | 
   390 
 | 
| 
 | 
   391 }
 | 
| 
 | 
   392 
 | 
| 
 | 
   393 
 | 
| 
 | 
   394 
 | 
| 
 | 
   395 
 | 
| 
 | 
   396 #endif
 | 
| 
 | 
   397 
 |