| 
1
 | 
     1 /*
 | 
| 
 | 
     2     Copyright 2009 Stéphane De Mita, Mathieu Siol
 | 
| 
 | 
     3 
 | 
| 
 | 
     4     This file is part of the EggLib library.
 | 
| 
 | 
     5 
 | 
| 
 | 
     6     EggLib is free software: you can redistribute it and/or modify
 | 
| 
 | 
     7     it under the terms of the GNU General Public License as published by
 | 
| 
 | 
     8     the Free Software Foundation, either version 3 of the License, or
 | 
| 
 | 
     9     (at your option) any later version.
 | 
| 
 | 
    10 
 | 
| 
 | 
    11     EggLib is distributed in the hope that it will be useful,
 | 
| 
 | 
    12     but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
| 
 | 
    13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
| 
 | 
    14     GNU General Public License for more details.
 | 
| 
 | 
    15 
 | 
| 
 | 
    16     You should have received a copy of the GNU General Public License
 | 
| 
 | 
    17     along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
 | 
| 
 | 
    18 */
 | 
| 
 | 
    19 
 | 
| 
 | 
    20 #ifndef EGGLIB_BASEDIVERSITY_HPP
 | 
| 
 | 
    21 #define EGGLIB_BASEDIVERSITY_HPP
 | 
| 
 | 
    22 
 | 
| 
 | 
    23 #include "CharMatrix.hpp"
 | 
| 
 | 
    24 #include "SitePolymorphism.hpp"
 | 
| 
 | 
    25 #include <string>
 | 
| 
 | 
    26 
 | 
| 
 | 
    27 /** \defgroup polymorphism polymorphism
 | 
| 
 | 
    28  *
 | 
| 
 | 
    29  * \brief Diversity analyses
 | 
| 
 | 
    30  *
 | 
| 
 | 
    31  * Two classes are contained in this module: NucleotideDiversity, that
 | 
| 
 | 
    32  * performs site-centered polymorphism analyses, and HaplotypeDiversity,
 | 
| 
 | 
    33  * that performs haplotype-centered analyses. The detection of
 | 
| 
 | 
    34  * polymorphic sites is common to both, through the base class
 | 
| 
 | 
    35  * BaseDiversity. However this phase must be repeated when stats from
 | 
| 
 | 
    36  * the two classes are needed. To reduce the computational burden, the
 | 
| 
 | 
    37  * function reserve() can be use, that directly allocates needed memory
 | 
| 
 | 
    38  * when the eventual number of polymorphic sites is known prior to
 | 
| 
 | 
    39  * analysis (even if not precisely). For both classes, a set of
 | 
| 
 | 
    40  * statistics are computed immediately upon load of a data set. For
 | 
| 
 | 
    41  * NucleotideDiversity, additional statistics are computed per group
 | 
| 
 | 
    42  * upon use of the corresponding accessors. This number of operations
 | 
| 
 | 
    43  * performed several times is strictly limited. This is particularly
 | 
| 
 | 
    44  * useful when different statistics are needed for a given alignment.
 | 
| 
 | 
    45  * However, this system allows not computing unnecessary statistics to
 | 
| 
 | 
    46  * a certain extend.
 | 
| 
 | 
    47  * 
 | 
| 
 | 
    48  */
 | 
| 
 | 
    49 
 | 
| 
 | 
    50 namespace egglib {
 | 
| 
 | 
    51 
 | 
| 
 | 
    52     /** \brief Base class of diversity classes
 | 
| 
 | 
    53     *
 | 
| 
 | 
    54     * Mutualizes the analysis of polymorphic sites through the method
 | 
| 
 | 
    55     * importSites() and related accessors.
 | 
| 
 | 
    56     * 
 | 
| 
 | 
    57     * \ingroup polymorphism
 | 
| 
 | 
    58     *
 | 
| 
 | 
    59     */
 | 
| 
 | 
    60     class BaseDiversity {
 | 
| 
 | 
    61     
 | 
| 
 | 
    62         public:
 | 
| 
 | 
    63     
 | 
| 
 | 
    64            /** \brief Constructor
 | 
| 
 | 
    65             * 
 | 
| 
 | 
    66             */ 
 | 
| 
 | 
    67             BaseDiversity();
 | 
| 
 | 
    68             
 | 
| 
 | 
    69            /** \brief Destructor
 | 
| 
 | 
    70             * 
 | 
| 
 | 
    71             */ 
 | 
| 
 | 
    72             virtual ~BaseDiversity();
 | 
| 
 | 
    73             
 | 
| 
 | 
    74            /** \brief Reserve sufficient memory for a given number of
 | 
| 
 | 
    75             * polymorphic sites.
 | 
| 
 | 
    76             * 
 | 
| 
 | 
    77             * This method makes importSite function faster when you
 | 
| 
 | 
    78             * already know how many polymorphic sites to expect, since
 | 
| 
 | 
    79             * the necessary memory will be allocated prior the screening
 | 
| 
 | 
    80             * of data. It is possible to use reserve() even if with a
 | 
| 
 | 
    81             * number of sites that is not matching what importSites()
 | 
| 
 | 
    82             * will find.
 | 
| 
 | 
    83             * 
 | 
| 
 | 
    84             * \param numberOfSites a strictly positive integer.
 | 
| 
 | 
    85             * 
 | 
| 
 | 
    86             */
 | 
| 
 | 
    87             virtual void reserve(unsigned int numberOfSites);
 | 
| 
 | 
    88 
 | 
| 
 | 
    89             /// Gets a site
 | 
| 
 | 
    90             const SitePolymorphism* get_site(unsigned int index) const;
 | 
| 
 | 
    91 
 | 
| 
 | 
    92             /// Gets a site position
 | 
| 
 | 
    93             unsigned int get_position(unsigned int index) const;
 | 
| 
 | 
    94 
 | 
| 
 | 
    95            /** \brief Predefined mapping string for DNA data
 | 
| 
 | 
    96             * 
 | 
| 
 | 
    97             */
 | 
| 
 | 
    98             static const std::string dnaMapping;
 | 
| 
 | 
    99 
 | 
| 
 | 
   100 
 | 
| 
 | 
   101            /** \brief Predefined mapping string for RNA data
 | 
| 
 | 
   102             * 
 | 
| 
 | 
   103             */
 | 
| 
 | 
   104             static const std::string rnaMapping;
 | 
| 
 | 
   105 
 | 
| 
 | 
   106 
 | 
| 
 | 
   107            /** \brief Predefined mapping string for amino acid data
 | 
| 
 | 
   108             * 
 | 
| 
 | 
   109             */
 | 
| 
 | 
   110             static const std::string aaMapping;
 | 
| 
 | 
   111 
 | 
| 
 | 
   112 
 | 
| 
 | 
   113             /// Clears and re-initializes object
 | 
| 
 | 
   114             virtual void reset();
 | 
| 
 | 
   115 
 | 
| 
 | 
   116 
 | 
| 
 | 
   117         protected:
 | 
| 
 | 
   118     
 | 
| 
 | 
   119             virtual void init();
 | 
| 
 | 
   120             virtual void clear();
 | 
| 
 | 
   121     
 | 
| 
 | 
   122             // 
 | 
| 
 | 
   123             void importSites(CharMatrix& data, bool allowMultipleMutations,
 | 
| 
 | 
   124                 double minimumExploitableData, unsigned int ignoreFrequency,
 | 
| 
 | 
   125                 std::string characterMapping, bool useZeroAsAncestral,
 | 
| 
 | 
   126                 bool ignoreOutgroup);
 | 
| 
 | 
   127 
 | 
| 
 | 
   128             // 
 | 
| 
 | 
   129             void analyzeSite(CharMatrix& data, unsigned int index, double maxMissingData, bool ignoreOutgroup); // analyzes a site, adds a Site to the Site container if the site is polymorphic
 | 
| 
 | 
   130             unsigned int getPopIndex(unsigned int label) const;  // returns v_npop if not found
 | 
| 
 | 
   131             
 | 
| 
 | 
   132             SitePolymorphism** v_sites;  // holder of polymorphic site addresses
 | 
| 
 | 
   133             bool* v_orientables;         // stores whether the sites are orientable or not
 | 
| 
 | 
   134             unsigned int* v_sitePositions;   // stores position of sites
 | 
| 
 | 
   135 
 | 
| 
 | 
   136             unsigned int  v_reserved;
 | 
| 
 | 
   137             unsigned int  v_ns;       // maximum number of sequences analyzed (max of sites' ns)
 | 
| 
 | 
   138             unsigned int  v_S;        // number of polymorphic sites
 | 
| 
 | 
   139             unsigned int  v_So;       // number of orientable sites
 | 
| 
 | 
   140             unsigned int  v_eta;      // number of mutation (whatever multiple)
 | 
| 
 | 
   141             double        v_nseff;    // average number of analyzed sequence
 | 
| 
 | 
   142             unsigned int  v_lseff;    // number of analyzed sites
 | 
| 
 | 
   143             double        v_nseffo;   // average number of analyzed sequences for analyzes with outgroup
 | 
| 
 | 
   144             unsigned int  v_lseffo;   // number of analyzed sites for analyzes with outgroup
 | 
| 
 | 
   145             unsigned int  v_npop;     // number of populations
 | 
| 
 | 
   146             unsigned int *v_popLabel; // label of each pop
 | 
| 
 | 
   147 
 | 
| 
 | 
   148             // options
 | 
| 
 | 
   149             bool          p_allowMultipleMutations;
 | 
| 
 | 
   150             double        p_minimumExploitableData;
 | 
| 
 | 
   151             std::string   p_characterMapping;
 | 
| 
 | 
   152             unsigned int  p_pos_sep_mapping;
 | 
| 
 | 
   153             bool          p_useZeroAsAncestral;
 | 
| 
 | 
   154             unsigned int  p_ignoreFrequency;
 | 
| 
 | 
   155 
 | 
| 
 | 
   156 
 | 
| 
 | 
   157     
 | 
| 
 | 
   158         private:
 | 
| 
 | 
   159             
 | 
| 
 | 
   160             BaseDiversity(const BaseDiversity& source) { }
 | 
| 
 | 
   161             
 | 
| 
 | 
   162             BaseDiversity& operator=(const BaseDiversity& source) {
 | 
| 
 | 
   163                 return *this;
 | 
| 
 | 
   164             }
 | 
| 
 | 
   165 
 | 
| 
 | 
   166     };
 | 
| 
 | 
   167 }
 | 
| 
 | 
   168 
 | 
| 
 | 
   169 #endif
 |