| 1 | 1 /* | 
|  | 2     Copyright 2009 Stéphane De Mita, Mathieu Siol | 
|  | 3 | 
|  | 4     This file is part of the EggLib library. | 
|  | 5 | 
|  | 6     EggLib is free software: you can redistribute it and/or modify | 
|  | 7     it under the terms of the GNU General Public License as published by | 
|  | 8     the Free Software Foundation, either version 3 of the License, or | 
|  | 9     (at your option) any later version. | 
|  | 10 | 
|  | 11     EggLib is distributed in the hope that it will be useful, | 
|  | 12     but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
|  | 14     GNU General Public License for more details. | 
|  | 15 | 
|  | 16     You should have received a copy of the GNU General Public License | 
|  | 17     along with EggLib.  If not, see <http://www.gnu.org/licenses/>. | 
|  | 18 */ | 
|  | 19 | 
|  | 20 #ifndef EGGLIB_BASEDIVERSITY_HPP | 
|  | 21 #define EGGLIB_BASEDIVERSITY_HPP | 
|  | 22 | 
|  | 23 #include "CharMatrix.hpp" | 
|  | 24 #include "SitePolymorphism.hpp" | 
|  | 25 #include <string> | 
|  | 26 | 
|  | 27 /** \defgroup polymorphism polymorphism | 
|  | 28  * | 
|  | 29  * \brief Diversity analyses | 
|  | 30  * | 
|  | 31  * Two classes are contained in this module: NucleotideDiversity, that | 
|  | 32  * performs site-centered polymorphism analyses, and HaplotypeDiversity, | 
|  | 33  * that performs haplotype-centered analyses. The detection of | 
|  | 34  * polymorphic sites is common to both, through the base class | 
|  | 35  * BaseDiversity. However this phase must be repeated when stats from | 
|  | 36  * the two classes are needed. To reduce the computational burden, the | 
|  | 37  * function reserve() can be use, that directly allocates needed memory | 
|  | 38  * when the eventual number of polymorphic sites is known prior to | 
|  | 39  * analysis (even if not precisely). For both classes, a set of | 
|  | 40  * statistics are computed immediately upon load of a data set. For | 
|  | 41  * NucleotideDiversity, additional statistics are computed per group | 
|  | 42  * upon use of the corresponding accessors. This number of operations | 
|  | 43  * performed several times is strictly limited. This is particularly | 
|  | 44  * useful when different statistics are needed for a given alignment. | 
|  | 45  * However, this system allows not computing unnecessary statistics to | 
|  | 46  * a certain extend. | 
|  | 47  * | 
|  | 48  */ | 
|  | 49 | 
|  | 50 namespace egglib { | 
|  | 51 | 
|  | 52     /** \brief Base class of diversity classes | 
|  | 53     * | 
|  | 54     * Mutualizes the analysis of polymorphic sites through the method | 
|  | 55     * importSites() and related accessors. | 
|  | 56     * | 
|  | 57     * \ingroup polymorphism | 
|  | 58     * | 
|  | 59     */ | 
|  | 60     class BaseDiversity { | 
|  | 61 | 
|  | 62         public: | 
|  | 63 | 
|  | 64            /** \brief Constructor | 
|  | 65             * | 
|  | 66             */ | 
|  | 67             BaseDiversity(); | 
|  | 68 | 
|  | 69            /** \brief Destructor | 
|  | 70             * | 
|  | 71             */ | 
|  | 72             virtual ~BaseDiversity(); | 
|  | 73 | 
|  | 74            /** \brief Reserve sufficient memory for a given number of | 
|  | 75             * polymorphic sites. | 
|  | 76             * | 
|  | 77             * This method makes importSite function faster when you | 
|  | 78             * already know how many polymorphic sites to expect, since | 
|  | 79             * the necessary memory will be allocated prior the screening | 
|  | 80             * of data. It is possible to use reserve() even if with a | 
|  | 81             * number of sites that is not matching what importSites() | 
|  | 82             * will find. | 
|  | 83             * | 
|  | 84             * \param numberOfSites a strictly positive integer. | 
|  | 85             * | 
|  | 86             */ | 
|  | 87             virtual void reserve(unsigned int numberOfSites); | 
|  | 88 | 
|  | 89             /// Gets a site | 
|  | 90             const SitePolymorphism* get_site(unsigned int index) const; | 
|  | 91 | 
|  | 92             /// Gets a site position | 
|  | 93             unsigned int get_position(unsigned int index) const; | 
|  | 94 | 
|  | 95            /** \brief Predefined mapping string for DNA data | 
|  | 96             * | 
|  | 97             */ | 
|  | 98             static const std::string dnaMapping; | 
|  | 99 | 
|  | 100 | 
|  | 101            /** \brief Predefined mapping string for RNA data | 
|  | 102             * | 
|  | 103             */ | 
|  | 104             static const std::string rnaMapping; | 
|  | 105 | 
|  | 106 | 
|  | 107            /** \brief Predefined mapping string for amino acid data | 
|  | 108             * | 
|  | 109             */ | 
|  | 110             static const std::string aaMapping; | 
|  | 111 | 
|  | 112 | 
|  | 113             /// Clears and re-initializes object | 
|  | 114             virtual void reset(); | 
|  | 115 | 
|  | 116 | 
|  | 117         protected: | 
|  | 118 | 
|  | 119             virtual void init(); | 
|  | 120             virtual void clear(); | 
|  | 121 | 
|  | 122             // | 
|  | 123             void importSites(CharMatrix& data, bool allowMultipleMutations, | 
|  | 124                 double minimumExploitableData, unsigned int ignoreFrequency, | 
|  | 125                 std::string characterMapping, bool useZeroAsAncestral, | 
|  | 126                 bool ignoreOutgroup); | 
|  | 127 | 
|  | 128             // | 
|  | 129             void analyzeSite(CharMatrix& data, unsigned int index, double maxMissingData, bool ignoreOutgroup); // analyzes a site, adds a Site to the Site container if the site is polymorphic | 
|  | 130             unsigned int getPopIndex(unsigned int label) const;  // returns v_npop if not found | 
|  | 131 | 
|  | 132             SitePolymorphism** v_sites;  // holder of polymorphic site addresses | 
|  | 133             bool* v_orientables;         // stores whether the sites are orientable or not | 
|  | 134             unsigned int* v_sitePositions;   // stores position of sites | 
|  | 135 | 
|  | 136             unsigned int  v_reserved; | 
|  | 137             unsigned int  v_ns;       // maximum number of sequences analyzed (max of sites' ns) | 
|  | 138             unsigned int  v_S;        // number of polymorphic sites | 
|  | 139             unsigned int  v_So;       // number of orientable sites | 
|  | 140             unsigned int  v_eta;      // number of mutation (whatever multiple) | 
|  | 141             double        v_nseff;    // average number of analyzed sequence | 
|  | 142             unsigned int  v_lseff;    // number of analyzed sites | 
|  | 143             double        v_nseffo;   // average number of analyzed sequences for analyzes with outgroup | 
|  | 144             unsigned int  v_lseffo;   // number of analyzed sites for analyzes with outgroup | 
|  | 145             unsigned int  v_npop;     // number of populations | 
|  | 146             unsigned int *v_popLabel; // label of each pop | 
|  | 147 | 
|  | 148             // options | 
|  | 149             bool          p_allowMultipleMutations; | 
|  | 150             double        p_minimumExploitableData; | 
|  | 151             std::string   p_characterMapping; | 
|  | 152             unsigned int  p_pos_sep_mapping; | 
|  | 153             bool          p_useZeroAsAncestral; | 
|  | 154             unsigned int  p_ignoreFrequency; | 
|  | 155 | 
|  | 156 | 
|  | 157 | 
|  | 158         private: | 
|  | 159 | 
|  | 160             BaseDiversity(const BaseDiversity& source) { } | 
|  | 161 | 
|  | 162             BaseDiversity& operator=(const BaseDiversity& source) { | 
|  | 163                 return *this; | 
|  | 164             } | 
|  | 165 | 
|  | 166     }; | 
|  | 167 } | 
|  | 168 | 
|  | 169 #endif |