diff egglib/egglib-2.1.5/include/egglib-cpp/LinkageDisequilibrium.hpp @ 9:98c37a5d67f4 draft

Uploaded
author dereeper
date Wed, 07 Feb 2018 22:08:47 -0500
parents 420b57c3c185
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/egglib/egglib-2.1.5/include/egglib-cpp/LinkageDisequilibrium.hpp	Wed Feb 07 22:08:47 2018 -0500
@@ -0,0 +1,233 @@
+/*
+    Copyright 2009 Stéphane De Mita, Mathieu Siol
+
+    This file is part of the EggLib library.
+
+    EggLib is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    EggLib is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef EGGLIB_LINKAGEDISEQUILIBRUM_HPP
+#define EGGLIB_LINKAGEDISEQUILIBRUM_HPP
+
+
+#include "BaseDiversity.hpp"
+#include "EggException.hpp"
+
+
+namespace egglib {
+
+   /** \brief Analyzes linkage disequilibrium per pair of polymorphic sites
+    *
+    * \ingroup polymorphism
+    * 
+    * The class considers an alignment and detects polymorphic sites
+    * using the BaseDiversity functionality (shared with other classes
+    * of the module). Only sites with exactly two alleles are
+    * considered. Statistics of pairwise linkage disequilibrium can
+    * be accessed by pair index (note that out-of-range errors are not
+    * checked). Population labels are ignored (but outgroups are
+    * excluded from the analysis).
+    *
+    */
+    class LinkageDisequilibrium : public BaseDiversity {
+
+      public:
+
+        /// Default constructor
+        LinkageDisequilibrium();
+
+        /// Destructor
+        virtual ~LinkageDisequilibrium();
+
+       /** \brief Analyzes polymorphic sites of an alignment
+        * 
+        * \param data an alignment object (subclass of CharMatrix).
+        * The presence of outgroup or of different populations will
+        * be detected based on the populationLabel members of the
+        * passed object. The populationLabel 999 will be interpreted
+        * as outgroups. If several outgroups are passed, sites were
+        * the outgroups are not consistent will be treated as "non-
+        * orientable".
+        * 
+        * \param minimumExploitableData site where the non-missing
+        * data (as defined by characterMapping) are at a frequency
+        * larger than this value will be removed from the analysis.
+        * Use 1. to take only 'complete' sites into account and 0.
+        * to use all sites.
+        * 
+        * \param ignoreFrequency removes sites that are polymorphic
+        * because of an allele at absolute frequency smaller than or
+        * equal to this value. If ignoreFrequency=1, no sites are
+        * removed, if ignoreFrequency=1, singleton sites are
+        * ignored. Such sites are completely removed from the
+        * analysis (not counted in lseff). Note that if more than
+        * one mutation is allowed, the site is removed only if all
+        * the alleles but one are smaller than or equal to this
+        * value. For example, an alignment column AAAAAAGAAT is
+        * ignored with an ignoreFrequency of 1, but AAAAAAGGAT is
+        * conserved (including the third allele T which is a
+        * singleton).
+        * 
+        * \param characterMapping a string giving the list of
+        * characters that should be considered as valid data. If a
+        * space is present in the string, the characters left of the
+        * space will be treated as valid data and the characters
+        * right of the space will be treated as missing data, that
+        * is tolerated but ignored. All characters not in the string
+        * will cause an EggInvalidCharacterError to be raised.
+        */
+        void load(CharMatrix& data,
+                double minimumExploitableData=1.,
+                unsigned int ignoreFrequency=0,
+                std::string characterMapping=dnaMapping);
+    
+    
+        /// Number of pairs contained in the instance
+        unsigned int numberOfPairs() const;
+
+        /// Alignment distance between a given pair
+        int d(unsigned int pair_index);
+
+        /// D statistic for a given pair
+        double D(unsigned int pair_index);
+
+        /// D' statistic for a given pair
+        double Dp(unsigned int pair_index);
+
+        /// r statistic for a given pair
+        double r(unsigned int pair_index);
+
+        /// r2 statistic for a given pair
+        double r2(unsigned int pair_index);
+
+        /// position of the first site for a given pair
+        unsigned int site1(unsigned int pair_index);
+
+        /// position of the second site for a given pair
+        unsigned int site2(unsigned int pair_index);
+
+        /// correlation coefficient between r2 and distance
+        double correl() const;
+        
+       /** \brief Computes the minimal number of recombination events
+        * 
+        * The computation is performed as described in Hudson, RR and
+        * NL Kaplan. 1985. Statistical properties of the number of
+        * recombination events in the history of a sample of DNA
+        * sequences. Genetics 111: 147-164. The returned parameter is
+        * the minimal number of recombination events, given by the
+        * number of non-overlapping pairs of segregating sites violating
+        * the rule of the four gamete. Only sites with two alleles are
+        * considered. Note that homoplasy (multiple mutations) mimicks
+        * recombination. The result of this function is not stored
+        * in this instance, and re-computed at each call.
+        * 
+        * \param data the same CharMatrix instance as passed to the load
+        * method. The instance must not have been modified.
+        * 
+        */
+        unsigned int Rmin(CharMatrix& data) const;
+
+
+
+      protected:
+      
+        // adds a pair of polymorphic sites
+        // assume position2>position1,
+        //  sites are polymorphic with exactly 2 alleles
+        void add(CharMatrix& data, unsigned int position1, unsigned int position2);
+
+        // Constructor help
+        void init();
+                
+        // Destructor helper
+        void clear();
+        
+        // Resizes arrays
+        void reset();
+        
+        // Small helper
+        inline double min(double a, double b) { return (a>b)?a:b;}
+
+        // Small helper
+        inline double max(double a, double b) { return (a>b)?b:a;}
+
+        // Small helper
+        inline void check(unsigned int pos) {  if (pos>=_n) throw EggRuntimeError("tried to access an invalid index"); }
+
+       /* Performs correlation
+        *
+        * This function works independently from the rest of the class.
+        *
+        * \param n length of data arrays.
+        * \param x first data vector.
+        * \param y second data vector.
+        * \param r variable to receive the correlation coefficient.
+        * \param a variable to receive the regression slope.
+        */
+        static void _correl(unsigned int n, const int* x, const double* y, double& r, double& a);
+
+        // Distance between pairs
+        int* _d;
+        
+        // D (classical) measure of LD
+        double *_D;
+        
+        // D'
+        double *_Dp;
+        
+        // r, correlation coefficient
+        double *_r;
+        
+        // square r
+        double *_r2;
+        
+        // Data array (not managed by the instance)
+        unsigned int *_site1;
+
+        // Data array (not managed by the instance)
+        unsigned int *_site2;
+        
+        // Number of pairs
+        unsigned int _n;
+
+     private:
+     
+        /// Copy constructor not available
+        LinkageDisequilibrium(const LinkageDisequilibrium&) { }
+
+        /// Assignment operator not available
+        LinkageDisequilibrium& operator=(const LinkageDisequilibrium&) {
+            return *this;
+        }
+
+
+        class Interval {
+            public:
+                Interval(unsigned int, unsigned int);
+                unsigned int a() const;
+                unsigned int b() const;
+                bool good() const;
+                void set_false();
+            private:
+                unsigned int _a;
+                unsigned int _b;
+                unsigned int _good;
+        };
+
+
+  };
+}
+
+#endif