diff egglib/egglib-2.1.5/include/egglib-cpp/Convert.hpp @ 9:98c37a5d67f4 draft

Uploaded
author dereeper
date Wed, 07 Feb 2018 22:08:47 -0500
parents 420b57c3c185
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/egglib/egglib-2.1.5/include/egglib-cpp/Convert.hpp	Wed Feb 07 22:08:47 2018 -0500
@@ -0,0 +1,234 @@
+/*
+    Copyright 2009 Stéphane De Mita, Mathieu Siol
+
+    This file is part of the EggLib library.
+
+    EggLib is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    EggLib is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#ifndef EGGLIB_CONVERT_HPP
+#define EGGLIB_CONVERT_HPP
+
+
+#include "DataMatrix.hpp"
+#include "Align.hpp"
+#include "EggException.hpp"
+#include "Random.hpp"
+#include <string>
+
+#include "config.h"
+
+#ifdef HAVE_LIBBPP_SEQ
+#include <Bpp/Seq/Alphabet.all>
+#include <Bpp/Seq/Sequence.h>
+#include <Bpp/Seq/Container.all>
+#endif
+
+
+
+namespace egglib {
+
+
+   /** \brief Performs conversion between sequence holder types
+    *
+    * \ingroup core
+    * 
+    * Static methods of this class allows conversion between sequence
+    * holder types implying parametrizable modifications.
+    * 
+    */
+    class Convert {
+
+        public:
+        
+           /** \brief DataMatrix to Align conversion
+            * 
+            * By defaut, this method generates an Align instance
+            * containing only the polymorphic sites. The integers of
+            * the DataMatrix will be converted as follow: 0 to A, 1 to
+            * C, 2 to G and 3 to T. This behaviour can be largely
+            * modified using options.
+            * 
+            * \param dataMatrix DataMatrix instance.
+            * 
+            * \param length length of the desired alignment. Non-varying
+            * stretches of data will be introduced to reach the
+            * specified length. By default the positions of segregating
+            * sites will be determined from the positions given by the
+            * DataMatrix object. Those positions are expressed in a
+            * continuous range, and will be discretized. Mutations
+            * falling on the same site will be moved of one position
+            * left or right (always preserving the order of mutation
+            * sites). If positions are all zero (the default of the
+            * DataMatrix class) and if length is larger than the number
+            * of segregating sites, then all segregating sites will
+            * cluster on the left-hand side of the alignment.
+            * 
+            * \param random the address to a Random object allowing to 
+            * draw random numbers (for randomizing positions and/or
+            * non-varying states). If an address is provided but no
+            * random numbers are required, it is ignored. If no address
+            * if provided and random numbers are required, a Random
+            * instance is built internally.
+            * 
+            * \param randomizePositions if true, the positions specified
+            * in the DataMatrix objects are ignored and the positions of
+            * mutations are drawn randomly along the interval (only if
+            * the specified length is larger than the number of
+            * segregating sites). If randomizePositions and false and
+            * positions are not
+            * 
+            * \param enforceLength specify whether a
+            * EggRuntimeError should be thrown when the number of
+            * polymorphic sites is larger than the specified length. If
+            * false (the default) and in cases where the specified
+            * length is too short to harbor all polymorphic sites, the
+            * alignment length will be increased as needed.
+            * 
+            * \param randomizeNonVaryingStates if true, the stretches of
+            * conserved positions (between segregating sites) will be
+            * randomly drawn from the current symbol mapping. Otherwise,
+            * the symbol given by fixed will be used.
+            * 
+            * \param randomizeAlleles if true, alleles will be drawn
+            * randomly from the mapped characters. Note that if a
+            * genotype value is larger than the size of the mapping, it
+            * will be replaced by the character given by unknown,
+            * without randomization. In other words, with the mapping
+            * "ACGT", alleles 0, 1, 2 and 3 will be randomly assigned
+            * to these four characters, but larger and negative alleles
+            * will be assigned to the unknown character.
+            * 
+            * \param mapping a string given the character to assign to
+            * different character values read from the DataMatrix. If
+            * the read value is 0, the first character of the string
+            * will used, the the value is 1, the second character will
+            * be used, and so on. If the integer read is out of range
+            * (in particular, for any negative value), then the
+            * character given by unknown will be used. An empty string
+            * will always lead to alignments containing only the
+            * character given by unknown. The string "01" is suitable
+            * for binary data.
+            * 
+            * \param unknown the character to use if an integer genotype
+            * value is not mapped in the mapping string (that is, if
+            * the mapping string is too short).
+            * 
+            * \param nonVaryingState character to use for conserved
+            * stretches of data. It doesn't have to be included in the
+            * mapping. If randomizeNonVaryingState is true, this
+            * argument is ignored.
+            * 
+            * \return The resulting Align object.
+            * 
+            */
+            static Align align(
+                DataMatrix& dataMatrix,
+                unsigned int length=0,
+                Random* random=NULL,
+                bool randomizePositions=false,
+                bool randomizeNonVaryingStates=false,
+                bool randomizeAlleles=false,
+                bool enforceLength=false,
+                std::string mapping="ACGT",
+                char unknown='?',
+                char nonVaryingState='A'
+            );
+
+
+#ifdef HAVE_LIBBPP_SEQ
+
+            /** \brief Converts an alignment to the equivalent Bio++ type
+            *
+            * During conversion, name information is lost (arbitrary
+            * names are generated in order toprevent duplicate names).
+            * The object is attached to an alphabet matching the passed
+            * integer. The names are bare rank integers (starting at the
+            * value giving by *offset*).
+            *
+            * \param align the source alignment object.
+            * 
+            * \param alphabetID an integer indicating which alphabet to
+            * use:
+            *       - 1 for DNA
+            *       - 2 for RNA
+            *       - 3 for proteins
+            *       - 4 for standard codon
+            *       - 5 for vertebrate mitochondrial codon
+            *       - 6 for invertebrate mitochondrial codon
+            *       - 7 for echinoderm mitochondrial codon
+            *       .
+            * Other values will result in an exception.
+            * 
+            * \param outgroupFlag an integer indicating whether to
+            * include outgroup sequences:
+            *       - 0 use all sequences
+            *       - 1 use only sequences without 999 label (ingroup)
+            *       - 2 use only sequences with 999 label (outgroup)
+            *       .
+            * Other values will result in an exception.
+            * 
+            * \param offset enter an integer to shift the names of the
+            * resulting alignment (useful to merge alignment and ensure
+            * that names are not duplicated).
+            * 
+            * \return A Bio++ alignment.
+            * 
+            */
+            static bpp::AlignedSequenceContainer egglib2bpp(Align& align, unsigned int alphabetID, unsigned int outgroupFlag, unsigned int offset=0);
+
+#endif
+
+
+
+        protected:
+
+           /** \brief This class cannot be instantiated
+            * 
+            */
+            Convert() { }
+
+
+           /** \brief This class cannot be instantiated
+            * 
+            */
+            Convert(const Convert& source) { }
+
+
+           /** \brief This class cannot be instantiated
+            * 
+            */
+            Convert& operator=(const Convert& source) { return *this; }
+
+
+           /** \brief This class cannot be instantiated
+            * 
+            */
+            virtual ~Convert() { }
+
+#ifdef HAVE_LIBBPP_SEQ
+            static bpp::DNA dnaAlphabet;
+            static bpp::RNA rnaAlphabet;
+            static bpp::ProteicAlphabet proteicAlphabet;
+            static bpp::StandardCodonAlphabet standardCodonAlphabet;
+            static bpp::VertebrateMitochondrialCodonAlphabet vertebrateMitochondrialCodonAlphabet;
+            static bpp::InvertebrateMitochondrialCodonAlphabet invertebrateMitochondrialCodonAlphabet;
+            static bpp::EchinodermMitochondrialCodonAlphabet echinodermMitochondrialCodonAlphabet;
+#endif
+
+    };
+}
+
+#endif