| 
1
 | 
     1 /*
 | 
| 
 | 
     2     Copyright 2009 Stéphane De Mita, Mathieu Siol
 | 
| 
 | 
     3 
 | 
| 
 | 
     4     This file is part of the EggLib library.
 | 
| 
 | 
     5 
 | 
| 
 | 
     6     EggLib is free software: you can redistribute it and/or modify
 | 
| 
 | 
     7     it under the terms of the GNU General Public License as published by
 | 
| 
 | 
     8     the Free Software Foundation, either version 3 of the License, or
 | 
| 
 | 
     9     (at your option) any later version.
 | 
| 
 | 
    10 
 | 
| 
 | 
    11     EggLib is distributed in the hope that it will be useful,
 | 
| 
 | 
    12     but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
| 
 | 
    13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
| 
 | 
    14     GNU General Public License for more details.
 | 
| 
 | 
    15 
 | 
| 
 | 
    16     You should have received a copy of the GNU General Public License
 | 
| 
 | 
    17     along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
 | 
| 
 | 
    18 */
 | 
| 
 | 
    19 
 | 
| 
 | 
    20 
 | 
| 
 | 
    21 #ifndef EGGLIB_CONVERT_HPP
 | 
| 
 | 
    22 #define EGGLIB_CONVERT_HPP
 | 
| 
 | 
    23 
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 #include "DataMatrix.hpp"
 | 
| 
 | 
    26 #include "Align.hpp"
 | 
| 
 | 
    27 #include "EggException.hpp"
 | 
| 
 | 
    28 #include "Random.hpp"
 | 
| 
 | 
    29 #include <string>
 | 
| 
 | 
    30 
 | 
| 
 | 
    31 #include "config.h"
 | 
| 
 | 
    32 
 | 
| 
 | 
    33 #ifdef HAVE_LIBBPP_SEQ
 | 
| 
 | 
    34 #include <Bpp/Seq/Alphabet.all>
 | 
| 
 | 
    35 #include <Bpp/Seq/Sequence.h>
 | 
| 
 | 
    36 #include <Bpp/Seq/Container.all>
 | 
| 
 | 
    37 #endif
 | 
| 
 | 
    38 
 | 
| 
 | 
    39 
 | 
| 
 | 
    40 
 | 
| 
 | 
    41 namespace egglib {
 | 
| 
 | 
    42 
 | 
| 
 | 
    43 
 | 
| 
 | 
    44    /** \brief Performs conversion between sequence holder types
 | 
| 
 | 
    45     *
 | 
| 
 | 
    46     * \ingroup core
 | 
| 
 | 
    47     * 
 | 
| 
 | 
    48     * Static methods of this class allows conversion between sequence
 | 
| 
 | 
    49     * holder types implying parametrizable modifications.
 | 
| 
 | 
    50     * 
 | 
| 
 | 
    51     */
 | 
| 
 | 
    52     class Convert {
 | 
| 
 | 
    53 
 | 
| 
 | 
    54         public:
 | 
| 
 | 
    55         
 | 
| 
 | 
    56            /** \brief DataMatrix to Align conversion
 | 
| 
 | 
    57             * 
 | 
| 
 | 
    58             * By defaut, this method generates an Align instance
 | 
| 
 | 
    59             * containing only the polymorphic sites. The integers of
 | 
| 
 | 
    60             * the DataMatrix will be converted as follow: 0 to A, 1 to
 | 
| 
 | 
    61             * C, 2 to G and 3 to T. This behaviour can be largely
 | 
| 
 | 
    62             * modified using options.
 | 
| 
 | 
    63             * 
 | 
| 
 | 
    64             * \param dataMatrix DataMatrix instance.
 | 
| 
 | 
    65             * 
 | 
| 
 | 
    66             * \param length length of the desired alignment. Non-varying
 | 
| 
 | 
    67             * stretches of data will be introduced to reach the
 | 
| 
 | 
    68             * specified length. By default the positions of segregating
 | 
| 
 | 
    69             * sites will be determined from the positions given by the
 | 
| 
 | 
    70             * DataMatrix object. Those positions are expressed in a
 | 
| 
 | 
    71             * continuous range, and will be discretized. Mutations
 | 
| 
 | 
    72             * falling on the same site will be moved of one position
 | 
| 
 | 
    73             * left or right (always preserving the order of mutation
 | 
| 
 | 
    74             * sites). If positions are all zero (the default of the
 | 
| 
 | 
    75             * DataMatrix class) and if length is larger than the number
 | 
| 
 | 
    76             * of segregating sites, then all segregating sites will
 | 
| 
 | 
    77             * cluster on the left-hand side of the alignment.
 | 
| 
 | 
    78             * 
 | 
| 
 | 
    79             * \param random the address to a Random object allowing to 
 | 
| 
 | 
    80             * draw random numbers (for randomizing positions and/or
 | 
| 
 | 
    81             * non-varying states). If an address is provided but no
 | 
| 
 | 
    82             * random numbers are required, it is ignored. If no address
 | 
| 
 | 
    83             * if provided and random numbers are required, a Random
 | 
| 
 | 
    84             * instance is built internally.
 | 
| 
 | 
    85             * 
 | 
| 
 | 
    86             * \param randomizePositions if true, the positions specified
 | 
| 
 | 
    87             * in the DataMatrix objects are ignored and the positions of
 | 
| 
 | 
    88             * mutations are drawn randomly along the interval (only if
 | 
| 
 | 
    89             * the specified length is larger than the number of
 | 
| 
 | 
    90             * segregating sites). If randomizePositions and false and
 | 
| 
 | 
    91             * positions are not
 | 
| 
 | 
    92             * 
 | 
| 
 | 
    93             * \param enforceLength specify whether a
 | 
| 
 | 
    94             * EggRuntimeError should be thrown when the number of
 | 
| 
 | 
    95             * polymorphic sites is larger than the specified length. If
 | 
| 
 | 
    96             * false (the default) and in cases where the specified
 | 
| 
 | 
    97             * length is too short to harbor all polymorphic sites, the
 | 
| 
 | 
    98             * alignment length will be increased as needed.
 | 
| 
 | 
    99             * 
 | 
| 
 | 
   100             * \param randomizeNonVaryingStates if true, the stretches of
 | 
| 
 | 
   101             * conserved positions (between segregating sites) will be
 | 
| 
 | 
   102             * randomly drawn from the current symbol mapping. Otherwise,
 | 
| 
 | 
   103             * the symbol given by fixed will be used.
 | 
| 
 | 
   104             * 
 | 
| 
 | 
   105             * \param randomizeAlleles if true, alleles will be drawn
 | 
| 
 | 
   106             * randomly from the mapped characters. Note that if a
 | 
| 
 | 
   107             * genotype value is larger than the size of the mapping, it
 | 
| 
 | 
   108             * will be replaced by the character given by unknown,
 | 
| 
 | 
   109             * without randomization. In other words, with the mapping
 | 
| 
 | 
   110             * "ACGT", alleles 0, 1, 2 and 3 will be randomly assigned
 | 
| 
 | 
   111             * to these four characters, but larger and negative alleles
 | 
| 
 | 
   112             * will be assigned to the unknown character.
 | 
| 
 | 
   113             * 
 | 
| 
 | 
   114             * \param mapping a string given the character to assign to
 | 
| 
 | 
   115             * different character values read from the DataMatrix. If
 | 
| 
 | 
   116             * the read value is 0, the first character of the string
 | 
| 
 | 
   117             * will used, the the value is 1, the second character will
 | 
| 
 | 
   118             * be used, and so on. If the integer read is out of range
 | 
| 
 | 
   119             * (in particular, for any negative value), then the
 | 
| 
 | 
   120             * character given by unknown will be used. An empty string
 | 
| 
 | 
   121             * will always lead to alignments containing only the
 | 
| 
 | 
   122             * character given by unknown. The string "01" is suitable
 | 
| 
 | 
   123             * for binary data.
 | 
| 
 | 
   124             * 
 | 
| 
 | 
   125             * \param unknown the character to use if an integer genotype
 | 
| 
 | 
   126             * value is not mapped in the mapping string (that is, if
 | 
| 
 | 
   127             * the mapping string is too short).
 | 
| 
 | 
   128             * 
 | 
| 
 | 
   129             * \param nonVaryingState character to use for conserved
 | 
| 
 | 
   130             * stretches of data. It doesn't have to be included in the
 | 
| 
 | 
   131             * mapping. If randomizeNonVaryingState is true, this
 | 
| 
 | 
   132             * argument is ignored.
 | 
| 
 | 
   133             * 
 | 
| 
 | 
   134             * \return The resulting Align object.
 | 
| 
 | 
   135             * 
 | 
| 
 | 
   136             */
 | 
| 
 | 
   137             static Align align(
 | 
| 
 | 
   138                 DataMatrix& dataMatrix,
 | 
| 
 | 
   139                 unsigned int length=0,
 | 
| 
 | 
   140                 Random* random=NULL,
 | 
| 
 | 
   141                 bool randomizePositions=false,
 | 
| 
 | 
   142                 bool randomizeNonVaryingStates=false,
 | 
| 
 | 
   143                 bool randomizeAlleles=false,
 | 
| 
 | 
   144                 bool enforceLength=false,
 | 
| 
 | 
   145                 std::string mapping="ACGT",
 | 
| 
 | 
   146                 char unknown='?',
 | 
| 
 | 
   147                 char nonVaryingState='A'
 | 
| 
 | 
   148             );
 | 
| 
 | 
   149 
 | 
| 
 | 
   150 
 | 
| 
 | 
   151 #ifdef HAVE_LIBBPP_SEQ
 | 
| 
 | 
   152 
 | 
| 
 | 
   153             /** \brief Converts an alignment to the equivalent Bio++ type
 | 
| 
 | 
   154             *
 | 
| 
 | 
   155             * During conversion, name information is lost (arbitrary
 | 
| 
 | 
   156             * names are generated in order toprevent duplicate names).
 | 
| 
 | 
   157             * The object is attached to an alphabet matching the passed
 | 
| 
 | 
   158             * integer. The names are bare rank integers (starting at the
 | 
| 
 | 
   159             * value giving by *offset*).
 | 
| 
 | 
   160             *
 | 
| 
 | 
   161             * \param align the source alignment object.
 | 
| 
 | 
   162             * 
 | 
| 
 | 
   163             * \param alphabetID an integer indicating which alphabet to
 | 
| 
 | 
   164             * use:
 | 
| 
 | 
   165             *       - 1 for DNA
 | 
| 
 | 
   166             *       - 2 for RNA
 | 
| 
 | 
   167             *       - 3 for proteins
 | 
| 
 | 
   168             *       - 4 for standard codon
 | 
| 
 | 
   169             *       - 5 for vertebrate mitochondrial codon
 | 
| 
 | 
   170             *       - 6 for invertebrate mitochondrial codon
 | 
| 
 | 
   171             *       - 7 for echinoderm mitochondrial codon
 | 
| 
 | 
   172             *       .
 | 
| 
 | 
   173             * Other values will result in an exception.
 | 
| 
 | 
   174             * 
 | 
| 
 | 
   175             * \param outgroupFlag an integer indicating whether to
 | 
| 
 | 
   176             * include outgroup sequences:
 | 
| 
 | 
   177             *       - 0 use all sequences
 | 
| 
 | 
   178             *       - 1 use only sequences without 999 label (ingroup)
 | 
| 
 | 
   179             *       - 2 use only sequences with 999 label (outgroup)
 | 
| 
 | 
   180             *       .
 | 
| 
 | 
   181             * Other values will result in an exception.
 | 
| 
 | 
   182             * 
 | 
| 
 | 
   183             * \param offset enter an integer to shift the names of the
 | 
| 
 | 
   184             * resulting alignment (useful to merge alignment and ensure
 | 
| 
 | 
   185             * that names are not duplicated).
 | 
| 
 | 
   186             * 
 | 
| 
 | 
   187             * \return A Bio++ alignment.
 | 
| 
 | 
   188             * 
 | 
| 
 | 
   189             */
 | 
| 
 | 
   190             static bpp::AlignedSequenceContainer egglib2bpp(Align& align, unsigned int alphabetID, unsigned int outgroupFlag, unsigned int offset=0);
 | 
| 
 | 
   191 
 | 
| 
 | 
   192 #endif
 | 
| 
 | 
   193 
 | 
| 
 | 
   194 
 | 
| 
 | 
   195 
 | 
| 
 | 
   196         protected:
 | 
| 
 | 
   197 
 | 
| 
 | 
   198            /** \brief This class cannot be instantiated
 | 
| 
 | 
   199             * 
 | 
| 
 | 
   200             */
 | 
| 
 | 
   201             Convert() { }
 | 
| 
 | 
   202 
 | 
| 
 | 
   203 
 | 
| 
 | 
   204            /** \brief This class cannot be instantiated
 | 
| 
 | 
   205             * 
 | 
| 
 | 
   206             */
 | 
| 
 | 
   207             Convert(const Convert& source) { }
 | 
| 
 | 
   208 
 | 
| 
 | 
   209 
 | 
| 
 | 
   210            /** \brief This class cannot be instantiated
 | 
| 
 | 
   211             * 
 | 
| 
 | 
   212             */
 | 
| 
 | 
   213             Convert& operator=(const Convert& source) { return *this; }
 | 
| 
 | 
   214 
 | 
| 
 | 
   215 
 | 
| 
 | 
   216            /** \brief This class cannot be instantiated
 | 
| 
 | 
   217             * 
 | 
| 
 | 
   218             */
 | 
| 
 | 
   219             virtual ~Convert() { }
 | 
| 
 | 
   220 
 | 
| 
 | 
   221 #ifdef HAVE_LIBBPP_SEQ
 | 
| 
 | 
   222             static bpp::DNA dnaAlphabet;
 | 
| 
 | 
   223             static bpp::RNA rnaAlphabet;
 | 
| 
 | 
   224             static bpp::ProteicAlphabet proteicAlphabet;
 | 
| 
 | 
   225             static bpp::StandardCodonAlphabet standardCodonAlphabet;
 | 
| 
 | 
   226             static bpp::VertebrateMitochondrialCodonAlphabet vertebrateMitochondrialCodonAlphabet;
 | 
| 
 | 
   227             static bpp::InvertebrateMitochondrialCodonAlphabet invertebrateMitochondrialCodonAlphabet;
 | 
| 
 | 
   228             static bpp::EchinodermMitochondrialCodonAlphabet echinodermMitochondrialCodonAlphabet;
 | 
| 
 | 
   229 #endif
 | 
| 
 | 
   230 
 | 
| 
 | 
   231     };
 | 
| 
 | 
   232 }
 | 
| 
 | 
   233 
 | 
| 
 | 
   234 #endif
 |