Mercurial > repos > dereeper > sniplay
comparison egglib/egglib-2.1.5/include/egglib-cpp/Convert.hpp @ 1:420b57c3c185 draft
Uploaded
| author | dereeper |
|---|---|
| date | Fri, 10 Jul 2015 04:39:30 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:3e19d0dfcf3e | 1:420b57c3c185 |
|---|---|
| 1 /* | |
| 2 Copyright 2009 Stéphane De Mita, Mathieu Siol | |
| 3 | |
| 4 This file is part of the EggLib library. | |
| 5 | |
| 6 EggLib is free software: you can redistribute it and/or modify | |
| 7 it under the terms of the GNU General Public License as published by | |
| 8 the Free Software Foundation, either version 3 of the License, or | |
| 9 (at your option) any later version. | |
| 10 | |
| 11 EggLib is distributed in the hope that it will be useful, | |
| 12 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 GNU General Public License for more details. | |
| 15 | |
| 16 You should have received a copy of the GNU General Public License | |
| 17 along with EggLib. If not, see <http://www.gnu.org/licenses/>. | |
| 18 */ | |
| 19 | |
| 20 | |
| 21 #ifndef EGGLIB_CONVERT_HPP | |
| 22 #define EGGLIB_CONVERT_HPP | |
| 23 | |
| 24 | |
| 25 #include "DataMatrix.hpp" | |
| 26 #include "Align.hpp" | |
| 27 #include "EggException.hpp" | |
| 28 #include "Random.hpp" | |
| 29 #include <string> | |
| 30 | |
| 31 #include "config.h" | |
| 32 | |
| 33 #ifdef HAVE_LIBBPP_SEQ | |
| 34 #include <Bpp/Seq/Alphabet.all> | |
| 35 #include <Bpp/Seq/Sequence.h> | |
| 36 #include <Bpp/Seq/Container.all> | |
| 37 #endif | |
| 38 | |
| 39 | |
| 40 | |
| 41 namespace egglib { | |
| 42 | |
| 43 | |
| 44 /** \brief Performs conversion between sequence holder types | |
| 45 * | |
| 46 * \ingroup core | |
| 47 * | |
| 48 * Static methods of this class allows conversion between sequence | |
| 49 * holder types implying parametrizable modifications. | |
| 50 * | |
| 51 */ | |
| 52 class Convert { | |
| 53 | |
| 54 public: | |
| 55 | |
| 56 /** \brief DataMatrix to Align conversion | |
| 57 * | |
| 58 * By defaut, this method generates an Align instance | |
| 59 * containing only the polymorphic sites. The integers of | |
| 60 * the DataMatrix will be converted as follow: 0 to A, 1 to | |
| 61 * C, 2 to G and 3 to T. This behaviour can be largely | |
| 62 * modified using options. | |
| 63 * | |
| 64 * \param dataMatrix DataMatrix instance. | |
| 65 * | |
| 66 * \param length length of the desired alignment. Non-varying | |
| 67 * stretches of data will be introduced to reach the | |
| 68 * specified length. By default the positions of segregating | |
| 69 * sites will be determined from the positions given by the | |
| 70 * DataMatrix object. Those positions are expressed in a | |
| 71 * continuous range, and will be discretized. Mutations | |
| 72 * falling on the same site will be moved of one position | |
| 73 * left or right (always preserving the order of mutation | |
| 74 * sites). If positions are all zero (the default of the | |
| 75 * DataMatrix class) and if length is larger than the number | |
| 76 * of segregating sites, then all segregating sites will | |
| 77 * cluster on the left-hand side of the alignment. | |
| 78 * | |
| 79 * \param random the address to a Random object allowing to | |
| 80 * draw random numbers (for randomizing positions and/or | |
| 81 * non-varying states). If an address is provided but no | |
| 82 * random numbers are required, it is ignored. If no address | |
| 83 * if provided and random numbers are required, a Random | |
| 84 * instance is built internally. | |
| 85 * | |
| 86 * \param randomizePositions if true, the positions specified | |
| 87 * in the DataMatrix objects are ignored and the positions of | |
| 88 * mutations are drawn randomly along the interval (only if | |
| 89 * the specified length is larger than the number of | |
| 90 * segregating sites). If randomizePositions and false and | |
| 91 * positions are not | |
| 92 * | |
| 93 * \param enforceLength specify whether a | |
| 94 * EggRuntimeError should be thrown when the number of | |
| 95 * polymorphic sites is larger than the specified length. If | |
| 96 * false (the default) and in cases where the specified | |
| 97 * length is too short to harbor all polymorphic sites, the | |
| 98 * alignment length will be increased as needed. | |
| 99 * | |
| 100 * \param randomizeNonVaryingStates if true, the stretches of | |
| 101 * conserved positions (between segregating sites) will be | |
| 102 * randomly drawn from the current symbol mapping. Otherwise, | |
| 103 * the symbol given by fixed will be used. | |
| 104 * | |
| 105 * \param randomizeAlleles if true, alleles will be drawn | |
| 106 * randomly from the mapped characters. Note that if a | |
| 107 * genotype value is larger than the size of the mapping, it | |
| 108 * will be replaced by the character given by unknown, | |
| 109 * without randomization. In other words, with the mapping | |
| 110 * "ACGT", alleles 0, 1, 2 and 3 will be randomly assigned | |
| 111 * to these four characters, but larger and negative alleles | |
| 112 * will be assigned to the unknown character. | |
| 113 * | |
| 114 * \param mapping a string given the character to assign to | |
| 115 * different character values read from the DataMatrix. If | |
| 116 * the read value is 0, the first character of the string | |
| 117 * will used, the the value is 1, the second character will | |
| 118 * be used, and so on. If the integer read is out of range | |
| 119 * (in particular, for any negative value), then the | |
| 120 * character given by unknown will be used. An empty string | |
| 121 * will always lead to alignments containing only the | |
| 122 * character given by unknown. The string "01" is suitable | |
| 123 * for binary data. | |
| 124 * | |
| 125 * \param unknown the character to use if an integer genotype | |
| 126 * value is not mapped in the mapping string (that is, if | |
| 127 * the mapping string is too short). | |
| 128 * | |
| 129 * \param nonVaryingState character to use for conserved | |
| 130 * stretches of data. It doesn't have to be included in the | |
| 131 * mapping. If randomizeNonVaryingState is true, this | |
| 132 * argument is ignored. | |
| 133 * | |
| 134 * \return The resulting Align object. | |
| 135 * | |
| 136 */ | |
| 137 static Align align( | |
| 138 DataMatrix& dataMatrix, | |
| 139 unsigned int length=0, | |
| 140 Random* random=NULL, | |
| 141 bool randomizePositions=false, | |
| 142 bool randomizeNonVaryingStates=false, | |
| 143 bool randomizeAlleles=false, | |
| 144 bool enforceLength=false, | |
| 145 std::string mapping="ACGT", | |
| 146 char unknown='?', | |
| 147 char nonVaryingState='A' | |
| 148 ); | |
| 149 | |
| 150 | |
| 151 #ifdef HAVE_LIBBPP_SEQ | |
| 152 | |
| 153 /** \brief Converts an alignment to the equivalent Bio++ type | |
| 154 * | |
| 155 * During conversion, name information is lost (arbitrary | |
| 156 * names are generated in order toprevent duplicate names). | |
| 157 * The object is attached to an alphabet matching the passed | |
| 158 * integer. The names are bare rank integers (starting at the | |
| 159 * value giving by *offset*). | |
| 160 * | |
| 161 * \param align the source alignment object. | |
| 162 * | |
| 163 * \param alphabetID an integer indicating which alphabet to | |
| 164 * use: | |
| 165 * - 1 for DNA | |
| 166 * - 2 for RNA | |
| 167 * - 3 for proteins | |
| 168 * - 4 for standard codon | |
| 169 * - 5 for vertebrate mitochondrial codon | |
| 170 * - 6 for invertebrate mitochondrial codon | |
| 171 * - 7 for echinoderm mitochondrial codon | |
| 172 * . | |
| 173 * Other values will result in an exception. | |
| 174 * | |
| 175 * \param outgroupFlag an integer indicating whether to | |
| 176 * include outgroup sequences: | |
| 177 * - 0 use all sequences | |
| 178 * - 1 use only sequences without 999 label (ingroup) | |
| 179 * - 2 use only sequences with 999 label (outgroup) | |
| 180 * . | |
| 181 * Other values will result in an exception. | |
| 182 * | |
| 183 * \param offset enter an integer to shift the names of the | |
| 184 * resulting alignment (useful to merge alignment and ensure | |
| 185 * that names are not duplicated). | |
| 186 * | |
| 187 * \return A Bio++ alignment. | |
| 188 * | |
| 189 */ | |
| 190 static bpp::AlignedSequenceContainer egglib2bpp(Align& align, unsigned int alphabetID, unsigned int outgroupFlag, unsigned int offset=0); | |
| 191 | |
| 192 #endif | |
| 193 | |
| 194 | |
| 195 | |
| 196 protected: | |
| 197 | |
| 198 /** \brief This class cannot be instantiated | |
| 199 * | |
| 200 */ | |
| 201 Convert() { } | |
| 202 | |
| 203 | |
| 204 /** \brief This class cannot be instantiated | |
| 205 * | |
| 206 */ | |
| 207 Convert(const Convert& source) { } | |
| 208 | |
| 209 | |
| 210 /** \brief This class cannot be instantiated | |
| 211 * | |
| 212 */ | |
| 213 Convert& operator=(const Convert& source) { return *this; } | |
| 214 | |
| 215 | |
| 216 /** \brief This class cannot be instantiated | |
| 217 * | |
| 218 */ | |
| 219 virtual ~Convert() { } | |
| 220 | |
| 221 #ifdef HAVE_LIBBPP_SEQ | |
| 222 static bpp::DNA dnaAlphabet; | |
| 223 static bpp::RNA rnaAlphabet; | |
| 224 static bpp::ProteicAlphabet proteicAlphabet; | |
| 225 static bpp::StandardCodonAlphabet standardCodonAlphabet; | |
| 226 static bpp::VertebrateMitochondrialCodonAlphabet vertebrateMitochondrialCodonAlphabet; | |
| 227 static bpp::InvertebrateMitochondrialCodonAlphabet invertebrateMitochondrialCodonAlphabet; | |
| 228 static bpp::EchinodermMitochondrialCodonAlphabet echinodermMitochondrialCodonAlphabet; | |
| 229 #endif | |
| 230 | |
| 231 }; | |
| 232 } | |
| 233 | |
| 234 #endif |
