Mercurial > repos > dereeper > sniplay
comparison egglib/egglib-2.1.5/include/egglib-cpp/Consensus.hpp @ 1:420b57c3c185 draft
Uploaded
| author | dereeper |
|---|---|
| date | Fri, 10 Jul 2015 04:39:30 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:3e19d0dfcf3e | 1:420b57c3c185 |
|---|---|
| 1 /* | |
| 2 Copyright 2008-2009 Stéphane De Mita, Mathieu Siol | |
| 3 | |
| 4 This file is part of the EggLib library. | |
| 5 | |
| 6 EggLib is free software: you can redistribute it and/or modify | |
| 7 it under the terms of the GNU General Public License as published by | |
| 8 the Free Software Foundation, either version 3 of the License, or | |
| 9 (at your option) any later version. | |
| 10 | |
| 11 EggLib is distributed in the hope that it will be useful, | |
| 12 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 GNU General Public License for more details. | |
| 15 | |
| 16 You should have received a copy of the GNU General Public License | |
| 17 along with EggLib. If not, see <http://www.gnu.org/licenses/>. | |
| 18 */ | |
| 19 | |
| 20 #ifndef EGGLIB_CONSENSUS_HPP | |
| 21 #define EGGLIB_CONSENSUS_HPP | |
| 22 | |
| 23 #include "Align.hpp" | |
| 24 #include <sstream> | |
| 25 #include <string> | |
| 26 #include <vector> | |
| 27 | |
| 28 namespace egglib { | |
| 29 | |
| 30 /** \brief Generates consensus sequences | |
| 31 * | |
| 32 * \ingroup polymorphism | |
| 33 * | |
| 34 * | |
| 35 * A consensus is generated when two sequences have the same name, | |
| 36 * ignoring everything after the first separator character (by | |
| 37 * default, "_"). Hence, the names "foo", "foo_goo" and "foo_third" | |
| 38 * will be treated as identical and the root will be "foo". The root | |
| 39 * will be used to name the resulting sequence. Note that the | |
| 40 * class works only for DNA sequences. | |
| 41 * | |
| 42 * Symbol convention: | |
| 43 * - A: adenosine | |
| 44 * - C: cytosine | |
| 45 * - G: guanine | |
| 46 * - T: thymine | |
| 47 * - M: A or C | |
| 48 * - R: A or G | |
| 49 * - W: A or T (weak) | |
| 50 * - S: C or G (strong) | |
| 51 * - Y: C or T | |
| 52 * - K: G or T | |
| 53 * - B: C or G or T(not A) | |
| 54 * - D: A or G or T (not C) | |
| 55 * - H: A or C or T (not G) | |
| 56 * - V: A or C or G (not T) | |
| 57 * - N: A or C or G or T | |
| 58 * - ?: nonsequenced position | |
| 59 * | |
| 60 * Other symbols will be treated as ? (lowercase are supported). | |
| 61 * | |
| 62 * Rigorous (alias liberal or strong) mode: | |
| 63 * - If two characters are the same, it is retained whatever it is | |
| 64 * (A + A = A) | |
| 65 * - Otherwise: | |
| 66 * - If one is the missing character (?) the other is retained | |
| 67 * whatever it is (A + ? = A). | |
| 68 * - If characters are consistent, that is one contains | |
| 69 * more information, that one is retained (A + M = A). | |
| 70 * - If characters are not consistent, the closest | |
| 71 * generic symbol is retained (A + C = M). | |
| 72 * . | |
| 73 * Note that the feedback of inconsistent characters in the | |
| 74 * outcome is not garanteed. | |
| 75 * In fact, (A + A + G) will result in R (as expected) but (A + | |
| 76 * G + A) will result in A, masking the problem. | |
| 77 * However, the position will indeed be counted as inconsistent. | |
| 78 * | |
| 79 * Not rigorous (conservative/weak) mode: | |
| 80 * - If two characters are the same, it is retained whatever it | |
| 81 * is (A + A = A). | |
| 82 * - Otherwise: | |
| 83 * - If one is ? the other is retained whatever it is (A + ? | |
| 84 * = A). | |
| 85 * - Otherwise an inconsistent character (by default, Z) is | |
| 86 * retained (A + C = Z). | |
| 87 * | |
| 88 * Iterative process of consensus: | |
| 89 * - Each sequence is taken in turn. | |
| 90 * - Each pair involving the focus sequence is processed and a | |
| 91 * consensus is generated. | |
| 92 * - When all pair have been processsed, the consensus already | |
| 93 * generated are themselves iteratively processed until only one | |
| 94 * remains. | |
| 95 * - Note that at each time the last two are taken first. | |
| 96 * | |
| 97 * A transparent interface gives access to the data for all steps of | |
| 98 * the consensus process, as vectors that covers all pairs (including | |
| 99 * intermediate steps of the iterative procedure described above) as | |
| 100 * well as singleton sequences. For the latter, the second name is | |
| 101 * not filled and all counts are set to 0. Note also that the name of | |
| 102 * such singleton sequence is shortened to the separator as well. | |
| 103 * | |
| 104 */ | |
| 105 class Consensus { | |
| 106 public: | |
| 107 /** \brief Constructor | |
| 108 * | |
| 109 */ | |
| 110 Consensus(); | |
| 111 | |
| 112 /** \brief Destructor | |
| 113 * | |
| 114 */ | |
| 115 virtual ~Consensus() {} | |
| 116 | |
| 117 /// Sets the character interpreted as missing (default: ?) | |
| 118 void setMissing(char); | |
| 119 | |
| 120 /// Sets the character used to point to disagreements (default: Z) | |
| 121 void setDisagreement(char); | |
| 122 | |
| 123 /// Checks all the characters | |
| 124 bool check_sequences(Align& align); | |
| 125 | |
| 126 /** \brief Reduces the sequence alignment by making consensus sequences | |
| 127 * | |
| 128 * \param align the original alignment. | |
| 129 * | |
| 130 * \param separator the character used to separated the root | |
| 131 * name of sequences to the variable part, as in (for the | |
| 132 * default value: "sequence_read1". | |
| 133 * | |
| 134 * \param rigorous consensus mode. | |
| 135 * | |
| 136 * \return An Align instance with duplicated sequences consensed. | |
| 137 * | |
| 138 */ | |
| 139 Align consensus(Align& align, char separator='_', bool rigorous=true); | |
| 140 | |
| 141 /// First name of consensed pairs | |
| 142 const std::vector<std::string>& firstSequenceNames(); | |
| 143 | |
| 144 /// Second names of consensed pairs | |
| 145 const std::vector<std::string>& secondSequenceNames(); | |
| 146 | |
| 147 /// Root names of consensed pairs | |
| 148 const std::vector<std::string>& roots(); | |
| 149 | |
| 150 /// Number of consistent positions for all consensed pairs | |
| 151 const std::vector<int>& consistentPositions(); | |
| 152 | |
| 153 /// Number of complementary positions for all consensed pairs | |
| 154 const std::vector<int>& complementaryPositions(); | |
| 155 | |
| 156 /// Number of uninformative positions for all consensed pairs | |
| 157 const std::vector<int>& uninformativePositions(); | |
| 158 | |
| 159 /// Number of ambiguous positions for all consensed pairs | |
| 160 const std::vector<int>& ambiguousPositions(); | |
| 161 | |
| 162 /// Number of at least partially resolved ambiguities for all consensed pairs | |
| 163 const std::vector<int>& atLeastPartiallyResolvedAmbiguities(); | |
| 164 | |
| 165 /// Vector of inconsistent positions ofr all consensed pairs | |
| 166 const std::vector<std::vector<int> >& inconsistentPositions(); | |
| 167 | |
| 168 | |
| 169 private: | |
| 170 | |
| 171 /** \brief Copying this class is not allowed | |
| 172 * | |
| 173 */ | |
| 174 Consensus(const Consensus& source) { } | |
| 175 | |
| 176 /** \brief Copying this class is not allowed | |
| 177 * | |
| 178 */ | |
| 179 Consensus& operator=(const Consensus& source) { return *this; } | |
| 180 | |
| 181 // A private helper | |
| 182 class PairwiseConsensus; | |
| 183 | |
| 184 // report data | |
| 185 std::vector<std::string> t_firstSequenceNames; | |
| 186 std::vector<std::string> t_secondSequenceNames; | |
| 187 std::vector<std::string> t_roots; | |
| 188 std::vector<int> t_consistentPositions; | |
| 189 std::vector<int> t_complementaryPositions; | |
| 190 std::vector<int> t_uninformativePositions; | |
| 191 std::vector<int> t_ambiguousPositions; | |
| 192 std::vector<int> t_atLeastPartiallyResolvedAmbiguities; | |
| 193 std::vector<std::vector<int> > t_inconsistentPositions; | |
| 194 | |
| 195 // Code for missing data (usually ?) | |
| 196 char MISSING; | |
| 197 | |
| 198 // Code for disgrement | |
| 199 char DISAGREEMENT; | |
| 200 | |
| 201 // Helper class managing a single pair | |
| 202 class PairwiseConsensus { | |
| 203 public: | |
| 204 // Default object creation | |
| 205 PairwiseConsensus(); | |
| 206 | |
| 207 // Object destruction | |
| 208 virtual ~PairwiseConsensus() {} | |
| 209 | |
| 210 // Usual object creation | |
| 211 PairwiseConsensus(std::string, std::string); | |
| 212 | |
| 213 // Fills an object created with the default constructor | |
| 214 void load(std::string,std::string); | |
| 215 | |
| 216 // Changes the MISSING character | |
| 217 void setUndeterminedCharacter(char); | |
| 218 | |
| 219 // Changes the DISAGREEMENT character | |
| 220 void setDisagreementCharacter(char); | |
| 221 | |
| 222 /* Uses the conservative mode of consensus | |
| 223 * | |
| 224 * Tries to avoid to make decisions, and adds the | |
| 225 * character set by DISAGREEMENT upon inconsistencies | |
| 226 * | |
| 227 * return The number of inconsistencies. | |
| 228 */ | |
| 229 int generateSoftConsensus(); | |
| 230 | |
| 231 /* Strict mode of consensus | |
| 232 * | |
| 233 * The number of inconsistencies. | |
| 234 */ | |
| 235 int generateHardConsensus(); | |
| 236 | |
| 237 // Two fully resolved (including gap) and identical characters | |
| 238 int getConsistentPositions(); | |
| 239 | |
| 240 // One informative (including gap) and one missing | |
| 241 int getComplementaryPositions(); | |
| 242 | |
| 243 // None missing, but different and incompatible | |
| 244 int getInconsistentPositions(); | |
| 245 | |
| 246 // Both are missing | |
| 247 int getUninformativePositions(); | |
| 248 | |
| 249 // Both identical or one missing, but not fully resolved | |
| 250 int getAmbiguousPositions(); | |
| 251 | |
| 252 // Different, not missing, complementary. | |
| 253 int getAtLeastPartiallyResolvedAmbiguities(); | |
| 254 | |
| 255 // Accessor | |
| 256 int getThisInconsistentPosition(unsigned int); | |
| 257 | |
| 258 // Generates the consensus sequence | |
| 259 std::string getConsensus(); | |
| 260 | |
| 261 private: | |
| 262 | |
| 263 inline bool isValid(char c) { | |
| 264 switch (c) { | |
| 265 case 'A': case 'a': | |
| 266 case 'C': case 'c': | |
| 267 case 'G': case 'g': | |
| 268 case 'T': case 't': | |
| 269 return true; | |
| 270 default: | |
| 271 return false; | |
| 272 } | |
| 273 } | |
| 274 | |
| 275 // This initiates a series of embedded objects | |
| 276 void setCharacterContainers(); | |
| 277 | |
| 278 // The first sequence | |
| 279 std::string seqA; | |
| 280 | |
| 281 // The second sequence | |
| 282 std::string seqB; | |
| 283 | |
| 284 // The resulting consensus | |
| 285 std::string cons; | |
| 286 | |
| 287 // The vecotr storing the inconsistent positions | |
| 288 std::vector<int> posIncons; | |
| 289 | |
| 290 // The length of the sequences | |
| 291 unsigned int ls; | |
| 292 | |
| 293 // Counter | |
| 294 int cntConsistentPositions; | |
| 295 | |
| 296 // Counter | |
| 297 int cntComplementaryPositions; | |
| 298 | |
| 299 // Counter | |
| 300 int cntAmbiguousPositions; | |
| 301 | |
| 302 // Counter | |
| 303 int cntInconsistentPositions; | |
| 304 | |
| 305 // Counter | |
| 306 int cntUninformativePositions; | |
| 307 | |
| 308 // Counter | |
| 309 int cntAtLeastPartiallyResolvedAmbiguities; | |
| 310 | |
| 311 // Code for missing data (usually ?) | |
| 312 char MISSING; | |
| 313 | |
| 314 // Code for disgrement | |
| 315 char DISAGREEMENT; | |
| 316 | |
| 317 public: | |
| 318 // This class manages relationships different symbols | |
| 319 class CharacterContainer { | |
| 320 public: | |
| 321 // Default value: @ | |
| 322 CharacterContainer(); | |
| 323 | |
| 324 // Initiates to a given symbol | |
| 325 CharacterContainer(const char&); | |
| 326 | |
| 327 // Assignment operator | |
| 328 CharacterContainer& operator=(const char&); | |
| 329 | |
| 330 // Sets the symbol | |
| 331 void setValue(char); | |
| 332 | |
| 333 // Set the descendants | |
| 334 void setSons(std::vector<CharacterContainer>); | |
| 335 | |
| 336 // Tests whether the symbol is the same | |
| 337 bool is(CharacterContainer); | |
| 338 | |
| 339 // Tests if the query is contained amongst the sons | |
| 340 bool has(CharacterContainer); | |
| 341 | |
| 342 // Tests if the query is contained amongst the sons | |
| 343 bool has(char); | |
| 344 | |
| 345 /* Tests whether the left character has the left one | |
| 346 * Should be called on the N object only. | |
| 347 */ | |
| 348 char lhas(CharacterContainer,CharacterContainer); | |
| 349 | |
| 350 /* Creates the object with the proper sons | |
| 351 * Should be called on the N object only. | |
| 352 */ | |
| 353 CharacterContainer init(char); | |
| 354 | |
| 355 // The symbol | |
| 356 char value; | |
| 357 | |
| 358 // The descendants | |
| 359 std::vector<CharacterContainer> sons; | |
| 360 }; | |
| 361 | |
| 362 private: | |
| 363 // Symbol ? | |
| 364 CharacterContainer ccQ; | |
| 365 | |
| 366 // Symbol A | |
| 367 CharacterContainer ccA; | |
| 368 | |
| 369 // Symbol C | |
| 370 CharacterContainer ccC; | |
| 371 | |
| 372 // Symbol G | |
| 373 CharacterContainer ccG; | |
| 374 | |
| 375 // Symbol T | |
| 376 CharacterContainer ccT; | |
| 377 | |
| 378 // Symbol U | |
| 379 CharacterContainer ccU; | |
| 380 | |
| 381 // Symbol M | |
| 382 CharacterContainer ccM; | |
| 383 | |
| 384 // Symbol R | |
| 385 CharacterContainer ccR; | |
| 386 | |
| 387 // Symbol W | |
| 388 CharacterContainer ccW; | |
| 389 | |
| 390 // Symbol S | |
| 391 CharacterContainer ccS; | |
| 392 | |
| 393 // Symbol Y | |
| 394 CharacterContainer ccY; | |
| 395 | |
| 396 // Symbol K | |
| 397 CharacterContainer ccK; | |
| 398 | |
| 399 // Symbol B | |
| 400 CharacterContainer ccB; | |
| 401 | |
| 402 // Symbol D | |
| 403 CharacterContainer ccD; | |
| 404 | |
| 405 // Symbol H | |
| 406 CharacterContainer ccH; | |
| 407 | |
| 408 // Symbol V | |
| 409 CharacterContainer ccV; | |
| 410 | |
| 411 // Symbol N | |
| 412 CharacterContainer ccN; | |
| 413 | |
| 414 // Symbol - | |
| 415 CharacterContainer ccGAP; | |
| 416 }; | |
| 417 }; | |
| 418 } | |
| 419 | |
| 420 #endif | |
| 421 |
