Mercurial > repos > dereeper > sniplay
comparison egglib/egglib-2.1.5/include/egglib-cpp/Consensus.hpp @ 1:420b57c3c185 draft
Uploaded
author | dereeper |
---|---|
date | Fri, 10 Jul 2015 04:39:30 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:3e19d0dfcf3e | 1:420b57c3c185 |
---|---|
1 /* | |
2 Copyright 2008-2009 Stéphane De Mita, Mathieu Siol | |
3 | |
4 This file is part of the EggLib library. | |
5 | |
6 EggLib is free software: you can redistribute it and/or modify | |
7 it under the terms of the GNU General Public License as published by | |
8 the Free Software Foundation, either version 3 of the License, or | |
9 (at your option) any later version. | |
10 | |
11 EggLib is distributed in the hope that it will be useful, | |
12 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 GNU General Public License for more details. | |
15 | |
16 You should have received a copy of the GNU General Public License | |
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>. | |
18 */ | |
19 | |
20 #ifndef EGGLIB_CONSENSUS_HPP | |
21 #define EGGLIB_CONSENSUS_HPP | |
22 | |
23 #include "Align.hpp" | |
24 #include <sstream> | |
25 #include <string> | |
26 #include <vector> | |
27 | |
28 namespace egglib { | |
29 | |
30 /** \brief Generates consensus sequences | |
31 * | |
32 * \ingroup polymorphism | |
33 * | |
34 * | |
35 * A consensus is generated when two sequences have the same name, | |
36 * ignoring everything after the first separator character (by | |
37 * default, "_"). Hence, the names "foo", "foo_goo" and "foo_third" | |
38 * will be treated as identical and the root will be "foo". The root | |
39 * will be used to name the resulting sequence. Note that the | |
40 * class works only for DNA sequences. | |
41 * | |
42 * Symbol convention: | |
43 * - A: adenosine | |
44 * - C: cytosine | |
45 * - G: guanine | |
46 * - T: thymine | |
47 * - M: A or C | |
48 * - R: A or G | |
49 * - W: A or T (weak) | |
50 * - S: C or G (strong) | |
51 * - Y: C or T | |
52 * - K: G or T | |
53 * - B: C or G or T(not A) | |
54 * - D: A or G or T (not C) | |
55 * - H: A or C or T (not G) | |
56 * - V: A or C or G (not T) | |
57 * - N: A or C or G or T | |
58 * - ?: nonsequenced position | |
59 * | |
60 * Other symbols will be treated as ? (lowercase are supported). | |
61 * | |
62 * Rigorous (alias liberal or strong) mode: | |
63 * - If two characters are the same, it is retained whatever it is | |
64 * (A + A = A) | |
65 * - Otherwise: | |
66 * - If one is the missing character (?) the other is retained | |
67 * whatever it is (A + ? = A). | |
68 * - If characters are consistent, that is one contains | |
69 * more information, that one is retained (A + M = A). | |
70 * - If characters are not consistent, the closest | |
71 * generic symbol is retained (A + C = M). | |
72 * . | |
73 * Note that the feedback of inconsistent characters in the | |
74 * outcome is not garanteed. | |
75 * In fact, (A + A + G) will result in R (as expected) but (A + | |
76 * G + A) will result in A, masking the problem. | |
77 * However, the position will indeed be counted as inconsistent. | |
78 * | |
79 * Not rigorous (conservative/weak) mode: | |
80 * - If two characters are the same, it is retained whatever it | |
81 * is (A + A = A). | |
82 * - Otherwise: | |
83 * - If one is ? the other is retained whatever it is (A + ? | |
84 * = A). | |
85 * - Otherwise an inconsistent character (by default, Z) is | |
86 * retained (A + C = Z). | |
87 * | |
88 * Iterative process of consensus: | |
89 * - Each sequence is taken in turn. | |
90 * - Each pair involving the focus sequence is processed and a | |
91 * consensus is generated. | |
92 * - When all pair have been processsed, the consensus already | |
93 * generated are themselves iteratively processed until only one | |
94 * remains. | |
95 * - Note that at each time the last two are taken first. | |
96 * | |
97 * A transparent interface gives access to the data for all steps of | |
98 * the consensus process, as vectors that covers all pairs (including | |
99 * intermediate steps of the iterative procedure described above) as | |
100 * well as singleton sequences. For the latter, the second name is | |
101 * not filled and all counts are set to 0. Note also that the name of | |
102 * such singleton sequence is shortened to the separator as well. | |
103 * | |
104 */ | |
105 class Consensus { | |
106 public: | |
107 /** \brief Constructor | |
108 * | |
109 */ | |
110 Consensus(); | |
111 | |
112 /** \brief Destructor | |
113 * | |
114 */ | |
115 virtual ~Consensus() {} | |
116 | |
117 /// Sets the character interpreted as missing (default: ?) | |
118 void setMissing(char); | |
119 | |
120 /// Sets the character used to point to disagreements (default: Z) | |
121 void setDisagreement(char); | |
122 | |
123 /// Checks all the characters | |
124 bool check_sequences(Align& align); | |
125 | |
126 /** \brief Reduces the sequence alignment by making consensus sequences | |
127 * | |
128 * \param align the original alignment. | |
129 * | |
130 * \param separator the character used to separated the root | |
131 * name of sequences to the variable part, as in (for the | |
132 * default value: "sequence_read1". | |
133 * | |
134 * \param rigorous consensus mode. | |
135 * | |
136 * \return An Align instance with duplicated sequences consensed. | |
137 * | |
138 */ | |
139 Align consensus(Align& align, char separator='_', bool rigorous=true); | |
140 | |
141 /// First name of consensed pairs | |
142 const std::vector<std::string>& firstSequenceNames(); | |
143 | |
144 /// Second names of consensed pairs | |
145 const std::vector<std::string>& secondSequenceNames(); | |
146 | |
147 /// Root names of consensed pairs | |
148 const std::vector<std::string>& roots(); | |
149 | |
150 /// Number of consistent positions for all consensed pairs | |
151 const std::vector<int>& consistentPositions(); | |
152 | |
153 /// Number of complementary positions for all consensed pairs | |
154 const std::vector<int>& complementaryPositions(); | |
155 | |
156 /// Number of uninformative positions for all consensed pairs | |
157 const std::vector<int>& uninformativePositions(); | |
158 | |
159 /// Number of ambiguous positions for all consensed pairs | |
160 const std::vector<int>& ambiguousPositions(); | |
161 | |
162 /// Number of at least partially resolved ambiguities for all consensed pairs | |
163 const std::vector<int>& atLeastPartiallyResolvedAmbiguities(); | |
164 | |
165 /// Vector of inconsistent positions ofr all consensed pairs | |
166 const std::vector<std::vector<int> >& inconsistentPositions(); | |
167 | |
168 | |
169 private: | |
170 | |
171 /** \brief Copying this class is not allowed | |
172 * | |
173 */ | |
174 Consensus(const Consensus& source) { } | |
175 | |
176 /** \brief Copying this class is not allowed | |
177 * | |
178 */ | |
179 Consensus& operator=(const Consensus& source) { return *this; } | |
180 | |
181 // A private helper | |
182 class PairwiseConsensus; | |
183 | |
184 // report data | |
185 std::vector<std::string> t_firstSequenceNames; | |
186 std::vector<std::string> t_secondSequenceNames; | |
187 std::vector<std::string> t_roots; | |
188 std::vector<int> t_consistentPositions; | |
189 std::vector<int> t_complementaryPositions; | |
190 std::vector<int> t_uninformativePositions; | |
191 std::vector<int> t_ambiguousPositions; | |
192 std::vector<int> t_atLeastPartiallyResolvedAmbiguities; | |
193 std::vector<std::vector<int> > t_inconsistentPositions; | |
194 | |
195 // Code for missing data (usually ?) | |
196 char MISSING; | |
197 | |
198 // Code for disgrement | |
199 char DISAGREEMENT; | |
200 | |
201 // Helper class managing a single pair | |
202 class PairwiseConsensus { | |
203 public: | |
204 // Default object creation | |
205 PairwiseConsensus(); | |
206 | |
207 // Object destruction | |
208 virtual ~PairwiseConsensus() {} | |
209 | |
210 // Usual object creation | |
211 PairwiseConsensus(std::string, std::string); | |
212 | |
213 // Fills an object created with the default constructor | |
214 void load(std::string,std::string); | |
215 | |
216 // Changes the MISSING character | |
217 void setUndeterminedCharacter(char); | |
218 | |
219 // Changes the DISAGREEMENT character | |
220 void setDisagreementCharacter(char); | |
221 | |
222 /* Uses the conservative mode of consensus | |
223 * | |
224 * Tries to avoid to make decisions, and adds the | |
225 * character set by DISAGREEMENT upon inconsistencies | |
226 * | |
227 * return The number of inconsistencies. | |
228 */ | |
229 int generateSoftConsensus(); | |
230 | |
231 /* Strict mode of consensus | |
232 * | |
233 * The number of inconsistencies. | |
234 */ | |
235 int generateHardConsensus(); | |
236 | |
237 // Two fully resolved (including gap) and identical characters | |
238 int getConsistentPositions(); | |
239 | |
240 // One informative (including gap) and one missing | |
241 int getComplementaryPositions(); | |
242 | |
243 // None missing, but different and incompatible | |
244 int getInconsistentPositions(); | |
245 | |
246 // Both are missing | |
247 int getUninformativePositions(); | |
248 | |
249 // Both identical or one missing, but not fully resolved | |
250 int getAmbiguousPositions(); | |
251 | |
252 // Different, not missing, complementary. | |
253 int getAtLeastPartiallyResolvedAmbiguities(); | |
254 | |
255 // Accessor | |
256 int getThisInconsistentPosition(unsigned int); | |
257 | |
258 // Generates the consensus sequence | |
259 std::string getConsensus(); | |
260 | |
261 private: | |
262 | |
263 inline bool isValid(char c) { | |
264 switch (c) { | |
265 case 'A': case 'a': | |
266 case 'C': case 'c': | |
267 case 'G': case 'g': | |
268 case 'T': case 't': | |
269 return true; | |
270 default: | |
271 return false; | |
272 } | |
273 } | |
274 | |
275 // This initiates a series of embedded objects | |
276 void setCharacterContainers(); | |
277 | |
278 // The first sequence | |
279 std::string seqA; | |
280 | |
281 // The second sequence | |
282 std::string seqB; | |
283 | |
284 // The resulting consensus | |
285 std::string cons; | |
286 | |
287 // The vecotr storing the inconsistent positions | |
288 std::vector<int> posIncons; | |
289 | |
290 // The length of the sequences | |
291 unsigned int ls; | |
292 | |
293 // Counter | |
294 int cntConsistentPositions; | |
295 | |
296 // Counter | |
297 int cntComplementaryPositions; | |
298 | |
299 // Counter | |
300 int cntAmbiguousPositions; | |
301 | |
302 // Counter | |
303 int cntInconsistentPositions; | |
304 | |
305 // Counter | |
306 int cntUninformativePositions; | |
307 | |
308 // Counter | |
309 int cntAtLeastPartiallyResolvedAmbiguities; | |
310 | |
311 // Code for missing data (usually ?) | |
312 char MISSING; | |
313 | |
314 // Code for disgrement | |
315 char DISAGREEMENT; | |
316 | |
317 public: | |
318 // This class manages relationships different symbols | |
319 class CharacterContainer { | |
320 public: | |
321 // Default value: @ | |
322 CharacterContainer(); | |
323 | |
324 // Initiates to a given symbol | |
325 CharacterContainer(const char&); | |
326 | |
327 // Assignment operator | |
328 CharacterContainer& operator=(const char&); | |
329 | |
330 // Sets the symbol | |
331 void setValue(char); | |
332 | |
333 // Set the descendants | |
334 void setSons(std::vector<CharacterContainer>); | |
335 | |
336 // Tests whether the symbol is the same | |
337 bool is(CharacterContainer); | |
338 | |
339 // Tests if the query is contained amongst the sons | |
340 bool has(CharacterContainer); | |
341 | |
342 // Tests if the query is contained amongst the sons | |
343 bool has(char); | |
344 | |
345 /* Tests whether the left character has the left one | |
346 * Should be called on the N object only. | |
347 */ | |
348 char lhas(CharacterContainer,CharacterContainer); | |
349 | |
350 /* Creates the object with the proper sons | |
351 * Should be called on the N object only. | |
352 */ | |
353 CharacterContainer init(char); | |
354 | |
355 // The symbol | |
356 char value; | |
357 | |
358 // The descendants | |
359 std::vector<CharacterContainer> sons; | |
360 }; | |
361 | |
362 private: | |
363 // Symbol ? | |
364 CharacterContainer ccQ; | |
365 | |
366 // Symbol A | |
367 CharacterContainer ccA; | |
368 | |
369 // Symbol C | |
370 CharacterContainer ccC; | |
371 | |
372 // Symbol G | |
373 CharacterContainer ccG; | |
374 | |
375 // Symbol T | |
376 CharacterContainer ccT; | |
377 | |
378 // Symbol U | |
379 CharacterContainer ccU; | |
380 | |
381 // Symbol M | |
382 CharacterContainer ccM; | |
383 | |
384 // Symbol R | |
385 CharacterContainer ccR; | |
386 | |
387 // Symbol W | |
388 CharacterContainer ccW; | |
389 | |
390 // Symbol S | |
391 CharacterContainer ccS; | |
392 | |
393 // Symbol Y | |
394 CharacterContainer ccY; | |
395 | |
396 // Symbol K | |
397 CharacterContainer ccK; | |
398 | |
399 // Symbol B | |
400 CharacterContainer ccB; | |
401 | |
402 // Symbol D | |
403 CharacterContainer ccD; | |
404 | |
405 // Symbol H | |
406 CharacterContainer ccH; | |
407 | |
408 // Symbol V | |
409 CharacterContainer ccV; | |
410 | |
411 // Symbol N | |
412 CharacterContainer ccN; | |
413 | |
414 // Symbol - | |
415 CharacterContainer ccGAP; | |
416 }; | |
417 }; | |
418 } | |
419 | |
420 #endif | |
421 |