1
|
1 /*
|
|
2 Copyright 2008-2009 Stéphane De Mita, Mathieu Siol
|
|
3
|
|
4 This file is part of the EggLib library.
|
|
5
|
|
6 EggLib is free software: you can redistribute it and/or modify
|
|
7 it under the terms of the GNU General Public License as published by
|
|
8 the Free Software Foundation, either version 3 of the License, or
|
|
9 (at your option) any later version.
|
|
10
|
|
11 EggLib is distributed in the hope that it will be useful,
|
|
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 GNU General Public License for more details.
|
|
15
|
|
16 You should have received a copy of the GNU General Public License
|
|
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
|
|
18 */
|
|
19
|
|
20
|
|
21 #ifndef EGGLIB_CONTAINER_HPP
|
|
22 #define EGGLIB_CONTAINER_HPP
|
|
23
|
|
24
|
|
25 namespace egglib {
|
|
26
|
|
27 /** \brief Handles a set of sequence alignment (aligned or not)
|
|
28 *
|
|
29 * \ingroup core
|
|
30 *
|
|
31 * Creation from a file or string stream should be performed using
|
|
32 * the class Fasta.
|
|
33 *
|
|
34 * Sequences are represented by two strings (name and sequence) and
|
|
35 * an integer (group) that can be accessed or modified by index.The
|
|
36 * order of sequences is guaranteed to be conserved, as if Container
|
|
37 * was a list of triplets (name, sequence, group).
|
|
38 *
|
|
39 * The data matrix is implemented as continuous arrays (char**) and
|
|
40 * allows efficient access and modification of data. For very large
|
|
41 * data matrices you might claim immediately the required memory
|
|
42 * using the constructor Container(unsigned int, char**).
|
|
43 *
|
|
44 */
|
|
45 class Container {
|
|
46
|
|
47 public:
|
|
48
|
|
49 /** \brief Creates an empty object
|
|
50 *
|
|
51 */
|
|
52 Container();
|
|
53
|
|
54
|
|
55 /** \brief Copy constructor
|
|
56 *
|
|
57 */
|
|
58 Container(const Container& source);
|
|
59
|
|
60
|
|
61 /** \brief Assignment operator
|
|
62 *
|
|
63 */
|
|
64 Container& operator= (const Container& source);
|
|
65
|
|
66
|
|
67 /** \brief Creates an object from a data matrix
|
|
68 *
|
|
69 * Allows you to create an object from data stored in a char*
|
|
70 * array. The array's size must be passed to the constructor.
|
|
71 * Since sequences can have different lengths, you need to
|
|
72 * terminate each sequence by a NULL character. This constructor
|
|
73 * is dedicated to very performance-critical tasks. For usual
|
|
74 * tasks, using the default constructor and subsequently adding
|
|
75 * sequences with addSeq should be enough.
|
|
76 *
|
|
77 * \param number_of_sequences the number of sequences (the length
|
|
78 * of the first dimension of the array).
|
|
79 *
|
|
80 * \param cstring_array the pointer to the data matrix.
|
|
81 *
|
|
82 */
|
|
83 Container(unsigned int number_of_sequences, char const* const* const cstring_array);
|
|
84
|
|
85
|
|
86 /** \brief Destructor
|
|
87 *
|
|
88 */
|
|
89 virtual ~Container();
|
|
90
|
|
91
|
|
92 /** \brief Clears all content of the object
|
|
93 *
|
|
94 */
|
|
95 virtual void clear();
|
|
96
|
|
97
|
|
98 /** \brief Adds a sequence to the object
|
|
99 *
|
|
100 * \param name the name of the sequence, as a c-string.
|
|
101 * \param sequence the sequence string, as a c-string.
|
|
102 * \param group the group index of the sequence.
|
|
103 *
|
|
104 * \return The new number of sequences.
|
|
105 *
|
|
106 */
|
|
107 virtual unsigned int append(const char* name, const char* sequence, unsigned int group=0);
|
|
108
|
|
109
|
|
110 /** \brief Removes a sequence from the object
|
|
111 *
|
|
112 * \param pos the index of the sequence to remove.
|
|
113 *
|
|
114 * \return The new number of sequences.
|
|
115 */
|
|
116 virtual unsigned int remove(unsigned int pos);
|
|
117
|
|
118
|
|
119 /** \brief Changes the name of a given sequence
|
|
120 *
|
|
121 * \param pos the sequence index.
|
|
122 * \param name the new name as a C-like string.
|
|
123 *
|
|
124 */
|
|
125 virtual void name(unsigned int pos, const char* name);
|
|
126
|
|
127
|
|
128 /** \brief Changes the sequence string of a given sequence
|
|
129 *
|
|
130 * \param pos the sequence index.
|
|
131 * \param sequence the new sequence as a C-like string.
|
|
132 *
|
|
133 */
|
|
134 virtual void sequence(unsigned int pos, const char* sequence);
|
|
135
|
|
136
|
|
137 /** \brief Appends a string to the a given sequence
|
|
138 *
|
|
139 * \param pos the sequence index.
|
|
140 * \param sequence the sequence to append at the end of the
|
|
141 * current one.
|
|
142 *
|
|
143 */
|
|
144 virtual void appendSequence(unsigned int pos, const char* sequence);
|
|
145
|
|
146
|
|
147 /** \brief Changes a character
|
|
148 *
|
|
149 * \param sequence the sequence index.
|
|
150 * \param position the character index.
|
|
151 * \param ch the new character value.
|
|
152 *
|
|
153 * The positions must fit in the current ranges.
|
|
154 *
|
|
155 */
|
|
156 virtual void set(unsigned int sequence, unsigned position, char ch);
|
|
157
|
|
158
|
|
159 /** \brief Gets a given character
|
|
160 *
|
|
161 * \param s the sequence index.
|
|
162 * \param p the character index.
|
|
163 *
|
|
164 * \return the character value.
|
|
165 *
|
|
166 * The positions must fit in the current ranges.
|
|
167 *
|
|
168 */
|
|
169 virtual char get(unsigned int s, unsigned int p) const;
|
|
170
|
|
171
|
|
172 /** \brief Changes the group index of a given sequence
|
|
173 *
|
|
174 * \param pos the sequence index.
|
|
175 * \param group the new group index value.
|
|
176 *
|
|
177 */
|
|
178 virtual void group(unsigned int pos, unsigned int group);
|
|
179
|
|
180
|
|
181 /** \brief Extracts a range of sequences
|
|
182 *
|
|
183 * \param a the index of the first sequence.
|
|
184 *
|
|
185 * \param b the index immediately passed the last sequence to
|
|
186 * extract.
|
|
187 *
|
|
188 * \return A copy of the object containing the specified
|
|
189 * range of sequences.
|
|
190 *
|
|
191 * Sequences a to b-1 are extracted, provided that the
|
|
192 * indices fit in the current number of sequences. To extract
|
|
193 * all sequences, use container.hslice(0, container.ns()).
|
|
194 *
|
|
195 * Note: invalid ranges will be silently supported. If
|
|
196 * a>=ls or b<=a, an empty object is returned. If b>ns,
|
|
197 * ls will be substituted to a.
|
|
198 *
|
|
199 */
|
|
200 Container hslice(unsigned int a, unsigned int b) const;
|
|
201
|
|
202
|
|
203 /** \brief Gets the number of sequences
|
|
204 *
|
|
205 */
|
|
206 unsigned int ns() const;
|
|
207
|
|
208
|
|
209 /** \brief Gets the length of a given sequence
|
|
210 *
|
|
211 * \param pos the index of the sequence.
|
|
212 *
|
|
213 * \return The length of that particular sequence.
|
|
214 *
|
|
215 */
|
|
216 virtual unsigned int ls(unsigned int pos) const ;
|
|
217
|
|
218
|
|
219 /** \brief Gets the name of the a given sequence
|
|
220 *
|
|
221 * \param pos the index of the sequence.
|
|
222 *
|
|
223 * \return The name of that particular sequence.
|
|
224 *
|
|
225 */
|
|
226 virtual const char* name(unsigned int pos) const;
|
|
227
|
|
228
|
|
229 /** \brief Gets the name of a given sequence
|
|
230 *
|
|
231 * \param pos the index of the sequence.
|
|
232 *
|
|
233 * \return The sequence string for that particular sequence.
|
|
234 *
|
|
235 */
|
|
236 virtual const char* sequence(unsigned int pos) const;
|
|
237
|
|
238
|
|
239
|
|
240 /** \brief Gets the group index of a given sequence
|
|
241 *
|
|
242 * \param pos the index of the sequence.
|
|
243 *
|
|
244 * \return The group index of that particular sequence.
|
|
245 *
|
|
246 */
|
|
247 virtual unsigned int group(unsigned int pos) const;
|
|
248
|
|
249
|
|
250 /** \brief Checks if all lengths are equal
|
|
251 *
|
|
252 * Returns true if the length of all sequences are equal or
|
|
253 * if there is less thant two sequences.
|
|
254 *
|
|
255 */
|
|
256 bool isEqual() const;
|
|
257
|
|
258
|
|
259 /** \brief Equalizes sequence lengths
|
|
260 *
|
|
261 * Extends sequences as need to ensure that all sequences
|
|
262 * have the same length.
|
|
263 *
|
|
264 * \param ch the character to use for padding.
|
|
265 *
|
|
266 * \return The final length obtained, which is the length of
|
|
267 * the longest sequence before the operation.
|
|
268 *
|
|
269 */
|
|
270 unsigned int equalize(char ch='?');
|
|
271
|
|
272
|
|
273 /** \brief Finds a sequence by its name
|
|
274 *
|
|
275 * Gets the position of the first sequence with the specified
|
|
276 * name.
|
|
277 *
|
|
278 * \param string a sequence name.
|
|
279 *
|
|
280 * \param strict if true, seeks an exact match. If false,
|
|
281 * compares only until the end of the requested name (for
|
|
282 * example: ATCFF will match ATCFF_01 if strict is false).
|
|
283 *
|
|
284 * \return The lowest index where the name matches, -1 if no
|
|
285 * sequence has such name.
|
|
286 *
|
|
287 */
|
|
288 int find(const char* string, bool strict=true) const;
|
|
289
|
|
290
|
|
291 protected:
|
|
292 // The number of sequences
|
|
293 unsigned int _ns;
|
|
294
|
|
295 // The array of name lengths
|
|
296 unsigned int* lnames;
|
|
297
|
|
298 // The array of names
|
|
299 char** names;
|
|
300
|
|
301 // The array of sequences (as c-strings)
|
|
302 char** sequences;
|
|
303
|
|
304 // The array of groups
|
|
305 unsigned int* groups;
|
|
306
|
|
307 // Imports an array of c-strings
|
|
308 virtual void setFromSource(unsigned int number_of_sequences, const char* const* const cstring_array);
|
|
309
|
|
310 // Constructor helper
|
|
311 virtual void copyObject(const Container&);
|
|
312
|
|
313 // Constructor partial helper
|
|
314 virtual void getNamesAndGroups(const Container&);
|
|
315
|
|
316 private:
|
|
317
|
|
318 // The array of sequence lengths
|
|
319 unsigned int* lsequences;
|
|
320
|
|
321 // Setup a valid empty object
|
|
322 virtual void init();
|
|
323 };
|
|
324 }
|
|
325
|
|
326 #endif
|