comparison egglib/egglib-2.1.5/include/egglib-cpp/Fasta.hpp @ 9:98c37a5d67f4 draft

Uploaded
author dereeper
date Wed, 07 Feb 2018 22:08:47 -0500
parents 420b57c3c185
children
comparison
equal deleted inserted replaced
8:6bf69b40365c 9:98c37a5d67f4
1 /*
2 Copyright 2008-2009 Stéphane De Mita, Mathieu Siol
3
4 This file is part of the EggLib library.
5
6 EggLib is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 EggLib is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef EGGLIB_FASTA_HPP
21 #define EGGLIB_FASTA_HPP
22
23 #include <istream>
24 #include <iostream>
25 #include <string>
26 #include "Container.hpp"
27
28 namespace egglib {
29
30 /** \brief Fasta parser/formatted
31 *
32 * \ingroup core
33 *
34 * Reads a multifasta sequence file from a string, a stream or a file
35 * and returns a Container. See the description of the format below.
36 * Formats a fasta string from a sequence container object and places
37 * it in a string, a stream of a file. All methods are static and the
38 * class cannot be instantiated. The methods parsef and formatf will
39 * open the file for you while the others will read/write directly
40 * in a string.
41 *
42 * Specifications of the fasta format:
43 *
44 * - The number of sequences is not limited.
45 *
46 * - Each sequence is preceded by a header limited to a single
47 * line and starting by a ">" character.
48 *
49 * - The header length is not limited and all characters are
50 * allowed but white spaces and special characters are
51 * discouraged.
52 *
53 * - Group indices are specified by \@0, \@1, \@2... strings
54 * appearing at the end of the header string (just before the
55 * carriage return). Note that group labels are ignored by
56 * default.
57 *
58 * - Group indices are ignored unless specifically specified in a
59 * parser's options.
60 *
61 * - The sequence itself continues on following lines until the
62 * next ">" character or the end of the file.
63 *
64 * - White spaces, tab and carriage returns are allowed at any
65 * position There is no limitation in length and different
66 * sequences can have different lengths.
67 *
68 * - Although the standard is lower case characters, Fasta
69 * assumes upper case characters and only supports lower case
70 * characters (and converts them to upper case characters).
71 * Information coded by change in case is lost.
72 *
73 */
74 class Fasta {
75
76 public:
77
78 /** \brief Imports a fasta file
79 *
80 * Imports the content of the file as is. Calls the method
81 * pase(std::istream*, bool) by creating its own istream.
82 *
83 * \param fname the name of a fasta file.
84 *
85 * \param importGroupLabels if set to true, scan automatically
86 * for groups. The format is @ followed by an integer, placed
87 * at the end of the header string(sequences without labels
88 * will be treated as \@0).
89 *
90 * \return A Container object containing the sequences.
91 *
92 */
93 static Container parsef(const char* fname, bool importGroupLabels=false);
94
95
96 /** \brief Imports a fasta file
97 *
98 * Imports the content of the file as is. Calls the method
99 * pase(std::istream*, bool) by creating its own istream. This
100 * method expects a reference to a Container to which the
101 * sequences will be appended.
102 *
103 * \param fname the name of a fasta file.
104 *
105 * \param container a Container instance, empty or not.
106 *
107 * \param importGroupLabels if set to true, scan automatically
108 * for groups. The format is @ followed by an integer, placed
109 * at the end of the header string(sequences without labels
110 * will be treated as \@0).
111 *
112 * \return Nothings: the new sequences are appended to the
113 * Container passed as argument.
114 *
115 */
116 static void parsef(const char* fname, Container& container, bool importGroupLabels=false);
117
118
119 /** \brief Imports a fasta file
120 *
121 * Imports the content of the file as is. Calls the method
122 * pase(std::istream*, bool) by creating its own istream.
123 *
124 * \param str a string containing the data.
125 *
126 * \param importGroupLabels if set to true, scan automatically
127 * for groups. The format is @ followed by an integer, placed
128 * at the end of the header string(sequences without labels
129 * will be treated as \@0).
130 *
131 * \return A Container object containing the sequences.
132 *
133 */
134 static Container parse(const std::string& str, bool importGroupLabels=false);
135
136
137 /** \brief Imports a fasta file
138 *
139 * Imports the content of the file as is. Calls the method
140 * pase(std::istream*, bool) by creating its own istream. This
141 * method expects a reference to a Container to which the
142 * sequences will be appended.
143 *
144 * \param str a string containing the data.
145 *
146 * \param container a Container instance, empty or not.
147 *
148 * \param importGroupLabels if set to true, scan automatically
149 * for groups. The format is @ followed by an integer, placed
150 * at the end of the header string(sequences without labels
151 * will be treated as \@0).
152 *
153 * \return Nothing: new sequences are appended to the Container
154 * passed as argument.
155 *
156 */
157 static void parse(const std::string& str, Container& container, bool importGroupLabels=false);
158
159
160 /** \brief Imports a fasta file from an open stream
161 *
162 * Imports the content of the file as is.
163 *
164 * \param stream an open stream (file or string) containing the
165 * data.
166 *
167 * \param importGroupLabels if set to true, scan automatically
168 * for groups. The format is @ followed by an integer, placed
169 * at the end of the header string(sequences without labels
170 * will be treated as \@0).
171 *
172 * \return A Container object containing the sequences.
173 *
174 */
175 static Container parse(std::istream& stream, bool importGroupLabels=false);
176
177
178 /** \brief Imports a fasta file from an open stream
179 *
180 * Imports the content of the file as is. This
181 * method expects a reference to a Container to which the
182 * sequences will be appended.
183 *
184 * \param stream an open stream (file or string) containing the
185 * data.
186 *
187 * \param container a Container instance, empty or not.
188 *
189 * \param importGroupLabels if set to true, scan automatically
190 * for groups. The format is @ followed by an integer, placed
191 * at the end of the header string(sequences without labels
192 * will be treated as \@0).
193 *
194 * \return Nothing: the new sequences are appended to the
195 * Container passed as argument.
196 *
197 */
198 static void parse(std::istream& stream, Container& container, bool importGroupLabels=false);
199
200
201 /** \brief Export sequences as fasta
202 *
203 * \param fname the name of the file where to place the result.
204 *
205 * \param container Container object to export.
206 *
207 * \param exportGroupLabels if set to true, exports group
208 * indices as a \@x at the end of the sequence name, where x is
209 * the group index. Otherwise, this information is discarded.
210 *
211 * \param lineLength the number of characters to place on a
212 * single line. If zero, no newlines are inserted within
213 * sequences.
214 *
215 */
216 static void formatf(const char* fname, const Container& container, bool exportGroupLabels=false, unsigned int lineLength=50);
217
218
219 /** \brief Export sequences as fasta
220 *
221 * \param file an open stream.
222 *
223 * \param container Container object to export.
224 *
225 * \param exportGroupLabels if set to true, exports group
226 * indices as a \@x at the end of the sequence name, where x is
227 * the group index. Otherwise, this information is discarded.
228 *
229 * \param lineLength the number of characters to place on a
230 * single line. If zero, no newlines are inserted within
231 * sequences.
232 *
233 */
234 static void format(std::ostream& file, const Container& container, bool exportGroupLabels=false, unsigned int lineLength=50);
235
236
237 /** \brief Export sequences as fasta
238 *
239 * This medod creates internally an ostringstream, calls the
240 * method format(ostream, container, bool) and returns the
241 * resulting string.
242 *
243 * \param container Container object to export.
244 *
245 * \param exportGroupLabels if set to true, exports group
246 * indices as a \@x at the end of the sequence name, where x is
247 * the group index. Otherwise, this information is discarded.
248 *
249 * \param lineLength the number of characters to place on a
250 * single line. If zero, no newlines are inserted within
251 * sequences.
252 *
253 * \return The formatted string.
254 *
255 */
256 static std::string format(const Container& container, bool exportGroupLabels=false, unsigned int lineLength=50);
257
258
259
260 protected:
261
262 /// This class cannot be instantiated
263 Fasta() { }
264
265 /// This class cannot be instantiated
266 Fasta(const Fasta& source) { }
267
268 /// This class cannot be or copied
269 Fasta& operator=(const Fasta& source) { return *this; }
270
271 /// This class cannot be instantiated
272 virtual ~Fasta() { }
273
274
275 };
276 }
277
278 #endif