Mercurial > repos > dereeper > sniplay
diff egglib/egglib-2.1.5/include/egglib-cpp/Container.hpp @ 9:98c37a5d67f4 draft
Uploaded
author | dereeper |
---|---|
date | Wed, 07 Feb 2018 22:08:47 -0500 |
parents | 420b57c3c185 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/egglib/egglib-2.1.5/include/egglib-cpp/Container.hpp Wed Feb 07 22:08:47 2018 -0500 @@ -0,0 +1,326 @@ +/* + Copyright 2008-2009 Stéphane De Mita, Mathieu Siol + + This file is part of the EggLib library. + + EggLib is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + EggLib is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with EggLib. If not, see <http://www.gnu.org/licenses/>. +*/ + + +#ifndef EGGLIB_CONTAINER_HPP +#define EGGLIB_CONTAINER_HPP + + +namespace egglib { + + /** \brief Handles a set of sequence alignment (aligned or not) + * + * \ingroup core + * + * Creation from a file or string stream should be performed using + * the class Fasta. + * + * Sequences are represented by two strings (name and sequence) and + * an integer (group) that can be accessed or modified by index.The + * order of sequences is guaranteed to be conserved, as if Container + * was a list of triplets (name, sequence, group). + * + * The data matrix is implemented as continuous arrays (char**) and + * allows efficient access and modification of data. For very large + * data matrices you might claim immediately the required memory + * using the constructor Container(unsigned int, char**). + * + */ + class Container { + + public: + + /** \brief Creates an empty object + * + */ + Container(); + + + /** \brief Copy constructor + * + */ + Container(const Container& source); + + + /** \brief Assignment operator + * + */ + Container& operator= (const Container& source); + + + /** \brief Creates an object from a data matrix + * + * Allows you to create an object from data stored in a char* + * array. The array's size must be passed to the constructor. + * Since sequences can have different lengths, you need to + * terminate each sequence by a NULL character. This constructor + * is dedicated to very performance-critical tasks. For usual + * tasks, using the default constructor and subsequently adding + * sequences with addSeq should be enough. + * + * \param number_of_sequences the number of sequences (the length + * of the first dimension of the array). + * + * \param cstring_array the pointer to the data matrix. + * + */ + Container(unsigned int number_of_sequences, char const* const* const cstring_array); + + + /** \brief Destructor + * + */ + virtual ~Container(); + + + /** \brief Clears all content of the object + * + */ + virtual void clear(); + + + /** \brief Adds a sequence to the object + * + * \param name the name of the sequence, as a c-string. + * \param sequence the sequence string, as a c-string. + * \param group the group index of the sequence. + * + * \return The new number of sequences. + * + */ + virtual unsigned int append(const char* name, const char* sequence, unsigned int group=0); + + + /** \brief Removes a sequence from the object + * + * \param pos the index of the sequence to remove. + * + * \return The new number of sequences. + */ + virtual unsigned int remove(unsigned int pos); + + + /** \brief Changes the name of a given sequence + * + * \param pos the sequence index. + * \param name the new name as a C-like string. + * + */ + virtual void name(unsigned int pos, const char* name); + + + /** \brief Changes the sequence string of a given sequence + * + * \param pos the sequence index. + * \param sequence the new sequence as a C-like string. + * + */ + virtual void sequence(unsigned int pos, const char* sequence); + + + /** \brief Appends a string to the a given sequence + * + * \param pos the sequence index. + * \param sequence the sequence to append at the end of the + * current one. + * + */ + virtual void appendSequence(unsigned int pos, const char* sequence); + + + /** \brief Changes a character + * + * \param sequence the sequence index. + * \param position the character index. + * \param ch the new character value. + * + * The positions must fit in the current ranges. + * + */ + virtual void set(unsigned int sequence, unsigned position, char ch); + + + /** \brief Gets a given character + * + * \param s the sequence index. + * \param p the character index. + * + * \return the character value. + * + * The positions must fit in the current ranges. + * + */ + virtual char get(unsigned int s, unsigned int p) const; + + + /** \brief Changes the group index of a given sequence + * + * \param pos the sequence index. + * \param group the new group index value. + * + */ + virtual void group(unsigned int pos, unsigned int group); + + + /** \brief Extracts a range of sequences + * + * \param a the index of the first sequence. + * + * \param b the index immediately passed the last sequence to + * extract. + * + * \return A copy of the object containing the specified + * range of sequences. + * + * Sequences a to b-1 are extracted, provided that the + * indices fit in the current number of sequences. To extract + * all sequences, use container.hslice(0, container.ns()). + * + * Note: invalid ranges will be silently supported. If + * a>=ls or b<=a, an empty object is returned. If b>ns, + * ls will be substituted to a. + * + */ + Container hslice(unsigned int a, unsigned int b) const; + + + /** \brief Gets the number of sequences + * + */ + unsigned int ns() const; + + + /** \brief Gets the length of a given sequence + * + * \param pos the index of the sequence. + * + * \return The length of that particular sequence. + * + */ + virtual unsigned int ls(unsigned int pos) const ; + + + /** \brief Gets the name of the a given sequence + * + * \param pos the index of the sequence. + * + * \return The name of that particular sequence. + * + */ + virtual const char* name(unsigned int pos) const; + + + /** \brief Gets the name of a given sequence + * + * \param pos the index of the sequence. + * + * \return The sequence string for that particular sequence. + * + */ + virtual const char* sequence(unsigned int pos) const; + + + + /** \brief Gets the group index of a given sequence + * + * \param pos the index of the sequence. + * + * \return The group index of that particular sequence. + * + */ + virtual unsigned int group(unsigned int pos) const; + + + /** \brief Checks if all lengths are equal + * + * Returns true if the length of all sequences are equal or + * if there is less thant two sequences. + * + */ + bool isEqual() const; + + + /** \brief Equalizes sequence lengths + * + * Extends sequences as need to ensure that all sequences + * have the same length. + * + * \param ch the character to use for padding. + * + * \return The final length obtained, which is the length of + * the longest sequence before the operation. + * + */ + unsigned int equalize(char ch='?'); + + + /** \brief Finds a sequence by its name + * + * Gets the position of the first sequence with the specified + * name. + * + * \param string a sequence name. + * + * \param strict if true, seeks an exact match. If false, + * compares only until the end of the requested name (for + * example: ATCFF will match ATCFF_01 if strict is false). + * + * \return The lowest index where the name matches, -1 if no + * sequence has such name. + * + */ + int find(const char* string, bool strict=true) const; + + + protected: + // The number of sequences + unsigned int _ns; + + // The array of name lengths + unsigned int* lnames; + + // The array of names + char** names; + + // The array of sequences (as c-strings) + char** sequences; + + // The array of groups + unsigned int* groups; + + // Imports an array of c-strings + virtual void setFromSource(unsigned int number_of_sequences, const char* const* const cstring_array); + + // Constructor helper + virtual void copyObject(const Container&); + + // Constructor partial helper + virtual void getNamesAndGroups(const Container&); + + private: + + // The array of sequence lengths + unsigned int* lsequences; + + // Setup a valid empty object + virtual void init(); + }; +} + +#endif