| 
1
 | 
     1 /*
 | 
| 
 | 
     2     Copyright 2008-2009 Stéphane De Mita, Mathieu Siol
 | 
| 
 | 
     3 
 | 
| 
 | 
     4     This file is part of the EggLib library.
 | 
| 
 | 
     5 
 | 
| 
 | 
     6     EggLib is free software: you can redistribute it and/or modify
 | 
| 
 | 
     7     it under the terms of the GNU General Public License as published by
 | 
| 
 | 
     8     the Free Software Foundation, either version 3 of the License, or
 | 
| 
 | 
     9     (at your option) any later version.
 | 
| 
 | 
    10 
 | 
| 
 | 
    11     EggLib is distributed in the hope that it will be useful,
 | 
| 
 | 
    12     but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
| 
 | 
    13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
| 
 | 
    14     GNU General Public License for more details.
 | 
| 
 | 
    15 
 | 
| 
 | 
    16     You should have received a copy of the GNU General Public License
 | 
| 
 | 
    17     along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
 | 
| 
 | 
    18 */
 | 
| 
 | 
    19 
 | 
| 
 | 
    20 
 | 
| 
 | 
    21 #ifndef EGGLIB_ALIGN_HPP
 | 
| 
 | 
    22 #define EGGLIB_ALIGN_HPP
 | 
| 
 | 
    23 
 | 
| 
 | 
    24 #include "Container.hpp"
 | 
| 
 | 
    25 #include "CharMatrix.hpp"
 | 
| 
 | 
    26 #include <vector>
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 /** \mainpage Summary
 | 
| 
 | 
    29  * 
 | 
| 
 | 
    30  * This is the automatically-generated reference manual of the C++
 | 
| 
 | 
    31  * egglib-cpp library. The library is presented as several modules, but
 | 
| 
 | 
    32  * note that they are only used to structure the documentation.
 | 
| 
 | 
    33  * 
 | 
| 
 | 
    34  * There is a single namespace (egglib) in which all classes are
 | 
| 
 | 
    35  * defined. See an example of programming with egglib-cpp in the
 | 
| 
 | 
    36  * EggLib package main documentation. Use "Modules" or "Classes" above
 | 
| 
 | 
    37  * to navigate in the library reference manual.
 | 
| 
 | 
    38  * 
 | 
| 
 | 
    39  */
 | 
| 
 | 
    40 
 | 
| 
 | 
    41 
 | 
| 
 | 
    42 /** \defgroup core core
 | 
| 
 | 
    43  *
 | 
| 
 | 
    44  * \brief Central core of the C++ library of Egglib
 | 
| 
 | 
    45  *
 | 
| 
 | 
    46  * Data storage classes, parsers/formatters and tools, plus exception
 | 
| 
 | 
    47  * types.
 | 
| 
 | 
    48  * 
 | 
| 
 | 
    49  */
 | 
| 
 | 
    50 
 | 
| 
 | 
    51 namespace egglib {
 | 
| 
 | 
    52 
 | 
| 
 | 
    53 
 | 
| 
 | 
    54    /** \brief Handles a sequence alignment
 | 
| 
 | 
    55     *
 | 
| 
 | 
    56     * \ingroup core
 | 
| 
 | 
    57     * 
 | 
| 
 | 
    58     * Creation from a file or string stream should be performed using
 | 
| 
 | 
    59     * the class Fasta. Align objects can be created by deep copy from
 | 
| 
 | 
    60     * both Align and Container type. In the latter case, the length are
 | 
| 
 | 
    61     * artificially equalized by "?" characters. Align objects can be
 | 
| 
 | 
    62     * created from a DataMatrix object (and all the way arround) using
 | 
| 
 | 
    63     * the specific class DMAConverter.
 | 
| 
 | 
    64     *
 | 
| 
 | 
    65     * Sequences are represented by two strings (name and sequence) and
 | 
| 
 | 
    66     * an integer (group) that can be accessed or modified by index.The
 | 
| 
 | 
    67     * order of sequences is guaranteed to be conserved, as if Align was
 | 
| 
 | 
    68     * a list of triplets (name, sequence, group).
 | 
| 
 | 
    69     *
 | 
| 
 | 
    70     * The data matrix is implemented as continuous array (char**) and
 | 
| 
 | 
    71     * allows efficient access and modification of data. For very large
 | 
| 
 | 
    72     * data matrices you might claim immediately the required memory
 | 
| 
 | 
    73     * using the constructor Align(unsigned int, char**).
 | 
| 
 | 
    74     * 
 | 
| 
 | 
    75     */
 | 
| 
 | 
    76     class Align : public Container, public CharMatrix {
 | 
| 
 | 
    77         public:
 | 
| 
 | 
    78         
 | 
| 
 | 
    79            /** \brief Creates an empty alignment
 | 
| 
 | 
    80             * 
 | 
| 
 | 
    81             */
 | 
| 
 | 
    82             Align();
 | 
| 
 | 
    83 
 | 
| 
 | 
    84 
 | 
| 
 | 
    85            /** \brief Creates an alignment from a data matrix.
 | 
| 
 | 
    86             * 
 | 
| 
 | 
    87             * Allows you to create an object from data stored in a char*
 | 
| 
 | 
    88             * array. The array's dimensions must be passed to the
 | 
| 
 | 
    89             * constructor, and as a result there is not need to
 | 
| 
 | 
    90             * terminate each sequence by a NULL character.
 | 
| 
 | 
    91             * 
 | 
| 
 | 
    92             * \param number_of_sequences the number of sequences (the
 | 
| 
 | 
    93             * length of the first dimension of the array).
 | 
| 
 | 
    94             * 
 | 
| 
 | 
    95             * \param alignment_length the length of sequences (the
 | 
| 
 | 
    96             * length of all lines of the array).
 | 
| 
 | 
    97             * 
 | 
| 
 | 
    98             * \param cstring_array the pointer to the data matrix.
 | 
| 
 | 
    99             * 
 | 
| 
 | 
   100             */
 | 
| 
 | 
   101             Align(unsigned int number_of_sequences, unsigned int alignment_length, char const * const * const cstring_array);
 | 
| 
 | 
   102 
 | 
| 
 | 
   103 
 | 
| 
 | 
   104             /** \brief Creates an alignment with given dimensions
 | 
| 
 | 
   105              * 
 | 
| 
 | 
   106              * Allows you to allocate directly a data matrix of a given
 | 
| 
 | 
   107              * size. Names are empty strings, groups 0, and all
 | 
| 
 | 
   108              * characters are ?.
 | 
| 
 | 
   109              * 
 | 
| 
 | 
   110             * \param number_of_sequences the number of sequences (the
 | 
| 
 | 
   111             * length of the first dimension of the array).
 | 
| 
 | 
   112             * 
 | 
| 
 | 
   113             * \param alignment_length the length of sequences (the
 | 
| 
 | 
   114             * length of all lines of the array).
 | 
| 
 | 
   115             * 
 | 
| 
 | 
   116             */
 | 
| 
 | 
   117             Align(unsigned int number_of_sequences, unsigned int alignment_length);
 | 
| 
 | 
   118 
 | 
| 
 | 
   119 
 | 
| 
 | 
   120            /** \brief Copy constructor
 | 
| 
 | 
   121             * 
 | 
| 
 | 
   122             */
 | 
| 
 | 
   123             Align(const Align& align);
 | 
| 
 | 
   124 
 | 
| 
 | 
   125 
 | 
| 
 | 
   126            /** \brief Copy constructor accepting a Container object
 | 
| 
 | 
   127             * 
 | 
| 
 | 
   128             * All but the longest sequences are padded with ? to match
 | 
| 
 | 
   129             * the longest sequence's length.
 | 
| 
 | 
   130             * 
 | 
| 
 | 
   131             */
 | 
| 
 | 
   132             Align(const Container& container);
 | 
| 
 | 
   133 
 | 
| 
 | 
   134 
 | 
| 
 | 
   135            /** \brief Copy operator
 | 
| 
 | 
   136             * 
 | 
| 
 | 
   137             */
 | 
| 
 | 
   138             Align& operator=(const Align& align);
 | 
| 
 | 
   139 
 | 
| 
 | 
   140 
 | 
| 
 | 
   141            /** \brief Copy operator accepting a Container object
 | 
| 
 | 
   142             * 
 | 
| 
 | 
   143             * All but the longest sequences are padded with ? to match
 | 
| 
 | 
   144             * the longest sequence's length.
 | 
| 
 | 
   145             * 
 | 
| 
 | 
   146             */
 | 
| 
 | 
   147             Align& operator=(const Container& container);
 | 
| 
 | 
   148 
 | 
| 
 | 
   149 
 | 
| 
 | 
   150            /** \brief Destructor
 | 
| 
 | 
   151             * 
 | 
| 
 | 
   152             */
 | 
| 
 | 
   153             virtual ~Align();
 | 
| 
 | 
   154 
 | 
| 
 | 
   155 
 | 
| 
 | 
   156            /** \brief Adds a sequence
 | 
| 
 | 
   157             *
 | 
| 
 | 
   158             * If the object already contains at least one sequence, the
 | 
| 
 | 
   159             * new sequence must have the same length. Otherwise, a
 | 
| 
 | 
   160             * EggUnalignedError is raised.
 | 
| 
 | 
   161             *
 | 
| 
 | 
   162             * \param name the name of the sequence.
 | 
| 
 | 
   163             * \param sequence the sequence string.
 | 
| 
 | 
   164             * \param group the group index of the sequence.
 | 
| 
 | 
   165             * \return The new number of sequences.
 | 
| 
 | 
   166             * 
 | 
| 
 | 
   167             */
 | 
| 
 | 
   168             virtual unsigned int append(const char* name, const char* sequence, unsigned int group=0);
 | 
| 
 | 
   169 
 | 
| 
 | 
   170 
 | 
| 
 | 
   171            /** \brief Removes a position (column) of the alignment
 | 
| 
 | 
   172             *
 | 
| 
 | 
   173             * \param pos the position to remove in the alignment.
 | 
| 
 | 
   174             * \return The new length of the alignment.
 | 
| 
 | 
   175             *
 | 
| 
 | 
   176             */
 | 
| 
 | 
   177             virtual unsigned int removePosition(unsigned int pos);
 | 
| 
 | 
   178 
 | 
| 
 | 
   179 
 | 
| 
 | 
   180            /** \brief Removes a sequence from the alignment
 | 
| 
 | 
   181             *
 | 
| 
 | 
   182             * \param pos the index of the sequence to remove.
 | 
| 
 | 
   183             * \return The new number of sequences.
 | 
| 
 | 
   184             * 
 | 
| 
 | 
   185             */
 | 
| 
 | 
   186             virtual unsigned int remove(unsigned int pos);
 | 
| 
 | 
   187 
 | 
| 
 | 
   188 
 | 
| 
 | 
   189            /** \brief Replace a sequence string
 | 
| 
 | 
   190             * 
 | 
| 
 | 
   191             * The new sequence must have the same length than the
 | 
| 
 | 
   192             * alignment. Otherwise, a EggUnalignedError is raised.
 | 
| 
 | 
   193             * 
 | 
| 
 | 
   194             * \param seq the index of the sequence to change.
 | 
| 
 | 
   195             * \param sequence the new sequence.
 | 
| 
 | 
   196             * 
 | 
| 
 | 
   197             */
 | 
| 
 | 
   198             virtual void sequence(unsigned int seq, const char* sequence);
 | 
| 
 | 
   199 
 | 
| 
 | 
   200 
 | 
| 
 | 
   201            /** \brief Gets the name of a given sequence
 | 
| 
 | 
   202             * 
 | 
| 
 | 
   203             * \param pos the index of the sequence.
 | 
| 
 | 
   204             * 
 | 
| 
 | 
   205             * \return The sequence string for that particular sequence.
 | 
| 
 | 
   206             * 
 | 
| 
 | 
   207             */
 | 
| 
 | 
   208             virtual inline const char* sequence(unsigned int pos) const { return Container::sequence(pos); }
 | 
| 
 | 
   209             
 | 
| 
 | 
   210             
 | 
| 
 | 
   211            /** \brief Alignment length
 | 
| 
 | 
   212             * 
 | 
| 
 | 
   213             * Returns 0 if the alignment is empty.
 | 
| 
 | 
   214             * 
 | 
| 
 | 
   215             */
 | 
| 
 | 
   216             virtual  unsigned int ls() const;
 | 
| 
 | 
   217 
 | 
| 
 | 
   218 
 | 
| 
 | 
   219            /** \brief Length of a given sequence
 | 
| 
 | 
   220             * 
 | 
| 
 | 
   221             * Calling this function is exactly the same as calling ls()
 | 
| 
 | 
   222             * (without arguments), regardless of the index provided,
 | 
| 
 | 
   223             * except that an exception is thrown if the index is out of
 | 
| 
 | 
   224             * bounds. Provided for compatibility with Container.
 | 
| 
 | 
   225             * 
 | 
| 
 | 
   226             * \param pos the index of the sequence.
 | 
| 
 | 
   227             * \return the length of the alignment.
 | 
| 
 | 
   228             * 
 | 
| 
 | 
   229             */
 | 
| 
 | 
   230             virtual unsigned int ls(unsigned int pos) const;
 | 
| 
 | 
   231 
 | 
| 
 | 
   232 
 | 
| 
 | 
   233            /** \brief Fast and unsecure accessor
 | 
| 
 | 
   234             * 
 | 
| 
 | 
   235             * This accessor doesn't perform out-of-bound checking!
 | 
| 
 | 
   236             * 
 | 
| 
 | 
   237             * \param s the index of the sequence (line).
 | 
| 
 | 
   238             * \param p the position in the alignment (column).
 | 
| 
 | 
   239             * \return The character at the given position.
 | 
| 
 | 
   240             * 
 | 
| 
 | 
   241             */
 | 
| 
 | 
   242             inline char character(unsigned int s, unsigned int p) const { return sequences[s][p]; }
 | 
| 
 | 
   243 
 | 
| 
 | 
   244 
 | 
| 
 | 
   245            /** \brief Gets a nucleotide
 | 
| 
 | 
   246             * 
 | 
| 
 | 
   247             * This modifier does perform out-of-bound checking.
 | 
| 
 | 
   248             * The specified position must exist.
 | 
| 
 | 
   249             * 
 | 
| 
 | 
   250             * \param sequence the index of the sequence (line).
 | 
| 
 | 
   251             * \param position the position in the alignment (column).
 | 
| 
 | 
   252             * \return the character at the given position.
 | 
| 
 | 
   253             * 
 | 
| 
 | 
   254             */
 | 
| 
 | 
   255             virtual char get(unsigned int sequence, unsigned int position) const;
 | 
| 
 | 
   256 
 | 
| 
 | 
   257 
 | 
| 
 | 
   258            /** \brief Sets a matrix position to a new character
 | 
| 
 | 
   259             * 
 | 
| 
 | 
   260             * This modifier does perform out-of-bound checking.
 | 
| 
 | 
   261             * The specified position must exist.
 | 
| 
 | 
   262             * 
 | 
| 
 | 
   263             * \param sequence the index of the sequence (line).
 | 
| 
 | 
   264             * \param position the position in the alignment (column).
 | 
| 
 | 
   265             * \param ch the new character value.
 | 
| 
 | 
   266             */
 | 
| 
 | 
   267             virtual void set(unsigned int sequence, unsigned position, char ch);
 | 
| 
 | 
   268 
 | 
| 
 | 
   269 
 | 
| 
 | 
   270            /** \brief Reverse a given column in binary data
 | 
| 
 | 
   271             *
 | 
| 
 | 
   272             * The specified column must contain only "0" ans "1" characters.
 | 
| 
 | 
   273             * "0" is replaced by "1" and all the way around
 | 
| 
 | 
   274             * 
 | 
| 
 | 
   275             */
 | 
| 
 | 
   276             void binSwitch(unsigned int pos);
 | 
| 
 | 
   277 
 | 
| 
 | 
   278 
 | 
| 
 | 
   279            /** \brief Extracts specified positions (columns) of the alignment
 | 
| 
 | 
   280             *
 | 
| 
 | 
   281             * All the specified sites are extracted in the specified
 | 
| 
 | 
   282             * order. This function is suitable for bootstrap (resample
 | 
| 
 | 
   283             * allowing redrawing the same site) and permutations.
 | 
| 
 | 
   284             * 
 | 
| 
 | 
   285             * This function doesn't perform out-of-bound checking.
 | 
| 
 | 
   286             * 
 | 
| 
 | 
   287             * \param list_of_sites a vector containing alignment
 | 
| 
 | 
   288             * positions.
 | 
| 
 | 
   289             * 
 | 
| 
 | 
   290             * \return A copy of the object containing the specified
 | 
| 
 | 
   291             * set of positions.
 | 
| 
 | 
   292             * 
 | 
| 
 | 
   293             */
 | 
| 
 | 
   294             Align vslice(std::vector<unsigned int> list_of_sites);
 | 
| 
 | 
   295 
 | 
| 
 | 
   296 
 | 
| 
 | 
   297            /** \brief Extracts a range of positions (columns)
 | 
| 
 | 
   298             * 
 | 
| 
 | 
   299             * \param a the first position.
 | 
| 
 | 
   300             * 
 | 
| 
 | 
   301             * \param b the index immediately passed the last sequence to
 | 
| 
 | 
   302             * extract.
 | 
| 
 | 
   303             * 
 | 
| 
 | 
   304             * \return A copy of the object containing the specified
 | 
| 
 | 
   305             * range of sequences.
 | 
| 
 | 
   306             * 
 | 
| 
 | 
   307             * Positions a to b-1 are extracted, provided that the
 | 
| 
 | 
   308             * indices fit in the current length of sequences. To extract
 | 
| 
 | 
   309             * all sequences, use align.vslice(0, align.ls()).
 | 
| 
 | 
   310             * 
 | 
| 
 | 
   311             * Note: invalid ranges will be silently supported. If
 | 
| 
 | 
   312             * a>=ls or b<=a, an empty object is returned. If b>ns,
 | 
| 
 | 
   313             * ls will be substituted to a.
 | 
| 
 | 
   314             */
 | 
| 
 | 
   315             Align vslice(unsigned int a, unsigned int b);
 | 
| 
 | 
   316 
 | 
| 
 | 
   317 
 | 
| 
 | 
   318            /** \brief Deletes all the content of the object
 | 
| 
 | 
   319             * 
 | 
| 
 | 
   320             */
 | 
| 
 | 
   321             virtual void clear();
 | 
| 
 | 
   322 
 | 
| 
 | 
   323 
 | 
| 
 | 
   324            /** \brief Same as ns()
 | 
| 
 | 
   325             * 
 | 
| 
 | 
   326             */
 | 
| 
 | 
   327             inline unsigned int numberOfSequences() const {
 | 
| 
 | 
   328                 return _ns;
 | 
| 
 | 
   329             }
 | 
| 
 | 
   330 
 | 
| 
 | 
   331 
 | 
| 
 | 
   332            /** \brief Same as ls()
 | 
| 
 | 
   333             * 
 | 
| 
 | 
   334             */
 | 
| 
 | 
   335             inline unsigned int numberOfSites() const {
 | 
| 
 | 
   336                 return _ls;
 | 
| 
 | 
   337             }
 | 
| 
 | 
   338 
 | 
| 
 | 
   339 
 | 
| 
 | 
   340            /** \brief Gets a group label (insecure)
 | 
| 
 | 
   341             * 
 | 
| 
 | 
   342             */
 | 
| 
 | 
   343             inline unsigned int populationLabel(unsigned int sequenceIndex) const {
 | 
| 
 | 
   344                 return groups[sequenceIndex];
 | 
| 
 | 
   345             }
 | 
| 
 | 
   346             
 | 
| 
 | 
   347             
 | 
| 
 | 
   348            /** \brief Just return the passed value
 | 
| 
 | 
   349             *
 | 
| 
 | 
   350             */
 | 
| 
 | 
   351             inline double sitePosition(unsigned int position) const {
 | 
| 
 | 
   352                 return (double) position;
 | 
| 
 | 
   353             }
 | 
| 
 | 
   354 
 | 
| 
 | 
   355 
 | 
| 
 | 
   356         protected:
 | 
| 
 | 
   357         
 | 
| 
 | 
   358             /// This function is not available for alignments
 | 
| 
 | 
   359             virtual void appendSequence(unsigned int pos, const char* sequence) {}
 | 
| 
 | 
   360 
 | 
| 
 | 
   361             // Initializer (creates a valid empty alignment)
 | 
| 
 | 
   362             virtual void init();
 | 
| 
 | 
   363         
 | 
| 
 | 
   364             // Makes a deep copy of the specified data matrix - if cstring_array is NULL, then ignores it and pads with ?'s
 | 
| 
 | 
   365             virtual void setFromSource(unsigned int number_of_sequences, unsigned int alignment_length, const char* const * const cstring_array);
 | 
| 
 | 
   366 
 | 
| 
 | 
   367             // Copies from a Container
 | 
| 
 | 
   368             virtual void copyObject(const Container&);
 | 
| 
 | 
   369             
 | 
| 
 | 
   370             // Copies from an Align
 | 
| 
 | 
   371             virtual void copyObject(const Align&);
 | 
| 
 | 
   372             
 | 
| 
 | 
   373             // Alignment length
 | 
| 
 | 
   374             unsigned int _ls;
 | 
| 
 | 
   375     };
 | 
| 
 | 
   376 }
 | 
| 
 | 
   377 
 | 
| 
 | 
   378 #endif
 |