| 
1
 | 
     1 /*
 | 
| 
 | 
     2     Copyright 2008,2009,2011 Stéphane De Mita and Mathieu Siol
 | 
| 
 | 
     3 
 | 
| 
 | 
     4     This file is part of the EggLib library.
 | 
| 
 | 
     5 
 | 
| 
 | 
     6     EggLib is free software: you can redistribute it and/or modify
 | 
| 
 | 
     7     it under the terms of the GNU General Public License as published by
 | 
| 
 | 
     8     the Free Software Foundation, either version 3 of the License, or
 | 
| 
 | 
     9     (at your option) any later version.
 | 
| 
 | 
    10 
 | 
| 
 | 
    11     EggLib is distributed in the hope that it will be useful,
 | 
| 
 | 
    12     but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
| 
 | 
    13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
| 
 | 
    14     GNU General Public License for more details.
 | 
| 
 | 
    15 
 | 
| 
 | 
    16     You should have received a copy of the GNU General Public License
 | 
| 
 | 
    17     along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
 | 
| 
 | 
    18 */
 | 
| 
 | 
    19 
 | 
| 
 | 
    20 #ifndef EGGLIB_GMS_HPP
 | 
| 
 | 
    21 #define EGGLIB_GMS_HPP
 | 
| 
 | 
    22 
 | 
| 
 | 
    23 #include "DataMatrix.hpp"
 | 
| 
 | 
    24 #include <string>
 | 
| 
 | 
    25 #include <istream>
 | 
| 
 | 
    26 
 | 
| 
 | 
    27 namespace egglib {
 | 
| 
 | 
    28 
 | 
| 
 | 
    29     /** \brief ms-like sequence format parser
 | 
| 
 | 
    30      * 
 | 
| 
 | 
    31      * The class provides parsing (input) and formatting (output)
 | 
| 
 | 
    32      * operations in ms format, that is the format used by Richard
 | 
| 
 | 
    33      * Hudson's program ms for outputting genotypes and by the
 | 
| 
 | 
    34      * associated program samplestat for reading them. Both types of
 | 
| 
 | 
    35      * operations are available through static methods using either
 | 
| 
 | 
    36      * a string or a stream (which can be a stream to or from a file
 | 
| 
 | 
    37      * or a string). In either case, types from the STL are used.
 | 
| 
 | 
    38      * Although ms deals only with data coded with 0 and 1, the class Ms
 | 
| 
 | 
    39      * offers the possibility of both importing and exporting data coded
 | 
| 
 | 
    40      * with by integer. All methods have an option named "separated". If
 | 
| 
 | 
    41      * this option is true, the parser or formatter introduces a slight
 | 
| 
 | 
    42      * modification of the format: genotypes individual data are
 | 
| 
 | 
    43      * separated by a white space ("1 0 1 1" instead of "1011", allowing
 | 
| 
 | 
    44      * genotype values larger than 9: "1 0 11 1").
 | 
| 
 | 
    45      *
 | 
| 
 | 
    46      * \ingroup core
 | 
| 
 | 
    47      *
 | 
| 
 | 
    48      */
 | 
| 
 | 
    49      class Ms {
 | 
| 
 | 
    50 
 | 
| 
 | 
    51       public:
 | 
| 
 | 
    52          
 | 
| 
 | 
    53        /** \brief Imports a sequence alignment
 | 
| 
 | 
    54         * 
 | 
| 
 | 
    55         * Creates a istringstream from the string and calls the
 | 
| 
 | 
    56         * overloaded method.
 | 
| 
 | 
    57         * 
 | 
| 
 | 
    58         * \param str the string to parse.
 | 
| 
 | 
    59         * \param ns the expected number of sequences.
 | 
| 
 | 
    60         * \param separated true if a white space separator is placed
 | 
| 
 | 
    61         * between genotype at each site.
 | 
| 
 | 
    62         *
 | 
| 
 | 
    63         * \return A sequence alignment as a data matrix.
 | 
| 
 | 
    64         */
 | 
| 
 | 
    65         static DataMatrix get(std::string, unsigned int ns, bool separated=false);
 | 
| 
 | 
    66 
 | 
| 
 | 
    67 
 | 
| 
 | 
    68        /** \brief Imports a sequence alignment
 | 
| 
 | 
    69         * 
 | 
| 
 | 
    70         * Attemps to generate a DataMatrix object from the stream.
 | 
| 
 | 
    71         * Reads only one simulation and throws a SeqlibFormatError
 | 
| 
 | 
    72         * exception in case of format error.
 | 
| 
 | 
    73         * 
 | 
| 
 | 
    74         * Allows any number of white lines before the //, but no other
 | 
| 
 | 
    75         * data. Supports \r at the end of lines (before the \n).
 | 
| 
 | 
    76         * Accepted symbols are all integers (0-9).
 | 
| 
 | 
    77         *
 | 
| 
 | 
    78         * \param stream the stream to parse.
 | 
| 
 | 
    79         * \param ns the expected number of sequences.
 | 
| 
 | 
    80         * \param separated true if a white space separator is placed
 | 
| 
 | 
    81         * between genotype at each site.
 | 
| 
 | 
    82         * 
 | 
| 
 | 
    83         * \return A sequence alignment as a data matrix.
 | 
| 
 | 
    84         */
 | 
| 
 | 
    85         static DataMatrix get(std::istream& stream, unsigned int ns, bool separated=false);
 | 
| 
 | 
    86 
 | 
| 
 | 
    87 
 | 
| 
 | 
    88        /** \brief Exports a sequence alignment
 | 
| 
 | 
    89         * 
 | 
| 
 | 
    90         * Internally creates a stringstream, calls the overloaded method
 | 
| 
 | 
    91         * and returns the outcome.
 | 
| 
 | 
    92         *
 | 
| 
 | 
    93         * \param dataMatrix the alignment object to write.
 | 
| 
 | 
    94         * \param separated true if a white space separator must be placed
 | 
| 
 | 
    95         * between the genotype at each site.
 | 
| 
 | 
    96         * 
 | 
| 
 | 
    97         */
 | 
| 
 | 
    98         static std::string format(DataMatrix& dataMatrix, bool separated=false);
 | 
| 
 | 
    99         
 | 
| 
 | 
   100         
 | 
| 
 | 
   101        /** \brief Exports a sequence alignment
 | 
| 
 | 
   102         * 
 | 
| 
 | 
   103         * Writes the formatted string to the stream 'on the fly'. The
 | 
| 
 | 
   104         * formatted string is guaranteed to starts with a // line and
 | 
| 
 | 
   105         * ends with an empty line. The client is expected to take care
 | 
| 
 | 
   106         * of writing any header and add an additional white line between
 | 
| 
 | 
   107         * simulations if needed. The method throws a SeqlibRuntimeError
 | 
| 
 | 
   108         * if the stream is not writable. The data matrix should contain
 | 
| 
 | 
   109         * only data within range 0-9 if separated is false (default) and
 | 
| 
 | 
   110         * any positive (>=0) integer if separated is true. Note that
 | 
| 
 | 
   111         * output generated with separated=true is never compatible with
 | 
| 
 | 
   112         * the original ms format, and that output generated with
 | 
| 
 | 
   113         * separator=false is compatible with the original ms format only
 | 
| 
 | 
   114         * if all alleles are 0 or 1 (which is not checked by this
 | 
| 
 | 
   115         * formatted).
 | 
| 
 | 
   116         * 
 | 
| 
 | 
   117         * \param stream the stream (file or string stream) where to
 | 
| 
 | 
   118         * write the output.
 | 
| 
 | 
   119         * \param dataMatrix the alignment object to write.
 | 
| 
 | 
   120         * \param separated true if a white space separator must be placed
 | 
| 
 | 
   121         * between the genotype at each site.
 | 
| 
 | 
   122         * 
 | 
| 
 | 
   123         */
 | 
| 
 | 
   124         static void format(std::ostream& stream, DataMatrix& dataMatrix, bool separated=false);
 | 
| 
 | 
   125 
 | 
| 
 | 
   126 
 | 
| 
 | 
   127        /** \brief Returns the last tMRCA read by any Ms instance
 | 
| 
 | 
   128         * 
 | 
| 
 | 
   129         * If a tMRCA value was present in the last simulation read by
 | 
| 
 | 
   130         * any Ms instance, it will be returned by this method. A value
 | 
| 
 | 
   131         * of -1. is returned if no simulation was read, or if the last
 | 
| 
 | 
   132         * simulation didn't contain a tMRCA value or if the last
 | 
| 
 | 
   133         * simulation provoked an exception before reaching the tMRCA
 | 
| 
 | 
   134         * line.
 | 
| 
 | 
   135         * 
 | 
| 
 | 
   136         */
 | 
| 
 | 
   137         static double tMRCA();
 | 
| 
 | 
   138 
 | 
| 
 | 
   139 
 | 
| 
 | 
   140        /** \brief Returns the last "prob" read by any Ms instance
 | 
| 
 | 
   141         * 
 | 
| 
 | 
   142         * "prob" is returned by ms when a fixed number of segregating
 | 
| 
 | 
   143         * sites is used in conjunction with a theta value. If a "prob"
 | 
| 
 | 
   144         * value was present in the last simulation read by any Ms
 | 
| 
 | 
   145         * instance, it will be returned by this method. A value of -1
 | 
| 
 | 
   146         * is returned if no simulation was read, or if the last
 | 
| 
 | 
   147         * simulation didn't contain a "prob" value or if the last
 | 
| 
 | 
   148         * simulation provoked an exception before reaching the "prob"
 | 
| 
 | 
   149         * line.
 | 
| 
 | 
   150         * 
 | 
| 
 | 
   151         */
 | 
| 
 | 
   152         static double prob();
 | 
| 
 | 
   153     
 | 
| 
 | 
   154 
 | 
| 
 | 
   155        /** \brief Returns the tree string found in the last simulation read by any Ms instance
 | 
| 
 | 
   156         * 
 | 
| 
 | 
   157         * If one or more trees were present in the last simulation read
 | 
| 
 | 
   158         * by any Ms instance, they will be returned as a unique string
 | 
| 
 | 
   159         * by this method. An empty string is returned if no simulation
 | 
| 
 | 
   160         * was read, or if the last simulation, or if the last simulation
 | 
| 
 | 
   161         * didn't contain any tree value or if the last simulation
 | 
| 
 | 
   162         * provoked an exception before reaching the tree line.
 | 
| 
 | 
   163         * 
 | 
| 
 | 
   164         * Note: the trees are returned as a single line.
 | 
| 
 | 
   165         * 
 | 
| 
 | 
   166         */
 | 
| 
 | 
   167         static std::string trees();
 | 
| 
 | 
   168 
 | 
| 
 | 
   169          
 | 
| 
 | 
   170       private:
 | 
| 
 | 
   171         // Line parser (the last \n is extracted and discarded - no error upon EOF)
 | 
| 
 | 
   172         std::string next_line(std::istream& stream);
 | 
| 
 | 
   173         
 | 
| 
 | 
   174         /// tMRCA (-1 if not found in ms output)
 | 
| 
 | 
   175         static double _tMRCA;
 | 
| 
 | 
   176         
 | 
| 
 | 
   177         /// probability (-1 if not found in ms output)
 | 
| 
 | 
   178         static double _prob;
 | 
| 
 | 
   179         
 | 
| 
 | 
   180         /// tree string (maybe contain several trees) (empty string if not found in ms output)
 | 
| 
 | 
   181         static std::string _trees;
 | 
| 
 | 
   182 
 | 
| 
 | 
   183         
 | 
| 
 | 
   184         /// No instantiation allowed
 | 
| 
 | 
   185         Ms() { }
 | 
| 
 | 
   186         
 | 
| 
 | 
   187         /// A fortiori no destruction allowed
 | 
| 
 | 
   188         ~Ms() { }
 | 
| 
 | 
   189 
 | 
| 
 | 
   190         /// No copy allowed
 | 
| 
 | 
   191         Ms(const Ms&) { }
 | 
| 
 | 
   192 
 | 
| 
 | 
   193         /// No copy allowed
 | 
| 
 | 
   194         Ms& operator=(const Ms&) { return *this; }
 | 
| 
 | 
   195                 
 | 
| 
 | 
   196     };
 | 
| 
 | 
   197 }
 | 
| 
 | 
   198     
 | 
| 
 | 
   199 #endif
 |