| 1 | 1 /* | 
|  | 2     Copyright 2008,2009,2011 Stéphane De Mita and Mathieu Siol | 
|  | 3 | 
|  | 4     This file is part of the EggLib library. | 
|  | 5 | 
|  | 6     EggLib is free software: you can redistribute it and/or modify | 
|  | 7     it under the terms of the GNU General Public License as published by | 
|  | 8     the Free Software Foundation, either version 3 of the License, or | 
|  | 9     (at your option) any later version. | 
|  | 10 | 
|  | 11     EggLib is distributed in the hope that it will be useful, | 
|  | 12     but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
|  | 14     GNU General Public License for more details. | 
|  | 15 | 
|  | 16     You should have received a copy of the GNU General Public License | 
|  | 17     along with EggLib.  If not, see <http://www.gnu.org/licenses/>. | 
|  | 18 */ | 
|  | 19 | 
|  | 20 #ifndef EGGLIB_GMS_HPP | 
|  | 21 #define EGGLIB_GMS_HPP | 
|  | 22 | 
|  | 23 #include "DataMatrix.hpp" | 
|  | 24 #include <string> | 
|  | 25 #include <istream> | 
|  | 26 | 
|  | 27 namespace egglib { | 
|  | 28 | 
|  | 29     /** \brief ms-like sequence format parser | 
|  | 30      * | 
|  | 31      * The class provides parsing (input) and formatting (output) | 
|  | 32      * operations in ms format, that is the format used by Richard | 
|  | 33      * Hudson's program ms for outputting genotypes and by the | 
|  | 34      * associated program samplestat for reading them. Both types of | 
|  | 35      * operations are available through static methods using either | 
|  | 36      * a string or a stream (which can be a stream to or from a file | 
|  | 37      * or a string). In either case, types from the STL are used. | 
|  | 38      * Although ms deals only with data coded with 0 and 1, the class Ms | 
|  | 39      * offers the possibility of both importing and exporting data coded | 
|  | 40      * with by integer. All methods have an option named "separated". If | 
|  | 41      * this option is true, the parser or formatter introduces a slight | 
|  | 42      * modification of the format: genotypes individual data are | 
|  | 43      * separated by a white space ("1 0 1 1" instead of "1011", allowing | 
|  | 44      * genotype values larger than 9: "1 0 11 1"). | 
|  | 45      * | 
|  | 46      * \ingroup core | 
|  | 47      * | 
|  | 48      */ | 
|  | 49      class Ms { | 
|  | 50 | 
|  | 51       public: | 
|  | 52 | 
|  | 53        /** \brief Imports a sequence alignment | 
|  | 54         * | 
|  | 55         * Creates a istringstream from the string and calls the | 
|  | 56         * overloaded method. | 
|  | 57         * | 
|  | 58         * \param str the string to parse. | 
|  | 59         * \param ns the expected number of sequences. | 
|  | 60         * \param separated true if a white space separator is placed | 
|  | 61         * between genotype at each site. | 
|  | 62         * | 
|  | 63         * \return A sequence alignment as a data matrix. | 
|  | 64         */ | 
|  | 65         static DataMatrix get(std::string, unsigned int ns, bool separated=false); | 
|  | 66 | 
|  | 67 | 
|  | 68        /** \brief Imports a sequence alignment | 
|  | 69         * | 
|  | 70         * Attemps to generate a DataMatrix object from the stream. | 
|  | 71         * Reads only one simulation and throws a SeqlibFormatError | 
|  | 72         * exception in case of format error. | 
|  | 73         * | 
|  | 74         * Allows any number of white lines before the //, but no other | 
|  | 75         * data. Supports \r at the end of lines (before the \n). | 
|  | 76         * Accepted symbols are all integers (0-9). | 
|  | 77         * | 
|  | 78         * \param stream the stream to parse. | 
|  | 79         * \param ns the expected number of sequences. | 
|  | 80         * \param separated true if a white space separator is placed | 
|  | 81         * between genotype at each site. | 
|  | 82         * | 
|  | 83         * \return A sequence alignment as a data matrix. | 
|  | 84         */ | 
|  | 85         static DataMatrix get(std::istream& stream, unsigned int ns, bool separated=false); | 
|  | 86 | 
|  | 87 | 
|  | 88        /** \brief Exports a sequence alignment | 
|  | 89         * | 
|  | 90         * Internally creates a stringstream, calls the overloaded method | 
|  | 91         * and returns the outcome. | 
|  | 92         * | 
|  | 93         * \param dataMatrix the alignment object to write. | 
|  | 94         * \param separated true if a white space separator must be placed | 
|  | 95         * between the genotype at each site. | 
|  | 96         * | 
|  | 97         */ | 
|  | 98         static std::string format(DataMatrix& dataMatrix, bool separated=false); | 
|  | 99 | 
|  | 100 | 
|  | 101        /** \brief Exports a sequence alignment | 
|  | 102         * | 
|  | 103         * Writes the formatted string to the stream 'on the fly'. The | 
|  | 104         * formatted string is guaranteed to starts with a // line and | 
|  | 105         * ends with an empty line. The client is expected to take care | 
|  | 106         * of writing any header and add an additional white line between | 
|  | 107         * simulations if needed. The method throws a SeqlibRuntimeError | 
|  | 108         * if the stream is not writable. The data matrix should contain | 
|  | 109         * only data within range 0-9 if separated is false (default) and | 
|  | 110         * any positive (>=0) integer if separated is true. Note that | 
|  | 111         * output generated with separated=true is never compatible with | 
|  | 112         * the original ms format, and that output generated with | 
|  | 113         * separator=false is compatible with the original ms format only | 
|  | 114         * if all alleles are 0 or 1 (which is not checked by this | 
|  | 115         * formatted). | 
|  | 116         * | 
|  | 117         * \param stream the stream (file or string stream) where to | 
|  | 118         * write the output. | 
|  | 119         * \param dataMatrix the alignment object to write. | 
|  | 120         * \param separated true if a white space separator must be placed | 
|  | 121         * between the genotype at each site. | 
|  | 122         * | 
|  | 123         */ | 
|  | 124         static void format(std::ostream& stream, DataMatrix& dataMatrix, bool separated=false); | 
|  | 125 | 
|  | 126 | 
|  | 127        /** \brief Returns the last tMRCA read by any Ms instance | 
|  | 128         * | 
|  | 129         * If a tMRCA value was present in the last simulation read by | 
|  | 130         * any Ms instance, it will be returned by this method. A value | 
|  | 131         * of -1. is returned if no simulation was read, or if the last | 
|  | 132         * simulation didn't contain a tMRCA value or if the last | 
|  | 133         * simulation provoked an exception before reaching the tMRCA | 
|  | 134         * line. | 
|  | 135         * | 
|  | 136         */ | 
|  | 137         static double tMRCA(); | 
|  | 138 | 
|  | 139 | 
|  | 140        /** \brief Returns the last "prob" read by any Ms instance | 
|  | 141         * | 
|  | 142         * "prob" is returned by ms when a fixed number of segregating | 
|  | 143         * sites is used in conjunction with a theta value. If a "prob" | 
|  | 144         * value was present in the last simulation read by any Ms | 
|  | 145         * instance, it will be returned by this method. A value of -1 | 
|  | 146         * is returned if no simulation was read, or if the last | 
|  | 147         * simulation didn't contain a "prob" value or if the last | 
|  | 148         * simulation provoked an exception before reaching the "prob" | 
|  | 149         * line. | 
|  | 150         * | 
|  | 151         */ | 
|  | 152         static double prob(); | 
|  | 153 | 
|  | 154 | 
|  | 155        /** \brief Returns the tree string found in the last simulation read by any Ms instance | 
|  | 156         * | 
|  | 157         * If one or more trees were present in the last simulation read | 
|  | 158         * by any Ms instance, they will be returned as a unique string | 
|  | 159         * by this method. An empty string is returned if no simulation | 
|  | 160         * was read, or if the last simulation, or if the last simulation | 
|  | 161         * didn't contain any tree value or if the last simulation | 
|  | 162         * provoked an exception before reaching the tree line. | 
|  | 163         * | 
|  | 164         * Note: the trees are returned as a single line. | 
|  | 165         * | 
|  | 166         */ | 
|  | 167         static std::string trees(); | 
|  | 168 | 
|  | 169 | 
|  | 170       private: | 
|  | 171         // Line parser (the last \n is extracted and discarded - no error upon EOF) | 
|  | 172         std::string next_line(std::istream& stream); | 
|  | 173 | 
|  | 174         /// tMRCA (-1 if not found in ms output) | 
|  | 175         static double _tMRCA; | 
|  | 176 | 
|  | 177         /// probability (-1 if not found in ms output) | 
|  | 178         static double _prob; | 
|  | 179 | 
|  | 180         /// tree string (maybe contain several trees) (empty string if not found in ms output) | 
|  | 181         static std::string _trees; | 
|  | 182 | 
|  | 183 | 
|  | 184         /// No instantiation allowed | 
|  | 185         Ms() { } | 
|  | 186 | 
|  | 187         /// A fortiori no destruction allowed | 
|  | 188         ~Ms() { } | 
|  | 189 | 
|  | 190         /// No copy allowed | 
|  | 191         Ms(const Ms&) { } | 
|  | 192 | 
|  | 193         /// No copy allowed | 
|  | 194         Ms& operator=(const Ms&) { return *this; } | 
|  | 195 | 
|  | 196     }; | 
|  | 197 } | 
|  | 198 | 
|  | 199 #endif |