Mercurial > repos > dereeper > sniplay
view egglib/egglib-2.1.5/include/egglib-cpp/Convert.hpp @ 9:98c37a5d67f4 draft
Uploaded
author | dereeper |
---|---|
date | Wed, 07 Feb 2018 22:08:47 -0500 |
parents | 420b57c3c185 |
children |
line wrap: on
line source
/* Copyright 2009 Stéphane De Mita, Mathieu Siol This file is part of the EggLib library. EggLib is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. EggLib is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with EggLib. If not, see <http://www.gnu.org/licenses/>. */ #ifndef EGGLIB_CONVERT_HPP #define EGGLIB_CONVERT_HPP #include "DataMatrix.hpp" #include "Align.hpp" #include "EggException.hpp" #include "Random.hpp" #include <string> #include "config.h" #ifdef HAVE_LIBBPP_SEQ #include <Bpp/Seq/Alphabet.all> #include <Bpp/Seq/Sequence.h> #include <Bpp/Seq/Container.all> #endif namespace egglib { /** \brief Performs conversion between sequence holder types * * \ingroup core * * Static methods of this class allows conversion between sequence * holder types implying parametrizable modifications. * */ class Convert { public: /** \brief DataMatrix to Align conversion * * By defaut, this method generates an Align instance * containing only the polymorphic sites. The integers of * the DataMatrix will be converted as follow: 0 to A, 1 to * C, 2 to G and 3 to T. This behaviour can be largely * modified using options. * * \param dataMatrix DataMatrix instance. * * \param length length of the desired alignment. Non-varying * stretches of data will be introduced to reach the * specified length. By default the positions of segregating * sites will be determined from the positions given by the * DataMatrix object. Those positions are expressed in a * continuous range, and will be discretized. Mutations * falling on the same site will be moved of one position * left or right (always preserving the order of mutation * sites). If positions are all zero (the default of the * DataMatrix class) and if length is larger than the number * of segregating sites, then all segregating sites will * cluster on the left-hand side of the alignment. * * \param random the address to a Random object allowing to * draw random numbers (for randomizing positions and/or * non-varying states). If an address is provided but no * random numbers are required, it is ignored. If no address * if provided and random numbers are required, a Random * instance is built internally. * * \param randomizePositions if true, the positions specified * in the DataMatrix objects are ignored and the positions of * mutations are drawn randomly along the interval (only if * the specified length is larger than the number of * segregating sites). If randomizePositions and false and * positions are not * * \param enforceLength specify whether a * EggRuntimeError should be thrown when the number of * polymorphic sites is larger than the specified length. If * false (the default) and in cases where the specified * length is too short to harbor all polymorphic sites, the * alignment length will be increased as needed. * * \param randomizeNonVaryingStates if true, the stretches of * conserved positions (between segregating sites) will be * randomly drawn from the current symbol mapping. Otherwise, * the symbol given by fixed will be used. * * \param randomizeAlleles if true, alleles will be drawn * randomly from the mapped characters. Note that if a * genotype value is larger than the size of the mapping, it * will be replaced by the character given by unknown, * without randomization. In other words, with the mapping * "ACGT", alleles 0, 1, 2 and 3 will be randomly assigned * to these four characters, but larger and negative alleles * will be assigned to the unknown character. * * \param mapping a string given the character to assign to * different character values read from the DataMatrix. If * the read value is 0, the first character of the string * will used, the the value is 1, the second character will * be used, and so on. If the integer read is out of range * (in particular, for any negative value), then the * character given by unknown will be used. An empty string * will always lead to alignments containing only the * character given by unknown. The string "01" is suitable * for binary data. * * \param unknown the character to use if an integer genotype * value is not mapped in the mapping string (that is, if * the mapping string is too short). * * \param nonVaryingState character to use for conserved * stretches of data. It doesn't have to be included in the * mapping. If randomizeNonVaryingState is true, this * argument is ignored. * * \return The resulting Align object. * */ static Align align( DataMatrix& dataMatrix, unsigned int length=0, Random* random=NULL, bool randomizePositions=false, bool randomizeNonVaryingStates=false, bool randomizeAlleles=false, bool enforceLength=false, std::string mapping="ACGT", char unknown='?', char nonVaryingState='A' ); #ifdef HAVE_LIBBPP_SEQ /** \brief Converts an alignment to the equivalent Bio++ type * * During conversion, name information is lost (arbitrary * names are generated in order toprevent duplicate names). * The object is attached to an alphabet matching the passed * integer. The names are bare rank integers (starting at the * value giving by *offset*). * * \param align the source alignment object. * * \param alphabetID an integer indicating which alphabet to * use: * - 1 for DNA * - 2 for RNA * - 3 for proteins * - 4 for standard codon * - 5 for vertebrate mitochondrial codon * - 6 for invertebrate mitochondrial codon * - 7 for echinoderm mitochondrial codon * . * Other values will result in an exception. * * \param outgroupFlag an integer indicating whether to * include outgroup sequences: * - 0 use all sequences * - 1 use only sequences without 999 label (ingroup) * - 2 use only sequences with 999 label (outgroup) * . * Other values will result in an exception. * * \param offset enter an integer to shift the names of the * resulting alignment (useful to merge alignment and ensure * that names are not duplicated). * * \return A Bio++ alignment. * */ static bpp::AlignedSequenceContainer egglib2bpp(Align& align, unsigned int alphabetID, unsigned int outgroupFlag, unsigned int offset=0); #endif protected: /** \brief This class cannot be instantiated * */ Convert() { } /** \brief This class cannot be instantiated * */ Convert(const Convert& source) { } /** \brief This class cannot be instantiated * */ Convert& operator=(const Convert& source) { return *this; } /** \brief This class cannot be instantiated * */ virtual ~Convert() { } #ifdef HAVE_LIBBPP_SEQ static bpp::DNA dnaAlphabet; static bpp::RNA rnaAlphabet; static bpp::ProteicAlphabet proteicAlphabet; static bpp::StandardCodonAlphabet standardCodonAlphabet; static bpp::VertebrateMitochondrialCodonAlphabet vertebrateMitochondrialCodonAlphabet; static bpp::InvertebrateMitochondrialCodonAlphabet invertebrateMitochondrialCodonAlphabet; static bpp::EchinodermMitochondrialCodonAlphabet echinodermMitochondrialCodonAlphabet; #endif }; } #endif