Mercurial > repos > urgi-team > teiso
comparison TEisotools-1.1.a/commons/core/seq/AlignedBioseqDB.py @ 13:feef9a0db09d draft
Uploaded
| author | urgi-team |
|---|---|
| date | Wed, 20 Jul 2016 09:04:42 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 12:22b0494ec883 | 13:feef9a0db09d |
|---|---|
| 1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
| 2 # http://www.inra.fr | |
| 3 # http://urgi.versailles.inra.fr | |
| 4 # | |
| 5 # This software is governed by the CeCILL license under French law and | |
| 6 # abiding by the rules of distribution of free software. You can use, | |
| 7 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 9 # "http://www.cecill.info". | |
| 10 # | |
| 11 # As a counterpart to the access to the source code and rights to copy, | |
| 12 # modify and redistribute granted by the license, users are provided only | |
| 13 # with a limited warranty and the software's author, the holder of the | |
| 14 # economic rights, and the successive licensors have only limited | |
| 15 # liability. | |
| 16 # | |
| 17 # In this respect, the user's attention is drawn to the risks associated | |
| 18 # with loading, using, modifying and/or developing or reproducing the | |
| 19 # software by the user in light of its specific status of free software, | |
| 20 # that may mean that it is complicated to manipulate, and that also | |
| 21 # therefore means that it is reserved for developers and experienced | |
| 22 # professionals having in-depth computer knowledge. Users are therefore | |
| 23 # encouraged to load and test the software's suitability as regards their | |
| 24 # requirements in conditions enabling the security of their systems and/or | |
| 25 # data to be ensured and, more generally, to use and operate it in the | |
| 26 # same conditions as regards security. | |
| 27 # | |
| 28 # The fact that you are presently reading this means that you have had | |
| 29 # knowledge of the CeCILL license and that you accept its terms. | |
| 30 | |
| 31 | |
| 32 import sys | |
| 33 from commons.core.seq.BioseqDB import BioseqDB | |
| 34 from commons.core.seq.Bioseq import Bioseq | |
| 35 from commons.core.coord.Align import Align | |
| 36 from commons.core.coord.Range import Range | |
| 37 from commons.core.stat.Stat import Stat | |
| 38 from math import log | |
| 39 | |
| 40 | |
| 41 ## Multiple Sequence Alignment Representation | |
| 42 # | |
| 43 # | |
| 44 class AlignedBioseqDB( BioseqDB ): | |
| 45 | |
| 46 def __init__( self, name="" ): | |
| 47 BioseqDB.__init__( self, name ) | |
| 48 seqLength = self.getLength() | |
| 49 if self.getSize() > 1: | |
| 50 for bs in self.db[1:]: | |
| 51 if bs.getLength() != seqLength: | |
| 52 print "ERROR: aligned sequences have different length" | |
| 53 | |
| 54 | |
| 55 ## Get length of the alignment | |
| 56 # | |
| 57 # @return length | |
| 58 # @warning name before migration was 'length' | |
| 59 # | |
| 60 def getLength( self ): | |
| 61 length = 0 | |
| 62 if self.db != []: | |
| 63 length = self.db[0].getLength() | |
| 64 return length | |
| 65 | |
| 66 | |
| 67 ## Get the true length of a given sequence (without gaps) | |
| 68 # | |
| 69 # @param header string header of the sequence to analyze | |
| 70 # @return length integer | |
| 71 # @warning name before migration was 'true_length' | |
| 72 # | |
| 73 def getSeqLengthWithoutGaps( self, header ): | |
| 74 bs = self.fetch( header ) | |
| 75 count = 0 | |
| 76 for pos in xrange(0,len(bs.sequence)): | |
| 77 if bs.sequence[pos] != "-": | |
| 78 count += 1 | |
| 79 return count | |
| 80 | |
| 81 def cleanMSA( self ): | |
| 82 #TODO: Refactoring | |
| 83 """clean the MSA""" | |
| 84 i2del = [] | |
| 85 | |
| 86 # for each sequence in the MSA | |
| 87 for seqi in xrange(0,self.getSize()): | |
| 88 if seqi in i2del: | |
| 89 continue | |
| 90 #define it as the reference | |
| 91 ref = self.db[seqi].sequence | |
| 92 refHeader = self.db[seqi].header | |
| 93 # for each following sequence | |
| 94 for seq_next in xrange(seqi+1,self.getSize()): | |
| 95 if seq_next in i2del: | |
| 96 continue | |
| 97 keep = 0 | |
| 98 # for each position along the MSA | |
| 99 for posx in xrange(0,self.getLength()): | |
| 100 seq = self.db[seq_next].sequence | |
| 101 if seq[posx] != '-' and ref[posx] != '-': | |
| 102 keep = 1 | |
| 103 break | |
| 104 seqHeader = self.db[seq_next].header | |
| 105 # if there is at least one gap between the ref seq and the other seq | |
| 106 # keep track of the shortest by recording it in "i2del" | |
| 107 if keep == 0: | |
| 108 | |
| 109 if self.getSeqLengthWithoutGaps(refHeader) < self.getSeqLengthWithoutGaps(seqHeader): | |
| 110 if seqi not in i2del: | |
| 111 i2del.append( seqi ) | |
| 112 else: | |
| 113 if seq_next not in i2del: | |
| 114 i2del.append( seq_next ) | |
| 115 | |
| 116 # delete from the MSA each seq present in the list "i2del" | |
| 117 for i in reversed(sorted(set(i2del))): | |
| 118 del self.db[i] | |
| 119 | |
| 120 self.idx = {} | |
| 121 count = 0 | |
| 122 for i in self.db: | |
| 123 self.idx[i.header] = count | |
| 124 count += 1 | |
| 125 | |
| 126 ## Record the occurrences of symbols (A, T, G, C, N, -, ...) at each site | |
| 127 # | |
| 128 # @return: list of dico whose keys are symbols and values are their occurrences | |
| 129 # | |
| 130 def getListOccPerSite( self ): | |
| 131 lOccPerSite = [] # list of dictionaries, one per position on the sequence | |
| 132 n = 0 # nb of sequences parsed from the input file | |
| 133 firstSeq = True | |
| 134 | |
| 135 # for each sequence in the bank | |
| 136 for bs in self.db: | |
| 137 if bs.sequence == None: | |
| 138 break | |
| 139 n += 1 | |
| 140 | |
| 141 # if it is the first to be parsed, create a dico at each site | |
| 142 if firstSeq: | |
| 143 for i in xrange(0,len(bs.sequence)): | |
| 144 lOccPerSite.append( {} ) | |
| 145 firstSeq = False | |
| 146 | |
| 147 # for each site, add its nucleotide | |
| 148 for i in xrange(0,len(bs.sequence)): | |
| 149 nuc = bs.sequence[i].upper() | |
| 150 if lOccPerSite[i].has_key( nuc ): | |
| 151 lOccPerSite[i][nuc] += 1 | |
| 152 else: | |
| 153 lOccPerSite[i][nuc] = 1 | |
| 154 | |
| 155 return lOccPerSite | |
| 156 | |
| 157 #TODO: review minNbNt !!! It should be at least 2 nucleotides to build a consensus... | |
| 158 ## Make a consensus from the MSA | |
| 159 # | |
| 160 # @param minNbNt: minimum nb of nucleotides to edit a consensus | |
| 161 # @param minPropNt: minimum proportion for the major nucleotide to be used, otherwise add 'N' (default=0.0) | |
| 162 # @param verbose: level of information sent to stdout (default=0/1) | |
| 163 # @return: consensus | |
| 164 # | |
| 165 def getConsensus( self, minNbNt, minPropNt=0.0, verbose=0 , isHeaderSAtannot=False): | |
| 166 | |
| 167 maxPropN = 0.40 # discard consensus if more than 40% of N's | |
| 168 | |
| 169 nbInSeq = self.getSize() | |
| 170 if verbose > 0: | |
| 171 print "nb of aligned sequences: %i" % ( nbInSeq ); sys.stdout.flush() | |
| 172 if nbInSeq < 2: | |
| 173 print "ERROR: can't make a consensus with less than 2 sequences" | |
| 174 sys.exit(1) | |
| 175 if minNbNt >= nbInSeq: | |
| 176 minNbNt = nbInSeq - 1 | |
| 177 print "minNbNt=%i" % ( minNbNt ) | |
| 178 if minPropNt >= 1.0: | |
| 179 print "ERROR: minPropNt=%.2f should be a proportion (below 1.0)" % ( minPropNt ) | |
| 180 sys.exit(1) | |
| 181 | |
| 182 lOccPerSite = self.getListOccPerSite() | |
| 183 nbSites = len(lOccPerSite) | |
| 184 if verbose > 0: | |
| 185 print "nb of sites: %i" % ( nbSites ); sys.stdout.flush() | |
| 186 | |
| 187 seqConsensus = "" | |
| 188 | |
| 189 # for each site (i.e. each column of the MSA) | |
| 190 nbRmvColumns = 0 | |
| 191 countSites = 0 | |
| 192 for dNt2Occ in lOccPerSite: | |
| 193 countSites += 1 | |
| 194 if verbose > 1: | |
| 195 print "site %s / %i" % ( str(countSites).zfill( len(str(nbSites)) ), | |
| 196 nbSites ) | |
| 197 sys.stdout.flush() | |
| 198 occMaxNt = 0 # occurrences of the predominant nucleotide at this site | |
| 199 lBestNt = [] | |
| 200 nbNt = 0 # total nb of A, T, G and C (no gap) | |
| 201 | |
| 202 # for each distinct symbol at this site (A, T, G, C, N, -,...) | |
| 203 for j in dNt2Occ.keys(): | |
| 204 if j != "-": | |
| 205 nbNt += dNt2Occ[j] | |
| 206 if verbose > 1: | |
| 207 print "%s: %i" % ( j, dNt2Occ[j] ) | |
| 208 if dNt2Occ[j] > occMaxNt: | |
| 209 occMaxNt = dNt2Occ[j] | |
| 210 lBestNt = [ j ] | |
| 211 elif dNt2Occ[j] == occMaxNt: | |
| 212 lBestNt.append( j ) | |
| 213 if nbNt == 0: # some MSA programs can remove some sequences (e.g. Muscle after Recon) or when using Refalign (non-alignable TE fragments put together via a refseq) | |
| 214 nbRmvColumns += 1 | |
| 215 | |
| 216 if len( lBestNt ) >= 1: | |
| 217 bestNt = lBestNt[0] | |
| 218 | |
| 219 # if the predominant nucleotide occurs in less than x% of the sequences, put a "N" | |
| 220 if minPropNt > 0.0 and nbNt != 0 and float(occMaxNt)/float(nbNt) < minPropNt: | |
| 221 bestNt = "N" | |
| 222 | |
| 223 if int(nbNt) >= int(minNbNt): | |
| 224 seqConsensus += bestNt | |
| 225 if verbose > 1: | |
| 226 print "-> %s" % ( bestNt ) | |
| 227 | |
| 228 if nbRmvColumns: | |
| 229 if nbRmvColumns == 1: | |
| 230 print "WARNING: 1 site was removed (%.2f%%)" % (nbRmvColumns / float(nbSites) * 100) | |
| 231 else: | |
| 232 print "WARNING: %i sites were removed (%.2f%%)" % ( nbRmvColumns, nbRmvColumns / float(nbSites) * 100 ) | |
| 233 sys.stdout.flush() | |
| 234 if seqConsensus == "": | |
| 235 print "WARNING: no consensus can be built (no sequence left)" | |
| 236 return | |
| 237 | |
| 238 propN = seqConsensus.count("N") / float(len(seqConsensus)) | |
| 239 if propN >= maxPropN: | |
| 240 print "WARNING: no consensus can be built (%i%% of N's >= %i%%)" % ( propN * 100, maxPropN * 100 ) | |
| 241 return | |
| 242 elif propN >= maxPropN * 0.5: | |
| 243 print "WARNING: %i%% of N's" % ( propN * 100 ) | |
| 244 | |
| 245 consensus = Bioseq() | |
| 246 consensus.sequence = seqConsensus | |
| 247 if isHeaderSAtannot: | |
| 248 header = self.db[0].header | |
| 249 pyramid = header.split("Gr")[1].split("Cl")[0] | |
| 250 pile = header.split("Cl")[1].split(" ")[0] | |
| 251 consensus.header = "consensus=%s length=%i nbAlign=%i pile=%s pyramid=%s" % (self.name, len(seqConsensus), self.getSize(), pile, pyramid) | |
| 252 else: | |
| 253 consensus.header = "consensus=%s length=%i nbAlign=%i" % ( self.name, len(seqConsensus), self.getSize() ) | |
| 254 | |
| 255 if verbose > 0: | |
| 256 | |
| 257 statEntropy = self.getEntropy( verbose - 1 ) | |
| 258 print "entropy: %s" % ( statEntropy.stringQuantiles() ) | |
| 259 sys.stdout.flush() | |
| 260 | |
| 261 return consensus | |
| 262 | |
| 263 | |
| 264 ## Get the entropy of the whole multiple alignment (only for A, T, G and C) | |
| 265 # | |
| 266 # @param verbose level of verbosity | |
| 267 # | |
| 268 # @return statistics about the entropy of the MSA | |
| 269 # | |
| 270 def getEntropy( self, verbose=0 ): | |
| 271 | |
| 272 stats = Stat() | |
| 273 | |
| 274 # get the occurrences of symbols at each site | |
| 275 lOccPerSite = self.getListOccPerSite() | |
| 276 | |
| 277 countSite = 0 | |
| 278 | |
| 279 # for each site | |
| 280 for dSymbol2Occ in lOccPerSite: | |
| 281 countSite += 1 | |
| 282 | |
| 283 # count the number of nucleotides (A, T, G and C, doesn't count gap '-') | |
| 284 nbNt = 0 | |
| 285 dATGC2Occ = {} | |
| 286 for base in ["A","T","G","C"]: | |
| 287 dATGC2Occ[ base ] = 0.0 | |
| 288 for nt in dSymbol2Occ.keys(): | |
| 289 if nt != "-": | |
| 290 nbNt += dSymbol2Occ[ nt ] | |
| 291 checkedNt = self.getATGCNFromIUPAC( nt ) | |
| 292 if checkedNt in ["A","T","G","C"] and dSymbol2Occ.has_key( checkedNt ): | |
| 293 dATGC2Occ[ checkedNt ] += 1 * dSymbol2Occ[ checkedNt ] | |
| 294 else: # for 'N' | |
| 295 if dSymbol2Occ.has_key( checkedNt ): | |
| 296 dATGC2Occ[ "A" ] += 0.25 * dSymbol2Occ[ checkedNt ] | |
| 297 dATGC2Occ[ "T" ] += 0.25 * dSymbol2Occ[ checkedNt ] | |
| 298 dATGC2Occ[ "G" ] += 0.25 * dSymbol2Occ[ checkedNt ] | |
| 299 dATGC2Occ[ "C" ] += 0.25 * dSymbol2Occ[ checkedNt ] | |
| 300 if verbose > 2: | |
| 301 for base in dATGC2Occ.keys(): | |
| 302 print "%s: %i" % ( base, dATGC2Occ[ base ] ) | |
| 303 | |
| 304 # compute the entropy for the site | |
| 305 entropySite = 0.0 | |
| 306 for nt in dATGC2Occ.keys(): | |
| 307 entropySite += self.computeEntropy( dATGC2Occ[ nt ], nbNt ) | |
| 308 if verbose > 1: | |
| 309 print "site %i (%i nt): entropy = %.3f" % ( countSite, nbNt, entropySite ) | |
| 310 stats.add( entropySite ) | |
| 311 | |
| 312 return stats | |
| 313 | |
| 314 | |
| 315 ## Get A, T, G, C or N from an IUPAC letter | |
| 316 # IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N'] | |
| 317 # | |
| 318 # @return A, T, G, C or N | |
| 319 # | |
| 320 def getATGCNFromIUPAC( self, nt ): | |
| 321 iBs = Bioseq() | |
| 322 return iBs.getATGCNFromIUPAC( nt ) | |
| 323 | |
| 324 | |
| 325 ## Compute the entropy based on the occurrences of a certain nucleotide and the total number of nucleotides | |
| 326 # | |
| 327 def computeEntropy( self, nbOcc, nbNt ): | |
| 328 if nbOcc == 0.0: | |
| 329 return 0.0 | |
| 330 else: | |
| 331 freq = nbOcc / float(nbNt) | |
| 332 return - freq * log(freq) / log(2) | |
| 333 | |
| 334 | |
| 335 ## Save the multiple alignment as a matrix with '0' if gap, '1' otherwise | |
| 336 # | |
| 337 def saveAsBinaryMatrix( self, outFile ): | |
| 338 outFileHandler = open( outFile, "w" ) | |
| 339 for bs in self.db: | |
| 340 string = "%s" % ( bs.header ) | |
| 341 for nt in bs.sequence: | |
| 342 if nt != "-": | |
| 343 string += "\t%i" % ( 1 ) | |
| 344 else: | |
| 345 string += "\t%i" % ( 0 ) | |
| 346 outFileHandler.write( "%s\n" % ( string ) ) | |
| 347 outFileHandler.close() | |
| 348 | |
| 349 | |
| 350 ## Return a list of Align instances corresponding to the aligned regions (without gaps) | |
| 351 # | |
| 352 # @param query string header of the sequence considered as query | |
| 353 # @param subject string header of the sequence considered as subject | |
| 354 # | |
| 355 def getAlignList( self, query, subject ): | |
| 356 lAligns = [] | |
| 357 alignQ = self.fetch( query ).sequence | |
| 358 alignS = self.fetch( subject ).sequence | |
| 359 createNewAlign = True | |
| 360 indexAlign = 0 | |
| 361 indexQ = 0 | |
| 362 indexS = 0 | |
| 363 while indexAlign < len(alignQ): | |
| 364 if alignQ[ indexAlign ] != "-" and alignS[ indexAlign ] != "-": | |
| 365 indexQ += 1 | |
| 366 indexS += 1 | |
| 367 if createNewAlign: | |
| 368 iAlign = Align( Range( query, indexQ, indexQ ), | |
| 369 Range( subject, indexS, indexS ), | |
| 370 0, | |
| 371 int( alignQ[ indexAlign ] == alignS[ indexAlign ] ), | |
| 372 int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) ) | |
| 373 lAligns.append( iAlign ) | |
| 374 createNewAlign = False | |
| 375 else: | |
| 376 lAligns[-1].range_query.end += 1 | |
| 377 lAligns[-1].range_subject.end += 1 | |
| 378 lAligns[-1].score += int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) | |
| 379 lAligns[-1].identity += int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) | |
| 380 else: | |
| 381 if not createNewAlign: | |
| 382 lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery() | |
| 383 createNewAlign = True | |
| 384 if alignQ[ indexAlign ] != "-": | |
| 385 indexQ += 1 | |
| 386 elif alignS[ indexAlign ] != "-": | |
| 387 indexS += 1 | |
| 388 indexAlign += 1 | |
| 389 if not createNewAlign: | |
| 390 lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery() | |
| 391 return lAligns | |
| 392 | |
| 393 | |
| 394 def removeGaps(self): | |
| 395 for iBs in self.db: | |
| 396 iBs.removeSymbol( "-" ) | |
| 397 | |
| 398 ## Compute mean per cent identity for MSA. | |
| 399 # First sequence in MSA is considered as reference sequence. | |
| 400 # | |
| 401 # | |
| 402 def computeMeanPcentIdentity(self): | |
| 403 seqRef = self.db[0] | |
| 404 sumPcentIdentity = 0 | |
| 405 | |
| 406 for seq in self.db[1:]: | |
| 407 pcentIdentity = self._computePcentIdentityBetweenSeqRefAndCurrentSeq(seqRef, seq) | |
| 408 sumPcentIdentity = sumPcentIdentity + pcentIdentity | |
| 409 | |
| 410 nbSeq = len(self.db[1:]) | |
| 411 meanPcentIdentity = round (sumPcentIdentity/nbSeq) | |
| 412 | |
| 413 return meanPcentIdentity | |
| 414 | |
| 415 def _computePcentIdentityBetweenSeqRefAndCurrentSeq(self, seqRef, seq): | |
| 416 indexOnSeqRef = 0 | |
| 417 sumIdentity = 0 | |
| 418 for nuclSeq in seq.sequence: | |
| 419 nuclRef = seqRef.sequence[indexOnSeqRef] | |
| 420 | |
| 421 if nuclRef != "-" and nuclRef == nuclSeq: | |
| 422 sumIdentity = sumIdentity + 1 | |
| 423 indexOnSeqRef = indexOnSeqRef + 1 | |
| 424 | |
| 425 return float(sumIdentity) / float(seqRef.getLength()) * 100 | |
| 426 | |
| 427 | |
| 428 | |
| 429 | |
| 430 | |
| 431 | |
| 432 | |
| 433 | |
| 434 | |
| 435 | |
| 436 | |
| 437 | |
| 438 | |
| 439 | |
| 440 |
