36
+ − 1 # Copyright INRA (Institut National de la Recherche Agronomique)
+ − 2 # http://www.inra.fr
+ − 3 # http://urgi.versailles.inra.fr
+ − 4 #
+ − 5 # This software is governed by the CeCILL license under French law and
+ − 6 # abiding by the rules of distribution of free software. You can use,
+ − 7 # modify and/ or redistribute the software under the terms of the CeCILL
+ − 8 # license as circulated by CEA, CNRS and INRIA at the following URL
+ − 9 # "http://www.cecill.info".
+ − 10 #
+ − 11 # As a counterpart to the access to the source code and rights to copy,
+ − 12 # modify and redistribute granted by the license, users are provided only
+ − 13 # with a limited warranty and the software's author, the holder of the
+ − 14 # economic rights, and the successive licensors have only limited
+ − 15 # liability.
+ − 16 #
+ − 17 # In this respect, the user's attention is drawn to the risks associated
+ − 18 # with loading, using, modifying and/or developing or reproducing the
+ − 19 # software by the user in light of its specific status of free software,
+ − 20 # that may mean that it is complicated to manipulate, and that also
+ − 21 # therefore means that it is reserved for developers and experienced
+ − 22 # professionals having in-depth computer knowledge. Users are therefore
+ − 23 # encouraged to load and test the software's suitability as regards their
+ − 24 # requirements in conditions enabling the security of their systems and/or
+ − 25 # data to be ensured and, more generally, to use and operate it in the
+ − 26 # same conditions as regards security.
+ − 27 #
+ − 28 # The fact that you are presently reading this means that you have had
+ − 29 # knowledge of the CeCILL license and that you accept its terms.
+ − 30
+ − 31
+ − 32 import sys
+ − 33 from commons.core.seq.BioseqDB import BioseqDB
+ − 34 from commons.core.seq.Bioseq import Bioseq
+ − 35 from commons.core.coord.Align import Align
+ − 36 from commons.core.coord.Range import Range
+ − 37 from commons.core.stat.Stat import Stat
+ − 38 from math import log
+ − 39
+ − 40
+ − 41 ## Multiple Sequence Alignment Representation
+ − 42 #
+ − 43 #
+ − 44 class AlignedBioseqDB( BioseqDB ):
+ − 45
+ − 46 def __init__( self, name="" ):
+ − 47 BioseqDB.__init__( self, name )
+ − 48 seqLength = self.getLength()
+ − 49 if self.getSize() > 1:
+ − 50 for bs in self.db[1:]:
+ − 51 if bs.getLength() != seqLength:
+ − 52 print "ERROR: aligned sequences have different length"
+ − 53
+ − 54
+ − 55 ## Get length of the alignment
+ − 56 #
+ − 57 # @return length
+ − 58 # @warning name before migration was 'length'
+ − 59 #
+ − 60 def getLength( self ):
+ − 61 length = 0
+ − 62 if self.db != []:
+ − 63 length = self.db[0].getLength()
+ − 64 return length
+ − 65
+ − 66
+ − 67 ## Get the true length of a given sequence (without gaps)
+ − 68 #
+ − 69 # @param header string header of the sequence to analyze
+ − 70 # @return length integer
+ − 71 # @warning name before migration was 'true_length'
+ − 72 #
+ − 73 def getSeqLengthWithoutGaps( self, header ):
+ − 74 bs = self.fetch( header )
+ − 75 count = 0
+ − 76 for pos in xrange(0,len(bs.sequence)):
+ − 77 if bs.sequence[pos] != "-":
+ − 78 count += 1
+ − 79 return count
+ − 80
+ − 81 def cleanMSA( self ):
+ − 82 #TODO: Refactoring
+ − 83 """clean the MSA"""
+ − 84 i2del = []
+ − 85
+ − 86 # for each sequence in the MSA
+ − 87 for seqi in xrange(0,self.getSize()):
+ − 88 if seqi in i2del:
+ − 89 continue
+ − 90 #define it as the reference
+ − 91 ref = self.db[seqi].sequence
+ − 92 refHeader = self.db[seqi].header
+ − 93 # for each following sequence
+ − 94 for seq_next in xrange(seqi+1,self.getSize()):
+ − 95 if seq_next in i2del:
+ − 96 continue
+ − 97 keep = 0
+ − 98 # for each position along the MSA
+ − 99 for posx in xrange(0,self.getLength()):
+ − 100 seq = self.db[seq_next].sequence
+ − 101 if seq[posx] != '-' and ref[posx] != '-':
+ − 102 keep = 1
+ − 103 break
+ − 104 seqHeader = self.db[seq_next].header
+ − 105 # if there is at least one gap between the ref seq and the other seq
+ − 106 # keep track of the shortest by recording it in "i2del"
+ − 107 if keep == 0:
+ − 108
+ − 109 if self.getSeqLengthWithoutGaps(refHeader) < self.getSeqLengthWithoutGaps(seqHeader):
+ − 110 if seqi not in i2del:
+ − 111 i2del.append( seqi )
+ − 112 else:
+ − 113 if seq_next not in i2del:
+ − 114 i2del.append( seq_next )
+ − 115
+ − 116 # delete from the MSA each seq present in the list "i2del"
+ − 117 for i in reversed(sorted(set(i2del))):
+ − 118 del self.db[i]
+ − 119
+ − 120 self.idx = {}
+ − 121 count = 0
+ − 122 for i in self.db:
+ − 123 self.idx[i.header] = count
+ − 124 count += 1
+ − 125
+ − 126 ## Record the occurrences of symbols (A, T, G, C, N, -, ...) at each site
+ − 127 #
+ − 128 # @return: list of dico whose keys are symbols and values are their occurrences
+ − 129 #
+ − 130 def getListOccPerSite( self ):
+ − 131 lOccPerSite = [] # list of dictionaries, one per position on the sequence
+ − 132 n = 0 # nb of sequences parsed from the input file
+ − 133 firstSeq = True
+ − 134
+ − 135 # for each sequence in the bank
+ − 136 for bs in self.db:
+ − 137 if bs.sequence == None:
+ − 138 break
+ − 139 n += 1
+ − 140
+ − 141 # if it is the first to be parsed, create a dico at each site
+ − 142 if firstSeq:
+ − 143 for i in xrange(0,len(bs.sequence)):
+ − 144 lOccPerSite.append( {} )
+ − 145 firstSeq = False
+ − 146
+ − 147 # for each site, add its nucleotide
+ − 148 for i in xrange(0,len(bs.sequence)):
+ − 149 nuc = bs.sequence[i].upper()
+ − 150 if lOccPerSite[i].has_key( nuc ):
+ − 151 lOccPerSite[i][nuc] += 1
+ − 152 else:
+ − 153 lOccPerSite[i][nuc] = 1
+ − 154
+ − 155 return lOccPerSite
+ − 156
+ − 157 #TODO: review minNbNt !!! It should be at least 2 nucleotides to build a consensus...
+ − 158 ## Make a consensus from the MSA
+ − 159 #
+ − 160 # @param minNbNt: minimum nb of nucleotides to edit a consensus
+ − 161 # @param minPropNt: minimum proportion for the major nucleotide to be used, otherwise add 'N' (default=0.0)
+ − 162 # @param verbose: level of information sent to stdout (default=0/1)
+ − 163 # @return: consensus
+ − 164 #
+ − 165 def getConsensus( self, minNbNt, minPropNt=0.0, verbose=0 , isHeaderSAtannot=False):
+ − 166
+ − 167 maxPropN = 0.40 # discard consensus if more than 40% of N's
+ − 168
+ − 169 nbInSeq = self.getSize()
+ − 170 if verbose > 0:
+ − 171 print "nb of aligned sequences: %i" % ( nbInSeq ); sys.stdout.flush()
+ − 172 if nbInSeq < 2:
+ − 173 print "ERROR: can't make a consensus with less than 2 sequences"
+ − 174 sys.exit(1)
+ − 175 if minNbNt >= nbInSeq:
+ − 176 minNbNt = nbInSeq - 1
+ − 177 print "minNbNt=%i" % ( minNbNt )
+ − 178 if minPropNt >= 1.0:
+ − 179 print "ERROR: minPropNt=%.2f should be a proportion (below 1.0)" % ( minPropNt )
+ − 180 sys.exit(1)
+ − 181
+ − 182 lOccPerSite = self.getListOccPerSite()
+ − 183 nbSites = len(lOccPerSite)
+ − 184 if verbose > 0:
+ − 185 print "nb of sites: %i" % ( nbSites ); sys.stdout.flush()
+ − 186
+ − 187 seqConsensus = ""
+ − 188
+ − 189 # for each site (i.e. each column of the MSA)
+ − 190 nbRmvColumns = 0
+ − 191 countSites = 0
+ − 192 for dNt2Occ in lOccPerSite:
+ − 193 countSites += 1
+ − 194 if verbose > 1:
+ − 195 print "site %s / %i" % ( str(countSites).zfill( len(str(nbSites)) ),
+ − 196 nbSites )
+ − 197 sys.stdout.flush()
+ − 198 occMaxNt = 0 # occurrences of the predominant nucleotide at this site
+ − 199 lBestNt = []
+ − 200 nbNt = 0 # total nb of A, T, G and C (no gap)
+ − 201
+ − 202 # for each distinct symbol at this site (A, T, G, C, N, -,...)
+ − 203 for j in dNt2Occ.keys():
+ − 204 if j != "-":
+ − 205 nbNt += dNt2Occ[j]
+ − 206 if verbose > 1:
+ − 207 print "%s: %i" % ( j, dNt2Occ[j] )
+ − 208 if dNt2Occ[j] > occMaxNt:
+ − 209 occMaxNt = dNt2Occ[j]
+ − 210 lBestNt = [ j ]
+ − 211 elif dNt2Occ[j] == occMaxNt:
+ − 212 lBestNt.append( j )
+ − 213 if nbNt == 0: # some MSA programs can remove some sequences (e.g. Muscle after Recon) or when using Refalign (non-alignable TE fragments put together via a refseq)
+ − 214 nbRmvColumns += 1
+ − 215
+ − 216 if len( lBestNt ) >= 1:
+ − 217 bestNt = lBestNt[0]
+ − 218
+ − 219 # if the predominant nucleotide occurs in less than x% of the sequences, put a "N"
+ − 220 if minPropNt > 0.0 and nbNt != 0 and float(occMaxNt)/float(nbNt) < minPropNt:
+ − 221 bestNt = "N"
+ − 222
+ − 223 if int(nbNt) >= int(minNbNt):
+ − 224 seqConsensus += bestNt
+ − 225 if verbose > 1:
+ − 226 print "-> %s" % ( bestNt )
+ − 227
+ − 228 if nbRmvColumns:
+ − 229 if nbRmvColumns == 1:
+ − 230 print "WARNING: 1 site was removed (%.2f%%)" % (nbRmvColumns / float(nbSites) * 100)
+ − 231 else:
+ − 232 print "WARNING: %i sites were removed (%.2f%%)" % ( nbRmvColumns, nbRmvColumns / float(nbSites) * 100 )
+ − 233 sys.stdout.flush()
+ − 234 if seqConsensus == "":
+ − 235 print "WARNING: no consensus can be built (no sequence left)"
+ − 236 return
+ − 237
+ − 238 propN = seqConsensus.count("N") / float(len(seqConsensus))
+ − 239 if propN >= maxPropN:
+ − 240 print "WARNING: no consensus can be built (%i%% of N's >= %i%%)" % ( propN * 100, maxPropN * 100 )
+ − 241 return
+ − 242 elif propN >= maxPropN * 0.5:
+ − 243 print "WARNING: %i%% of N's" % ( propN * 100 )
+ − 244
+ − 245 consensus = Bioseq()
+ − 246 consensus.sequence = seqConsensus
+ − 247 if isHeaderSAtannot:
+ − 248 header = self.db[0].header
+ − 249 pyramid = header.split("Gr")[1].split("Cl")[0]
+ − 250 pile = header.split("Cl")[1].split(" ")[0]
+ − 251 consensus.header = "consensus=%s length=%i nbAlign=%i pile=%s pyramid=%s" % (self.name, len(seqConsensus), self.getSize(), pile, pyramid)
+ − 252 else:
+ − 253 consensus.header = "consensus=%s length=%i nbAlign=%i" % ( self.name, len(seqConsensus), self.getSize() )
+ − 254
+ − 255 if verbose > 0:
+ − 256
+ − 257 statEntropy = self.getEntropy( verbose - 1 )
+ − 258 print "entropy: %s" % ( statEntropy.stringQuantiles() )
+ − 259 sys.stdout.flush()
+ − 260
+ − 261 return consensus
+ − 262
+ − 263
+ − 264 ## Get the entropy of the whole multiple alignment (only for A, T, G and C)
+ − 265 #
+ − 266 # @param verbose level of verbosity
+ − 267 #
+ − 268 # @return statistics about the entropy of the MSA
+ − 269 #
+ − 270 def getEntropy( self, verbose=0 ):
+ − 271
+ − 272 stats = Stat()
+ − 273
+ − 274 # get the occurrences of symbols at each site
+ − 275 lOccPerSite = self.getListOccPerSite()
+ − 276
+ − 277 countSite = 0
+ − 278
+ − 279 # for each site
+ − 280 for dSymbol2Occ in lOccPerSite:
+ − 281 countSite += 1
+ − 282
+ − 283 # count the number of nucleotides (A, T, G and C, doesn't count gap '-')
+ − 284 nbNt = 0
+ − 285 dATGC2Occ = {}
+ − 286 for base in ["A","T","G","C"]:
+ − 287 dATGC2Occ[ base ] = 0.0
+ − 288 for nt in dSymbol2Occ.keys():
+ − 289 if nt != "-":
+ − 290 nbNt += dSymbol2Occ[ nt ]
+ − 291 checkedNt = self.getATGCNFromIUPAC( nt )
+ − 292 if checkedNt in ["A","T","G","C"] and dSymbol2Occ.has_key( checkedNt ):
+ − 293 dATGC2Occ[ checkedNt ] += 1 * dSymbol2Occ[ checkedNt ]
+ − 294 else: # for 'N'
+ − 295 if dSymbol2Occ.has_key( checkedNt ):
+ − 296 dATGC2Occ[ "A" ] += 0.25 * dSymbol2Occ[ checkedNt ]
+ − 297 dATGC2Occ[ "T" ] += 0.25 * dSymbol2Occ[ checkedNt ]
+ − 298 dATGC2Occ[ "G" ] += 0.25 * dSymbol2Occ[ checkedNt ]
+ − 299 dATGC2Occ[ "C" ] += 0.25 * dSymbol2Occ[ checkedNt ]
+ − 300 if verbose > 2:
+ − 301 for base in dATGC2Occ.keys():
+ − 302 print "%s: %i" % ( base, dATGC2Occ[ base ] )
+ − 303
+ − 304 # compute the entropy for the site
+ − 305 entropySite = 0.0
+ − 306 for nt in dATGC2Occ.keys():
+ − 307 entropySite += self.computeEntropy( dATGC2Occ[ nt ], nbNt )
+ − 308 if verbose > 1:
+ − 309 print "site %i (%i nt): entropy = %.3f" % ( countSite, nbNt, entropySite )
+ − 310 stats.add( entropySite )
+ − 311
+ − 312 return stats
+ − 313
+ − 314
+ − 315 ## Get A, T, G, C or N from an IUPAC letter
+ − 316 # IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N']
+ − 317 #
+ − 318 # @return A, T, G, C or N
+ − 319 #
+ − 320 def getATGCNFromIUPAC( self, nt ):
+ − 321 iBs = Bioseq()
+ − 322 return iBs.getATGCNFromIUPAC( nt )
+ − 323
+ − 324
+ − 325 ## Compute the entropy based on the occurrences of a certain nucleotide and the total number of nucleotides
+ − 326 #
+ − 327 def computeEntropy( self, nbOcc, nbNt ):
+ − 328 if nbOcc == 0.0:
+ − 329 return 0.0
+ − 330 else:
+ − 331 freq = nbOcc / float(nbNt)
+ − 332 return - freq * log(freq) / log(2)
+ − 333
+ − 334
+ − 335 ## Save the multiple alignment as a matrix with '0' if gap, '1' otherwise
+ − 336 #
+ − 337 def saveAsBinaryMatrix( self, outFile ):
+ − 338 outFileHandler = open( outFile, "w" )
+ − 339 for bs in self.db:
+ − 340 string = "%s" % ( bs.header )
+ − 341 for nt in bs.sequence:
+ − 342 if nt != "-":
+ − 343 string += "\t%i" % ( 1 )
+ − 344 else:
+ − 345 string += "\t%i" % ( 0 )
+ − 346 outFileHandler.write( "%s\n" % ( string ) )
+ − 347 outFileHandler.close()
+ − 348
+ − 349
+ − 350 ## Return a list of Align instances corresponding to the aligned regions (without gaps)
+ − 351 #
+ − 352 # @param query string header of the sequence considered as query
+ − 353 # @param subject string header of the sequence considered as subject
+ − 354 #
+ − 355 def getAlignList( self, query, subject ):
+ − 356 lAligns = []
+ − 357 alignQ = self.fetch( query ).sequence
+ − 358 alignS = self.fetch( subject ).sequence
+ − 359 createNewAlign = True
+ − 360 indexAlign = 0
+ − 361 indexQ = 0
+ − 362 indexS = 0
+ − 363 while indexAlign < len(alignQ):
+ − 364 if alignQ[ indexAlign ] != "-" and alignS[ indexAlign ] != "-":
+ − 365 indexQ += 1
+ − 366 indexS += 1
+ − 367 if createNewAlign:
+ − 368 iAlign = Align( Range( query, indexQ, indexQ ),
+ − 369 Range( subject, indexS, indexS ),
+ − 370 0,
+ − 371 int( alignQ[ indexAlign ] == alignS[ indexAlign ] ),
+ − 372 int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) )
+ − 373 lAligns.append( iAlign )
+ − 374 createNewAlign = False
+ − 375 else:
+ − 376 lAligns[-1].range_query.end += 1
+ − 377 lAligns[-1].range_subject.end += 1
+ − 378 lAligns[-1].score += int( alignQ[ indexAlign ] == alignS[ indexAlign ] )
+ − 379 lAligns[-1].identity += int( alignQ[ indexAlign ] == alignS[ indexAlign ] )
+ − 380 else:
+ − 381 if not createNewAlign:
+ − 382 lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery()
+ − 383 createNewAlign = True
+ − 384 if alignQ[ indexAlign ] != "-":
+ − 385 indexQ += 1
+ − 386 elif alignS[ indexAlign ] != "-":
+ − 387 indexS += 1
+ − 388 indexAlign += 1
+ − 389 if not createNewAlign:
+ − 390 lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery()
+ − 391 return lAligns
+ − 392
+ − 393
+ − 394 def removeGaps(self):
+ − 395 for iBs in self.db:
+ − 396 iBs.removeSymbol( "-" )
+ − 397
+ − 398 ## Compute mean per cent identity for MSA.
+ − 399 # First sequence in MSA is considered as reference sequence.
+ − 400 #
+ − 401 #
+ − 402 def computeMeanPcentIdentity(self):
+ − 403 seqRef = self.db[0]
+ − 404 sumPcentIdentity = 0
+ − 405
+ − 406 for seq in self.db[1:]:
+ − 407 pcentIdentity = self._computePcentIdentityBetweenSeqRefAndCurrentSeq(seqRef, seq)
+ − 408 sumPcentIdentity = sumPcentIdentity + pcentIdentity
+ − 409
+ − 410 nbSeq = len(self.db[1:])
+ − 411 meanPcentIdentity = round (sumPcentIdentity/nbSeq)
+ − 412
+ − 413 return meanPcentIdentity
+ − 414
+ − 415 def _computePcentIdentityBetweenSeqRefAndCurrentSeq(self, seqRef, seq):
+ − 416 indexOnSeqRef = 0
+ − 417 sumIdentity = 0
+ − 418 for nuclSeq in seq.sequence:
+ − 419 nuclRef = seqRef.sequence[indexOnSeqRef]
+ − 420
+ − 421 if nuclRef != "-" and nuclRef == nuclSeq:
+ − 422 sumIdentity = sumIdentity + 1
+ − 423 indexOnSeqRef = indexOnSeqRef + 1
+ − 424
+ − 425 return float(sumIdentity) / float(seqRef.getLength()) * 100
+ − 426
+ − 427
+ − 428
+ − 429
+ − 430
+ − 431
+ − 432
+ − 433
+ − 434
+ − 435
+ − 436
+ − 437
+ − 438
+ − 439
+ − 440