comparison commons/core/seq/AlignedBioseqDB.py @ 36:44d5973c188c

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 15:02:29 -0400
parents
children
comparison
equal deleted inserted replaced
35:d94018ca4ada 36:44d5973c188c
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31
32 import sys
33 from commons.core.seq.BioseqDB import BioseqDB
34 from commons.core.seq.Bioseq import Bioseq
35 from commons.core.coord.Align import Align
36 from commons.core.coord.Range import Range
37 from commons.core.stat.Stat import Stat
38 from math import log
39
40
41 ## Multiple Sequence Alignment Representation
42 #
43 #
44 class AlignedBioseqDB( BioseqDB ):
45
46 def __init__( self, name="" ):
47 BioseqDB.__init__( self, name )
48 seqLength = self.getLength()
49 if self.getSize() > 1:
50 for bs in self.db[1:]:
51 if bs.getLength() != seqLength:
52 print "ERROR: aligned sequences have different length"
53
54
55 ## Get length of the alignment
56 #
57 # @return length
58 # @warning name before migration was 'length'
59 #
60 def getLength( self ):
61 length = 0
62 if self.db != []:
63 length = self.db[0].getLength()
64 return length
65
66
67 ## Get the true length of a given sequence (without gaps)
68 #
69 # @param header string header of the sequence to analyze
70 # @return length integer
71 # @warning name before migration was 'true_length'
72 #
73 def getSeqLengthWithoutGaps( self, header ):
74 bs = self.fetch( header )
75 count = 0
76 for pos in xrange(0,len(bs.sequence)):
77 if bs.sequence[pos] != "-":
78 count += 1
79 return count
80
81 def cleanMSA( self ):
82 #TODO: Refactoring
83 """clean the MSA"""
84 i2del = []
85
86 # for each sequence in the MSA
87 for seqi in xrange(0,self.getSize()):
88 if seqi in i2del:
89 continue
90 #define it as the reference
91 ref = self.db[seqi].sequence
92 refHeader = self.db[seqi].header
93 # for each following sequence
94 for seq_next in xrange(seqi+1,self.getSize()):
95 if seq_next in i2del:
96 continue
97 keep = 0
98 # for each position along the MSA
99 for posx in xrange(0,self.getLength()):
100 seq = self.db[seq_next].sequence
101 if seq[posx] != '-' and ref[posx] != '-':
102 keep = 1
103 break
104 seqHeader = self.db[seq_next].header
105 # if there is at least one gap between the ref seq and the other seq
106 # keep track of the shortest by recording it in "i2del"
107 if keep == 0:
108
109 if self.getSeqLengthWithoutGaps(refHeader) < self.getSeqLengthWithoutGaps(seqHeader):
110 if seqi not in i2del:
111 i2del.append( seqi )
112 else:
113 if seq_next not in i2del:
114 i2del.append( seq_next )
115
116 # delete from the MSA each seq present in the list "i2del"
117 for i in reversed(sorted(set(i2del))):
118 del self.db[i]
119
120 self.idx = {}
121 count = 0
122 for i in self.db:
123 self.idx[i.header] = count
124 count += 1
125
126 ## Record the occurrences of symbols (A, T, G, C, N, -, ...) at each site
127 #
128 # @return: list of dico whose keys are symbols and values are their occurrences
129 #
130 def getListOccPerSite( self ):
131 lOccPerSite = [] # list of dictionaries, one per position on the sequence
132 n = 0 # nb of sequences parsed from the input file
133 firstSeq = True
134
135 # for each sequence in the bank
136 for bs in self.db:
137 if bs.sequence == None:
138 break
139 n += 1
140
141 # if it is the first to be parsed, create a dico at each site
142 if firstSeq:
143 for i in xrange(0,len(bs.sequence)):
144 lOccPerSite.append( {} )
145 firstSeq = False
146
147 # for each site, add its nucleotide
148 for i in xrange(0,len(bs.sequence)):
149 nuc = bs.sequence[i].upper()
150 if lOccPerSite[i].has_key( nuc ):
151 lOccPerSite[i][nuc] += 1
152 else:
153 lOccPerSite[i][nuc] = 1
154
155 return lOccPerSite
156
157 #TODO: review minNbNt !!! It should be at least 2 nucleotides to build a consensus...
158 ## Make a consensus from the MSA
159 #
160 # @param minNbNt: minimum nb of nucleotides to edit a consensus
161 # @param minPropNt: minimum proportion for the major nucleotide to be used, otherwise add 'N' (default=0.0)
162 # @param verbose: level of information sent to stdout (default=0/1)
163 # @return: consensus
164 #
165 def getConsensus( self, minNbNt, minPropNt=0.0, verbose=0 , isHeaderSAtannot=False):
166
167 maxPropN = 0.40 # discard consensus if more than 40% of N's
168
169 nbInSeq = self.getSize()
170 if verbose > 0:
171 print "nb of aligned sequences: %i" % ( nbInSeq ); sys.stdout.flush()
172 if nbInSeq < 2:
173 print "ERROR: can't make a consensus with less than 2 sequences"
174 sys.exit(1)
175 if minNbNt >= nbInSeq:
176 minNbNt = nbInSeq - 1
177 print "minNbNt=%i" % ( minNbNt )
178 if minPropNt >= 1.0:
179 print "ERROR: minPropNt=%.2f should be a proportion (below 1.0)" % ( minPropNt )
180 sys.exit(1)
181
182 lOccPerSite = self.getListOccPerSite()
183 nbSites = len(lOccPerSite)
184 if verbose > 0:
185 print "nb of sites: %i" % ( nbSites ); sys.stdout.flush()
186
187 seqConsensus = ""
188
189 # for each site (i.e. each column of the MSA)
190 nbRmvColumns = 0
191 countSites = 0
192 for dNt2Occ in lOccPerSite:
193 countSites += 1
194 if verbose > 1:
195 print "site %s / %i" % ( str(countSites).zfill( len(str(nbSites)) ),
196 nbSites )
197 sys.stdout.flush()
198 occMaxNt = 0 # occurrences of the predominant nucleotide at this site
199 lBestNt = []
200 nbNt = 0 # total nb of A, T, G and C (no gap)
201
202 # for each distinct symbol at this site (A, T, G, C, N, -,...)
203 for j in dNt2Occ.keys():
204 if j != "-":
205 nbNt += dNt2Occ[j]
206 if verbose > 1:
207 print "%s: %i" % ( j, dNt2Occ[j] )
208 if dNt2Occ[j] > occMaxNt:
209 occMaxNt = dNt2Occ[j]
210 lBestNt = [ j ]
211 elif dNt2Occ[j] == occMaxNt:
212 lBestNt.append( j )
213 if nbNt == 0: # some MSA programs can remove some sequences (e.g. Muscle after Recon) or when using Refalign (non-alignable TE fragments put together via a refseq)
214 nbRmvColumns += 1
215
216 if len( lBestNt ) >= 1:
217 bestNt = lBestNt[0]
218
219 # if the predominant nucleotide occurs in less than x% of the sequences, put a "N"
220 if minPropNt > 0.0 and nbNt != 0 and float(occMaxNt)/float(nbNt) < minPropNt:
221 bestNt = "N"
222
223 if int(nbNt) >= int(minNbNt):
224 seqConsensus += bestNt
225 if verbose > 1:
226 print "-> %s" % ( bestNt )
227
228 if nbRmvColumns:
229 if nbRmvColumns == 1:
230 print "WARNING: 1 site was removed (%.2f%%)" % (nbRmvColumns / float(nbSites) * 100)
231 else:
232 print "WARNING: %i sites were removed (%.2f%%)" % ( nbRmvColumns, nbRmvColumns / float(nbSites) * 100 )
233 sys.stdout.flush()
234 if seqConsensus == "":
235 print "WARNING: no consensus can be built (no sequence left)"
236 return
237
238 propN = seqConsensus.count("N") / float(len(seqConsensus))
239 if propN >= maxPropN:
240 print "WARNING: no consensus can be built (%i%% of N's >= %i%%)" % ( propN * 100, maxPropN * 100 )
241 return
242 elif propN >= maxPropN * 0.5:
243 print "WARNING: %i%% of N's" % ( propN * 100 )
244
245 consensus = Bioseq()
246 consensus.sequence = seqConsensus
247 if isHeaderSAtannot:
248 header = self.db[0].header
249 pyramid = header.split("Gr")[1].split("Cl")[0]
250 pile = header.split("Cl")[1].split(" ")[0]
251 consensus.header = "consensus=%s length=%i nbAlign=%i pile=%s pyramid=%s" % (self.name, len(seqConsensus), self.getSize(), pile, pyramid)
252 else:
253 consensus.header = "consensus=%s length=%i nbAlign=%i" % ( self.name, len(seqConsensus), self.getSize() )
254
255 if verbose > 0:
256
257 statEntropy = self.getEntropy( verbose - 1 )
258 print "entropy: %s" % ( statEntropy.stringQuantiles() )
259 sys.stdout.flush()
260
261 return consensus
262
263
264 ## Get the entropy of the whole multiple alignment (only for A, T, G and C)
265 #
266 # @param verbose level of verbosity
267 #
268 # @return statistics about the entropy of the MSA
269 #
270 def getEntropy( self, verbose=0 ):
271
272 stats = Stat()
273
274 # get the occurrences of symbols at each site
275 lOccPerSite = self.getListOccPerSite()
276
277 countSite = 0
278
279 # for each site
280 for dSymbol2Occ in lOccPerSite:
281 countSite += 1
282
283 # count the number of nucleotides (A, T, G and C, doesn't count gap '-')
284 nbNt = 0
285 dATGC2Occ = {}
286 for base in ["A","T","G","C"]:
287 dATGC2Occ[ base ] = 0.0
288 for nt in dSymbol2Occ.keys():
289 if nt != "-":
290 nbNt += dSymbol2Occ[ nt ]
291 checkedNt = self.getATGCNFromIUPAC( nt )
292 if checkedNt in ["A","T","G","C"] and dSymbol2Occ.has_key( checkedNt ):
293 dATGC2Occ[ checkedNt ] += 1 * dSymbol2Occ[ checkedNt ]
294 else: # for 'N'
295 if dSymbol2Occ.has_key( checkedNt ):
296 dATGC2Occ[ "A" ] += 0.25 * dSymbol2Occ[ checkedNt ]
297 dATGC2Occ[ "T" ] += 0.25 * dSymbol2Occ[ checkedNt ]
298 dATGC2Occ[ "G" ] += 0.25 * dSymbol2Occ[ checkedNt ]
299 dATGC2Occ[ "C" ] += 0.25 * dSymbol2Occ[ checkedNt ]
300 if verbose > 2:
301 for base in dATGC2Occ.keys():
302 print "%s: %i" % ( base, dATGC2Occ[ base ] )
303
304 # compute the entropy for the site
305 entropySite = 0.0
306 for nt in dATGC2Occ.keys():
307 entropySite += self.computeEntropy( dATGC2Occ[ nt ], nbNt )
308 if verbose > 1:
309 print "site %i (%i nt): entropy = %.3f" % ( countSite, nbNt, entropySite )
310 stats.add( entropySite )
311
312 return stats
313
314
315 ## Get A, T, G, C or N from an IUPAC letter
316 # IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N']
317 #
318 # @return A, T, G, C or N
319 #
320 def getATGCNFromIUPAC( self, nt ):
321 iBs = Bioseq()
322 return iBs.getATGCNFromIUPAC( nt )
323
324
325 ## Compute the entropy based on the occurrences of a certain nucleotide and the total number of nucleotides
326 #
327 def computeEntropy( self, nbOcc, nbNt ):
328 if nbOcc == 0.0:
329 return 0.0
330 else:
331 freq = nbOcc / float(nbNt)
332 return - freq * log(freq) / log(2)
333
334
335 ## Save the multiple alignment as a matrix with '0' if gap, '1' otherwise
336 #
337 def saveAsBinaryMatrix( self, outFile ):
338 outFileHandler = open( outFile, "w" )
339 for bs in self.db:
340 string = "%s" % ( bs.header )
341 for nt in bs.sequence:
342 if nt != "-":
343 string += "\t%i" % ( 1 )
344 else:
345 string += "\t%i" % ( 0 )
346 outFileHandler.write( "%s\n" % ( string ) )
347 outFileHandler.close()
348
349
350 ## Return a list of Align instances corresponding to the aligned regions (without gaps)
351 #
352 # @param query string header of the sequence considered as query
353 # @param subject string header of the sequence considered as subject
354 #
355 def getAlignList( self, query, subject ):
356 lAligns = []
357 alignQ = self.fetch( query ).sequence
358 alignS = self.fetch( subject ).sequence
359 createNewAlign = True
360 indexAlign = 0
361 indexQ = 0
362 indexS = 0
363 while indexAlign < len(alignQ):
364 if alignQ[ indexAlign ] != "-" and alignS[ indexAlign ] != "-":
365 indexQ += 1
366 indexS += 1
367 if createNewAlign:
368 iAlign = Align( Range( query, indexQ, indexQ ),
369 Range( subject, indexS, indexS ),
370 0,
371 int( alignQ[ indexAlign ] == alignS[ indexAlign ] ),
372 int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) )
373 lAligns.append( iAlign )
374 createNewAlign = False
375 else:
376 lAligns[-1].range_query.end += 1
377 lAligns[-1].range_subject.end += 1
378 lAligns[-1].score += int( alignQ[ indexAlign ] == alignS[ indexAlign ] )
379 lAligns[-1].identity += int( alignQ[ indexAlign ] == alignS[ indexAlign ] )
380 else:
381 if not createNewAlign:
382 lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery()
383 createNewAlign = True
384 if alignQ[ indexAlign ] != "-":
385 indexQ += 1
386 elif alignS[ indexAlign ] != "-":
387 indexS += 1
388 indexAlign += 1
389 if not createNewAlign:
390 lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery()
391 return lAligns
392
393
394 def removeGaps(self):
395 for iBs in self.db:
396 iBs.removeSymbol( "-" )
397
398 ## Compute mean per cent identity for MSA.
399 # First sequence in MSA is considered as reference sequence.
400 #
401 #
402 def computeMeanPcentIdentity(self):
403 seqRef = self.db[0]
404 sumPcentIdentity = 0
405
406 for seq in self.db[1:]:
407 pcentIdentity = self._computePcentIdentityBetweenSeqRefAndCurrentSeq(seqRef, seq)
408 sumPcentIdentity = sumPcentIdentity + pcentIdentity
409
410 nbSeq = len(self.db[1:])
411 meanPcentIdentity = round (sumPcentIdentity/nbSeq)
412
413 return meanPcentIdentity
414
415 def _computePcentIdentityBetweenSeqRefAndCurrentSeq(self, seqRef, seq):
416 indexOnSeqRef = 0
417 sumIdentity = 0
418 for nuclSeq in seq.sequence:
419 nuclRef = seqRef.sequence[indexOnSeqRef]
420
421 if nuclRef != "-" and nuclRef == nuclSeq:
422 sumIdentity = sumIdentity + 1
423 indexOnSeqRef = indexOnSeqRef + 1
424
425 return float(sumIdentity) / float(seqRef.getLength()) * 100
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440