comparison commons/core/seq/AlignedBioseqDB.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children 94ab73e8a190
comparison
equal deleted inserted replaced
5:ea3082881bf8 6:769e306b7933
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31
32 import sys
33 from commons.core.seq.BioseqDB import BioseqDB
34 from commons.core.seq.Bioseq import Bioseq
35 from commons.core.coord.Align import Align
36 from commons.core.coord.Range import Range
37 from commons.core.stat.Stat import Stat
38 from math import log
39
40
41 ## Multiple Sequence Alignment Representation
42 #
43 #
44 class AlignedBioseqDB( BioseqDB ):
45
46 def __init__( self, name="" ):
47 BioseqDB.__init__( self, name )
48 seqLength = self.getLength()
49 if self.getSize() > 1:
50 for bs in self.db[1:]:
51 if bs.getLength() != seqLength:
52 print "ERROR: aligned sequences have different length"
53
54
55 ## Get length of the alignment
56 #
57 # @return length
58 # @warning name before migration was 'length'
59 #
60 def getLength( self ):
61 length = 0
62 if self.db != []:
63 length = self.db[0].getLength()
64 return length
65
66
67 ## Get the true length of a given sequence (without gaps)
68 #
69 # @param header string header of the sequence to analyze
70 # @return length integer
71 # @warning name before migration was 'true_length'
72 #
73 def getSeqLengthWithoutGaps( self, header ):
74 bs = self.fetch( header )
75 count = 0
76 for pos in xrange(0,len(bs.sequence)):
77 if bs.sequence[pos] != "-":
78 count += 1
79 return count
80
81
82 ## Record the occurrences of symbols (A, T, G, C, N, -, ...) at each site
83 #
84 # @return: list of dico whose keys are symbols and values are their occurrences
85 #
86 def getListOccPerSite( self ):
87 lOccPerSite = [] # list of dictionaries, one per position on the sequence
88 n = 0 # nb of sequences parsed from the input file
89 firstSeq = True
90
91 # for each sequence in the bank
92 for bs in self.db:
93 if bs.sequence == None:
94 break
95 n += 1
96
97 # if it is the first to be parsed, create a dico at each site
98 if firstSeq:
99 for i in xrange(0,len(bs.sequence)):
100 lOccPerSite.append( {} )
101 firstSeq = False
102
103 # for each site, add its nucleotide
104 for i in xrange(0,len(bs.sequence)):
105 nuc = bs.sequence[i].upper()
106 if lOccPerSite[i].has_key( nuc ):
107 lOccPerSite[i][nuc] += 1
108 else:
109 lOccPerSite[i][nuc] = 1
110
111 return lOccPerSite
112
113 #TODO: review minNbNt !!! It should be at least 2 nucleotides to build a consensus...
114 ## Make a consensus from the MSA
115 #
116 # @param minNbNt: minimum nb of nucleotides to edit a consensus
117 # @param minPropNt: minimum proportion for the major nucleotide to be used, otherwise add 'N' (default=0.0)
118 # @param verbose: level of information sent to stdout (default=0/1)
119 # @return: consensus
120 #
121 def getConsensus( self, minNbNt, minPropNt=0.0, verbose=0 , isHeaderSAtannot=False):
122
123 maxPropN = 0.40 # discard consensus if more than 40% of N's
124
125 nbInSeq = self.getSize()
126 if verbose > 0:
127 print "nb of aligned sequences: %i" % ( nbInSeq ); sys.stdout.flush()
128 if nbInSeq < 2:
129 print "ERROR: can't make a consensus with less than 2 sequences"
130 sys.exit(1)
131 if minNbNt >= nbInSeq:
132 minNbNt = nbInSeq - 1
133 print "minNbNt=%i" % ( minNbNt )
134 if minPropNt >= 1.0:
135 print "ERROR: minPropNt=%.2f should be a proportion (below 1.0)" % ( minPropNt )
136 sys.exit(1)
137
138 lOccPerSite = self.getListOccPerSite()
139 nbSites = len(lOccPerSite)
140 if verbose > 0:
141 print "nb of sites: %i" % ( nbSites ); sys.stdout.flush()
142
143 seqConsensus = ""
144
145 # for each site (i.e. each column of the MSA)
146 nbRmvColumns = 0
147 countSites = 0
148 for dNt2Occ in lOccPerSite:
149 countSites += 1
150 if verbose > 1:
151 print "site %s / %i" % ( str(countSites).zfill( len(str(nbSites)) ),
152 nbSites )
153 sys.stdout.flush()
154 occMaxNt = 0 # occurrences of the predominant nucleotide at this site
155 lBestNt = []
156 nbNt = 0 # total nb of A, T, G and C (no gap)
157
158 # for each distinct symbol at this site (A, T, G, C, N, -,...)
159 for j in dNt2Occ.keys():
160 if j != "-":
161 nbNt += dNt2Occ[j]
162 if verbose > 1:
163 print "%s: %i" % ( j, dNt2Occ[j] )
164 if dNt2Occ[j] > occMaxNt:
165 occMaxNt = dNt2Occ[j]
166 lBestNt = [ j ]
167 elif dNt2Occ[j] == occMaxNt:
168 lBestNt.append( j )
169 if nbNt == 0: # some MSA programs can remove some sequences (e.g. Muscle after Recon) or when using Refalign (non-alignable TE fragments put together via a refseq)
170 nbRmvColumns += 1
171
172 if len( lBestNt ) >= 1:
173 bestNt = lBestNt[0]
174
175 # if the predominant nucleotide occurs in less than x% of the sequences, put a "N"
176 if minPropNt > 0.0 and nbNt != 0 and float(occMaxNt)/float(nbNt) < minPropNt:
177 bestNt = "N"
178
179 if int(nbNt) >= int(minNbNt):
180 seqConsensus += bestNt
181 if verbose > 1:
182 print "-> %s" % ( bestNt )
183
184 if nbRmvColumns:
185 if nbRmvColumns == 1:
186 print "WARNING: 1 site was removed (%.2f%%)" % (nbRmvColumns / float(nbSites) * 100)
187 else:
188 print "WARNING: %i sites were removed (%.2f%%)" % ( nbRmvColumns, nbRmvColumns / float(nbSites) * 100 )
189 sys.stdout.flush()
190 if seqConsensus == "":
191 print "WARNING: no consensus can be built (no sequence left)"
192 return
193
194 propN = seqConsensus.count("N") / float(len(seqConsensus))
195 if propN >= maxPropN:
196 print "WARNING: no consensus can be built (%i%% of N's >= %i%%)" % ( propN * 100, maxPropN * 100 )
197 return
198 elif propN >= maxPropN * 0.5:
199 print "WARNING: %i%% of N's" % ( propN * 100 )
200
201 consensus = Bioseq()
202 consensus.sequence = seqConsensus
203 if isHeaderSAtannot:
204 header = self.db[0].header
205 pyramid = header.split("Gr")[1].split("Cl")[0]
206 pile = header.split("Cl")[1].split(" ")[0]
207 consensus.header = "consensus=%s length=%i nbAlign=%i pile=%s pyramid=%s" % (self.name, len(seqConsensus), self.getSize(), pile, pyramid)
208 else:
209 consensus.header = "consensus=%s length=%i nbAlign=%i" % ( self.name, len(seqConsensus), self.getSize() )
210
211 if verbose > 0:
212
213 statEntropy = self.getEntropy( verbose - 1 )
214 print "entropy: %s" % ( statEntropy.stringQuantiles() )
215 sys.stdout.flush()
216
217 return consensus
218
219
220 ## Get the entropy of the whole multiple alignment (only for A, T, G and C)
221 #
222 # @param verbose level of verbosity
223 #
224 # @return statistics about the entropy of the MSA
225 #
226 def getEntropy( self, verbose=0 ):
227
228 stats = Stat()
229
230 # get the occurrences of symbols at each site
231 lOccPerSite = self.getListOccPerSite()
232
233 countSite = 0
234
235 # for each site
236 for dSymbol2Occ in lOccPerSite:
237 countSite += 1
238
239 # count the number of nucleotides (A, T, G and C, doesn't count gap '-')
240 nbNt = 0
241 dATGC2Occ = {}
242 for base in ["A","T","G","C"]:
243 dATGC2Occ[ base ] = 0.0
244 for nt in dSymbol2Occ.keys():
245 if nt != "-":
246 nbNt += dSymbol2Occ[ nt ]
247 checkedNt = self.getATGCNFromIUPAC( nt )
248 if checkedNt in ["A","T","G","C"] and dSymbol2Occ.has_key( checkedNt ):
249 dATGC2Occ[ checkedNt ] += 1 * dSymbol2Occ[ checkedNt ]
250 else: # for 'N'
251 if dSymbol2Occ.has_key( checkedNt ):
252 dATGC2Occ[ "A" ] += 0.25 * dSymbol2Occ[ checkedNt ]
253 dATGC2Occ[ "T" ] += 0.25 * dSymbol2Occ[ checkedNt ]
254 dATGC2Occ[ "G" ] += 0.25 * dSymbol2Occ[ checkedNt ]
255 dATGC2Occ[ "C" ] += 0.25 * dSymbol2Occ[ checkedNt ]
256 if verbose > 2:
257 for base in dATGC2Occ.keys():
258 print "%s: %i" % ( base, dATGC2Occ[ base ] )
259
260 # compute the entropy for the site
261 entropySite = 0.0
262 for nt in dATGC2Occ.keys():
263 entropySite += self.computeEntropy( dATGC2Occ[ nt ], nbNt )
264 if verbose > 1:
265 print "site %i (%i nt): entropy = %.3f" % ( countSite, nbNt, entropySite )
266 stats.add( entropySite )
267
268 return stats
269
270
271 ## Get A, T, G, C or N from an IUPAC letter
272 # IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N']
273 #
274 # @return A, T, G, C or N
275 #
276 def getATGCNFromIUPAC( self, nt ):
277 iBs = Bioseq()
278 return iBs.getATGCNFromIUPAC( nt )
279
280
281 ## Compute the entropy based on the occurrences of a certain nucleotide and the total number of nucleotides
282 #
283 def computeEntropy( self, nbOcc, nbNt ):
284 if nbOcc == 0.0:
285 return 0.0
286 else:
287 freq = nbOcc / float(nbNt)
288 return - freq * log(freq) / log(2)
289
290
291 ## Save the multiple alignment as a matrix with '0' if gap, '1' otherwise
292 #
293 def saveAsBinaryMatrix( self, outFile ):
294 outFileHandler = open( outFile, "w" )
295 for bs in self.db:
296 string = "%s" % ( bs.header )
297 for nt in bs.sequence:
298 if nt != "-":
299 string += "\t%i" % ( 1 )
300 else:
301 string += "\t%i" % ( 0 )
302 outFileHandler.write( "%s\n" % ( string ) )
303 outFileHandler.close()
304
305
306 ## Return a list of Align instances corresponding to the aligned regions (without gaps)
307 #
308 # @param query string header of the sequence considered as query
309 # @param subject string header of the sequence considered as subject
310 #
311 def getAlignList( self, query, subject ):
312 lAligns = []
313 alignQ = self.fetch( query ).sequence
314 alignS = self.fetch( subject ).sequence
315 createNewAlign = True
316 indexAlign = 0
317 indexQ = 0
318 indexS = 0
319 while indexAlign < len(alignQ):
320 if alignQ[ indexAlign ] != "-" and alignS[ indexAlign ] != "-":
321 indexQ += 1
322 indexS += 1
323 if createNewAlign:
324 iAlign = Align( Range( query, indexQ, indexQ ),
325 Range( subject, indexS, indexS ),
326 0,
327 int( alignQ[ indexAlign ] == alignS[ indexAlign ] ),
328 int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) )
329 lAligns.append( iAlign )
330 createNewAlign = False
331 else:
332 lAligns[-1].range_query.end += 1
333 lAligns[-1].range_subject.end += 1
334 lAligns[-1].score += int( alignQ[ indexAlign ] == alignS[ indexAlign ] )
335 lAligns[-1].identity += int( alignQ[ indexAlign ] == alignS[ indexAlign ] )
336 else:
337 if not createNewAlign:
338 lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery()
339 createNewAlign = True
340 if alignQ[ indexAlign ] != "-":
341 indexQ += 1
342 elif alignS[ indexAlign ] != "-":
343 indexS += 1
344 indexAlign += 1
345 if not createNewAlign:
346 lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery()
347 return lAligns
348
349
350 def removeGaps(self):
351 for iBs in self.db:
352 iBs.removeSymbol( "-" )
353
354 ## Compute mean per cent identity for MSA.
355 # First sequence in MSA is considered as reference sequence.
356 #
357 #
358 def computeMeanPcentIdentity(self):
359 seqRef = self.db[0]
360 sumPcentIdentity = 0
361
362 for seq in self.db[1:]:
363 pcentIdentity = self._computePcentIdentityBetweenSeqRefAndCurrentSeq(seqRef, seq)
364 sumPcentIdentity = sumPcentIdentity + pcentIdentity
365
366 nbSeq = len(self.db[1:])
367 meanPcentIdentity = round (sumPcentIdentity/nbSeq)
368
369 return meanPcentIdentity
370
371 def _computePcentIdentityBetweenSeqRefAndCurrentSeq(self, seqRef, seq):
372 indexOnSeqRef = 0
373 sumIdentity = 0
374 for nuclSeq in seq.sequence:
375 nuclRef = seqRef.sequence[indexOnSeqRef]
376
377 if nuclRef != "-" and nuclRef == nuclSeq:
378 sumIdentity = sumIdentity + 1
379 indexOnSeqRef = indexOnSeqRef + 1
380
381 return float(sumIdentity) / float(seqRef.getLength()) * 100
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396