6
|
1 # Copyright INRA (Institut National de la Recherche Agronomique)
|
|
2 # http://www.inra.fr
|
|
3 # http://urgi.versailles.inra.fr
|
|
4 #
|
|
5 # This software is governed by the CeCILL license under French law and
|
|
6 # abiding by the rules of distribution of free software. You can use,
|
|
7 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
8 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
9 # "http://www.cecill.info".
|
|
10 #
|
|
11 # As a counterpart to the access to the source code and rights to copy,
|
|
12 # modify and redistribute granted by the license, users are provided only
|
|
13 # with a limited warranty and the software's author, the holder of the
|
|
14 # economic rights, and the successive licensors have only limited
|
|
15 # liability.
|
|
16 #
|
|
17 # In this respect, the user's attention is drawn to the risks associated
|
|
18 # with loading, using, modifying and/or developing or reproducing the
|
|
19 # software by the user in light of its specific status of free software,
|
|
20 # that may mean that it is complicated to manipulate, and that also
|
|
21 # therefore means that it is reserved for developers and experienced
|
|
22 # professionals having in-depth computer knowledge. Users are therefore
|
|
23 # encouraged to load and test the software's suitability as regards their
|
|
24 # requirements in conditions enabling the security of their systems and/or
|
|
25 # data to be ensured and, more generally, to use and operate it in the
|
|
26 # same conditions as regards security.
|
|
27 #
|
|
28 # The fact that you are presently reading this means that you have had
|
|
29 # knowledge of the CeCILL license and that you accept its terms.
|
|
30
|
|
31
|
|
32 import sys
|
|
33 from commons.core.seq.BioseqDB import BioseqDB
|
|
34 from commons.core.seq.Bioseq import Bioseq
|
|
35 from commons.core.coord.Align import Align
|
|
36 from commons.core.coord.Range import Range
|
|
37 from commons.core.stat.Stat import Stat
|
|
38 from math import log
|
|
39
|
|
40
|
|
41 ## Multiple Sequence Alignment Representation
|
|
42 #
|
|
43 #
|
|
44 class AlignedBioseqDB( BioseqDB ):
|
|
45
|
|
46 def __init__( self, name="" ):
|
|
47 BioseqDB.__init__( self, name )
|
|
48 seqLength = self.getLength()
|
|
49 if self.getSize() > 1:
|
|
50 for bs in self.db[1:]:
|
|
51 if bs.getLength() != seqLength:
|
|
52 print "ERROR: aligned sequences have different length"
|
|
53
|
|
54
|
|
55 ## Get length of the alignment
|
|
56 #
|
|
57 # @return length
|
|
58 # @warning name before migration was 'length'
|
|
59 #
|
|
60 def getLength( self ):
|
|
61 length = 0
|
|
62 if self.db != []:
|
|
63 length = self.db[0].getLength()
|
|
64 return length
|
|
65
|
|
66
|
|
67 ## Get the true length of a given sequence (without gaps)
|
|
68 #
|
|
69 # @param header string header of the sequence to analyze
|
|
70 # @return length integer
|
|
71 # @warning name before migration was 'true_length'
|
|
72 #
|
|
73 def getSeqLengthWithoutGaps( self, header ):
|
|
74 bs = self.fetch( header )
|
|
75 count = 0
|
|
76 for pos in xrange(0,len(bs.sequence)):
|
|
77 if bs.sequence[pos] != "-":
|
|
78 count += 1
|
|
79 return count
|
|
80
|
|
81
|
|
82 ## Record the occurrences of symbols (A, T, G, C, N, -, ...) at each site
|
|
83 #
|
|
84 # @return: list of dico whose keys are symbols and values are their occurrences
|
|
85 #
|
|
86 def getListOccPerSite( self ):
|
|
87 lOccPerSite = [] # list of dictionaries, one per position on the sequence
|
|
88 n = 0 # nb of sequences parsed from the input file
|
|
89 firstSeq = True
|
|
90
|
|
91 # for each sequence in the bank
|
|
92 for bs in self.db:
|
|
93 if bs.sequence == None:
|
|
94 break
|
|
95 n += 1
|
|
96
|
|
97 # if it is the first to be parsed, create a dico at each site
|
|
98 if firstSeq:
|
|
99 for i in xrange(0,len(bs.sequence)):
|
|
100 lOccPerSite.append( {} )
|
|
101 firstSeq = False
|
|
102
|
|
103 # for each site, add its nucleotide
|
|
104 for i in xrange(0,len(bs.sequence)):
|
|
105 nuc = bs.sequence[i].upper()
|
|
106 if lOccPerSite[i].has_key( nuc ):
|
|
107 lOccPerSite[i][nuc] += 1
|
|
108 else:
|
|
109 lOccPerSite[i][nuc] = 1
|
|
110
|
|
111 return lOccPerSite
|
|
112
|
|
113 #TODO: review minNbNt !!! It should be at least 2 nucleotides to build a consensus...
|
|
114 ## Make a consensus from the MSA
|
|
115 #
|
|
116 # @param minNbNt: minimum nb of nucleotides to edit a consensus
|
|
117 # @param minPropNt: minimum proportion for the major nucleotide to be used, otherwise add 'N' (default=0.0)
|
|
118 # @param verbose: level of information sent to stdout (default=0/1)
|
|
119 # @return: consensus
|
|
120 #
|
|
121 def getConsensus( self, minNbNt, minPropNt=0.0, verbose=0 , isHeaderSAtannot=False):
|
|
122
|
|
123 maxPropN = 0.40 # discard consensus if more than 40% of N's
|
|
124
|
|
125 nbInSeq = self.getSize()
|
|
126 if verbose > 0:
|
|
127 print "nb of aligned sequences: %i" % ( nbInSeq ); sys.stdout.flush()
|
|
128 if nbInSeq < 2:
|
|
129 print "ERROR: can't make a consensus with less than 2 sequences"
|
|
130 sys.exit(1)
|
|
131 if minNbNt >= nbInSeq:
|
|
132 minNbNt = nbInSeq - 1
|
|
133 print "minNbNt=%i" % ( minNbNt )
|
|
134 if minPropNt >= 1.0:
|
|
135 print "ERROR: minPropNt=%.2f should be a proportion (below 1.0)" % ( minPropNt )
|
|
136 sys.exit(1)
|
|
137
|
|
138 lOccPerSite = self.getListOccPerSite()
|
|
139 nbSites = len(lOccPerSite)
|
|
140 if verbose > 0:
|
|
141 print "nb of sites: %i" % ( nbSites ); sys.stdout.flush()
|
|
142
|
|
143 seqConsensus = ""
|
|
144
|
|
145 # for each site (i.e. each column of the MSA)
|
|
146 nbRmvColumns = 0
|
|
147 countSites = 0
|
|
148 for dNt2Occ in lOccPerSite:
|
|
149 countSites += 1
|
|
150 if verbose > 1:
|
|
151 print "site %s / %i" % ( str(countSites).zfill( len(str(nbSites)) ),
|
|
152 nbSites )
|
|
153 sys.stdout.flush()
|
|
154 occMaxNt = 0 # occurrences of the predominant nucleotide at this site
|
|
155 lBestNt = []
|
|
156 nbNt = 0 # total nb of A, T, G and C (no gap)
|
|
157
|
|
158 # for each distinct symbol at this site (A, T, G, C, N, -,...)
|
|
159 for j in dNt2Occ.keys():
|
|
160 if j != "-":
|
|
161 nbNt += dNt2Occ[j]
|
|
162 if verbose > 1:
|
|
163 print "%s: %i" % ( j, dNt2Occ[j] )
|
|
164 if dNt2Occ[j] > occMaxNt:
|
|
165 occMaxNt = dNt2Occ[j]
|
|
166 lBestNt = [ j ]
|
|
167 elif dNt2Occ[j] == occMaxNt:
|
|
168 lBestNt.append( j )
|
|
169 if nbNt == 0: # some MSA programs can remove some sequences (e.g. Muscle after Recon) or when using Refalign (non-alignable TE fragments put together via a refseq)
|
|
170 nbRmvColumns += 1
|
|
171
|
|
172 if len( lBestNt ) >= 1:
|
|
173 bestNt = lBestNt[0]
|
|
174
|
|
175 # if the predominant nucleotide occurs in less than x% of the sequences, put a "N"
|
|
176 if minPropNt > 0.0 and nbNt != 0 and float(occMaxNt)/float(nbNt) < minPropNt:
|
|
177 bestNt = "N"
|
|
178
|
|
179 if int(nbNt) >= int(minNbNt):
|
|
180 seqConsensus += bestNt
|
|
181 if verbose > 1:
|
|
182 print "-> %s" % ( bestNt )
|
|
183
|
|
184 if nbRmvColumns:
|
|
185 if nbRmvColumns == 1:
|
|
186 print "WARNING: 1 site was removed (%.2f%%)" % (nbRmvColumns / float(nbSites) * 100)
|
|
187 else:
|
|
188 print "WARNING: %i sites were removed (%.2f%%)" % ( nbRmvColumns, nbRmvColumns / float(nbSites) * 100 )
|
|
189 sys.stdout.flush()
|
|
190 if seqConsensus == "":
|
|
191 print "WARNING: no consensus can be built (no sequence left)"
|
|
192 return
|
|
193
|
|
194 propN = seqConsensus.count("N") / float(len(seqConsensus))
|
|
195 if propN >= maxPropN:
|
|
196 print "WARNING: no consensus can be built (%i%% of N's >= %i%%)" % ( propN * 100, maxPropN * 100 )
|
|
197 return
|
|
198 elif propN >= maxPropN * 0.5:
|
|
199 print "WARNING: %i%% of N's" % ( propN * 100 )
|
|
200
|
|
201 consensus = Bioseq()
|
|
202 consensus.sequence = seqConsensus
|
|
203 if isHeaderSAtannot:
|
|
204 header = self.db[0].header
|
|
205 pyramid = header.split("Gr")[1].split("Cl")[0]
|
|
206 pile = header.split("Cl")[1].split(" ")[0]
|
|
207 consensus.header = "consensus=%s length=%i nbAlign=%i pile=%s pyramid=%s" % (self.name, len(seqConsensus), self.getSize(), pile, pyramid)
|
|
208 else:
|
|
209 consensus.header = "consensus=%s length=%i nbAlign=%i" % ( self.name, len(seqConsensus), self.getSize() )
|
|
210
|
|
211 if verbose > 0:
|
|
212
|
|
213 statEntropy = self.getEntropy( verbose - 1 )
|
|
214 print "entropy: %s" % ( statEntropy.stringQuantiles() )
|
|
215 sys.stdout.flush()
|
|
216
|
|
217 return consensus
|
|
218
|
|
219
|
|
220 ## Get the entropy of the whole multiple alignment (only for A, T, G and C)
|
|
221 #
|
|
222 # @param verbose level of verbosity
|
|
223 #
|
|
224 # @return statistics about the entropy of the MSA
|
|
225 #
|
|
226 def getEntropy( self, verbose=0 ):
|
|
227
|
|
228 stats = Stat()
|
|
229
|
|
230 # get the occurrences of symbols at each site
|
|
231 lOccPerSite = self.getListOccPerSite()
|
|
232
|
|
233 countSite = 0
|
|
234
|
|
235 # for each site
|
|
236 for dSymbol2Occ in lOccPerSite:
|
|
237 countSite += 1
|
|
238
|
|
239 # count the number of nucleotides (A, T, G and C, doesn't count gap '-')
|
|
240 nbNt = 0
|
|
241 dATGC2Occ = {}
|
|
242 for base in ["A","T","G","C"]:
|
|
243 dATGC2Occ[ base ] = 0.0
|
|
244 for nt in dSymbol2Occ.keys():
|
|
245 if nt != "-":
|
|
246 nbNt += dSymbol2Occ[ nt ]
|
|
247 checkedNt = self.getATGCNFromIUPAC( nt )
|
|
248 if checkedNt in ["A","T","G","C"] and dSymbol2Occ.has_key( checkedNt ):
|
|
249 dATGC2Occ[ checkedNt ] += 1 * dSymbol2Occ[ checkedNt ]
|
|
250 else: # for 'N'
|
|
251 if dSymbol2Occ.has_key( checkedNt ):
|
|
252 dATGC2Occ[ "A" ] += 0.25 * dSymbol2Occ[ checkedNt ]
|
|
253 dATGC2Occ[ "T" ] += 0.25 * dSymbol2Occ[ checkedNt ]
|
|
254 dATGC2Occ[ "G" ] += 0.25 * dSymbol2Occ[ checkedNt ]
|
|
255 dATGC2Occ[ "C" ] += 0.25 * dSymbol2Occ[ checkedNt ]
|
|
256 if verbose > 2:
|
|
257 for base in dATGC2Occ.keys():
|
|
258 print "%s: %i" % ( base, dATGC2Occ[ base ] )
|
|
259
|
|
260 # compute the entropy for the site
|
|
261 entropySite = 0.0
|
|
262 for nt in dATGC2Occ.keys():
|
|
263 entropySite += self.computeEntropy( dATGC2Occ[ nt ], nbNt )
|
|
264 if verbose > 1:
|
|
265 print "site %i (%i nt): entropy = %.3f" % ( countSite, nbNt, entropySite )
|
|
266 stats.add( entropySite )
|
|
267
|
|
268 return stats
|
|
269
|
|
270
|
|
271 ## Get A, T, G, C or N from an IUPAC letter
|
|
272 # IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N']
|
|
273 #
|
|
274 # @return A, T, G, C or N
|
|
275 #
|
|
276 def getATGCNFromIUPAC( self, nt ):
|
|
277 iBs = Bioseq()
|
|
278 return iBs.getATGCNFromIUPAC( nt )
|
|
279
|
|
280
|
|
281 ## Compute the entropy based on the occurrences of a certain nucleotide and the total number of nucleotides
|
|
282 #
|
|
283 def computeEntropy( self, nbOcc, nbNt ):
|
|
284 if nbOcc == 0.0:
|
|
285 return 0.0
|
|
286 else:
|
|
287 freq = nbOcc / float(nbNt)
|
|
288 return - freq * log(freq) / log(2)
|
|
289
|
|
290
|
|
291 ## Save the multiple alignment as a matrix with '0' if gap, '1' otherwise
|
|
292 #
|
|
293 def saveAsBinaryMatrix( self, outFile ):
|
|
294 outFileHandler = open( outFile, "w" )
|
|
295 for bs in self.db:
|
|
296 string = "%s" % ( bs.header )
|
|
297 for nt in bs.sequence:
|
|
298 if nt != "-":
|
|
299 string += "\t%i" % ( 1 )
|
|
300 else:
|
|
301 string += "\t%i" % ( 0 )
|
|
302 outFileHandler.write( "%s\n" % ( string ) )
|
|
303 outFileHandler.close()
|
|
304
|
|
305
|
|
306 ## Return a list of Align instances corresponding to the aligned regions (without gaps)
|
|
307 #
|
|
308 # @param query string header of the sequence considered as query
|
|
309 # @param subject string header of the sequence considered as subject
|
|
310 #
|
|
311 def getAlignList( self, query, subject ):
|
|
312 lAligns = []
|
|
313 alignQ = self.fetch( query ).sequence
|
|
314 alignS = self.fetch( subject ).sequence
|
|
315 createNewAlign = True
|
|
316 indexAlign = 0
|
|
317 indexQ = 0
|
|
318 indexS = 0
|
|
319 while indexAlign < len(alignQ):
|
|
320 if alignQ[ indexAlign ] != "-" and alignS[ indexAlign ] != "-":
|
|
321 indexQ += 1
|
|
322 indexS += 1
|
|
323 if createNewAlign:
|
|
324 iAlign = Align( Range( query, indexQ, indexQ ),
|
|
325 Range( subject, indexS, indexS ),
|
|
326 0,
|
|
327 int( alignQ[ indexAlign ] == alignS[ indexAlign ] ),
|
|
328 int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) )
|
|
329 lAligns.append( iAlign )
|
|
330 createNewAlign = False
|
|
331 else:
|
|
332 lAligns[-1].range_query.end += 1
|
|
333 lAligns[-1].range_subject.end += 1
|
|
334 lAligns[-1].score += int( alignQ[ indexAlign ] == alignS[ indexAlign ] )
|
|
335 lAligns[-1].identity += int( alignQ[ indexAlign ] == alignS[ indexAlign ] )
|
|
336 else:
|
|
337 if not createNewAlign:
|
|
338 lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery()
|
|
339 createNewAlign = True
|
|
340 if alignQ[ indexAlign ] != "-":
|
|
341 indexQ += 1
|
|
342 elif alignS[ indexAlign ] != "-":
|
|
343 indexS += 1
|
|
344 indexAlign += 1
|
|
345 if not createNewAlign:
|
|
346 lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery()
|
|
347 return lAligns
|
|
348
|
|
349
|
|
350 def removeGaps(self):
|
|
351 for iBs in self.db:
|
|
352 iBs.removeSymbol( "-" )
|
|
353
|
|
354 ## Compute mean per cent identity for MSA.
|
|
355 # First sequence in MSA is considered as reference sequence.
|
|
356 #
|
|
357 #
|
|
358 def computeMeanPcentIdentity(self):
|
|
359 seqRef = self.db[0]
|
|
360 sumPcentIdentity = 0
|
|
361
|
|
362 for seq in self.db[1:]:
|
|
363 pcentIdentity = self._computePcentIdentityBetweenSeqRefAndCurrentSeq(seqRef, seq)
|
|
364 sumPcentIdentity = sumPcentIdentity + pcentIdentity
|
|
365
|
|
366 nbSeq = len(self.db[1:])
|
|
367 meanPcentIdentity = round (sumPcentIdentity/nbSeq)
|
|
368
|
|
369 return meanPcentIdentity
|
|
370
|
|
371 def _computePcentIdentityBetweenSeqRefAndCurrentSeq(self, seqRef, seq):
|
|
372 indexOnSeqRef = 0
|
|
373 sumIdentity = 0
|
|
374 for nuclSeq in seq.sequence:
|
|
375 nuclRef = seqRef.sequence[indexOnSeqRef]
|
|
376
|
|
377 if nuclRef != "-" and nuclRef == nuclSeq:
|
|
378 sumIdentity = sumIdentity + 1
|
|
379 indexOnSeqRef = indexOnSeqRef + 1
|
|
380
|
|
381 return float(sumIdentity) / float(seqRef.getLength()) * 100
|
|
382
|
|
383
|
|
384
|
|
385
|
|
386
|
|
387
|
|
388
|
|
389
|
|
390
|
|
391
|
|
392
|
|
393
|
|
394
|
|
395
|
|
396
|