Mercurial > repos > yufei-luo > s_mart
comparison commons/core/seq/AlignedBioseqDB.py @ 6:769e306b7933
Change the repository level.
author | yufei-luo |
---|---|
date | Fri, 18 Jan 2013 04:54:14 -0500 |
parents | |
children | 94ab73e8a190 |
comparison
equal
deleted
inserted
replaced
5:ea3082881bf8 | 6:769e306b7933 |
---|---|
1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
2 # http://www.inra.fr | |
3 # http://urgi.versailles.inra.fr | |
4 # | |
5 # This software is governed by the CeCILL license under French law and | |
6 # abiding by the rules of distribution of free software. You can use, | |
7 # modify and/ or redistribute the software under the terms of the CeCILL | |
8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
9 # "http://www.cecill.info". | |
10 # | |
11 # As a counterpart to the access to the source code and rights to copy, | |
12 # modify and redistribute granted by the license, users are provided only | |
13 # with a limited warranty and the software's author, the holder of the | |
14 # economic rights, and the successive licensors have only limited | |
15 # liability. | |
16 # | |
17 # In this respect, the user's attention is drawn to the risks associated | |
18 # with loading, using, modifying and/or developing or reproducing the | |
19 # software by the user in light of its specific status of free software, | |
20 # that may mean that it is complicated to manipulate, and that also | |
21 # therefore means that it is reserved for developers and experienced | |
22 # professionals having in-depth computer knowledge. Users are therefore | |
23 # encouraged to load and test the software's suitability as regards their | |
24 # requirements in conditions enabling the security of their systems and/or | |
25 # data to be ensured and, more generally, to use and operate it in the | |
26 # same conditions as regards security. | |
27 # | |
28 # The fact that you are presently reading this means that you have had | |
29 # knowledge of the CeCILL license and that you accept its terms. | |
30 | |
31 | |
32 import sys | |
33 from commons.core.seq.BioseqDB import BioseqDB | |
34 from commons.core.seq.Bioseq import Bioseq | |
35 from commons.core.coord.Align import Align | |
36 from commons.core.coord.Range import Range | |
37 from commons.core.stat.Stat import Stat | |
38 from math import log | |
39 | |
40 | |
41 ## Multiple Sequence Alignment Representation | |
42 # | |
43 # | |
44 class AlignedBioseqDB( BioseqDB ): | |
45 | |
46 def __init__( self, name="" ): | |
47 BioseqDB.__init__( self, name ) | |
48 seqLength = self.getLength() | |
49 if self.getSize() > 1: | |
50 for bs in self.db[1:]: | |
51 if bs.getLength() != seqLength: | |
52 print "ERROR: aligned sequences have different length" | |
53 | |
54 | |
55 ## Get length of the alignment | |
56 # | |
57 # @return length | |
58 # @warning name before migration was 'length' | |
59 # | |
60 def getLength( self ): | |
61 length = 0 | |
62 if self.db != []: | |
63 length = self.db[0].getLength() | |
64 return length | |
65 | |
66 | |
67 ## Get the true length of a given sequence (without gaps) | |
68 # | |
69 # @param header string header of the sequence to analyze | |
70 # @return length integer | |
71 # @warning name before migration was 'true_length' | |
72 # | |
73 def getSeqLengthWithoutGaps( self, header ): | |
74 bs = self.fetch( header ) | |
75 count = 0 | |
76 for pos in xrange(0,len(bs.sequence)): | |
77 if bs.sequence[pos] != "-": | |
78 count += 1 | |
79 return count | |
80 | |
81 | |
82 ## Record the occurrences of symbols (A, T, G, C, N, -, ...) at each site | |
83 # | |
84 # @return: list of dico whose keys are symbols and values are their occurrences | |
85 # | |
86 def getListOccPerSite( self ): | |
87 lOccPerSite = [] # list of dictionaries, one per position on the sequence | |
88 n = 0 # nb of sequences parsed from the input file | |
89 firstSeq = True | |
90 | |
91 # for each sequence in the bank | |
92 for bs in self.db: | |
93 if bs.sequence == None: | |
94 break | |
95 n += 1 | |
96 | |
97 # if it is the first to be parsed, create a dico at each site | |
98 if firstSeq: | |
99 for i in xrange(0,len(bs.sequence)): | |
100 lOccPerSite.append( {} ) | |
101 firstSeq = False | |
102 | |
103 # for each site, add its nucleotide | |
104 for i in xrange(0,len(bs.sequence)): | |
105 nuc = bs.sequence[i].upper() | |
106 if lOccPerSite[i].has_key( nuc ): | |
107 lOccPerSite[i][nuc] += 1 | |
108 else: | |
109 lOccPerSite[i][nuc] = 1 | |
110 | |
111 return lOccPerSite | |
112 | |
113 #TODO: review minNbNt !!! It should be at least 2 nucleotides to build a consensus... | |
114 ## Make a consensus from the MSA | |
115 # | |
116 # @param minNbNt: minimum nb of nucleotides to edit a consensus | |
117 # @param minPropNt: minimum proportion for the major nucleotide to be used, otherwise add 'N' (default=0.0) | |
118 # @param verbose: level of information sent to stdout (default=0/1) | |
119 # @return: consensus | |
120 # | |
121 def getConsensus( self, minNbNt, minPropNt=0.0, verbose=0 , isHeaderSAtannot=False): | |
122 | |
123 maxPropN = 0.40 # discard consensus if more than 40% of N's | |
124 | |
125 nbInSeq = self.getSize() | |
126 if verbose > 0: | |
127 print "nb of aligned sequences: %i" % ( nbInSeq ); sys.stdout.flush() | |
128 if nbInSeq < 2: | |
129 print "ERROR: can't make a consensus with less than 2 sequences" | |
130 sys.exit(1) | |
131 if minNbNt >= nbInSeq: | |
132 minNbNt = nbInSeq - 1 | |
133 print "minNbNt=%i" % ( minNbNt ) | |
134 if minPropNt >= 1.0: | |
135 print "ERROR: minPropNt=%.2f should be a proportion (below 1.0)" % ( minPropNt ) | |
136 sys.exit(1) | |
137 | |
138 lOccPerSite = self.getListOccPerSite() | |
139 nbSites = len(lOccPerSite) | |
140 if verbose > 0: | |
141 print "nb of sites: %i" % ( nbSites ); sys.stdout.flush() | |
142 | |
143 seqConsensus = "" | |
144 | |
145 # for each site (i.e. each column of the MSA) | |
146 nbRmvColumns = 0 | |
147 countSites = 0 | |
148 for dNt2Occ in lOccPerSite: | |
149 countSites += 1 | |
150 if verbose > 1: | |
151 print "site %s / %i" % ( str(countSites).zfill( len(str(nbSites)) ), | |
152 nbSites ) | |
153 sys.stdout.flush() | |
154 occMaxNt = 0 # occurrences of the predominant nucleotide at this site | |
155 lBestNt = [] | |
156 nbNt = 0 # total nb of A, T, G and C (no gap) | |
157 | |
158 # for each distinct symbol at this site (A, T, G, C, N, -,...) | |
159 for j in dNt2Occ.keys(): | |
160 if j != "-": | |
161 nbNt += dNt2Occ[j] | |
162 if verbose > 1: | |
163 print "%s: %i" % ( j, dNt2Occ[j] ) | |
164 if dNt2Occ[j] > occMaxNt: | |
165 occMaxNt = dNt2Occ[j] | |
166 lBestNt = [ j ] | |
167 elif dNt2Occ[j] == occMaxNt: | |
168 lBestNt.append( j ) | |
169 if nbNt == 0: # some MSA programs can remove some sequences (e.g. Muscle after Recon) or when using Refalign (non-alignable TE fragments put together via a refseq) | |
170 nbRmvColumns += 1 | |
171 | |
172 if len( lBestNt ) >= 1: | |
173 bestNt = lBestNt[0] | |
174 | |
175 # if the predominant nucleotide occurs in less than x% of the sequences, put a "N" | |
176 if minPropNt > 0.0 and nbNt != 0 and float(occMaxNt)/float(nbNt) < minPropNt: | |
177 bestNt = "N" | |
178 | |
179 if int(nbNt) >= int(minNbNt): | |
180 seqConsensus += bestNt | |
181 if verbose > 1: | |
182 print "-> %s" % ( bestNt ) | |
183 | |
184 if nbRmvColumns: | |
185 if nbRmvColumns == 1: | |
186 print "WARNING: 1 site was removed (%.2f%%)" % (nbRmvColumns / float(nbSites) * 100) | |
187 else: | |
188 print "WARNING: %i sites were removed (%.2f%%)" % ( nbRmvColumns, nbRmvColumns / float(nbSites) * 100 ) | |
189 sys.stdout.flush() | |
190 if seqConsensus == "": | |
191 print "WARNING: no consensus can be built (no sequence left)" | |
192 return | |
193 | |
194 propN = seqConsensus.count("N") / float(len(seqConsensus)) | |
195 if propN >= maxPropN: | |
196 print "WARNING: no consensus can be built (%i%% of N's >= %i%%)" % ( propN * 100, maxPropN * 100 ) | |
197 return | |
198 elif propN >= maxPropN * 0.5: | |
199 print "WARNING: %i%% of N's" % ( propN * 100 ) | |
200 | |
201 consensus = Bioseq() | |
202 consensus.sequence = seqConsensus | |
203 if isHeaderSAtannot: | |
204 header = self.db[0].header | |
205 pyramid = header.split("Gr")[1].split("Cl")[0] | |
206 pile = header.split("Cl")[1].split(" ")[0] | |
207 consensus.header = "consensus=%s length=%i nbAlign=%i pile=%s pyramid=%s" % (self.name, len(seqConsensus), self.getSize(), pile, pyramid) | |
208 else: | |
209 consensus.header = "consensus=%s length=%i nbAlign=%i" % ( self.name, len(seqConsensus), self.getSize() ) | |
210 | |
211 if verbose > 0: | |
212 | |
213 statEntropy = self.getEntropy( verbose - 1 ) | |
214 print "entropy: %s" % ( statEntropy.stringQuantiles() ) | |
215 sys.stdout.flush() | |
216 | |
217 return consensus | |
218 | |
219 | |
220 ## Get the entropy of the whole multiple alignment (only for A, T, G and C) | |
221 # | |
222 # @param verbose level of verbosity | |
223 # | |
224 # @return statistics about the entropy of the MSA | |
225 # | |
226 def getEntropy( self, verbose=0 ): | |
227 | |
228 stats = Stat() | |
229 | |
230 # get the occurrences of symbols at each site | |
231 lOccPerSite = self.getListOccPerSite() | |
232 | |
233 countSite = 0 | |
234 | |
235 # for each site | |
236 for dSymbol2Occ in lOccPerSite: | |
237 countSite += 1 | |
238 | |
239 # count the number of nucleotides (A, T, G and C, doesn't count gap '-') | |
240 nbNt = 0 | |
241 dATGC2Occ = {} | |
242 for base in ["A","T","G","C"]: | |
243 dATGC2Occ[ base ] = 0.0 | |
244 for nt in dSymbol2Occ.keys(): | |
245 if nt != "-": | |
246 nbNt += dSymbol2Occ[ nt ] | |
247 checkedNt = self.getATGCNFromIUPAC( nt ) | |
248 if checkedNt in ["A","T","G","C"] and dSymbol2Occ.has_key( checkedNt ): | |
249 dATGC2Occ[ checkedNt ] += 1 * dSymbol2Occ[ checkedNt ] | |
250 else: # for 'N' | |
251 if dSymbol2Occ.has_key( checkedNt ): | |
252 dATGC2Occ[ "A" ] += 0.25 * dSymbol2Occ[ checkedNt ] | |
253 dATGC2Occ[ "T" ] += 0.25 * dSymbol2Occ[ checkedNt ] | |
254 dATGC2Occ[ "G" ] += 0.25 * dSymbol2Occ[ checkedNt ] | |
255 dATGC2Occ[ "C" ] += 0.25 * dSymbol2Occ[ checkedNt ] | |
256 if verbose > 2: | |
257 for base in dATGC2Occ.keys(): | |
258 print "%s: %i" % ( base, dATGC2Occ[ base ] ) | |
259 | |
260 # compute the entropy for the site | |
261 entropySite = 0.0 | |
262 for nt in dATGC2Occ.keys(): | |
263 entropySite += self.computeEntropy( dATGC2Occ[ nt ], nbNt ) | |
264 if verbose > 1: | |
265 print "site %i (%i nt): entropy = %.3f" % ( countSite, nbNt, entropySite ) | |
266 stats.add( entropySite ) | |
267 | |
268 return stats | |
269 | |
270 | |
271 ## Get A, T, G, C or N from an IUPAC letter | |
272 # IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N'] | |
273 # | |
274 # @return A, T, G, C or N | |
275 # | |
276 def getATGCNFromIUPAC( self, nt ): | |
277 iBs = Bioseq() | |
278 return iBs.getATGCNFromIUPAC( nt ) | |
279 | |
280 | |
281 ## Compute the entropy based on the occurrences of a certain nucleotide and the total number of nucleotides | |
282 # | |
283 def computeEntropy( self, nbOcc, nbNt ): | |
284 if nbOcc == 0.0: | |
285 return 0.0 | |
286 else: | |
287 freq = nbOcc / float(nbNt) | |
288 return - freq * log(freq) / log(2) | |
289 | |
290 | |
291 ## Save the multiple alignment as a matrix with '0' if gap, '1' otherwise | |
292 # | |
293 def saveAsBinaryMatrix( self, outFile ): | |
294 outFileHandler = open( outFile, "w" ) | |
295 for bs in self.db: | |
296 string = "%s" % ( bs.header ) | |
297 for nt in bs.sequence: | |
298 if nt != "-": | |
299 string += "\t%i" % ( 1 ) | |
300 else: | |
301 string += "\t%i" % ( 0 ) | |
302 outFileHandler.write( "%s\n" % ( string ) ) | |
303 outFileHandler.close() | |
304 | |
305 | |
306 ## Return a list of Align instances corresponding to the aligned regions (without gaps) | |
307 # | |
308 # @param query string header of the sequence considered as query | |
309 # @param subject string header of the sequence considered as subject | |
310 # | |
311 def getAlignList( self, query, subject ): | |
312 lAligns = [] | |
313 alignQ = self.fetch( query ).sequence | |
314 alignS = self.fetch( subject ).sequence | |
315 createNewAlign = True | |
316 indexAlign = 0 | |
317 indexQ = 0 | |
318 indexS = 0 | |
319 while indexAlign < len(alignQ): | |
320 if alignQ[ indexAlign ] != "-" and alignS[ indexAlign ] != "-": | |
321 indexQ += 1 | |
322 indexS += 1 | |
323 if createNewAlign: | |
324 iAlign = Align( Range( query, indexQ, indexQ ), | |
325 Range( subject, indexS, indexS ), | |
326 0, | |
327 int( alignQ[ indexAlign ] == alignS[ indexAlign ] ), | |
328 int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) ) | |
329 lAligns.append( iAlign ) | |
330 createNewAlign = False | |
331 else: | |
332 lAligns[-1].range_query.end += 1 | |
333 lAligns[-1].range_subject.end += 1 | |
334 lAligns[-1].score += int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) | |
335 lAligns[-1].identity += int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) | |
336 else: | |
337 if not createNewAlign: | |
338 lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery() | |
339 createNewAlign = True | |
340 if alignQ[ indexAlign ] != "-": | |
341 indexQ += 1 | |
342 elif alignS[ indexAlign ] != "-": | |
343 indexS += 1 | |
344 indexAlign += 1 | |
345 if not createNewAlign: | |
346 lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery() | |
347 return lAligns | |
348 | |
349 | |
350 def removeGaps(self): | |
351 for iBs in self.db: | |
352 iBs.removeSymbol( "-" ) | |
353 | |
354 ## Compute mean per cent identity for MSA. | |
355 # First sequence in MSA is considered as reference sequence. | |
356 # | |
357 # | |
358 def computeMeanPcentIdentity(self): | |
359 seqRef = self.db[0] | |
360 sumPcentIdentity = 0 | |
361 | |
362 for seq in self.db[1:]: | |
363 pcentIdentity = self._computePcentIdentityBetweenSeqRefAndCurrentSeq(seqRef, seq) | |
364 sumPcentIdentity = sumPcentIdentity + pcentIdentity | |
365 | |
366 nbSeq = len(self.db[1:]) | |
367 meanPcentIdentity = round (sumPcentIdentity/nbSeq) | |
368 | |
369 return meanPcentIdentity | |
370 | |
371 def _computePcentIdentityBetweenSeqRefAndCurrentSeq(self, seqRef, seq): | |
372 indexOnSeqRef = 0 | |
373 sumIdentity = 0 | |
374 for nuclSeq in seq.sequence: | |
375 nuclRef = seqRef.sequence[indexOnSeqRef] | |
376 | |
377 if nuclRef != "-" and nuclRef == nuclSeq: | |
378 sumIdentity = sumIdentity + 1 | |
379 indexOnSeqRef = indexOnSeqRef + 1 | |
380 | |
381 return float(sumIdentity) / float(seqRef.getLength()) * 100 | |
382 | |
383 | |
384 | |
385 | |
386 | |
387 | |
388 | |
389 | |
390 | |
391 | |
392 | |
393 | |
394 | |
395 | |
396 |