comparison commons/core/seq/BioseqDB.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children
comparison
equal deleted inserted replaced
5:ea3082881bf8 6:769e306b7933
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31
32 import sys
33 import re
34 from commons.core.seq.Bioseq import Bioseq
35 from commons.core.stat.Stat import Stat
36
37
38 ## Handle a collection of a Bioseq (header-sequence)
39 #
40 class BioseqDB( object ):
41
42 def __init__( self, name="" ):
43 self.idx = {}
44 self.idx_renamed = {}
45 self.db = []
46 self.name = name
47 if name != "":
48 faFile = open( name )
49 self.read( faFile )
50 faFile.close()
51 self.mean_seq_lgth = None
52 self.stat = Stat()
53
54
55 ## Equal operator
56 #
57 def __eq__( self, o ):
58 selfSize = self.getSize()
59 if selfSize != o.getSize():
60 return False
61 nbEqualInstances = 0
62 for i in self.db:
63 atLeastOneIsEqual = False
64 for j in o.db:
65 if i == j:
66 atLeastOneIsEqual = True
67 continue
68 if atLeastOneIsEqual:
69 nbEqualInstances += 1
70 if nbEqualInstances == selfSize:
71 return True
72 return False
73
74
75 ## Change the name of the BioseqDB
76 #
77 # @param name the BioseqDB name
78 #
79 def setName(self, name):
80 self.name = name
81
82
83 ## Record each sequence of the input file as a list of Bioseq instances
84 #
85 # @param faFileHandler handler of a fasta file
86 #
87 def read( self, faFileHandler ):
88 while True:
89 seq = Bioseq()
90 seq.read( faFileHandler )
91 if seq.sequence == None:
92 break
93 self.add( seq )
94
95
96 ## Write all Bioseq of BioseqDB in a formatted fasta file (60 character long)
97 #
98 # @param faFileHandler file handler of a fasta file
99 #
100 def write( self, faFileHandler ):
101 for bs in self.db:
102 bs.writeABioseqInAFastaFile( faFileHandler )
103
104
105 ## Write all Bioseq of BioseqDB in a formatted fasta file (60 character long)
106 #
107 # @param outFaFileName file name of fasta file
108 # @param mode 'write' or 'append'
109 #
110 def save( self, outFaFileName, mode="w" ):
111 outFaFile = open( outFaFileName, mode )
112 self.write( outFaFile )
113 outFaFile.close()
114
115
116 ## Read a formatted fasta file and load it in the BioseqDB instance
117 #
118 # @param inFaFileName file name of fasta file
119 #
120 def load(self, inFaFileName):
121 fichier = open(inFaFileName)
122 self.read(fichier)
123 fichier.close()
124
125
126 ## Reverse each sequence of the collection
127 #
128 def reverse( self ):
129 for bs in self.db:
130 bs.reverse()
131
132
133 ## Turn each sequence into its complement
134 #
135 def complement( self ):
136 for bs in self.db:
137 bs.complement()
138
139
140 ## Reverse and complement each sequence
141 #
142 def reverseComplement( self ):
143 for bs in self.db:
144 bs.reverseComplement()
145
146
147 ## Set the collection from a list of Bioseq instances
148 #
149 def setData( self, lBioseqs ):
150 for i in lBioseqs:
151 self.add( i )
152
153
154 ## Initialization of each attribute of the collection
155 #
156 def reset( self ):
157 self.db = []
158 self.idx = {}
159 self.name = None
160 self.mean_seq_lgth = None
161 self.stat.reset()
162
163
164 ## Remove all the gap of the sequences of the collection
165 #
166 def cleanGap(self):
167 for iBioSeq in self.db:
168 iBioSeq.cleanGap()
169
170
171 ## Add a Bioseq instance and update the attributes
172 #
173 # @param bs a Bioseq instance
174 #
175 def add( self, bs ):
176 if self.idx.has_key( bs.header ):
177 sys.stderr.write( "ERROR: two sequences with same header '%s'\n" % ( bs.header ) )
178 sys.exit(1)
179 self.db.append( bs )
180 self.idx[ bs.header ] = len(self.db) - 1
181 self.idx_renamed[ bs.header.replace("::","-").replace(":","-").replace(",","-").replace(" ","_") ] = len(self.db) - 1
182
183
184 ## Give the Bioseq instance corresponding to specified index
185 #
186 # @return a Bioseq instance
187 #
188 def __getitem__(self,index):
189 if index < len(self.db):
190 return self.db[index]
191
192
193 ## Give the number of sequences in the bank
194 #
195 # @return an integer
196 #
197 def getSize( self ):
198 return len( self.db )
199
200
201 ## Give the cumulative sequence length in the bank
202 #
203 # @return an integer
204 #
205 def getLength( self ):
206 cumLength = 0
207 for iBioseq in self.db:
208 cumLength += iBioseq.getLength()
209
210 return cumLength
211
212
213 ## Return the length of a given sequence via its header
214 #
215 # @return an integer
216 #
217 def getSeqLength( self, header ):
218 return self.fetch(header).getLength()
219
220
221 ## Return a list with the sequence headers
222 #
223 def getHeaderList( self ):
224 lHeaders = []
225 for bs in self.db:
226 lHeaders.append( bs.header )
227 return lHeaders
228
229
230 ## Return a list with the sequences
231 #
232 def getSequencesList( self ):
233 lSeqs = []
234 for bs in self.db:
235 lSeqs.append( bs.getSequence() )
236 return lSeqs
237
238
239 ## Give the Bioseq instance of the BioseqDB specified by its header
240 #
241 # @warning name of this method not appropriate getBioseqByHeader is proposed
242 # @param header string
243 # @return a Bioseq instance
244 #
245 def fetch( self, header ):
246 return self.db[self.idx[header]]
247
248
249 ## Give the Bioseq instance of the BioseqDB specified by its renamed header
250 # In renamed header "::", ":", "," character are been replaced by "-" and " " by "_"
251 #
252 # @param renamedHeader string
253 # @return a Bioseq instance
254 #
255 def getBioseqByRenamedHeader( self, renamedHeader ):
256 return self.db[self.idx_renamed[renamedHeader]]
257
258
259 ## Count the number of times the given nucleotide is present in the bank.
260 #
261 # @param nt character (nt or aa)
262 # @return an integer
263 #
264 def countNt( self, nt ):
265 total = 0
266 for iBioseq in self.db:
267 total+= iBioseq.countNt( nt )
268 return total
269
270
271 ## Count the number of times each nucleotide (A,T,G,C,N) is present in the bank.
272 #
273 # @return a dictionary with nucleotide as key and an integer as values
274 #
275 def countAllNt( self ):
276 dNt2Count = {}
277 for nt in ["A","T","G","C","N"]:
278 dNt2Count[ nt ] = self.countNt( nt )
279 return dNt2Count
280
281
282 ## Extract a sub BioseqDB of specified size which beginning at specified start
283 #
284 # @param start integer index of first included Bioseq
285 # @param size integer size of expected BioseqDB
286 # @return a BioseqDB
287 #
288 def extractPart(self, start, size):
289 iShorterBioseqDB = BioseqDB()
290 for iBioseq in self.db[start:(start + size)]:
291 iShorterBioseqDB.add(iBioseq)
292 return iShorterBioseqDB
293
294
295 ## Extract a sub BioseqDB with the specified number of best length Bioseq
296 #
297 # @param numBioseq integer the number of Bioseq searched
298 # @return a BioseqDB
299 #
300 def bestLength(self, numBioseq):
301 length_list = []
302 numseq = 0
303 for each_seq in self.db:
304 if each_seq.sequence == None:
305 l=0
306 else:
307 l = each_seq.getLength()
308 length_list.append(l)
309 numseq = numseq + 1
310
311 length_list.sort()
312 size = len(length_list)
313 if numBioseq < size:
314 len_min = length_list[size-numBioseq]
315 else:
316 len_min = length_list[0]
317
318 numseq = 0
319 nbsave = 0
320 bestSeqs = BioseqDB()
321 bestSeqs.setName(self.name)
322 for each_seq in self.db:
323 if each_seq.sequence == None:
324 l=0
325 else :
326 l = each_seq.getLength()
327 numseq = numseq + 1
328 if l >= len_min:
329 bestSeqs.add(each_seq)
330 nbsave = nbsave + 1
331 if nbsave == numBioseq :
332 break
333 return bestSeqs
334
335
336 ## Extract a sub BioseqDB from a file with Bioseq header containing the specified pattern
337 #
338 # @param pattern regular expression of wished Bioseq header
339 # @param inFileName name of fasta file in which we want extract the BioseqDB
340 #
341 def extractPatternOfFile(self, pattern, inFileName):
342 if pattern=="" :
343 return
344 srch=re.compile(pattern)
345 file_db=open(inFileName)
346 numseq=0
347 nbsave=0
348 while 1:
349 seq=Bioseq()
350 seq.read(file_db)
351 if seq.sequence==None:
352 break
353 numseq+=1
354 m=srch.search(seq.header)
355 if m:
356 self.add(seq)
357 nbsave+=1
358 file_db.close()
359
360
361 ## Extract a sub BioseqDB from the instance with all Bioseq header containing the specified pattern
362 #
363 # @param pattern regular expression of wished Bioseq header
364 #
365 # @return a BioseqDB
366 #
367 def getByPattern(self,pattern):
368 if pattern=="" :
369 return
370 iBioseqDB=BioseqDB()
371 srch=re.compile(pattern)
372 for iBioseq in self.db:
373 if srch.search(iBioseq.header):
374 iBioseqDB.add(iBioseq)
375 return iBioseqDB
376
377
378 ## Extract a sub BioseqDB from the instance with all Bioseq header not containing the specified pattern
379 #
380 # @param pattern regular expression of not wished Bioseq header
381 #
382 # @return a BioseqDB
383 #
384 def getDiffFromPattern(self,pattern):
385 if pattern=="" :
386 return
387 iBioseqDB=BioseqDB()
388 srch=re.compile(pattern)
389 for iBioseq in self.db:
390 if not srch.search(iBioseq.header):
391 iBioseqDB.add(iBioseq)
392 return iBioseqDB
393
394 #TODO: to run several times to remove all concerned sequences when big data. How to fix it ?
395 ## Remove from the instance all Bioseq which header contains the specified pattern
396 #
397 # @param pattern regular expression of not wished Bioseq header
398 #
399 def rmByPattern(self,pattern):
400 if pattern=="" :
401 return
402 srch=re.compile(pattern)
403 for seq in self.db:
404 if srch.search(seq.header):
405 self.db.remove(seq)
406
407
408 ## Copy a part from another BioseqDB in the BioseqDB if Bioseq have got header containing the specified pattern
409 #
410 # @warning this method is called extractPattern in pyRepet.seq.BioseqDB
411 #
412 # @param pattern regular expression of wished Bioseq header
413 # @param sourceBioseqDB the BioseqDB from which we want extract Bioseq
414 #
415 def addBioseqFromABioseqDBIfHeaderContainPattern(self, pattern, sourceBioseqDB):
416 if pattern=="" :
417 return
418 srch=re.compile(pattern)
419 for seq in sourceBioseqDB.db:
420 m=srch.search(seq.header)
421 if m:
422 self.add(seq)
423
424
425 ## Up-case the sequence characters in all sequences
426 #
427 def upCase( self ):
428 for bs in self.db:
429 bs.upCase()
430
431
432 ## Split each gapped Bioseq in a list and store all in a dictionary
433 #
434 # @return a dict, keys are bioseq headers, values are list of Map instances
435 #
436 def getDictOfLMapsWithoutGaps( self ):
437 dSeq2Maps = {}
438
439 for bs in self.db:
440 dSeq2Maps[ bs.header ] = bs.getLMapWhithoutGap()
441
442 return dSeq2Maps
443
444 ## Give the list of the sequence length in the bank
445 #
446 # @return an list
447 #
448 def getListOfSequencesLength( self ):
449 lLength = []
450 for iBioseq in self.db:
451 lLength.append(iBioseq.getLength())
452
453 return lLength
454
455 ## Return sequence length for a list of sequence header
456 #
457 def getSeqLengthByListOfName( self, lHeaderName ):
458 lseqLength=[]
459 for headerName in lHeaderName:
460 lseqLength.append(self.getSeqLength( headerName ))
461 return lseqLength