Mercurial > repos > yufei-luo > s_mart
comparison commons/core/parsing/Multifasta2SNPFile.py @ 6:769e306b7933
Change the repository level.
author | yufei-luo |
---|---|
date | Fri, 18 Jan 2013 04:54:14 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:ea3082881bf8 | 6:769e306b7933 |
---|---|
1 import re | |
2 import os | |
3 import logging | |
4 from commons.core.utils.FileUtils import FileUtils | |
5 from commons.core.seq.BioseqDB import BioseqDB | |
6 from commons.core.seq.Bioseq import Bioseq | |
7 from commons.core.LoggerFactory import LoggerFactory | |
8 | |
9 DNA_ALPHABET_WITH_N_AND_DELS = set (['A','T','G','C','N','-']) | |
10 IUPAC = set(['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N', '-', 'a','t','g','c','u','r','y','m','k','w','s','b','d','h','v','n']) | |
11 | |
12 class Multifasta2SNPFile( object ): | |
13 | |
14 POLYM_TYPE_4_SNP = "SNP" | |
15 POLYM_TYPE_4_INSERTION = "INSERTION" | |
16 POLYM_TYPE_4_DELETION = "DELETION" | |
17 POLYM_DEFAULT_CONFIDENCE_VALUE = "A" | |
18 SNP_LENGTH = 1 | |
19 FLANK_LENGTH = 250 | |
20 | |
21 def __init__(self, taxon, batchName="", geneName=""): | |
22 | |
23 if(batchName): | |
24 self._batchName = batchName | |
25 | |
26 if(geneName): | |
27 self._geneName = geneName | |
28 | |
29 self._taxon = taxon | |
30 self._outSubSNPFileName = "SubSNP.csv" | |
31 self._outAlleleFileName = "Allele.csv" | |
32 self._outIndividualFileName = "Individual.csv" | |
33 self._outSequenceFSAFileName = "Sequences.fsa" | |
34 self._outSequenceCSVFileName = "Sequences.csv" | |
35 self._outBatchFileName = "Batch.txt" | |
36 self._outBatchLineFileName = "BatchLine.csv" | |
37 self._logFileName = "multifasta2SNP.log" | |
38 | |
39 self._lBatchFileResults = [] | |
40 self._lSubSNPFileResults = [] | |
41 self._lRefSequences = [] | |
42 self._lIndividualFileResults = [] | |
43 self._lBatchLineFileResults = [] | |
44 self._dIndividualNumbers4SubSNPResults = {} | |
45 self._dAlleleFileResults = {} | |
46 | |
47 | |
48 self.dcurrentIndel = {} | |
49 self.lIndelsOfTheCurrentLine = [] | |
50 self.lIndelsOverAllLines = [] | |
51 self.dSNPsPositions = {} | |
52 | |
53 self._iCurrentLineNumber = 0 | |
54 self._currentBatchNumber = 1 | |
55 self.currentLineName = "" | |
56 self.currentNucleotide = "" | |
57 self.currentPosition = 0 | |
58 self._sPolymConfidenceValue = Multifasta2SNPFile.POLYM_DEFAULT_CONFIDENCE_VALUE | |
59 self._sPolymType = Multifasta2SNPFile.POLYM_TYPE_4_SNP | |
60 self._iPolymLength = Multifasta2SNPFile.SNP_LENGTH | |
61 self._fileUtils = FileUtils() | |
62 | |
63 if self._fileUtils.isRessourceExists(self._logFileName): | |
64 os.remove(self._logFileName) | |
65 self._logFile = LoggerFactory.createLogger(self._logFileName, logging.INFO, "%(asctime)s %(levelname)s: %(message)s") | |
66 | |
67 def runOneBatch( self, inFileName): | |
68 self._currentFileName = inFileName | |
69 #TODO: methode a virer; n'utiliser au final que runOneBatchWithoutWriting | |
70 self._wrapper = self.createWrapperFromFile(inFileName) | |
71 self._lBatchFileResults = self.completeBatchList() | |
72 self.detectSNPsAndIndels(self._wrapper) | |
73 self._writeAllOutputFiles() | |
74 self._currentBatchNumber += 1 | |
75 | |
76 def runOneBatchWithoutWriting( self, inFileName): | |
77 self.lIndelsOverAllLines = [] | |
78 self._currentFileName = inFileName | |
79 self._wrapper = self.createWrapperFromFile(inFileName) | |
80 self._lBatchFileResults = self.completeBatchList() | |
81 self.detectSNPsAndIndels(self._wrapper) | |
82 self._currentBatchNumber += 1 | |
83 | |
84 | |
85 def _cleanOutputsInTheCurrentDir(self): | |
86 #TODO: create a list of files to be deleted | |
87 FileUtils.removeFilesByPattern("*.csv") | |
88 if (FileUtils.isRessourceExists(self._outBatchFileName)): | |
89 os.remove(self._outBatchFileName) | |
90 if (FileUtils.isRessourceExists(self._outSequenceFSAFileName)): | |
91 os.remove(self._outSequenceFSAFileName) | |
92 | |
93 | |
94 def _createOutputObjectsIteratingOnCurrentDir(self): | |
95 #TODO: gerer les extensions multiples | |
96 extList = [".fasta", ".fsa"] | |
97 for dirname, dirnames, filenames in os.walk("."): | |
98 filenames.sort() | |
99 for filename in filenames: | |
100 if os.path.splitext(filename)[1] in extList: | |
101 self._geneName = os.path.splitext(filename)[0] | |
102 self._batchName = "Batch_" + self._geneName | |
103 self.runOneBatchWithoutWriting(filename) | |
104 | |
105 def runSeveralBatches( self, inputDir): | |
106 #TODO: enlever les chdirs, appeler les fichiers en absolu et modifier les tests en consequences | |
107 os.chdir(inputDir) | |
108 self._cleanOutputsInTheCurrentDir() | |
109 self._createOutputObjectsIteratingOnCurrentDir() | |
110 self._writeAllOutputFiles() | |
111 os.chdir("../") | |
112 | |
113 | |
114 def _treatADeletionClosingWithAnotherBaseThanRefSeq(self, lineName, nucleotide, position): | |
115 if (self.isTheIndelOpen4ThisLine): | |
116 self._closeTheCurrentIndel(lineName, nucleotide, position) | |
117 self._manageSNPs(lineName, nucleotide, position) | |
118 self.addOnePolymorphicPosition(position) | |
119 | |
120 def _treatNucleotideDifferentThanRefSeqCase(self, refSeq, lineName, index, nucleotide, position): | |
121 if (nucleotide == "-" or refSeq[index] == "-"): | |
122 if (self.isTheIndelOpen4ThisLine): | |
123 self._expandTheCurrentIndel(position, nucleotide) | |
124 else: | |
125 self._startAnIndel(position, nucleotide) | |
126 else: | |
127 self._treatADeletionClosingWithAnotherBaseThanRefSeq(lineName, nucleotide, position) | |
128 | |
129 | |
130 def _treatSameNucleotideInOneIndel(self, refSeq, lineName, index, nucleotide, position): | |
131 if (self._sPolymType == Multifasta2SNPFile.POLYM_TYPE_4_DELETION): | |
132 self._closeTheCurrentIndel(lineName, nucleotide, position) | |
133 elif (self._sPolymType == Multifasta2SNPFile.POLYM_TYPE_4_INSERTION): | |
134 if (refSeq[index] == "-"): | |
135 self._expandTheCurrentIndel(position, nucleotide) | |
136 else: | |
137 self._closeTheCurrentIndel(lineName, nucleotide, position) | |
138 | |
139 def detectSNPsAndIndels(self, iRefAndLines): | |
140 refSeq = iRefAndLines.getReferenceSequence() | |
141 refSeqLength = len ( refSeq ) | |
142 self.dSNPsPositions = {} | |
143 | |
144 for iLineBioseq in iRefAndLines.getLinesBioseqInstances(): | |
145 lineSequence = iLineBioseq.sequence | |
146 self.currentLineName = iLineBioseq.header | |
147 self._manageCurrentIndividual(self.currentLineName) | |
148 | |
149 index = 0 | |
150 self.isTheIndelOpen4ThisLine = 0 | |
151 self.lIndelsOfTheCurrentLine = [] | |
152 for nucleotide in lineSequence: | |
153 position = index + 1 | |
154 if (index < refSeqLength) and self._isSNPDetected(refSeq, index, nucleotide): | |
155 self._treatNucleotideDifferentThanRefSeqCase(refSeq, self.currentLineName, index, nucleotide, position) | |
156 elif(index < refSeqLength and self.isTheIndelOpen4ThisLine) : | |
157 self._treatSameNucleotideInOneIndel(refSeq, self.currentLineName, index, nucleotide, position) | |
158 index = index + 1 | |
159 self.currentNucleotide = nucleotide | |
160 self.currentPosition = position | |
161 | |
162 self.lIndelsOverAllLines = self.lIndelsOverAllLines + self.lIndelsOfTheCurrentLine | |
163 | |
164 self._postTraitementDetectSNP(self.currentLineName, self.currentNucleotide, self.currentPosition) | |
165 | |
166 def _manageCurrentIndividual(self, lineName): | |
167 self._lIndividualFileResults = self._completeIndividualListWithCurrentIndividual(self._lIndividualFileResults, lineName) | |
168 self._lBatchLineFileResults = self._completeBatchLineListWithCurrentIndividual(self._lBatchLineFileResults, self._lIndividualFileResults, lineName) | |
169 if not self._dIndividualNumbers4SubSNPResults.__contains__(lineName): | |
170 self._dIndividualNumbers4SubSNPResults[lineName] = len(self._lIndividualFileResults) | |
171 | |
172 | |
173 def _manageLastPositionIndels(self, lineName, nucleotide, position): | |
174 if (self.isTheIndelOpen4ThisLine): | |
175 self._closeTheCurrentIndel(lineName, nucleotide, position) | |
176 self.lIndelsOverAllLines.append(self.lIndelsOfTheCurrentLine.pop()) | |
177 | |
178 def _postTraitementDetectSNP(self, lineName, nucleotide, position): | |
179 self._manageLastPositionIndels(lineName, nucleotide, position) | |
180 | |
181 self._mergeAllelesAndSubSNPsFromOverlappingIndels() | |
182 self._addMissingsAllelesAndSubSNPs() | |
183 | |
184 self._lSubSNPFileResults = self._sortSubSNPResultByBatchPositionAndLineName(self._lSubSNPFileResults) | |
185 | |
186 def _manageSNPs(self, lineName, nucleotide, position): | |
187 self._dAlleleFileResults = self._completeAlleleSetWithCurrentAllele(self._dAlleleFileResults, nucleotide) | |
188 truePosition = self.getUngappedPositionInRefSeq(position) | |
189 subSNPName = self._formatSubSNPName(lineName, truePosition, Multifasta2SNPFile.POLYM_TYPE_4_SNP) | |
190 iAlleleNumber = self._dAlleleFileResults[nucleotide] | |
191 self._sPolymType = Multifasta2SNPFile.POLYM_TYPE_4_SNP | |
192 flank5Prime, flank3Prime = self.getFlanksOfASubSNP(lineName, position, Multifasta2SNPFile.SNP_LENGTH, Multifasta2SNPFile.FLANK_LENGTH) | |
193 dSubSNPResult = {'subSNPName':subSNPName, 'position':truePosition, 'lineName':self._dIndividualNumbers4SubSNPResults[lineName], | |
194 'allele':iAlleleNumber, 'batchNumber': self._currentBatchNumber, 'confidenceValue':self._sPolymConfidenceValue, | |
195 'type':self._sPolymType, 'length': Multifasta2SNPFile.SNP_LENGTH, | |
196 '5flank':flank5Prime, '3flank':flank3Prime} | |
197 if(not self.subSNPExistsInSubSNPList(dSubSNPResult, self._lSubSNPFileResults)): | |
198 self._lSubSNPFileResults.append(dSubSNPResult) | |
199 | |
200 def _startAnIndel(self, position, nucleotide): | |
201 self.dcurrentIndel['start'] = position | |
202 self.dcurrentIndel['end'] = position | |
203 self.sCurrentIndelAllele = nucleotide | |
204 if(nucleotide == "-"): | |
205 self._sPolymType = Multifasta2SNPFile.POLYM_TYPE_4_DELETION | |
206 else: | |
207 self._sPolymType = Multifasta2SNPFile.POLYM_TYPE_4_INSERTION | |
208 self.isTheIndelOpen4ThisLine = 1 | |
209 | |
210 def _expandTheCurrentIndel(self, position, nucleotide): | |
211 self.sCurrentIndelAllele = self.sCurrentIndelAllele + nucleotide | |
212 self.dcurrentIndel['end'] = position | |
213 | |
214 def _closeTheCurrentIndel(self, lineName, nucleotide, position): | |
215 subSNPName = self._formatSubSNPName(lineName, self.dcurrentIndel['start'], self._sPolymType) | |
216 | |
217 dIndel4TheLine = {'name': subSNPName, 'lineName': lineName, 'start': self.dcurrentIndel['start'],'end' :self.dcurrentIndel['end'], | |
218 'allele': self.sCurrentIndelAllele, 'type': self._sPolymType, 'length': self._iPolymLength} | |
219 | |
220 dIndel4TheLine['length'] = self.getAnIndelLength(dIndel4TheLine) | |
221 | |
222 self.lIndelsOfTheCurrentLine.append(dIndel4TheLine) | |
223 | |
224 self.dcurrentIndel.clear() | |
225 self.isTheIndelOpen4ThisLine = 0 | |
226 | |
227 | |
228 def _mergeAllelesAndSubSNPsFromOverlappingIndels(self): | |
229 lIndelList = [] | |
230 for dIndel in self.lIndelsOverAllLines: | |
231 lIndelList = self.clusteriseIndels(dIndel, self.lIndelsOverAllLines) | |
232 | |
233 for dIndel in lIndelList: | |
234 oldAllele = dIndel['allele'] | |
235 start = dIndel['start'] | |
236 stop = dIndel['end'] | |
237 lineName = dIndel['lineName'] | |
238 | |
239 LineBioSeq = self._wrapper._iLinesBioseqDB.fetch(lineName) | |
240 dIndel = self.updateAllele(oldAllele, start, stop, LineBioSeq, dIndel) | |
241 dSubSNPResult = self.createSubSNPFromAMissingPolym(dIndel, lineName) | |
242 if(not self.subSNPExistsInSubSNPList(dSubSNPResult, self._lSubSNPFileResults)): | |
243 self._lSubSNPFileResults.append(dSubSNPResult) | |
244 | |
245 def updateAllele(self, oldAllele, start, stop, LineBioSeq, dIndel): | |
246 #TODO: creer le test | |
247 newAllele = LineBioSeq.subseq(start, stop).sequence | |
248 if newAllele != oldAllele: | |
249 dIndel['allele'] = newAllele | |
250 self._dAlleleFileResults = self._completeAlleleSetWithCurrentAllele(self._dAlleleFileResults, newAllele) | |
251 return dIndel | |
252 | |
253 | |
254 def getFlanksOfASubSNP(self, lineName, subsnpPosition, polymLength, flankLength): | |
255 bioSeqOfTheLine = self._wrapper._iLinesBioseqDB.fetch(lineName) | |
256 flank5Prime = bioSeqOfTheLine.get5PrimeFlank(subsnpPosition, flankLength) | |
257 flank3Prime = bioSeqOfTheLine.get3PrimeFlank(subsnpPosition, flankLength, polymLength) | |
258 | |
259 return flank5Prime, flank3Prime | |
260 | |
261 def createSubSNPFromAMissingPolym(self, dIndel, lineName): | |
262 if(dIndel['type'] == Multifasta2SNPFile.POLYM_TYPE_4_INSERTION): | |
263 start = self.getUngappedPositionInRefSeq(dIndel['start']-1) | |
264 else: | |
265 start = self.getUngappedPositionInRefSeq(dIndel['start']) | |
266 | |
267 subSNPName = self._formatSubSNPName(dIndel['lineName'], start, dIndel['type']) | |
268 | |
269 iAlleleNumber = self._dAlleleFileResults[dIndel['allele']] | |
270 | |
271 iPolymLength = self.getAnIndelLength(dIndel) | |
272 | |
273 flank5Prime, flank3Prime = self.getFlanksOfASubSNP(lineName, dIndel['start'], iPolymLength, Multifasta2SNPFile.FLANK_LENGTH) | |
274 | |
275 dSubSNPResult = {'subSNPName':subSNPName, 'position':start, 'lineName':self._dIndividualNumbers4SubSNPResults[lineName], 'allele':iAlleleNumber, | |
276 'batchNumber': self._currentBatchNumber, 'confidenceValue':self._sPolymConfidenceValue, 'type':dIndel['type'], | |
277 'length': iPolymLength, '5flank':flank5Prime, '3flank':flank3Prime} | |
278 | |
279 return dSubSNPResult | |
280 | |
281 def clusteriseIndels(self, dIndel, lIndelsOverAllLines): | |
282 iIndice = 0 | |
283 for dIndel in lIndelsOverAllLines: | |
284 iIndice2Compare = 0 | |
285 for dIndel2Compare in lIndelsOverAllLines: | |
286 dIndel, dIndel2Compare = self.mergeBoundsForTwoOverlappingIndels(dIndel, dIndel2Compare) | |
287 lIndelsOverAllLines = self.updateBoundsForAnIndelInAnIndelList(lIndelsOverAllLines, dIndel) | |
288 lIndelsOverAllLines = self.updateBoundsForAnIndelInAnIndelList(lIndelsOverAllLines, dIndel2Compare) | |
289 iIndice2Compare = iIndice2Compare + 1 | |
290 iIndice = iIndice + 1 | |
291 | |
292 return lIndelsOverAllLines | |
293 | |
294 def mergeBoundsForTwoOverlappingIndels(self, dIndel1, dIndel2): | |
295 if((dIndel2['start'] <= dIndel1['start']) and (dIndel2['end'] >= dIndel1['start']) or | |
296 (dIndel1['start'] <= dIndel2['start']) and (dIndel1['end'] >= dIndel2['start'])): | |
297 if(dIndel1['start'] <= dIndel2['start']): | |
298 iStart = dIndel1['start'] | |
299 else: | |
300 iStart = dIndel2['start'] | |
301 | |
302 if(dIndel1['end'] >= dIndel2['end']): | |
303 iEnd = dIndel1['end'] | |
304 else: | |
305 iEnd = dIndel2['end'] | |
306 | |
307 dIndel1['start'] = iStart | |
308 dIndel1['end'] = iEnd | |
309 dIndel2['start'] = iStart | |
310 dIndel2['end'] = iEnd | |
311 | |
312 return dIndel1, dIndel2 | |
313 | |
314 def updateBoundsForAnIndelInAnIndelList(self, lIndelsList, dIndelWithNewBounds): | |
315 name = dIndelWithNewBounds['name'] | |
316 dIndelInTheList, iIndice = self.findAnIndelInAListWithHisName(name, lIndelsList) | |
317 lIndelsList.remove(dIndelInTheList) | |
318 lIndelsList.insert(iIndice, dIndelWithNewBounds) | |
319 return lIndelsList | |
320 | |
321 | |
322 def findASubSNPInAListWithHisName(self, name, lSubSNPList): | |
323 dSubSNP2Find = {} | |
324 indice = 0 | |
325 indice2Find = -1 | |
326 for dSubSNP in lSubSNPList: | |
327 if(dSubSNP['subSNPName'] == name): | |
328 dSubSNP2Find = dSubSNP | |
329 indice2Find = indice | |
330 indice = indice + 1 | |
331 | |
332 if dSubSNP2Find == {} or indice2Find == -1: | |
333 msg = "trying to find a SubSNP not existing: " + name | |
334 self._logFile.error(msg) | |
335 raise Exception ("trying to find a SubSNP not existing: " + name) | |
336 else: | |
337 return dSubSNP2Find, indice2Find | |
338 | |
339 def subSNPExistsInSubSNPList(self, dSubSNP2Find, lSubSNPList): | |
340 flag = 0 | |
341 for dSubSNP in lSubSNPList: | |
342 if(dSubSNP2Find['subSNPName'] == dSubSNP['subSNPName']): | |
343 flag = 1 | |
344 | |
345 if flag == 1: | |
346 return True | |
347 else: | |
348 return False | |
349 | |
350 | |
351 def findAnIndelInAListWithHisName(self, name, lIndelList): | |
352 dIndel2Find = {} | |
353 indice = 0 | |
354 indice2Find = -1 | |
355 for dIndel in lIndelList: | |
356 if(dIndel['name'] == name): | |
357 dIndel2Find = dIndel | |
358 indice2Find = indice | |
359 indice = indice + 1 | |
360 | |
361 if dIndel2Find == {} or indice2Find == -1: | |
362 msg = "trying to find an indel not existing: " + name | |
363 self._logFile.error(msg) | |
364 raise Exception (msg) | |
365 else: | |
366 return dIndel2Find, indice2Find | |
367 | |
368 def _addMissingsAllelesAndSubSNPs(self): | |
369 for dIndel in self.lIndelsOverAllLines: | |
370 start = dIndel['start'] | |
371 end = dIndel['end'] | |
372 type = dIndel['type'] | |
373 self.addMissingAllelesAndSubSNPsForOnePolym(start, end, type) | |
374 | |
375 for position in self.dSNPsPositions: | |
376 self.addMissingAllelesAndSubSNPsForOnePolym(position, position, "SNP") | |
377 | |
378 def addMissingAllelesAndSubSNPsForOnePolym(self, start, end, polymType): | |
379 refSeqAllele = self._wrapper._iReferenceBioseq.subseq(start, end).sequence | |
380 BioSeqDb = self._wrapper.getLinesBioseqInstances() | |
381 lBioSeqDbAlleles = self.getAllelesOfASubSeq(BioSeqDb, start, end) | |
382 for subSequence in lBioSeqDbAlleles: | |
383 if(subSequence['allele'] == refSeqAllele): | |
384 lineName = subSequence['header'] | |
385 dMissingPolym = {'lineName': lineName, 'start': start,'end' :end, | |
386 'allele': subSequence['allele'], 'type':polymType} | |
387 self._dAlleleFileResults = self._completeAlleleSetWithCurrentAllele(self._dAlleleFileResults, subSequence['allele']) | |
388 dSubSNPResult = self.createSubSNPFromAMissingPolym(dMissingPolym, lineName) | |
389 if(not self.subSNPExistsInSubSNPList(dSubSNPResult, self._lSubSNPFileResults)): | |
390 self._lSubSNPFileResults.append(dSubSNPResult) | |
391 | |
392 def addOnePolymorphicPosition(self, position): | |
393 if(not self.dSNPsPositions.has_key(position)): | |
394 self.dSNPsPositions[position] = 1 | |
395 | |
396 def getUngappedPositionInRefSeq(self, position): | |
397 if(position ==1): | |
398 nbOfGaps = 0 | |
399 else: | |
400 seqIn5Prime = self._wrapper._iReferenceBioseq.subseq(1, position-1).sequence | |
401 nbOfGaps = seqIn5Prime.count("-") | |
402 | |
403 return position - nbOfGaps | |
404 | |
405 def getAllelesOfASubSeq(self, BioSeqDb, start, end): | |
406 lAlleles = [] | |
407 for iBioSeq in BioSeqDb: | |
408 dAlleles = {} | |
409 dAlleles['header'] = iBioSeq.header | |
410 dAlleles['allele'] = iBioSeq.subseq(start, end).sequence | |
411 lAlleles.append(dAlleles) | |
412 | |
413 return lAlleles | |
414 | |
415 def getAnIndelLength(self, dIndel): | |
416 length = 0 | |
417 if(dIndel['type'] == Multifasta2SNPFile.POLYM_TYPE_4_DELETION): | |
418 length = dIndel['end'] - dIndel['start'] + 1 | |
419 else: | |
420 length = len(dIndel['allele']) | |
421 | |
422 return length | |
423 | |
424 def createWrapperFromFile(self, inFileName): | |
425 faF = open(inFileName, "r") | |
426 iBioSeqDB = self._extractSequencesFromInputFile(faF) | |
427 faF.close() | |
428 | |
429 iBioSeqDB.upCase() | |
430 referenceBioseq = iBioSeqDB[0] | |
431 linesBioSeqDB = iBioSeqDB.extractPart(1, iBioSeqDB.getSize() - 1) | |
432 | |
433 try: | |
434 if(FileUtils.isEmpty(inFileName)): | |
435 msg = "The input file is empty!" | |
436 self._logFile.error(self._prefixeWithLineNumber (msg)) | |
437 raise Exception (self._prefixeWithFileName (msg)) | |
438 if(self.isHeaderInRefSeqList(referenceBioseq.header)): | |
439 msg = "This reference sequence already exists in one previous file!" | |
440 self._logFile.error(self._prefixeWithLineNumber (msg)) | |
441 raise Exception (self._prefixeWithLineNumber (msg)) | |
442 except Exception, e : | |
443 raise Exception ("Problem with one input file: \n" + str(e)) | |
444 | |
445 self._lRefSequences.append(referenceBioseq) | |
446 | |
447 return ReferenceBioseqAndLinesBioseqDBWrapper(referenceBioseq, linesBioSeqDB, self._logFile, inFileName) | |
448 | |
449 def isHeaderInRefSeqList(self, header): | |
450 isHeader = False | |
451 for item in self._lRefSequences: | |
452 if item.header == header: | |
453 isHeader = True | |
454 return isHeader | |
455 | |
456 def completeBatchList(self): | |
457 dBatchResults = {'BatchNumber' : self._currentBatchNumber, 'BatchName' : self._batchName, 'GeneName' : self._geneName,'ContactNumber' : "1", | |
458 'ProtocolNumber' : "1", 'ThematicNumber' : "1", 'RefSeqName': self._wrapper._iReferenceBioseq.header} | |
459 | |
460 self._lBatchFileResults.append(dBatchResults) | |
461 | |
462 return self._lBatchFileResults | |
463 | |
464 def getLineAsAHeader(self, lineToBeCheck, lineNumber = 0): | |
465 """ | |
466 header line begin with the tag(or token) '>' tag | |
467 ended with an carriage return | |
468 contain The name of sequence must respect this alphabet [a-zA-Z0-9_-:] | |
469 """ | |
470 obsHeader = lineToBeCheck | |
471 if obsHeader[0]!=">" : | |
472 msg = "tag '>' omitted before header" | |
473 self._logFile.error(self._prefixeWithLineNumber (msg)) | |
474 raise Exception (self._prefixeWithLineNumber (msg)) | |
475 else : | |
476 obsHeader = obsHeader[1:] | |
477 obsHeader = obsHeader.replace ("\n","") | |
478 obsHeader = self._removeRepeatedBlanksInAStr(obsHeader) | |
479 obsHeader = self._replaceBlankByUnderScoreInAStr(obsHeader) | |
480 if self.checkHeaderAlphabet(obsHeader) : | |
481 return obsHeader | |
482 self._logFile.error(self._prefixeWithLineNumber ("fatal error on header")) | |
483 raise Exception (self._prefixeWithLineNumber ("fatal error on header")) | |
484 | |
485 def getLineAsASeq(self, lineToBeCheck): | |
486 """ | |
487 Sequence line | |
488 ended with an carriage return | |
489 contain only character of the IUPAC alphabet | |
490 """ | |
491 obsSeq = str.upper(lineToBeCheck) | |
492 obsSeq = obsSeq.replace ("\n","") | |
493 obsSeq = obsSeq.replace ("\r","") | |
494 obsLine = obsSeq.replace("-","") | |
495 if not self.isIUPAC_bases(obsLine) : | |
496 msg = "the sequence contain a non nucleic character " | |
497 self._logFile.error(self._prefixeWithLineNumber (msg)) | |
498 raise Exception (self._prefixeWithLineNumber (msg)) | |
499 return obsSeq | |
500 | |
501 def checkHeaderAlphabet( self, strToCheck): | |
502 """ | |
503 Check the string | |
504 the string is not a header when founding a pattern not corresponding to the regexp | |
505 \W Matches any non-alphanumeric character; this is equivalent to the class [^a-zA-Z0-9_-:]. | |
506 """ | |
507 if strToCheck=="": | |
508 return False | |
509 p = re.compile('[^a-zA-Z0-9_:\-]', re.IGNORECASE) #p = re.compile('(\W|-|:)+', re.IGNORECASE) | |
510 errList=p.findall(strToCheck) | |
511 if len( errList ) > 0 : | |
512 return False | |
513 else: | |
514 return True | |
515 | |
516 ## Check the string is nucleotides sequence from the DNA_ALPHABET_WITH_N = ["A","T","G","C","N"] of IUPAC nomenclature. | |
517 # @return True if sequence contain A, T, G, C or N False otherwise | |
518 # | |
519 def isDNA_bases( self, sequence): | |
520 if sequence == "" : | |
521 return False | |
522 | |
523 setFromString = set() | |
524 | |
525 for nt in sequence : | |
526 setFromString.add(nt) | |
527 | |
528 return setFromString.issubset(DNA_ALPHABET_WITH_N_AND_DELS) | |
529 | |
530 ## Check if the string is nucleotides sequence from the IUPAC ALPHABET . | |
531 # @return True if sequence contain IUPAC letters False otherwise | |
532 # | |
533 def isIUPAC_bases( self, sequence): | |
534 if sequence == "" : | |
535 return False | |
536 | |
537 setFromString = set() | |
538 | |
539 for nt in sequence : | |
540 setFromString.add(nt) | |
541 | |
542 return setFromString.issubset(IUPAC) | |
543 | |
544 def _writeAllOutputFiles(self): | |
545 writer = Multifasta2SNPFileWriter() | |
546 writer.write(self) | |
547 | |
548 def _sortSubSNPResultByBatchPositionAndLineName(self, lSubSNPResults): | |
549 return sorted(lSubSNPResults, key=lambda SNPresults: (SNPresults['batchNumber'], SNPresults['position'], SNPresults['lineName'])) | |
550 | |
551 def _formatSubSNPName(self, currentLineHeader, position, polymType): | |
552 shortPolymType = polymType[:3] | |
553 return self._batchName + "_" + shortPolymType + "_" + str(position) + "_" + currentLineHeader | |
554 | |
555 def _isSNPDetected(self, referenceSequence, index, nt): | |
556 if((nt != referenceSequence[index]) and (nt.upper() != "N") and (referenceSequence[index].upper() != "N")): | |
557 return True | |
558 else: | |
559 return False | |
560 | |
561 def _extractSequencesFromInputFile(self, inFile): | |
562 # attention : DNA_ALPHABET_WITH_N_AND_DELS = Set (['A','T','G','C','N']) no including "gap" | |
563 lInFileLines = inFile.readlines() | |
564 nbOfLines = len(lInFileLines) - 1 | |
565 #premiere lecture | |
566 self._iCurrentLineNumber = 0 | |
567 isSameSeq = False | |
568 newSeq = "" | |
569 bioseqDB = BioseqDB () | |
570 while self._iCurrentLineNumber < nbOfLines : | |
571 bioseq = Bioseq() | |
572 bioseq.header = self.getLineAsAHeader( lInFileLines[self._iCurrentLineNumber] ) | |
573 isSameSeq = True | |
574 while isSameSeq and (self._iCurrentLineNumber < nbOfLines) : | |
575 self._iCurrentLineNumber +=1 | |
576 if lInFileLines[self._iCurrentLineNumber][0] == ">" : | |
577 isSameSeq = False | |
578 else : | |
579 newSeq = newSeq + self.getLineAsASeq( lInFileLines[self._iCurrentLineNumber] ) | |
580 isSameSeq = True | |
581 bioseq.setSequence(newSeq) | |
582 newSeq = "" | |
583 bioseqDB.add(bioseq) | |
584 return bioseqDB | |
585 | |
586 def _removeRepeatedBlanksInAStr (self, StrToClean ): | |
587 resStr=StrToClean.expandtabs(2) | |
588 compResStr=resStr.replace (" "," ") | |
589 while compResStr != resStr : | |
590 resStr=compResStr | |
591 compResStr=resStr.replace (" "," ") | |
592 return resStr | |
593 | |
594 def _replaceBlankByUnderScoreInAStr (self, StrToClean ): | |
595 resStr = StrToClean.replace (" ","_") | |
596 return resStr | |
597 | |
598 def _prefixeWithLineNumber (self, strMsg): | |
599 resStr = "File: " + self._currentFileName + "\t" | |
600 resStr += "Line %i " % (self._iCurrentLineNumber+1 ) + strMsg | |
601 return resStr | |
602 | |
603 def _prefixeWithFileName (self, strMsg): | |
604 resStr = "File: " + self._currentFileName + "\n" + strMsg | |
605 return resStr | |
606 | |
607 | |
608 def _completeAlleleSetWithCurrentAllele(self, dAlleleFileResults, dnaBase): | |
609 if dAlleleFileResults.has_key(dnaBase): | |
610 return dAlleleFileResults | |
611 else: | |
612 iAlleleNumber = len(dAlleleFileResults) + 1 | |
613 dAlleleFileResults[dnaBase] = iAlleleNumber | |
614 return dAlleleFileResults | |
615 | |
616 def _completeIndividualListWithCurrentIndividual(self, lIndividualResults, lineName): | |
617 if lIndividualResults == []: | |
618 iIndividualNumber = 1 | |
619 else: | |
620 iIndividualNumber = len(lIndividualResults) + 1 | |
621 | |
622 #TODO: transformer la liste de dictionnaire en liste d'objets | |
623 if not (self._checkIfALineExistInList(lIndividualResults, lineName)): | |
624 dIndividual2Add = {'individualNumber': iIndividualNumber, 'individualName': lineName, 'scientificName': self._taxon} | |
625 lIndividualResults.append(dIndividual2Add) | |
626 | |
627 return lIndividualResults | |
628 | |
629 def _completeBatchLineListWithCurrentIndividual(self, lBatchLineResults, lIndividualResults, lineName): | |
630 lineDict = self._getALineDictFromADictListWithALineName(lIndividualResults, lineName) | |
631 | |
632 if len(lineDict) != 0: | |
633 if(lineDict.has_key('individualNumber')): | |
634 indivNumberOfTheLineDict = lineDict['individualNumber'] | |
635 else: | |
636 msg = "Problem with the batchLine results construction: individual named " + lineName + " has no individual number!" | |
637 self._logFile.error(msg) | |
638 raise Exception (msg) | |
639 else: | |
640 msg = "Problem with the batchLine results construction: individual named " + lineName + " not in individual list!" | |
641 self._logFile.error(msg) | |
642 raise Exception (msg) | |
643 | |
644 dResults2Add = {'IndividualNumber': str(indivNumberOfTheLineDict), 'BatchNumber' : self._currentBatchNumber} | |
645 lBatchLineResults.append(dResults2Add) | |
646 return lBatchLineResults | |
647 | |
648 def _getALineDictFromADictListWithALineName(self, lDictList, lineName): | |
649 dictToReturn = {} | |
650 for myDict in lDictList: | |
651 if myDict['individualName'] == lineName: | |
652 dictToReturn = myDict | |
653 | |
654 return dictToReturn | |
655 | |
656 def _checkIfALineExistInList(self, lDictList, lineName): | |
657 for myDict in lDictList: | |
658 if myDict['individualName'] == lineName: | |
659 return True | |
660 return False | |
661 | |
662 def _getCurrentBatchResult(self): | |
663 return self._lBatchFileResults[self._currentBatchNumber-1] | |
664 | |
665 | |
666 | |
667 | |
668 class ReferenceBioseqAndLinesBioseqDBWrapper (object): | |
669 | |
670 def __init__ (self, iReferenceBioseq, iLinesBioSeqDB, logger, fileName): | |
671 self._iReferenceBioseq = iReferenceBioseq | |
672 self._iLinesBioseqDB = iLinesBioSeqDB | |
673 self._logger = logger | |
674 self._currentFileName = fileName | |
675 self._checkAllSeqs() | |
676 | |
677 | |
678 def _checkAllSeqs(self): | |
679 self._iReferenceBioseq.checkEOF() | |
680 refSeqLen = self._iReferenceBioseq.getLength() | |
681 | |
682 for seq in self._iLinesBioseqDB.db: | |
683 seq.checkEOF() | |
684 if(not seq.getLength() == refSeqLen): | |
685 msg = "File: " + self._currentFileName + ", problem with the sequence " + seq.header + ": its length is different from the reference seq! All the sequences must have the same length.\n" | |
686 msg += "refseq length: " + str(refSeqLen) + "\n" | |
687 msg += "seq length: " + str(seq.getLength()) + "\n" | |
688 self._logger.error(msg) | |
689 raise Exception (msg) | |
690 | |
691 def getLinesBioseqInstances(self): | |
692 return self._iLinesBioseqDB.db | |
693 | |
694 def getReferenceSequence(self): | |
695 return self._iReferenceBioseq.sequence | |
696 | |
697 class Multifasta2SNPFileWriter(object): | |
698 | |
699 SUB_SNP_FILE_HEADER = ["SubSNPName","ConfidenceValue","Type","Position","5flank", | |
700 "3flank","Length","BatchNumber","IndividualNumber","PrimerType","PrimerNumber","Forward_or_Reverse","AlleleNumber"] | |
701 | |
702 ALLELE_FILE_HEADER = ["AlleleNumber","Value","Motif","NbCopy","Comment"] | |
703 | |
704 INDIVIDUAL_FILE_HEADER = ["IndividualNumber","IndividualName","Description","AberrAneuploide", | |
705 "FractionLength","DeletionLineSynthesis","UrlEarImage","TypeLine","ChromNumber","ArmChrom","DeletionBin","ScientificName", | |
706 "local_germplasm_name","submitter_code","local_institute","donor_institute","donor_acc_id"] | |
707 | |
708 SEQUENCE_CSV_FILE_HEADER = ["SequenceName","SeqType","BankName","BankVersion","ACNumber","Locus","ScientificName"] | |
709 | |
710 BATCH_TXT_FILE_HEADER = ["BatchNumber", "BatchName", "GeneName", "Description", "ContactNumber", "ProtocolNumber", "ThematicNumber", "RefSeqName", "AlignmentFileName", "SeqName"] | |
711 | |
712 BATCH_LINE_FILE_HEADER = ["IndividualNumber", "Pos5", "Pos3", "BatchNumber", "Sequence"] | |
713 | |
714 def __init__(self): | |
715 self._csvFieldSeparator = ";" | |
716 self._txtSubFieldSeparator = ": " | |
717 self._txtFieldSeparator = "\n" | |
718 self._primerType = "Sequence" | |
719 self._csvLineSeparator = "\n" | |
720 self._txtLineSeparator = "//\n" | |
721 | |
722 def write(self, iMultifasta2SNPFile): | |
723 self._writeSubSNPFile(iMultifasta2SNPFile._outSubSNPFileName, iMultifasta2SNPFile._lSubSNPFileResults) | |
724 self._writeAlleleFile(iMultifasta2SNPFile._outAlleleFileName, iMultifasta2SNPFile._dAlleleFileResults) | |
725 self._writeIndividualFile(iMultifasta2SNPFile._outIndividualFileName, iMultifasta2SNPFile._lIndividualFileResults) | |
726 self._writeSequenceFiles(iMultifasta2SNPFile._outSequenceFSAFileName, iMultifasta2SNPFile._outSequenceCSVFileName, iMultifasta2SNPFile._lRefSequences, iMultifasta2SNPFile._taxon) | |
727 self._writeBatchFile(iMultifasta2SNPFile._outBatchFileName, iMultifasta2SNPFile._lBatchFileResults) | |
728 self._writeBatchLineFile(iMultifasta2SNPFile._outBatchLineFileName, iMultifasta2SNPFile._lBatchLineFileResults) | |
729 | |
730 def sortAlleleResultByAlleleNumber(self, dAlleleFileResults): | |
731 return sorted(dAlleleFileResults.items(), key=lambda(k,v):(v,k)) | |
732 | |
733 def _writeSubSNPFile(self, subSNPFileName, lSNP2Write): | |
734 outF = open(subSNPFileName, "w") | |
735 self._writeSNPFileHeader(outF) | |
736 for dSNP in lSNP2Write: | |
737 self._writeSNPFileLine(outF, dSNP) | |
738 outF.close() | |
739 | |
740 def _writeAlleleFile(self, alleleFileName, dAllele2Write): | |
741 outF = open(alleleFileName, "w") | |
742 self._writeAlleleFileHeader(outF) | |
743 lAlleleSortedResults = self.sortAlleleResultByAlleleNumber(dAllele2Write) | |
744 for tAllele in lAlleleSortedResults: | |
745 self._writeAlleleFileLine(outF, tAllele[0], tAllele[1]) | |
746 | |
747 outF.close() | |
748 | |
749 def _writeIndividualFile(self, individualFileName, lIndividual2Write): | |
750 sorted(lIndividual2Write, key=lambda Individual: (Individual['individualNumber'])) | |
751 outF = open(individualFileName, "w") | |
752 self._writeIndividualFileHeader(outF) | |
753 | |
754 for dIndiv in lIndividual2Write: | |
755 self._writeIndividualFileLine(outF, dIndiv) | |
756 | |
757 outF.close() | |
758 | |
759 def _writeSequenceFiles(self, sequenceFSAFileName, sequenceCSVFileName, lRefSequences, taxon): | |
760 outFSA = open(sequenceFSAFileName, "w") | |
761 outCSV = open(sequenceCSVFileName, "w") | |
762 self._writeSequenceCSVHeader(outCSV) | |
763 | |
764 for refSeq in lRefSequences: | |
765 refSeq.cleanGap() | |
766 self._writeSequenceFSAFile(outFSA, refSeq) | |
767 self._writeSequenceCSVLine(outCSV, refSeq, taxon) | |
768 | |
769 outFSA.close() | |
770 outCSV.close() | |
771 | |
772 def _writeSequenceFSAFile(self, outF, refSeq): | |
773 outF.write( ">%s\n" % ( refSeq.header ) ) | |
774 outF.write( "%s\n" % ( refSeq.sequence[0:refSeq.getLength()] ) ) | |
775 | |
776 | |
777 def _writeBatchFile(self, batchFileName, lBatchResults): | |
778 outF = open(batchFileName, "w") | |
779 for dBatchResults in lBatchResults: | |
780 for head in Multifasta2SNPFileWriter.BATCH_TXT_FILE_HEADER[:]: | |
781 if dBatchResults.has_key(head): | |
782 outF.write(head + self._txtSubFieldSeparator + str(dBatchResults[head]) + self._txtFieldSeparator) | |
783 else: | |
784 outF.write(head + self._txtSubFieldSeparator + self._txtFieldSeparator) | |
785 | |
786 outF.write(self._txtLineSeparator) | |
787 | |
788 outF.close() | |
789 | |
790 def _writeBatchLineFile(self, batchLineFileName, lBatchLineResults): | |
791 outF = open(batchLineFileName, "w") | |
792 self._writeBatchLineFileHeader(outF) | |
793 for dResult in lBatchLineResults: | |
794 self._writeBatchLineFileLine(outF, dResult) | |
795 outF.close() | |
796 | |
797 def _writeSNPFileHeader(self, outF): | |
798 for head in Multifasta2SNPFileWriter.SUB_SNP_FILE_HEADER[:-1]: | |
799 outF.write(head + self._csvFieldSeparator) | |
800 outF.write(Multifasta2SNPFileWriter.SUB_SNP_FILE_HEADER[-1] + self._csvLineSeparator) | |
801 | |
802 def _writeAlleleFileHeader(self, outF): | |
803 for head in Multifasta2SNPFileWriter.ALLELE_FILE_HEADER[:-1]: | |
804 outF.write(head + self._csvFieldSeparator) | |
805 outF.write(Multifasta2SNPFileWriter.ALLELE_FILE_HEADER[-1] + self._csvLineSeparator) | |
806 | |
807 def _writeIndividualFileHeader(self, outF): | |
808 for head in Multifasta2SNPFileWriter.INDIVIDUAL_FILE_HEADER[:-1]: | |
809 outF.write(head + self._csvFieldSeparator) | |
810 outF.write(Multifasta2SNPFileWriter.INDIVIDUAL_FILE_HEADER[-1] + self._csvLineSeparator) | |
811 | |
812 def _writeSequenceCSVHeader(self, outF): | |
813 for head in Multifasta2SNPFileWriter.SEQUENCE_CSV_FILE_HEADER[:-1]: | |
814 outF.write(head + self._csvFieldSeparator) | |
815 outF.write(Multifasta2SNPFileWriter.SEQUENCE_CSV_FILE_HEADER[-1] + self._csvLineSeparator) | |
816 | |
817 def _writeBatchLineFileHeader(self, outF): | |
818 for head in Multifasta2SNPFileWriter.BATCH_LINE_FILE_HEADER[:-1]: | |
819 outF.write(head + self._csvFieldSeparator) | |
820 outF.write(Multifasta2SNPFileWriter.BATCH_LINE_FILE_HEADER[-1] + self._csvLineSeparator) | |
821 | |
822 def _writeSNPFileLine(self, outF, dSNP): | |
823 outF.write(dSNP['subSNPName'] + self._csvFieldSeparator) | |
824 outF.write(dSNP['confidenceValue'] + self._csvFieldSeparator + dSNP['type'] + self._csvFieldSeparator) | |
825 outF.write(str(dSNP['position']) + self._csvFieldSeparator + dSNP['5flank'] + self._csvFieldSeparator + dSNP['3flank'] + self._csvFieldSeparator) | |
826 outF.write(str(dSNP['length']) + self._csvFieldSeparator + str(dSNP['batchNumber']) + self._csvFieldSeparator) | |
827 outF.write(str(dSNP['lineName']) + self._csvFieldSeparator) | |
828 outF.write(self._primerType + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator + str(dSNP['allele']) + self._csvLineSeparator) | |
829 | |
830 def _writeAlleleFileLine(self, outF, sAllele2Write, iAlleleNumber): | |
831 outF.write(str(iAlleleNumber) + self._csvFieldSeparator) | |
832 outF.write(sAllele2Write + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator + self._csvLineSeparator) | |
833 | |
834 def _writeIndividualFileLine(self, outF, dIndividual): | |
835 outF.write(str(dIndividual['individualNumber']) + self._csvFieldSeparator) | |
836 outF.write(dIndividual['individualName'] + self._csvFieldSeparator + self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator) | |
837 outF.write(dIndividual['scientificName'] + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator+ self._csvFieldSeparator + self._csvFieldSeparator + self._csvLineSeparator) | |
838 | |
839 def _writeSequenceCSVLine(self, outF, refSeq, taxon): | |
840 outF.write(refSeq.header + self._csvFieldSeparator) | |
841 outF.write("Reference" + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator) | |
842 outF.write(taxon + self._csvLineSeparator) | |
843 | |
844 def _writeBatchLineFileLine(self, outF, dResult): | |
845 outF.write(str(dResult['IndividualNumber']) + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator) | |
846 outF.write(str(dResult['BatchNumber']) + self._csvFieldSeparator + self._csvLineSeparator) |