3
|
1 '''
|
|
2 Created on Jan 1, 2011
|
|
3
|
|
4 @author: John L. Herndon
|
|
5 @contact: herndon@cs.colostate.edu
|
|
6 @organization: Colorado State University
|
|
7 @group: Computer Science Department, Asa Ben-Hur's laboratory
|
|
8 '''
|
|
9
|
|
10
|
|
11 import fastaparser
|
|
12 import utils
|
|
13 import os
|
|
14 import programs
|
|
15 import nucmerparser
|
|
16 import copy
|
|
17
|
|
18 class IncludeFileManager( object ):
|
|
19 """
|
|
20 A class to manage include files
|
|
21 """
|
|
22 #This class needs some work. Need to come up with a way to find unique sequences between all include files....
|
|
23
|
|
24 def __init__( self ):
|
|
25 """
|
|
26 Constructor
|
|
27 """
|
|
28 self.includeFiles = [ ]
|
|
29 self.nucmer = programs.Nucmer( )
|
|
30 self.isExcludeFileInitialized = False
|
|
31 self.isReferenceFileInitialized = False
|
|
32 self.referenceFile = None
|
|
33 self.referenceSequence = None
|
|
34 self.uniqueSequences = None
|
|
35
|
|
36 def setExcludeFile( self, excludeFileName ):
|
|
37 """
|
|
38 A function to set the exclude file that will be used when nucmer is called
|
|
39 """
|
|
40
|
|
41 utils.logMessage( "IncludeFileManager::setExcludeFile( )", "fileName {0}".format( excludeFileName ) )
|
|
42 self.excludeFileName = excludeFileName
|
|
43 self.isExcludeFileInitialized = True
|
|
44
|
|
45
|
|
46 def findUniqueSequencesInFile(self, doWantFile, doNotWantFile ):
|
|
47 utils.logMessage( "IncludeFileManager::findUniqueSequence( )", "running nucmer for reference file: {0}".format( doWantFile ) )
|
|
48 coordFile = self.nucmer.execute( [ doWantFile, doNotWantFile ] )
|
|
49
|
|
50 matches = nucmerparser.parseCoordMatchFile( coordFile )
|
|
51 sequences = fastaparser.parseFastaFileAsPrimerSequence( doWantFile )
|
|
52
|
|
53 for match in matches:
|
|
54 if sequences.has_key( match.seqID ):
|
|
55 primerData = sequences[ match.seqID ]
|
|
56 primerData.addMatch( match )
|
|
57 else:
|
|
58 print "Warning: id from .coords file not found in sequence data..."
|
|
59 utils.logMessage( "IncludeFileManager::processMatches( )", "WARNING - an ID was read in a Match that does not correspond to a sequence read from the fasta file!" )
|
|
60
|
|
61 returnValue = [ ]
|
|
62
|
|
63 for key in sequences.keys( ):
|
|
64 sequence = sequences[ key ]
|
|
65 subSequences = sequence.getNonMatchedSubSequences( )
|
|
66 returnValue.extend( subSequences )
|
|
67
|
|
68 return returnValue
|
|
69
|
|
70
|
|
71 def findCommonSequencesInFile(self, want, alsoWant ):
|
|
72 utils.logMessage( "IncludeFileManager::findUniqueSequence( )", "running nucmer for reference file: {0}".format( want ) )
|
|
73
|
|
74 print want, alsoWant
|
|
75 coordFile = self.nucmer.execute( [ want, alsoWant ] )
|
|
76
|
|
77 matches = nucmerparser.parseCoordMatchFile( coordFile )
|
|
78 sequences = fastaparser.parseFastaFileAsPrimerSequence( want )
|
|
79
|
|
80 for match in matches:
|
|
81 if sequences.has_key( match.seqID ):
|
|
82 primerData = sequences[ match.seqID ]
|
|
83 primerData.addMatch( match )
|
|
84
|
|
85 returnValue = [ ]
|
|
86 for key in sequences:
|
|
87 sequence = sequences[ key ]
|
|
88 subSequences = sequence.getMatchedSubSequences( )
|
|
89 returnValue.extend( subSequences )
|
|
90
|
|
91
|
|
92 return returnValue
|
|
93
|
|
94
|
|
95 def processIncludeFile( self, includeFileName ):
|
|
96 """
|
|
97 A function that adds and processes and include file.
|
|
98 An exclude file must be set for this function to be called.
|
|
99 """
|
|
100
|
|
101 utils.logMessage( "IncludeFileManager::processIncludeFile( )", "processing {0}".format( includeFileName ) )
|
|
102
|
|
103 if self.isExcludeFileInitialized == False:
|
|
104 utils.logMessage( "IncludeFileManager::processIncludeFile( )", "no exclude file set".format( includeFileName ) )
|
|
105 raise utils.ModuleNotInitializedException( "includefilemanager", "no exclude file set" )
|
|
106
|
|
107 if self.isReferenceFileInitialized == False:
|
|
108
|
|
109 utils.logMessage( "IncludeFileManager::processIncludeFile( )", "running nucmer for reference file: {0}".format( includeFileName ) )
|
|
110 self.uniqueSequences = self.findUniqueSequencesInFile( includeFileName, self.excludeFileName )
|
|
111
|
|
112 self.referenceFile = includeFileName
|
|
113 self.isReferenceFileInitialized = True
|
|
114
|
|
115 else:
|
|
116 #write the unique sequences to a temp file
|
|
117 tempSequences = utils.getTemporaryDirectory( ) + "/tempSequences.fasta"
|
|
118 fastaparser.writeFastaFile( self.uniqueSequences, tempSequences )
|
|
119 self.findCommonSequencesInFile( includeFileName, tempSequences )
|
|
120 self.includeFiles.append( includeFileName )
|
|
121
|
|
122
|
|
123 def getUniqueSequences( self ):
|
|
124 """
|
|
125 getUniqueSequences - return a dictionary of all sequences that are found in include fasta files, but not the
|
|
126 combined exclude fasta files. The dictionary is indexed by the file ID
|
|
127 """
|
|
128
|
|
129 return self.uniqueSequences
|
|
130
|
|
131
|
|
132 |