annotate uniqprimer-0.5.0/primertools/includefilemanager.py @ 3:3249d78ecfc2 draft

Uploaded
author dereeper
date Mon, 03 Jan 2022 09:56:55 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
1 '''
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
2 Created on Jan 1, 2011
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
3
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
4 @author: John L. Herndon
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
5 @contact: herndon@cs.colostate.edu
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
6 @organization: Colorado State University
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
7 @group: Computer Science Department, Asa Ben-Hur's laboratory
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
8 '''
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
9
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
10
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
11 import fastaparser
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
12 import utils
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
13 import os
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
14 import programs
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
15 import nucmerparser
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
16 import copy
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
17
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
18 class IncludeFileManager( object ):
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
19 """
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
20 A class to manage include files
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
21 """
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
22 #This class needs some work. Need to come up with a way to find unique sequences between all include files....
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
23
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
24 def __init__( self ):
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
25 """
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
26 Constructor
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
27 """
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
28 self.includeFiles = [ ]
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
29 self.nucmer = programs.Nucmer( )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
30 self.isExcludeFileInitialized = False
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
31 self.isReferenceFileInitialized = False
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
32 self.referenceFile = None
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
33 self.referenceSequence = None
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
34 self.uniqueSequences = None
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
35
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
36 def setExcludeFile( self, excludeFileName ):
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
37 """
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
38 A function to set the exclude file that will be used when nucmer is called
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
39 """
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
40
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
41 utils.logMessage( "IncludeFileManager::setExcludeFile( )", "fileName {0}".format( excludeFileName ) )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
42 self.excludeFileName = excludeFileName
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
43 self.isExcludeFileInitialized = True
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
44
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
45
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
46 def findUniqueSequencesInFile(self, doWantFile, doNotWantFile ):
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
47 utils.logMessage( "IncludeFileManager::findUniqueSequence( )", "running nucmer for reference file: {0}".format( doWantFile ) )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
48 coordFile = self.nucmer.execute( [ doWantFile, doNotWantFile ] )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
49
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
50 matches = nucmerparser.parseCoordMatchFile( coordFile )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
51 sequences = fastaparser.parseFastaFileAsPrimerSequence( doWantFile )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
52
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
53 for match in matches:
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
54 if sequences.has_key( match.seqID ):
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
55 primerData = sequences[ match.seqID ]
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
56 primerData.addMatch( match )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
57 else:
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
58 print "Warning: id from .coords file not found in sequence data..."
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
59 utils.logMessage( "IncludeFileManager::processMatches( )", "WARNING - an ID was read in a Match that does not correspond to a sequence read from the fasta file!" )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
60
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
61 returnValue = [ ]
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
62
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
63 for key in sequences.keys( ):
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
64 sequence = sequences[ key ]
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
65 subSequences = sequence.getNonMatchedSubSequences( )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
66 returnValue.extend( subSequences )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
67
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
68 return returnValue
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
69
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
70
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
71 def findCommonSequencesInFile(self, want, alsoWant ):
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
72 utils.logMessage( "IncludeFileManager::findUniqueSequence( )", "running nucmer for reference file: {0}".format( want ) )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
73
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
74 print want, alsoWant
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
75 coordFile = self.nucmer.execute( [ want, alsoWant ] )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
76
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
77 matches = nucmerparser.parseCoordMatchFile( coordFile )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
78 sequences = fastaparser.parseFastaFileAsPrimerSequence( want )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
79
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
80 for match in matches:
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
81 if sequences.has_key( match.seqID ):
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
82 primerData = sequences[ match.seqID ]
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
83 primerData.addMatch( match )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
84
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
85 returnValue = [ ]
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
86 for key in sequences:
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
87 sequence = sequences[ key ]
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
88 subSequences = sequence.getMatchedSubSequences( )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
89 returnValue.extend( subSequences )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
90
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
91
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
92 return returnValue
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
93
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
94
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
95 def processIncludeFile( self, includeFileName ):
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
96 """
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
97 A function that adds and processes and include file.
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
98 An exclude file must be set for this function to be called.
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
99 """
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
100
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
101 utils.logMessage( "IncludeFileManager::processIncludeFile( )", "processing {0}".format( includeFileName ) )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
102
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
103 if self.isExcludeFileInitialized == False:
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
104 utils.logMessage( "IncludeFileManager::processIncludeFile( )", "no exclude file set".format( includeFileName ) )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
105 raise utils.ModuleNotInitializedException( "includefilemanager", "no exclude file set" )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
106
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
107 if self.isReferenceFileInitialized == False:
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
108
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
109 utils.logMessage( "IncludeFileManager::processIncludeFile( )", "running nucmer for reference file: {0}".format( includeFileName ) )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
110 self.uniqueSequences = self.findUniqueSequencesInFile( includeFileName, self.excludeFileName )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
111
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
112 self.referenceFile = includeFileName
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
113 self.isReferenceFileInitialized = True
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
114
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
115 else:
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
116 #write the unique sequences to a temp file
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
117 tempSequences = utils.getTemporaryDirectory( ) + "/tempSequences.fasta"
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
118 fastaparser.writeFastaFile( self.uniqueSequences, tempSequences )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
119 self.findCommonSequencesInFile( includeFileName, tempSequences )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
120 self.includeFiles.append( includeFileName )
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
121
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
122
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
123 def getUniqueSequences( self ):
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
124 """
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
125 getUniqueSequences - return a dictionary of all sequences that are found in include fasta files, but not the
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
126 combined exclude fasta files. The dictionary is indexed by the file ID
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
127 """
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
128
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
129 return self.uniqueSequences
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
130
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
131
3249d78ecfc2 Uploaded
dereeper
parents:
diff changeset
132