comparison commons/tools/OrientSequences.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
comparison
equal deleted inserted replaced
17:b0e8584489e6 18:94ab73e8a190
1 #!/usr/bin/env python
2
3 """
4 Interface to orient sequences before making a multiple alignment.
5 Use hashing or suffix tree to get an idea of the appropriate strand.
6 Use 'orienter' by default, otherwise use 'mummer'.
7 """
8
9 import sys
10 import os
11 import glob
12 import getopt
13
14 from commons.core.seq.BioseqDB import BioseqDB
15 import pyRepet.seq.fastaDB
16 from commons.core.checker.CheckerUtils import CheckerUtils
17
18 class OrientSequences( object ):
19 """
20 Interface to orient sequences before making a multiple alignment.
21 Use hashing or suffix tree to get an idea of the appropriate strand.
22 Use 'orienter' by default, otherwise use 'mummer'.
23 """
24
25 def __init__(self, inFileName="", minMatchLength=10, prgToOrient = "orienter", outFileName="", clean=False, verbosity=1):
26 """
27 Constructor.
28 """
29 self._inFileName = inFileName
30 self._minMatchLength = minMatchLength
31 self._prgToOrient = prgToOrient
32 self._outFileName = outFileName
33 self._clean = clean
34 self._verbose = verbosity
35
36 def help( self ):
37 """
38 Display the help on stdout.
39 """
40 print
41 print "usage:",sys.argv[0].split("/")[-1],"[options]"
42 print "options:"
43 print " -h: this help"
44 print " -i: name of the input file (format='fasta')"
45 print " -m: minimum match length (default=10)"
46 print " -p: program to use first (default=orienter/mummer)"
47 print " -o: name of the output file (default=inFileName+'.oriented')"
48 print " -c: clean"
49 print " -v: verbosity level (0/default=1/2)"
50 print
51
52 def setAttributesFromCmdLine( self ):
53 """
54 Set the attributes from the command-line.
55 """
56 try:
57 opts, args = getopt.getopt(sys.argv[1:],"hi:m:p:o:cv:")
58 except getopt.GetoptError, err:
59 print str(err); self.help(); sys.exit(1)
60 for o,a in opts:
61 if o == "-h":
62 self.help(); sys.exit(0)
63 elif o == "-i":
64 self.setInputFileName( a )
65 elif o == "-m":
66 self.setMinMatchLength( a )
67 elif o == "-p":
68 self.setPrgToOrient( a )
69 elif o == "-o":
70 self.setOutputFileName( a )
71 elif o == "-c":
72 self.setClean()
73 elif o == "-v":
74 self.setVerbosityLevel( a )
75
76 def setInputFileName( self, inFileName ):
77 self._inFileName = inFileName
78
79 def setMinMatchLength( self, minMatchLength ):
80 self._minMatchLength = int(minMatchLength)
81
82 def setPrgToOrient( self, prgToOrient ):
83 self._prgToOrient = prgToOrient
84
85 def setOutputFileName( self, outFileName ):
86 self._outFileName = outFileName
87
88 def setClean( self ):
89 self._clean = True
90
91 def setVerbosityLevel( self, verbose ):
92 self._verbose = int(verbose)
93
94 def checkAttributes( self ):
95 """
96 Check the attributes are valid before running the algorithm.
97 """
98 if self._inFileName == "":
99 print "ERROR: missing input file name"
100 self.help(); sys.exit(1)
101 if not os.path.exists( self._inFileName ):
102 print "ERROR: input file '%s' doesn't exist" % ( self._inFileName )
103 self.help(); sys.exit(1)
104 if self._prgToOrient not in [ "orienter", "mummer" ]:
105 print "ERROR: unknown program '%s'" % ( self._prgToOrient )
106 self.help(); sys.exit(1)
107 if self._outFileName == "":
108 self._outFileName = "%s.oriented" % ( self._inFileName )
109
110 def useOrienter( self ):
111 """
112 Use 'orienter'.
113 @return: exit value of 'orienter'
114 """
115 prg = "orienter"
116 cmd = prg
117 cmd += " -k"
118 # cmd += " -l %i" % ( self._minMatchLength )
119 cmd += " -v %i" % ( self._verbose )
120 cmd += " %s" % ( self._inFileName )
121 log = os.system( cmd )
122 if log == 0 and self._outFileName.split(".")[-1] != "oriented":
123 os.rename( self._inFileName + ".oriented", self._outFileName )
124 return log
125
126 def compareInputSequencesWithMummer( self, nbInSeq ):
127 """
128 Launch MUmmer on two single-sequence fasta files to find all maximal matches regardless of their uniqueness and record stdout.
129 Only N(N-1)/2 comparisons are made.
130 @param nbInSeq: number of input sequences
131 @type nbInSeq: integer
132 """
133 if self._verbose > 0:
134 print "aligning input sequences..."
135 sys.stdout.flush()
136 if not CheckerUtils.isExecutableInUserPath( "mummer" ):
137 msg = "ERROR: 'mummer' is not in your PATH"
138 sys.stderr.write( "%s\n" % ( msg ) )
139 sys.exit(1)
140 lInFiles = glob.glob( "batch_*.fa" )
141 for i in range( 1, nbInSeq+1 ):
142 for j in range( i+1, nbInSeq+1 ):
143 if self._verbose > 1:
144 print "launch MUmmer on %i versus %i" % ( i, j )
145 sys.stdout.flush()
146 prg = "mummer"
147 cmd = prg
148 cmd += " -maxmatch"
149 cmd += " -l %i" % ( self._minMatchLength )
150 cmd += " -b"
151 cmd += " -F"
152 cmd += " batch_%s.fa" % ( str(j).zfill( len( str( len(lInFiles) ) ) ) )
153 cmd += " batch_%s.fa" % ( str(i).zfill( len( str( len(lInFiles) ) ) ) )
154 cmd += " > mummer_%i_vs_%i.txt" % ( i, j )
155 if self._verbose < 3:
156 cmd += " 2> /dev/null"
157 returnStatus = os.system( cmd )
158 if returnStatus != 0:
159 msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus )
160 sys.stderr.write( "%s\n" % ( msg ) )
161 sys.exit(1)
162
163 def parseMummerOutput( self, mummerFileName, nameSeq1, nameSeq2 ):
164 """
165 Parse the output from MUmmer.
166 @param mummerFileName: file with the output from MUmmer
167 @type mummerFileName: string
168 @param nameSeq1: name of the first sequence in the pairwise comparison
169 @type nameSeq1: string
170 @param nameSeq2: name of the first sequence in the pairwise comparison
171 @type nameSeq2: string
172 @return: dictionary whose keys are strands and values the number of maximal matches
173 """
174 if self._verbose > 1:
175 print "parse '%s'" % ( mummerFileName )
176 sys.stdout.flush()
177 dStrand2NbMatches = {}
178 mummerF = open( mummerFileName, "r" )
179 strand = "direct"
180 nbMatches = 0
181 line = mummerF.readline()
182 while True:
183 if line == "":
184 break
185 if line[0] == ">":
186 if nameSeq1 not in line:
187 print "ERROR"
188 print nameSeq1
189 print nameSeq2
190 sys.exit(1)
191 if "Reverse" in line:
192 dStrand2NbMatches[ strand ] = nbMatches
193 strand = "reverse"
194 nbMatches = 0
195 else:
196 nbMatches += 1
197 line = mummerF.readline()
198 dStrand2NbMatches[ strand ] = nbMatches
199 mummerF.close()
200 if self._verbose > 1:
201 print " direct: %i maximal matches" % ( dStrand2NbMatches["direct"] )
202 print " reverse: %i maximal matches" % ( dStrand2NbMatches["reverse"] )
203 sys.stdout.flush()
204 return dStrand2NbMatches
205
206 def getCumulativeMatchLengthsOnBothStrandForEachPairwiseComparison( self, lInHeaders, nbInSeq ):
207 """
208 For each pairwise comparison, retrieve the number of matches on both strand.
209 @param lInHeaders: list of the sequence headers
210 @type lInHeaders: list of strings
211 @param nbInSeq: number of input sequences
212 @type nbInSeq: integer
213 @return: dictionary whose keys are pairwise comparisons and values are number of matches on both strands
214 """
215 dMatrix = {}
216 for i in range( 1, nbInSeq+1 ):
217 for j in range( i+1, nbInSeq+1 ):
218 pairComp = "%i_vs_%i" % ( i, j )
219 dStrand2NbMatches = self.parseMummerOutput( "mummer_%s.txt" % ( pairComp ), lInHeaders[i-1], lInHeaders[j-1] )
220 dMatrix[ pairComp ] = dStrand2NbMatches
221 return dMatrix
222
223 def showResultsAsOrienter( self, nbInSeq, dMatrix ):
224 """
225 Not used for the moment but can be useful...
226 """
227 for i in range( 1, nbInSeq+1 ):
228 for j in range( i+1, nbInSeq+1 ):
229 pairComp = "%i_vs_%i" % ( i, j )
230 string = "aligning %i with %i" % ( i, j )
231 string += " %i %i" % ( dMatrix[pairComp]["direct"], dMatrix[pairComp]["reverse"] )
232 string += " max=%i" % ( max( dMatrix[pairComp]["direct"], dMatrix[pairComp]["reverse"] ) )
233 if dMatrix[pairComp]["reverse"] > dMatrix[pairComp]["direct"]:
234 string += " strand=-"
235 string += " nb=%i" % ( dMatrix[pairComp]["reverse"] )
236 else:
237 string += " strand=+"
238 string += " nb=%i" % ( dMatrix[pairComp]["direct"] )
239 print string; sys.stdout.flush()
240
241 def getSequencesToReverseFromMatrix( self, dMatrix, lNewHeaders ):
242 """
243 Retrieve the sequences than need to be re-oriented.
244 @param dMatrix:
245 @type dMatrix:
246 @param lInHeaders: list of the new sequence headers
247 @type lInHeaders: list of strings
248 @return: list of headers corresponding to sequences than need to be re-oriented
249 """
250 lSeqToOrient = []
251
252 for i in range( 1, len(lNewHeaders)+1 ):
253 for j in range( i+1, len(lNewHeaders)+1 ):
254 comp = "%i_vs_%i" % ( i, j )
255 nbDirectMatches = dMatrix[ comp ][ "direct" ]
256 nbReverseMatches = dMatrix[ comp ][ "reverse" ]
257 if self._verbose > 1:
258 print "%s: direct=%i reverse=%i" % ( comp, nbDirectMatches, nbReverseMatches )
259 if nbReverseMatches > nbDirectMatches and lNewHeaders[i-1] not in lSeqToOrient:
260 if lNewHeaders[ j-1 ] not in lSeqToOrient:
261 if self._verbose > 2:
262 "need to reverse '%s'" % ( lNewHeaders[j-1] )
263 lSeqToOrient.append( lNewHeaders[ j-1 ] )
264 return lSeqToOrient
265
266
267 def getSequencesToReverse( self ):
268 """
269 Return a list with the headers of the sequences to reverse.
270 """
271 lSequenceHeadersToReverse = []
272
273 if self._prgToOrient == "orienter":
274 exitStatus = self.useOrienter()
275 if exitStatus == 0:
276 self.end()
277 sys.exit(0)
278 if exitStatus != 0:
279 print "\nWARNING: 'orienter' had a problem, switching to 'mummer'"
280 sys.stdout.flush()
281
282 lInHeaders = pyRepet.seq.fastaDB.dbHeaders( self._inFileName )
283 nbInSeq = len( lInHeaders )
284 if self._verbose > 0:
285 print "nb of input sequences: %i" % ( nbInSeq )
286 sys.stdout.flush()
287
288 pyRepet.seq.fastaDB.shortenSeqHeaders( self._inFileName, 1 )
289 tmpFileName = "%s.shortH" % ( self._inFileName )
290 lNewHeaders = pyRepet.seq.fastaDB.dbHeaders( tmpFileName )
291 dNew2Init = pyRepet.seq.fastaDB.retrieveLinksNewInitialHeaders( "%slink" % ( tmpFileName ) )
292
293 pyRepet.seq.fastaDB.dbSplit( tmpFileName, nbSeqPerBatch=1, newDir=True )
294 os.chdir( "batches" )
295 self.compareInputSequencesWithMummer( nbInSeq )
296 dMatrix = self.getCumulativeMatchLengthsOnBothStrandForEachPairwiseComparison( lNewHeaders, nbInSeq )
297 os.chdir( ".." )
298
299 lNewHeadersToReverse = self.getSequencesToReverseFromMatrix( dMatrix, lNewHeaders )
300 for newH in lNewHeadersToReverse:
301 lSequenceHeadersToReverse.append( dNew2Init[ newH ] )
302 if self._verbose > 0:
303 print "nb of sequences to reverse: %i" % ( len(lNewHeadersToReverse) )
304 for initH in lSequenceHeadersToReverse: print " %s" % ( initH )
305 sys.stdout.flush()
306
307 if self._clean:
308 os.remove( tmpFileName )
309 os.remove( "%slink" % ( tmpFileName ) )
310
311 return lSequenceHeadersToReverse
312
313 def orientInputSequences( self, lSequenceHeadersToReverse, tmpFileName="" ):
314 """
315 Save input sequences while re-orienting those needing it.
316 @param lSequenceHeadersToReverse: list of headers corresponding to sequences than need to be re-oriented
317 @type lSequenceHeadersToReverse: list of strings
318 @param tmpFileName: name of a fasta file (inFileName by default)
319 @type tmpFileName: string
320 """
321 if self._verbose > 0:
322 print "saving oriented sequences..."
323 sys.stdout.flush()
324 if tmpFileName == "":
325 tmpFileName = self._inFileName
326 inDB = BioseqDB( tmpFileName )
327 outDB = BioseqDB()
328 for bs in inDB.db:
329 if bs.header in lSequenceHeadersToReverse:
330 bs.reverseComplement()
331 bs.header += " re-oriented"
332 outDB.add( bs )
333 outDB.save( self._outFileName )
334
335 def clean( self ):
336 if os.path.exists( "batches" ):
337 os.system( "rm -rf batches" )
338 if os.path.exists( "orienter_error.log" ):
339 os.remove( "orienter_error.log" )
340 for f in glob.glob( "core.*" ):
341 os.remove( f )
342
343 def start( self ):
344 """
345 Useful commands before running the program.
346 """
347 self.checkAttributes()
348 if self._verbose > 0:
349 print "START %s" % ( type(self).__name__ )
350 print "input file: %s" % ( self._inFileName )
351 sys.stdout.flush()
352
353 def end( self ):
354 """
355 Useful commands before ending the program.
356 """
357 if self._clean:
358 self.clean()
359 if self._verbose > 0:
360 print "END %s" % ( type(self).__name__ )
361 sys.stdout.flush()
362
363 def run( self ):
364 """
365 Run the program.
366 """
367 self.start()
368 lSequenceHeadersToReverse = self.getSequencesToReverse()
369 self.orientInputSequences( lSequenceHeadersToReverse )
370 self.end()
371
372 if __name__ == "__main__":
373 i = OrientSequences()
374 i.setAttributesFromCmdLine()
375 i.run()