Mercurial > repos > yufei-luo > s_mart
comparison commons/tools/OrientSequences.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:b0e8584489e6 | 18:94ab73e8a190 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 Interface to orient sequences before making a multiple alignment. | |
5 Use hashing or suffix tree to get an idea of the appropriate strand. | |
6 Use 'orienter' by default, otherwise use 'mummer'. | |
7 """ | |
8 | |
9 import sys | |
10 import os | |
11 import glob | |
12 import getopt | |
13 | |
14 from commons.core.seq.BioseqDB import BioseqDB | |
15 import pyRepet.seq.fastaDB | |
16 from commons.core.checker.CheckerUtils import CheckerUtils | |
17 | |
18 class OrientSequences( object ): | |
19 """ | |
20 Interface to orient sequences before making a multiple alignment. | |
21 Use hashing or suffix tree to get an idea of the appropriate strand. | |
22 Use 'orienter' by default, otherwise use 'mummer'. | |
23 """ | |
24 | |
25 def __init__(self, inFileName="", minMatchLength=10, prgToOrient = "orienter", outFileName="", clean=False, verbosity=1): | |
26 """ | |
27 Constructor. | |
28 """ | |
29 self._inFileName = inFileName | |
30 self._minMatchLength = minMatchLength | |
31 self._prgToOrient = prgToOrient | |
32 self._outFileName = outFileName | |
33 self._clean = clean | |
34 self._verbose = verbosity | |
35 | |
36 def help( self ): | |
37 """ | |
38 Display the help on stdout. | |
39 """ | |
40 print | |
41 print "usage:",sys.argv[0].split("/")[-1],"[options]" | |
42 print "options:" | |
43 print " -h: this help" | |
44 print " -i: name of the input file (format='fasta')" | |
45 print " -m: minimum match length (default=10)" | |
46 print " -p: program to use first (default=orienter/mummer)" | |
47 print " -o: name of the output file (default=inFileName+'.oriented')" | |
48 print " -c: clean" | |
49 print " -v: verbosity level (0/default=1/2)" | |
50 print | |
51 | |
52 def setAttributesFromCmdLine( self ): | |
53 """ | |
54 Set the attributes from the command-line. | |
55 """ | |
56 try: | |
57 opts, args = getopt.getopt(sys.argv[1:],"hi:m:p:o:cv:") | |
58 except getopt.GetoptError, err: | |
59 print str(err); self.help(); sys.exit(1) | |
60 for o,a in opts: | |
61 if o == "-h": | |
62 self.help(); sys.exit(0) | |
63 elif o == "-i": | |
64 self.setInputFileName( a ) | |
65 elif o == "-m": | |
66 self.setMinMatchLength( a ) | |
67 elif o == "-p": | |
68 self.setPrgToOrient( a ) | |
69 elif o == "-o": | |
70 self.setOutputFileName( a ) | |
71 elif o == "-c": | |
72 self.setClean() | |
73 elif o == "-v": | |
74 self.setVerbosityLevel( a ) | |
75 | |
76 def setInputFileName( self, inFileName ): | |
77 self._inFileName = inFileName | |
78 | |
79 def setMinMatchLength( self, minMatchLength ): | |
80 self._minMatchLength = int(minMatchLength) | |
81 | |
82 def setPrgToOrient( self, prgToOrient ): | |
83 self._prgToOrient = prgToOrient | |
84 | |
85 def setOutputFileName( self, outFileName ): | |
86 self._outFileName = outFileName | |
87 | |
88 def setClean( self ): | |
89 self._clean = True | |
90 | |
91 def setVerbosityLevel( self, verbose ): | |
92 self._verbose = int(verbose) | |
93 | |
94 def checkAttributes( self ): | |
95 """ | |
96 Check the attributes are valid before running the algorithm. | |
97 """ | |
98 if self._inFileName == "": | |
99 print "ERROR: missing input file name" | |
100 self.help(); sys.exit(1) | |
101 if not os.path.exists( self._inFileName ): | |
102 print "ERROR: input file '%s' doesn't exist" % ( self._inFileName ) | |
103 self.help(); sys.exit(1) | |
104 if self._prgToOrient not in [ "orienter", "mummer" ]: | |
105 print "ERROR: unknown program '%s'" % ( self._prgToOrient ) | |
106 self.help(); sys.exit(1) | |
107 if self._outFileName == "": | |
108 self._outFileName = "%s.oriented" % ( self._inFileName ) | |
109 | |
110 def useOrienter( self ): | |
111 """ | |
112 Use 'orienter'. | |
113 @return: exit value of 'orienter' | |
114 """ | |
115 prg = "orienter" | |
116 cmd = prg | |
117 cmd += " -k" | |
118 # cmd += " -l %i" % ( self._minMatchLength ) | |
119 cmd += " -v %i" % ( self._verbose ) | |
120 cmd += " %s" % ( self._inFileName ) | |
121 log = os.system( cmd ) | |
122 if log == 0 and self._outFileName.split(".")[-1] != "oriented": | |
123 os.rename( self._inFileName + ".oriented", self._outFileName ) | |
124 return log | |
125 | |
126 def compareInputSequencesWithMummer( self, nbInSeq ): | |
127 """ | |
128 Launch MUmmer on two single-sequence fasta files to find all maximal matches regardless of their uniqueness and record stdout. | |
129 Only N(N-1)/2 comparisons are made. | |
130 @param nbInSeq: number of input sequences | |
131 @type nbInSeq: integer | |
132 """ | |
133 if self._verbose > 0: | |
134 print "aligning input sequences..." | |
135 sys.stdout.flush() | |
136 if not CheckerUtils.isExecutableInUserPath( "mummer" ): | |
137 msg = "ERROR: 'mummer' is not in your PATH" | |
138 sys.stderr.write( "%s\n" % ( msg ) ) | |
139 sys.exit(1) | |
140 lInFiles = glob.glob( "batch_*.fa" ) | |
141 for i in range( 1, nbInSeq+1 ): | |
142 for j in range( i+1, nbInSeq+1 ): | |
143 if self._verbose > 1: | |
144 print "launch MUmmer on %i versus %i" % ( i, j ) | |
145 sys.stdout.flush() | |
146 prg = "mummer" | |
147 cmd = prg | |
148 cmd += " -maxmatch" | |
149 cmd += " -l %i" % ( self._minMatchLength ) | |
150 cmd += " -b" | |
151 cmd += " -F" | |
152 cmd += " batch_%s.fa" % ( str(j).zfill( len( str( len(lInFiles) ) ) ) ) | |
153 cmd += " batch_%s.fa" % ( str(i).zfill( len( str( len(lInFiles) ) ) ) ) | |
154 cmd += " > mummer_%i_vs_%i.txt" % ( i, j ) | |
155 if self._verbose < 3: | |
156 cmd += " 2> /dev/null" | |
157 returnStatus = os.system( cmd ) | |
158 if returnStatus != 0: | |
159 msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus ) | |
160 sys.stderr.write( "%s\n" % ( msg ) ) | |
161 sys.exit(1) | |
162 | |
163 def parseMummerOutput( self, mummerFileName, nameSeq1, nameSeq2 ): | |
164 """ | |
165 Parse the output from MUmmer. | |
166 @param mummerFileName: file with the output from MUmmer | |
167 @type mummerFileName: string | |
168 @param nameSeq1: name of the first sequence in the pairwise comparison | |
169 @type nameSeq1: string | |
170 @param nameSeq2: name of the first sequence in the pairwise comparison | |
171 @type nameSeq2: string | |
172 @return: dictionary whose keys are strands and values the number of maximal matches | |
173 """ | |
174 if self._verbose > 1: | |
175 print "parse '%s'" % ( mummerFileName ) | |
176 sys.stdout.flush() | |
177 dStrand2NbMatches = {} | |
178 mummerF = open( mummerFileName, "r" ) | |
179 strand = "direct" | |
180 nbMatches = 0 | |
181 line = mummerF.readline() | |
182 while True: | |
183 if line == "": | |
184 break | |
185 if line[0] == ">": | |
186 if nameSeq1 not in line: | |
187 print "ERROR" | |
188 print nameSeq1 | |
189 print nameSeq2 | |
190 sys.exit(1) | |
191 if "Reverse" in line: | |
192 dStrand2NbMatches[ strand ] = nbMatches | |
193 strand = "reverse" | |
194 nbMatches = 0 | |
195 else: | |
196 nbMatches += 1 | |
197 line = mummerF.readline() | |
198 dStrand2NbMatches[ strand ] = nbMatches | |
199 mummerF.close() | |
200 if self._verbose > 1: | |
201 print " direct: %i maximal matches" % ( dStrand2NbMatches["direct"] ) | |
202 print " reverse: %i maximal matches" % ( dStrand2NbMatches["reverse"] ) | |
203 sys.stdout.flush() | |
204 return dStrand2NbMatches | |
205 | |
206 def getCumulativeMatchLengthsOnBothStrandForEachPairwiseComparison( self, lInHeaders, nbInSeq ): | |
207 """ | |
208 For each pairwise comparison, retrieve the number of matches on both strand. | |
209 @param lInHeaders: list of the sequence headers | |
210 @type lInHeaders: list of strings | |
211 @param nbInSeq: number of input sequences | |
212 @type nbInSeq: integer | |
213 @return: dictionary whose keys are pairwise comparisons and values are number of matches on both strands | |
214 """ | |
215 dMatrix = {} | |
216 for i in range( 1, nbInSeq+1 ): | |
217 for j in range( i+1, nbInSeq+1 ): | |
218 pairComp = "%i_vs_%i" % ( i, j ) | |
219 dStrand2NbMatches = self.parseMummerOutput( "mummer_%s.txt" % ( pairComp ), lInHeaders[i-1], lInHeaders[j-1] ) | |
220 dMatrix[ pairComp ] = dStrand2NbMatches | |
221 return dMatrix | |
222 | |
223 def showResultsAsOrienter( self, nbInSeq, dMatrix ): | |
224 """ | |
225 Not used for the moment but can be useful... | |
226 """ | |
227 for i in range( 1, nbInSeq+1 ): | |
228 for j in range( i+1, nbInSeq+1 ): | |
229 pairComp = "%i_vs_%i" % ( i, j ) | |
230 string = "aligning %i with %i" % ( i, j ) | |
231 string += " %i %i" % ( dMatrix[pairComp]["direct"], dMatrix[pairComp]["reverse"] ) | |
232 string += " max=%i" % ( max( dMatrix[pairComp]["direct"], dMatrix[pairComp]["reverse"] ) ) | |
233 if dMatrix[pairComp]["reverse"] > dMatrix[pairComp]["direct"]: | |
234 string += " strand=-" | |
235 string += " nb=%i" % ( dMatrix[pairComp]["reverse"] ) | |
236 else: | |
237 string += " strand=+" | |
238 string += " nb=%i" % ( dMatrix[pairComp]["direct"] ) | |
239 print string; sys.stdout.flush() | |
240 | |
241 def getSequencesToReverseFromMatrix( self, dMatrix, lNewHeaders ): | |
242 """ | |
243 Retrieve the sequences than need to be re-oriented. | |
244 @param dMatrix: | |
245 @type dMatrix: | |
246 @param lInHeaders: list of the new sequence headers | |
247 @type lInHeaders: list of strings | |
248 @return: list of headers corresponding to sequences than need to be re-oriented | |
249 """ | |
250 lSeqToOrient = [] | |
251 | |
252 for i in range( 1, len(lNewHeaders)+1 ): | |
253 for j in range( i+1, len(lNewHeaders)+1 ): | |
254 comp = "%i_vs_%i" % ( i, j ) | |
255 nbDirectMatches = dMatrix[ comp ][ "direct" ] | |
256 nbReverseMatches = dMatrix[ comp ][ "reverse" ] | |
257 if self._verbose > 1: | |
258 print "%s: direct=%i reverse=%i" % ( comp, nbDirectMatches, nbReverseMatches ) | |
259 if nbReverseMatches > nbDirectMatches and lNewHeaders[i-1] not in lSeqToOrient: | |
260 if lNewHeaders[ j-1 ] not in lSeqToOrient: | |
261 if self._verbose > 2: | |
262 "need to reverse '%s'" % ( lNewHeaders[j-1] ) | |
263 lSeqToOrient.append( lNewHeaders[ j-1 ] ) | |
264 return lSeqToOrient | |
265 | |
266 | |
267 def getSequencesToReverse( self ): | |
268 """ | |
269 Return a list with the headers of the sequences to reverse. | |
270 """ | |
271 lSequenceHeadersToReverse = [] | |
272 | |
273 if self._prgToOrient == "orienter": | |
274 exitStatus = self.useOrienter() | |
275 if exitStatus == 0: | |
276 self.end() | |
277 sys.exit(0) | |
278 if exitStatus != 0: | |
279 print "\nWARNING: 'orienter' had a problem, switching to 'mummer'" | |
280 sys.stdout.flush() | |
281 | |
282 lInHeaders = pyRepet.seq.fastaDB.dbHeaders( self._inFileName ) | |
283 nbInSeq = len( lInHeaders ) | |
284 if self._verbose > 0: | |
285 print "nb of input sequences: %i" % ( nbInSeq ) | |
286 sys.stdout.flush() | |
287 | |
288 pyRepet.seq.fastaDB.shortenSeqHeaders( self._inFileName, 1 ) | |
289 tmpFileName = "%s.shortH" % ( self._inFileName ) | |
290 lNewHeaders = pyRepet.seq.fastaDB.dbHeaders( tmpFileName ) | |
291 dNew2Init = pyRepet.seq.fastaDB.retrieveLinksNewInitialHeaders( "%slink" % ( tmpFileName ) ) | |
292 | |
293 pyRepet.seq.fastaDB.dbSplit( tmpFileName, nbSeqPerBatch=1, newDir=True ) | |
294 os.chdir( "batches" ) | |
295 self.compareInputSequencesWithMummer( nbInSeq ) | |
296 dMatrix = self.getCumulativeMatchLengthsOnBothStrandForEachPairwiseComparison( lNewHeaders, nbInSeq ) | |
297 os.chdir( ".." ) | |
298 | |
299 lNewHeadersToReverse = self.getSequencesToReverseFromMatrix( dMatrix, lNewHeaders ) | |
300 for newH in lNewHeadersToReverse: | |
301 lSequenceHeadersToReverse.append( dNew2Init[ newH ] ) | |
302 if self._verbose > 0: | |
303 print "nb of sequences to reverse: %i" % ( len(lNewHeadersToReverse) ) | |
304 for initH in lSequenceHeadersToReverse: print " %s" % ( initH ) | |
305 sys.stdout.flush() | |
306 | |
307 if self._clean: | |
308 os.remove( tmpFileName ) | |
309 os.remove( "%slink" % ( tmpFileName ) ) | |
310 | |
311 return lSequenceHeadersToReverse | |
312 | |
313 def orientInputSequences( self, lSequenceHeadersToReverse, tmpFileName="" ): | |
314 """ | |
315 Save input sequences while re-orienting those needing it. | |
316 @param lSequenceHeadersToReverse: list of headers corresponding to sequences than need to be re-oriented | |
317 @type lSequenceHeadersToReverse: list of strings | |
318 @param tmpFileName: name of a fasta file (inFileName by default) | |
319 @type tmpFileName: string | |
320 """ | |
321 if self._verbose > 0: | |
322 print "saving oriented sequences..." | |
323 sys.stdout.flush() | |
324 if tmpFileName == "": | |
325 tmpFileName = self._inFileName | |
326 inDB = BioseqDB( tmpFileName ) | |
327 outDB = BioseqDB() | |
328 for bs in inDB.db: | |
329 if bs.header in lSequenceHeadersToReverse: | |
330 bs.reverseComplement() | |
331 bs.header += " re-oriented" | |
332 outDB.add( bs ) | |
333 outDB.save( self._outFileName ) | |
334 | |
335 def clean( self ): | |
336 if os.path.exists( "batches" ): | |
337 os.system( "rm -rf batches" ) | |
338 if os.path.exists( "orienter_error.log" ): | |
339 os.remove( "orienter_error.log" ) | |
340 for f in glob.glob( "core.*" ): | |
341 os.remove( f ) | |
342 | |
343 def start( self ): | |
344 """ | |
345 Useful commands before running the program. | |
346 """ | |
347 self.checkAttributes() | |
348 if self._verbose > 0: | |
349 print "START %s" % ( type(self).__name__ ) | |
350 print "input file: %s" % ( self._inFileName ) | |
351 sys.stdout.flush() | |
352 | |
353 def end( self ): | |
354 """ | |
355 Useful commands before ending the program. | |
356 """ | |
357 if self._clean: | |
358 self.clean() | |
359 if self._verbose > 0: | |
360 print "END %s" % ( type(self).__name__ ) | |
361 sys.stdout.flush() | |
362 | |
363 def run( self ): | |
364 """ | |
365 Run the program. | |
366 """ | |
367 self.start() | |
368 lSequenceHeadersToReverse = self.getSequencesToReverse() | |
369 self.orientInputSequences( lSequenceHeadersToReverse ) | |
370 self.end() | |
371 | |
372 if __name__ == "__main__": | |
373 i = OrientSequences() | |
374 i.setAttributesFromCmdLine() | |
375 i.run() |