comparison bin/last-map-probs @ 0:06f8460885ff

migrate from GitHub
author yutaka-saito
date Sun, 19 Apr 2015 20:51:13 +0900
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:06f8460885ff
1 #! /usr/bin/env python
2
3 # Copyright 2010, 2011, 2012, 2014 Martin C. Frith
4
5 # Read query-genome alignments: write them along with the probability
6 # that each alignment is not the true mapping of its query. These
7 # probabilities make the risky assumption that one of the alignments
8 # reported for each query is correct.
9
10 import sys, os, fileinput, math, optparse, signal
11
12 def logsum(numbers):
13 """Adds numbers, in log space, to avoid overflow."""
14 m = max(numbers)
15 s = sum(math.exp(i - m) for i in numbers)
16 return m + math.log(s)
17
18 def mafScore(aLine):
19 for word in aLine.split():
20 if word.startswith("score="):
21 return float(word[6:])
22 raise Exception("found an alignment without a score")
23
24 def namesAndScores(lines):
25 queryNames = []
26 scores = []
27 for line in lines:
28 if line[0] == "a": # faster than line.startswith("a")
29 s = mafScore(line)
30 scores.append(s)
31 sLineCount = 0
32 elif line[0] == "s":
33 sLineCount += 1
34 if sLineCount == 2: queryNames.append(line.split(None, 2)[1])
35 elif line[0].isdigit(): # we have an alignment in tabular format
36 w = line.split(None, 7)
37 scores.append(float(w[0]))
38 queryNames.append(w[6])
39 return queryNames, scores
40
41 def scoreTotals(queryNames, scores, temperature):
42 queryLists = {}
43 for n, s in zip(queryNames, scores):
44 queryLists.setdefault(n, []).append(s / temperature)
45 return dict((k, logsum(v)) for k, v in queryLists.iteritems())
46
47 def writeOneBatch(lines, queryNames, scores, denominators, opts, temperature):
48 isWanted = True
49 i = 0
50 for line in lines:
51 if line[0] == "a":
52 s = scores[i]
53 p = 1.0 - math.exp(s / temperature - denominators[queryNames[i]])
54 i += 1
55 if s < opts.score or p > opts.mismap:
56 isWanted = False
57 else:
58 newLineEnd = " mismap=%.3g\n" % p
59 line = line.rstrip() + newLineEnd
60 elif line[0].isdigit(): # we have an alignment in tabular format
61 s = scores[i]
62 p = 1.0 - math.exp(s / temperature - denominators[queryNames[i]])
63 i += 1
64 if s < opts.score or p > opts.mismap: continue
65 newLineEnd = "\t%.3g\n" % p
66 line = line.rstrip() + newLineEnd
67 if isWanted: print line,
68 if line.isspace(): isWanted = True # reset at end of maf paragraph
69
70 def doOneBatch(lines, opts, temperature):
71 queryNames, scores = namesAndScores(lines)
72 denominators = scoreTotals(queryNames, scores, temperature)
73 writeOneBatch(lines, queryNames, scores, denominators, opts, temperature)
74
75 def readHeaderOrDie(lines):
76 t = 0.0
77 e = -1
78 for line in lines:
79 if line.startswith("#") or line.isspace():
80 print line,
81 for i in line.split():
82 if i.startswith("t="): t = float(i[2:])
83 elif i.startswith("e="): e = int(i[2:])
84 if t > 0 and e >= 0: break
85 else:
86 raise Exception("I need a header with t= and e=")
87 return t, e
88
89 def lastMapProbs(opts, args):
90 f = fileinput.input(args)
91 temperature, e = readHeaderOrDie(f)
92 if opts.score < 0: opts.score = e + round(temperature * math.log(1000))
93 lines = []
94
95 for line in f:
96 if line.startswith("# batch"):
97 doOneBatch(lines, opts, temperature)
98 lines = []
99 lines.append(line)
100 doOneBatch(lines, opts, temperature)
101
102 if __name__ == "__main__":
103 signal.signal(signal.SIGPIPE, signal.SIG_DFL) # avoid silly error message
104
105 usage = """
106 %prog --help
107 %prog [options] lastal-alignments"""
108
109 description = "Calculate a mismap probability for each alignment. This is the probability that the alignment does not reflect the origin of the query sequence, assuming that one reported alignment does reflect the origin of each query."
110
111 op = optparse.OptionParser(usage=usage, description=description)
112 op.add_option("-m", "--mismap", type="float", default=0.01, metavar="M",
113 help="don't write alignments with mismap probability > M (default: %default)")
114 op.add_option("-s", "--score", type="float", default=-1, metavar="S",
115 help="don't write alignments with score < S (default: e+t*ln[1000])")
116 (opts, args) = op.parse_args()
117 if not args and sys.stdin.isatty():
118 op.print_help()
119 op.exit()
120
121 try: lastMapProbs(opts, args)
122 except KeyboardInterrupt: pass # avoid silly error message
123 except Exception, e:
124 prog = os.path.basename(sys.argv[0])
125 sys.exit(prog + ": error: " + str(e))