annotate bin/maf-cull @ 0:06f8460885ff

migrate from GitHub
author yutaka-saito
date Sun, 19 Apr 2015 20:51:13 +0900
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
1 #! /usr/bin/env python
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
2
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
3 # Read MAF-format alignments. Write them, omitting alignments whose
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
4 # coordinates in the top-most sequence are contained in those of >=
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
5 # cullingLimit higher-scoring alignments.
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
6
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
7 # Alignments on opposite strands are not considered to contain each
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
8 # other.
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
9
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
10 # The alignments must be sorted by sequence name, then strand, then
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
11 # start coordinate.
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
12
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
13 # This algorithm is not theoretically optimal, but it is simple and
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
14 # probably fast in practice. Optimal algorithms are described in:
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
15 # Winnowing sequences from a database search.
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
16 # Berman P, Zhang Z, Wolf YI, Koonin EV, Miller W.
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
17 # J Comput Biol. 2000 Feb-Apr;7(1-2):293-302.
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
18 # (Use a "priority search tree" or an "interval tree".)
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
19
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
20 # Seems to work with Python 2.x, x>=4
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
21
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
22 import fileinput, itertools, operator, optparse, os, signal, sys
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
23
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
24 # The intervals must have size > 0.
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
25
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
26 def isFresh(oldInterval, newInterval):
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
27 return oldInterval.end > newInterval.start
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
28
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
29 def freshIntervals(storedIntervals, newInterval):
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
30 """Yields those storedIntervals that overlap newInterval."""
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
31 return [i for i in storedIntervals if isFresh(i, newInterval)]
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
32
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
33 def isDominated(dog, queen):
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
34 return dog.score < queen.score and dog.end <= queen.end
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
35
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
36 def isWanted(newInterval, storedIntervals, cullingLimit):
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
37 """Is newInterval dominated by < cullingLimit storedIntervals?"""
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
38 dominators = (i for i in storedIntervals if isDominated(newInterval, i))
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
39 return len(list(dominators)) < cullingLimit
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
40
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
41 # Check that the intervals are sorted by start position, and further
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
42 # sort them in descending order of score.
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
43 def sortedIntervals(intervals):
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
44 oldStart = ()
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
45 for k, v in itertools.groupby(intervals, operator.attrgetter("start")):
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
46 if k < oldStart: raise Exception("the input is not sorted properly")
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
47 oldStart = k
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
48 for i in sorted(v, key=operator.attrgetter("score"), reverse=True):
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
49 yield i
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
50
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
51 def culledIntervals(intervals, cullingLimit):
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
52 """Yield intervals contained in < cullingLimit higher-scoring intervals."""
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
53 storedIntervals = []
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
54 for i in sortedIntervals(intervals):
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
55 storedIntervals = freshIntervals(storedIntervals, i)
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
56 if isWanted(i, storedIntervals, cullingLimit):
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
57 yield i
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
58 storedIntervals.append(i)
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
59
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
60 class Maf:
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
61 def __init__(self, lines):
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
62 self.lines = lines
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
63 try:
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
64 aLine = lines[0]
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
65 aWords = aLine.split()
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
66 scoreGen = (i for i in aWords if i.startswith("score="))
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
67 scoreWord = scoreGen.next()
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
68 self.score = float(scoreWord.split("=")[1])
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
69 except: raise Exception("missing score")
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
70 try:
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
71 sLine = lines[1]
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
72 sWords = sLine.split()
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
73 seqName = sWords[1]
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
74 alnStart = int(sWords[2])
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
75 alnSize = int(sWords[3])
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
76 strand = sWords[4]
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
77 self.start = seqName, strand, alnStart
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
78 self.end = seqName, strand, alnStart + alnSize
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
79 except: raise Exception("can't interpret the MAF data")
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
80
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
81 def mafInput(lines):
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
82 for k, v in itertools.groupby(lines, str.isspace):
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
83 if not k: yield Maf(list(v))
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
84
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
85 def filterComments(lines):
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
86 for i in lines:
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
87 if i.startswith("#"): print i,
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
88 else: yield i
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
89
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
90 def mafCull(opts, args):
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
91 inputLines = fileinput.input(args)
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
92 inputMafs = mafInput(filterComments(inputLines))
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
93 for maf in culledIntervals(inputMafs, opts.limit):
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
94 for i in maf.lines: print i,
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
95 print # blank line after each alignment
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
96
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
97 if __name__ == "__main__":
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
98 signal.signal(signal.SIGPIPE, signal.SIG_DFL) # avoid silly error message
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
99
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
100 usage = "%prog [options] my-alignments.maf"
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
101 description = "Cull alignments whose top-sequence coordinates are contained in LIMIT or more higher-scoring alignments."
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
102 op = optparse.OptionParser(usage=usage, description=description)
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
103 op.add_option("-l", "--limit", type="int", default=2,
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
104 help="culling limit (default: %default)")
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
105 (opts, args) = op.parse_args()
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
106 if opts.limit < 1: op.error("option -l: should be >= 1")
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
107
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
108 try: mafCull(opts, args)
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
109 except KeyboardInterrupt: pass # avoid silly error message
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
110 except Exception, e:
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
111 prog = os.path.basename(sys.argv[0])
06f8460885ff migrate from GitHub
yutaka-saito
parents:
diff changeset
112 sys.exit(prog + ": error: " + str(e))