Mercurial > repos > yutaka-saito > bsfcall
diff bin/maf-join @ 0:06f8460885ff
migrate from GitHub
author | yutaka-saito |
---|---|
date | Sun, 19 Apr 2015 20:51:13 +0900 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/maf-join Sun Apr 19 20:51:13 2015 +0900 @@ -0,0 +1,244 @@ +#! /usr/bin/env python + +# Copyright 2009, 2010, 2011 Martin C. Frith + +# Join two or more sets of MAF-format multiple alignments into bigger +# multiple alignments. The 'join field' is the top genome, which +# should be the same for each input. Each input should be sorted by +# position in the top genome. + +# WARNING: Alignment columns with a gap in the top genome are joined +# arbitrarily!!! + +import sys, os, fileinput, optparse, signal + +signal.signal(signal.SIGPIPE, signal.SIG_DFL) # stop spurious error message + +class peekable: # Adapted from Python Cookbook 2nd edition + """An iterator that supports a peek operation.""" + def __init__(self, iterable): + self.it = iter(iterable) + self.cache = [] + def __iter__(self): + return self + def next(self): + if self.cache: return self.cache.pop() + else: return self.it.next() + def peek(self): + if not self.cache: self.cache.append(self.it.next()) + return self.cache[0] + +def maxLen(things): return max(map(len, things)) + +class MafBlock: + def __init__(self, chr, beg, end, strand, chrSize, seq, prob): + self.chr = chr # chromosome names + self.beg = beg # alignment begin coordinates + self.end = end # alignment end coordinates + self.strand = strand + self.chrSize = chrSize # chromosome sizes + self.seq = seq # aligned sequences, including gaps + self.prob = prob # probabilities (may be empty) + + def __nonzero__(self): + return len(self.seq) > 0 + + def __cmp__(self, other): + return cmp(self.chr[:1] + self.beg[:1], other.chr[:1] + other.beg[:1]) + + def before(self, other): + return (self.chr[0], self.end[0]) <= (other.chr[0], other.beg[0]) + + def after(self, other): + return (self.chr[0], self.beg[0]) >= (other.chr[0], other.end[0]) + + def addLine(self, line): + words = line.split() + if line.startswith('s'): + self.chr.append(words[1]) + self.beg.append(int(words[2])) + self.end.append(int(words[2]) + int(words[3])) + self.strand.append(words[4]) + self.chrSize.append(words[5]) + self.seq.append(list(words[6])) + elif line.startswith('p'): + self.prob.append(words[1]) + + def write(self): + beg = map(str, self.beg) + size = [str(e-b) for b, e in zip(self.beg, self.end)] + seq = [''.join(i) for i in self.seq] + columns = self.chr, beg, size, self.strand, self.chrSize, seq + widths = map(maxLen, columns) + print 'a' + for row in zip(*columns): + widthsAndFields = zip(widths, row) + field0 = "%-*s" % widthsAndFields[0] # left-justify + fields = ["%*s" % i for i in widthsAndFields[1:]] # right-justify + print 's', field0, ' '.join(fields) + pad = ' '.join(' ' * i for i in widths[:-1]) + for i in self.prob: + print 'p', pad, i + print # blank line afterwards + +def topSeqBeg(maf): return maf.beg[0] +def topSeqEnd(maf): return maf.end[0] +def emptyMaf(): return MafBlock([], [], [], [], [], [], []) + +def joinOnFirstItem(x, y): + if x[0] != y[0]: + raise ValueError('join fields not equal:\n'+str(x[0])+'\n'+str(y[0])) + return x + y[1:] + +def mafEasyJoin(x, y): + '''Join two MAF blocks on the top sequence.''' + xJoin = zip(x.chr, x.beg, x.end, x.strand, x.chrSize, x.seq) + yJoin = zip(y.chr, y.beg, y.end, y.strand, y.chrSize, y.seq) + joined = joinOnFirstItem(xJoin, yJoin) + chr, beg, end, strand, chrSize, seq = zip(*joined) + prob = x.prob + y.prob + return MafBlock(chr, beg, end, strand, chrSize, seq, prob) + +def countNonGaps(s): return len(s) - s.count('-') + +def nthNonGap(s, n): + '''Get the start position of the n-th non-gap.''' + for i, x in enumerate(s): + if x != '-': + if n == 0: return i + n -= 1 + raise ValueError('non-gap not found') + +def nthLastNonGap(s, n): + '''Get the end position of the n-th last non-gap.''' + return len(s) - nthNonGap(s[::-1], n) + +def mafSlice(maf, alnBeg, alnEnd): + '''Return a slice of a MAF block, using coordinates in the alignment.''' + beg = [b + countNonGaps(s[:alnBeg]) for b, s in zip(maf.beg, maf.seq)] + end = [e - countNonGaps(s[alnEnd:]) for e, s in zip(maf.end, maf.seq)] + seq = [i[alnBeg:alnEnd] for i in maf.seq] + prob = [i[alnBeg:alnEnd] for i in maf.prob] + return MafBlock(maf.chr, beg, end, maf.strand, maf.chrSize, seq, prob) + +def mafSliceTopSeq(maf, newTopSeqBeg, newTopSeqEnd): + '''Return a slice of a MAF block, using coordinates in the top sequence.''' + lettersFromBeg = newTopSeqBeg - topSeqBeg(maf) + lettersFromEnd = topSeqEnd(maf) - newTopSeqEnd + alnBeg = nthNonGap(maf.seq[0], lettersFromBeg) + alnEnd = nthLastNonGap(maf.seq[0], lettersFromEnd) + return mafSlice(maf, alnBeg, alnEnd) + +def jumpGaps(sequence, index): + '''Return the next index of the sequence where there is a non-gap.''' + nextIndex = index + while sequence[nextIndex] == '-': nextIndex += 1 + return nextIndex + +def gapsToAdd(sequences): + '''Return new gaps and their positions, needed to align the non-gaps.''' + gapInfo = [[] for i in sequences] + gapBeg = [0 for i in sequences] + try: + while True: + gapEnd = [jumpGaps(s, p) for s, p in zip(sequences, gapBeg)] + gapSize = [e-b for b, e in zip(gapBeg, gapEnd)] + maxGapSize = max(gapSize) + for s, e, i in zip(gapSize, gapEnd, gapInfo): + if s < maxGapSize: + newGap = maxGapSize - s + i.append((newGap, e)) + gapBeg = [e+1 for e in gapEnd] + except IndexError: return gapInfo + +def chunksAndGaps(s, gapsAndPositions, oneGap): + '''Yield chunks of "s" interspersed with gaps at given positions.''' + oldPosition = 0 + for gapLen, position in gapsAndPositions: + yield s[oldPosition:position] + yield oneGap * gapLen + oldPosition = position + yield s[oldPosition:] + +def mafAddGaps(maf, gapsAndPositions): + '''Add the given gaps at the given positions to a MAF block.''' + maf.seq = [sum(chunksAndGaps(i, gapsAndPositions, ['-']), []) + for i in maf.seq] + maf.prob = [''.join(chunksAndGaps(i, gapsAndPositions, '~')) + for i in maf.prob] + +def mafJoin(mafs): + '''Intersect and join overlapping MAF blocks.''' + newTopSeqBeg = max(map(topSeqBeg, mafs)) + newTopSeqEnd = min(map(topSeqEnd, mafs)) + mafs = [mafSliceTopSeq(i, newTopSeqBeg, newTopSeqEnd) for i in mafs] + topSeqs = [i.seq[0] for i in mafs] + gapInfo = gapsToAdd(topSeqs) + for maf, gapsAndPositions in zip(mafs, gapInfo): + mafAddGaps(maf, gapsAndPositions) + return reduce(mafEasyJoin, mafs) + +def mafInput(lines): + '''Read lines and yield MAF blocks.''' + maf = emptyMaf() + for line in lines: + if line.isspace(): + if maf: yield maf + maf = emptyMaf() + else: + maf.addLine(line) + if maf: yield maf + +def sortedMafInput(lines): + '''Read lines and yield MAF blocks, checking that they are in order.''' + old = emptyMaf() + for maf in mafInput(lines): + if maf < old: sys.exit(progName + ": MAF blocks not sorted properly") + yield maf + old = maf + +def allOverlaps(sequences, beg, end): + '''Yield all combinations of MAF blocks that overlap in the top genome.''' + assert beg < end + if not sequences: + yield () + return + for i in sequences[0]: + if topSeqEnd(i) <= beg: continue + if topSeqBeg(i) >= end: break # assumes they're sorted by position + newBeg = max(beg, topSeqBeg(i)) + newEnd = min(end, topSeqEnd(i)) + for j in allOverlaps(sequences[1:], newBeg, newEnd): + yield (i,) + j + +def nextWindow(window, input, referenceMaf): + '''Yield "relevant" MAF blocks, based on overlap with referenceMaf.''' + for maf in window: + if not maf.before(referenceMaf): yield maf + try: + while True: + maf = input.peek() + if maf.after(referenceMaf): break + maf = input.next() + if not maf.before(referenceMaf): yield maf + except StopIteration: pass + +def overlappingMafs(sortedMafInputs): + '''Yield all combinations of MAF blocks that overlap in the top genome.''' + if not sortedMafInputs: return + head, tail = sortedMafInputs[0], sortedMafInputs[1:] + windows = [[] for t in tail] + for h in head: # iterate over MAF blocks in the first input + windows = [list(nextWindow(w, t, h)) for w, t in zip(windows, tail)] + for i in allOverlaps(windows, topSeqBeg(h), topSeqEnd(h)): + yield (h,) + i + +op = optparse.OptionParser(usage="%prog sorted-file1.maf sorted-file2.maf ...") +(opts, args) = op.parse_args() + +progName = os.path.basename(sys.argv[0]) + +inputs = [peekable(sortedMafInput(fileinput.input(i))) for i in args] + +for mafs in overlappingMafs(inputs): + mafJoin(mafs).write()