Mercurial > repos > yutaka-saito > bsfcall
comparison bin/last-dotplot @ 0:06f8460885ff
migrate from GitHub
author | yutaka-saito |
---|---|
date | Sun, 19 Apr 2015 20:51:13 +0900 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:06f8460885ff |
---|---|
1 #! /usr/bin/env python | |
2 | |
3 # Read pair-wise alignments in MAF or LAST tabular format: write an | |
4 # "Oxford grid", a.k.a. dotplot. | |
5 | |
6 # TODO: Currently, pixels with zero aligned nt-pairs are white, and | |
7 # pixels with one or more aligned nt-pairs are black. This can look | |
8 # too crowded for large genome alignments. I tried shading each pixel | |
9 # according to the number of aligned nt-pairs within it, but the | |
10 # result is too faint. How can this be done better? | |
11 | |
12 import fileinput, itertools, optparse, os, re, sys | |
13 | |
14 # Try to make PIL/PILLOW work: | |
15 try: from PIL import Image, ImageDraw, ImageFont, ImageColor | |
16 except ImportError: import Image, ImageDraw, ImageFont, ImageColor | |
17 | |
18 my_name = os.path.basename(sys.argv[0]) | |
19 usage = """ | |
20 %prog --help | |
21 %prog [options] last-tabular-output dotplot.png | |
22 %prog [options] last-tabular-output dotplot.gif | |
23 etc.""" | |
24 parser = optparse.OptionParser(usage=usage) | |
25 # Replace "width" & "height" with a single "length" option? | |
26 parser.add_option("-x", "--width", type="int", dest="width", default=1000, | |
27 help="maximum width in pixels (default: %default)") | |
28 parser.add_option("-y", "--height", type="int", dest="height", default=1000, | |
29 help="maximum height in pixels (default: %default)") | |
30 parser.add_option("-f", "--fontfile", dest="fontfile", | |
31 help="TrueType or OpenType font file") | |
32 parser.add_option("-s", "--fontsize", type="int", dest="fontsize", default=11, | |
33 help="TrueType or OpenType font size (default: %default)") | |
34 parser.add_option("-c", "--forwardcolor", dest="forwardcolor", default="red", | |
35 help="Color for forward alignments (default: %default)") | |
36 parser.add_option("-r", "--reversecolor", dest="reversecolor", default="blue", | |
37 help="Color for reverse alignments (default: %default)") | |
38 (opts, args) = parser.parse_args() | |
39 if len(args) != 2: parser.error("2 arguments needed") | |
40 | |
41 if opts.fontfile: font = ImageFont.truetype(opts.fontfile, opts.fontsize) | |
42 else: font = ImageFont.load_default() | |
43 | |
44 # Make these options too? | |
45 text_color = "black" | |
46 background_color = "white" | |
47 pix_tween_seqs = 2 # number of border pixels between sequences | |
48 border_shade = 239, 239, 239 # the shade of grey to use for border pixels | |
49 label_space = 5 # minimum number of pixels between axis labels | |
50 | |
51 image_mode = 'RGB' | |
52 forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode) | |
53 reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode) | |
54 overlap_color = tuple([(i+j)//2 for i, j in zip(forward_color, reverse_color)]) | |
55 | |
56 def tabBlocks(beg1, beg2, blocks): | |
57 '''Get the gapless blocks of an alignment, from LAST tabular format.''' | |
58 for i in blocks.split(","): | |
59 if ":" in i: | |
60 x, y = i.split(":") | |
61 beg1 += int(x) | |
62 beg2 += int(y) | |
63 else: | |
64 size = int(i) | |
65 yield beg1, beg2, size | |
66 beg1 += size | |
67 beg2 += size | |
68 | |
69 def mafBlocks(beg1, beg2, seq1, seq2): | |
70 '''Get the gapless blocks of an alignment, from MAF format.''' | |
71 size = 0 | |
72 for x, y in itertools.izip(seq1, seq2): | |
73 if x == "-": | |
74 if size: | |
75 yield beg1, beg2, size | |
76 beg1 += size | |
77 beg2 += size | |
78 size = 0 | |
79 beg2 += 1 | |
80 elif y == "-": | |
81 if size: | |
82 yield beg1, beg2, size | |
83 beg1 += size | |
84 beg2 += size | |
85 size = 0 | |
86 beg1 += 1 | |
87 else: | |
88 size += 1 | |
89 if size: yield beg1, beg2, size | |
90 | |
91 def alignmentInput(lines): | |
92 '''Get alignments and sequence lengths, from MAF or tabular format.''' | |
93 mafCount = 0 | |
94 for line in lines: | |
95 w = line.split() | |
96 if line[0].isdigit(): # tabular format | |
97 chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5]) | |
98 if w[4] == "-": beg1 -= seqlen1 | |
99 chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10]) | |
100 if w[9] == "-": beg2 -= seqlen2 | |
101 blocks = tabBlocks(beg1, beg2, w[11]) | |
102 yield chr1, seqlen1, chr2, seqlen2, blocks | |
103 elif line[0] == "s": # MAF format | |
104 if mafCount == 0: | |
105 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6] | |
106 if w[4] == "-": beg1 -= seqlen1 | |
107 mafCount = 1 | |
108 else: | |
109 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6] | |
110 if w[4] == "-": beg2 -= seqlen2 | |
111 blocks = mafBlocks(beg1, beg2, seq1, seq2) | |
112 yield chr1, seqlen1, chr2, seqlen2, blocks | |
113 mafCount = 0 | |
114 | |
115 def readAlignments(lines): | |
116 '''Get alignments and sequence lengths, from MAF or tabular format.''' | |
117 alignments = [] | |
118 seqLengths1 = {} | |
119 seqLengths2 = {} | |
120 for chr1, seqlen1, chr2, seqlen2, blocks in alignmentInput(lines): | |
121 aln = chr1, chr2, blocks | |
122 alignments.append(aln) | |
123 seqLengths1[chr1] = seqlen1 | |
124 seqLengths2[chr2] = seqlen2 | |
125 return alignments, seqLengths1, seqLengths2 | |
126 | |
127 sys.stderr.write(my_name + ": reading alignments...\n") | |
128 input = fileinput.input(args[0]) | |
129 alignments, seq_size_dic1, seq_size_dic2 = readAlignments(input) | |
130 sys.stderr.write(my_name + ": done\n") | |
131 | |
132 if not alignments: | |
133 sys.exit(my_name + ": there are no alignments") | |
134 | |
135 def natural_sort_key(my_string): | |
136 '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.''' | |
137 parts = re.split(r'(\d+)', my_string) | |
138 parts[1::2] = map(int, parts[1::2]) | |
139 return parts | |
140 | |
141 def get_text_sizes(my_strings): | |
142 '''Get widths & heights, in pixels, of some strings.''' | |
143 if opts.fontsize == 0: return [(0, 0) for i in my_strings] | |
144 image_size = 1, 1 | |
145 im = Image.new(image_mode, image_size) | |
146 draw = ImageDraw.Draw(im) | |
147 return [draw.textsize(i, font=font) for i in my_strings] | |
148 | |
149 def get_seq_info(seq_size_dic): | |
150 '''Return miscellaneous information about the sequences.''' | |
151 seq_names = seq_size_dic.keys() | |
152 seq_names.sort(key=natural_sort_key) | |
153 seq_sizes = [seq_size_dic[i] for i in seq_names] | |
154 name_sizes = get_text_sizes(seq_names) | |
155 margin = max(zip(*name_sizes)[1]) # maximum text height | |
156 return seq_names, seq_sizes, name_sizes, margin | |
157 | |
158 seq_names1, seq_sizes1, name_sizes1, margin1 = get_seq_info(seq_size_dic1) | |
159 seq_names2, seq_sizes2, name_sizes2, margin2 = get_seq_info(seq_size_dic2) | |
160 | |
161 def div_ceil(x, y): | |
162 '''Return x / y rounded up.''' | |
163 q, r = divmod(x, y) | |
164 return q + (r != 0) | |
165 | |
166 def tot_seq_pix(seq_sizes, bp_per_pix): | |
167 '''Return the total pixels needed for sequences of the given sizes.''' | |
168 return sum([div_ceil(i, bp_per_pix) for i in seq_sizes]) | |
169 | |
170 def get_bp_per_pix(seq_sizes, pix_limit): | |
171 '''Get the minimum bp-per-pixel that fits in the size limit.''' | |
172 seq_num = len(seq_sizes) | |
173 seq_pix_limit = pix_limit - pix_tween_seqs * (seq_num - 1) | |
174 if seq_pix_limit < seq_num: | |
175 sys.exit(my_name + ": can't fit the image: too many sequences?") | |
176 lower_bound = div_ceil(sum(seq_sizes), seq_pix_limit) | |
177 for bp_per_pix in itertools.count(lower_bound): # slow linear search | |
178 if tot_seq_pix(seq_sizes, bp_per_pix) <= seq_pix_limit: break | |
179 return bp_per_pix | |
180 | |
181 sys.stderr.write(my_name + ": choosing bp per pixel...\n") | |
182 bp_per_pix1 = get_bp_per_pix(seq_sizes1, opts.width - margin1) | |
183 bp_per_pix2 = get_bp_per_pix(seq_sizes2, opts.height - margin2) | |
184 bp_per_pix = max(bp_per_pix1, bp_per_pix2) | |
185 sys.stderr.write(my_name + ": bp per pixel = " + str(bp_per_pix) + "\n") | |
186 | |
187 def get_seq_starts(seq_pix, pix_tween_seqs, margin): | |
188 '''Get the start pixel for each sequence.''' | |
189 seq_starts = [] | |
190 pix_tot = margin - pix_tween_seqs | |
191 for i in seq_pix: | |
192 pix_tot += pix_tween_seqs | |
193 seq_starts.append(pix_tot) | |
194 pix_tot += i | |
195 return seq_starts | |
196 | |
197 def get_pix_info(seq_sizes, margin): | |
198 '''Return pixel information about the sequences.''' | |
199 seq_pix = [div_ceil(i, bp_per_pix) for i in seq_sizes] | |
200 seq_starts = get_seq_starts(seq_pix, pix_tween_seqs, margin) | |
201 tot_pix = seq_starts[-1] + seq_pix[-1] | |
202 return seq_pix, seq_starts, tot_pix | |
203 | |
204 seq_pix1, seq_starts1, width = get_pix_info(seq_sizes1, margin1) | |
205 seq_pix2, seq_starts2, height = get_pix_info(seq_sizes2, margin2) | |
206 seq_start_dic1 = dict(zip(seq_names1, seq_starts1)) | |
207 seq_start_dic2 = dict(zip(seq_names2, seq_starts2)) | |
208 hits = [0] * (width * height) # the image data | |
209 | |
210 sys.stderr.write(my_name + ": processing alignments...\n") | |
211 for seq1, seq2, blocks in alignments: | |
212 seq_start1 = seq_start_dic1[seq1] | |
213 seq_start2 = seq_start_dic2[seq2] | |
214 my_start = seq_start2 * width + seq_start1 | |
215 for beg1, beg2, size in blocks: | |
216 end1 = beg1 + size | |
217 end2 = beg2 + size | |
218 if beg1 >= 0: j = xrange(beg1, end1) | |
219 else: j = xrange(-1 - beg1, -1 - end1, -1) | |
220 if beg2 >= 0: k = xrange(beg2, end2) | |
221 else: k = xrange(-1 - beg2, -1 - end2, -1) | |
222 if beg2 >= 0: store_value = 1 | |
223 else: store_value = 2 | |
224 for real_pos1, real_pos2 in itertools.izip(j, k): | |
225 pix1 = real_pos1 // bp_per_pix | |
226 pix2 = real_pos2 // bp_per_pix | |
227 hits[my_start + pix2 * width + pix1] |= store_value | |
228 sys.stderr.write(my_name + ": done\n") | |
229 | |
230 def make_label(text, text_size, range_start, range_size): | |
231 '''Return an axis label with endpoint & sort-order information.''' | |
232 text_width = text_size[0] | |
233 label_start = range_start + (range_size - text_width) // 2 | |
234 label_end = label_start + text_width | |
235 sort_key = text_width - range_size | |
236 return sort_key, label_start, label_end, text | |
237 | |
238 def get_nonoverlapping_labels(labels): | |
239 '''Get a subset of non-overlapping axis labels, greedily.''' | |
240 nonoverlapping_labels = [] | |
241 for i in labels: | |
242 if True not in [i[1] < j[2] + label_space and j[1] < i[2] + label_space | |
243 for j in nonoverlapping_labels]: | |
244 nonoverlapping_labels.append(i) | |
245 return nonoverlapping_labels | |
246 | |
247 def get_axis_image(seq_names, name_sizes, seq_starts, seq_pix): | |
248 '''Make an image of axis labels.''' | |
249 min_pos = seq_starts[0] | |
250 max_pos = seq_starts[-1] + seq_pix[-1] | |
251 height = max(zip(*name_sizes)[1]) | |
252 labels = [make_label(i, j, k, l) for i, j, k, l in | |
253 zip(seq_names, name_sizes, seq_starts, seq_pix)] | |
254 labels = [i for i in labels if i[1] >= min_pos and i[2] <= max_pos] | |
255 labels.sort() | |
256 labels = get_nonoverlapping_labels(labels) | |
257 image_size = max_pos, height | |
258 im = Image.new(image_mode, image_size, border_shade) | |
259 draw = ImageDraw.Draw(im) | |
260 for i in labels: | |
261 position = i[1], 0 | |
262 draw.text(position, i[3], font=font, fill=text_color) | |
263 return im | |
264 | |
265 image_size = width, height | |
266 im = Image.new(image_mode, image_size, background_color) | |
267 | |
268 for i in range(height): | |
269 for j in range(width): | |
270 store_value = hits[i * width + j] | |
271 xy = j, i | |
272 if store_value == 1: im.putpixel(xy, forward_color) | |
273 elif store_value == 2: im.putpixel(xy, reverse_color) | |
274 elif store_value == 3: im.putpixel(xy, overlap_color) | |
275 | |
276 if opts.fontsize != 0: | |
277 axis1 = get_axis_image(seq_names1, name_sizes1, seq_starts1, seq_pix1) | |
278 axis2 = get_axis_image(seq_names2, name_sizes2, seq_starts2, seq_pix2) | |
279 axis2 = axis2.rotate(270) | |
280 im.paste(axis1, (0, 0)) | |
281 im.paste(axis2, (0, 0)) | |
282 | |
283 for i in seq_starts1[1:]: | |
284 box = i - pix_tween_seqs, margin2, i, height | |
285 im.paste(border_shade, box) | |
286 | |
287 for i in seq_starts2[1:]: | |
288 box = margin1, i - pix_tween_seqs, width, i | |
289 im.paste(border_shade, box) | |
290 | |
291 im.save(args[1]) |