Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/docutils/parsers/rst/tableparser.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:26e78fe6e8c4 |
---|---|
1 # $Id: tableparser.py 8373 2019-08-27 12:11:30Z milde $ | |
2 # Author: David Goodger <goodger@python.org> | |
3 # Copyright: This module has been placed in the public domain. | |
4 | |
5 """ | |
6 This module defines table parser classes,which parse plaintext-graphic tables | |
7 and produce a well-formed data structure suitable for building a CALS table. | |
8 | |
9 :Classes: | |
10 - `GridTableParser`: Parse fully-formed tables represented with a grid. | |
11 - `SimpleTableParser`: Parse simple tables, delimited by top & bottom | |
12 borders. | |
13 | |
14 :Exception class: `TableMarkupError` | |
15 | |
16 :Function: | |
17 `update_dict_of_lists()`: Merge two dictionaries containing list values. | |
18 """ | |
19 | |
20 __docformat__ = 'reStructuredText' | |
21 | |
22 | |
23 import re | |
24 import sys | |
25 from docutils import DataError | |
26 from docutils.utils import strip_combining_chars | |
27 | |
28 | |
29 class TableMarkupError(DataError): | |
30 | |
31 """ | |
32 Raise if there is any problem with table markup. | |
33 | |
34 The keyword argument `offset` denotes the offset of the problem | |
35 from the table's start line. | |
36 """ | |
37 | |
38 def __init__(self, *args, **kwargs): | |
39 self.offset = kwargs.pop('offset', 0) | |
40 DataError.__init__(self, *args) | |
41 | |
42 | |
43 class TableParser(object): | |
44 | |
45 """ | |
46 Abstract superclass for the common parts of the syntax-specific parsers. | |
47 """ | |
48 | |
49 head_body_separator_pat = None | |
50 """Matches the row separator between head rows and body rows.""" | |
51 | |
52 double_width_pad_char = '\x00' | |
53 """Padding character for East Asian double-width text.""" | |
54 | |
55 def parse(self, block): | |
56 """ | |
57 Analyze the text `block` and return a table data structure. | |
58 | |
59 Given a plaintext-graphic table in `block` (list of lines of text; no | |
60 whitespace padding), parse the table, construct and return the data | |
61 necessary to construct a CALS table or equivalent. | |
62 | |
63 Raise `TableMarkupError` if there is any problem with the markup. | |
64 """ | |
65 self.setup(block) | |
66 self.find_head_body_sep() | |
67 self.parse_table() | |
68 structure = self.structure_from_cells() | |
69 return structure | |
70 | |
71 def find_head_body_sep(self): | |
72 """Look for a head/body row separator line; store the line index.""" | |
73 for i in range(len(self.block)): | |
74 line = self.block[i] | |
75 if self.head_body_separator_pat.match(line): | |
76 if self.head_body_sep: | |
77 raise TableMarkupError( | |
78 'Multiple head/body row separators ' | |
79 '(table lines %s and %s); only one allowed.' | |
80 % (self.head_body_sep+1, i+1), offset=i) | |
81 else: | |
82 self.head_body_sep = i | |
83 self.block[i] = line.replace('=', '-') | |
84 if self.head_body_sep == 0 or self.head_body_sep == (len(self.block) | |
85 - 1): | |
86 raise TableMarkupError('The head/body row separator may not be ' | |
87 'the first or last line of the table.', | |
88 offset=i) | |
89 | |
90 | |
91 class GridTableParser(TableParser): | |
92 | |
93 """ | |
94 Parse a grid table using `parse()`. | |
95 | |
96 Here's an example of a grid table:: | |
97 | |
98 +------------------------+------------+----------+----------+ | |
99 | Header row, column 1 | Header 2 | Header 3 | Header 4 | | |
100 +========================+============+==========+==========+ | |
101 | body row 1, column 1 | column 2 | column 3 | column 4 | | |
102 +------------------------+------------+----------+----------+ | |
103 | body row 2 | Cells may span columns. | | |
104 +------------------------+------------+---------------------+ | |
105 | body row 3 | Cells may | - Table cells | | |
106 +------------------------+ span rows. | - contain | | |
107 | body row 4 | | - body elements. | | |
108 +------------------------+------------+---------------------+ | |
109 | |
110 Intersections use '+', row separators use '-' (except for one optional | |
111 head/body row separator, which uses '='), and column separators use '|'. | |
112 | |
113 Passing the above table to the `parse()` method will result in the | |
114 following data structure:: | |
115 | |
116 ([24, 12, 10, 10], | |
117 [[(0, 0, 1, ['Header row, column 1']), | |
118 (0, 0, 1, ['Header 2']), | |
119 (0, 0, 1, ['Header 3']), | |
120 (0, 0, 1, ['Header 4'])]], | |
121 [[(0, 0, 3, ['body row 1, column 1']), | |
122 (0, 0, 3, ['column 2']), | |
123 (0, 0, 3, ['column 3']), | |
124 (0, 0, 3, ['column 4'])], | |
125 [(0, 0, 5, ['body row 2']), | |
126 (0, 2, 5, ['Cells may span columns.']), | |
127 None, | |
128 None], | |
129 [(0, 0, 7, ['body row 3']), | |
130 (1, 0, 7, ['Cells may', 'span rows.', '']), | |
131 (1, 1, 7, ['- Table cells', '- contain', '- body elements.']), | |
132 None], | |
133 [(0, 0, 9, ['body row 4']), None, None, None]]) | |
134 | |
135 The first item is a list containing column widths (colspecs). The second | |
136 item is a list of head rows, and the third is a list of body rows. Each | |
137 row contains a list of cells. Each cell is either None (for a cell unused | |
138 because of another cell's span), or a tuple. A cell tuple contains four | |
139 items: the number of extra rows used by the cell in a vertical span | |
140 (morerows); the number of extra columns used by the cell in a horizontal | |
141 span (morecols); the line offset of the first line of the cell contents; | |
142 and the cell contents, a list of lines of text. | |
143 """ | |
144 | |
145 head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$') | |
146 | |
147 def setup(self, block): | |
148 self.block = block[:] # make a copy; it may be modified | |
149 self.block.disconnect() # don't propagate changes to parent | |
150 self.bottom = len(block) - 1 | |
151 self.right = len(block[0]) - 1 | |
152 self.head_body_sep = None | |
153 self.done = [-1] * len(block[0]) | |
154 self.cells = [] | |
155 self.rowseps = {0: [0]} | |
156 self.colseps = {0: [0]} | |
157 | |
158 def parse_table(self): | |
159 """ | |
160 Start with a queue of upper-left corners, containing the upper-left | |
161 corner of the table itself. Trace out one rectangular cell, remember | |
162 it, and add its upper-right and lower-left corners to the queue of | |
163 potential upper-left corners of further cells. Process the queue in | |
164 top-to-bottom order, keeping track of how much of each text column has | |
165 been seen. | |
166 | |
167 We'll end up knowing all the row and column boundaries, cell positions | |
168 and their dimensions. | |
169 """ | |
170 corners = [(0, 0)] | |
171 while corners: | |
172 top, left = corners.pop(0) | |
173 if top == self.bottom or left == self.right \ | |
174 or top <= self.done[left]: | |
175 continue | |
176 result = self.scan_cell(top, left) | |
177 if not result: | |
178 continue | |
179 bottom, right, rowseps, colseps = result | |
180 update_dict_of_lists(self.rowseps, rowseps) | |
181 update_dict_of_lists(self.colseps, colseps) | |
182 self.mark_done(top, left, bottom, right) | |
183 cellblock = self.block.get_2D_block(top + 1, left + 1, | |
184 bottom, right) | |
185 cellblock.disconnect() # lines in cell can't sync with parent | |
186 cellblock.replace(self.double_width_pad_char, '') | |
187 self.cells.append((top, left, bottom, right, cellblock)) | |
188 corners.extend([(top, right), (bottom, left)]) | |
189 corners.sort() | |
190 if not self.check_parse_complete(): | |
191 raise TableMarkupError('Malformed table; parse incomplete.') | |
192 | |
193 def mark_done(self, top, left, bottom, right): | |
194 """For keeping track of how much of each text column has been seen.""" | |
195 before = top - 1 | |
196 after = bottom - 1 | |
197 for col in range(left, right): | |
198 assert self.done[col] == before | |
199 self.done[col] = after | |
200 | |
201 def check_parse_complete(self): | |
202 """Each text column should have been completely seen.""" | |
203 last = self.bottom - 1 | |
204 for col in range(self.right): | |
205 if self.done[col] != last: | |
206 return False | |
207 return True | |
208 | |
209 def scan_cell(self, top, left): | |
210 """Starting at the top-left corner, start tracing out a cell.""" | |
211 assert self.block[top][left] == '+' | |
212 result = self.scan_right(top, left) | |
213 return result | |
214 | |
215 def scan_right(self, top, left): | |
216 """ | |
217 Look for the top-right corner of the cell, and make note of all column | |
218 boundaries ('+'). | |
219 """ | |
220 colseps = {} | |
221 line = self.block[top] | |
222 for i in range(left + 1, self.right + 1): | |
223 if line[i] == '+': | |
224 colseps[i] = [top] | |
225 result = self.scan_down(top, left, i) | |
226 if result: | |
227 bottom, rowseps, newcolseps = result | |
228 update_dict_of_lists(colseps, newcolseps) | |
229 return bottom, i, rowseps, colseps | |
230 elif line[i] != '-': | |
231 return None | |
232 return None | |
233 | |
234 def scan_down(self, top, left, right): | |
235 """ | |
236 Look for the bottom-right corner of the cell, making note of all row | |
237 boundaries. | |
238 """ | |
239 rowseps = {} | |
240 for i in range(top + 1, self.bottom + 1): | |
241 if self.block[i][right] == '+': | |
242 rowseps[i] = [right] | |
243 result = self.scan_left(top, left, i, right) | |
244 if result: | |
245 newrowseps, colseps = result | |
246 update_dict_of_lists(rowseps, newrowseps) | |
247 return i, rowseps, colseps | |
248 elif self.block[i][right] != '|': | |
249 return None | |
250 return None | |
251 | |
252 def scan_left(self, top, left, bottom, right): | |
253 """ | |
254 Noting column boundaries, look for the bottom-left corner of the cell. | |
255 It must line up with the starting point. | |
256 """ | |
257 colseps = {} | |
258 line = self.block[bottom] | |
259 for i in range(right - 1, left, -1): | |
260 if line[i] == '+': | |
261 colseps[i] = [bottom] | |
262 elif line[i] != '-': | |
263 return None | |
264 if line[left] != '+': | |
265 return None | |
266 result = self.scan_up(top, left, bottom, right) | |
267 if result is not None: | |
268 rowseps = result | |
269 return rowseps, colseps | |
270 return None | |
271 | |
272 def scan_up(self, top, left, bottom, right): | |
273 """ | |
274 Noting row boundaries, see if we can return to the starting point. | |
275 """ | |
276 rowseps = {} | |
277 for i in range(bottom - 1, top, -1): | |
278 if self.block[i][left] == '+': | |
279 rowseps[i] = [left] | |
280 elif self.block[i][left] != '|': | |
281 return None | |
282 return rowseps | |
283 | |
284 def structure_from_cells(self): | |
285 """ | |
286 From the data collected by `scan_cell()`, convert to the final data | |
287 structure. | |
288 """ | |
289 rowseps = sorted(self.rowseps.keys()) # list of row boundaries | |
290 rowindex = {} | |
291 for i in range(len(rowseps)): | |
292 rowindex[rowseps[i]] = i # row boundary -> row number mapping | |
293 colseps = sorted(self.colseps.keys()) # list of column boundaries | |
294 colindex = {} | |
295 for i in range(len(colseps)): | |
296 colindex[colseps[i]] = i # column boundary -> col number map | |
297 colspecs = [(colseps[i] - colseps[i - 1] - 1) | |
298 for i in range(1, len(colseps))] # list of column widths | |
299 # prepare an empty table with the correct number of rows & columns | |
300 onerow = [None for i in range(len(colseps) - 1)] | |
301 rows = [onerow[:] for i in range(len(rowseps) - 1)] | |
302 # keep track of # of cells remaining; should reduce to zero | |
303 remaining = (len(rowseps) - 1) * (len(colseps) - 1) | |
304 for top, left, bottom, right, block in self.cells: | |
305 rownum = rowindex[top] | |
306 colnum = colindex[left] | |
307 assert rows[rownum][colnum] is None, ( | |
308 'Cell (row %s, column %s) already used.' | |
309 % (rownum + 1, colnum + 1)) | |
310 morerows = rowindex[bottom] - rownum - 1 | |
311 morecols = colindex[right] - colnum - 1 | |
312 remaining -= (morerows + 1) * (morecols + 1) | |
313 # write the cell into the table | |
314 rows[rownum][colnum] = (morerows, morecols, top + 1, block) | |
315 assert remaining == 0, 'Unused cells remaining.' | |
316 if self.head_body_sep: # separate head rows from body rows | |
317 numheadrows = rowindex[self.head_body_sep] | |
318 headrows = rows[:numheadrows] | |
319 bodyrows = rows[numheadrows:] | |
320 else: | |
321 headrows = [] | |
322 bodyrows = rows | |
323 return (colspecs, headrows, bodyrows) | |
324 | |
325 | |
326 class SimpleTableParser(TableParser): | |
327 | |
328 """ | |
329 Parse a simple table using `parse()`. | |
330 | |
331 Here's an example of a simple table:: | |
332 | |
333 ===== ===== | |
334 col 1 col 2 | |
335 ===== ===== | |
336 1 Second column of row 1. | |
337 2 Second column of row 2. | |
338 Second line of paragraph. | |
339 3 - Second column of row 3. | |
340 | |
341 - Second item in bullet | |
342 list (row 3, column 2). | |
343 4 is a span | |
344 ------------ | |
345 5 | |
346 ===== ===== | |
347 | |
348 Top and bottom borders use '=', column span underlines use '-', column | |
349 separation is indicated with spaces. | |
350 | |
351 Passing the above table to the `parse()` method will result in the | |
352 following data structure, whose interpretation is the same as for | |
353 `GridTableParser`:: | |
354 | |
355 ([5, 25], | |
356 [[(0, 0, 1, ['col 1']), | |
357 (0, 0, 1, ['col 2'])]], | |
358 [[(0, 0, 3, ['1']), | |
359 (0, 0, 3, ['Second column of row 1.'])], | |
360 [(0, 0, 4, ['2']), | |
361 (0, 0, 4, ['Second column of row 2.', | |
362 'Second line of paragraph.'])], | |
363 [(0, 0, 6, ['3']), | |
364 (0, 0, 6, ['- Second column of row 3.', | |
365 '', | |
366 '- Second item in bullet', | |
367 ' list (row 3, column 2).'])], | |
368 [(0, 1, 10, ['4 is a span'])], | |
369 [(0, 0, 12, ['5']), | |
370 (0, 0, 12, [''])]]) | |
371 """ | |
372 | |
373 head_body_separator_pat = re.compile('=[ =]*$') | |
374 span_pat = re.compile('-[ -]*$') | |
375 | |
376 def setup(self, block): | |
377 self.block = block[:] # make a copy; it will be modified | |
378 self.block.disconnect() # don't propagate changes to parent | |
379 # Convert top & bottom borders to column span underlines: | |
380 self.block[0] = self.block[0].replace('=', '-') | |
381 self.block[-1] = self.block[-1].replace('=', '-') | |
382 self.head_body_sep = None | |
383 self.columns = [] | |
384 self.border_end = None | |
385 self.table = [] | |
386 self.done = [-1] * len(block[0]) | |
387 self.rowseps = {0: [0]} | |
388 self.colseps = {0: [0]} | |
389 | |
390 def parse_table(self): | |
391 """ | |
392 First determine the column boundaries from the top border, then | |
393 process rows. Each row may consist of multiple lines; accumulate | |
394 lines until a row is complete. Call `self.parse_row` to finish the | |
395 job. | |
396 """ | |
397 # Top border must fully describe all table columns. | |
398 self.columns = self.parse_columns(self.block[0], 0) | |
399 self.border_end = self.columns[-1][1] | |
400 firststart, firstend = self.columns[0] | |
401 offset = 1 # skip top border | |
402 start = 1 | |
403 text_found = None | |
404 while offset < len(self.block): | |
405 line = self.block[offset] | |
406 if self.span_pat.match(line): | |
407 # Column span underline or border; row is complete. | |
408 self.parse_row(self.block[start:offset], start, | |
409 (line.rstrip(), offset)) | |
410 start = offset + 1 | |
411 text_found = None | |
412 elif line[firststart:firstend].strip(): | |
413 # First column not blank, therefore it's a new row. | |
414 if text_found and offset != start: | |
415 self.parse_row(self.block[start:offset], start) | |
416 start = offset | |
417 text_found = 1 | |
418 elif not text_found: | |
419 start = offset + 1 | |
420 offset += 1 | |
421 | |
422 def parse_columns(self, line, offset): | |
423 """ | |
424 Given a column span underline, return a list of (begin, end) pairs. | |
425 """ | |
426 cols = [] | |
427 end = 0 | |
428 while True: | |
429 begin = line.find('-', end) | |
430 end = line.find(' ', begin) | |
431 if begin < 0: | |
432 break | |
433 if end < 0: | |
434 end = len(line) | |
435 cols.append((begin, end)) | |
436 if self.columns: | |
437 if cols[-1][1] != self.border_end: | |
438 raise TableMarkupError('Column span incomplete in table ' | |
439 'line %s.' % (offset+1), | |
440 offset=offset) | |
441 # Allow for an unbounded rightmost column: | |
442 cols[-1] = (cols[-1][0], self.columns[-1][1]) | |
443 return cols | |
444 | |
445 def init_row(self, colspec, offset): | |
446 i = 0 | |
447 cells = [] | |
448 for start, end in colspec: | |
449 morecols = 0 | |
450 try: | |
451 assert start == self.columns[i][0] | |
452 while end != self.columns[i][1]: | |
453 i += 1 | |
454 morecols += 1 | |
455 except (AssertionError, IndexError): | |
456 raise TableMarkupError('Column span alignment problem ' | |
457 'in table line %s.' % (offset+2), | |
458 offset=offset+1) | |
459 cells.append([0, morecols, offset, []]) | |
460 i += 1 | |
461 return cells | |
462 | |
463 def parse_row(self, lines, start, spanline=None): | |
464 """ | |
465 Given the text `lines` of a row, parse it and append to `self.table`. | |
466 | |
467 The row is parsed according to the current column spec (either | |
468 `spanline` if provided or `self.columns`). For each column, extract | |
469 text from each line, and check for text in column margins. Finally, | |
470 adjust for insignificant whitespace. | |
471 """ | |
472 if not (lines or spanline): | |
473 # No new row, just blank lines. | |
474 return | |
475 if spanline: | |
476 columns = self.parse_columns(*spanline) | |
477 span_offset = spanline[1] | |
478 else: | |
479 columns = self.columns[:] | |
480 span_offset = start | |
481 self.check_columns(lines, start, columns) | |
482 row = self.init_row(columns, start) | |
483 for i in range(len(columns)): | |
484 start, end = columns[i] | |
485 cellblock = lines.get_2D_block(0, start, len(lines), end) | |
486 cellblock.disconnect() # lines in cell can't sync with parent | |
487 cellblock.replace(self.double_width_pad_char, '') | |
488 row[i][3] = cellblock | |
489 self.table.append(row) | |
490 | |
491 def check_columns(self, lines, first_line, columns): | |
492 """ | |
493 Check for text in column margins and text overflow in the last column. | |
494 Raise TableMarkupError if anything but whitespace is in column margins. | |
495 Adjust the end value for the last column if there is text overflow. | |
496 """ | |
497 # "Infinite" value for a dummy last column's beginning, used to | |
498 # check for text overflow: | |
499 columns.append((sys.maxsize, None)) | |
500 lastcol = len(columns) - 2 | |
501 # combining characters do not contribute to the column width | |
502 lines = [strip_combining_chars(line) for line in lines] | |
503 | |
504 for i in range(len(columns) - 1): | |
505 start, end = columns[i] | |
506 nextstart = columns[i+1][0] | |
507 offset = 0 | |
508 for line in lines: | |
509 if i == lastcol and line[end:].strip(): | |
510 text = line[start:].rstrip() | |
511 new_end = start + len(text) | |
512 main_start, main_end = self.columns[-1] | |
513 columns[i] = (start, max(main_end, new_end)) | |
514 if new_end > main_end: | |
515 self.columns[-1] = (main_start, new_end) | |
516 elif line[end:nextstart].strip(): | |
517 raise TableMarkupError('Text in column margin ' | |
518 'in table line %s.' % (first_line+offset+1), | |
519 offset=first_line+offset) | |
520 offset += 1 | |
521 columns.pop() | |
522 | |
523 def structure_from_cells(self): | |
524 colspecs = [end - start for start, end in self.columns] | |
525 first_body_row = 0 | |
526 if self.head_body_sep: | |
527 for i in range(len(self.table)): | |
528 if self.table[i][0][2] > self.head_body_sep: | |
529 first_body_row = i | |
530 break | |
531 return (colspecs, self.table[:first_body_row], | |
532 self.table[first_body_row:]) | |
533 | |
534 | |
535 def update_dict_of_lists(master, newdata): | |
536 """ | |
537 Extend the list values of `master` with those from `newdata`. | |
538 | |
539 Both parameters must be dictionaries containing list values. | |
540 """ | |
541 for key, values in newdata.items(): | |
542 master.setdefault(key, []).extend(values) |