comparison env/lib/python3.7/site-packages/docutils/parsers/rst/tableparser.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 # $Id: tableparser.py 8373 2019-08-27 12:11:30Z milde $
2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
4
5 """
6 This module defines table parser classes,which parse plaintext-graphic tables
7 and produce a well-formed data structure suitable for building a CALS table.
8
9 :Classes:
10 - `GridTableParser`: Parse fully-formed tables represented with a grid.
11 - `SimpleTableParser`: Parse simple tables, delimited by top & bottom
12 borders.
13
14 :Exception class: `TableMarkupError`
15
16 :Function:
17 `update_dict_of_lists()`: Merge two dictionaries containing list values.
18 """
19
20 __docformat__ = 'reStructuredText'
21
22
23 import re
24 import sys
25 from docutils import DataError
26 from docutils.utils import strip_combining_chars
27
28
29 class TableMarkupError(DataError):
30
31 """
32 Raise if there is any problem with table markup.
33
34 The keyword argument `offset` denotes the offset of the problem
35 from the table's start line.
36 """
37
38 def __init__(self, *args, **kwargs):
39 self.offset = kwargs.pop('offset', 0)
40 DataError.__init__(self, *args)
41
42
43 class TableParser(object):
44
45 """
46 Abstract superclass for the common parts of the syntax-specific parsers.
47 """
48
49 head_body_separator_pat = None
50 """Matches the row separator between head rows and body rows."""
51
52 double_width_pad_char = '\x00'
53 """Padding character for East Asian double-width text."""
54
55 def parse(self, block):
56 """
57 Analyze the text `block` and return a table data structure.
58
59 Given a plaintext-graphic table in `block` (list of lines of text; no
60 whitespace padding), parse the table, construct and return the data
61 necessary to construct a CALS table or equivalent.
62
63 Raise `TableMarkupError` if there is any problem with the markup.
64 """
65 self.setup(block)
66 self.find_head_body_sep()
67 self.parse_table()
68 structure = self.structure_from_cells()
69 return structure
70
71 def find_head_body_sep(self):
72 """Look for a head/body row separator line; store the line index."""
73 for i in range(len(self.block)):
74 line = self.block[i]
75 if self.head_body_separator_pat.match(line):
76 if self.head_body_sep:
77 raise TableMarkupError(
78 'Multiple head/body row separators '
79 '(table lines %s and %s); only one allowed.'
80 % (self.head_body_sep+1, i+1), offset=i)
81 else:
82 self.head_body_sep = i
83 self.block[i] = line.replace('=', '-')
84 if self.head_body_sep == 0 or self.head_body_sep == (len(self.block)
85 - 1):
86 raise TableMarkupError('The head/body row separator may not be '
87 'the first or last line of the table.',
88 offset=i)
89
90
91 class GridTableParser(TableParser):
92
93 """
94 Parse a grid table using `parse()`.
95
96 Here's an example of a grid table::
97
98 +------------------------+------------+----------+----------+
99 | Header row, column 1 | Header 2 | Header 3 | Header 4 |
100 +========================+============+==========+==========+
101 | body row 1, column 1 | column 2 | column 3 | column 4 |
102 +------------------------+------------+----------+----------+
103 | body row 2 | Cells may span columns. |
104 +------------------------+------------+---------------------+
105 | body row 3 | Cells may | - Table cells |
106 +------------------------+ span rows. | - contain |
107 | body row 4 | | - body elements. |
108 +------------------------+------------+---------------------+
109
110 Intersections use '+', row separators use '-' (except for one optional
111 head/body row separator, which uses '='), and column separators use '|'.
112
113 Passing the above table to the `parse()` method will result in the
114 following data structure::
115
116 ([24, 12, 10, 10],
117 [[(0, 0, 1, ['Header row, column 1']),
118 (0, 0, 1, ['Header 2']),
119 (0, 0, 1, ['Header 3']),
120 (0, 0, 1, ['Header 4'])]],
121 [[(0, 0, 3, ['body row 1, column 1']),
122 (0, 0, 3, ['column 2']),
123 (0, 0, 3, ['column 3']),
124 (0, 0, 3, ['column 4'])],
125 [(0, 0, 5, ['body row 2']),
126 (0, 2, 5, ['Cells may span columns.']),
127 None,
128 None],
129 [(0, 0, 7, ['body row 3']),
130 (1, 0, 7, ['Cells may', 'span rows.', '']),
131 (1, 1, 7, ['- Table cells', '- contain', '- body elements.']),
132 None],
133 [(0, 0, 9, ['body row 4']), None, None, None]])
134
135 The first item is a list containing column widths (colspecs). The second
136 item is a list of head rows, and the third is a list of body rows. Each
137 row contains a list of cells. Each cell is either None (for a cell unused
138 because of another cell's span), or a tuple. A cell tuple contains four
139 items: the number of extra rows used by the cell in a vertical span
140 (morerows); the number of extra columns used by the cell in a horizontal
141 span (morecols); the line offset of the first line of the cell contents;
142 and the cell contents, a list of lines of text.
143 """
144
145 head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$')
146
147 def setup(self, block):
148 self.block = block[:] # make a copy; it may be modified
149 self.block.disconnect() # don't propagate changes to parent
150 self.bottom = len(block) - 1
151 self.right = len(block[0]) - 1
152 self.head_body_sep = None
153 self.done = [-1] * len(block[0])
154 self.cells = []
155 self.rowseps = {0: [0]}
156 self.colseps = {0: [0]}
157
158 def parse_table(self):
159 """
160 Start with a queue of upper-left corners, containing the upper-left
161 corner of the table itself. Trace out one rectangular cell, remember
162 it, and add its upper-right and lower-left corners to the queue of
163 potential upper-left corners of further cells. Process the queue in
164 top-to-bottom order, keeping track of how much of each text column has
165 been seen.
166
167 We'll end up knowing all the row and column boundaries, cell positions
168 and their dimensions.
169 """
170 corners = [(0, 0)]
171 while corners:
172 top, left = corners.pop(0)
173 if top == self.bottom or left == self.right \
174 or top <= self.done[left]:
175 continue
176 result = self.scan_cell(top, left)
177 if not result:
178 continue
179 bottom, right, rowseps, colseps = result
180 update_dict_of_lists(self.rowseps, rowseps)
181 update_dict_of_lists(self.colseps, colseps)
182 self.mark_done(top, left, bottom, right)
183 cellblock = self.block.get_2D_block(top + 1, left + 1,
184 bottom, right)
185 cellblock.disconnect() # lines in cell can't sync with parent
186 cellblock.replace(self.double_width_pad_char, '')
187 self.cells.append((top, left, bottom, right, cellblock))
188 corners.extend([(top, right), (bottom, left)])
189 corners.sort()
190 if not self.check_parse_complete():
191 raise TableMarkupError('Malformed table; parse incomplete.')
192
193 def mark_done(self, top, left, bottom, right):
194 """For keeping track of how much of each text column has been seen."""
195 before = top - 1
196 after = bottom - 1
197 for col in range(left, right):
198 assert self.done[col] == before
199 self.done[col] = after
200
201 def check_parse_complete(self):
202 """Each text column should have been completely seen."""
203 last = self.bottom - 1
204 for col in range(self.right):
205 if self.done[col] != last:
206 return False
207 return True
208
209 def scan_cell(self, top, left):
210 """Starting at the top-left corner, start tracing out a cell."""
211 assert self.block[top][left] == '+'
212 result = self.scan_right(top, left)
213 return result
214
215 def scan_right(self, top, left):
216 """
217 Look for the top-right corner of the cell, and make note of all column
218 boundaries ('+').
219 """
220 colseps = {}
221 line = self.block[top]
222 for i in range(left + 1, self.right + 1):
223 if line[i] == '+':
224 colseps[i] = [top]
225 result = self.scan_down(top, left, i)
226 if result:
227 bottom, rowseps, newcolseps = result
228 update_dict_of_lists(colseps, newcolseps)
229 return bottom, i, rowseps, colseps
230 elif line[i] != '-':
231 return None
232 return None
233
234 def scan_down(self, top, left, right):
235 """
236 Look for the bottom-right corner of the cell, making note of all row
237 boundaries.
238 """
239 rowseps = {}
240 for i in range(top + 1, self.bottom + 1):
241 if self.block[i][right] == '+':
242 rowseps[i] = [right]
243 result = self.scan_left(top, left, i, right)
244 if result:
245 newrowseps, colseps = result
246 update_dict_of_lists(rowseps, newrowseps)
247 return i, rowseps, colseps
248 elif self.block[i][right] != '|':
249 return None
250 return None
251
252 def scan_left(self, top, left, bottom, right):
253 """
254 Noting column boundaries, look for the bottom-left corner of the cell.
255 It must line up with the starting point.
256 """
257 colseps = {}
258 line = self.block[bottom]
259 for i in range(right - 1, left, -1):
260 if line[i] == '+':
261 colseps[i] = [bottom]
262 elif line[i] != '-':
263 return None
264 if line[left] != '+':
265 return None
266 result = self.scan_up(top, left, bottom, right)
267 if result is not None:
268 rowseps = result
269 return rowseps, colseps
270 return None
271
272 def scan_up(self, top, left, bottom, right):
273 """
274 Noting row boundaries, see if we can return to the starting point.
275 """
276 rowseps = {}
277 for i in range(bottom - 1, top, -1):
278 if self.block[i][left] == '+':
279 rowseps[i] = [left]
280 elif self.block[i][left] != '|':
281 return None
282 return rowseps
283
284 def structure_from_cells(self):
285 """
286 From the data collected by `scan_cell()`, convert to the final data
287 structure.
288 """
289 rowseps = sorted(self.rowseps.keys()) # list of row boundaries
290 rowindex = {}
291 for i in range(len(rowseps)):
292 rowindex[rowseps[i]] = i # row boundary -> row number mapping
293 colseps = sorted(self.colseps.keys()) # list of column boundaries
294 colindex = {}
295 for i in range(len(colseps)):
296 colindex[colseps[i]] = i # column boundary -> col number map
297 colspecs = [(colseps[i] - colseps[i - 1] - 1)
298 for i in range(1, len(colseps))] # list of column widths
299 # prepare an empty table with the correct number of rows & columns
300 onerow = [None for i in range(len(colseps) - 1)]
301 rows = [onerow[:] for i in range(len(rowseps) - 1)]
302 # keep track of # of cells remaining; should reduce to zero
303 remaining = (len(rowseps) - 1) * (len(colseps) - 1)
304 for top, left, bottom, right, block in self.cells:
305 rownum = rowindex[top]
306 colnum = colindex[left]
307 assert rows[rownum][colnum] is None, (
308 'Cell (row %s, column %s) already used.'
309 % (rownum + 1, colnum + 1))
310 morerows = rowindex[bottom] - rownum - 1
311 morecols = colindex[right] - colnum - 1
312 remaining -= (morerows + 1) * (morecols + 1)
313 # write the cell into the table
314 rows[rownum][colnum] = (morerows, morecols, top + 1, block)
315 assert remaining == 0, 'Unused cells remaining.'
316 if self.head_body_sep: # separate head rows from body rows
317 numheadrows = rowindex[self.head_body_sep]
318 headrows = rows[:numheadrows]
319 bodyrows = rows[numheadrows:]
320 else:
321 headrows = []
322 bodyrows = rows
323 return (colspecs, headrows, bodyrows)
324
325
326 class SimpleTableParser(TableParser):
327
328 """
329 Parse a simple table using `parse()`.
330
331 Here's an example of a simple table::
332
333 ===== =====
334 col 1 col 2
335 ===== =====
336 1 Second column of row 1.
337 2 Second column of row 2.
338 Second line of paragraph.
339 3 - Second column of row 3.
340
341 - Second item in bullet
342 list (row 3, column 2).
343 4 is a span
344 ------------
345 5
346 ===== =====
347
348 Top and bottom borders use '=', column span underlines use '-', column
349 separation is indicated with spaces.
350
351 Passing the above table to the `parse()` method will result in the
352 following data structure, whose interpretation is the same as for
353 `GridTableParser`::
354
355 ([5, 25],
356 [[(0, 0, 1, ['col 1']),
357 (0, 0, 1, ['col 2'])]],
358 [[(0, 0, 3, ['1']),
359 (0, 0, 3, ['Second column of row 1.'])],
360 [(0, 0, 4, ['2']),
361 (0, 0, 4, ['Second column of row 2.',
362 'Second line of paragraph.'])],
363 [(0, 0, 6, ['3']),
364 (0, 0, 6, ['- Second column of row 3.',
365 '',
366 '- Second item in bullet',
367 ' list (row 3, column 2).'])],
368 [(0, 1, 10, ['4 is a span'])],
369 [(0, 0, 12, ['5']),
370 (0, 0, 12, [''])]])
371 """
372
373 head_body_separator_pat = re.compile('=[ =]*$')
374 span_pat = re.compile('-[ -]*$')
375
376 def setup(self, block):
377 self.block = block[:] # make a copy; it will be modified
378 self.block.disconnect() # don't propagate changes to parent
379 # Convert top & bottom borders to column span underlines:
380 self.block[0] = self.block[0].replace('=', '-')
381 self.block[-1] = self.block[-1].replace('=', '-')
382 self.head_body_sep = None
383 self.columns = []
384 self.border_end = None
385 self.table = []
386 self.done = [-1] * len(block[0])
387 self.rowseps = {0: [0]}
388 self.colseps = {0: [0]}
389
390 def parse_table(self):
391 """
392 First determine the column boundaries from the top border, then
393 process rows. Each row may consist of multiple lines; accumulate
394 lines until a row is complete. Call `self.parse_row` to finish the
395 job.
396 """
397 # Top border must fully describe all table columns.
398 self.columns = self.parse_columns(self.block[0], 0)
399 self.border_end = self.columns[-1][1]
400 firststart, firstend = self.columns[0]
401 offset = 1 # skip top border
402 start = 1
403 text_found = None
404 while offset < len(self.block):
405 line = self.block[offset]
406 if self.span_pat.match(line):
407 # Column span underline or border; row is complete.
408 self.parse_row(self.block[start:offset], start,
409 (line.rstrip(), offset))
410 start = offset + 1
411 text_found = None
412 elif line[firststart:firstend].strip():
413 # First column not blank, therefore it's a new row.
414 if text_found and offset != start:
415 self.parse_row(self.block[start:offset], start)
416 start = offset
417 text_found = 1
418 elif not text_found:
419 start = offset + 1
420 offset += 1
421
422 def parse_columns(self, line, offset):
423 """
424 Given a column span underline, return a list of (begin, end) pairs.
425 """
426 cols = []
427 end = 0
428 while True:
429 begin = line.find('-', end)
430 end = line.find(' ', begin)
431 if begin < 0:
432 break
433 if end < 0:
434 end = len(line)
435 cols.append((begin, end))
436 if self.columns:
437 if cols[-1][1] != self.border_end:
438 raise TableMarkupError('Column span incomplete in table '
439 'line %s.' % (offset+1),
440 offset=offset)
441 # Allow for an unbounded rightmost column:
442 cols[-1] = (cols[-1][0], self.columns[-1][1])
443 return cols
444
445 def init_row(self, colspec, offset):
446 i = 0
447 cells = []
448 for start, end in colspec:
449 morecols = 0
450 try:
451 assert start == self.columns[i][0]
452 while end != self.columns[i][1]:
453 i += 1
454 morecols += 1
455 except (AssertionError, IndexError):
456 raise TableMarkupError('Column span alignment problem '
457 'in table line %s.' % (offset+2),
458 offset=offset+1)
459 cells.append([0, morecols, offset, []])
460 i += 1
461 return cells
462
463 def parse_row(self, lines, start, spanline=None):
464 """
465 Given the text `lines` of a row, parse it and append to `self.table`.
466
467 The row is parsed according to the current column spec (either
468 `spanline` if provided or `self.columns`). For each column, extract
469 text from each line, and check for text in column margins. Finally,
470 adjust for insignificant whitespace.
471 """
472 if not (lines or spanline):
473 # No new row, just blank lines.
474 return
475 if spanline:
476 columns = self.parse_columns(*spanline)
477 span_offset = spanline[1]
478 else:
479 columns = self.columns[:]
480 span_offset = start
481 self.check_columns(lines, start, columns)
482 row = self.init_row(columns, start)
483 for i in range(len(columns)):
484 start, end = columns[i]
485 cellblock = lines.get_2D_block(0, start, len(lines), end)
486 cellblock.disconnect() # lines in cell can't sync with parent
487 cellblock.replace(self.double_width_pad_char, '')
488 row[i][3] = cellblock
489 self.table.append(row)
490
491 def check_columns(self, lines, first_line, columns):
492 """
493 Check for text in column margins and text overflow in the last column.
494 Raise TableMarkupError if anything but whitespace is in column margins.
495 Adjust the end value for the last column if there is text overflow.
496 """
497 # "Infinite" value for a dummy last column's beginning, used to
498 # check for text overflow:
499 columns.append((sys.maxsize, None))
500 lastcol = len(columns) - 2
501 # combining characters do not contribute to the column width
502 lines = [strip_combining_chars(line) for line in lines]
503
504 for i in range(len(columns) - 1):
505 start, end = columns[i]
506 nextstart = columns[i+1][0]
507 offset = 0
508 for line in lines:
509 if i == lastcol and line[end:].strip():
510 text = line[start:].rstrip()
511 new_end = start + len(text)
512 main_start, main_end = self.columns[-1]
513 columns[i] = (start, max(main_end, new_end))
514 if new_end > main_end:
515 self.columns[-1] = (main_start, new_end)
516 elif line[end:nextstart].strip():
517 raise TableMarkupError('Text in column margin '
518 'in table line %s.' % (first_line+offset+1),
519 offset=first_line+offset)
520 offset += 1
521 columns.pop()
522
523 def structure_from_cells(self):
524 colspecs = [end - start for start, end in self.columns]
525 first_body_row = 0
526 if self.head_body_sep:
527 for i in range(len(self.table)):
528 if self.table[i][0][2] > self.head_body_sep:
529 first_body_row = i
530 break
531 return (colspecs, self.table[:first_body_row],
532 self.table[first_body_row:])
533
534
535 def update_dict_of_lists(master, newdata):
536 """
537 Extend the list values of `master` with those from `newdata`.
538
539 Both parameters must be dictionaries containing list values.
540 """
541 for key, values in newdata.items():
542 master.setdefault(key, []).extend(values)