Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/docutils/parsers/rst/tableparser.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/env/lib/python3.7/site-packages/docutils/parsers/rst/tableparser.py Sat May 02 07:14:21 2020 -0400 @@ -0,0 +1,542 @@ +# $Id: tableparser.py 8373 2019-08-27 12:11:30Z milde $ +# Author: David Goodger <goodger@python.org> +# Copyright: This module has been placed in the public domain. + +""" +This module defines table parser classes,which parse plaintext-graphic tables +and produce a well-formed data structure suitable for building a CALS table. + +:Classes: + - `GridTableParser`: Parse fully-formed tables represented with a grid. + - `SimpleTableParser`: Parse simple tables, delimited by top & bottom + borders. + +:Exception class: `TableMarkupError` + +:Function: + `update_dict_of_lists()`: Merge two dictionaries containing list values. +""" + +__docformat__ = 'reStructuredText' + + +import re +import sys +from docutils import DataError +from docutils.utils import strip_combining_chars + + +class TableMarkupError(DataError): + + """ + Raise if there is any problem with table markup. + + The keyword argument `offset` denotes the offset of the problem + from the table's start line. + """ + + def __init__(self, *args, **kwargs): + self.offset = kwargs.pop('offset', 0) + DataError.__init__(self, *args) + + +class TableParser(object): + + """ + Abstract superclass for the common parts of the syntax-specific parsers. + """ + + head_body_separator_pat = None + """Matches the row separator between head rows and body rows.""" + + double_width_pad_char = '\x00' + """Padding character for East Asian double-width text.""" + + def parse(self, block): + """ + Analyze the text `block` and return a table data structure. + + Given a plaintext-graphic table in `block` (list of lines of text; no + whitespace padding), parse the table, construct and return the data + necessary to construct a CALS table or equivalent. + + Raise `TableMarkupError` if there is any problem with the markup. + """ + self.setup(block) + self.find_head_body_sep() + self.parse_table() + structure = self.structure_from_cells() + return structure + + def find_head_body_sep(self): + """Look for a head/body row separator line; store the line index.""" + for i in range(len(self.block)): + line = self.block[i] + if self.head_body_separator_pat.match(line): + if self.head_body_sep: + raise TableMarkupError( + 'Multiple head/body row separators ' + '(table lines %s and %s); only one allowed.' + % (self.head_body_sep+1, i+1), offset=i) + else: + self.head_body_sep = i + self.block[i] = line.replace('=', '-') + if self.head_body_sep == 0 or self.head_body_sep == (len(self.block) + - 1): + raise TableMarkupError('The head/body row separator may not be ' + 'the first or last line of the table.', + offset=i) + + +class GridTableParser(TableParser): + + """ + Parse a grid table using `parse()`. + + Here's an example of a grid table:: + + +------------------------+------------+----------+----------+ + | Header row, column 1 | Header 2 | Header 3 | Header 4 | + +========================+============+==========+==========+ + | body row 1, column 1 | column 2 | column 3 | column 4 | + +------------------------+------------+----------+----------+ + | body row 2 | Cells may span columns. | + +------------------------+------------+---------------------+ + | body row 3 | Cells may | - Table cells | + +------------------------+ span rows. | - contain | + | body row 4 | | - body elements. | + +------------------------+------------+---------------------+ + + Intersections use '+', row separators use '-' (except for one optional + head/body row separator, which uses '='), and column separators use '|'. + + Passing the above table to the `parse()` method will result in the + following data structure:: + + ([24, 12, 10, 10], + [[(0, 0, 1, ['Header row, column 1']), + (0, 0, 1, ['Header 2']), + (0, 0, 1, ['Header 3']), + (0, 0, 1, ['Header 4'])]], + [[(0, 0, 3, ['body row 1, column 1']), + (0, 0, 3, ['column 2']), + (0, 0, 3, ['column 3']), + (0, 0, 3, ['column 4'])], + [(0, 0, 5, ['body row 2']), + (0, 2, 5, ['Cells may span columns.']), + None, + None], + [(0, 0, 7, ['body row 3']), + (1, 0, 7, ['Cells may', 'span rows.', '']), + (1, 1, 7, ['- Table cells', '- contain', '- body elements.']), + None], + [(0, 0, 9, ['body row 4']), None, None, None]]) + + The first item is a list containing column widths (colspecs). The second + item is a list of head rows, and the third is a list of body rows. Each + row contains a list of cells. Each cell is either None (for a cell unused + because of another cell's span), or a tuple. A cell tuple contains four + items: the number of extra rows used by the cell in a vertical span + (morerows); the number of extra columns used by the cell in a horizontal + span (morecols); the line offset of the first line of the cell contents; + and the cell contents, a list of lines of text. + """ + + head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$') + + def setup(self, block): + self.block = block[:] # make a copy; it may be modified + self.block.disconnect() # don't propagate changes to parent + self.bottom = len(block) - 1 + self.right = len(block[0]) - 1 + self.head_body_sep = None + self.done = [-1] * len(block[0]) + self.cells = [] + self.rowseps = {0: [0]} + self.colseps = {0: [0]} + + def parse_table(self): + """ + Start with a queue of upper-left corners, containing the upper-left + corner of the table itself. Trace out one rectangular cell, remember + it, and add its upper-right and lower-left corners to the queue of + potential upper-left corners of further cells. Process the queue in + top-to-bottom order, keeping track of how much of each text column has + been seen. + + We'll end up knowing all the row and column boundaries, cell positions + and their dimensions. + """ + corners = [(0, 0)] + while corners: + top, left = corners.pop(0) + if top == self.bottom or left == self.right \ + or top <= self.done[left]: + continue + result = self.scan_cell(top, left) + if not result: + continue + bottom, right, rowseps, colseps = result + update_dict_of_lists(self.rowseps, rowseps) + update_dict_of_lists(self.colseps, colseps) + self.mark_done(top, left, bottom, right) + cellblock = self.block.get_2D_block(top + 1, left + 1, + bottom, right) + cellblock.disconnect() # lines in cell can't sync with parent + cellblock.replace(self.double_width_pad_char, '') + self.cells.append((top, left, bottom, right, cellblock)) + corners.extend([(top, right), (bottom, left)]) + corners.sort() + if not self.check_parse_complete(): + raise TableMarkupError('Malformed table; parse incomplete.') + + def mark_done(self, top, left, bottom, right): + """For keeping track of how much of each text column has been seen.""" + before = top - 1 + after = bottom - 1 + for col in range(left, right): + assert self.done[col] == before + self.done[col] = after + + def check_parse_complete(self): + """Each text column should have been completely seen.""" + last = self.bottom - 1 + for col in range(self.right): + if self.done[col] != last: + return False + return True + + def scan_cell(self, top, left): + """Starting at the top-left corner, start tracing out a cell.""" + assert self.block[top][left] == '+' + result = self.scan_right(top, left) + return result + + def scan_right(self, top, left): + """ + Look for the top-right corner of the cell, and make note of all column + boundaries ('+'). + """ + colseps = {} + line = self.block[top] + for i in range(left + 1, self.right + 1): + if line[i] == '+': + colseps[i] = [top] + result = self.scan_down(top, left, i) + if result: + bottom, rowseps, newcolseps = result + update_dict_of_lists(colseps, newcolseps) + return bottom, i, rowseps, colseps + elif line[i] != '-': + return None + return None + + def scan_down(self, top, left, right): + """ + Look for the bottom-right corner of the cell, making note of all row + boundaries. + """ + rowseps = {} + for i in range(top + 1, self.bottom + 1): + if self.block[i][right] == '+': + rowseps[i] = [right] + result = self.scan_left(top, left, i, right) + if result: + newrowseps, colseps = result + update_dict_of_lists(rowseps, newrowseps) + return i, rowseps, colseps + elif self.block[i][right] != '|': + return None + return None + + def scan_left(self, top, left, bottom, right): + """ + Noting column boundaries, look for the bottom-left corner of the cell. + It must line up with the starting point. + """ + colseps = {} + line = self.block[bottom] + for i in range(right - 1, left, -1): + if line[i] == '+': + colseps[i] = [bottom] + elif line[i] != '-': + return None + if line[left] != '+': + return None + result = self.scan_up(top, left, bottom, right) + if result is not None: + rowseps = result + return rowseps, colseps + return None + + def scan_up(self, top, left, bottom, right): + """ + Noting row boundaries, see if we can return to the starting point. + """ + rowseps = {} + for i in range(bottom - 1, top, -1): + if self.block[i][left] == '+': + rowseps[i] = [left] + elif self.block[i][left] != '|': + return None + return rowseps + + def structure_from_cells(self): + """ + From the data collected by `scan_cell()`, convert to the final data + structure. + """ + rowseps = sorted(self.rowseps.keys()) # list of row boundaries + rowindex = {} + for i in range(len(rowseps)): + rowindex[rowseps[i]] = i # row boundary -> row number mapping + colseps = sorted(self.colseps.keys()) # list of column boundaries + colindex = {} + for i in range(len(colseps)): + colindex[colseps[i]] = i # column boundary -> col number map + colspecs = [(colseps[i] - colseps[i - 1] - 1) + for i in range(1, len(colseps))] # list of column widths + # prepare an empty table with the correct number of rows & columns + onerow = [None for i in range(len(colseps) - 1)] + rows = [onerow[:] for i in range(len(rowseps) - 1)] + # keep track of # of cells remaining; should reduce to zero + remaining = (len(rowseps) - 1) * (len(colseps) - 1) + for top, left, bottom, right, block in self.cells: + rownum = rowindex[top] + colnum = colindex[left] + assert rows[rownum][colnum] is None, ( + 'Cell (row %s, column %s) already used.' + % (rownum + 1, colnum + 1)) + morerows = rowindex[bottom] - rownum - 1 + morecols = colindex[right] - colnum - 1 + remaining -= (morerows + 1) * (morecols + 1) + # write the cell into the table + rows[rownum][colnum] = (morerows, morecols, top + 1, block) + assert remaining == 0, 'Unused cells remaining.' + if self.head_body_sep: # separate head rows from body rows + numheadrows = rowindex[self.head_body_sep] + headrows = rows[:numheadrows] + bodyrows = rows[numheadrows:] + else: + headrows = [] + bodyrows = rows + return (colspecs, headrows, bodyrows) + + +class SimpleTableParser(TableParser): + + """ + Parse a simple table using `parse()`. + + Here's an example of a simple table:: + + ===== ===== + col 1 col 2 + ===== ===== + 1 Second column of row 1. + 2 Second column of row 2. + Second line of paragraph. + 3 - Second column of row 3. + + - Second item in bullet + list (row 3, column 2). + 4 is a span + ------------ + 5 + ===== ===== + + Top and bottom borders use '=', column span underlines use '-', column + separation is indicated with spaces. + + Passing the above table to the `parse()` method will result in the + following data structure, whose interpretation is the same as for + `GridTableParser`:: + + ([5, 25], + [[(0, 0, 1, ['col 1']), + (0, 0, 1, ['col 2'])]], + [[(0, 0, 3, ['1']), + (0, 0, 3, ['Second column of row 1.'])], + [(0, 0, 4, ['2']), + (0, 0, 4, ['Second column of row 2.', + 'Second line of paragraph.'])], + [(0, 0, 6, ['3']), + (0, 0, 6, ['- Second column of row 3.', + '', + '- Second item in bullet', + ' list (row 3, column 2).'])], + [(0, 1, 10, ['4 is a span'])], + [(0, 0, 12, ['5']), + (0, 0, 12, [''])]]) + """ + + head_body_separator_pat = re.compile('=[ =]*$') + span_pat = re.compile('-[ -]*$') + + def setup(self, block): + self.block = block[:] # make a copy; it will be modified + self.block.disconnect() # don't propagate changes to parent + # Convert top & bottom borders to column span underlines: + self.block[0] = self.block[0].replace('=', '-') + self.block[-1] = self.block[-1].replace('=', '-') + self.head_body_sep = None + self.columns = [] + self.border_end = None + self.table = [] + self.done = [-1] * len(block[0]) + self.rowseps = {0: [0]} + self.colseps = {0: [0]} + + def parse_table(self): + """ + First determine the column boundaries from the top border, then + process rows. Each row may consist of multiple lines; accumulate + lines until a row is complete. Call `self.parse_row` to finish the + job. + """ + # Top border must fully describe all table columns. + self.columns = self.parse_columns(self.block[0], 0) + self.border_end = self.columns[-1][1] + firststart, firstend = self.columns[0] + offset = 1 # skip top border + start = 1 + text_found = None + while offset < len(self.block): + line = self.block[offset] + if self.span_pat.match(line): + # Column span underline or border; row is complete. + self.parse_row(self.block[start:offset], start, + (line.rstrip(), offset)) + start = offset + 1 + text_found = None + elif line[firststart:firstend].strip(): + # First column not blank, therefore it's a new row. + if text_found and offset != start: + self.parse_row(self.block[start:offset], start) + start = offset + text_found = 1 + elif not text_found: + start = offset + 1 + offset += 1 + + def parse_columns(self, line, offset): + """ + Given a column span underline, return a list of (begin, end) pairs. + """ + cols = [] + end = 0 + while True: + begin = line.find('-', end) + end = line.find(' ', begin) + if begin < 0: + break + if end < 0: + end = len(line) + cols.append((begin, end)) + if self.columns: + if cols[-1][1] != self.border_end: + raise TableMarkupError('Column span incomplete in table ' + 'line %s.' % (offset+1), + offset=offset) + # Allow for an unbounded rightmost column: + cols[-1] = (cols[-1][0], self.columns[-1][1]) + return cols + + def init_row(self, colspec, offset): + i = 0 + cells = [] + for start, end in colspec: + morecols = 0 + try: + assert start == self.columns[i][0] + while end != self.columns[i][1]: + i += 1 + morecols += 1 + except (AssertionError, IndexError): + raise TableMarkupError('Column span alignment problem ' + 'in table line %s.' % (offset+2), + offset=offset+1) + cells.append([0, morecols, offset, []]) + i += 1 + return cells + + def parse_row(self, lines, start, spanline=None): + """ + Given the text `lines` of a row, parse it and append to `self.table`. + + The row is parsed according to the current column spec (either + `spanline` if provided or `self.columns`). For each column, extract + text from each line, and check for text in column margins. Finally, + adjust for insignificant whitespace. + """ + if not (lines or spanline): + # No new row, just blank lines. + return + if spanline: + columns = self.parse_columns(*spanline) + span_offset = spanline[1] + else: + columns = self.columns[:] + span_offset = start + self.check_columns(lines, start, columns) + row = self.init_row(columns, start) + for i in range(len(columns)): + start, end = columns[i] + cellblock = lines.get_2D_block(0, start, len(lines), end) + cellblock.disconnect() # lines in cell can't sync with parent + cellblock.replace(self.double_width_pad_char, '') + row[i][3] = cellblock + self.table.append(row) + + def check_columns(self, lines, first_line, columns): + """ + Check for text in column margins and text overflow in the last column. + Raise TableMarkupError if anything but whitespace is in column margins. + Adjust the end value for the last column if there is text overflow. + """ + # "Infinite" value for a dummy last column's beginning, used to + # check for text overflow: + columns.append((sys.maxsize, None)) + lastcol = len(columns) - 2 + # combining characters do not contribute to the column width + lines = [strip_combining_chars(line) for line in lines] + + for i in range(len(columns) - 1): + start, end = columns[i] + nextstart = columns[i+1][0] + offset = 0 + for line in lines: + if i == lastcol and line[end:].strip(): + text = line[start:].rstrip() + new_end = start + len(text) + main_start, main_end = self.columns[-1] + columns[i] = (start, max(main_end, new_end)) + if new_end > main_end: + self.columns[-1] = (main_start, new_end) + elif line[end:nextstart].strip(): + raise TableMarkupError('Text in column margin ' + 'in table line %s.' % (first_line+offset+1), + offset=first_line+offset) + offset += 1 + columns.pop() + + def structure_from_cells(self): + colspecs = [end - start for start, end in self.columns] + first_body_row = 0 + if self.head_body_sep: + for i in range(len(self.table)): + if self.table[i][0][2] > self.head_body_sep: + first_body_row = i + break + return (colspecs, self.table[:first_body_row], + self.table[first_body_row:]) + + +def update_dict_of_lists(master, newdata): + """ + Extend the list values of `master` with those from `newdata`. + + Both parameters must be dictionaries containing list values. + """ + for key, values in newdata.items(): + master.setdefault(key, []).extend(values)