Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/docutils/parsers/rst/tableparser.py @ 5:9b1c78e6ba9c draft default tip
"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author | shellac |
---|---|
date | Mon, 01 Jun 2020 08:59:25 -0400 (2020-06-01) |
parents | 79f47841a781 |
children |
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/docutils/parsers/rst/tableparser.py Thu May 14 16:47:39 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,542 +0,0 @@ -# $Id: tableparser.py 8373 2019-08-27 12:11:30Z milde $ -# Author: David Goodger <goodger@python.org> -# Copyright: This module has been placed in the public domain. - -""" -This module defines table parser classes,which parse plaintext-graphic tables -and produce a well-formed data structure suitable for building a CALS table. - -:Classes: - - `GridTableParser`: Parse fully-formed tables represented with a grid. - - `SimpleTableParser`: Parse simple tables, delimited by top & bottom - borders. - -:Exception class: `TableMarkupError` - -:Function: - `update_dict_of_lists()`: Merge two dictionaries containing list values. -""" - -__docformat__ = 'reStructuredText' - - -import re -import sys -from docutils import DataError -from docutils.utils import strip_combining_chars - - -class TableMarkupError(DataError): - - """ - Raise if there is any problem with table markup. - - The keyword argument `offset` denotes the offset of the problem - from the table's start line. - """ - - def __init__(self, *args, **kwargs): - self.offset = kwargs.pop('offset', 0) - DataError.__init__(self, *args) - - -class TableParser(object): - - """ - Abstract superclass for the common parts of the syntax-specific parsers. - """ - - head_body_separator_pat = None - """Matches the row separator between head rows and body rows.""" - - double_width_pad_char = '\x00' - """Padding character for East Asian double-width text.""" - - def parse(self, block): - """ - Analyze the text `block` and return a table data structure. - - Given a plaintext-graphic table in `block` (list of lines of text; no - whitespace padding), parse the table, construct and return the data - necessary to construct a CALS table or equivalent. - - Raise `TableMarkupError` if there is any problem with the markup. - """ - self.setup(block) - self.find_head_body_sep() - self.parse_table() - structure = self.structure_from_cells() - return structure - - def find_head_body_sep(self): - """Look for a head/body row separator line; store the line index.""" - for i in range(len(self.block)): - line = self.block[i] - if self.head_body_separator_pat.match(line): - if self.head_body_sep: - raise TableMarkupError( - 'Multiple head/body row separators ' - '(table lines %s and %s); only one allowed.' - % (self.head_body_sep+1, i+1), offset=i) - else: - self.head_body_sep = i - self.block[i] = line.replace('=', '-') - if self.head_body_sep == 0 or self.head_body_sep == (len(self.block) - - 1): - raise TableMarkupError('The head/body row separator may not be ' - 'the first or last line of the table.', - offset=i) - - -class GridTableParser(TableParser): - - """ - Parse a grid table using `parse()`. - - Here's an example of a grid table:: - - +------------------------+------------+----------+----------+ - | Header row, column 1 | Header 2 | Header 3 | Header 4 | - +========================+============+==========+==========+ - | body row 1, column 1 | column 2 | column 3 | column 4 | - +------------------------+------------+----------+----------+ - | body row 2 | Cells may span columns. | - +------------------------+------------+---------------------+ - | body row 3 | Cells may | - Table cells | - +------------------------+ span rows. | - contain | - | body row 4 | | - body elements. | - +------------------------+------------+---------------------+ - - Intersections use '+', row separators use '-' (except for one optional - head/body row separator, which uses '='), and column separators use '|'. - - Passing the above table to the `parse()` method will result in the - following data structure:: - - ([24, 12, 10, 10], - [[(0, 0, 1, ['Header row, column 1']), - (0, 0, 1, ['Header 2']), - (0, 0, 1, ['Header 3']), - (0, 0, 1, ['Header 4'])]], - [[(0, 0, 3, ['body row 1, column 1']), - (0, 0, 3, ['column 2']), - (0, 0, 3, ['column 3']), - (0, 0, 3, ['column 4'])], - [(0, 0, 5, ['body row 2']), - (0, 2, 5, ['Cells may span columns.']), - None, - None], - [(0, 0, 7, ['body row 3']), - (1, 0, 7, ['Cells may', 'span rows.', '']), - (1, 1, 7, ['- Table cells', '- contain', '- body elements.']), - None], - [(0, 0, 9, ['body row 4']), None, None, None]]) - - The first item is a list containing column widths (colspecs). The second - item is a list of head rows, and the third is a list of body rows. Each - row contains a list of cells. Each cell is either None (for a cell unused - because of another cell's span), or a tuple. A cell tuple contains four - items: the number of extra rows used by the cell in a vertical span - (morerows); the number of extra columns used by the cell in a horizontal - span (morecols); the line offset of the first line of the cell contents; - and the cell contents, a list of lines of text. - """ - - head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$') - - def setup(self, block): - self.block = block[:] # make a copy; it may be modified - self.block.disconnect() # don't propagate changes to parent - self.bottom = len(block) - 1 - self.right = len(block[0]) - 1 - self.head_body_sep = None - self.done = [-1] * len(block[0]) - self.cells = [] - self.rowseps = {0: [0]} - self.colseps = {0: [0]} - - def parse_table(self): - """ - Start with a queue of upper-left corners, containing the upper-left - corner of the table itself. Trace out one rectangular cell, remember - it, and add its upper-right and lower-left corners to the queue of - potential upper-left corners of further cells. Process the queue in - top-to-bottom order, keeping track of how much of each text column has - been seen. - - We'll end up knowing all the row and column boundaries, cell positions - and their dimensions. - """ - corners = [(0, 0)] - while corners: - top, left = corners.pop(0) - if top == self.bottom or left == self.right \ - or top <= self.done[left]: - continue - result = self.scan_cell(top, left) - if not result: - continue - bottom, right, rowseps, colseps = result - update_dict_of_lists(self.rowseps, rowseps) - update_dict_of_lists(self.colseps, colseps) - self.mark_done(top, left, bottom, right) - cellblock = self.block.get_2D_block(top + 1, left + 1, - bottom, right) - cellblock.disconnect() # lines in cell can't sync with parent - cellblock.replace(self.double_width_pad_char, '') - self.cells.append((top, left, bottom, right, cellblock)) - corners.extend([(top, right), (bottom, left)]) - corners.sort() - if not self.check_parse_complete(): - raise TableMarkupError('Malformed table; parse incomplete.') - - def mark_done(self, top, left, bottom, right): - """For keeping track of how much of each text column has been seen.""" - before = top - 1 - after = bottom - 1 - for col in range(left, right): - assert self.done[col] == before - self.done[col] = after - - def check_parse_complete(self): - """Each text column should have been completely seen.""" - last = self.bottom - 1 - for col in range(self.right): - if self.done[col] != last: - return False - return True - - def scan_cell(self, top, left): - """Starting at the top-left corner, start tracing out a cell.""" - assert self.block[top][left] == '+' - result = self.scan_right(top, left) - return result - - def scan_right(self, top, left): - """ - Look for the top-right corner of the cell, and make note of all column - boundaries ('+'). - """ - colseps = {} - line = self.block[top] - for i in range(left + 1, self.right + 1): - if line[i] == '+': - colseps[i] = [top] - result = self.scan_down(top, left, i) - if result: - bottom, rowseps, newcolseps = result - update_dict_of_lists(colseps, newcolseps) - return bottom, i, rowseps, colseps - elif line[i] != '-': - return None - return None - - def scan_down(self, top, left, right): - """ - Look for the bottom-right corner of the cell, making note of all row - boundaries. - """ - rowseps = {} - for i in range(top + 1, self.bottom + 1): - if self.block[i][right] == '+': - rowseps[i] = [right] - result = self.scan_left(top, left, i, right) - if result: - newrowseps, colseps = result - update_dict_of_lists(rowseps, newrowseps) - return i, rowseps, colseps - elif self.block[i][right] != '|': - return None - return None - - def scan_left(self, top, left, bottom, right): - """ - Noting column boundaries, look for the bottom-left corner of the cell. - It must line up with the starting point. - """ - colseps = {} - line = self.block[bottom] - for i in range(right - 1, left, -1): - if line[i] == '+': - colseps[i] = [bottom] - elif line[i] != '-': - return None - if line[left] != '+': - return None - result = self.scan_up(top, left, bottom, right) - if result is not None: - rowseps = result - return rowseps, colseps - return None - - def scan_up(self, top, left, bottom, right): - """ - Noting row boundaries, see if we can return to the starting point. - """ - rowseps = {} - for i in range(bottom - 1, top, -1): - if self.block[i][left] == '+': - rowseps[i] = [left] - elif self.block[i][left] != '|': - return None - return rowseps - - def structure_from_cells(self): - """ - From the data collected by `scan_cell()`, convert to the final data - structure. - """ - rowseps = sorted(self.rowseps.keys()) # list of row boundaries - rowindex = {} - for i in range(len(rowseps)): - rowindex[rowseps[i]] = i # row boundary -> row number mapping - colseps = sorted(self.colseps.keys()) # list of column boundaries - colindex = {} - for i in range(len(colseps)): - colindex[colseps[i]] = i # column boundary -> col number map - colspecs = [(colseps[i] - colseps[i - 1] - 1) - for i in range(1, len(colseps))] # list of column widths - # prepare an empty table with the correct number of rows & columns - onerow = [None for i in range(len(colseps) - 1)] - rows = [onerow[:] for i in range(len(rowseps) - 1)] - # keep track of # of cells remaining; should reduce to zero - remaining = (len(rowseps) - 1) * (len(colseps) - 1) - for top, left, bottom, right, block in self.cells: - rownum = rowindex[top] - colnum = colindex[left] - assert rows[rownum][colnum] is None, ( - 'Cell (row %s, column %s) already used.' - % (rownum + 1, colnum + 1)) - morerows = rowindex[bottom] - rownum - 1 - morecols = colindex[right] - colnum - 1 - remaining -= (morerows + 1) * (morecols + 1) - # write the cell into the table - rows[rownum][colnum] = (morerows, morecols, top + 1, block) - assert remaining == 0, 'Unused cells remaining.' - if self.head_body_sep: # separate head rows from body rows - numheadrows = rowindex[self.head_body_sep] - headrows = rows[:numheadrows] - bodyrows = rows[numheadrows:] - else: - headrows = [] - bodyrows = rows - return (colspecs, headrows, bodyrows) - - -class SimpleTableParser(TableParser): - - """ - Parse a simple table using `parse()`. - - Here's an example of a simple table:: - - ===== ===== - col 1 col 2 - ===== ===== - 1 Second column of row 1. - 2 Second column of row 2. - Second line of paragraph. - 3 - Second column of row 3. - - - Second item in bullet - list (row 3, column 2). - 4 is a span - ------------ - 5 - ===== ===== - - Top and bottom borders use '=', column span underlines use '-', column - separation is indicated with spaces. - - Passing the above table to the `parse()` method will result in the - following data structure, whose interpretation is the same as for - `GridTableParser`:: - - ([5, 25], - [[(0, 0, 1, ['col 1']), - (0, 0, 1, ['col 2'])]], - [[(0, 0, 3, ['1']), - (0, 0, 3, ['Second column of row 1.'])], - [(0, 0, 4, ['2']), - (0, 0, 4, ['Second column of row 2.', - 'Second line of paragraph.'])], - [(0, 0, 6, ['3']), - (0, 0, 6, ['- Second column of row 3.', - '', - '- Second item in bullet', - ' list (row 3, column 2).'])], - [(0, 1, 10, ['4 is a span'])], - [(0, 0, 12, ['5']), - (0, 0, 12, [''])]]) - """ - - head_body_separator_pat = re.compile('=[ =]*$') - span_pat = re.compile('-[ -]*$') - - def setup(self, block): - self.block = block[:] # make a copy; it will be modified - self.block.disconnect() # don't propagate changes to parent - # Convert top & bottom borders to column span underlines: - self.block[0] = self.block[0].replace('=', '-') - self.block[-1] = self.block[-1].replace('=', '-') - self.head_body_sep = None - self.columns = [] - self.border_end = None - self.table = [] - self.done = [-1] * len(block[0]) - self.rowseps = {0: [0]} - self.colseps = {0: [0]} - - def parse_table(self): - """ - First determine the column boundaries from the top border, then - process rows. Each row may consist of multiple lines; accumulate - lines until a row is complete. Call `self.parse_row` to finish the - job. - """ - # Top border must fully describe all table columns. - self.columns = self.parse_columns(self.block[0], 0) - self.border_end = self.columns[-1][1] - firststart, firstend = self.columns[0] - offset = 1 # skip top border - start = 1 - text_found = None - while offset < len(self.block): - line = self.block[offset] - if self.span_pat.match(line): - # Column span underline or border; row is complete. - self.parse_row(self.block[start:offset], start, - (line.rstrip(), offset)) - start = offset + 1 - text_found = None - elif line[firststart:firstend].strip(): - # First column not blank, therefore it's a new row. - if text_found and offset != start: - self.parse_row(self.block[start:offset], start) - start = offset - text_found = 1 - elif not text_found: - start = offset + 1 - offset += 1 - - def parse_columns(self, line, offset): - """ - Given a column span underline, return a list of (begin, end) pairs. - """ - cols = [] - end = 0 - while True: - begin = line.find('-', end) - end = line.find(' ', begin) - if begin < 0: - break - if end < 0: - end = len(line) - cols.append((begin, end)) - if self.columns: - if cols[-1][1] != self.border_end: - raise TableMarkupError('Column span incomplete in table ' - 'line %s.' % (offset+1), - offset=offset) - # Allow for an unbounded rightmost column: - cols[-1] = (cols[-1][0], self.columns[-1][1]) - return cols - - def init_row(self, colspec, offset): - i = 0 - cells = [] - for start, end in colspec: - morecols = 0 - try: - assert start == self.columns[i][0] - while end != self.columns[i][1]: - i += 1 - morecols += 1 - except (AssertionError, IndexError): - raise TableMarkupError('Column span alignment problem ' - 'in table line %s.' % (offset+2), - offset=offset+1) - cells.append([0, morecols, offset, []]) - i += 1 - return cells - - def parse_row(self, lines, start, spanline=None): - """ - Given the text `lines` of a row, parse it and append to `self.table`. - - The row is parsed according to the current column spec (either - `spanline` if provided or `self.columns`). For each column, extract - text from each line, and check for text in column margins. Finally, - adjust for insignificant whitespace. - """ - if not (lines or spanline): - # No new row, just blank lines. - return - if spanline: - columns = self.parse_columns(*spanline) - span_offset = spanline[1] - else: - columns = self.columns[:] - span_offset = start - self.check_columns(lines, start, columns) - row = self.init_row(columns, start) - for i in range(len(columns)): - start, end = columns[i] - cellblock = lines.get_2D_block(0, start, len(lines), end) - cellblock.disconnect() # lines in cell can't sync with parent - cellblock.replace(self.double_width_pad_char, '') - row[i][3] = cellblock - self.table.append(row) - - def check_columns(self, lines, first_line, columns): - """ - Check for text in column margins and text overflow in the last column. - Raise TableMarkupError if anything but whitespace is in column margins. - Adjust the end value for the last column if there is text overflow. - """ - # "Infinite" value for a dummy last column's beginning, used to - # check for text overflow: - columns.append((sys.maxsize, None)) - lastcol = len(columns) - 2 - # combining characters do not contribute to the column width - lines = [strip_combining_chars(line) for line in lines] - - for i in range(len(columns) - 1): - start, end = columns[i] - nextstart = columns[i+1][0] - offset = 0 - for line in lines: - if i == lastcol and line[end:].strip(): - text = line[start:].rstrip() - new_end = start + len(text) - main_start, main_end = self.columns[-1] - columns[i] = (start, max(main_end, new_end)) - if new_end > main_end: - self.columns[-1] = (main_start, new_end) - elif line[end:nextstart].strip(): - raise TableMarkupError('Text in column margin ' - 'in table line %s.' % (first_line+offset+1), - offset=first_line+offset) - offset += 1 - columns.pop() - - def structure_from_cells(self): - colspecs = [end - start for start, end in self.columns] - first_body_row = 0 - if self.head_body_sep: - for i in range(len(self.table)): - if self.table[i][0][2] > self.head_body_sep: - first_body_row = i - break - return (colspecs, self.table[:first_body_row], - self.table[first_body_row:]) - - -def update_dict_of_lists(master, newdata): - """ - Extend the list values of `master` with those from `newdata`. - - Both parameters must be dictionaries containing list values. - """ - for key, values in newdata.items(): - master.setdefault(key, []).extend(values)