diff env/lib/python3.7/site-packages/rdflib/plugins/parsers/structureddata.py @ 5:9b1c78e6ba9c draft default tip

"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author shellac
date Mon, 01 Jun 2020 08:59:25 -0400
parents 79f47841a781
children
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/rdflib/plugins/parsers/structureddata.py	Thu May 14 16:47:39 2020 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,308 +0,0 @@
-#!/usr/bin/env python
-"""
-Extraction parsers for structured data embedded into HTML or XML files.
-The former may include RDFa or microdata. The syntax and the extraction
-procedures are based on:
-
-* The RDFa specifications: http://www.w3.org/TR/#tr_RDFa
-* The microdata specification: http://www.w3.org/TR/microdata/
-* The specification of the microdata to RDF conversion:
-http://www.w3.org/TR/microdata-rdf/
-
-License: W3C Software License,
-http://www.w3.org/Consortium/Legal/copyright-software
-Author: Ivan Herman
-Copyright: W3C
-
-"""
-
-from rdflib.parser import (
-    Parser, StringInputSource, URLInputSource, FileInputSource)
-
-try:
-    import html5lib
-    assert html5lib
-    html5lib = True
-except ImportError:
-    import warnings
-    warnings.warn(
-        'html5lib not found! RDFa and Microdata ' +
-        'parsers will not be available.')
-    html5lib = False
-
-
-def _get_orig_source(source):
-    """
-    A bit of a hack; the RDFa/microdata parsers need more than what the
-    upper layers of RDFLib provide...
-    This method returns the original source references.
-    """
-    if isinstance(source, StringInputSource):
-        orig_source = source.getByteStream()
-    elif isinstance(source, URLInputSource):
-        orig_source = source.url
-    elif isinstance(source, FileInputSource):
-        orig_source = source.file.name
-        source.file.close()
-    else:
-        orig_source = source.getByteStream()
-    baseURI = source.getPublicId()
-    return (baseURI, orig_source)
-
-
-def _check_error(graph):
-    from .pyRdfa import RDFA_Error, ns_rdf
-    from .pyRdfa.options import ns_dc
-    for (s, p, o) in graph.triples((None, ns_rdf["type"], RDFA_Error)):
-        for (x, y, msg) in graph.triples((s, ns_dc["description"], None)):
-            raise Exception("RDFa parsing Error! %s" % msg)
-
-
-# This is the parser interface as it would look when called from the
-# rest of RDFLib
-class RDFaParser(Parser):
-    """
-    Wrapper around the RDFa 1.1 parser. For further details on the RDFa 1.1
-    processing, see the relevant W3C documents at
-    http://www.w3.org/TR/#tr_RDFa. RDFa 1.1 is defined for XHTML, HTML5, SVG
-    and, in general, for any XML language.
-
-    Note that the parser can also handle RDFa 1.0 if the extra parameter is
-    used and/or the input source uses RDFa 1.0 specific @version or DTD-s.
-    """
-    def parse(self, source, graph,
-              pgraph=None,
-              media_type="",
-              rdfa_version=None,
-              embedded_rdf=False,
-              space_preserve=True,
-              vocab_expansion=False,
-              vocab_cache=False,
-              refresh_vocab_cache=False,
-              vocab_cache_report=False,
-              check_lite=False):
-        """
-        @param source: one of the input sources that the RDFLib package defined
-        @type source: InputSource class instance
-        @param graph: target graph for the triples; output graph, in RDFa spec.
-        parlance
-        @type graph: RDFLib Graph
-        @keyword pgraph: target for error and warning triples; processor graph,
-        in RDFa spec. parlance. If set to None, these triples are ignored
-        @type pgraph: RDFLib Graph
-        @keyword media_type: explicit setting of the preferred media type
-        (a.k.a. content type) of the the RDFa source. None means the content
-        type of the HTTP result is used, or a guess is made based on the
-        suffix of a file
-        @type media_type: string
-        @keyword rdfa_version: 1.0 or 1.1. If the value is "", then, by
-        default, 1.1 is used unless the source has explicit signals to use
-        1.0 (e.g., using a @version attribute, using a DTD set up for 1.0, etc)
-        @type rdfa_version: string
-        @keyword embedded_rdf: some formats allow embedding RDF in other
-        formats: (X)HTML can contain turtle in a special <script> element,
-        SVG can have RDF/XML embedded in a <metadata> element. This flag
-        controls whether those triples should be interpreted and added to
-        the output graph. Some languages (e.g., SVG) require this, and the
-        flag is ignored.
-        @type embedded_rdf: Boolean
-        @keyword space_preserve: by default, space in the HTML source must be preserved in the generated literal;
-        this behavior can be switched off
-        @type space_preserve: Boolean
-        @keyword vocab_expansion: whether the RDFa @vocab attribute should
-        also mean vocabulary expansion (see the RDFa 1.1 spec for further
-        details)
-        @type vocab_expansion: Boolean
-        @keyword vocab_cache: in case vocab expansion is used, whether the
-        expansion data (i.e., vocabulary) should be cached locally. This
-        requires the ability for the local application to write on the
-        local file system
-        @type vocab_chache: Boolean
-        @keyword vocab_cache_report: whether the details of vocabulary file caching process should be reported
-        in the processor graph as information (mainly useful for debug)
-        @type vocab_cache_report: Boolean
-        @keyword refresh_vocab_cache: whether the caching checks of vocabs should be by-passed, ie, if caches should be re-generated regardless of the stored date (important for vocab development)
-        @type refresh_vocab_cache: Boolean
-        @keyword check_lite: generate extra warnings in case the input source is not RDFa 1.1 check_lite
-        @type check_lite: Boolean
-        """
-
-        if html5lib is False:
-            raise ImportError(
-                'html5lib is not installed, cannot use ' +
-                'RDFa and Microdata parsers.')
-
-        (baseURI, orig_source) = _get_orig_source(source)
-        self._process(graph, pgraph, baseURI, orig_source,
-                      media_type=media_type,
-                      rdfa_version=rdfa_version,
-                      embedded_rdf=embedded_rdf,
-                      space_preserve=space_preserve,
-                      vocab_expansion=vocab_expansion,
-                      vocab_cache=vocab_cache,
-                      vocab_cache_report=vocab_cache_report,
-                      refresh_vocab_cache=refresh_vocab_cache,
-                      check_lite=check_lite
-                      )
-
-    def _process(self, graph, pgraph, baseURI, orig_source,
-                 media_type="",
-                 rdfa_version=None,
-                 embedded_rdf=False,
-                 space_preserve=True,
-                 vocab_expansion=False,
-                 vocab_cache=False,
-                 vocab_cache_report=False,
-                 refresh_vocab_cache=False,
-                 check_lite=False):
-        from .pyRdfa import pyRdfa, Options
-        from rdflib import Graph
-        processor_graph = pgraph if pgraph is not None else Graph()
-        self.options = Options(output_processor_graph=True,
-                               embedded_rdf=embedded_rdf,
-                               space_preserve=space_preserve,
-                               vocab_expansion=vocab_expansion,
-                               vocab_cache=vocab_cache,
-                               vocab_cache_report=vocab_cache_report,
-                               refresh_vocab_cache=refresh_vocab_cache,
-                               check_lite=check_lite)
-
-        if media_type is None:
-            media_type = ""
-        processor = pyRdfa(self.options,
-                           base=baseURI,
-                           media_type=media_type,
-                           rdfa_version=rdfa_version)
-        processor.graph_from_source(orig_source, graph=graph, pgraph=processor_graph, rdfOutput=False)
-        # This may result in an exception if the graph parsing led to an error
-        _check_error(processor_graph)
-
-
-class RDFa10Parser(Parser):
-    """
-    This is just a convenience class to wrap around the RDFa 1.0 parser.
-    """
-    def parse(self, source, graph, pgraph=None, media_type=""):
-        """
-        @param source: one of the input sources that the RDFLib package defined
-        @type source: InputSource class instance
-        @param graph: target graph for the triples; output graph, in RDFa
-        spec. parlance
-        @type graph: RDFLib Graph
-        @keyword pgraph: target for error and warning triples; processor
-        graph, in RDFa spec. parlance. If set to None, these triples are
-        ignored
-        @type pgraph: RDFLib Graph
-        @keyword media_type: explicit setting of the preferred media type
-        (a.k.a. content type) of the the RDFa source. None means the content
-        type of the HTTP result is used, or a guess is made based on the
-        suffix of a file
-        @type media_type: string
-        @keyword rdfOutput: whether Exceptions should be catched and added,
-        as triples, to the processor graph, or whether they should be raised.
-        @type rdfOutput: Boolean
-        """
-        RDFaParser().parse(source, graph, pgraph=pgraph,
-                           media_type=media_type, rdfa_version="1.0")
-
-
-class MicrodataParser(Parser):
-    """
-    Wrapper around an HTML5 microdata, extracted and converted into RDF. For
-    the specification of microdata, see the relevant section of the HTML5
-    spec: http://www.w3.org/TR/microdata/; for the algorithm used to extract
-    microdata into RDF, see http://www.w3.org/TR/microdata-rdf/.
-    """
-    def parse(self, source, graph, vocab_expansion=False, vocab_cache=False):
-        """
-        @param source: one of the input sources that the RDFLib package defined
-        @type source: InputSource class instance
-        @param graph: target graph for the triples; output graph, in RDFa
-        spec. parlance
-        @type graph: RDFLib Graph
-        @keyword vocab_expansion: whether the RDFa @vocab attribute should
-        also mean vocabulary expansion (see the RDFa 1.1 spec for further
-            details)
-        @type vocab_expansion: Boolean
-        @keyword vocab_cache: in case vocab expansion is used, whether the
-        expansion data (i.e., vocabulary) should be cached locally. This
-        requires the ability for the local application to write on the
-        local file system
-        @type vocab_chache: Boolean
-        @keyword rdfOutput: whether Exceptions should be catched and added,
-        as triples, to the processor graph, or whether they should be raised.
-        @type rdfOutput: Boolean
-        """
-        if html5lib is False:
-            raise ImportError(
-                'html5lib is not installed, cannot use RDFa ' +
-                'and Microdata parsers.')
-
-        (baseURI, orig_source) = _get_orig_source(source)
-        self._process(graph, baseURI, orig_source,
-                      vocab_expansion=vocab_expansion,
-                      vocab_cache=vocab_cache)
-
-    def _process(self, graph, baseURI, orig_source,
-                 vocab_expansion=False, vocab_cache=False):
-        from .pyMicrodata import pyMicrodata
-        processor = pyMicrodata(base=baseURI, vocab_expansion=vocab_expansion,
-                                vocab_cache=vocab_cache)
-        processor.graph_from_source(
-            orig_source, graph=graph, rdfOutput=False)
-
-
-class StructuredDataParser(Parser):
-    """
-    Convenience parser to extract both RDFa (including embedded Turtle)
-    and microdata from an HTML file.
-    It is simply a wrapper around the specific parsers.
-    """
-    def parse(self, source, graph,
-              pgraph=None,
-              rdfa_version="",
-              vocab_expansion=False,
-              vocab_cache=False,
-              media_type='text/html'
-              ):
-        """
-        @param source: one of the input sources that the RDFLib package defined
-        @type source: InputSource class instance
-        @param graph: target graph for the triples; output graph, in RDFa
-        spec. parlance
-        @keyword rdfa_version: 1.0 or 1.1. If the value is "", then, by
-        default, 1.1 is used unless the source has explicit signals to use 1.0
-        (e.g., using a @version attribute, using a DTD set up for 1.0, etc)
-        @type rdfa_version: string
-        @type graph: RDFLib Graph
-        @keyword pgraph: target for error and warning triples; processor
-        graph, in RDFa spec. parlance. If set to None, these triples are
-        ignored
-        @type pgraph: RDFLib Graph
-        @keyword vocab_expansion: whether the RDFa @vocab attribute should
-        also mean vocabulary expansion (see the RDFa 1.1 spec for further
-            details)
-        @type vocab_expansion: Boolean
-        @keyword vocab_cache: in case vocab expansion is used, whether the
-        expansion data (i.e., vocabulary) should be cached locally. This
-        requires the ability for the local application to write on the
-        local file system
-        @type vocab_chache: Boolean
-        @keyword rdfOutput: whether Exceptions should be catched and added,
-        as triples, to the processor graph, or whether they should be raised.
-        @type rdfOutput: Boolean
-        """
-        # Note that the media_type argument is ignored, and is here only to avoid an 'unexpected argument' error.
-        # This parser works for text/html only anyway...
-        (baseURI, orig_source) = _get_orig_source(source)
-        if rdfa_version == "" : rdfa_version = "1.1"
-        RDFaParser()._process(graph, pgraph, baseURI, orig_source,
-                              media_type='text/html',
-                              rdfa_version=rdfa_version,
-                              vocab_expansion=vocab_expansion,
-                              vocab_cache=vocab_cache)
-        MicrodataParser()._process(graph, baseURI, orig_source,
-                                   vocab_expansion=vocab_expansion,
-                                   vocab_cache=vocab_cache)
-        from .hturtle import HTurtleParser
-        HTurtleParser()._process(graph, baseURI, orig_source, media_type='text/html')