diff planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/hturtle.py @ 1:56ad4e20f292 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:32:28 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/hturtle.py	Fri Jul 31 00:32:28 2020 -0400
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+"""
+Extraction parser RDF embedded verbatim into HTML or XML files. This is based
+on:
+
+* The specification on embedding turtle into html:
+    http://www.w3.org/TR/turtle/#in-html
+
+For SVG (and currently SVG only) the method also extracts an embedded RDF/XML
+data, per SVG specification
+
+License: W3C Software License,
+http://www.w3.org/Consortium/Legal/copyright-software
+Author: Ivan Herman
+Copyright: W3C
+"""
+
+from rdflib.parser import Parser
+from .pyRdfa import pyRdfa, Options
+from .pyRdfa.state import ExecutionContext
+from .pyRdfa.embeddedRDF import handle_embeddedRDF
+from .structureddata import _get_orig_source, _check_error
+
+try:
+    import html5lib
+    assert html5lib
+    html5lib = True
+except ImportError:
+    import warnings
+    warnings.warn(
+        'html5lib not found! RDFa and Microdata parsers ' +
+        'will not be available.')
+    html5lib = False
+
+
+class HTurtle(pyRdfa):
+    """
+    Bastardizing the RDFa 1.1 parser to do a hturtle extractions
+    """
+    def __init__(self, options=None, base="", media_type=""):
+        pyRdfa.__init__(self, options=options, base=base,
+                        media_type=media_type, rdfa_version="1.1")
+
+    def graph_from_DOM(self, dom, graph, pgraph=None):
+        """
+        Stealing the parsing function from the original class, to do
+        turtle extraction only
+        """
+
+        def copyGraph(tog, fromg):
+            for t in fromg:
+                tog.add(t)
+            for k, ns in fromg.namespaces():
+                tog.bind(k, ns)
+
+        def _process_one_node(node, graph, state):
+            if handle_embeddedRDF(node, graph, state):
+                # we got an RDF content that has been extracted into Graph;
+                # the recursion should stop
+                return
+            else:
+                # recurse through all the child elements of the current node
+                for n in node.childNodes:
+                    if n.nodeType == node.ELEMENT_NODE:
+                        _process_one_node(n, graph, state)
+
+        topElement = dom.documentElement
+        state = ExecutionContext(topElement, graph, base=self.base,
+                                 options=self.options, rdfa_version="1.1")
+        _process_one_node(topElement, graph, state)
+        if pgraph is not None:
+            copyGraph(pgraph, self.options.processor_graph.graph)
+
+# This is the parser interface as it would look when called from the rest of
+# RDFLib
+
+
+class HTurtleParser(Parser):
+    def parse(self, source, graph, pgraph=None, media_type=""):
+        """
+        @param source: one of the input sources that the RDFLib package defined
+        @type source: InputSource class instance
+        @param graph: target graph for the triples; output graph, in RDFa spec.
+        parlance
+        @type graph: RDFLib Graph
+        @keyword media_type: explicit setting of the preferred media type
+        (a.k.a. content type) of the the RDFa source. None means the content
+        type of the HTTP result is used, or a guess is made based on the
+        suffix of a file
+        @type media_type: string
+        """
+        if html5lib is False:
+            raise ImportError(
+                'html5lib is not installed, cannot ' +
+                'use RDFa and Microdata parsers.')
+
+        (baseURI, orig_source) = _get_orig_source(source)
+        self._process(
+            graph, pgraph, baseURI, orig_source, media_type=media_type)
+
+    def _process(self, graph, baseURI, orig_source, media_type=""):
+        self.options = Options(output_processor_graph=None,
+                               embedded_rdf=True,
+                               vocab_expansion=False,
+                               vocab_cache=False)
+
+        if media_type is None:
+            media_type = ""
+        processor = HTurtle(
+            self.options, base=baseURI, media_type=media_type)
+        processor.graph_from_source(
+            orig_source, graph=graph, pgraph=None, rdfOutput=False)
+        # get possible error triples to raise exceptions
+        _check_error(graph)