comparison env/lib/python3.7/site-packages/rdflib/plugins/parsers/hturtle.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 # -*- coding: utf-8 -*-
2 """
3 Extraction parser RDF embedded verbatim into HTML or XML files. This is based
4 on:
5
6 * The specification on embedding turtle into html:
7 http://www.w3.org/TR/turtle/#in-html
8
9 For SVG (and currently SVG only) the method also extracts an embedded RDF/XML
10 data, per SVG specification
11
12 License: W3C Software License,
13 http://www.w3.org/Consortium/Legal/copyright-software
14 Author: Ivan Herman
15 Copyright: W3C
16 """
17
18 from rdflib.parser import Parser
19 from .pyRdfa import pyRdfa, Options
20 from .pyRdfa.state import ExecutionContext
21 from .pyRdfa.embeddedRDF import handle_embeddedRDF
22 from .structureddata import _get_orig_source, _check_error
23
24 try:
25 import html5lib
26 assert html5lib
27 html5lib = True
28 except ImportError:
29 import warnings
30 warnings.warn(
31 'html5lib not found! RDFa and Microdata parsers ' +
32 'will not be available.')
33 html5lib = False
34
35
36 class HTurtle(pyRdfa):
37 """
38 Bastardizing the RDFa 1.1 parser to do a hturtle extractions
39 """
40 def __init__(self, options=None, base="", media_type=""):
41 pyRdfa.__init__(self, options=options, base=base,
42 media_type=media_type, rdfa_version="1.1")
43
44 def graph_from_DOM(self, dom, graph, pgraph=None):
45 """
46 Stealing the parsing function from the original class, to do
47 turtle extraction only
48 """
49
50 def copyGraph(tog, fromg):
51 for t in fromg:
52 tog.add(t)
53 for k, ns in fromg.namespaces():
54 tog.bind(k, ns)
55
56 def _process_one_node(node, graph, state):
57 if handle_embeddedRDF(node, graph, state):
58 # we got an RDF content that has been extracted into Graph;
59 # the recursion should stop
60 return
61 else:
62 # recurse through all the child elements of the current node
63 for n in node.childNodes:
64 if n.nodeType == node.ELEMENT_NODE:
65 _process_one_node(n, graph, state)
66
67 topElement = dom.documentElement
68 state = ExecutionContext(topElement, graph, base=self.base,
69 options=self.options, rdfa_version="1.1")
70 _process_one_node(topElement, graph, state)
71 if pgraph is not None:
72 copyGraph(pgraph, self.options.processor_graph.graph)
73
74 # This is the parser interface as it would look when called from the rest of
75 # RDFLib
76
77
78 class HTurtleParser(Parser):
79 def parse(self, source, graph, pgraph=None, media_type=""):
80 """
81 @param source: one of the input sources that the RDFLib package defined
82 @type source: InputSource class instance
83 @param graph: target graph for the triples; output graph, in RDFa spec.
84 parlance
85 @type graph: RDFLib Graph
86 @keyword media_type: explicit setting of the preferred media type
87 (a.k.a. content type) of the the RDFa source. None means the content
88 type of the HTTP result is used, or a guess is made based on the
89 suffix of a file
90 @type media_type: string
91 """
92 if html5lib is False:
93 raise ImportError(
94 'html5lib is not installed, cannot ' +
95 'use RDFa and Microdata parsers.')
96
97 (baseURI, orig_source) = _get_orig_source(source)
98 self._process(
99 graph, pgraph, baseURI, orig_source, media_type=media_type)
100
101 def _process(self, graph, baseURI, orig_source, media_type=""):
102 self.options = Options(output_processor_graph=None,
103 embedded_rdf=True,
104 vocab_expansion=False,
105 vocab_cache=False)
106
107 if media_type is None:
108 media_type = ""
109 processor = HTurtle(
110 self.options, base=baseURI, media_type=media_type)
111 processor.graph_from_source(
112 orig_source, graph=graph, pgraph=None, rdfOutput=False)
113 # get possible error triples to raise exceptions
114 _check_error(graph)