Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/structureddata.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:d30785e31577 | 1:56ad4e20f292 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Extraction parsers for structured data embedded into HTML or XML files. | |
4 The former may include RDFa or microdata. The syntax and the extraction | |
5 procedures are based on: | |
6 | |
7 * The RDFa specifications: http://www.w3.org/TR/#tr_RDFa | |
8 * The microdata specification: http://www.w3.org/TR/microdata/ | |
9 * The specification of the microdata to RDF conversion: | |
10 http://www.w3.org/TR/microdata-rdf/ | |
11 | |
12 License: W3C Software License, | |
13 http://www.w3.org/Consortium/Legal/copyright-software | |
14 Author: Ivan Herman | |
15 Copyright: W3C | |
16 | |
17 """ | |
18 | |
19 from rdflib.parser import ( | |
20 Parser, StringInputSource, URLInputSource, FileInputSource) | |
21 | |
22 try: | |
23 import html5lib | |
24 assert html5lib | |
25 html5lib = True | |
26 except ImportError: | |
27 import warnings | |
28 warnings.warn( | |
29 'html5lib not found! RDFa and Microdata ' + | |
30 'parsers will not be available.') | |
31 html5lib = False | |
32 | |
33 | |
34 def _get_orig_source(source): | |
35 """ | |
36 A bit of a hack; the RDFa/microdata parsers need more than what the | |
37 upper layers of RDFLib provide... | |
38 This method returns the original source references. | |
39 """ | |
40 if isinstance(source, StringInputSource): | |
41 orig_source = source.getByteStream() | |
42 elif isinstance(source, URLInputSource): | |
43 orig_source = source.url | |
44 elif isinstance(source, FileInputSource): | |
45 orig_source = source.file.name | |
46 source.file.close() | |
47 else: | |
48 orig_source = source.getByteStream() | |
49 baseURI = source.getPublicId() | |
50 return (baseURI, orig_source) | |
51 | |
52 | |
53 def _check_error(graph): | |
54 from .pyRdfa import RDFA_Error, ns_rdf | |
55 from .pyRdfa.options import ns_dc | |
56 for (s, p, o) in graph.triples((None, ns_rdf["type"], RDFA_Error)): | |
57 for (x, y, msg) in graph.triples((s, ns_dc["description"], None)): | |
58 raise Exception("RDFa parsing Error! %s" % msg) | |
59 | |
60 | |
61 # This is the parser interface as it would look when called from the | |
62 # rest of RDFLib | |
63 class RDFaParser(Parser): | |
64 """ | |
65 Wrapper around the RDFa 1.1 parser. For further details on the RDFa 1.1 | |
66 processing, see the relevant W3C documents at | |
67 http://www.w3.org/TR/#tr_RDFa. RDFa 1.1 is defined for XHTML, HTML5, SVG | |
68 and, in general, for any XML language. | |
69 | |
70 Note that the parser can also handle RDFa 1.0 if the extra parameter is | |
71 used and/or the input source uses RDFa 1.0 specific @version or DTD-s. | |
72 """ | |
73 def parse(self, source, graph, | |
74 pgraph=None, | |
75 media_type="", | |
76 rdfa_version=None, | |
77 embedded_rdf=False, | |
78 space_preserve=True, | |
79 vocab_expansion=False, | |
80 vocab_cache=False, | |
81 refresh_vocab_cache=False, | |
82 vocab_cache_report=False, | |
83 check_lite=False): | |
84 """ | |
85 @param source: one of the input sources that the RDFLib package defined | |
86 @type source: InputSource class instance | |
87 @param graph: target graph for the triples; output graph, in RDFa spec. | |
88 parlance | |
89 @type graph: RDFLib Graph | |
90 @keyword pgraph: target for error and warning triples; processor graph, | |
91 in RDFa spec. parlance. If set to None, these triples are ignored | |
92 @type pgraph: RDFLib Graph | |
93 @keyword media_type: explicit setting of the preferred media type | |
94 (a.k.a. content type) of the the RDFa source. None means the content | |
95 type of the HTTP result is used, or a guess is made based on the | |
96 suffix of a file | |
97 @type media_type: string | |
98 @keyword rdfa_version: 1.0 or 1.1. If the value is "", then, by | |
99 default, 1.1 is used unless the source has explicit signals to use | |
100 1.0 (e.g., using a @version attribute, using a DTD set up for 1.0, etc) | |
101 @type rdfa_version: string | |
102 @keyword embedded_rdf: some formats allow embedding RDF in other | |
103 formats: (X)HTML can contain turtle in a special <script> element, | |
104 SVG can have RDF/XML embedded in a <metadata> element. This flag | |
105 controls whether those triples should be interpreted and added to | |
106 the output graph. Some languages (e.g., SVG) require this, and the | |
107 flag is ignored. | |
108 @type embedded_rdf: Boolean | |
109 @keyword space_preserve: by default, space in the HTML source must be preserved in the generated literal; | |
110 this behavior can be switched off | |
111 @type space_preserve: Boolean | |
112 @keyword vocab_expansion: whether the RDFa @vocab attribute should | |
113 also mean vocabulary expansion (see the RDFa 1.1 spec for further | |
114 details) | |
115 @type vocab_expansion: Boolean | |
116 @keyword vocab_cache: in case vocab expansion is used, whether the | |
117 expansion data (i.e., vocabulary) should be cached locally. This | |
118 requires the ability for the local application to write on the | |
119 local file system | |
120 @type vocab_chache: Boolean | |
121 @keyword vocab_cache_report: whether the details of vocabulary file caching process should be reported | |
122 in the processor graph as information (mainly useful for debug) | |
123 @type vocab_cache_report: Boolean | |
124 @keyword refresh_vocab_cache: whether the caching checks of vocabs should be by-passed, ie, if caches should be re-generated regardless of the stored date (important for vocab development) | |
125 @type refresh_vocab_cache: Boolean | |
126 @keyword check_lite: generate extra warnings in case the input source is not RDFa 1.1 check_lite | |
127 @type check_lite: Boolean | |
128 """ | |
129 | |
130 if html5lib is False: | |
131 raise ImportError( | |
132 'html5lib is not installed, cannot use ' + | |
133 'RDFa and Microdata parsers.') | |
134 | |
135 (baseURI, orig_source) = _get_orig_source(source) | |
136 self._process(graph, pgraph, baseURI, orig_source, | |
137 media_type=media_type, | |
138 rdfa_version=rdfa_version, | |
139 embedded_rdf=embedded_rdf, | |
140 space_preserve=space_preserve, | |
141 vocab_expansion=vocab_expansion, | |
142 vocab_cache=vocab_cache, | |
143 vocab_cache_report=vocab_cache_report, | |
144 refresh_vocab_cache=refresh_vocab_cache, | |
145 check_lite=check_lite | |
146 ) | |
147 | |
148 def _process(self, graph, pgraph, baseURI, orig_source, | |
149 media_type="", | |
150 rdfa_version=None, | |
151 embedded_rdf=False, | |
152 space_preserve=True, | |
153 vocab_expansion=False, | |
154 vocab_cache=False, | |
155 vocab_cache_report=False, | |
156 refresh_vocab_cache=False, | |
157 check_lite=False): | |
158 from .pyRdfa import pyRdfa, Options | |
159 from rdflib import Graph | |
160 processor_graph = pgraph if pgraph is not None else Graph() | |
161 self.options = Options(output_processor_graph=True, | |
162 embedded_rdf=embedded_rdf, | |
163 space_preserve=space_preserve, | |
164 vocab_expansion=vocab_expansion, | |
165 vocab_cache=vocab_cache, | |
166 vocab_cache_report=vocab_cache_report, | |
167 refresh_vocab_cache=refresh_vocab_cache, | |
168 check_lite=check_lite) | |
169 | |
170 if media_type is None: | |
171 media_type = "" | |
172 processor = pyRdfa(self.options, | |
173 base=baseURI, | |
174 media_type=media_type, | |
175 rdfa_version=rdfa_version) | |
176 processor.graph_from_source(orig_source, graph=graph, pgraph=processor_graph, rdfOutput=False) | |
177 # This may result in an exception if the graph parsing led to an error | |
178 _check_error(processor_graph) | |
179 | |
180 | |
181 class RDFa10Parser(Parser): | |
182 """ | |
183 This is just a convenience class to wrap around the RDFa 1.0 parser. | |
184 """ | |
185 def parse(self, source, graph, pgraph=None, media_type=""): | |
186 """ | |
187 @param source: one of the input sources that the RDFLib package defined | |
188 @type source: InputSource class instance | |
189 @param graph: target graph for the triples; output graph, in RDFa | |
190 spec. parlance | |
191 @type graph: RDFLib Graph | |
192 @keyword pgraph: target for error and warning triples; processor | |
193 graph, in RDFa spec. parlance. If set to None, these triples are | |
194 ignored | |
195 @type pgraph: RDFLib Graph | |
196 @keyword media_type: explicit setting of the preferred media type | |
197 (a.k.a. content type) of the the RDFa source. None means the content | |
198 type of the HTTP result is used, or a guess is made based on the | |
199 suffix of a file | |
200 @type media_type: string | |
201 @keyword rdfOutput: whether Exceptions should be catched and added, | |
202 as triples, to the processor graph, or whether they should be raised. | |
203 @type rdfOutput: Boolean | |
204 """ | |
205 RDFaParser().parse(source, graph, pgraph=pgraph, | |
206 media_type=media_type, rdfa_version="1.0") | |
207 | |
208 | |
209 class MicrodataParser(Parser): | |
210 """ | |
211 Wrapper around an HTML5 microdata, extracted and converted into RDF. For | |
212 the specification of microdata, see the relevant section of the HTML5 | |
213 spec: http://www.w3.org/TR/microdata/; for the algorithm used to extract | |
214 microdata into RDF, see http://www.w3.org/TR/microdata-rdf/. | |
215 """ | |
216 def parse(self, source, graph, vocab_expansion=False, vocab_cache=False): | |
217 """ | |
218 @param source: one of the input sources that the RDFLib package defined | |
219 @type source: InputSource class instance | |
220 @param graph: target graph for the triples; output graph, in RDFa | |
221 spec. parlance | |
222 @type graph: RDFLib Graph | |
223 @keyword vocab_expansion: whether the RDFa @vocab attribute should | |
224 also mean vocabulary expansion (see the RDFa 1.1 spec for further | |
225 details) | |
226 @type vocab_expansion: Boolean | |
227 @keyword vocab_cache: in case vocab expansion is used, whether the | |
228 expansion data (i.e., vocabulary) should be cached locally. This | |
229 requires the ability for the local application to write on the | |
230 local file system | |
231 @type vocab_chache: Boolean | |
232 @keyword rdfOutput: whether Exceptions should be catched and added, | |
233 as triples, to the processor graph, or whether they should be raised. | |
234 @type rdfOutput: Boolean | |
235 """ | |
236 if html5lib is False: | |
237 raise ImportError( | |
238 'html5lib is not installed, cannot use RDFa ' + | |
239 'and Microdata parsers.') | |
240 | |
241 (baseURI, orig_source) = _get_orig_source(source) | |
242 self._process(graph, baseURI, orig_source, | |
243 vocab_expansion=vocab_expansion, | |
244 vocab_cache=vocab_cache) | |
245 | |
246 def _process(self, graph, baseURI, orig_source, | |
247 vocab_expansion=False, vocab_cache=False): | |
248 from .pyMicrodata import pyMicrodata | |
249 processor = pyMicrodata(base=baseURI, vocab_expansion=vocab_expansion, | |
250 vocab_cache=vocab_cache) | |
251 processor.graph_from_source( | |
252 orig_source, graph=graph, rdfOutput=False) | |
253 | |
254 | |
255 class StructuredDataParser(Parser): | |
256 """ | |
257 Convenience parser to extract both RDFa (including embedded Turtle) | |
258 and microdata from an HTML file. | |
259 It is simply a wrapper around the specific parsers. | |
260 """ | |
261 def parse(self, source, graph, | |
262 pgraph=None, | |
263 rdfa_version="", | |
264 vocab_expansion=False, | |
265 vocab_cache=False, | |
266 media_type='text/html' | |
267 ): | |
268 """ | |
269 @param source: one of the input sources that the RDFLib package defined | |
270 @type source: InputSource class instance | |
271 @param graph: target graph for the triples; output graph, in RDFa | |
272 spec. parlance | |
273 @keyword rdfa_version: 1.0 or 1.1. If the value is "", then, by | |
274 default, 1.1 is used unless the source has explicit signals to use 1.0 | |
275 (e.g., using a @version attribute, using a DTD set up for 1.0, etc) | |
276 @type rdfa_version: string | |
277 @type graph: RDFLib Graph | |
278 @keyword pgraph: target for error and warning triples; processor | |
279 graph, in RDFa spec. parlance. If set to None, these triples are | |
280 ignored | |
281 @type pgraph: RDFLib Graph | |
282 @keyword vocab_expansion: whether the RDFa @vocab attribute should | |
283 also mean vocabulary expansion (see the RDFa 1.1 spec for further | |
284 details) | |
285 @type vocab_expansion: Boolean | |
286 @keyword vocab_cache: in case vocab expansion is used, whether the | |
287 expansion data (i.e., vocabulary) should be cached locally. This | |
288 requires the ability for the local application to write on the | |
289 local file system | |
290 @type vocab_chache: Boolean | |
291 @keyword rdfOutput: whether Exceptions should be catched and added, | |
292 as triples, to the processor graph, or whether they should be raised. | |
293 @type rdfOutput: Boolean | |
294 """ | |
295 # Note that the media_type argument is ignored, and is here only to avoid an 'unexpected argument' error. | |
296 # This parser works for text/html only anyway... | |
297 (baseURI, orig_source) = _get_orig_source(source) | |
298 if rdfa_version == "" : rdfa_version = "1.1" | |
299 RDFaParser()._process(graph, pgraph, baseURI, orig_source, | |
300 media_type='text/html', | |
301 rdfa_version=rdfa_version, | |
302 vocab_expansion=vocab_expansion, | |
303 vocab_cache=vocab_cache) | |
304 MicrodataParser()._process(graph, baseURI, orig_source, | |
305 vocab_expansion=vocab_expansion, | |
306 vocab_cache=vocab_cache) | |
307 from .hturtle import HTurtleParser | |
308 HTurtleParser()._process(graph, baseURI, orig_source, media_type='text/html') |