comparison planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/pyRdfa/state.py @ 1:56ad4e20f292 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:32:28 -0400
parents
children
comparison
equal deleted inserted replaced
0:d30785e31577 1:56ad4e20f292
1 # -*- coding: utf-8 -*-
2 """
3 Parser's execution context (a.k.a. state) object and handling. The state includes:
4
5 - language, retrieved from C{@xml:lang} or C{@lang}
6 - URI base, determined by C{<base>} or set explicitly. This is a little bit superfluous, because the current RDFa syntax does not make use of C{@xml:base}; i.e., this could be a global value. But the structure is prepared to add C{@xml:base} easily, if needed.
7 - options, in the form of an L{options<pyRdfa.options>} instance
8 - a separate vocabulary/CURIE handling resource, in the form of an L{termorcurie<pyRdfa.TermOrCurie>} instance
9
10 The execution context object is also used to handle URI-s, CURIE-s, terms, etc.
11
12 @summary: RDFa parser execution context
13 @organization: U{World Wide Web Consortium<http://www.w3.org>}
14 @author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
15 @license: This software is available for use under the
16 U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}
17 """
18
19 """
20 $Id: state.py,v 1.23 2013-10-16 11:48:54 ivan Exp $
21 $Date: 2013-10-16 11:48:54 $
22 """
23 import sys
24 (py_v_major, py_v_minor, py_v_micro, py_v_final, py_v_serial) = sys.version_info
25
26 import rdflib
27 from rdflib import URIRef
28 from rdflib import Literal
29 from rdflib import BNode
30 from rdflib import Namespace
31 if rdflib.__version__ >= "3.0.0" :
32 from rdflib import RDF as ns_rdf
33 from rdflib import RDFS as ns_rdfs
34 else :
35 from rdflib.RDFS import RDFSNS as ns_rdfs
36 from rdflib.RDF import RDFNS as ns_rdf
37
38 from .options import Options
39 from .utils import quote_URI
40 from .host import HostLanguage, accept_xml_base, accept_xml_lang, beautifying_prefixes
41
42 from .termorcurie import TermOrCurie
43 from . import UnresolvablePrefix, UnresolvableTerm
44
45 from . import err_lang
46 from . import err_URI_scheme
47 from . import err_illegal_safe_CURIE
48 from . import err_no_CURIE_in_safe_CURIE
49 from . import err_undefined_terms
50 from . import err_non_legal_CURIE_ref
51 from . import err_undefined_CURIE
52
53 if py_v_major >= 3 :
54 from urllib.parse import urlparse, urlunparse, urlsplit, urljoin
55 else :
56 from urllib.parse import urlparse, urlunparse, urlsplit, urljoin
57
58 class ListStructure :
59 """Special class to handle the C{@inlist} type structures in RDFa 1.1; stores the "origin", i.e,
60 where the list will be attached to, and the mappings as defined in the spec.
61 """
62 def __init__(self) :
63 self.mapping = {}
64 self.origin = None
65
66 #### Core Class definition
67 class ExecutionContext :
68 """State at a specific node, including the current set of namespaces in the RDFLib sense, current language,
69 the base, vocabularies, etc. The class is also used to interpret URI-s and CURIE-s to produce
70 URI references for RDFLib.
71
72 @ivar options: reference to the overall options
73 @type options: L{Options}
74 @ivar base: the 'base' URI
75 @ivar parsedBase: the parsed version of base, as produced by urlparse.urlsplit
76 @ivar defaultNS: default namespace (if defined via @xmlns) to be used for XML Literals
77 @ivar lang: language tag (possibly None)
78 @ivar term_or_curie: vocabulary management class instance
79 @type term_or_curie: L{termorcurie.TermOrCurie}
80 @ivar list_mapping: dictionary of arrays, containing a list of URIs key-ed via properties for lists
81 @ivar node: the node to which this state belongs
82 @type node: DOM node instance
83 @ivar rdfa_version: RDFa version of the content
84 @type rdfa_version: String
85 @ivar supress_lang: in some cases, the effect of the lang attribute should be supressed for the given node, although it should be inherited down below (example: @value attribute of the data element in HTML5)
86 @type supress_lang: Boolean
87 @cvar _list: list of attributes that allow for lists of values and should be treated as such
88 @cvar _resource_type: dictionary; mapping table from attribute name to the exact method to retrieve the URI(s). Is initialized at first instantiation.
89 """
90
91 # list of attributes that allow for lists of values and should be treated as such
92 _list = [ "rel", "rev", "property", "typeof", "role" ]
93 # mapping table from attribute name to the exact method to retrieve the URI(s).
94 _resource_type = {}
95
96 def __init__(self, node, graph, inherited_state=None, base="", options=None, rdfa_version = None) :
97 """
98 @param node: the current DOM Node
99 @param graph: the RDFLib Graph
100 @keyword inherited_state: the state as inherited
101 from upper layers. This inherited_state is mixed with the state information
102 retrieved from the current node.
103 @type inherited_state: L{state.ExecutionContext}
104 @keyword base: string denoting the base URI for the specific node. This overrides the possible
105 base inherited from the upper layers. The
106 current XHTML+RDFa syntax does not allow the usage of C{@xml:base}, but SVG1.2 does, so this is
107 necessary for SVG (and other possible XML dialects that accept C{@xml:base})
108 @keyword options: invocation options, and references to warning graphs
109 @type options: L{Options<pyRdfa.options>}
110 """
111 def remove_frag_id(uri) :
112 """
113 The fragment ID for self.base must be removed
114 """
115 try :
116 # To be on the safe side:-)
117 t = urlparse(uri)
118 return urlunparse((t[0],t[1],t[2],t[3],t[4],""))
119 except :
120 return uri
121
122 # This is, conceptually, an additional class initialization, but it must be done run time, otherwise import errors show up
123 if len( ExecutionContext._resource_type ) == 0 :
124 ExecutionContext._resource_type = {
125 "href" : ExecutionContext._URI,
126 "src" : ExecutionContext._URI,
127 "vocab" : ExecutionContext._URI,
128
129 "about" : ExecutionContext._CURIEorURI,
130 "resource" : ExecutionContext._CURIEorURI,
131
132 "rel" : ExecutionContext._TERMorCURIEorAbsURI,
133 "rev" : ExecutionContext._TERMorCURIEorAbsURI,
134 "datatype" : ExecutionContext._TERMorCURIEorAbsURI,
135 "typeof" : ExecutionContext._TERMorCURIEorAbsURI,
136 "property" : ExecutionContext._TERMorCURIEorAbsURI,
137 "role" : ExecutionContext._TERMorCURIEorAbsURI,
138 }
139 #-----------------------------------------------------------------
140 self.node = node
141
142 #-----------------------------------------------------------------
143 # Settling the base. In a generic XML, xml:base should be accepted at all levels (though this is not the
144 # case in, say, XHTML...)
145 # At the moment, it is invoked with a 'None' at the top level of parsing, that is
146 # when the <base> element is looked for (for the HTML cases, that is)
147 if inherited_state :
148 self.rdfa_version = inherited_state.rdfa_version
149 self.base = inherited_state.base
150 self.options = inherited_state.options
151
152 self.list_mapping = inherited_state.list_mapping
153 self.new_list = False
154
155 # for generic XML versions the xml:base attribute should be handled
156 if self.options.host_language in accept_xml_base and node.hasAttribute("xml:base") :
157 self.base = remove_frag_id(node.getAttribute("xml:base"))
158 else :
159 # this is the branch called from the very top
160 self.list_mapping = ListStructure()
161 self.new_list = True
162
163 if rdfa_version is not None :
164 self.rdfa_version = rdfa_version
165 else :
166 from . import rdfa_current_version
167 self.rdfa_version = rdfa_current_version
168
169 # This value can be overwritten by a @version attribute
170 if node.hasAttribute("version") :
171 top_version = node.getAttribute("version")
172 if top_version.find("RDFa 1.0") != -1 or top_version.find("RDFa1.0") != -1 :
173 self.rdfa_version = "1.0"
174 elif top_version.find("RDFa 1.1") != -1 or top_version.find("RDFa1.1") != -1 :
175 self.rdfa_version = "1.1"
176
177 # this is just to play safe. I believe this should actually not happen...
178 if options == None :
179 from . import Options
180 self.options = Options()
181 else :
182 self.options = options
183
184 self.base = ""
185 # handle the base element case for HTML
186 if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5 ] :
187 for bases in node.getElementsByTagName("base") :
188 if bases.hasAttribute("href") :
189 self.base = remove_frag_id(bases.getAttribute("href"))
190 continue
191 elif self.options.host_language in accept_xml_base and node.hasAttribute("xml:base") :
192 self.base = remove_frag_id(node.getAttribute("xml:base"))
193
194 # If no local setting for base occurs, the input argument has it
195 if self.base == "" :
196 self.base = base
197
198 # Perform an extra beautification in RDFLib
199 if self.options.host_language in beautifying_prefixes :
200 dict = beautifying_prefixes[self.options.host_language]
201 for key in dict :
202 graph.bind(key,dict[key])
203
204 input_info = "Input Host Language:%s, RDFa version:%s, base:%s" % (self.options.host_language, self.rdfa_version, self.base)
205 self.options.add_info(input_info)
206
207
208 #-----------------------------------------------------------------
209 # this will be used repeatedly, better store it once and for all...
210 self.parsedBase = urlsplit(self.base)
211
212 #-----------------------------------------------------------------
213 # generate and store the local CURIE handling class instance
214 self.term_or_curie = TermOrCurie(self, graph, inherited_state)
215
216 #-----------------------------------------------------------------
217 # Settling the language tags
218 # @lang has priority over @xml:lang
219 # it is a bit messy: the three fundamental modes (xhtml, html, or xml) are all slightly different:-(
220 # first get the inherited state's language, if any
221 if inherited_state :
222 self.lang = inherited_state.lang
223 else :
224 self.lang = None
225
226 self.supress_lang = False
227
228
229 if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.xhtml5, HostLanguage.html5 ] :
230 # we may have lang and xml:lang
231 if node.hasAttribute("lang") :
232 lang = node.getAttribute("lang").lower()
233 else :
234 lang = None
235 if node.hasAttribute("xml:lang") :
236 xmllang = node.getAttribute("xml:lang").lower()
237 else :
238 xmllang = None
239 # First of all, set the value, if any
240 if xmllang != None :
241 # this has priority
242 if len(xmllang) != 0 :
243 self.lang = xmllang
244 else :
245 self.lang = None
246 elif lang != None :
247 if len(lang) != 0 :
248 self.lang = lang
249 else :
250 self.lang = None
251 # Ideally, a warning should be generated if lang and xmllang are both present with different values. But
252 # the HTML5 Parser does its magic by overriding a lang value if xmllang is present, so the potential
253 # error situations are simply swallowed...
254
255 elif self.options.host_language in accept_xml_lang and node.hasAttribute("xml:lang") :
256 self.lang = node.getAttribute("xml:lang").lower()
257 if len(self.lang) == 0 : self.lang = None
258
259 #-----------------------------------------------------------------
260 # Set the default namespace. Used when generating XML Literals
261 if node.hasAttribute("xmlns") :
262 self.defaultNS = node.getAttribute("xmlns")
263 elif inherited_state and inherited_state.defaultNS != None :
264 self.defaultNS = inherited_state.defaultNS
265 else :
266 self.defaultNS = None
267 # end __init__
268
269 def _URI(self, val) :
270 """Returns a URI for a 'pure' URI (ie, not a CURIE). The method resolves possible relative URI-s. It also
271 checks whether the URI uses an unusual URI scheme (and issues a warning); this may be the result of an
272 uninterpreted CURIE...
273 @param val: attribute value to be interpreted
274 @type val: string
275 @return: an RDFLib URIRef instance
276 """
277 def create_URIRef(uri, check = True) :
278 """
279 Mini helping function: it checks whether a uri is using a usual scheme before a URIRef is created. In case
280 there is something unusual, a warning is generated (though the URIRef is created nevertheless)
281 @param uri: (absolute) URI string
282 @return: an RDFLib URIRef instance
283 """
284 from . import uri_schemes
285 val = uri.strip()
286 if check and urlsplit(val)[0] not in uri_schemes :
287 self.options.add_warning(err_URI_scheme % val.strip(), node=self.node.nodeName)
288 return URIRef(val)
289
290 def join(base, v, check = True) :
291 """
292 Mini helping function: it makes a urljoin for the paths. Based on the python library, but
293 that one has a bug: in some cases it
294 swallows the '#' or '?' character at the end. This is clearly a problem with
295 Semantic Web URI-s, so this is checked, too
296 @param base: base URI string
297 @param v: local part
298 @param check: whether the URI should be checked against the list of 'existing' URI schemes
299 @return: an RDFLib URIRef instance
300 """
301 # UGLY!!! There is a bug for a corner case in python version <= 2.5.X
302 if len(v) > 0 and v[0] == '?' and (py_v_major < 3 and py_v_minor <= 5) :
303 return create_URIRef(base+v, check)
304 ####
305
306 joined = urljoin(base, v)
307 try :
308 if v[-1] != joined[-1] and (v[-1] == "#" or v[-1] == "?") :
309 return create_URIRef(joined + v[-1], check)
310 else :
311 return create_URIRef(joined, check)
312 except :
313 return create_URIRef(joined, check)
314
315 if val == "" :
316 # The fragment ID must be removed...
317 return URIRef(self.base)
318
319 # fall back on good old traditional URI-s.
320 # To be on the safe side, let us use the Python libraries
321 if self.parsedBase[0] == "" :
322 # base is, in fact, a local file name
323 # The following call is just to be sure that some pathological cases when
324 # the ':' _does_ appear in the URI but not in a scheme position is taken
325 # care of properly...
326
327 key = urlsplit(val)[0]
328 if key == "" :
329 # relative URI, to be combined with local file name:
330 return join(self.base, val, check = False)
331 else :
332 return create_URIRef(val)
333 else :
334 # Trust the python library...
335 # Well, not quite:-) there is what is, in my view, a bug in the urljoin; in some cases it
336 # swallows the '#' or '?' character at the end. This is clearly a problem with
337 # Semantic Web URI-s
338 return join(self.base, val)
339 # end _URI
340
341 def _CURIEorURI(self, val) :
342 """Returns a URI for a (safe or not safe) CURIE. In case it is a safe CURIE but the CURIE itself
343 is not defined, an error message is issued. Otherwise, if it is not a CURIE, it is taken to be a URI
344 @param val: attribute value to be interpreted
345 @type val: string
346 @return: an RDFLib URIRef instance or None
347 """
348 if val == "" :
349 return URIRef(self.base)
350
351 safe_curie = False
352 if val[0] == '[' :
353 # If a safe CURIE is asked for, a pure URI is not acceptable.
354 # Is checked below, and that is why the safe_curie flag is necessary
355 if val[-1] != ']' :
356 # that is certainly forbidden: an incomplete safe CURIE
357 self.options.add_warning(err_illegal_safe_CURIE % val, UnresolvablePrefix, node=self.node.nodeName)
358 return None
359 else :
360 val = val[1:-1]
361 safe_curie = True
362 # There is a branch here depending on whether we are in 1.1 or 1.0 mode
363 if self.rdfa_version >= "1.1" :
364 retval = self.term_or_curie.CURIE_to_URI(val)
365 if retval == None :
366 # the value could not be interpreted as a CURIE, ie, it did not produce any valid URI.
367 # The rule says that then the whole value should be considered as a URI
368 # except if it was part of a safe CURIE. In that case it should be ignored...
369 if safe_curie :
370 self.options.add_warning(err_no_CURIE_in_safe_CURIE % val, UnresolvablePrefix, node=self.node.nodeName)
371 return None
372 else :
373 return self._URI(val)
374 else :
375 # there is an unlikely case where the retval is actually a URIRef with a relative URI. Better filter that one out
376 if isinstance(retval, BNode) == False and urlsplit(str(retval))[0] == "" :
377 # yep, there is something wrong, a new URIRef has to be created:
378 return URIRef(self.base+str(retval))
379 else :
380 return retval
381 else :
382 # in 1.0 mode a CURIE can be considered only in case of a safe CURIE
383 if safe_curie :
384 return self.term_or_curie.CURIE_to_URI(val)
385 else :
386 return self._URI(val)
387 # end _CURIEorURI
388
389 def _TERMorCURIEorAbsURI(self, val) :
390 """Returns a URI either for a term or for a CURIE. The value must be an NCNAME to be handled as a term; otherwise
391 the method falls back on a CURIE or an absolute URI.
392 @param val: attribute value to be interpreted
393 @type val: string
394 @return: an RDFLib URIRef instance or None
395 """
396 from . import uri_schemes
397 # This case excludes the pure base, ie, the empty value
398 if val == "" :
399 return None
400
401 from .termorcurie import ncname, termname
402 if termname.match(val) :
403 # This is a term, must be handled as such...
404 retval = self.term_or_curie.term_to_URI(val)
405 if not retval :
406 self.options.add_warning(err_undefined_terms % val, UnresolvableTerm, node=self.node.nodeName, buggy_value = val)
407 return None
408 else :
409 return retval
410 else :
411 # try a CURIE
412 retval = self.term_or_curie.CURIE_to_URI(val)
413 if retval :
414 return retval
415 elif self.rdfa_version >= "1.1" :
416 # See if it is an absolute URI
417 scheme = urlsplit(val)[0]
418 if scheme == "" :
419 # bug; there should be no relative URIs here
420 self.options.add_warning(err_non_legal_CURIE_ref % val, UnresolvablePrefix, node=self.node.nodeName)
421 return None
422 else :
423 if scheme not in uri_schemes :
424 self.options.add_warning(err_URI_scheme % val.strip(), node=self.node.nodeName)
425 return URIRef(val)
426 else :
427 # rdfa 1.0 case
428 self.options.add_warning(err_undefined_CURIE % val.strip(), UnresolvablePrefix, node=self.node.nodeName)
429 return None
430 # end _TERMorCURIEorAbsURI
431
432 # -----------------------------------------------------------------------------------------------
433
434 def getURI(self, attr) :
435 """Get the URI(s) for the attribute. The name of the attribute determines whether the value should be
436 a pure URI, a CURIE, etc, and whether the return is a single element of a list of those. This is done
437 using the L{ExecutionContext._resource_type} table.
438 @param attr: attribute name
439 @type attr: string
440 @return: an RDFLib URIRef instance (or None) or a list of those
441 """
442 if self.node.hasAttribute(attr) :
443 val = self.node.getAttribute(attr)
444 else :
445 if attr in ExecutionContext._list :
446 return []
447 else :
448 return None
449
450 # This may raise an exception if the attr has no key. This, actually,
451 # should not happen if the code is correct, but it does not harm having it here...
452 try :
453 func = ExecutionContext._resource_type[attr]
454 except :
455 # Actually, this should not happen...
456 func = ExecutionContext._URI
457
458 if attr in ExecutionContext._list :
459 # Allows for a list
460 resources = [ func(self, v.strip()) for v in val.strip().split() if v != None ]
461 retval = [ r for r in resources if r != None ]
462 else :
463 retval = func(self, val.strip())
464 return retval
465 # end getURI
466
467 def getResource(self, *args) :
468 """Get single resources from several different attributes. The first one that returns a valid URI wins.
469 @param args: variable list of attribute names, or a single attribute being a list itself.
470 @return: an RDFLib URIRef instance (or None) :
471 """
472 if len(args) == 0 :
473 return None
474 if isinstance(args[0], tuple) or isinstance(args[0], list) :
475 rargs = args[0]
476 else :
477 rargs = args
478
479 for resource in rargs :
480 uri = self.getURI(resource)
481 if uri != None : return uri
482 return None
483
484 # -----------------------------------------------------------------------------------------------
485 def reset_list_mapping(self, origin=None) :
486 """
487 Reset, ie, create a new empty dictionary for the list mapping.
488 """
489 self.list_mapping = ListStructure()
490 if origin: self.set_list_origin(origin)
491 self.new_list = True
492
493 def list_empty(self) :
494 """
495 Checks whether the list is empty.
496 @return: Boolean
497 """
498 return len(self.list_mapping.mapping) == 0
499
500 def get_list_props(self) :
501 """
502 Return the list of property values in the list structure
503 @return: list of URIRef
504 """
505 return list(self.list_mapping.mapping.keys())
506
507 def get_list_value(self,prop) :
508 """
509 Return the list of values in the list structure for a specific property
510 @return: list of RDF nodes
511 """
512 return self.list_mapping.mapping[prop]
513
514 def set_list_origin(self, origin) :
515 """
516 Set the origin of the list, ie, the subject to attach the final list(s) to
517 @param origin: URIRef
518 """
519 self.list_mapping.origin = origin
520
521 def get_list_origin(self) :
522 """
523 Return the origin of the list, ie, the subject to attach the final list(s) to
524 @return: URIRef
525 """
526 return self.list_mapping.origin
527
528 def add_to_list_mapping(self, property, resource) :
529 """Add a new property-resource on the list mapping structure. The latter is a dictionary of arrays;
530 if the array does not exist yet, it will be created on the fly.
531
532 @param property: the property URI, used as a key in the dictionary
533 @param resource: the resource to be added to the relevant array in the dictionary. Can be None; this is a dummy
534 placeholder for C{<span rel="property" inlist>...</span>} constructions that may be filled in by children or siblings; if not
535 an empty list has to be generated.
536 """
537 if property in self.list_mapping.mapping :
538 if resource != None :
539 # indeed, if it is None, than it should not override anything
540 if self.list_mapping.mapping[property] == None :
541 # replacing a dummy with real content
542 self.list_mapping.mapping[property] = [ resource ]
543 else :
544 self.list_mapping.mapping[property].append(resource)
545 else :
546 if resource != None :
547 self.list_mapping.mapping[property] = [ resource ]
548 else :
549 self.list_mapping.mapping[property] = None
550
551
552 ####################