comparison planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/pyMicrodata/microdata.py @ 1:56ad4e20f292 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:32:28 -0400
parents
children
comparison
equal deleted inserted replaced
0:d30785e31577 1:56ad4e20f292
1 # -*- coding: utf-8 -*-
2 """
3
4 The core of the Microdata->RDF conversion, a more or less verbatim implementation of the
5 U{W3C IG Note<http://www.w3.org/TR/microdata-rdf/>}. Because the implementation was also used to check
6 the note itself, it tries to be fairly close to the text.
7
8
9 @organization: U{World Wide Web Consortium<http://www.w3.org>}
10 @author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
11 @license: This software is available for use under the
12 U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}
13 """
14
15 """
16 $Id: microdata.py,v 1.4 2012/09/05 16:40:43 ivan Exp $
17 $Date: 2012/09/05 16:40:43 $
18
19 Added a reaction on the RDFaStopParsing exception: if raised while setting up the local execution context, parsing
20 is stopped (on the whole subtree)
21 """
22
23 import sys
24 if sys.version_info[0] >= 3 :
25 from urllib.parse import urlsplit, urlunsplit
26 else :
27 from urllib.parse import urlsplit, urlunsplit
28
29 from types import *
30
31 import rdflib
32 from rdflib import URIRef
33 from rdflib import Literal
34 from rdflib import BNode
35 from rdflib import Namespace
36 if rdflib.__version__ >= "3.0.0" :
37 from rdflib import Graph
38 from rdflib import RDF as ns_rdf
39 from rdflib import RDFS as ns_rdfs
40 from rdflib import XSD as ns_xsd
41 else :
42 from rdflib.Graph import Graph
43 from rdflib.RDFS import RDFSNS as ns_rdfs
44 from rdflib.Literal import _XSD_NS as ns_xsd
45 from rdflib.RDF import RDFNS as ns_rdf
46
47 ns_owl = Namespace("http://www.w3.org/2002/07/owl#")
48
49 from .registry import registry, vocab_names
50 from .utils import generate_RDF_collection, get_Literal, get_time_type
51 from .utils import get_lang_from_hierarchy, is_absolute_URI, generate_URI, fragment_escape
52
53 MD_VOCAB = "http://www.w3.org/ns/md#"
54 RDFA_VOCAB = URIRef("http://www.w3.org/ns/rdfa#usesVocabulary")
55
56 from . import debug
57
58 # Existing predicate schemes
59 class PropertySchemes :
60 vocabulary = "vocabulary"
61 contextual = "contextual"
62
63 class ValueMethod :
64 unordered = "unordered"
65 list = "list"
66
67 # ----------------------------------------------------------------------------
68
69 class Evaluation_Context :
70 """
71 Evaluation context structure. See Section 4.1 of the U{W3C IG Note<http://www.w3.org/TR/microdata-rdf/>}for the details.
72
73 @ivar current_type : an absolute URL for the current type, used when an item does not contain an item type
74 @ivar memory: mapping from items to RDF subjects
75 @type memory: dictionary
76 @ivar current_name: an absolute URL for the in-scope name, used for generating URIs for properties of items without an item type
77 @ivar current_vocabulary: an absolute URL for the current vocabulary, from the registry
78 """
79 def __init__( self ) :
80 self.current_type = None
81 self.memory = {}
82 self.current_name = None
83 self.current_vocabulary = None
84
85 def get_memory( self, item ) :
86 """
87 Get the memory content (ie, RDF subject) for 'item', or None if not stored yet
88 @param item: an 'item', in microdata terminology
89 @type item: DOM Element Node
90 @return: None, or an RDF Subject (URIRef or BNode)
91 """
92 if item in self.memory :
93 return self.memory[item]
94 else :
95 return None
96
97 def set_memory( self, item, subject ) :
98 """
99 Set the memory content, ie, the subject, for 'item'.
100 @param item: an 'item', in microdata terminology
101 @type item: DOM Element Node
102 @param subject: RDF Subject
103 @type subject: URIRef or Blank Node
104 """
105 self.memory[item] = subject
106
107 def new_copy(self, itype) :
108 """
109 During the generation algorithm a new copy of the current context has to be done with a new current type.
110
111 At the moment, the content of memory is copied, ie, a fresh dictionary is created and the content copied over.
112 Not clear whether that is necessary, though, maybe a simple reference is enough...
113 @param itype : an absolute URL for the current type
114 @return: a new evaluation context instance
115 """
116 retval = Evaluation_Context()
117 for k in self.memory :
118 retval.memory[k] = self.memory[k]
119
120 retval.current_type = itype
121 retval.current_name = self.current_name
122 retval.current_vocabulary = self.current_vocabulary
123 return retval
124
125 def __str__(self) :
126 retval = "Evaluation context:\n"
127 retval += " current type: %s\n" % self.current_type
128 retval += " current name: %s\n" % self.current_name
129 retval += " current vocabulary: %s\n" % self.current_vocabulary
130 retval += " memory: %s\n" % self.memory
131 retval += "----\n"
132 return retval
133
134 class Microdata :
135 """
136 This class encapsulates methods that are defined by the U{microdata spec<http://dev.w3.org/html5/md/Overview.html>},
137 as opposed to the RDF conversion note.
138
139 @ivar document: top of the DOM tree, as returned by the HTML5 parser
140 @ivar base: the base URI of the Dom tree, either set from the outside or via a @base element
141 """
142 def __init__( self, document, base = None) :
143 """
144 @param document: top of the DOM tree, as returned by the HTML5 parser
145 @param base: the base URI of the Dom tree, either set from the outside or via a @base element
146 """
147 self.document = document
148
149 #-----------------------------------------------------------------
150 # set the document base, will be used to generate top level URIs
151 self.base = None
152 # handle the base element case for HTML
153 for set_base in document.getElementsByTagName("base") :
154 if set_base.hasAttribute("href") :
155 # Yep, there is a local setting for base
156 self.base = set_base.getAttribute("href")
157 return
158 # If got here, ie, if no local setting for base occurs, the input argument has it
159 self.base = base
160
161 def get_top_level_items( self ) :
162 """
163 A top level item is and element that has the @itemscope set, but no @itemtype. They have to
164 be collected in pre-order and depth-first fashion.
165
166 @return: list of items (ie, DOM Nodes)
167 """
168 def collect_items( node ) :
169 items = []
170 for child in node.childNodes :
171 if child.nodeType == node.ELEMENT_NODE :
172 items += collect_items( child )
173
174 if node.hasAttribute("itemscope") and not node.hasAttribute("itemprop") :
175 # This is also a top level item
176 items.append(node)
177
178 return items
179
180 return collect_items( self.document )
181
182 def get_item_properties( self, item ) :
183 """
184 Collect the item's properties, ie, all DOM descendent nodes with @itemprop until the subtree hits another
185 @itemscope. @itemrefs are also added at this point.
186
187 @param item: current item
188 @type item: DOM Node
189 @return: array of items, ie, DOM Nodes
190 """
191 # go down the tree until another itemprop is hit, take care of the itemrefs, too; see the microdata doc
192 # probably the ugliest stuff
193 # returns a series of element nodes.
194 # Is it worth filtering the ones with itemprop at that level???
195 results = []
196 memory = [ item ]
197 pending = [ child for child in item.childNodes if child.nodeType == item.ELEMENT_NODE ]
198
199 if item.hasAttribute("itemref") :
200 for id in item.getAttribute("itemref").strip().split() :
201 obj = self.getElementById(id)
202 if obj != None : pending.append(obj)
203
204 while len(pending) > 0 :
205 current = pending.pop(0)
206 if current in memory :
207 # in general this raises an error; the same item cannot be there twice. In this case this is
208 # simply ignored
209 continue
210 else :
211 # this for the check above
212 memory.append(current)
213
214 # @itemscope is the barrier...
215 if not current.hasAttribute("itemscope") :
216 pending = [ child for child in current.childNodes if child.nodeType == child.ELEMENT_NODE ] + pending
217
218 if current.hasAttribute("itemprop") and current.getAttribute("itemprop").strip() != "" :
219 results.append(current)
220
221 return results
222
223 def getElementById(self, id) :
224 """This is a method defined for DOM 2 HTML, but the HTML5 parser does not seem to define it. Oh well...
225 @param id: value of an @id attribute to look for
226 @return: array of nodes whose @id attribute matches C{id} (formally, there should be only one...)
227 """
228 def collect_ids( node ) :
229 ids = []
230 for child in node.childNodes :
231 if child.nodeType == node.ELEMENT_NODE :
232 ids += collect_ids( child )
233
234 if node.hasAttribute("id") and node.getAttribute("id") == id :
235 # This is also a top level item
236 ids.append(node)
237
238 return ids
239
240 ids = collect_ids(self.document)
241 if len(ids) > 0 :
242 return ids[0]
243 else :
244 return None
245
246 class MicrodataConversion(Microdata) :
247 """
248 Top level class encapsulating the conversion algorithms as described in the W3C note.
249
250 @ivar graph: an RDF graph; an RDFLib Graph
251 @type graph: RDFLib Graph
252 @ivar document: top of the DOM tree, as returned by the HTML5 parser
253 @ivar ns_md: the Namespace for the microdata vocabulary
254 @ivar base: the base of the Dom tree, either set from the outside or via a @base element
255 """
256 def __init__( self, document, graph, base = None, vocab_expansion = False, vocab_cache = True ) :
257 """
258 @param graph: an RDF graph; an RDFLib Graph
259 @type graph: RDFLib Graph
260 @param document: top of the DOM tree, as returned by the HTML5 parser
261 @keyword base: the base of the Dom tree, either set from the outside or via a @base element
262 @keyword vocab_expansion: whether vocab expansion should be performed or not
263 @type vocab_expansion: Boolean
264 @keyword vocab_cache: if vocabulary expansion is done, then perform caching of the vocabulary data
265 @type vocab_cache: Boolean
266 """
267 Microdata.__init__(self, document, base)
268 self.vocab_expansion = vocab_expansion
269 self.vocab_cache = vocab_cache
270 self.graph = graph
271 self.ns_md = Namespace( MD_VOCAB )
272 self.graph.bind( "md",MD_VOCAB )
273 self.vocabularies_used = False
274
275 # Get the vocabularies defined in the registry bound to proper names, if any...
276
277 def _use_rdfa_context () :
278 try :
279 from ..pyRdfa.initialcontext import initial_context
280 except :
281 from pyRdfa.initialcontext import initial_context
282 retval = {}
283 vocabs = initial_context["http://www.w3.org/2011/rdfa-context/rdfa-1.1"].ns
284 for prefix in list(vocabs.keys()) :
285 uri = vocabs[prefix]
286 if uri not in vocab_names and uri not in registry : retval[uri] = prefix
287 return retval
288
289 for vocab in registry :
290 if vocab in vocab_names :
291 self.graph.bind( vocab_names[vocab],vocab )
292 else :
293 hvocab = vocab + '#'
294 if hvocab in vocab_names :
295 self.graph.bind( vocab_names[hvocab],hvocab )
296
297 # Add the prefixes defined in the RDFa initial context to improve the outlook of the output
298 # I put this into a try: except: in case the pyRdfa package is not available...
299 try :
300 try :
301 from ..pyRdfa.initialcontext import initial_context
302 except :
303 from pyRdfa.initialcontext import initial_context
304 vocabs = initial_context["http://www.w3.org/2011/rdfa-context/rdfa-1.1"].ns
305 for prefix in list(vocabs.keys()) :
306 uri = vocabs[prefix]
307 if uri not in registry :
308 # if it is in the registry, then it may have needed some special microdata massage...
309 self.graph.bind( prefix,uri )
310 except :
311 pass
312
313 def convert( self ) :
314 """
315 Top level entry to convert and generate all the triples. It finds the top level items,
316 and generates triples for each of them; additionally, it generates a top level entry point
317 to the items from base in the form of an RDF list.
318 """
319 item_list = []
320 for top_level_item in self.get_top_level_items() :
321 item_list.append( self.generate_triples(top_level_item, Evaluation_Context()) )
322 list = generate_RDF_collection( self.graph, item_list )
323 self.graph.add( (URIRef(self.base),self.ns_md["item"],list) )
324
325 # If the vocab expansion is also switched on, this is the time to do it.
326
327 # This is the version with my current proposal: the basic expansion is always there;
328 # the follow-your-nose inclusion of vocabulary is optional
329 if self.vocabularies_used :
330 try :
331 try :
332 from ..pyRdfa.rdfs.process import MiniOWL, process_rdfa_sem
333 from ..pyRdfa.options import Options
334 except :
335 from pyRdfa.rdfs.process import MiniOWL, process_rdfa_sem
336 from pyRdfa.options import Options
337 # if we did not get here, the pyRdfa package could not be
338 # imported. Too bad, but life should go on in the except branch...
339 if self.vocab_expansion :
340 # This is the full deal
341 options = Options(vocab_expansion = self.vocab_expansion, vocab_cache = self.vocab_cache)
342 process_rdfa_sem(self.graph, options)
343 else :
344 MiniOWL(self.graph).closure()
345 except :
346 pass
347
348 def generate_triples( self, item, context ) :
349 """
350 Generate the triples for a specific item. See the W3C Note for the details.
351
352 @param item: the DOM Node for the specific item
353 @type item: DOM Node
354 @param context: an instance of an evaluation context
355 @type context: L{Evaluation_Context}
356 @return: a URIRef or a BNode for the (RDF) subject
357 """
358 # Step 1,2: if the subject has to be set, store it in memory
359 subject = context.get_memory( item )
360 if subject == None :
361 # nop, there is no subject set. If there is a valid @itemid, that carries it
362 if item.hasAttribute("itemid") and is_absolute_URI( item.getAttribute("itemid") ):
363 subject = URIRef( item.getAttribute("itemid").strip() )
364 else :
365 subject = BNode()
366 context.set_memory( item, subject )
367
368 # Step 3: set the type triples if any
369 types = []
370 if item.hasAttribute("itemtype") :
371 types = item.getAttribute("itemtype").strip().split()
372 for t in types :
373 if is_absolute_URI( t ) :
374 self.graph.add( (subject, ns_rdf["type"], URIRef(t)) )
375
376 # Step 4, 5 and 6 to set the typing variable
377 if len(types) == 0 :
378 itype = None
379 else :
380 if is_absolute_URI(types[0]) :
381 itype = types[0]
382 context.current_name = None
383 elif context.current_type != None :
384 itype = context.current_type
385 else :
386 itype = None
387
388 # Step 7, 8, 9: Check the registry for possible keys and set the vocab
389 vocab = None
390 if itype != None :
391 for key in list(registry.keys()) :
392 if itype.startswith(key) :
393 # There is a predefined vocabulary for this type...
394 vocab = key
395 # Step 7: Issue an rdfa usesVocabulary triple
396 self.graph.add( (URIRef(self.base), RDFA_VOCAB, URIRef(vocab)))
397 self.vocabularies_used = True
398 break
399 # The registry has not set the vocabulary; has to be extracted from the type
400 if vocab == None :
401 parsed = urlsplit(itype)
402 if parsed.fragment != "" :
403 vocab = urlunsplit( (parsed.scheme,parsed.netloc,parsed.path,parsed.query,"") ) + '#'
404 elif parsed.path == "" and parsed.query == "" :
405 vocab = itype
406 if vocab[-1] != '/' : vocab += '/'
407 else :
408 vocab = itype.rsplit('/',1)[0] + '/'
409
410 # Step 9: update vocab in the context
411 if vocab != None :
412 context.current_vocabulary = vocab
413 elif item.hasAttribute("itemtype") :
414 context.current_vocabulary = None
415
416 # Step 10: set up a property list; this will be used to generate triples later.
417 # each entry in the dictionary is an array of RDF objects
418 property_list = {}
419
420 # Step 11: Get the item properties and run a cycle on those
421 for prop in self.get_item_properties(item) :
422 for name in prop.getAttribute("itemprop").strip().split() :
423 # 11.1.1. set a new context
424 new_context = context.new_copy(itype)
425 # 11.1.2, generate the URI for the property name, that will be the predicate
426 # Also update the context
427 new_context.current_name = predicate = self.generate_predicate_URI( name,new_context )
428 # 11.1.3, generate the property value. The extra flag signals that the value is a new item
429 # Note that 10.1.4 step is done in the method itself, ie, a recursion may occur there
430 # if a new item is hit (in which case the return value is a RDF resource chaining to a subject)
431 value = self.get_property_value( prop, new_context )
432 # 11.1.5, store all the values
433 if predicate in property_list :
434 property_list[predicate].append(value)
435 else :
436 property_list[predicate] = [ value ]
437
438 # step 12: generate the triples
439 for property in list(property_list.keys()) :
440 self.generate_property_values( subject, URIRef(property), property_list[property], context )
441
442 # Step 13: return the subject to the caller
443 return subject
444
445 def generate_predicate_URI( self, name, context ) :
446 """
447 Generate a full URI for a predicate, using the type, the vocabulary, etc.
448
449 For details of this entry, see Section 4.4
450 @param name: name of the property, ie, what appears in @itemprop
451 @param context: an instance of an evaluation context
452 @type context: L{Evaluation_Context}
453 """
454 if debug: print(( "name: %s, %s" % (name,context) ))
455
456 # Step 1: absolute URI-s are fine, take them as they are
457 if is_absolute_URI(name) : return name
458
459 # Step 2: if type is none, that this is just used as a fragment
460 # if not context.current_type :
461 if context.current_type == None and context.current_vocabulary == None :
462 if self.base[-1] == '#' :
463 b = self.base[:-1]
464 else :
465 b = self.base
466 return b + '#' + fragment_escape(name)
467
468 #if context.current_type == None :
469 # return generate_URI( self.base, name )
470
471 # Step 3: set the scheme
472 try :
473 if context.current_vocabulary in registry and "propertyURI" in registry[context.current_vocabulary] :
474 scheme = registry[context.current_vocabulary]["propertyURI"]
475 else :
476 scheme = PropertySchemes.vocabulary
477 except :
478 # This is when the structure of the registry is broken
479 scheme = PropertySchemes.vocabulary
480
481 name = fragment_escape( name )
482 if scheme == PropertySchemes.contextual :
483 # Step 5.1
484 s = context.current_name
485 # s = context.current_type
486 if s != None and s.startswith("http://www.w3.org/ns/md?type=") :
487 # Step 5.2
488 expandedURI = s + '.' + name
489 else :
490 # Step 5.3
491 expandedURI = "http://www.w3.org/ns/md?type=" + fragment_escape(context.current_type) + "&prop=" + name
492 else :
493 # Step 4
494 if context.current_vocabulary[-1] == '#' or context.current_vocabulary[-1] == '/' :
495 expandedURI = context.current_vocabulary + name
496 else :
497 expandedURI = context.current_vocabulary + '#' + name
498
499 # see if there are subproperty/equivalentproperty relations
500 try :
501 vocab_mapping = registry[context.current_vocabulary]["properties"][name]
502 # if we got that far, we may have some mappings
503
504 expandedURIRef = URIRef(expandedURI)
505 try :
506 subpr = vocab_mapping["subPropertyOf"]
507 if subpr != None :
508 if isinstance(subpr,list) :
509 for p in subpr :
510 self.graph.add( (expandedURIRef, ns_rdfs["subPropertyOf"], URIRef(p)) )
511 else :
512 self.graph.add( (expandedURIRef, ns_rdfs["subPropertyOf"], URIRef(subpr)) )
513 except :
514 # Ok, no sub property
515 pass
516 try :
517 subpr = vocab_mapping["equivalentProperty"]
518 if subpr != None :
519 if isinstance(subpr,list) :
520 for p in subpr :
521 self.graph.add( (expandedURIRef, ns_owl["equivalentProperty"], URIRef(p)) )
522 else :
523 self.graph.add( (expandedURIRef, ns_owl["equivalentProperty"], URIRef(subpr)) )
524 except :
525 # Ok, no sub property
526 pass
527 except :
528 # no harm done, no extra vocabulary term
529 pass
530
531
532 return expandedURI
533
534 def get_property_value(self, node, context) :
535 """
536 Generate an RDF object, ie, the value of a property. Note that if this element contains
537 an @itemscope, then a recursive call to L{MicrodataConversion.generate_triples} is done and the
538 return value of that method (ie, the subject for the corresponding item) is return as an
539 object.
540
541 Otherwise, either URIRefs are created for <a>, <img>, etc, elements, or a Literal; the latter
542 gets a time-related type for the <time> element.
543
544 @param node: the DOM Node for which the property values should be generated
545 @type node: DOM Node
546 @param context: an instance of an evaluation context
547 @type context: L{Evaluation_Context}
548 @return: an RDF resource (URIRef, BNode, or Literal)
549 """
550 URI_attrs = {
551 "audio" : "src",
552 "embed" : "src",
553 "iframe" : "src",
554 "img" : "src",
555 "source" : "src",
556 "track" : "src",
557 "video" : "src",
558 "data" : "src",
559 "a" : "href",
560 "area" : "href",
561 "link" : "href",
562 "object" : "data"
563 }
564 lang = get_lang_from_hierarchy( self.document, node )
565
566 if node.hasAttribute("itemscope") :
567 # THIS IS A RECURSION ENTRY POINT!
568 return self.generate_triples( node, context )
569
570 elif node.tagName in URI_attrs and node.hasAttribute(URI_attrs[node.tagName]) :
571 return URIRef( generate_URI( self.base, node.getAttribute(URI_attrs[node.tagName]).strip() ) )
572
573 elif node.tagName == "meta" and node.hasAttribute("content") :
574 if lang :
575 return Literal( node.getAttribute("content"), lang = lang )
576 else :
577 return Literal( node.getAttribute("content") )
578
579 elif node.tagName == "meter" or node.tagName == "data" :
580 if node.hasAttribute("value") :
581 val = node.getAttribute("value")
582 # check whether the attribute value can be defined as a float or an integer
583 try :
584 fval = int(val)
585 dt = ns_xsd["integer"]
586 except :
587 # Well, not an int, try then a integer
588 try :
589 fval = float(val)
590 dt = ns_xsd["float"]
591 except :
592 # Sigh, this is not a valid value, but let it go through as a plain literal nevertheless
593 fval = val
594 dt = None
595 if dt :
596 return Literal( val, datatype = dt)
597 else :
598 return Literal( val )
599 else :
600 return Literal( "" )
601
602 elif node.tagName == "time" and node.hasAttribute("datetime") :
603 litval = node.getAttribute("datetime")
604 dtype = get_time_type(litval)
605 if dtype :
606 return Literal( litval, datatype = dtype )
607 else :
608 return Literal( litval )
609
610 else :
611 if lang :
612 return Literal( get_Literal(node), lang = lang )
613 else :
614 return Literal( get_Literal(node) )
615
616 def generate_property_values( self, subject, predicate, objects, context) :
617 """
618 Generate the property values for a specific subject and predicate. The context should specify whether
619 the objects should be added in an RDF list or each triples individually.
620
621 @param subject: RDF subject
622 @type subject: RDFLib Node (URIRef or blank node)
623 @param predicate: RDF predicate
624 @type predicate: RDFLib URIRef
625 @param objects: RDF objects
626 @type objects: list of RDFLib nodes (URIRefs, Blank Nodes, or literals)
627 @param context: evaluation context
628 @type context: L{Evaluation_Context}
629 """
630 # generate triples with a list, or a bunch of triples, depending on the context
631 # The biggest complication is to find the method...
632 method = ValueMethod.unordered
633 superproperties = None
634
635 # This is necessary because predicate is a URIRef, and I am not sure the comparisons would work well
636 # to be tested, in fact...
637 pred_key = "%s" % predicate
638 for key in registry :
639 if predicate.startswith(key) :
640 # This the part of the registry corresponding to the predicate's vocabulary
641 registry_object = registry[key]
642 try :
643 if "multipleValues" in registry_object : method = registry_object["multipleValues"]
644 # The generic definition can be overwritten for a specific property. The simplest is to rely on a 'try'
645 # with the right structure...
646 try :
647 method = registry_object["properties"][pred_key[len(key):]]["multipleValues"]
648 except :
649 pass
650 except :
651 pass
652
653 if method == ValueMethod.unordered :
654 for object in objects :
655 self.graph.add( (subject, predicate, object) )
656 else :
657 self.graph.add( (subject,predicate,generate_RDF_collection( self.graph, objects )) )
658
659
660
661
662
663