comparison planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/rdfxml.py @ 1:56ad4e20f292 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:32:28 -0400 (2020-07-31)
parents
children
comparison
equal deleted inserted replaced
0:d30785e31577 1:56ad4e20f292
1 """
2 An RDF/XML parser for RDFLib
3 """
4
5 from xml.sax import make_parser
6 from xml.sax.handler import ErrorHandler
7 from xml.sax.saxutils import handler, quoteattr, escape
8 from urllib.parse import urljoin, urldefrag
9
10 from rdflib.namespace import RDF, is_ncname
11 from rdflib.term import URIRef
12 from rdflib.term import BNode
13 from rdflib.term import Literal
14 from rdflib.exceptions import ParserError, Error
15 from rdflib.parser import Parser
16
17 __all__ = ['create_parser', 'BagID', 'ElementHandler',
18 'RDFXMLHandler', 'RDFXMLParser']
19
20 RDFNS = RDF
21
22 # http://www.w3.org/TR/rdf-syntax-grammar/#eventterm-attribute-URI
23 # A mapping from unqualified terms to their qualified version.
24 UNQUALIFIED = {"about": RDF.about,
25 "ID": RDF.ID,
26 "type": RDF.type,
27 "resource": RDF.resource,
28 "parseType": RDF.parseType}
29
30 # http://www.w3.org/TR/rdf-syntax-grammar/#coreSyntaxTerms
31 CORE_SYNTAX_TERMS = [RDF.RDF, RDF.ID, RDF.about, RDF.parseType,
32 RDF.resource, RDF.nodeID, RDF.datatype]
33
34 # http://www.w3.org/TR/rdf-syntax-grammar/#syntaxTerms
35 SYNTAX_TERMS = CORE_SYNTAX_TERMS + [RDF.Description, RDF.li]
36
37 # http://www.w3.org/TR/rdf-syntax-grammar/#oldTerms
38 OLD_TERMS = [
39 URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"),
40 URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"),
41 URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID")]
42
43 NODE_ELEMENT_EXCEPTIONS = CORE_SYNTAX_TERMS + [RDF.li, ] + OLD_TERMS
44 NODE_ELEMENT_ATTRIBUTES = [RDF.ID, RDF.nodeID, RDF.about]
45
46 PROPERTY_ELEMENT_EXCEPTIONS = \
47 CORE_SYNTAX_TERMS + [RDF.Description, ] + OLD_TERMS
48 PROPERTY_ATTRIBUTE_EXCEPTIONS = \
49 CORE_SYNTAX_TERMS + [RDF.Description, RDF.li] + OLD_TERMS
50 PROPERTY_ELEMENT_ATTRIBUTES = [RDF.ID, RDF.resource, RDF.nodeID]
51
52 XMLNS = "http://www.w3.org/XML/1998/namespace"
53 BASE = (XMLNS, "base")
54 LANG = (XMLNS, "lang")
55
56
57 class BagID(URIRef):
58 __slots__ = ['li']
59
60 def __init__(self, val):
61 super(URIRef, self).__init__(val)
62 self.li = 0
63
64 def next_li(self):
65 self.li += 1
66 return RDFNS[self.li]
67
68
69 class ElementHandler(object):
70 __slots__ = ['start', 'char', 'end', 'li', 'id',
71 'base', 'subject', 'predicate', 'object',
72 'list', 'language', 'datatype', 'declared', 'data']
73
74 def __init__(self):
75 self.start = None
76 self.char = None
77 self.end = None
78 self.li = 0
79 self.id = None
80 self.base = None
81 self.subject = None
82 self.object = None
83 self.list = None
84 self.language = None
85 self.datatype = None
86 self.declared = None
87 self.data = None
88
89 def next_li(self):
90 self.li += 1
91 return RDFNS[self.li]
92
93
94 class RDFXMLHandler(handler.ContentHandler):
95
96 def __init__(self, store):
97 self.store = store
98 self.preserve_bnode_ids = False
99 self.reset()
100
101 def reset(self):
102 document_element = ElementHandler()
103 document_element.start = self.document_element_start
104 document_element.end = lambda name, qname: None
105 self.stack = [None, document_element, ]
106 self.ids = {} # remember IDs we have already seen
107 self.bnode = {}
108 self._ns_contexts = [{}] # contains uri -> prefix dicts
109 self._current_context = self._ns_contexts[-1]
110
111 # ContentHandler methods
112
113 def setDocumentLocator(self, locator):
114 self.locator = locator
115
116 def startDocument(self):
117 pass
118
119 def startPrefixMapping(self, prefix, namespace):
120 self._ns_contexts.append(self._current_context.copy())
121 self._current_context[namespace] = prefix
122 self.store.bind(prefix, namespace or "", override=False)
123
124 def endPrefixMapping(self, prefix):
125 self._current_context = self._ns_contexts[-1]
126 del self._ns_contexts[-1]
127
128 def startElementNS(self, name, qname, attrs):
129 stack = self.stack
130 stack.append(ElementHandler())
131 current = self.current
132 parent = self.parent
133 base = attrs.get(BASE, None)
134 if base is not None:
135 base, frag = urldefrag(base)
136 if parent and parent.base:
137 base = urljoin(parent.base, base)
138 else:
139 systemId = self.locator.getPublicId() \
140 or self.locator.getSystemId()
141 if systemId:
142 base = urljoin(systemId, base)
143 else:
144 if parent:
145 base = parent.base
146 if base is None:
147 systemId = self.locator.getPublicId() \
148 or self.locator.getSystemId()
149 if systemId:
150 base, frag = urldefrag(systemId)
151 current.base = base
152 language = attrs.get(LANG, None)
153 if language is None:
154 if parent:
155 language = parent.language
156 current.language = language
157 current.start(name, qname, attrs)
158
159 def endElementNS(self, name, qname):
160 self.current.end(name, qname)
161 self.stack.pop()
162
163 def characters(self, content):
164 char = self.current.char
165 if char:
166 char(content)
167
168 def ignorableWhitespace(self, content):
169 pass
170
171 def processingInstruction(self, target, data):
172 pass
173
174 def add_reified(self, sid, xxx_todo_changeme):
175 (s, p, o) = xxx_todo_changeme
176 self.store.add((sid, RDF.type, RDF.Statement))
177 self.store.add((sid, RDF.subject, s))
178 self.store.add((sid, RDF.predicate, p))
179 self.store.add((sid, RDF.object, o))
180
181 def error(self, message):
182 locator = self.locator
183 info = "%s:%s:%s: " % (locator.getSystemId(),
184 locator.getLineNumber(),
185 locator.getColumnNumber())
186 raise ParserError(info + message)
187
188 def get_current(self):
189 return self.stack[-2]
190 # Create a read only property called current so that self.current
191 # give the current element handler.
192 current = property(get_current)
193
194 def get_next(self):
195 return self.stack[-1]
196 # Create a read only property that gives the element handler to be
197 # used for the next element.
198 next = property(get_next)
199
200 def get_parent(self):
201 return self.stack[-3]
202 # Create a read only property that gives the current parent
203 # element handler
204 parent = property(get_parent)
205
206 def absolutize(self, uri):
207 result = urljoin(self.current.base, uri, allow_fragments=1)
208 if uri and uri[-1] == "#" and result[-1] != "#":
209 result = "%s#" % result
210 return URIRef(result)
211
212 def convert(self, name, qname, attrs):
213 if name[0] is None:
214 name = URIRef(name[1])
215 else:
216 name = URIRef("".join(name))
217 atts = {}
218 for (n, v) in list(attrs.items()): # attrs._attrs.iteritems(): #
219 if n[0] is None:
220 att = n[1]
221 else:
222 att = "".join(n)
223 if att.startswith(XMLNS) or att[0:3].lower() == "xml":
224 pass
225 elif att in UNQUALIFIED:
226 # if not RDFNS[att] in atts:
227 atts[RDFNS[att]] = v
228 else:
229 atts[URIRef(att)] = v
230 return name, atts
231
232 def document_element_start(self, name, qname, attrs):
233 if name[0] and URIRef("".join(name)) == RDF.RDF:
234 # Cheap hack so 2to3 doesn't turn it into __next__
235 next = getattr(self, 'next')
236 next.start = self.node_element_start
237 next.end = self.node_element_end
238 else:
239 self.node_element_start(name, qname, attrs)
240 # self.current.end = self.node_element_end
241 # TODO... set end to something that sets start such that
242 # another element will cause error
243
244 def node_element_start(self, name, qname, attrs):
245 name, atts = self.convert(name, qname, attrs)
246 current = self.current
247 absolutize = self.absolutize
248
249 # Cheap hack so 2to3 doesn't turn it into __next__
250 next = getattr(self, 'next')
251 next.start = self.property_element_start
252 next.end = self.property_element_end
253
254 if name in NODE_ELEMENT_EXCEPTIONS:
255 self.error("Invalid node element URI: %s" % name)
256
257 if RDF.ID in atts:
258 if RDF.about in atts or RDF.nodeID in atts:
259 self.error(
260 "Can have at most one of rdf:ID, rdf:about, and rdf:nodeID"
261 )
262
263 id = atts[RDF.ID]
264 if not is_ncname(id):
265 self.error("rdf:ID value is not a valid NCName: %s" % id)
266 subject = absolutize("#%s" % id)
267 if subject in self.ids:
268 self.error(
269 "two elements cannot use the same ID: '%s'" % subject)
270 self.ids[subject] = 1 # IDs can only appear once within a document
271 elif RDF.nodeID in atts:
272 if RDF.ID in atts or RDF.about in atts:
273 self.error(
274 "Can have at most one of rdf:ID, rdf:about, and rdf:nodeID"
275 )
276 nodeID = atts[RDF.nodeID]
277 if not is_ncname(nodeID):
278 self.error(
279 "rdf:nodeID value is not a valid NCName: %s" % nodeID)
280 if self.preserve_bnode_ids is False:
281 if nodeID in self.bnode:
282 subject = self.bnode[nodeID]
283 else:
284 subject = BNode()
285 self.bnode[nodeID] = subject
286 else:
287 subject = BNode(nodeID)
288 elif RDF.about in atts:
289 if RDF.ID in atts or RDF.nodeID in atts:
290 self.error(
291 "Can have at most one of rdf:ID, rdf:about, and rdf:nodeID"
292 )
293 subject = absolutize(atts[RDF.about])
294 else:
295 subject = BNode()
296
297 if name != RDF.Description: # S1
298 self.store.add((subject, RDF.type, absolutize(name)))
299
300 language = current.language
301 for att in atts:
302 if not att.startswith(str(RDFNS)):
303 predicate = absolutize(att)
304 try:
305 object = Literal(atts[att], language)
306 except Error as e:
307 self.error(e.msg)
308 elif att == RDF.type: # S2
309 predicate = RDF.type
310 object = absolutize(atts[RDF.type])
311 elif att in NODE_ELEMENT_ATTRIBUTES:
312 continue
313 elif att in PROPERTY_ATTRIBUTE_EXCEPTIONS: # S3
314 self.error("Invalid property attribute URI: %s" % att)
315 continue # for when error does not throw an exception
316 else:
317 predicate = absolutize(att)
318 try:
319 object = Literal(atts[att], language)
320 except Error as e:
321 self.error(e.msg)
322 self.store.add((subject, predicate, object))
323
324 current.subject = subject
325
326 def node_element_end(self, name, qname):
327 # repeat node-elements are only allowed
328 # at at top-level
329
330 if self.parent.object and self.current != self.stack[2]:
331
332 self.error("Repeat node-elements inside property elements: %s"%"".join(name))
333
334 self.parent.object = self.current.subject
335
336 def property_element_start(self, name, qname, attrs):
337 name, atts = self.convert(name, qname, attrs)
338 current = self.current
339 absolutize = self.absolutize
340
341 # Cheap hack so 2to3 doesn't turn it into __next__
342 next = getattr(self, 'next')
343 object = None
344 current.data = None
345 current.list = None
346
347 if not name.startswith(str(RDFNS)):
348 current.predicate = absolutize(name)
349 elif name == RDF.li:
350 current.predicate = current.next_li()
351 elif name in PROPERTY_ELEMENT_EXCEPTIONS:
352 self.error("Invalid property element URI: %s" % name)
353 else:
354 current.predicate = absolutize(name)
355
356 id = atts.get(RDF.ID, None)
357 if id is not None:
358 if not is_ncname(id):
359 self.error("rdf:ID value is not a value NCName: %s" % id)
360 current.id = absolutize("#%s" % id)
361 else:
362 current.id = None
363
364 resource = atts.get(RDF.resource, None)
365 nodeID = atts.get(RDF.nodeID, None)
366 parse_type = atts.get(RDF.parseType, None)
367 if resource is not None and nodeID is not None:
368 self.error(
369 "Property element cannot have both rdf:nodeID and rdf:resource"
370 )
371 if resource is not None:
372 object = absolutize(resource)
373 next.start = self.node_element_start
374 next.end = self.node_element_end
375 elif nodeID is not None:
376 if not is_ncname(nodeID):
377 self.error(
378 "rdf:nodeID value is not a valid NCName: %s" % nodeID)
379 if self.preserve_bnode_ids is False:
380 if nodeID in self.bnode:
381 object = self.bnode[nodeID]
382 else:
383 subject = BNode()
384 self.bnode[nodeID] = subject
385 object = subject
386 else:
387 object = subject = BNode(nodeID)
388 next.start = self.node_element_start
389 next.end = self.node_element_end
390 else:
391 if parse_type is not None:
392 for att in atts:
393 if att != RDF.parseType and att != RDF.ID:
394 self.error("Property attr '%s' now allowed here" % att)
395 if parse_type == "Resource":
396 current.subject = object = BNode()
397 current.char = self.property_element_char
398 next.start = self.property_element_start
399 next.end = self.property_element_end
400 elif parse_type == "Collection":
401 current.char = None
402 object = current.list = RDF.nil # BNode()
403 # self.parent.subject
404 next.start = self.node_element_start
405 next.end = self.list_node_element_end
406 else: # if parse_type=="Literal":
407 # All other values are treated as Literal
408 # See: http://www.w3.org/TR/rdf-syntax-grammar/
409 # parseTypeOtherPropertyElt
410 object = Literal("", datatype=RDF.XMLLiteral)
411 current.char = self.literal_element_char
412 current.declared = {XMLNS: 'xml'}
413 next.start = self.literal_element_start
414 next.char = self.literal_element_char
415 next.end = self.literal_element_end
416 current.object = object
417 return
418 else:
419 object = None
420 current.char = self.property_element_char
421 next.start = self.node_element_start
422 next.end = self.node_element_end
423
424 datatype = current.datatype = atts.get(RDF.datatype, None)
425 language = current.language
426 if datatype is not None:
427 # TODO: check that there are no atts other than datatype and id
428 datatype = absolutize(datatype)
429 else:
430 for att in atts:
431 if not att.startswith(str(RDFNS)):
432 predicate = absolutize(att)
433 elif att in PROPERTY_ELEMENT_ATTRIBUTES:
434 continue
435 elif att in PROPERTY_ATTRIBUTE_EXCEPTIONS:
436 self.error("""Invalid property attribute URI: %s""" % att)
437 else:
438 predicate = absolutize(att)
439
440 if att == RDF.type:
441 o = URIRef(atts[att])
442 else:
443 if datatype is not None:
444 language = None
445 o = Literal(atts[att], language, datatype)
446
447 if object is None:
448 object = BNode()
449 self.store.add((object, predicate, o))
450 if object is None:
451 current.data = ""
452 current.object = None
453 else:
454 current.data = None
455 current.object = object
456
457 def property_element_char(self, data):
458 current = self.current
459 if current.data is not None:
460 current.data += data
461
462 def property_element_end(self, name, qname):
463 current = self.current
464 if current.data is not None and current.object is None:
465 literalLang = current.language
466 if current.datatype is not None:
467 literalLang = None
468 current.object = Literal(
469 current.data, literalLang, current.datatype)
470 current.data = None
471 if self.next.end == self.list_node_element_end:
472 if current.object != RDF.nil:
473 self.store.add((current.list, RDF.rest, RDF.nil))
474 if current.object is not None:
475 self.store.add(
476 (self.parent.subject, current.predicate, current.object))
477 if current.id is not None:
478 self.add_reified(current.id, (self.parent.subject,
479 current.predicate, current.object))
480 current.subject = None
481
482 def list_node_element_end(self, name, qname):
483 current = self.current
484 if self.parent.list == RDF.nil:
485 list = BNode()
486 # Removed between 20030123 and 20030905
487 # self.store.add((list, RDF.type, LIST))
488 self.parent.list = list
489 self.store.add((self.parent.list, RDF.first, current.subject))
490 self.parent.object = list
491 self.parent.char = None
492 else:
493 list = BNode()
494 # Removed between 20030123 and 20030905
495 # self.store.add((list, RDF.type, LIST))
496 self.store.add((self.parent.list, RDF.rest, list))
497 self.store.add((list, RDF.first, current.subject))
498 self.parent.list = list
499
500 def literal_element_start(self, name, qname, attrs):
501 current = self.current
502 self.next.start = self.literal_element_start
503 self.next.char = self.literal_element_char
504 self.next.end = self.literal_element_end
505 current.declared = self.parent.declared.copy()
506 if name[0]:
507 prefix = self._current_context[name[0]]
508 if prefix:
509 current.object = "<%s:%s" % (prefix, name[1])
510 else:
511 current.object = "<%s" % name[1]
512 if not name[0] in current.declared:
513 current.declared[name[0]] = prefix
514 if prefix:
515 current.object += (' xmlns:%s="%s"' % (prefix, name[0]))
516 else:
517 current.object += (' xmlns="%s"' % name[0])
518 else:
519 current.object = "<%s" % name[1]
520
521 for (name, value) in list(attrs.items()):
522 if name[0]:
523 if not name[0] in current.declared:
524 current.declared[name[0]] = self._current_context[name[0]]
525 name = current.declared[name[0]] + ":" + name[1]
526 else:
527 name = name[1]
528 current.object += (' %s=%s' % (name, quoteattr(value)))
529 current.object += ">"
530
531 def literal_element_char(self, data):
532 self.current.object += escape(data)
533
534 def literal_element_end(self, name, qname):
535 if name[0]:
536 prefix = self._current_context[name[0]]
537 if prefix:
538 end = "</%s:%s>" % (prefix, name[1])
539 else:
540 end = "</%s>" % name[1]
541 else:
542 end = "</%s>" % name[1]
543 self.parent.object += self.current.object + end
544
545
546 def create_parser(target, store):
547 parser = make_parser()
548 try:
549 # Workaround for bug in expatreader.py. Needed when
550 # expatreader is trying to guess a prefix.
551 parser.start_namespace_decl(
552 "xml", "http://www.w3.org/XML/1998/namespace")
553 except AttributeError:
554 pass # Not present in Jython (at least)
555 parser.setFeature(handler.feature_namespaces, 1)
556 rdfxml = RDFXMLHandler(store)
557 rdfxml.setDocumentLocator(target)
558 # rdfxml.setDocumentLocator(_Locator(self.url, self.parser))
559 parser.setContentHandler(rdfxml)
560 parser.setErrorHandler(ErrorHandler())
561 return parser
562
563
564 class RDFXMLParser(Parser):
565
566 def __init__(self):
567 pass
568
569 def parse(self, source, sink, **args):
570 self._parser = create_parser(source, sink)
571 content_handler = self._parser.getContentHandler()
572 preserve_bnode_ids = args.get("preserve_bnode_ids", None)
573 if preserve_bnode_ids is not None:
574 content_handler.preserve_bnode_ids = preserve_bnode_ids
575 # # We're only using it once now
576 # content_handler.reset()
577 # self._parser.reset()
578 self._parser.parse(source)