Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/rdfxml.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 (2020-07-31) |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:d30785e31577 | 1:56ad4e20f292 |
---|---|
1 """ | |
2 An RDF/XML parser for RDFLib | |
3 """ | |
4 | |
5 from xml.sax import make_parser | |
6 from xml.sax.handler import ErrorHandler | |
7 from xml.sax.saxutils import handler, quoteattr, escape | |
8 from urllib.parse import urljoin, urldefrag | |
9 | |
10 from rdflib.namespace import RDF, is_ncname | |
11 from rdflib.term import URIRef | |
12 from rdflib.term import BNode | |
13 from rdflib.term import Literal | |
14 from rdflib.exceptions import ParserError, Error | |
15 from rdflib.parser import Parser | |
16 | |
17 __all__ = ['create_parser', 'BagID', 'ElementHandler', | |
18 'RDFXMLHandler', 'RDFXMLParser'] | |
19 | |
20 RDFNS = RDF | |
21 | |
22 # http://www.w3.org/TR/rdf-syntax-grammar/#eventterm-attribute-URI | |
23 # A mapping from unqualified terms to their qualified version. | |
24 UNQUALIFIED = {"about": RDF.about, | |
25 "ID": RDF.ID, | |
26 "type": RDF.type, | |
27 "resource": RDF.resource, | |
28 "parseType": RDF.parseType} | |
29 | |
30 # http://www.w3.org/TR/rdf-syntax-grammar/#coreSyntaxTerms | |
31 CORE_SYNTAX_TERMS = [RDF.RDF, RDF.ID, RDF.about, RDF.parseType, | |
32 RDF.resource, RDF.nodeID, RDF.datatype] | |
33 | |
34 # http://www.w3.org/TR/rdf-syntax-grammar/#syntaxTerms | |
35 SYNTAX_TERMS = CORE_SYNTAX_TERMS + [RDF.Description, RDF.li] | |
36 | |
37 # http://www.w3.org/TR/rdf-syntax-grammar/#oldTerms | |
38 OLD_TERMS = [ | |
39 URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"), | |
40 URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"), | |
41 URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID")] | |
42 | |
43 NODE_ELEMENT_EXCEPTIONS = CORE_SYNTAX_TERMS + [RDF.li, ] + OLD_TERMS | |
44 NODE_ELEMENT_ATTRIBUTES = [RDF.ID, RDF.nodeID, RDF.about] | |
45 | |
46 PROPERTY_ELEMENT_EXCEPTIONS = \ | |
47 CORE_SYNTAX_TERMS + [RDF.Description, ] + OLD_TERMS | |
48 PROPERTY_ATTRIBUTE_EXCEPTIONS = \ | |
49 CORE_SYNTAX_TERMS + [RDF.Description, RDF.li] + OLD_TERMS | |
50 PROPERTY_ELEMENT_ATTRIBUTES = [RDF.ID, RDF.resource, RDF.nodeID] | |
51 | |
52 XMLNS = "http://www.w3.org/XML/1998/namespace" | |
53 BASE = (XMLNS, "base") | |
54 LANG = (XMLNS, "lang") | |
55 | |
56 | |
57 class BagID(URIRef): | |
58 __slots__ = ['li'] | |
59 | |
60 def __init__(self, val): | |
61 super(URIRef, self).__init__(val) | |
62 self.li = 0 | |
63 | |
64 def next_li(self): | |
65 self.li += 1 | |
66 return RDFNS[self.li] | |
67 | |
68 | |
69 class ElementHandler(object): | |
70 __slots__ = ['start', 'char', 'end', 'li', 'id', | |
71 'base', 'subject', 'predicate', 'object', | |
72 'list', 'language', 'datatype', 'declared', 'data'] | |
73 | |
74 def __init__(self): | |
75 self.start = None | |
76 self.char = None | |
77 self.end = None | |
78 self.li = 0 | |
79 self.id = None | |
80 self.base = None | |
81 self.subject = None | |
82 self.object = None | |
83 self.list = None | |
84 self.language = None | |
85 self.datatype = None | |
86 self.declared = None | |
87 self.data = None | |
88 | |
89 def next_li(self): | |
90 self.li += 1 | |
91 return RDFNS[self.li] | |
92 | |
93 | |
94 class RDFXMLHandler(handler.ContentHandler): | |
95 | |
96 def __init__(self, store): | |
97 self.store = store | |
98 self.preserve_bnode_ids = False | |
99 self.reset() | |
100 | |
101 def reset(self): | |
102 document_element = ElementHandler() | |
103 document_element.start = self.document_element_start | |
104 document_element.end = lambda name, qname: None | |
105 self.stack = [None, document_element, ] | |
106 self.ids = {} # remember IDs we have already seen | |
107 self.bnode = {} | |
108 self._ns_contexts = [{}] # contains uri -> prefix dicts | |
109 self._current_context = self._ns_contexts[-1] | |
110 | |
111 # ContentHandler methods | |
112 | |
113 def setDocumentLocator(self, locator): | |
114 self.locator = locator | |
115 | |
116 def startDocument(self): | |
117 pass | |
118 | |
119 def startPrefixMapping(self, prefix, namespace): | |
120 self._ns_contexts.append(self._current_context.copy()) | |
121 self._current_context[namespace] = prefix | |
122 self.store.bind(prefix, namespace or "", override=False) | |
123 | |
124 def endPrefixMapping(self, prefix): | |
125 self._current_context = self._ns_contexts[-1] | |
126 del self._ns_contexts[-1] | |
127 | |
128 def startElementNS(self, name, qname, attrs): | |
129 stack = self.stack | |
130 stack.append(ElementHandler()) | |
131 current = self.current | |
132 parent = self.parent | |
133 base = attrs.get(BASE, None) | |
134 if base is not None: | |
135 base, frag = urldefrag(base) | |
136 if parent and parent.base: | |
137 base = urljoin(parent.base, base) | |
138 else: | |
139 systemId = self.locator.getPublicId() \ | |
140 or self.locator.getSystemId() | |
141 if systemId: | |
142 base = urljoin(systemId, base) | |
143 else: | |
144 if parent: | |
145 base = parent.base | |
146 if base is None: | |
147 systemId = self.locator.getPublicId() \ | |
148 or self.locator.getSystemId() | |
149 if systemId: | |
150 base, frag = urldefrag(systemId) | |
151 current.base = base | |
152 language = attrs.get(LANG, None) | |
153 if language is None: | |
154 if parent: | |
155 language = parent.language | |
156 current.language = language | |
157 current.start(name, qname, attrs) | |
158 | |
159 def endElementNS(self, name, qname): | |
160 self.current.end(name, qname) | |
161 self.stack.pop() | |
162 | |
163 def characters(self, content): | |
164 char = self.current.char | |
165 if char: | |
166 char(content) | |
167 | |
168 def ignorableWhitespace(self, content): | |
169 pass | |
170 | |
171 def processingInstruction(self, target, data): | |
172 pass | |
173 | |
174 def add_reified(self, sid, xxx_todo_changeme): | |
175 (s, p, o) = xxx_todo_changeme | |
176 self.store.add((sid, RDF.type, RDF.Statement)) | |
177 self.store.add((sid, RDF.subject, s)) | |
178 self.store.add((sid, RDF.predicate, p)) | |
179 self.store.add((sid, RDF.object, o)) | |
180 | |
181 def error(self, message): | |
182 locator = self.locator | |
183 info = "%s:%s:%s: " % (locator.getSystemId(), | |
184 locator.getLineNumber(), | |
185 locator.getColumnNumber()) | |
186 raise ParserError(info + message) | |
187 | |
188 def get_current(self): | |
189 return self.stack[-2] | |
190 # Create a read only property called current so that self.current | |
191 # give the current element handler. | |
192 current = property(get_current) | |
193 | |
194 def get_next(self): | |
195 return self.stack[-1] | |
196 # Create a read only property that gives the element handler to be | |
197 # used for the next element. | |
198 next = property(get_next) | |
199 | |
200 def get_parent(self): | |
201 return self.stack[-3] | |
202 # Create a read only property that gives the current parent | |
203 # element handler | |
204 parent = property(get_parent) | |
205 | |
206 def absolutize(self, uri): | |
207 result = urljoin(self.current.base, uri, allow_fragments=1) | |
208 if uri and uri[-1] == "#" and result[-1] != "#": | |
209 result = "%s#" % result | |
210 return URIRef(result) | |
211 | |
212 def convert(self, name, qname, attrs): | |
213 if name[0] is None: | |
214 name = URIRef(name[1]) | |
215 else: | |
216 name = URIRef("".join(name)) | |
217 atts = {} | |
218 for (n, v) in list(attrs.items()): # attrs._attrs.iteritems(): # | |
219 if n[0] is None: | |
220 att = n[1] | |
221 else: | |
222 att = "".join(n) | |
223 if att.startswith(XMLNS) or att[0:3].lower() == "xml": | |
224 pass | |
225 elif att in UNQUALIFIED: | |
226 # if not RDFNS[att] in atts: | |
227 atts[RDFNS[att]] = v | |
228 else: | |
229 atts[URIRef(att)] = v | |
230 return name, atts | |
231 | |
232 def document_element_start(self, name, qname, attrs): | |
233 if name[0] and URIRef("".join(name)) == RDF.RDF: | |
234 # Cheap hack so 2to3 doesn't turn it into __next__ | |
235 next = getattr(self, 'next') | |
236 next.start = self.node_element_start | |
237 next.end = self.node_element_end | |
238 else: | |
239 self.node_element_start(name, qname, attrs) | |
240 # self.current.end = self.node_element_end | |
241 # TODO... set end to something that sets start such that | |
242 # another element will cause error | |
243 | |
244 def node_element_start(self, name, qname, attrs): | |
245 name, atts = self.convert(name, qname, attrs) | |
246 current = self.current | |
247 absolutize = self.absolutize | |
248 | |
249 # Cheap hack so 2to3 doesn't turn it into __next__ | |
250 next = getattr(self, 'next') | |
251 next.start = self.property_element_start | |
252 next.end = self.property_element_end | |
253 | |
254 if name in NODE_ELEMENT_EXCEPTIONS: | |
255 self.error("Invalid node element URI: %s" % name) | |
256 | |
257 if RDF.ID in atts: | |
258 if RDF.about in atts or RDF.nodeID in atts: | |
259 self.error( | |
260 "Can have at most one of rdf:ID, rdf:about, and rdf:nodeID" | |
261 ) | |
262 | |
263 id = atts[RDF.ID] | |
264 if not is_ncname(id): | |
265 self.error("rdf:ID value is not a valid NCName: %s" % id) | |
266 subject = absolutize("#%s" % id) | |
267 if subject in self.ids: | |
268 self.error( | |
269 "two elements cannot use the same ID: '%s'" % subject) | |
270 self.ids[subject] = 1 # IDs can only appear once within a document | |
271 elif RDF.nodeID in atts: | |
272 if RDF.ID in atts or RDF.about in atts: | |
273 self.error( | |
274 "Can have at most one of rdf:ID, rdf:about, and rdf:nodeID" | |
275 ) | |
276 nodeID = atts[RDF.nodeID] | |
277 if not is_ncname(nodeID): | |
278 self.error( | |
279 "rdf:nodeID value is not a valid NCName: %s" % nodeID) | |
280 if self.preserve_bnode_ids is False: | |
281 if nodeID in self.bnode: | |
282 subject = self.bnode[nodeID] | |
283 else: | |
284 subject = BNode() | |
285 self.bnode[nodeID] = subject | |
286 else: | |
287 subject = BNode(nodeID) | |
288 elif RDF.about in atts: | |
289 if RDF.ID in atts or RDF.nodeID in atts: | |
290 self.error( | |
291 "Can have at most one of rdf:ID, rdf:about, and rdf:nodeID" | |
292 ) | |
293 subject = absolutize(atts[RDF.about]) | |
294 else: | |
295 subject = BNode() | |
296 | |
297 if name != RDF.Description: # S1 | |
298 self.store.add((subject, RDF.type, absolutize(name))) | |
299 | |
300 language = current.language | |
301 for att in atts: | |
302 if not att.startswith(str(RDFNS)): | |
303 predicate = absolutize(att) | |
304 try: | |
305 object = Literal(atts[att], language) | |
306 except Error as e: | |
307 self.error(e.msg) | |
308 elif att == RDF.type: # S2 | |
309 predicate = RDF.type | |
310 object = absolutize(atts[RDF.type]) | |
311 elif att in NODE_ELEMENT_ATTRIBUTES: | |
312 continue | |
313 elif att in PROPERTY_ATTRIBUTE_EXCEPTIONS: # S3 | |
314 self.error("Invalid property attribute URI: %s" % att) | |
315 continue # for when error does not throw an exception | |
316 else: | |
317 predicate = absolutize(att) | |
318 try: | |
319 object = Literal(atts[att], language) | |
320 except Error as e: | |
321 self.error(e.msg) | |
322 self.store.add((subject, predicate, object)) | |
323 | |
324 current.subject = subject | |
325 | |
326 def node_element_end(self, name, qname): | |
327 # repeat node-elements are only allowed | |
328 # at at top-level | |
329 | |
330 if self.parent.object and self.current != self.stack[2]: | |
331 | |
332 self.error("Repeat node-elements inside property elements: %s"%"".join(name)) | |
333 | |
334 self.parent.object = self.current.subject | |
335 | |
336 def property_element_start(self, name, qname, attrs): | |
337 name, atts = self.convert(name, qname, attrs) | |
338 current = self.current | |
339 absolutize = self.absolutize | |
340 | |
341 # Cheap hack so 2to3 doesn't turn it into __next__ | |
342 next = getattr(self, 'next') | |
343 object = None | |
344 current.data = None | |
345 current.list = None | |
346 | |
347 if not name.startswith(str(RDFNS)): | |
348 current.predicate = absolutize(name) | |
349 elif name == RDF.li: | |
350 current.predicate = current.next_li() | |
351 elif name in PROPERTY_ELEMENT_EXCEPTIONS: | |
352 self.error("Invalid property element URI: %s" % name) | |
353 else: | |
354 current.predicate = absolutize(name) | |
355 | |
356 id = atts.get(RDF.ID, None) | |
357 if id is not None: | |
358 if not is_ncname(id): | |
359 self.error("rdf:ID value is not a value NCName: %s" % id) | |
360 current.id = absolutize("#%s" % id) | |
361 else: | |
362 current.id = None | |
363 | |
364 resource = atts.get(RDF.resource, None) | |
365 nodeID = atts.get(RDF.nodeID, None) | |
366 parse_type = atts.get(RDF.parseType, None) | |
367 if resource is not None and nodeID is not None: | |
368 self.error( | |
369 "Property element cannot have both rdf:nodeID and rdf:resource" | |
370 ) | |
371 if resource is not None: | |
372 object = absolutize(resource) | |
373 next.start = self.node_element_start | |
374 next.end = self.node_element_end | |
375 elif nodeID is not None: | |
376 if not is_ncname(nodeID): | |
377 self.error( | |
378 "rdf:nodeID value is not a valid NCName: %s" % nodeID) | |
379 if self.preserve_bnode_ids is False: | |
380 if nodeID in self.bnode: | |
381 object = self.bnode[nodeID] | |
382 else: | |
383 subject = BNode() | |
384 self.bnode[nodeID] = subject | |
385 object = subject | |
386 else: | |
387 object = subject = BNode(nodeID) | |
388 next.start = self.node_element_start | |
389 next.end = self.node_element_end | |
390 else: | |
391 if parse_type is not None: | |
392 for att in atts: | |
393 if att != RDF.parseType and att != RDF.ID: | |
394 self.error("Property attr '%s' now allowed here" % att) | |
395 if parse_type == "Resource": | |
396 current.subject = object = BNode() | |
397 current.char = self.property_element_char | |
398 next.start = self.property_element_start | |
399 next.end = self.property_element_end | |
400 elif parse_type == "Collection": | |
401 current.char = None | |
402 object = current.list = RDF.nil # BNode() | |
403 # self.parent.subject | |
404 next.start = self.node_element_start | |
405 next.end = self.list_node_element_end | |
406 else: # if parse_type=="Literal": | |
407 # All other values are treated as Literal | |
408 # See: http://www.w3.org/TR/rdf-syntax-grammar/ | |
409 # parseTypeOtherPropertyElt | |
410 object = Literal("", datatype=RDF.XMLLiteral) | |
411 current.char = self.literal_element_char | |
412 current.declared = {XMLNS: 'xml'} | |
413 next.start = self.literal_element_start | |
414 next.char = self.literal_element_char | |
415 next.end = self.literal_element_end | |
416 current.object = object | |
417 return | |
418 else: | |
419 object = None | |
420 current.char = self.property_element_char | |
421 next.start = self.node_element_start | |
422 next.end = self.node_element_end | |
423 | |
424 datatype = current.datatype = atts.get(RDF.datatype, None) | |
425 language = current.language | |
426 if datatype is not None: | |
427 # TODO: check that there are no atts other than datatype and id | |
428 datatype = absolutize(datatype) | |
429 else: | |
430 for att in atts: | |
431 if not att.startswith(str(RDFNS)): | |
432 predicate = absolutize(att) | |
433 elif att in PROPERTY_ELEMENT_ATTRIBUTES: | |
434 continue | |
435 elif att in PROPERTY_ATTRIBUTE_EXCEPTIONS: | |
436 self.error("""Invalid property attribute URI: %s""" % att) | |
437 else: | |
438 predicate = absolutize(att) | |
439 | |
440 if att == RDF.type: | |
441 o = URIRef(atts[att]) | |
442 else: | |
443 if datatype is not None: | |
444 language = None | |
445 o = Literal(atts[att], language, datatype) | |
446 | |
447 if object is None: | |
448 object = BNode() | |
449 self.store.add((object, predicate, o)) | |
450 if object is None: | |
451 current.data = "" | |
452 current.object = None | |
453 else: | |
454 current.data = None | |
455 current.object = object | |
456 | |
457 def property_element_char(self, data): | |
458 current = self.current | |
459 if current.data is not None: | |
460 current.data += data | |
461 | |
462 def property_element_end(self, name, qname): | |
463 current = self.current | |
464 if current.data is not None and current.object is None: | |
465 literalLang = current.language | |
466 if current.datatype is not None: | |
467 literalLang = None | |
468 current.object = Literal( | |
469 current.data, literalLang, current.datatype) | |
470 current.data = None | |
471 if self.next.end == self.list_node_element_end: | |
472 if current.object != RDF.nil: | |
473 self.store.add((current.list, RDF.rest, RDF.nil)) | |
474 if current.object is not None: | |
475 self.store.add( | |
476 (self.parent.subject, current.predicate, current.object)) | |
477 if current.id is not None: | |
478 self.add_reified(current.id, (self.parent.subject, | |
479 current.predicate, current.object)) | |
480 current.subject = None | |
481 | |
482 def list_node_element_end(self, name, qname): | |
483 current = self.current | |
484 if self.parent.list == RDF.nil: | |
485 list = BNode() | |
486 # Removed between 20030123 and 20030905 | |
487 # self.store.add((list, RDF.type, LIST)) | |
488 self.parent.list = list | |
489 self.store.add((self.parent.list, RDF.first, current.subject)) | |
490 self.parent.object = list | |
491 self.parent.char = None | |
492 else: | |
493 list = BNode() | |
494 # Removed between 20030123 and 20030905 | |
495 # self.store.add((list, RDF.type, LIST)) | |
496 self.store.add((self.parent.list, RDF.rest, list)) | |
497 self.store.add((list, RDF.first, current.subject)) | |
498 self.parent.list = list | |
499 | |
500 def literal_element_start(self, name, qname, attrs): | |
501 current = self.current | |
502 self.next.start = self.literal_element_start | |
503 self.next.char = self.literal_element_char | |
504 self.next.end = self.literal_element_end | |
505 current.declared = self.parent.declared.copy() | |
506 if name[0]: | |
507 prefix = self._current_context[name[0]] | |
508 if prefix: | |
509 current.object = "<%s:%s" % (prefix, name[1]) | |
510 else: | |
511 current.object = "<%s" % name[1] | |
512 if not name[0] in current.declared: | |
513 current.declared[name[0]] = prefix | |
514 if prefix: | |
515 current.object += (' xmlns:%s="%s"' % (prefix, name[0])) | |
516 else: | |
517 current.object += (' xmlns="%s"' % name[0]) | |
518 else: | |
519 current.object = "<%s" % name[1] | |
520 | |
521 for (name, value) in list(attrs.items()): | |
522 if name[0]: | |
523 if not name[0] in current.declared: | |
524 current.declared[name[0]] = self._current_context[name[0]] | |
525 name = current.declared[name[0]] + ":" + name[1] | |
526 else: | |
527 name = name[1] | |
528 current.object += (' %s=%s' % (name, quoteattr(value))) | |
529 current.object += ">" | |
530 | |
531 def literal_element_char(self, data): | |
532 self.current.object += escape(data) | |
533 | |
534 def literal_element_end(self, name, qname): | |
535 if name[0]: | |
536 prefix = self._current_context[name[0]] | |
537 if prefix: | |
538 end = "</%s:%s>" % (prefix, name[1]) | |
539 else: | |
540 end = "</%s>" % name[1] | |
541 else: | |
542 end = "</%s>" % name[1] | |
543 self.parent.object += self.current.object + end | |
544 | |
545 | |
546 def create_parser(target, store): | |
547 parser = make_parser() | |
548 try: | |
549 # Workaround for bug in expatreader.py. Needed when | |
550 # expatreader is trying to guess a prefix. | |
551 parser.start_namespace_decl( | |
552 "xml", "http://www.w3.org/XML/1998/namespace") | |
553 except AttributeError: | |
554 pass # Not present in Jython (at least) | |
555 parser.setFeature(handler.feature_namespaces, 1) | |
556 rdfxml = RDFXMLHandler(store) | |
557 rdfxml.setDocumentLocator(target) | |
558 # rdfxml.setDocumentLocator(_Locator(self.url, self.parser)) | |
559 parser.setContentHandler(rdfxml) | |
560 parser.setErrorHandler(ErrorHandler()) | |
561 return parser | |
562 | |
563 | |
564 class RDFXMLParser(Parser): | |
565 | |
566 def __init__(self): | |
567 pass | |
568 | |
569 def parse(self, source, sink, **args): | |
570 self._parser = create_parser(source, sink) | |
571 content_handler = self._parser.getContentHandler() | |
572 preserve_bnode_ids = args.get("preserve_bnode_ids", None) | |
573 if preserve_bnode_ids is not None: | |
574 content_handler.preserve_bnode_ids = preserve_bnode_ids | |
575 # # We're only using it once now | |
576 # content_handler.reset() | |
577 # self._parser.reset() | |
578 self._parser.parse(source) |