Mercurial > repos > guerler > springsuite
diff planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/rdfxml.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/rdfxml.py Fri Jul 31 00:32:28 2020 -0400 @@ -0,0 +1,578 @@ +""" +An RDF/XML parser for RDFLib +""" + +from xml.sax import make_parser +from xml.sax.handler import ErrorHandler +from xml.sax.saxutils import handler, quoteattr, escape +from urllib.parse import urljoin, urldefrag + +from rdflib.namespace import RDF, is_ncname +from rdflib.term import URIRef +from rdflib.term import BNode +from rdflib.term import Literal +from rdflib.exceptions import ParserError, Error +from rdflib.parser import Parser + +__all__ = ['create_parser', 'BagID', 'ElementHandler', + 'RDFXMLHandler', 'RDFXMLParser'] + +RDFNS = RDF + +# http://www.w3.org/TR/rdf-syntax-grammar/#eventterm-attribute-URI +# A mapping from unqualified terms to their qualified version. +UNQUALIFIED = {"about": RDF.about, + "ID": RDF.ID, + "type": RDF.type, + "resource": RDF.resource, + "parseType": RDF.parseType} + +# http://www.w3.org/TR/rdf-syntax-grammar/#coreSyntaxTerms +CORE_SYNTAX_TERMS = [RDF.RDF, RDF.ID, RDF.about, RDF.parseType, + RDF.resource, RDF.nodeID, RDF.datatype] + +# http://www.w3.org/TR/rdf-syntax-grammar/#syntaxTerms +SYNTAX_TERMS = CORE_SYNTAX_TERMS + [RDF.Description, RDF.li] + +# http://www.w3.org/TR/rdf-syntax-grammar/#oldTerms +OLD_TERMS = [ + URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEach"), + URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#aboutEachPrefix"), + URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#bagID")] + +NODE_ELEMENT_EXCEPTIONS = CORE_SYNTAX_TERMS + [RDF.li, ] + OLD_TERMS +NODE_ELEMENT_ATTRIBUTES = [RDF.ID, RDF.nodeID, RDF.about] + +PROPERTY_ELEMENT_EXCEPTIONS = \ + CORE_SYNTAX_TERMS + [RDF.Description, ] + OLD_TERMS +PROPERTY_ATTRIBUTE_EXCEPTIONS = \ + CORE_SYNTAX_TERMS + [RDF.Description, RDF.li] + OLD_TERMS +PROPERTY_ELEMENT_ATTRIBUTES = [RDF.ID, RDF.resource, RDF.nodeID] + +XMLNS = "http://www.w3.org/XML/1998/namespace" +BASE = (XMLNS, "base") +LANG = (XMLNS, "lang") + + +class BagID(URIRef): + __slots__ = ['li'] + + def __init__(self, val): + super(URIRef, self).__init__(val) + self.li = 0 + + def next_li(self): + self.li += 1 + return RDFNS[self.li] + + +class ElementHandler(object): + __slots__ = ['start', 'char', 'end', 'li', 'id', + 'base', 'subject', 'predicate', 'object', + 'list', 'language', 'datatype', 'declared', 'data'] + + def __init__(self): + self.start = None + self.char = None + self.end = None + self.li = 0 + self.id = None + self.base = None + self.subject = None + self.object = None + self.list = None + self.language = None + self.datatype = None + self.declared = None + self.data = None + + def next_li(self): + self.li += 1 + return RDFNS[self.li] + + +class RDFXMLHandler(handler.ContentHandler): + + def __init__(self, store): + self.store = store + self.preserve_bnode_ids = False + self.reset() + + def reset(self): + document_element = ElementHandler() + document_element.start = self.document_element_start + document_element.end = lambda name, qname: None + self.stack = [None, document_element, ] + self.ids = {} # remember IDs we have already seen + self.bnode = {} + self._ns_contexts = [{}] # contains uri -> prefix dicts + self._current_context = self._ns_contexts[-1] + + # ContentHandler methods + + def setDocumentLocator(self, locator): + self.locator = locator + + def startDocument(self): + pass + + def startPrefixMapping(self, prefix, namespace): + self._ns_contexts.append(self._current_context.copy()) + self._current_context[namespace] = prefix + self.store.bind(prefix, namespace or "", override=False) + + def endPrefixMapping(self, prefix): + self._current_context = self._ns_contexts[-1] + del self._ns_contexts[-1] + + def startElementNS(self, name, qname, attrs): + stack = self.stack + stack.append(ElementHandler()) + current = self.current + parent = self.parent + base = attrs.get(BASE, None) + if base is not None: + base, frag = urldefrag(base) + if parent and parent.base: + base = urljoin(parent.base, base) + else: + systemId = self.locator.getPublicId() \ + or self.locator.getSystemId() + if systemId: + base = urljoin(systemId, base) + else: + if parent: + base = parent.base + if base is None: + systemId = self.locator.getPublicId() \ + or self.locator.getSystemId() + if systemId: + base, frag = urldefrag(systemId) + current.base = base + language = attrs.get(LANG, None) + if language is None: + if parent: + language = parent.language + current.language = language + current.start(name, qname, attrs) + + def endElementNS(self, name, qname): + self.current.end(name, qname) + self.stack.pop() + + def characters(self, content): + char = self.current.char + if char: + char(content) + + def ignorableWhitespace(self, content): + pass + + def processingInstruction(self, target, data): + pass + + def add_reified(self, sid, xxx_todo_changeme): + (s, p, o) = xxx_todo_changeme + self.store.add((sid, RDF.type, RDF.Statement)) + self.store.add((sid, RDF.subject, s)) + self.store.add((sid, RDF.predicate, p)) + self.store.add((sid, RDF.object, o)) + + def error(self, message): + locator = self.locator + info = "%s:%s:%s: " % (locator.getSystemId(), + locator.getLineNumber(), + locator.getColumnNumber()) + raise ParserError(info + message) + + def get_current(self): + return self.stack[-2] + # Create a read only property called current so that self.current + # give the current element handler. + current = property(get_current) + + def get_next(self): + return self.stack[-1] + # Create a read only property that gives the element handler to be + # used for the next element. + next = property(get_next) + + def get_parent(self): + return self.stack[-3] + # Create a read only property that gives the current parent + # element handler + parent = property(get_parent) + + def absolutize(self, uri): + result = urljoin(self.current.base, uri, allow_fragments=1) + if uri and uri[-1] == "#" and result[-1] != "#": + result = "%s#" % result + return URIRef(result) + + def convert(self, name, qname, attrs): + if name[0] is None: + name = URIRef(name[1]) + else: + name = URIRef("".join(name)) + atts = {} + for (n, v) in list(attrs.items()): # attrs._attrs.iteritems(): # + if n[0] is None: + att = n[1] + else: + att = "".join(n) + if att.startswith(XMLNS) or att[0:3].lower() == "xml": + pass + elif att in UNQUALIFIED: + # if not RDFNS[att] in atts: + atts[RDFNS[att]] = v + else: + atts[URIRef(att)] = v + return name, atts + + def document_element_start(self, name, qname, attrs): + if name[0] and URIRef("".join(name)) == RDF.RDF: + # Cheap hack so 2to3 doesn't turn it into __next__ + next = getattr(self, 'next') + next.start = self.node_element_start + next.end = self.node_element_end + else: + self.node_element_start(name, qname, attrs) + # self.current.end = self.node_element_end + # TODO... set end to something that sets start such that + # another element will cause error + + def node_element_start(self, name, qname, attrs): + name, atts = self.convert(name, qname, attrs) + current = self.current + absolutize = self.absolutize + + # Cheap hack so 2to3 doesn't turn it into __next__ + next = getattr(self, 'next') + next.start = self.property_element_start + next.end = self.property_element_end + + if name in NODE_ELEMENT_EXCEPTIONS: + self.error("Invalid node element URI: %s" % name) + + if RDF.ID in atts: + if RDF.about in atts or RDF.nodeID in atts: + self.error( + "Can have at most one of rdf:ID, rdf:about, and rdf:nodeID" + ) + + id = atts[RDF.ID] + if not is_ncname(id): + self.error("rdf:ID value is not a valid NCName: %s" % id) + subject = absolutize("#%s" % id) + if subject in self.ids: + self.error( + "two elements cannot use the same ID: '%s'" % subject) + self.ids[subject] = 1 # IDs can only appear once within a document + elif RDF.nodeID in atts: + if RDF.ID in atts or RDF.about in atts: + self.error( + "Can have at most one of rdf:ID, rdf:about, and rdf:nodeID" + ) + nodeID = atts[RDF.nodeID] + if not is_ncname(nodeID): + self.error( + "rdf:nodeID value is not a valid NCName: %s" % nodeID) + if self.preserve_bnode_ids is False: + if nodeID in self.bnode: + subject = self.bnode[nodeID] + else: + subject = BNode() + self.bnode[nodeID] = subject + else: + subject = BNode(nodeID) + elif RDF.about in atts: + if RDF.ID in atts or RDF.nodeID in atts: + self.error( + "Can have at most one of rdf:ID, rdf:about, and rdf:nodeID" + ) + subject = absolutize(atts[RDF.about]) + else: + subject = BNode() + + if name != RDF.Description: # S1 + self.store.add((subject, RDF.type, absolutize(name))) + + language = current.language + for att in atts: + if not att.startswith(str(RDFNS)): + predicate = absolutize(att) + try: + object = Literal(atts[att], language) + except Error as e: + self.error(e.msg) + elif att == RDF.type: # S2 + predicate = RDF.type + object = absolutize(atts[RDF.type]) + elif att in NODE_ELEMENT_ATTRIBUTES: + continue + elif att in PROPERTY_ATTRIBUTE_EXCEPTIONS: # S3 + self.error("Invalid property attribute URI: %s" % att) + continue # for when error does not throw an exception + else: + predicate = absolutize(att) + try: + object = Literal(atts[att], language) + except Error as e: + self.error(e.msg) + self.store.add((subject, predicate, object)) + + current.subject = subject + + def node_element_end(self, name, qname): + # repeat node-elements are only allowed + # at at top-level + + if self.parent.object and self.current != self.stack[2]: + + self.error("Repeat node-elements inside property elements: %s"%"".join(name)) + + self.parent.object = self.current.subject + + def property_element_start(self, name, qname, attrs): + name, atts = self.convert(name, qname, attrs) + current = self.current + absolutize = self.absolutize + + # Cheap hack so 2to3 doesn't turn it into __next__ + next = getattr(self, 'next') + object = None + current.data = None + current.list = None + + if not name.startswith(str(RDFNS)): + current.predicate = absolutize(name) + elif name == RDF.li: + current.predicate = current.next_li() + elif name in PROPERTY_ELEMENT_EXCEPTIONS: + self.error("Invalid property element URI: %s" % name) + else: + current.predicate = absolutize(name) + + id = atts.get(RDF.ID, None) + if id is not None: + if not is_ncname(id): + self.error("rdf:ID value is not a value NCName: %s" % id) + current.id = absolutize("#%s" % id) + else: + current.id = None + + resource = atts.get(RDF.resource, None) + nodeID = atts.get(RDF.nodeID, None) + parse_type = atts.get(RDF.parseType, None) + if resource is not None and nodeID is not None: + self.error( + "Property element cannot have both rdf:nodeID and rdf:resource" + ) + if resource is not None: + object = absolutize(resource) + next.start = self.node_element_start + next.end = self.node_element_end + elif nodeID is not None: + if not is_ncname(nodeID): + self.error( + "rdf:nodeID value is not a valid NCName: %s" % nodeID) + if self.preserve_bnode_ids is False: + if nodeID in self.bnode: + object = self.bnode[nodeID] + else: + subject = BNode() + self.bnode[nodeID] = subject + object = subject + else: + object = subject = BNode(nodeID) + next.start = self.node_element_start + next.end = self.node_element_end + else: + if parse_type is not None: + for att in atts: + if att != RDF.parseType and att != RDF.ID: + self.error("Property attr '%s' now allowed here" % att) + if parse_type == "Resource": + current.subject = object = BNode() + current.char = self.property_element_char + next.start = self.property_element_start + next.end = self.property_element_end + elif parse_type == "Collection": + current.char = None + object = current.list = RDF.nil # BNode() + # self.parent.subject + next.start = self.node_element_start + next.end = self.list_node_element_end + else: # if parse_type=="Literal": + # All other values are treated as Literal + # See: http://www.w3.org/TR/rdf-syntax-grammar/ + # parseTypeOtherPropertyElt + object = Literal("", datatype=RDF.XMLLiteral) + current.char = self.literal_element_char + current.declared = {XMLNS: 'xml'} + next.start = self.literal_element_start + next.char = self.literal_element_char + next.end = self.literal_element_end + current.object = object + return + else: + object = None + current.char = self.property_element_char + next.start = self.node_element_start + next.end = self.node_element_end + + datatype = current.datatype = atts.get(RDF.datatype, None) + language = current.language + if datatype is not None: + # TODO: check that there are no atts other than datatype and id + datatype = absolutize(datatype) + else: + for att in atts: + if not att.startswith(str(RDFNS)): + predicate = absolutize(att) + elif att in PROPERTY_ELEMENT_ATTRIBUTES: + continue + elif att in PROPERTY_ATTRIBUTE_EXCEPTIONS: + self.error("""Invalid property attribute URI: %s""" % att) + else: + predicate = absolutize(att) + + if att == RDF.type: + o = URIRef(atts[att]) + else: + if datatype is not None: + language = None + o = Literal(atts[att], language, datatype) + + if object is None: + object = BNode() + self.store.add((object, predicate, o)) + if object is None: + current.data = "" + current.object = None + else: + current.data = None + current.object = object + + def property_element_char(self, data): + current = self.current + if current.data is not None: + current.data += data + + def property_element_end(self, name, qname): + current = self.current + if current.data is not None and current.object is None: + literalLang = current.language + if current.datatype is not None: + literalLang = None + current.object = Literal( + current.data, literalLang, current.datatype) + current.data = None + if self.next.end == self.list_node_element_end: + if current.object != RDF.nil: + self.store.add((current.list, RDF.rest, RDF.nil)) + if current.object is not None: + self.store.add( + (self.parent.subject, current.predicate, current.object)) + if current.id is not None: + self.add_reified(current.id, (self.parent.subject, + current.predicate, current.object)) + current.subject = None + + def list_node_element_end(self, name, qname): + current = self.current + if self.parent.list == RDF.nil: + list = BNode() + # Removed between 20030123 and 20030905 + # self.store.add((list, RDF.type, LIST)) + self.parent.list = list + self.store.add((self.parent.list, RDF.first, current.subject)) + self.parent.object = list + self.parent.char = None + else: + list = BNode() + # Removed between 20030123 and 20030905 + # self.store.add((list, RDF.type, LIST)) + self.store.add((self.parent.list, RDF.rest, list)) + self.store.add((list, RDF.first, current.subject)) + self.parent.list = list + + def literal_element_start(self, name, qname, attrs): + current = self.current + self.next.start = self.literal_element_start + self.next.char = self.literal_element_char + self.next.end = self.literal_element_end + current.declared = self.parent.declared.copy() + if name[0]: + prefix = self._current_context[name[0]] + if prefix: + current.object = "<%s:%s" % (prefix, name[1]) + else: + current.object = "<%s" % name[1] + if not name[0] in current.declared: + current.declared[name[0]] = prefix + if prefix: + current.object += (' xmlns:%s="%s"' % (prefix, name[0])) + else: + current.object += (' xmlns="%s"' % name[0]) + else: + current.object = "<%s" % name[1] + + for (name, value) in list(attrs.items()): + if name[0]: + if not name[0] in current.declared: + current.declared[name[0]] = self._current_context[name[0]] + name = current.declared[name[0]] + ":" + name[1] + else: + name = name[1] + current.object += (' %s=%s' % (name, quoteattr(value))) + current.object += ">" + + def literal_element_char(self, data): + self.current.object += escape(data) + + def literal_element_end(self, name, qname): + if name[0]: + prefix = self._current_context[name[0]] + if prefix: + end = "</%s:%s>" % (prefix, name[1]) + else: + end = "</%s>" % name[1] + else: + end = "</%s>" % name[1] + self.parent.object += self.current.object + end + + +def create_parser(target, store): + parser = make_parser() + try: + # Workaround for bug in expatreader.py. Needed when + # expatreader is trying to guess a prefix. + parser.start_namespace_decl( + "xml", "http://www.w3.org/XML/1998/namespace") + except AttributeError: + pass # Not present in Jython (at least) + parser.setFeature(handler.feature_namespaces, 1) + rdfxml = RDFXMLHandler(store) + rdfxml.setDocumentLocator(target) + # rdfxml.setDocumentLocator(_Locator(self.url, self.parser)) + parser.setContentHandler(rdfxml) + parser.setErrorHandler(ErrorHandler()) + return parser + + +class RDFXMLParser(Parser): + + def __init__(self): + pass + + def parse(self, source, sink, **args): + self._parser = create_parser(source, sink) + content_handler = self._parser.getContentHandler() + preserve_bnode_ids = args.get("preserve_bnode_ids", None) + if preserve_bnode_ids is not None: + content_handler.preserve_bnode_ids = preserve_bnode_ids + # # We're only using it once now + # content_handler.reset() + # self._parser.reset() + self._parser.parse(source)