view env/lib/python3.7/site-packages/rdflib_jsonld/serializer.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
line wrap: on
line source

# -*- coding: utf-8 -*-
"""
This serialiser will output an RDF Graph as a JSON-LD formatted document. See:

    http://json-ld.org/

Example usage::

    >>> from rdflib.plugin import register, Serializer
    >>> register('json-ld', Serializer, 'rdflib_jsonld.serializer', 'JsonLDSerializer')

    >>> from rdflib import Graph

    >>> testrdf = '''
    ... @prefix dc: <http://purl.org/dc/terms/> .
    ... <http://example.org/about>
    ...     dc:title "Someone's Homepage"@en .
    ... '''

    >>> g = Graph().parse(data=testrdf, format='n3')

    >>> print((g.serialize(format='json-ld', indent=4).decode()))
    [
        {
            "@id": "http://example.org/about",
            "http://purl.org/dc/terms/title": [
                {
                    "@language": "en",
                    "@value": "Someone's Homepage"
                }
            ]
        }
    ]

"""

# NOTE: This code writes the entire JSON object into memory before serialising,
# but we should consider streaming the output to deal with arbitrarily large
# graphs.

import warnings

from rdflib.serializer import Serializer
from rdflib.graph import Graph
from rdflib.term import URIRef, Literal, BNode
from rdflib.namespace import RDF, XSD

from .context import Context, UNDEF
from .util import json
from .keys import CONTEXT, GRAPH, ID, VOCAB, LIST, SET, LANG

__all__ = ['JsonLDSerializer', 'from_rdf']


PLAIN_LITERAL_TYPES = set([XSD.boolean, XSD.integer, XSD.double, XSD.string])


class JsonLDSerializer(Serializer):
    def __init__(self, store):
        super(JsonLDSerializer, self).__init__(store)

    def serialize(self, stream, base=None, encoding=None, **kwargs):
        # TODO: docstring w. args and return value
        encoding = encoding or 'utf-8'
        if encoding not in ('utf-8', 'utf-16'):
            warnings.warn("JSON should be encoded as unicode. " +
                          "Given encoding was: %s" % encoding)

        context_data = kwargs.get('context')
        use_native_types = kwargs.get('use_native_types', False),
        use_rdf_type = kwargs.get('use_rdf_type', False)
        auto_compact = kwargs.get('auto_compact', False)

        indent = kwargs.get('indent', 2)
        separators = kwargs.get('separators', (',', ': '))
        sort_keys = kwargs.get('sort_keys', True)
        ensure_ascii = kwargs.get('ensure_ascii', False)

        obj = from_rdf(self.store, context_data, base,
                use_native_types, use_rdf_type,
                auto_compact=auto_compact)

        data = json.dumps(obj, indent=indent, separators=separators,
                          sort_keys=sort_keys, ensure_ascii=ensure_ascii)

        stream.write(data.encode(encoding, 'replace'))


def from_rdf(graph, context_data=None, base=None,
        use_native_types=False, use_rdf_type=False,
        auto_compact=False, startnode=None, index=False):
    # TODO: docstring w. args and return value
    # TODO: support for index and startnode

    if not context_data and auto_compact:
        context_data = dict(
            (pfx, str(ns))
            for (pfx, ns) in graph.namespaces() if pfx and
            str(ns) != "http://www.w3.org/XML/1998/namespace")

    if isinstance(context_data, Context):
        context = context_data
        context_data = context.to_dict()
    else:
        context = Context(context_data, base=base)

    converter = Converter(context, use_native_types, use_rdf_type)
    result = converter.convert(graph)

    if converter.context.active:
        if isinstance(result, list):
            result = {context.get_key(GRAPH): result}
        result[CONTEXT] = context_data

    return result


class Converter(object):

    def __init__(self, context, use_native_types, use_rdf_type):
        self.context = context
        self.use_native_types = context.active or use_native_types
        self.use_rdf_type = use_rdf_type

    def convert(self, graph):
        # TODO: bug in rdflib dataset parsing (nquads et al):
        # plain triples end up in separate unnamed graphs (rdflib issue #436)
        if graph.context_aware:
            default_graph = Graph()
            graphs = [default_graph]
            for g in graph.contexts():
                if isinstance(g.identifier, URIRef):
                    graphs.append(g)
                else:
                    default_graph += g
        else:
            graphs = [graph]

        context = self.context

        objs = []
        for g in graphs:
            obj = {}
            graphname = None

            if isinstance(g.identifier, URIRef):
                graphname = context.shrink_iri(g.identifier)
                obj[context.id_key] = graphname

            nodes = self.from_graph(g)

            if not graphname and len(nodes) == 1:
                obj.update(nodes[0])
            else:
                if not nodes:
                    continue
                obj[context.graph_key] = nodes

            if objs and objs[0].get(context.get_key(ID)) == graphname:
                objs[0].update(obj)
            else:
                objs.append(obj)

        if len(graphs) == 1 and len(objs) == 1 and not self.context.active:
            default = objs[0]
            items = default.get(context.graph_key)
            if len(default) == 1 and items:
                objs = items
        elif len(objs) == 1 and self.context.active:
            objs = objs[0]

        return objs

    def from_graph(self, graph):
        nodemap = {}

        for s in set(graph.subjects()):
            ## only iri:s and unreferenced (rest will be promoted to top if needed)
            if isinstance(s, URIRef) or (isinstance(s, BNode)
                    and not any(graph.subjects(None, s))):
                self.process_subject(graph, s, nodemap)

        return list(nodemap.values())

    def process_subject(self, graph, s, nodemap):
        if isinstance(s, URIRef):
            node_id = self.context.shrink_iri(s)
        elif isinstance(s, BNode):
            node_id = s.n3()
        else:
            node_id = None

        #used_as_object = any(graph.subjects(None, s))
        if node_id in nodemap:
            return None

        node = {}
        node[self.context.id_key] = node_id
        nodemap[node_id] = node

        for p, o in graph.predicate_objects(s):
            self.add_to_node(graph, s, p, o, node, nodemap)

        return node

    def add_to_node(self, graph, s, p, o, s_node, nodemap):
        context = self.context

        if isinstance(o, Literal):
            datatype = str(o.datatype) if o.datatype else None
            language = o.language
            term = context.find_term(str(p), datatype, language=language)
        else:
            containers = [LIST, None] if graph.value(o, RDF.first) else [None]
            for container in containers:
                for coercion in (ID, VOCAB, UNDEF):
                    term = context.find_term(str(p), coercion, container)
                    if term:
                        break
                if term:
                    break

        node = None
        use_set = not context.active

        if term:
            p_key = term.name

            if term.type:
                node = self.type_coerce(o, term.type)
            elif term.language and o.language == term.language:
                node = str(o)
            elif context.language and (
                    term.language is None and o.language is None):
                node = str(o)

            if term.container == SET:
                use_set = True
            elif term.container == LIST:
                node = [self.type_coerce(v, term.type) or self.to_raw_value(graph, s, v, nodemap)
                        for v in self.to_collection(graph, o)]
            elif term.container == LANG and language:
                value = s_node.setdefault(p_key, {})
                values = value.get(language)
                node = str(o)
                if values:
                    if not isinstance(values, list):
                        value[language] = values = [values]
                    values.append(node)
                else:
                    value[language] = node
                return

        else:
            p_key = context.to_symbol(p)
            # TODO: for coercing curies - quite clumsy; unify to_symbol and find_term?
            key_term = context.terms.get(p_key)
            if key_term and (key_term.type or key_term.container):
                p_key = p
            if not term and p == RDF.type and not self.use_rdf_type:
                if isinstance(o, URIRef):
                    node = context.to_symbol(o)
                p_key = context.type_key

        if node is None:
            node = self.to_raw_value(graph, s, o, nodemap)

        value = s_node.get(p_key)
        if value:
            if not isinstance(value, list):
                value = [value]
            value.append(node)
        elif use_set:
            value = [node]
        else:
            value = node
        s_node[p_key] = value

    def type_coerce(self, o, coerce_type):
        if coerce_type == ID:
            if isinstance(o, URIRef):
                return self.context.shrink_iri(o)
            elif isinstance(o, BNode):
                return o.n3()
            else:
                return o
        elif coerce_type == VOCAB and isinstance(o, URIRef):
            return self.context.to_symbol(o)
        elif isinstance(o, Literal) and str(o.datatype) == coerce_type:
            return o
        else:
            return None

    def to_raw_value(self, graph, s, o, nodemap):
        context = self.context
        coll = self.to_collection(graph, o)
        if coll is not None:
            coll = [self.to_raw_value(graph, s, lo, nodemap)
                    for lo in self.to_collection(graph, o)]
            return {context.list_key: coll}
        elif isinstance(o, BNode):
            embed = False # TODO: self.context.active or using startnode and only one ref
            onode = self.process_subject(graph, o, nodemap)
            if onode:
                if embed and not any(s2 for s2 in graph.subjects(None, o) if s2 != s):
                    return onode
                else:
                    nodemap[onode[context.id_key]] = onode
            return {context.id_key: o.n3()}
        elif isinstance(o, URIRef):
            # TODO: embed if o != startnode (else reverse)
            return {context.id_key: context.shrink_iri(o)}
        elif isinstance(o, Literal):
            # TODO: if compact
            native = self.use_native_types and o.datatype in PLAIN_LITERAL_TYPES
            if native:
                v = o.toPython()
            else:
                v = str(o)
            if o.datatype:
                if native:
                    if self.context.active:
                        return v
                    else:
                        return {context.value_key: v}
                return {context.type_key: context.to_symbol(o.datatype),
                        context.value_key: v}
            elif o.language and o.language != context.language:
                return {context.lang_key: o.language,
                        context.value_key: v}
            elif not context.active or context.language and not o.language:
                return {context.value_key: v}
            else:
                return v

    def to_collection(self, graph, l):
        if l != RDF.nil and not graph.value(l, RDF.first):
            return None
        list_nodes = []
        chain = set([l])
        while l:
            if l == RDF.nil:
                return list_nodes
            if isinstance(l, URIRef):
                return None
            first, rest = None, None
            for p, o in graph.predicate_objects(l):
                if not first and p == RDF.first:
                    first = o
                elif not rest and p == RDF.rest:
                    rest = o
                elif p != RDF.type or o != RDF.List:
                    return None
            list_nodes.append(first)
            l = rest
            if l in chain:
                return None
            chain.add(l)