Mercurial > repos > guerler > springsuite
diff planemo/lib/python3.7/site-packages/rdflib/plugins/sparql/algebra.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/planemo/lib/python3.7/site-packages/rdflib/plugins/sparql/algebra.py Fri Jul 31 00:32:28 2020 -0400 @@ -0,0 +1,821 @@ + +""" +Converting the 'parse-tree' output of pyparsing to a SPARQL Algebra expression + +http://www.w3.org/TR/sparql11-query/#sparqlQuery + +""" + +import functools +import operator +import collections + +from rdflib import Literal, Variable, URIRef, BNode + +from rdflib.plugins.sparql.sparql import Prologue, Query +from rdflib.plugins.sparql.parserutils import CompValue, Expr +from rdflib.plugins.sparql.operators import ( + and_, TrueFilter, simplify as simplifyFilters) +from rdflib.paths import ( + InvPath, AlternativePath, SequencePath, MulPath, NegatedPath) + +from pyparsing import ParseResults +from functools import reduce + + +# --------------------------- +# Some convenience methods +def OrderBy(p, expr): + return CompValue('OrderBy', p=p, expr=expr) + + +def ToMultiSet(p): + return CompValue('ToMultiSet', p=p) + + +def Union(p1, p2): + return CompValue('Union', p1=p1, p2=p2) + + +def Join(p1, p2): + return CompValue('Join', p1=p1, p2=p2) + + +def Minus(p1, p2): + return CompValue('Minus', p1=p1, p2=p2) + + +def Graph(term, graph): + return CompValue('Graph', term=term, p=graph) + + +def BGP(triples=None): + return CompValue('BGP', triples=triples or []) + + +def LeftJoin(p1, p2, expr): + return CompValue('LeftJoin', p1=p1, p2=p2, expr=expr) + + +def Filter(expr, p): + return CompValue('Filter', expr=expr, p=p) + + +def Extend(p, expr, var): + return CompValue('Extend', p=p, expr=expr, var=var) + +def Values(res): + return CompValue('values', res=res) + +def Project(p, PV): + return CompValue('Project', p=p, PV=PV) + + +def Group(p, expr=None): + return CompValue('Group', p=p, expr=expr) + + +def _knownTerms(triple, varsknown, varscount): + return (len([_f for _f in (x not in varsknown and + isinstance( + x, (Variable, BNode)) for x in triple) if _f]), + -sum(varscount.get(x, 0) for x in triple), + not isinstance(triple[2], Literal), + ) + + +def reorderTriples(l): + """ + Reorder triple patterns so that we execute the + ones with most bindings first + """ + + def _addvar(term, varsknown): + if isinstance(term, (Variable, BNode)): + varsknown.add(term) + + l = [(None, x) for x in l] + varsknown = set() + varscount = collections.defaultdict(int) + for t in l: + for c in t[1]: + if isinstance(c, (Variable, BNode)): + varscount[c] += 1 + i = 0 + + # Done in steps, sort by number of bound terms + # the top block of patterns with the most bound terms is kept + # the rest is resorted based on the vars bound after the first + # block is evaluated + + # we sort by decorate/undecorate, since we need the value of the sort keys + + while i < len(l): + l[i:] = sorted((_knownTerms(x[ + 1], varsknown, varscount), x[1]) for x in l[i:]) + t = l[i][0][0] # top block has this many terms bound + j = 0 + while i+j < len(l) and l[i+j][0][0] == t: + for c in l[i+j][1]: + _addvar(c, varsknown) + j += 1 + i += 1 + + return [x[1] for x in l] + + +def triples(l): + + l = reduce(lambda x, y: x + y, l) + if (len(l) % 3) != 0: + raise Exception('these aint triples') + return reorderTriples((l[x], l[x + 1], l[x + 2]) + for x in range(0, len(l), 3)) + + +def translatePName(p, prologue): + """ + Expand prefixed/relative URIs + """ + if isinstance(p, CompValue): + if p.name == 'pname': + return prologue.absolutize(p) + if p.name == 'literal': + return Literal(p.string, lang=p.lang, + datatype=prologue.absolutize(p.datatype)) + elif isinstance(p, URIRef): + return prologue.absolutize(p) + + +def translatePath(p): + + """ + Translate PropertyPath expressions + """ + + if isinstance(p, CompValue): + if p.name == 'PathAlternative': + if len(p.part) == 1: + return p.part[0] + else: + return AlternativePath(*p.part) + + elif p.name == 'PathSequence': + if len(p.part) == 1: + return p.part[0] + else: + return SequencePath(*p.part) + + elif p.name == 'PathElt': + if not p.mod: + return p.part + else: + if isinstance(p.part, list): + if len(p.part) != 1: + raise Exception('Denkfehler!') + + return MulPath(p.part[0], p.mod) + else: + return MulPath(p.part, p.mod) + + elif p.name == 'PathEltOrInverse': + if isinstance(p.part, list): + if len(p.part) != 1: + raise Exception('Denkfehler!') + return InvPath(p.part[0]) + else: + return InvPath(p.part) + + elif p.name == 'PathNegatedPropertySet': + if isinstance(p.part, list): + return NegatedPath(AlternativePath(*p.part)) + else: + return NegatedPath(p.part) + + +def translateExists(e): + + """ + Translate the graph pattern used by EXISTS and NOT EXISTS + http://www.w3.org/TR/sparql11-query/#sparqlCollectFilters + """ + + def _c(n): + if isinstance(n, CompValue): + if n.name in ('Builtin_EXISTS', 'Builtin_NOTEXISTS'): + n.graph = translateGroupGraphPattern(n.graph) + if n.graph.name == 'Filter': + # filters inside (NOT) EXISTS can see vars bound outside + n.graph.no_isolated_scope = True + + e = traverse(e, visitPost=_c) + + return e + + +def collectAndRemoveFilters(parts): + + """ + + FILTER expressions apply to the whole group graph pattern in which + they appear. + + http://www.w3.org/TR/sparql11-query/#sparqlCollectFilters + """ + + filters = [] + + i = 0 + while i < len(parts): + p = parts[i] + if p.name == 'Filter': + filters.append(translateExists(p.expr)) + parts.pop(i) + else: + i += 1 + + if filters: + return and_(*filters) + + return None + + +def translateGroupOrUnionGraphPattern(graphPattern): + A = None + + for g in graphPattern.graph: + g = translateGroupGraphPattern(g) + if not A: + A = g + else: + A = Union(A, g) + return A + + +def translateGraphGraphPattern(graphPattern): + return Graph(graphPattern.term, + translateGroupGraphPattern(graphPattern.graph)) + + +def translateInlineData(graphPattern): + return ToMultiSet(translateValues(graphPattern)) + + +def translateGroupGraphPattern(graphPattern): + """ + http://www.w3.org/TR/sparql11-query/#convertGraphPattern + """ + + if graphPattern.name == 'SubSelect': + return ToMultiSet(translate(graphPattern)[0]) + + if not graphPattern.part: + graphPattern.part = [] # empty { } + + filters = collectAndRemoveFilters(graphPattern.part) + + g = [] + for p in graphPattern.part: + if p.name == 'TriplesBlock': + # merge adjacent TripleBlocks + if not (g and g[-1].name == 'BGP'): + g.append(BGP()) + g[-1]["triples"] += triples(p.triples) + else: + g.append(p) + + G = BGP() + for p in g: + if p.name == 'OptionalGraphPattern': + A = translateGroupGraphPattern(p.graph) + if A.name == 'Filter': + G = LeftJoin(G, A.p, A.expr) + else: + G = LeftJoin(G, A, TrueFilter) + elif p.name == 'MinusGraphPattern': + G = Minus(p1=G, p2=translateGroupGraphPattern(p.graph)) + elif p.name == 'GroupOrUnionGraphPattern': + G = Join(p1=G, p2=translateGroupOrUnionGraphPattern(p)) + elif p.name == 'GraphGraphPattern': + G = Join(p1=G, p2=translateGraphGraphPattern(p)) + elif p.name == 'InlineData': + G = Join(p1=G, p2=translateInlineData(p)) + elif p.name == 'ServiceGraphPattern': + G = Join(p1=G, p2=p) + elif p.name in ('BGP', 'Extend'): + G = Join(p1=G, p2=p) + elif p.name == 'Bind': + G = Extend(G, p.expr, p.var) + + else: + raise Exception('Unknown part in GroupGraphPattern: %s - %s' % + (type(p), p.name)) + + if filters: + G = Filter(expr=filters, p=G) + + return G + + +class StopTraversal(Exception): + def __init__(self, rv): + self.rv = rv + + +def _traverse(e, visitPre=lambda n: None, visitPost=lambda n: None): + """ + Traverse a parse-tree, visit each node + + if visit functions return a value, replace current node + """ + _e = visitPre(e) + if _e is not None: + return _e + + if e is None: + return None + + if isinstance(e, (list, ParseResults)): + return [_traverse(x, visitPre, visitPost) for x in e] + elif isinstance(e, tuple): + return tuple([_traverse(x, visitPre, visitPost) for x in e]) + + elif isinstance(e, CompValue): + for k, val in e.items(): + e[k] = _traverse(val, visitPre, visitPost) + + _e = visitPost(e) + if _e is not None: + return _e + + return e + + +def _traverseAgg(e, visitor=lambda n, v: None): + """ + Traverse a parse-tree, visit each node + + if visit functions return a value, replace current node + """ + + res = [] + + if isinstance(e, (list, ParseResults, tuple)): + res = [_traverseAgg(x, visitor) for x in e] + + elif isinstance(e, CompValue): + for k, val in e.items(): + if val != None: + res.append(_traverseAgg(val, visitor)) + + return visitor(e, res) + + +def traverse( + tree, visitPre=lambda n: None, + visitPost=lambda n: None, complete=None): + """ + Traverse tree, visit each node with visit function + visit function may raise StopTraversal to stop traversal + if complete!=None, it is returned on complete traversal, + otherwise the transformed tree is returned + """ + try: + r = _traverse(tree, visitPre, visitPost) + if complete is not None: + return complete + return r + except StopTraversal as st: + return st.rv + + +def _hasAggregate(x): + """ + Traverse parse(sub)Tree + return true if any aggregates are used + """ + + if isinstance(x, CompValue): + if x.name.startswith('Aggregate_'): + raise StopTraversal(True) + + +def _aggs(e, A): + """ + Collect Aggregates in A + replaces aggregates with variable references + """ + + # TODO: nested Aggregates? + + if isinstance(e, CompValue) and e.name.startswith('Aggregate_'): + A.append(e) + aggvar = Variable('__agg_%d__' % len(A)) + e["res"] = aggvar + return aggvar + + +def _findVars(x, res): + """ + Find all variables in a tree + """ + if isinstance(x, Variable): + res.add(x) + if isinstance(x, CompValue): + if x.name == "Bind": + res.add(x.var) + return x # stop recursion and finding vars in the expr + elif x.name == 'SubSelect': + if x.projection: + res.update(v.var or v.evar for v in x.projection) + return x + + +def _addVars(x, children): + """ + find which variables may be bound by this part of the query + """ + if isinstance(x, Variable): + return set([x]) + elif isinstance(x, CompValue): + if x.name == "RelationalExpression": + x["_vars"] = set() + elif x.name == "Extend": + # vars only used in the expr for a bind should not be included + x["_vars"] = reduce(operator.or_, [ child for child,part in zip(children,x) if part!='expr' ], set()) + + else: + x["_vars"] = set(reduce(operator.or_, children, set())) + + if x.name == 'SubSelect': + if x.projection: + s = set(v.var or v.evar for v in x.projection) + else: + s = set() + + return s + + return x["_vars"] + + return reduce(operator.or_, children, set()) + + +def _sample(e, v=None): + """ + For each unaggregated variable V in expr + Replace V with Sample(V) + """ + if isinstance(e, CompValue) and e.name.startswith("Aggregate_"): + return e # do not replace vars in aggregates + if isinstance(e, Variable) and v != e: + return CompValue('Aggregate_Sample', vars=e) + + +def _simplifyFilters(e): + if isinstance(e, Expr): + return simplifyFilters(e) + + +def translateAggregates(q, M): + E = [] + A = [] + + # collect/replace aggs in : + # select expr as ?var + if q.projection: + for v in q.projection: + if v.evar: + v.expr = traverse(v.expr, functools.partial(_sample, v=v.evar)) + v.expr = traverse(v.expr, functools.partial(_aggs, A=A)) + + + # having clause + if traverse(q.having, _hasAggregate, complete=False): + q.having = traverse(q.having, _sample) + traverse(q.having, functools.partial(_aggs, A=A)) + + # order by + if traverse(q.orderby, _hasAggregate, complete=False): + q.orderby = traverse(q.orderby, _sample) + traverse(q.orderby, functools.partial(_aggs, A=A)) + + # sample all other select vars + # TODO: only allowed for vars in group-by? + if q.projection: + for v in q.projection: + if v.var: + rv = Variable('__agg_%d__' % (len(A) + 1)) + A.append(CompValue('Aggregate_Sample', vars=v.var, res=rv)) + E.append((rv, v.var)) + + return CompValue('AggregateJoin', A=A, p=M), E + + +def translateValues(v): + # if len(v.var)!=len(v.value): + # raise Exception("Unmatched vars and values in ValueClause: "+str(v)) + + res = [] + if not v.var: + return res + if not v.value: + return res + if not isinstance(v.value[0], list): + + for val in v.value: + res.append({v.var[0]: val}) + else: + for vals in v.value: + res.append(dict(list(zip(v.var, vals)))) + + return Values(res) + + +def translate(q): + """ + http://www.w3.org/TR/sparql11-query/#convertSolMod + + """ + + _traverse(q, _simplifyFilters) + + q.where = traverse(q.where, visitPost=translatePath) + + # TODO: Var scope test + VS = set() + traverse(q.where, functools.partial(_findVars, res=VS)) + + # all query types have a where part + M = translateGroupGraphPattern(q.where) + + aggregate = False + if q.groupby: + conditions = [] + # convert "GROUP BY (?expr as ?var)" to an Extend + for c in q.groupby.condition: + if isinstance(c, CompValue) and c.name == 'GroupAs': + M = Extend(M, c.expr, c.var) + c = c.var + conditions.append(c) + + M = Group(p=M, expr=conditions) + aggregate = True + elif traverse(q.having, _hasAggregate, complete=False) or \ + traverse(q.orderby, _hasAggregate, complete=False) or \ + any(traverse(x.expr, _hasAggregate, complete=False) + for x in q.projection or [] if x.evar): + # if any aggregate is used, implicit group by + M = Group(p=M) + aggregate = True + + if aggregate: + M, E = translateAggregates(q, M) + else: + E = [] + + # HAVING + if q.having: + M = Filter(expr=and_(*q.having.condition), p=M) + + # VALUES + if q.valuesClause: + M = Join(p1=M, p2=ToMultiSet(translateValues(q.valuesClause))) + + if not q.projection: + # select * + PV = list(VS) + else: + PV = list() + for v in q.projection: + if v.var: + if v not in PV: + PV.append(v.var) + elif v.evar: + if v not in PV: + PV.append(v.evar) + + E.append((v.expr, v.evar)) + else: + raise Exception("I expected a var or evar here!") + + for e, v in E: + M = Extend(M, e, v) + + # ORDER BY + if q.orderby: + M = OrderBy(M, [CompValue('OrderCondition', expr=c.expr, + order=c.order) for c in q.orderby.condition]) + + # PROJECT + M = Project(M, PV) + + if q.modifier: + if q.modifier == 'DISTINCT': + M = CompValue('Distinct', p=M) + elif q.modifier == 'REDUCED': + M = CompValue('Reduced', p=M) + + if q.limitoffset: + offset = 0 + if q.limitoffset.offset != None: + offset = q.limitoffset.offset.toPython() + + if q.limitoffset.limit != None: + M = CompValue('Slice', p=M, start=offset, + length=q.limitoffset.limit.toPython()) + else: + M = CompValue('Slice', p=M, start=offset) + + return M, PV + + +def simplify(n): + """Remove joins to empty BGPs""" + if isinstance(n, CompValue): + if n.name == 'Join': + if n.p1.name == 'BGP' and len(n.p1.triples) == 0: + return n.p2 + if n.p2.name == 'BGP' and len(n.p2.triples) == 0: + return n.p1 + elif n.name == 'BGP': + n["triples"] = reorderTriples(n.triples) + return n + + +def analyse(n, children): + + """ + Some things can be lazily joined. + This propegates whether they can up the tree + and sets lazy flags for all joins + """ + + if isinstance(n, CompValue): + if n.name == 'Join': + n["lazy"] = all(children) + return False + elif n.name in ('Slice', 'Distinct'): + return False + else: + return all(children) + else: + return True + + +def translatePrologue(p, base, initNs=None, prologue=None): + + if prologue is None: + prologue = Prologue() + prologue.base = "" + if base: + prologue.base = base + if initNs: + for k, v in initNs.items(): + prologue.bind(k, v) + + for x in p: + if x.name == 'Base': + prologue.base = x.iri + elif x.name == 'PrefixDecl': + prologue.bind(x.prefix, prologue.absolutize(x.iri)) + + return prologue + + +def translateQuads(quads): + if quads.triples: + alltriples = triples(quads.triples) + else: + alltriples = [] + + allquads = collections.defaultdict(list) + + if quads.quadsNotTriples: + for q in quads.quadsNotTriples: + if q.triples: + allquads[q.term] += triples(q.triples) + + return alltriples, allquads + + +def translateUpdate1(u, prologue): + if u.name in ('Load', 'Clear', 'Drop', 'Create'): + pass # no translation needed + elif u.name in ('Add', 'Move', 'Copy'): + pass + elif u.name in ('InsertData', 'DeleteData', 'DeleteWhere'): + t, q = translateQuads(u.quads) + u["quads"] = q + u["triples"] = t + if u.name in ('DeleteWhere', 'DeleteData'): + pass # TODO: check for bnodes in triples + elif u.name == 'Modify': + if u.delete: + u.delete["triples"], u.delete[ + "quads"] = translateQuads(u.delete.quads) + if u.insert: + u.insert["triples"], u.insert[ + "quads"] = translateQuads(u.insert.quads) + u["where"] = translateGroupGraphPattern(u.where) + else: + raise Exception('Unknown type of update operation: %s' % u) + + u.prologue = prologue + return u + + +def translateUpdate(q, base=None, initNs=None): + """ + Returns a list of SPARQL Update Algebra expressions + """ + + res = [] + prologue = None + if not q.request: + return res + for p, u in zip(q.prologue, q.request): + prologue = translatePrologue(p, base, initNs, prologue) + + # absolutize/resolve prefixes + u = traverse( + u, visitPost=functools.partial(translatePName, prologue=prologue)) + u = _traverse(u, _simplifyFilters) + + u = traverse(u, visitPost=translatePath) + + res.append(translateUpdate1(u, prologue)) + + return res + + +def translateQuery(q, base=None, initNs=None): + """ + Translate a query-parsetree to a SPARQL Algebra Expression + + Return a rdflib.plugins.sparql.sparql.Query object + """ + + # We get in: (prologue, query) + + prologue = translatePrologue(q[0], base, initNs) + + # absolutize/resolve prefixes + q[1] = traverse( + q[1], visitPost=functools.partial(translatePName, prologue=prologue)) + + P, PV = translate(q[1]) + datasetClause = q[1].datasetClause + if q[1].name == 'ConstructQuery': + + template = triples(q[1].template) if q[1].template else None + + res = CompValue(q[1].name, p=P, + template=template, + datasetClause=datasetClause) + else: + res = CompValue(q[1].name, p=P, datasetClause=datasetClause, PV=PV) + + res = traverse(res, visitPost=simplify) + _traverseAgg(res, visitor=analyse) + _traverseAgg(res, _addVars) + + return Query(prologue, res) + + +def pprintAlgebra(q): + def pp(p, ind=" "): + # if isinstance(p, list): + # print "[ " + # for x in p: pp(x,ind) + # print "%s ]"%ind + # return + if not isinstance(p, CompValue): + print(p) + return + print("%s(" % (p.name, )) + for k in p: + print("%s%s =" % (ind, k,), end=' ') + pp(p[k], ind + " ") + print("%s)" % ind) + + try: + pp(q.algebra) + except AttributeError: + # it's update, just a list + for x in q: + pp(x) + +if __name__ == '__main__': + import sys + from rdflib.plugins.sparql import parser + import os.path + + if os.path.exists(sys.argv[1]): + q = file(sys.argv[1]) + else: + q = sys.argv[1] + + pq = parser.parseQuery(q) + print(pq) + tq = translateQuery(pq) + print(pprintAlgebra(tq))