Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/rdflib/void.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/env/lib/python3.7/site-packages/rdflib/void.py Sat May 02 07:14:21 2020 -0400 @@ -0,0 +1,129 @@ +import collections + +from rdflib import URIRef, Graph, Literal +from rdflib.namespace import VOID, RDF + + +def generateVoID(g, dataset=None, res=None, distinctForPartitions=True): + """ + Returns a new graph with a VoID description of the passed dataset + + For more info on Vocabulary of Interlinked Datasets (VoID), see: + http://vocab.deri.ie/void + + This only makes two passes through the triples (once to detect the types + of things) + + The tradeoff is that lots of temporary structures are built up in memory + meaning lots of memory may be consumed :) + I imagine at least a few copies of your original graph. + + the distinctForPartitions parameter controls whether + distinctSubjects/objects are tracked for each class/propertyPartition + this requires more memory again + + """ + + typeMap = collections.defaultdict(set) + classes = collections.defaultdict(set) + for e, c in g.subject_objects(RDF.type): + classes[c].add(e) + typeMap[e].add(c) + + triples = 0 + subjects = set() + objects = set() + properties = set() + classCount = collections.defaultdict(int) + propCount = collections.defaultdict(int) + + classProps = collections.defaultdict(set) + classObjects = collections.defaultdict(set) + propSubjects = collections.defaultdict(set) + propObjects = collections.defaultdict(set) + + for s, p, o in g: + + triples += 1 + subjects.add(s) + properties.add(p) + objects.add(o) + + # class partitions + if s in typeMap: + for c in typeMap[s]: + classCount[c] += 1 + if distinctForPartitions: + classObjects[c].add(o) + classProps[c].add(p) + + # property partitions + propCount[p] += 1 + if distinctForPartitions: + propObjects[p].add(o) + propSubjects[p].add(s) + + if not dataset: + dataset = URIRef("http://example.org/Dataset") + + if not res: + res = Graph() + + res.add((dataset, RDF.type, VOID.Dataset)) + + # basic stats + res.add((dataset, VOID.triples, Literal(triples))) + res.add((dataset, VOID.classes, Literal(len(classes)))) + + res.add((dataset, VOID.distinctObjects, Literal(len(objects)))) + res.add((dataset, VOID.distinctSubjects, Literal(len(subjects)))) + res.add((dataset, VOID.properties, Literal(len(properties)))) + + for i, c in enumerate(classes): + part = URIRef(dataset + "_class%d" % i) + res.add((dataset, VOID.classPartition, part)) + res.add((part, RDF.type, VOID.Dataset)) + + res.add((part, VOID.triples, Literal(classCount[c]))) + res.add((part, VOID.classes, Literal(1))) + + res.add((part, VOID["class"], c)) + + res.add((part, VOID.entities, Literal(len(classes[c])))) + res.add((part, VOID.distinctSubjects, Literal(len(classes[c])))) + + if distinctForPartitions: + res.add( + (part, VOID.properties, Literal(len(classProps[c])))) + res.add((part, VOID.distinctObjects, + Literal(len(classObjects[c])))) + + for i, p in enumerate(properties): + part = URIRef(dataset + "_property%d" % i) + res.add((dataset, VOID.propertyPartition, part)) + res.add((part, RDF.type, VOID.Dataset)) + + res.add((part, VOID.triples, Literal(propCount[p]))) + res.add((part, VOID.properties, Literal(1))) + + res.add((part, VOID.property, p)) + + if distinctForPartitions: + + entities = 0 + propClasses = set() + for s in propSubjects[p]: + if s in typeMap: + entities += 1 + for c in typeMap[s]: + propClasses.add(c) + + res.add((part, VOID.entities, Literal(entities))) + res.add((part, VOID.classes, Literal(len(propClasses)))) + + res.add((part, VOID.distinctSubjects, + Literal(len(propSubjects[p])))) + res.add((part, VOID.distinctObjects, + Literal(len(propObjects[p])))) + + return res, dataset