guppy_basecaller: env/lib/python3.7/site-packages/rdflib/void.py comparison

comparison env/lib/python3.7/site-packages/rdflib/void.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"

author	shellac
date	Sat, 02 May 2020 07:14:21 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:26e78fe6e8c4
+import collections
+from rdflib import URIRef, Graph, Literal
+from rdflib.namespace import VOID, RDF
+def generateVoID(g, dataset=None, res=None, distinctForPartitions=True):
+"""
+Returns a new graph with a VoID description of the passed dataset
+For more info on Vocabulary of Interlinked Datasets (VoID), see:
+http://vocab.deri.ie/void
+This only makes two passes through the triples (once to detect the types
+of things)
+The tradeoff is that lots of temporary structures are built up in memory
+meaning lots of memory may be consumed :)
+I imagine at least a few copies of your original graph.
+the distinctForPartitions parameter controls whether
+distinctSubjects/objects are tracked for each class/propertyPartition
+this requires more memory again
+"""
+typeMap = collections.defaultdict(set)
+classes = collections.defaultdict(set)
+for e, c in g.subject_objects(RDF.type):
+classes[c].add(e)
+typeMap[e].add(c)
+triples = 0
+subjects = set()
+objects = set()
+properties = set()
+classCount = collections.defaultdict(int)
+propCount = collections.defaultdict(int)
+classProps = collections.defaultdict(set)
+classObjects = collections.defaultdict(set)
+propSubjects = collections.defaultdict(set)
+propObjects = collections.defaultdict(set)
+for s, p, o in g:
+triples += 1
+subjects.add(s)
+properties.add(p)
+objects.add(o)
+# class partitions
+if s in typeMap:
+for c in typeMap[s]:
+classCount[c] += 1
+if distinctForPartitions:
+classObjects[c].add(o)
+classProps[c].add(p)
+# property partitions
+propCount[p] += 1
+if distinctForPartitions:
+propObjects[p].add(o)
+propSubjects[p].add(s)
+if not dataset:
+dataset = URIRef("http://example.org/Dataset")
+if not res:
+res = Graph()
+res.add((dataset, RDF.type, VOID.Dataset))
+# basic stats
+res.add((dataset, VOID.triples, Literal(triples)))
+res.add((dataset, VOID.classes, Literal(len(classes))))
+res.add((dataset, VOID.distinctObjects, Literal(len(objects))))
+res.add((dataset, VOID.distinctSubjects, Literal(len(subjects))))
+res.add((dataset, VOID.properties, Literal(len(properties))))
+for i, c in enumerate(classes):
+part = URIRef(dataset + "_class%d" % i)
+res.add((dataset, VOID.classPartition, part))
+res.add((part, RDF.type, VOID.Dataset))
+res.add((part, VOID.triples, Literal(classCount[c])))
+res.add((part, VOID.classes, Literal(1)))
+res.add((part, VOID["class"], c))
+res.add((part, VOID.entities, Literal(len(classes[c]))))
+res.add((part, VOID.distinctSubjects, Literal(len(classes[c]))))
+if distinctForPartitions:
+res.add(
+(part, VOID.properties, Literal(len(classProps[c]))))
+res.add((part, VOID.distinctObjects,
+Literal(len(classObjects[c]))))
+for i, p in enumerate(properties):
+part = URIRef(dataset + "_property%d" % i)
+res.add((dataset, VOID.propertyPartition, part))
+res.add((part, RDF.type, VOID.Dataset))
+res.add((part, VOID.triples, Literal(propCount[p])))
+res.add((part, VOID.properties, Literal(1)))
+res.add((part, VOID.property, p))
+if distinctForPartitions:
+entities = 0
+propClasses = set()
+for s in propSubjects[p]:
+if s in typeMap:
+entities += 1
+for c in typeMap[s]:
+propClasses.add(c)
+res.add((part, VOID.entities, Literal(entities)))
+res.add((part, VOID.classes, Literal(len(propClasses))))
+res.add((part, VOID.distinctSubjects,
+Literal(len(propSubjects[p]))))
+res.add((part, VOID.distinctObjects,
+Literal(len(propObjects[p]))))
+return res, dataset

Mercurial > repos > shellac > guppy_basecaller

comparison env/lib/python3.7/site-packages/rdflib/void.py @ 0:26e78fe6e8c4 draft