Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/rdflib/void.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
| author | shellac |
|---|---|
| date | Sat, 02 May 2020 07:14:21 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:26e78fe6e8c4 |
|---|---|
| 1 import collections | |
| 2 | |
| 3 from rdflib import URIRef, Graph, Literal | |
| 4 from rdflib.namespace import VOID, RDF | |
| 5 | |
| 6 | |
| 7 def generateVoID(g, dataset=None, res=None, distinctForPartitions=True): | |
| 8 """ | |
| 9 Returns a new graph with a VoID description of the passed dataset | |
| 10 | |
| 11 For more info on Vocabulary of Interlinked Datasets (VoID), see: | |
| 12 http://vocab.deri.ie/void | |
| 13 | |
| 14 This only makes two passes through the triples (once to detect the types | |
| 15 of things) | |
| 16 | |
| 17 The tradeoff is that lots of temporary structures are built up in memory | |
| 18 meaning lots of memory may be consumed :) | |
| 19 I imagine at least a few copies of your original graph. | |
| 20 | |
| 21 the distinctForPartitions parameter controls whether | |
| 22 distinctSubjects/objects are tracked for each class/propertyPartition | |
| 23 this requires more memory again | |
| 24 | |
| 25 """ | |
| 26 | |
| 27 typeMap = collections.defaultdict(set) | |
| 28 classes = collections.defaultdict(set) | |
| 29 for e, c in g.subject_objects(RDF.type): | |
| 30 classes[c].add(e) | |
| 31 typeMap[e].add(c) | |
| 32 | |
| 33 triples = 0 | |
| 34 subjects = set() | |
| 35 objects = set() | |
| 36 properties = set() | |
| 37 classCount = collections.defaultdict(int) | |
| 38 propCount = collections.defaultdict(int) | |
| 39 | |
| 40 classProps = collections.defaultdict(set) | |
| 41 classObjects = collections.defaultdict(set) | |
| 42 propSubjects = collections.defaultdict(set) | |
| 43 propObjects = collections.defaultdict(set) | |
| 44 | |
| 45 for s, p, o in g: | |
| 46 | |
| 47 triples += 1 | |
| 48 subjects.add(s) | |
| 49 properties.add(p) | |
| 50 objects.add(o) | |
| 51 | |
| 52 # class partitions | |
| 53 if s in typeMap: | |
| 54 for c in typeMap[s]: | |
| 55 classCount[c] += 1 | |
| 56 if distinctForPartitions: | |
| 57 classObjects[c].add(o) | |
| 58 classProps[c].add(p) | |
| 59 | |
| 60 # property partitions | |
| 61 propCount[p] += 1 | |
| 62 if distinctForPartitions: | |
| 63 propObjects[p].add(o) | |
| 64 propSubjects[p].add(s) | |
| 65 | |
| 66 if not dataset: | |
| 67 dataset = URIRef("http://example.org/Dataset") | |
| 68 | |
| 69 if not res: | |
| 70 res = Graph() | |
| 71 | |
| 72 res.add((dataset, RDF.type, VOID.Dataset)) | |
| 73 | |
| 74 # basic stats | |
| 75 res.add((dataset, VOID.triples, Literal(triples))) | |
| 76 res.add((dataset, VOID.classes, Literal(len(classes)))) | |
| 77 | |
| 78 res.add((dataset, VOID.distinctObjects, Literal(len(objects)))) | |
| 79 res.add((dataset, VOID.distinctSubjects, Literal(len(subjects)))) | |
| 80 res.add((dataset, VOID.properties, Literal(len(properties)))) | |
| 81 | |
| 82 for i, c in enumerate(classes): | |
| 83 part = URIRef(dataset + "_class%d" % i) | |
| 84 res.add((dataset, VOID.classPartition, part)) | |
| 85 res.add((part, RDF.type, VOID.Dataset)) | |
| 86 | |
| 87 res.add((part, VOID.triples, Literal(classCount[c]))) | |
| 88 res.add((part, VOID.classes, Literal(1))) | |
| 89 | |
| 90 res.add((part, VOID["class"], c)) | |
| 91 | |
| 92 res.add((part, VOID.entities, Literal(len(classes[c])))) | |
| 93 res.add((part, VOID.distinctSubjects, Literal(len(classes[c])))) | |
| 94 | |
| 95 if distinctForPartitions: | |
| 96 res.add( | |
| 97 (part, VOID.properties, Literal(len(classProps[c])))) | |
| 98 res.add((part, VOID.distinctObjects, | |
| 99 Literal(len(classObjects[c])))) | |
| 100 | |
| 101 for i, p in enumerate(properties): | |
| 102 part = URIRef(dataset + "_property%d" % i) | |
| 103 res.add((dataset, VOID.propertyPartition, part)) | |
| 104 res.add((part, RDF.type, VOID.Dataset)) | |
| 105 | |
| 106 res.add((part, VOID.triples, Literal(propCount[p]))) | |
| 107 res.add((part, VOID.properties, Literal(1))) | |
| 108 | |
| 109 res.add((part, VOID.property, p)) | |
| 110 | |
| 111 if distinctForPartitions: | |
| 112 | |
| 113 entities = 0 | |
| 114 propClasses = set() | |
| 115 for s in propSubjects[p]: | |
| 116 if s in typeMap: | |
| 117 entities += 1 | |
| 118 for c in typeMap[s]: | |
| 119 propClasses.add(c) | |
| 120 | |
| 121 res.add((part, VOID.entities, Literal(entities))) | |
| 122 res.add((part, VOID.classes, Literal(len(propClasses)))) | |
| 123 | |
| 124 res.add((part, VOID.distinctSubjects, | |
| 125 Literal(len(propSubjects[p])))) | |
| 126 res.add((part, VOID.distinctObjects, | |
| 127 Literal(len(propObjects[p])))) | |
| 128 | |
| 129 return res, dataset |
