comparison env/lib/python3.7/site-packages/rdflib/void.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 import collections
2
3 from rdflib import URIRef, Graph, Literal
4 from rdflib.namespace import VOID, RDF
5
6
7 def generateVoID(g, dataset=None, res=None, distinctForPartitions=True):
8 """
9 Returns a new graph with a VoID description of the passed dataset
10
11 For more info on Vocabulary of Interlinked Datasets (VoID), see:
12 http://vocab.deri.ie/void
13
14 This only makes two passes through the triples (once to detect the types
15 of things)
16
17 The tradeoff is that lots of temporary structures are built up in memory
18 meaning lots of memory may be consumed :)
19 I imagine at least a few copies of your original graph.
20
21 the distinctForPartitions parameter controls whether
22 distinctSubjects/objects are tracked for each class/propertyPartition
23 this requires more memory again
24
25 """
26
27 typeMap = collections.defaultdict(set)
28 classes = collections.defaultdict(set)
29 for e, c in g.subject_objects(RDF.type):
30 classes[c].add(e)
31 typeMap[e].add(c)
32
33 triples = 0
34 subjects = set()
35 objects = set()
36 properties = set()
37 classCount = collections.defaultdict(int)
38 propCount = collections.defaultdict(int)
39
40 classProps = collections.defaultdict(set)
41 classObjects = collections.defaultdict(set)
42 propSubjects = collections.defaultdict(set)
43 propObjects = collections.defaultdict(set)
44
45 for s, p, o in g:
46
47 triples += 1
48 subjects.add(s)
49 properties.add(p)
50 objects.add(o)
51
52 # class partitions
53 if s in typeMap:
54 for c in typeMap[s]:
55 classCount[c] += 1
56 if distinctForPartitions:
57 classObjects[c].add(o)
58 classProps[c].add(p)
59
60 # property partitions
61 propCount[p] += 1
62 if distinctForPartitions:
63 propObjects[p].add(o)
64 propSubjects[p].add(s)
65
66 if not dataset:
67 dataset = URIRef("http://example.org/Dataset")
68
69 if not res:
70 res = Graph()
71
72 res.add((dataset, RDF.type, VOID.Dataset))
73
74 # basic stats
75 res.add((dataset, VOID.triples, Literal(triples)))
76 res.add((dataset, VOID.classes, Literal(len(classes))))
77
78 res.add((dataset, VOID.distinctObjects, Literal(len(objects))))
79 res.add((dataset, VOID.distinctSubjects, Literal(len(subjects))))
80 res.add((dataset, VOID.properties, Literal(len(properties))))
81
82 for i, c in enumerate(classes):
83 part = URIRef(dataset + "_class%d" % i)
84 res.add((dataset, VOID.classPartition, part))
85 res.add((part, RDF.type, VOID.Dataset))
86
87 res.add((part, VOID.triples, Literal(classCount[c])))
88 res.add((part, VOID.classes, Literal(1)))
89
90 res.add((part, VOID["class"], c))
91
92 res.add((part, VOID.entities, Literal(len(classes[c]))))
93 res.add((part, VOID.distinctSubjects, Literal(len(classes[c]))))
94
95 if distinctForPartitions:
96 res.add(
97 (part, VOID.properties, Literal(len(classProps[c]))))
98 res.add((part, VOID.distinctObjects,
99 Literal(len(classObjects[c]))))
100
101 for i, p in enumerate(properties):
102 part = URIRef(dataset + "_property%d" % i)
103 res.add((dataset, VOID.propertyPartition, part))
104 res.add((part, RDF.type, VOID.Dataset))
105
106 res.add((part, VOID.triples, Literal(propCount[p])))
107 res.add((part, VOID.properties, Literal(1)))
108
109 res.add((part, VOID.property, p))
110
111 if distinctForPartitions:
112
113 entities = 0
114 propClasses = set()
115 for s in propSubjects[p]:
116 if s in typeMap:
117 entities += 1
118 for c in typeMap[s]:
119 propClasses.add(c)
120
121 res.add((part, VOID.entities, Literal(entities)))
122 res.add((part, VOID.classes, Literal(len(propClasses))))
123
124 res.add((part, VOID.distinctSubjects,
125 Literal(len(propSubjects[p]))))
126 res.add((part, VOID.distinctObjects,
127 Literal(len(propObjects[p]))))
128
129 return res, dataset