comparison env/lib/python3.7/site-packages/rdflib/void.py @ 5:9b1c78e6ba9c draft default tip

"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author shellac
date Mon, 01 Jun 2020 08:59:25 -0400
parents 79f47841a781
children
comparison
equal deleted inserted replaced
4:79f47841a781 5:9b1c78e6ba9c
1 import collections
2
3 from rdflib import URIRef, Graph, Literal
4 from rdflib.namespace import VOID, RDF
5
6
7 def generateVoID(g, dataset=None, res=None, distinctForPartitions=True):
8 """
9 Returns a new graph with a VoID description of the passed dataset
10
11 For more info on Vocabulary of Interlinked Datasets (VoID), see:
12 http://vocab.deri.ie/void
13
14 This only makes two passes through the triples (once to detect the types
15 of things)
16
17 The tradeoff is that lots of temporary structures are built up in memory
18 meaning lots of memory may be consumed :)
19 I imagine at least a few copies of your original graph.
20
21 the distinctForPartitions parameter controls whether
22 distinctSubjects/objects are tracked for each class/propertyPartition
23 this requires more memory again
24
25 """
26
27 typeMap = collections.defaultdict(set)
28 classes = collections.defaultdict(set)
29 for e, c in g.subject_objects(RDF.type):
30 classes[c].add(e)
31 typeMap[e].add(c)
32
33 triples = 0
34 subjects = set()
35 objects = set()
36 properties = set()
37 classCount = collections.defaultdict(int)
38 propCount = collections.defaultdict(int)
39
40 classProps = collections.defaultdict(set)
41 classObjects = collections.defaultdict(set)
42 propSubjects = collections.defaultdict(set)
43 propObjects = collections.defaultdict(set)
44
45 for s, p, o in g:
46
47 triples += 1
48 subjects.add(s)
49 properties.add(p)
50 objects.add(o)
51
52 # class partitions
53 if s in typeMap:
54 for c in typeMap[s]:
55 classCount[c] += 1
56 if distinctForPartitions:
57 classObjects[c].add(o)
58 classProps[c].add(p)
59
60 # property partitions
61 propCount[p] += 1
62 if distinctForPartitions:
63 propObjects[p].add(o)
64 propSubjects[p].add(s)
65
66 if not dataset:
67 dataset = URIRef("http://example.org/Dataset")
68
69 if not res:
70 res = Graph()
71
72 res.add((dataset, RDF.type, VOID.Dataset))
73
74 # basic stats
75 res.add((dataset, VOID.triples, Literal(triples)))
76 res.add((dataset, VOID.classes, Literal(len(classes))))
77
78 res.add((dataset, VOID.distinctObjects, Literal(len(objects))))
79 res.add((dataset, VOID.distinctSubjects, Literal(len(subjects))))
80 res.add((dataset, VOID.properties, Literal(len(properties))))
81
82 for i, c in enumerate(classes):
83 part = URIRef(dataset + "_class%d" % i)
84 res.add((dataset, VOID.classPartition, part))
85 res.add((part, RDF.type, VOID.Dataset))
86
87 res.add((part, VOID.triples, Literal(classCount[c])))
88 res.add((part, VOID.classes, Literal(1)))
89
90 res.add((part, VOID["class"], c))
91
92 res.add((part, VOID.entities, Literal(len(classes[c]))))
93 res.add((part, VOID.distinctSubjects, Literal(len(classes[c]))))
94
95 if distinctForPartitions:
96 res.add(
97 (part, VOID.properties, Literal(len(classProps[c]))))
98 res.add((part, VOID.distinctObjects,
99 Literal(len(classObjects[c]))))
100
101 for i, p in enumerate(properties):
102 part = URIRef(dataset + "_property%d" % i)
103 res.add((dataset, VOID.propertyPartition, part))
104 res.add((part, RDF.type, VOID.Dataset))
105
106 res.add((part, VOID.triples, Literal(propCount[p])))
107 res.add((part, VOID.properties, Literal(1)))
108
109 res.add((part, VOID.property, p))
110
111 if distinctForPartitions:
112
113 entities = 0
114 propClasses = set()
115 for s in propSubjects[p]:
116 if s in typeMap:
117 entities += 1
118 for c in typeMap[s]:
119 propClasses.add(c)
120
121 res.add((part, VOID.entities, Literal(entities)))
122 res.add((part, VOID.classes, Literal(len(propClasses))))
123
124 res.add((part, VOID.distinctSubjects,
125 Literal(len(propSubjects[p]))))
126 res.add((part, VOID.distinctObjects,
127 Literal(len(propObjects[p]))))
128
129 return res, dataset