Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/rdflib/void.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:26e78fe6e8c4 |
---|---|
1 import collections | |
2 | |
3 from rdflib import URIRef, Graph, Literal | |
4 from rdflib.namespace import VOID, RDF | |
5 | |
6 | |
7 def generateVoID(g, dataset=None, res=None, distinctForPartitions=True): | |
8 """ | |
9 Returns a new graph with a VoID description of the passed dataset | |
10 | |
11 For more info on Vocabulary of Interlinked Datasets (VoID), see: | |
12 http://vocab.deri.ie/void | |
13 | |
14 This only makes two passes through the triples (once to detect the types | |
15 of things) | |
16 | |
17 The tradeoff is that lots of temporary structures are built up in memory | |
18 meaning lots of memory may be consumed :) | |
19 I imagine at least a few copies of your original graph. | |
20 | |
21 the distinctForPartitions parameter controls whether | |
22 distinctSubjects/objects are tracked for each class/propertyPartition | |
23 this requires more memory again | |
24 | |
25 """ | |
26 | |
27 typeMap = collections.defaultdict(set) | |
28 classes = collections.defaultdict(set) | |
29 for e, c in g.subject_objects(RDF.type): | |
30 classes[c].add(e) | |
31 typeMap[e].add(c) | |
32 | |
33 triples = 0 | |
34 subjects = set() | |
35 objects = set() | |
36 properties = set() | |
37 classCount = collections.defaultdict(int) | |
38 propCount = collections.defaultdict(int) | |
39 | |
40 classProps = collections.defaultdict(set) | |
41 classObjects = collections.defaultdict(set) | |
42 propSubjects = collections.defaultdict(set) | |
43 propObjects = collections.defaultdict(set) | |
44 | |
45 for s, p, o in g: | |
46 | |
47 triples += 1 | |
48 subjects.add(s) | |
49 properties.add(p) | |
50 objects.add(o) | |
51 | |
52 # class partitions | |
53 if s in typeMap: | |
54 for c in typeMap[s]: | |
55 classCount[c] += 1 | |
56 if distinctForPartitions: | |
57 classObjects[c].add(o) | |
58 classProps[c].add(p) | |
59 | |
60 # property partitions | |
61 propCount[p] += 1 | |
62 if distinctForPartitions: | |
63 propObjects[p].add(o) | |
64 propSubjects[p].add(s) | |
65 | |
66 if not dataset: | |
67 dataset = URIRef("http://example.org/Dataset") | |
68 | |
69 if not res: | |
70 res = Graph() | |
71 | |
72 res.add((dataset, RDF.type, VOID.Dataset)) | |
73 | |
74 # basic stats | |
75 res.add((dataset, VOID.triples, Literal(triples))) | |
76 res.add((dataset, VOID.classes, Literal(len(classes)))) | |
77 | |
78 res.add((dataset, VOID.distinctObjects, Literal(len(objects)))) | |
79 res.add((dataset, VOID.distinctSubjects, Literal(len(subjects)))) | |
80 res.add((dataset, VOID.properties, Literal(len(properties)))) | |
81 | |
82 for i, c in enumerate(classes): | |
83 part = URIRef(dataset + "_class%d" % i) | |
84 res.add((dataset, VOID.classPartition, part)) | |
85 res.add((part, RDF.type, VOID.Dataset)) | |
86 | |
87 res.add((part, VOID.triples, Literal(classCount[c]))) | |
88 res.add((part, VOID.classes, Literal(1))) | |
89 | |
90 res.add((part, VOID["class"], c)) | |
91 | |
92 res.add((part, VOID.entities, Literal(len(classes[c])))) | |
93 res.add((part, VOID.distinctSubjects, Literal(len(classes[c])))) | |
94 | |
95 if distinctForPartitions: | |
96 res.add( | |
97 (part, VOID.properties, Literal(len(classProps[c])))) | |
98 res.add((part, VOID.distinctObjects, | |
99 Literal(len(classObjects[c])))) | |
100 | |
101 for i, p in enumerate(properties): | |
102 part = URIRef(dataset + "_property%d" % i) | |
103 res.add((dataset, VOID.propertyPartition, part)) | |
104 res.add((part, RDF.type, VOID.Dataset)) | |
105 | |
106 res.add((part, VOID.triples, Literal(propCount[p]))) | |
107 res.add((part, VOID.properties, Literal(1))) | |
108 | |
109 res.add((part, VOID.property, p)) | |
110 | |
111 if distinctForPartitions: | |
112 | |
113 entities = 0 | |
114 propClasses = set() | |
115 for s in propSubjects[p]: | |
116 if s in typeMap: | |
117 entities += 1 | |
118 for c in typeMap[s]: | |
119 propClasses.add(c) | |
120 | |
121 res.add((part, VOID.entities, Literal(entities))) | |
122 res.add((part, VOID.classes, Literal(len(propClasses)))) | |
123 | |
124 res.add((part, VOID.distinctSubjects, | |
125 Literal(len(propSubjects[p])))) | |
126 res.add((part, VOID.distinctObjects, | |
127 Literal(len(propObjects[p])))) | |
128 | |
129 return res, dataset |