Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/rdflib/util.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 (2021-03-22) |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 """ | |
2 Some utility functions. | |
3 | |
4 Miscellaneous utilities | |
5 | |
6 * list2set | |
7 * first | |
8 * uniq | |
9 * more_than | |
10 | |
11 Term characterisation and generation | |
12 | |
13 * to_term | |
14 * from_n3 | |
15 | |
16 Date/time utilities | |
17 | |
18 * date_time | |
19 * parse_date_time | |
20 | |
21 Statement and component type checkers | |
22 | |
23 * check_context | |
24 * check_subject | |
25 * check_predicate | |
26 * check_object | |
27 * check_statement | |
28 * check_pattern | |
29 | |
30 """ | |
31 from __future__ import absolute_import | |
32 from __future__ import division | |
33 from __future__ import print_function | |
34 | |
35 from calendar import timegm | |
36 from time import altzone | |
37 # from time import daylight | |
38 from time import gmtime | |
39 from time import localtime | |
40 from time import time | |
41 from time import timezone | |
42 | |
43 from os.path import splitext | |
44 | |
45 from rdflib.exceptions import ContextTypeError | |
46 from rdflib.exceptions import ObjectTypeError | |
47 from rdflib.exceptions import PredicateTypeError | |
48 from rdflib.exceptions import SubjectTypeError | |
49 from rdflib.graph import Graph | |
50 from rdflib.graph import QuotedGraph | |
51 from rdflib.namespace import Namespace | |
52 from rdflib.namespace import NamespaceManager | |
53 from rdflib.term import BNode | |
54 from rdflib.term import Literal | |
55 from rdflib.term import URIRef | |
56 from rdflib.compat import sign | |
57 | |
58 __all__ = [ | |
59 'list2set', 'first', 'uniq', 'more_than', 'to_term', 'from_n3', | |
60 'date_time', 'parse_date_time', 'check_context', 'check_subject', | |
61 'check_predicate', 'check_object', 'check_statement', 'check_pattern', | |
62 'guess_format', 'find_roots', 'get_tree'] | |
63 | |
64 | |
65 def list2set(seq): | |
66 """ | |
67 Return a new list without duplicates. | |
68 Preserves the order, unlike set(seq) | |
69 """ | |
70 seen = set() | |
71 return [x for x in seq if x not in seen and not seen.add(x)] | |
72 | |
73 | |
74 def first(seq): | |
75 """ | |
76 return the first element in a python sequence | |
77 for graphs, use graph.value instead | |
78 """ | |
79 for result in seq: | |
80 return result | |
81 return None | |
82 | |
83 | |
84 def uniq(sequence, strip=0): | |
85 """removes duplicate strings from the sequence.""" | |
86 if strip: | |
87 return set(s.strip() for s in sequence) | |
88 else: | |
89 return set(sequence) | |
90 | |
91 | |
92 def more_than(sequence, number): | |
93 "Returns 1 if sequence has more items than number and 0 if not." | |
94 i = 0 | |
95 for item in sequence: | |
96 i += 1 | |
97 if i > number: | |
98 return 1 | |
99 return 0 | |
100 | |
101 | |
102 def to_term(s, default=None): | |
103 """ | |
104 Creates and returns an Identifier of type corresponding | |
105 to the pattern of the given positional argument string ``s``: | |
106 | |
107 '' returns the ``default`` keyword argument value or ``None`` | |
108 | |
109 '<s>' returns ``URIRef(s)`` (i.e. without angle brackets) | |
110 | |
111 '"s"' returns ``Literal(s)`` (i.e. without doublequotes) | |
112 | |
113 '_s' returns ``BNode(s)`` (i.e. without leading underscore) | |
114 | |
115 """ | |
116 if not s: | |
117 return default | |
118 elif s.startswith("<") and s.endswith(">"): | |
119 return URIRef(s[1:-1]) | |
120 elif s.startswith('"') and s.endswith('"'): | |
121 return Literal(s[1:-1]) | |
122 elif s.startswith("_"): | |
123 return BNode(s) | |
124 else: | |
125 msg = "Unrecognised term syntax: '%s'" % s | |
126 raise Exception(msg) | |
127 | |
128 | |
129 def from_n3(s, default=None, backend=None, nsm=None): | |
130 r''' | |
131 Creates the Identifier corresponding to the given n3 string. | |
132 | |
133 >>> from_n3('<http://ex.com/foo>') == URIRef('http://ex.com/foo') | |
134 True | |
135 >>> from_n3('"foo"@de') == Literal('foo', lang='de') | |
136 True | |
137 >>> from_n3('"""multi\nline\nstring"""@en') == Literal( | |
138 ... 'multi\nline\nstring', lang='en') | |
139 True | |
140 >>> from_n3('42') == Literal(42) | |
141 True | |
142 >>> from_n3(Literal(42).n3()) == Literal(42) | |
143 True | |
144 >>> from_n3('"42"^^xsd:integer') == Literal(42) | |
145 True | |
146 >>> from rdflib import RDFS | |
147 >>> from_n3('rdfs:label') == RDFS['label'] | |
148 True | |
149 >>> nsm = NamespaceManager(Graph()) | |
150 >>> nsm.bind('dbpedia', 'http://dbpedia.org/resource/') | |
151 >>> berlin = URIRef('http://dbpedia.org/resource/Berlin') | |
152 >>> from_n3('dbpedia:Berlin', nsm=nsm) == berlin | |
153 True | |
154 | |
155 ''' | |
156 if not s: | |
157 return default | |
158 if s.startswith('<'): | |
159 # Hack: this should correctly handle strings with either native unicode | |
160 # characters, or \u1234 unicode escapes. | |
161 return URIRef(s[1:-1].encode("raw-unicode-escape").decode("unicode-escape")) | |
162 elif s.startswith('"'): | |
163 if s.startswith('"""'): | |
164 quotes = '"""' | |
165 else: | |
166 quotes = '"' | |
167 value, rest = s.rsplit(quotes, 1) | |
168 value = value[len(quotes):] # strip leading quotes | |
169 datatype = None | |
170 language = None | |
171 | |
172 # as a given datatype overrules lang-tag check for it first | |
173 dtoffset = rest.rfind('^^') | |
174 if dtoffset >= 0: | |
175 # found a datatype | |
176 # datatype has to come after lang-tag so ignore everything before | |
177 # see: http://www.w3.org/TR/2011/WD-turtle-20110809/ | |
178 # #prod-turtle2-RDFLiteral | |
179 datatype = from_n3(rest[dtoffset + 2:], default, backend, nsm) | |
180 else: | |
181 if rest.startswith("@"): | |
182 language = rest[1:] # strip leading at sign | |
183 | |
184 value = value.replace(r'\"', '"') | |
185 # Hack: this should correctly handle strings with either native unicode | |
186 # characters, or \u1234 unicode escapes. | |
187 value = value.encode("raw-unicode-escape").decode("unicode-escape") | |
188 return Literal(value, language, datatype) | |
189 elif s == 'true' or s == 'false': | |
190 return Literal(s == 'true') | |
191 elif s.isdigit(): | |
192 return Literal(int(s)) | |
193 elif s.startswith('{'): | |
194 identifier = from_n3(s[1:-1]) | |
195 return QuotedGraph(backend, identifier) | |
196 elif s.startswith('['): | |
197 identifier = from_n3(s[1:-1]) | |
198 return Graph(backend, identifier) | |
199 elif s.startswith("_:"): | |
200 return BNode(s[2:]) | |
201 elif ':' in s: | |
202 if nsm is None: | |
203 # instantiate default NamespaceManager and rely on its defaults | |
204 nsm = NamespaceManager(Graph()) | |
205 prefix, last_part = s.split(':', 1) | |
206 ns = dict(nsm.namespaces())[prefix] | |
207 return Namespace(ns)[last_part] | |
208 else: | |
209 return BNode(s) | |
210 | |
211 | |
212 def check_context(c): | |
213 if not (isinstance(c, URIRef) or | |
214 isinstance(c, BNode)): | |
215 raise ContextTypeError("%s:%s" % (c, type(c))) | |
216 | |
217 | |
218 def check_subject(s): | |
219 """ Test that s is a valid subject identifier.""" | |
220 if not (isinstance(s, URIRef) or isinstance(s, BNode)): | |
221 raise SubjectTypeError(s) | |
222 | |
223 | |
224 def check_predicate(p): | |
225 """ Test that p is a valid predicate identifier.""" | |
226 if not isinstance(p, URIRef): | |
227 raise PredicateTypeError(p) | |
228 | |
229 | |
230 def check_object(o): | |
231 """ Test that o is a valid object identifier.""" | |
232 if not (isinstance(o, URIRef) or | |
233 isinstance(o, Literal) or | |
234 isinstance(o, BNode)): | |
235 raise ObjectTypeError(o) | |
236 | |
237 | |
238 def check_statement(triple): | |
239 (s, p, o) = triple | |
240 if not (isinstance(s, URIRef) or isinstance(s, BNode)): | |
241 raise SubjectTypeError(s) | |
242 | |
243 if not isinstance(p, URIRef): | |
244 raise PredicateTypeError(p) | |
245 | |
246 if not (isinstance(o, URIRef) or | |
247 isinstance(o, Literal) or | |
248 isinstance(o, BNode)): | |
249 raise ObjectTypeError(o) | |
250 | |
251 | |
252 def check_pattern(triple): | |
253 (s, p, o) = triple | |
254 if s and not (isinstance(s, URIRef) or isinstance(s, BNode)): | |
255 raise SubjectTypeError(s) | |
256 | |
257 if p and not isinstance(p, URIRef): | |
258 raise PredicateTypeError(p) | |
259 | |
260 if o and not (isinstance(o, URIRef) or | |
261 isinstance(o, Literal) or | |
262 isinstance(o, BNode)): | |
263 raise ObjectTypeError(o) | |
264 | |
265 | |
266 def date_time(t=None, local_time_zone=False): | |
267 """http://www.w3.org/TR/NOTE-datetime ex: 1997-07-16T19:20:30Z | |
268 | |
269 >>> date_time(1126482850) | |
270 '2005-09-11T23:54:10Z' | |
271 | |
272 @@ this will change depending on where it is run | |
273 #>>> date_time(1126482850, local_time_zone=True) | |
274 #'2005-09-11T19:54:10-04:00' | |
275 | |
276 >>> date_time(1) | |
277 '1970-01-01T00:00:01Z' | |
278 | |
279 >>> date_time(0) | |
280 '1970-01-01T00:00:00Z' | |
281 """ | |
282 if t is None: | |
283 t = time() | |
284 | |
285 if local_time_zone: | |
286 time_tuple = localtime(t) | |
287 if time_tuple[8]: | |
288 tz_mins = altzone // 60 | |
289 else: | |
290 tz_mins = timezone // 60 | |
291 tzd = "-%02d:%02d" % (tz_mins // 60, tz_mins % 60) | |
292 else: | |
293 time_tuple = gmtime(t) | |
294 tzd = "Z" | |
295 | |
296 year, month, day, hh, mm, ss, wd, y, z = time_tuple | |
297 s = "%0004d-%02d-%02dT%02d:%02d:%02d%s" % ( | |
298 year, month, day, hh, mm, ss, tzd) | |
299 return s | |
300 | |
301 | |
302 def parse_date_time(val): | |
303 """always returns seconds in UTC | |
304 | |
305 # tests are written like this to make any errors easier to understand | |
306 >>> parse_date_time('2005-09-11T23:54:10Z') - 1126482850.0 | |
307 0.0 | |
308 | |
309 >>> parse_date_time('2005-09-11T16:54:10-07:00') - 1126482850.0 | |
310 0.0 | |
311 | |
312 >>> parse_date_time('1970-01-01T00:00:01Z') - 1.0 | |
313 0.0 | |
314 | |
315 >>> parse_date_time('1970-01-01T00:00:00Z') - 0.0 | |
316 0.0 | |
317 >>> parse_date_time("2005-09-05T10:42:00") - 1125916920.0 | |
318 0.0 | |
319 """ | |
320 | |
321 if "T" not in val: | |
322 val += "T00:00:00Z" | |
323 | |
324 ymd, time = val.split("T") | |
325 hms, tz_str = time[0:8], time[8:] | |
326 | |
327 if not tz_str or tz_str == "Z": | |
328 time = time[:-1] | |
329 tz_offset = 0 | |
330 else: | |
331 signed_hrs = int(tz_str[:3]) | |
332 mins = int(tz_str[4:6]) | |
333 secs = (sign(signed_hrs) * mins + signed_hrs * 60) * 60 | |
334 tz_offset = -secs | |
335 | |
336 year, month, day = ymd.split("-") | |
337 hour, minute, second = hms.split(":") | |
338 | |
339 t = timegm((int(year), int(month), int(day), int(hour), | |
340 int(minute), int(second), 0, 0, 0)) | |
341 t = t + tz_offset | |
342 return t | |
343 | |
344 | |
345 SUFFIX_FORMAT_MAP = { | |
346 'rdf': 'xml', | |
347 'rdfs': 'xml', | |
348 'owl': 'xml', | |
349 'n3': 'n3', | |
350 'ttl': 'turtle', | |
351 'nt': 'nt', | |
352 'trix': 'trix', | |
353 'xhtml': 'rdfa', | |
354 'html': 'rdfa', | |
355 'svg': 'rdfa', | |
356 'nq': 'nquads', | |
357 'trig': 'trig' | |
358 } | |
359 | |
360 | |
361 def guess_format(fpath, fmap=None): | |
362 """ | |
363 Guess RDF serialization based on file suffix. Uses | |
364 ``SUFFIX_FORMAT_MAP`` unless ``fmap`` is provided. Examples: | |
365 | |
366 >>> guess_format('path/to/file.rdf') | |
367 'xml' | |
368 >>> guess_format('path/to/file.owl') | |
369 'xml' | |
370 >>> guess_format('path/to/file.ttl') | |
371 'turtle' | |
372 >>> guess_format('path/to/file.xhtml') | |
373 'rdfa' | |
374 >>> guess_format('path/to/file.svg') | |
375 'rdfa' | |
376 >>> guess_format('path/to/file.xhtml', {'xhtml': 'grddl'}) | |
377 'grddl' | |
378 | |
379 This also works with just the suffixes, with or without leading dot, and | |
380 regardless of letter case:: | |
381 | |
382 >>> guess_format('.rdf') | |
383 'xml' | |
384 >>> guess_format('rdf') | |
385 'xml' | |
386 >>> guess_format('RDF') | |
387 'xml' | |
388 """ | |
389 fmap = fmap or SUFFIX_FORMAT_MAP | |
390 return fmap.get(_get_ext(fpath)) or fmap.get(fpath.lower()) | |
391 | |
392 | |
393 def _get_ext(fpath, lower=True): | |
394 """ | |
395 Gets the file extension from a file(path); stripped of leading '.' and in | |
396 lower case. Examples: | |
397 | |
398 >>> _get_ext("path/to/file.txt") | |
399 'txt' | |
400 >>> _get_ext("OTHER.PDF") | |
401 'pdf' | |
402 >>> _get_ext("noext") | |
403 '' | |
404 >>> _get_ext(".rdf") | |
405 'rdf' | |
406 """ | |
407 ext = splitext(fpath)[-1] | |
408 if ext == '' and fpath.startswith("."): | |
409 ext = fpath | |
410 if lower: | |
411 ext = ext.lower() | |
412 if ext.startswith('.'): | |
413 ext = ext[1:] | |
414 return ext | |
415 | |
416 | |
417 def find_roots(graph, prop, roots=None): | |
418 """ | |
419 Find the roots in some sort of transitive hierarchy. | |
420 | |
421 find_roots(graph, rdflib.RDFS.subClassOf) | |
422 will return a set of all roots of the sub-class hierarchy | |
423 | |
424 Assumes triple of the form (child, prop, parent), i.e. the direction of | |
425 RDFS.subClassOf or SKOS.broader | |
426 | |
427 """ | |
428 | |
429 non_roots = set() | |
430 if roots is None: | |
431 roots = set() | |
432 for x, y in graph.subject_objects(prop): | |
433 non_roots.add(x) | |
434 if x in roots: | |
435 roots.remove(x) | |
436 if y not in non_roots: | |
437 roots.add(y) | |
438 return roots | |
439 | |
440 | |
441 def get_tree(graph, | |
442 root, | |
443 prop, | |
444 mapper=lambda x: x, | |
445 sortkey=None, | |
446 done=None, | |
447 dir='down'): | |
448 """ | |
449 Return a nested list/tuple structure representing the tree | |
450 built by the transitive property given, starting from the root given | |
451 | |
452 i.e. | |
453 | |
454 get_tree(graph, | |
455 rdflib.URIRef("http://xmlns.com/foaf/0.1/Person"), | |
456 rdflib.RDFS.subClassOf) | |
457 | |
458 will return the structure for the subClassTree below person. | |
459 | |
460 dir='down' assumes triple of the form (child, prop, parent), | |
461 i.e. the direction of RDFS.subClassOf or SKOS.broader | |
462 Any other dir traverses in the other direction | |
463 | |
464 """ | |
465 | |
466 if done is None: | |
467 done = set() | |
468 if root in done: | |
469 return | |
470 done.add(root) | |
471 tree = [] | |
472 | |
473 if dir == 'down': | |
474 branches = graph.subjects(prop, root) | |
475 else: | |
476 branches = graph.objects(root, prop) | |
477 | |
478 for branch in branches: | |
479 t = get_tree(graph, branch, prop, mapper, sortkey, done, dir) | |
480 if t: | |
481 tree.append(t) | |
482 | |
483 return (mapper(root), sorted(tree, key=sortkey)) | |
484 | |
485 | |
486 def test(): | |
487 import doctest | |
488 doctest.testmod() | |
489 | |
490 | |
491 if __name__ == "__main__": | |
492 # try to make the tests work outside of the time zone they were written in | |
493 # import os, time | |
494 # os.environ['TZ'] = 'US/Pacific' | |
495 # try: | |
496 # time.tzset() | |
497 # except AttributeError, e: | |
498 # print e | |
499 # pass | |
500 # tzset missing! see | |
501 # http://mail.python.org/pipermail/python-dev/2003-April/034480.html | |
502 test() # pragma: no cover |