comparison env/lib/python3.9/site-packages/rdflib/util.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000 (2021-03-22)
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 """
2 Some utility functions.
3
4 Miscellaneous utilities
5
6 * list2set
7 * first
8 * uniq
9 * more_than
10
11 Term characterisation and generation
12
13 * to_term
14 * from_n3
15
16 Date/time utilities
17
18 * date_time
19 * parse_date_time
20
21 Statement and component type checkers
22
23 * check_context
24 * check_subject
25 * check_predicate
26 * check_object
27 * check_statement
28 * check_pattern
29
30 """
31 from __future__ import absolute_import
32 from __future__ import division
33 from __future__ import print_function
34
35 from calendar import timegm
36 from time import altzone
37 # from time import daylight
38 from time import gmtime
39 from time import localtime
40 from time import time
41 from time import timezone
42
43 from os.path import splitext
44
45 from rdflib.exceptions import ContextTypeError
46 from rdflib.exceptions import ObjectTypeError
47 from rdflib.exceptions import PredicateTypeError
48 from rdflib.exceptions import SubjectTypeError
49 from rdflib.graph import Graph
50 from rdflib.graph import QuotedGraph
51 from rdflib.namespace import Namespace
52 from rdflib.namespace import NamespaceManager
53 from rdflib.term import BNode
54 from rdflib.term import Literal
55 from rdflib.term import URIRef
56 from rdflib.compat import sign
57
58 __all__ = [
59 'list2set', 'first', 'uniq', 'more_than', 'to_term', 'from_n3',
60 'date_time', 'parse_date_time', 'check_context', 'check_subject',
61 'check_predicate', 'check_object', 'check_statement', 'check_pattern',
62 'guess_format', 'find_roots', 'get_tree']
63
64
65 def list2set(seq):
66 """
67 Return a new list without duplicates.
68 Preserves the order, unlike set(seq)
69 """
70 seen = set()
71 return [x for x in seq if x not in seen and not seen.add(x)]
72
73
74 def first(seq):
75 """
76 return the first element in a python sequence
77 for graphs, use graph.value instead
78 """
79 for result in seq:
80 return result
81 return None
82
83
84 def uniq(sequence, strip=0):
85 """removes duplicate strings from the sequence."""
86 if strip:
87 return set(s.strip() for s in sequence)
88 else:
89 return set(sequence)
90
91
92 def more_than(sequence, number):
93 "Returns 1 if sequence has more items than number and 0 if not."
94 i = 0
95 for item in sequence:
96 i += 1
97 if i > number:
98 return 1
99 return 0
100
101
102 def to_term(s, default=None):
103 """
104 Creates and returns an Identifier of type corresponding
105 to the pattern of the given positional argument string ``s``:
106
107 '' returns the ``default`` keyword argument value or ``None``
108
109 '<s>' returns ``URIRef(s)`` (i.e. without angle brackets)
110
111 '"s"' returns ``Literal(s)`` (i.e. without doublequotes)
112
113 '_s' returns ``BNode(s)`` (i.e. without leading underscore)
114
115 """
116 if not s:
117 return default
118 elif s.startswith("<") and s.endswith(">"):
119 return URIRef(s[1:-1])
120 elif s.startswith('"') and s.endswith('"'):
121 return Literal(s[1:-1])
122 elif s.startswith("_"):
123 return BNode(s)
124 else:
125 msg = "Unrecognised term syntax: '%s'" % s
126 raise Exception(msg)
127
128
129 def from_n3(s, default=None, backend=None, nsm=None):
130 r'''
131 Creates the Identifier corresponding to the given n3 string.
132
133 >>> from_n3('<http://ex.com/foo>') == URIRef('http://ex.com/foo')
134 True
135 >>> from_n3('"foo"@de') == Literal('foo', lang='de')
136 True
137 >>> from_n3('"""multi\nline\nstring"""@en') == Literal(
138 ... 'multi\nline\nstring', lang='en')
139 True
140 >>> from_n3('42') == Literal(42)
141 True
142 >>> from_n3(Literal(42).n3()) == Literal(42)
143 True
144 >>> from_n3('"42"^^xsd:integer') == Literal(42)
145 True
146 >>> from rdflib import RDFS
147 >>> from_n3('rdfs:label') == RDFS['label']
148 True
149 >>> nsm = NamespaceManager(Graph())
150 >>> nsm.bind('dbpedia', 'http://dbpedia.org/resource/')
151 >>> berlin = URIRef('http://dbpedia.org/resource/Berlin')
152 >>> from_n3('dbpedia:Berlin', nsm=nsm) == berlin
153 True
154
155 '''
156 if not s:
157 return default
158 if s.startswith('<'):
159 # Hack: this should correctly handle strings with either native unicode
160 # characters, or \u1234 unicode escapes.
161 return URIRef(s[1:-1].encode("raw-unicode-escape").decode("unicode-escape"))
162 elif s.startswith('"'):
163 if s.startswith('"""'):
164 quotes = '"""'
165 else:
166 quotes = '"'
167 value, rest = s.rsplit(quotes, 1)
168 value = value[len(quotes):] # strip leading quotes
169 datatype = None
170 language = None
171
172 # as a given datatype overrules lang-tag check for it first
173 dtoffset = rest.rfind('^^')
174 if dtoffset >= 0:
175 # found a datatype
176 # datatype has to come after lang-tag so ignore everything before
177 # see: http://www.w3.org/TR/2011/WD-turtle-20110809/
178 # #prod-turtle2-RDFLiteral
179 datatype = from_n3(rest[dtoffset + 2:], default, backend, nsm)
180 else:
181 if rest.startswith("@"):
182 language = rest[1:] # strip leading at sign
183
184 value = value.replace(r'\"', '"')
185 # Hack: this should correctly handle strings with either native unicode
186 # characters, or \u1234 unicode escapes.
187 value = value.encode("raw-unicode-escape").decode("unicode-escape")
188 return Literal(value, language, datatype)
189 elif s == 'true' or s == 'false':
190 return Literal(s == 'true')
191 elif s.isdigit():
192 return Literal(int(s))
193 elif s.startswith('{'):
194 identifier = from_n3(s[1:-1])
195 return QuotedGraph(backend, identifier)
196 elif s.startswith('['):
197 identifier = from_n3(s[1:-1])
198 return Graph(backend, identifier)
199 elif s.startswith("_:"):
200 return BNode(s[2:])
201 elif ':' in s:
202 if nsm is None:
203 # instantiate default NamespaceManager and rely on its defaults
204 nsm = NamespaceManager(Graph())
205 prefix, last_part = s.split(':', 1)
206 ns = dict(nsm.namespaces())[prefix]
207 return Namespace(ns)[last_part]
208 else:
209 return BNode(s)
210
211
212 def check_context(c):
213 if not (isinstance(c, URIRef) or
214 isinstance(c, BNode)):
215 raise ContextTypeError("%s:%s" % (c, type(c)))
216
217
218 def check_subject(s):
219 """ Test that s is a valid subject identifier."""
220 if not (isinstance(s, URIRef) or isinstance(s, BNode)):
221 raise SubjectTypeError(s)
222
223
224 def check_predicate(p):
225 """ Test that p is a valid predicate identifier."""
226 if not isinstance(p, URIRef):
227 raise PredicateTypeError(p)
228
229
230 def check_object(o):
231 """ Test that o is a valid object identifier."""
232 if not (isinstance(o, URIRef) or
233 isinstance(o, Literal) or
234 isinstance(o, BNode)):
235 raise ObjectTypeError(o)
236
237
238 def check_statement(triple):
239 (s, p, o) = triple
240 if not (isinstance(s, URIRef) or isinstance(s, BNode)):
241 raise SubjectTypeError(s)
242
243 if not isinstance(p, URIRef):
244 raise PredicateTypeError(p)
245
246 if not (isinstance(o, URIRef) or
247 isinstance(o, Literal) or
248 isinstance(o, BNode)):
249 raise ObjectTypeError(o)
250
251
252 def check_pattern(triple):
253 (s, p, o) = triple
254 if s and not (isinstance(s, URIRef) or isinstance(s, BNode)):
255 raise SubjectTypeError(s)
256
257 if p and not isinstance(p, URIRef):
258 raise PredicateTypeError(p)
259
260 if o and not (isinstance(o, URIRef) or
261 isinstance(o, Literal) or
262 isinstance(o, BNode)):
263 raise ObjectTypeError(o)
264
265
266 def date_time(t=None, local_time_zone=False):
267 """http://www.w3.org/TR/NOTE-datetime ex: 1997-07-16T19:20:30Z
268
269 >>> date_time(1126482850)
270 '2005-09-11T23:54:10Z'
271
272 @@ this will change depending on where it is run
273 #>>> date_time(1126482850, local_time_zone=True)
274 #'2005-09-11T19:54:10-04:00'
275
276 >>> date_time(1)
277 '1970-01-01T00:00:01Z'
278
279 >>> date_time(0)
280 '1970-01-01T00:00:00Z'
281 """
282 if t is None:
283 t = time()
284
285 if local_time_zone:
286 time_tuple = localtime(t)
287 if time_tuple[8]:
288 tz_mins = altzone // 60
289 else:
290 tz_mins = timezone // 60
291 tzd = "-%02d:%02d" % (tz_mins // 60, tz_mins % 60)
292 else:
293 time_tuple = gmtime(t)
294 tzd = "Z"
295
296 year, month, day, hh, mm, ss, wd, y, z = time_tuple
297 s = "%0004d-%02d-%02dT%02d:%02d:%02d%s" % (
298 year, month, day, hh, mm, ss, tzd)
299 return s
300
301
302 def parse_date_time(val):
303 """always returns seconds in UTC
304
305 # tests are written like this to make any errors easier to understand
306 >>> parse_date_time('2005-09-11T23:54:10Z') - 1126482850.0
307 0.0
308
309 >>> parse_date_time('2005-09-11T16:54:10-07:00') - 1126482850.0
310 0.0
311
312 >>> parse_date_time('1970-01-01T00:00:01Z') - 1.0
313 0.0
314
315 >>> parse_date_time('1970-01-01T00:00:00Z') - 0.0
316 0.0
317 >>> parse_date_time("2005-09-05T10:42:00") - 1125916920.0
318 0.0
319 """
320
321 if "T" not in val:
322 val += "T00:00:00Z"
323
324 ymd, time = val.split("T")
325 hms, tz_str = time[0:8], time[8:]
326
327 if not tz_str or tz_str == "Z":
328 time = time[:-1]
329 tz_offset = 0
330 else:
331 signed_hrs = int(tz_str[:3])
332 mins = int(tz_str[4:6])
333 secs = (sign(signed_hrs) * mins + signed_hrs * 60) * 60
334 tz_offset = -secs
335
336 year, month, day = ymd.split("-")
337 hour, minute, second = hms.split(":")
338
339 t = timegm((int(year), int(month), int(day), int(hour),
340 int(minute), int(second), 0, 0, 0))
341 t = t + tz_offset
342 return t
343
344
345 SUFFIX_FORMAT_MAP = {
346 'rdf': 'xml',
347 'rdfs': 'xml',
348 'owl': 'xml',
349 'n3': 'n3',
350 'ttl': 'turtle',
351 'nt': 'nt',
352 'trix': 'trix',
353 'xhtml': 'rdfa',
354 'html': 'rdfa',
355 'svg': 'rdfa',
356 'nq': 'nquads',
357 'trig': 'trig'
358 }
359
360
361 def guess_format(fpath, fmap=None):
362 """
363 Guess RDF serialization based on file suffix. Uses
364 ``SUFFIX_FORMAT_MAP`` unless ``fmap`` is provided. Examples:
365
366 >>> guess_format('path/to/file.rdf')
367 'xml'
368 >>> guess_format('path/to/file.owl')
369 'xml'
370 >>> guess_format('path/to/file.ttl')
371 'turtle'
372 >>> guess_format('path/to/file.xhtml')
373 'rdfa'
374 >>> guess_format('path/to/file.svg')
375 'rdfa'
376 >>> guess_format('path/to/file.xhtml', {'xhtml': 'grddl'})
377 'grddl'
378
379 This also works with just the suffixes, with or without leading dot, and
380 regardless of letter case::
381
382 >>> guess_format('.rdf')
383 'xml'
384 >>> guess_format('rdf')
385 'xml'
386 >>> guess_format('RDF')
387 'xml'
388 """
389 fmap = fmap or SUFFIX_FORMAT_MAP
390 return fmap.get(_get_ext(fpath)) or fmap.get(fpath.lower())
391
392
393 def _get_ext(fpath, lower=True):
394 """
395 Gets the file extension from a file(path); stripped of leading '.' and in
396 lower case. Examples:
397
398 >>> _get_ext("path/to/file.txt")
399 'txt'
400 >>> _get_ext("OTHER.PDF")
401 'pdf'
402 >>> _get_ext("noext")
403 ''
404 >>> _get_ext(".rdf")
405 'rdf'
406 """
407 ext = splitext(fpath)[-1]
408 if ext == '' and fpath.startswith("."):
409 ext = fpath
410 if lower:
411 ext = ext.lower()
412 if ext.startswith('.'):
413 ext = ext[1:]
414 return ext
415
416
417 def find_roots(graph, prop, roots=None):
418 """
419 Find the roots in some sort of transitive hierarchy.
420
421 find_roots(graph, rdflib.RDFS.subClassOf)
422 will return a set of all roots of the sub-class hierarchy
423
424 Assumes triple of the form (child, prop, parent), i.e. the direction of
425 RDFS.subClassOf or SKOS.broader
426
427 """
428
429 non_roots = set()
430 if roots is None:
431 roots = set()
432 for x, y in graph.subject_objects(prop):
433 non_roots.add(x)
434 if x in roots:
435 roots.remove(x)
436 if y not in non_roots:
437 roots.add(y)
438 return roots
439
440
441 def get_tree(graph,
442 root,
443 prop,
444 mapper=lambda x: x,
445 sortkey=None,
446 done=None,
447 dir='down'):
448 """
449 Return a nested list/tuple structure representing the tree
450 built by the transitive property given, starting from the root given
451
452 i.e.
453
454 get_tree(graph,
455 rdflib.URIRef("http://xmlns.com/foaf/0.1/Person"),
456 rdflib.RDFS.subClassOf)
457
458 will return the structure for the subClassTree below person.
459
460 dir='down' assumes triple of the form (child, prop, parent),
461 i.e. the direction of RDFS.subClassOf or SKOS.broader
462 Any other dir traverses in the other direction
463
464 """
465
466 if done is None:
467 done = set()
468 if root in done:
469 return
470 done.add(root)
471 tree = []
472
473 if dir == 'down':
474 branches = graph.subjects(prop, root)
475 else:
476 branches = graph.objects(root, prop)
477
478 for branch in branches:
479 t = get_tree(graph, branch, prop, mapper, sortkey, done, dir)
480 if t:
481 tree.append(t)
482
483 return (mapper(root), sorted(tree, key=sortkey))
484
485
486 def test():
487 import doctest
488 doctest.testmod()
489
490
491 if __name__ == "__main__":
492 # try to make the tests work outside of the time zone they were written in
493 # import os, time
494 # os.environ['TZ'] = 'US/Pacific'
495 # try:
496 # time.tzset()
497 # except AttributeError, e:
498 # print e
499 # pass
500 # tzset missing! see
501 # http://mail.python.org/pipermail/python-dev/2003-April/034480.html
502 test() # pragma: no cover