comparison env/lib/python3.7/site-packages/rdflib/util.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 """
2 Some utility functions.
3
4 Miscellaneous utilities
5
6 * list2set
7 * first
8 * uniq
9 * more_than
10
11 Term characterisation and generation
12
13 * to_term
14 * from_n3
15
16 Date/time utilities
17
18 * date_time
19 * parse_date_time
20
21 Statement and component type checkers
22
23 * check_context
24 * check_subject
25 * check_predicate
26 * check_object
27 * check_statement
28 * check_pattern
29
30 """
31
32 from calendar import timegm
33 from time import altzone
34 # from time import daylight
35 from time import gmtime
36 from time import localtime
37 from time import time
38 from time import timezone
39
40 from os.path import splitext
41 from io import StringIO
42
43 from rdflib.exceptions import ContextTypeError
44 from rdflib.exceptions import ObjectTypeError
45 from rdflib.exceptions import PredicateTypeError
46 from rdflib.exceptions import SubjectTypeError
47 from rdflib.graph import Graph
48 from rdflib.graph import QuotedGraph
49 from rdflib.namespace import Namespace
50 from rdflib.namespace import NamespaceManager
51 from rdflib.term import BNode
52 from rdflib.term import Literal
53 from rdflib.term import URIRef
54 from rdflib.py3compat import sign
55
56 __all__ = [
57 'list2set', 'first', 'uniq', 'more_than', 'to_term', 'from_n3',
58 'date_time', 'parse_date_time', 'check_context', 'check_subject',
59 'check_predicate', 'check_object', 'check_statement', 'check_pattern',
60 'guess_format', 'find_roots', 'get_tree']
61
62
63 def list2set(seq):
64 """
65 Return a new list without duplicates.
66 Preserves the order, unlike set(seq)
67 """
68 seen = set()
69 return [x for x in seq if x not in seen and not seen.add(x)]
70
71
72 def first(seq):
73 """
74 return the first element in a python sequence
75 for graphs, use graph.value instead
76 """
77 for result in seq:
78 return result
79 return None
80
81
82 def uniq(sequence, strip=0):
83 """removes duplicate strings from the sequence."""
84 if strip:
85 return set(s.strip() for s in sequence)
86 else:
87 return set(sequence)
88
89
90 def more_than(sequence, number):
91 "Returns 1 if sequence has more items than number and 0 if not."
92 i = 0
93 for item in sequence:
94 i += 1
95 if i > number:
96 return 1
97 return 0
98
99
100 def to_term(s, default=None):
101 """
102 Creates and returns an Identifier of type corresponding
103 to the pattern of the given positional argument string ``s``:
104
105 '' returns the ``default`` keyword argument value or ``None``
106
107 '<s>' returns ``URIRef(s)`` (i.e. without angle brackets)
108
109 '"s"' returns ``Literal(s)`` (i.e. without doublequotes)
110
111 '_s' returns ``BNode(s)`` (i.e. without leading underscore)
112
113 """
114 if not s:
115 return default
116 elif s.startswith("<") and s.endswith(">"):
117 return URIRef(s[1:-1])
118 elif s.startswith('"') and s.endswith('"'):
119 return Literal(s[1:-1])
120 elif s.startswith("_"):
121 return BNode(s)
122 else:
123 msg = "Unrecognised term syntax: '%s'" % s
124 raise Exception(msg)
125
126
127 def from_n3(s, default=None, backend=None, nsm=None):
128 r'''
129 Creates the Identifier corresponding to the given n3 string.
130
131 >>> from_n3('<http://ex.com/foo>') == URIRef('http://ex.com/foo')
132 True
133 >>> from_n3('"foo"@de') == Literal('foo', lang='de')
134 True
135 >>> from_n3('"""multi\nline\nstring"""@en') == Literal(
136 ... 'multi\nline\nstring', lang='en')
137 True
138 >>> from_n3('42') == Literal(42)
139 True
140 >>> from_n3(Literal(42).n3()) == Literal(42)
141 True
142 >>> from_n3('"42"^^xsd:integer') == Literal(42)
143 True
144 >>> from rdflib import RDFS
145 >>> from_n3('rdfs:label') == RDFS['label']
146 True
147 >>> nsm = NamespaceManager(Graph())
148 >>> nsm.bind('dbpedia', 'http://dbpedia.org/resource/')
149 >>> berlin = URIRef('http://dbpedia.org/resource/Berlin')
150 >>> from_n3('dbpedia:Berlin', nsm=nsm) == berlin
151 True
152
153 '''
154 if not s:
155 return default
156 if s.startswith('<'):
157 return URIRef(s[1:-1])
158 elif s.startswith('"'):
159 if s.startswith('"""'):
160 quotes = '"""'
161 else:
162 quotes = '"'
163 value, rest = s.rsplit(quotes, 1)
164 value = value[len(quotes):] # strip leading quotes
165 datatype = None
166 language = None
167
168 # as a given datatype overrules lang-tag check for it first
169 dtoffset = rest.rfind('^^')
170 if dtoffset >= 0:
171 # found a datatype
172 # datatype has to come after lang-tag so ignore everything before
173 # see: http://www.w3.org/TR/2011/WD-turtle-20110809/
174 # #prod-turtle2-RDFLiteral
175 datatype = from_n3(rest[dtoffset + 2:], default, backend, nsm)
176 else:
177 if rest.startswith("@"):
178 language = rest[1:] # strip leading at sign
179
180 value = value.replace(r'\"', '"')
181 # Hack: this should correctly handle strings with either native unicode
182 # characters, or \u1234 unicode escapes.
183 value = value.encode("raw-unicode-escape").decode("unicode-escape")
184 return Literal(value, language, datatype)
185 elif s == 'true' or s == 'false':
186 return Literal(s == 'true')
187 elif s.isdigit():
188 return Literal(int(s))
189 elif s.startswith('{'):
190 identifier = from_n3(s[1:-1])
191 return QuotedGraph(backend, identifier)
192 elif s.startswith('['):
193 identifier = from_n3(s[1:-1])
194 return Graph(backend, identifier)
195 elif s.startswith("_:"):
196 return BNode(s[2:])
197 elif ':' in s:
198 if nsm is None:
199 # instantiate default NamespaceManager and rely on its defaults
200 nsm = NamespaceManager(Graph())
201 prefix, last_part = s.split(':', 1)
202 ns = dict(nsm.namespaces())[prefix]
203 return Namespace(ns)[last_part]
204 else:
205 return BNode(s)
206
207
208 def check_context(c):
209 if not (isinstance(c, URIRef) or
210 isinstance(c, BNode)):
211 raise ContextTypeError("%s:%s" % (c, type(c)))
212
213
214 def check_subject(s):
215 """ Test that s is a valid subject identifier."""
216 if not (isinstance(s, URIRef) or isinstance(s, BNode)):
217 raise SubjectTypeError(s)
218
219
220 def check_predicate(p):
221 """ Test that p is a valid predicate identifier."""
222 if not isinstance(p, URIRef):
223 raise PredicateTypeError(p)
224
225
226 def check_object(o):
227 """ Test that o is a valid object identifier."""
228 if not (isinstance(o, URIRef) or
229 isinstance(o, Literal) or
230 isinstance(o, BNode)):
231 raise ObjectTypeError(o)
232
233
234 def check_statement(triple):
235 (s, p, o) = triple
236 if not (isinstance(s, URIRef) or isinstance(s, BNode)):
237 raise SubjectTypeError(s)
238
239 if not isinstance(p, URIRef):
240 raise PredicateTypeError(p)
241
242 if not (isinstance(o, URIRef) or
243 isinstance(o, Literal) or
244 isinstance(o, BNode)):
245 raise ObjectTypeError(o)
246
247
248 def check_pattern(triple):
249 (s, p, o) = triple
250 if s and not (isinstance(s, URIRef) or isinstance(s, BNode)):
251 raise SubjectTypeError(s)
252
253 if p and not isinstance(p, URIRef):
254 raise PredicateTypeError(p)
255
256 if o and not (isinstance(o, URIRef) or
257 isinstance(o, Literal) or
258 isinstance(o, BNode)):
259 raise ObjectTypeError(o)
260
261
262 def date_time(t=None, local_time_zone=False):
263 """http://www.w3.org/TR/NOTE-datetime ex: 1997-07-16T19:20:30Z
264
265 >>> date_time(1126482850)
266 '2005-09-11T23:54:10Z'
267
268 @@ this will change depending on where it is run
269 #>>> date_time(1126482850, local_time_zone=True)
270 #'2005-09-11T19:54:10-04:00'
271
272 >>> date_time(1)
273 '1970-01-01T00:00:01Z'
274
275 >>> date_time(0)
276 '1970-01-01T00:00:00Z'
277 """
278 if t is None:
279 t = time()
280
281 if local_time_zone:
282 time_tuple = localtime(t)
283 if time_tuple[8]:
284 tz_mins = altzone // 60
285 else:
286 tz_mins = timezone // 60
287 tzd = "-%02d:%02d" % (tz_mins // 60, tz_mins % 60)
288 else:
289 time_tuple = gmtime(t)
290 tzd = "Z"
291
292 year, month, day, hh, mm, ss, wd, y, z = time_tuple
293 s = "%0004d-%02d-%02dT%02d:%02d:%02d%s" % (
294 year, month, day, hh, mm, ss, tzd)
295 return s
296
297
298 def parse_date_time(val):
299 """always returns seconds in UTC
300
301 # tests are written like this to make any errors easier to understand
302 >>> parse_date_time('2005-09-11T23:54:10Z') - 1126482850.0
303 0.0
304
305 >>> parse_date_time('2005-09-11T16:54:10-07:00') - 1126482850.0
306 0.0
307
308 >>> parse_date_time('1970-01-01T00:00:01Z') - 1.0
309 0.0
310
311 >>> parse_date_time('1970-01-01T00:00:00Z') - 0.0
312 0.0
313 >>> parse_date_time("2005-09-05T10:42:00") - 1125916920.0
314 0.0
315 """
316
317 if "T" not in val:
318 val += "T00:00:00Z"
319
320 ymd, time = val.split("T")
321 hms, tz_str = time[0:8], time[8:]
322
323 if not tz_str or tz_str == "Z":
324 time = time[:-1]
325 tz_offset = 0
326 else:
327 signed_hrs = int(tz_str[:3])
328 mins = int(tz_str[4:6])
329 secs = (sign(signed_hrs) * mins + signed_hrs * 60) * 60
330 tz_offset = -secs
331
332 year, month, day = ymd.split("-")
333 hour, minute, second = hms.split(":")
334
335 t = timegm((int(year), int(month), int(day), int(hour),
336 int(minute), int(second), 0, 0, 0))
337 t = t + tz_offset
338 return t
339
340
341
342
343
344 SUFFIX_FORMAT_MAP = {
345 'rdf': 'xml',
346 'rdfs': 'xml',
347 'owl': 'xml',
348 'n3': 'n3',
349 'ttl': 'turtle',
350 'nt': 'nt',
351 'trix': 'trix',
352 'xhtml': 'rdfa',
353 'html': 'rdfa',
354 'svg': 'rdfa',
355 'nq': 'nquads',
356 'trig': 'trig'
357 }
358
359
360 def guess_format(fpath, fmap=None):
361 """
362 Guess RDF serialization based on file suffix. Uses
363 ``SUFFIX_FORMAT_MAP`` unless ``fmap`` is provided. Examples:
364
365 >>> guess_format('path/to/file.rdf')
366 'xml'
367 >>> guess_format('path/to/file.owl')
368 'xml'
369 >>> guess_format('path/to/file.ttl')
370 'turtle'
371 >>> guess_format('path/to/file.xhtml')
372 'rdfa'
373 >>> guess_format('path/to/file.svg')
374 'rdfa'
375 >>> guess_format('path/to/file.xhtml', {'xhtml': 'grddl'})
376 'grddl'
377
378 This also works with just the suffixes, with or without leading dot, and
379 regardless of letter case::
380
381 >>> guess_format('.rdf')
382 'xml'
383 >>> guess_format('rdf')
384 'xml'
385 >>> guess_format('RDF')
386 'xml'
387 """
388 fmap = fmap or SUFFIX_FORMAT_MAP
389 return fmap.get(_get_ext(fpath)) or fmap.get(fpath.lower())
390
391
392 def _get_ext(fpath, lower=True):
393 """
394 Gets the file extension from a file(path); stripped of leading '.' and in
395 lower case. Examples:
396
397 >>> _get_ext("path/to/file.txt")
398 'txt'
399 >>> _get_ext("OTHER.PDF")
400 'pdf'
401 >>> _get_ext("noext")
402 ''
403 >>> _get_ext(".rdf")
404 'rdf'
405 """
406 ext = splitext(fpath)[-1]
407 if ext == '' and fpath.startswith("."):
408 ext = fpath
409 if lower:
410 ext = ext.lower()
411 if ext.startswith('.'):
412 ext = ext[1:]
413 return ext
414
415
416 def find_roots(graph, prop, roots=None):
417 """
418 Find the roots in some sort of transitive hierarchy.
419
420 find_roots(graph, rdflib.RDFS.subClassOf)
421 will return a set of all roots of the sub-class hierarchy
422
423 Assumes triple of the form (child, prop, parent), i.e. the direction of
424 RDFS.subClassOf or SKOS.broader
425
426 """
427
428 non_roots = set()
429 if roots is None:
430 roots = set()
431 for x, y in graph.subject_objects(prop):
432 non_roots.add(x)
433 if x in roots:
434 roots.remove(x)
435 if y not in non_roots:
436 roots.add(y)
437 return roots
438
439
440 def get_tree(graph,
441 root,
442 prop,
443 mapper=lambda x: x,
444 sortkey=None,
445 done=None,
446 dir='down'):
447 """
448 Return a nested list/tuple structure representing the tree
449 built by the transitive property given, starting from the root given
450
451 i.e.
452
453 get_tree(graph,
454 rdflib.URIRef("http://xmlns.com/foaf/0.1/Person"),
455 rdflib.RDFS.subClassOf)
456
457 will return the structure for the subClassTree below person.
458
459 dir='down' assumes triple of the form (child, prop, parent),
460 i.e. the direction of RDFS.subClassOf or SKOS.broader
461 Any other dir traverses in the other direction
462
463 """
464
465 if done is None:
466 done = set()
467 if root in done:
468 return
469 done.add(root)
470 tree = []
471
472 if dir == 'down':
473 branches = graph.subjects(prop, root)
474 else:
475 branches = graph.objects(root, prop)
476
477 for branch in branches:
478 t = get_tree(graph, branch, prop, mapper, sortkey, done, dir)
479 if t:
480 tree.append(t)
481
482 return (mapper(root), sorted(tree, key=sortkey))
483
484
485
486
487 def test():
488 import doctest
489 doctest.testmod()
490
491 if __name__ == "__main__":
492 # try to make the tests work outside of the time zone they were written in
493 # import os, time
494 # os.environ['TZ'] = 'US/Pacific'
495 # try:
496 # time.tzset()
497 # except AttributeError, e:
498 # print e
499 # pass
500 # tzset missing! see
501 # http://mail.python.org/pipermail/python-dev/2003-April/034480.html
502 test() # pragma: no cover