comparison env/lib/python3.7/site-packages/schema_salad/ref_resolver.py @ 5:9b1c78e6ba9c draft default tip

"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author shellac
date Mon, 01 Jun 2020 08:59:25 -0400
parents 79f47841a781
children
comparison
equal deleted inserted replaced
4:79f47841a781 5:9b1c78e6ba9c
1 from __future__ import absolute_import
2
3 import copy
4 import logging
5 import os
6 import re
7 import sys
8 import xml.sax
9 from io import open
10 from typing import Callable # pylint: disable=unused-import
11 from typing import (
12 Any,
13 Dict,
14 Iterable,
15 List,
16 MutableMapping,
17 MutableSequence,
18 Optional,
19 Set,
20 Tuple,
21 TypeVar,
22 Union,
23 cast,
24 )
25
26 import requests
27 from cachecontrol.caches import FileCache
28 from cachecontrol.wrapper import CacheControl
29 from future.utils import raise_from
30 from rdflib.graph import Graph
31 from rdflib.namespace import OWL, RDF, RDFS
32 from rdflib.plugins.parsers.notation3 import BadSyntax
33 from six import StringIO, iteritems, string_types
34 from six.moves import range, urllib
35 from typing_extensions import Text # pylint: disable=unused-import
36
37 from ruamel import yaml
38 from ruamel.yaml.comments import CommentedMap, CommentedSeq, LineCol
39
40 from .exceptions import ValidationException, SchemaSaladException
41 from .sourceline import SourceLine, add_lc_filename, relname
42 from .utils import aslist, onWindows
43
44 # move to a regular typing import when Python 3.3-3.6 is no longer supported
45
46
47 _logger = logging.getLogger("salad")
48 ContextType = Dict[Text, Union[Dict[Text, Any], Text, Iterable[Text]]]
49 DocumentType = TypeVar("DocumentType", CommentedSeq, CommentedMap)
50 DocumentOrStrType = TypeVar("DocumentOrStrType", CommentedSeq, CommentedMap, Text)
51
52 _re_drive = re.compile(r"/([a-zA-Z]):")
53
54
55 def file_uri(path, split_frag=False): # type: (str, bool) -> str
56 if path.startswith("file://"):
57 return path
58 if split_frag:
59 pathsp = path.split("#", 2)
60 frag = "#" + urllib.parse.quote(str(pathsp[1])) if len(pathsp) == 2 else ""
61 urlpath = urllib.request.pathname2url(str(pathsp[0]))
62 else:
63 urlpath = urllib.request.pathname2url(path)
64 frag = ""
65 if urlpath.startswith("//"):
66 return "file:{}{}".format(urlpath, frag)
67 return "file://{}{}".format(urlpath, frag)
68
69
70 def uri_file_path(url): # type: (str) -> str
71 split = urllib.parse.urlsplit(url)
72 if split.scheme == "file":
73 return urllib.request.url2pathname(str(split.path)) + (
74 "#" + urllib.parse.unquote(str(split.fragment))
75 if bool(split.fragment)
76 else ""
77 )
78 raise ValidationException("Not a file URI: {}".format(url))
79
80
81 def to_validation_exception(
82 e,
83 ): # type: (yaml.error.MarkedYAMLError) -> ValidationException
84 fname_regex = re.compile(r"^file://" + re.escape(os.getcwd()) + "/")
85
86 exc = ValidationException(e.problem)
87 mark = e.problem_mark
88 exc.file = re.sub(fname_regex, "", mark.name)
89 exc.start = (mark.line + 1, mark.column + 1)
90 exc.end = None
91
92 if e.context:
93 parent = ValidationException(e.context)
94 mark = e.context_mark
95 parent.file = re.sub(fname_regex, "", mark.name)
96 parent.start = (mark.line + 1, mark.column + 1)
97 parent.end = None
98 parent.children = [exc]
99 return parent
100 else:
101 return exc
102
103
104 class NormDict(CommentedMap):
105 """A Dict where all keys are normalized using the provided function."""
106
107 def __init__(self, normalize=Text): # type: (Callable[[Text], Text]) -> None
108 super(NormDict, self).__init__()
109 self.normalize = normalize
110
111 def __getitem__(self, key): # type: (Any) -> Any
112 return super(NormDict, self).__getitem__(self.normalize(key))
113
114 def __setitem__(self, key, value): # type: (Any, Any) -> Any
115 return super(NormDict, self).__setitem__(self.normalize(key), value)
116
117 def __delitem__(self, key): # type: (Any) -> Any
118 return super(NormDict, self).__delitem__(self.normalize(key))
119
120 def __contains__(self, key): # type: (Any) -> Any
121 return super(NormDict, self).__contains__(self.normalize(key))
122
123
124 def merge_properties(a, b): # type: (List[Any], List[Any]) -> Dict[Any, Any]
125 c = {}
126 for i in a:
127 if i not in b:
128 c[i] = a[i]
129 for i in b:
130 if i not in a:
131 c[i] = b[i]
132 for i in a:
133 if i in b:
134 c[i] = aslist(a[i]) + aslist(b[i]) # type: ignore
135
136 return c
137
138
139 def SubLoader(loader): # type: (Loader) -> Loader
140 return Loader(
141 loader.ctx,
142 schemagraph=loader.graph,
143 foreign_properties=loader.foreign_properties,
144 idx=loader.idx,
145 cache=loader.cache,
146 fetcher_constructor=loader.fetcher_constructor,
147 skip_schemas=loader.skip_schemas,
148 url_fields=loader.url_fields,
149 allow_attachments=loader.allow_attachments,
150 )
151
152
153 class Fetcher(object):
154 def fetch_text(self, url): # type: (Text) -> Text
155 raise NotImplementedError()
156
157 def check_exists(self, url): # type: (Text) -> bool
158 raise NotImplementedError()
159
160 def urljoin(self, base_url, url): # type: (Text, Text) -> Text
161 raise NotImplementedError()
162
163 schemes = [u"file", u"http", u"https", u"mailto"]
164
165 def supported_schemes(self): # type: () -> List[Text]
166 return self.schemes
167
168
169 class DefaultFetcher(Fetcher):
170 def __init__(
171 self,
172 cache, # type: Dict[Text, Union[Text, bool]]
173 session, # type: Optional[requests.sessions.Session]
174 ): # type: (...) -> None
175 self.cache = cache
176 self.session = session
177
178 def fetch_text(self, url):
179 # type: (Text) -> Text
180 if url in self.cache and self.cache[url] is not True:
181 # treat "True" as a placeholder that indicates something exists but
182 # not necessarily what its contents is.
183 return cast(Text, self.cache[url])
184
185 split = urllib.parse.urlsplit(url)
186 scheme, path = split.scheme, split.path
187
188 if scheme in [u"http", u"https"] and self.session is not None:
189 try:
190 resp = self.session.get(url)
191 resp.raise_for_status()
192 except Exception as e:
193 raise_from(
194 ValidationException("Error fetching {}: {}".format(url, e)), e
195 )
196 return resp.text
197 if scheme == "file":
198 try:
199 # On Windows, url.path will be /drive:/path ; on Unix systems,
200 # /path. As we want drive:/path instead of /drive:/path on Windows,
201 # remove the leading /.
202 if os.path.isabs(
203 path[1:]
204 ): # checking if pathis valid after removing front / or not
205 path = path[1:]
206 with open(
207 urllib.request.url2pathname(str(path)), encoding="utf-8"
208 ) as fp:
209 return Text(fp.read())
210
211 except (OSError, IOError) as err:
212 if err.filename == path:
213 raise_from(ValidationException(Text(err)), err)
214 else:
215 raise_from(
216 ValidationException("Error reading {}: {}".format(url, err)),
217 err,
218 )
219 raise ValidationException("Unsupported scheme in url: {}".format(url))
220
221 def check_exists(self, url): # type: (Text) -> bool
222 if url in self.cache:
223 return True
224
225 split = urllib.parse.urlsplit(url)
226 scheme, path = split.scheme, split.path
227
228 if scheme in [u"http", u"https"] and self.session is not None:
229 try:
230 resp = self.session.head(url)
231 resp.raise_for_status()
232 except Exception:
233 return False
234 self.cache[url] = True
235 return True
236 if scheme == "file":
237 return os.path.exists(urllib.request.url2pathname(str(path)))
238 if scheme == "mailto":
239 return True
240 raise ValidationException("Unsupported scheme in url: {}".format(url))
241
242 def urljoin(self, base_url, url): # type: (Text, Text) -> Text
243 if url.startswith("_:"):
244 return url
245
246 basesplit = urllib.parse.urlsplit(base_url)
247 split = urllib.parse.urlsplit(url)
248 if basesplit.scheme and basesplit.scheme != "file" and split.scheme == "file":
249 raise ValidationException(
250 "Not resolving potential remote exploit {} from base {}".format(
251 url, base_url
252 )
253 )
254
255 if sys.platform == "win32":
256 if base_url == url:
257 return url
258 basesplit = urllib.parse.urlsplit(base_url)
259 # note that below might split
260 # "C:" with "C" as URI scheme
261 split = urllib.parse.urlsplit(url)
262
263 has_drive = split.scheme and len(split.scheme) == 1
264
265 if basesplit.scheme == "file":
266 # Special handling of relative file references on Windows
267 # as urllib seems to not be quite up to the job
268
269 # netloc MIGHT appear in equivalents of UNC Strings
270 # \\server1.example.com\path as
271 # file:///server1.example.com/path
272 # https://tools.ietf.org/html/rfc8089#appendix-E.3.2
273 # (TODO: test this)
274 netloc = split.netloc or basesplit.netloc
275
276 # Check if url is a local path like "C:/Users/fred"
277 # or actually an absolute URI like http://example.com/fred
278 if has_drive:
279 # Assume split.scheme is actually a drive, e.g. "C:"
280 # so we'll recombine into a path
281 path_with_drive = urllib.parse.urlunsplit(
282 (split.scheme, "", split.path, "", "")
283 )
284 # Compose new file:/// URI with path_with_drive
285 # .. carrying over any #fragment (?query just in case..)
286 return urllib.parse.urlunsplit(
287 ("file", netloc, path_with_drive, split.query, split.fragment)
288 )
289 if (
290 not split.scheme
291 and not netloc
292 and split.path
293 and split.path.startswith("/")
294 ):
295 # Relative - but does it have a drive?
296 base_drive = _re_drive.match(basesplit.path)
297 drive = _re_drive.match(split.path)
298 if base_drive and not drive:
299 # Keep drive letter from base_url
300 # https://tools.ietf.org/html/rfc8089#appendix-E.2.1
301 # e.g. urljoin("file:///D:/bar/a.txt", "/foo/b.txt")
302 # == file:///D:/foo/b.txt
303 path_with_drive = "/{}:{}".format(
304 base_drive.group(1), split.path
305 )
306 return urllib.parse.urlunsplit(
307 (
308 "file",
309 netloc,
310 path_with_drive,
311 split.query,
312 split.fragment,
313 )
314 )
315
316 # else: fall-through to resolve as relative URI
317 elif has_drive:
318 # Base is http://something but url is C:/something - which urllib
319 # would wrongly resolve as an absolute path that could later be used
320 # to access local files
321 raise ValidationException(
322 "Not resolving potential remote exploit {} from base {}".format(
323 url, base_url
324 )
325 )
326
327 return urllib.parse.urljoin(base_url, url)
328
329
330 idx_type = Dict[Text, Union[CommentedMap, CommentedSeq, Text, None]]
331 fetcher_sig = Callable[
332 [Dict[Text, Union[Text, bool]], requests.sessions.Session], Fetcher
333 ]
334 attachements_sig = Callable[[Union[CommentedMap, CommentedSeq]], bool]
335
336
337 class Loader(object):
338 def __init__(
339 self,
340 ctx, # type: ContextType
341 schemagraph=None, # type: Optional[Graph]
342 foreign_properties=None, # type: Optional[Set[Text]]
343 idx=None, # type: Optional[idx_type]
344 cache=None, # type: Optional[Dict[Text, Any]]
345 session=None, # type: Optional[requests.sessions.Session]
346 fetcher_constructor=None, # type: Optional[fetcher_sig]
347 skip_schemas=None, # type: Optional[bool]
348 url_fields=None, # type: Optional[Set[Text]]
349 allow_attachments=None, # type: Optional[attachements_sig]
350 ):
351 # type: (...) -> None
352
353 if idx is not None:
354 self.idx = idx
355 else:
356 self.idx = NormDict(lambda url: urllib.parse.urlsplit(url).geturl())
357
358 self.ctx = {} # type: ContextType
359 if schemagraph is not None:
360 self.graph = schemagraph
361 else:
362 self.graph = Graph()
363
364 if foreign_properties is not None:
365 self.foreign_properties = set(foreign_properties)
366 else:
367 self.foreign_properties = set()
368
369 if cache is not None:
370 self.cache = cache
371 else:
372 self.cache = {}
373
374 if skip_schemas is not None:
375 self.skip_schemas = skip_schemas
376 else:
377 self.skip_schemas = False
378
379 if session is None:
380 if "HOME" in os.environ:
381 self.session = CacheControl(
382 requests.Session(),
383 cache=FileCache(
384 os.path.join(os.environ["HOME"], ".cache", "salad")
385 ),
386 )
387 elif "TMP" in os.environ:
388 self.session = CacheControl(
389 requests.Session(),
390 cache=FileCache(os.path.join(os.environ["TMP"], ".cache", "salad")),
391 )
392 else:
393 self.session = CacheControl(
394 requests.Session(),
395 cache=FileCache(os.path.join("/tmp", ".cache", "salad")),
396 )
397 else:
398 self.session = session
399
400 if fetcher_constructor is not None:
401 self.fetcher_constructor = fetcher_constructor
402 else:
403 self.fetcher_constructor = DefaultFetcher
404 self.fetcher = self.fetcher_constructor(self.cache, self.session)
405 self.fetch_text = self.fetcher.fetch_text
406 self.check_exists = self.fetcher.check_exists
407
408 if url_fields is None:
409 self.url_fields = set() # type: Set[Text]
410 else:
411 self.url_fields = set(url_fields)
412
413 self.scoped_ref_fields = {} # type: Dict[Text, int]
414 self.vocab_fields = set() # type: Set[Text]
415 self.identifiers = [] # type: List[Text]
416 self.identity_links = set() # type: Set[Text]
417 self.standalone = None # type: Optional[Set[Text]]
418 self.nolinkcheck = set() # type: Set[Text]
419 self.vocab = {} # type: Dict[Text, Text]
420 self.rvocab = {} # type: Dict[Text, Text]
421 self.idmap = {} # type: Dict[Text, Any]
422 self.mapPredicate = {} # type: Dict[Text, Text]
423 self.type_dsl_fields = set() # type: Set[Text]
424 self.subscopes = {} # type: Dict[Text, Text]
425 self.secondaryFile_dsl_fields = set() # type: Set[Text]
426 self.allow_attachments = allow_attachments
427
428 self.add_context(ctx)
429
430 def expand_url(
431 self,
432 url, # type: Text
433 base_url, # type: Text
434 scoped_id=False, # type: bool
435 vocab_term=False, # type: bool
436 scoped_ref=None, # type: Optional[int]
437 ):
438 # type: (...) -> Text
439 if url in (u"@id", u"@type") or url is None:
440 return url
441
442 if vocab_term and url in self.vocab:
443 return url
444
445 if url.startswith("_:"):
446 return url
447
448 if bool(self.vocab) and u":" in url:
449 prefix = url.split(u":")[0]
450 if prefix in self.vocab:
451 url = self.vocab[prefix] + url[len(prefix) + 1 :]
452 elif prefix not in self.fetcher.supported_schemes():
453 _logger.warning(
454 "URI prefix '%s' of '%s' not recognized, are you missing a "
455 "$namespaces section?",
456 prefix,
457 url,
458 )
459
460 split = urllib.parse.urlsplit(url)
461
462 if (
463 (bool(split.scheme) and split.scheme in [u"http", u"https", u"file"])
464 or url.startswith(u"$(")
465 or url.startswith(u"${")
466 ):
467 pass
468 elif scoped_id and not bool(split.fragment):
469 splitbase = urllib.parse.urlsplit(base_url)
470 frg = u""
471 if bool(splitbase.fragment):
472 frg = splitbase.fragment + u"/" + split.path
473 else:
474 frg = split.path
475 pt = splitbase.path if splitbase.path != "" else "/"
476 url = urllib.parse.urlunsplit(
477 (splitbase.scheme, splitbase.netloc, pt, splitbase.query, frg)
478 )
479 elif scoped_ref is not None and not split.fragment:
480 pass
481 else:
482 url = self.fetcher.urljoin(base_url, url)
483
484 if vocab_term and url in self.rvocab:
485 return self.rvocab[url]
486 else:
487 return url
488
489 def _add_properties(self, s): # type: (Text) -> None
490 for _, _, rng in self.graph.triples((s, RDFS.range, None)):
491 literal = (
492 Text(rng).startswith(u"http://www.w3.org/2001/XMLSchema#")
493 and not Text(rng) == u"http://www.w3.org/2001/XMLSchema#anyURI"
494 ) or Text(rng) == u"http://www.w3.org/2000/01/rdf-schema#Literal"
495 if not literal:
496 self.url_fields.add(Text(s))
497 self.foreign_properties.add(Text(s))
498
499 def add_namespaces(self, ns): # type: (Dict[Text, Text]) -> None
500 self.vocab.update(ns)
501
502 def add_schemas(self, ns, base_url):
503 # type: (Union[List[Text], Text], Text) -> None
504 if self.skip_schemas:
505 return
506 for sch in aslist(ns):
507 try:
508 fetchurl = self.fetcher.urljoin(base_url, sch)
509 if fetchurl not in self.cache or self.cache[fetchurl] is True:
510 _logger.debug("Getting external schema %s", fetchurl)
511 content = self.fetch_text(fetchurl)
512 self.cache[fetchurl] = Graph()
513 for fmt in ["xml", "turtle", "rdfa"]:
514 try:
515 self.cache[fetchurl].parse(
516 data=content, format=fmt, publicID=str(fetchurl)
517 )
518 self.graph += self.cache[fetchurl]
519 break
520 except xml.sax.SAXParseException:
521 pass
522 except TypeError:
523 pass
524 except BadSyntax:
525 pass
526 except Exception as e:
527 _logger.warning(
528 "Could not load extension schema %s: %s", fetchurl, Text(e)
529 )
530
531 for s, _, _ in self.graph.triples((None, RDF.type, RDF.Property)):
532 self._add_properties(s)
533 for s, _, o in self.graph.triples((None, RDFS.subPropertyOf, None)):
534 self._add_properties(s)
535 self._add_properties(o)
536 for s, _, _ in self.graph.triples((None, RDFS.range, None)):
537 self._add_properties(s)
538 for s, _, _ in self.graph.triples((None, RDF.type, OWL.ObjectProperty)):
539 self._add_properties(s)
540
541 for s, _, _ in self.graph.triples((None, None, None)):
542 self.idx[Text(s)] = None
543
544 def add_context(self, newcontext, baseuri=""):
545 # type: (ContextType, Text) -> None
546 if bool(self.vocab):
547 raise ValidationException("Refreshing context that already has stuff in it")
548
549 self.url_fields = set(("$schemas",))
550 self.scoped_ref_fields = {}
551 self.vocab_fields = set()
552 self.identifiers = []
553 self.identity_links = set()
554 self.standalone = set()
555 self.nolinkcheck = set()
556 self.idmap = {}
557 self.mapPredicate = {}
558 self.vocab = {}
559 self.rvocab = {}
560 self.type_dsl_fields = set()
561 self.secondaryFile_dsl_fields = set()
562 self.subscopes = {}
563
564 self.ctx.update(_copy_dict_without_key(newcontext, u"@context"))
565
566 _logger.debug("ctx is %s", self.ctx)
567
568 for key, value in self.ctx.items():
569 if value == u"@id":
570 self.identifiers.append(key)
571 self.identity_links.add(key)
572 elif isinstance(value, MutableMapping):
573 if value.get(u"@type") == u"@id":
574 self.url_fields.add(key)
575 if u"refScope" in value:
576 self.scoped_ref_fields[key] = value[u"refScope"]
577 if value.get(u"identity", False):
578 self.identity_links.add(key)
579
580 if value.get(u"@type") == u"@vocab":
581 self.url_fields.add(key)
582 self.vocab_fields.add(key)
583 if u"refScope" in value:
584 self.scoped_ref_fields[key] = value[u"refScope"]
585 if value.get(u"typeDSL"):
586 self.type_dsl_fields.add(key)
587
588 if value.get(u"secondaryFilesDSL"):
589 self.secondaryFile_dsl_fields.add(key)
590
591 if value.get(u"noLinkCheck"):
592 self.nolinkcheck.add(key)
593
594 if value.get(u"mapSubject"):
595 self.idmap[key] = value[u"mapSubject"]
596
597 if value.get(u"mapPredicate"):
598 self.mapPredicate[key] = value[u"mapPredicate"]
599
600 if value.get(u"@id"):
601 self.vocab[key] = value[u"@id"]
602
603 if value.get(u"subscope"):
604 self.subscopes[key] = value[u"subscope"]
605
606 elif isinstance(value, string_types):
607 self.vocab[key] = value
608
609 for k, v in self.vocab.items():
610 self.rvocab[self.expand_url(v, u"", scoped_id=False)] = k
611
612 self.identifiers.sort()
613
614 _logger.debug("identifiers is %s", self.identifiers)
615 _logger.debug("identity_links is %s", self.identity_links)
616 _logger.debug("url_fields is %s", self.url_fields)
617 _logger.debug("vocab_fields is %s", self.vocab_fields)
618 _logger.debug("vocab is %s", self.vocab)
619
620 resolved_ref_type = Tuple[
621 Optional[Union[CommentedMap, CommentedSeq, Text]], CommentedMap
622 ]
623
624 def resolve_ref(
625 self,
626 ref, # type: Union[CommentedMap, CommentedSeq, Text]
627 base_url=None, # type: Optional[Text]
628 checklinks=True, # type: bool
629 strict_foreign_properties=False, # type: bool
630 ):
631 # type: (...) -> Loader.resolved_ref_type
632
633 lref = ref # type: Union[CommentedMap, CommentedSeq, Text, None]
634 obj = None # type: Optional[CommentedMap]
635 resolved_obj = None # type: Optional[Union[CommentedMap, CommentedSeq, Text]]
636 inc = False
637 mixin = None # type: Optional[MutableMapping[Text, Any]]
638
639 if not base_url:
640 base_url = file_uri(os.getcwd()) + "/"
641
642 sl = SourceLine(obj, None)
643 # If `ref` is a dict, look for special directives.
644 if isinstance(lref, CommentedMap):
645 obj = lref
646 if "$import" in obj:
647 sl = SourceLine(obj, "$import")
648 if len(obj) == 1:
649 lref = obj[u"$import"]
650 obj = None
651 else:
652 raise ValidationException(
653 u"'$import' must be the only field in {}".format(obj), sl
654 )
655 elif "$include" in obj:
656 sl = SourceLine(obj, "$include")
657 if len(obj) == 1:
658 lref = obj[u"$include"]
659 inc = True
660 obj = None
661 else:
662 raise ValidationException(
663 u"'$include' must be the only field in {}".format(obj), sl
664 )
665 elif "$mixin" in obj:
666 sl = SourceLine(obj, "$mixin")
667 lref = obj[u"$mixin"]
668 mixin = obj
669 obj = None
670 else:
671 lref = None
672 for identifier in self.identifiers:
673 if identifier in obj:
674 lref = obj[identifier]
675 break
676 if not lref:
677 raise ValidationException(
678 u"Object `{}` does not have identifier field in {}".format(
679 obj, self.identifiers
680 ),
681 sl,
682 )
683
684 if not isinstance(lref, string_types):
685 raise ValidationException(
686 u"Expected CommentedMap or string, got {}: `{}`".format(
687 type(lref), lref
688 )
689 )
690
691 if isinstance(lref, string_types) and os.sep == "\\":
692 # Convert Windows path separator in ref
693 lref = lref.replace("\\", "/")
694
695 url = self.expand_url(lref, base_url, scoped_id=(obj is not None))
696 # Has this reference been loaded already?
697 if url in self.idx and (not mixin):
698 resolved_obj = self.idx[url]
699 if isinstance(resolved_obj, MutableMapping):
700 metadata = self.idx.get(urllib.parse.urldefrag(url)[0], CommentedMap())
701 if isinstance(metadata, MutableMapping):
702 if u"$graph" in resolved_obj:
703 metadata = _copy_dict_without_key(resolved_obj, u"$graph")
704 return resolved_obj[u"$graph"], metadata
705 else:
706 return resolved_obj, metadata
707 else:
708 raise ValidationException(
709 u"Expected CommentedMap, got {}: `{}`".format(
710 type(metadata), metadata
711 )
712 )
713 elif isinstance(resolved_obj, MutableSequence):
714 metadata = self.idx.get(urllib.parse.urldefrag(url)[0], CommentedMap())
715 if isinstance(metadata, MutableMapping):
716 return resolved_obj, metadata
717 else:
718 return resolved_obj, CommentedMap()
719 elif isinstance(resolved_obj, string_types):
720 return resolved_obj, CommentedMap()
721 else:
722 raise ValidationException(
723 u"Expected MutableMapping or MutableSequence, got {}: `{}`".format(
724 type(resolved_obj), resolved_obj
725 )
726 )
727
728 # "$include" directive means load raw text
729 if inc:
730 return self.fetch_text(url), CommentedMap()
731
732 doc = None
733 if isinstance(obj, MutableMapping):
734 for identifier in self.identifiers:
735 obj[identifier] = url
736 doc_url = url
737 else:
738 # Load structured document
739 doc_url, frg = urllib.parse.urldefrag(url)
740 if doc_url in self.idx and (not mixin):
741 # If the base document is in the index, it was already loaded,
742 # so if we didn't find the reference earlier then it must not
743 # exist.
744 raise ValidationException(
745 u"Reference `#{}` not found in file `{}`.".format(frg, doc_url), sl
746 )
747 doc = self.fetch(doc_url, inject_ids=(not mixin))
748
749 # Recursively expand urls and resolve directives
750 if bool(mixin):
751 doc = copy.deepcopy(doc)
752 if doc is not None and mixin is not None:
753 doc.update(mixin)
754 del doc["$mixin"]
755 resolved_obj, metadata = self.resolve_all(
756 doc,
757 base_url,
758 file_base=doc_url,
759 checklinks=checklinks,
760 strict_foreign_properties=strict_foreign_properties,
761 )
762 else:
763 if doc:
764 resolve_target = doc
765 else:
766 resolve_target = obj
767 resolved_obj, metadata = self.resolve_all(
768 resolve_target,
769 doc_url,
770 checklinks=checklinks,
771 strict_foreign_properties=strict_foreign_properties,
772 )
773
774 # Requested reference should be in the index now, otherwise it's a bad
775 # reference
776 if not bool(mixin):
777 if url in self.idx:
778 resolved_obj = self.idx[url]
779 else:
780 raise ValidationException(
781 "Reference `{}` is not in the index. Index contains: {}".format(
782 url, ", ".join(self.idx)
783 )
784 )
785
786 if isinstance(resolved_obj, CommentedMap):
787 if u"$graph" in resolved_obj:
788 metadata = _copy_dict_without_key(resolved_obj, u"$graph")
789 return resolved_obj[u"$graph"], metadata
790 else:
791 return resolved_obj, metadata
792 else:
793 return resolved_obj, metadata
794
795 def _resolve_idmap(
796 self,
797 document, # type: CommentedMap
798 loader, # type: Loader
799 ):
800 # type: (...) -> None
801 # Convert fields with mapSubject into lists
802 # use mapPredicate if the mapped value isn't a dict.
803 for idmapField in loader.idmap:
804 if idmapField in document:
805 idmapFieldValue = document[idmapField]
806 if (
807 isinstance(idmapFieldValue, MutableMapping)
808 and "$import" not in idmapFieldValue
809 and "$include" not in idmapFieldValue
810 ):
811 ls = CommentedSeq()
812 for k in sorted(idmapFieldValue.keys()):
813 val = idmapFieldValue[k]
814 v = None # type: Optional[CommentedMap]
815 if not isinstance(val, CommentedMap):
816 if idmapField in loader.mapPredicate:
817 v = CommentedMap(
818 ((loader.mapPredicate[idmapField], val),)
819 )
820 v.lc.add_kv_line_col(
821 loader.mapPredicate[idmapField],
822 document[idmapField].lc.data[k],
823 )
824 v.lc.filename = document.lc.filename
825 else:
826 raise ValidationException(
827 "mapSubject '{}' value '{}' is not a dict "
828 "and does not have a mapPredicate.".format(k, v)
829 )
830 else:
831 v = val
832
833 v[loader.idmap[idmapField]] = k
834 v.lc.add_kv_line_col(
835 loader.idmap[idmapField], document[idmapField].lc.data[k]
836 )
837 v.lc.filename = document.lc.filename
838
839 ls.lc.add_kv_line_col(len(ls), document[idmapField].lc.data[k])
840
841 ls.lc.filename = document.lc.filename
842 ls.append(v)
843
844 document[idmapField] = ls
845
846 typeDSLregex = re.compile(Text(r"^([^[?]+)(\[\])?(\?)?$"))
847
848 def _type_dsl(
849 self,
850 t, # type: Union[Text, Dict[Text, Text], List[Text]]
851 lc, # type: LineCol
852 filename, # type: Text
853 ): # type: (...) -> Union[Text, Dict[Text, Text], List[Text]]
854
855 if not isinstance(t, string_types):
856 return t
857
858 m = Loader.typeDSLregex.match(t)
859 if not m:
860 return t
861 first = m.group(1)
862 second = third = None
863 if bool(m.group(2)):
864 second = CommentedMap((("type", "array"), ("items", first)))
865 second.lc.add_kv_line_col("type", lc)
866 second.lc.add_kv_line_col("items", lc)
867 second.lc.filename = filename
868 if bool(m.group(3)):
869 third = CommentedSeq([u"null", second or first])
870 third.lc.add_kv_line_col(0, lc)
871 third.lc.add_kv_line_col(1, lc)
872 third.lc.filename = filename
873 return third or second or first
874
875 def _secondaryFile_dsl(
876 self,
877 t, # type: Union[Text, Dict[Text, Text], List[Text]]
878 lc, # type: LineCol
879 filename, # type: Text
880 ): # type: (...) -> Union[Text, Dict[Text, Text], List[Text]]
881
882 if not isinstance(t, string_types):
883 return t
884 pat = t
885 req = None
886 if t.endswith("?"):
887 pat = t[0:-1]
888 req = False
889
890 second = CommentedMap((("pattern", pat), ("required", req)))
891 second.lc.add_kv_line_col("pattern", lc)
892 second.lc.add_kv_line_col("required", lc)
893 second.lc.filename = filename
894 return second
895
896 def _apply_dsl(
897 self,
898 datum, # type: Union[Text, Dict[Any, Any], List[Any]]
899 d, # type: Text
900 loader, # type: Loader
901 lc, # type: LineCol
902 filename, # type: Text
903 ):
904 # type: (...) -> Union[Text, Dict[Any, Any], List[Any]]
905 if d in loader.type_dsl_fields:
906 return self._type_dsl(datum, lc, filename)
907 elif d in loader.secondaryFile_dsl_fields:
908 return self._secondaryFile_dsl(datum, lc, filename)
909 else:
910 return datum
911
912 def _resolve_dsl(
913 self,
914 document, # type: CommentedMap
915 loader, # type: Loader
916 ):
917 # type: (...) -> None
918 fields = list(loader.type_dsl_fields)
919 fields.extend(loader.secondaryFile_dsl_fields)
920
921 for d in fields:
922 if d in document:
923 datum2 = datum = document[d]
924 if isinstance(datum, string_types):
925 datum2 = self._apply_dsl(
926 datum, d, loader, document.lc.data[d], document.lc.filename
927 )
928 elif isinstance(datum, CommentedSeq):
929 datum2 = CommentedSeq()
930 for n, t in enumerate(datum):
931 if datum.lc and datum.lc.data:
932 datum2.lc.add_kv_line_col(len(datum2), datum.lc.data[n])
933 datum2.append(
934 self._apply_dsl(
935 t, d, loader, datum.lc.data[n], document.lc.filename
936 )
937 )
938 else:
939 datum2.append(self._apply_dsl(t, d, loader, LineCol(), ""))
940 if isinstance(datum2, CommentedSeq):
941 datum3 = CommentedSeq()
942 seen = [] # type: List[Text]
943 for i, item in enumerate(datum2):
944 if isinstance(item, CommentedSeq):
945 for j, v in enumerate(item):
946 if v not in seen:
947 datum3.lc.add_kv_line_col(
948 len(datum3), item.lc.data[j]
949 )
950 datum3.append(v)
951 seen.append(v)
952 else:
953 if item not in seen:
954 if datum2.lc and datum2.lc.data:
955 datum3.lc.add_kv_line_col(
956 len(datum3), datum2.lc.data[i]
957 )
958 datum3.append(item)
959 seen.append(item)
960 document[d] = datum3
961 else:
962 document[d] = datum2
963
964 def _resolve_identifier(self, document, loader, base_url):
965 # type: (CommentedMap, Loader, Text) -> Text
966 # Expand identifier field (usually 'id') to resolve scope
967 for identifer in loader.identifiers:
968 if identifer in document:
969 if isinstance(document[identifer], string_types):
970 document[identifer] = loader.expand_url(
971 document[identifer], base_url, scoped_id=True
972 )
973 if document[identifer] not in loader.idx or isinstance(
974 loader.idx[document[identifer]], string_types
975 ):
976 loader.idx[document[identifer]] = document
977 base_url = document[identifer]
978 else:
979 raise ValidationException(
980 "identifier field '{}' must be a string".format(
981 document[identifer]
982 )
983 )
984 return base_url
985
986 def _resolve_identity(self, document, loader, base_url):
987 # type: (Dict[Text, List[Text]], Loader, Text) -> None
988 # Resolve scope for identity fields (fields where the value is the
989 # identity of a standalone node, such as enum symbols)
990 for identifer in loader.identity_links:
991 if identifer in document and isinstance(
992 document[identifer], MutableSequence
993 ):
994 for n, _v in enumerate(document[identifer]):
995 if isinstance(document[identifer][n], string_types):
996 document[identifer][n] = loader.expand_url(
997 document[identifer][n], base_url, scoped_id=True
998 )
999 if document[identifer][n] not in loader.idx:
1000 loader.idx[document[identifer][n]] = document[identifer][n]
1001
1002 def _normalize_fields(self, document, loader):
1003 # type: (CommentedMap, Loader) -> None
1004 # Normalize fields which are prefixed or full URIn to vocabulary terms
1005 for d in list(document.keys()):
1006 d2 = loader.expand_url(d, u"", scoped_id=False, vocab_term=True)
1007 if d != d2:
1008 document[d2] = document[d]
1009 document.lc.add_kv_line_col(d2, document.lc.data[d])
1010 del document[d]
1011
1012 def _resolve_uris(
1013 self,
1014 document, # type: Dict[Text, Union[Text, List[Text]]]
1015 loader, # type: Loader
1016 base_url, # type: Text
1017 ):
1018 # type: (...) -> None
1019 # Resolve remaining URLs based on document base
1020 for d in loader.url_fields:
1021 if d in document:
1022 datum = document[d]
1023 if isinstance(datum, string_types):
1024 document[d] = loader.expand_url(
1025 datum,
1026 base_url,
1027 scoped_id=False,
1028 vocab_term=(d in loader.vocab_fields),
1029 scoped_ref=loader.scoped_ref_fields.get(d),
1030 )
1031 elif isinstance(datum, MutableSequence):
1032 for i, url in enumerate(datum):
1033 if isinstance(url, string_types):
1034 datum[i] = loader.expand_url(
1035 url,
1036 base_url,
1037 scoped_id=False,
1038 vocab_term=(d in loader.vocab_fields),
1039 scoped_ref=loader.scoped_ref_fields.get(d),
1040 )
1041
1042 def resolve_all(
1043 self,
1044 document, # type: Union[CommentedMap, CommentedSeq]
1045 base_url, # type: Text
1046 file_base=None, # type: Optional[Text]
1047 checklinks=True, # type: bool
1048 strict_foreign_properties=False, # type: bool
1049 ):
1050 # type: (...) -> Loader.resolved_ref_type
1051 loader = self
1052 metadata = CommentedMap() # type: CommentedMap
1053 if file_base is None:
1054 file_base = base_url
1055
1056 if isinstance(document, CommentedMap):
1057 # Handle $import and $include
1058 if u"$import" in document or u"$include" in document:
1059 return self.resolve_ref(
1060 document,
1061 base_url=file_base,
1062 checklinks=checklinks,
1063 strict_foreign_properties=strict_foreign_properties,
1064 )
1065 elif u"$mixin" in document:
1066 return self.resolve_ref(
1067 document,
1068 base_url=base_url,
1069 checklinks=checklinks,
1070 strict_foreign_properties=strict_foreign_properties,
1071 )
1072 elif isinstance(document, CommentedSeq):
1073 pass
1074 elif isinstance(document, (list, dict)):
1075 raise ValidationException(
1076 "Expected CommentedMap or CommentedSeq, got {}: `{}`".format(
1077 type(document), document
1078 )
1079 )
1080 else:
1081 return (document, metadata)
1082
1083 newctx = None # type: Optional[Loader]
1084 if isinstance(document, CommentedMap):
1085 # Handle $base, $profile, $namespaces, $schemas and $graph
1086 if u"$base" in document:
1087 base_url = document[u"$base"]
1088
1089 if u"$profile" in document:
1090 if newctx is None:
1091 newctx = SubLoader(self)
1092 newctx.add_namespaces(document.get(u"$namespaces", CommentedMap()))
1093 newctx.add_schemas(document.get(u"$schemas", []), document[u"$profile"])
1094
1095 if u"$namespaces" in document:
1096 if newctx is None:
1097 newctx = SubLoader(self)
1098 newctx.add_namespaces(document[u"$namespaces"])
1099
1100 if u"$schemas" in document:
1101 if newctx is None:
1102 newctx = SubLoader(self)
1103 newctx.add_schemas(document[u"$schemas"], file_base)
1104
1105 if newctx is not None:
1106 loader = newctx
1107
1108 for identifer in loader.identity_links:
1109 if identifer in document:
1110 if isinstance(document[identifer], string_types):
1111 document[identifer] = loader.expand_url(
1112 document[identifer], base_url, scoped_id=True
1113 )
1114 loader.idx[document[identifer]] = document
1115
1116 metadata = document
1117 if u"$graph" in document:
1118 document = document[u"$graph"]
1119
1120 if isinstance(document, CommentedMap):
1121 self._normalize_fields(document, loader)
1122 self._resolve_idmap(document, loader)
1123 self._resolve_dsl(document, loader)
1124 base_url = self._resolve_identifier(document, loader, base_url)
1125 self._resolve_identity(document, loader, base_url)
1126 self._resolve_uris(document, loader, base_url)
1127
1128 try:
1129 for key, val in document.items():
1130 subscope = "" # type: Text
1131 if key in loader.subscopes:
1132 subscope = "/" + loader.subscopes[key]
1133 document[key], _ = loader.resolve_all(
1134 val, base_url + subscope, file_base=file_base, checklinks=False
1135 )
1136 except ValidationException as v:
1137 _logger.warning("loader is %s", id(loader), exc_info=True)
1138 raise_from(
1139 ValidationException(
1140 "({}) ({}) Validation error in field {}:".format(
1141 id(loader), file_base, key
1142 ),
1143 None,
1144 [v],
1145 ),
1146 v,
1147 )
1148
1149 elif isinstance(document, CommentedSeq):
1150 i = 0
1151 try:
1152 while i < len(document):
1153 val = document[i]
1154 if isinstance(val, CommentedMap) and (
1155 u"$import" in val or u"$mixin" in val
1156 ):
1157 l, import_metadata = loader.resolve_ref(
1158 val, base_url=file_base, checklinks=False
1159 )
1160 metadata.setdefault("$import_metadata", {})
1161 for identifier in loader.identifiers:
1162 if identifier in import_metadata:
1163 metadata["$import_metadata"][
1164 import_metadata[identifier]
1165 ] = import_metadata
1166 if isinstance(l, CommentedSeq):
1167 lc = document.lc.data[i]
1168 del document[i]
1169 llen = len(l)
1170 for j in range(len(document) + llen, i + llen, -1):
1171 document.lc.data[j - 1] = document.lc.data[j - llen]
1172 for item in l:
1173 document.insert(i, item)
1174 document.lc.data[i] = lc
1175 i += 1
1176 else:
1177 document[i] = l
1178 i += 1
1179 else:
1180 document[i], _ = loader.resolve_all(
1181 val, base_url, file_base=file_base, checklinks=False
1182 )
1183 i += 1
1184 except ValidationException as v:
1185 _logger.warning("failed", exc_info=True)
1186 raise_from(
1187 ValidationException(
1188 "({}) ({}) Validation error in position {}:".format(
1189 id(loader), file_base, i
1190 ),
1191 None,
1192 [v],
1193 ),
1194 v,
1195 )
1196
1197 if checklinks:
1198 all_doc_ids = {} # type: Dict[Text, Text]
1199 loader.validate_links(
1200 document,
1201 u"",
1202 all_doc_ids,
1203 strict_foreign_properties=strict_foreign_properties,
1204 )
1205
1206 return document, metadata
1207
1208 def fetch(self, url, inject_ids=True): # type: (Text, bool) -> Any
1209 if url in self.idx:
1210 return self.idx[url]
1211 try:
1212 text = self.fetch_text(url)
1213 if isinstance(text, bytes):
1214 textIO = StringIO(text.decode("utf-8"))
1215 else:
1216 textIO = StringIO(text)
1217 textIO.name = str(url)
1218 attachments = yaml.round_trip_load_all(textIO, preserve_quotes=True)
1219 result = next(attachments)
1220
1221 if self.allow_attachments is not None and self.allow_attachments(result):
1222 i = 1
1223 for a in attachments:
1224 self.idx["{}#attachment-{}".format(url, i)] = a
1225 i += 1
1226 add_lc_filename(result, url)
1227 except yaml.error.MarkedYAMLError as e:
1228 raise_from(to_validation_exception(e), e)
1229 if isinstance(result, CommentedMap) and inject_ids and bool(self.identifiers):
1230 for identifier in self.identifiers:
1231 if identifier not in result:
1232 result[identifier] = url
1233 self.idx[
1234 self.expand_url(result[identifier], url, scoped_id=True)
1235 ] = result
1236 self.idx[url] = result
1237 return result
1238
1239 FieldType = TypeVar("FieldType", Text, CommentedSeq, CommentedMap)
1240
1241 def validate_scoped(self, field, link, docid):
1242 # type: (Text, Text, Text) -> Text
1243 split = urllib.parse.urlsplit(docid)
1244 sp = split.fragment.split(u"/")
1245 n = self.scoped_ref_fields[field]
1246 while n > 0 and len(sp) > 0:
1247 sp.pop()
1248 n -= 1
1249 tried = []
1250 while True:
1251 sp.append(link)
1252 url = urllib.parse.urlunsplit(
1253 (split.scheme, split.netloc, split.path, split.query, u"/".join(sp))
1254 )
1255 tried.append(url)
1256 if url in self.idx:
1257 return url
1258 sp.pop()
1259 if len(sp) == 0:
1260 break
1261 sp.pop()
1262 if onWindows() and link.startswith("file:"):
1263 link = link.lower()
1264 raise ValidationException(
1265 "Field `{}` references unknown identifier `{}`, tried {}".format(
1266 field, link, ", ".join(tried)
1267 )
1268 )
1269
1270 def validate_link(self, field, link, docid, all_doc_ids):
1271 # type: (Text, Loader.FieldType, Text, Dict[Text, Text]) -> Loader.FieldType
1272 if field in self.nolinkcheck:
1273 return link
1274 if isinstance(link, string_types):
1275 if field in self.vocab_fields:
1276 if (
1277 link not in self.vocab
1278 and link not in self.idx
1279 and link not in self.rvocab
1280 ):
1281 if field in self.scoped_ref_fields:
1282 return self.validate_scoped(field, link, docid)
1283 elif not self.check_exists(link):
1284 raise ValidationException(
1285 "Field `{}` contains undefined reference to `{}`".format(
1286 field, link
1287 )
1288 )
1289 elif link not in self.idx and link not in self.rvocab:
1290 if field in self.scoped_ref_fields:
1291 return self.validate_scoped(field, link, docid)
1292 elif not self.check_exists(link):
1293 raise ValidationException(
1294 "Field `{}` contains undefined reference to `{}`".format(
1295 field, link
1296 )
1297 )
1298 elif isinstance(link, CommentedSeq):
1299 errors = []
1300 for n, i in enumerate(link):
1301 try:
1302 link[n] = self.validate_link(field, i, docid, all_doc_ids)
1303 except ValidationException as v:
1304 errors.append(v)
1305 if bool(errors):
1306 raise ValidationException("", None, errors)
1307 elif isinstance(link, CommentedMap):
1308 self.validate_links(link, docid, all_doc_ids)
1309 else:
1310 raise ValidationException(
1311 "`{}` field is {}, expected string, list, or a dict.".format(
1312 field, type(link).__name__
1313 )
1314 )
1315 return link
1316
1317 def getid(self, d): # type: (Any) -> Optional[Text]
1318 if isinstance(d, MutableMapping):
1319 for i in self.identifiers:
1320 if i in d:
1321 idd = d[i]
1322 if isinstance(idd, string_types):
1323 return idd
1324 return None
1325
1326 def validate_links(
1327 self,
1328 document, # type: Union[CommentedMap, CommentedSeq, Text, None]
1329 base_url, # type: Text
1330 all_doc_ids, # type: Dict[Text, Text]
1331 strict_foreign_properties=False, # type: bool
1332 ): # type: (...) -> None
1333 docid = self.getid(document)
1334 if not docid:
1335 docid = base_url
1336
1337 errors = [] # type: List[SchemaSaladException]
1338 iterator = None # type: Any
1339 if isinstance(document, MutableSequence):
1340 iterator = enumerate(document)
1341 elif isinstance(document, MutableMapping):
1342 for d in self.url_fields:
1343 sl = SourceLine(document, d, Text)
1344 try:
1345 if d in document and d not in self.identity_links:
1346 document[d] = self.validate_link(
1347 d, document[d], docid, all_doc_ids
1348 )
1349 except SchemaSaladException as v:
1350 v = v.with_sourceline(sl)
1351 if d == "$schemas" or (
1352 d in self.foreign_properties and not strict_foreign_properties
1353 ):
1354 _logger.warning(v)
1355 else:
1356 errors.append(v)
1357 # TODO: Validator should local scope only in which
1358 # duplicated keys are prohibited.
1359 # See also https://github.com/common-workflow-language/common-workflow-language/issues/734 # noqa: B950
1360 # In the future, it should raise
1361 # ValidationException instead of _logger.warn
1362 try:
1363 for (
1364 identifier
1365 ) in self.identifiers: # validate that each id is defined uniquely
1366 if identifier in document:
1367 sl = SourceLine(document, identifier, Text)
1368 if (
1369 document[identifier] in all_doc_ids
1370 and sl.makeLead() != all_doc_ids[document[identifier]]
1371 ):
1372 _logger.warning(
1373 "%s object %s `%s` previously defined",
1374 all_doc_ids[document[identifier]],
1375 identifier,
1376 relname(document[identifier]),
1377 )
1378 else:
1379 all_doc_ids[document[identifier]] = sl.makeLead()
1380 break
1381 except ValidationException as v:
1382 errors.append(v.with_sourceline(sl))
1383
1384 if hasattr(document, "iteritems"):
1385 iterator = iteritems(document)
1386 else:
1387 iterator = list(document.items())
1388 else:
1389 return
1390
1391 for key, val in iterator:
1392 sl = SourceLine(document, key, Text)
1393 try:
1394 self.validate_links(
1395 val,
1396 docid,
1397 all_doc_ids,
1398 strict_foreign_properties=strict_foreign_properties,
1399 )
1400 except ValidationException as v:
1401 if key in self.nolinkcheck or (
1402 isinstance(key, string_types) and ":" in key
1403 ):
1404 _logger.warning(v)
1405 else:
1406 docid2 = self.getid(val)
1407 if docid2 is not None:
1408 errors.append(
1409 ValidationException(
1410 "checking object `{}`".format(relname(docid2)), sl, [v]
1411 )
1412 )
1413 else:
1414 if isinstance(key, string_types):
1415 errors.append(
1416 ValidationException(
1417 "checking field `{}`".format(key), sl, [v]
1418 )
1419 )
1420 else:
1421 errors.append(ValidationException("checking item", sl, [v]))
1422 if bool(errors):
1423 if len(errors) > 1:
1424 raise ValidationException("", None, errors)
1425 else:
1426 raise errors[0]
1427 return
1428
1429
1430 D = TypeVar("D", CommentedMap, ContextType)
1431
1432
1433 def _copy_dict_without_key(from_dict, filtered_key):
1434 # type: (D, Any) -> D
1435 new_dict = CommentedMap(from_dict.items())
1436 if filtered_key in new_dict:
1437 del new_dict[filtered_key]
1438 if isinstance(from_dict, CommentedMap):
1439 new_dict.lc.data = copy.copy(from_dict.lc.data)
1440 new_dict.lc.filename = from_dict.lc.filename
1441 return new_dict