Mercurial > repos > shellac > sam_consensus_v3
diff env/lib/python3.9/site-packages/schema_salad/schema.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/env/lib/python3.9/site-packages/schema_salad/schema.py Mon Mar 22 18:12:50 2021 +0000 @@ -0,0 +1,770 @@ +"""Functions to process Schema Salad schemas.""" + +import copy +import hashlib +from typing import ( + IO, + Any, + Dict, + List, + Mapping, + MutableMapping, + MutableSequence, + Optional, + Set, + Tuple, + TypeVar, + Union, + cast, +) +from urllib.parse import urldefrag, urlparse + +from pkg_resources import resource_stream +from ruamel import yaml +from ruamel.yaml.comments import CommentedMap, CommentedSeq + +from schema_salad.utils import ( + CacheType, + ResolveType, + add_dictlist, + aslist, + convert_to_dict, + flatten, + json_dumps, +) + +from . import _logger, jsonld_context, ref_resolver, validate +from .avro.schema import Names, SchemaParseException, make_avsc_object +from .exceptions import ( + ClassValidationException, + SchemaSaladException, + ValidationException, +) +from .ref_resolver import Loader +from .sourceline import SourceLine, add_lc_filename, relname + +SALAD_FILES = ( + "metaschema.yml", + "metaschema_base.yml", + "salad.md", + "field_name.yml", + "import_include.md", + "link_res.yml", + "ident_res.yml", + "vocab_res.yml", + "vocab_res.yml", + "field_name_schema.yml", + "field_name_src.yml", + "field_name_proc.yml", + "ident_res_schema.yml", + "ident_res_src.yml", + "ident_res_proc.yml", + "link_res_schema.yml", + "link_res_src.yml", + "link_res_proc.yml", + "vocab_res_schema.yml", + "vocab_res_src.yml", + "vocab_res_proc.yml", + "map_res.yml", + "map_res_schema.yml", + "map_res_src.yml", + "map_res_proc.yml", + "typedsl_res.yml", + "typedsl_res_schema.yml", + "typedsl_res_src.yml", + "typedsl_res_proc.yml", + "sfdsl_res.yml", + "sfdsl_res_schema.yml", + "sfdsl_res_src.yml", + "sfdsl_res_proc.yml", +) + +saladp = "https://w3id.org/cwl/salad#" + + +def get_metaschema() -> Tuple[Names, List[Dict[str, str]], Loader]: + """Instantiate the metaschema.""" + loader = ref_resolver.Loader( + { + "Any": saladp + "Any", + "ArraySchema": saladp + "ArraySchema", + "Array_symbol": saladp + "ArraySchema/type/Array_symbol", + "DocType": saladp + "DocType", + "Documentation": saladp + "Documentation", + "Documentation_symbol": saladp + "Documentation/type/Documentation_symbol", + "Documented": saladp + "Documented", + "EnumSchema": saladp + "EnumSchema", + "Enum_symbol": saladp + "EnumSchema/type/Enum_symbol", + "JsonldPredicate": saladp + "JsonldPredicate", + "NamedType": saladp + "NamedType", + "PrimitiveType": saladp + "PrimitiveType", + "RecordField": saladp + "RecordField", + "RecordSchema": saladp + "RecordSchema", + "Record_symbol": saladp + "RecordSchema/type/Record_symbol", + "SaladEnumSchema": saladp + "SaladEnumSchema", + "SaladRecordField": saladp + "SaladRecordField", + "SaladRecordSchema": saladp + "SaladRecordSchema", + "SchemaDefinedType": saladp + "SchemaDefinedType", + "SpecializeDef": saladp + "SpecializeDef", + "_container": saladp + "JsonldPredicate/_container", + "_id": {"@id": saladp + "_id", "@type": "@id", "identity": True}, + "_type": saladp + "JsonldPredicate/_type", + "abstract": saladp + "SaladRecordSchema/abstract", + "array": saladp + "array", + "boolean": "http://www.w3.org/2001/XMLSchema#boolean", + "dct": "http://purl.org/dc/terms/", + "default": {"@id": saladp + "default", "noLinkCheck": True}, + "doc": "rdfs:comment", + "docAfter": {"@id": saladp + "docAfter", "@type": "@id"}, + "docChild": {"@id": saladp + "docChild", "@type": "@id"}, + "docParent": {"@id": saladp + "docParent", "@type": "@id"}, + "documentRoot": saladp + "SchemaDefinedType/documentRoot", + "documentation": saladp + "documentation", + "double": "http://www.w3.org/2001/XMLSchema#double", + "enum": saladp + "enum", + "extends": {"@id": saladp + "extends", "@type": "@id", "refScope": 1}, + "fields": { + "@id": saladp + "fields", + "mapPredicate": "type", + "mapSubject": "name", + }, + "float": "http://www.w3.org/2001/XMLSchema#float", + "identity": saladp + "JsonldPredicate/identity", + "inVocab": saladp + "NamedType/inVocab", + "int": "http://www.w3.org/2001/XMLSchema#int", + "items": {"@id": saladp + "items", "@type": "@vocab", "refScope": 2}, + "jsonldPredicate": "sld:jsonldPredicate", + "long": "http://www.w3.org/2001/XMLSchema#long", + "mapPredicate": saladp + "JsonldPredicate/mapPredicate", + "mapSubject": saladp + "JsonldPredicate/mapSubject", + "name": "@id", + "noLinkCheck": saladp + "JsonldPredicate/noLinkCheck", + "null": saladp + "null", + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "record": saladp + "record", + "refScope": saladp + "JsonldPredicate/refScope", + "sld": saladp, + "specialize": { + "@id": saladp + "specialize", + "mapPredicate": "specializeTo", + "mapSubject": "specializeFrom", + }, + "specializeFrom": { + "@id": saladp + "specializeFrom", + "@type": "@id", + "refScope": 1, + }, + "specializeTo": { + "@id": saladp + "specializeTo", + "@type": "@id", + "refScope": 1, + }, + "string": "http://www.w3.org/2001/XMLSchema#string", + "subscope": saladp + "JsonldPredicate/subscope", + "symbols": {"@id": saladp + "symbols", "@type": "@id", "identity": True}, + "type": { + "@id": saladp + "type", + "@type": "@vocab", + "refScope": 2, + "typeDSL": True, + }, + "typeDSL": saladp + "JsonldPredicate/typeDSL", + "xsd": "http://www.w3.org/2001/XMLSchema#", + } + ) + + for salad in SALAD_FILES: + with resource_stream("schema_salad", "metaschema/" + salad) as stream: + loader.cache["https://w3id.org/cwl/" + salad] = stream.read().decode( + "UTF-8" + ) + + with resource_stream("schema_salad", "metaschema/metaschema.yml") as stream: + loader.cache["https://w3id.org/cwl/salad"] = stream.read().decode("UTF-8") + + j = yaml.main.round_trip_load(loader.cache["https://w3id.org/cwl/salad"]) + add_lc_filename(j, "metaschema.yml") + j2 = loader.resolve_all(j, saladp)[0] + + if not isinstance(j2, list): + _logger.error("%s", j2) + raise SchemaParseException(f"Not a list: {j2}") + else: + sch_obj = make_avro(j2, loader) + try: + sch_names = make_avro_schema_from_avro(sch_obj) + except SchemaParseException: + _logger.error("Metaschema error, avro was:\n%s", json_dumps(sch_obj, indent=4)) + raise + validate_doc(sch_names, j2, loader, strict=True) + return (sch_names, j2, loader) + + +def add_namespaces( + metadata: Mapping[str, Any], namespaces: MutableMapping[str, str] +) -> None: + """Collect the provided namespaces, checking for conflicts.""" + for key, value in metadata.items(): + if key not in namespaces: + namespaces[key] = value + elif namespaces[key] != value: + raise ValidationException( + "Namespace prefix '{}' has conflicting definitions '{}'" + " and '{}'.".format(key, namespaces[key], value) + ) + + +def collect_namespaces(metadata: Mapping[str, Any]) -> Dict[str, str]: + """Walk through the metadata object, collecting namespace declarations.""" + namespaces = {} # type: Dict[str, str] + if "$import_metadata" in metadata: + for value in metadata["$import_metadata"].values(): + add_namespaces(collect_namespaces(value), namespaces) + if "$namespaces" in metadata: + add_namespaces(metadata["$namespaces"], namespaces) + return namespaces + + +schema_type = Tuple[Loader, Union[Names, SchemaParseException], Dict[str, Any], Loader] + + +def load_schema( + schema_ref: ResolveType, + cache: Optional[CacheType] = None, +) -> schema_type: + """ + Load a schema that can be used to validate documents using load_and_validate. + + return: document_loader, avsc_names, schema_metadata, metaschema_loader + """ + + metaschema_names, _metaschema_doc, metaschema_loader = get_metaschema() + if cache is not None: + metaschema_loader.cache.update(cache) + schema_doc, schema_metadata = metaschema_loader.resolve_ref(schema_ref, "") + + if not isinstance(schema_doc, MutableSequence): + raise ValidationException("Schema reference must resolve to a list.") + + validate_doc(metaschema_names, schema_doc, metaschema_loader, True) + metactx = schema_metadata.get("@context", {}) + metactx.update(collect_namespaces(schema_metadata)) + schema_ctx = jsonld_context.salad_to_jsonld_context(schema_doc, metactx)[0] + + # Create the loader that will be used to load the target document. + document_loader = Loader(schema_ctx, cache=cache) + + # Make the Avro validation that will be used to validate the target + # document + avsc_names = make_avro_schema(schema_doc, document_loader) + + return document_loader, avsc_names, schema_metadata, metaschema_loader + + +def load_and_validate( + document_loader: Loader, + avsc_names: Names, + document: Union[CommentedMap, str], + strict: bool, + strict_foreign_properties: bool = False, +) -> Tuple[Any, Dict[str, Any]]: + """Load a document and validate it with the provided schema. + + return data, metadata + """ + try: + if isinstance(document, CommentedMap): + data, metadata = document_loader.resolve_all( + document, + document["id"], + checklinks=True, + strict_foreign_properties=strict_foreign_properties, + ) + else: + data, metadata = document_loader.resolve_ref( + document, + checklinks=True, + strict_foreign_properties=strict_foreign_properties, + ) + + validate_doc( + avsc_names, + data, + document_loader, + strict, + strict_foreign_properties=strict_foreign_properties, + ) + except ValidationException as exc: + raise ValidationException("", None, [exc]) from exc + return data, metadata + + +def validate_doc( + schema_names: Names, + doc: ResolveType, + loader: Loader, + strict: bool, + strict_foreign_properties: bool = False, +) -> None: + """Validate a document using the provided schema.""" + has_root = False + for root in schema_names.names.values(): + if (hasattr(root, "get_prop") and root.get_prop("documentRoot")) or ( + "documentRoot" in root.props + ): + has_root = True + break + + if not has_root: + raise ValidationException("No document roots defined in the schema") + + if isinstance(doc, MutableSequence): + vdoc = doc + elif isinstance(doc, CommentedMap): + vdoc = CommentedSeq([doc]) + vdoc.lc.add_kv_line_col(0, [doc.lc.line, doc.lc.col]) + vdoc.lc.filename = doc.lc.filename + else: + raise ValidationException("Document must be dict or list") + + roots = [] + for root in schema_names.names.values(): + if (hasattr(root, "get_prop") and root.get_prop("documentRoot")) or ( + root.props.get("documentRoot") + ): + roots.append(root) + + anyerrors = [] + for pos, item in enumerate(vdoc): + sourceline = SourceLine(vdoc, pos, str) + success = False + for root in roots: + success = validate.validate_ex( + root, + item, + loader.identifiers, + strict, + foreign_properties=loader.foreign_properties, + raise_ex=False, + skip_foreign_properties=loader.skip_schemas, + strict_foreign_properties=strict_foreign_properties, + ) + if success: + break + + if not success: + errors = [] # type: List[SchemaSaladException] + for root in roots: + if hasattr(root, "get_prop"): + name = root.get_prop("name") + elif hasattr(root, "name"): + name = root.name + + try: + validate.validate_ex( + root, + item, + loader.identifiers, + strict, + foreign_properties=loader.foreign_properties, + raise_ex=True, + skip_foreign_properties=loader.skip_schemas, + strict_foreign_properties=strict_foreign_properties, + ) + except ClassValidationException as exc1: + errors = [ + ClassValidationException( + f"tried `{name}` but", sourceline, [exc1] + ) + ] + break + except ValidationException as exc2: + errors.append( + ValidationException(f"tried `{name}` but", sourceline, [exc2]) + ) + + objerr = "Invalid" + for ident in loader.identifiers: + if ident in item: + objerr = "Object `{}` is not valid because".format( + relname(item[ident]) + ) + break + anyerrors.append(ValidationException(objerr, sourceline, errors, "-")) + if anyerrors: + raise ValidationException("", None, anyerrors, "*") + + +def get_anon_name( + rec: MutableMapping[str, Union[str, Dict[str, str], List[str]]] +) -> str: + """Calculate a reproducible name for anonymous types.""" + if "name" in rec: + name = rec["name"] + if isinstance(name, str): + return name + raise ValidationException(f"Expected name field to be a string, was {name}") + anon_name = "" + if rec["type"] in ("enum", saladp + "enum"): + for sym in rec["symbols"]: + anon_name += sym + return "enum_" + hashlib.sha1(anon_name.encode("UTF-8")).hexdigest() # nosec + if rec["type"] in ("record", saladp + "record"): + for field in rec["fields"]: + if isinstance(field, Mapping): + anon_name += field["name"] + else: + raise ValidationException( + "Expected entries in 'fields' to also be maps, was {}.".format( + field + ) + ) + return "record_" + hashlib.sha1(anon_name.encode("UTF-8")).hexdigest() # nosec + if rec["type"] in ("array", saladp + "array"): + return "" + raise ValidationException("Expected enum or record, was {}".format(rec["type"])) + + +def replace_type( + items: Any, + spec: Dict[str, Any], + loader: Loader, + found: Set[str], + find_embeds: bool = True, + deepen: bool = True, +) -> Any: + """ Go through and replace types in the 'spec' mapping""" + + if isinstance(items, MutableMapping): + # recursively check these fields for types to replace + if items.get("type") in ("record", "enum") and items.get("name"): + if items["name"] in found: + return items["name"] + found.add(items["name"]) + + if not deepen: + return items + + items = copy.copy(items) + if not items.get("name"): + items["name"] = get_anon_name(items) + for name in ("type", "items", "fields"): + if name in items: + items[name] = replace_type( + items[name], + spec, + loader, + found, + find_embeds=find_embeds, + deepen=find_embeds, + ) + if isinstance(items[name], MutableSequence): + items[name] = flatten(items[name]) + + return items + if isinstance(items, MutableSequence): + # recursively transform list + return [ + replace_type(i, spec, loader, found, find_embeds=find_embeds, deepen=deepen) + for i in items + ] + if isinstance(items, str): + # found a string which is a symbol corresponding to a type. + replace_with = None + if items in loader.vocab: + # If it's a vocabulary term, first expand it to its fully qualified + # URI + items = loader.vocab[items] + + if items in spec: + # Look up in specialization map + replace_with = spec[items] + + if replace_with: + return replace_type( + replace_with, spec, loader, found, find_embeds=find_embeds + ) + found.add(items) + return items + + +def avro_name(url: str) -> str: + """ + Turn a URL into an Avro-safe name. + + If the URL has no fragment, return this plain URL. + + Extract either the last part of the URL fragment past the slash, otherwise + the whole fragment. + """ + frg = urldefrag(url)[1] + if frg != "": + if "/" in frg: + return frg[frg.rindex("/") + 1 :] + return frg + return url + + +Avro = TypeVar("Avro", MutableMapping[str, Any], MutableSequence[Any], str) + + +def make_valid_avro( + items: Avro, + alltypes: Dict[str, Dict[str, Any]], + found: Set[str], + union: bool = False, +) -> Union[ + Avro, MutableMapping[str, str], str, List[Union[Any, MutableMapping[str, str], str]] +]: + """Convert our schema to be more avro like.""" + # Possibly could be integrated into our fork of avro/schema.py? + if isinstance(items, MutableMapping): + avro = copy.copy(items) + if avro.get("name") and avro.get("inVocab", True): + avro["name"] = avro_name(avro["name"]) + + if "type" in avro and avro["type"] in ( + saladp + "record", + saladp + "enum", + "record", + "enum", + ): + if (hasattr(avro, "get") and avro.get("abstract")) or ("abstract" in avro): + return avro + if avro["name"] in found: + return cast(str, avro["name"]) + found.add(avro["name"]) + for field in ("type", "items", "values", "fields"): + if field in avro: + avro[field] = make_valid_avro(avro[field], alltypes, found, union=True) + if "symbols" in avro: + avro["symbols"] = [avro_name(sym) for sym in avro["symbols"]] + return avro + if isinstance(items, MutableSequence): + ret = [] + for i in items: + ret.append(make_valid_avro(i, alltypes, found, union=union)) + return ret + if union and isinstance(items, str): + if items in alltypes and avro_name(items) not in found: + return make_valid_avro(alltypes[items], alltypes, found, union=union) + return avro_name(items) + else: + return items + + +def deepcopy_strip(item: Any) -> Any: + """ + Make a deep copy of list and dict objects. + + Intentionally do not copy attributes. This is to discard CommentedMap and + CommentedSeq metadata which is very expensive with regular copy.deepcopy. + """ + + if isinstance(item, MutableMapping): + return {k: deepcopy_strip(v) for k, v in item.items()} + if isinstance(item, MutableSequence): + return [deepcopy_strip(k) for k in item] + return item + + +def extend_and_specialize( + items: List[Dict[str, Any]], loader: Loader +) -> List[Dict[str, Any]]: + """ + Apply 'extend' and 'specialize' to fully materialize derived record types. + """ + + items2 = deepcopy_strip(items) + types = {i["name"]: i for i in items2} # type: Dict[str, Any] + results = [] + + for stype in items2: + if "extends" in stype: + specs = {} # type: Dict[str, str] + if "specialize" in stype: + for spec in aslist(stype["specialize"]): + specs[spec["specializeFrom"]] = spec["specializeTo"] + + exfields = [] # type: List[str] + exsym = [] # type: List[str] + for ex in aslist(stype["extends"]): + if ex not in types: + raise ValidationException( + "Extends {} in {} refers to invalid base type.".format( + stype["extends"], stype["name"] + ) + ) + + basetype = copy.copy(types[ex]) + + if stype["type"] == "record": + if specs: + basetype["fields"] = replace_type( + basetype.get("fields", []), specs, loader, set() + ) + + for field in basetype.get("fields", []): + if "inherited_from" not in field: + field["inherited_from"] = ex + + exfields.extend(basetype.get("fields", [])) + elif stype["type"] == "enum": + exsym.extend(basetype.get("symbols", [])) + + if stype["type"] == "record": + stype = copy.copy(stype) + exfields.extend(stype.get("fields", [])) + stype["fields"] = exfields + + fieldnames = set() # type: Set[str] + for field in stype["fields"]: + if field["name"] in fieldnames: + raise ValidationException( + "Field name {} appears twice in {}".format( + field["name"], stype["name"] + ) + ) + else: + fieldnames.add(field["name"]) + elif stype["type"] == "enum": + stype = copy.copy(stype) + exsym.extend(stype.get("symbols", [])) + stype["symbol"] = exsym + + types[stype["name"]] = stype + + results.append(stype) + + ex_types = {} + for result in results: + ex_types[result["name"]] = result + + extended_by = {} # type: Dict[str, str] + for result in results: + if "extends" in result: + for ex in aslist(result["extends"]): + if ex_types[ex].get("abstract"): + add_dictlist(extended_by, ex, ex_types[result["name"]]) + add_dictlist(extended_by, avro_name(ex), ex_types[ex]) + + for result in results: + if result.get("abstract") and result["name"] not in extended_by: + raise ValidationException( + "{} is abstract but missing a concrete subtype".format(result["name"]) + ) + + for result in results: + if "fields" in result: + result["fields"] = replace_type( + result["fields"], extended_by, loader, set() + ) + + return results + + +def make_avro( + i: List[Dict[str, Any]], + loader: Loader, +) -> List[Any]: + + j = extend_and_specialize(i, loader) + + name_dict = {} # type: Dict[str, Dict[str, Any]] + for entry in j: + name_dict[entry["name"]] = entry + avro = make_valid_avro(j, name_dict, set()) + + return [ + t + for t in avro + if isinstance(t, MutableMapping) + and not t.get("abstract") + and t.get("type") != "documentation" + ] + + +def make_avro_schema( + i: List[Any], + loader: Loader, +) -> Names: + """ + All in one convenience function. + + Call make_avro() and make_avro_schema_from_avro() separately if you need + the intermediate result for diagnostic output. + """ + names = Names() + avro = make_avro(i, loader) + make_avsc_object(convert_to_dict(avro), names) + return names + + +def make_avro_schema_from_avro(avro: List[Union[Avro, Dict[str, str], str]]) -> Names: + names = Names() + make_avsc_object(convert_to_dict(avro), names) + return names + + +def shortname(inputid: str) -> str: + """Returns the last segment of the provided fragment or path.""" + parsed_id = urlparse(inputid) + if parsed_id.fragment: + return parsed_id.fragment.split("/")[-1] + return parsed_id.path.split("/")[-1] + + +def print_inheritance(doc: List[Dict[str, Any]], stream: IO[Any]) -> None: + """Write a Grapviz inheritance graph for the supplied document.""" + stream.write("digraph {\n") + for entry in doc: + if entry["type"] == "record": + label = name = shortname(entry["name"]) + fields = entry.get("fields", []) + if fields: + label += "\\n* {}\\l".format( + "\\l* ".join(shortname(field["name"]) for field in fields) + ) + shape = "ellipse" if entry.get("abstract") else "box" + stream.write(f'"{name}" [shape={shape} label="{label}"];\n') + if "extends" in entry: + for target in aslist(entry["extends"]): + stream.write('"{}" -> "{}";\n'.format(shortname(target), name)) + stream.write("}\n") + + +def print_fieldrefs(doc: List[Dict[str, Any]], loader: Loader, stream: IO[Any]) -> None: + """Write a GraphViz graph of the relationships between the fields.""" + obj = extend_and_specialize(doc, loader) + + primitives = { + "http://www.w3.org/2001/XMLSchema#string", + "http://www.w3.org/2001/XMLSchema#boolean", + "http://www.w3.org/2001/XMLSchema#int", + "http://www.w3.org/2001/XMLSchema#long", + saladp + "null", + saladp + "enum", + saladp + "array", + saladp + "record", + saladp + "Any", + } + + stream.write("digraph {\n") + for entry in obj: + if entry.get("abstract"): + continue + if entry["type"] == "record": + label = shortname(entry["name"]) + for field in entry.get("fields", []): + found = set() # type: Set[str] + field_name = shortname(field["name"]) + replace_type(field["type"], {}, loader, found, find_embeds=False) + for each_type in found: + if each_type not in primitives: + stream.write( + '"{}" -> "{}" [label="{}"];\n'.format( + label, shortname(each_type), field_name + ) + ) + stream.write("}\n")