Mercurial > repos > shellac > sam_consensus_v3
view env/lib/python3.9/site-packages/schema_salad/schema.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
line wrap: on
line source
"""Functions to process Schema Salad schemas.""" import copy import hashlib from typing import ( IO, Any, Dict, List, Mapping, MutableMapping, MutableSequence, Optional, Set, Tuple, TypeVar, Union, cast, ) from urllib.parse import urldefrag, urlparse from pkg_resources import resource_stream from ruamel import yaml from ruamel.yaml.comments import CommentedMap, CommentedSeq from schema_salad.utils import ( CacheType, ResolveType, add_dictlist, aslist, convert_to_dict, flatten, json_dumps, ) from . import _logger, jsonld_context, ref_resolver, validate from .avro.schema import Names, SchemaParseException, make_avsc_object from .exceptions import ( ClassValidationException, SchemaSaladException, ValidationException, ) from .ref_resolver import Loader from .sourceline import SourceLine, add_lc_filename, relname SALAD_FILES = ( "metaschema.yml", "metaschema_base.yml", "salad.md", "field_name.yml", "import_include.md", "link_res.yml", "ident_res.yml", "vocab_res.yml", "vocab_res.yml", "field_name_schema.yml", "field_name_src.yml", "field_name_proc.yml", "ident_res_schema.yml", "ident_res_src.yml", "ident_res_proc.yml", "link_res_schema.yml", "link_res_src.yml", "link_res_proc.yml", "vocab_res_schema.yml", "vocab_res_src.yml", "vocab_res_proc.yml", "map_res.yml", "map_res_schema.yml", "map_res_src.yml", "map_res_proc.yml", "typedsl_res.yml", "typedsl_res_schema.yml", "typedsl_res_src.yml", "typedsl_res_proc.yml", "sfdsl_res.yml", "sfdsl_res_schema.yml", "sfdsl_res_src.yml", "sfdsl_res_proc.yml", ) saladp = "https://w3id.org/cwl/salad#" def get_metaschema() -> Tuple[Names, List[Dict[str, str]], Loader]: """Instantiate the metaschema.""" loader = ref_resolver.Loader( { "Any": saladp + "Any", "ArraySchema": saladp + "ArraySchema", "Array_symbol": saladp + "ArraySchema/type/Array_symbol", "DocType": saladp + "DocType", "Documentation": saladp + "Documentation", "Documentation_symbol": saladp + "Documentation/type/Documentation_symbol", "Documented": saladp + "Documented", "EnumSchema": saladp + "EnumSchema", "Enum_symbol": saladp + "EnumSchema/type/Enum_symbol", "JsonldPredicate": saladp + "JsonldPredicate", "NamedType": saladp + "NamedType", "PrimitiveType": saladp + "PrimitiveType", "RecordField": saladp + "RecordField", "RecordSchema": saladp + "RecordSchema", "Record_symbol": saladp + "RecordSchema/type/Record_symbol", "SaladEnumSchema": saladp + "SaladEnumSchema", "SaladRecordField": saladp + "SaladRecordField", "SaladRecordSchema": saladp + "SaladRecordSchema", "SchemaDefinedType": saladp + "SchemaDefinedType", "SpecializeDef": saladp + "SpecializeDef", "_container": saladp + "JsonldPredicate/_container", "_id": {"@id": saladp + "_id", "@type": "@id", "identity": True}, "_type": saladp + "JsonldPredicate/_type", "abstract": saladp + "SaladRecordSchema/abstract", "array": saladp + "array", "boolean": "http://www.w3.org/2001/XMLSchema#boolean", "dct": "http://purl.org/dc/terms/", "default": {"@id": saladp + "default", "noLinkCheck": True}, "doc": "rdfs:comment", "docAfter": {"@id": saladp + "docAfter", "@type": "@id"}, "docChild": {"@id": saladp + "docChild", "@type": "@id"}, "docParent": {"@id": saladp + "docParent", "@type": "@id"}, "documentRoot": saladp + "SchemaDefinedType/documentRoot", "documentation": saladp + "documentation", "double": "http://www.w3.org/2001/XMLSchema#double", "enum": saladp + "enum", "extends": {"@id": saladp + "extends", "@type": "@id", "refScope": 1}, "fields": { "@id": saladp + "fields", "mapPredicate": "type", "mapSubject": "name", }, "float": "http://www.w3.org/2001/XMLSchema#float", "identity": saladp + "JsonldPredicate/identity", "inVocab": saladp + "NamedType/inVocab", "int": "http://www.w3.org/2001/XMLSchema#int", "items": {"@id": saladp + "items", "@type": "@vocab", "refScope": 2}, "jsonldPredicate": "sld:jsonldPredicate", "long": "http://www.w3.org/2001/XMLSchema#long", "mapPredicate": saladp + "JsonldPredicate/mapPredicate", "mapSubject": saladp + "JsonldPredicate/mapSubject", "name": "@id", "noLinkCheck": saladp + "JsonldPredicate/noLinkCheck", "null": saladp + "null", "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "rdfs": "http://www.w3.org/2000/01/rdf-schema#", "record": saladp + "record", "refScope": saladp + "JsonldPredicate/refScope", "sld": saladp, "specialize": { "@id": saladp + "specialize", "mapPredicate": "specializeTo", "mapSubject": "specializeFrom", }, "specializeFrom": { "@id": saladp + "specializeFrom", "@type": "@id", "refScope": 1, }, "specializeTo": { "@id": saladp + "specializeTo", "@type": "@id", "refScope": 1, }, "string": "http://www.w3.org/2001/XMLSchema#string", "subscope": saladp + "JsonldPredicate/subscope", "symbols": {"@id": saladp + "symbols", "@type": "@id", "identity": True}, "type": { "@id": saladp + "type", "@type": "@vocab", "refScope": 2, "typeDSL": True, }, "typeDSL": saladp + "JsonldPredicate/typeDSL", "xsd": "http://www.w3.org/2001/XMLSchema#", } ) for salad in SALAD_FILES: with resource_stream("schema_salad", "metaschema/" + salad) as stream: loader.cache["https://w3id.org/cwl/" + salad] = stream.read().decode( "UTF-8" ) with resource_stream("schema_salad", "metaschema/metaschema.yml") as stream: loader.cache["https://w3id.org/cwl/salad"] = stream.read().decode("UTF-8") j = yaml.main.round_trip_load(loader.cache["https://w3id.org/cwl/salad"]) add_lc_filename(j, "metaschema.yml") j2 = loader.resolve_all(j, saladp)[0] if not isinstance(j2, list): _logger.error("%s", j2) raise SchemaParseException(f"Not a list: {j2}") else: sch_obj = make_avro(j2, loader) try: sch_names = make_avro_schema_from_avro(sch_obj) except SchemaParseException: _logger.error("Metaschema error, avro was:\n%s", json_dumps(sch_obj, indent=4)) raise validate_doc(sch_names, j2, loader, strict=True) return (sch_names, j2, loader) def add_namespaces( metadata: Mapping[str, Any], namespaces: MutableMapping[str, str] ) -> None: """Collect the provided namespaces, checking for conflicts.""" for key, value in metadata.items(): if key not in namespaces: namespaces[key] = value elif namespaces[key] != value: raise ValidationException( "Namespace prefix '{}' has conflicting definitions '{}'" " and '{}'.".format(key, namespaces[key], value) ) def collect_namespaces(metadata: Mapping[str, Any]) -> Dict[str, str]: """Walk through the metadata object, collecting namespace declarations.""" namespaces = {} # type: Dict[str, str] if "$import_metadata" in metadata: for value in metadata["$import_metadata"].values(): add_namespaces(collect_namespaces(value), namespaces) if "$namespaces" in metadata: add_namespaces(metadata["$namespaces"], namespaces) return namespaces schema_type = Tuple[Loader, Union[Names, SchemaParseException], Dict[str, Any], Loader] def load_schema( schema_ref: ResolveType, cache: Optional[CacheType] = None, ) -> schema_type: """ Load a schema that can be used to validate documents using load_and_validate. return: document_loader, avsc_names, schema_metadata, metaschema_loader """ metaschema_names, _metaschema_doc, metaschema_loader = get_metaschema() if cache is not None: metaschema_loader.cache.update(cache) schema_doc, schema_metadata = metaschema_loader.resolve_ref(schema_ref, "") if not isinstance(schema_doc, MutableSequence): raise ValidationException("Schema reference must resolve to a list.") validate_doc(metaschema_names, schema_doc, metaschema_loader, True) metactx = schema_metadata.get("@context", {}) metactx.update(collect_namespaces(schema_metadata)) schema_ctx = jsonld_context.salad_to_jsonld_context(schema_doc, metactx)[0] # Create the loader that will be used to load the target document. document_loader = Loader(schema_ctx, cache=cache) # Make the Avro validation that will be used to validate the target # document avsc_names = make_avro_schema(schema_doc, document_loader) return document_loader, avsc_names, schema_metadata, metaschema_loader def load_and_validate( document_loader: Loader, avsc_names: Names, document: Union[CommentedMap, str], strict: bool, strict_foreign_properties: bool = False, ) -> Tuple[Any, Dict[str, Any]]: """Load a document and validate it with the provided schema. return data, metadata """ try: if isinstance(document, CommentedMap): data, metadata = document_loader.resolve_all( document, document["id"], checklinks=True, strict_foreign_properties=strict_foreign_properties, ) else: data, metadata = document_loader.resolve_ref( document, checklinks=True, strict_foreign_properties=strict_foreign_properties, ) validate_doc( avsc_names, data, document_loader, strict, strict_foreign_properties=strict_foreign_properties, ) except ValidationException as exc: raise ValidationException("", None, [exc]) from exc return data, metadata def validate_doc( schema_names: Names, doc: ResolveType, loader: Loader, strict: bool, strict_foreign_properties: bool = False, ) -> None: """Validate a document using the provided schema.""" has_root = False for root in schema_names.names.values(): if (hasattr(root, "get_prop") and root.get_prop("documentRoot")) or ( "documentRoot" in root.props ): has_root = True break if not has_root: raise ValidationException("No document roots defined in the schema") if isinstance(doc, MutableSequence): vdoc = doc elif isinstance(doc, CommentedMap): vdoc = CommentedSeq([doc]) vdoc.lc.add_kv_line_col(0, [doc.lc.line, doc.lc.col]) vdoc.lc.filename = doc.lc.filename else: raise ValidationException("Document must be dict or list") roots = [] for root in schema_names.names.values(): if (hasattr(root, "get_prop") and root.get_prop("documentRoot")) or ( root.props.get("documentRoot") ): roots.append(root) anyerrors = [] for pos, item in enumerate(vdoc): sourceline = SourceLine(vdoc, pos, str) success = False for root in roots: success = validate.validate_ex( root, item, loader.identifiers, strict, foreign_properties=loader.foreign_properties, raise_ex=False, skip_foreign_properties=loader.skip_schemas, strict_foreign_properties=strict_foreign_properties, ) if success: break if not success: errors = [] # type: List[SchemaSaladException] for root in roots: if hasattr(root, "get_prop"): name = root.get_prop("name") elif hasattr(root, "name"): name = root.name try: validate.validate_ex( root, item, loader.identifiers, strict, foreign_properties=loader.foreign_properties, raise_ex=True, skip_foreign_properties=loader.skip_schemas, strict_foreign_properties=strict_foreign_properties, ) except ClassValidationException as exc1: errors = [ ClassValidationException( f"tried `{name}` but", sourceline, [exc1] ) ] break except ValidationException as exc2: errors.append( ValidationException(f"tried `{name}` but", sourceline, [exc2]) ) objerr = "Invalid" for ident in loader.identifiers: if ident in item: objerr = "Object `{}` is not valid because".format( relname(item[ident]) ) break anyerrors.append(ValidationException(objerr, sourceline, errors, "-")) if anyerrors: raise ValidationException("", None, anyerrors, "*") def get_anon_name( rec: MutableMapping[str, Union[str, Dict[str, str], List[str]]] ) -> str: """Calculate a reproducible name for anonymous types.""" if "name" in rec: name = rec["name"] if isinstance(name, str): return name raise ValidationException(f"Expected name field to be a string, was {name}") anon_name = "" if rec["type"] in ("enum", saladp + "enum"): for sym in rec["symbols"]: anon_name += sym return "enum_" + hashlib.sha1(anon_name.encode("UTF-8")).hexdigest() # nosec if rec["type"] in ("record", saladp + "record"): for field in rec["fields"]: if isinstance(field, Mapping): anon_name += field["name"] else: raise ValidationException( "Expected entries in 'fields' to also be maps, was {}.".format( field ) ) return "record_" + hashlib.sha1(anon_name.encode("UTF-8")).hexdigest() # nosec if rec["type"] in ("array", saladp + "array"): return "" raise ValidationException("Expected enum or record, was {}".format(rec["type"])) def replace_type( items: Any, spec: Dict[str, Any], loader: Loader, found: Set[str], find_embeds: bool = True, deepen: bool = True, ) -> Any: """ Go through and replace types in the 'spec' mapping""" if isinstance(items, MutableMapping): # recursively check these fields for types to replace if items.get("type") in ("record", "enum") and items.get("name"): if items["name"] in found: return items["name"] found.add(items["name"]) if not deepen: return items items = copy.copy(items) if not items.get("name"): items["name"] = get_anon_name(items) for name in ("type", "items", "fields"): if name in items: items[name] = replace_type( items[name], spec, loader, found, find_embeds=find_embeds, deepen=find_embeds, ) if isinstance(items[name], MutableSequence): items[name] = flatten(items[name]) return items if isinstance(items, MutableSequence): # recursively transform list return [ replace_type(i, spec, loader, found, find_embeds=find_embeds, deepen=deepen) for i in items ] if isinstance(items, str): # found a string which is a symbol corresponding to a type. replace_with = None if items in loader.vocab: # If it's a vocabulary term, first expand it to its fully qualified # URI items = loader.vocab[items] if items in spec: # Look up in specialization map replace_with = spec[items] if replace_with: return replace_type( replace_with, spec, loader, found, find_embeds=find_embeds ) found.add(items) return items def avro_name(url: str) -> str: """ Turn a URL into an Avro-safe name. If the URL has no fragment, return this plain URL. Extract either the last part of the URL fragment past the slash, otherwise the whole fragment. """ frg = urldefrag(url)[1] if frg != "": if "/" in frg: return frg[frg.rindex("/") + 1 :] return frg return url Avro = TypeVar("Avro", MutableMapping[str, Any], MutableSequence[Any], str) def make_valid_avro( items: Avro, alltypes: Dict[str, Dict[str, Any]], found: Set[str], union: bool = False, ) -> Union[ Avro, MutableMapping[str, str], str, List[Union[Any, MutableMapping[str, str], str]] ]: """Convert our schema to be more avro like.""" # Possibly could be integrated into our fork of avro/schema.py? if isinstance(items, MutableMapping): avro = copy.copy(items) if avro.get("name") and avro.get("inVocab", True): avro["name"] = avro_name(avro["name"]) if "type" in avro and avro["type"] in ( saladp + "record", saladp + "enum", "record", "enum", ): if (hasattr(avro, "get") and avro.get("abstract")) or ("abstract" in avro): return avro if avro["name"] in found: return cast(str, avro["name"]) found.add(avro["name"]) for field in ("type", "items", "values", "fields"): if field in avro: avro[field] = make_valid_avro(avro[field], alltypes, found, union=True) if "symbols" in avro: avro["symbols"] = [avro_name(sym) for sym in avro["symbols"]] return avro if isinstance(items, MutableSequence): ret = [] for i in items: ret.append(make_valid_avro(i, alltypes, found, union=union)) return ret if union and isinstance(items, str): if items in alltypes and avro_name(items) not in found: return make_valid_avro(alltypes[items], alltypes, found, union=union) return avro_name(items) else: return items def deepcopy_strip(item: Any) -> Any: """ Make a deep copy of list and dict objects. Intentionally do not copy attributes. This is to discard CommentedMap and CommentedSeq metadata which is very expensive with regular copy.deepcopy. """ if isinstance(item, MutableMapping): return {k: deepcopy_strip(v) for k, v in item.items()} if isinstance(item, MutableSequence): return [deepcopy_strip(k) for k in item] return item def extend_and_specialize( items: List[Dict[str, Any]], loader: Loader ) -> List[Dict[str, Any]]: """ Apply 'extend' and 'specialize' to fully materialize derived record types. """ items2 = deepcopy_strip(items) types = {i["name"]: i for i in items2} # type: Dict[str, Any] results = [] for stype in items2: if "extends" in stype: specs = {} # type: Dict[str, str] if "specialize" in stype: for spec in aslist(stype["specialize"]): specs[spec["specializeFrom"]] = spec["specializeTo"] exfields = [] # type: List[str] exsym = [] # type: List[str] for ex in aslist(stype["extends"]): if ex not in types: raise ValidationException( "Extends {} in {} refers to invalid base type.".format( stype["extends"], stype["name"] ) ) basetype = copy.copy(types[ex]) if stype["type"] == "record": if specs: basetype["fields"] = replace_type( basetype.get("fields", []), specs, loader, set() ) for field in basetype.get("fields", []): if "inherited_from" not in field: field["inherited_from"] = ex exfields.extend(basetype.get("fields", [])) elif stype["type"] == "enum": exsym.extend(basetype.get("symbols", [])) if stype["type"] == "record": stype = copy.copy(stype) exfields.extend(stype.get("fields", [])) stype["fields"] = exfields fieldnames = set() # type: Set[str] for field in stype["fields"]: if field["name"] in fieldnames: raise ValidationException( "Field name {} appears twice in {}".format( field["name"], stype["name"] ) ) else: fieldnames.add(field["name"]) elif stype["type"] == "enum": stype = copy.copy(stype) exsym.extend(stype.get("symbols", [])) stype["symbol"] = exsym types[stype["name"]] = stype results.append(stype) ex_types = {} for result in results: ex_types[result["name"]] = result extended_by = {} # type: Dict[str, str] for result in results: if "extends" in result: for ex in aslist(result["extends"]): if ex_types[ex].get("abstract"): add_dictlist(extended_by, ex, ex_types[result["name"]]) add_dictlist(extended_by, avro_name(ex), ex_types[ex]) for result in results: if result.get("abstract") and result["name"] not in extended_by: raise ValidationException( "{} is abstract but missing a concrete subtype".format(result["name"]) ) for result in results: if "fields" in result: result["fields"] = replace_type( result["fields"], extended_by, loader, set() ) return results def make_avro( i: List[Dict[str, Any]], loader: Loader, ) -> List[Any]: j = extend_and_specialize(i, loader) name_dict = {} # type: Dict[str, Dict[str, Any]] for entry in j: name_dict[entry["name"]] = entry avro = make_valid_avro(j, name_dict, set()) return [ t for t in avro if isinstance(t, MutableMapping) and not t.get("abstract") and t.get("type") != "documentation" ] def make_avro_schema( i: List[Any], loader: Loader, ) -> Names: """ All in one convenience function. Call make_avro() and make_avro_schema_from_avro() separately if you need the intermediate result for diagnostic output. """ names = Names() avro = make_avro(i, loader) make_avsc_object(convert_to_dict(avro), names) return names def make_avro_schema_from_avro(avro: List[Union[Avro, Dict[str, str], str]]) -> Names: names = Names() make_avsc_object(convert_to_dict(avro), names) return names def shortname(inputid: str) -> str: """Returns the last segment of the provided fragment or path.""" parsed_id = urlparse(inputid) if parsed_id.fragment: return parsed_id.fragment.split("/")[-1] return parsed_id.path.split("/")[-1] def print_inheritance(doc: List[Dict[str, Any]], stream: IO[Any]) -> None: """Write a Grapviz inheritance graph for the supplied document.""" stream.write("digraph {\n") for entry in doc: if entry["type"] == "record": label = name = shortname(entry["name"]) fields = entry.get("fields", []) if fields: label += "\\n* {}\\l".format( "\\l* ".join(shortname(field["name"]) for field in fields) ) shape = "ellipse" if entry.get("abstract") else "box" stream.write(f'"{name}" [shape={shape} label="{label}"];\n') if "extends" in entry: for target in aslist(entry["extends"]): stream.write('"{}" -> "{}";\n'.format(shortname(target), name)) stream.write("}\n") def print_fieldrefs(doc: List[Dict[str, Any]], loader: Loader, stream: IO[Any]) -> None: """Write a GraphViz graph of the relationships between the fields.""" obj = extend_and_specialize(doc, loader) primitives = { "http://www.w3.org/2001/XMLSchema#string", "http://www.w3.org/2001/XMLSchema#boolean", "http://www.w3.org/2001/XMLSchema#int", "http://www.w3.org/2001/XMLSchema#long", saladp + "null", saladp + "enum", saladp + "array", saladp + "record", saladp + "Any", } stream.write("digraph {\n") for entry in obj: if entry.get("abstract"): continue if entry["type"] == "record": label = shortname(entry["name"]) for field in entry.get("fields", []): found = set() # type: Set[str] field_name = shortname(field["name"]) replace_type(field["type"], {}, loader, found, find_embeds=False) for each_type in found: if each_type not in primitives: stream.write( '"{}" -> "{}" [label="{}"];\n'.format( label, shortname(each_type), field_name ) ) stream.write("}\n")