diff planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/pyRdfa/state.py @ 1:56ad4e20f292 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:32:28 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/pyRdfa/state.py	Fri Jul 31 00:32:28 2020 -0400
@@ -0,0 +1,552 @@
+# -*- coding: utf-8 -*-
+"""
+Parser's execution context (a.k.a. state) object and handling. The state includes:
+
+  - language, retrieved from C{@xml:lang} or C{@lang}
+  - URI base, determined by C{<base>} or set explicitly. This is a little bit superfluous, because the current RDFa syntax does not make use of C{@xml:base}; i.e., this could be a global value.  But the structure is prepared to add C{@xml:base} easily, if needed.
+  - options, in the form of an L{options<pyRdfa.options>} instance
+  - a separate vocabulary/CURIE handling resource, in the form of an L{termorcurie<pyRdfa.TermOrCurie>} instance
+
+The execution context object is also used to handle URI-s, CURIE-s, terms, etc.
+
+@summary: RDFa parser execution context
+@organization: U{World Wide Web Consortium<http://www.w3.org>}
+@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
+@license: This software is available for use under the
+U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}
+"""
+
+"""
+$Id: state.py,v 1.23 2013-10-16 11:48:54 ivan Exp $
+$Date: 2013-10-16 11:48:54 $
+"""
+import sys
+(py_v_major, py_v_minor, py_v_micro, py_v_final, py_v_serial) = sys.version_info
+
+import rdflib
+from rdflib	import URIRef
+from rdflib	import Literal
+from rdflib	import BNode
+from rdflib	import Namespace
+if rdflib.__version__ >= "3.0.0" :
+	from rdflib	import RDF  as ns_rdf
+	from rdflib	import RDFS as ns_rdfs
+else :
+	from rdflib.RDFS	import RDFSNS as ns_rdfs
+	from rdflib.RDF		import RDFNS  as ns_rdf
+
+from .options	import Options
+from .utils 	import quote_URI
+from .host 		import HostLanguage, accept_xml_base, accept_xml_lang, beautifying_prefixes
+
+from .termorcurie	import TermOrCurie
+from .				import UnresolvablePrefix, UnresolvableTerm
+
+from . import err_lang
+from . import err_URI_scheme
+from . import err_illegal_safe_CURIE
+from . import err_no_CURIE_in_safe_CURIE
+from . import err_undefined_terms
+from . import err_non_legal_CURIE_ref
+from . import err_undefined_CURIE
+
+if py_v_major >= 3 :
+	from urllib.parse import urlparse, urlunparse, urlsplit, urljoin
+else :
+	from urllib.parse import urlparse, urlunparse, urlsplit, urljoin
+
+class ListStructure :
+	"""Special class to handle the C{@inlist} type structures in RDFa 1.1; stores the "origin", i.e,
+	where the list will be attached to, and the mappings as defined in the spec.
+	"""
+	def __init__(self) :
+		self.mapping = {}
+		self.origin	  = None
+
+#### Core Class definition
+class ExecutionContext :
+	"""State at a specific node, including the current set of namespaces in the RDFLib sense, current language,
+	the base, vocabularies, etc. The class is also used to interpret URI-s and CURIE-s to produce
+	URI references for RDFLib.
+
+	@ivar options: reference to the overall options
+	@type options: L{Options}
+	@ivar base: the 'base' URI
+	@ivar parsedBase: the parsed version of base, as produced by urlparse.urlsplit
+	@ivar defaultNS: default namespace (if defined via @xmlns) to be used for XML Literals
+	@ivar lang: language tag (possibly None)
+	@ivar term_or_curie: vocabulary management class instance
+	@type term_or_curie: L{termorcurie.TermOrCurie}
+	@ivar list_mapping: dictionary of arrays, containing a list of URIs key-ed via properties for lists
+	@ivar node: the node to which this state belongs
+	@type node: DOM node instance
+	@ivar rdfa_version: RDFa version of the content
+	@type rdfa_version: String
+	@ivar supress_lang: in some cases, the effect of the lang attribute should be supressed for the given node, although it should be inherited down below (example: @value attribute of the data element in HTML5)
+	@type supress_lang: Boolean
+	@cvar _list: list of attributes that allow for lists of values and should be treated as such
+	@cvar _resource_type: dictionary; mapping table from attribute name to the exact method to retrieve the URI(s). Is initialized at first instantiation.
+	"""
+
+	# list of attributes that allow for lists of values and should be treated as such
+	_list = [ "rel", "rev", "property", "typeof", "role" ]
+	# mapping table from attribute name to the exact method to retrieve the URI(s).
+	_resource_type = {}
+
+	def __init__(self, node, graph, inherited_state=None, base="", options=None, rdfa_version = None) :
+		"""
+		@param node: the current DOM Node
+		@param graph: the RDFLib Graph
+		@keyword inherited_state: the state as inherited
+		from upper layers. This inherited_state is mixed with the state information
+		retrieved from the current node.
+		@type inherited_state: L{state.ExecutionContext}
+		@keyword base: string denoting the base URI for the specific node. This overrides the possible
+		base inherited from the upper layers. The
+		current XHTML+RDFa syntax does not allow the usage of C{@xml:base}, but SVG1.2 does, so this is
+		necessary for SVG (and other possible XML dialects that accept C{@xml:base})
+		@keyword options: invocation options, and references to warning graphs
+		@type options: L{Options<pyRdfa.options>}
+		"""
+		def remove_frag_id(uri) :
+			"""
+			The fragment ID for self.base must be removed
+			"""
+			try :
+				# To be on the safe side:-)
+				t = urlparse(uri)
+				return urlunparse((t[0],t[1],t[2],t[3],t[4],""))
+			except :
+				return uri
+
+		# This is, conceptually, an additional class initialization, but it must be done run time, otherwise import errors show up
+		if len(	ExecutionContext._resource_type ) == 0 :
+			ExecutionContext._resource_type = {
+				"href"		:	ExecutionContext._URI,
+				"src"		:	ExecutionContext._URI,
+				"vocab"	    :   ExecutionContext._URI,
+
+				"about"		:	ExecutionContext._CURIEorURI,
+				"resource"	:	ExecutionContext._CURIEorURI,
+
+				"rel"		:	ExecutionContext._TERMorCURIEorAbsURI,
+				"rev"		:	ExecutionContext._TERMorCURIEorAbsURI,
+				"datatype"	:	ExecutionContext._TERMorCURIEorAbsURI,
+				"typeof"	:	ExecutionContext._TERMorCURIEorAbsURI,
+				"property"	:	ExecutionContext._TERMorCURIEorAbsURI,
+				"role"		:	ExecutionContext._TERMorCURIEorAbsURI,
+			}
+		#-----------------------------------------------------------------
+		self.node = node
+
+		#-----------------------------------------------------------------
+		# Settling the base. In a generic XML, xml:base should be accepted at all levels (though this is not the
+		# case in, say, XHTML...)
+		# At the moment, it is invoked with a 'None' at the top level of parsing, that is
+		# when the <base> element is looked for (for the HTML cases, that is)
+		if inherited_state :
+			self.rdfa_version		= inherited_state.rdfa_version
+			self.base				= inherited_state.base
+			self.options			= inherited_state.options
+
+			self.list_mapping 		= inherited_state.list_mapping
+			self.new_list			= False
+
+			# for generic XML versions the xml:base attribute should be handled
+			if self.options.host_language in accept_xml_base and node.hasAttribute("xml:base") :
+				self.base = remove_frag_id(node.getAttribute("xml:base"))
+		else :
+			# this is the branch called from the very top
+			self.list_mapping = ListStructure()
+			self.new_list	  = True
+
+			if rdfa_version is not None :
+				self.rdfa_version = rdfa_version
+			else :
+				from . import rdfa_current_version
+				self.rdfa_version = rdfa_current_version
+
+			# This value can be overwritten by a @version attribute
+			if node.hasAttribute("version") :
+				top_version = node.getAttribute("version")
+				if top_version.find("RDFa 1.0") != -1 or top_version.find("RDFa1.0") != -1 :
+					self.rdfa_version = "1.0"
+				elif top_version.find("RDFa 1.1") != -1 or top_version.find("RDFa1.1") != -1 :
+					self.rdfa_version = "1.1"
+
+			# this is just to play safe. I believe this should actually not happen...
+			if options == None :
+				from . import Options
+				self.options = Options()
+			else :
+				self.options = options
+
+			self.base = ""
+			# handle the base element case for HTML
+			if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5  ] :
+				for bases in node.getElementsByTagName("base") :
+					if bases.hasAttribute("href") :
+						self.base = remove_frag_id(bases.getAttribute("href"))
+						continue
+			elif self.options.host_language in accept_xml_base and node.hasAttribute("xml:base") :
+				self.base = remove_frag_id(node.getAttribute("xml:base"))
+
+			# If no local setting for base occurs, the input argument has it
+			if self.base == "" :
+				self.base = base
+
+			# Perform an extra beautification in RDFLib
+			if self.options.host_language in beautifying_prefixes :
+				dict = beautifying_prefixes[self.options.host_language]
+				for key in dict :
+					graph.bind(key,dict[key])
+
+			input_info = "Input Host Language:%s, RDFa version:%s, base:%s" % (self.options.host_language, self.rdfa_version, self.base)
+			self.options.add_info(input_info)
+
+
+		#-----------------------------------------------------------------
+		# this will be used repeatedly, better store it once and for all...
+		self.parsedBase = urlsplit(self.base)
+
+		#-----------------------------------------------------------------
+		# generate and store the local CURIE handling class instance
+		self.term_or_curie = TermOrCurie(self, graph, inherited_state)
+
+		#-----------------------------------------------------------------
+		# Settling the language tags
+		# @lang has priority over @xml:lang
+		# it is a bit messy: the three fundamental modes (xhtml, html, or xml) are all slightly different:-(
+		# first get the inherited state's language, if any
+		if inherited_state :
+			self.lang = inherited_state.lang
+		else :
+			self.lang = None
+
+		self.supress_lang = False
+
+
+		if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.xhtml5, HostLanguage.html5 ] :
+			# we may have lang and xml:lang
+			if node.hasAttribute("lang") :
+				lang = node.getAttribute("lang").lower()
+			else :
+				lang = None
+			if node.hasAttribute("xml:lang") :
+				xmllang = node.getAttribute("xml:lang").lower()
+			else :
+				xmllang = None
+			# First of all, set the value, if any
+			if xmllang != None :
+				# this has priority
+				if len(xmllang) != 0 :
+					self.lang = xmllang
+				else :
+					self.lang = None
+			elif lang != None :
+				if len(lang) != 0 :
+					self.lang = lang
+				else :
+					self.lang = None
+			# Ideally, a warning should be generated if lang and xmllang are both present with different values. But
+			# the HTML5 Parser does its magic by overriding a lang value if xmllang is present, so the potential
+			# error situations are simply swallowed...
+
+		elif self.options.host_language in accept_xml_lang and node.hasAttribute("xml:lang") :
+				self.lang = node.getAttribute("xml:lang").lower()
+				if len(self.lang) == 0 : self.lang = None
+
+		#-----------------------------------------------------------------
+		# Set the default namespace. Used when generating XML Literals
+		if node.hasAttribute("xmlns") :
+			self.defaultNS = node.getAttribute("xmlns")
+		elif inherited_state and inherited_state.defaultNS != None :
+			self.defaultNS = inherited_state.defaultNS
+		else :
+			self.defaultNS = None
+	# end __init__
+
+	def _URI(self, val) :
+		"""Returns a URI for a 'pure' URI (ie, not a CURIE). The method resolves possible relative URI-s. It also
+		checks whether the URI uses an unusual URI scheme (and issues a warning); this may be the result of an
+		uninterpreted CURIE...
+		@param val: attribute value to be interpreted
+		@type val: string
+		@return: an RDFLib URIRef instance
+		"""
+		def create_URIRef(uri, check = True) :
+			"""
+			Mini helping function: it checks whether a uri is using a usual scheme before a URIRef is created. In case
+			there is something unusual, a warning is generated (though the URIRef is created nevertheless)
+			@param uri: (absolute) URI string
+			@return: an RDFLib URIRef instance
+			"""
+			from .	import uri_schemes
+			val = uri.strip()
+			if check and urlsplit(val)[0] not in uri_schemes :
+				self.options.add_warning(err_URI_scheme % val.strip(), node=self.node.nodeName)
+			return URIRef(val)
+
+		def join(base, v, check = True) :
+			"""
+			Mini helping function: it makes a urljoin for the paths. Based on the python library, but
+			that one has a bug: in some cases it
+			swallows the '#' or '?' character at the end. This is clearly a problem with
+			Semantic Web URI-s, so this is checked, too
+			@param base: base URI string
+			@param v: local part
+			@param check: whether the URI should be checked against the list of 'existing' URI schemes
+			@return: an RDFLib URIRef instance
+			"""
+			# UGLY!!! There is a bug for a corner case in python version <= 2.5.X
+			if len(v) > 0 and v[0] == '?' and (py_v_major < 3 and py_v_minor <= 5) :
+				return create_URIRef(base+v, check)
+			####
+
+			joined = urljoin(base, v)
+			try :
+				if v[-1] != joined[-1] and (v[-1] == "#" or v[-1] == "?") :
+					return create_URIRef(joined + v[-1], check)
+				else :
+					return create_URIRef(joined, check)
+			except :
+				return create_URIRef(joined, check)
+
+		if val == "" :
+			# The fragment ID must be removed...
+			return URIRef(self.base)
+
+		# fall back on good old traditional URI-s.
+		# To be on the safe side, let us use the Python libraries
+		if self.parsedBase[0] == "" :
+			# base is, in fact, a local file name
+			# The following call is just to be sure that some pathological cases when
+			# the ':' _does_ appear in the URI but not in a scheme position is taken
+			# care of properly...
+
+			key = urlsplit(val)[0]
+			if key == "" :
+				# relative URI, to be combined with local file name:
+				return join(self.base, val, check = False)
+			else :
+				return create_URIRef(val)
+		else :
+			# Trust the python library...
+			# Well, not quite:-) there is what is, in my view, a bug in the urljoin; in some cases it
+			# swallows the '#' or '?' character at the end. This is clearly a problem with
+			# Semantic Web URI-s
+			return join(self.base, val)
+	# end _URI
+
+	def _CURIEorURI(self, val) :
+		"""Returns a URI for a (safe or not safe) CURIE. In case it is a safe CURIE but the CURIE itself
+		is not defined, an error message is issued. Otherwise, if it is not a CURIE, it is taken to be a URI
+		@param val: attribute value to be interpreted
+		@type val: string
+		@return: an RDFLib URIRef instance or None
+		"""
+		if val == "" :
+			return URIRef(self.base)
+
+		safe_curie = False
+		if val[0] == '[' :
+			# If a safe CURIE is asked for, a pure URI is not acceptable.
+			# Is checked below, and that is why the safe_curie flag is necessary
+			if val[-1] != ']' :
+				# that is certainly forbidden: an incomplete safe CURIE
+				self.options.add_warning(err_illegal_safe_CURIE % val, UnresolvablePrefix, node=self.node.nodeName)
+				return None
+			else :
+				val = val[1:-1]
+				safe_curie = True
+		# There is a branch here depending on whether we are in 1.1 or 1.0 mode
+		if self.rdfa_version >= "1.1" :
+			retval = self.term_or_curie.CURIE_to_URI(val)
+			if retval == None :
+				# the value could not be interpreted as a CURIE, ie, it did not produce any valid URI.
+				# The rule says that then the whole value should be considered as a URI
+				# except if it was part of a safe CURIE. In that case it should be ignored...
+				if safe_curie :
+					self.options.add_warning(err_no_CURIE_in_safe_CURIE % val, UnresolvablePrefix, node=self.node.nodeName)
+					return None
+				else :
+					return self._URI(val)
+			else :
+				# there is an unlikely case where the retval is actually a URIRef with a relative URI. Better filter that one out
+				if isinstance(retval, BNode) == False and urlsplit(str(retval))[0] == "" :
+					# yep, there is something wrong, a new URIRef has to be created:
+					return URIRef(self.base+str(retval))
+				else :
+					return retval
+		else :
+			# in 1.0 mode a CURIE can be considered only in case of a safe CURIE
+			if safe_curie :
+				return self.term_or_curie.CURIE_to_URI(val)
+			else :
+				return self._URI(val)
+	# end _CURIEorURI
+
+	def _TERMorCURIEorAbsURI(self, val) :
+		"""Returns a URI either for a term or for a CURIE. The value must be an NCNAME to be handled as a term; otherwise
+		the method falls back on a CURIE or an absolute URI.
+		@param val: attribute value to be interpreted
+		@type val: string
+		@return: an RDFLib URIRef instance or None
+		"""
+		from . import uri_schemes
+		# This case excludes the pure base, ie, the empty value
+		if val == "" :
+			return None
+
+		from .termorcurie import ncname, termname
+		if termname.match(val) :
+			# This is a term, must be handled as such...
+			retval = self.term_or_curie.term_to_URI(val)
+			if not retval :
+				self.options.add_warning(err_undefined_terms % val, UnresolvableTerm, node=self.node.nodeName, buggy_value = val)
+				return None
+			else :
+				return retval
+		else :
+			# try a CURIE
+			retval = self.term_or_curie.CURIE_to_URI(val)
+			if retval :
+				return retval
+			elif self.rdfa_version >= "1.1" :
+				# See if it is an absolute URI
+				scheme = urlsplit(val)[0]
+				if scheme == "" :
+					# bug; there should be no relative URIs here
+					self.options.add_warning(err_non_legal_CURIE_ref % val, UnresolvablePrefix, node=self.node.nodeName)
+					return None
+				else :
+					if scheme not in uri_schemes :
+						self.options.add_warning(err_URI_scheme % val.strip(), node=self.node.nodeName)
+					return URIRef(val)
+			else :
+				# rdfa 1.0 case
+				self.options.add_warning(err_undefined_CURIE % val.strip(), UnresolvablePrefix, node=self.node.nodeName)
+				return None
+	# end _TERMorCURIEorAbsURI
+
+	# -----------------------------------------------------------------------------------------------
+
+	def getURI(self, attr) :
+		"""Get the URI(s) for the attribute. The name of the attribute determines whether the value should be
+		a pure URI, a CURIE, etc, and whether the return is a single element of a list of those. This is done
+		using the L{ExecutionContext._resource_type} table.
+		@param attr: attribute name
+		@type attr: string
+		@return: an RDFLib URIRef instance (or None) or a list of those
+		"""
+		if self.node.hasAttribute(attr) :
+			val = self.node.getAttribute(attr)
+		else :
+			if attr in ExecutionContext._list :
+				return []
+			else :
+				return None
+
+		# This may raise an exception if the attr has no key. This, actually,
+		# should not happen if the code is correct, but it does not harm having it here...
+		try :
+			func = ExecutionContext._resource_type[attr]
+		except :
+			# Actually, this should not happen...
+			func = ExecutionContext._URI
+
+		if attr in ExecutionContext._list :
+			# Allows for a list
+			resources = [ func(self, v.strip()) for v in val.strip().split() if v != None ]
+			retval = [ r for r in resources if r != None ]
+		else :
+			retval = func(self, val.strip())
+		return retval
+	# end getURI
+
+	def getResource(self, *args) :
+		"""Get single resources from several different attributes. The first one that returns a valid URI wins.
+		@param args: variable list of attribute names, or a single attribute being a list itself.
+		@return: an RDFLib URIRef instance (or None) :
+		"""
+		if len(args) == 0 :
+			return None
+		if isinstance(args[0], tuple) or isinstance(args[0], list) :
+			rargs = args[0]
+		else :
+			rargs = args
+
+		for resource in rargs :
+			uri = self.getURI(resource)
+			if uri != None : return uri
+		return None
+
+	# -----------------------------------------------------------------------------------------------
+	def reset_list_mapping(self, origin=None) :
+		"""
+		Reset, ie, create a new empty dictionary for the list mapping.
+		"""
+		self.list_mapping = ListStructure()
+		if origin: self.set_list_origin(origin)
+		self.new_list = True
+
+	def list_empty(self) :
+		"""
+		Checks whether the list is empty.
+		@return: Boolean
+		"""
+		return len(self.list_mapping.mapping) == 0
+
+	def get_list_props(self) :
+		"""
+		Return the list of property values in the list structure
+		@return: list of URIRef
+		"""
+		return list(self.list_mapping.mapping.keys())
+
+	def get_list_value(self,prop) :
+		"""
+		Return the list of values in the list structure for a specific property
+		@return: list of RDF nodes
+		"""
+		return self.list_mapping.mapping[prop]
+
+	def set_list_origin(self, origin) :
+		"""
+		Set the origin of the list, ie, the subject to attach the final list(s) to
+		@param origin: URIRef
+		"""
+		self.list_mapping.origin = origin
+
+	def get_list_origin(self) :
+		"""
+		Return the origin of the list, ie, the subject to attach the final list(s) to
+		@return: URIRef
+		"""
+		return self.list_mapping.origin
+
+	def add_to_list_mapping(self, property, resource) :
+		"""Add a new property-resource on the list mapping structure. The latter is a dictionary of arrays;
+		if the array does not exist yet, it will be created on the fly.
+
+		@param property: the property URI, used as a key in the dictionary
+		@param resource: the resource to be added to the relevant array in the dictionary. Can be None; this is a dummy
+		placeholder for C{<span rel="property" inlist>...</span>} constructions that may be filled in by children or siblings; if not
+		an empty list has to be generated.
+		"""
+		if property in self.list_mapping.mapping :
+			if resource != None :
+				# indeed, if it is None, than it should not override anything
+				if self.list_mapping.mapping[property] == None :
+					# replacing a dummy with real content
+					self.list_mapping.mapping[property] = [ resource ]
+				else :
+					self.list_mapping.mapping[property].append(resource)
+		else :
+			if resource != None :
+				self.list_mapping.mapping[property] = [ resource ]
+			else :
+				self.list_mapping.mapping[property] = None
+
+
+####################