diff planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/pyMicrodata/utils.py @ 1:56ad4e20f292 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:32:28 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/pyMicrodata/utils.py	Fri Jul 31 00:32:28 2020 -0400
@@ -0,0 +1,310 @@
+# -*- coding: utf-8 -*-
+"""
+Various utilities for pyMicrodata
+
+@organization: U{World Wide Web Consortium<http://www.w3.org>}
+@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
+@license: This software is available for use under the
+U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}
+"""
+
+"""
+$Id: utils.py,v 1.7 2012/09/01 15:17:28 ivan Exp $
+$Date: 2012/09/01 15:17:28 $
+"""
+import os, os.path, sys
+(py_v_major, py_v_minor, py_v_micro, py_v_final, py_v_serial) = sys.version_info
+
+if py_v_major >= 3 :
+	from urllib.request import Request, urlopen
+	from urllib.parse   import urljoin, quote, urlparse
+	from http.server    import BaseHTTPRequestHandler
+	from urllib.error   import HTTPError as urllib_HTTPError
+else :
+	from urllib.request import Request, urlopen
+	from urllib.error        import HTTPError as urllib_HTTPError
+	from urllib.parse       import urljoin, urlparse
+	from urllib.parse         import quote
+	from http.server import BaseHTTPRequestHandler
+
+import re
+from datetime import datetime
+
+from rdflib	import BNode
+import rdflib
+if rdflib.__version__ >= "3.0.0" :
+	from rdflib	import RDF as ns_rdf
+else :
+	from rdflib.RDF	import RDFNS  as ns_rdf
+
+#################################################################################
+def is_absolute_URI( uri ) :
+	return urlparse(uri)[0] != ""
+
+#################################################################################
+
+def fragment_escape( name ) :
+	return quote(name, '/~:-.')
+
+#################################################################################
+
+def generate_URI(base, v) :
+	"""
+	Generate an (absolute) URI; if val is a fragment, then using it with base,
+	otherwise just return the value
+	@param base: Absolute URI for base
+	@param v: relative or absolute URI
+	"""
+	if is_absolute_URI( v ) :
+		return v
+	else :
+		# UGLY!!! There is a bug for a corner case in python version <= 2.5.X
+		if len(v) > 0 and v[0] == '?' and (py_v_major < 3 and py_v_minor <= 5) :
+			return base+val
+		####
+
+		# Trust the python library...
+		# Well, not quite:-) there is what is, in my view, a bug in the urljoin; in some cases it
+		# swallows the '#' or '?' character at the end. This is clearly a problem with
+		# Semantic Web URI-s
+		v = fragment_escape(v.strip())
+		joined = urljoin(base, v)
+		try :
+			if v[-1] != joined[-1] and (v[-1] == "#" or v[-1] == "?") :
+				return joined + v[-1]
+			else :
+				return joined
+		except :
+			return joined
+
+#################################################################################
+def generate_RDF_collection( graph, vals ) :
+	"""
+	Generate an RDF List from vals, returns the head of the list
+	@param graph: RDF graph
+	@type graph: RDFLib Graph
+	@param vals: array of RDF Resources
+	@return: head of the List (an RDF Resource)
+	"""
+	# generate an RDF List, returns the head
+	# list has all the elements in RDF format already
+	heads = [ BNode() for r in vals ] + [ ns_rdf["nil"] ]
+	for i in range(0, len(vals)) :
+		graph.add( (heads[i], ns_rdf["first"], vals[i]) )
+		graph.add( (heads[i], ns_rdf["rest"],  heads[i+1]) )
+	return heads[0]
+
+#################################################################################
+def get_Literal(Pnode):
+	"""
+	Get (recursively) the full text from a DOM Node.
+
+	@param Pnode: DOM Node
+	@return: string
+	"""
+	rc = ""
+	for node in Pnode.childNodes:
+		if node.nodeType == node.TEXT_NODE:
+			rc = rc + node.data
+		elif node.nodeType == node.ELEMENT_NODE :
+			rc = rc + get_Literal(node)
+
+	# This presupposes that all spaces and such should be stripped. I am not sure it is true in the spec,
+	# but this is what the examples show
+	# return re.sub(r'(\r| |\n|\t)+'," ",rc).strip()
+
+	# at present, the agreement seems to say that white spaces are maintained:
+	return rc
+
+#################################################################################
+def get_lang(node) :
+	# we may have lang and xml:lang
+	retval  = None
+	if node.hasAttribute("lang") :
+		retval = node.getAttribute("lang")
+	if retval and node.hasAttribute("xml:lang") :
+		xmllang = node.getAttribute("xml:lang").lower()
+		if not( xmllang != None and xmllang == retval.lower() ) :
+			# This is an error, in which case retval must be invalidated...
+			retval = None
+	return retval
+
+def get_lang_from_hierarchy(document, node) :
+	lang = get_lang(node)
+	if lang == None :
+		parent = node.parentNode
+		if parent != None and parent != document :
+			return get_lang_from_hierarchy(document, parent)
+		else :
+			return get_lang(document)
+	else :
+		return lang
+
+#################################################################################
+datetime_type 	= "http://www.w3.org/2001/XMLSchema#dateTime"
+time_type 	 	= "http://www.w3.org/2001/XMLSchema#time"
+date_type 	 	= "http://www.w3.org/2001/XMLSchema#date"
+date_gYear		= "http://www.w3.org/2001/XMLSchema#gYear"
+date_gYearMonth	= "http://www.w3.org/2001/XMLSchema#gYearMonth"
+date_gMonthDay	= "http://www.w3.org/2001/XMLSchema#gMonthDay"
+duration_type	= "http://www.w3.org/2001/XMLSchema#duration"
+
+_formats = {
+	date_gMonthDay	  : [ "%m-%d" ],
+	date_gYearMonth	  : [ "%Y-%m"],
+	date_gYear     	  : [ "%Y" ],
+	date_type      	  : [ "%Y-%m-%d", "%Y-%m-%dZ" ],
+	time_type      	  : [ "%H:%M",
+					      "%H:%M:%S",
+					      "%H:%M:%SZ",
+					      "%H:%M:%S.%f" ],
+	datetime_type  	  : [ "%Y-%m-%dT%H:%M",
+					      "%Y-%m-%dT%H:%M:%S",
+					      "%Y-%m-%dT%H:%M:%S.%f",
+					      "%Y-%m-%dT%H:%MZ",
+					      "%Y-%m-%dT%H:%M:%SZ",
+					      "%Y-%m-%dT%H:%M:%S.%fZ" ],
+	duration_type     : [ "P%dD",
+						  "P%YY%mM%dD",
+						  "P%YY%mM",
+						  "P%YY%dD",
+						  "P%YY",
+						  "P%mM",
+						  "P%mM%dD",
+						 ],
+}
+
+_dur_times = [ "%HH%MM%SS", "%HH", "%MM", "%SS", "%HH%MM", "%HH%SS", "%MM%SS" ]
+
+def get_time_type(string) :
+	"""
+	Check whether the string abides to one of the accepted time related datatypes, and returns that one if yes
+	@param string: the attribute value to be checked
+	@return : a datatype URI or None
+	"""
+	for key in _formats :
+		for format in _formats[key] :
+			try :
+				# try to check if the syntax is fine
+				d = datetime.strptime(string, format)
+				# bingo!
+				return key
+			except ValueError :
+				pass
+
+	# Now come the special cases:-(
+	# Check first for the duration stuff, that is the nastiest.
+	if len(string) > 2 and string[0] == 'P' or (string [0] == '-' and string[1] == 'P') :
+		# this is meant to be a duration type
+		# first of all, get rid of the leading '-' and check again
+		if string[0] == '-' :
+			for format in _formats[duration_type] :
+				try :
+					# try to check if the syntax is fine
+					d = datetime.strptime(string, format)
+					# bingo!
+					return duration_type
+				except ValueError :
+					pass
+		# Let us see if the value contains a separate time portion, and cut that one
+		durs = string.split('T')
+		if len(durs) == 2 :
+			# yep, so we should check again
+			dur = durs[0]
+			tm  = durs[1]
+			# Check the duration part
+			td = False
+			for format in _formats[duration_type] :
+				try :
+					# try to check if the syntax is fine
+					d = datetime.strptime(dur, format)
+					# bingo!
+					td = True
+					break
+				except ValueError :
+					pass
+			if td == True :
+				# Getting there...
+				for format in _dur_times :
+					try :
+						# try to check if the syntax is fine
+						d = datetime.strptime(tm, format)
+						# bingo!
+						return duration_type
+					except ValueError :
+						pass
+			# something went wrong...
+			return None
+		else :
+			# Well, no more tricks, this is a plain type
+			return None
+
+	# If we got here, we should check the time zone
+	# there is a discrepancy betwen the python and the HTML5/XSD lexical string,
+	# which means that this has to handled separately for the date and the timezone portion
+	try :
+		# The time-zone-less portion of the string
+		str = string[0:-6]
+		# The time-zone portion
+		tz = string[-5:]
+		try :
+			t = datetime.strptime(tz,"%H:%M")
+		except ValueError :
+			# Bummer, this is not a correct time
+			return None
+		# The time-zone is fine, the datetime portion has to be checked
+		for format in _formats[datetime_type] :
+			try :
+				# try to check if it is fine
+				d = datetime.strptime(str, format)
+				# Bingo!
+				return datetime_type
+			except ValueError :
+				pass
+	except :
+		pass
+	return None
+
+
+#########################################################################################################
+# Handling URIs
+class URIOpener :
+	"""A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class
+	sets the content location.
+	The class also adds an accept header to the outgoing request, namely
+	text/html and application/xhtml+xml (unless set explicitly by the caller).
+
+	@ivar data: the real data, ie, a file-like object
+	@ivar headers: the return headers as sent back by the server
+	@ivar location: the real location of the data (ie, after possible redirection and content negotiation)
+	"""
+	CONTENT_LOCATION	= 'Content-Location'
+	def __init__(self, name) :
+		"""
+		@param name: URL to be opened
+		@keyword additional_headers: additional HTTP request headers to be added to the call
+		"""
+		try :
+			# Note the removal of the fragment ID. This is necessary, per the HTTP spec
+			req = Request(url=name.split('#')[0])
+
+			req.add_header('Accept', 'text/html, application/xhtml+xml')
+
+			self.data		= urlopen(req)
+			self.headers	= self.data.info()
+
+			if URIOpener.CONTENT_LOCATION in self.headers :
+				self.location = urljoin(self.data.geturl(),self.headers[URIOpener.CONTENT_LOCATION])
+			else :
+				self.location = name
+
+		except urllib_HTTPError :
+			e = sys.exc_info()[1]
+			from pyMicrodata import HTTPError
+			msg = BaseHTTPRequestHandler.responses[e.code]
+			raise HTTPError('%s' % msg[1], e.code)
+		except Exception :
+			e = sys.exc_info()[1]
+			from pyMicrodata import MicrodataError
+			raise MicrodataError('%s' % e)
+