comparison planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/pyMicrodata/utils.py @ 1:56ad4e20f292 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:32:28 -0400
parents
children
comparison
equal deleted inserted replaced
0:d30785e31577 1:56ad4e20f292
1 # -*- coding: utf-8 -*-
2 """
3 Various utilities for pyMicrodata
4
5 @organization: U{World Wide Web Consortium<http://www.w3.org>}
6 @author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
7 @license: This software is available for use under the
8 U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}
9 """
10
11 """
12 $Id: utils.py,v 1.7 2012/09/01 15:17:28 ivan Exp $
13 $Date: 2012/09/01 15:17:28 $
14 """
15 import os, os.path, sys
16 (py_v_major, py_v_minor, py_v_micro, py_v_final, py_v_serial) = sys.version_info
17
18 if py_v_major >= 3 :
19 from urllib.request import Request, urlopen
20 from urllib.parse import urljoin, quote, urlparse
21 from http.server import BaseHTTPRequestHandler
22 from urllib.error import HTTPError as urllib_HTTPError
23 else :
24 from urllib.request import Request, urlopen
25 from urllib.error import HTTPError as urllib_HTTPError
26 from urllib.parse import urljoin, urlparse
27 from urllib.parse import quote
28 from http.server import BaseHTTPRequestHandler
29
30 import re
31 from datetime import datetime
32
33 from rdflib import BNode
34 import rdflib
35 if rdflib.__version__ >= "3.0.0" :
36 from rdflib import RDF as ns_rdf
37 else :
38 from rdflib.RDF import RDFNS as ns_rdf
39
40 #################################################################################
41 def is_absolute_URI( uri ) :
42 return urlparse(uri)[0] != ""
43
44 #################################################################################
45
46 def fragment_escape( name ) :
47 return quote(name, '/~:-.')
48
49 #################################################################################
50
51 def generate_URI(base, v) :
52 """
53 Generate an (absolute) URI; if val is a fragment, then using it with base,
54 otherwise just return the value
55 @param base: Absolute URI for base
56 @param v: relative or absolute URI
57 """
58 if is_absolute_URI( v ) :
59 return v
60 else :
61 # UGLY!!! There is a bug for a corner case in python version <= 2.5.X
62 if len(v) > 0 and v[0] == '?' and (py_v_major < 3 and py_v_minor <= 5) :
63 return base+val
64 ####
65
66 # Trust the python library...
67 # Well, not quite:-) there is what is, in my view, a bug in the urljoin; in some cases it
68 # swallows the '#' or '?' character at the end. This is clearly a problem with
69 # Semantic Web URI-s
70 v = fragment_escape(v.strip())
71 joined = urljoin(base, v)
72 try :
73 if v[-1] != joined[-1] and (v[-1] == "#" or v[-1] == "?") :
74 return joined + v[-1]
75 else :
76 return joined
77 except :
78 return joined
79
80 #################################################################################
81 def generate_RDF_collection( graph, vals ) :
82 """
83 Generate an RDF List from vals, returns the head of the list
84 @param graph: RDF graph
85 @type graph: RDFLib Graph
86 @param vals: array of RDF Resources
87 @return: head of the List (an RDF Resource)
88 """
89 # generate an RDF List, returns the head
90 # list has all the elements in RDF format already
91 heads = [ BNode() for r in vals ] + [ ns_rdf["nil"] ]
92 for i in range(0, len(vals)) :
93 graph.add( (heads[i], ns_rdf["first"], vals[i]) )
94 graph.add( (heads[i], ns_rdf["rest"], heads[i+1]) )
95 return heads[0]
96
97 #################################################################################
98 def get_Literal(Pnode):
99 """
100 Get (recursively) the full text from a DOM Node.
101
102 @param Pnode: DOM Node
103 @return: string
104 """
105 rc = ""
106 for node in Pnode.childNodes:
107 if node.nodeType == node.TEXT_NODE:
108 rc = rc + node.data
109 elif node.nodeType == node.ELEMENT_NODE :
110 rc = rc + get_Literal(node)
111
112 # This presupposes that all spaces and such should be stripped. I am not sure it is true in the spec,
113 # but this is what the examples show
114 # return re.sub(r'(\r| |\n|\t)+'," ",rc).strip()
115
116 # at present, the agreement seems to say that white spaces are maintained:
117 return rc
118
119 #################################################################################
120 def get_lang(node) :
121 # we may have lang and xml:lang
122 retval = None
123 if node.hasAttribute("lang") :
124 retval = node.getAttribute("lang")
125 if retval and node.hasAttribute("xml:lang") :
126 xmllang = node.getAttribute("xml:lang").lower()
127 if not( xmllang != None and xmllang == retval.lower() ) :
128 # This is an error, in which case retval must be invalidated...
129 retval = None
130 return retval
131
132 def get_lang_from_hierarchy(document, node) :
133 lang = get_lang(node)
134 if lang == None :
135 parent = node.parentNode
136 if parent != None and parent != document :
137 return get_lang_from_hierarchy(document, parent)
138 else :
139 return get_lang(document)
140 else :
141 return lang
142
143 #################################################################################
144 datetime_type = "http://www.w3.org/2001/XMLSchema#dateTime"
145 time_type = "http://www.w3.org/2001/XMLSchema#time"
146 date_type = "http://www.w3.org/2001/XMLSchema#date"
147 date_gYear = "http://www.w3.org/2001/XMLSchema#gYear"
148 date_gYearMonth = "http://www.w3.org/2001/XMLSchema#gYearMonth"
149 date_gMonthDay = "http://www.w3.org/2001/XMLSchema#gMonthDay"
150 duration_type = "http://www.w3.org/2001/XMLSchema#duration"
151
152 _formats = {
153 date_gMonthDay : [ "%m-%d" ],
154 date_gYearMonth : [ "%Y-%m"],
155 date_gYear : [ "%Y" ],
156 date_type : [ "%Y-%m-%d", "%Y-%m-%dZ" ],
157 time_type : [ "%H:%M",
158 "%H:%M:%S",
159 "%H:%M:%SZ",
160 "%H:%M:%S.%f" ],
161 datetime_type : [ "%Y-%m-%dT%H:%M",
162 "%Y-%m-%dT%H:%M:%S",
163 "%Y-%m-%dT%H:%M:%S.%f",
164 "%Y-%m-%dT%H:%MZ",
165 "%Y-%m-%dT%H:%M:%SZ",
166 "%Y-%m-%dT%H:%M:%S.%fZ" ],
167 duration_type : [ "P%dD",
168 "P%YY%mM%dD",
169 "P%YY%mM",
170 "P%YY%dD",
171 "P%YY",
172 "P%mM",
173 "P%mM%dD",
174 ],
175 }
176
177 _dur_times = [ "%HH%MM%SS", "%HH", "%MM", "%SS", "%HH%MM", "%HH%SS", "%MM%SS" ]
178
179 def get_time_type(string) :
180 """
181 Check whether the string abides to one of the accepted time related datatypes, and returns that one if yes
182 @param string: the attribute value to be checked
183 @return : a datatype URI or None
184 """
185 for key in _formats :
186 for format in _formats[key] :
187 try :
188 # try to check if the syntax is fine
189 d = datetime.strptime(string, format)
190 # bingo!
191 return key
192 except ValueError :
193 pass
194
195 # Now come the special cases:-(
196 # Check first for the duration stuff, that is the nastiest.
197 if len(string) > 2 and string[0] == 'P' or (string [0] == '-' and string[1] == 'P') :
198 # this is meant to be a duration type
199 # first of all, get rid of the leading '-' and check again
200 if string[0] == '-' :
201 for format in _formats[duration_type] :
202 try :
203 # try to check if the syntax is fine
204 d = datetime.strptime(string, format)
205 # bingo!
206 return duration_type
207 except ValueError :
208 pass
209 # Let us see if the value contains a separate time portion, and cut that one
210 durs = string.split('T')
211 if len(durs) == 2 :
212 # yep, so we should check again
213 dur = durs[0]
214 tm = durs[1]
215 # Check the duration part
216 td = False
217 for format in _formats[duration_type] :
218 try :
219 # try to check if the syntax is fine
220 d = datetime.strptime(dur, format)
221 # bingo!
222 td = True
223 break
224 except ValueError :
225 pass
226 if td == True :
227 # Getting there...
228 for format in _dur_times :
229 try :
230 # try to check if the syntax is fine
231 d = datetime.strptime(tm, format)
232 # bingo!
233 return duration_type
234 except ValueError :
235 pass
236 # something went wrong...
237 return None
238 else :
239 # Well, no more tricks, this is a plain type
240 return None
241
242 # If we got here, we should check the time zone
243 # there is a discrepancy betwen the python and the HTML5/XSD lexical string,
244 # which means that this has to handled separately for the date and the timezone portion
245 try :
246 # The time-zone-less portion of the string
247 str = string[0:-6]
248 # The time-zone portion
249 tz = string[-5:]
250 try :
251 t = datetime.strptime(tz,"%H:%M")
252 except ValueError :
253 # Bummer, this is not a correct time
254 return None
255 # The time-zone is fine, the datetime portion has to be checked
256 for format in _formats[datetime_type] :
257 try :
258 # try to check if it is fine
259 d = datetime.strptime(str, format)
260 # Bingo!
261 return datetime_type
262 except ValueError :
263 pass
264 except :
265 pass
266 return None
267
268
269 #########################################################################################################
270 # Handling URIs
271 class URIOpener :
272 """A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class
273 sets the content location.
274 The class also adds an accept header to the outgoing request, namely
275 text/html and application/xhtml+xml (unless set explicitly by the caller).
276
277 @ivar data: the real data, ie, a file-like object
278 @ivar headers: the return headers as sent back by the server
279 @ivar location: the real location of the data (ie, after possible redirection and content negotiation)
280 """
281 CONTENT_LOCATION = 'Content-Location'
282 def __init__(self, name) :
283 """
284 @param name: URL to be opened
285 @keyword additional_headers: additional HTTP request headers to be added to the call
286 """
287 try :
288 # Note the removal of the fragment ID. This is necessary, per the HTTP spec
289 req = Request(url=name.split('#')[0])
290
291 req.add_header('Accept', 'text/html, application/xhtml+xml')
292
293 self.data = urlopen(req)
294 self.headers = self.data.info()
295
296 if URIOpener.CONTENT_LOCATION in self.headers :
297 self.location = urljoin(self.data.geturl(),self.headers[URIOpener.CONTENT_LOCATION])
298 else :
299 self.location = name
300
301 except urllib_HTTPError :
302 e = sys.exc_info()[1]
303 from pyMicrodata import HTTPError
304 msg = BaseHTTPRequestHandler.responses[e.code]
305 raise HTTPError('%s' % msg[1], e.code)
306 except Exception :
307 e = sys.exc_info()[1]
308 from pyMicrodata import MicrodataError
309 raise MicrodataError('%s' % e)
310