Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/pyMicrodata/utils.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:d30785e31577 | 1:56ad4e20f292 |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 Various utilities for pyMicrodata | |
4 | |
5 @organization: U{World Wide Web Consortium<http://www.w3.org>} | |
6 @author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} | |
7 @license: This software is available for use under the | |
8 U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} | |
9 """ | |
10 | |
11 """ | |
12 $Id: utils.py,v 1.7 2012/09/01 15:17:28 ivan Exp $ | |
13 $Date: 2012/09/01 15:17:28 $ | |
14 """ | |
15 import os, os.path, sys | |
16 (py_v_major, py_v_minor, py_v_micro, py_v_final, py_v_serial) = sys.version_info | |
17 | |
18 if py_v_major >= 3 : | |
19 from urllib.request import Request, urlopen | |
20 from urllib.parse import urljoin, quote, urlparse | |
21 from http.server import BaseHTTPRequestHandler | |
22 from urllib.error import HTTPError as urllib_HTTPError | |
23 else : | |
24 from urllib.request import Request, urlopen | |
25 from urllib.error import HTTPError as urllib_HTTPError | |
26 from urllib.parse import urljoin, urlparse | |
27 from urllib.parse import quote | |
28 from http.server import BaseHTTPRequestHandler | |
29 | |
30 import re | |
31 from datetime import datetime | |
32 | |
33 from rdflib import BNode | |
34 import rdflib | |
35 if rdflib.__version__ >= "3.0.0" : | |
36 from rdflib import RDF as ns_rdf | |
37 else : | |
38 from rdflib.RDF import RDFNS as ns_rdf | |
39 | |
40 ################################################################################# | |
41 def is_absolute_URI( uri ) : | |
42 return urlparse(uri)[0] != "" | |
43 | |
44 ################################################################################# | |
45 | |
46 def fragment_escape( name ) : | |
47 return quote(name, '/~:-.') | |
48 | |
49 ################################################################################# | |
50 | |
51 def generate_URI(base, v) : | |
52 """ | |
53 Generate an (absolute) URI; if val is a fragment, then using it with base, | |
54 otherwise just return the value | |
55 @param base: Absolute URI for base | |
56 @param v: relative or absolute URI | |
57 """ | |
58 if is_absolute_URI( v ) : | |
59 return v | |
60 else : | |
61 # UGLY!!! There is a bug for a corner case in python version <= 2.5.X | |
62 if len(v) > 0 and v[0] == '?' and (py_v_major < 3 and py_v_minor <= 5) : | |
63 return base+val | |
64 #### | |
65 | |
66 # Trust the python library... | |
67 # Well, not quite:-) there is what is, in my view, a bug in the urljoin; in some cases it | |
68 # swallows the '#' or '?' character at the end. This is clearly a problem with | |
69 # Semantic Web URI-s | |
70 v = fragment_escape(v.strip()) | |
71 joined = urljoin(base, v) | |
72 try : | |
73 if v[-1] != joined[-1] and (v[-1] == "#" or v[-1] == "?") : | |
74 return joined + v[-1] | |
75 else : | |
76 return joined | |
77 except : | |
78 return joined | |
79 | |
80 ################################################################################# | |
81 def generate_RDF_collection( graph, vals ) : | |
82 """ | |
83 Generate an RDF List from vals, returns the head of the list | |
84 @param graph: RDF graph | |
85 @type graph: RDFLib Graph | |
86 @param vals: array of RDF Resources | |
87 @return: head of the List (an RDF Resource) | |
88 """ | |
89 # generate an RDF List, returns the head | |
90 # list has all the elements in RDF format already | |
91 heads = [ BNode() for r in vals ] + [ ns_rdf["nil"] ] | |
92 for i in range(0, len(vals)) : | |
93 graph.add( (heads[i], ns_rdf["first"], vals[i]) ) | |
94 graph.add( (heads[i], ns_rdf["rest"], heads[i+1]) ) | |
95 return heads[0] | |
96 | |
97 ################################################################################# | |
98 def get_Literal(Pnode): | |
99 """ | |
100 Get (recursively) the full text from a DOM Node. | |
101 | |
102 @param Pnode: DOM Node | |
103 @return: string | |
104 """ | |
105 rc = "" | |
106 for node in Pnode.childNodes: | |
107 if node.nodeType == node.TEXT_NODE: | |
108 rc = rc + node.data | |
109 elif node.nodeType == node.ELEMENT_NODE : | |
110 rc = rc + get_Literal(node) | |
111 | |
112 # This presupposes that all spaces and such should be stripped. I am not sure it is true in the spec, | |
113 # but this is what the examples show | |
114 # return re.sub(r'(\r| |\n|\t)+'," ",rc).strip() | |
115 | |
116 # at present, the agreement seems to say that white spaces are maintained: | |
117 return rc | |
118 | |
119 ################################################################################# | |
120 def get_lang(node) : | |
121 # we may have lang and xml:lang | |
122 retval = None | |
123 if node.hasAttribute("lang") : | |
124 retval = node.getAttribute("lang") | |
125 if retval and node.hasAttribute("xml:lang") : | |
126 xmllang = node.getAttribute("xml:lang").lower() | |
127 if not( xmllang != None and xmllang == retval.lower() ) : | |
128 # This is an error, in which case retval must be invalidated... | |
129 retval = None | |
130 return retval | |
131 | |
132 def get_lang_from_hierarchy(document, node) : | |
133 lang = get_lang(node) | |
134 if lang == None : | |
135 parent = node.parentNode | |
136 if parent != None and parent != document : | |
137 return get_lang_from_hierarchy(document, parent) | |
138 else : | |
139 return get_lang(document) | |
140 else : | |
141 return lang | |
142 | |
143 ################################################################################# | |
144 datetime_type = "http://www.w3.org/2001/XMLSchema#dateTime" | |
145 time_type = "http://www.w3.org/2001/XMLSchema#time" | |
146 date_type = "http://www.w3.org/2001/XMLSchema#date" | |
147 date_gYear = "http://www.w3.org/2001/XMLSchema#gYear" | |
148 date_gYearMonth = "http://www.w3.org/2001/XMLSchema#gYearMonth" | |
149 date_gMonthDay = "http://www.w3.org/2001/XMLSchema#gMonthDay" | |
150 duration_type = "http://www.w3.org/2001/XMLSchema#duration" | |
151 | |
152 _formats = { | |
153 date_gMonthDay : [ "%m-%d" ], | |
154 date_gYearMonth : [ "%Y-%m"], | |
155 date_gYear : [ "%Y" ], | |
156 date_type : [ "%Y-%m-%d", "%Y-%m-%dZ" ], | |
157 time_type : [ "%H:%M", | |
158 "%H:%M:%S", | |
159 "%H:%M:%SZ", | |
160 "%H:%M:%S.%f" ], | |
161 datetime_type : [ "%Y-%m-%dT%H:%M", | |
162 "%Y-%m-%dT%H:%M:%S", | |
163 "%Y-%m-%dT%H:%M:%S.%f", | |
164 "%Y-%m-%dT%H:%MZ", | |
165 "%Y-%m-%dT%H:%M:%SZ", | |
166 "%Y-%m-%dT%H:%M:%S.%fZ" ], | |
167 duration_type : [ "P%dD", | |
168 "P%YY%mM%dD", | |
169 "P%YY%mM", | |
170 "P%YY%dD", | |
171 "P%YY", | |
172 "P%mM", | |
173 "P%mM%dD", | |
174 ], | |
175 } | |
176 | |
177 _dur_times = [ "%HH%MM%SS", "%HH", "%MM", "%SS", "%HH%MM", "%HH%SS", "%MM%SS" ] | |
178 | |
179 def get_time_type(string) : | |
180 """ | |
181 Check whether the string abides to one of the accepted time related datatypes, and returns that one if yes | |
182 @param string: the attribute value to be checked | |
183 @return : a datatype URI or None | |
184 """ | |
185 for key in _formats : | |
186 for format in _formats[key] : | |
187 try : | |
188 # try to check if the syntax is fine | |
189 d = datetime.strptime(string, format) | |
190 # bingo! | |
191 return key | |
192 except ValueError : | |
193 pass | |
194 | |
195 # Now come the special cases:-( | |
196 # Check first for the duration stuff, that is the nastiest. | |
197 if len(string) > 2 and string[0] == 'P' or (string [0] == '-' and string[1] == 'P') : | |
198 # this is meant to be a duration type | |
199 # first of all, get rid of the leading '-' and check again | |
200 if string[0] == '-' : | |
201 for format in _formats[duration_type] : | |
202 try : | |
203 # try to check if the syntax is fine | |
204 d = datetime.strptime(string, format) | |
205 # bingo! | |
206 return duration_type | |
207 except ValueError : | |
208 pass | |
209 # Let us see if the value contains a separate time portion, and cut that one | |
210 durs = string.split('T') | |
211 if len(durs) == 2 : | |
212 # yep, so we should check again | |
213 dur = durs[0] | |
214 tm = durs[1] | |
215 # Check the duration part | |
216 td = False | |
217 for format in _formats[duration_type] : | |
218 try : | |
219 # try to check if the syntax is fine | |
220 d = datetime.strptime(dur, format) | |
221 # bingo! | |
222 td = True | |
223 break | |
224 except ValueError : | |
225 pass | |
226 if td == True : | |
227 # Getting there... | |
228 for format in _dur_times : | |
229 try : | |
230 # try to check if the syntax is fine | |
231 d = datetime.strptime(tm, format) | |
232 # bingo! | |
233 return duration_type | |
234 except ValueError : | |
235 pass | |
236 # something went wrong... | |
237 return None | |
238 else : | |
239 # Well, no more tricks, this is a plain type | |
240 return None | |
241 | |
242 # If we got here, we should check the time zone | |
243 # there is a discrepancy betwen the python and the HTML5/XSD lexical string, | |
244 # which means that this has to handled separately for the date and the timezone portion | |
245 try : | |
246 # The time-zone-less portion of the string | |
247 str = string[0:-6] | |
248 # The time-zone portion | |
249 tz = string[-5:] | |
250 try : | |
251 t = datetime.strptime(tz,"%H:%M") | |
252 except ValueError : | |
253 # Bummer, this is not a correct time | |
254 return None | |
255 # The time-zone is fine, the datetime portion has to be checked | |
256 for format in _formats[datetime_type] : | |
257 try : | |
258 # try to check if it is fine | |
259 d = datetime.strptime(str, format) | |
260 # Bingo! | |
261 return datetime_type | |
262 except ValueError : | |
263 pass | |
264 except : | |
265 pass | |
266 return None | |
267 | |
268 | |
269 ######################################################################################################### | |
270 # Handling URIs | |
271 class URIOpener : | |
272 """A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class | |
273 sets the content location. | |
274 The class also adds an accept header to the outgoing request, namely | |
275 text/html and application/xhtml+xml (unless set explicitly by the caller). | |
276 | |
277 @ivar data: the real data, ie, a file-like object | |
278 @ivar headers: the return headers as sent back by the server | |
279 @ivar location: the real location of the data (ie, after possible redirection and content negotiation) | |
280 """ | |
281 CONTENT_LOCATION = 'Content-Location' | |
282 def __init__(self, name) : | |
283 """ | |
284 @param name: URL to be opened | |
285 @keyword additional_headers: additional HTTP request headers to be added to the call | |
286 """ | |
287 try : | |
288 # Note the removal of the fragment ID. This is necessary, per the HTTP spec | |
289 req = Request(url=name.split('#')[0]) | |
290 | |
291 req.add_header('Accept', 'text/html, application/xhtml+xml') | |
292 | |
293 self.data = urlopen(req) | |
294 self.headers = self.data.info() | |
295 | |
296 if URIOpener.CONTENT_LOCATION in self.headers : | |
297 self.location = urljoin(self.data.geturl(),self.headers[URIOpener.CONTENT_LOCATION]) | |
298 else : | |
299 self.location = name | |
300 | |
301 except urllib_HTTPError : | |
302 e = sys.exc_info()[1] | |
303 from pyMicrodata import HTTPError | |
304 msg = BaseHTTPRequestHandler.responses[e.code] | |
305 raise HTTPError('%s' % msg[1], e.code) | |
306 except Exception : | |
307 e = sys.exc_info()[1] | |
308 from pyMicrodata import MicrodataError | |
309 raise MicrodataError('%s' % e) | |
310 |