comparison planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/pyRdfa/utils.py @ 1:56ad4e20f292 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:32:28 -0400
parents
children
comparison
equal deleted inserted replaced
0:d30785e31577 1:56ad4e20f292
1 # -*- coding: utf-8 -*-
2 """
3 Various utilities for pyRdfa.
4
5 Most of the utilities are straightforward.
6
7 @organization: U{World Wide Web Consortium<http://www.w3.org>}
8 @author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
9 @license: This software is available for use under the
10 U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}
11
12
13 """
14
15 """
16 $Id: utils.py,v 1.9 2012/11/16 17:51:53 ivan Exp $
17 $Date: 2012/11/16 17:51:53 $
18 """
19 import os, os.path, sys, imp, datetime
20
21 # Python 3 vs. 2 switch
22 if sys.version_info[0] >= 3 :
23 from urllib.request import Request, urlopen
24 from urllib.parse import urljoin, quote
25 from http.server import BaseHTTPRequestHandler
26 from urllib.error import HTTPError as urllib_HTTPError
27 else :
28 from urllib.request import Request, urlopen
29 from urllib.error import HTTPError as urllib_HTTPError
30 from urllib.parse import urljoin
31 from urllib.parse import quote
32 from http.server import BaseHTTPRequestHandler
33
34 from .extras.httpheader import content_type, parse_http_datetime
35
36 import rdflib
37 if rdflib.__version__ >= "3.0.0" :
38 from rdflib import RDF as ns_rdf
39 else :
40 from rdflib.RDF import RDFNS as ns_rdf
41
42 from .host import HostLanguage, preferred_suffixes
43
44 #########################################################################################################
45 # Handling URIs
46 class URIOpener :
47 """A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class
48 sets a number of instance variable that might be relevant for processing.
49 The class also adds an accept header to the outgoing request, namely
50 text/html and application/xhtml+xml (unless set explicitly by the caller).
51
52 If the content type is set by the server, the relevant HTTP response field is used. Otherwise,
53 common suffixes are used (see L{host.preferred_suffixes}) to set the content type (this is really of importance
54 for C{file:///} URI-s). If none of these works, the content type is empty.
55
56 Interpretation of the content type for the return is done by Deron Meranda's U{httpheader module<http://deron.meranda.us/>}.
57
58 @ivar data: the real data, ie, a file-like object
59 @ivar headers: the return headers as sent back by the server
60 @ivar content_type: the content type of the resource or the empty string, if the content type cannot be determined
61 @ivar location: the real location of the data (ie, after possible redirection and content negotiation)
62 @ivar last_modified_date: sets the last modified date if set in the header, None otherwise
63 @ivar expiration_date: sets the expiration date if set in the header, I{current UTC plus one day} otherwise (this is used for caching purposes, hence this artificial setting)
64 """
65 CONTENT_LOCATION = 'Content-Location'
66 CONTENT_TYPE = 'Content-Type'
67 LAST_MODIFIED = 'Last-Modified'
68 EXPIRES = 'Expires'
69 def __init__(self, name, additional_headers = {}) :
70 """
71 @param name: URL to be opened
72 @keyword additional_headers: additional HTTP request headers to be added to the call
73 """
74 try :
75 # Note the removal of the fragment ID. This is necessary, per the HTTP spec
76 req = Request(url=name.split('#')[0])
77
78 for key in additional_headers :
79 req.add_header(key, additional_headers[key])
80 if 'Accept' not in additional_headers :
81 req.add_header('Accept', 'text/html, application/xhtml+xml')
82
83 self.data = urlopen(req)
84 self.headers = self.data.info()
85
86 if URIOpener.CONTENT_TYPE in self.headers :
87 # The call below will remove the possible media type parameters, like charset settings
88 ct = content_type(self.headers[URIOpener.CONTENT_TYPE])
89 self.content_type = ct.media_type
90 if 'charset' in ct.parmdict :
91 self.charset = ct.parmdict['charset']
92 else :
93 self.charset = None
94 # print
95 else :
96 # check if the suffix can be used for the content type; this may be important
97 # for file:// type URI or if the server is not properly set up to return the right
98 # mime type
99 self.charset = None
100 self.content_type = ""
101 for suffix in list(preferred_suffixes.keys()) :
102 if name.endswith(suffix) :
103 self.content_type = preferred_suffixes[suffix]
104 break
105
106 if URIOpener.CONTENT_LOCATION in self.headers :
107 self.location = urljoin(self.data.geturl(),self.headers[URIOpener.CONTENT_LOCATION])
108 else :
109 self.location = name
110
111 self.expiration_date = datetime.datetime.utcnow() + datetime.timedelta(days=1)
112 if URIOpener.EXPIRES in self.headers :
113 try :
114 # Thanks to Deron Meranda for the HTTP date conversion method...
115 self.expiration_date = parse_http_datetime(self.headers[URIOpener.EXPIRES])
116 except :
117 # The Expires date format was wrong, sorry, forget it...
118 pass
119
120 self.last_modified_date = None
121 if URIOpener.LAST_MODIFIED in self.headers :
122 try :
123 # Thanks to Deron Meranda for the HTTP date conversion method...
124 self.last_modified_date = parse_http_datetime(self.headers[URIOpener.LAST_MODIFIED])
125 except :
126 # The last modified date format was wrong, sorry, forget it...
127 pass
128
129 except urllib_HTTPError :
130 e = sys.exc_info()[1]
131 from . import HTTPError
132 msg = BaseHTTPRequestHandler.responses[e.code]
133 raise HTTPError('%s' % msg[1], e.code)
134 except Exception :
135 e = sys.exc_info()[1]
136 from . import RDFaError
137 raise RDFaError('%s' % e)
138
139 #########################################################################################################
140
141 # 'safe' characters for the URI quoting, ie, characters that can safely stay as they are. Other
142 # special characters are converted to their %.. equivalents for namespace prefixes
143 _unquotedChars = ':/\?=#~'
144 _warnChars = [' ','\n','\r','\t']
145
146 def quote_URI(uri, options = None) :
147 """
148 'quote' a URI, ie, exchange special characters for their '%..' equivalents. Some of the characters
149 may stay as they are (listed in L{_unquotedChars}. If one of the characters listed in L{_warnChars}
150 is also in the uri, an extra warning is also generated.
151 @param uri: URI
152 @param options:
153 @type options: L{Options<pyRdfa.Options>}
154 """
155 from . import err_unusual_char_in_URI
156 suri = uri.strip()
157 for c in _warnChars :
158 if suri.find(c) != -1 :
159 if options != None :
160 options.add_warning(err_unusual_char_in_URI % suri)
161 break
162 return quote(suri, _unquotedChars)
163
164 #########################################################################################################
165
166 def create_file_name(uri) :
167 """
168 Create a suitable file name from an (absolute) URI. Used, eg, for the generation of a file name for a cached vocabulary file.
169 """
170 suri = uri.strip()
171 final_uri = quote(suri,_unquotedChars)
172 # Remove some potentially dangereous characters
173 return final_uri.replace(' ','_').replace('%','_').replace('-','_').replace('+','_').replace('/','_').replace('?','_').replace(':','_').replace('=','_').replace('#','_')
174
175 #########################################################################################################
176 def has_one_of_attributes(node,*args) :
177 """
178 Check whether one of the listed attributes is present on a (DOM) node.
179 @param node: DOM element node
180 @param args: possible attribute names
181 @return: True or False
182 @rtype: Boolean
183 """
184 if len(args) == 0 :
185 return None
186 if isinstance(args[0], tuple) or isinstance(args[0], list) :
187 rargs = args[0]
188 else :
189 rargs = args
190
191 return True in [ node.hasAttribute(attr) for attr in rargs ]
192
193 #########################################################################################################
194 def traverse_tree(node, func) :
195 """Traverse the whole element tree, and perform the function C{func} on all the elements.
196 @param node: DOM element node
197 @param func: function to be called on the node. Input parameter is a DOM Element Node. If the function returns a boolean True, the recursion is stopped.
198 """
199 if func(node) :
200 return
201
202 for n in node.childNodes :
203 if n.nodeType == node.ELEMENT_NODE :
204 traverse_tree(n, func)
205
206 #########################################################################################################
207 def return_XML(state, inode, base = True, xmlns = True) :
208 """
209 Get (recursively) the XML Literal content of a DOM Element Node. (Most of the processing is done
210 via a C{node.toxml} call of the xml minidom implementation.)
211
212 @param inode: DOM Node
213 @param state: L{pyRdfa.state.ExecutionContext}
214 @param base: whether the base element should be added to the output
215 @type base: Boolean
216 @param xmlns: whether the namespace declarations should be repeated in the generated node
217 @type xmlns: Boolean
218 @return: string
219 """
220 node = inode.cloneNode(True)
221 # Decorate the element with namespaces.lang values and, optionally, base
222 if base :
223 node.setAttribute("xml:base",state.base)
224 if xmlns :
225 for prefix in state.term_or_curie.xmlns :
226 if not node.hasAttribute("xmlns:%s" % prefix) :
227 node.setAttribute("xmlns:%s" % prefix,"%s" % state.term_or_curie.xmlns[prefix])
228 # Set the default namespace, if not done (and is available)
229 if not node.getAttribute("xmlns") and state.defaultNS != None :
230 node.setAttribute("xmlns", state.defaultNS)
231 # Get the lang, if necessary
232 if state.lang :
233 if state.options.host_language in [ HostLanguage.xhtml, HostLanguage.xhtml5, HostLanguage.html5 ] :
234 if not node.getAttribute("lang") :
235 node.setAttribute("lang", state.lang)
236 else :
237 if not node.getAttribute("xml:lang") :
238 node.setAttribute("xml:lang", state.lang)
239
240 if sys.version_info[0] >= 3 :
241 return node.toxml()
242 else :
243 q = node.toxml(encoding='utf-8')
244 return str(q, encoding='utf-8')
245
246 #########################################################################################################
247
248 def dump(node) :
249 """
250 This is just for debug purposes: it prints the essential content of the node in the tree starting at node.
251
252 @param node: DOM node
253 """
254 print(( node.toprettyxml(indent="", newl="") ))
255
256
257