Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/ntriples.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:d30785e31577 | 1:56ad4e20f292 |
---|---|
1 #!/usr/bin/env python | |
2 __doc__ = """ | |
3 N-Triples Parser | |
4 License: GPL 2, W3C, BSD, or MIT | |
5 Author: Sean B. Palmer, inamidst.com | |
6 """ | |
7 | |
8 import re | |
9 import codecs | |
10 | |
11 from rdflib.term import URIRef as URI | |
12 from rdflib.term import BNode as bNode | |
13 from rdflib.term import Literal | |
14 | |
15 from rdflib.py3compat import cast_bytes, decodeUnicodeEscape | |
16 | |
17 __all__ = ['unquote', 'uriquote', 'Sink', 'NTriplesParser'] | |
18 | |
19 uriref = r'<([^:]+:[^\s"<>]+)>' | |
20 literal = r'"([^"\\]*(?:\\.[^"\\]*)*)"' | |
21 litinfo = r'(?:@([a-z]+(?:-[a-zA-Z0-9]+)*)|\^\^' + uriref + r')?' | |
22 | |
23 r_line = re.compile(r'([^\r\n]*)(?:\r\n|\r|\n)') | |
24 r_wspace = re.compile(r'[ \t]*') | |
25 r_wspaces = re.compile(r'[ \t]+') | |
26 r_tail = re.compile(r'[ \t]*\.[ \t]*(#.*)?') | |
27 r_uriref = re.compile(uriref) | |
28 r_nodeid = re.compile(r'_:([A-Za-z0-9]*)') | |
29 r_literal = re.compile(literal + litinfo) | |
30 | |
31 bufsiz = 2048 | |
32 validate = False | |
33 | |
34 | |
35 class Node(str): | |
36 pass | |
37 | |
38 | |
39 class ParseError(Exception): | |
40 pass | |
41 | |
42 | |
43 class Sink(object): | |
44 def __init__(self): | |
45 self.length = 0 | |
46 | |
47 def triple(self, s, p, o): | |
48 self.length += 1 | |
49 print((s, p, o)) | |
50 | |
51 quot = {'t': '\t', 'n': '\n', 'r': '\r', '"': '"', '\\': | |
52 '\\'} | |
53 r_safe = re.compile(r'([\x20\x21\x23-\x5B\x5D-\x7E]+)') | |
54 r_quot = re.compile(r'\\(t|n|r|"|\\)') | |
55 r_uniquot = re.compile(r'\\u([0-9A-F]{4})|\\U([0-9A-F]{8})') | |
56 | |
57 | |
58 def unquote(s): | |
59 """Unquote an N-Triples string.""" | |
60 if not validate: | |
61 | |
62 if isinstance(s, str): # nquads | |
63 s = decodeUnicodeEscape(s) | |
64 else: | |
65 s = s.decode('unicode-escape') | |
66 | |
67 return s | |
68 else: | |
69 result = [] | |
70 while s: | |
71 m = r_safe.match(s) | |
72 if m: | |
73 s = s[m.end():] | |
74 result.append(m.group(1)) | |
75 continue | |
76 | |
77 m = r_quot.match(s) | |
78 if m: | |
79 s = s[2:] | |
80 result.append(quot[m.group(1)]) | |
81 continue | |
82 | |
83 m = r_uniquot.match(s) | |
84 if m: | |
85 s = s[m.end():] | |
86 u, U = m.groups() | |
87 codepoint = int(u or U, 16) | |
88 if codepoint > 0x10FFFF: | |
89 raise ParseError("Disallowed codepoint: %08X" % codepoint) | |
90 result.append(chr(codepoint)) | |
91 elif s.startswith('\\'): | |
92 raise ParseError("Illegal escape at: %s..." % s[:10]) | |
93 else: | |
94 raise ParseError("Illegal literal character: %r" % s[0]) | |
95 return ''.join(result) | |
96 | |
97 r_hibyte = re.compile(r'([\x80-\xFF])') | |
98 | |
99 | |
100 def uriquote(uri): | |
101 if not validate: | |
102 return uri | |
103 else: | |
104 return r_hibyte.sub( | |
105 lambda m: '%%%02X' % ord(m.group(1)), uri) | |
106 | |
107 | |
108 class NTriplesParser(object): | |
109 """An N-Triples Parser. | |
110 | |
111 Usage:: | |
112 | |
113 p = NTriplesParser(sink=MySink()) | |
114 sink = p.parse(f) # file; use parsestring for a string | |
115 """ | |
116 | |
117 _bnode_ids = {} | |
118 | |
119 def __init__(self, sink=None): | |
120 if sink is not None: | |
121 self.sink = sink | |
122 else: | |
123 self.sink = Sink() | |
124 | |
125 def parse(self, f): | |
126 """Parse f as an N-Triples file.""" | |
127 if not hasattr(f, 'read'): | |
128 raise ParseError("Item to parse must be a file-like object.") | |
129 | |
130 # since N-Triples 1.1 files can and should be utf-8 encoded | |
131 f = codecs.getreader('utf-8')(f) | |
132 | |
133 self.file = f | |
134 self.buffer = '' | |
135 while True: | |
136 self.line = self.readline() | |
137 if self.line is None: | |
138 break | |
139 try: | |
140 self.parseline() | |
141 except ParseError: | |
142 raise ParseError("Invalid line: %r" % self.line) | |
143 return self.sink | |
144 | |
145 def parsestring(self, s): | |
146 """Parse s as an N-Triples string.""" | |
147 if not isinstance(s, str): | |
148 raise ParseError("Item to parse must be a string instance.") | |
149 try: | |
150 from io import BytesIO | |
151 assert BytesIO | |
152 except ImportError: | |
153 from io import StringIO as BytesIO | |
154 assert BytesIO | |
155 f = BytesIO() | |
156 f.write(cast_bytes(s)) | |
157 f.seek(0) | |
158 self.parse(f) | |
159 | |
160 def readline(self): | |
161 """Read an N-Triples line from buffered input.""" | |
162 # N-Triples lines end in either CRLF, CR, or LF | |
163 # Therefore, we can't just use f.readline() | |
164 if not self.buffer: | |
165 buffer = self.file.read(bufsiz) | |
166 if not buffer: | |
167 return None | |
168 self.buffer = buffer | |
169 | |
170 while True: | |
171 m = r_line.match(self.buffer) | |
172 if m: # the more likely prospect | |
173 self.buffer = self.buffer[m.end():] | |
174 return m.group(1) | |
175 else: | |
176 buffer = self.file.read(bufsiz) | |
177 if not buffer and not self.buffer.isspace(): | |
178 # Last line does not need to be terminated with a newline | |
179 buffer += "\n" | |
180 elif not buffer: | |
181 return None | |
182 self.buffer += buffer | |
183 | |
184 def parseline(self): | |
185 self.eat(r_wspace) | |
186 if (not self.line) or self.line.startswith('#'): | |
187 return # The line is empty or a comment | |
188 | |
189 subject = self.subject() | |
190 self.eat(r_wspaces) | |
191 | |
192 predicate = self.predicate() | |
193 self.eat(r_wspaces) | |
194 | |
195 object = self.object() | |
196 self.eat(r_tail) | |
197 | |
198 if self.line: | |
199 raise ParseError("Trailing garbage") | |
200 self.sink.triple(subject, predicate, object) | |
201 | |
202 def peek(self, token): | |
203 return self.line.startswith(token) | |
204 | |
205 def eat(self, pattern): | |
206 m = pattern.match(self.line) | |
207 if not m: # @@ Why can't we get the original pattern? | |
208 # print(dir(pattern)) | |
209 # print repr(self.line), type(self.line) | |
210 raise ParseError("Failed to eat %s at %s" % (pattern.pattern, self.line)) | |
211 self.line = self.line[m.end():] | |
212 return m | |
213 | |
214 def subject(self): | |
215 # @@ Consider using dictionary cases | |
216 subj = self.uriref() or self.nodeid() | |
217 if not subj: | |
218 raise ParseError("Subject must be uriref or nodeID") | |
219 return subj | |
220 | |
221 def predicate(self): | |
222 pred = self.uriref() | |
223 if not pred: | |
224 raise ParseError("Predicate must be uriref") | |
225 return pred | |
226 | |
227 def object(self): | |
228 objt = self.uriref() or self.nodeid() or self.literal() | |
229 if objt is False: | |
230 raise ParseError("Unrecognised object type") | |
231 return objt | |
232 | |
233 def uriref(self): | |
234 if self.peek('<'): | |
235 uri = self.eat(r_uriref).group(1) | |
236 uri = unquote(uri) | |
237 uri = uriquote(uri) | |
238 return URI(uri) | |
239 return False | |
240 | |
241 def nodeid(self): | |
242 if self.peek('_'): | |
243 # Fix for https://github.com/RDFLib/rdflib/issues/204 | |
244 bnode_id = self.eat(r_nodeid).group(1) | |
245 new_id = self._bnode_ids.get(bnode_id, None) | |
246 if new_id is not None: | |
247 # Re-map to id specfic to this doc | |
248 return bNode(new_id) | |
249 else: | |
250 # Replace with freshly-generated document-specific BNode id | |
251 bnode = bNode() | |
252 # Store the mapping | |
253 self._bnode_ids[bnode_id] = bnode | |
254 return bnode | |
255 return False | |
256 | |
257 def literal(self): | |
258 if self.peek('"'): | |
259 lit, lang, dtype = self.eat(r_literal).groups() | |
260 if lang: | |
261 lang = lang | |
262 else: | |
263 lang = None | |
264 if dtype: | |
265 dtype = dtype | |
266 else: | |
267 dtype = None | |
268 if lang and dtype: | |
269 raise ParseError("Can't have both a language and a datatype") | |
270 lit = unquote(lit) | |
271 return Literal(lit, lang, dtype) | |
272 return False | |
273 | |
274 # # Obsolete, unused | |
275 # def parseURI(uri): | |
276 # import urllib | |
277 # parser = NTriplesParser() | |
278 # u = urllib.urlopen(uri) | |
279 # sink = parser.parse(u) | |
280 # u.close() | |
281 # # for triple in sink: | |
282 # # print triple | |
283 # print 'Length of input:', sink.length |