comparison planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/ntriples.py @ 1:56ad4e20f292 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:32:28 -0400
parents
children
comparison
equal deleted inserted replaced
0:d30785e31577 1:56ad4e20f292
1 #!/usr/bin/env python
2 __doc__ = """
3 N-Triples Parser
4 License: GPL 2, W3C, BSD, or MIT
5 Author: Sean B. Palmer, inamidst.com
6 """
7
8 import re
9 import codecs
10
11 from rdflib.term import URIRef as URI
12 from rdflib.term import BNode as bNode
13 from rdflib.term import Literal
14
15 from rdflib.py3compat import cast_bytes, decodeUnicodeEscape
16
17 __all__ = ['unquote', 'uriquote', 'Sink', 'NTriplesParser']
18
19 uriref = r'<([^:]+:[^\s"<>]+)>'
20 literal = r'"([^"\\]*(?:\\.[^"\\]*)*)"'
21 litinfo = r'(?:@([a-z]+(?:-[a-zA-Z0-9]+)*)|\^\^' + uriref + r')?'
22
23 r_line = re.compile(r'([^\r\n]*)(?:\r\n|\r|\n)')
24 r_wspace = re.compile(r'[ \t]*')
25 r_wspaces = re.compile(r'[ \t]+')
26 r_tail = re.compile(r'[ \t]*\.[ \t]*(#.*)?')
27 r_uriref = re.compile(uriref)
28 r_nodeid = re.compile(r'_:([A-Za-z0-9]*)')
29 r_literal = re.compile(literal + litinfo)
30
31 bufsiz = 2048
32 validate = False
33
34
35 class Node(str):
36 pass
37
38
39 class ParseError(Exception):
40 pass
41
42
43 class Sink(object):
44 def __init__(self):
45 self.length = 0
46
47 def triple(self, s, p, o):
48 self.length += 1
49 print((s, p, o))
50
51 quot = {'t': '\t', 'n': '\n', 'r': '\r', '"': '"', '\\':
52 '\\'}
53 r_safe = re.compile(r'([\x20\x21\x23-\x5B\x5D-\x7E]+)')
54 r_quot = re.compile(r'\\(t|n|r|"|\\)')
55 r_uniquot = re.compile(r'\\u([0-9A-F]{4})|\\U([0-9A-F]{8})')
56
57
58 def unquote(s):
59 """Unquote an N-Triples string."""
60 if not validate:
61
62 if isinstance(s, str): # nquads
63 s = decodeUnicodeEscape(s)
64 else:
65 s = s.decode('unicode-escape')
66
67 return s
68 else:
69 result = []
70 while s:
71 m = r_safe.match(s)
72 if m:
73 s = s[m.end():]
74 result.append(m.group(1))
75 continue
76
77 m = r_quot.match(s)
78 if m:
79 s = s[2:]
80 result.append(quot[m.group(1)])
81 continue
82
83 m = r_uniquot.match(s)
84 if m:
85 s = s[m.end():]
86 u, U = m.groups()
87 codepoint = int(u or U, 16)
88 if codepoint > 0x10FFFF:
89 raise ParseError("Disallowed codepoint: %08X" % codepoint)
90 result.append(chr(codepoint))
91 elif s.startswith('\\'):
92 raise ParseError("Illegal escape at: %s..." % s[:10])
93 else:
94 raise ParseError("Illegal literal character: %r" % s[0])
95 return ''.join(result)
96
97 r_hibyte = re.compile(r'([\x80-\xFF])')
98
99
100 def uriquote(uri):
101 if not validate:
102 return uri
103 else:
104 return r_hibyte.sub(
105 lambda m: '%%%02X' % ord(m.group(1)), uri)
106
107
108 class NTriplesParser(object):
109 """An N-Triples Parser.
110
111 Usage::
112
113 p = NTriplesParser(sink=MySink())
114 sink = p.parse(f) # file; use parsestring for a string
115 """
116
117 _bnode_ids = {}
118
119 def __init__(self, sink=None):
120 if sink is not None:
121 self.sink = sink
122 else:
123 self.sink = Sink()
124
125 def parse(self, f):
126 """Parse f as an N-Triples file."""
127 if not hasattr(f, 'read'):
128 raise ParseError("Item to parse must be a file-like object.")
129
130 # since N-Triples 1.1 files can and should be utf-8 encoded
131 f = codecs.getreader('utf-8')(f)
132
133 self.file = f
134 self.buffer = ''
135 while True:
136 self.line = self.readline()
137 if self.line is None:
138 break
139 try:
140 self.parseline()
141 except ParseError:
142 raise ParseError("Invalid line: %r" % self.line)
143 return self.sink
144
145 def parsestring(self, s):
146 """Parse s as an N-Triples string."""
147 if not isinstance(s, str):
148 raise ParseError("Item to parse must be a string instance.")
149 try:
150 from io import BytesIO
151 assert BytesIO
152 except ImportError:
153 from io import StringIO as BytesIO
154 assert BytesIO
155 f = BytesIO()
156 f.write(cast_bytes(s))
157 f.seek(0)
158 self.parse(f)
159
160 def readline(self):
161 """Read an N-Triples line from buffered input."""
162 # N-Triples lines end in either CRLF, CR, or LF
163 # Therefore, we can't just use f.readline()
164 if not self.buffer:
165 buffer = self.file.read(bufsiz)
166 if not buffer:
167 return None
168 self.buffer = buffer
169
170 while True:
171 m = r_line.match(self.buffer)
172 if m: # the more likely prospect
173 self.buffer = self.buffer[m.end():]
174 return m.group(1)
175 else:
176 buffer = self.file.read(bufsiz)
177 if not buffer and not self.buffer.isspace():
178 # Last line does not need to be terminated with a newline
179 buffer += "\n"
180 elif not buffer:
181 return None
182 self.buffer += buffer
183
184 def parseline(self):
185 self.eat(r_wspace)
186 if (not self.line) or self.line.startswith('#'):
187 return # The line is empty or a comment
188
189 subject = self.subject()
190 self.eat(r_wspaces)
191
192 predicate = self.predicate()
193 self.eat(r_wspaces)
194
195 object = self.object()
196 self.eat(r_tail)
197
198 if self.line:
199 raise ParseError("Trailing garbage")
200 self.sink.triple(subject, predicate, object)
201
202 def peek(self, token):
203 return self.line.startswith(token)
204
205 def eat(self, pattern):
206 m = pattern.match(self.line)
207 if not m: # @@ Why can't we get the original pattern?
208 # print(dir(pattern))
209 # print repr(self.line), type(self.line)
210 raise ParseError("Failed to eat %s at %s" % (pattern.pattern, self.line))
211 self.line = self.line[m.end():]
212 return m
213
214 def subject(self):
215 # @@ Consider using dictionary cases
216 subj = self.uriref() or self.nodeid()
217 if not subj:
218 raise ParseError("Subject must be uriref or nodeID")
219 return subj
220
221 def predicate(self):
222 pred = self.uriref()
223 if not pred:
224 raise ParseError("Predicate must be uriref")
225 return pred
226
227 def object(self):
228 objt = self.uriref() or self.nodeid() or self.literal()
229 if objt is False:
230 raise ParseError("Unrecognised object type")
231 return objt
232
233 def uriref(self):
234 if self.peek('<'):
235 uri = self.eat(r_uriref).group(1)
236 uri = unquote(uri)
237 uri = uriquote(uri)
238 return URI(uri)
239 return False
240
241 def nodeid(self):
242 if self.peek('_'):
243 # Fix for https://github.com/RDFLib/rdflib/issues/204
244 bnode_id = self.eat(r_nodeid).group(1)
245 new_id = self._bnode_ids.get(bnode_id, None)
246 if new_id is not None:
247 # Re-map to id specfic to this doc
248 return bNode(new_id)
249 else:
250 # Replace with freshly-generated document-specific BNode id
251 bnode = bNode()
252 # Store the mapping
253 self._bnode_ids[bnode_id] = bnode
254 return bnode
255 return False
256
257 def literal(self):
258 if self.peek('"'):
259 lit, lang, dtype = self.eat(r_literal).groups()
260 if lang:
261 lang = lang
262 else:
263 lang = None
264 if dtype:
265 dtype = dtype
266 else:
267 dtype = None
268 if lang and dtype:
269 raise ParseError("Can't have both a language and a datatype")
270 lit = unquote(lit)
271 return Literal(lit, lang, dtype)
272 return False
273
274 # # Obsolete, unused
275 # def parseURI(uri):
276 # import urllib
277 # parser = NTriplesParser()
278 # u = urllib.urlopen(uri)
279 # sink = parser.parse(u)
280 # u.close()
281 # # for triple in sink:
282 # # print triple
283 # print 'Length of input:', sink.length