Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/rdflib/tools/csv2rdf.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:d30785e31577 | 1:56ad4e20f292 |
---|---|
1 """ | |
2 A commandline tool for semi-automatically converting CSV to RDF | |
3 | |
4 try: ``csv2rdf --help`` | |
5 | |
6 """ | |
7 | |
8 | |
9 import sys | |
10 import re | |
11 import csv | |
12 import getopt | |
13 import configparser | |
14 import fileinput | |
15 import codecs | |
16 import time | |
17 import datetime | |
18 import warnings | |
19 import urllib.request, urllib.error, urllib.parse | |
20 | |
21 import rdflib | |
22 | |
23 from rdflib import RDF, RDFS | |
24 from rdflib.namespace import split_uri | |
25 | |
26 __all__ = [ 'CSV2RDF' ] | |
27 | |
28 HELP = """ | |
29 csv2rdf.py \ | |
30 -b <instance-base> \ | |
31 -p <property-base> \ | |
32 [-c <classname>] \ | |
33 [-i <identity column(s)>] \ | |
34 [-l <label columns>] \ | |
35 [-s <N>] [-o <output>] \ | |
36 [-f configfile] \ | |
37 [--col<N> <colspec>] \ | |
38 [--prop<N> <property>] \ | |
39 <[-d <delim>] \ | |
40 [-C] [files...]" | |
41 | |
42 Reads csv files from stdin or given files | |
43 if -d is given, use this delimiter | |
44 if -s is given, skips N lines at the start | |
45 Creates a URI from the columns given to -i, or automatically by numbering if | |
46 none is given | |
47 Outputs RDFS labels from the columns given to -l | |
48 if -c is given adds a type triple with the given classname | |
49 if -C is given, the class is defined as rdfs:Class | |
50 Outputs one RDF triple per column in each row. | |
51 Output is in n3 format. | |
52 Output is stdout, unless -o is specified | |
53 | |
54 Long options also supported: \ | |
55 --base, \ | |
56 --propbase, \ | |
57 --ident, \ | |
58 --class, \ | |
59 --label, \ | |
60 --out, \ | |
61 --defineclass | |
62 | |
63 Long options --col0, --col1, ... | |
64 can be used to specify conversion for columns. | |
65 Conversions can be: | |
66 float(), int(), split(sep, [more]), uri(base, [class]), date(format) | |
67 | |
68 Long options --prop0, --prop1, ... | |
69 can be used to use specific properties, rather than ones auto-generated | |
70 from the headers | |
71 | |
72 -f says to read config from a .ini/config file - the file must contain one | |
73 section called csv2rdf, with keys like the long options, i.e.: | |
74 | |
75 [csv2rdf] | |
76 out=output.n3 | |
77 base=http://example.org/ | |
78 col0=split(";") | |
79 col1=split(";", uri("http://example.org/things/", | |
80 "http://xmlns.com/foaf/0.1/Person")) | |
81 col2=float() | |
82 col3=int() | |
83 col4=date("%Y-%b-%d %H:%M:%S") | |
84 | |
85 """ | |
86 | |
87 # bah - ugly global | |
88 uris = {} | |
89 | |
90 | |
91 def toProperty(label): | |
92 """ | |
93 CamelCase + lowercase inital a string | |
94 | |
95 | |
96 FIRST_NM => firstNm | |
97 | |
98 firstNm => firstNm | |
99 | |
100 """ | |
101 label = re.sub("[^\w]", " ", label) | |
102 label = re.sub("([a-z])([A-Z])", "\\1 \\2", label) | |
103 label = label.split(" ") | |
104 return "".join([label[0].lower()] + [x.capitalize() for x in label[1:]]) | |
105 | |
106 | |
107 def toPropertyLabel(label): | |
108 if not label[1:2].isupper(): | |
109 return label[0:1].lower() + label[1:] | |
110 return label | |
111 | |
112 | |
113 def index(l, i): | |
114 """return a set of indexes from a list | |
115 >>> index([1,2,3],(0,2)) | |
116 (1, 3) | |
117 """ | |
118 return tuple([l[x] for x in i]) | |
119 | |
120 | |
121 def csv_reader(csv_data, dialect=csv.excel, **kwargs): | |
122 | |
123 csv_reader = csv.reader(csv_data, | |
124 dialect=dialect, **kwargs) | |
125 for row in csv_reader: | |
126 # decode UTF-8 back to Unicode, cell by cell: | |
127 yield [str(cell, 'utf-8', errors='replace') for cell in row] | |
128 | |
129 | |
130 def prefixuri(x, prefix, class_=None): | |
131 if prefix: | |
132 r = rdflib.URIRef( | |
133 prefix + urllib.parse.quote( | |
134 x.encode("utf8").replace(" ", "_"), safe="")) | |
135 else: | |
136 r = rdflib.URIRef(x) | |
137 uris[x] = (r, class_) | |
138 return r | |
139 | |
140 # meta-language for config | |
141 | |
142 | |
143 class NodeMaker(object): | |
144 def range(self): | |
145 return rdflib.RDFS.Literal | |
146 | |
147 def __call__(self, x): | |
148 return rdflib.Literal(x) | |
149 | |
150 | |
151 class NodeUri(NodeMaker): | |
152 def __init__(self, prefix, class_): | |
153 self.prefix = prefix | |
154 if class_: | |
155 self.class_ = rdflib.URIRef(class_) | |
156 else: | |
157 self.class_ = None | |
158 | |
159 def __call__(self, x): | |
160 return prefixuri(x, self.prefix, self.class_) | |
161 | |
162 def range(self): | |
163 return self.class_ or rdflib.RDF.Resource | |
164 | |
165 | |
166 class NodeLiteral(NodeMaker): | |
167 def __init__(self, f=None): | |
168 self.f = f | |
169 | |
170 | |
171 class NodeFloat(NodeLiteral): | |
172 def __call__(self, x): | |
173 if not self.f: | |
174 return rdflib.Literal(float(x)) | |
175 if callable(self.f): | |
176 return rdflib.Literal(float(self.f(x))) | |
177 raise Exception("Function passed to float is not callable") | |
178 | |
179 def range(self): | |
180 return rdflib.XSD.double | |
181 | |
182 | |
183 class NodeInt(NodeLiteral): | |
184 def __call__(self, x): | |
185 if not self.f: | |
186 return rdflib.Literal(int(x)) | |
187 if callable(self.f): | |
188 return rdflib.Literal(int(self.f(x))) | |
189 raise Exception("Function passed to int is not callable") | |
190 | |
191 def range(self): | |
192 return rdflib.XSD.int | |
193 | |
194 | |
195 class NodeReplace(NodeMaker): | |
196 def __init__(self, a, b): | |
197 self.a = a | |
198 self.b = b | |
199 | |
200 def __call__(self, x): | |
201 return x.replace(self.a, self.b) | |
202 | |
203 | |
204 class NodeDate(NodeLiteral): | |
205 def __call__(self, x): | |
206 return rdflib.Literal(datetime.datetime.strptime(x, self.f)) | |
207 | |
208 def range(self): | |
209 return rdflib.XSD.dateTime | |
210 | |
211 | |
212 class NodeSplit(NodeMaker): | |
213 def __init__(self, sep, f): | |
214 self.sep = sep | |
215 self.f = f | |
216 | |
217 def __call__(self, x): | |
218 if not self.f: | |
219 self.f = rdflib.Literal | |
220 if not callable(self.f): | |
221 raise Exception("Function passed to split is not callable!") | |
222 return [ | |
223 self.f(y.strip()) for y in x.split(self.sep) if y.strip() != ""] | |
224 | |
225 def range(self): | |
226 if self.f and isinstance(self.f, NodeMaker): | |
227 return self.f.range() | |
228 return NodeMaker.range(self) | |
229 | |
230 default_node_make = NodeMaker() | |
231 | |
232 | |
233 def _config_ignore(*args, **kwargs): | |
234 return "ignore" | |
235 | |
236 | |
237 def _config_uri(prefix=None, class_=None): | |
238 return NodeUri(prefix, class_) | |
239 | |
240 | |
241 def _config_literal(): | |
242 return NodeLiteral | |
243 | |
244 | |
245 def _config_float(f=None): | |
246 return NodeFloat(f) | |
247 | |
248 | |
249 def _config_replace(a, b): | |
250 return NodeReplace(a, b) | |
251 | |
252 | |
253 def _config_int(f=None): | |
254 return NodeInt(f) | |
255 | |
256 | |
257 def _config_date(format_): | |
258 return NodeDate(format_) | |
259 | |
260 | |
261 def _config_split(sep=None, f=None): | |
262 return NodeSplit(sep, f) | |
263 | |
264 config_functions = {"ignore": _config_ignore, | |
265 "uri": _config_uri, | |
266 "literal": _config_literal, | |
267 "float": _config_float, | |
268 "int": _config_int, | |
269 "date": _config_date, | |
270 "split": _config_split, | |
271 "replace": _config_replace | |
272 } | |
273 | |
274 | |
275 def column(v): | |
276 """Return a function for column mapping""" | |
277 | |
278 return eval(v, config_functions) | |
279 | |
280 | |
281 class CSV2RDF(object): | |
282 def __init__(self): | |
283 | |
284 self.CLASS = None | |
285 self.BASE = None | |
286 self.PROPBASE = None | |
287 self.IDENT = 'auto' | |
288 self.LABEL = None | |
289 self.DEFINECLASS = False | |
290 self.SKIP = 0 | |
291 self.DELIM = "," | |
292 | |
293 self.COLUMNS = {} | |
294 self.PROPS = {} | |
295 | |
296 self.OUT = codecs.getwriter("utf-8")(sys.stdout, errors='replace') | |
297 | |
298 self.triples = 0 | |
299 | |
300 def triple(self, s, p, o): | |
301 self.OUT.write("%s %s %s .\n" % (s.n3(), p.n3(), o.n3())) | |
302 self.triples += 1 | |
303 | |
304 def convert(self, csvreader): | |
305 | |
306 start = time.time() | |
307 | |
308 if self.OUT: | |
309 sys.stderr.write("Output to %s\n" % self.OUT.name) | |
310 | |
311 if self.IDENT != "auto" and not isinstance(self.IDENT, tuple): | |
312 self.IDENT = (self.IDENT,) | |
313 | |
314 if not self.BASE: | |
315 warnings.warn("No base given, using http://example.org/instances/") | |
316 self.BASE = rdflib.Namespace("http://example.org/instances/") | |
317 | |
318 if not self.PROPBASE: | |
319 warnings.warn( | |
320 "No property base given, using http://example.org/property/") | |
321 self.PROPBASE = rdflib.Namespace("http://example.org/props/") | |
322 | |
323 # skip lines at the start | |
324 for x in range(self.SKIP): | |
325 next(csvreader) | |
326 | |
327 # read header line | |
328 header_labels = list(next(csvreader)) | |
329 headers = dict( | |
330 enumerate([self.PROPBASE[toProperty(x)] for x in header_labels])) | |
331 # override header properties if some are given | |
332 for k, v in self.PROPS.items(): | |
333 headers[k] = v | |
334 header_labels[k] = split_uri(v)[1] | |
335 | |
336 if self.DEFINECLASS: | |
337 # output class/property definitions | |
338 self.triple(self.CLASS, RDF.type, RDFS.Class) | |
339 for i in range(len(headers)): | |
340 h, l = headers[i], header_labels[i] | |
341 if h == "" or l == "": | |
342 continue | |
343 if self.COLUMNS.get(i) == _config_ignore: | |
344 continue | |
345 self.triple(h, RDF.type, RDF.Property) | |
346 self.triple(h, RDFS.label, rdflib.Literal(toPropertyLabel(l))) | |
347 self.triple(h, RDFS.domain, self.CLASS) | |
348 self.triple(h, RDFS.range, | |
349 self.COLUMNS.get(i, default_node_make).range()) | |
350 | |
351 rows = 0 | |
352 for l in csvreader: | |
353 try: | |
354 if self.IDENT == 'auto': | |
355 uri = self.BASE["%d" % rows] | |
356 else: | |
357 uri = self.BASE["_".join([urllib.parse.quote(x.encode( | |
358 "utf8").replace(" ", "_"), safe="") | |
359 for x in index(l, self.IDENT)])] | |
360 | |
361 if self.LABEL: | |
362 self.triple(uri, RDFS.label, rdflib.Literal( | |
363 " ".join(index(l, self.LABEL)))) | |
364 | |
365 if self.CLASS: | |
366 # type triple | |
367 self.triple(uri, RDF.type, self.CLASS) | |
368 | |
369 for i, x in enumerate(l): | |
370 x = x.strip() | |
371 if x != '': | |
372 if self.COLUMNS.get(i) == _config_ignore: | |
373 continue | |
374 try: | |
375 o = self.COLUMNS.get(i, rdflib.Literal)(x) | |
376 if isinstance(o, list): | |
377 for _o in o: | |
378 self.triple(uri, headers[i], _o) | |
379 else: | |
380 self.triple(uri, headers[i], o) | |
381 | |
382 except Exception as e: | |
383 warnings.warn( | |
384 "Could not process value for column " + | |
385 "%d:%s in row %d, ignoring: %s " % ( | |
386 i, headers[i], rows, e.message)) | |
387 | |
388 rows += 1 | |
389 if rows % 100000 == 0: | |
390 sys.stderr.write( | |
391 "%d rows, %d triples, elapsed %.2fs.\n" % ( | |
392 rows, self.triples, time.time() - start)) | |
393 except: | |
394 sys.stderr.write("Error processing line: %d\n" % rows) | |
395 raise | |
396 | |
397 # output types/labels for generated URIs | |
398 classes = set() | |
399 for l, x in uris.items(): | |
400 u, c = x | |
401 self.triple(u, RDFS.label, rdflib.Literal(l)) | |
402 if c: | |
403 c = rdflib.URIRef(c) | |
404 classes.add(c) | |
405 self.triple(u, RDF.type, c) | |
406 | |
407 for c in classes: | |
408 self.triple(c, RDF.type, RDFS.Class) | |
409 | |
410 self.OUT.close() | |
411 sys.stderr.write( | |
412 "Converted %d rows into %d triples.\n" % (rows, self.triples)) | |
413 sys.stderr.write("Took %.2f seconds.\n" % (time.time() - start)) | |
414 | |
415 | |
416 def main(): | |
417 csv2rdf = CSV2RDF() | |
418 | |
419 opts, files = getopt.getopt( | |
420 sys.argv[1:], | |
421 "hc:b:p:i:o:Cf:l:s:d:", | |
422 ["out=", "base=", "delim=", "propbase=", "class=", | |
423 "ident=", "label=", "skip=", "defineclass", "help"]) | |
424 opts = dict(opts) | |
425 | |
426 if "-h" in opts or "--help" in opts: | |
427 print(HELP) | |
428 sys.exit(-1) | |
429 | |
430 if "-f" in opts: | |
431 config = configparser.ConfigParser() | |
432 config.readfp(open(opts["-f"])) | |
433 for k, v in config.items("csv2rdf"): | |
434 if k == "out": | |
435 csv2rdf.OUT = codecs.open(v, "w", "utf-8") | |
436 elif k == "base": | |
437 csv2rdf.BASE = rdflib.Namespace(v) | |
438 elif k == "propbase": | |
439 csv2rdf.PROPBASE = rdflib.Namespace(v) | |
440 elif k == "class": | |
441 csv2rdf.CLASS = rdflib.URIRef(v) | |
442 elif k == "defineclass": | |
443 csv2rdf.DEFINECLASS = bool(v) | |
444 elif k == "ident": | |
445 csv2rdf.IDENT = eval(v) | |
446 elif k == "label": | |
447 csv2rdf.LABEL = eval(v) | |
448 elif k == "delim": | |
449 csv2rdf.DELIM = v | |
450 elif k == "skip": | |
451 csv2rdf.SKIP = int(v) | |
452 elif k.startswith("col"): | |
453 csv2rdf.COLUMNS[int(k[3:])] = column(v) | |
454 elif k.startswith("prop"): | |
455 csv2rdf.PROPS[int(k[4:])] = rdflib.URIRef(v) | |
456 | |
457 if "-o" in opts: | |
458 csv2rdf.OUT = codecs.open(opts["-o"], "w", "utf-8") | |
459 if "--out" in opts: | |
460 csv2rdf.OUT = codecs.open(opts["--out"], "w", "utf-8") | |
461 | |
462 if "-b" in opts: | |
463 csv2rdf.BASE = rdflib.Namespace(opts["-b"]) | |
464 if "--base" in opts: | |
465 csv2rdf.BASE = rdflib.Namespace(opts["--base"]) | |
466 | |
467 if "-d" in opts: | |
468 csv2rdf.DELIM = opts["-d"] | |
469 if "--delim" in opts: | |
470 csv2rdf.DELIM = opts["--delim"] | |
471 | |
472 if "-p" in opts: | |
473 csv2rdf.PROPBASE = rdflib.Namespace(opts["-p"]) | |
474 if "--propbase" in opts: | |
475 csv2rdf.PROPBASE = rdflib.Namespace(opts["--propbase"]) | |
476 | |
477 if "-l" in opts: | |
478 csv2rdf.LABEL = eval(opts["-l"]) | |
479 if "--label" in opts: | |
480 csv2rdf.LABEL = eval(opts["--label"]) | |
481 | |
482 if "-i" in opts: | |
483 csv2rdf.IDENT = eval(opts["-i"]) | |
484 if "--ident" in opts: | |
485 csv2rdf.IDENT = eval(opts["--ident"]) | |
486 | |
487 if "-s" in opts: | |
488 csv2rdf.SKIP = int(opts["-s"]) | |
489 if "--skip" in opts: | |
490 csv2rdf.SKIP = int(opts["--skip"]) | |
491 | |
492 if "-c" in opts: | |
493 csv2rdf.CLASS = rdflib.URIRef(opts["-c"]) | |
494 if "--class" in opts: | |
495 csv2rdf.CLASS = rdflib.URIRef(opts["--class"]) | |
496 | |
497 for k, v in opts.items(): | |
498 if k.startswith("--col"): | |
499 csv2rdf.COLUMNS[int(k[5:])] = column(v) | |
500 elif k.startswith("--prop"): | |
501 csv2rdf.PROPS[int(k[6:])] = rdflib.URIRef(v) | |
502 | |
503 if csv2rdf.CLASS and ("-C" in opts or "--defineclass" in opts): | |
504 csv2rdf.DEFINECLASS = True | |
505 | |
506 csv2rdf.convert( | |
507 csv_reader(fileinput.input(files), delimiter=csv2rdf.DELIM)) | |
508 | |
509 | |
510 if __name__ == '__main__': | |
511 main() |