comparison env/lib/python3.7/site-packages/rdflib/tools/csv2rdf.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 """
2 A commandline tool for semi-automatically converting CSV to RDF
3
4 try: ``csv2rdf --help``
5
6 """
7
8
9 import sys
10 import re
11 import csv
12 import getopt
13 import configparser
14 import fileinput
15 import codecs
16 import time
17 import datetime
18 import warnings
19 import urllib.request, urllib.error, urllib.parse
20
21 import rdflib
22
23 from rdflib import RDF, RDFS
24 from rdflib.namespace import split_uri
25
26 __all__ = [ 'CSV2RDF' ]
27
28 HELP = """
29 csv2rdf.py \
30 -b <instance-base> \
31 -p <property-base> \
32 [-c <classname>] \
33 [-i <identity column(s)>] \
34 [-l <label columns>] \
35 [-s <N>] [-o <output>] \
36 [-f configfile] \
37 [--col<N> <colspec>] \
38 [--prop<N> <property>] \
39 <[-d <delim>] \
40 [-C] [files...]"
41
42 Reads csv files from stdin or given files
43 if -d is given, use this delimiter
44 if -s is given, skips N lines at the start
45 Creates a URI from the columns given to -i, or automatically by numbering if
46 none is given
47 Outputs RDFS labels from the columns given to -l
48 if -c is given adds a type triple with the given classname
49 if -C is given, the class is defined as rdfs:Class
50 Outputs one RDF triple per column in each row.
51 Output is in n3 format.
52 Output is stdout, unless -o is specified
53
54 Long options also supported: \
55 --base, \
56 --propbase, \
57 --ident, \
58 --class, \
59 --label, \
60 --out, \
61 --defineclass
62
63 Long options --col0, --col1, ...
64 can be used to specify conversion for columns.
65 Conversions can be:
66 float(), int(), split(sep, [more]), uri(base, [class]), date(format)
67
68 Long options --prop0, --prop1, ...
69 can be used to use specific properties, rather than ones auto-generated
70 from the headers
71
72 -f says to read config from a .ini/config file - the file must contain one
73 section called csv2rdf, with keys like the long options, i.e.:
74
75 [csv2rdf]
76 out=output.n3
77 base=http://example.org/
78 col0=split(";")
79 col1=split(";", uri("http://example.org/things/",
80 "http://xmlns.com/foaf/0.1/Person"))
81 col2=float()
82 col3=int()
83 col4=date("%Y-%b-%d %H:%M:%S")
84
85 """
86
87 # bah - ugly global
88 uris = {}
89
90
91 def toProperty(label):
92 """
93 CamelCase + lowercase inital a string
94
95
96 FIRST_NM => firstNm
97
98 firstNm => firstNm
99
100 """
101 label = re.sub("[^\w]", " ", label)
102 label = re.sub("([a-z])([A-Z])", "\\1 \\2", label)
103 label = label.split(" ")
104 return "".join([label[0].lower()] + [x.capitalize() for x in label[1:]])
105
106
107 def toPropertyLabel(label):
108 if not label[1:2].isupper():
109 return label[0:1].lower() + label[1:]
110 return label
111
112
113 def index(l, i):
114 """return a set of indexes from a list
115 >>> index([1,2,3],(0,2))
116 (1, 3)
117 """
118 return tuple([l[x] for x in i])
119
120
121 def csv_reader(csv_data, dialect=csv.excel, **kwargs):
122
123 csv_reader = csv.reader(csv_data,
124 dialect=dialect, **kwargs)
125 for row in csv_reader:
126 # decode UTF-8 back to Unicode, cell by cell:
127 yield [str(cell, 'utf-8', errors='replace') for cell in row]
128
129
130 def prefixuri(x, prefix, class_=None):
131 if prefix:
132 r = rdflib.URIRef(
133 prefix + urllib.parse.quote(
134 x.encode("utf8").replace(" ", "_"), safe=""))
135 else:
136 r = rdflib.URIRef(x)
137 uris[x] = (r, class_)
138 return r
139
140 # meta-language for config
141
142
143 class NodeMaker(object):
144 def range(self):
145 return rdflib.RDFS.Literal
146
147 def __call__(self, x):
148 return rdflib.Literal(x)
149
150
151 class NodeUri(NodeMaker):
152 def __init__(self, prefix, class_):
153 self.prefix = prefix
154 if class_:
155 self.class_ = rdflib.URIRef(class_)
156 else:
157 self.class_ = None
158
159 def __call__(self, x):
160 return prefixuri(x, self.prefix, self.class_)
161
162 def range(self):
163 return self.class_ or rdflib.RDF.Resource
164
165
166 class NodeLiteral(NodeMaker):
167 def __init__(self, f=None):
168 self.f = f
169
170
171 class NodeFloat(NodeLiteral):
172 def __call__(self, x):
173 if not self.f:
174 return rdflib.Literal(float(x))
175 if callable(self.f):
176 return rdflib.Literal(float(self.f(x)))
177 raise Exception("Function passed to float is not callable")
178
179 def range(self):
180 return rdflib.XSD.double
181
182
183 class NodeInt(NodeLiteral):
184 def __call__(self, x):
185 if not self.f:
186 return rdflib.Literal(int(x))
187 if callable(self.f):
188 return rdflib.Literal(int(self.f(x)))
189 raise Exception("Function passed to int is not callable")
190
191 def range(self):
192 return rdflib.XSD.int
193
194
195 class NodeReplace(NodeMaker):
196 def __init__(self, a, b):
197 self.a = a
198 self.b = b
199
200 def __call__(self, x):
201 return x.replace(self.a, self.b)
202
203
204 class NodeDate(NodeLiteral):
205 def __call__(self, x):
206 return rdflib.Literal(datetime.datetime.strptime(x, self.f))
207
208 def range(self):
209 return rdflib.XSD.dateTime
210
211
212 class NodeSplit(NodeMaker):
213 def __init__(self, sep, f):
214 self.sep = sep
215 self.f = f
216
217 def __call__(self, x):
218 if not self.f:
219 self.f = rdflib.Literal
220 if not callable(self.f):
221 raise Exception("Function passed to split is not callable!")
222 return [
223 self.f(y.strip()) for y in x.split(self.sep) if y.strip() != ""]
224
225 def range(self):
226 if self.f and isinstance(self.f, NodeMaker):
227 return self.f.range()
228 return NodeMaker.range(self)
229
230 default_node_make = NodeMaker()
231
232
233 def _config_ignore(*args, **kwargs):
234 return "ignore"
235
236
237 def _config_uri(prefix=None, class_=None):
238 return NodeUri(prefix, class_)
239
240
241 def _config_literal():
242 return NodeLiteral
243
244
245 def _config_float(f=None):
246 return NodeFloat(f)
247
248
249 def _config_replace(a, b):
250 return NodeReplace(a, b)
251
252
253 def _config_int(f=None):
254 return NodeInt(f)
255
256
257 def _config_date(format_):
258 return NodeDate(format_)
259
260
261 def _config_split(sep=None, f=None):
262 return NodeSplit(sep, f)
263
264 config_functions = {"ignore": _config_ignore,
265 "uri": _config_uri,
266 "literal": _config_literal,
267 "float": _config_float,
268 "int": _config_int,
269 "date": _config_date,
270 "split": _config_split,
271 "replace": _config_replace
272 }
273
274
275 def column(v):
276 """Return a function for column mapping"""
277
278 return eval(v, config_functions)
279
280
281 class CSV2RDF(object):
282 def __init__(self):
283
284 self.CLASS = None
285 self.BASE = None
286 self.PROPBASE = None
287 self.IDENT = 'auto'
288 self.LABEL = None
289 self.DEFINECLASS = False
290 self.SKIP = 0
291 self.DELIM = ","
292
293 self.COLUMNS = {}
294 self.PROPS = {}
295
296 self.OUT = codecs.getwriter("utf-8")(sys.stdout, errors='replace')
297
298 self.triples = 0
299
300 def triple(self, s, p, o):
301 self.OUT.write("%s %s %s .\n" % (s.n3(), p.n3(), o.n3()))
302 self.triples += 1
303
304 def convert(self, csvreader):
305
306 start = time.time()
307
308 if self.OUT:
309 sys.stderr.write("Output to %s\n" % self.OUT.name)
310
311 if self.IDENT != "auto" and not isinstance(self.IDENT, tuple):
312 self.IDENT = (self.IDENT,)
313
314 if not self.BASE:
315 warnings.warn("No base given, using http://example.org/instances/")
316 self.BASE = rdflib.Namespace("http://example.org/instances/")
317
318 if not self.PROPBASE:
319 warnings.warn(
320 "No property base given, using http://example.org/property/")
321 self.PROPBASE = rdflib.Namespace("http://example.org/props/")
322
323 # skip lines at the start
324 for x in range(self.SKIP):
325 next(csvreader)
326
327 # read header line
328 header_labels = list(next(csvreader))
329 headers = dict(
330 enumerate([self.PROPBASE[toProperty(x)] for x in header_labels]))
331 # override header properties if some are given
332 for k, v in self.PROPS.items():
333 headers[k] = v
334 header_labels[k] = split_uri(v)[1]
335
336 if self.DEFINECLASS:
337 # output class/property definitions
338 self.triple(self.CLASS, RDF.type, RDFS.Class)
339 for i in range(len(headers)):
340 h, l = headers[i], header_labels[i]
341 if h == "" or l == "":
342 continue
343 if self.COLUMNS.get(i) == _config_ignore:
344 continue
345 self.triple(h, RDF.type, RDF.Property)
346 self.triple(h, RDFS.label, rdflib.Literal(toPropertyLabel(l)))
347 self.triple(h, RDFS.domain, self.CLASS)
348 self.triple(h, RDFS.range,
349 self.COLUMNS.get(i, default_node_make).range())
350
351 rows = 0
352 for l in csvreader:
353 try:
354 if self.IDENT == 'auto':
355 uri = self.BASE["%d" % rows]
356 else:
357 uri = self.BASE["_".join([urllib.parse.quote(x.encode(
358 "utf8").replace(" ", "_"), safe="")
359 for x in index(l, self.IDENT)])]
360
361 if self.LABEL:
362 self.triple(uri, RDFS.label, rdflib.Literal(
363 " ".join(index(l, self.LABEL))))
364
365 if self.CLASS:
366 # type triple
367 self.triple(uri, RDF.type, self.CLASS)
368
369 for i, x in enumerate(l):
370 x = x.strip()
371 if x != '':
372 if self.COLUMNS.get(i) == _config_ignore:
373 continue
374 try:
375 o = self.COLUMNS.get(i, rdflib.Literal)(x)
376 if isinstance(o, list):
377 for _o in o:
378 self.triple(uri, headers[i], _o)
379 else:
380 self.triple(uri, headers[i], o)
381
382 except Exception as e:
383 warnings.warn(
384 "Could not process value for column " +
385 "%d:%s in row %d, ignoring: %s " % (
386 i, headers[i], rows, e.message))
387
388 rows += 1
389 if rows % 100000 == 0:
390 sys.stderr.write(
391 "%d rows, %d triples, elapsed %.2fs.\n" % (
392 rows, self.triples, time.time() - start))
393 except:
394 sys.stderr.write("Error processing line: %d\n" % rows)
395 raise
396
397 # output types/labels for generated URIs
398 classes = set()
399 for l, x in uris.items():
400 u, c = x
401 self.triple(u, RDFS.label, rdflib.Literal(l))
402 if c:
403 c = rdflib.URIRef(c)
404 classes.add(c)
405 self.triple(u, RDF.type, c)
406
407 for c in classes:
408 self.triple(c, RDF.type, RDFS.Class)
409
410 self.OUT.close()
411 sys.stderr.write(
412 "Converted %d rows into %d triples.\n" % (rows, self.triples))
413 sys.stderr.write("Took %.2f seconds.\n" % (time.time() - start))
414
415
416 def main():
417 csv2rdf = CSV2RDF()
418
419 opts, files = getopt.getopt(
420 sys.argv[1:],
421 "hc:b:p:i:o:Cf:l:s:d:",
422 ["out=", "base=", "delim=", "propbase=", "class=",
423 "ident=", "label=", "skip=", "defineclass", "help"])
424 opts = dict(opts)
425
426 if "-h" in opts or "--help" in opts:
427 print(HELP)
428 sys.exit(-1)
429
430 if "-f" in opts:
431 config = configparser.ConfigParser()
432 config.readfp(open(opts["-f"]))
433 for k, v in config.items("csv2rdf"):
434 if k == "out":
435 csv2rdf.OUT = codecs.open(v, "w", "utf-8")
436 elif k == "base":
437 csv2rdf.BASE = rdflib.Namespace(v)
438 elif k == "propbase":
439 csv2rdf.PROPBASE = rdflib.Namespace(v)
440 elif k == "class":
441 csv2rdf.CLASS = rdflib.URIRef(v)
442 elif k == "defineclass":
443 csv2rdf.DEFINECLASS = bool(v)
444 elif k == "ident":
445 csv2rdf.IDENT = eval(v)
446 elif k == "label":
447 csv2rdf.LABEL = eval(v)
448 elif k == "delim":
449 csv2rdf.DELIM = v
450 elif k == "skip":
451 csv2rdf.SKIP = int(v)
452 elif k.startswith("col"):
453 csv2rdf.COLUMNS[int(k[3:])] = column(v)
454 elif k.startswith("prop"):
455 csv2rdf.PROPS[int(k[4:])] = rdflib.URIRef(v)
456
457 if "-o" in opts:
458 csv2rdf.OUT = codecs.open(opts["-o"], "w", "utf-8")
459 if "--out" in opts:
460 csv2rdf.OUT = codecs.open(opts["--out"], "w", "utf-8")
461
462 if "-b" in opts:
463 csv2rdf.BASE = rdflib.Namespace(opts["-b"])
464 if "--base" in opts:
465 csv2rdf.BASE = rdflib.Namespace(opts["--base"])
466
467 if "-d" in opts:
468 csv2rdf.DELIM = opts["-d"]
469 if "--delim" in opts:
470 csv2rdf.DELIM = opts["--delim"]
471
472 if "-p" in opts:
473 csv2rdf.PROPBASE = rdflib.Namespace(opts["-p"])
474 if "--propbase" in opts:
475 csv2rdf.PROPBASE = rdflib.Namespace(opts["--propbase"])
476
477 if "-l" in opts:
478 csv2rdf.LABEL = eval(opts["-l"])
479 if "--label" in opts:
480 csv2rdf.LABEL = eval(opts["--label"])
481
482 if "-i" in opts:
483 csv2rdf.IDENT = eval(opts["-i"])
484 if "--ident" in opts:
485 csv2rdf.IDENT = eval(opts["--ident"])
486
487 if "-s" in opts:
488 csv2rdf.SKIP = int(opts["-s"])
489 if "--skip" in opts:
490 csv2rdf.SKIP = int(opts["--skip"])
491
492 if "-c" in opts:
493 csv2rdf.CLASS = rdflib.URIRef(opts["-c"])
494 if "--class" in opts:
495 csv2rdf.CLASS = rdflib.URIRef(opts["--class"])
496
497 for k, v in opts.items():
498 if k.startswith("--col"):
499 csv2rdf.COLUMNS[int(k[5:])] = column(v)
500 elif k.startswith("--prop"):
501 csv2rdf.PROPS[int(k[6:])] = rdflib.URIRef(v)
502
503 if csv2rdf.CLASS and ("-C" in opts or "--defineclass" in opts):
504 csv2rdf.DEFINECLASS = True
505
506 csv2rdf.convert(
507 csv_reader(fileinput.input(files), delimiter=csv2rdf.DELIM))
508
509
510 if __name__ == '__main__':
511 main()