Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/rdflib/tools/csv2rdf.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
| author | shellac |
|---|---|
| date | Sat, 02 May 2020 07:14:21 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:26e78fe6e8c4 |
|---|---|
| 1 """ | |
| 2 A commandline tool for semi-automatically converting CSV to RDF | |
| 3 | |
| 4 try: ``csv2rdf --help`` | |
| 5 | |
| 6 """ | |
| 7 | |
| 8 | |
| 9 import sys | |
| 10 import re | |
| 11 import csv | |
| 12 import getopt | |
| 13 import configparser | |
| 14 import fileinput | |
| 15 import codecs | |
| 16 import time | |
| 17 import datetime | |
| 18 import warnings | |
| 19 import urllib.request, urllib.error, urllib.parse | |
| 20 | |
| 21 import rdflib | |
| 22 | |
| 23 from rdflib import RDF, RDFS | |
| 24 from rdflib.namespace import split_uri | |
| 25 | |
| 26 __all__ = [ 'CSV2RDF' ] | |
| 27 | |
| 28 HELP = """ | |
| 29 csv2rdf.py \ | |
| 30 -b <instance-base> \ | |
| 31 -p <property-base> \ | |
| 32 [-c <classname>] \ | |
| 33 [-i <identity column(s)>] \ | |
| 34 [-l <label columns>] \ | |
| 35 [-s <N>] [-o <output>] \ | |
| 36 [-f configfile] \ | |
| 37 [--col<N> <colspec>] \ | |
| 38 [--prop<N> <property>] \ | |
| 39 <[-d <delim>] \ | |
| 40 [-C] [files...]" | |
| 41 | |
| 42 Reads csv files from stdin or given files | |
| 43 if -d is given, use this delimiter | |
| 44 if -s is given, skips N lines at the start | |
| 45 Creates a URI from the columns given to -i, or automatically by numbering if | |
| 46 none is given | |
| 47 Outputs RDFS labels from the columns given to -l | |
| 48 if -c is given adds a type triple with the given classname | |
| 49 if -C is given, the class is defined as rdfs:Class | |
| 50 Outputs one RDF triple per column in each row. | |
| 51 Output is in n3 format. | |
| 52 Output is stdout, unless -o is specified | |
| 53 | |
| 54 Long options also supported: \ | |
| 55 --base, \ | |
| 56 --propbase, \ | |
| 57 --ident, \ | |
| 58 --class, \ | |
| 59 --label, \ | |
| 60 --out, \ | |
| 61 --defineclass | |
| 62 | |
| 63 Long options --col0, --col1, ... | |
| 64 can be used to specify conversion for columns. | |
| 65 Conversions can be: | |
| 66 float(), int(), split(sep, [more]), uri(base, [class]), date(format) | |
| 67 | |
| 68 Long options --prop0, --prop1, ... | |
| 69 can be used to use specific properties, rather than ones auto-generated | |
| 70 from the headers | |
| 71 | |
| 72 -f says to read config from a .ini/config file - the file must contain one | |
| 73 section called csv2rdf, with keys like the long options, i.e.: | |
| 74 | |
| 75 [csv2rdf] | |
| 76 out=output.n3 | |
| 77 base=http://example.org/ | |
| 78 col0=split(";") | |
| 79 col1=split(";", uri("http://example.org/things/", | |
| 80 "http://xmlns.com/foaf/0.1/Person")) | |
| 81 col2=float() | |
| 82 col3=int() | |
| 83 col4=date("%Y-%b-%d %H:%M:%S") | |
| 84 | |
| 85 """ | |
| 86 | |
| 87 # bah - ugly global | |
| 88 uris = {} | |
| 89 | |
| 90 | |
| 91 def toProperty(label): | |
| 92 """ | |
| 93 CamelCase + lowercase inital a string | |
| 94 | |
| 95 | |
| 96 FIRST_NM => firstNm | |
| 97 | |
| 98 firstNm => firstNm | |
| 99 | |
| 100 """ | |
| 101 label = re.sub("[^\w]", " ", label) | |
| 102 label = re.sub("([a-z])([A-Z])", "\\1 \\2", label) | |
| 103 label = label.split(" ") | |
| 104 return "".join([label[0].lower()] + [x.capitalize() for x in label[1:]]) | |
| 105 | |
| 106 | |
| 107 def toPropertyLabel(label): | |
| 108 if not label[1:2].isupper(): | |
| 109 return label[0:1].lower() + label[1:] | |
| 110 return label | |
| 111 | |
| 112 | |
| 113 def index(l, i): | |
| 114 """return a set of indexes from a list | |
| 115 >>> index([1,2,3],(0,2)) | |
| 116 (1, 3) | |
| 117 """ | |
| 118 return tuple([l[x] for x in i]) | |
| 119 | |
| 120 | |
| 121 def csv_reader(csv_data, dialect=csv.excel, **kwargs): | |
| 122 | |
| 123 csv_reader = csv.reader(csv_data, | |
| 124 dialect=dialect, **kwargs) | |
| 125 for row in csv_reader: | |
| 126 # decode UTF-8 back to Unicode, cell by cell: | |
| 127 yield [str(cell, 'utf-8', errors='replace') for cell in row] | |
| 128 | |
| 129 | |
| 130 def prefixuri(x, prefix, class_=None): | |
| 131 if prefix: | |
| 132 r = rdflib.URIRef( | |
| 133 prefix + urllib.parse.quote( | |
| 134 x.encode("utf8").replace(" ", "_"), safe="")) | |
| 135 else: | |
| 136 r = rdflib.URIRef(x) | |
| 137 uris[x] = (r, class_) | |
| 138 return r | |
| 139 | |
| 140 # meta-language for config | |
| 141 | |
| 142 | |
| 143 class NodeMaker(object): | |
| 144 def range(self): | |
| 145 return rdflib.RDFS.Literal | |
| 146 | |
| 147 def __call__(self, x): | |
| 148 return rdflib.Literal(x) | |
| 149 | |
| 150 | |
| 151 class NodeUri(NodeMaker): | |
| 152 def __init__(self, prefix, class_): | |
| 153 self.prefix = prefix | |
| 154 if class_: | |
| 155 self.class_ = rdflib.URIRef(class_) | |
| 156 else: | |
| 157 self.class_ = None | |
| 158 | |
| 159 def __call__(self, x): | |
| 160 return prefixuri(x, self.prefix, self.class_) | |
| 161 | |
| 162 def range(self): | |
| 163 return self.class_ or rdflib.RDF.Resource | |
| 164 | |
| 165 | |
| 166 class NodeLiteral(NodeMaker): | |
| 167 def __init__(self, f=None): | |
| 168 self.f = f | |
| 169 | |
| 170 | |
| 171 class NodeFloat(NodeLiteral): | |
| 172 def __call__(self, x): | |
| 173 if not self.f: | |
| 174 return rdflib.Literal(float(x)) | |
| 175 if callable(self.f): | |
| 176 return rdflib.Literal(float(self.f(x))) | |
| 177 raise Exception("Function passed to float is not callable") | |
| 178 | |
| 179 def range(self): | |
| 180 return rdflib.XSD.double | |
| 181 | |
| 182 | |
| 183 class NodeInt(NodeLiteral): | |
| 184 def __call__(self, x): | |
| 185 if not self.f: | |
| 186 return rdflib.Literal(int(x)) | |
| 187 if callable(self.f): | |
| 188 return rdflib.Literal(int(self.f(x))) | |
| 189 raise Exception("Function passed to int is not callable") | |
| 190 | |
| 191 def range(self): | |
| 192 return rdflib.XSD.int | |
| 193 | |
| 194 | |
| 195 class NodeReplace(NodeMaker): | |
| 196 def __init__(self, a, b): | |
| 197 self.a = a | |
| 198 self.b = b | |
| 199 | |
| 200 def __call__(self, x): | |
| 201 return x.replace(self.a, self.b) | |
| 202 | |
| 203 | |
| 204 class NodeDate(NodeLiteral): | |
| 205 def __call__(self, x): | |
| 206 return rdflib.Literal(datetime.datetime.strptime(x, self.f)) | |
| 207 | |
| 208 def range(self): | |
| 209 return rdflib.XSD.dateTime | |
| 210 | |
| 211 | |
| 212 class NodeSplit(NodeMaker): | |
| 213 def __init__(self, sep, f): | |
| 214 self.sep = sep | |
| 215 self.f = f | |
| 216 | |
| 217 def __call__(self, x): | |
| 218 if not self.f: | |
| 219 self.f = rdflib.Literal | |
| 220 if not callable(self.f): | |
| 221 raise Exception("Function passed to split is not callable!") | |
| 222 return [ | |
| 223 self.f(y.strip()) for y in x.split(self.sep) if y.strip() != ""] | |
| 224 | |
| 225 def range(self): | |
| 226 if self.f and isinstance(self.f, NodeMaker): | |
| 227 return self.f.range() | |
| 228 return NodeMaker.range(self) | |
| 229 | |
| 230 default_node_make = NodeMaker() | |
| 231 | |
| 232 | |
| 233 def _config_ignore(*args, **kwargs): | |
| 234 return "ignore" | |
| 235 | |
| 236 | |
| 237 def _config_uri(prefix=None, class_=None): | |
| 238 return NodeUri(prefix, class_) | |
| 239 | |
| 240 | |
| 241 def _config_literal(): | |
| 242 return NodeLiteral | |
| 243 | |
| 244 | |
| 245 def _config_float(f=None): | |
| 246 return NodeFloat(f) | |
| 247 | |
| 248 | |
| 249 def _config_replace(a, b): | |
| 250 return NodeReplace(a, b) | |
| 251 | |
| 252 | |
| 253 def _config_int(f=None): | |
| 254 return NodeInt(f) | |
| 255 | |
| 256 | |
| 257 def _config_date(format_): | |
| 258 return NodeDate(format_) | |
| 259 | |
| 260 | |
| 261 def _config_split(sep=None, f=None): | |
| 262 return NodeSplit(sep, f) | |
| 263 | |
| 264 config_functions = {"ignore": _config_ignore, | |
| 265 "uri": _config_uri, | |
| 266 "literal": _config_literal, | |
| 267 "float": _config_float, | |
| 268 "int": _config_int, | |
| 269 "date": _config_date, | |
| 270 "split": _config_split, | |
| 271 "replace": _config_replace | |
| 272 } | |
| 273 | |
| 274 | |
| 275 def column(v): | |
| 276 """Return a function for column mapping""" | |
| 277 | |
| 278 return eval(v, config_functions) | |
| 279 | |
| 280 | |
| 281 class CSV2RDF(object): | |
| 282 def __init__(self): | |
| 283 | |
| 284 self.CLASS = None | |
| 285 self.BASE = None | |
| 286 self.PROPBASE = None | |
| 287 self.IDENT = 'auto' | |
| 288 self.LABEL = None | |
| 289 self.DEFINECLASS = False | |
| 290 self.SKIP = 0 | |
| 291 self.DELIM = "," | |
| 292 | |
| 293 self.COLUMNS = {} | |
| 294 self.PROPS = {} | |
| 295 | |
| 296 self.OUT = codecs.getwriter("utf-8")(sys.stdout, errors='replace') | |
| 297 | |
| 298 self.triples = 0 | |
| 299 | |
| 300 def triple(self, s, p, o): | |
| 301 self.OUT.write("%s %s %s .\n" % (s.n3(), p.n3(), o.n3())) | |
| 302 self.triples += 1 | |
| 303 | |
| 304 def convert(self, csvreader): | |
| 305 | |
| 306 start = time.time() | |
| 307 | |
| 308 if self.OUT: | |
| 309 sys.stderr.write("Output to %s\n" % self.OUT.name) | |
| 310 | |
| 311 if self.IDENT != "auto" and not isinstance(self.IDENT, tuple): | |
| 312 self.IDENT = (self.IDENT,) | |
| 313 | |
| 314 if not self.BASE: | |
| 315 warnings.warn("No base given, using http://example.org/instances/") | |
| 316 self.BASE = rdflib.Namespace("http://example.org/instances/") | |
| 317 | |
| 318 if not self.PROPBASE: | |
| 319 warnings.warn( | |
| 320 "No property base given, using http://example.org/property/") | |
| 321 self.PROPBASE = rdflib.Namespace("http://example.org/props/") | |
| 322 | |
| 323 # skip lines at the start | |
| 324 for x in range(self.SKIP): | |
| 325 next(csvreader) | |
| 326 | |
| 327 # read header line | |
| 328 header_labels = list(next(csvreader)) | |
| 329 headers = dict( | |
| 330 enumerate([self.PROPBASE[toProperty(x)] for x in header_labels])) | |
| 331 # override header properties if some are given | |
| 332 for k, v in self.PROPS.items(): | |
| 333 headers[k] = v | |
| 334 header_labels[k] = split_uri(v)[1] | |
| 335 | |
| 336 if self.DEFINECLASS: | |
| 337 # output class/property definitions | |
| 338 self.triple(self.CLASS, RDF.type, RDFS.Class) | |
| 339 for i in range(len(headers)): | |
| 340 h, l = headers[i], header_labels[i] | |
| 341 if h == "" or l == "": | |
| 342 continue | |
| 343 if self.COLUMNS.get(i) == _config_ignore: | |
| 344 continue | |
| 345 self.triple(h, RDF.type, RDF.Property) | |
| 346 self.triple(h, RDFS.label, rdflib.Literal(toPropertyLabel(l))) | |
| 347 self.triple(h, RDFS.domain, self.CLASS) | |
| 348 self.triple(h, RDFS.range, | |
| 349 self.COLUMNS.get(i, default_node_make).range()) | |
| 350 | |
| 351 rows = 0 | |
| 352 for l in csvreader: | |
| 353 try: | |
| 354 if self.IDENT == 'auto': | |
| 355 uri = self.BASE["%d" % rows] | |
| 356 else: | |
| 357 uri = self.BASE["_".join([urllib.parse.quote(x.encode( | |
| 358 "utf8").replace(" ", "_"), safe="") | |
| 359 for x in index(l, self.IDENT)])] | |
| 360 | |
| 361 if self.LABEL: | |
| 362 self.triple(uri, RDFS.label, rdflib.Literal( | |
| 363 " ".join(index(l, self.LABEL)))) | |
| 364 | |
| 365 if self.CLASS: | |
| 366 # type triple | |
| 367 self.triple(uri, RDF.type, self.CLASS) | |
| 368 | |
| 369 for i, x in enumerate(l): | |
| 370 x = x.strip() | |
| 371 if x != '': | |
| 372 if self.COLUMNS.get(i) == _config_ignore: | |
| 373 continue | |
| 374 try: | |
| 375 o = self.COLUMNS.get(i, rdflib.Literal)(x) | |
| 376 if isinstance(o, list): | |
| 377 for _o in o: | |
| 378 self.triple(uri, headers[i], _o) | |
| 379 else: | |
| 380 self.triple(uri, headers[i], o) | |
| 381 | |
| 382 except Exception as e: | |
| 383 warnings.warn( | |
| 384 "Could not process value for column " + | |
| 385 "%d:%s in row %d, ignoring: %s " % ( | |
| 386 i, headers[i], rows, e.message)) | |
| 387 | |
| 388 rows += 1 | |
| 389 if rows % 100000 == 0: | |
| 390 sys.stderr.write( | |
| 391 "%d rows, %d triples, elapsed %.2fs.\n" % ( | |
| 392 rows, self.triples, time.time() - start)) | |
| 393 except: | |
| 394 sys.stderr.write("Error processing line: %d\n" % rows) | |
| 395 raise | |
| 396 | |
| 397 # output types/labels for generated URIs | |
| 398 classes = set() | |
| 399 for l, x in uris.items(): | |
| 400 u, c = x | |
| 401 self.triple(u, RDFS.label, rdflib.Literal(l)) | |
| 402 if c: | |
| 403 c = rdflib.URIRef(c) | |
| 404 classes.add(c) | |
| 405 self.triple(u, RDF.type, c) | |
| 406 | |
| 407 for c in classes: | |
| 408 self.triple(c, RDF.type, RDFS.Class) | |
| 409 | |
| 410 self.OUT.close() | |
| 411 sys.stderr.write( | |
| 412 "Converted %d rows into %d triples.\n" % (rows, self.triples)) | |
| 413 sys.stderr.write("Took %.2f seconds.\n" % (time.time() - start)) | |
| 414 | |
| 415 | |
| 416 def main(): | |
| 417 csv2rdf = CSV2RDF() | |
| 418 | |
| 419 opts, files = getopt.getopt( | |
| 420 sys.argv[1:], | |
| 421 "hc:b:p:i:o:Cf:l:s:d:", | |
| 422 ["out=", "base=", "delim=", "propbase=", "class=", | |
| 423 "ident=", "label=", "skip=", "defineclass", "help"]) | |
| 424 opts = dict(opts) | |
| 425 | |
| 426 if "-h" in opts or "--help" in opts: | |
| 427 print(HELP) | |
| 428 sys.exit(-1) | |
| 429 | |
| 430 if "-f" in opts: | |
| 431 config = configparser.ConfigParser() | |
| 432 config.readfp(open(opts["-f"])) | |
| 433 for k, v in config.items("csv2rdf"): | |
| 434 if k == "out": | |
| 435 csv2rdf.OUT = codecs.open(v, "w", "utf-8") | |
| 436 elif k == "base": | |
| 437 csv2rdf.BASE = rdflib.Namespace(v) | |
| 438 elif k == "propbase": | |
| 439 csv2rdf.PROPBASE = rdflib.Namespace(v) | |
| 440 elif k == "class": | |
| 441 csv2rdf.CLASS = rdflib.URIRef(v) | |
| 442 elif k == "defineclass": | |
| 443 csv2rdf.DEFINECLASS = bool(v) | |
| 444 elif k == "ident": | |
| 445 csv2rdf.IDENT = eval(v) | |
| 446 elif k == "label": | |
| 447 csv2rdf.LABEL = eval(v) | |
| 448 elif k == "delim": | |
| 449 csv2rdf.DELIM = v | |
| 450 elif k == "skip": | |
| 451 csv2rdf.SKIP = int(v) | |
| 452 elif k.startswith("col"): | |
| 453 csv2rdf.COLUMNS[int(k[3:])] = column(v) | |
| 454 elif k.startswith("prop"): | |
| 455 csv2rdf.PROPS[int(k[4:])] = rdflib.URIRef(v) | |
| 456 | |
| 457 if "-o" in opts: | |
| 458 csv2rdf.OUT = codecs.open(opts["-o"], "w", "utf-8") | |
| 459 if "--out" in opts: | |
| 460 csv2rdf.OUT = codecs.open(opts["--out"], "w", "utf-8") | |
| 461 | |
| 462 if "-b" in opts: | |
| 463 csv2rdf.BASE = rdflib.Namespace(opts["-b"]) | |
| 464 if "--base" in opts: | |
| 465 csv2rdf.BASE = rdflib.Namespace(opts["--base"]) | |
| 466 | |
| 467 if "-d" in opts: | |
| 468 csv2rdf.DELIM = opts["-d"] | |
| 469 if "--delim" in opts: | |
| 470 csv2rdf.DELIM = opts["--delim"] | |
| 471 | |
| 472 if "-p" in opts: | |
| 473 csv2rdf.PROPBASE = rdflib.Namespace(opts["-p"]) | |
| 474 if "--propbase" in opts: | |
| 475 csv2rdf.PROPBASE = rdflib.Namespace(opts["--propbase"]) | |
| 476 | |
| 477 if "-l" in opts: | |
| 478 csv2rdf.LABEL = eval(opts["-l"]) | |
| 479 if "--label" in opts: | |
| 480 csv2rdf.LABEL = eval(opts["--label"]) | |
| 481 | |
| 482 if "-i" in opts: | |
| 483 csv2rdf.IDENT = eval(opts["-i"]) | |
| 484 if "--ident" in opts: | |
| 485 csv2rdf.IDENT = eval(opts["--ident"]) | |
| 486 | |
| 487 if "-s" in opts: | |
| 488 csv2rdf.SKIP = int(opts["-s"]) | |
| 489 if "--skip" in opts: | |
| 490 csv2rdf.SKIP = int(opts["--skip"]) | |
| 491 | |
| 492 if "-c" in opts: | |
| 493 csv2rdf.CLASS = rdflib.URIRef(opts["-c"]) | |
| 494 if "--class" in opts: | |
| 495 csv2rdf.CLASS = rdflib.URIRef(opts["--class"]) | |
| 496 | |
| 497 for k, v in opts.items(): | |
| 498 if k.startswith("--col"): | |
| 499 csv2rdf.COLUMNS[int(k[5:])] = column(v) | |
| 500 elif k.startswith("--prop"): | |
| 501 csv2rdf.PROPS[int(k[6:])] = rdflib.URIRef(v) | |
| 502 | |
| 503 if csv2rdf.CLASS and ("-C" in opts or "--defineclass" in opts): | |
| 504 csv2rdf.DEFINECLASS = True | |
| 505 | |
| 506 csv2rdf.convert( | |
| 507 csv_reader(fileinput.input(files), delimiter=csv2rdf.DELIM)) | |
| 508 | |
| 509 | |
| 510 if __name__ == '__main__': | |
| 511 main() |
