Mercurial > repos > guerler > springsuite
view planemo/lib/python3.7/site-packages/rdflib/tools/csv2rdf.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
line wrap: on
line source
""" A commandline tool for semi-automatically converting CSV to RDF try: ``csv2rdf --help`` """ import sys import re import csv import getopt import configparser import fileinput import codecs import time import datetime import warnings import urllib.request, urllib.error, urllib.parse import rdflib from rdflib import RDF, RDFS from rdflib.namespace import split_uri __all__ = [ 'CSV2RDF' ] HELP = """ csv2rdf.py \ -b <instance-base> \ -p <property-base> \ [-c <classname>] \ [-i <identity column(s)>] \ [-l <label columns>] \ [-s <N>] [-o <output>] \ [-f configfile] \ [--col<N> <colspec>] \ [--prop<N> <property>] \ <[-d <delim>] \ [-C] [files...]" Reads csv files from stdin or given files if -d is given, use this delimiter if -s is given, skips N lines at the start Creates a URI from the columns given to -i, or automatically by numbering if none is given Outputs RDFS labels from the columns given to -l if -c is given adds a type triple with the given classname if -C is given, the class is defined as rdfs:Class Outputs one RDF triple per column in each row. Output is in n3 format. Output is stdout, unless -o is specified Long options also supported: \ --base, \ --propbase, \ --ident, \ --class, \ --label, \ --out, \ --defineclass Long options --col0, --col1, ... can be used to specify conversion for columns. Conversions can be: float(), int(), split(sep, [more]), uri(base, [class]), date(format) Long options --prop0, --prop1, ... can be used to use specific properties, rather than ones auto-generated from the headers -f says to read config from a .ini/config file - the file must contain one section called csv2rdf, with keys like the long options, i.e.: [csv2rdf] out=output.n3 base=http://example.org/ col0=split(";") col1=split(";", uri("http://example.org/things/", "http://xmlns.com/foaf/0.1/Person")) col2=float() col3=int() col4=date("%Y-%b-%d %H:%M:%S") """ # bah - ugly global uris = {} def toProperty(label): """ CamelCase + lowercase inital a string FIRST_NM => firstNm firstNm => firstNm """ label = re.sub("[^\w]", " ", label) label = re.sub("([a-z])([A-Z])", "\\1 \\2", label) label = label.split(" ") return "".join([label[0].lower()] + [x.capitalize() for x in label[1:]]) def toPropertyLabel(label): if not label[1:2].isupper(): return label[0:1].lower() + label[1:] return label def index(l, i): """return a set of indexes from a list >>> index([1,2,3],(0,2)) (1, 3) """ return tuple([l[x] for x in i]) def csv_reader(csv_data, dialect=csv.excel, **kwargs): csv_reader = csv.reader(csv_data, dialect=dialect, **kwargs) for row in csv_reader: # decode UTF-8 back to Unicode, cell by cell: yield [str(cell, 'utf-8', errors='replace') for cell in row] def prefixuri(x, prefix, class_=None): if prefix: r = rdflib.URIRef( prefix + urllib.parse.quote( x.encode("utf8").replace(" ", "_"), safe="")) else: r = rdflib.URIRef(x) uris[x] = (r, class_) return r # meta-language for config class NodeMaker(object): def range(self): return rdflib.RDFS.Literal def __call__(self, x): return rdflib.Literal(x) class NodeUri(NodeMaker): def __init__(self, prefix, class_): self.prefix = prefix if class_: self.class_ = rdflib.URIRef(class_) else: self.class_ = None def __call__(self, x): return prefixuri(x, self.prefix, self.class_) def range(self): return self.class_ or rdflib.RDF.Resource class NodeLiteral(NodeMaker): def __init__(self, f=None): self.f = f class NodeFloat(NodeLiteral): def __call__(self, x): if not self.f: return rdflib.Literal(float(x)) if callable(self.f): return rdflib.Literal(float(self.f(x))) raise Exception("Function passed to float is not callable") def range(self): return rdflib.XSD.double class NodeInt(NodeLiteral): def __call__(self, x): if not self.f: return rdflib.Literal(int(x)) if callable(self.f): return rdflib.Literal(int(self.f(x))) raise Exception("Function passed to int is not callable") def range(self): return rdflib.XSD.int class NodeReplace(NodeMaker): def __init__(self, a, b): self.a = a self.b = b def __call__(self, x): return x.replace(self.a, self.b) class NodeDate(NodeLiteral): def __call__(self, x): return rdflib.Literal(datetime.datetime.strptime(x, self.f)) def range(self): return rdflib.XSD.dateTime class NodeSplit(NodeMaker): def __init__(self, sep, f): self.sep = sep self.f = f def __call__(self, x): if not self.f: self.f = rdflib.Literal if not callable(self.f): raise Exception("Function passed to split is not callable!") return [ self.f(y.strip()) for y in x.split(self.sep) if y.strip() != ""] def range(self): if self.f and isinstance(self.f, NodeMaker): return self.f.range() return NodeMaker.range(self) default_node_make = NodeMaker() def _config_ignore(*args, **kwargs): return "ignore" def _config_uri(prefix=None, class_=None): return NodeUri(prefix, class_) def _config_literal(): return NodeLiteral def _config_float(f=None): return NodeFloat(f) def _config_replace(a, b): return NodeReplace(a, b) def _config_int(f=None): return NodeInt(f) def _config_date(format_): return NodeDate(format_) def _config_split(sep=None, f=None): return NodeSplit(sep, f) config_functions = {"ignore": _config_ignore, "uri": _config_uri, "literal": _config_literal, "float": _config_float, "int": _config_int, "date": _config_date, "split": _config_split, "replace": _config_replace } def column(v): """Return a function for column mapping""" return eval(v, config_functions) class CSV2RDF(object): def __init__(self): self.CLASS = None self.BASE = None self.PROPBASE = None self.IDENT = 'auto' self.LABEL = None self.DEFINECLASS = False self.SKIP = 0 self.DELIM = "," self.COLUMNS = {} self.PROPS = {} self.OUT = codecs.getwriter("utf-8")(sys.stdout, errors='replace') self.triples = 0 def triple(self, s, p, o): self.OUT.write("%s %s %s .\n" % (s.n3(), p.n3(), o.n3())) self.triples += 1 def convert(self, csvreader): start = time.time() if self.OUT: sys.stderr.write("Output to %s\n" % self.OUT.name) if self.IDENT != "auto" and not isinstance(self.IDENT, tuple): self.IDENT = (self.IDENT,) if not self.BASE: warnings.warn("No base given, using http://example.org/instances/") self.BASE = rdflib.Namespace("http://example.org/instances/") if not self.PROPBASE: warnings.warn( "No property base given, using http://example.org/property/") self.PROPBASE = rdflib.Namespace("http://example.org/props/") # skip lines at the start for x in range(self.SKIP): next(csvreader) # read header line header_labels = list(next(csvreader)) headers = dict( enumerate([self.PROPBASE[toProperty(x)] for x in header_labels])) # override header properties if some are given for k, v in self.PROPS.items(): headers[k] = v header_labels[k] = split_uri(v)[1] if self.DEFINECLASS: # output class/property definitions self.triple(self.CLASS, RDF.type, RDFS.Class) for i in range(len(headers)): h, l = headers[i], header_labels[i] if h == "" or l == "": continue if self.COLUMNS.get(i) == _config_ignore: continue self.triple(h, RDF.type, RDF.Property) self.triple(h, RDFS.label, rdflib.Literal(toPropertyLabel(l))) self.triple(h, RDFS.domain, self.CLASS) self.triple(h, RDFS.range, self.COLUMNS.get(i, default_node_make).range()) rows = 0 for l in csvreader: try: if self.IDENT == 'auto': uri = self.BASE["%d" % rows] else: uri = self.BASE["_".join([urllib.parse.quote(x.encode( "utf8").replace(" ", "_"), safe="") for x in index(l, self.IDENT)])] if self.LABEL: self.triple(uri, RDFS.label, rdflib.Literal( " ".join(index(l, self.LABEL)))) if self.CLASS: # type triple self.triple(uri, RDF.type, self.CLASS) for i, x in enumerate(l): x = x.strip() if x != '': if self.COLUMNS.get(i) == _config_ignore: continue try: o = self.COLUMNS.get(i, rdflib.Literal)(x) if isinstance(o, list): for _o in o: self.triple(uri, headers[i], _o) else: self.triple(uri, headers[i], o) except Exception as e: warnings.warn( "Could not process value for column " + "%d:%s in row %d, ignoring: %s " % ( i, headers[i], rows, e.message)) rows += 1 if rows % 100000 == 0: sys.stderr.write( "%d rows, %d triples, elapsed %.2fs.\n" % ( rows, self.triples, time.time() - start)) except: sys.stderr.write("Error processing line: %d\n" % rows) raise # output types/labels for generated URIs classes = set() for l, x in uris.items(): u, c = x self.triple(u, RDFS.label, rdflib.Literal(l)) if c: c = rdflib.URIRef(c) classes.add(c) self.triple(u, RDF.type, c) for c in classes: self.triple(c, RDF.type, RDFS.Class) self.OUT.close() sys.stderr.write( "Converted %d rows into %d triples.\n" % (rows, self.triples)) sys.stderr.write("Took %.2f seconds.\n" % (time.time() - start)) def main(): csv2rdf = CSV2RDF() opts, files = getopt.getopt( sys.argv[1:], "hc:b:p:i:o:Cf:l:s:d:", ["out=", "base=", "delim=", "propbase=", "class=", "ident=", "label=", "skip=", "defineclass", "help"]) opts = dict(opts) if "-h" in opts or "--help" in opts: print(HELP) sys.exit(-1) if "-f" in opts: config = configparser.ConfigParser() config.readfp(open(opts["-f"])) for k, v in config.items("csv2rdf"): if k == "out": csv2rdf.OUT = codecs.open(v, "w", "utf-8") elif k == "base": csv2rdf.BASE = rdflib.Namespace(v) elif k == "propbase": csv2rdf.PROPBASE = rdflib.Namespace(v) elif k == "class": csv2rdf.CLASS = rdflib.URIRef(v) elif k == "defineclass": csv2rdf.DEFINECLASS = bool(v) elif k == "ident": csv2rdf.IDENT = eval(v) elif k == "label": csv2rdf.LABEL = eval(v) elif k == "delim": csv2rdf.DELIM = v elif k == "skip": csv2rdf.SKIP = int(v) elif k.startswith("col"): csv2rdf.COLUMNS[int(k[3:])] = column(v) elif k.startswith("prop"): csv2rdf.PROPS[int(k[4:])] = rdflib.URIRef(v) if "-o" in opts: csv2rdf.OUT = codecs.open(opts["-o"], "w", "utf-8") if "--out" in opts: csv2rdf.OUT = codecs.open(opts["--out"], "w", "utf-8") if "-b" in opts: csv2rdf.BASE = rdflib.Namespace(opts["-b"]) if "--base" in opts: csv2rdf.BASE = rdflib.Namespace(opts["--base"]) if "-d" in opts: csv2rdf.DELIM = opts["-d"] if "--delim" in opts: csv2rdf.DELIM = opts["--delim"] if "-p" in opts: csv2rdf.PROPBASE = rdflib.Namespace(opts["-p"]) if "--propbase" in opts: csv2rdf.PROPBASE = rdflib.Namespace(opts["--propbase"]) if "-l" in opts: csv2rdf.LABEL = eval(opts["-l"]) if "--label" in opts: csv2rdf.LABEL = eval(opts["--label"]) if "-i" in opts: csv2rdf.IDENT = eval(opts["-i"]) if "--ident" in opts: csv2rdf.IDENT = eval(opts["--ident"]) if "-s" in opts: csv2rdf.SKIP = int(opts["-s"]) if "--skip" in opts: csv2rdf.SKIP = int(opts["--skip"]) if "-c" in opts: csv2rdf.CLASS = rdflib.URIRef(opts["-c"]) if "--class" in opts: csv2rdf.CLASS = rdflib.URIRef(opts["--class"]) for k, v in opts.items(): if k.startswith("--col"): csv2rdf.COLUMNS[int(k[5:])] = column(v) elif k.startswith("--prop"): csv2rdf.PROPS[int(k[6:])] = rdflib.URIRef(v) if csv2rdf.CLASS and ("-C" in opts or "--defineclass" in opts): csv2rdf.DEFINECLASS = True csv2rdf.convert( csv_reader(fileinput.input(files), delimiter=csv2rdf.DELIM)) if __name__ == '__main__': main()