Mercurial > repos > guerler > springsuite
diff planemo/lib/python3.7/site-packages/rdflib/tools/csv2rdf.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/planemo/lib/python3.7/site-packages/rdflib/tools/csv2rdf.py Fri Jul 31 00:32:28 2020 -0400 @@ -0,0 +1,511 @@ +""" +A commandline tool for semi-automatically converting CSV to RDF + +try: ``csv2rdf --help`` + +""" + + +import sys +import re +import csv +import getopt +import configparser +import fileinput +import codecs +import time +import datetime +import warnings +import urllib.request, urllib.error, urllib.parse + +import rdflib + +from rdflib import RDF, RDFS +from rdflib.namespace import split_uri + +__all__ = [ 'CSV2RDF' ] + +HELP = """ +csv2rdf.py \ + -b <instance-base> \ + -p <property-base> \ + [-c <classname>] \ + [-i <identity column(s)>] \ + [-l <label columns>] \ + [-s <N>] [-o <output>] \ + [-f configfile] \ + [--col<N> <colspec>] \ + [--prop<N> <property>] \ + <[-d <delim>] \ + [-C] [files...]" + +Reads csv files from stdin or given files +if -d is given, use this delimiter +if -s is given, skips N lines at the start +Creates a URI from the columns given to -i, or automatically by numbering if +none is given +Outputs RDFS labels from the columns given to -l +if -c is given adds a type triple with the given classname +if -C is given, the class is defined as rdfs:Class +Outputs one RDF triple per column in each row. +Output is in n3 format. +Output is stdout, unless -o is specified + +Long options also supported: \ + --base, \ + --propbase, \ + --ident, \ + --class, \ + --label, \ + --out, \ + --defineclass + +Long options --col0, --col1, ... +can be used to specify conversion for columns. +Conversions can be: + float(), int(), split(sep, [more]), uri(base, [class]), date(format) + +Long options --prop0, --prop1, ... +can be used to use specific properties, rather than ones auto-generated +from the headers + +-f says to read config from a .ini/config file - the file must contain one +section called csv2rdf, with keys like the long options, i.e.: + +[csv2rdf] +out=output.n3 +base=http://example.org/ +col0=split(";") +col1=split(";", uri("http://example.org/things/", + "http://xmlns.com/foaf/0.1/Person")) +col2=float() +col3=int() +col4=date("%Y-%b-%d %H:%M:%S") + +""" + +# bah - ugly global +uris = {} + + +def toProperty(label): + """ + CamelCase + lowercase inital a string + + + FIRST_NM => firstNm + + firstNm => firstNm + + """ + label = re.sub("[^\w]", " ", label) + label = re.sub("([a-z])([A-Z])", "\\1 \\2", label) + label = label.split(" ") + return "".join([label[0].lower()] + [x.capitalize() for x in label[1:]]) + + +def toPropertyLabel(label): + if not label[1:2].isupper(): + return label[0:1].lower() + label[1:] + return label + + +def index(l, i): + """return a set of indexes from a list + >>> index([1,2,3],(0,2)) + (1, 3) + """ + return tuple([l[x] for x in i]) + + +def csv_reader(csv_data, dialect=csv.excel, **kwargs): + + csv_reader = csv.reader(csv_data, + dialect=dialect, **kwargs) + for row in csv_reader: + # decode UTF-8 back to Unicode, cell by cell: + yield [str(cell, 'utf-8', errors='replace') for cell in row] + + +def prefixuri(x, prefix, class_=None): + if prefix: + r = rdflib.URIRef( + prefix + urllib.parse.quote( + x.encode("utf8").replace(" ", "_"), safe="")) + else: + r = rdflib.URIRef(x) + uris[x] = (r, class_) + return r + +# meta-language for config + + +class NodeMaker(object): + def range(self): + return rdflib.RDFS.Literal + + def __call__(self, x): + return rdflib.Literal(x) + + +class NodeUri(NodeMaker): + def __init__(self, prefix, class_): + self.prefix = prefix + if class_: + self.class_ = rdflib.URIRef(class_) + else: + self.class_ = None + + def __call__(self, x): + return prefixuri(x, self.prefix, self.class_) + + def range(self): + return self.class_ or rdflib.RDF.Resource + + +class NodeLiteral(NodeMaker): + def __init__(self, f=None): + self.f = f + + +class NodeFloat(NodeLiteral): + def __call__(self, x): + if not self.f: + return rdflib.Literal(float(x)) + if callable(self.f): + return rdflib.Literal(float(self.f(x))) + raise Exception("Function passed to float is not callable") + + def range(self): + return rdflib.XSD.double + + +class NodeInt(NodeLiteral): + def __call__(self, x): + if not self.f: + return rdflib.Literal(int(x)) + if callable(self.f): + return rdflib.Literal(int(self.f(x))) + raise Exception("Function passed to int is not callable") + + def range(self): + return rdflib.XSD.int + + +class NodeReplace(NodeMaker): + def __init__(self, a, b): + self.a = a + self.b = b + + def __call__(self, x): + return x.replace(self.a, self.b) + + +class NodeDate(NodeLiteral): + def __call__(self, x): + return rdflib.Literal(datetime.datetime.strptime(x, self.f)) + + def range(self): + return rdflib.XSD.dateTime + + +class NodeSplit(NodeMaker): + def __init__(self, sep, f): + self.sep = sep + self.f = f + + def __call__(self, x): + if not self.f: + self.f = rdflib.Literal + if not callable(self.f): + raise Exception("Function passed to split is not callable!") + return [ + self.f(y.strip()) for y in x.split(self.sep) if y.strip() != ""] + + def range(self): + if self.f and isinstance(self.f, NodeMaker): + return self.f.range() + return NodeMaker.range(self) + +default_node_make = NodeMaker() + + +def _config_ignore(*args, **kwargs): + return "ignore" + + +def _config_uri(prefix=None, class_=None): + return NodeUri(prefix, class_) + + +def _config_literal(): + return NodeLiteral + + +def _config_float(f=None): + return NodeFloat(f) + + +def _config_replace(a, b): + return NodeReplace(a, b) + + +def _config_int(f=None): + return NodeInt(f) + + +def _config_date(format_): + return NodeDate(format_) + + +def _config_split(sep=None, f=None): + return NodeSplit(sep, f) + +config_functions = {"ignore": _config_ignore, + "uri": _config_uri, + "literal": _config_literal, + "float": _config_float, + "int": _config_int, + "date": _config_date, + "split": _config_split, + "replace": _config_replace + } + + +def column(v): + """Return a function for column mapping""" + + return eval(v, config_functions) + + +class CSV2RDF(object): + def __init__(self): + + self.CLASS = None + self.BASE = None + self.PROPBASE = None + self.IDENT = 'auto' + self.LABEL = None + self.DEFINECLASS = False + self.SKIP = 0 + self.DELIM = "," + + self.COLUMNS = {} + self.PROPS = {} + + self.OUT = codecs.getwriter("utf-8")(sys.stdout, errors='replace') + + self.triples = 0 + + def triple(self, s, p, o): + self.OUT.write("%s %s %s .\n" % (s.n3(), p.n3(), o.n3())) + self.triples += 1 + + def convert(self, csvreader): + + start = time.time() + + if self.OUT: + sys.stderr.write("Output to %s\n" % self.OUT.name) + + if self.IDENT != "auto" and not isinstance(self.IDENT, tuple): + self.IDENT = (self.IDENT,) + + if not self.BASE: + warnings.warn("No base given, using http://example.org/instances/") + self.BASE = rdflib.Namespace("http://example.org/instances/") + + if not self.PROPBASE: + warnings.warn( + "No property base given, using http://example.org/property/") + self.PROPBASE = rdflib.Namespace("http://example.org/props/") + + # skip lines at the start + for x in range(self.SKIP): + next(csvreader) + + # read header line + header_labels = list(next(csvreader)) + headers = dict( + enumerate([self.PROPBASE[toProperty(x)] for x in header_labels])) + # override header properties if some are given + for k, v in self.PROPS.items(): + headers[k] = v + header_labels[k] = split_uri(v)[1] + + if self.DEFINECLASS: + # output class/property definitions + self.triple(self.CLASS, RDF.type, RDFS.Class) + for i in range(len(headers)): + h, l = headers[i], header_labels[i] + if h == "" or l == "": + continue + if self.COLUMNS.get(i) == _config_ignore: + continue + self.triple(h, RDF.type, RDF.Property) + self.triple(h, RDFS.label, rdflib.Literal(toPropertyLabel(l))) + self.triple(h, RDFS.domain, self.CLASS) + self.triple(h, RDFS.range, + self.COLUMNS.get(i, default_node_make).range()) + + rows = 0 + for l in csvreader: + try: + if self.IDENT == 'auto': + uri = self.BASE["%d" % rows] + else: + uri = self.BASE["_".join([urllib.parse.quote(x.encode( + "utf8").replace(" ", "_"), safe="") + for x in index(l, self.IDENT)])] + + if self.LABEL: + self.triple(uri, RDFS.label, rdflib.Literal( + " ".join(index(l, self.LABEL)))) + + if self.CLASS: + # type triple + self.triple(uri, RDF.type, self.CLASS) + + for i, x in enumerate(l): + x = x.strip() + if x != '': + if self.COLUMNS.get(i) == _config_ignore: + continue + try: + o = self.COLUMNS.get(i, rdflib.Literal)(x) + if isinstance(o, list): + for _o in o: + self.triple(uri, headers[i], _o) + else: + self.triple(uri, headers[i], o) + + except Exception as e: + warnings.warn( + "Could not process value for column " + + "%d:%s in row %d, ignoring: %s " % ( + i, headers[i], rows, e.message)) + + rows += 1 + if rows % 100000 == 0: + sys.stderr.write( + "%d rows, %d triples, elapsed %.2fs.\n" % ( + rows, self.triples, time.time() - start)) + except: + sys.stderr.write("Error processing line: %d\n" % rows) + raise + + # output types/labels for generated URIs + classes = set() + for l, x in uris.items(): + u, c = x + self.triple(u, RDFS.label, rdflib.Literal(l)) + if c: + c = rdflib.URIRef(c) + classes.add(c) + self.triple(u, RDF.type, c) + + for c in classes: + self.triple(c, RDF.type, RDFS.Class) + + self.OUT.close() + sys.stderr.write( + "Converted %d rows into %d triples.\n" % (rows, self.triples)) + sys.stderr.write("Took %.2f seconds.\n" % (time.time() - start)) + + +def main(): + csv2rdf = CSV2RDF() + + opts, files = getopt.getopt( + sys.argv[1:], + "hc:b:p:i:o:Cf:l:s:d:", + ["out=", "base=", "delim=", "propbase=", "class=", + "ident=", "label=", "skip=", "defineclass", "help"]) + opts = dict(opts) + + if "-h" in opts or "--help" in opts: + print(HELP) + sys.exit(-1) + + if "-f" in opts: + config = configparser.ConfigParser() + config.readfp(open(opts["-f"])) + for k, v in config.items("csv2rdf"): + if k == "out": + csv2rdf.OUT = codecs.open(v, "w", "utf-8") + elif k == "base": + csv2rdf.BASE = rdflib.Namespace(v) + elif k == "propbase": + csv2rdf.PROPBASE = rdflib.Namespace(v) + elif k == "class": + csv2rdf.CLASS = rdflib.URIRef(v) + elif k == "defineclass": + csv2rdf.DEFINECLASS = bool(v) + elif k == "ident": + csv2rdf.IDENT = eval(v) + elif k == "label": + csv2rdf.LABEL = eval(v) + elif k == "delim": + csv2rdf.DELIM = v + elif k == "skip": + csv2rdf.SKIP = int(v) + elif k.startswith("col"): + csv2rdf.COLUMNS[int(k[3:])] = column(v) + elif k.startswith("prop"): + csv2rdf.PROPS[int(k[4:])] = rdflib.URIRef(v) + + if "-o" in opts: + csv2rdf.OUT = codecs.open(opts["-o"], "w", "utf-8") + if "--out" in opts: + csv2rdf.OUT = codecs.open(opts["--out"], "w", "utf-8") + + if "-b" in opts: + csv2rdf.BASE = rdflib.Namespace(opts["-b"]) + if "--base" in opts: + csv2rdf.BASE = rdflib.Namespace(opts["--base"]) + + if "-d" in opts: + csv2rdf.DELIM = opts["-d"] + if "--delim" in opts: + csv2rdf.DELIM = opts["--delim"] + + if "-p" in opts: + csv2rdf.PROPBASE = rdflib.Namespace(opts["-p"]) + if "--propbase" in opts: + csv2rdf.PROPBASE = rdflib.Namespace(opts["--propbase"]) + + if "-l" in opts: + csv2rdf.LABEL = eval(opts["-l"]) + if "--label" in opts: + csv2rdf.LABEL = eval(opts["--label"]) + + if "-i" in opts: + csv2rdf.IDENT = eval(opts["-i"]) + if "--ident" in opts: + csv2rdf.IDENT = eval(opts["--ident"]) + + if "-s" in opts: + csv2rdf.SKIP = int(opts["-s"]) + if "--skip" in opts: + csv2rdf.SKIP = int(opts["--skip"]) + + if "-c" in opts: + csv2rdf.CLASS = rdflib.URIRef(opts["-c"]) + if "--class" in opts: + csv2rdf.CLASS = rdflib.URIRef(opts["--class"]) + + for k, v in opts.items(): + if k.startswith("--col"): + csv2rdf.COLUMNS[int(k[5:])] = column(v) + elif k.startswith("--prop"): + csv2rdf.PROPS[int(k[6:])] = rdflib.URIRef(v) + + if csv2rdf.CLASS and ("-C" in opts or "--defineclass" in opts): + csv2rdf.DEFINECLASS = True + + csv2rdf.convert( + csv_reader(fileinput.input(files), delimiter=csv2rdf.DELIM)) + + +if __name__ == '__main__': + main()