guppy_basecaller: env/lib/python3.7/site-packages/rdflib/tools/csv2rdf.py comparison

comparison env/lib/python3.7/site-packages/rdflib/tools/csv2rdf.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"

author	shellac
date	Sat, 02 May 2020 07:14:21 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:26e78fe6e8c4
+"""
+A commandline tool for semi-automatically converting CSV to RDF
+try: ``csv2rdf --help``
+"""
+import sys
+import re
+import csv
+import getopt
+import configparser
+import fileinput
+import codecs
+import time
+import datetime
+import warnings
+import urllib.request, urllib.error, urllib.parse
+import rdflib
+from rdflib import RDF, RDFS
+from rdflib.namespace import split_uri
+__all__ = [ 'CSV2RDF' ]
+HELP = """
+csv2rdf.py \
+-b <instance-base> \
+-p <property-base> \
+[-c <classname>] \
+[-i <identity column(s)>] \
+[-l <label columns>] \
+[-s <N>] [-o <output>] \
+[-f configfile] \
+[--col<N> <colspec>] \
+[--prop<N> <property>] \
+<[-d <delim>] \
+[-C] [files...]"
+Reads csv files from stdin or given files
+if -d is given, use this delimiter
+if -s is given, skips N lines at the start
+Creates a URI from the columns given to -i, or automatically by numbering if
+none is given
+Outputs RDFS labels from the columns given to -l
+if -c is given adds a type triple with the given classname
+if -C is given, the class is defined as rdfs:Class
+Outputs one RDF triple per column in each row.
+Output is in n3 format.
+Output is stdout, unless -o is specified
+Long options also supported: \
+--base, \
+--propbase, \
+--ident, \
+--class, \
+--label, \
+--out, \
+--defineclass
+Long options --col0, --col1, ...
+can be used to specify conversion for columns.
+Conversions can be:
+float(), int(), split(sep, [more]), uri(base, [class]), date(format)
+Long options --prop0, --prop1, ...
+can be used to use specific properties, rather than ones auto-generated
+from the headers
+-f says to read config from a .ini/config file - the file must contain one
+section called csv2rdf, with keys like the long options, i.e.:
+[csv2rdf]
+out=output.n3
+base=http://example.org/
+col0=split(";")
+col1=split(";", uri("http://example.org/things/",
+"http://xmlns.com/foaf/0.1/Person"))
+col2=float()
+col3=int()
+col4=date("%Y-%b-%d %H:%M:%S")
+"""
+# bah - ugly global
+uris = {}
+def toProperty(label):
+"""
+CamelCase + lowercase inital a string
+FIRST_NM => firstNm
+firstNm => firstNm
+"""
+label = re.sub("[^\w]", " ", label)
+label = re.sub("([a-z])([A-Z])", "\\1 \\2", label)
+label = label.split(" ")
+return "".join([label[0].lower()] + [x.capitalize() for x in label[1:]])
+def toPropertyLabel(label):
+if not label[1:2].isupper():
+return label[0:1].lower() + label[1:]
+return label
+def index(l, i):
+"""return a set of indexes from a list
+>>> index([1,2,3],(0,2))
+(1, 3)
+"""
+return tuple([l[x] for x in i])
+def csv_reader(csv_data, dialect=csv.excel, **kwargs):
+csv_reader = csv.reader(csv_data,
+dialect=dialect, **kwargs)
+for row in csv_reader:
+# decode UTF-8 back to Unicode, cell by cell:
+yield [str(cell, 'utf-8', errors='replace') for cell in row]
+def prefixuri(x, prefix, class_=None):
+if prefix:
+r = rdflib.URIRef(
+prefix + urllib.parse.quote(
+x.encode("utf8").replace(" ", "_"), safe=""))
+else:
+r = rdflib.URIRef(x)
+uris[x] = (r, class_)
+return r
+# meta-language for config
+class NodeMaker(object):
+def range(self):
+return rdflib.RDFS.Literal
+def __call__(self, x):
+return rdflib.Literal(x)
+class NodeUri(NodeMaker):
+def __init__(self, prefix, class_):
+self.prefix = prefix
+if class_:
+self.class_ = rdflib.URIRef(class_)
+else:
+self.class_ = None
+def __call__(self, x):
+return prefixuri(x, self.prefix, self.class_)
+def range(self):
+return self.class_ or rdflib.RDF.Resource
+class NodeLiteral(NodeMaker):
+def __init__(self, f=None):
+self.f = f
+class NodeFloat(NodeLiteral):
+def __call__(self, x):
+if not self.f:
+return rdflib.Literal(float(x))
+if callable(self.f):
+return rdflib.Literal(float(self.f(x)))
+raise Exception("Function passed to float is not callable")
+def range(self):
+return rdflib.XSD.double
+class NodeInt(NodeLiteral):
+def __call__(self, x):
+if not self.f:
+return rdflib.Literal(int(x))
+if callable(self.f):
+return rdflib.Literal(int(self.f(x)))
+raise Exception("Function passed to int is not callable")
+def range(self):
+return rdflib.XSD.int
+class NodeReplace(NodeMaker):
+def __init__(self, a, b):
+self.a = a
+self.b = b
+def __call__(self, x):
+return x.replace(self.a, self.b)
+class NodeDate(NodeLiteral):
+def __call__(self, x):
+return rdflib.Literal(datetime.datetime.strptime(x, self.f))
+def range(self):
+return rdflib.XSD.dateTime
+class NodeSplit(NodeMaker):
+def __init__(self, sep, f):
+self.sep = sep
+self.f = f
+def __call__(self, x):
+if not self.f:
+self.f = rdflib.Literal
+if not callable(self.f):
+raise Exception("Function passed to split is not callable!")
+return [
+self.f(y.strip()) for y in x.split(self.sep) if y.strip() != ""]
+def range(self):
+if self.f and isinstance(self.f, NodeMaker):
+return self.f.range()
+return NodeMaker.range(self)
+default_node_make = NodeMaker()
+def _config_ignore(*args, **kwargs):
+return "ignore"
+def _config_uri(prefix=None, class_=None):
+return NodeUri(prefix, class_)
+def _config_literal():
+return NodeLiteral
+def _config_float(f=None):
+return NodeFloat(f)
+def _config_replace(a, b):
+return NodeReplace(a, b)
+def _config_int(f=None):
+return NodeInt(f)
+def _config_date(format_):
+return NodeDate(format_)
+def _config_split(sep=None, f=None):
+return NodeSplit(sep, f)
+config_functions = {"ignore": _config_ignore,
+"uri": _config_uri,
+"literal": _config_literal,
+"float": _config_float,
+"int": _config_int,
+"date": _config_date,
+"split": _config_split,
+"replace": _config_replace
+}
+def column(v):
+"""Return a function for column mapping"""
+return eval(v, config_functions)
+class CSV2RDF(object):
+def __init__(self):
+self.CLASS = None
+self.BASE = None
+self.PROPBASE = None
+self.IDENT = 'auto'
+self.LABEL = None
+self.DEFINECLASS = False
+self.SKIP = 0
+self.DELIM = ","
+self.COLUMNS = {}
+self.PROPS = {}
+self.OUT = codecs.getwriter("utf-8")(sys.stdout, errors='replace')
+self.triples = 0
+def triple(self, s, p, o):
+self.OUT.write("%s %s %s .\n" % (s.n3(), p.n3(), o.n3()))
+self.triples += 1
+def convert(self, csvreader):
+start = time.time()
+if self.OUT:
+sys.stderr.write("Output to %s\n" % self.OUT.name)
+if self.IDENT != "auto" and not isinstance(self.IDENT, tuple):
+self.IDENT = (self.IDENT,)
+if not self.BASE:
+warnings.warn("No base given, using http://example.org/instances/")
+self.BASE = rdflib.Namespace("http://example.org/instances/")
+if not self.PROPBASE:
+warnings.warn(
+"No property base given, using http://example.org/property/")
+self.PROPBASE = rdflib.Namespace("http://example.org/props/")
+# skip lines at the start
+for x in range(self.SKIP):
+next(csvreader)
+# read header line
+header_labels = list(next(csvreader))
+headers = dict(
+enumerate([self.PROPBASE[toProperty(x)] for x in header_labels]))
+# override header properties if some are given
+for k, v in self.PROPS.items():
+headers[k] = v
+header_labels[k] = split_uri(v)[1]
+if self.DEFINECLASS:
+# output class/property definitions
+self.triple(self.CLASS, RDF.type, RDFS.Class)
+for i in range(len(headers)):
+h, l = headers[i], header_labels[i]
+if h == "" or l == "":
+continue
+if self.COLUMNS.get(i) == _config_ignore:
+continue
+self.triple(h, RDF.type, RDF.Property)
+self.triple(h, RDFS.label, rdflib.Literal(toPropertyLabel(l)))
+self.triple(h, RDFS.domain, self.CLASS)
+self.triple(h, RDFS.range,
+self.COLUMNS.get(i, default_node_make).range())
+rows = 0
+for l in csvreader:
+try:
+if self.IDENT == 'auto':
+uri = self.BASE["%d" % rows]
+else:
+uri = self.BASE["_".join([urllib.parse.quote(x.encode(
+"utf8").replace(" ", "_"), safe="")
+for x in index(l, self.IDENT)])]
+if self.LABEL:
+self.triple(uri, RDFS.label, rdflib.Literal(
+" ".join(index(l, self.LABEL))))
+if self.CLASS:
+# type triple
+self.triple(uri, RDF.type, self.CLASS)
+for i, x in enumerate(l):
+x = x.strip()
+if x != '':
+if self.COLUMNS.get(i) == _config_ignore:
+continue
+try:
+o = self.COLUMNS.get(i, rdflib.Literal)(x)
+if isinstance(o, list):
+for _o in o:
+self.triple(uri, headers[i], _o)
+else:
+self.triple(uri, headers[i], o)
+except Exception as e:
+warnings.warn(
+"Could not process value for column " +
+"%d:%s in row %d, ignoring: %s " % (
+i, headers[i], rows, e.message))
+rows += 1
+if rows % 100000 == 0:
+sys.stderr.write(
+"%d rows, %d triples, elapsed %.2fs.\n" % (
+rows, self.triples, time.time() - start))
+except:
+sys.stderr.write("Error processing line: %d\n" % rows)
+raise
+# output types/labels for generated URIs
+classes = set()
+for l, x in uris.items():
+u, c = x
+self.triple(u, RDFS.label, rdflib.Literal(l))
+if c:
+c = rdflib.URIRef(c)
+classes.add(c)
+self.triple(u, RDF.type, c)
+for c in classes:
+self.triple(c, RDF.type, RDFS.Class)
+self.OUT.close()
+sys.stderr.write(
+"Converted %d rows into %d triples.\n" % (rows, self.triples))
+sys.stderr.write("Took %.2f seconds.\n" % (time.time() - start))
+def main():
+csv2rdf = CSV2RDF()
+opts, files = getopt.getopt(
+sys.argv[1:],
+"hc:b:p:i:o:Cf:l:s:d:",
+["out=", "base=", "delim=", "propbase=", "class=",
+"ident=", "label=", "skip=", "defineclass", "help"])
+opts = dict(opts)
+if "-h" in opts or "--help" in opts:
+print(HELP)
+sys.exit(-1)
+if "-f" in opts:
+config = configparser.ConfigParser()
+config.readfp(open(opts["-f"]))
+for k, v in config.items("csv2rdf"):
+if k == "out":
+csv2rdf.OUT = codecs.open(v, "w", "utf-8")
+elif k == "base":
+csv2rdf.BASE = rdflib.Namespace(v)
+elif k == "propbase":
+csv2rdf.PROPBASE = rdflib.Namespace(v)
+elif k == "class":
+csv2rdf.CLASS = rdflib.URIRef(v)
+elif k == "defineclass":
+csv2rdf.DEFINECLASS = bool(v)
+elif k == "ident":
+csv2rdf.IDENT = eval(v)
+elif k == "label":
+csv2rdf.LABEL = eval(v)
+elif k == "delim":
+csv2rdf.DELIM = v
+elif k == "skip":
+csv2rdf.SKIP = int(v)
+elif k.startswith("col"):
+csv2rdf.COLUMNS[int(k[3:])] = column(v)
+elif k.startswith("prop"):
+csv2rdf.PROPS[int(k[4:])] = rdflib.URIRef(v)
+if "-o" in opts:
+csv2rdf.OUT = codecs.open(opts["-o"], "w", "utf-8")
+if "--out" in opts:
+csv2rdf.OUT = codecs.open(opts["--out"], "w", "utf-8")
+if "-b" in opts:
+csv2rdf.BASE = rdflib.Namespace(opts["-b"])
+if "--base" in opts:
+csv2rdf.BASE = rdflib.Namespace(opts["--base"])
+if "-d" in opts:
+csv2rdf.DELIM = opts["-d"]
+if "--delim" in opts:
+csv2rdf.DELIM = opts["--delim"]
+if "-p" in opts:
+csv2rdf.PROPBASE = rdflib.Namespace(opts["-p"])
+if "--propbase" in opts:
+csv2rdf.PROPBASE = rdflib.Namespace(opts["--propbase"])
+if "-l" in opts:
+csv2rdf.LABEL = eval(opts["-l"])
+if "--label" in opts:
+csv2rdf.LABEL = eval(opts["--label"])
+if "-i" in opts:
+csv2rdf.IDENT = eval(opts["-i"])
+if "--ident" in opts:
+csv2rdf.IDENT = eval(opts["--ident"])
+if "-s" in opts:
+csv2rdf.SKIP = int(opts["-s"])
+if "--skip" in opts:
+csv2rdf.SKIP = int(opts["--skip"])
+if "-c" in opts:
+csv2rdf.CLASS = rdflib.URIRef(opts["-c"])
+if "--class" in opts:
+csv2rdf.CLASS = rdflib.URIRef(opts["--class"])
+for k, v in opts.items():
+if k.startswith("--col"):
+csv2rdf.COLUMNS[int(k[5:])] = column(v)
+elif k.startswith("--prop"):
+csv2rdf.PROPS[int(k[6:])] = rdflib.URIRef(v)
+if csv2rdf.CLASS and ("-C" in opts or "--defineclass" in opts):
+csv2rdf.DEFINECLASS = True
+csv2rdf.convert(
+csv_reader(fileinput.input(files), delimiter=csv2rdf.DELIM))
+if __name__ == '__main__':
+main()

Mercurial > repos > shellac > guppy_basecaller

comparison env/lib/python3.7/site-packages/rdflib/tools/csv2rdf.py @ 0:26e78fe6e8c4 draft