Mercurial > repos > guerler > springsuite
diff planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/structureddata.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/planemo/lib/python3.7/site-packages/rdflib/plugins/parsers/structureddata.py Fri Jul 31 00:32:28 2020 -0400 @@ -0,0 +1,308 @@ +#!/usr/bin/env python +""" +Extraction parsers for structured data embedded into HTML or XML files. +The former may include RDFa or microdata. The syntax and the extraction +procedures are based on: + +* The RDFa specifications: http://www.w3.org/TR/#tr_RDFa +* The microdata specification: http://www.w3.org/TR/microdata/ +* The specification of the microdata to RDF conversion: +http://www.w3.org/TR/microdata-rdf/ + +License: W3C Software License, +http://www.w3.org/Consortium/Legal/copyright-software +Author: Ivan Herman +Copyright: W3C + +""" + +from rdflib.parser import ( + Parser, StringInputSource, URLInputSource, FileInputSource) + +try: + import html5lib + assert html5lib + html5lib = True +except ImportError: + import warnings + warnings.warn( + 'html5lib not found! RDFa and Microdata ' + + 'parsers will not be available.') + html5lib = False + + +def _get_orig_source(source): + """ + A bit of a hack; the RDFa/microdata parsers need more than what the + upper layers of RDFLib provide... + This method returns the original source references. + """ + if isinstance(source, StringInputSource): + orig_source = source.getByteStream() + elif isinstance(source, URLInputSource): + orig_source = source.url + elif isinstance(source, FileInputSource): + orig_source = source.file.name + source.file.close() + else: + orig_source = source.getByteStream() + baseURI = source.getPublicId() + return (baseURI, orig_source) + + +def _check_error(graph): + from .pyRdfa import RDFA_Error, ns_rdf + from .pyRdfa.options import ns_dc + for (s, p, o) in graph.triples((None, ns_rdf["type"], RDFA_Error)): + for (x, y, msg) in graph.triples((s, ns_dc["description"], None)): + raise Exception("RDFa parsing Error! %s" % msg) + + +# This is the parser interface as it would look when called from the +# rest of RDFLib +class RDFaParser(Parser): + """ + Wrapper around the RDFa 1.1 parser. For further details on the RDFa 1.1 + processing, see the relevant W3C documents at + http://www.w3.org/TR/#tr_RDFa. RDFa 1.1 is defined for XHTML, HTML5, SVG + and, in general, for any XML language. + + Note that the parser can also handle RDFa 1.0 if the extra parameter is + used and/or the input source uses RDFa 1.0 specific @version or DTD-s. + """ + def parse(self, source, graph, + pgraph=None, + media_type="", + rdfa_version=None, + embedded_rdf=False, + space_preserve=True, + vocab_expansion=False, + vocab_cache=False, + refresh_vocab_cache=False, + vocab_cache_report=False, + check_lite=False): + """ + @param source: one of the input sources that the RDFLib package defined + @type source: InputSource class instance + @param graph: target graph for the triples; output graph, in RDFa spec. + parlance + @type graph: RDFLib Graph + @keyword pgraph: target for error and warning triples; processor graph, + in RDFa spec. parlance. If set to None, these triples are ignored + @type pgraph: RDFLib Graph + @keyword media_type: explicit setting of the preferred media type + (a.k.a. content type) of the the RDFa source. None means the content + type of the HTTP result is used, or a guess is made based on the + suffix of a file + @type media_type: string + @keyword rdfa_version: 1.0 or 1.1. If the value is "", then, by + default, 1.1 is used unless the source has explicit signals to use + 1.0 (e.g., using a @version attribute, using a DTD set up for 1.0, etc) + @type rdfa_version: string + @keyword embedded_rdf: some formats allow embedding RDF in other + formats: (X)HTML can contain turtle in a special <script> element, + SVG can have RDF/XML embedded in a <metadata> element. This flag + controls whether those triples should be interpreted and added to + the output graph. Some languages (e.g., SVG) require this, and the + flag is ignored. + @type embedded_rdf: Boolean + @keyword space_preserve: by default, space in the HTML source must be preserved in the generated literal; + this behavior can be switched off + @type space_preserve: Boolean + @keyword vocab_expansion: whether the RDFa @vocab attribute should + also mean vocabulary expansion (see the RDFa 1.1 spec for further + details) + @type vocab_expansion: Boolean + @keyword vocab_cache: in case vocab expansion is used, whether the + expansion data (i.e., vocabulary) should be cached locally. This + requires the ability for the local application to write on the + local file system + @type vocab_chache: Boolean + @keyword vocab_cache_report: whether the details of vocabulary file caching process should be reported + in the processor graph as information (mainly useful for debug) + @type vocab_cache_report: Boolean + @keyword refresh_vocab_cache: whether the caching checks of vocabs should be by-passed, ie, if caches should be re-generated regardless of the stored date (important for vocab development) + @type refresh_vocab_cache: Boolean + @keyword check_lite: generate extra warnings in case the input source is not RDFa 1.1 check_lite + @type check_lite: Boolean + """ + + if html5lib is False: + raise ImportError( + 'html5lib is not installed, cannot use ' + + 'RDFa and Microdata parsers.') + + (baseURI, orig_source) = _get_orig_source(source) + self._process(graph, pgraph, baseURI, orig_source, + media_type=media_type, + rdfa_version=rdfa_version, + embedded_rdf=embedded_rdf, + space_preserve=space_preserve, + vocab_expansion=vocab_expansion, + vocab_cache=vocab_cache, + vocab_cache_report=vocab_cache_report, + refresh_vocab_cache=refresh_vocab_cache, + check_lite=check_lite + ) + + def _process(self, graph, pgraph, baseURI, orig_source, + media_type="", + rdfa_version=None, + embedded_rdf=False, + space_preserve=True, + vocab_expansion=False, + vocab_cache=False, + vocab_cache_report=False, + refresh_vocab_cache=False, + check_lite=False): + from .pyRdfa import pyRdfa, Options + from rdflib import Graph + processor_graph = pgraph if pgraph is not None else Graph() + self.options = Options(output_processor_graph=True, + embedded_rdf=embedded_rdf, + space_preserve=space_preserve, + vocab_expansion=vocab_expansion, + vocab_cache=vocab_cache, + vocab_cache_report=vocab_cache_report, + refresh_vocab_cache=refresh_vocab_cache, + check_lite=check_lite) + + if media_type is None: + media_type = "" + processor = pyRdfa(self.options, + base=baseURI, + media_type=media_type, + rdfa_version=rdfa_version) + processor.graph_from_source(orig_source, graph=graph, pgraph=processor_graph, rdfOutput=False) + # This may result in an exception if the graph parsing led to an error + _check_error(processor_graph) + + +class RDFa10Parser(Parser): + """ + This is just a convenience class to wrap around the RDFa 1.0 parser. + """ + def parse(self, source, graph, pgraph=None, media_type=""): + """ + @param source: one of the input sources that the RDFLib package defined + @type source: InputSource class instance + @param graph: target graph for the triples; output graph, in RDFa + spec. parlance + @type graph: RDFLib Graph + @keyword pgraph: target for error and warning triples; processor + graph, in RDFa spec. parlance. If set to None, these triples are + ignored + @type pgraph: RDFLib Graph + @keyword media_type: explicit setting of the preferred media type + (a.k.a. content type) of the the RDFa source. None means the content + type of the HTTP result is used, or a guess is made based on the + suffix of a file + @type media_type: string + @keyword rdfOutput: whether Exceptions should be catched and added, + as triples, to the processor graph, or whether they should be raised. + @type rdfOutput: Boolean + """ + RDFaParser().parse(source, graph, pgraph=pgraph, + media_type=media_type, rdfa_version="1.0") + + +class MicrodataParser(Parser): + """ + Wrapper around an HTML5 microdata, extracted and converted into RDF. For + the specification of microdata, see the relevant section of the HTML5 + spec: http://www.w3.org/TR/microdata/; for the algorithm used to extract + microdata into RDF, see http://www.w3.org/TR/microdata-rdf/. + """ + def parse(self, source, graph, vocab_expansion=False, vocab_cache=False): + """ + @param source: one of the input sources that the RDFLib package defined + @type source: InputSource class instance + @param graph: target graph for the triples; output graph, in RDFa + spec. parlance + @type graph: RDFLib Graph + @keyword vocab_expansion: whether the RDFa @vocab attribute should + also mean vocabulary expansion (see the RDFa 1.1 spec for further + details) + @type vocab_expansion: Boolean + @keyword vocab_cache: in case vocab expansion is used, whether the + expansion data (i.e., vocabulary) should be cached locally. This + requires the ability for the local application to write on the + local file system + @type vocab_chache: Boolean + @keyword rdfOutput: whether Exceptions should be catched and added, + as triples, to the processor graph, or whether they should be raised. + @type rdfOutput: Boolean + """ + if html5lib is False: + raise ImportError( + 'html5lib is not installed, cannot use RDFa ' + + 'and Microdata parsers.') + + (baseURI, orig_source) = _get_orig_source(source) + self._process(graph, baseURI, orig_source, + vocab_expansion=vocab_expansion, + vocab_cache=vocab_cache) + + def _process(self, graph, baseURI, orig_source, + vocab_expansion=False, vocab_cache=False): + from .pyMicrodata import pyMicrodata + processor = pyMicrodata(base=baseURI, vocab_expansion=vocab_expansion, + vocab_cache=vocab_cache) + processor.graph_from_source( + orig_source, graph=graph, rdfOutput=False) + + +class StructuredDataParser(Parser): + """ + Convenience parser to extract both RDFa (including embedded Turtle) + and microdata from an HTML file. + It is simply a wrapper around the specific parsers. + """ + def parse(self, source, graph, + pgraph=None, + rdfa_version="", + vocab_expansion=False, + vocab_cache=False, + media_type='text/html' + ): + """ + @param source: one of the input sources that the RDFLib package defined + @type source: InputSource class instance + @param graph: target graph for the triples; output graph, in RDFa + spec. parlance + @keyword rdfa_version: 1.0 or 1.1. If the value is "", then, by + default, 1.1 is used unless the source has explicit signals to use 1.0 + (e.g., using a @version attribute, using a DTD set up for 1.0, etc) + @type rdfa_version: string + @type graph: RDFLib Graph + @keyword pgraph: target for error and warning triples; processor + graph, in RDFa spec. parlance. If set to None, these triples are + ignored + @type pgraph: RDFLib Graph + @keyword vocab_expansion: whether the RDFa @vocab attribute should + also mean vocabulary expansion (see the RDFa 1.1 spec for further + details) + @type vocab_expansion: Boolean + @keyword vocab_cache: in case vocab expansion is used, whether the + expansion data (i.e., vocabulary) should be cached locally. This + requires the ability for the local application to write on the + local file system + @type vocab_chache: Boolean + @keyword rdfOutput: whether Exceptions should be catched and added, + as triples, to the processor graph, or whether they should be raised. + @type rdfOutput: Boolean + """ + # Note that the media_type argument is ignored, and is here only to avoid an 'unexpected argument' error. + # This parser works for text/html only anyway... + (baseURI, orig_source) = _get_orig_source(source) + if rdfa_version == "" : rdfa_version = "1.1" + RDFaParser()._process(graph, pgraph, baseURI, orig_source, + media_type='text/html', + rdfa_version=rdfa_version, + vocab_expansion=vocab_expansion, + vocab_cache=vocab_cache) + MicrodataParser()._process(graph, baseURI, orig_source, + vocab_expansion=vocab_expansion, + vocab_cache=vocab_cache) + from .hturtle import HTurtleParser + HTurtleParser()._process(graph, baseURI, orig_source, media_type='text/html')