diff --git a/lod/README.md b/lod/README.md new file mode 100644 index 0000000..12543e6 --- /dev/null +++ b/lod/README.md @@ -0,0 +1,11 @@ +# Files included # + +* `validate.py` validates and serializes a turtle dataset +* `sparql.py` runs a custom sparql query on a given dataset (by default, `reviews.ttl`) +* `extract_data.py` extracts RDFa, micro-data and JSON-LD data from a given URL + +# Installation # + +``` +pip install -r requirements.txt +``` diff --git a/lod/extract_data.py b/lod/extract_data.py new file mode 100644 index 0000000..5d48156 --- /dev/null +++ b/lod/extract_data.py @@ -0,0 +1,45 @@ + +import sys +from urllib import request, parse +from rdflib import Graph, term +from lxml import etree + +if len(sys.argv) < 2: + print('Usage: python {} '.format(sys.argv[0])) + print('') + print('Extract rdfa, microdata and json-ld annotations from a website') + exit(1) + +url = sys.argv[1] + +g = Graph() +g.parse(url, format='rdfa') +g.parse(url, format='microdata') + + +def sanitize_triple(t): + """Function to remove bad URIs from the graph that would otherwise + make the serialization fail.""" + def sanitize_triple_item(item): + if isinstance(item, term.URIRef) and '/' not in item: + return term.URIRef(parse.quote(str(item))) + return item + + return (sanitize_triple_item(t[0]), + sanitize_triple_item(t[1]), + sanitize_triple_item(t[2])) + + +with request.urlopen(url) as response: + # Get all json-ld objects embedded in the html file + html = response.read().decode('utf-8') + parser = etree.XMLParser(recover=True) + root = etree.fromstring(html, parser=parser) + for jsonld in root.findall(".//script[@type='application/ld+json']"): + g.parse(data=jsonld.text, publicID=url, format='json-ld') + + +fixedgraph = Graph() +fixedgraph += [sanitize_triple(s) for s in g] + +print(g.serialize(format='turtle').decode('utf-8')) diff --git a/lod/reviews.ttl b/lod/reviews.ttl new file mode 100644 index 0000000..d5246a9 --- /dev/null +++ b/lod/reviews.ttl @@ -0,0 +1,29 @@ +@prefix rdf: . +@prefix rdfs: . +@prefix schema: . + + +_:Hotel1 a schema:Hotel ; + schema:description "A fictitious hotel" . + + +_:Review1 a schema:Review ; + schema:reviewBody "This is a great review" ; + schema:reviewRating [ + a schema:Rating ; + schema:author ; + schema:ratingValue "0.7" + + ] ; + schema:itemReviewed _:Hotel1 . + + +_:Review2 a schema:Review ; + schema:reviewBody "This is a not so great review" ; + schema:reviewRating [ + a schema:Rating ; + schema:author [ a schema:Person ; + schema:givenName "anonymous" ] ; + schema:ratingValue "0.3" + ] ; + schema:itemReviewed _:Hotel1 . \ No newline at end of file diff --git a/lod/server.py b/lod/server.py new file mode 100644 index 0000000..6ed5dd8 --- /dev/null +++ b/lod/server.py @@ -0,0 +1,23 @@ +# !/bin/env python # +# Ejemplo de consultas SPARQL sobre turtle # +# python consultas.py # +import rdflib +import sys + +dataset = sys.argv[1] if len(sys.argv) > 1 else 'reviews.ttl' +g = rdflib.Graph() + +schema = rdflib.Namespace("http://schema.org/") + +# Read Turtle file # +g.parse(dataset, format='turtle') + +results = g.query( + """SELECT DISTINCT ?review ?p ?o + WHERE { + ?review a schema:Review. + ?review ?p ?o. + }""", initNs={'schema': schema}) + +for row in results: + print("%s %s %s" % row) diff --git a/lod/validate.py b/lod/validate.py new file mode 100644 index 0000000..99db6f3 --- /dev/null +++ b/lod/validate.py @@ -0,0 +1,6 @@ +import rdflib +import sys +g = rdflib.Graph() +dataset = sys.argv[1] if len(sys.argv) > 1 else 'reviews.ttl' +g.parse(dataset, format="n3") +print(g.serialize(format="n3").decode('utf-8'))