From a70ee9af584b2da63a3da9e194b6df6357a4bfe1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=2E=20Fernando=20S=C3=A1nchez?= <balkian@gmail.com>
Date: Tue, 21 Feb 2017 18:02:21 +0100
Subject: [PATCH] Added LOD files

---
 lod/README.md       | 11 +++++++++++
 lod/extract_data.py | 45 +++++++++++++++++++++++++++++++++++++++++++++
 lod/reviews.ttl     | 29 +++++++++++++++++++++++++++++
 lod/server.py       | 23 +++++++++++++++++++++++
 lod/validate.py     |  6 ++++++
 5 files changed, 114 insertions(+)
 create mode 100644 lod/README.md
 create mode 100644 lod/extract_data.py
 create mode 100644 lod/reviews.ttl
 create mode 100644 lod/server.py
 create mode 100644 lod/validate.py
diff --git a/lod/README.md b/lod/README.md
new file mode 100644
index 0000000..12543e6
--- /dev/null
+++ b/lod/README.md
@@ -0,0 +1,11 @@
+# Files included #
+
+* `validate.py` validates and serializes a turtle dataset
+* `sparql.py` runs a custom sparql query on a given dataset (by default, `reviews.ttl`)
+* `extract_data.py` extracts RDFa, micro-data and JSON-LD data from a given URL
+
+# Installation #
+
+```
+pip install -r requirements.txt
+```
diff --git a/lod/extract_data.py b/lod/extract_data.py
new file mode 100644
index 0000000..5d48156
--- /dev/null
+++ b/lod/extract_data.py
@@ -0,0 +1,45 @@
+
+import sys
+from urllib import request, parse
+from rdflib import Graph, term
+from lxml import etree
+
+if len(sys.argv) < 2:
+    print('Usage: python {} <URL>'.format(sys.argv[0]))
+    print('')
+    print('Extract rdfa, microdata and json-ld annotations from a website')
+    exit(1)
+
+url = sys.argv[1]
+
+g = Graph()
+g.parse(url, format='rdfa')
+g.parse(url, format='microdata')
+
+
+def sanitize_triple(t):
+    """Function to remove bad URIs from the graph that would otherwise
+    make the serialization fail."""
+    def sanitize_triple_item(item):
+        if isinstance(item, term.URIRef) and '/' not in item:
+            return term.URIRef(parse.quote(str(item)))
+        return item
+
+    return (sanitize_triple_item(t[0]),
+            sanitize_triple_item(t[1]),
+            sanitize_triple_item(t[2]))
+
+
+with request.urlopen(url) as response:
+    # Get all json-ld objects embedded in the html file
+    html = response.read().decode('utf-8')
+    parser = etree.XMLParser(recover=True)
+    root = etree.fromstring(html, parser=parser)
+    for jsonld in root.findall(".//script[@type='application/ld+json']"):
+        g.parse(data=jsonld.text, publicID=url, format='json-ld')
+
+
+fixedgraph = Graph()
+fixedgraph += [sanitize_triple(s) for s in g]
+
+print(g.serialize(format='turtle').decode('utf-8'))
diff --git a/lod/reviews.ttl b/lod/reviews.ttl
new file mode 100644
index 0000000..d5246a9
--- /dev/null
+++ b/lod/reviews.ttl
@@ -0,0 +1,29 @@
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix schema: <http://schema.org/> .
+
+
+_:Hotel1 a schema:Hotel ;
+         schema:description "A fictitious hotel" .
+
+
+_:Review1 a schema:Review ;
+          schema:reviewBody "This is a great review" ;
+          schema:reviewRating [
+           a schema:Rating ;
+           schema:author <http://jfernando.es/me> ;
+           schema:ratingValue "0.7"
+           
+          ] ;
+          schema:itemReviewed _:Hotel1 .
+
+
+_:Review2 a schema:Review ;
+          schema:reviewBody "This is a not so great review" ;
+          schema:reviewRating [
+           a schema:Rating ;
+           schema:author [ a schema:Person ;
+           schema:givenName "anonymous" ] ;
+           schema:ratingValue "0.3"
+          ] ;
+          schema:itemReviewed _:Hotel1 .
\ No newline at end of file
diff --git a/lod/server.py b/lod/server.py
new file mode 100644
index 0000000..6ed5dd8
--- /dev/null
+++ b/lod/server.py
@@ -0,0 +1,23 @@
+# !/bin/env python #
+# Ejemplo de consultas SPARQL sobre turtle #
+# python consultas.py #
+import rdflib
+import sys
+
+dataset = sys.argv[1] if len(sys.argv) > 1 else 'reviews.ttl'
+g = rdflib.Graph()
+
+schema = rdflib.Namespace("http://schema.org/")
+
+# Read Turtle file #
+g.parse(dataset, format='turtle')
+
+results = g.query(
+    """SELECT DISTINCT ?review ?p ?o
+       WHERE {
+          ?review a schema:Review.
+          ?review ?p ?o.
+       }""", initNs={'schema': schema})
+
+for row in results:
+    print("%s %s %s" % row)
diff --git a/lod/validate.py b/lod/validate.py
new file mode 100644
index 0000000..99db6f3
--- /dev/null
+++ b/lod/validate.py
@@ -0,0 +1,6 @@
+import rdflib
+import sys
+g = rdflib.Graph()
+dataset = sys.argv[1] if len(sys.argv) > 1 else 'reviews.ttl'
+g.parse(dataset, format="n3")
+print(g.serialize(format="n3").decode('utf-8'))