Added LOD files

2025-08-23 02:02:20 +00:00 · 2017-02-21 18:02:21 +01:00
parent b5bbde8f3c
commit a70ee9af58
5 changed files with 114 additions and 0 deletions
--- a/lod/README.md
+++ b/lod/README.md
@@ -0,0 +1,11 @@
+# Files included #
+
+* `validate.py` validates and serializes a turtle dataset
+* `sparql.py` runs a custom sparql query on a given dataset (by default, `reviews.ttl`)
+* `extract_data.py` extracts RDFa, micro-data and JSON-LD data from a given URL
+
+# Installation #
+
+```
+pip install -r requirements.txt
+```
--- a/lod/extract_data.py
+++ b/lod/extract_data.py
@@ -0,0 +1,45 @@
+
+import sys
+from urllib import request, parse
+from rdflib import Graph, term
+from lxml import etree
+
+if len(sys.argv) < 2:
+    print('Usage: python {} <URL>'.format(sys.argv[0]))
+    print('')
+    print('Extract rdfa, microdata and json-ld annotations from a website')
+    exit(1)
+
+url = sys.argv[1]
+
+g = Graph()
+g.parse(url, format='rdfa')
+g.parse(url, format='microdata')
+
+
+def sanitize_triple(t):
+    """Function to remove bad URIs from the graph that would otherwise
+    make the serialization fail."""
+    def sanitize_triple_item(item):
+        if isinstance(item, term.URIRef) and '/' not in item:
+            return term.URIRef(parse.quote(str(item)))
+        return item
+
+    return (sanitize_triple_item(t[0]),
+            sanitize_triple_item(t[1]),
+            sanitize_triple_item(t[2]))
+
+
+with request.urlopen(url) as response:
+    # Get all json-ld objects embedded in the html file
+    html = response.read().decode('utf-8')
+    parser = etree.XMLParser(recover=True)
+    root = etree.fromstring(html, parser=parser)
+    for jsonld in root.findall(".//script[@type='application/ld+json']"):
+        g.parse(data=jsonld.text, publicID=url, format='json-ld')
+
+
+fixedgraph = Graph()
+fixedgraph += [sanitize_triple(s) for s in g]
+
+print(g.serialize(format='turtle').decode('utf-8'))
--- a/lod/reviews.ttl
+++ b/lod/reviews.ttl
@@ -0,0 +1,29 @@
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix schema: <http://schema.org/> .
+
+
+_:Hotel1 a schema:Hotel ;
+         schema:description "A fictitious hotel" .
+
+
+_:Review1 a schema:Review ;
+          schema:reviewBody "This is a great review" ;
+          schema:reviewRating [
+           a schema:Rating ;
+           schema:author <http://jfernando.es/me> ;
+           schema:ratingValue "0.7"
+           
+          ] ;
+          schema:itemReviewed _:Hotel1 .
+
+
+_:Review2 a schema:Review ;
+          schema:reviewBody "This is a not so great review" ;
+          schema:reviewRating [
+           a schema:Rating ;
+           schema:author [ a schema:Person ;
+           schema:givenName "anonymous" ] ;
+           schema:ratingValue "0.3"
+          ] ;
+          schema:itemReviewed _:Hotel1 .
--- a/lod/server.py
+++ b/lod/server.py
@@ -0,0 +1,23 @@
+# !/bin/env python #
+# Ejemplo de consultas SPARQL sobre turtle #
+# python consultas.py #
+import rdflib
+import sys
+
+dataset = sys.argv[1] if len(sys.argv) > 1 else 'reviews.ttl'
+g = rdflib.Graph()
+
+schema = rdflib.Namespace("http://schema.org/")
+
+# Read Turtle file #
+g.parse(dataset, format='turtle')
+
+results = g.query(
+    """SELECT DISTINCT ?review ?p ?o
+       WHERE {
+          ?review a schema:Review.
+          ?review ?p ?o.
+       }""", initNs={'schema': schema})
+
+for row in results:
+    print("%s %s %s" % row)
--- a/lod/validate.py
+++ b/lod/validate.py
@@ -0,0 +1,6 @@
+import rdflib
+import sys
+g = rdflib.Graph()
+dataset = sys.argv[1] if len(sys.argv) > 1 else 'reviews.ttl'
+g.parse(dataset, format="n3")
+print(g.serialize(format="n3").decode('utf-8'))