1
0
mirror of https://github.com/gsi-upm/sitc synced 2024-11-22 06:22:29 +00:00

Added LOD files

This commit is contained in:
J. Fernando Sánchez 2017-02-21 18:02:21 +01:00
parent b5bbde8f3c
commit a70ee9af58
5 changed files with 114 additions and 0 deletions

11
lod/README.md Normal file
View File

@ -0,0 +1,11 @@
# Files included #
* `validate.py` validates and serializes a turtle dataset
* `sparql.py` runs a custom sparql query on a given dataset (by default, `reviews.ttl`)
* `extract_data.py` extracts RDFa, micro-data and JSON-LD data from a given URL
# Installation #
```
pip install -r requirements.txt
```

45
lod/extract_data.py Normal file
View File

@ -0,0 +1,45 @@
import sys
from urllib import request, parse
from rdflib import Graph, term
from lxml import etree
if len(sys.argv) < 2:
print('Usage: python {} <URL>'.format(sys.argv[0]))
print('')
print('Extract rdfa, microdata and json-ld annotations from a website')
exit(1)
url = sys.argv[1]
g = Graph()
g.parse(url, format='rdfa')
g.parse(url, format='microdata')
def sanitize_triple(t):
"""Function to remove bad URIs from the graph that would otherwise
make the serialization fail."""
def sanitize_triple_item(item):
if isinstance(item, term.URIRef) and '/' not in item:
return term.URIRef(parse.quote(str(item)))
return item
return (sanitize_triple_item(t[0]),
sanitize_triple_item(t[1]),
sanitize_triple_item(t[2]))
with request.urlopen(url) as response:
# Get all json-ld objects embedded in the html file
html = response.read().decode('utf-8')
parser = etree.XMLParser(recover=True)
root = etree.fromstring(html, parser=parser)
for jsonld in root.findall(".//script[@type='application/ld+json']"):
g.parse(data=jsonld.text, publicID=url, format='json-ld')
fixedgraph = Graph()
fixedgraph += [sanitize_triple(s) for s in g]
print(g.serialize(format='turtle').decode('utf-8'))

29
lod/reviews.ttl Normal file
View File

@ -0,0 +1,29 @@
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix schema: <http://schema.org/> .
_:Hotel1 a schema:Hotel ;
schema:description "A fictitious hotel" .
_:Review1 a schema:Review ;
schema:reviewBody "This is a great review" ;
schema:reviewRating [
a schema:Rating ;
schema:author <http://jfernando.es/me> ;
schema:ratingValue "0.7"
] ;
schema:itemReviewed _:Hotel1 .
_:Review2 a schema:Review ;
schema:reviewBody "This is a not so great review" ;
schema:reviewRating [
a schema:Rating ;
schema:author [ a schema:Person ;
schema:givenName "anonymous" ] ;
schema:ratingValue "0.3"
] ;
schema:itemReviewed _:Hotel1 .

23
lod/server.py Normal file
View File

@ -0,0 +1,23 @@
# !/bin/env python #
# Ejemplo de consultas SPARQL sobre turtle #
# python consultas.py #
import rdflib
import sys
dataset = sys.argv[1] if len(sys.argv) > 1 else 'reviews.ttl'
g = rdflib.Graph()
schema = rdflib.Namespace("http://schema.org/")
# Read Turtle file #
g.parse(dataset, format='turtle')
results = g.query(
"""SELECT DISTINCT ?review ?p ?o
WHERE {
?review a schema:Review.
?review ?p ?o.
}""", initNs={'schema': schema})
for row in results:
print("%s %s %s" % row)

6
lod/validate.py Normal file
View File

@ -0,0 +1,6 @@
import rdflib
import sys
g = rdflib.Graph()
dataset = sys.argv[1] if len(sys.argv) > 1 else 'reviews.ttl'
g.parse(dataset, format="n3")
print(g.serialize(format="n3").decode('utf-8'))