mirror of
https://github.com/gsi-upm/sitc
synced 2024-11-22 06:22:29 +00:00
Added LOD files
This commit is contained in:
parent
b5bbde8f3c
commit
a70ee9af58
11
lod/README.md
Normal file
11
lod/README.md
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# Files included #
|
||||||
|
|
||||||
|
* `validate.py` validates and serializes a turtle dataset
|
||||||
|
* `sparql.py` runs a custom sparql query on a given dataset (by default, `reviews.ttl`)
|
||||||
|
* `extract_data.py` extracts RDFa, micro-data and JSON-LD data from a given URL
|
||||||
|
|
||||||
|
# Installation #
|
||||||
|
|
||||||
|
```
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
45
lod/extract_data.py
Normal file
45
lod/extract_data.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
|
||||||
|
import sys
|
||||||
|
from urllib import request, parse
|
||||||
|
from rdflib import Graph, term
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print('Usage: python {} <URL>'.format(sys.argv[0]))
|
||||||
|
print('')
|
||||||
|
print('Extract rdfa, microdata and json-ld annotations from a website')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
url = sys.argv[1]
|
||||||
|
|
||||||
|
g = Graph()
|
||||||
|
g.parse(url, format='rdfa')
|
||||||
|
g.parse(url, format='microdata')
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_triple(t):
|
||||||
|
"""Function to remove bad URIs from the graph that would otherwise
|
||||||
|
make the serialization fail."""
|
||||||
|
def sanitize_triple_item(item):
|
||||||
|
if isinstance(item, term.URIRef) and '/' not in item:
|
||||||
|
return term.URIRef(parse.quote(str(item)))
|
||||||
|
return item
|
||||||
|
|
||||||
|
return (sanitize_triple_item(t[0]),
|
||||||
|
sanitize_triple_item(t[1]),
|
||||||
|
sanitize_triple_item(t[2]))
|
||||||
|
|
||||||
|
|
||||||
|
with request.urlopen(url) as response:
|
||||||
|
# Get all json-ld objects embedded in the html file
|
||||||
|
html = response.read().decode('utf-8')
|
||||||
|
parser = etree.XMLParser(recover=True)
|
||||||
|
root = etree.fromstring(html, parser=parser)
|
||||||
|
for jsonld in root.findall(".//script[@type='application/ld+json']"):
|
||||||
|
g.parse(data=jsonld.text, publicID=url, format='json-ld')
|
||||||
|
|
||||||
|
|
||||||
|
fixedgraph = Graph()
|
||||||
|
fixedgraph += [sanitize_triple(s) for s in g]
|
||||||
|
|
||||||
|
print(g.serialize(format='turtle').decode('utf-8'))
|
29
lod/reviews.ttl
Normal file
29
lod/reviews.ttl
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
|
||||||
|
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
|
||||||
|
@prefix schema: <http://schema.org/> .
|
||||||
|
|
||||||
|
|
||||||
|
_:Hotel1 a schema:Hotel ;
|
||||||
|
schema:description "A fictitious hotel" .
|
||||||
|
|
||||||
|
|
||||||
|
_:Review1 a schema:Review ;
|
||||||
|
schema:reviewBody "This is a great review" ;
|
||||||
|
schema:reviewRating [
|
||||||
|
a schema:Rating ;
|
||||||
|
schema:author <http://jfernando.es/me> ;
|
||||||
|
schema:ratingValue "0.7"
|
||||||
|
|
||||||
|
] ;
|
||||||
|
schema:itemReviewed _:Hotel1 .
|
||||||
|
|
||||||
|
|
||||||
|
_:Review2 a schema:Review ;
|
||||||
|
schema:reviewBody "This is a not so great review" ;
|
||||||
|
schema:reviewRating [
|
||||||
|
a schema:Rating ;
|
||||||
|
schema:author [ a schema:Person ;
|
||||||
|
schema:givenName "anonymous" ] ;
|
||||||
|
schema:ratingValue "0.3"
|
||||||
|
] ;
|
||||||
|
schema:itemReviewed _:Hotel1 .
|
23
lod/server.py
Normal file
23
lod/server.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
# !/bin/env python #
|
||||||
|
# Ejemplo de consultas SPARQL sobre turtle #
|
||||||
|
# python consultas.py #
|
||||||
|
import rdflib
|
||||||
|
import sys
|
||||||
|
|
||||||
|
dataset = sys.argv[1] if len(sys.argv) > 1 else 'reviews.ttl'
|
||||||
|
g = rdflib.Graph()
|
||||||
|
|
||||||
|
schema = rdflib.Namespace("http://schema.org/")
|
||||||
|
|
||||||
|
# Read Turtle file #
|
||||||
|
g.parse(dataset, format='turtle')
|
||||||
|
|
||||||
|
results = g.query(
|
||||||
|
"""SELECT DISTINCT ?review ?p ?o
|
||||||
|
WHERE {
|
||||||
|
?review a schema:Review.
|
||||||
|
?review ?p ?o.
|
||||||
|
}""", initNs={'schema': schema})
|
||||||
|
|
||||||
|
for row in results:
|
||||||
|
print("%s %s %s" % row)
|
6
lod/validate.py
Normal file
6
lod/validate.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
import rdflib
|
||||||
|
import sys
|
||||||
|
g = rdflib.Graph()
|
||||||
|
dataset = sys.argv[1] if len(sys.argv) > 1 else 'reviews.ttl'
|
||||||
|
g.parse(dataset, format="n3")
|
||||||
|
print(g.serialize(format="n3").decode('utf-8'))
|
Loading…
Reference in New Issue
Block a user