1
0
mirror of https://github.com/gsi-upm/sitc synced 2024-11-22 14:32:28 +00:00

Fixed encoding and jsonld issues

This commit is contained in:
J. Fernando Sánchez 2017-02-21 18:22:14 +01:00
parent a70ee9af58
commit f5d44e4a18

View File

@ -32,14 +32,15 @@ def sanitize_triple(t):
with request.urlopen(url) as response: with request.urlopen(url) as response:
# Get all json-ld objects embedded in the html file # Get all json-ld objects embedded in the html file
html = response.read().decode('utf-8') html = response.read().decode('utf-8', errors='ignore')
parser = etree.XMLParser(recover=True) parser = etree.XMLParser(recover=True)
root = etree.fromstring(html, parser=parser) root = etree.fromstring(html, parser=parser)
for jsonld in root.findall(".//script[@type='application/ld+json']"): if root:
g.parse(data=jsonld.text, publicID=url, format='json-ld') for jsonld in root.findall(".//script[@type='application/ld+json']"):
g.parse(data=jsonld.text, publicID=url, format='json-ld')
fixedgraph = Graph() fixedgraph = Graph()
fixedgraph += [sanitize_triple(s) for s in g] fixedgraph += [sanitize_triple(s) for s in g]
print(g.serialize(format='turtle').decode('utf-8')) print(g.serialize(format='turtle').decode('utf-8', errors='ignore'))