From f5d44e4a18d73cfab01431b83a91e502692ab88a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Fernando=20S=C3=A1nchez?= Date: Tue, 21 Feb 2017 18:22:14 +0100 Subject: [PATCH] Fixed encoding and jsonld issues --- lod/extract_data.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lod/extract_data.py b/lod/extract_data.py index 5d48156..ffd531a 100644 --- a/lod/extract_data.py +++ b/lod/extract_data.py @@ -32,14 +32,15 @@ def sanitize_triple(t): with request.urlopen(url) as response: # Get all json-ld objects embedded in the html file - html = response.read().decode('utf-8') + html = response.read().decode('utf-8', errors='ignore') parser = etree.XMLParser(recover=True) root = etree.fromstring(html, parser=parser) - for jsonld in root.findall(".//script[@type='application/ld+json']"): - g.parse(data=jsonld.text, publicID=url, format='json-ld') + if root: + for jsonld in root.findall(".//script[@type='application/ld+json']"): + g.parse(data=jsonld.text, publicID=url, format='json-ld') fixedgraph = Graph() fixedgraph += [sanitize_triple(s) for s in g] -print(g.serialize(format='turtle').decode('utf-8')) +print(g.serialize(format='turtle').decode('utf-8', errors='ignore'))