mirror of
https://github.com/gsi-upm/sitc
synced 2024-11-22 14:32:28 +00:00
Fixed encoding and jsonld issues
This commit is contained in:
parent
a70ee9af58
commit
f5d44e4a18
@ -32,14 +32,15 @@ def sanitize_triple(t):
|
|||||||
|
|
||||||
with request.urlopen(url) as response:
|
with request.urlopen(url) as response:
|
||||||
# Get all json-ld objects embedded in the html file
|
# Get all json-ld objects embedded in the html file
|
||||||
html = response.read().decode('utf-8')
|
html = response.read().decode('utf-8', errors='ignore')
|
||||||
parser = etree.XMLParser(recover=True)
|
parser = etree.XMLParser(recover=True)
|
||||||
root = etree.fromstring(html, parser=parser)
|
root = etree.fromstring(html, parser=parser)
|
||||||
for jsonld in root.findall(".//script[@type='application/ld+json']"):
|
if root:
|
||||||
g.parse(data=jsonld.text, publicID=url, format='json-ld')
|
for jsonld in root.findall(".//script[@type='application/ld+json']"):
|
||||||
|
g.parse(data=jsonld.text, publicID=url, format='json-ld')
|
||||||
|
|
||||||
|
|
||||||
fixedgraph = Graph()
|
fixedgraph = Graph()
|
||||||
fixedgraph += [sanitize_triple(s) for s in g]
|
fixedgraph += [sanitize_triple(s) for s in g]
|
||||||
|
|
||||||
print(g.serialize(format='turtle').decode('utf-8'))
|
print(g.serialize(format='turtle').decode('utf-8', errors='ignore'))
|
||||||
|
Loading…
Reference in New Issue
Block a user