diff --git a/lod/SPARQL.ipynb b/lod/SPARQL.ipynb new file mode 100644 index 0000000..6e99077 --- /dev/null +++ b/lod/SPARQL.ipynb @@ -0,0 +1,1825 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "7276f055a8c504d3c80098c62ed41a4f", + "grade": false, + "grade_id": "cell-0bfe38f97f6ab2d2", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "source": [ + "
\n", + "
\n", + "

Course Notes for Learning Intelligent Systems

\n", + "

Department of Telematic Engineering Systems

\n", + "
Universidad Politécnica de Madrid
\n", + "
\n", + " \"UPM\"\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "a273399fb0e4a7752cea07a36562def1", + "grade": false, + "grade_id": "cell-0cd673883ee592d1", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "source": [ + "## Introduction to Linked Data\n", + "\n", + "This lecture provides a quick introduction to semantic queries in Python.\n", + "We will be using DBpedia, a semantic version of Wikipedia.\n", + "\n", + "The language we will use to query DBpedia is SPARQL, a semantic query language inspired by SQL.\n", + "For convenience, the examples in the notebook are executable, and they are accompanied by some code to test the results.\n", + "If the tests pass, you probably got the answer right.\n", + "\n", + "However, you can also use any other method to write and send your queries.\n", + "You may find online query editors particularly useful.\n", + "In addition to running queries from your browser, they provide useful features such as syntax highlighting and autocompletion.\n", + "Some examples are:\n", + "\n", + "* DBpedia's virtuoso query editor https://dbpedia.org/sparql\n", + "* A javascript based client hosted at GSI: http://yasgui.cluster.gsi.dit.upm.es/" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "255c4bd678939b4448860dc5e0afdae6", + "grade": false, + "grade_id": "cell-10264483046abcc4", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "source": [ + "## Objectives\n", + "\n", + "* Learning SPARQL and the Linked Data principles by defining queries to answer a set of problems of increasing difficulty\n", + "* Verifying the usefulness of the Linked Open Data initiative by querying data from different RDF graphs and endpoints\n", + "* Learning how to use integrated SPARQL editors and programming interfaces to SPARQL." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "f04dd27e103bacc5166763900527901e", + "grade": false, + "grade_id": "cell-4f8492996e74bf20", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "source": [ + "## Tools\n", + "\n", + "* This notebook\n", + "* SPARQL editors (optional)\n", + " * YASGUI-GSI http://yasgui.cluster.gsi.dit.upm.es\n", + " * DBpedia virtuoso http://dbpedia.org/sparql\n", + "\n", + "Using the YASGUI-GSI editor has several advantages over other options.\n", + "It features:\n", + "\n", + "* Selection of data source, either by specifying the URL or by selecting from a dropdown menu\n", + "* Interactive query editing\n", + " * A set of pre-defined queries\n", + " * Syntax errors\n", + " * Auto-complete\n", + "* Data visualization\n", + " * Total number of results\n", + " * Different formats (table, pivot table, raw response, etc.)\n", + " * Pagination of results\n", + " * Search and filter results" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "c5f8646518bd832a47d71f9d3218237a", + "grade": false, + "grade_id": "cell-eb13908482825e42", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "source": [ + "Run this line to enable the `%%sparql` magic command." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from helpers import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `%%sparql` magic command will allow us to use SPARQL inside normal jupyter cells.\n", + "\n", + "For instance, the following code:\n", + "\n", + "```\n", + "%%sparql\n", + "\n", + "MY QUERY\n", + "``` \n", + "\n", + "Is the same as `run_query('MY QUERY', endpoint='http://dbpedia.org/sparql')` plus some additional steps, such as saving the results in a nice table format so that they can be used later and storing the results in a variable (`LAST_QUERY`), which we will use in our tests.\n", + "\n", + "You do not need to worry about it, and **you can always use one of the suggested online editors if you wish**." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercises\n", + "\n", + "The following exercises cover the basics of SPARQL with simple use cases.\n", + "We will provide you some example code to get you started, the *question* you will have to answer using SPARQL, and the skeleton for the answer.\n", + "\n", + "After every query, you will find some python code to test the results of the query.\n", + "Make sure you've run the tests before moving to the next exercise.\n", + "If the test gives you an error, you've probably done something wrong.\n", + "You **do not need to understand or modify the test code**.\n", + "\n", + "\n", + "In case you're interested, the tests rely on the `LAST_QUERY` variable, which is updated by the `%%sparql` magic after every query.\n", + "This variable contains the full query used (`LAST_QUERY[\"query\"]`), the endpoint it was sent to (`LAST_QUERY[\"endpoint\"]`), and a dictionary with the response of the endpoint (`LAST_QUERY[\"results\"]`).\n", + "For convenience, the results are also given as tuples (`LAST_QUERY[\"tuples\"]`), and as a dictionary of of `{column:[values]}` (`LAST_QUERY[\"columns\"]`)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### First Select\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start with a simple query. We will get a list of cities and towns in Madrid.\n", + "If we take a look at the DBpedia ontology or the page of any town we already know, we discover that the property that links towns to their community is [`isPartOf`](http://dbpedia.org/ontology/isPartOf), and [the Community of Madrid is also a resource in DBpedia](http://dbpedia.org/resource/Community_of_Madrid)\n", + "\n", + "Since there are potentially many cities to get, we will limit our results to the first 10 results:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "SELECT ?localidad\n", + "WHERE {\n", + " ?localidad \n", + "}\n", + "LIMIT 10" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, that query is very verbose because we are using full URIs.\n", + "To simplify it, we will make use of SPARQL prefixes:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX dbo: \n", + "PREFIX dbr: \n", + " \n", + "SELECT ?localidad\n", + "WHERE {\n", + " ?localidad dbo:isPartOf dbr:Community_of_Madrid.\n", + "}\n", + "LIMIT 10" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To make sure that the query returned something sensible, we can test it with some python code:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert 'localidad' in LAST_QUERY['columns']\n", + "assert len(LAST_QUERY['tuples']) == 10" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that you have some experience under your belt, it is time to design your own query.\n", + "\n", + "Your first task it to get a list of Spanish Novelits, using the skeleton below and the previous query to guide you.\n", + "\n", + "Pages for Spanish novelists are grouped in the *Spanish novelists* DBpedia category. You can use that fact to get your list.\n", + "In other words, the difference from the previous query will be using `dct:subject` instead of `dbo:isPartOf`, and `dbc:Spanish_novelists` instead of `dbr:Community_of_Madrid`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "d73b49b84482f51dc199b0e22763e9cc", + "grade": false, + "grade_id": "cell-7a9509ff3c34127e", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX dct:\n", + "PREFIX dbc:\n", + "\n", + "SELECT ?escritor\n", + "\n", + "WHERE {\n", + "# YOUR CODE HERE\n", + "}\n", + "LIMIT 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "a5aafd75ac7fa036fe5dafc4ed30c535", + "grade": true, + "grade_id": "cell-91240ded2cac7b6d", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "assert len(LAST_QUERY['columns']) == 1 # We only use one variable, ?escritor\n", + "assert len(LAST_QUERY['tuples']) == 10 # There should be 10 results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using more criteria\n", + "\n", + "We can get more than one property in the same query. Let us modify our query to get the population of the cities as well." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX dbo: \n", + "PREFIX dbr: \n", + " \n", + "SELECT ?localidad ?pop ?when\n", + "\n", + "WHERE {\n", + " ?localidad dbo:populationTotal ?pop .\n", + " ?localidad dbo:isPartOf dbr:Community_of_Madrid.\n", + " ?localidad dbp:populationAsOf ?when .\n", + "}\n", + "\n", + "LIMIT 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert 'localidad' in LAST_QUERY['columns']\n", + "assert 'http://dbpedia.org/resource/Parla' in LAST_QUERY['columns']['localidad']\n", + "assert ('http://dbpedia.org/resource/San_Sebastián_de_los_Reyes', '75912', '2009') in LAST_QUERY['tuples']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Time to try it yourself.\n", + "\n", + "Get the list of Spanish novelists AND their name (using rdfs:label)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "7cbf5260bbc6121b4ec1ec0f62e814c1", + "grade": false, + "grade_id": "cell-83dcaae0d09657b5", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX rdfs:\n", + "PREFIX dct:\n", + "PREFIX dbc:\n", + "\n", + "SELECT ?escritor ?name\n", + "\n", + "WHERE {\n", + "# YOUR CODE HERE\n", + "}\n", + "LIMIT 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "5c7bee95c0c08a8ede47fcaad597f51f", + "grade": true, + "grade_id": "cell-8afd28aada7a896c", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "assert 'escritor' in LAST_QUERY['columns']\n", + "assert 'http://dbpedia.org/resource/Eduardo_Mendoza_Garriga' in LAST_QUERY['columns']['escritor']\n", + "assert ('http://dbpedia.org/resource/Eduardo_Mendoza_Garriga', 'Eduardo Mendoza') in LAST_QUERY['tuples']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtering and ordering" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the previous example, we saw that we got what seemed to be duplicated answers.\n", + "\n", + "This happens because entities can have labels in different languages (e.g. English, Spanish).\n", + "To restrict the search to only those results we're interested in, we can use filtering.\n", + "\n", + "We can also decide the order in which our results are shown.\n", + "\n", + "For instance, this is how we could use filtering to get only large cities in our example, ordered by population:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX dbo: \n", + "PREFIX dbr: \n", + " \n", + "SELECT ?localidad ?pop ?when\n", + "\n", + "WHERE {\n", + " ?localidad dbo:populationTotal ?pop .\n", + " ?localidad dbo:isPartOf dbr:Community_of_Madrid.\n", + " ?localidad dbp:populationAsOf ?when .\n", + " FILTER(?pop > 100000)\n", + "}\n", + "ORDER BY ?pop\n", + "LIMIT 100" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that ordering happens before limits." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "c6080c3ed1dd3e9c3a224ac74e9dedc6", + "grade": true, + "grade_id": "cell-cb7b8283568cd349", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "# We still have the biggest city\n", + "assert ('http://dbpedia.org/resource/Madrid', '3141991', '2014') in LAST_QUERY['tuples']\n", + "# But the smaller ones are gone\n", + "assert 'http://dbpedia.org/resource/Tres_Cantos' not in LAST_QUERY['columns']['localidad']\n", + "assert 'http://dbpedia.org/resource/San_Sebastián_de_los_Reyes' not in LAST_QUERY['columns']['localidad']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, try filtering to get a list of novelists and their name in Spanish, ordered by name `(FILTER (LANG(?nombre) = \"es\") y ORDER BY`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "8b1697739ecd76d45b6597a28429f13d", + "grade": false, + "grade_id": "cell-ff3d611cb0304b01", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX dct:\n", + "PREFIX dbc:\n", + "\n", + "SELECT ?escritor, ?nombre\n", + "\n", + "WHERE {\n", + "# YOUR CODE HERE\n", + "}\n", + "# YOUR CODE HERE\n", + "LIMIT 1000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "2300be1911eb9cfddc6e2a82dcb244c2", + "grade": true, + "grade_id": "cell-d70cc6ea394741bc", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "assert len(LAST_QUERY['tuples']) >= 50\n", + "assert 'Adelaida García Morales' in LAST_QUERY['columns']['nombre']\n", + "assert sum(1 for k in LAST_QUERY['columns']['escritor'] if k == 'http://dbpedia.org/resource/Adelaida_García_Morales') == 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dates" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From now on, we will focus on our Writers example.\n", + "\n", + "First, search for writers born in the XX century.\n", + "You can use a special filter, knowing that `\"2000\"^^xsd:date` is the first date of year 2000." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "1764314669c1e3ad131a0930fa33549c", + "grade": false, + "grade_id": "cell-ab7755944d46f9ca", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX dct:\n", + "PREFIX dbc:\n", + "PREFIX dbo:\n", + "\n", + "SELECT ?escritor, ?nombre, year(?fechaNac) as ?nac\n", + "\n", + "WHERE {\n", + "# YOUR CODE HERE\n", + "}\n", + "# YOUR CODE HERE\n", + "LIMIT 1000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "7a3c047b64ce4ffd02c87878f73f212a", + "grade": true, + "grade_id": "cell-cf3821f2d33fb0f6", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "assert 'Camilo José Cela' in LAST_QUERY['columns']['nombre']\n", + "assert 'Javier Marías' in LAST_QUERY['columns']['nombre']\n", + "assert all(int(x) > 1899 and int(x) < 2001 for x in LAST_QUERY['columns']['nac'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Optional\n", + "\n", + "In our last example, we were missing all the novelists that are missing their birth information in DBpedia.\n", + "\n", + "We can specify optional values in a query using the `OPTIONAL` keyword.\n", + "When a set of clauses are inside an OPTIONAL group, the SPARQL endpoint will try to use them in the query\n", + "If there are no results for that part of the query, the variables it specifies will not be bound (i.e. they will be empty).\n", + "\n", + "Using that, let us retrieve all the novelists born between 1900 and 2000, and the date they died (if they are available)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "429b902d4da0f40aefebba0ab722645e", + "grade": false, + "grade_id": "cell-254a18dd973e82ed", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX dct:\n", + "PREFIX dbc:\n", + "PREFIX dbo:\n", + "\n", + "SELECT ?escritor, ?nombre, ?fechaNac, ?fechaDef\n", + "\n", + "WHERE {\n", + "# YOUR CODE HERE\n", + "}\n", + "# YOUR CODE HERE\n", + "LIMIT 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "ed9321d104cd1d6e7839e3bcac78a8f1", + "grade": true, + "grade_id": "cell-4d6a64dde67f0e11", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "assert 'Miguel de Cervantes' in LAST_QUERY['columns']['nombre']\n", + "assert '1547-1-1' in LAST_QUERY['columns']['fechaNac']\n", + "assert '' not in LAST_QUERY['columns']['fechaNac'] # All birthdates are defined\n", + "assert '' in LAST_QUERY['columns']['fechaDef'] # Some deathdates are not defined" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bound" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can check whether the optional value for a key was bound in a SPARQL query using `BOUND(?key)`.\n", + "\n", + "This is very useful for two purposes.\n", + "First, it allows us to look for patterns that **do not occur** in the graph, such as missing properties.\n", + "For instance, we could search for the authors with missing birth information so we can add it.\n", + "Secondly, we can use bound in filters to get conditional filters.\n", + "We will explore both uses in this exercise." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the list of Spanish novelists that are still alive.\n", + "A person is alive if their death date is not defined and the were born less than 100 years ago" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "555154c87d8722bfeacd0e5cf5abc1a7", + "grade": false, + "grade_id": "cell-474b1a72dec6827c", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX dct:\n", + "PREFIX dbc:\n", + "PREFIX dbo:\n", + "\n", + "SELECT ?escritor, ?nombre, year(?fechaNac) as ?nac\n", + "\n", + "WHERE {\n", + "# YOUR CODE HERE\n", + "}\n", + "# YOUR CODE HERE\n", + "LIMIT 1000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "fd420f3d8b7eca269eaba715b3999893", + "grade": true, + "grade_id": "cell-46b62dd2856bc919", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "assert 'Fernando Arrabal' in LAST_QUERY['columns']['nombre']\n", + "assert 'Albert Espinosa' in LAST_QUERY['columns']['nombre']\n", + "for year in LAST_QUERY['columns']['nac']:\n", + " assert int(year) >= 1918" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, get the list of Spanish novelists that died before their fifties (i.e. younger than 50 years old), or that aren't 50 years old yet.\n", + "\n", + "Hint: you can use boolean logic in your filters (e.g. `&&` and `||`).\n", + "\n", + "Hint 2: Some dates are not formatted properly, which makes some queries fail when they shouldn't. You might need to convert between different types as a workaround. For instance, you could get the year from a date like this: `year(xsd:dateTime(str(?date)))`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "22505aa8eab7f771bf30ed12fe13f80c", + "grade": false, + "grade_id": "cell-ceefd3c8fbd39d79", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX dct:\n", + "PREFIX dbc:\n", + "PREFIX dbo:\n", + "\n", + "SELECT ?escritor, ?nombre, year(?fechaNac) as ?nac, ?fechaDef\n", + "\n", + "WHERE {\n", + "# YOUR CODE HERE\n", + "}\n", + "# YOUR CODE HERE\n", + "LIMIT 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "f11cf03b1c9ae7dbdaac314579b6c4bf", + "grade": true, + "grade_id": "cell-461cd6ccc6c2dc79", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "assert 'Javier Sierra' in LAST_QUERY['columns']['nombre']\n", + "assert 'http://dbpedia.org/resource/Sanmao_(author)' in LAST_QUERY['columns']['escritor']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Finding unique elements" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In our last example, our results show some authors more than once.\n", + "This is because some properties are defined more than once.\n", + "For instance, birth date is giving using different formats.\n", + "Even if we exclude that property from our results by not adding it in our `SELECT`, we will get duplicated lines.\n", + "\n", + "To solve this, we can use the `DISTINCT` keyword." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Modify your last query to remove duplicated lines.\n", + "In other words, authors should only appear once." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "1380346cba93b5641132ba21f102e116", + "grade": false, + "grade_id": "cell-2a39adc71d26ae73", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX dct:\n", + "PREFIX dbc:\n", + "PREFIX dbo:\n", + "\n", + "SELECT DISTINCT ?escritor, ?nombre, year(?fechaNac) as ?nac\n", + "\n", + "WHERE {\n", + "# YOUR CODE HERE\n", + "}\n", + "# YOUR CODE HERE\n", + "LIMIT 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "c8e5bf05e9d050389b2f8e7f142fdab0", + "grade": true, + "grade_id": "cell-542e0e36347fd5d1", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "assert 'Javier Sierra' in LAST_QUERY['columns']['nombre']\n", + "assert 'http://dbpedia.org/resource/Albert_Espinosa' in LAST_QUERY['columns']['escritor']\n", + "\n", + "from collections import Counter\n", + "c = Counter(LAST_QUERY['columns']['nombre'])\n", + "for count in c.values():\n", + " assert count == 1\n", + " \n", + "c1 = Counter(LAST_QUERY['columns']['escritor'])\n", + "assert all(count==1 for count in c1.values())\n", + "# c = Counter(LAST_QUERY['columns']['nombre'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using other resources" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the list of living Spanish novelists born in Madrid.\n", + "\n", + "Hint: use `dbr:Madrid` and `dbo:birthPlace`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "32e2c9b0ce32483960f5ca794da54fa8", + "grade": false, + "grade_id": "cell-d175e41da57c889b", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX dct:\n", + "PREFIX dbc:\n", + "PREFIX dbr:\n", + "PREFIX dbo:\n", + "\n", + "SELECT DISTINCT ?escritor, ?nombre, ?lugarNac, year(?fechaNac) as ?nac\n", + "\n", + "WHERE {\n", + "# YOUR CODE HERE\n", + "}\n", + "# YOUR CODE HERE\n", + "LIMIT 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "db2cdda5575af942f110d85e2dbe02b5", + "grade": true, + "grade_id": "cell-fadd095862db6bc8", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "assert 'José Ángel Mañas' in LAST_QUERY['columns']['nombre']\n", + "assert 'http://dbpedia.org/resource/Madrid' in LAST_QUERY['columns']['lugarNac']\n", + "MADRID_QUERY = LAST_QUERY['columns'].copy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Traversing the graph" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the list of works of the authors in the previous query (i.e. authors born in Madrid), if they have any.\n", + "\n", + "Hint: use `dbo:author`, which is a **property of a literary work** that points to the author." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "abd3d09bdf5801d6d0b27d80326dfead", + "grade": false, + "grade_id": "cell-e4b99af9ef91ff6f", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX dct:\n", + "PREFIX dbc:\n", + "PREFIX dbr:\n", + "PREFIX dbo:\n", + "\n", + "SELECT DISTINCT ?escritor, ?nombre, ?obra\n", + "\n", + "WHERE {\n", + "# YOUR CODE HERE\n", + "}\n", + "# YOUR CODE HERE\n", + "LIMIT 10000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "d1305aa44456d51e3c52d78a9381f73a", + "grade": true, + "grade_id": "cell-68661b73c2140e4f", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "assert 'http://dbpedia.org/resource/A_Heart_So_White' in LAST_QUERY['columns']['obra']\n", + "assert 'http://dbpedia.org/resource/Tomorrow_in_the_Battle_Think_on_Me' in LAST_QUERY['columns']['obra']\n", + "assert '' in LAST_QUERY['columns']['obra'] # Some authors don't have works in dbpedia" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also get a list of the works in string format using GROUP_CONCAT.\n", + "For instance, `GROUP_CONCAT(?obra, \",\")`, to separate works with a comma.\n", + "\n", + "Try it yourself:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "f0ab8a246687b926fb919abbafaf3b53", + "grade": false, + "grade_id": "cell-e13fae23ccb78bb8", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX dct:\n", + "PREFIX dbc:\n", + "PREFIX dbr:\n", + "PREFIX dbo:\n", + "\n", + "# YOUR CODE HERE\n", + "\n", + "WHERE {\n", + "# YOUR CODE HERE\n", + "}\n", + "# YOUR CODE HERE\n", + "LIMIT 10000" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Traversing the graph" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get a list of living Spanish novelists born in Madrid, their name in Spanish, a link to their foto and a website (if they have one).\n", + "\n", + "If the query is right, you should see a list of writers after running the test code.\n", + "\n", + "Hint: `foaf:depiction` and `foaf: homepage`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "4ffc5d79f79c2079e93843838e91e053", + "grade": false, + "grade_id": "cell-b1f71c67dd71dad4", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX dct:\n", + "PREFIX dbc:\n", + "PREFIX dbr:\n", + "PREFIX dbo:\n", + "\n", + "SELECT ?escritor ?web ?foto\n", + "\n", + "WHERE {\n", + "# YOUR CODE HERE\n", + "}\n", + "ORDER BY ?nombre\n", + "LIMIT 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "bc497e6eaebe05e31248e3479df43c0c", + "grade": true, + "grade_id": "cell-8b8ba7cca701c652", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "fotos = set(filter(lambda x: x != '', LAST_QUERY['columns']['foto']))\n", + "assert len(fotos) > 2\n", + "show_photos(fotos) #show the pictures of the writers!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Union" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can merge the results of several queries, just like using `JOIN` in SQL.\n", + "The keyword in SPARQL is `UNION`, because we are merging graphs.\n", + "\n", + "`UNION` is useful in many situations.\n", + "For instance, when there are equivalent properties, or when you want to use two search terms and FILTER would be too inefficient.\n", + "\n", + "The syntax is as follows:\n", + "\n", + "```sparql\n", + "SELECT ?title\n", + "WHERE {\n", + " { ?book dc10:title ?title }\n", + " UNION\n", + " { ?book dc11:title ?title }\n", + " \n", + " ... REST OF YOUR QUERY ...\n", + "\n", + "}\n", + "```\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using UNION, get a list of distinct spanish novelists AND poets.\n", + "\n", + "Hint: Category: Spanish_poets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "5606810420d8cd259da74a3cc17fa824", + "grade": false, + "grade_id": "cell-21eb6323b6d0011d", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX dct:\n", + "PREFIX dbc:\n", + "PREFIX dbr:\n", + "PREFIX dbo:\n", + "\n", + "SELECT DISTINCT ?escritor, ?nombre\n", + "\n", + "WHERE {\n", + "# YOUR CODE HERE\n", + "}\n", + "# YOUR CODE HERE\n", + "LIMIT 10000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "eec248e71a855a5e713d31ae470f3fd4", + "grade": true, + "grade_id": "cell-004e021e877c6ace", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "assert 'Garcilaso de la Vega' in LAST_QUERY['columns']['nombre']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also get the count of results either by inspecting the result (we will not cover this) or by aggregating the results using the `COUNT` operation.\n", + "\n", + "The syntax is:\n", + " \n", + "```sparql\n", + "SELECT COUNT(?variable) as ?count_name\n", + "```\n", + "\n", + "Try it yourself with our previous example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "2452c6213ad156deb5adbcfaeef74b8b", + "grade": false, + "grade_id": "cell-e35414e191c5bf16", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX dct:\n", + "PREFIX dbc:\n", + "PREFIX dbr:\n", + "PREFIX dbo:\n", + "\n", + "# YOUR CODE HERE\n", + "\n", + "WHERE {\n", + "# YOUR CODE HERE\n", + "}\n", + "LIMIT 10000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "f8b76d57ce959522a3914a442835393a", + "grade": true, + "grade_id": "cell-7a7ef8255a5662e2", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "assert len(LAST_QUERY['columns']) == 1\n", + "column_name = list(LAST_QUERY['columns'].keys())[0]\n", + "assert int(LAST_QUERY['columns'][column_name][0]) > 200" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Regular expressions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "580570dba869801272f9948f1e901bfd", + "grade": false, + "grade_id": "cell-a57d3546a812f689", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX dct:\n", + "PREFIX dbc:\n", + "PREFIX dbr:\n", + "PREFIX dbo:\n", + "\n", + "# YOUR CODE HERE\n", + "\n", + "WHERE {\n", + "# YOUR CODE HERE\n", + "}\n", + "# YOUR CODE HERE\n", + "LIMIT 1000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "71b5b187bb147c0e7444b29a4f413720", + "grade": true, + "grade_id": "cell-c149fe65008f39a9", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "assert len(LAST_QUERY['columns']['nombre']) > 15\n", + "for i in LAST_QUERY['columns']['nombre']:\n", + " assert 'Juan' in i" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Additional exercises" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Find out if there are more dbpedia entries for writers (dbo:Writer) than for football players (dbo:SoccerPlayers)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get a list of European countries with a population higher than 20 million, in decreasing order of population, including their URI, name in English and population." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Find the country in the world that speaks the most languages. Show its name in Spanish, if available." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Querying custom data\n", + "\n", + "In the last part of this course, we will query the data annotated in the previous course on RDF.\n", + "\n", + "The goal is to try SPARQL with data annotated by users with limited knowledge of vocabularies and semantics, and to compare the experience with similar queries to a more structured dataset.\n", + "\n", + "Hence, there are two parts.\n", + "First, you will query a set of graphs annotated by students of this course.\n", + "Then, you will query a synthetic dataset that contains similar information." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In particular, you need to run five queries, each one will answer one of the following questions:\n", + "\n", + "* Number of hotels (or entities) with reviews\n", + "* Number of reviews\n", + "* The hotel with the lowest average score\n", + "* The hotel with the highest average score\n", + "* A list of hotels with their addresses and telephone numbers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Manually annotated" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Querying the manually annotated dataset is slightly different from querying DBpedia.\n", + "The main difference is that this dataset uses different graphs to separate the annotations from different students.\n", + "\n", + "First, let us get a list of graphs available:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/ejerciciohoteles\n", + " \n", + "SELECT ?g WHERE {\n", + " GRAPH ?g {\n", + " ?s ?p ?o .\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have this list, you can query specific graphs like so:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/ejerciciohoteles\n", + " \n", + "SELECT *\n", + "WHERE {\n", + " GRAPH {\n", + " ?s ?p ?o .\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, design five queries to answer the questions in the description, and run each of them in at least five of these graphs.\n", + "\n", + "You can manually run the queries or use the code below, where you only need to specify your queries and the graphs you have identified.\n", + "\n", + "If you need additional prefixes, feel free to modify the TEMPLATE variable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display\n", + "\n", + "QUERIES = {\n", + " 'highest score': '''\n", + " ?s ?p ?o\n", + "''',\n", + " 'lowest score': '''\n", + " ?s ?p ?o\n", + " ''',\n", + " 'number of hotels': '''\n", + " ?s ?p ?o\n", + " ''',\n", + " 'number of reviews': '''\n", + " ?s ?p ?o\n", + " ''',\n", + " 'telephones and addresses': '''\n", + " ?s ?p ?o\n", + " ''',\n", + " \n", + "}\n", + "\n", + "TEMPLATE = '''\n", + "SELECT * WHERE {{\n", + " GRAPH <{graph}>{{\n", + " {query}\n", + " }}\n", + " }}\n", + "'''\n", + "\n", + "GRAPHS = ['http://fuseki.cluster.gsi.dit.upm.es/36de86e6754934381d935f10618fe985',\n", + " ]\n", + "\n", + "for name, query in QUERIES.items():\n", + " for graph in GRAPHS:\n", + " print(name, '@', graph)\n", + " display(sparql('http://fuseki.cluster.gsi.dit.upm.es/ejerciciohoteles', TEMPLATE.format(graph=graph,\n", + " query=query)\n", + " ))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Synthetic dataset\n", + "\n", + "Now, run the same queries in the synthetic dataset.\n", + "\n", + "The query below should get you started:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/hotelessintetico \n", + "\n", + "SELECT *\n", + "WHERE {\n", + " ?s ?p ?o .\n", + "}\n", + "LIMIT 10" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Discussion\n", + "\n", + "Compare the results of the synthetic and the manual dataset, and answer these questions:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Both datasets should use the same schema. Are there any differences when it comes to using them?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "11e7e2b7d3dfb45f9534506761f896f9", + "grade": true, + "grade_id": "cell-9bd08e4f5842cb89", + "locked": false, + "points": 0, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "# YOUR CODE HERE" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Are data correctly annotated in both datasets?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "f676f18c71297e8429448fa0f0833db1", + "grade": true, + "grade_id": "cell-9dc1c9033198bb18", + "locked": false, + "points": 0, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "# YOUR CODE HERE" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Has any of the datasets been harder to query? Why?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "2a24b20a338d18f4879540f5e03f5889", + "grade": true, + "grade_id": "cell-0e63b8e9dcb24676", + "locked": false, + "points": 0, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "# YOUR CODE HERE" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Has any of the datasets been harder to query? Why" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "2ec2cf74959db9112c189a4e7a0b3609", + "grade": true, + "grade_id": "cell-6c18003ced54be23", + "locked": false, + "points": 0, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "# YOUR CODE HERE" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Are data correctly annotated in both datasets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "4a062d17043e5459a48314b1177cb8f1", + "grade": true, + "grade_id": "cell-cdce24ef5f581981", + "locked": false, + "points": 0, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "# YOUR CODE HERE" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* [RDFLib documentation](https://rdflib.readthedocs.io/en/stable/).\n", + "* [Wikidata Query Service query examples](https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Licence\n", + "The notebook is freely licensed under under the [Creative Commons Attribution Share-Alike license](https://creativecommons.org/licenses/by/2.0/). \n", + "\n", + "© 2018 Universidad Politécnica de Madrid." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lod/helpers.py b/lod/helpers.py new file mode 100644 index 0000000..b92f3d2 --- /dev/null +++ b/lod/helpers.py @@ -0,0 +1,94 @@ +from IPython.core.magic import (register_line_magic, register_cell_magic, + register_line_cell_magic) + +from IPython.display import HTML, display, Image +from urllib.request import Request, urlopen +from urllib.parse import quote_plus, urlencode +from urllib.error import HTTPError + +import json + + +def send_query(query, endpoint): + FORMATS = ",".join(["application/sparql-results+json", "text/javascript", "application/json"]) + + data = {'query': query} + # b = quote_plus(query) + + r = Request(endpoint, + data=urlencode(data).encode('utf-8'), + headers={'content-type': 'application/x-www-form-urlencoded', + 'accept': FORMATS}, + method='POST') + return json.loads(urlopen(r).read().decode('utf-8')); + + +def tabulate(tuples, header=None): + if not header: + header, tuples = tuples[0], tuples[1:] + header = '{}'.format(''.join('{}'.format(h) for h in header)) + rows = [] + for row in tuples: + inner = ''.join('{}'.format(c) for c in row) + rows.append('{}'.format(inner)) + body = ''.join(rows) + table = '{header}{body}
'.format(body=body, + header=header) + return table + + +LAST_QUERY = {} + + +def query(query, endpoint=None, print_table=False): + global LAST_QUERY + + endpoint = endpoint or "http://dbpedia.org/sparql" + results = send_query(query, endpoint) + tuples = to_table(results) + + + columns = {} + header, values = tuples[0], tuples[1:] + + for ix, h in enumerate(header): + columns[h] = [] + for row in values: + columns[h].append(row[ix]) + + LAST_QUERY.update({ + 'query': query, + 'endpoint': query, + 'results': results, + 'tuples': values, + 'columns': columns + }) + + if not print_table: + return tuples + return HTML(tabulate(tuples)) + + +def to_table(results): + table = [] + header = results['head']['vars'] + table.append(header) + for result in results["results"]["bindings"]: + table.append(tuple(result.get(h, {}).get('value', "") for h in header)) + return table + + +@register_cell_magic +def sparql(line, cell): + try: + return query(cell, endpoint=line, print_table=True) + except HTTPError as ex: + error_message = ex.read().decode('utf-8') + print('Error {}. Reason: {}'.format(ex.status, ex.reason)) + print(error_message) + + +def show_photos(values): + for value in values: + if 'http://' in value: + display(Image(url=value)) diff --git a/logo.jpg b/logo.jpg new file mode 100644 index 0000000..ff723a7 Binary files /dev/null and b/logo.jpg differ