diff --git a/lod/01_SPARQL_Introduction.ipynb b/lod/01_SPARQL_Introduction.ipynb new file mode 100644 index 0000000..a3a9b88 --- /dev/null +++ b/lod/01_SPARQL_Introduction.ipynb @@ -0,0 +1,1870 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "7276f055a8c504d3c80098c62ed41a4f", + "grade": false, + "grade_id": "cell-0bfe38f97f6ab2d2", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "source": [ + "
\n", + "
\n", + "

Course Notes for Learning Intelligent Systems

\n", + "

Department of Telematic Engineering Systems

\n", + "
Universidad Politécnica de Madrid
\n", + "
\n", + " \"UPM\"\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "6a78a7c2cbcad6ec014af585a381f1ff", + "grade": false, + "grade_id": "cell-0cd673883ee592d1", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "source": [ + "## Introduction to Linked Open Data\n", + "\n", + "This lecture provides a quick introduction to semantic queries in Python using SPARQL.\n", + "SPARQL is aa semantic query language inspired by SQL.\n", + "\n", + "This is the first in a series of notebooks about SPARQL, which consists of:\n", + "\n", + "* This notebook, which introduces basic concepts using a small public dataset.\n", + "* [A notebook with queries to a custom dataset](02_SPARQL_Custom_Endpoint.ipynb), which links to the RDF exercises.\n", + "* [A notebook with queries to DBpedia](03_SPARQL_Writers.ipynb). DBpedia is the semantic version of Wikipedia. It is very useful, as it contains much more data. However, finding the right properties to query can be challenging.\n", + "* [A notebook with more advanced SPARQL concepts](04_SPARQL_Advanced.ipynb), which extends the previous notebook with more advanced concepts, such as regular expressions and dealing with dates." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "bc0ca2e21254707344c60f895cb204b4", + "grade": false, + "grade_id": "cell-10264483046abcc4", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "source": [ + "## Objectives\n", + "\n", + "* Learning SPARQL and the Linked Data principles by defining queries to answer a set of problems of increasing difficulty\n", + "* Learning how to use integrated SPARQL editors and programming interfaces to SPARQL." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "99aecbad8f94966d92d72dc911d3ff99", + "grade": false, + "grade_id": "cell-4f8492996e74bf20", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "source": [ + "## Tools\n", + "\n", + "* This notebook\n", + "* External SPARQL editors (optional)\n", + " * YASGUI-GSI http://yasgui.cluster.gsi.dit.upm.es\n", + " * DBpedia virtuoso http://dbpedia.org/sparql\n", + "\n", + "Using the YASGUI-GSI editor has several advantages over other options.\n", + "It features:\n", + "\n", + "* Selection of data source, either by specifying the URL or by selecting from a dropdown menu\n", + "* Interactive query editing\n", + " * A set of pre-defined queries\n", + " * Syntax errors\n", + " * Auto-complete\n", + "* Data visualization\n", + " * Total number of results\n", + " * Different formats (table, pivot table, raw response, etc.)\n", + " * Pagination of results\n", + " * Search and filter results" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "99e3107f9987cdddae7866dded27f165", + "grade": false, + "grade_id": "cell-70ac24910356c3cf", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "source": [ + "## Instructions\n", + "\n", + "We will be using a semantic server, available at: http://fuseki.cluster.gsi.dit.upm.es/sitc.\n", + "\n", + "This server contains a dataset about [Beatles songs](http://www.snee.com/bobdc.blog/2017/11/sparql-queries-of-beatles-reco.html), which we will query with SPARQL.\n", + "\n", + "We will provide you some example code to get you started, the *question* you will have to answer using SPARQL, a template for the answer.\n", + "\n", + "After every query, you will find some python code to test the results of the query.\n", + "**Make sure you've run the tests before moving to the next exercise**.\n", + "If the test gives you an error, you've probably done something wrong.\n", + "You do not need to understand or modify the test code." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "1d332d3d11fd6b57f0ec0ac3c358c6cb", + "grade": false, + "grade_id": "cell-eb13908482825e42", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "source": [ + "For convenience, the examples in the notebook are executable (using the `%%sparql` magic command), and they are accompanied by some code to test the results.\n", + "If the tests pass, you probably got the answer right.\n", + "\n", + "**Run this line to enable the `%%sparql` magic command.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "aca7c5538b8fc53e99c92e94e6818c83", + "grade": false, + "grade_id": "cell-b3f3d92fa2100c3d", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "from helpers import sparql, solution, show_photos" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "e896b6560e45d5c385a43aa85e3523c7", + "grade": false, + "grade_id": "cell-04410e75828c388d", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "source": [ + "The `%%sparql` magic command will allow us to use SPARQL inside normal jupyter cells.\n", + "\n", + "For instance, the following code:\n", + "\n", + "```python \n", + "%%sparql http://dbpedia.org/sparql\n", + "\n", + "\n", + "``` \n", + "\n", + "Is the same as `run_query('', endpoint='http://dbpedia.org/sparql')` plus some additional steps, such as saving the results in a nice table format so that they can be used later and storing the results in a variable (`solution()`), which we will use in our tests.\n", + "\n", + "You do not need to worry about it, and **you can always use one of the suggested online editors if you wish**." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "96ca90572d6b275fa515c6b976115257", + "grade": false, + "grade_id": "cell-2a44c0da2c206d01", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "source": [ + "You can also use any other method to write your queries.\n", + "Just make sure to copy the working query back into the notebook so you can test it.\n", + "\n", + "You may find online query editors particularly useful.\n", + "In addition to running queries from your browser, they provide useful features such as syntax highlighting and autocompletion.\n", + "Some examples are:\n", + "\n", + "* DBpedia's virtuoso query editor https://dbpedia.org/sparql\n", + "* A javascript based client hosted at GSI: http://yasgui.cluster.gsi.dit.upm.es/\n", + "\n", + "[^1]: http://www.snee.com/bobdc.blog/2017/11/sparql-queries-of-beatles-reco.html" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "79c60bd3d4c13f380aae5778c5ce7245", + "grade": false, + "grade_id": "cell-d645128d3af18117", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "source": [ + "## Exercises\n", + "\n", + "The following exercises cover the basics of SPARQL with simple use cases." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "f7428fe79cd33383dfd3b09a0d951b6e", + "grade": false, + "grade_id": "cell-8391a5322a9ad4a7", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "source": [ + "#### First select - Exploring the dataset\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "f6b5da583694dd5cc9326c670830875d", + "grade": false, + "grade_id": "cell-4f56a152e4d70c02", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "source": [ + "Let's start with a simple query to explore the dataset using SPARQL.\n", + "We will get a list of the types of entities in the dataset.\n", + "\n", + "SPARQL syntax is similar to SQL, mixed with turtle.\n", + "A SPARQL query has two main parts: the `SELECT` block, which specifies what variables we want to get; and the `WHERE` block which, loosely speaking, defines how the variables will be obtained from the graph.\n", + "\n", + "In order to construct the `WHERE` block, we have to know the data we want to extract would be represented in Turtle.\n", + "\n", + "In particular, to write an entity and its type, we would write this triple:\n", + "\n", + "```turtle\n", + " a .\n", + "```\n", + "\n", + "For example:\n", + "\n", + "```turtle\n", + "example:Timmy a example:Boy\n", + "```\n", + "\n", + "In SPARQL, the parts that we wish to extract are replaced with a variable (e.g. `?name`, `?type`).\n", + "Hence, we would have something like this:\n", + "\n", + "```turtle\n", + "?entity a ?type\n", + "```\n", + "\n", + "The name of the variable has no effect on the query, but you should use a sensible name.\n", + "In these notebooks, try to use the names provided in the templates, because they might be used in the tests.\n", + "\n", + "There are additional parts in the query.\n", + "For now, we will only cover the `LIMIT` statement, which limits the number of results we will get.\n", + "Using `LIMIT` is usually a good idea, especially when trying new queries, because the dataset may be too big. \n", + "\n", + "Using all these concepts, we will run our first query, to get the list of entities and their type:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "7a9dc62ab639143c9fc13593e50500d4", + "grade": false, + "grade_id": "cell-8ce8c954513f17e7", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "SELECT ?entity ?type\n", + "WHERE {\n", + " ?entity a ?type\n", + "}\n", + "LIMIT 10" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "d6a79c2f5fd005a9e15a8f67dcfd4784", + "grade": false, + "grade_id": "cell-3d6d622c717c3950", + "locked": true, + "schema_version": 1, + "solution": false + } + }, + "source": [ + "You can check that the results you got match our expectations:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert len(solution()['tuples']) == 10 # Make sure we got 10 results \n", + "assert len(solution()['columns']) >= 1 # In 2 columns (?entity and ?type)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, use the same concepts to write a query that gets the **list of entities and their properties**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "3414fee2b6ccfc90a87a62697b35fbda", + "grade": false, + "grade_id": "cell-6e904d692b5facad", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "SELECT DISTINCT ?entity ?prop\n", + "WHERE {\n", + "# YOUR ANSWER HERE\n", + "}\n", + "LIMIT 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "97bd5d5383bd94a72c7452bc33e4b0f9", + "grade": true, + "grade_id": "cell-3fc0d3c43dfd04a3", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "s = solution()\n", + "assert len(s['tuples']) >= 100 # There are at least 100 results\n", + "assert 'entity' in s['columns'] # A column named entity exists\n", + "assert 'http://learningsparql.com/ns/musician/RaymondBrown' in s['columns']['entity'] # RaymondBrown is an entity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Getting a list of DISTINCT types" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To get a better grip of the dataset, we will get a list of types.\n", + "\n", + "We may try to do so with a simple query: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "SELECT ?type\n", + "WHERE {\n", + " ?entity a ?type\n", + "}\n", + "LIMIT 10" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, this list has many duplicates.\n", + "In fact, we only get one type (`Musician`).\n", + "\n", + "To remove duplicates, we will need the `DISTINCT` statement, which only shows unique (distinct) rows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "SELECT DISTINCT ?type\n", + "WHERE {\n", + " ?entity a ?type\n", + "}\n", + "LIMIT 100" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We should see only three types now (`Musician`, `Song`, and `Instrument`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert 'type' in solution()['columns']\n", + "assert len(solution()['tuples']) == 3\n", + "assert 'http://learningsparql.com/ns/schema/Musician' in solution()['columns']['type']\n", + "assert 'http://learningsparql.com/ns/schema/Song' in solution()['columns']['type']\n", + "assert 'http://learningsparql.com/ns/schema/Instrument' in solution()['columns']['type']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, **build a query to get the list of unique properties**:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "47c4f68e342ffe59a3804de7b6a3909b", + "grade": false, + "grade_id": "cell-e615f9a77c4bc9a5", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "SELECT DISTINCT ?property\n", + "WHERE {\n", + "# YOUR ANSWER HERE\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "c9ffeba2d4ffc3e0b95f15a0ec6012c5", + "grade": true, + "grade_id": "cell-9168718938ab7347", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "assert len(solution()['tuples']) == 182\n", + "assert 'http://learningsparql.com/ns/instrument/bass' in solution()['columns']['property']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Geting all properties for songs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `WHERE` statement can contain more than one line.\n", + "\n", + "For example, we can restrict the list of properties from the previous exercise, to only get properties of musicians:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "PREFIX s: \n", + "PREFIX rdfs: \n", + "\n", + "SELECT DISTINCT ?prop\n", + "WHERE {\n", + " ?song a s:Musician .\n", + " ?song ?prop ?value .\n", + "}\n", + "LIMIT 20" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There should be two results:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert len(solution()['tuples']) == 2 # There are exactly two results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice the use of prefixes, just like in turtle.\n", + "Also, these two options are equivalent:\n", + "\n", + "```turtle\n", + "?song a s:Musician ;\n", + " ?prop ?value .\n", + "\n", + "# And\n", + "\n", + "?song a s:Musician ;\n", + "?song ?prop ?value .\n", + "```\n", + "\n", + "The first one is just shorter to write.\n", + "\n", + "Alternatively, in this example we can also replace the properties we are not using with square brackets `[]`:\n", + "\n", + "```turtle\n", + "[] a s:Musician ;\n", + " ?prop [] .\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, use the same concepts to get a list of **songs and properties**, without duplicates:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "8b0faf938efc1a64a70515da3c132605", + "grade": false, + "grade_id": "cell-0223a51f609edcf9", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "PREFIX s: \n", + "PREFIX rdfs: \n", + "\n", + "# YOUR ANSWER HERE\n", + "WHERE {\n", + "# YOUR ANSWER HERE\n", + "}\n", + "LIMIT 20" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "e93d7336fd125d95996e60fd312a4e4d", + "grade": true, + "grade_id": "cell-3c7943c6382c62f5", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "s = solution()\n", + "assert len(set(s['tuples'])) == len(s['tuples']) # There are no duplicates\n", + "assert len(s['tuples']) >= 20" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Getting a list of song names" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the previous exercise, we saw the properties for Songs.\n", + "One of them is `rdfs:label`, which gives a human readable name for the entity.\n", + "\n", + "Using `rdfs:label`, get a list of song names:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "271f2b194c2db4c558a46e8312b593e6", + "grade": false, + "grade_id": "cell-8f43547dd788bb33", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "PREFIX s: \n", + "PREFIX rdfs: \n", + "\n", + "SELECT ?name\n", + "WHERE {\n", + "# YOUR ANSWER HERE\n", + "}\n", + "LIMIT 20" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "9f1f7cec8ce4674971543728ada86674", + "grade": true, + "grade_id": "cell-e13a1c921af2f6eb", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "s = solution()\n", + "assert 'Besame Mucho' in s['columns']['name']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Getting an ordered list of songs (ORDER BY)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `ORDER BY` statement allows us to determine the way results will be sorted.\n", + "This makes it easier to find errors, or missing data.\n", + "\n", + "The syntax is the following:\n", + "\n", + "```sparql\n", + "\n", + "SELECT *\n", + "WHERE { ... }\n", + "ORDER BY ... DESC() ASC()\n", + "... other statements like LIMIT ...\n", + "```\n", + "\n", + "The results can be sorted in ascending or descending order, and using several variables." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use `ORDER BY` to get a list of songs in **descending order**:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "9dcd9c6d51a61ac129cffa06e1463c66", + "grade": false, + "grade_id": "cell-a0f0b9d9b05c9631", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "PREFIX s: \n", + "PREFIX rdfs: \n", + "\n", + "SELECT ?name\n", + "WHERE {\n", + "# YOUR ANSWER HERE\n", + "}\n", + "# YOUR ANSWER HERE\n", + "LIMIT 50" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "a044b3fd6b8bd4e098bbe4d818cb4e9f", + "grade": true, + "grade_id": "cell-bc012ca9d7ad2867", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "s = solution()\n", + "assert len(s['tuples']) >= 20\n", + "assert s['columns']['name'][0][0] > s['columns']['name'][-1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get a list of musicians who collaborated in at least one song (Traversing the graph)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From our inspection of the properties in previous exercises, we know that each song has a list of properties that link to musicians, and each musician has a name. For example:\n", + "\n", + "\n", + "```turtle\n", + "song:HeyJude a schema:Song ;\n", + " instrument:guitar musician:RingoStarr .\n", + "\n", + "musician:RingoStarr a schema:Musician ;\n", + " rdfs:label \"Ringo Starr\" .\n", + "```\n", + "\n", + "Using this structure, and the SPARQL statements you already know, to get the **names** of all musicians that collaborated in at least one song.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "7be32a274bb576eb4c154c2737bc5a26", + "grade": false, + "grade_id": "cell-523b963fa4e288d0", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "PREFIX s: \n", + "PREFIX rdfs: \n", + "\n", + "SELECT DISTINCT ?musician\n", + "WHERE {\n", + " ?song a s:Song .\n", + "# YOUR ANSWER HERE\n", + " ]\n", + "}\n", + "ORDER BY ?name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "c8e3a929faf2afa72207c6921382654c", + "grade": true, + "grade_id": "cell-aa9a4e18d6fda225", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "s = solution()\n", + "assert 'musician' in s['columns']\n", + "assert 'Paul McCartney' in s['columns']['musician']\n", + "assert 'Peter Coe' in s['columns']['musician']\n", + "assert len(solution()['tuples']) >= 200" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### In how many songs did Ringo collaborate? (COUNT)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Results can be aggregated using different functions.\n", + "One of the simplest functions is `COUNT`.\n", + "The syntax for COUNT is:\n", + " \n", + "```sparql\n", + "SELECT COUNT(?variable) as ?count_name\n", + "```\n", + "\n", + "Use `COUNT` and `GROUP BY` to get a " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "d8419711d2db43ad657e2658a1ea86c4", + "grade": false, + "grade_id": "cell-e89d08031e30b299", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "PREFIX s: \n", + "PREFIX m: \n", + "PREFIX rdfs: \n", + "\n", + "# YOUR ANSWER HERE\n", + "WHERE {\n", + " ?song a s:Song .\n", + " ?song ?instrument m:RingoStarr .\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "29404e07edf639cdc0ce0d82e654ec31", + "grade": true, + "grade_id": "cell-903d2be00885e1d2", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "assert solution()['columns']['number'][0] == '412'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Getting the frequency of each instrument (GROUP BY)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Results can be grouped by one or more of the variables.\n", + "\n", + "Grouping is achieved with the `GROUP BY` statement. \n", + "The syntax for `GROUP BY` is:\n", + "\n", + " \n", + "```sparql\n", + "SELECT GROUP BY ?variable1 ?variable2 ...\n", + "```\n", + "\n", + "Once results are grouped, they can be aggregated using any aggregation function, such as `COUNT`.\n", + "\n", + "Using `GROUP BY` and `COUNT`, get the count of songs that use each instrument:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "7a0a7206384e7e1d9eb4450dd9e9871f", + "grade": false, + "grade_id": "cell-1429e4eb5400dbc7", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "PREFIX s: \n", + "PREFIX m: \n", + "PREFIX rdfs: \n", + "\n", + "SELECT ?instrument (COUNT(?song) as ?number)\n", + "WHERE {\n", + " ?song a s:Song .\n", + " ?song ?instrument m:RingoStarr .\n", + "}\n", + "# YOUR ANSWER HERE\n", + "ORDER BY DESC(?number)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "bd4dc379fea969d513be0ea97ee75922", + "grade": true, + "grade_id": "cell-907aaf6001e27e50", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "s = solution()\n", + "assert len(s['tuples']) == 37\n", + "assert s['columns']['number'][-1] == '1'\n", + "assert s['columns']['number'][0] == '233'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### How many different instruments are there in every song?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use other keywords inside our aggregation.\n", + "For example, we could use `DISTINCT` to remove duplicates before aggregating.\n", + "\n", + "Here is an example, which shows the number of songs each musician collaborated in.\n", + "It has to use `DISTINCT` because some artists play multiple instruments in a song." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "PREFIX s: \n", + "PREFIX rdfs: \n", + "\n", + "SELECT ?artist (COUNT(DISTINCT ?song) as ?number)\n", + "WHERE {\n", + " ?artist a s:Musician .\n", + " ?song ?instrument ?artist .\n", + "}\n", + "GROUP BY ?artist\n", + "ORDER BY DESC(?number)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, use the same principle to get the count of **different** instruments in each song.\n", + "Some songs have several musicians playing the same instrument, but we only care about *different* instruments in each song.\n", + "\n", + "Use `?number` for the count." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "4a231b4d6874dad435512b988c17c39e", + "grade": false, + "grade_id": "cell-ee208c762d00da9c", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "PREFIX s: \n", + "PREFIX rdfs: \n", + "\n", + "# YOUR ANSWER HERE\n", + "WHERE {\n", + " [] a s:Song ;\n", + " rdfs:label ?song ;\n", + " ?instrument ?musician .\n", + "}\n", + "# YOUR ANSWER HERE\n", + "ORDER BY DESC(?number)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "8118099bf14d9f0eb241c4d93ea6f0b9", + "grade": true, + "grade_id": "cell-ddeec32b8ac3d894", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "s = solution()\n", + "assert s['columns']['number'][0] == '27'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Who is the vocalist in every song? (using OPTIONAL)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this exercise, we will get a list of songs and their vocalists.\n", + "\n", + "We coul start with this query:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "PREFIX s: \n", + "PREFIX i: \n", + "PREFIX m: \n", + "PREFIX rdfs: \n", + "\n", + "SELECT ?song ?vocalist\n", + "WHERE {\n", + " ?song a s:Song .\n", + " ?song i:vocals ?vocalist\n", + "}\n", + "LIMIT 100" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, there are some songs that do not have a vocalist (at least, in the dataset).\n", + "Those songs will not appear in the list above, because we they do not match part of the `WHERE` clause.\n", + "\n", + "In these cases, we can specify optional values in a query using the `OPTIONAL` keyword.\n", + "When a set of clauses are inside an OPTIONAL group, the SPARQL endpoint will try to use them in the query.\n", + "If there are no results for that part of the query, the variables it specifies will not be bound (i.e. they will be empty).\n", + "\n", + "To exemplify this, we can use a property that **does not exist in the dataset**:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "PREFIX s: \n", + "PREFIX i: \n", + "PREFIX m: \n", + "PREFIX rdfs: \n", + "\n", + "SELECT ?song ?musician\n", + "WHERE {\n", + " ?song a s:Song .\n", + " OPTIONAL {\n", + " ?song i:a_made_up_instrument ?musician\n", + " }\n", + "}\n", + "LIMIT 100" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Although the property does not exist, the query will still return all the songs.\n", + "In the column for our instrument, it returns an empty value.\n", + "\n", + "Now, use the same concept, to get a list of the **names** of the vocalists (if any) in each song." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "4b0a0854457c37640aad67f375ed3a17", + "grade": false, + "grade_id": "cell-dcd68c45c1608a28", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "PREFIX s: \n", + "PREFIX i: \n", + "PREFIX m: \n", + "PREFIX rdfs: \n", + "\n", + "SELECT ?song ?vocalist\n", + "WHERE {\n", + " ?s a s:Song .\n", + " ?s rdfs:label ?song .\n", + "# YOUR ANSWER HERE\n", + "}\n", + "LIMIT 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "f7122b2284b5d59d59ce4a2925f0bb21", + "grade": true, + "grade_id": "cell-1e706b9c1c1331bc", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "s = solution()\n", + "assert 'Paul McCartney' in s['columns']['vocalist']\n", + "assert 'Paul McCartney' in s['columns']['vocalist']\n", + "assert ('Besame Mucho', 'Paul McCartney') in s['tuples']\n", + "assert '' in s['columns']['vocalist'] # Some songs do not have a vocalist" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What songs do not have a vocalist? (Bound)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we only want to list those songs that **do not** have a vocalist.\n", + "\n", + "To do so, we can copy the query from the previous exercise, and filter the results with the `BOUND` function.\n", + "\n", + "`BOUND` will return `true` if the variable has a value, and `false` otherwise.\n", + "\n", + "This is very useful for two purposes.\n", + "Firstly, it allows us to look for patterns that **do not occur** in the graph, such as missing properties.\n", + "For instance, we could search for the authors with missing birth information so we can add it.\n", + "Secondly, we can use bound in filters to get conditional filters.\n", + "\n", + "Add a filter below to only get songs without a vocalist:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "09621e7af911faf39a834e8281bc6d1f", + "grade": false, + "grade_id": "cell-0c7cc924a13d792a", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "PREFIX s: \n", + "PREFIX i: \n", + "PREFIX m: \n", + "PREFIX rdfs: \n", + "\n", + "SELECT ?song\n", + "WHERE {\n", + " ?s a s:Song .\n", + " ?s rdfs:label ?song .\n", + " OPTIONAL {\n", + " ?s i:vocals ?vocalist\n", + " }\n", + "# YOUR ANSWER HERE\n", + "}\n", + "LIMIT 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "cebff8ce42f3f36923e81e083a23d24c", + "grade": true, + "grade_id": "cell-2541abc93ab4d506", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "s = solution()\n", + "assert len(s['tuples']) == 23" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Who played guitar OR bass in the most songs? (Advanced FILTER with GROUP)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this exercise, we want a table with the name of musicians that played either the guitar (`i:guitar`) or the bass (`i:bass`), the instrument they played, and the times they played it.\n", + "\n", + "If a musician played both instruments, it should appear twice." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "ea9797f3b2d001ea41d7fa7a5170d5fb", + "grade": false, + "grade_id": "cell-d750b6d64c6aa0a7", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX s: \n", + "PREFIX i: \n", + "PREFIX m: \n", + "\n", + "SELECT ?musician ?instrument (COUNT(DISTINCT ?song) AS ?number)\n", + "WHERE {\n", + " ?song ?ins ?player .\n", + " ?ins rdfs:label ?instrument .\n", + " ?player rdfs:label ?musician .\n", + "# YOUR ANSWER HERE\n", + "}\n", + "# YOUR ANSWER HERE\n", + "\n", + "ORDER BY DESC(?instrument) DESC(?number)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s = solution()\n", + "assert ('George Harrison', 'guitar', '27') in s['tuples']\n", + "assert ('Stuart Sutcliffe', 'bass', '3') in s['tuples']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Who played the most instruments? (Advanced FILTER II)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, count how many instruments each musician have played in a song.\n", + "\n", + "**Do not count lead (`i:vocals`) or backing vocals (`i:backingvocals`) as instruments**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "2d82df272d43f678d3b19bf0b41530c1", + "grade": false, + "grade_id": "cell-2f5aa516f8191787", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX s: \n", + "PREFIX i: \n", + "PREFIX m: \n", + "\n", + "# YOUR ANSWER HERE\n", + "WHERE {\n", + " ?song ?ins ?player .\n", + " ?ins rdfs:label ?instrument .\n", + " ?player rdfs:label ?musician .\n", + "# YOUR ANSWER HERE\n", + "}\n", + "GROUP BY ?musician\n", + "ORDER BY DESC(?instrument) DESC(?number)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "bc83dd9577c9111b1f0ef5bd40c4ec08", + "grade": true, + "grade_id": "cell-bcd0f7e26b6c11c2", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "s = solution()\n", + "assert ('John Lennon', '52') in s['tuples']\n", + "assert ('Andy White', '2') in s['tuples']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Which songs had Ringo in dums OR Lennon in lead vocals? (UNION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can merge the results of several queries, just like using `JOIN` in SQL.\n", + "The keyword in SPARQL is `UNION`, because we are merging graphs.\n", + "\n", + "`UNION` is useful in many situations.\n", + "For instance, when there are equivalent properties, or when you want to use two search terms and FILTER would be too inefficient.\n", + "\n", + "The syntax is as follows:\n", + "\n", + "```sparql\n", + "SELECT ?title\n", + "WHERE {\n", + " { ?book dc10:title ?title }\n", + " UNION\n", + " { ?book dc11:title ?title }\n", + " \n", + " ... REST OF YOUR QUERY ...\n", + "\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "a1e20e2be817a592683dea89eed0120e", + "grade": false, + "grade_id": "cell-d3a742bd87d9c793", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "PREFIX rdfs: \n", + "PREFIX s: \n", + "PREFIX i: \n", + "PREFIX m: \n", + "\n", + "SELECT DISTINCT ?song\n", + "WHERE {\n", + "# YOUR ANSWER HERE\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "087630476d73bb415b065fafbd6024f0", + "grade": true, + "grade_id": "cell-409402df0e801d09", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "assert len(solution()['tuples']) == 246" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### In how many songs has each musician collaborated at least 10 times? (HAVING)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can filter results after an aggregation, using the `HAVING` statement.\n", + "Its syntax is:\n", + " \n", + "\n", + "```sparql\n", + "SELECT ...\n", + "WHERE ...\n", + "GROUP BY ...\n", + "HAVING ()\n", + "```\n", + "\n", + "e.g.\n", + "\n", + "```sparql\n", + "HAVING (?count > 10)\n", + "```\n", + "\n", + "Use this new statement to get the list of artists that played at least 10 times with the Beatlest, and the number of times they did:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "1d2cb88412c89c35861a4f9fccea3bf2", + "grade": false, + "grade_id": "cell-9d1ec854eb530235", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "\n", + "PREFIX rdfs: \n", + "\n", + "SELECT ?musician (COUNT(DISTINCT ?song) AS ?number) \n", + "WHERE {\n", + " ?song ?instrument [\n", + " rdfs:label ?musician \n", + " ]\n", + "}\n", + "GROUP BY ?musician\n", + "# YOUR ANSWER HERE\n", + "ORDER BY DESC(?number)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "checksum": "aa20aa4d11632ea5bd6004df3187d979", + "grade": true, + "grade_id": "cell-a79c688b4566dbe8", + "locked": true, + "points": 0, + "schema_version": 1, + "solution": false + } + }, + "outputs": [], + "source": [ + "s = solution()\n", + "assert len(s['tuples']) == 7\n", + "assert s['columns']['musician'][0] == 'Paul McCartney'\n", + "assert s['columns']['musician'][-1] == 'Mal Evans'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## **Optional** exercises" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These are additional exercises that can be solved with more advanced concepts.\n", + "\n", + "If you are curious, you could also check the notebook on Advanced SPARQL concepts." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What instruments could each musician play? (GROUP_CONCAT)\n", + "\n", + "\n", + "Another option to aggregate results is to concatenate them.\n", + "You can do so with:\n", + "\n", + "```sparql\n", + "GROUP_CONCAT(?name; separator=\",\")\n", + "```\n", + "\n", + "Using `GROUP_CONCAT`, get a list of the instruments that each musician could play." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "508b7f8656e849838aa93cd38f1c6635", + "grade": false, + "grade_id": "cell-7ea1f5154cdd8324", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "PREFIX rdfs: \n", + "PREFIX s: \n", + "PREFIX i: \n", + "PREFIX m: \n", + "\n", + "# YOUR ANSWER HERE" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What types of vocals are there? (REGEX)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In one of the exercises, we excluded lead and backing vocals from the list of instruments.\n", + "However, are those the only types of vocals?\n", + "\n", + "You can check if a string or URI matches a regular expression with `regex(?variable, \"\")`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "checksum": "cff1f9c034393f8af055e1f930d5fe32", + "grade": false, + "grade_id": "cell-b6bee887a1b1fc60", + "locked": false, + "schema_version": 1, + "solution": true + } + }, + "outputs": [], + "source": [ + "%%sparql http://fuseki.cluster.gsi.dit.upm.es/sitc/\n", + "PREFIX rdfs: \n", + "PREFIX s: \n", + "PREFIX i: \n", + "PREFIX m: \n", + "\n", + "# YOUR ANSWER HERE" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* [SPARQL queries of Beatles recording sessions](http://www.snee.com/bobdc.blog/2017/11/sparql-queries-of-beatles-reco.html)\n", + "* [RDFLib documentation](https://rdflib.readthedocs.io/en/stable/).\n", + "* [Wikidata Query Service query examples](https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Licence\n", + "The notebook is freely licensed under under the [Creative Commons Attribution Share-Alike license](https://creativecommons.org/licenses/by/2.0/). \n", + "\n", + "© 2018 Universidad Politécnica de Madrid." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lod/SPARQL.ipynb b/lod/SPARQL.ipynb deleted file mode 100644 index 3e0f071..0000000 --- a/lod/SPARQL.ipynb +++ /dev/null @@ -1,1880 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "7276f055a8c504d3c80098c62ed41a4f", - "grade": false, - "grade_id": "cell-0bfe38f97f6ab2d2", - "locked": true, - "schema_version": 1, - "solution": false - } - }, - "source": [ - "
\n", - "
\n", - "

Course Notes for Learning Intelligent Systems

\n", - "

Department of Telematic Engineering Systems

\n", - "
Universidad Politécnica de Madrid
\n", - "
\n", - " \"UPM\"\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "a273399fb0e4a7752cea07a36562def1", - "grade": false, - "grade_id": "cell-0cd673883ee592d1", - "locked": true, - "schema_version": 1, - "solution": false - } - }, - "source": [ - "## Introduction to Linked Data\n", - "\n", - "This lecture provides a quick introduction to semantic queries in Python.\n", - "We will be using DBpedia, a semantic version of Wikipedia.\n", - "\n", - "The language we will use to query DBpedia is SPARQL, a semantic query language inspired by SQL.\n", - "For convenience, the examples in the notebook are executable, and they are accompanied by some code to test the results.\n", - "If the tests pass, you probably got the answer right.\n", - "\n", - "However, you can also use any other method to write and send your queries.\n", - "You may find online query editors particularly useful.\n", - "In addition to running queries from your browser, they provide useful features such as syntax highlighting and autocompletion.\n", - "Some examples are:\n", - "\n", - "* DBpedia's virtuoso query editor https://dbpedia.org/sparql\n", - "* A javascript based client hosted at GSI: http://yasgui.cluster.gsi.dit.upm.es/" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "255c4bd678939b4448860dc5e0afdae6", - "grade": false, - "grade_id": "cell-10264483046abcc4", - "locked": true, - "schema_version": 1, - "solution": false - } - }, - "source": [ - "## Objectives\n", - "\n", - "* Learning SPARQL and the Linked Data principles by defining queries to answer a set of problems of increasing difficulty\n", - "* Verifying the usefulness of the Linked Open Data initiative by querying data from different RDF graphs and endpoints\n", - "* Learning how to use integrated SPARQL editors and programming interfaces to SPARQL." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "f04dd27e103bacc5166763900527901e", - "grade": false, - "grade_id": "cell-4f8492996e74bf20", - "locked": true, - "schema_version": 1, - "solution": false - } - }, - "source": [ - "## Tools\n", - "\n", - "* This notebook\n", - "* SPARQL editors (optional)\n", - " * YASGUI-GSI http://yasgui.cluster.gsi.dit.upm.es\n", - " * DBpedia virtuoso http://dbpedia.org/sparql\n", - "\n", - "Using the YASGUI-GSI editor has several advantages over other options.\n", - "It features:\n", - "\n", - "* Selection of data source, either by specifying the URL or by selecting from a dropdown menu\n", - "* Interactive query editing\n", - " * A set of pre-defined queries\n", - " * Syntax errors\n", - " * Auto-complete\n", - "* Data visualization\n", - " * Total number of results\n", - " * Different formats (table, pivot table, raw response, etc.)\n", - " * Pagination of results\n", - " * Search and filter results" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "c5f8646518bd832a47d71f9d3218237a", - "grade": false, - "grade_id": "cell-eb13908482825e42", - "locked": true, - "schema_version": 1, - "solution": false - } - }, - "source": [ - "Run this line to enable the `%%sparql` magic command." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from helpers import *" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `%%sparql` magic command will allow us to use SPARQL inside normal jupyter cells.\n", - "\n", - "For instance, the following code:\n", - "\n", - "```\n", - "%%sparql\n", - "\n", - "MY QUERY\n", - "``` \n", - "\n", - "Is the same as `run_query('MY QUERY', endpoint='http://dbpedia.org/sparql')` plus some additional steps, such as saving the results in a nice table format so that they can be used later and storing the results in a variable (`LAST_QUERY`), which we will use in our tests.\n", - "\n", - "You do not need to worry about it, and **you can always use one of the suggested online editors if you wish**." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Exercises\n", - "\n", - "The following exercises cover the basics of SPARQL with simple use cases.\n", - "We will provide you some example code to get you started, the *question* you will have to answer using SPARQL, and the skeleton for the answer.\n", - "\n", - "After every query, you will find some python code to test the results of the query.\n", - "Make sure you've run the tests before moving to the next exercise.\n", - "If the test gives you an error, you've probably done something wrong.\n", - "You **do not need to understand or modify the test code**.\n", - "\n", - "\n", - "In case you're interested, the tests rely on the `LAST_QUERY` variable, which is updated by the `%%sparql` magic after every query.\n", - "This variable contains the full query used (`LAST_QUERY[\"query\"]`), the endpoint it was sent to (`LAST_QUERY[\"endpoint\"]`), and a dictionary with the response of the endpoint (`LAST_QUERY[\"results\"]`).\n", - "For convenience, the results are also given as tuples (`LAST_QUERY[\"tuples\"]`), and as a dictionary of of `{column:[values]}` (`LAST_QUERY[\"columns\"]`)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### First Select\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's start with a simple query. We will get a list of cities and towns in Madrid.\n", - "If we take a look at the DBpedia ontology or the page of any town we already know, we discover that the property that links towns to their community is [`isPartOf`](http://dbpedia.org/ontology/isPartOf), and [the Community of Madrid is also a resource in DBpedia](http://dbpedia.org/resource/Community_of_Madrid)\n", - "\n", - "Since there are potentially many cities to get, we will limit our results to the first 10 results:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "SELECT ?localidad\n", - "WHERE {\n", - " ?localidad \n", - "}\n", - "LIMIT 10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "However, that query is very verbose because we are using full URIs.\n", - "To simplify it, we will make use of SPARQL prefixes:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX dbo: \n", - "PREFIX dbr: \n", - " \n", - "SELECT ?localidad\n", - "WHERE {\n", - " ?localidad dbo:isPartOf dbr:Community_of_Madrid.\n", - "}\n", - "LIMIT 10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To make sure that the query returned something sensible, we can test it with some python code:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "assert 'localidad' in LAST_QUERY['columns']\n", - "assert len(LAST_QUERY['tuples']) == 10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that you have some experience under your belt, it is time to design your own query.\n", - "\n", - "Your first task it to get a list of Spanish Novelits, using the skeleton below and the previous query to guide you.\n", - "\n", - "Pages for Spanish novelists are grouped in the *Spanish novelists* DBpedia category. You can use that fact to get your list.\n", - "In other words, the difference from the previous query will be using `dct:subject` instead of `dbo:isPartOf`, and `dbc:Spanish_novelists` instead of `dbr:Community_of_Madrid`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "d73b49b84482f51dc199b0e22763e9cc", - "grade": false, - "grade_id": "cell-7a9509ff3c34127e", - "locked": false, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX dct:\n", - "PREFIX dbc:\n", - "\n", - "SELECT ?escritor\n", - "\n", - "WHERE {\n", - "# YOUR CODE HERE\n", - "}\n", - "LIMIT 10" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "a5aafd75ac7fa036fe5dafc4ed30c535", - "grade": true, - "grade_id": "cell-91240ded2cac7b6d", - "locked": true, - "points": 0, - "schema_version": 1, - "solution": false - } - }, - "outputs": [], - "source": [ - "assert len(LAST_QUERY['columns']) == 1 # We only use one variable, ?escritor\n", - "assert len(LAST_QUERY['tuples']) == 10 # There should be 10 results" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using more criteria\n", - "\n", - "We can get more than one property in the same query. Let us modify our query to get the population of the cities as well." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX rdfs: \n", - "PREFIX dbo: \n", - "PREFIX dbr: \n", - " \n", - "SELECT ?localidad ?pop ?when\n", - "\n", - "WHERE {\n", - " ?localidad dbo:populationTotal ?pop .\n", - " ?localidad dbo:isPartOf dbr:Community_of_Madrid.\n", - " ?localidad dbp:populationAsOf ?when .\n", - "}\n", - "\n", - "LIMIT 100" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "assert 'localidad' in LAST_QUERY['columns']\n", - "assert 'http://dbpedia.org/resource/Parla' in LAST_QUERY['columns']['localidad']\n", - "assert ('http://dbpedia.org/resource/San_Sebastián_de_los_Reyes', '75912', '2009') in LAST_QUERY['tuples']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Time to try it yourself.\n", - "\n", - "Get the list of Spanish novelists AND their name (using rdfs:label)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "7cbf5260bbc6121b4ec1ec0f62e814c1", - "grade": false, - "grade_id": "cell-83dcaae0d09657b5", - "locked": false, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX rdfs:\n", - "PREFIX dct:\n", - "PREFIX dbc:\n", - "\n", - "SELECT ?escritor ?name\n", - "\n", - "WHERE {\n", - "# YOUR CODE HERE\n", - "}\n", - "LIMIT 10" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "5c7bee95c0c08a8ede47fcaad597f51f", - "grade": true, - "grade_id": "cell-8afd28aada7a896c", - "locked": true, - "points": 0, - "schema_version": 1, - "solution": false - } - }, - "outputs": [], - "source": [ - "assert 'escritor' in LAST_QUERY['columns']\n", - "assert 'http://dbpedia.org/resource/Eduardo_Mendoza_Garriga' in LAST_QUERY['columns']['escritor']\n", - "assert ('http://dbpedia.org/resource/Eduardo_Mendoza_Garriga', 'Eduardo Mendoza') in LAST_QUERY['tuples']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Filtering and ordering" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the previous example, we saw that we got what seemed to be duplicated answers.\n", - "\n", - "This happens because entities can have labels in different languages (e.g. English, Spanish).\n", - "To restrict the search to only those results we're interested in, we can use filtering.\n", - "\n", - "We can also decide the order in which our results are shown.\n", - "\n", - "For instance, this is how we could use filtering to get only large cities in our example, ordered by population:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX rdfs: \n", - "PREFIX dbo: \n", - "PREFIX dbr: \n", - " \n", - "SELECT ?localidad ?pop ?when\n", - "\n", - "WHERE {\n", - " ?localidad dbo:populationTotal ?pop .\n", - " ?localidad dbo:isPartOf dbr:Community_of_Madrid.\n", - " ?localidad dbp:populationAsOf ?when .\n", - " FILTER(?pop > 100000)\n", - "}\n", - "ORDER BY ?pop\n", - "LIMIT 100" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that ordering happens before limits." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "c6080c3ed1dd3e9c3a224ac74e9dedc6", - "grade": true, - "grade_id": "cell-cb7b8283568cd349", - "locked": true, - "points": 0, - "schema_version": 1, - "solution": false - } - }, - "outputs": [], - "source": [ - "# We still have the biggest city\n", - "assert ('http://dbpedia.org/resource/Madrid', '3141991', '2014') in LAST_QUERY['tuples']\n", - "# But the smaller ones are gone\n", - "assert 'http://dbpedia.org/resource/Tres_Cantos' not in LAST_QUERY['columns']['localidad']\n", - "assert 'http://dbpedia.org/resource/San_Sebastián_de_los_Reyes' not in LAST_QUERY['columns']['localidad']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, try filtering to get a list of novelists and their name in Spanish, ordered by name `(FILTER (LANG(?nombre) = \"es\") y ORDER BY`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "8b1697739ecd76d45b6597a28429f13d", - "grade": false, - "grade_id": "cell-ff3d611cb0304b01", - "locked": false, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX rdfs: \n", - "PREFIX dct:\n", - "PREFIX dbc:\n", - "\n", - "SELECT ?escritor, ?nombre\n", - "\n", - "WHERE {\n", - "# YOUR CODE HERE\n", - "}\n", - "# YOUR CODE HERE\n", - "LIMIT 1000" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "2300be1911eb9cfddc6e2a82dcb244c2", - "grade": true, - "grade_id": "cell-d70cc6ea394741bc", - "locked": true, - "points": 0, - "schema_version": 1, - "solution": false - } - }, - "outputs": [], - "source": [ - "assert len(LAST_QUERY['tuples']) >= 50\n", - "assert 'Adelaida García Morales' in LAST_QUERY['columns']['nombre']\n", - "assert sum(1 for k in LAST_QUERY['columns']['escritor'] if k == 'http://dbpedia.org/resource/Adelaida_García_Morales') == 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Dates" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From now on, we will focus on our Writers example.\n", - "\n", - "First, search for writers born in the XX century.\n", - "You can use a special filter, knowing that `\"2000\"^^xsd:date` is the first date of year 2000." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "1764314669c1e3ad131a0930fa33549c", - "grade": false, - "grade_id": "cell-ab7755944d46f9ca", - "locked": false, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX rdfs: \n", - "PREFIX dct:\n", - "PREFIX dbc:\n", - "PREFIX dbo:\n", - "\n", - "SELECT ?escritor, ?nombre, year(?fechaNac) as ?nac\n", - "\n", - "WHERE {\n", - "# YOUR CODE HERE\n", - "}\n", - "# YOUR CODE HERE\n", - "LIMIT 1000" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "7a3c047b64ce4ffd02c87878f73f212a", - "grade": true, - "grade_id": "cell-cf3821f2d33fb0f6", - "locked": true, - "points": 0, - "schema_version": 1, - "solution": false - } - }, - "outputs": [], - "source": [ - "assert 'Camilo José Cela' in LAST_QUERY['columns']['nombre']\n", - "assert 'Javier Marías' in LAST_QUERY['columns']['nombre']\n", - "assert all(int(x) > 1899 and int(x) < 2001 for x in LAST_QUERY['columns']['nac'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Optional\n", - "\n", - "In our last example, we were missing all the novelists that are missing their birth information in DBpedia.\n", - "\n", - "We can specify optional values in a query using the `OPTIONAL` keyword.\n", - "When a set of clauses are inside an OPTIONAL group, the SPARQL endpoint will try to use them in the query\n", - "If there are no results for that part of the query, the variables it specifies will not be bound (i.e. they will be empty).\n", - "\n", - "Using that, let us retrieve all the novelists born between 1900 and 2000, and the date they died (if they are available)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "429b902d4da0f40aefebba0ab722645e", - "grade": false, - "grade_id": "cell-254a18dd973e82ed", - "locked": false, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX rdfs: \n", - "PREFIX dct:\n", - "PREFIX dbc:\n", - "PREFIX dbo:\n", - "\n", - "SELECT ?escritor, ?nombre, ?fechaNac, ?fechaDef\n", - "\n", - "WHERE {\n", - "# YOUR CODE HERE\n", - "}\n", - "# YOUR CODE HERE\n", - "LIMIT 100" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "a6db5e4879286b0617be04711002ad63", - "grade": true, - "grade_id": "cell-4d6a64dde67f0e11", - "locked": true, - "points": 0, - "schema_version": 1, - "solution": false - } - }, - "outputs": [], - "source": [ - "assert 'Camilo José Cela' in LAST_QUERY['columns']['nombre']\n", - "assert '1916-05-11' in LAST_QUERY['columns']['fechaNac']\n", - "assert '' not in LAST_QUERY['columns']['fechaNac'] # All birthdates are defined\n", - "assert '' in LAST_QUERY['columns']['fechaDef'] # Some deathdates are not defined" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Bound" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can check whether the optional value for a key was bound in a SPARQL query using `BOUND(?key)`.\n", - "\n", - "This is very useful for two purposes.\n", - "First, it allows us to look for patterns that **do not occur** in the graph, such as missing properties.\n", - "For instance, we could search for the authors with missing birth information so we can add it.\n", - "Secondly, we can use bound in filters to get conditional filters.\n", - "We will explore both uses in this exercise." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get the list of Spanish novelists that are still alive.\n", - "A person is alive if their death date is not defined and the were born less than 100 years ago" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "555154c87d8722bfeacd0e5cf5abc1a7", - "grade": false, - "grade_id": "cell-474b1a72dec6827c", - "locked": false, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX rdfs: \n", - "PREFIX dct:\n", - "PREFIX dbc:\n", - "PREFIX dbo:\n", - "\n", - "SELECT ?escritor, ?nombre, year(?fechaNac) as ?nac\n", - "\n", - "WHERE {\n", - "# YOUR CODE HERE\n", - "}\n", - "# YOUR CODE HERE\n", - "LIMIT 1000" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "fd420f3d8b7eca269eaba715b3999893", - "grade": true, - "grade_id": "cell-46b62dd2856bc919", - "locked": true, - "points": 0, - "schema_version": 1, - "solution": false - } - }, - "outputs": [], - "source": [ - "assert 'Fernando Arrabal' in LAST_QUERY['columns']['nombre']\n", - "assert 'Albert Espinosa' in LAST_QUERY['columns']['nombre']\n", - "for year in LAST_QUERY['columns']['nac']:\n", - " assert int(year) >= 1918" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, get the list of Spanish novelists that died before their fifties (i.e. younger than 50 years old), or that aren't 50 years old yet.\n", - "\n", - "Hint: you can use boolean logic in your filters (e.g. `&&` and `||`).\n", - "\n", - "Hint 2: Some dates are not formatted properly, which makes some queries fail when they shouldn't. You might need to convert between different types as a workaround. For instance, you could get the year from a date like this: `year(xsd:dateTime(str(?date)))`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "22505aa8eab7f771bf30ed12fe13f80c", - "grade": false, - "grade_id": "cell-ceefd3c8fbd39d79", - "locked": false, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX rdfs: \n", - "PREFIX dct:\n", - "PREFIX dbc:\n", - "PREFIX dbo:\n", - "\n", - "SELECT ?escritor, ?nombre, year(?fechaNac) as ?nac, ?fechaDef\n", - "\n", - "WHERE {\n", - "# YOUR CODE HERE\n", - "}\n", - "# YOUR CODE HERE\n", - "LIMIT 100" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "f11cf03b1c9ae7dbdaac314579b6c4bf", - "grade": true, - "grade_id": "cell-461cd6ccc6c2dc79", - "locked": true, - "points": 0, - "schema_version": 1, - "solution": false - } - }, - "outputs": [], - "source": [ - "assert 'Javier Sierra' in LAST_QUERY['columns']['nombre']\n", - "assert 'http://dbpedia.org/resource/Sanmao_(author)' in LAST_QUERY['columns']['escritor']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Finding unique elements" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In our last example, our results show some authors more than once.\n", - "This is because some properties are defined more than once.\n", - "For instance, birth date is giving using different formats.\n", - "Even if we exclude that property from our results by not adding it in our `SELECT`, we will get duplicated lines.\n", - "\n", - "To solve this, we can use the `DISTINCT` keyword." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Modify your last query to remove duplicated lines.\n", - "In other words, authors should only appear once." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "1380346cba93b5641132ba21f102e116", - "grade": false, - "grade_id": "cell-2a39adc71d26ae73", - "locked": false, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX rdfs: \n", - "PREFIX dct:\n", - "PREFIX dbc:\n", - "PREFIX dbo:\n", - "\n", - "SELECT DISTINCT ?escritor, ?nombre, year(?fechaNac) as ?nac\n", - "\n", - "WHERE {\n", - "# YOUR CODE HERE\n", - "}\n", - "# YOUR CODE HERE\n", - "LIMIT 100" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "c8e5bf05e9d050389b2f8e7f142fdab0", - "grade": true, - "grade_id": "cell-542e0e36347fd5d1", - "locked": true, - "points": 0, - "schema_version": 1, - "solution": false - } - }, - "outputs": [], - "source": [ - "assert 'Javier Sierra' in LAST_QUERY['columns']['nombre']\n", - "assert 'http://dbpedia.org/resource/Albert_Espinosa' in LAST_QUERY['columns']['escritor']\n", - "\n", - "from collections import Counter\n", - "c = Counter(LAST_QUERY['columns']['nombre'])\n", - "for count in c.values():\n", - " assert count == 1\n", - " \n", - "c1 = Counter(LAST_QUERY['columns']['escritor'])\n", - "assert all(count==1 for count in c1.values())\n", - "# c = Counter(LAST_QUERY['columns']['nombre'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using other resources" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get the list of living Spanish novelists born in Madrid.\n", - "\n", - "Hint: use `dbr:Madrid` and `dbo:birthPlace`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "32e2c9b0ce32483960f5ca794da54fa8", - "grade": false, - "grade_id": "cell-d175e41da57c889b", - "locked": false, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX rdfs: \n", - "PREFIX dct:\n", - "PREFIX dbc:\n", - "PREFIX dbr:\n", - "PREFIX dbo:\n", - "\n", - "SELECT DISTINCT ?escritor, ?nombre, ?lugarNac, year(?fechaNac) as ?nac\n", - "\n", - "WHERE {\n", - "# YOUR CODE HERE\n", - "}\n", - "# YOUR CODE HERE\n", - "LIMIT 100" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "db2cdda5575af942f110d85e2dbe02b5", - "grade": true, - "grade_id": "cell-fadd095862db6bc8", - "locked": true, - "points": 0, - "schema_version": 1, - "solution": false - } - }, - "outputs": [], - "source": [ - "assert 'José Ángel Mañas' in LAST_QUERY['columns']['nombre']\n", - "assert 'http://dbpedia.org/resource/Madrid' in LAST_QUERY['columns']['lugarNac']\n", - "MADRID_QUERY = LAST_QUERY['columns'].copy()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Traversing the graph" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get the list of works of the authors in the previous query (i.e. authors born in Madrid), if they have any.\n", - "\n", - "Hint: use `dbo:author`, which is a **property of a literary work** that points to the author." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "abd3d09bdf5801d6d0b27d80326dfead", - "grade": false, - "grade_id": "cell-e4b99af9ef91ff6f", - "locked": false, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX rdfs: \n", - "PREFIX dct:\n", - "PREFIX dbc:\n", - "PREFIX dbr:\n", - "PREFIX dbo:\n", - "\n", - "SELECT DISTINCT ?escritor, ?nombre, ?obra\n", - "\n", - "WHERE {\n", - "# YOUR CODE HERE\n", - "}\n", - "# YOUR CODE HERE\n", - "LIMIT 10000" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "d1305aa44456d51e3c52d78a9381f73a", - "grade": true, - "grade_id": "cell-68661b73c2140e4f", - "locked": true, - "points": 0, - "schema_version": 1, - "solution": false - } - }, - "outputs": [], - "source": [ - "assert 'http://dbpedia.org/resource/A_Heart_So_White' in LAST_QUERY['columns']['obra']\n", - "assert 'http://dbpedia.org/resource/Tomorrow_in_the_Battle_Think_on_Me' in LAST_QUERY['columns']['obra']\n", - "assert '' in LAST_QUERY['columns']['obra'] # Some authors don't have works in dbpedia" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also get a list of the works in string format using GROUP_CONCAT.\n", - "For instance, `GROUP_CONCAT(?obra, \",\")`, to separate works with a comma.\n", - "\n", - "Try it yourself:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "f0ab8a246687b926fb919abbafaf3b53", - "grade": false, - "grade_id": "cell-e13fae23ccb78bb8", - "locked": false, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX rdfs: \n", - "PREFIX dct:\n", - "PREFIX dbc:\n", - "PREFIX dbr:\n", - "PREFIX dbo:\n", - "\n", - "# YOUR CODE HERE\n", - "\n", - "WHERE {\n", - "# YOUR CODE HERE\n", - "}\n", - "# YOUR CODE HERE\n", - "LIMIT 10000" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Traversing the graph" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get a list of living Spanish novelists born in Madrid, their name in Spanish, a link to their foto and a website (if they have one).\n", - "\n", - "If the query is right, you should see a list of writers after running the test code.\n", - "\n", - "Hint: `foaf:depiction` and `foaf: homepage`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "4ffc5d79f79c2079e93843838e91e053", - "grade": false, - "grade_id": "cell-b1f71c67dd71dad4", - "locked": false, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX rdfs: \n", - "PREFIX dct:\n", - "PREFIX dbc:\n", - "PREFIX dbr:\n", - "PREFIX dbo:\n", - "\n", - "SELECT ?escritor ?web ?foto\n", - "\n", - "WHERE {\n", - "# YOUR CODE HERE\n", - "}\n", - "ORDER BY ?nombre\n", - "LIMIT 100" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "bc497e6eaebe05e31248e3479df43c0c", - "grade": true, - "grade_id": "cell-8b8ba7cca701c652", - "locked": true, - "points": 0, - "schema_version": 1, - "solution": false - } - }, - "outputs": [], - "source": [ - "fotos = set(filter(lambda x: x != '', LAST_QUERY['columns']['foto']))\n", - "assert len(fotos) > 2\n", - "show_photos(fotos) #show the pictures of the writers!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Union" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can merge the results of several queries, just like using `JOIN` in SQL.\n", - "The keyword in SPARQL is `UNION`, because we are merging graphs.\n", - "\n", - "`UNION` is useful in many situations.\n", - "For instance, when there are equivalent properties, or when you want to use two search terms and FILTER would be too inefficient.\n", - "\n", - "The syntax is as follows:\n", - "\n", - "```sparql\n", - "SELECT ?title\n", - "WHERE {\n", - " { ?book dc10:title ?title }\n", - " UNION\n", - " { ?book dc11:title ?title }\n", - " \n", - " ... REST OF YOUR QUERY ...\n", - "\n", - "}\n", - "```\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using UNION, get a list of distinct spanish novelists AND poets.\n", - "\n", - "Hint: Category: Spanish_poets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "5606810420d8cd259da74a3cc17fa824", - "grade": false, - "grade_id": "cell-21eb6323b6d0011d", - "locked": false, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX rdfs: \n", - "PREFIX dct:\n", - "PREFIX dbc:\n", - "PREFIX dbr:\n", - "PREFIX dbo:\n", - "\n", - "SELECT DISTINCT ?escritor, ?nombre\n", - "\n", - "WHERE {\n", - "# YOUR CODE HERE\n", - "}\n", - "# YOUR CODE HERE\n", - "LIMIT 10000" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "eec248e71a855a5e713d31ae470f3fd4", - "grade": true, - "grade_id": "cell-004e021e877c6ace", - "locked": true, - "points": 0, - "schema_version": 1, - "solution": false - } - }, - "outputs": [], - "source": [ - "assert 'Garcilaso de la Vega' in LAST_QUERY['columns']['nombre']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also get the count of results either by inspecting the result (we will not cover this) or by aggregating the results using the `COUNT` operation.\n", - "\n", - "The syntax is:\n", - " \n", - "```sparql\n", - "SELECT COUNT(?variable) as ?count_name\n", - "```\n", - "\n", - "Try it yourself with our previous example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "2452c6213ad156deb5adbcfaeef74b8b", - "grade": false, - "grade_id": "cell-e35414e191c5bf16", - "locked": false, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX rdfs: \n", - "PREFIX dct:\n", - "PREFIX dbc:\n", - "PREFIX dbr:\n", - "PREFIX dbo:\n", - "\n", - "# YOUR CODE HERE\n", - "\n", - "WHERE {\n", - "# YOUR CODE HERE\n", - "}\n", - "LIMIT 10000" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "f8b76d57ce959522a3914a442835393a", - "grade": true, - "grade_id": "cell-7a7ef8255a5662e2", - "locked": true, - "points": 0, - "schema_version": 1, - "solution": false - } - }, - "outputs": [], - "source": [ - "assert len(LAST_QUERY['columns']) == 1\n", - "column_name = list(LAST_QUERY['columns'].keys())[0]\n", - "assert int(LAST_QUERY['columns'][column_name][0]) > 200" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Regular expressions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The last SPARQL concept we will cover are [regular expressions](https://www.w3.org/TR/rdf-sparql-query/#funcex-regex) (`regex`).\n", - "Regular expressions are a very powerful tool, but we will only cover the basics in this exercise.\n", - "\n", - "In essence, regular expressions match strings against patterns.\n", - "In their simplest form, they can be used to find substrings within a variable.\n", - "For instance, using `regex(?label, \"substring\")` would only match if and only if the `?label` variable contains `substring`.\n", - "But regular expressions can be more complex than that.\n", - "For instance, we can find patterns such as: a 10 digit number, a 5 character long string, or variables without whitespaces.\n", - "\n", - "The syntax of the regex function is the following:\n", - "\n", - "```\n", - "regex(?variable, \"pattern\", \"flags\")\n", - "```\n", - "\n", - "Flags are optional configuration options for the regular expression, such as *do not care about case* (`i` flag).\n", - "\n", - "As an example, let us find the cities in Madrid that contain \"de\" in their name." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "SELECT ?localidad\n", - "WHERE {\n", - " ?localidad .\n", - " ?localidad rdfs:label ?nombre .\n", - " FILTER (lang(?nombre) = \"es\" ).\n", - " FILTER regex(?nombre, \"de\", \"i\")\n", - "}\n", - "LIMIT 10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, use regular expressions to find Spanish novelists whose **first name** is Juan.\n", - "In other words, their name **starts with** \"Juan\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "580570dba869801272f9948f1e901bfd", - "grade": false, - "grade_id": "cell-a57d3546a812f689", - "locked": false, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "%%sparql\n", - "\n", - "PREFIX rdfs: \n", - "PREFIX dct:\n", - "PREFIX dbc:\n", - "PREFIX dbr:\n", - "PREFIX dbo:\n", - "\n", - "# YOUR CODE HERE\n", - "\n", - "WHERE {\n", - "# YOUR CODE HERE\n", - "}\n", - "# YOUR CODE HERE\n", - "LIMIT 1000" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false, - "nbgrader": { - "checksum": "6632242d1d5055e12c3df37941b9e434", - "grade": true, - "grade_id": "cell-c149fe65008f39a9", - "locked": true, - "points": 0, - "schema_version": 1, - "solution": false - } - }, - "outputs": [], - "source": [ - "assert len(LAST_QUERY['columns']['nombre']) > 15\n", - "for i in LAST_QUERY['columns']['nombre']:\n", - " assert 'Juan' in i\n", - "assert \"Robert Juan-Cantavella\" not in LAST_QUERY['columns']['nombre']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Additional exercises" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Find out if there are more dbpedia entries for writers (dbo:Writer) than for football players (dbo:SoccerPlayers)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get a list of European countries with a population higher than 20 million, in decreasing order of population, including their URI, name in English and population." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Find the country in the world that speaks the most languages. Show its name in Spanish, if available." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Querying custom data\n", - "\n", - "In the last part of this course, we will query the data annotated in the previous course on RDF.\n", - "\n", - "The goal is to try SPARQL with data annotated by users with limited knowledge of vocabularies and semantics, and to compare the experience with similar queries to a more structured dataset.\n", - "\n", - "Hence, there are two parts.\n", - "First, you will query a set of graphs annotated by students of this course.\n", - "Then, you will query a synthetic dataset that contains similar information." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In particular, you need to run five queries, each one will answer one of the following questions:\n", - "\n", - "* Number of hotels (or entities) with reviews\n", - "* Number of reviews\n", - "* The hotel with the lowest average score\n", - "* The hotel with the highest average score\n", - "* A list of hotels with their addresses and telephone numbers" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Manually annotated" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Querying the manually annotated dataset is slightly different from querying DBpedia.\n", - "The main difference is that this dataset uses different graphs to separate the annotations from different students.\n", - "\n", - "**Each graph is a separate set of triples**.\n", - "For this exercise, you could think of graphs as individual endpoints.\n", - "\n", - "\n", - "First, let us get a list of graphs available:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%sparql http://fuseki.cluster.gsi.dit.upm.es/ejerciciohoteles\n", - " \n", - "SELECT ?g WHERE {\n", - " GRAPH ?g {\n", - " ?s ?p ?o .\n", - " }\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once you have this list, you can query specific graphs like so:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%sparql http://fuseki.cluster.gsi.dit.upm.es/ejerciciohoteles\n", - " \n", - "SELECT *\n", - "WHERE {\n", - " GRAPH {\n", - " ?s ?p ?o .\n", - " }\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, design five queries to answer the questions in the description, and run each of them in at least five of these graphs.\n", - "\n", - "You can manually run the queries or use the code below, where you only need to specify your queries and the graphs you have identified.\n", - "\n", - "If you need additional prefixes, feel free to modify the TEMPLATE variable." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from IPython.display import display\n", - "\n", - "QUERIES = {\n", - " 'highest score': '''\n", - " ?s ?p ?o\n", - "''',\n", - " 'lowest score': '''\n", - " ?s ?p ?o\n", - " ''',\n", - " 'number of hotels': '''\n", - " ?s ?p ?o\n", - " ''',\n", - " 'number of reviews': '''\n", - " ?s ?p ?o\n", - " ''',\n", - " 'telephones and addresses': '''\n", - " ?s ?p ?o\n", - " ''',\n", - " \n", - "}\n", - "\n", - "TEMPLATE = '''\n", - "SELECT * WHERE {{\n", - " GRAPH <{graph}>{{\n", - " {query}\n", - " }}\n", - " }}\n", - "'''\n", - "\n", - "GRAPHS = ['http://fuseki.cluster.gsi.dit.upm.es/36de86e6754934381d935f10618fe985',\n", - " ]\n", - "\n", - "for name, query in QUERIES.items():\n", - " for graph in GRAPHS:\n", - " print(name, '@', graph)\n", - " display(sparql('http://fuseki.cluster.gsi.dit.upm.es/ejerciciohoteles', TEMPLATE.format(graph=graph,\n", - " query=query)\n", - " ))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Synthetic dataset\n", - "\n", - "Now, run the same queries in the synthetic dataset.\n", - "\n", - "The query below should get you started:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%sparql http://fuseki.cluster.gsi.dit.upm.es/hotelessintetico \n", - "\n", - "SELECT *\n", - "WHERE {\n", - " ?s ?p ?o .\n", - "}\n", - "LIMIT 10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Discussion\n", - "\n", - "Compare the results of the synthetic and the manual dataset, and answer these questions:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Both datasets should use the same schema. Are there any differences when it comes to using them?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "11e7e2b7d3dfb45f9534506761f896f9", - "grade": true, - "grade_id": "cell-9bd08e4f5842cb89", - "locked": false, - "points": 0, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "# YOUR CODE HERE" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Are data correctly annotated in both datasets?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "f676f18c71297e8429448fa0f0833db1", - "grade": true, - "grade_id": "cell-9dc1c9033198bb18", - "locked": false, - "points": 0, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "# YOUR CODE HERE" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Has any of the datasets been harder to query? Why?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "2a24b20a338d18f4879540f5e03f5889", - "grade": true, - "grade_id": "cell-0e63b8e9dcb24676", - "locked": false, - "points": 0, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "# YOUR CODE HERE" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Has any of the datasets been harder to query? Why" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "2ec2cf74959db9112c189a4e7a0b3609", - "grade": true, - "grade_id": "cell-6c18003ced54be23", - "locked": false, - "points": 0, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "# YOUR CODE HERE" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Are data correctly annotated in both datasets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "nbgrader": { - "checksum": "4a062d17043e5459a48314b1177cb8f1", - "grade": true, - "grade_id": "cell-cdce24ef5f581981", - "locked": false, - "points": 0, - "schema_version": 1, - "solution": true - } - }, - "outputs": [], - "source": [ - "# YOUR CODE HERE" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## References" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* [RDFLib documentation](https://rdflib.readthedocs.io/en/stable/).\n", - "* [Wikidata Query Service query examples](https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Licence\n", - "The notebook is freely licensed under under the [Creative Commons Attribution Share-Alike license](https://creativecommons.org/licenses/by/2.0/). \n", - "\n", - "© 2018 Universidad Politécnica de Madrid." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/lod/extract_data.py b/lod/extract_data.py deleted file mode 100644 index cb1da9b..0000000 --- a/lod/extract_data.py +++ /dev/null @@ -1,49 +0,0 @@ - -import sys -from future.standard_library import install_aliases -install_aliases() - -from urllib import request, parse -from rdflib import Graph, term -from lxml import etree - -if len(sys.argv) < 2: - print('Usage: python {} '.format(sys.argv[0])) - print('') - print('Extract rdfa, microdata and json-ld annotations from a website') - exit(1) - -url = sys.argv[1] - -g = Graph() -g.parse(url, format='rdfa') -g.parse(url, format='microdata') - - -def sanitize_triple(t): - """Function to remove bad URIs from the graph that would otherwise - make the serialization fail.""" - def sanitize_triple_item(item): - if isinstance(item, term.URIRef) and '/' not in item: - return term.URIRef(parse.quote(str(item))) - return item - - return (sanitize_triple_item(t[0]), - sanitize_triple_item(t[1]), - sanitize_triple_item(t[2])) - - -with request.urlopen(url) as response: - # Get all json-ld objects embedded in the html file - html = response.read().decode('utf-8', errors='ignore') - parser = etree.XMLParser(recover=True) - root = etree.fromstring(html, parser=parser) - if root: - for jsonld in root.findall(".//script[@type='application/ld+json']"): - g.parse(data=jsonld.text, publicID=url, format='json-ld') - - -fixedgraph = Graph() -fixedgraph += [sanitize_triple(s) for s in g] - -print(g.serialize(format='turtle').decode('utf-8', errors='ignore')) diff --git a/lod/helpers.py b/lod/helpers.py index b92f3d2..7798189 100644 --- a/lod/helpers.py +++ b/lod/helpers.py @@ -1,12 +1,22 @@ +''' +Helper functions and ipython magic for the SPARQL exercises. + +The tests in the notebooks rely on the `LAST_QUERY` variable, which is updated by the `%%sparql` magic after every query. +This variable contains the full query used (`LAST_QUERY["query"]`), the endpoint it was sent to (`LAST_QUERY["endpoint"]`), and a dictionary with the response of the endpoint (`LAST_QUERY["results"]`). +For convenience, the results are also given as tuples (`LAST_QUERY["tuples"]`), and as a dictionary of of `{column:[values]}` (`LAST_QUERY["columns"]`). +''' from IPython.core.magic import (register_line_magic, register_cell_magic, register_line_cell_magic) - -from IPython.display import HTML, display, Image +from IPython.display import HTML, display, Image, display_javascript from urllib.request import Request, urlopen from urllib.parse import quote_plus, urlencode from urllib.error import HTTPError import json +import sys + +js = "IPython.CodeCell.options_default.highlight_modes['magic_sparql'] = {'reg':[/^%%sparql/]};" +display_javascript(js, raw=True) def send_query(query, endpoint): @@ -20,7 +30,11 @@ def send_query(query, endpoint): headers={'content-type': 'application/x-www-form-urlencoded', 'accept': FORMATS}, method='POST') - return json.loads(urlopen(r).read().decode('utf-8')); + res = urlopen(r) + data = res.read().decode('utf-8') + if res.getcode() == 200: + return json.loads(data) + raise Exception('Error getting results: {}'.format(data)) def tabulate(tuples, header=None): @@ -39,11 +53,14 @@ def tabulate(tuples, header=None): LAST_QUERY = {} +def solution(): + return LAST_QUERY + def query(query, endpoint=None, print_table=False): global LAST_QUERY - endpoint = endpoint or "http://dbpedia.org/sparql" + endpoint = endpoint or "http://fuseki.cluster.gsi.dit.upm.es/sitc/" results = send_query(query, endpoint) tuples = to_table(results) @@ -80,12 +97,30 @@ def to_table(results): @register_cell_magic def sparql(line, cell): + ''' + Sparql magic command for ipython. It can be used in a cell like this: + + ``` + %%sparql + + ... Your SPARQL query ... + + ``` + + by default, it will use the DBpedia endpoint, but you can use a different endpoint like this: + + ``` + %%sparql http://my-sparql-endpoint... + + ... Your SPARQL query ... + ``` + ''' try: return query(cell, endpoint=line, print_table=True) except HTTPError as ex: error_message = ex.read().decode('utf-8') print('Error {}. Reason: {}'.format(ex.status, ex.reason)) - print(error_message) + print(error_message, file=sys.stderr) def show_photos(values): diff --git a/lod/reviews.ttl b/lod/reviews.ttl deleted file mode 100644 index d5246a9..0000000 --- a/lod/reviews.ttl +++ /dev/null @@ -1,29 +0,0 @@ -@prefix rdf: . -@prefix rdfs: . -@prefix schema: . - - -_:Hotel1 a schema:Hotel ; - schema:description "A fictitious hotel" . - - -_:Review1 a schema:Review ; - schema:reviewBody "This is a great review" ; - schema:reviewRating [ - a schema:Rating ; - schema:author ; - schema:ratingValue "0.7" - - ] ; - schema:itemReviewed _:Hotel1 . - - -_:Review2 a schema:Review ; - schema:reviewBody "This is a not so great review" ; - schema:reviewRating [ - a schema:Rating ; - schema:author [ a schema:Person ; - schema:givenName "anonymous" ] ; - schema:ratingValue "0.3" - ] ; - schema:itemReviewed _:Hotel1 . \ No newline at end of file diff --git a/lod/sparql.py b/lod/sparql.py deleted file mode 100644 index 6ed5dd8..0000000 --- a/lod/sparql.py +++ /dev/null @@ -1,23 +0,0 @@ -# !/bin/env python # -# Ejemplo de consultas SPARQL sobre turtle # -# python consultas.py # -import rdflib -import sys - -dataset = sys.argv[1] if len(sys.argv) > 1 else 'reviews.ttl' -g = rdflib.Graph() - -schema = rdflib.Namespace("http://schema.org/") - -# Read Turtle file # -g.parse(dataset, format='turtle') - -results = g.query( - """SELECT DISTINCT ?review ?p ?o - WHERE { - ?review a schema:Review. - ?review ?p ?o. - }""", initNs={'schema': schema}) - -for row in results: - print("%s %s %s" % row) diff --git a/lod/validate.py b/lod/validate.py deleted file mode 100644 index 99db6f3..0000000 --- a/lod/validate.py +++ /dev/null @@ -1,6 +0,0 @@ -import rdflib -import sys -g = rdflib.Graph() -dataset = sys.argv[1] if len(sys.argv) > 1 else 'reviews.ttl' -g.parse(dataset, format="n3") -print(g.serialize(format="n3").decode('utf-8'))