1
0
mirror of https://github.com/gsi-upm/sitc synced 2025-08-23 18:22:19 +00:00

Remove outputs and metadata

This commit is contained in:
J. Fernando Sánchez
2019-02-28 15:30:33 +01:00
parent a1be167cc0
commit c1d3ca38ea
25 changed files with 989 additions and 14268 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -84,25 +84,9 @@
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 5\n",
"1 10\n",
"2 15\n",
"dtype: int64"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
@@ -124,25 +108,9 @@
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"a 5\n",
"b 10\n",
"c 15\n",
"dtype: int64"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"d = {'a': 5, 'b': 10, 'c': 15}\n",
"s = Series(d)\n",
@@ -151,22 +119,9 @@
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['a', 'b', 'c'], dtype='object')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# We can get the list of indexes\n",
"s.index"
@@ -174,22 +129,9 @@
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 5, 10, 15])"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# and the values\n",
"s.values"
@@ -204,28 +146,9 @@
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Madrid 3141991\n",
"Barcelona 1604555\n",
"Valencia 786189\n",
"Sevilla 693878\n",
"Zaragoza 664953\n",
"Malaga 569130\n",
"dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Series with population in 2015 of more populated cities in Spain\n",
"s = Series([3141991, 1604555, 786189, 693878, 664953, 569130], index=['Madrid', 'Barcelona', 'Valencia', 'Sevilla', \n",
@@ -235,22 +158,9 @@
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"3141991"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Population of Madrid\n",
"s['Madrid']"
@@ -272,28 +182,9 @@
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Madrid True\n",
"Barcelona True\n",
"Valencia False\n",
"Sevilla False\n",
"Zaragoza False\n",
"Malaga False\n",
"dtype: bool"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Boolean condition\n",
"s > 1000000"
@@ -301,24 +192,9 @@
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Madrid 3141991\n",
"Barcelona 1604555\n",
"dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cities with population greater than 1.000.000\n",
"s[s > 1000000]"
@@ -333,24 +209,9 @@
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Madrid 3141991\n",
"Barcelona 1604555\n",
"dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cities with population greater than the mean\n",
"s[s > s.mean()]"
@@ -358,25 +219,9 @@
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Madrid 3141991\n",
"Barcelona 1604555\n",
"Valencia 786189\n",
"dtype: int64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cities with population greater than the median\n",
"s[s > s.median()]"
@@ -384,28 +229,9 @@
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Madrid True\n",
"Barcelona True\n",
"Valencia True\n",
"Sevilla False\n",
"Zaragoza False\n",
"Malaga False\n",
"dtype: bool"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check cities with a population greater than 700.000\n",
"s > 700000"
@@ -413,25 +239,9 @@
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Madrid 3141991\n",
"Barcelona 1604555\n",
"Valencia 786189\n",
"dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# List cities with a population greater than 700.000\n",
"s[s > 700000]"
@@ -439,28 +249,9 @@
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Madrid True\n",
"Barcelona True\n",
"Valencia True\n",
"Sevilla False\n",
"Zaragoza False\n",
"Malaga False\n",
"dtype: bool"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Another way to write the same boolean indexing selection\n",
"bigger_than_700000 = s > 700000\n",
@@ -469,25 +260,9 @@
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Madrid 3141991\n",
"Barcelona 1604555\n",
"Valencia 786189\n",
"dtype: int64"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Cities with population > 700000\n",
"s[bigger_than_700000]"
@@ -509,28 +284,9 @@
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Madrid 1570995.5\n",
"Barcelona 802277.5\n",
"Valencia 393094.5\n",
"Sevilla 346939.0\n",
"Zaragoza 332476.5\n",
"Malaga 284565.0\n",
"dtype: float64"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Divide population by 2\n",
"s / 2"
@@ -538,22 +294,9 @@
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1243449.3333333333"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get the average population\n",
"s.mean()"
@@ -561,22 +304,9 @@
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"3141991"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get the highest population\n",
"s.max()"
@@ -598,28 +328,9 @@
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Madrid 3320000\n",
"Barcelona 1604555\n",
"Valencia 786189\n",
"Sevilla 693878\n",
"Zaragoza 664953\n",
"Malaga 569130\n",
"dtype: int64"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Change population of one city\n",
"s['Madrid'] = 3320000\n",
@@ -628,28 +339,9 @@
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Madrid 3652000.0\n",
"Barcelona 1765010.5\n",
"Valencia 864807.9\n",
"Sevilla 693878.0\n",
"Zaragoza 664953.0\n",
"Malaga 569130.0\n",
"dtype: float64"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Increase by 10% cities with population greater than 700000\n",
"s[s > 700000] = 1.1 * s[s > 700000]\n",
@@ -672,61 +364,9 @@
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>one</th>\n",
" <th>two</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <td>3.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d</th>\n",
" <td>NaN</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" one two\n",
"a 1.0 1.0\n",
"b 2.0 2.0\n",
"c 3.0 3.0\n",
"d NaN 4.0"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# We are going to create a DataFrame from a dict of Series\n",
"d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),\n",
@@ -748,55 +388,9 @@
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>one</th>\n",
" <th>two</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>d</th>\n",
" <td>NaN</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" one two\n",
"d NaN 4.0\n",
"b 2.0 2.0\n",
"a 1.0 1.0"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# We can filter\n",
"df = DataFrame(d, index=['d', 'b', 'a'])\n",
@@ -812,55 +406,9 @@
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>two</th>\n",
" <th>three</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>d</th>\n",
" <td>4.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>a</th>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" two three\n",
"d 4.0 NaN\n",
"b 2.0 NaN\n",
"a 1.0 NaN"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])\n",
"df"

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@@ -46,10 +46,8 @@
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
@@ -82,9 +80,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": []
},
@@ -105,9 +101,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": []
},
@@ -121,9 +115,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": []
},
@@ -137,9 +129,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": []
},
@@ -153,17 +143,13 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"metadata": {},
"source": [
"How many passsengers have survived? List them grouped by Sex and Pclass.\n",
"\n",
@@ -173,17 +159,13 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"metadata": {},
"source": [
"Visualise df_1 as an histogram."
]
@@ -191,17 +173,13 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"metadata": {},
"source": [
"# Feature Engineering"
]
@@ -232,9 +210,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": [
"df['FamilySize'] = df['SibSp'] + df['Parch']\n",
@@ -258,9 +234,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": [
"df['Alone'] = (df.FamilySize == 0)\n",
@@ -284,9 +258,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": [
"#Taken from http://www.analyticsvidhya.com/blog/2014/09/data-munging-python-using-pandas-baby-steps-python/\n",
@@ -307,9 +279,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": [
"df['Salutation'].unique()"
@@ -318,9 +288,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": [
"df.groupby(['Salutation']).size()"
@@ -336,9 +304,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": [
"def group_salutation(old_salutation):\n",
@@ -362,9 +328,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": [
"# Distribution\n",
@@ -375,9 +339,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": [
"df.boxplot(column='Age', by = 'Salutation', sym='k.')"
@@ -393,9 +355,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": [
"# Specific features for Children and Female since there are more survivors\n",
@@ -413,9 +373,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"# Group ages to simplify machine learning algorithms. 0: 0-5, 1: 6-10, 2: 11-15, 3: 16-59 and 4: 60-80\n",
@@ -437,10 +395,8 @@
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def substrings_in_string(big_string, substrings):\n",
@@ -475,9 +431,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": [
"df['FarePerPerson']= df['Fare'] / (df['FamilySize'] + 1)"
@@ -500,9 +454,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"df['AgeClass']=df['Age']*df['Pclass']"

File diff suppressed because one or more lines are too long

View File

@@ -19,11 +19,10 @@ samples.
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.learning_curve import learning_curve
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
@@ -53,7 +52,7 @@ def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
cv : integer, cross-validation generator, optional
If an integer is passed, it is the number of folds (defaults to 3).
Specific cross-validation objects can be passed, see
sklearn.cross_validation module for the list of possible objects
sklearn.model_selection module for the list of possible objects
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).