mirror of
https://github.com/gsi-upm/sitc
synced 2024-11-18 04:22:28 +00:00
674 lines
20 KiB
Plaintext
674 lines
20 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "skip"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"![](images/EscUpmPolit_p.gif \"UPM\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "skip"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"# Course Notes for Learning Intelligent Systems"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "skip"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"Department of Telematic Engineering Systems, Universidad Politécnica de Madrid, © Carlos A. Iglesias"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "skip"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"## [Introduction to Preprocessing](00_Intro_Preprocessing.ipynb)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "slide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"# Datacleaner\n",
|
|||
|
"[Datacleaner](https://github.com/rhiever/datacleaner) supports:\n",
|
|||
|
"\n",
|
|||
|
"* drop rows with missing values\n",
|
|||
|
"* replace missing values with the mode or median on a column-by-column basis\n",
|
|||
|
"* encode non-numeric variables with numerical equivalents\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"Install with\n",
|
|||
|
"\n",
|
|||
|
"**pip install datacleaner**"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "slide"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>PassengerId</th>\n",
|
|||
|
" <th>Survived</th>\n",
|
|||
|
" <th>Pclass</th>\n",
|
|||
|
" <th>Name</th>\n",
|
|||
|
" <th>Sex</th>\n",
|
|||
|
" <th>Age</th>\n",
|
|||
|
" <th>SibSp</th>\n",
|
|||
|
" <th>Parch</th>\n",
|
|||
|
" <th>Ticket</th>\n",
|
|||
|
" <th>Fare</th>\n",
|
|||
|
" <th>Cabin</th>\n",
|
|||
|
" <th>Embarked</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>Braund, Mr. Owen Harris</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>22.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>A/5 21171</td>\n",
|
|||
|
" <td>7.2500</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>S</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
|
|||
|
" <td>female</td>\n",
|
|||
|
" <td>38.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>PC 17599</td>\n",
|
|||
|
" <td>71.2833</td>\n",
|
|||
|
" <td>C85</td>\n",
|
|||
|
" <td>C</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>Heikkinen, Miss. Laina</td>\n",
|
|||
|
" <td>female</td>\n",
|
|||
|
" <td>26.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>STON/O2. 3101282</td>\n",
|
|||
|
" <td>7.9250</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>S</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
|
|||
|
" <td>female</td>\n",
|
|||
|
" <td>35.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>113803</td>\n",
|
|||
|
" <td>53.1000</td>\n",
|
|||
|
" <td>C123</td>\n",
|
|||
|
" <td>S</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>Allen, Mr. William Henry</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>35.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>373450</td>\n",
|
|||
|
" <td>8.0500</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>S</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>886</th>\n",
|
|||
|
" <td>887</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>Montvila, Rev. Juozas</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>27.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>211536</td>\n",
|
|||
|
" <td>13.0000</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>S</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>887</th>\n",
|
|||
|
" <td>888</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>Graham, Miss. Margaret Edith</td>\n",
|
|||
|
" <td>female</td>\n",
|
|||
|
" <td>19.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>112053</td>\n",
|
|||
|
" <td>30.0000</td>\n",
|
|||
|
" <td>B42</td>\n",
|
|||
|
" <td>S</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>888</th>\n",
|
|||
|
" <td>889</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
|
|||
|
" <td>female</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>W./C. 6607</td>\n",
|
|||
|
" <td>23.4500</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>S</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>889</th>\n",
|
|||
|
" <td>890</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>Behr, Mr. Karl Howell</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>26.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>111369</td>\n",
|
|||
|
" <td>30.0000</td>\n",
|
|||
|
" <td>C148</td>\n",
|
|||
|
" <td>C</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>890</th>\n",
|
|||
|
" <td>891</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>Dooley, Mr. Patrick</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>32.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>370376</td>\n",
|
|||
|
" <td>7.7500</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>Q</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>891 rows × 12 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" PassengerId Survived Pclass \\\n",
|
|||
|
"0 1 0 3 \n",
|
|||
|
"1 2 1 1 \n",
|
|||
|
"2 3 1 3 \n",
|
|||
|
"3 4 1 1 \n",
|
|||
|
"4 5 0 3 \n",
|
|||
|
".. ... ... ... \n",
|
|||
|
"886 887 0 2 \n",
|
|||
|
"887 888 1 1 \n",
|
|||
|
"888 889 0 3 \n",
|
|||
|
"889 890 1 1 \n",
|
|||
|
"890 891 0 3 \n",
|
|||
|
"\n",
|
|||
|
" Name Sex Age SibSp \\\n",
|
|||
|
"0 Braund, Mr. Owen Harris male 22.0 1 \n",
|
|||
|
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
|
|||
|
"2 Heikkinen, Miss. Laina female 26.0 0 \n",
|
|||
|
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
|
|||
|
"4 Allen, Mr. William Henry male 35.0 0 \n",
|
|||
|
".. ... ... ... ... \n",
|
|||
|
"886 Montvila, Rev. Juozas male 27.0 0 \n",
|
|||
|
"887 Graham, Miss. Margaret Edith female 19.0 0 \n",
|
|||
|
"888 Johnston, Miss. Catherine Helen \"Carrie\" female NaN 1 \n",
|
|||
|
"889 Behr, Mr. Karl Howell male 26.0 0 \n",
|
|||
|
"890 Dooley, Mr. Patrick male 32.0 0 \n",
|
|||
|
"\n",
|
|||
|
" Parch Ticket Fare Cabin Embarked \n",
|
|||
|
"0 0 A/5 21171 7.2500 NaN S \n",
|
|||
|
"1 0 PC 17599 71.2833 C85 C \n",
|
|||
|
"2 0 STON/O2. 3101282 7.9250 NaN S \n",
|
|||
|
"3 0 113803 53.1000 C123 S \n",
|
|||
|
"4 0 373450 8.0500 NaN S \n",
|
|||
|
".. ... ... ... ... ... \n",
|
|||
|
"886 0 211536 13.0000 NaN S \n",
|
|||
|
"887 0 112053 30.0000 B42 S \n",
|
|||
|
"888 2 W./C. 6607 23.4500 NaN S \n",
|
|||
|
"889 0 111369 30.0000 C148 C \n",
|
|||
|
"890 0 370376 7.7500 NaN Q \n",
|
|||
|
"\n",
|
|||
|
"[891 rows x 12 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 10,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"\n",
|
|||
|
"from datacleaner import autoclean\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv('https://raw.githubusercontent.com/gsi-upm/sitc/master/ml2/data-titanic/train.csv')\n",
|
|||
|
"df"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 12,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "slide"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>PassengerId</th>\n",
|
|||
|
" <th>Survived</th>\n",
|
|||
|
" <th>Pclass</th>\n",
|
|||
|
" <th>Name</th>\n",
|
|||
|
" <th>Sex</th>\n",
|
|||
|
" <th>Age</th>\n",
|
|||
|
" <th>SibSp</th>\n",
|
|||
|
" <th>Parch</th>\n",
|
|||
|
" <th>Ticket</th>\n",
|
|||
|
" <th>Fare</th>\n",
|
|||
|
" <th>Cabin</th>\n",
|
|||
|
" <th>Embarked</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>108</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>22.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>523</td>\n",
|
|||
|
" <td>7.2500</td>\n",
|
|||
|
" <td>47</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>190</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>38.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>596</td>\n",
|
|||
|
" <td>71.2833</td>\n",
|
|||
|
" <td>81</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>353</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>26.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>669</td>\n",
|
|||
|
" <td>7.9250</td>\n",
|
|||
|
" <td>47</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>272</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>35.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>49</td>\n",
|
|||
|
" <td>53.1000</td>\n",
|
|||
|
" <td>55</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>15</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>35.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>472</td>\n",
|
|||
|
" <td>8.0500</td>\n",
|
|||
|
" <td>47</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>886</th>\n",
|
|||
|
" <td>887</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>548</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>27.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>101</td>\n",
|
|||
|
" <td>13.0000</td>\n",
|
|||
|
" <td>47</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>887</th>\n",
|
|||
|
" <td>888</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>303</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>19.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>14</td>\n",
|
|||
|
" <td>30.0000</td>\n",
|
|||
|
" <td>30</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>888</th>\n",
|
|||
|
" <td>889</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>413</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>28.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>675</td>\n",
|
|||
|
" <td>23.4500</td>\n",
|
|||
|
" <td>47</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>889</th>\n",
|
|||
|
" <td>890</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>81</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>26.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>30.0000</td>\n",
|
|||
|
" <td>60</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>890</th>\n",
|
|||
|
" <td>891</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>220</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>32.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>466</td>\n",
|
|||
|
" <td>7.7500</td>\n",
|
|||
|
" <td>47</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>891 rows × 12 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket \\\n",
|
|||
|
"0 1 0 3 108 1 22.0 1 0 523 \n",
|
|||
|
"1 2 1 1 190 0 38.0 1 0 596 \n",
|
|||
|
"2 3 1 3 353 0 26.0 0 0 669 \n",
|
|||
|
"3 4 1 1 272 0 35.0 1 0 49 \n",
|
|||
|
"4 5 0 3 15 1 35.0 0 0 472 \n",
|
|||
|
".. ... ... ... ... ... ... ... ... ... \n",
|
|||
|
"886 887 0 2 548 1 27.0 0 0 101 \n",
|
|||
|
"887 888 1 1 303 0 19.0 0 0 14 \n",
|
|||
|
"888 889 0 3 413 0 28.0 1 2 675 \n",
|
|||
|
"889 890 1 1 81 1 26.0 0 0 8 \n",
|
|||
|
"890 891 0 3 220 1 32.0 0 0 466 \n",
|
|||
|
"\n",
|
|||
|
" Fare Cabin Embarked \n",
|
|||
|
"0 7.2500 47 2 \n",
|
|||
|
"1 71.2833 81 0 \n",
|
|||
|
"2 7.9250 47 2 \n",
|
|||
|
"3 53.1000 55 2 \n",
|
|||
|
"4 8.0500 47 2 \n",
|
|||
|
".. ... ... ... \n",
|
|||
|
"886 13.0000 47 2 \n",
|
|||
|
"887 30.0000 30 2 \n",
|
|||
|
"888 23.4500 47 2 \n",
|
|||
|
"889 30.0000 60 0 \n",
|
|||
|
"890 7.7500 47 1 \n",
|
|||
|
"\n",
|
|||
|
"[891 rows x 12 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 12,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df_clean = autoclean(df, copy=True)\n",
|
|||
|
"df_clean"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "skip"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"# References\n",
|
|||
|
"* [Cleaning and Prepping Data with Python for Data Science — Best Practices and Helpful Packages](https://medium.com/@rrfd/cleaning-and-prepping-data-with-python-for-data-science-best-practices-and-helpful-packages-af1edfbe2a3), DeFilippi, 2019, \n",
|
|||
|
"* [Data Preprocessing for Machine learning in Python, GeeksForGeeks](https://www.geeksforgeeks.org/data-preprocessing-machine-learning-python/), A. Sharma, 2018.\n",
|
|||
|
"* [Handy Python Libraries for Formatting and Cleaning Data](https://mode.com/blog/python-data-cleaning-libraries), M. Bierly, 2016\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "skip"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"## Licence\n",
|
|||
|
"The notebook is freely licensed under under the [Creative Commons Attribution Share-Alike license](https://creativecommons.org/licenses/by/2.0/). \n",
|
|||
|
"\n",
|
|||
|
"© Carlos A. Iglesias, Universidad Politécnica de Madrid."
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"celltoolbar": "Slideshow",
|
|||
|
"datacleaner": {
|
|||
|
"position": {
|
|||
|
"top": "50px"
|
|||
|
},
|
|||
|
"python": {
|
|||
|
"varRefreshCmd": "try:\n print(_datacleaner.dataframe_metadata())\nexcept:\n print([])"
|
|||
|
},
|
|||
|
"window_display": true
|
|||
|
},
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3 (ipykernel)",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.11.7"
|
|||
|
},
|
|||
|
"latex_envs": {
|
|||
|
"LaTeX_envs_menu_present": true,
|
|||
|
"autocomplete": true,
|
|||
|
"bibliofile": "biblio.bib",
|
|||
|
"cite_by": "apalike",
|
|||
|
"current_citInitial": 1,
|
|||
|
"eqLabelWithNumbers": true,
|
|||
|
"eqNumInitial": 1,
|
|||
|
"hotkeys": {
|
|||
|
"equation": "Ctrl-E",
|
|||
|
"itemize": "Ctrl-I"
|
|||
|
},
|
|||
|
"labels_anchors": false,
|
|||
|
"latex_user_defs": false,
|
|||
|
"report_style_numbering": false,
|
|||
|
"user_envs_cfg": false
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 4
|
|||
|
}
|