mirror of
https://github.com/gsi-upm/sitc
synced 2025-01-09 20:41:27 +00:00
Compare commits
No commits in common. "f0278aea336a6b81159db0f53a5a705e358c5a34" and "bf21e3ceabaad7f1dba55ca6ec550ee5b28a34ce" have entirely different histories.
f0278aea33
...
bf21e3ceab
@ -92,7 +92,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "Python 3",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -106,7 +106,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.12"
|
"version": "3.7.1"
|
||||||
},
|
},
|
||||||
"latex_envs": {
|
"latex_envs": {
|
||||||
"LaTeX_envs_menu_present": true,
|
"LaTeX_envs_menu_present": true,
|
||||||
|
@ -433,10 +433,10 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"* [Pandas](http://pandas.pydata.org/)\n",
|
"* [Pandas](http://pandas.pydata.org/)\n",
|
||||||
"* [Learning Pandas, Michael Heydt, Packt Publishing, 2017](https://learning.oreilly.com/library/view/learning-pandas/9781787123137/)\n",
|
"* [Learning Pandas, Michael Heydt, Packt Publishing, 2015](http://proquest.safaribooksonline.com/book/programming/python/9781783985128)\n",
|
||||||
"* [Pandas. Introduction to Data Structures](https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html)\n",
|
"* [Pandas. Introduction to Data Structures](http://pandas.pydata.org/pandas-docs/stable/dsintro.html#dsintro)\n",
|
||||||
"* [Introducing Pandas Objects](https://www.oreilly.com/learning/introducing-pandas-objects)\n",
|
"* [Introducing Pandas Objects](https://www.oreilly.com/learning/introducing-pandas-objects)\n",
|
||||||
"* [Boolean Operators in Pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-operators)"
|
"* [Boolean Operators in Pandas](http://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-operators)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -458,7 +458,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "Python 3",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -472,7 +472,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.12"
|
"version": "3.7.1"
|
||||||
},
|
},
|
||||||
"latex_envs": {
|
"latex_envs": {
|
||||||
"LaTeX_envs_menu_present": true,
|
"LaTeX_envs_menu_present": true,
|
||||||
|
@ -404,7 +404,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"#Mean Age and SibSp of passengers grouped by passenger class and sex\n",
|
"#Mean Age and SibSp of passengers grouped by passenger class and sex\n",
|
||||||
"df.groupby(['Pclass', 'Sex'])[['Age','SibSp']].mean()"
|
"df.groupby(['Pclass', 'Sex'])['Age','SibSp'].mean()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -414,7 +414,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"#Show mean Age and SibSp for passengers older than 25 grouped by Passenger Class and Sex\n",
|
"#Show mean Age and SibSp for passengers older than 25 grouped by Passenger Class and Sex\n",
|
||||||
"df[df.Age > 25].groupby(['Pclass', 'Sex'])[['Age','SibSp']].mean()"
|
"df[df.Age > 25].groupby(['Pclass', 'Sex'])['Age','SibSp'].mean()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -424,7 +424,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Mean age, SibSp , Survived of passengers older than 25 which survived, grouped by Passenger Class and Sex \n",
|
"# Mean age, SibSp , Survived of passengers older than 25 which survived, grouped by Passenger Class and Sex \n",
|
||||||
"df[(df.Age > 25 & (df.Survived == 1))].groupby(['Pclass', 'Sex'])[['Age','SibSp','Survived']].mean()"
|
"df[(df.Age > 25 & (df.Survived == 1))].groupby(['Pclass', 'Sex'])['Age','SibSp','Survived'].mean()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -436,7 +436,7 @@
|
|||||||
"# We can also decide which function apply in each column\n",
|
"# We can also decide which function apply in each column\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Show mean Age, mean SibSp, and number of passengers older than 25 that survived, grouped by Passenger Class and Sex\n",
|
"#Show mean Age, mean SibSp, and number of passengers older than 25 that survived, grouped by Passenger Class and Sex\n",
|
||||||
"df[(df.Age > 25 & (df.Survived == 1))].groupby(['Pclass', 'Sex'])[['Age','SibSp','Survived']].agg({'Age': np.mean, \n",
|
"df[(df.Age > 25 & (df.Survived == 1))].groupby(['Pclass', 'Sex'])['Age','SibSp','Survived'].agg({'Age': np.mean, \n",
|
||||||
" 'SibSp': np.mean, 'Survived': np.sum})"
|
" 'SibSp': np.mean, 'Survived': np.sum})"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -600,8 +600,8 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Fill missing values with the median, we avoid empty (None) values with numeric_only\n",
|
"# Fill missing values with the median\n",
|
||||||
"df_filled = df.fillna(df.median(numeric_only=True))\n",
|
"df_filled = df.fillna(df.median())\n",
|
||||||
"df_filled[-5:]"
|
"df_filled[-5:]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -685,7 +685,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# But we are working on a copy, so we get a warning\n",
|
"# But we are working on a copy \n",
|
||||||
"df.iloc[889]['Sex'] = np.nan"
|
"df.iloc[889]['Sex'] = np.nan"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -695,7 +695,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# If we want to change it, we should not chain selections\n",
|
"# If we want to change, we should not chain selections\n",
|
||||||
"# The selection can be done with the column name\n",
|
"# The selection can be done with the column name\n",
|
||||||
"df.loc[889, 'Sex']"
|
"df.loc[889, 'Sex']"
|
||||||
]
|
]
|
||||||
@ -932,11 +932,11 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"* [Pandas](http://pandas.pydata.org/)\n",
|
"* [Pandas](http://pandas.pydata.org/)\n",
|
||||||
"* [Learning Pandas, Michael Heydt, Packt Publishing, 2017](https://learning.oreilly.com/library/view/learning-pandas/9781787123137/)\n",
|
"* [Learning Pandas, Michael Heydt, Packt Publishing, 2015](http://proquest.safaribooksonline.com/book/programming/python/9781783985128)\n",
|
||||||
"* [Pandas. Introduction to Data Structures](https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html)\n",
|
"* [Useful Pandas Snippets](https://gist.github.com/bsweger/e5817488d161f37dcbd2)\n",
|
||||||
|
"* [Pandas. Introduction to Data Structures](http://pandas.pydata.org/pandas-docs/stable/dsintro.html#dsintro)\n",
|
||||||
"* [Introducing Pandas Objects](https://www.oreilly.com/learning/introducing-pandas-objects)\n",
|
"* [Introducing Pandas Objects](https://www.oreilly.com/learning/introducing-pandas-objects)\n",
|
||||||
"* [Boolean Operators in Pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-operators)\n",
|
"* [Boolean Operators in Pandas](http://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-operators)"
|
||||||
"* [Useful Pandas Snippets](https://gist.github.com/bsweger/e5817488d161f37dcbd2)"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -958,7 +958,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "Python 3",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -972,7 +972,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.12"
|
"version": "3.7.1"
|
||||||
},
|
},
|
||||||
"latex_envs": {
|
"latex_envs": {
|
||||||
"LaTeX_envs_menu_present": true,
|
"LaTeX_envs_menu_present": true,
|
||||||
|
@ -367,7 +367,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Now we visualise age and survived to see if there is some relationship\n",
|
"# Now we visualise age and survived to see if there is some relationship\n",
|
||||||
"sns.FacetGrid(df, hue=\"Survived\", height=5).map(sns.kdeplot, \"Age\").add_legend()"
|
"sns.FacetGrid(df, hue=\"Survived\", size=5).map(sns.kdeplot, \"Age\").add_legend()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -567,7 +567,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Plot with seaborn\n",
|
"# Plot with seaborn\n",
|
||||||
"sns.countplot(x='Sex', data=df)"
|
"sns.countplot('Sex', data=df)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -683,6 +683,16 @@
|
|||||||
"df.groupby('Pclass').size()"
|
"df.groupby('Pclass').size()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Distribution\n",
|
||||||
|
"sns.countplot('Pclass', data=df)"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -715,7 +725,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"sns.catplot(x='Pclass',data=df,hue='Sex',kind='count')"
|
"sns.factorplot('Pclass',data=df,hue='Sex',kind='count')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -896,7 +906,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Distribution\n",
|
"# Distribution\n",
|
||||||
"sns.countplot(x='Embarked', data=df)"
|
"sns.countplot('Embarked', data=df)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -987,7 +997,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Distribution\n",
|
"# Distribution\n",
|
||||||
"sns.countplot(x='SibSp', data=df)"
|
"sns.countplot('SibSp', data=df)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1170,7 +1180,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Distribution\n",
|
"# Distribution\n",
|
||||||
"sns.countplot(x='Parch', data=df)"
|
"sns.countplot('Parch', data=df)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1223,7 +1233,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"df.groupby(['Pclass', 'Sex', 'Parch'])[['Parch', 'SibSp', 'Survived']].agg({'Parch': np.size, 'SibSp': np.mean, 'Survived': np.mean})"
|
"df.groupby(['Pclass', 'Sex', 'Parch'])['Parch', 'SibSp', 'Survived'].agg({'Parch': np.size, 'SibSp': np.mean, 'Survived': np.mean})"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1566,7 +1576,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "Python 3",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -1580,7 +1590,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.12"
|
"version": "3.7.1"
|
||||||
},
|
},
|
||||||
"latex_envs": {
|
"latex_envs": {
|
||||||
"LaTeX_envs_menu_present": true,
|
"LaTeX_envs_menu_present": true,
|
||||||
|
@ -46,7 +46,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 11,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -72,7 +72,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Assign the variable *df* a Dataframe with the Titanic Dataset from the URL https://raw.githubusercontent.com/gsi-upm/sitc/master/ml2/data-titanic/train.csv.\n",
|
"Assign the variable *df* a Dataframe with the Titanic Dataset from the URL https://raw.githubusercontent.com/gsi-upm/sitc/master/ml2/data-titanic/train.csv\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Print *df*."
|
"Print *df*."
|
||||||
]
|
]
|
||||||
@ -209,9 +209,312 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 20,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>PassengerId</th>\n",
|
||||||
|
" <th>Survived</th>\n",
|
||||||
|
" <th>Pclass</th>\n",
|
||||||
|
" <th>Name</th>\n",
|
||||||
|
" <th>Sex</th>\n",
|
||||||
|
" <th>Age</th>\n",
|
||||||
|
" <th>SibSp</th>\n",
|
||||||
|
" <th>Parch</th>\n",
|
||||||
|
" <th>Ticket</th>\n",
|
||||||
|
" <th>Fare</th>\n",
|
||||||
|
" <th>Cabin</th>\n",
|
||||||
|
" <th>Embarked</th>\n",
|
||||||
|
" <th>FamilySize</th>\n",
|
||||||
|
" <th>AgeGroup</th>\n",
|
||||||
|
" <th>Deck</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>3</td>\n",
|
||||||
|
" <td>Braund, Mr. Owen Harris</td>\n",
|
||||||
|
" <td>male</td>\n",
|
||||||
|
" <td>22.0</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>A/5 21171</td>\n",
|
||||||
|
" <td>7.2500</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>S</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>3.0</td>\n",
|
||||||
|
" <td>X</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
|
||||||
|
" <td>female</td>\n",
|
||||||
|
" <td>38.0</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>PC 17599</td>\n",
|
||||||
|
" <td>71.2833</td>\n",
|
||||||
|
" <td>C85</td>\n",
|
||||||
|
" <td>C</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>3.0</td>\n",
|
||||||
|
" <td>C</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>3</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>3</td>\n",
|
||||||
|
" <td>Heikkinen, Miss. Laina</td>\n",
|
||||||
|
" <td>female</td>\n",
|
||||||
|
" <td>26.0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>STON/O2. 3101282</td>\n",
|
||||||
|
" <td>7.9250</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>S</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>3.0</td>\n",
|
||||||
|
" <td>X</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>4</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
|
||||||
|
" <td>female</td>\n",
|
||||||
|
" <td>35.0</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>113803</td>\n",
|
||||||
|
" <td>53.1000</td>\n",
|
||||||
|
" <td>C123</td>\n",
|
||||||
|
" <td>S</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>3.0</td>\n",
|
||||||
|
" <td>C</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>5</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>3</td>\n",
|
||||||
|
" <td>Allen, Mr. William Henry</td>\n",
|
||||||
|
" <td>male</td>\n",
|
||||||
|
" <td>35.0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>373450</td>\n",
|
||||||
|
" <td>8.0500</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>S</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>3.0</td>\n",
|
||||||
|
" <td>X</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>...</th>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>886</th>\n",
|
||||||
|
" <td>887</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>Montvila, Rev. Juozas</td>\n",
|
||||||
|
" <td>male</td>\n",
|
||||||
|
" <td>27.0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>211536</td>\n",
|
||||||
|
" <td>13.0000</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>S</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>3.0</td>\n",
|
||||||
|
" <td>X</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>887</th>\n",
|
||||||
|
" <td>888</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>Graham, Miss. Margaret Edith</td>\n",
|
||||||
|
" <td>female</td>\n",
|
||||||
|
" <td>19.0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>112053</td>\n",
|
||||||
|
" <td>30.0000</td>\n",
|
||||||
|
" <td>B42</td>\n",
|
||||||
|
" <td>S</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>3.0</td>\n",
|
||||||
|
" <td>B</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>888</th>\n",
|
||||||
|
" <td>889</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>3</td>\n",
|
||||||
|
" <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
|
||||||
|
" <td>female</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>W./C. 6607</td>\n",
|
||||||
|
" <td>23.4500</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>S</td>\n",
|
||||||
|
" <td>3</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>X</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>889</th>\n",
|
||||||
|
" <td>890</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>Behr, Mr. Karl Howell</td>\n",
|
||||||
|
" <td>male</td>\n",
|
||||||
|
" <td>26.0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>111369</td>\n",
|
||||||
|
" <td>30.0000</td>\n",
|
||||||
|
" <td>C148</td>\n",
|
||||||
|
" <td>C</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>3.0</td>\n",
|
||||||
|
" <td>C</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>890</th>\n",
|
||||||
|
" <td>891</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>3</td>\n",
|
||||||
|
" <td>Dooley, Mr. Patrick</td>\n",
|
||||||
|
" <td>male</td>\n",
|
||||||
|
" <td>32.0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>370376</td>\n",
|
||||||
|
" <td>7.7500</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>Q</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>3.0</td>\n",
|
||||||
|
" <td>X</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"<p>891 rows × 15 columns</p>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" PassengerId Survived Pclass \\\n",
|
||||||
|
"0 1 0 3 \n",
|
||||||
|
"1 2 1 1 \n",
|
||||||
|
"2 3 1 3 \n",
|
||||||
|
"3 4 1 1 \n",
|
||||||
|
"4 5 0 3 \n",
|
||||||
|
".. ... ... ... \n",
|
||||||
|
"886 887 0 2 \n",
|
||||||
|
"887 888 1 1 \n",
|
||||||
|
"888 889 0 3 \n",
|
||||||
|
"889 890 1 1 \n",
|
||||||
|
"890 891 0 3 \n",
|
||||||
|
"\n",
|
||||||
|
" Name Sex Age SibSp \\\n",
|
||||||
|
"0 Braund, Mr. Owen Harris male 22.0 1 \n",
|
||||||
|
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
|
||||||
|
"2 Heikkinen, Miss. Laina female 26.0 0 \n",
|
||||||
|
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
|
||||||
|
"4 Allen, Mr. William Henry male 35.0 0 \n",
|
||||||
|
".. ... ... ... ... \n",
|
||||||
|
"886 Montvila, Rev. Juozas male 27.0 0 \n",
|
||||||
|
"887 Graham, Miss. Margaret Edith female 19.0 0 \n",
|
||||||
|
"888 Johnston, Miss. Catherine Helen \"Carrie\" female NaN 1 \n",
|
||||||
|
"889 Behr, Mr. Karl Howell male 26.0 0 \n",
|
||||||
|
"890 Dooley, Mr. Patrick male 32.0 0 \n",
|
||||||
|
"\n",
|
||||||
|
" Parch Ticket Fare Cabin Embarked FamilySize AgeGroup \\\n",
|
||||||
|
"0 0 A/5 21171 7.2500 NaN S 1 3.0 \n",
|
||||||
|
"1 0 PC 17599 71.2833 C85 C 1 3.0 \n",
|
||||||
|
"2 0 STON/O2. 3101282 7.9250 NaN S 0 3.0 \n",
|
||||||
|
"3 0 113803 53.1000 C123 S 1 3.0 \n",
|
||||||
|
"4 0 373450 8.0500 NaN S 0 3.0 \n",
|
||||||
|
".. ... ... ... ... ... ... ... \n",
|
||||||
|
"886 0 211536 13.0000 NaN S 0 3.0 \n",
|
||||||
|
"887 0 112053 30.0000 B42 S 0 3.0 \n",
|
||||||
|
"888 2 W./C. 6607 23.4500 NaN S 3 NaN \n",
|
||||||
|
"889 0 111369 30.0000 C148 C 0 3.0 \n",
|
||||||
|
"890 0 370376 7.7500 NaN Q 0 3.0 \n",
|
||||||
|
"\n",
|
||||||
|
" Deck \n",
|
||||||
|
"0 X \n",
|
||||||
|
"1 C \n",
|
||||||
|
"2 X \n",
|
||||||
|
"3 C \n",
|
||||||
|
"4 X \n",
|
||||||
|
".. ... \n",
|
||||||
|
"886 X \n",
|
||||||
|
"887 B \n",
|
||||||
|
"888 X \n",
|
||||||
|
"889 C \n",
|
||||||
|
"890 X \n",
|
||||||
|
"\n",
|
||||||
|
"[891 rows x 15 columns]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"df['FamilySize'] = df['SibSp'] + df['Parch']\n",
|
"df['FamilySize'] = df['SibSp'] + df['Parch']\n",
|
||||||
"df"
|
"df"
|
||||||
@ -303,9 +606,23 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 2,
|
||||||
"metadata": {},
|
"metadata": {
|
||||||
"outputs": [],
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "NameError",
|
||||||
|
"evalue": "name 'df' is not defined",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[0;32m<ipython-input-2-515fd9f54fd1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Others'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Salutation'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Salutation'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgroup_salutation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Salutation'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[0;31mNameError\u001b[0m: name 'df' is not defined"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"def group_salutation(old_salutation):\n",
|
"def group_salutation(old_salutation):\n",
|
||||||
" if old_salutation == 'Mr':\n",
|
" if old_salutation == 'Mr':\n",
|
||||||
@ -372,7 +689,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 12,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -395,7 +712,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 14,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -488,7 +805,7 @@
|
|||||||
"window_display": false
|
"window_display": false
|
||||||
},
|
},
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "Python 3",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -502,7 +819,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.12"
|
"version": "3.8.8"
|
||||||
},
|
},
|
||||||
"latex_envs": {
|
"latex_envs": {
|
||||||
"LaTeX_envs_menu_present": true,
|
"LaTeX_envs_menu_present": true,
|
||||||
|
@ -78,7 +78,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"* [Python Machine Learning](https://learning.oreilly.com/library/view/python-machine-learning/9781789955750/), Sebastian Raschka and Vahid Mirjalili, Packt Publishing, 2019."
|
"* [Python Machine Learning](http://proquest.safaribooksonline.com/book/programming/python/9781783555130), Sebastian Raschka, Packt Publishing, 2015."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -100,7 +100,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "Python 3",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -114,7 +114,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.12"
|
"version": "3.7.1"
|
||||||
},
|
},
|
||||||
"latex_envs": {
|
"latex_envs": {
|
||||||
"LaTeX_envs_menu_present": true,
|
"LaTeX_envs_menu_present": true,
|
||||||
|
@ -535,13 +535,13 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"# This step will take some time\n",
|
"# This step will take some time\n",
|
||||||
"# Cross-validationt\n",
|
"# Cross-validationt\n",
|
||||||
"cv = KFold(n_splits=5, shuffle=True, random_state=33)\n",
|
"cv = KFold(n_splits=5, shuffle=False, random_state=33)\n",
|
||||||
"# StratifiedKFold has is a variation of k-fold which returns stratified folds:\n",
|
"# StratifiedKFold has is a variation of k-fold which returns stratified folds:\n",
|
||||||
"# each set contains approximately the same percentage of samples of each target class as the complete set.\n",
|
"# each set contains approximately the same percentage of samples of each target class as the complete set.\n",
|
||||||
"#cv = StratifiedKFold(y, n_folds=3, shuffle=True, random_state=33)\n",
|
"#cv = StratifiedKFold(y, n_folds=3, shuffle=False, random_state=33)\n",
|
||||||
"scores = cross_val_score(model, X, y, cv=cv)\n",
|
"scores = cross_val_score(model, X, y, cv=cv)\n",
|
||||||
"print(\"Scores in every iteration\", scores)\n",
|
"print(\"Scores in every iteration\", scores)\n",
|
||||||
"print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))"
|
"print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -644,7 +644,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"* [Titanic Machine Learning from Disaster](https://www.kaggle.com/c/titanic/forums/t/5105/ipython-notebook-tutorial-for-titanic-machine-learning-from-disaster)\n",
|
"* [Titanic Machine Learning from Disaster](https://www.kaggle.com/c/titanic/forums/t/5105/ipython-notebook-tutorial-for-titanic-machine-learning-from-disaster)\n",
|
||||||
"* [API SVC scikit-learn](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)\n",
|
"* [API SVC scikit-learn](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)\n",
|
||||||
"* [How to choose the right metric for evaluating an ML model](https://www.kaggle.com/vipulgandhi/how-to-choose-right-metric-for-evaluating-ml-model)"
|
"* [Better evaluation of classification models](http://blog.kaggle.com/2015/10/23/scikit-learn-video-9-better-evaluation-of-classification-models/)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -666,7 +666,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "Python 3",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -680,7 +680,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.12"
|
"version": "3.7.1"
|
||||||
},
|
},
|
||||||
"latex_envs": {
|
"latex_envs": {
|
||||||
"LaTeX_envs_menu_present": true,
|
"LaTeX_envs_menu_present": true,
|
||||||
|
@ -39,7 +39,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"In this exercise, we are going to put in practice what we have learnt in the notebooks of the session. \n",
|
"In this exercise we are going to put in practice what we have learnt in the notebooks of the session. \n",
|
||||||
"\n",
|
"\n",
|
||||||
"In the previous notebook we have been applying the SVM machine learning algorithm.\n",
|
"In the previous notebook we have been applying the SVM machine learning algorithm.\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -67,7 +67,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "Python 3",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -81,7 +81,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.12"
|
"version": "3.7.1"
|
||||||
},
|
},
|
||||||
"latex_envs": {
|
"latex_envs": {
|
||||||
"LaTeX_envs_menu_present": true,
|
"LaTeX_envs_menu_present": true,
|
||||||
|
@ -1,21 +1,21 @@
|
|||||||
"""
|
"""
|
||||||
|
Taken from http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
|
||||||
|
|
||||||
========================
|
========================
|
||||||
Plotting Learning Curves
|
Plotting Learning Curves
|
||||||
========================
|
========================
|
||||||
In the first column, first row the learning curve of a naive Bayes classifier
|
|
||||||
is shown for the digits dataset. Note that the training score and the
|
|
||||||
cross-validation score are both not very good at the end. However, the shape
|
|
||||||
of the curve can be found in more complex datasets very often: the training
|
|
||||||
score is very high at the beginning and decreases and the cross-validation
|
|
||||||
score is very low at the beginning and increases. In the second column, first
|
|
||||||
row we see the learning curve of an SVM with RBF kernel. We can see clearly
|
|
||||||
that the training score is still around the maximum and the validation score
|
|
||||||
could be increased with more training samples. The plots in the second row
|
|
||||||
show the times required by the models to train with various sizes of training
|
|
||||||
dataset. The plots in the third row show how much time was required to train
|
|
||||||
the models for each training sizes.
|
|
||||||
|
|
||||||
|
On the left side the learning curve of a naive Bayes classifier is shown for
|
||||||
|
the digits dataset. Note that the training score and the cross-validation score
|
||||||
|
are both not very good at the end. However, the shape of the curve can be found
|
||||||
|
in more complex datasets very often: the training score is very high at the
|
||||||
|
beginning and decreases and the cross-validation score is very low at the
|
||||||
|
beginning and increases. On the right side we see the learning curve of an SVM
|
||||||
|
with RBF kernel. We can see clearly that the training score is still around
|
||||||
|
the maximum and the validation score could be increased with more training
|
||||||
|
samples.
|
||||||
"""
|
"""
|
||||||
|
#print(__doc__)
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
@ -23,181 +23,86 @@ from sklearn.naive_bayes import GaussianNB
|
|||||||
from sklearn.svm import SVC
|
from sklearn.svm import SVC
|
||||||
from sklearn.datasets import load_digits
|
from sklearn.datasets import load_digits
|
||||||
from sklearn.model_selection import learning_curve
|
from sklearn.model_selection import learning_curve
|
||||||
from sklearn.model_selection import ShuffleSplit
|
|
||||||
|
|
||||||
|
|
||||||
def plot_learning_curve(
|
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
|
||||||
estimator,
|
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
|
||||||
title,
|
|
||||||
X,
|
|
||||||
y,
|
|
||||||
axes=None,
|
|
||||||
ylim=None,
|
|
||||||
cv=None,
|
|
||||||
n_jobs=None,
|
|
||||||
train_sizes=np.linspace(0.1, 1.0, 5),
|
|
||||||
):
|
|
||||||
"""
|
"""
|
||||||
Generate 3 plots: the test and training learning curve, the training
|
Generate a simple plot of the test and traning learning curve.
|
||||||
samples vs fit times curve, the fit times vs score curve.
|
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
estimator : estimator instance
|
estimator : object type that implements the "fit" and "predict" methods
|
||||||
An estimator instance implementing `fit` and `predict` methods which
|
An object of that type which is cloned for each validation.
|
||||||
will be cloned for each validation.
|
|
||||||
|
|
||||||
title : str
|
title : string
|
||||||
Title for the chart.
|
Title for the chart.
|
||||||
|
|
||||||
X : array-like of shape (n_samples, n_features)
|
X : array-like, shape (n_samples, n_features)
|
||||||
Training vector, where ``n_samples`` is the number of samples and
|
Training vector, where n_samples is the number of samples and
|
||||||
``n_features`` is the number of features.
|
n_features is the number of features.
|
||||||
|
|
||||||
y : array-like of shape (n_samples) or (n_samples, n_features)
|
y : array-like, shape (n_samples) or (n_samples, n_features), optional
|
||||||
Target relative to ``X`` for classification or regression;
|
Target relative to X for classification or regression;
|
||||||
None for unsupervised learning.
|
None for unsupervised learning.
|
||||||
|
|
||||||
axes : array-like of shape (3,), default=None
|
ylim : tuple, shape (ymin, ymax), optional
|
||||||
Axes to use for plotting the curves.
|
Defines minimum and maximum yvalues plotted.
|
||||||
|
|
||||||
ylim : tuple of shape (2,), default=None
|
cv : integer, cross-validation generator, optional
|
||||||
Defines minimum and maximum y-values plotted, e.g. (ymin, ymax).
|
If an integer is passed, it is the number of folds (defaults to 3).
|
||||||
|
Specific cross-validation objects can be passed, see
|
||||||
|
sklearn.model_selection module for the list of possible objects
|
||||||
|
|
||||||
cv : int, cross-validation generator or an iterable, default=None
|
n_jobs : integer, optional
|
||||||
Determines the cross-validation splitting strategy.
|
Number of jobs to run in parallel (default 1).
|
||||||
Possible inputs for cv are:
|
|
||||||
|
|
||||||
- None, to use the default 5-fold cross-validation,
|
|
||||||
- integer, to specify the number of folds.
|
|
||||||
- :term:`CV splitter`,
|
|
||||||
- An iterable yielding (train, test) splits as arrays of indices.
|
|
||||||
|
|
||||||
For integer/None inputs, if ``y`` is binary or multiclass,
|
|
||||||
:class:`StratifiedKFold` used. If the estimator is not a classifier
|
|
||||||
or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
|
|
||||||
|
|
||||||
Refer :ref:`User Guide <cross_validation>` for the various
|
|
||||||
cross-validators that can be used here.
|
|
||||||
|
|
||||||
n_jobs : int or None, default=None
|
|
||||||
Number of jobs to run in parallel.
|
|
||||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|
||||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|
||||||
for more details.
|
|
||||||
|
|
||||||
train_sizes : array-like of shape (n_ticks,)
|
|
||||||
Relative or absolute numbers of training examples that will be used to
|
|
||||||
generate the learning curve. If the ``dtype`` is float, it is regarded
|
|
||||||
as a fraction of the maximum size of the training set (that is
|
|
||||||
determined by the selected validation method), i.e. it has to be within
|
|
||||||
(0, 1]. Otherwise it is interpreted as absolute sizes of the training
|
|
||||||
sets. Note that for classification the number of samples usually have
|
|
||||||
to be big enough to contain at least one sample from each class.
|
|
||||||
(default: np.linspace(0.1, 1.0, 5))
|
|
||||||
"""
|
"""
|
||||||
if axes is None:
|
plt.figure()
|
||||||
_, axes = plt.subplots(1, 3, figsize=(20, 5))
|
plt.title(title)
|
||||||
|
|
||||||
axes[0].set_title(title)
|
|
||||||
if ylim is not None:
|
if ylim is not None:
|
||||||
axes[0].set_ylim(*ylim)
|
plt.ylim(*ylim)
|
||||||
axes[0].set_xlabel("Training examples")
|
plt.xlabel("Training examples")
|
||||||
axes[0].set_ylabel("Score")
|
plt.ylabel("Score")
|
||||||
|
train_sizes, train_scores, test_scores = learning_curve(
|
||||||
train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
|
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
|
||||||
estimator,
|
|
||||||
X,
|
|
||||||
y,
|
|
||||||
cv=cv,
|
|
||||||
n_jobs=n_jobs,
|
|
||||||
train_sizes=train_sizes,
|
|
||||||
return_times=True,
|
|
||||||
)
|
|
||||||
train_scores_mean = np.mean(train_scores, axis=1)
|
train_scores_mean = np.mean(train_scores, axis=1)
|
||||||
train_scores_std = np.std(train_scores, axis=1)
|
train_scores_std = np.std(train_scores, axis=1)
|
||||||
test_scores_mean = np.mean(test_scores, axis=1)
|
test_scores_mean = np.mean(test_scores, axis=1)
|
||||||
test_scores_std = np.std(test_scores, axis=1)
|
test_scores_std = np.std(test_scores, axis=1)
|
||||||
fit_times_mean = np.mean(fit_times, axis=1)
|
plt.grid()
|
||||||
fit_times_std = np.std(fit_times, axis=1)
|
|
||||||
|
|
||||||
# Plot learning curve
|
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
|
||||||
axes[0].grid()
|
train_scores_mean + train_scores_std, alpha=0.1,
|
||||||
axes[0].fill_between(
|
color="r")
|
||||||
train_sizes,
|
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
|
||||||
train_scores_mean - train_scores_std,
|
test_scores_mean + test_scores_std, alpha=0.1, color="g")
|
||||||
train_scores_mean + train_scores_std,
|
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
|
||||||
alpha=0.1,
|
label="Training score")
|
||||||
color="r",
|
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
|
||||||
)
|
label="Cross-validation score")
|
||||||
axes[0].fill_between(
|
|
||||||
train_sizes,
|
|
||||||
test_scores_mean - test_scores_std,
|
|
||||||
test_scores_mean + test_scores_std,
|
|
||||||
alpha=0.1,
|
|
||||||
color="g",
|
|
||||||
)
|
|
||||||
axes[0].plot(
|
|
||||||
train_sizes, train_scores_mean, "o-", color="r", label="Training score"
|
|
||||||
)
|
|
||||||
axes[0].plot(
|
|
||||||
train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score"
|
|
||||||
)
|
|
||||||
axes[0].legend(loc="best")
|
|
||||||
|
|
||||||
# Plot n_samples vs fit_times
|
|
||||||
axes[1].grid()
|
|
||||||
axes[1].plot(train_sizes, fit_times_mean, "o-")
|
|
||||||
axes[1].fill_between(
|
|
||||||
train_sizes,
|
|
||||||
fit_times_mean - fit_times_std,
|
|
||||||
fit_times_mean + fit_times_std,
|
|
||||||
alpha=0.1,
|
|
||||||
)
|
|
||||||
axes[1].set_xlabel("Training examples")
|
|
||||||
axes[1].set_ylabel("fit_times")
|
|
||||||
axes[1].set_title("Scalability of the model")
|
|
||||||
|
|
||||||
# Plot fit_time vs score
|
|
||||||
fit_time_argsort = fit_times_mean.argsort()
|
|
||||||
fit_time_sorted = fit_times_mean[fit_time_argsort]
|
|
||||||
test_scores_mean_sorted = test_scores_mean[fit_time_argsort]
|
|
||||||
test_scores_std_sorted = test_scores_std[fit_time_argsort]
|
|
||||||
axes[2].grid()
|
|
||||||
axes[2].plot(fit_time_sorted, test_scores_mean_sorted, "o-")
|
|
||||||
axes[2].fill_between(
|
|
||||||
fit_time_sorted,
|
|
||||||
test_scores_mean_sorted - test_scores_std_sorted,
|
|
||||||
test_scores_mean_sorted + test_scores_std_sorted,
|
|
||||||
alpha=0.1,
|
|
||||||
)
|
|
||||||
axes[2].set_xlabel("fit_times")
|
|
||||||
axes[2].set_ylabel("Score")
|
|
||||||
axes[2].set_title("Performance of the model")
|
|
||||||
|
|
||||||
|
plt.legend(loc="best")
|
||||||
return plt
|
return plt
|
||||||
|
|
||||||
|
|
||||||
fig, axes = plt.subplots(3, 2, figsize=(10, 15))
|
#digits = load_digits()
|
||||||
|
#X, y = digits.data, digits.target
|
||||||
|
|
||||||
X, y = load_digits(return_X_y=True)
|
|
||||||
|
|
||||||
title = "Learning Curves (Naive Bayes)"
|
#title = "Learning Curves (Naive Bayes)"
|
||||||
# Cross validation with 50 iterations to get smoother mean test and train
|
# Cross validation with 100 iterations to get smoother mean test and train
|
||||||
# score curves, each time with 20% data randomly selected as a validation set.
|
# score curves, each time with 20% data randomly selected as a validation set.
|
||||||
cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0)
|
#cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=100,
|
||||||
|
# test_size=0.2, random_state=0)
|
||||||
|
|
||||||
estimator = GaussianNB()
|
#estimator = GaussianNB()
|
||||||
plot_learning_curve(
|
#plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4)
|
||||||
estimator, title, X, y, axes=axes[:, 0], ylim=(0.7, 1.01), cv=cv, n_jobs=4
|
|
||||||
)
|
|
||||||
|
|
||||||
title = r"Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
|
#title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
|
||||||
# SVC is more expensive so we do a lower number of CV iterations:
|
# SVC is more expensive so we do a lower number of CV iterations:
|
||||||
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
|
#cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=10,
|
||||||
estimator = SVC(gamma=0.001)
|
# test_size=0.2, random_state=0)
|
||||||
plot_learning_curve(
|
#estimator = SVC(gamma=0.001)
|
||||||
estimator, title, X, y, axes=axes[:, 1], ylim=(0.7, 1.01), cv=cv, n_jobs=4
|
#plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4)
|
||||||
)
|
|
||||||
|
|
||||||
plt.show()
|
#plt.show()
|
||||||
|
Loading…
Reference in New Issue
Block a user