diff --git a/ml2/3_3_Data_Munging_with_Pandas.ipynb b/ml2/3_3_Data_Munging_with_Pandas.ipynb index 1ecd01d..99347bc 100644 --- a/ml2/3_3_Data_Munging_with_Pandas.ipynb +++ b/ml2/3_3_Data_Munging_with_Pandas.ipynb @@ -451,7 +451,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Pivot tables are an intuitive way to analyze data, and alternative to group columns." + "Pivot tables are an intuitive way to analyze data, and an alternative to group columns.\n", + "\n", + "This command makes a table with rows Sex and columns Pclass, and\n", + "averages the result of the column Survived, thereby giving the percentage of survivors in each grouping." ] }, { @@ -460,7 +463,14 @@ "metadata": {}, "outputs": [], "source": [ - "pd.pivot_table(df, index='Sex')" + "pd.pivot_table(df, index='Sex', columns='Pclass', values=['Survived'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we want to analyze multi-index, the percentage of survivoers, given sex and age, and distributed by Pclass." ] }, { @@ -469,7 +479,14 @@ "metadata": {}, "outputs": [], "source": [ - "pd.pivot_table(df, index=['Sex', 'Pclass'])" + "pd.pivot_table(df, index=['Sex', 'Age'], columns=['Pclass'], values=['Survived'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nevertheless, this is not very useful since we have a row per age. Thus, we define a partition." ] }, { @@ -478,7 +495,8 @@ "metadata": {}, "outputs": [], "source": [ - "pd.pivot_table(df, index=['Sex', 'Pclass'], values=['Age', 'SibSp'])" + "# Partition each of the passengers into 3 categories based on their age\n", + "age = pd.cut(df['Age'], [0,12,18,80])" ] }, { @@ -487,7 +505,14 @@ "metadata": {}, "outputs": [], "source": [ - "pd.pivot_table(df, index=['Sex', 'Pclass'], values=['Age', 'SibSp'], aggfunc=np.mean)" + "pd.pivot_table(df, index=['Sex', age], columns=['Pclass'], values=['Survived'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can change the function used for aggregating each group." ] }, { @@ -496,8 +521,18 @@ "metadata": {}, "outputs": [], "source": [ - "# Try np.sum, np.size, len\n", - "pd.pivot_table(df, index=['Sex', 'Pclass'], values=['Age', 'SibSp'], aggfunc=[np.mean, np.sum])" + "# default\n", + "pd.pivot_table(df, index=['Sex', age], columns=['Pclass'], values=['Survived'], aggfunc=np.mean)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Two agg functions\n", + "pd.pivot_table(df, index=['Sex', age], columns=['Pclass'], values=['Survived'], aggfunc=[np.mean, np.sum])" ] }, { @@ -972,7 +1007,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.11.5" }, "latex_envs": { "LaTeX_envs_menu_present": true,