diff --git a/ml2/3_5_Exercise_1.ipynb b/ml2/3_5_Exercise_1.ipynb
index 7bd52c0..122fbcc 100644
--- a/ml2/3_5_Exercise_1.ipynb
+++ b/ml2/3_5_Exercise_1.ipynb
@@ -46,7 +46,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -209,12 +209,315 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 20,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " PassengerId | \n",
+ " Survived | \n",
+ " Pclass | \n",
+ " Name | \n",
+ " Sex | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Ticket | \n",
+ " Fare | \n",
+ " Cabin | \n",
+ " Embarked | \n",
+ " FamilySize | \n",
+ " AgeGroup | \n",
+ " Deck | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Braund, Mr. Owen Harris | \n",
+ " male | \n",
+ " 22.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " A/5 21171 | \n",
+ " 7.2500 | \n",
+ " NaN | \n",
+ " S | \n",
+ " 1 | \n",
+ " 3.0 | \n",
+ " X | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
+ " female | \n",
+ " 38.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " PC 17599 | \n",
+ " 71.2833 | \n",
+ " C85 | \n",
+ " C | \n",
+ " 1 | \n",
+ " 3.0 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " Heikkinen, Miss. Laina | \n",
+ " female | \n",
+ " 26.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " STON/O2. 3101282 | \n",
+ " 7.9250 | \n",
+ " NaN | \n",
+ " S | \n",
+ " 0 | \n",
+ " 3.0 | \n",
+ " X | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
+ " female | \n",
+ " 35.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 113803 | \n",
+ " 53.1000 | \n",
+ " C123 | \n",
+ " S | \n",
+ " 1 | \n",
+ " 3.0 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Allen, Mr. William Henry | \n",
+ " male | \n",
+ " 35.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 373450 | \n",
+ " 8.0500 | \n",
+ " NaN | \n",
+ " S | \n",
+ " 0 | \n",
+ " 3.0 | \n",
+ " X | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 886 | \n",
+ " 887 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " Montvila, Rev. Juozas | \n",
+ " male | \n",
+ " 27.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 211536 | \n",
+ " 13.0000 | \n",
+ " NaN | \n",
+ " S | \n",
+ " 0 | \n",
+ " 3.0 | \n",
+ " X | \n",
+ "
\n",
+ " \n",
+ " 887 | \n",
+ " 888 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Graham, Miss. Margaret Edith | \n",
+ " female | \n",
+ " 19.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 112053 | \n",
+ " 30.0000 | \n",
+ " B42 | \n",
+ " S | \n",
+ " 0 | \n",
+ " 3.0 | \n",
+ " B | \n",
+ "
\n",
+ " \n",
+ " 888 | \n",
+ " 889 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Johnston, Miss. Catherine Helen \"Carrie\" | \n",
+ " female | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " W./C. 6607 | \n",
+ " 23.4500 | \n",
+ " NaN | \n",
+ " S | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " X | \n",
+ "
\n",
+ " \n",
+ " 889 | \n",
+ " 890 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Behr, Mr. Karl Howell | \n",
+ " male | \n",
+ " 26.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 111369 | \n",
+ " 30.0000 | \n",
+ " C148 | \n",
+ " C | \n",
+ " 0 | \n",
+ " 3.0 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " 890 | \n",
+ " 891 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Dooley, Mr. Patrick | \n",
+ " male | \n",
+ " 32.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 370376 | \n",
+ " 7.7500 | \n",
+ " NaN | \n",
+ " Q | \n",
+ " 0 | \n",
+ " 3.0 | \n",
+ " X | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
891 rows × 15 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " PassengerId Survived Pclass \\\n",
+ "0 1 0 3 \n",
+ "1 2 1 1 \n",
+ "2 3 1 3 \n",
+ "3 4 1 1 \n",
+ "4 5 0 3 \n",
+ ".. ... ... ... \n",
+ "886 887 0 2 \n",
+ "887 888 1 1 \n",
+ "888 889 0 3 \n",
+ "889 890 1 1 \n",
+ "890 891 0 3 \n",
+ "\n",
+ " Name Sex Age SibSp \\\n",
+ "0 Braund, Mr. Owen Harris male 22.0 1 \n",
+ "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
+ "2 Heikkinen, Miss. Laina female 26.0 0 \n",
+ "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
+ "4 Allen, Mr. William Henry male 35.0 0 \n",
+ ".. ... ... ... ... \n",
+ "886 Montvila, Rev. Juozas male 27.0 0 \n",
+ "887 Graham, Miss. Margaret Edith female 19.0 0 \n",
+ "888 Johnston, Miss. Catherine Helen \"Carrie\" female NaN 1 \n",
+ "889 Behr, Mr. Karl Howell male 26.0 0 \n",
+ "890 Dooley, Mr. Patrick male 32.0 0 \n",
+ "\n",
+ " Parch Ticket Fare Cabin Embarked FamilySize AgeGroup \\\n",
+ "0 0 A/5 21171 7.2500 NaN S 1 3.0 \n",
+ "1 0 PC 17599 71.2833 C85 C 1 3.0 \n",
+ "2 0 STON/O2. 3101282 7.9250 NaN S 0 3.0 \n",
+ "3 0 113803 53.1000 C123 S 1 3.0 \n",
+ "4 0 373450 8.0500 NaN S 0 3.0 \n",
+ ".. ... ... ... ... ... ... ... \n",
+ "886 0 211536 13.0000 NaN S 0 3.0 \n",
+ "887 0 112053 30.0000 B42 S 0 3.0 \n",
+ "888 2 W./C. 6607 23.4500 NaN S 3 NaN \n",
+ "889 0 111369 30.0000 C148 C 0 3.0 \n",
+ "890 0 370376 7.7500 NaN Q 0 3.0 \n",
+ "\n",
+ " Deck \n",
+ "0 X \n",
+ "1 C \n",
+ "2 X \n",
+ "3 C \n",
+ "4 X \n",
+ ".. ... \n",
+ "886 X \n",
+ "887 B \n",
+ "888 X \n",
+ "889 C \n",
+ "890 X \n",
+ "\n",
+ "[891 rows x 15 columns]"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"df['FamilySize'] = df['SibSp'] + df['Parch']\n",
- "df.head()"
+ "df"
]
},
{
@@ -303,9 +606,31 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "ename": "KeyError",
+ "evalue": "'Salutation'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 3079\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3080\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3081\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;31mKeyError\u001b[0m: 'Salutation'",
+ "\nThe above exception was the direct cause of the following exception:\n",
+ "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Others'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Salutation'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Salutation'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgroup_salutation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Salutation'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3022\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnlevels\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3023\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_multilevel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3024\u001b[0;31m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3025\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3026\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 3080\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3081\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3082\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3083\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3084\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtolerance\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mKeyError\u001b[0m: 'Salutation'"
+ ]
+ }
+ ],
"source": [
"def group_salutation(old_salutation):\n",
" if old_salutation == 'Mr':\n",
@@ -372,13 +697,13 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# Group ages to simplify machine learning algorithms. 0: 0-5, 1: 6-10, 2: 11-15, 3: 16-59 and 4: 60-80\n",
- "df['AgeGroup'] = 0\n",
- "df.loc[(.Age<6),'AgeGroup'] = 0\n",
+ "df['AgeGroup'] = np.nan\n",
+ "df.loc[(df.Age<6),'AgeGroup'] = 0\n",
"df.loc[(df.Age>=6) & (df.Age < 11),'AgeGroup'] = 1\n",
"df.loc[(df.Age>=11) & (df.Age < 16),'AgeGroup'] = 2\n",
"df.loc[(df.Age>=16) & (df.Age < 60),'AgeGroup'] = 3\n",
@@ -395,7 +720,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@@ -404,8 +729,8 @@
" if np.isnan(big_string):\n",
" return 'X'\n",
" for substring in substrings:\n",
- " if big_string.find(substring) != 1:\n",
- " return substring\n",
+ " if substring in big_string:\n",
+ " return substring[0::]\n",
" print(big_string)\n",
" return 'X'\n",
" \n",
@@ -478,6 +803,15 @@
}
],
"metadata": {
+ "datacleaner": {
+ "position": {
+ "top": "50px"
+ },
+ "python": {
+ "varRefreshCmd": "try:\n print(_datacleaner.dataframe_metadata())\nexcept:\n print([])"
+ },
+ "window_display": false
+ },
"kernelspec": {
"display_name": "Python 3",
"language": "python",
@@ -493,7 +827,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.1"
+ "version": "3.7.9"
},
"latex_envs": {
"LaTeX_envs_menu_present": true,