From 4d87b07ed9bacf32dab5d2236b6697ac2ef7b662 Mon Sep 17 00:00:00 2001
From: cif <cif2cif@gmail.com>
Date: Mon, 7 Mar 2022 14:16:14 +0100
Subject: [PATCH] Updated visualization

---
 ml2/3_7_SVM.ipynb          |  12 +-
 ml2/plot_learning_curve.py | 221 ++++++++++++++++++++++++++-----------
 2 files changed, 164 insertions(+), 69 deletions(-)

diff --git a/ml2/3_7_SVM.ipynb b/ml2/3_7_SVM.ipynb
index de69670..4886db7 100644
--- a/ml2/3_7_SVM.ipynb
+++ b/ml2/3_7_SVM.ipynb
@@ -535,13 +535,13 @@
    "source": [
     "# This step will take some time\n",
     "# Cross-validationt\n",
-    "cv = KFold(n_splits=5, shuffle=False, random_state=33)\n",
+    "cv = KFold(n_splits=5, shuffle=True, random_state=33)\n",
     "# StratifiedKFold has is a variation of k-fold which returns stratified folds:\n",
     "# each set contains approximately the same percentage of samples of each target class as the complete set.\n",
-    "#cv = StratifiedKFold(y, n_folds=3, shuffle=False, random_state=33)\n",
+    "#cv = StratifiedKFold(y, n_folds=3, shuffle=True, random_state=33)\n",
     "scores = cross_val_score(model, X, y, cv=cv)\n",
     "print(\"Scores in every iteration\", scores)\n",
-    "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))\n"
+    "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))"
    ]
   },
   {
@@ -644,7 +644,7 @@
    "source": [
     "* [Titanic Machine Learning from Disaster](https://www.kaggle.com/c/titanic/forums/t/5105/ipython-notebook-tutorial-for-titanic-machine-learning-from-disaster)\n",
     "* [API SVC scikit-learn](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)\n",
-    "* [Better evaluation of classification models](http://blog.kaggle.com/2015/10/23/scikit-learn-video-9-better-evaluation-of-classification-models/)"
+    "* [How to choose the right metric for evaluating an ML model](https://www.kaggle.com/vipulgandhi/how-to-choose-right-metric-for-evaluating-ml-model)"
    ]
   },
   {
@@ -666,7 +666,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -680,7 +680,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.1"
+   "version": "3.8.12"
   },
   "latex_envs": {
    "LaTeX_envs_menu_present": true,
diff --git a/ml2/plot_learning_curve.py b/ml2/plot_learning_curve.py
index 7bcc318..25f43d8 100644
--- a/ml2/plot_learning_curve.py
+++ b/ml2/plot_learning_curve.py
@@ -1,21 +1,21 @@
 """
-Taken from http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
-
 ========================
 Plotting Learning Curves
 ========================
+In the first column, first row the learning curve of a naive Bayes classifier
+is shown for the digits dataset. Note that the training score and the
+cross-validation score are both not very good at the end. However, the shape
+of the curve can be found in more complex datasets very often: the training
+score is very high at the beginning and decreases and the cross-validation
+score is very low at the beginning and increases. In the second column, first
+row we see the learning curve of an SVM with RBF kernel. We can see clearly
+that the training score is still around the maximum and the validation score
+could be increased with more training samples. The plots in the second row
+show the times required by the models to train with various sizes of training
+dataset. The plots in the third row show how much time was required to train
+the models for each training sizes.
 
-On the left side the learning curve of a naive Bayes classifier is shown for
-the digits dataset. Note that the training score and the cross-validation score
-are both not very good at the end. However, the shape of the curve can be found
-in more complex datasets very often: the training score is very high at the
-beginning and decreases and the cross-validation score is very low at the
-beginning and increases. On the right side we see the learning curve of an SVM
-with RBF kernel. We can see clearly that the training score is still around
-the maximum and the validation score could be increased with more training
-samples.
 """
-#print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
@@ -23,86 +23,181 @@ from sklearn.naive_bayes import GaussianNB
 from sklearn.svm import SVC
 from sklearn.datasets import load_digits
 from sklearn.model_selection import learning_curve
+from sklearn.model_selection import ShuffleSplit
 
 
-def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
-                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
+def plot_learning_curve(
+    estimator,
+    title,
+    X,
+    y,
+    axes=None,
+    ylim=None,
+    cv=None,
+    n_jobs=None,
+    train_sizes=np.linspace(0.1, 1.0, 5),
+):
     """
-    Generate a simple plot of the test and traning learning curve.
+    Generate 3 plots: the test and training learning curve, the training
+    samples vs fit times curve, the fit times vs score curve.
 
     Parameters
     ----------
-    estimator : object type that implements the "fit" and "predict" methods
-        An object of that type which is cloned for each validation.
+    estimator : estimator instance
+        An estimator instance implementing `fit` and `predict` methods which
+        will be cloned for each validation.
 
-    title : string
+    title : str
         Title for the chart.
 
-    X : array-like, shape (n_samples, n_features)
-        Training vector, where n_samples is the number of samples and
-        n_features is the number of features.
+    X : array-like of shape (n_samples, n_features)
+        Training vector, where ``n_samples`` is the number of samples and
+        ``n_features`` is the number of features.
 
-    y : array-like, shape (n_samples) or (n_samples, n_features), optional
-        Target relative to X for classification or regression;
+    y : array-like of shape (n_samples) or (n_samples, n_features)
+        Target relative to ``X`` for classification or regression;
         None for unsupervised learning.
 
-    ylim : tuple, shape (ymin, ymax), optional
-        Defines minimum and maximum yvalues plotted.
+    axes : array-like of shape (3,), default=None
+        Axes to use for plotting the curves.
 
-    cv : integer, cross-validation generator, optional
-        If an integer is passed, it is the number of folds (defaults to 3).
-        Specific cross-validation objects can be passed, see
-        sklearn.model_selection module for the list of possible objects
+    ylim : tuple of shape (2,), default=None
+        Defines minimum and maximum y-values plotted, e.g. (ymin, ymax).
 
-    n_jobs : integer, optional
-        Number of jobs to run in parallel (default 1).
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+          - None, to use the default 5-fold cross-validation,
+          - integer, to specify the number of folds.
+          - :term:`CV splitter`,
+          - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if ``y`` is binary or multiclass,
+        :class:`StratifiedKFold` used. If the estimator is not a classifier
+        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validators that can be used here.
+
+    n_jobs : int or None, default=None
+        Number of jobs to run in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    train_sizes : array-like of shape (n_ticks,)
+        Relative or absolute numbers of training examples that will be used to
+        generate the learning curve. If the ``dtype`` is float, it is regarded
+        as a fraction of the maximum size of the training set (that is
+        determined by the selected validation method), i.e. it has to be within
+        (0, 1]. Otherwise it is interpreted as absolute sizes of the training
+        sets. Note that for classification the number of samples usually have
+        to be big enough to contain at least one sample from each class.
+        (default: np.linspace(0.1, 1.0, 5))
     """
-    plt.figure()
-    plt.title(title)
+    if axes is None:
+        _, axes = plt.subplots(1, 3, figsize=(20, 5))
+
+    axes[0].set_title(title)
     if ylim is not None:
-        plt.ylim(*ylim)
-    plt.xlabel("Training examples")
-    plt.ylabel("Score")
-    train_sizes, train_scores, test_scores = learning_curve(
-        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
+        axes[0].set_ylim(*ylim)
+    axes[0].set_xlabel("Training examples")
+    axes[0].set_ylabel("Score")
+
+    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
+        estimator,
+        X,
+        y,
+        cv=cv,
+        n_jobs=n_jobs,
+        train_sizes=train_sizes,
+        return_times=True,
+    )
     train_scores_mean = np.mean(train_scores, axis=1)
     train_scores_std = np.std(train_scores, axis=1)
     test_scores_mean = np.mean(test_scores, axis=1)
     test_scores_std = np.std(test_scores, axis=1)
-    plt.grid()
+    fit_times_mean = np.mean(fit_times, axis=1)
+    fit_times_std = np.std(fit_times, axis=1)
 
-    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
-                     train_scores_mean + train_scores_std, alpha=0.1,
-                     color="r")
-    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
-                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
-    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
-             label="Training score")
-    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
-             label="Cross-validation score")
+    # Plot learning curve
+    axes[0].grid()
+    axes[0].fill_between(
+        train_sizes,
+        train_scores_mean - train_scores_std,
+        train_scores_mean + train_scores_std,
+        alpha=0.1,
+        color="r",
+    )
+    axes[0].fill_between(
+        train_sizes,
+        test_scores_mean - test_scores_std,
+        test_scores_mean + test_scores_std,
+        alpha=0.1,
+        color="g",
+    )
+    axes[0].plot(
+        train_sizes, train_scores_mean, "o-", color="r", label="Training score"
+    )
+    axes[0].plot(
+        train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score"
+    )
+    axes[0].legend(loc="best")
+
+    # Plot n_samples vs fit_times
+    axes[1].grid()
+    axes[1].plot(train_sizes, fit_times_mean, "o-")
+    axes[1].fill_between(
+        train_sizes,
+        fit_times_mean - fit_times_std,
+        fit_times_mean + fit_times_std,
+        alpha=0.1,
+    )
+    axes[1].set_xlabel("Training examples")
+    axes[1].set_ylabel("fit_times")
+    axes[1].set_title("Scalability of the model")
+
+    # Plot fit_time vs score
+    fit_time_argsort = fit_times_mean.argsort()
+    fit_time_sorted = fit_times_mean[fit_time_argsort]
+    test_scores_mean_sorted = test_scores_mean[fit_time_argsort]
+    test_scores_std_sorted = test_scores_std[fit_time_argsort]
+    axes[2].grid()
+    axes[2].plot(fit_time_sorted, test_scores_mean_sorted, "o-")
+    axes[2].fill_between(
+        fit_time_sorted,
+        test_scores_mean_sorted - test_scores_std_sorted,
+        test_scores_mean_sorted + test_scores_std_sorted,
+        alpha=0.1,
+    )
+    axes[2].set_xlabel("fit_times")
+    axes[2].set_ylabel("Score")
+    axes[2].set_title("Performance of the model")
 
-    plt.legend(loc="best")
     return plt
 
 
-#digits = load_digits()
-#X, y = digits.data, digits.target
+fig, axes = plt.subplots(3, 2, figsize=(10, 15))
 
+X, y = load_digits(return_X_y=True)
 
-#title = "Learning Curves (Naive Bayes)"
-# Cross validation with 100 iterations to get smoother mean test and train
+title = "Learning Curves (Naive Bayes)"
+# Cross validation with 50 iterations to get smoother mean test and train
 # score curves, each time with 20% data randomly selected as a validation set.
-#cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=100,
-#                                   test_size=0.2, random_state=0)
+cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0)
 
-#estimator = GaussianNB()
-#plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4)
+estimator = GaussianNB()
+plot_learning_curve(
+    estimator, title, X, y, axes=axes[:, 0], ylim=(0.7, 1.01), cv=cv, n_jobs=4
+)
 
-#title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
+title = r"Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
 # SVC is more expensive so we do a lower number of CV iterations:
-#cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=10,
-#	                                   test_size=0.2, random_state=0)
-#estimator = SVC(gamma=0.001)
-#plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4)
+cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
+estimator = SVC(gamma=0.001)
+plot_learning_curve(
+    estimator, title, X, y, axes=axes[:, 1], ylim=(0.7, 1.01), cv=cv, n_jobs=4
+)
 
-#plt.show()
+plt.show()