diff --git a/ml2/3_7_SVM.ipynb b/ml2/3_7_SVM.ipynb index de69670..4886db7 100644 --- a/ml2/3_7_SVM.ipynb +++ b/ml2/3_7_SVM.ipynb @@ -535,13 +535,13 @@ "source": [ "# This step will take some time\n", "# Cross-validationt\n", - "cv = KFold(n_splits=5, shuffle=False, random_state=33)\n", + "cv = KFold(n_splits=5, shuffle=True, random_state=33)\n", "# StratifiedKFold has is a variation of k-fold which returns stratified folds:\n", "# each set contains approximately the same percentage of samples of each target class as the complete set.\n", - "#cv = StratifiedKFold(y, n_folds=3, shuffle=False, random_state=33)\n", + "#cv = StratifiedKFold(y, n_folds=3, shuffle=True, random_state=33)\n", "scores = cross_val_score(model, X, y, cv=cv)\n", "print(\"Scores in every iteration\", scores)\n", - "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))\n" + "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))" ] }, { @@ -644,7 +644,7 @@ "source": [ "* [Titanic Machine Learning from Disaster](https://www.kaggle.com/c/titanic/forums/t/5105/ipython-notebook-tutorial-for-titanic-machine-learning-from-disaster)\n", "* [API SVC scikit-learn](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)\n", - "* [Better evaluation of classification models](http://blog.kaggle.com/2015/10/23/scikit-learn-video-9-better-evaluation-of-classification-models/)" + "* [How to choose the right metric for evaluating an ML model](https://www.kaggle.com/vipulgandhi/how-to-choose-right-metric-for-evaluating-ml-model)" ] }, { @@ -666,7 +666,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -680,7 +680,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.1" + "version": "3.8.12" }, "latex_envs": { "LaTeX_envs_menu_present": true, diff --git a/ml2/plot_learning_curve.py b/ml2/plot_learning_curve.py index 7bcc318..25f43d8 100644 --- a/ml2/plot_learning_curve.py +++ b/ml2/plot_learning_curve.py @@ -1,21 +1,21 @@ """ -Taken from http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html - ======================== Plotting Learning Curves ======================== +In the first column, first row the learning curve of a naive Bayes classifier +is shown for the digits dataset. Note that the training score and the +cross-validation score are both not very good at the end. However, the shape +of the curve can be found in more complex datasets very often: the training +score is very high at the beginning and decreases and the cross-validation +score is very low at the beginning and increases. In the second column, first +row we see the learning curve of an SVM with RBF kernel. We can see clearly +that the training score is still around the maximum and the validation score +could be increased with more training samples. The plots in the second row +show the times required by the models to train with various sizes of training +dataset. The plots in the third row show how much time was required to train +the models for each training sizes. -On the left side the learning curve of a naive Bayes classifier is shown for -the digits dataset. Note that the training score and the cross-validation score -are both not very good at the end. However, the shape of the curve can be found -in more complex datasets very often: the training score is very high at the -beginning and decreases and the cross-validation score is very low at the -beginning and increases. On the right side we see the learning curve of an SVM -with RBF kernel. We can see clearly that the training score is still around -the maximum and the validation score could be increased with more training -samples. """ -#print(__doc__) import numpy as np import matplotlib.pyplot as plt @@ -23,86 +23,181 @@ from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.datasets import load_digits from sklearn.model_selection import learning_curve +from sklearn.model_selection import ShuffleSplit -def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, - n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)): +def plot_learning_curve( + estimator, + title, + X, + y, + axes=None, + ylim=None, + cv=None, + n_jobs=None, + train_sizes=np.linspace(0.1, 1.0, 5), +): """ - Generate a simple plot of the test and traning learning curve. + Generate 3 plots: the test and training learning curve, the training + samples vs fit times curve, the fit times vs score curve. Parameters ---------- - estimator : object type that implements the "fit" and "predict" methods - An object of that type which is cloned for each validation. + estimator : estimator instance + An estimator instance implementing `fit` and `predict` methods which + will be cloned for each validation. - title : string + title : str Title for the chart. - X : array-like, shape (n_samples, n_features) - Training vector, where n_samples is the number of samples and - n_features is the number of features. + X : array-like of shape (n_samples, n_features) + Training vector, where ``n_samples`` is the number of samples and + ``n_features`` is the number of features. - y : array-like, shape (n_samples) or (n_samples, n_features), optional - Target relative to X for classification or regression; + y : array-like of shape (n_samples) or (n_samples, n_features) + Target relative to ``X`` for classification or regression; None for unsupervised learning. - ylim : tuple, shape (ymin, ymax), optional - Defines minimum and maximum yvalues plotted. + axes : array-like of shape (3,), default=None + Axes to use for plotting the curves. - cv : integer, cross-validation generator, optional - If an integer is passed, it is the number of folds (defaults to 3). - Specific cross-validation objects can be passed, see - sklearn.model_selection module for the list of possible objects + ylim : tuple of shape (2,), default=None + Defines minimum and maximum y-values plotted, e.g. (ymin, ymax). - n_jobs : integer, optional - Number of jobs to run in parallel (default 1). + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross-validation, + - integer, to specify the number of folds. + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if ``y`` is binary or multiclass, + :class:`StratifiedKFold` used. If the estimator is not a classifier + or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validators that can be used here. + + n_jobs : int or None, default=None + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + train_sizes : array-like of shape (n_ticks,) + Relative or absolute numbers of training examples that will be used to + generate the learning curve. If the ``dtype`` is float, it is regarded + as a fraction of the maximum size of the training set (that is + determined by the selected validation method), i.e. it has to be within + (0, 1]. Otherwise it is interpreted as absolute sizes of the training + sets. Note that for classification the number of samples usually have + to be big enough to contain at least one sample from each class. + (default: np.linspace(0.1, 1.0, 5)) """ - plt.figure() - plt.title(title) + if axes is None: + _, axes = plt.subplots(1, 3, figsize=(20, 5)) + + axes[0].set_title(title) if ylim is not None: - plt.ylim(*ylim) - plt.xlabel("Training examples") - plt.ylabel("Score") - train_sizes, train_scores, test_scores = learning_curve( - estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) + axes[0].set_ylim(*ylim) + axes[0].set_xlabel("Training examples") + axes[0].set_ylabel("Score") + + train_sizes, train_scores, test_scores, fit_times, _ = learning_curve( + estimator, + X, + y, + cv=cv, + n_jobs=n_jobs, + train_sizes=train_sizes, + return_times=True, + ) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) - plt.grid() + fit_times_mean = np.mean(fit_times, axis=1) + fit_times_std = np.std(fit_times, axis=1) - plt.fill_between(train_sizes, train_scores_mean - train_scores_std, - train_scores_mean + train_scores_std, alpha=0.1, - color="r") - plt.fill_between(train_sizes, test_scores_mean - test_scores_std, - test_scores_mean + test_scores_std, alpha=0.1, color="g") - plt.plot(train_sizes, train_scores_mean, 'o-', color="r", - label="Training score") - plt.plot(train_sizes, test_scores_mean, 'o-', color="g", - label="Cross-validation score") + # Plot learning curve + axes[0].grid() + axes[0].fill_between( + train_sizes, + train_scores_mean - train_scores_std, + train_scores_mean + train_scores_std, + alpha=0.1, + color="r", + ) + axes[0].fill_between( + train_sizes, + test_scores_mean - test_scores_std, + test_scores_mean + test_scores_std, + alpha=0.1, + color="g", + ) + axes[0].plot( + train_sizes, train_scores_mean, "o-", color="r", label="Training score" + ) + axes[0].plot( + train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score" + ) + axes[0].legend(loc="best") + + # Plot n_samples vs fit_times + axes[1].grid() + axes[1].plot(train_sizes, fit_times_mean, "o-") + axes[1].fill_between( + train_sizes, + fit_times_mean - fit_times_std, + fit_times_mean + fit_times_std, + alpha=0.1, + ) + axes[1].set_xlabel("Training examples") + axes[1].set_ylabel("fit_times") + axes[1].set_title("Scalability of the model") + + # Plot fit_time vs score + fit_time_argsort = fit_times_mean.argsort() + fit_time_sorted = fit_times_mean[fit_time_argsort] + test_scores_mean_sorted = test_scores_mean[fit_time_argsort] + test_scores_std_sorted = test_scores_std[fit_time_argsort] + axes[2].grid() + axes[2].plot(fit_time_sorted, test_scores_mean_sorted, "o-") + axes[2].fill_between( + fit_time_sorted, + test_scores_mean_sorted - test_scores_std_sorted, + test_scores_mean_sorted + test_scores_std_sorted, + alpha=0.1, + ) + axes[2].set_xlabel("fit_times") + axes[2].set_ylabel("Score") + axes[2].set_title("Performance of the model") - plt.legend(loc="best") return plt -#digits = load_digits() -#X, y = digits.data, digits.target +fig, axes = plt.subplots(3, 2, figsize=(10, 15)) +X, y = load_digits(return_X_y=True) -#title = "Learning Curves (Naive Bayes)" -# Cross validation with 100 iterations to get smoother mean test and train +title = "Learning Curves (Naive Bayes)" +# Cross validation with 50 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. -#cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=100, -# test_size=0.2, random_state=0) +cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0) -#estimator = GaussianNB() -#plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4) +estimator = GaussianNB() +plot_learning_curve( + estimator, title, X, y, axes=axes[:, 0], ylim=(0.7, 1.01), cv=cv, n_jobs=4 +) -#title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)" +title = r"Learning Curves (SVM, RBF kernel, $\gamma=0.001$)" # SVC is more expensive so we do a lower number of CV iterations: -#cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=10, -# test_size=0.2, random_state=0) -#estimator = SVC(gamma=0.001) -#plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4) +cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0) +estimator = SVC(gamma=0.001) +plot_learning_curve( + estimator, title, X, y, axes=axes[:, 1], ylim=(0.7, 1.01), cv=cv, n_jobs=4 +) -#plt.show() +plt.show()