mirror of
				https://github.com/gsi-upm/sitc
				synced 2025-10-25 20:58:19 +00:00 
			
		
		
		
	Updated visualization
This commit is contained in:
		| @@ -535,13 +535,13 @@ | ||||
|    "source": [ | ||||
|     "# This step will take some time\n", | ||||
|     "# Cross-validationt\n", | ||||
|     "cv = KFold(n_splits=5, shuffle=False, random_state=33)\n", | ||||
|     "cv = KFold(n_splits=5, shuffle=True, random_state=33)\n", | ||||
|     "# StratifiedKFold has is a variation of k-fold which returns stratified folds:\n", | ||||
|     "# each set contains approximately the same percentage of samples of each target class as the complete set.\n", | ||||
|     "#cv = StratifiedKFold(y, n_folds=3, shuffle=False, random_state=33)\n", | ||||
|     "#cv = StratifiedKFold(y, n_folds=3, shuffle=True, random_state=33)\n", | ||||
|     "scores = cross_val_score(model, X, y, cv=cv)\n", | ||||
|     "print(\"Scores in every iteration\", scores)\n", | ||||
|     "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))\n" | ||||
|     "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
| @@ -644,7 +644,7 @@ | ||||
|    "source": [ | ||||
|     "* [Titanic Machine Learning from Disaster](https://www.kaggle.com/c/titanic/forums/t/5105/ipython-notebook-tutorial-for-titanic-machine-learning-from-disaster)\n", | ||||
|     "* [API SVC scikit-learn](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)\n", | ||||
|     "* [Better evaluation of classification models](http://blog.kaggle.com/2015/10/23/scikit-learn-video-9-better-evaluation-of-classification-models/)" | ||||
|     "* [How to choose the right metric for evaluating an ML model](https://www.kaggle.com/vipulgandhi/how-to-choose-right-metric-for-evaluating-ml-model)" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
| @@ -666,7 +666,7 @@ | ||||
|  ], | ||||
|  "metadata": { | ||||
|   "kernelspec": { | ||||
|    "display_name": "Python 3", | ||||
|    "display_name": "Python 3 (ipykernel)", | ||||
|    "language": "python", | ||||
|    "name": "python3" | ||||
|   }, | ||||
| @@ -680,7 +680,7 @@ | ||||
|    "name": "python", | ||||
|    "nbconvert_exporter": "python", | ||||
|    "pygments_lexer": "ipython3", | ||||
|    "version": "3.7.1" | ||||
|    "version": "3.8.12" | ||||
|   }, | ||||
|   "latex_envs": { | ||||
|    "LaTeX_envs_menu_present": true, | ||||
|   | ||||
| @@ -1,21 +1,21 @@ | ||||
| """ | ||||
| Taken from http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html | ||||
|  | ||||
| ======================== | ||||
| Plotting Learning Curves | ||||
| ======================== | ||||
| In the first column, first row the learning curve of a naive Bayes classifier | ||||
| is shown for the digits dataset. Note that the training score and the | ||||
| cross-validation score are both not very good at the end. However, the shape | ||||
| of the curve can be found in more complex datasets very often: the training | ||||
| score is very high at the beginning and decreases and the cross-validation | ||||
| score is very low at the beginning and increases. In the second column, first | ||||
| row we see the learning curve of an SVM with RBF kernel. We can see clearly | ||||
| that the training score is still around the maximum and the validation score | ||||
| could be increased with more training samples. The plots in the second row | ||||
| show the times required by the models to train with various sizes of training | ||||
| dataset. The plots in the third row show how much time was required to train | ||||
| the models for each training sizes. | ||||
|  | ||||
| On the left side the learning curve of a naive Bayes classifier is shown for | ||||
| the digits dataset. Note that the training score and the cross-validation score | ||||
| are both not very good at the end. However, the shape of the curve can be found | ||||
| in more complex datasets very often: the training score is very high at the | ||||
| beginning and decreases and the cross-validation score is very low at the | ||||
| beginning and increases. On the right side we see the learning curve of an SVM | ||||
| with RBF kernel. We can see clearly that the training score is still around | ||||
| the maximum and the validation score could be increased with more training | ||||
| samples. | ||||
| """ | ||||
| #print(__doc__) | ||||
|  | ||||
| import numpy as np | ||||
| import matplotlib.pyplot as plt | ||||
| @@ -23,86 +23,181 @@ from sklearn.naive_bayes import GaussianNB | ||||
| from sklearn.svm import SVC | ||||
| from sklearn.datasets import load_digits | ||||
| from sklearn.model_selection import learning_curve | ||||
| from sklearn.model_selection import ShuffleSplit | ||||
|  | ||||
|  | ||||
| def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, | ||||
|                         n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)): | ||||
| def plot_learning_curve( | ||||
|     estimator, | ||||
|     title, | ||||
|     X, | ||||
|     y, | ||||
|     axes=None, | ||||
|     ylim=None, | ||||
|     cv=None, | ||||
|     n_jobs=None, | ||||
|     train_sizes=np.linspace(0.1, 1.0, 5), | ||||
| ): | ||||
|     """ | ||||
|     Generate a simple plot of the test and traning learning curve. | ||||
|     Generate 3 plots: the test and training learning curve, the training | ||||
|     samples vs fit times curve, the fit times vs score curve. | ||||
|  | ||||
|     Parameters | ||||
|     ---------- | ||||
|     estimator : object type that implements the "fit" and "predict" methods | ||||
|         An object of that type which is cloned for each validation. | ||||
|     estimator : estimator instance | ||||
|         An estimator instance implementing `fit` and `predict` methods which | ||||
|         will be cloned for each validation. | ||||
|  | ||||
|     title : string | ||||
|     title : str | ||||
|         Title for the chart. | ||||
|  | ||||
|     X : array-like, shape (n_samples, n_features) | ||||
|         Training vector, where n_samples is the number of samples and | ||||
|         n_features is the number of features. | ||||
|     X : array-like of shape (n_samples, n_features) | ||||
|         Training vector, where ``n_samples`` is the number of samples and | ||||
|         ``n_features`` is the number of features. | ||||
|  | ||||
|     y : array-like, shape (n_samples) or (n_samples, n_features), optional | ||||
|         Target relative to X for classification or regression; | ||||
|     y : array-like of shape (n_samples) or (n_samples, n_features) | ||||
|         Target relative to ``X`` for classification or regression; | ||||
|         None for unsupervised learning. | ||||
|  | ||||
|     ylim : tuple, shape (ymin, ymax), optional | ||||
|         Defines minimum and maximum yvalues plotted. | ||||
|     axes : array-like of shape (3,), default=None | ||||
|         Axes to use for plotting the curves. | ||||
|  | ||||
|     cv : integer, cross-validation generator, optional | ||||
|         If an integer is passed, it is the number of folds (defaults to 3). | ||||
|         Specific cross-validation objects can be passed, see | ||||
|         sklearn.model_selection module for the list of possible objects | ||||
|     ylim : tuple of shape (2,), default=None | ||||
|         Defines minimum and maximum y-values plotted, e.g. (ymin, ymax). | ||||
|  | ||||
|     n_jobs : integer, optional | ||||
|         Number of jobs to run in parallel (default 1). | ||||
|     cv : int, cross-validation generator or an iterable, default=None | ||||
|         Determines the cross-validation splitting strategy. | ||||
|         Possible inputs for cv are: | ||||
|  | ||||
|           - None, to use the default 5-fold cross-validation, | ||||
|           - integer, to specify the number of folds. | ||||
|           - :term:`CV splitter`, | ||||
|           - An iterable yielding (train, test) splits as arrays of indices. | ||||
|  | ||||
|         For integer/None inputs, if ``y`` is binary or multiclass, | ||||
|         :class:`StratifiedKFold` used. If the estimator is not a classifier | ||||
|         or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. | ||||
|  | ||||
|         Refer :ref:`User Guide <cross_validation>` for the various | ||||
|         cross-validators that can be used here. | ||||
|  | ||||
|     n_jobs : int or None, default=None | ||||
|         Number of jobs to run in parallel. | ||||
|         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. | ||||
|         ``-1`` means using all processors. See :term:`Glossary <n_jobs>` | ||||
|         for more details. | ||||
|  | ||||
|     train_sizes : array-like of shape (n_ticks,) | ||||
|         Relative or absolute numbers of training examples that will be used to | ||||
|         generate the learning curve. If the ``dtype`` is float, it is regarded | ||||
|         as a fraction of the maximum size of the training set (that is | ||||
|         determined by the selected validation method), i.e. it has to be within | ||||
|         (0, 1]. Otherwise it is interpreted as absolute sizes of the training | ||||
|         sets. Note that for classification the number of samples usually have | ||||
|         to be big enough to contain at least one sample from each class. | ||||
|         (default: np.linspace(0.1, 1.0, 5)) | ||||
|     """ | ||||
|     plt.figure() | ||||
|     plt.title(title) | ||||
|     if axes is None: | ||||
|         _, axes = plt.subplots(1, 3, figsize=(20, 5)) | ||||
|  | ||||
|     axes[0].set_title(title) | ||||
|     if ylim is not None: | ||||
|         plt.ylim(*ylim) | ||||
|     plt.xlabel("Training examples") | ||||
|     plt.ylabel("Score") | ||||
|     train_sizes, train_scores, test_scores = learning_curve( | ||||
|         estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) | ||||
|         axes[0].set_ylim(*ylim) | ||||
|     axes[0].set_xlabel("Training examples") | ||||
|     axes[0].set_ylabel("Score") | ||||
|  | ||||
|     train_sizes, train_scores, test_scores, fit_times, _ = learning_curve( | ||||
|         estimator, | ||||
|         X, | ||||
|         y, | ||||
|         cv=cv, | ||||
|         n_jobs=n_jobs, | ||||
|         train_sizes=train_sizes, | ||||
|         return_times=True, | ||||
|     ) | ||||
|     train_scores_mean = np.mean(train_scores, axis=1) | ||||
|     train_scores_std = np.std(train_scores, axis=1) | ||||
|     test_scores_mean = np.mean(test_scores, axis=1) | ||||
|     test_scores_std = np.std(test_scores, axis=1) | ||||
|     plt.grid() | ||||
|     fit_times_mean = np.mean(fit_times, axis=1) | ||||
|     fit_times_std = np.std(fit_times, axis=1) | ||||
|  | ||||
|     plt.fill_between(train_sizes, train_scores_mean - train_scores_std, | ||||
|                      train_scores_mean + train_scores_std, alpha=0.1, | ||||
|                      color="r") | ||||
|     plt.fill_between(train_sizes, test_scores_mean - test_scores_std, | ||||
|                      test_scores_mean + test_scores_std, alpha=0.1, color="g") | ||||
|     plt.plot(train_sizes, train_scores_mean, 'o-', color="r", | ||||
|              label="Training score") | ||||
|     plt.plot(train_sizes, test_scores_mean, 'o-', color="g", | ||||
|              label="Cross-validation score") | ||||
|     # Plot learning curve | ||||
|     axes[0].grid() | ||||
|     axes[0].fill_between( | ||||
|         train_sizes, | ||||
|         train_scores_mean - train_scores_std, | ||||
|         train_scores_mean + train_scores_std, | ||||
|         alpha=0.1, | ||||
|         color="r", | ||||
|     ) | ||||
|     axes[0].fill_between( | ||||
|         train_sizes, | ||||
|         test_scores_mean - test_scores_std, | ||||
|         test_scores_mean + test_scores_std, | ||||
|         alpha=0.1, | ||||
|         color="g", | ||||
|     ) | ||||
|     axes[0].plot( | ||||
|         train_sizes, train_scores_mean, "o-", color="r", label="Training score" | ||||
|     ) | ||||
|     axes[0].plot( | ||||
|         train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score" | ||||
|     ) | ||||
|     axes[0].legend(loc="best") | ||||
|  | ||||
|     # Plot n_samples vs fit_times | ||||
|     axes[1].grid() | ||||
|     axes[1].plot(train_sizes, fit_times_mean, "o-") | ||||
|     axes[1].fill_between( | ||||
|         train_sizes, | ||||
|         fit_times_mean - fit_times_std, | ||||
|         fit_times_mean + fit_times_std, | ||||
|         alpha=0.1, | ||||
|     ) | ||||
|     axes[1].set_xlabel("Training examples") | ||||
|     axes[1].set_ylabel("fit_times") | ||||
|     axes[1].set_title("Scalability of the model") | ||||
|  | ||||
|     # Plot fit_time vs score | ||||
|     fit_time_argsort = fit_times_mean.argsort() | ||||
|     fit_time_sorted = fit_times_mean[fit_time_argsort] | ||||
|     test_scores_mean_sorted = test_scores_mean[fit_time_argsort] | ||||
|     test_scores_std_sorted = test_scores_std[fit_time_argsort] | ||||
|     axes[2].grid() | ||||
|     axes[2].plot(fit_time_sorted, test_scores_mean_sorted, "o-") | ||||
|     axes[2].fill_between( | ||||
|         fit_time_sorted, | ||||
|         test_scores_mean_sorted - test_scores_std_sorted, | ||||
|         test_scores_mean_sorted + test_scores_std_sorted, | ||||
|         alpha=0.1, | ||||
|     ) | ||||
|     axes[2].set_xlabel("fit_times") | ||||
|     axes[2].set_ylabel("Score") | ||||
|     axes[2].set_title("Performance of the model") | ||||
|  | ||||
|     plt.legend(loc="best") | ||||
|     return plt | ||||
|  | ||||
|  | ||||
| #digits = load_digits() | ||||
| #X, y = digits.data, digits.target | ||||
| fig, axes = plt.subplots(3, 2, figsize=(10, 15)) | ||||
|  | ||||
| X, y = load_digits(return_X_y=True) | ||||
|  | ||||
| #title = "Learning Curves (Naive Bayes)" | ||||
| # Cross validation with 100 iterations to get smoother mean test and train | ||||
| title = "Learning Curves (Naive Bayes)" | ||||
| # Cross validation with 50 iterations to get smoother mean test and train | ||||
| # score curves, each time with 20% data randomly selected as a validation set. | ||||
| #cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=100, | ||||
| #                                   test_size=0.2, random_state=0) | ||||
| cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0) | ||||
|  | ||||
| #estimator = GaussianNB() | ||||
| #plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4) | ||||
| estimator = GaussianNB() | ||||
| plot_learning_curve( | ||||
|     estimator, title, X, y, axes=axes[:, 0], ylim=(0.7, 1.01), cv=cv, n_jobs=4 | ||||
| ) | ||||
|  | ||||
| #title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)" | ||||
| title = r"Learning Curves (SVM, RBF kernel, $\gamma=0.001$)" | ||||
| # SVC is more expensive so we do a lower number of CV iterations: | ||||
| #cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=10, | ||||
| #	                                   test_size=0.2, random_state=0) | ||||
| #estimator = SVC(gamma=0.001) | ||||
| #plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4) | ||||
| cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0) | ||||
| estimator = SVC(gamma=0.001) | ||||
| plot_learning_curve( | ||||
|     estimator, title, X, y, axes=axes[:, 1], ylim=(0.7, 1.01), cv=cv, n_jobs=4 | ||||
| ) | ||||
|  | ||||
| #plt.show() | ||||
| plt.show() | ||||
|   | ||||
		Reference in New Issue
	
	Block a user