From d05c48d51bfeddba2a10541b7d8a47b664386397 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Fernando=20S=C3=A1nchez?= Date: Fri, 5 May 2017 11:50:56 +0200 Subject: [PATCH] --- Demo.ipynb | 569 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 569 insertions(+) create mode 100644 Demo.ipynb diff --git a/Demo.ipynb b/Demo.ipynb new file mode 100644 index 0000000..897a505 --- /dev/null +++ b/Demo.ipynb @@ -0,0 +1,569 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook provides a tutorial on how to use the library." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%load_ext autoreload" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import logging\n", + "\n", + "logging.basicConfig(level=logging.DEBUG)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Datasets management is made simple. You can view the available datasets:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "- sentiment140:\n", + " \t Downloaded: True\n", + " \t # instances: 1600000\n", + "\n", + "\n" + ] + } + ], + "source": [ + "from gsitk.datasets.datasets import DatasetManager\n", + "\n", + "dm = DatasetManager()\n", + "dm.view_datasets()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Preparing the data:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:gsitk.datasets.datasets:Preparing data: sentiment140\n", + "DEBUG:gsitk.datasets.utils:Checking data path: /data/sentiment140\n", + "DEBUG:gsitk.datasets.utils:Verified: trainingandtestdata.zip\n", + "DEBUG:gsitk.datasets.datasets:sentiment140 data is ready\n" + ] + } + ], + "source": [ + "data = dm.prepare_datasets()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['sentiment140'])" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data is a simple pandas DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
polaritytext
0-1['user', 'url', 'aw', 'elong', ',', 'thats', '...
1-1['is', 'upset', 'that', 'he', 'cant', 'update'...
2-1['user', 'i', 'dived', 'many', 'times', 'for',...
3-1['my', 'whole', 'body', 'feels', 'itchy', 'and...
4-1['user', 'no', ',', 'its', 'not', 'behaving', ...
\n", + "
" + ], + "text/plain": [ + " polarity text\n", + "0 -1 ['user', 'url', 'aw', 'elong', ',', 'thats', '...\n", + "1 -1 ['is', 'upset', 'that', 'he', 'cant', 'update'...\n", + "2 -1 ['user', 'i', 'dived', 'many', 'times', 'for',...\n", + "3 -1 ['my', 'whole', 'body', 'feels', 'itchy', 'and...\n", + "4 -1 ['user', 'no', ',', 'its', 'not', 'behaving', ..." + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['sentiment140'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For using a word2vec model as feature extractor, write:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:gensim.utils:loading Word2Vec object from /data/w2vmodel_500d_5mc\n", + "INFO:gensim.utils:loading syn0 from /data/w2vmodel_500d_5mc.syn0.npy with mmap=None\n", + "INFO:gensim.utils:loading syn1 from /data/w2vmodel_500d_5mc.syn1.npy with mmap=None\n", + "INFO:gensim.utils:setting ignored attribute syn0norm to None\n", + "INFO:gensim.utils:setting ignored attribute cum_table to None\n", + "INFO:gensim.utils:loaded /data/w2vmodel_500d_5mc\n" + ] + } + ], + "source": [ + "from gsitk.features.word2vec import Word2VecFeatures\n", + "\n", + "w2v_feat = Word2VecFeatures(w2v_model_path='/data/w2vmodel_500d_5mc')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Extracting features is made by the method `transform`. All feature extractors implement `transform`." + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(1600000, 500)" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transformed = w2v_feat.transform(data['sentiment140']['text'].values)\n", + "transformed.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If extracting the features is time consuming, you can save the features locally:" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from gsitk.features import features\n", + "\n", + "features.save_features(transformed, 'w2v__sentiment40')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And you can load them later:" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:gsitk.features.utils:Reading features from w2v__sentiment\n", + "DEBUG:gsitk.features.utils:Features are in /data/features/w2v__sentiment40.npy\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n", + " 0.00937027, 0.21814214],\n", + " [-0.06142361, -0.03791333, 0.18094143, ..., 0.00306141,\n", + " 0.08196757, 0.02467711],\n", + " [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n", + " 0.00937027, 0.21814214],\n", + " ..., \n", + " [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n", + " 0.00937027, 0.21814214],\n", + " [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n", + " 0.00937027, 0.21814214],\n", + " [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n", + " 0.00937027, 0.21814214]])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "utils.load_features('w2v__sentiment')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pipes and Evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The evaluation process uses pipes. Pipe are a way of organizing the different elements of the evaluation. Pipes are represented by EvalTuples, that are a way of specifiying which datasets, features and classifiers we want to evaluate." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we want to include a classifier in our evaluation:" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,\n", + " eta0=0.0, fit_intercept=True, l1_ratio=0.15,\n", + " learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=-1,\n", + " penalty='l2', power_t=0.5, random_state=None, shuffle=True,\n", + " verbose=0, warm_start=False)" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from gsitk.pipe import Model, Features, EvalTuple\n", + "from sklearn.linear_model import SGDClassifier\n", + "\n", + "sgd = SGDClassifier(n_jobs=-1)\n", + "sgd.fit(transformed, data['sentiment140']['polarity'].values)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "models = [Model(name='sgd', classifier=sgd)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Including features:" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "feats = [Features(name='w2v__sentiment140', dataset='sentiment140', values=transformed)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Putting them together:" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "ets = [EvalTuple(classifier='sgd', features='w2v__sentiment140', labels='sentiment140')]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running the evaluation:" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from gsitk.evaluation.evaluation import Evaluation\n", + "\n", + "ev = Evaluation(datasets=data, features=feats, models=models, tuples=ets)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:gsitk.evaluation.evaluation:Model sgd predicting from features w2v__sentiment140\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DatasetFeaturesModelAccuracyPrecisionRecallF1-Score
0sentiment140w2v__sentiment140sgd0.5890350.5775540.6630560.617359
\n", + "
" + ], + "text/plain": [ + " Dataset Features Model Accuracy Precision Recall \\\n", + "0 sentiment140 w2v__sentiment140 sgd 0.589035 0.577554 0.663056 \n", + "\n", + " F1-Score \n", + "0 0.617359 " + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ev.evaluate()\n", + "ev.results" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}