{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook provides a tutorial on how to use the library." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%load_ext autoreload" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import logging\n", "\n", "logging.basicConfig(level=logging.DEBUG)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Datasets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Datasets management is made simple. You can view the available datasets:" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "- sentiment140:\n", " \t Downloaded: True\n", " \t # instances: 1600000\n", "\n", "\n" ] } ], "source": [ "from gsitk.datasets.datasets import DatasetManager\n", "\n", "dm = DatasetManager()\n", "dm.view_datasets()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Preparing the data:" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "DEBUG:gsitk.datasets.datasets:Preparing data: sentiment140\n", "DEBUG:gsitk.datasets.utils:Checking data path: /data/sentiment140\n", "DEBUG:gsitk.datasets.utils:Verified: trainingandtestdata.zip\n", "DEBUG:gsitk.datasets.datasets:sentiment140 data is ready\n" ] } ], "source": [ "data = dm.prepare_datasets()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "dict_keys(['sentiment140'])" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.keys()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Data is a simple pandas DataFrame." ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
polaritytext
0-1['user', 'url', 'aw', 'elong', ',', 'thats', '...
1-1['is', 'upset', 'that', 'he', 'cant', 'update'...
2-1['user', 'i', 'dived', 'many', 'times', 'for',...
3-1['my', 'whole', 'body', 'feels', 'itchy', 'and...
4-1['user', 'no', ',', 'its', 'not', 'behaving', ...
\n", "
" ], "text/plain": [ " polarity text\n", "0 -1 ['user', 'url', 'aw', 'elong', ',', 'thats', '...\n", "1 -1 ['is', 'upset', 'that', 'he', 'cant', 'update'...\n", "2 -1 ['user', 'i', 'dived', 'many', 'times', 'for',...\n", "3 -1 ['my', 'whole', 'body', 'feels', 'itchy', 'and...\n", "4 -1 ['user', 'no', ',', 'its', 'not', 'behaving', ..." ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['sentiment140'].head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Preprocessing" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For using a word2vec model as feature extractor, write:" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:gensim.utils:loading Word2Vec object from /data/w2vmodel_500d_5mc\n", "INFO:gensim.utils:loading syn0 from /data/w2vmodel_500d_5mc.syn0.npy with mmap=None\n", "INFO:gensim.utils:loading syn1 from /data/w2vmodel_500d_5mc.syn1.npy with mmap=None\n", "INFO:gensim.utils:setting ignored attribute syn0norm to None\n", "INFO:gensim.utils:setting ignored attribute cum_table to None\n", "INFO:gensim.utils:loaded /data/w2vmodel_500d_5mc\n" ] } ], "source": [ "from gsitk.features.word2vec import Word2VecFeatures\n", "\n", "w2v_feat = Word2VecFeatures(w2v_model_path='/data/w2vmodel_500d_5mc')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extracting features is made by the method `transform`. All feature extractors implement `transform`." ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(1600000, 500)" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transformed = w2v_feat.transform(data['sentiment140']['text'].values)\n", "transformed.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If extracting the features is time consuming, you can save the features locally:" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from gsitk.features import features\n", "\n", "features.save_features(transformed, 'w2v__sentiment40')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And you can load them later:" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "DEBUG:gsitk.features.utils:Reading features from w2v__sentiment\n", "DEBUG:gsitk.features.utils:Features are in /data/features/w2v__sentiment40.npy\n" ] }, { "data": { "text/plain": [ "array([[-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n", " 0.00937027, 0.21814214],\n", " [-0.06142361, -0.03791333, 0.18094143, ..., 0.00306141,\n", " 0.08196757, 0.02467711],\n", " [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n", " 0.00937027, 0.21814214],\n", " ..., \n", " [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n", " 0.00937027, 0.21814214],\n", " [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n", " 0.00937027, 0.21814214],\n", " [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n", " 0.00937027, 0.21814214]])" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "utils.load_features('w2v__sentiment')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Pipes and Evaluation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The evaluation process uses pipes. Pipe are a way of organizing the different elements of the evaluation. Pipes are represented by EvalTuples, that are a way of specifiying which datasets, features and classifiers we want to evaluate." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If we want to include a classifier in our evaluation:" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,\n", " eta0=0.0, fit_intercept=True, l1_ratio=0.15,\n", " learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=-1,\n", " penalty='l2', power_t=0.5, random_state=None, shuffle=True,\n", " verbose=0, warm_start=False)" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from gsitk.pipe import Model, Features, EvalTuple\n", "from sklearn.linear_model import SGDClassifier\n", "\n", "sgd = SGDClassifier(n_jobs=-1)\n", "sgd.fit(transformed, data['sentiment140']['polarity'].values)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": true }, "outputs": [], "source": [ "models = [Model(name='sgd', classifier=sgd)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Including features:" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": true }, "outputs": [], "source": [ "feats = [Features(name='w2v__sentiment140', dataset='sentiment140', values=transformed)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Putting them together:" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": true }, "outputs": [], "source": [ "ets = [EvalTuple(classifier='sgd', features='w2v__sentiment140', labels='sentiment140')]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Running the evaluation:" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from gsitk.evaluation.evaluation import Evaluation\n", "\n", "ev = Evaluation(datasets=data, features=feats, models=models, tuples=ets)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "DEBUG:gsitk.evaluation.evaluation:Model sgd predicting from features w2v__sentiment140\n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DatasetFeaturesModelAccuracyPrecisionRecallF1-Score
0sentiment140w2v__sentiment140sgd0.5890350.5775540.6630560.617359
\n", "
" ], "text/plain": [ " Dataset Features Model Accuracy Precision Recall \\\n", "0 sentiment140 w2v__sentiment140 sgd 0.589035 0.577554 0.663056 \n", "\n", " F1-Score \n", "0 0.617359 " ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ev.evaluate()\n", "ev.results" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }