diff --git a/repos/004008aea84ab19b153b4cecd40e1461/Demo.ipynb b/repos/004008aea84ab19b153b4cecd40e1461/Demo.ipynb
new file mode 100644
index 0000000..897a505
--- /dev/null
+++ b/repos/004008aea84ab19b153b4cecd40e1461/Demo.ipynb
@@ -0,0 +1,569 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This notebook provides a tutorial on how to use the library."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "%load_ext autoreload"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "%autoreload 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "import logging\n",
+ "\n",
+ "logging.basicConfig(level=logging.DEBUG)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Datasets"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Datasets management is made simple. You can view the available datasets:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "- sentiment140:\n",
+ " \t Downloaded: True\n",
+ " \t # instances: 1600000\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "from gsitk.datasets.datasets import DatasetManager\n",
+ "\n",
+ "dm = DatasetManager()\n",
+ "dm.view_datasets()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Preparing the data:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "DEBUG:gsitk.datasets.datasets:Preparing data: sentiment140\n",
+ "DEBUG:gsitk.datasets.utils:Checking data path: /data/sentiment140\n",
+ "DEBUG:gsitk.datasets.utils:Verified: trainingandtestdata.zip\n",
+ "DEBUG:gsitk.datasets.datasets:sentiment140 data is ready\n"
+ ]
+ }
+ ],
+ "source": [
+ "data = dm.prepare_datasets()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "dict_keys(['sentiment140'])"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.keys()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Data is a simple pandas DataFrame."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " polarity | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " -1 | \n",
+ " ['user', 'url', 'aw', 'elong', ',', 'thats', '... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " -1 | \n",
+ " ['is', 'upset', 'that', 'he', 'cant', 'update'... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " -1 | \n",
+ " ['user', 'i', 'dived', 'many', 'times', 'for',... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " -1 | \n",
+ " ['my', 'whole', 'body', 'feels', 'itchy', 'and... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " -1 | \n",
+ " ['user', 'no', ',', 'its', 'not', 'behaving', ... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " polarity text\n",
+ "0 -1 ['user', 'url', 'aw', 'elong', ',', 'thats', '...\n",
+ "1 -1 ['is', 'upset', 'that', 'he', 'cant', 'update'...\n",
+ "2 -1 ['user', 'i', 'dived', 'many', 'times', 'for',...\n",
+ "3 -1 ['my', 'whole', 'body', 'feels', 'itchy', 'and...\n",
+ "4 -1 ['user', 'no', ',', 'its', 'not', 'behaving', ..."
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data['sentiment140'].head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Preprocessing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Features"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "For using a word2vec model as feature extractor, write:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:gensim.utils:loading Word2Vec object from /data/w2vmodel_500d_5mc\n",
+ "INFO:gensim.utils:loading syn0 from /data/w2vmodel_500d_5mc.syn0.npy with mmap=None\n",
+ "INFO:gensim.utils:loading syn1 from /data/w2vmodel_500d_5mc.syn1.npy with mmap=None\n",
+ "INFO:gensim.utils:setting ignored attribute syn0norm to None\n",
+ "INFO:gensim.utils:setting ignored attribute cum_table to None\n",
+ "INFO:gensim.utils:loaded /data/w2vmodel_500d_5mc\n"
+ ]
+ }
+ ],
+ "source": [
+ "from gsitk.features.word2vec import Word2VecFeatures\n",
+ "\n",
+ "w2v_feat = Word2VecFeatures(w2v_model_path='/data/w2vmodel_500d_5mc')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Extracting features is made by the method `transform`. All feature extractors implement `transform`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1600000, 500)"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "transformed = w2v_feat.transform(data['sentiment140']['text'].values)\n",
+ "transformed.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If extracting the features is time consuming, you can save the features locally:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "from gsitk.features import features\n",
+ "\n",
+ "features.save_features(transformed, 'w2v__sentiment40')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And you can load them later:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "DEBUG:gsitk.features.utils:Reading features from w2v__sentiment\n",
+ "DEBUG:gsitk.features.utils:Features are in /data/features/w2v__sentiment40.npy\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array([[-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n",
+ " 0.00937027, 0.21814214],\n",
+ " [-0.06142361, -0.03791333, 0.18094143, ..., 0.00306141,\n",
+ " 0.08196757, 0.02467711],\n",
+ " [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n",
+ " 0.00937027, 0.21814214],\n",
+ " ..., \n",
+ " [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n",
+ " 0.00937027, 0.21814214],\n",
+ " [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n",
+ " 0.00937027, 0.21814214],\n",
+ " [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n",
+ " 0.00937027, 0.21814214]])"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "utils.load_features('w2v__sentiment')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Pipes and Evaluation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The evaluation process uses pipes. Pipe are a way of organizing the different elements of the evaluation. Pipes are represented by EvalTuples, that are a way of specifiying which datasets, features and classifiers we want to evaluate."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If we want to include a classifier in our evaluation:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,\n",
+ " eta0=0.0, fit_intercept=True, l1_ratio=0.15,\n",
+ " learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=-1,\n",
+ " penalty='l2', power_t=0.5, random_state=None, shuffle=True,\n",
+ " verbose=0, warm_start=False)"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from gsitk.pipe import Model, Features, EvalTuple\n",
+ "from sklearn.linear_model import SGDClassifier\n",
+ "\n",
+ "sgd = SGDClassifier(n_jobs=-1)\n",
+ "sgd.fit(transformed, data['sentiment140']['polarity'].values)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "models = [Model(name='sgd', classifier=sgd)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Including features:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "feats = [Features(name='w2v__sentiment140', dataset='sentiment140', values=transformed)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Putting them together:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "ets = [EvalTuple(classifier='sgd', features='w2v__sentiment140', labels='sentiment140')]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Running the evaluation:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "from gsitk.evaluation.evaluation import Evaluation\n",
+ "\n",
+ "ev = Evaluation(datasets=data, features=feats, models=models, tuples=ets)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "DEBUG:gsitk.evaluation.evaluation:Model sgd predicting from features w2v__sentiment140\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Dataset | \n",
+ " Features | \n",
+ " Model | \n",
+ " Accuracy | \n",
+ " Precision | \n",
+ " Recall | \n",
+ " F1-Score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " sentiment140 | \n",
+ " w2v__sentiment140 | \n",
+ " sgd | \n",
+ " 0.589035 | \n",
+ " 0.577554 | \n",
+ " 0.663056 | \n",
+ " 0.617359 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Dataset Features Model Accuracy Precision Recall \\\n",
+ "0 sentiment140 w2v__sentiment140 sgd 0.589035 0.577554 0.663056 \n",
+ "\n",
+ " F1-Score \n",
+ "0 0.617359 "
+ ]
+ },
+ "execution_count": 58,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ev.evaluate()\n",
+ "ev.results"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.5.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}