mirror of
https://github.com/balkian/gists.git
synced 2024-11-25 02:32:28 +00:00
3279606988
git-subtree-dir: repos/004008aea84ab19b153b4cecd40e1461 git-subtree-mainline:49aeda804b
git-subtree-split:d05c48d51b
570 lines
13 KiB
Plaintext
570 lines
13 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"This notebook provides a tutorial on how to use the library."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"%load_ext autoreload"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"%autoreload 2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import logging\n",
|
|
"\n",
|
|
"logging.basicConfig(level=logging.DEBUG)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Datasets"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Datasets management is made simple. You can view the available datasets:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"- sentiment140:\n",
|
|
" \t Downloaded: True\n",
|
|
" \t # instances: 1600000\n",
|
|
"\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from gsitk.datasets.datasets import DatasetManager\n",
|
|
"\n",
|
|
"dm = DatasetManager()\n",
|
|
"dm.view_datasets()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Preparing the data:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"DEBUG:gsitk.datasets.datasets:Preparing data: sentiment140\n",
|
|
"DEBUG:gsitk.datasets.utils:Checking data path: /data/sentiment140\n",
|
|
"DEBUG:gsitk.datasets.utils:Verified: trainingandtestdata.zip\n",
|
|
"DEBUG:gsitk.datasets.datasets:sentiment140 data is ready\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"data = dm.prepare_datasets()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"dict_keys(['sentiment140'])"
|
|
]
|
|
},
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data.keys()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Data is a simple pandas DataFrame."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>polarity</th>\n",
|
|
" <th>text</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>-1</td>\n",
|
|
" <td>['user', 'url', 'aw', 'elong', ',', 'thats', '...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>-1</td>\n",
|
|
" <td>['is', 'upset', 'that', 'he', 'cant', 'update'...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>-1</td>\n",
|
|
" <td>['user', 'i', 'dived', 'many', 'times', 'for',...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>-1</td>\n",
|
|
" <td>['my', 'whole', 'body', 'feels', 'itchy', 'and...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>-1</td>\n",
|
|
" <td>['user', 'no', ',', 'its', 'not', 'behaving', ...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" polarity text\n",
|
|
"0 -1 ['user', 'url', 'aw', 'elong', ',', 'thats', '...\n",
|
|
"1 -1 ['is', 'upset', 'that', 'he', 'cant', 'update'...\n",
|
|
"2 -1 ['user', 'i', 'dived', 'many', 'times', 'for',...\n",
|
|
"3 -1 ['my', 'whole', 'body', 'feels', 'itchy', 'and...\n",
|
|
"4 -1 ['user', 'no', ',', 'its', 'not', 'behaving', ..."
|
|
]
|
|
},
|
|
"execution_count": 17,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data['sentiment140'].head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Preprocessing"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Features"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"For using a word2vec model as feature extractor, write:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"INFO:gensim.utils:loading Word2Vec object from /data/w2vmodel_500d_5mc\n",
|
|
"INFO:gensim.utils:loading syn0 from /data/w2vmodel_500d_5mc.syn0.npy with mmap=None\n",
|
|
"INFO:gensim.utils:loading syn1 from /data/w2vmodel_500d_5mc.syn1.npy with mmap=None\n",
|
|
"INFO:gensim.utils:setting ignored attribute syn0norm to None\n",
|
|
"INFO:gensim.utils:setting ignored attribute cum_table to None\n",
|
|
"INFO:gensim.utils:loaded /data/w2vmodel_500d_5mc\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from gsitk.features.word2vec import Word2VecFeatures\n",
|
|
"\n",
|
|
"w2v_feat = Word2VecFeatures(w2v_model_path='/data/w2vmodel_500d_5mc')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Extracting features is made by the method `transform`. All feature extractors implement `transform`."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 48,
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(1600000, 500)"
|
|
]
|
|
},
|
|
"execution_count": 48,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"transformed = w2v_feat.transform(data['sentiment140']['text'].values)\n",
|
|
"transformed.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"If extracting the features is time consuming, you can save the features locally:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 59,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from gsitk.features import features\n",
|
|
"\n",
|
|
"features.save_features(transformed, 'w2v__sentiment40')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"And you can load them later:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"DEBUG:gsitk.features.utils:Reading features from w2v__sentiment\n",
|
|
"DEBUG:gsitk.features.utils:Features are in /data/features/w2v__sentiment40.npy\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"array([[-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n",
|
|
" 0.00937027, 0.21814214],\n",
|
|
" [-0.06142361, -0.03791333, 0.18094143, ..., 0.00306141,\n",
|
|
" 0.08196757, 0.02467711],\n",
|
|
" [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n",
|
|
" 0.00937027, 0.21814214],\n",
|
|
" ..., \n",
|
|
" [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n",
|
|
" 0.00937027, 0.21814214],\n",
|
|
" [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n",
|
|
" 0.00937027, 0.21814214],\n",
|
|
" [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n",
|
|
" 0.00937027, 0.21814214]])"
|
|
]
|
|
},
|
|
"execution_count": 29,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"utils.load_features('w2v__sentiment')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Pipes and Evaluation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"The evaluation process uses pipes. Pipe are a way of organizing the different elements of the evaluation. Pipes are represented by EvalTuples, that are a way of specifiying which datasets, features and classifiers we want to evaluate."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"If we want to include a classifier in our evaluation:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 49,
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,\n",
|
|
" eta0=0.0, fit_intercept=True, l1_ratio=0.15,\n",
|
|
" learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=-1,\n",
|
|
" penalty='l2', power_t=0.5, random_state=None, shuffle=True,\n",
|
|
" verbose=0, warm_start=False)"
|
|
]
|
|
},
|
|
"execution_count": 49,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"from gsitk.pipe import Model, Features, EvalTuple\n",
|
|
"from sklearn.linear_model import SGDClassifier\n",
|
|
"\n",
|
|
"sgd = SGDClassifier(n_jobs=-1)\n",
|
|
"sgd.fit(transformed, data['sentiment140']['polarity'].values)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 50,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"models = [Model(name='sgd', classifier=sgd)]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Including features:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 51,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"feats = [Features(name='w2v__sentiment140', dataset='sentiment140', values=transformed)]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Putting them together:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 52,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"ets = [EvalTuple(classifier='sgd', features='w2v__sentiment140', labels='sentiment140')]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Running the evaluation:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 57,
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from gsitk.evaluation.evaluation import Evaluation\n",
|
|
"\n",
|
|
"ev = Evaluation(datasets=data, features=feats, models=models, tuples=ets)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 58,
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"DEBUG:gsitk.evaluation.evaluation:Model sgd predicting from features w2v__sentiment140\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Dataset</th>\n",
|
|
" <th>Features</th>\n",
|
|
" <th>Model</th>\n",
|
|
" <th>Accuracy</th>\n",
|
|
" <th>Precision</th>\n",
|
|
" <th>Recall</th>\n",
|
|
" <th>F1-Score</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>sentiment140</td>\n",
|
|
" <td>w2v__sentiment140</td>\n",
|
|
" <td>sgd</td>\n",
|
|
" <td>0.589035</td>\n",
|
|
" <td>0.577554</td>\n",
|
|
" <td>0.663056</td>\n",
|
|
" <td>0.617359</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Dataset Features Model Accuracy Precision Recall \\\n",
|
|
"0 sentiment140 w2v__sentiment140 sgd 0.589035 0.577554 0.663056 \n",
|
|
"\n",
|
|
" F1-Score \n",
|
|
"0 0.617359 "
|
|
]
|
|
},
|
|
"execution_count": 58,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"ev.evaluate()\n",
|
|
"ev.results"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.5.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|