{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook provides a tutorial on how to use the library."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%load_ext autoreload"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import logging\n",
"\n",
"logging.basicConfig(level=logging.DEBUG)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Datasets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Datasets management is made simple. You can view the available datasets:"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"- sentiment140:\n",
" \t Downloaded: True\n",
" \t # instances: 1600000\n",
"\n",
"\n"
]
}
],
"source": [
"from gsitk.datasets.datasets import DatasetManager\n",
"\n",
"dm = DatasetManager()\n",
"dm.view_datasets()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Preparing the data:"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"DEBUG:gsitk.datasets.datasets:Preparing data: sentiment140\n",
"DEBUG:gsitk.datasets.utils:Checking data path: /data/sentiment140\n",
"DEBUG:gsitk.datasets.utils:Verified: trainingandtestdata.zip\n",
"DEBUG:gsitk.datasets.datasets:sentiment140 data is ready\n"
]
}
],
"source": [
"data = dm.prepare_datasets()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['sentiment140'])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.keys()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Data is a simple pandas DataFrame."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" polarity | \n",
" text | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" -1 | \n",
" ['user', 'url', 'aw', 'elong', ',', 'thats', '... | \n",
"
\n",
" \n",
" 1 | \n",
" -1 | \n",
" ['is', 'upset', 'that', 'he', 'cant', 'update'... | \n",
"
\n",
" \n",
" 2 | \n",
" -1 | \n",
" ['user', 'i', 'dived', 'many', 'times', 'for',... | \n",
"
\n",
" \n",
" 3 | \n",
" -1 | \n",
" ['my', 'whole', 'body', 'feels', 'itchy', 'and... | \n",
"
\n",
" \n",
" 4 | \n",
" -1 | \n",
" ['user', 'no', ',', 'its', 'not', 'behaving', ... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" polarity text\n",
"0 -1 ['user', 'url', 'aw', 'elong', ',', 'thats', '...\n",
"1 -1 ['is', 'upset', 'that', 'he', 'cant', 'update'...\n",
"2 -1 ['user', 'i', 'dived', 'many', 'times', 'for',...\n",
"3 -1 ['my', 'whole', 'body', 'feels', 'itchy', 'and...\n",
"4 -1 ['user', 'no', ',', 'its', 'not', 'behaving', ..."
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['sentiment140'].head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Preprocessing"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Features"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For using a word2vec model as feature extractor, write:"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:gensim.utils:loading Word2Vec object from /data/w2vmodel_500d_5mc\n",
"INFO:gensim.utils:loading syn0 from /data/w2vmodel_500d_5mc.syn0.npy with mmap=None\n",
"INFO:gensim.utils:loading syn1 from /data/w2vmodel_500d_5mc.syn1.npy with mmap=None\n",
"INFO:gensim.utils:setting ignored attribute syn0norm to None\n",
"INFO:gensim.utils:setting ignored attribute cum_table to None\n",
"INFO:gensim.utils:loaded /data/w2vmodel_500d_5mc\n"
]
}
],
"source": [
"from gsitk.features.word2vec import Word2VecFeatures\n",
"\n",
"w2v_feat = Word2VecFeatures(w2v_model_path='/data/w2vmodel_500d_5mc')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Extracting features is made by the method `transform`. All feature extractors implement `transform`."
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(1600000, 500)"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transformed = w2v_feat.transform(data['sentiment140']['text'].values)\n",
"transformed.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If extracting the features is time consuming, you can save the features locally:"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from gsitk.features import features\n",
"\n",
"features.save_features(transformed, 'w2v__sentiment40')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"And you can load them later:"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"DEBUG:gsitk.features.utils:Reading features from w2v__sentiment\n",
"DEBUG:gsitk.features.utils:Features are in /data/features/w2v__sentiment40.npy\n"
]
},
{
"data": {
"text/plain": [
"array([[-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n",
" 0.00937027, 0.21814214],\n",
" [-0.06142361, -0.03791333, 0.18094143, ..., 0.00306141,\n",
" 0.08196757, 0.02467711],\n",
" [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n",
" 0.00937027, 0.21814214],\n",
" ..., \n",
" [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n",
" 0.00937027, 0.21814214],\n",
" [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n",
" 0.00937027, 0.21814214],\n",
" [-0.03798573, 0.03630935, 0.08243822, ..., -0.0287797 ,\n",
" 0.00937027, 0.21814214]])"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"utils.load_features('w2v__sentiment')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pipes and Evaluation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The evaluation process uses pipes. Pipe are a way of organizing the different elements of the evaluation. Pipes are represented by EvalTuples, that are a way of specifiying which datasets, features and classifiers we want to evaluate."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If we want to include a classifier in our evaluation:"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,\n",
" eta0=0.0, fit_intercept=True, l1_ratio=0.15,\n",
" learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=-1,\n",
" penalty='l2', power_t=0.5, random_state=None, shuffle=True,\n",
" verbose=0, warm_start=False)"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from gsitk.pipe import Model, Features, EvalTuple\n",
"from sklearn.linear_model import SGDClassifier\n",
"\n",
"sgd = SGDClassifier(n_jobs=-1)\n",
"sgd.fit(transformed, data['sentiment140']['polarity'].values)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"models = [Model(name='sgd', classifier=sgd)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Including features:"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"feats = [Features(name='w2v__sentiment140', dataset='sentiment140', values=transformed)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Putting them together:"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ets = [EvalTuple(classifier='sgd', features='w2v__sentiment140', labels='sentiment140')]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Running the evaluation:"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from gsitk.evaluation.evaluation import Evaluation\n",
"\n",
"ev = Evaluation(datasets=data, features=feats, models=models, tuples=ets)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"DEBUG:gsitk.evaluation.evaluation:Model sgd predicting from features w2v__sentiment140\n"
]
},
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Dataset | \n",
" Features | \n",
" Model | \n",
" Accuracy | \n",
" Precision | \n",
" Recall | \n",
" F1-Score | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" sentiment140 | \n",
" w2v__sentiment140 | \n",
" sgd | \n",
" 0.589035 | \n",
" 0.577554 | \n",
" 0.663056 | \n",
" 0.617359 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Dataset Features Model Accuracy Precision Recall \\\n",
"0 sentiment140 w2v__sentiment140 sgd 0.589035 0.577554 0.663056 \n",
"\n",
" F1-Score \n",
"0 0.617359 "
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ev.evaluate()\n",
"ev.results"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}