1
0
mirror of https://github.com/gsi-upm/sitc synced 2025-08-24 02:22:21 +00:00

Remove outputs and metadata

This commit is contained in:
J. Fernando Sánchez
2019-02-28 15:30:33 +01:00
parent a1be167cc0
commit c1d3ca38ea
25 changed files with 989 additions and 14268 deletions

View File

@@ -123,183 +123,9 @@
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>essay_id</th>\n",
" <th>essay_set</th>\n",
" <th>essay</th>\n",
" <th>rater1_domain1</th>\n",
" <th>rater2_domain1</th>\n",
" <th>rater3_domain1</th>\n",
" <th>domain1_score</th>\n",
" <th>rater1_domain2</th>\n",
" <th>rater2_domain2</th>\n",
" <th>domain2_score</th>\n",
" <th>...</th>\n",
" <th>rater2_trait3</th>\n",
" <th>rater2_trait4</th>\n",
" <th>rater2_trait5</th>\n",
" <th>rater2_trait6</th>\n",
" <th>rater3_trait1</th>\n",
" <th>rater3_trait2</th>\n",
" <th>rater3_trait3</th>\n",
" <th>rater3_trait4</th>\n",
" <th>rater3_trait5</th>\n",
" <th>rater3_trait6</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Dear local newspaper, I think effects computer...</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>NaN</td>\n",
" <td>8</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>Dear @CAPS1 @CAPS2, I believe that using compu...</td>\n",
" <td>5</td>\n",
" <td>4</td>\n",
" <td>NaN</td>\n",
" <td>9</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>7</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>Dear Local Newspaper, @CAPS1 I have found that...</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>NaN</td>\n",
" <td>10</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4 rows × 28 columns</p>\n",
"</div>"
],
"text/plain": [
" essay_id essay_set essay \\\n",
"0 1 1 Dear local newspaper, I think effects computer... \n",
"1 2 1 Dear @CAPS1 @CAPS2, I believe that using compu... \n",
"2 3 1 Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl... \n",
"3 4 1 Dear Local Newspaper, @CAPS1 I have found that... \n",
"\n",
" rater1_domain1 rater2_domain1 rater3_domain1 domain1_score \\\n",
"0 4 4 NaN 8 \n",
"1 5 4 NaN 9 \n",
"2 4 3 NaN 7 \n",
"3 5 5 NaN 10 \n",
"\n",
" rater1_domain2 rater2_domain2 domain2_score ... \\\n",
"0 NaN NaN NaN ... \n",
"1 NaN NaN NaN ... \n",
"2 NaN NaN NaN ... \n",
"3 NaN NaN NaN ... \n",
"\n",
" rater2_trait3 rater2_trait4 rater2_trait5 rater2_trait6 rater3_trait1 \\\n",
"0 NaN NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN NaN \n",
"\n",
" rater3_trait2 rater3_trait3 rater3_trait4 rater3_trait5 rater3_trait6 \n",
"0 NaN NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN NaN \n",
"\n",
"[4 rows x 28 columns]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
@@ -311,44 +137,18 @@
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(12976, 28)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_orig.shape"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(1783, 3)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# We filter the data of the essay_set number 1, and we keep only two columns for this \n",
"# example\n",
@@ -359,83 +159,17 @@
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>essay_id</th>\n",
" <th>essay</th>\n",
" <th>domain1_score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Dear local newspaper, I think effects computer...</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Dear @CAPS1 @CAPS2, I believe that using compu...</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Dear Local Newspaper, @CAPS1 I have found that...</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Dear @LOCATION1, I know having computers has a...</td>\n",
" <td>8</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" essay_id essay domain1_score\n",
"0 1 Dear local newspaper, I think effects computer... 8\n",
"1 2 Dear @CAPS1 @CAPS2, I believe that using compu... 9\n",
"2 3 Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl... 7\n",
"3 4 Dear Local Newspaper, @CAPS1 I have found that... 10\n",
"4 5 Dear @LOCATION1, I know having computers has a... 8"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[0:5]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Define X and Y\n",
@@ -468,10 +202,8 @@
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Generic Transformer \n",
@@ -509,10 +241,8 @@
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Sample of statistics using nltk\n",
@@ -541,10 +271,8 @@
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.base import BaseEstimator, TransformerMixin\n",
@@ -581,10 +309,8 @@
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.base import BaseEstimator, TransformerMixin\n",
@@ -635,10 +361,8 @@
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.pipeline import Pipeline, FeatureUnion\n",
@@ -674,23 +398,12 @@
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Scores in every iteration [ 0.39798206 0.27497194]\n",
"Accuracy: 0.34 (+/- 0.12)\n"
]
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.cross_validation import cross_val_score, KFold\n",
"from sklearn.model_selection import cross_val_score, KFold\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.feature_extraction import DictVectorizer\n",
"from sklearn.preprocessing import FunctionTransformer\n",
@@ -726,7 +439,7 @@
"\n",
"# Using KFold validation\n",
"\n",
"cv = KFold(X.shape[0], 2, shuffle=True, random_state=33)\n",
"cv = KFold(2, shuffle=True, random_state=33)\n",
"scores = cross_val_score(pipeline, X, y, cv=cv)\n",
"print(\"Scores in every iteration\", scores)\n",
"print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))"
@@ -734,9 +447,7 @@
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"metadata": {},
"source": [
"The result is not very good :(."
]
@@ -789,9 +500,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 0
"nbformat_minor": 1
}