Remove outputs and metadata

2026-02-08 23:58:17 +00:00 · 2019-02-28 15:30:33 +01:00
parent a1be167cc0
commit c1d3ca38ea
25 changed files with 989 additions and 14268 deletions
--- a/nlp/4_6_Combining_Features.ipynb
+++ b/nlp/4_6_Combining_Features.ipynb
@@ -123,183 +123,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>essay_id</th>\n",
-       "      <th>essay_set</th>\n",
-       "      <th>essay</th>\n",
-       "      <th>rater1_domain1</th>\n",
-       "      <th>rater2_domain1</th>\n",
-       "      <th>rater3_domain1</th>\n",
-       "      <th>domain1_score</th>\n",
-       "      <th>rater1_domain2</th>\n",
-       "      <th>rater2_domain2</th>\n",
-       "      <th>domain2_score</th>\n",
-       "      <th>...</th>\n",
-       "      <th>rater2_trait3</th>\n",
-       "      <th>rater2_trait4</th>\n",
-       "      <th>rater2_trait5</th>\n",
-       "      <th>rater2_trait6</th>\n",
-       "      <th>rater3_trait1</th>\n",
-       "      <th>rater3_trait2</th>\n",
-       "      <th>rater3_trait3</th>\n",
-       "      <th>rater3_trait4</th>\n",
-       "      <th>rater3_trait5</th>\n",
-       "      <th>rater3_trait6</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Dear local newspaper, I think effects computer...</td>\n",
-       "      <td>4</td>\n",
-       "      <td>4</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>8</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Dear @CAPS1 @CAPS2, I believe that using compu...</td>\n",
-       "      <td>5</td>\n",
-       "      <td>4</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>9</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>3</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...</td>\n",
-       "      <td>4</td>\n",
-       "      <td>3</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>7</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>4</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Dear Local Newspaper, @CAPS1 I have found that...</td>\n",
-       "      <td>5</td>\n",
-       "      <td>5</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>10</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>4 rows × 28 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   essay_id  essay_set                                              essay  \\\n",
-       "0         1          1  Dear local newspaper, I think effects computer...   \n",
-       "1         2          1  Dear @CAPS1 @CAPS2, I believe that using compu...   \n",
-       "2         3          1  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...   \n",
-       "3         4          1  Dear Local Newspaper, @CAPS1 I have found that...   \n",
-       "\n",
-       "   rater1_domain1  rater2_domain1  rater3_domain1  domain1_score  \\\n",
-       "0               4               4             NaN              8   \n",
-       "1               5               4             NaN              9   \n",
-       "2               4               3             NaN              7   \n",
-       "3               5               5             NaN             10   \n",
-       "\n",
-       "   rater1_domain2  rater2_domain2  domain2_score      ...        \\\n",
-       "0             NaN             NaN            NaN      ...         \n",
-       "1             NaN             NaN            NaN      ...         \n",
-       "2             NaN             NaN            NaN      ...         \n",
-       "3             NaN             NaN            NaN      ...         \n",
-       "\n",
-       "   rater2_trait3  rater2_trait4  rater2_trait5  rater2_trait6  rater3_trait1  \\\n",
-       "0            NaN            NaN            NaN            NaN            NaN   \n",
-       "1            NaN            NaN            NaN            NaN            NaN   \n",
-       "2            NaN            NaN            NaN            NaN            NaN   \n",
-       "3            NaN            NaN            NaN            NaN            NaN   \n",
-       "\n",
-       "   rater3_trait2  rater3_trait3  rater3_trait4  rater3_trait5  rater3_trait6  \n",
-       "0            NaN            NaN            NaN            NaN            NaN  \n",
-       "1            NaN            NaN            NaN            NaN            NaN  \n",
-       "2            NaN            NaN            NaN            NaN            NaN  \n",
-       "3            NaN            NaN            NaN            NaN            NaN  \n",
-       "\n",
-       "[4 rows x 28 columns]"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
@@ -311,44 +137,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(12976, 28)"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "df_orig.shape"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(1783, 3)"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "# We filter the data of the essay_set number 1, and we keep only two columns for this \n",
    "# example\n",
@@ -359,83 +159,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>essay_id</th>\n",
-       "      <th>essay</th>\n",
-       "      <th>domain1_score</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>Dear local newspaper, I think effects computer...</td>\n",
-       "      <td>8</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2</td>\n",
-       "      <td>Dear @CAPS1 @CAPS2, I believe that using compu...</td>\n",
-       "      <td>9</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>3</td>\n",
-       "      <td>Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...</td>\n",
-       "      <td>7</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>4</td>\n",
-       "      <td>Dear Local Newspaper, @CAPS1 I have found that...</td>\n",
-       "      <td>10</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>5</td>\n",
-       "      <td>Dear @LOCATION1, I know having computers has a...</td>\n",
-       "      <td>8</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   essay_id                                              essay  domain1_score\n",
-       "0         1  Dear local newspaper, I think effects computer...              8\n",
-       "1         2  Dear @CAPS1 @CAPS2, I believe that using compu...              9\n",
-       "2         3  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...              7\n",
-       "3         4  Dear Local Newspaper, @CAPS1 I have found that...             10\n",
-       "4         5  Dear @LOCATION1, I know having computers has a...              8"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "df[0:5]"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": null,
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Define X and Y\n",
@@ -468,10 +202,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": null,
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Generic Transformer \n",
@@ -509,10 +241,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": null,
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Sample of statistics using nltk\n",
@@ -541,10 +271,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": null,
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
@@ -581,10 +309,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": null,
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
@@ -635,10 +361,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": null,
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline, FeatureUnion\n",
@@ -674,23 +398,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Scores in every iteration [ 0.39798206  0.27497194]\n",
-      "Accuracy: 0.34 (+/- 0.12)\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import MultinomialNB\n",
-    "from sklearn.cross_validation import cross_val_score, KFold\n",
+    "from sklearn.model_selection import cross_val_score, KFold\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.feature_extraction import DictVectorizer\n",
    "from sklearn.preprocessing import FunctionTransformer\n",
@@ -726,7 +439,7 @@
    "\n",
    "# Using KFold validation\n",
    "\n",
-    "cv = KFold(X.shape[0], 2, shuffle=True, random_state=33)\n",
+    "cv = KFold(2, shuffle=True, random_state=33)\n",
    "scores = cross_val_score(pipeline, X, y, cv=cv)\n",
    "print(\"Scores in every iteration\", scores)\n",
    "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))"
@@ -734,9 +447,7 @@
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "source": [
    "The result is not very good :(."
   ]
@@ -789,9 +500,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "version": "3.6.7"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }