Remove outputs and metadata

2025-10-09 23:22:22 +00:00 · 2019-02-28 15:30:33 +01:00
parent a1be167cc0
commit c1d3ca38ea
25 changed files with 989 additions and 14268 deletions
--- a/nlp/4_1_Lexical_Processing.ipynb
+++ b/nlp/4_1_Lexical_Processing.ipynb
@@ -68,9 +68,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "review = \"\"\"I purchased this monitor because of budgetary concerns. This item was the most inexpensive 17 inch monitor \n",
@@ -111,9 +109,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
@@ -171,9 +167,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
@@ -199,10 +193,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false,
-    "scrolled": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "words = [word_tokenize(t) for t in sent_tokenize(review)]\n",
@@ -219,9 +210,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "words = word_tokenize(review)\n",
@@ -239,9 +228,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import TweetTokenizer\n",
@@ -268,9 +255,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer\n",
@@ -304,9 +289,7 @@
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "source": [
    "As we can see, we get the forms *are* and *is* instead of *be*. This is because we have not introduce the Part-Of-Speech (POS), and the default POS is 'n' (name).\n",
    "\n",
@@ -316,9 +299,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "verbs = \"are crying is have has\"\n",
@@ -327,9 +308,7 @@
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "source": [
    "Depending of the application, we can select stemmers or lemmatizers. \n",
    "\n",
@@ -341,9 +320,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess(words, type='doc'):\n",
@@ -376,9 +353,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.corpus import stopwords\n",
@@ -390,9 +365,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess(words, type='doc'):\n",
@@ -428,9 +401,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "import string\n",
@@ -474,9 +445,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "frec = nltk.FreqDist(nltk.word_tokenize(review))\n",
--- a/nlp/4_2_Syntactic_Processing.ipynb
+++ b/nlp/4_2_Syntactic_Processing.ipynb
@@ -62,9 +62,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "review = \"\"\"I purchased this Dell monitor because of budgetary concerns. This item was the most inexpensive 17 inch Apple monitor \n",
@@ -110,9 +108,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk import pos_tag, word_tokenize\n",
@@ -129,9 +125,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "print (pos_tag(word_tokenize(review)))"
@@ -147,9 +141,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
@@ -166,9 +158,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.stem import WordNetLemmatizer\n",
@@ -199,9 +189,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk import ne_chunk, pos_tag, word_tokenize\n",
@@ -246,9 +234,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.app import srparser_app\n",
@@ -265,9 +251,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.app import rdparser_app\n",
@@ -288,9 +272,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.chunk.regexp import *\n",
@@ -316,9 +298,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "def extractTrees(parsed_tree, category='NP'):\n",
@@ -330,9 +310,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "def extractStrings(parsed_tree, category='NP'):\n",
--- a/nlp/4_3_Vector_Representation.ipynb
+++ b/nlp/4_3_Vector_Representation.ipynb
@@ -60,9 +60,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "doc1 = 'Summer is coming but Summer is short'\n",
@@ -73,9 +71,7 @@
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "source": [
    "# Tools"
   ]
@@ -110,9 +106,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
@@ -123,9 +117,7 @@
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "source": [
    "As we can see, [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer) comes with many options. We can define many configuration options, such as the maximum or minimum frequency of a term (*min_fd*, *max_df*), maximum number of features (*max_features*), if we analyze words or characters (*analyzer*), or if the output is binary or not (*binary*). *CountVectorizer* also allows us to include if we want to preprocess the input (*preprocessor*) before tokenizing it (*tokenizer*) and exclude stop words (*stop_words*).\n",
    "\n",
@@ -137,9 +129,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "vectors = vectorizer.fit_transform(documents)\n",
@@ -148,9 +138,7 @@
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "source": [
    "We see the vectors are stored as a sparse matrix of 3x6 dimensions.\n",
    "We can print the matrix as well as the feature names."
@@ -159,9 +147,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "print(vectors.toarray())\n",
@@ -170,9 +156,7 @@
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "source": [
    "As you can see, the pronoun 'I' has been removed because of the default token_pattern. \n",
    "We can change this as follows."
@@ -181,9 +165,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer(analyzer=\"word\", stop_words=None, token_pattern='(?u)\\\\b\\\\w+\\\\b') \n",
@@ -201,9 +183,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', token_pattern='(?u)\\\\b\\\\w+\\\\b') \n",
@@ -214,9 +194,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "#stop words in scikit-learn for English\n",
@@ -226,9 +204,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Vectors\n",
@@ -246,9 +222,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.spatial.distance import cosine\n",
@@ -275,9 +249,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', binary=True) \n",
@@ -288,9 +260,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "vectors.toarray()"
@@ -313,9 +283,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', ngram_range=[2,2]) \n",
@@ -326,9 +294,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "vectors.toarray()"
@@ -351,9 +317,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
@@ -366,9 +330,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "vectors.toarray()"
@@ -384,9 +346,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "train = [doc1, doc2, doc3]\n",
@@ -400,10 +360,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false,
-    "scrolled": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "vectors.toarray()"
@@ -419,9 +376,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics.pairwise import cosine_similarity\n",
@@ -445,9 +400,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics.pairwise import linear_kernel\n",
--- a/nlp/4_4_Classification.ipynb
+++ b/nlp/4_4_Classification.ipynb
@@ -74,19 +74,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "from sklearn.datasets import fetch_20newsgroups\n",
    "\n",
@@ -100,19 +90,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "20\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "#Number of categories\n",
    "print(len(newsgroups_train.target_names))"
@@ -120,28 +100,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Category id 4 comp.sys.mac.hardware\n",
-      "Doc A fair number of brave souls who upgraded their SI clock oscillator have\n",
-      "shared their experiences for this poll. Please send a brief message detailing\n",
-      "your experiences with the procedure. Top speed attained, CPU rated speed,\n",
-      "add on cards and adapters, heat sinks, hour of usage per day, floppy disk\n",
-      "functionality with 800 and 1.4 m floppies are especially requested.\n",
-      "\n",
-      "I will be summarizing in the next two days, so please add to the network\n",
-      "knowledge base if you have done the clock upgrade and haven't answered this\n",
-      "poll. Thanks.\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "# Show a document\n",
    "docid = 1\n",
@@ -154,22 +115,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(11314,)"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "#Number of files\n",
    "newsgroups_train.filenames.shape"
@@ -177,30 +125,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/cif/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2652: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.\n",
-      "  VisibleDeprecationWarning)\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "(11314, 101323)"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "# Obtain a vector\n",
    "\n",
@@ -214,22 +141,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "66.80510871486653"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "# The tf-idf vectors are very sparse with an average of 66 non zero components in 101.323 dimensions (.06%)\n",
    "vectors_train.nnz / float(vectors_train.shape[0])"
@@ -251,30 +165,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/cif/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2652: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.\n",
-      "  VisibleDeprecationWarning)\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "0.69545360719001303"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
@@ -302,20 +195,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "dimensionality: 101323\n",
-      "density: 1.000000\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "from sklearn.utils.extmath import density\n",
    "\n",
@@ -325,38 +207,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "alt.atheism: islam atheists say just religion atheism think don people god\n",
-      "comp.graphics: looking format 3d know program file files thanks image graphics\n",
-      "comp.os.ms-windows.misc: card problem thanks driver drivers use files dos file windows\n",
-      "comp.sys.ibm.pc.hardware: monitor disk thanks pc ide controller bus card scsi drive\n",
-      "comp.sys.mac.hardware: know monitor does quadra simms thanks problem drive apple mac\n",
-      "comp.windows.x: using windows x11r5 use application thanks widget server motif window\n",
-      "misc.forsale: asking email sell price condition new shipping offer 00 sale\n",
-      "rec.autos: don ford new good dealer just engine like cars car\n",
-      "rec.motorcycles: don just helmet riding like motorcycle ride bikes dod bike\n",
-      "rec.sport.baseball: braves players pitching hit runs games game baseball team year\n",
-      "rec.sport.hockey: league year nhl games season players play hockey team game\n",
-      "sci.crypt: people use escrow nsa keys government chip clipper encryption key\n",
-      "sci.electronics: don thanks voltage used know does like circuit power use\n",
-      "sci.med: skepticism cadre dsl banks chastity n3jxp pitt gordon geb msg\n",
-      "sci.space: just lunar earth shuttle like moon launch orbit nasa space\n",
-      "soc.religion.christian: believe faith christian christ bible people christians church jesus god\n",
-      "talk.politics.guns: just law firearms government fbi don weapons people guns gun\n",
-      "talk.politics.mideast: said arabs arab turkish people armenians armenian jews israeli israel\n",
-      "talk.politics.misc: know state clinton president just think tax don government people\n",
-      "talk.religion.misc: think don koresh objective christians bible people christian jesus god\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "# We can review the top features per topic in Bayes (attribute coef_)\n",
    "import numpy as np\n",
@@ -373,28 +226,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[ 2 15]\n",
-      "['comp.os.ms-windows.misc', 'soc.religion.christian']\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/cif/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2652: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.\n",
-      "  VisibleDeprecationWarning)\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "# We try the classifier in two new docs\n",
    "\n",
--- a/nlp/4_5_Semantic_Models.ipynb
+++ b/nlp/4_5_Semantic_Models.ipynb
@@ -77,9 +77,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets import fetch_20newsgroups\n",
@@ -123,9 +121,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim import matutils\n",
@@ -152,10 +148,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": null,
+   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.models.ldamodel import LdaModel\n",
@@ -169,9 +163,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# check the topics\n",
@@ -188,9 +180,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# import the gensim.corpora module to generate dictionary\n",
@@ -222,9 +212,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# You can save the dictionary\n",
@@ -236,9 +224,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate a list of docs, where each doc is a list of words\n",
@@ -249,9 +235,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# import the gensim.corpora module to generate dictionary\n",
@@ -263,9 +247,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# You can optionally save the  dictionary \n",
@@ -277,9 +259,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# We can print the dictionary, it is a mappying of id and tokens\n",
@@ -290,9 +270,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# construct the corpus representing each document as a bag-of-words (bow) vector\n",
@@ -302,9 +280,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.models import TfidfModel\n",
@@ -317,9 +293,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "#print tf-idf of first document\n",
@@ -329,9 +303,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.models.ldamodel import LdaModel\n",
@@ -344,9 +316,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# check the topics\n",
@@ -356,9 +326,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# check the lsa vector for the first document\n",
@@ -369,9 +337,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "#predict topics of a new doc\n",
@@ -384,9 +350,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "#transform into LDA space\n",
@@ -397,9 +361,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# print the document's single most prominent LDA topic\n",
@@ -409,9 +371,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "lda_vector_tfidf = lda_model[tfidf_model[bow_vector]]\n",
@@ -430,9 +390,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.models.lsimodel import LsiModel\n",
@@ -448,9 +406,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# check the topics\n",
@@ -460,9 +416,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# check the lsi vector for the first document\n",
--- a/nlp/4_6_Combining_Features.ipynb
+++ b/nlp/4_6_Combining_Features.ipynb
@@ -123,183 +123,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>essay_id</th>\n",
-       "      <th>essay_set</th>\n",
-       "      <th>essay</th>\n",
-       "      <th>rater1_domain1</th>\n",
-       "      <th>rater2_domain1</th>\n",
-       "      <th>rater3_domain1</th>\n",
-       "      <th>domain1_score</th>\n",
-       "      <th>rater1_domain2</th>\n",
-       "      <th>rater2_domain2</th>\n",
-       "      <th>domain2_score</th>\n",
-       "      <th>...</th>\n",
-       "      <th>rater2_trait3</th>\n",
-       "      <th>rater2_trait4</th>\n",
-       "      <th>rater2_trait5</th>\n",
-       "      <th>rater2_trait6</th>\n",
-       "      <th>rater3_trait1</th>\n",
-       "      <th>rater3_trait2</th>\n",
-       "      <th>rater3_trait3</th>\n",
-       "      <th>rater3_trait4</th>\n",
-       "      <th>rater3_trait5</th>\n",
-       "      <th>rater3_trait6</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Dear local newspaper, I think effects computer...</td>\n",
-       "      <td>4</td>\n",
-       "      <td>4</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>8</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Dear @CAPS1 @CAPS2, I believe that using compu...</td>\n",
-       "      <td>5</td>\n",
-       "      <td>4</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>9</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>3</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...</td>\n",
-       "      <td>4</td>\n",
-       "      <td>3</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>7</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>4</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Dear Local Newspaper, @CAPS1 I have found that...</td>\n",
-       "      <td>5</td>\n",
-       "      <td>5</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>10</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>4 rows × 28 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   essay_id  essay_set                                              essay  \\\n",
-       "0         1          1  Dear local newspaper, I think effects computer...   \n",
-       "1         2          1  Dear @CAPS1 @CAPS2, I believe that using compu...   \n",
-       "2         3          1  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...   \n",
-       "3         4          1  Dear Local Newspaper, @CAPS1 I have found that...   \n",
-       "\n",
-       "   rater1_domain1  rater2_domain1  rater3_domain1  domain1_score  \\\n",
-       "0               4               4             NaN              8   \n",
-       "1               5               4             NaN              9   \n",
-       "2               4               3             NaN              7   \n",
-       "3               5               5             NaN             10   \n",
-       "\n",
-       "   rater1_domain2  rater2_domain2  domain2_score      ...        \\\n",
-       "0             NaN             NaN            NaN      ...         \n",
-       "1             NaN             NaN            NaN      ...         \n",
-       "2             NaN             NaN            NaN      ...         \n",
-       "3             NaN             NaN            NaN      ...         \n",
-       "\n",
-       "   rater2_trait3  rater2_trait4  rater2_trait5  rater2_trait6  rater3_trait1  \\\n",
-       "0            NaN            NaN            NaN            NaN            NaN   \n",
-       "1            NaN            NaN            NaN            NaN            NaN   \n",
-       "2            NaN            NaN            NaN            NaN            NaN   \n",
-       "3            NaN            NaN            NaN            NaN            NaN   \n",
-       "\n",
-       "   rater3_trait2  rater3_trait3  rater3_trait4  rater3_trait5  rater3_trait6  \n",
-       "0            NaN            NaN            NaN            NaN            NaN  \n",
-       "1            NaN            NaN            NaN            NaN            NaN  \n",
-       "2            NaN            NaN            NaN            NaN            NaN  \n",
-       "3            NaN            NaN            NaN            NaN            NaN  \n",
-       "\n",
-       "[4 rows x 28 columns]"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
@@ -311,44 +137,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(12976, 28)"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "df_orig.shape"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(1783, 3)"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "# We filter the data of the essay_set number 1, and we keep only two columns for this \n",
    "# example\n",
@@ -359,83 +159,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>essay_id</th>\n",
-       "      <th>essay</th>\n",
-       "      <th>domain1_score</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>Dear local newspaper, I think effects computer...</td>\n",
-       "      <td>8</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2</td>\n",
-       "      <td>Dear @CAPS1 @CAPS2, I believe that using compu...</td>\n",
-       "      <td>9</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>3</td>\n",
-       "      <td>Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...</td>\n",
-       "      <td>7</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>4</td>\n",
-       "      <td>Dear Local Newspaper, @CAPS1 I have found that...</td>\n",
-       "      <td>10</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>5</td>\n",
-       "      <td>Dear @LOCATION1, I know having computers has a...</td>\n",
-       "      <td>8</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   essay_id                                              essay  domain1_score\n",
-       "0         1  Dear local newspaper, I think effects computer...              8\n",
-       "1         2  Dear @CAPS1 @CAPS2, I believe that using compu...              9\n",
-       "2         3  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...              7\n",
-       "3         4  Dear Local Newspaper, @CAPS1 I have found that...             10\n",
-       "4         5  Dear @LOCATION1, I know having computers has a...              8"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "df[0:5]"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": null,
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Define X and Y\n",
@@ -468,10 +202,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": null,
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Generic Transformer \n",
@@ -509,10 +241,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": null,
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Sample of statistics using nltk\n",
@@ -541,10 +271,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": null,
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
@@ -581,10 +309,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": null,
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
@@ -635,10 +361,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "collapsed": false
-   },
+   "execution_count": null,
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline, FeatureUnion\n",
@@ -674,23 +398,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Scores in every iteration [ 0.39798206  0.27497194]\n",
-      "Accuracy: 0.34 (+/- 0.12)\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import MultinomialNB\n",
-    "from sklearn.cross_validation import cross_val_score, KFold\n",
+    "from sklearn.model_selection import cross_val_score, KFold\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.feature_extraction import DictVectorizer\n",
    "from sklearn.preprocessing import FunctionTransformer\n",
@@ -726,7 +439,7 @@
    "\n",
    "# Using KFold validation\n",
    "\n",
-    "cv = KFold(X.shape[0], 2, shuffle=True, random_state=33)\n",
+    "cv = KFold(2, shuffle=True, random_state=33)\n",
    "scores = cross_val_score(pipeline, X, y, cv=cv)\n",
    "print(\"Scores in every iteration\", scores)\n",
    "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))"
@@ -734,9 +447,7 @@
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "source": [
    "The result is not very good :(."
   ]
@@ -789,9 +500,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "version": "3.6.7"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }