Included installation of nltk

2026-03-27 21:38:16 +00:00 · 2017-04-20 12:56:39 +02:00
parent cb40531dc4
commit e88e144a50
5 changed files with 140 additions and 987 deletions
--- a/nlp/4_5_Semantic_Models.ipynb
+++ b/nlp/4_5_Semantic_Models.ipynb
@@ -76,22 +76,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(2034, 2807)"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from sklearn.datasets import fetch_20newsgroups\n",
    "\n",
@@ -133,7 +122,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
@@ -163,7 +152,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
@@ -179,29 +168,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[(0,\n",
-       "  '0.004*objects + 0.004*obtained + 0.003*comets + 0.003*manhattan + 0.003*member + 0.003*beginning + 0.003*center + 0.003*groups + 0.003*aware + 0.003*increased'),\n",
-       " (1,\n",
-       "  '0.003*activity + 0.002*objects + 0.002*professional + 0.002*eyes + 0.002*manhattan + 0.002*pressure + 0.002*netters + 0.002*chosen + 0.002*attempted + 0.002*medical'),\n",
-       " (2,\n",
-       "  '0.003*mechanism + 0.003*led + 0.003*platform + 0.003*frank + 0.003*mormons + 0.003*aeronautics + 0.002*concepts + 0.002*header + 0.002*forces + 0.002*profit'),\n",
-       " (3,\n",
-       "  '0.005*diameter + 0.005*having + 0.004*complex + 0.004*conclusions + 0.004*activity + 0.004*looking + 0.004*action + 0.004*inflatable + 0.004*defined + 0.004*association')]"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# check the topics\n",
    "lda.print_topics(4)"
@@ -216,7 +187,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
@@ -250,19 +221,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dictionary(10913 unique tokens: ['whose', 'used', 'hoc', 'transfinite', 'newtek']...)\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# You can save the dictionary\n",
    "dictionary.save('newsgroup.dict')\n",
@@ -272,7 +235,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
@@ -285,7 +248,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
@@ -299,7 +262,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
@@ -313,19 +276,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dictionary(10913 unique tokens: ['whose', 'used', 'hoc', 'transfinite', 'newtek']...)\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# We can print the dictionary, it is a mappying of id and tokens\n",
    "\n",
@@ -334,7 +289,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
@@ -346,7 +301,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
@@ -361,19 +316,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[(0, 0.1598114653031772), (1, 0.10438175896914427), (2, 0.5700978153855775), (3, 0.24093628445650234), (4, 0.722808853369507), (5, 0.24093628445650234)]\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "#print tf-idf of first document\n",
    "print(corpus_tfidf[0])"
@@ -381,7 +328,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
@@ -396,29 +343,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[(0,\n",
-       "  '0.010*targa + 0.007*ns + 0.006*thanks + 0.006*davidian + 0.006*ssrt + 0.006*yayayay + 0.005*craig + 0.005*bull + 0.005*gerald + 0.005*sorry'),\n",
-       " (1,\n",
-       "  '0.011*god + 0.010*mary + 0.008*baptist + 0.008*islam + 0.006*zoroastrians + 0.006*joseph + 0.006*lucky + 0.006*khomeini + 0.006*samaritan + 0.005*crusades'),\n",
-       " (2,\n",
-       "  '0.007*koresh + 0.007*moon + 0.007*western + 0.006*plane + 0.006*jeff + 0.006*unix + 0.005*bible + 0.005*also + 0.005*basically + 0.005*bob'),\n",
-       " (3,\n",
-       "  '0.011*whatever + 0.008*joy + 0.007*happy + 0.006*virtual + 0.006*reality + 0.004*really + 0.003*samuel___ + 0.003*oh + 0.003*virtually + 0.003*toaster')]"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# check the topics\n",
    "lda_model.print_topics(4)"
@@ -426,19 +355,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[(0, 0.085176135689180726), (1, 0.6919655173835938), (2, 0.1377903468164027), (3, 0.0850680001108228)]\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# check the lsa vector for the first document\n",
    "corpus_lda = lda_model[corpus_tfidf]\n",
@@ -447,19 +368,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[('lord', 1), ('god', 2)]\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "#predict topics of a new doc\n",
    "new_doc = \"God is love and God is the Lord\"\n",
@@ -470,19 +383,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[(0, 0.062509420435514051), (1, 0.81246608790618835), (2, 0.062508281488992554), (3, 0.062516210169305114)]\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "#transform into LDA space\n",
    "lda_vector = lda_model[bow_vector]\n",
@@ -491,19 +396,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0.011*god + 0.010*mary + 0.008*baptist + 0.008*islam + 0.006*zoroastrians + 0.006*joseph + 0.006*lucky + 0.006*khomeini + 0.006*samaritan + 0.005*crusades\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# print the document's single most prominent LDA topic\n",
    "print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))"
@@ -511,20 +408,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[(0, 0.10392179866025079), (1, 0.68822094221870811), (2, 0.10391916429993264), (3, 0.10393809482110833)]\n",
-      "0.011*god + 0.010*mary + 0.008*baptist + 0.008*islam + 0.006*zoroastrians + 0.006*joseph + 0.006*lucky + 0.006*khomeini + 0.006*samaritan + 0.005*crusades\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "lda_vector_tfidf = lda_model[tfidf_model[bow_vector]]\n",
    "print(lda_vector_tfidf)\n",
@@ -541,7 +429,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
@@ -559,29 +447,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[(0,\n",
-       "  '0.769*\"god\" + 0.346*\"jesus\" + 0.235*\"bible\" + 0.204*\"christian\" + 0.149*\"christians\" + 0.107*\"christ\" + 0.090*\"well\" + 0.085*\"koresh\" + 0.081*\"kent\" + 0.080*\"christianity\"'),\n",
-       " (1,\n",
-       "  '-0.863*\"thanks\" + -0.255*\"please\" + -0.159*\"hello\" + -0.153*\"hi\" + 0.123*\"god\" + -0.112*\"sorry\" + -0.087*\"could\" + -0.074*\"windows\" + -0.067*\"jpeg\" + -0.063*\"vga\"'),\n",
-       " (2,\n",
-       "  '0.780*\"well\" + -0.229*\"god\" + 0.165*\"yes\" + -0.153*\"thanks\" + 0.133*\"ico\" + 0.133*\"tek\" + 0.130*\"bronx\" + 0.130*\"beauchaine\" + 0.130*\"queens\" + 0.129*\"manhattan\"'),\n",
-       " (3,\n",
-       "  '0.340*\"well\" + -0.335*\"ico\" + -0.334*\"tek\" + -0.328*\"beauchaine\" + -0.328*\"bronx\" + -0.328*\"queens\" + -0.326*\"manhattan\" + -0.305*\"bob\" + -0.305*\"com\" + -0.072*\"god\"')]"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# check the topics\n",
    "lsi_model.print_topics(4)"
@@ -589,19 +459,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[(0, 0.1598114653031772), (1, 0.10438175896914427), (2, 0.5700978153855775), (3, 0.24093628445650234), (4, 0.722808853369507), (5, 0.24093628445650234)]\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# check the lsi vector for the first document\n",
    "print(corpus_tfidf[0])"
@@ -655,7 +517,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "version": "3.5.2"
  }
 },
 "nbformat": 4,