Included installation of nltk

2025-08-24 02:22:21 +00:00 · 2017-04-20 12:56:39 +02:00
parent cb40531dc4
commit e88e144a50
5 changed files with 140 additions and 987 deletions
--- a/nlp/4_3_Vector_Representation.ipynb
+++ b/nlp/4_3_Vector_Representation.ipynb
@@ -59,7 +59,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
@@ -109,27 +109,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
-       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
-       "        lowercase=True, max_df=1.0, max_features=5000, min_df=1,\n",
-       "        ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
-       "        strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
-       "        tokenizer=None, vocabulary=None)"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
@@ -152,23 +136,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<3x10 sparse matrix of type '<class 'numpy.int64'>'\n",
-       "\twith 15 stored elements in Compressed Sparse Row format>"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "vectors = vectorizer.fit_transform(documents)\n",
    "vectors"
@@ -186,22 +158,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[[0 1 1 2 0 0 1 2 0 0]\n",
-      " [1 0 0 0 2 0 0 1 2 1]\n",
-      " [1 0 0 0 2 1 0 0 1 1]]\n",
-      "['and', 'but', 'coming', 'is', 'like', 'sandwiches', 'short', 'summer', 'the', 'winter']\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "print(vectors.toarray())\n",
    "print(vectorizer.get_feature_names())"
@@ -219,32 +180,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['and',\n",
-       " 'but',\n",
-       " 'coming',\n",
-       " 'i',\n",
-       " 'is',\n",
-       " 'like',\n",
-       " 'sandwiches',\n",
-       " 'short',\n",
-       " 'summer',\n",
-       " 'the',\n",
-       " 'winter']"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer(analyzer=\"word\", stop_words=None, token_pattern='(?u)\\\\b\\\\w+\\\\b') \n",
    "vectors = vectorizer.fit_transform(documents)\n",
@@ -260,22 +200,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', token_pattern='(?u)\\\\b\\\\w+\\\\b') \n",
    "vectors = vectorizer.fit_transform(documents)\n",
@@ -284,19 +213,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "frozenset({'could', 'sixty', 'onto', 'by', 'against', 'up', 'a', 'everything', 'other', 'otherwise', 'ourselves', 'beside', 'nowhere', 'then', 'below', 'put', 'ten', 'such', 'cannot', 'either', 'due', 'hasnt', 'whereupon', 'were', 'once', 'at', 'for', 'front', 'get', 'whereas', 'that', 'eight', 'another', 'except', 'of', 'wherever', 'over', 'to', 'whom', 'you', 'former', 'behind', 'yours', 'yourself', 'what', 'even', 'however', 'go', 'less', 'bottom', 'may', 'along', 'is', 'can', 'move', 'eg', 'somewhere', 'latterly', 'seemed', 'thence', 'becoming', 'himself', 'whether', 'six', 'first', 'off', 'do', 'many', 'namely', 'never', 'because', 'mostly', 'nevertheless', 'thereupon', 'here', 'least', 'anyone', 'one', 'others', 'cry', 'they', 'thereby', 'ie', 'am', 'this', 'would', 'any', 'while', 'see', 'too', 'your', 'somehow', 'within', 'same', 'sometimes', 'thereafter', 'must', 'take', 're', 'both', 'fill', 'nor', 'sometime', 'he', 'third', 'more', 'also', 'most', 'during', 'much', 'our', 'thick', 'enough', 'full', 'toward', 'with', 'mill', 'anyhow', 'nobody', 'why', 'thru', 'although', 'nothing', 'meanwhile', 'or', 'some', 'ltd', 'wherein', 'thus', 'someone', 'whereby', 'who', 'un', 'are', 'hundred', 'whereafter', 'fire', 'twenty', 'only', 'several', 'among', 'no', 'than', 'before', 'been', 'else', 'find', 'fifteen', 'hence', 'ours', 'already', 'be', 'besides', 'next', 'interest', 'whither', 'whole', 'eleven', 'without', 'five', 'show', 'in', 'throughout', 'own', 'amongst', 'will', 'neither', 'everywhere', 'part', 'give', 'my', 'hers', 'his', 'upon', 'well', 'him', 'yourselves', 'whatever', 'cant', 'though', 'had', 'again', 'every', 'noone', 'top', 'which', 'de', 'almost', 'system', 'under', 'down', 'latter', 'above', 'whence', 'found', 'myself', 'three', 'those', 'become', 'moreover', 'but', 'anyway', 'beyond', 'from', 'now', 'as', 'seeming', 'con', 'themselves', 'hereupon', 'each', 'serious', 'two', 'across', 'out', 'the', 'therein', 'between', 'inc', 'where', 'anything', 'seem', 'co', 'therefore', 'whoever', 'herein', 'about', 'herself', 'should', 'anywhere', 'how', 'we', 'after', 'describe', 'being', 'etc', 'very', 'not', 'an', 'me', 'call', 'per', 'detail', 'still', 'around', 'hereby', 'sincere', 'their', 'has', 'became', 'beforehand', 'everyone', 'hereafter', 'made', 'ever', 'indeed', 'itself', 'something', 'afterwards', 'none', 'done', 'nine', 'alone', 'please', 'its', 'name', 'since', 'on', 'she', 'bill', 'have', 'mine', 'few', 'her', 'seems', 'always', 'side', 'forty', 'further', 'via', 'last', 'amount', 'towards', 'fify', 'through', 'whose', 'couldnt', 'perhaps', 'thin', 'until', 'becomes', 'elsewhere', 'and', 'i', 'them', 'together', 'us', 'was', 'when', 'rather', 'whenever', 'formerly', 'keep', 'so', 'back', 'there', 'amoungst', 'might', 'these', 'all', 'empty', 'often', 'into', 'it', 'twelve', 'yet', 'if', 'four'})\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "#stop words in scikit-learn for English\n",
    "print(vectorizer.get_stop_words())"
@@ -304,24 +225,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[1, 0, 0, 1, 2, 0],\n",
-       "       [0, 2, 0, 0, 1, 1],\n",
-       "       [0, 2, 1, 0, 0, 1]], dtype=int64)"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Vectors\n",
    "f_array = vectors.toarray()\n",
@@ -337,19 +245,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0.666666666667 1.0 0.166666666667\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from scipy.spatial.distance import cosine\n",
    "d12 = cosine(f_array[0], f_array[1])\n",
@@ -374,22 +274,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', binary=True) \n",
    "vectors = vectorizer.fit_transform(documents)\n",
@@ -398,24 +287,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[1, 0, 0, 1, 1, 0],\n",
-       "       [0, 1, 0, 0, 1, 1],\n",
-       "       [0, 1, 1, 0, 0, 1]], dtype=int64)"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "vectors.toarray()"
   ]
@@ -436,29 +312,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['coming summer',\n",
-       " 'like sandwiches',\n",
-       " 'like summer',\n",
-       " 'like winter',\n",
-       " 'sandwiches like',\n",
-       " 'summer coming',\n",
-       " 'summer like',\n",
-       " 'summer short']"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', ngram_range=[2,2]) \n",
    "vectors = vectorizer.fit_transform(documents)\n",
@@ -467,24 +325,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[1, 0, 0, 0, 0, 1, 0, 1],\n",
-       "       [0, 0, 1, 1, 0, 0, 1, 0],\n",
-       "       [0, 1, 0, 1, 1, 0, 0, 0]], dtype=int64)"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "vectors.toarray()"
   ]
@@ -505,22 +350,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
@@ -531,27 +365,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[ 0.48148213,  0.        ,  0.        ,  0.48148213,  0.73235914,\n",
-       "         0.        ],\n",
-       "       [ 0.        ,  0.81649658,  0.        ,  0.        ,  0.40824829,\n",
-       "         0.40824829],\n",
-       "       [ 0.        ,  0.77100584,  0.50689001,  0.        ,  0.        ,\n",
-       "         0.38550292]])"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "vectors.toarray()"
   ]
@@ -565,22 +383,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']"
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "train = [doc1, doc2, doc3]\n",
    "vectorizer = TfidfVectorizer(analyzer=\"word\", stop_words='english')\n",
@@ -592,28 +399,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[ 0.48148213,  0.        ,  0.        ,  0.48148213,  0.73235914,\n",
-       "         0.        ],\n",
-       "       [ 0.        ,  0.81649658,  0.        ,  0.        ,  0.40824829,\n",
-       "         0.40824829],\n",
-       "       [ 0.        ,  0.77100584,  0.50689001,  0.        ,  0.        ,\n",
-       "         0.38550292]])"
-      ]
-     },
-     "execution_count": 31,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "vectors.toarray()"
   ]
@@ -627,22 +418,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[ 0.38324078,  0.24713249,  0.23336362]])"
-      ]
-     },
-     "execution_count": 33,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "\n",
@@ -664,22 +444,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([ 0.38324078,  0.24713249,  0.23336362])"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from sklearn.metrics.pairwise import linear_kernel\n",
    "cosine_similarity = linear_kernel(vector_query, vectors).flatten()\n",
@@ -734,7 +503,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "version": "3.5.2"
  }
 },
 "nbformat": 4,