Updated 4_4 to use get_feature_names_out() instead of get_feature_names

Updated 4_4 - using feature_log_prob_ instead of coef_ (deprecated)
2025-06-13 11:42:21 +00:00 · 2023-04-23 16:41:53 +02:00 · 2023-04-23 16:37:48 +02:00
3 changed files with 305 additions and 101 deletions
--- a/nlp/4_3_Vector_Representation.ipynb
+++ b/nlp/4_3_Vector_Representation.ipynb
@ -105,9 +105,23 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>CountVectorizer(max_features=5000)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">CountVectorizer</label><div class=\"sk-toggleable__content\"><pre>CountVectorizer(max_features=5000)</pre></div></div></div></div></div>"
+      ],
+      "text/plain": [
+       "CountVectorizer(max_features=5000)"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
@ -128,9 +142,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<3x10 sparse matrix of type '<class 'numpy.int64'>'\n",
+       "\twith 15 stored elements in Compressed Sparse Row format>"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "vectors = vectorizer.fit_transform(documents)\n",
    "vectors"
@ -146,12 +172,24 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0 1 1 2 0 0 1 2 0 0]\n",
+      " [1 0 0 0 2 0 0 1 2 1]\n",
+      " [1 0 0 0 2 1 0 0 1 1]]\n",
+      "['and' 'but' 'coming' 'is' 'like' 'sandwiches' 'short' 'summer' 'the'\n",
+      " 'winter']\n"
+     ]
+    }
+   ],
   "source": [
    "print(vectors.toarray())\n",
-    "print(vectorizer.get_feature_names())"
+    "print(vectorizer.get_feature_names_out())"
   ]
  },
  {
@ -164,13 +202,25 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['and', 'but', 'coming', 'i', 'is', 'like', 'sandwiches', 'short',\n",
+       "       'summer', 'the', 'winter'], dtype=object)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "vectorizer = CountVectorizer(analyzer=\"word\", stop_words=None, token_pattern='(?u)\\\\b\\\\w+\\\\b') \n",
    "vectors = vectorizer.fit_transform(documents)\n",
-    "vectorizer.get_feature_names()"
+    "vectorizer.get_feature_names_out()"
   ]
  },
  {
@ -182,20 +232,47 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/cif/anaconda3/lib/python3.10/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.\n",
+      "  warnings.warn(msg, category=FutureWarning)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', token_pattern='(?u)\\\\b\\\\w+\\\\b') \n",
    "vectors = vectorizer.fit_transform(documents)\n",
-    "vectorizer.get_feature_names()"
+    "vectorizer.get_feature_names_out()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "frozenset({'or', 'be', 'least', 'ours', 'very', 'noone', 'more', 'can', 'front', 'last', 'co', 'where', 'beyond', 'you', 'was', 'to', 'nine', 'here', 'describe', 'than', 'rather', 'therefore', 'except', 'at', 'again', 'ourselves', 'most', 'anyway', 'thick', 'whither', 'thereupon', 'someone', 'hereupon', 'besides', 'among', 'hasnt', 'across', 'namely', 'because', 'is', 'out', 'same', 'yourself', 'somehow', 'sincere', 'con', 'hereby', 'towards', 'interest', 'much', 'up', 'why', 'myself', 'all', 'nobody', 'though', 'every', 'show', 'not', 'there', 'whether', 'still', 'name', 'when', 'the', 'each', 'six', 'nor', 'and', 'under', 'thereby', 'less', 'either', 'thence', 'into', 'seemed', 'something', 'four', 'sometimes', 'himself', 'those', 'nowhere', 'almost', 'are', 'empty', 'must', 'while', 'afterwards', 'perhaps', 'from', 'detail', 'through', 'any', 'have', 'may', 'he', 'anywhere', 'alone', 'without', 'beforehand', 'had', 'too', 'yourselves', 'our', 'see', 'how', 'please', 'what', 'am', 'do', 'it', 'serious', 'yet', 'down', 'top', 'amount', 'then', 'both', 'fire', 'been', 'wherein', 'done', 'etc', 'whose', 'whereafter', 'who', 'ltd', 'meanwhile', 'further', 'few', 'first', 'behind', 'made', 'yours', 'until', 'toward', 'amoungst', 'anyhow', 'we', 'with', 'give', 'go', 'no', 'back', 'else', 'becomes', 'your', 'fill', 'together', 'another', 'throughout', 'onto', 'de', 'me', 'ten', 'system', 'became', 'per', 'therein', 'everyone', 'often', 'ie', 'put', 'hers', 'herself', 'nevertheless', 'itself', 'eg', 'herein', 'his', 'this', 'cry', 'due', 'bill', 'one', 'on', 'being', 'themselves', 'of', 'some', 'their', 'neither', 'elsewhere', 'since', 'whole', 'eight', 'i', 'a', 'whoever', 'own', 'call', 'them', 'mostly', 'she', 'my', 'cannot', 'us', 'never', 'as', 'thin', 'upon', 'cant', 'un', 'before', 'her', 'otherwise', 'full', 'these', 'next', 'they', 'side', 'somewhere', 'fifty', 'hence', 'so', 'along', 'already', 'three', 'latter', 'anything', 'whom', 'could', 'indeed', 'nothing', 'whereby', 'which', 'sometime', 'become', 'ever', 'amongst', 'by', 'in', 'five', 'after', 'mine', 'fifteen', 'wherever', 'found', 'thereafter', 'third', 'keep', 'anyone', 'will', 'bottom', 'off', 'seem', 'none', 'an', 'whatever', 'over', 'during', 'also', 'latterly', 'via', 'take', 'former', 'above', 'now', 'becoming', 'hereafter', 'such', 'two', 'only', 'about', 'sixty', 're', 'everything', 'others', 'hundred', 'twelve', 'thus', 'even', 'well', 'always', 'once', 'beside', 'get', 'mill', 'seems', 'if', 'whereupon', 'find', 'forty', 'inc', 'whenever', 'around', 'other', 'should', 'many', 'enough', 'however', 'move', 'against', 'several', 'everywhere', 'has', 'whereas', 'that', 'whence', 'eleven', 'its', 'within', 'twenty', 'part', 'although', 'thru', 'couldnt', 'moreover', 'him', 'formerly', 'might', 'seeming', 'but', 'below', 'would', 'between', 'were', 'for'})\n"
+     ]
+    }
+   ],
   "source": [
    "#stop words in scikit-learn for English\n",
    "print(vectorizer.get_stop_words())"
@ -442,7 +519,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -456,7 +533,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.1"
+   "version": "3.10.10"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
--- a/nlp/4_4_Classification.ipynb
+++ b/nlp/4_4_Classification.ipynb
@ -74,9 +74,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n"
+     ]
+    }
+   ],
   "source": [
    "from sklearn.datasets import fetch_20newsgroups\n",
    "\n",
@ -90,9 +98,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "20\n"
+     ]
+    }
+   ],
   "source": [
    "#Number of categories\n",
    "print(len(newsgroups_train.target_names))"
@ -100,9 +116,26 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Category id 4 comp.sys.mac.hardware\n",
+      "Doc A fair number of brave souls who upgraded their SI clock oscillator have\n",
+      "shared their experiences for this poll. Please send a brief message detailing\n",
+      "your experiences with the procedure. Top speed attained, CPU rated speed,\n",
+      "add on cards and adapters, heat sinks, hour of usage per day, floppy disk\n",
+      "functionality with 800 and 1.4 m floppies are especially requested.\n",
+      "\n",
+      "I will be summarizing in the next two days, so please add to the network\n",
+      "knowledge base if you have done the clock upgrade and haven't answered this\n",
+      "poll. Thanks.\n"
+     ]
+    }
+   ],
   "source": [
    "# Show a document\n",
    "docid = 1\n",
@ -115,9 +148,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(11314,)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "#Number of files\n",
    "newsgroups_train.filenames.shape"
@ -125,9 +169,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(11314, 101322)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "# Obtain a vector\n",
    "\n",
@ -141,9 +196,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "66.802987449178"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "# The tf-idf vectors are very sparse with an average of 66 non zero components in 101.323 dimensions (.06%)\n",
    "vectors_train.nnz / float(vectors_train.shape[0])"
@ -165,9 +231,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.695453607190013"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
@ -195,29 +272,44 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "alt.atheism: islam atheists say just religion atheism think don people god\n",
+      "comp.graphics: looking format 3d know program file files thanks image graphics\n",
+      "comp.os.ms-windows.misc: card problem thanks driver drivers use files dos file windows\n",
+      "comp.sys.ibm.pc.hardware: monitor disk thanks pc ide controller bus card scsi drive\n",
+      "comp.sys.mac.hardware: know monitor does quadra simms thanks problem drive apple mac\n",
+      "comp.windows.x: using windows x11r5 use application thanks widget server motif window\n",
+      "misc.forsale: asking email sell price condition new shipping offer 00 sale\n",
+      "rec.autos: don ford new good dealer just engine like cars car\n",
+      "rec.motorcycles: don just helmet riding like motorcycle ride bikes dod bike\n",
+      "rec.sport.baseball: braves players pitching hit runs games game baseball team year\n",
+      "rec.sport.hockey: league year nhl games season players play hockey team game\n",
+      "sci.crypt: people use escrow nsa keys government chip clipper encryption key\n",
+      "sci.electronics: don thanks voltage used know does like circuit power use\n",
+      "sci.med: skepticism cadre dsl banks chastity n3jxp pitt gordon geb msg\n",
+      "sci.space: just lunar earth shuttle like moon launch orbit nasa space\n",
+      "soc.religion.christian: believe faith christian christ bible people christians church jesus god\n",
+      "talk.politics.guns: just law firearms government fbi don weapons people guns gun\n",
+      "talk.politics.mideast: said arabs arab turkish people armenians armenian jews israeli israel\n",
+      "talk.politics.misc: know state clinton president just think tax don government people\n",
+      "talk.religion.misc: think don koresh objective christians bible people christian jesus god\n"
+     ]
+    }
+   ],
   "source": [
-    "from sklearn.utils.extmath import density\n",
-    "\n",
-    "print(\"dimensionality: %d\" % model.coef_.shape[1])\n",
-    "print(\"density: %f\" % density(model.coef_))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# We can review the top features per topic in Bayes (attribute coef_)\n",
+    "# We can review the top features per topic in Bayes (attribute feature_log_prob_)\n",
    "import numpy as np\n",
    "\n",
    "def show_top10(classifier, vectorizer, categories):\n",
-    "    feature_names = np.asarray(vectorizer.get_feature_names())\n",
+    "    feature_names = np.asarray(vectorizer.get_feature_names_out())\n",
    "    for i, category in enumerate(categories):\n",
-    "        top10 = np.argsort(classifier.coef_[i])[-10:]\n",
+    "        top10 = np.argsort(classifier.feature_log_prob_[i, :])[-10:]\n",
    "        print(\"%s: %s\" % (category, \" \".join(feature_names[top10])))\n",
    "\n",
    "        \n",
@ -226,9 +318,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[ 2 15]\n",
+      "['comp.os.ms-windows.misc', 'soc.religion.christian']\n"
+     ]
+    }
+   ],
   "source": [
    "# We try the classifier in two new docs\n",
    "\n",
@ -275,7 +376,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -289,7 +390,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.1"
+   "version": "3.10.10"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
--- a/nlp/4_5_Semantic_Models.ipynb
+++ b/nlp/4_5_Semantic_Models.ipynb
@ -76,7 +76,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
@ -85,7 +85,7 @@
       "(2034, 2807)"
      ]
     },
-     "execution_count": 33,
+     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -134,15 +134,41 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: gensim in /home/cif/anaconda3/lib/python3.10/site-packages (4.3.1)\n",
+      "Requirement already satisfied: scipy>=1.7.0 in /home/cif/anaconda3/lib/python3.10/site-packages (from gensim) (1.10.1)\n",
+      "Requirement already satisfied: smart-open>=1.8.1 in /home/cif/anaconda3/lib/python3.10/site-packages (from gensim) (6.3.0)\n",
+      "Requirement already satisfied: numpy>=1.18.5 in /home/cif/anaconda3/lib/python3.10/site-packages (from gensim) (1.24.2)\n",
+      "Note: you may need to restart the kernel to use updated packages.\n",
+      "Requirement already satisfied: python-Levenshtein in /home/cif/anaconda3/lib/python3.10/site-packages (0.21.0)\n",
+      "Requirement already satisfied: Levenshtein==0.21.0 in /home/cif/anaconda3/lib/python3.10/site-packages (from python-Levenshtein) (0.21.0)\n",
+      "Requirement already satisfied: rapidfuzz<4.0.0,>=2.3.0 in /home/cif/anaconda3/lib/python3.10/site-packages (from Levenshtein==0.21.0->python-Levenshtein) (3.0.0)\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install gensim\n",
+    "%pip install python-Levenshtein"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim import matutils\n",
    "\n",
-    "vocab = vectorizer.get_feature_names()\n",
+    "vocab = vectorizer.get_feature_names_out()\n",
    "\n",
-    "dictionary = dict([(i, s) for i, s in enumerate(vectorizer.get_feature_names())])\n",
+    "dictionary = dict([(i, s) for i, s in enumerate(vectorizer.get_feature_names_out())])\n",
    "corpus_tfidf = matutils.Sparse2Corpus(vectors_train)"
   ]
  },
@ -162,7 +188,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@ -176,23 +202,23 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0,\n",
-       "  '0.011*\"baptist\" + 0.010*\"koresh\" + 0.009*\"bible\" + 0.006*\"reality\" + 0.006*\"virtual\" + 0.005*\"scarlet\" + 0.005*\"shag\" + 0.004*\"tootsie\" + 0.004*\"kinda\" + 0.004*\"captain\"'),\n",
+       "  '0.004*\"central\" + 0.004*\"assumptions\" + 0.004*\"matthew\" + 0.004*\"define\" + 0.004*\"holes\" + 0.003*\"killing\" + 0.003*\"item\" + 0.003*\"curious\" + 0.003*\"going\" + 0.003*\"presentations\"'),\n",
       " (1,\n",
-       "  '0.010*\"targa\" + 0.008*\"thanks\" + 0.008*\"moon\" + 0.007*\"craig\" + 0.007*\"zoroastrians\" + 0.006*\"yayayay\" + 0.005*\"unfortunately\" + 0.005*\"windows\" + 0.005*\"rayshade\" + 0.004*\"tdb\"'),\n",
+       "  '0.002*\"mechanism\" + 0.002*\"led\" + 0.002*\"apple\" + 0.002*\"color\" + 0.002*\"mormons\" + 0.002*\"activity\" + 0.002*\"concepts\" + 0.002*\"frank\" + 0.002*\"platform\" + 0.002*\"fault\"'),\n",
       " (2,\n",
-       "  '0.009*\"mary\" + 0.007*\"whatever\" + 0.006*\"god\" + 0.005*\"ns\" + 0.005*\"lucky\" + 0.005*\"joseph\" + 0.005*\"ssrt\" + 0.005*\"samaritan\" + 0.005*\"crusades\" + 0.004*\"phobos\"'),\n",
+       "  '0.005*\"objects\" + 0.005*\"obtained\" + 0.003*\"manhattan\" + 0.003*\"capability\" + 0.003*\"education\" + 0.003*\"men\" + 0.003*\"photo\" + 0.003*\"decent\" + 0.003*\"environmental\" + 0.003*\"pain\"'),\n",
       " (3,\n",
-       "  '0.009*\"islam\" + 0.008*\"western\" + 0.008*\"plane\" + 0.008*\"jeff\" + 0.007*\"cheers\" + 0.007*\"kent\" + 0.007*\"joy\" + 0.007*\"khomeini\" + 0.007*\"davidian\" + 0.006*\"basically\"')]"
+       "  '0.004*\"car\" + 0.004*\"contain\" + 0.004*\"groups\" + 0.004*\"center\" + 0.004*\"evil\" + 0.004*\"maintain\" + 0.004*\"comets\" + 0.004*\"88\" + 0.004*\"density\" + 0.003*\"company\"')]"
      ]
     },
-     "execution_count": 61,
+     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -211,7 +237,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -243,14 +269,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Dictionary(10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...)\n"
+      "Dictionary<10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...>\n"
     ]
    }
   ],
@ -263,7 +289,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
@ -274,7 +300,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@ -286,14 +312,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Dictionary(10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...)\n"
+      "Dictionary<10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...>\n"
     ]
    }
   ],
@ -305,7 +331,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
@ -315,7 +341,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
@ -328,7 +354,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
@ -346,7 +372,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
@ -359,23 +385,23 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0,\n",
-       "  '0.009*\"whatever\" + 0.007*\"plane\" + 0.007*\"ns\" + 0.007*\"joy\" + 0.006*\"happy\" + 0.005*\"bob\" + 0.004*\"phil\" + 0.004*\"nasa\" + 0.003*\"purdue\" + 0.003*\"neie\"'),\n",
+       "  '0.011*\"mary\" + 0.007*\"ns\" + 0.006*\"joseph\" + 0.006*\"lucky\" + 0.006*\"ssrt\" + 0.005*\"god\" + 0.005*\"unfortunately\" + 0.004*\"rayshade\" + 0.004*\"phil\" + 0.004*\"nasa\"'),\n",
       " (1,\n",
-       "  '0.009*\"god\" + 0.008*\"mary\" + 0.008*\"targa\" + 0.007*\"baptist\" + 0.007*\"thanks\" + 0.007*\"koresh\" + 0.006*\"really\" + 0.006*\"bible\" + 0.005*\"lot\" + 0.005*\"lucky\"'),\n",
+       "  '0.009*\"thanks\" + 0.009*\"targa\" + 0.008*\"whatever\" + 0.008*\"baptist\" + 0.007*\"islam\" + 0.006*\"cheers\" + 0.006*\"kent\" + 0.006*\"zoroastrians\" + 0.006*\"joy\" + 0.006*\"lot\"'),\n",
       " (2,\n",
-       "  '0.010*\"moon\" + 0.007*\"phobos\" + 0.006*\"unfortunately\" + 0.006*\"martian\" + 0.006*\"russian\" + 0.005*\"rayshade\" + 0.005*\"anybody\" + 0.005*\"perturbations\" + 0.005*\"thanks\" + 0.004*\"apollo\"'),\n",
+       "  '0.008*\"moon\" + 0.008*\"really\" + 0.008*\"western\" + 0.007*\"plane\" + 0.006*\"samaritan\" + 0.006*\"crusades\" + 0.006*\"baltimore\" + 0.005*\"bob\" + 0.005*\"septuagint\" + 0.005*\"virtual\"'),\n",
       " (3,\n",
-       "  '0.008*\"islam\" + 0.008*\"western\" + 0.007*\"jeff\" + 0.007*\"zoroastrians\" + 0.006*\"davidian\" + 0.006*\"basically\" + 0.005*\"bull\" + 0.005*\"gerald\" + 0.005*\"sorry\" + 0.004*\"kent\"')]"
+       "  '0.009*\"koresh\" + 0.008*\"bible\" + 0.008*\"jeff\" + 0.007*\"basically\" + 0.006*\"gerald\" + 0.006*\"bull\" + 0.005*\"pd\" + 0.004*\"also\" + 0.003*\"dam\" + 0.003*\"feiner\"')]"
      ]
     },
-     "execution_count": 76,
+     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -387,14 +413,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[(0, 0.7154438), (1, 0.10569019), (2, 0.09522807), (3, 0.08363795)]\n"
+      "[(0, 0.09161347), (1, 0.1133858), (2, 0.103424065), (3, 0.69157666)]\n"
     ]
    }
   ],
@ -406,7 +432,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
@ -427,14 +453,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[(0, 0.06320839), (1, 0.80878526), (2, 0.06274223), (3, 0.065264106)]\n"
+      "[(0, 0.066217005), (1, 0.8084562), (2, 0.062542014), (3, 0.0627848)]\n"
     ]
    }
   ],
@ -446,14 +472,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "0.009*\"god\" + 0.008*\"mary\" + 0.008*\"targa\" + 0.007*\"baptist\" + 0.007*\"thanks\" + 0.007*\"koresh\" + 0.006*\"really\" + 0.006*\"bible\" + 0.005*\"lot\" + 0.005*\"lucky\"\n"
+      "0.009*\"thanks\" + 0.009*\"targa\" + 0.008*\"whatever\" + 0.008*\"baptist\" + 0.007*\"islam\" + 0.006*\"cheers\" + 0.006*\"kent\" + 0.006*\"zoroastrians\" + 0.006*\"joy\" + 0.006*\"lot\"\n"
     ]
    }
   ],
@ -464,15 +490,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[(0, 0.10564032), (1, 0.67894983), (2, 0.104482815), (3, 0.11092702)]\n",
-      "0.009*\"god\" + 0.008*\"mary\" + 0.008*\"targa\" + 0.007*\"baptist\" + 0.007*\"thanks\" + 0.007*\"koresh\" + 0.006*\"really\" + 0.006*\"bible\" + 0.005*\"lot\" + 0.005*\"lucky\"\n"
+      "[(0, 0.11006463), (1, 0.6813435), (2, 0.10399808), (3, 0.10459379)]\n",
+      "0.009*\"thanks\" + 0.009*\"targa\" + 0.008*\"whatever\" + 0.008*\"baptist\" + 0.007*\"islam\" + 0.006*\"cheers\" + 0.006*\"kent\" + 0.006*\"zoroastrians\" + 0.006*\"joy\" + 0.006*\"lot\"\n"
     ]
    }
   ],
@ -492,7 +518,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
@ -508,23 +534,23 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0,\n",
-       "  '0.769*\"god\" + 0.346*\"jesus\" + 0.235*\"bible\" + 0.204*\"christian\" + 0.148*\"christians\" + 0.107*\"christ\" + 0.090*\"well\" + 0.085*\"koresh\" + 0.081*\"kent\" + 0.080*\"christianity\"'),\n",
+       "  '-0.769*\"god\" + -0.345*\"jesus\" + -0.235*\"bible\" + -0.203*\"christian\" + -0.149*\"christians\" + -0.107*\"christ\" + -0.089*\"well\" + -0.085*\"koresh\" + -0.082*\"kent\" + -0.081*\"christianity\"'),\n",
       " (1,\n",
-       "  '-0.863*\"thanks\" + -0.255*\"please\" + -0.159*\"hello\" + -0.152*\"hi\" + 0.124*\"god\" + -0.111*\"sorry\" + -0.088*\"could\" + -0.074*\"windows\" + -0.067*\"jpeg\" + -0.063*\"gif\"'),\n",
+       "  '-0.863*\"thanks\" + -0.255*\"please\" + -0.159*\"hello\" + -0.152*\"hi\" + 0.123*\"god\" + -0.112*\"sorry\" + -0.088*\"could\" + -0.074*\"windows\" + -0.067*\"jpeg\" + -0.063*\"gif\"'),\n",
       " (2,\n",
-       "  '-0.780*\"well\" + 0.229*\"god\" + -0.165*\"yes\" + 0.154*\"thanks\" + -0.133*\"ico\" + -0.133*\"tek\" + -0.130*\"queens\" + -0.130*\"bronx\" + -0.130*\"beauchaine\" + -0.130*\"manhattan\"'),\n",
+       "  '0.779*\"well\" + -0.229*\"god\" + 0.165*\"yes\" + -0.154*\"thanks\" + 0.135*\"ico\" + 0.134*\"tek\" + 0.131*\"queens\" + 0.131*\"bronx\" + 0.131*\"beauchaine\" + 0.131*\"manhattan\"'),\n",
       " (3,\n",
-       "  '-0.338*\"well\" + 0.336*\"ico\" + 0.334*\"tek\" + 0.328*\"bronx\" + 0.328*\"beauchaine\" + 0.328*\"queens\" + 0.326*\"manhattan\" + 0.305*\"com\" + 0.305*\"bob\" + 0.072*\"god\"')]"
+       "  '-0.342*\"well\" + 0.335*\"ico\" + 0.333*\"tek\" + 0.327*\"bronx\" + 0.327*\"queens\" + 0.327*\"beauchaine\" + 0.325*\"manhattan\" + 0.305*\"bob\" + 0.304*\"com\" + 0.073*\"god\"')]"
      ]
     },
-     "execution_count": 83,
+     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -536,7 +562,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
@ -595,7 +621,7 @@
   "window_display": false
  },
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -609,7 +635,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.8"
+   "version": "3.10.10"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
Author	SHA1	Message	Date
cif	380340d66d	Updated 4_4 to use get_feature_names_out() instead of get_feature_names	2023-04-23 16:41:53 +02:00
cif	7f49f8990b	Updated 4_4 - using feature_log_prob_ instead of coef_ (deprecated)	2023-04-23 16:37:48 +02:00