1
0
mirror of https://github.com/gsi-upm/sitc synced 2024-12-22 11:48:12 +00:00

Updated 4_4 - using feature_log_prob_ instead of coef_ (deprecated)

This commit is contained in:
cif 2023-04-23 16:37:48 +02:00
parent 419ea57824
commit 7f49f8990b
2 changed files with 211 additions and 84 deletions

View File

@ -74,9 +74,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n"
]
}
],
"source": [ "source": [
"from sklearn.datasets import fetch_20newsgroups\n", "from sklearn.datasets import fetch_20newsgroups\n",
"\n", "\n",
@ -90,9 +98,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"20\n"
]
}
],
"source": [ "source": [
"#Number of categories\n", "#Number of categories\n",
"print(len(newsgroups_train.target_names))" "print(len(newsgroups_train.target_names))"
@ -100,9 +116,26 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Category id 4 comp.sys.mac.hardware\n",
"Doc A fair number of brave souls who upgraded their SI clock oscillator have\n",
"shared their experiences for this poll. Please send a brief message detailing\n",
"your experiences with the procedure. Top speed attained, CPU rated speed,\n",
"add on cards and adapters, heat sinks, hour of usage per day, floppy disk\n",
"functionality with 800 and 1.4 m floppies are especially requested.\n",
"\n",
"I will be summarizing in the next two days, so please add to the network\n",
"knowledge base if you have done the clock upgrade and haven't answered this\n",
"poll. Thanks.\n"
]
}
],
"source": [ "source": [
"# Show a document\n", "# Show a document\n",
"docid = 1\n", "docid = 1\n",
@ -115,9 +148,20 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"(11314,)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"#Number of files\n", "#Number of files\n",
"newsgroups_train.filenames.shape" "newsgroups_train.filenames.shape"
@ -125,9 +169,20 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"(11314, 101322)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"# Obtain a vector\n", "# Obtain a vector\n",
"\n", "\n",
@ -141,9 +196,20 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"66.802987449178"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"# The tf-idf vectors are very sparse with an average of 66 non zero components in 101.323 dimensions (.06%)\n", "# The tf-idf vectors are very sparse with an average of 66 non zero components in 101.323 dimensions (.06%)\n",
"vectors_train.nnz / float(vectors_train.shape[0])" "vectors_train.nnz / float(vectors_train.shape[0])"
@ -165,9 +231,20 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"0.695453607190013"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.naive_bayes import MultinomialNB\n",
"\n", "\n",
@ -195,29 +272,44 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 9,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"alt.atheism: islam atheists say just religion atheism think don people god\n",
"comp.graphics: looking format 3d know program file files thanks image graphics\n",
"comp.os.ms-windows.misc: card problem thanks driver drivers use files dos file windows\n",
"comp.sys.ibm.pc.hardware: monitor disk thanks pc ide controller bus card scsi drive\n",
"comp.sys.mac.hardware: know monitor does quadra simms thanks problem drive apple mac\n",
"comp.windows.x: using windows x11r5 use application thanks widget server motif window\n",
"misc.forsale: asking email sell price condition new shipping offer 00 sale\n",
"rec.autos: don ford new good dealer just engine like cars car\n",
"rec.motorcycles: don just helmet riding like motorcycle ride bikes dod bike\n",
"rec.sport.baseball: braves players pitching hit runs games game baseball team year\n",
"rec.sport.hockey: league year nhl games season players play hockey team game\n",
"sci.crypt: people use escrow nsa keys government chip clipper encryption key\n",
"sci.electronics: don thanks voltage used know does like circuit power use\n",
"sci.med: skepticism cadre dsl banks chastity n3jxp pitt gordon geb msg\n",
"sci.space: just lunar earth shuttle like moon launch orbit nasa space\n",
"soc.religion.christian: believe faith christian christ bible people christians church jesus god\n",
"talk.politics.guns: just law firearms government fbi don weapons people guns gun\n",
"talk.politics.mideast: said arabs arab turkish people armenians armenian jews israeli israel\n",
"talk.politics.misc: know state clinton president just think tax don government people\n",
"talk.religion.misc: think don koresh objective christians bible people christian jesus god\n"
]
}
],
"source": [ "source": [
"from sklearn.utils.extmath import density\n", "# We can review the top features per topic in Bayes (attribute feature_log_prob_)\n",
"\n",
"print(\"dimensionality: %d\" % model.coef_.shape[1])\n",
"print(\"density: %f\" % density(model.coef_))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# We can review the top features per topic in Bayes (attribute coef_)\n",
"import numpy as np\n", "import numpy as np\n",
"\n", "\n",
"def show_top10(classifier, vectorizer, categories):\n", "def show_top10(classifier, vectorizer, categories):\n",
" feature_names = np.asarray(vectorizer.get_feature_names())\n", " feature_names = np.asarray(vectorizer.get_feature_names_out())\n",
" for i, category in enumerate(categories):\n", " for i, category in enumerate(categories):\n",
" top10 = np.argsort(classifier.coef_[i])[-10:]\n", " top10 = np.argsort(classifier.feature_log_prob_[i, :])[-10:]\n",
" print(\"%s: %s\" % (category, \" \".join(feature_names[top10])))\n", " print(\"%s: %s\" % (category, \" \".join(feature_names[top10])))\n",
"\n", "\n",
" \n", " \n",
@ -226,9 +318,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 10,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 2 15]\n",
"['comp.os.ms-windows.misc', 'soc.religion.christian']\n"
]
}
],
"source": [ "source": [
"# We try the classifier in two new docs\n", "# We try the classifier in two new docs\n",
"\n", "\n",
@ -275,7 +376,7 @@
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "Python 3", "display_name": "Python 3 (ipykernel)",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
}, },
@ -289,7 +390,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.7.1" "version": "3.10.10"
}, },
"latex_envs": { "latex_envs": {
"LaTeX_envs_menu_present": true, "LaTeX_envs_menu_present": true,

View File

@ -76,7 +76,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 33, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -85,7 +85,7 @@
"(2034, 2807)" "(2034, 2807)"
] ]
}, },
"execution_count": 33, "execution_count": 1,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -134,15 +134,41 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 34, "execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: gensim in /home/cif/anaconda3/lib/python3.10/site-packages (4.3.1)\n",
"Requirement already satisfied: scipy>=1.7.0 in /home/cif/anaconda3/lib/python3.10/site-packages (from gensim) (1.10.1)\n",
"Requirement already satisfied: smart-open>=1.8.1 in /home/cif/anaconda3/lib/python3.10/site-packages (from gensim) (6.3.0)\n",
"Requirement already satisfied: numpy>=1.18.5 in /home/cif/anaconda3/lib/python3.10/site-packages (from gensim) (1.24.2)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: python-Levenshtein in /home/cif/anaconda3/lib/python3.10/site-packages (0.21.0)\n",
"Requirement already satisfied: Levenshtein==0.21.0 in /home/cif/anaconda3/lib/python3.10/site-packages (from python-Levenshtein) (0.21.0)\n",
"Requirement already satisfied: rapidfuzz<4.0.0,>=2.3.0 in /home/cif/anaconda3/lib/python3.10/site-packages (from Levenshtein==0.21.0->python-Levenshtein) (3.0.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install gensim\n",
"%pip install python-Levenshtein"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from gensim import matutils\n", "from gensim import matutils\n",
"\n", "\n",
"vocab = vectorizer.get_feature_names()\n", "vocab = vectorizer.get_feature_names_out()\n",
"\n", "\n",
"dictionary = dict([(i, s) for i, s in enumerate(vectorizer.get_feature_names())])\n", "dictionary = dict([(i, s) for i, s in enumerate(vectorizer.get_feature_names_out())])\n",
"corpus_tfidf = matutils.Sparse2Corpus(vectors_train)" "corpus_tfidf = matutils.Sparse2Corpus(vectors_train)"
] ]
}, },
@ -162,7 +188,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 60, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -176,23 +202,23 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 61, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"[(0,\n", "[(0,\n",
" '0.011*\"baptist\" + 0.010*\"koresh\" + 0.009*\"bible\" + 0.006*\"reality\" + 0.006*\"virtual\" + 0.005*\"scarlet\" + 0.005*\"shag\" + 0.004*\"tootsie\" + 0.004*\"kinda\" + 0.004*\"captain\"'),\n", " '0.004*\"central\" + 0.004*\"assumptions\" + 0.004*\"matthew\" + 0.004*\"define\" + 0.004*\"holes\" + 0.003*\"killing\" + 0.003*\"item\" + 0.003*\"curious\" + 0.003*\"going\" + 0.003*\"presentations\"'),\n",
" (1,\n", " (1,\n",
" '0.010*\"targa\" + 0.008*\"thanks\" + 0.008*\"moon\" + 0.007*\"craig\" + 0.007*\"zoroastrians\" + 0.006*\"yayayay\" + 0.005*\"unfortunately\" + 0.005*\"windows\" + 0.005*\"rayshade\" + 0.004*\"tdb\"'),\n", " '0.002*\"mechanism\" + 0.002*\"led\" + 0.002*\"apple\" + 0.002*\"color\" + 0.002*\"mormons\" + 0.002*\"activity\" + 0.002*\"concepts\" + 0.002*\"frank\" + 0.002*\"platform\" + 0.002*\"fault\"'),\n",
" (2,\n", " (2,\n",
" '0.009*\"mary\" + 0.007*\"whatever\" + 0.006*\"god\" + 0.005*\"ns\" + 0.005*\"lucky\" + 0.005*\"joseph\" + 0.005*\"ssrt\" + 0.005*\"samaritan\" + 0.005*\"crusades\" + 0.004*\"phobos\"'),\n", " '0.005*\"objects\" + 0.005*\"obtained\" + 0.003*\"manhattan\" + 0.003*\"capability\" + 0.003*\"education\" + 0.003*\"men\" + 0.003*\"photo\" + 0.003*\"decent\" + 0.003*\"environmental\" + 0.003*\"pain\"'),\n",
" (3,\n", " (3,\n",
" '0.009*\"islam\" + 0.008*\"western\" + 0.008*\"plane\" + 0.008*\"jeff\" + 0.007*\"cheers\" + 0.007*\"kent\" + 0.007*\"joy\" + 0.007*\"khomeini\" + 0.007*\"davidian\" + 0.006*\"basically\"')]" " '0.004*\"car\" + 0.004*\"contain\" + 0.004*\"groups\" + 0.004*\"center\" + 0.004*\"evil\" + 0.004*\"maintain\" + 0.004*\"comets\" + 0.004*\"88\" + 0.004*\"density\" + 0.003*\"company\"')]"
] ]
}, },
"execution_count": 61, "execution_count": 4,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -211,7 +237,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 62, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -243,14 +269,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 63, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Dictionary(10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...)\n" "Dictionary<10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...>\n"
] ]
} }
], ],
@ -263,7 +289,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 64, "execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -274,7 +300,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 65, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -286,14 +312,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 71, "execution_count": 9,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Dictionary(10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...)\n" "Dictionary<10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...>\n"
] ]
} }
], ],
@ -305,7 +331,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 72, "execution_count": 10,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -315,7 +341,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 73, "execution_count": 11,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -328,7 +354,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 74, "execution_count": 12,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -346,7 +372,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 75, "execution_count": 13,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -359,23 +385,23 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 76, "execution_count": 14,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"[(0,\n", "[(0,\n",
" '0.009*\"whatever\" + 0.007*\"plane\" + 0.007*\"ns\" + 0.007*\"joy\" + 0.006*\"happy\" + 0.005*\"bob\" + 0.004*\"phil\" + 0.004*\"nasa\" + 0.003*\"purdue\" + 0.003*\"neie\"'),\n", " '0.011*\"mary\" + 0.007*\"ns\" + 0.006*\"joseph\" + 0.006*\"lucky\" + 0.006*\"ssrt\" + 0.005*\"god\" + 0.005*\"unfortunately\" + 0.004*\"rayshade\" + 0.004*\"phil\" + 0.004*\"nasa\"'),\n",
" (1,\n", " (1,\n",
" '0.009*\"god\" + 0.008*\"mary\" + 0.008*\"targa\" + 0.007*\"baptist\" + 0.007*\"thanks\" + 0.007*\"koresh\" + 0.006*\"really\" + 0.006*\"bible\" + 0.005*\"lot\" + 0.005*\"lucky\"'),\n", " '0.009*\"thanks\" + 0.009*\"targa\" + 0.008*\"whatever\" + 0.008*\"baptist\" + 0.007*\"islam\" + 0.006*\"cheers\" + 0.006*\"kent\" + 0.006*\"zoroastrians\" + 0.006*\"joy\" + 0.006*\"lot\"'),\n",
" (2,\n", " (2,\n",
" '0.010*\"moon\" + 0.007*\"phobos\" + 0.006*\"unfortunately\" + 0.006*\"martian\" + 0.006*\"russian\" + 0.005*\"rayshade\" + 0.005*\"anybody\" + 0.005*\"perturbations\" + 0.005*\"thanks\" + 0.004*\"apollo\"'),\n", " '0.008*\"moon\" + 0.008*\"really\" + 0.008*\"western\" + 0.007*\"plane\" + 0.006*\"samaritan\" + 0.006*\"crusades\" + 0.006*\"baltimore\" + 0.005*\"bob\" + 0.005*\"septuagint\" + 0.005*\"virtual\"'),\n",
" (3,\n", " (3,\n",
" '0.008*\"islam\" + 0.008*\"western\" + 0.007*\"jeff\" + 0.007*\"zoroastrians\" + 0.006*\"davidian\" + 0.006*\"basically\" + 0.005*\"bull\" + 0.005*\"gerald\" + 0.005*\"sorry\" + 0.004*\"kent\"')]" " '0.009*\"koresh\" + 0.008*\"bible\" + 0.008*\"jeff\" + 0.007*\"basically\" + 0.006*\"gerald\" + 0.006*\"bull\" + 0.005*\"pd\" + 0.004*\"also\" + 0.003*\"dam\" + 0.003*\"feiner\"')]"
] ]
}, },
"execution_count": 76, "execution_count": 14,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -387,14 +413,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 77, "execution_count": 15,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"[(0, 0.7154438), (1, 0.10569019), (2, 0.09522807), (3, 0.08363795)]\n" "[(0, 0.09161347), (1, 0.1133858), (2, 0.103424065), (3, 0.69157666)]\n"
] ]
} }
], ],
@ -406,7 +432,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 78, "execution_count": 16,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -427,14 +453,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 79, "execution_count": 17,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"[(0, 0.06320839), (1, 0.80878526), (2, 0.06274223), (3, 0.065264106)]\n" "[(0, 0.066217005), (1, 0.8084562), (2, 0.062542014), (3, 0.0627848)]\n"
] ]
} }
], ],
@ -446,14 +472,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 80, "execution_count": 18,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"0.009*\"god\" + 0.008*\"mary\" + 0.008*\"targa\" + 0.007*\"baptist\" + 0.007*\"thanks\" + 0.007*\"koresh\" + 0.006*\"really\" + 0.006*\"bible\" + 0.005*\"lot\" + 0.005*\"lucky\"\n" "0.009*\"thanks\" + 0.009*\"targa\" + 0.008*\"whatever\" + 0.008*\"baptist\" + 0.007*\"islam\" + 0.006*\"cheers\" + 0.006*\"kent\" + 0.006*\"zoroastrians\" + 0.006*\"joy\" + 0.006*\"lot\"\n"
] ]
} }
], ],
@ -464,15 +490,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 81, "execution_count": 19,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"[(0, 0.10564032), (1, 0.67894983), (2, 0.104482815), (3, 0.11092702)]\n", "[(0, 0.11006463), (1, 0.6813435), (2, 0.10399808), (3, 0.10459379)]\n",
"0.009*\"god\" + 0.008*\"mary\" + 0.008*\"targa\" + 0.007*\"baptist\" + 0.007*\"thanks\" + 0.007*\"koresh\" + 0.006*\"really\" + 0.006*\"bible\" + 0.005*\"lot\" + 0.005*\"lucky\"\n" "0.009*\"thanks\" + 0.009*\"targa\" + 0.008*\"whatever\" + 0.008*\"baptist\" + 0.007*\"islam\" + 0.006*\"cheers\" + 0.006*\"kent\" + 0.006*\"zoroastrians\" + 0.006*\"joy\" + 0.006*\"lot\"\n"
] ]
} }
], ],
@ -492,7 +518,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 82, "execution_count": 20,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -508,23 +534,23 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 83, "execution_count": 21,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"[(0,\n", "[(0,\n",
" '0.769*\"god\" + 0.346*\"jesus\" + 0.235*\"bible\" + 0.204*\"christian\" + 0.148*\"christians\" + 0.107*\"christ\" + 0.090*\"well\" + 0.085*\"koresh\" + 0.081*\"kent\" + 0.080*\"christianity\"'),\n", " '-0.769*\"god\" + -0.345*\"jesus\" + -0.235*\"bible\" + -0.203*\"christian\" + -0.149*\"christians\" + -0.107*\"christ\" + -0.089*\"well\" + -0.085*\"koresh\" + -0.082*\"kent\" + -0.081*\"christianity\"'),\n",
" (1,\n", " (1,\n",
" '-0.863*\"thanks\" + -0.255*\"please\" + -0.159*\"hello\" + -0.152*\"hi\" + 0.124*\"god\" + -0.111*\"sorry\" + -0.088*\"could\" + -0.074*\"windows\" + -0.067*\"jpeg\" + -0.063*\"gif\"'),\n", " '-0.863*\"thanks\" + -0.255*\"please\" + -0.159*\"hello\" + -0.152*\"hi\" + 0.123*\"god\" + -0.112*\"sorry\" + -0.088*\"could\" + -0.074*\"windows\" + -0.067*\"jpeg\" + -0.063*\"gif\"'),\n",
" (2,\n", " (2,\n",
" '-0.780*\"well\" + 0.229*\"god\" + -0.165*\"yes\" + 0.154*\"thanks\" + -0.133*\"ico\" + -0.133*\"tek\" + -0.130*\"queens\" + -0.130*\"bronx\" + -0.130*\"beauchaine\" + -0.130*\"manhattan\"'),\n", " '0.779*\"well\" + -0.229*\"god\" + 0.165*\"yes\" + -0.154*\"thanks\" + 0.135*\"ico\" + 0.134*\"tek\" + 0.131*\"queens\" + 0.131*\"bronx\" + 0.131*\"beauchaine\" + 0.131*\"manhattan\"'),\n",
" (3,\n", " (3,\n",
" '-0.338*\"well\" + 0.336*\"ico\" + 0.334*\"tek\" + 0.328*\"bronx\" + 0.328*\"beauchaine\" + 0.328*\"queens\" + 0.326*\"manhattan\" + 0.305*\"com\" + 0.305*\"bob\" + 0.072*\"god\"')]" " '-0.342*\"well\" + 0.335*\"ico\" + 0.333*\"tek\" + 0.327*\"bronx\" + 0.327*\"queens\" + 0.327*\"beauchaine\" + 0.325*\"manhattan\" + 0.305*\"bob\" + 0.304*\"com\" + 0.073*\"god\"')]"
] ]
}, },
"execution_count": 83, "execution_count": 21,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -536,7 +562,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 84, "execution_count": 22,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -595,7 +621,7 @@
"window_display": false "window_display": false
}, },
"kernelspec": { "kernelspec": {
"display_name": "Python 3", "display_name": "Python 3 (ipykernel)",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
}, },
@ -609,7 +635,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.8.8" "version": "3.10.10"
}, },
"latex_envs": { "latex_envs": {
"LaTeX_envs_menu_present": true, "LaTeX_envs_menu_present": true,