1
0
mirror of https://github.com/gsi-upm/sitc synced 2025-08-24 02:22:21 +00:00

Updated 4_4 - using feature_log_prob_ instead of coef_ (deprecated)

This commit is contained in:
cif
2023-04-23 16:37:48 +02:00
parent 419ea57824
commit 7f49f8990b
2 changed files with 211 additions and 84 deletions

View File

@@ -76,7 +76,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 1,
"metadata": {},
"outputs": [
{
@@ -85,7 +85,7 @@
"(2034, 2807)"
]
},
"execution_count": 33,
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
@@ -134,15 +134,41 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: gensim in /home/cif/anaconda3/lib/python3.10/site-packages (4.3.1)\n",
"Requirement already satisfied: scipy>=1.7.0 in /home/cif/anaconda3/lib/python3.10/site-packages (from gensim) (1.10.1)\n",
"Requirement already satisfied: smart-open>=1.8.1 in /home/cif/anaconda3/lib/python3.10/site-packages (from gensim) (6.3.0)\n",
"Requirement already satisfied: numpy>=1.18.5 in /home/cif/anaconda3/lib/python3.10/site-packages (from gensim) (1.24.2)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: python-Levenshtein in /home/cif/anaconda3/lib/python3.10/site-packages (0.21.0)\n",
"Requirement already satisfied: Levenshtein==0.21.0 in /home/cif/anaconda3/lib/python3.10/site-packages (from python-Levenshtein) (0.21.0)\n",
"Requirement already satisfied: rapidfuzz<4.0.0,>=2.3.0 in /home/cif/anaconda3/lib/python3.10/site-packages (from Levenshtein==0.21.0->python-Levenshtein) (3.0.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install gensim\n",
"%pip install python-Levenshtein"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"from gensim import matutils\n",
"\n",
"vocab = vectorizer.get_feature_names()\n",
"vocab = vectorizer.get_feature_names_out()\n",
"\n",
"dictionary = dict([(i, s) for i, s in enumerate(vectorizer.get_feature_names())])\n",
"dictionary = dict([(i, s) for i, s in enumerate(vectorizer.get_feature_names_out())])\n",
"corpus_tfidf = matutils.Sparse2Corpus(vectors_train)"
]
},
@@ -162,7 +188,7 @@
},
{
"cell_type": "code",
"execution_count": 60,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -176,23 +202,23 @@
},
{
"cell_type": "code",
"execution_count": 61,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" '0.011*\"baptist\" + 0.010*\"koresh\" + 0.009*\"bible\" + 0.006*\"reality\" + 0.006*\"virtual\" + 0.005*\"scarlet\" + 0.005*\"shag\" + 0.004*\"tootsie\" + 0.004*\"kinda\" + 0.004*\"captain\"'),\n",
" '0.004*\"central\" + 0.004*\"assumptions\" + 0.004*\"matthew\" + 0.004*\"define\" + 0.004*\"holes\" + 0.003*\"killing\" + 0.003*\"item\" + 0.003*\"curious\" + 0.003*\"going\" + 0.003*\"presentations\"'),\n",
" (1,\n",
" '0.010*\"targa\" + 0.008*\"thanks\" + 0.008*\"moon\" + 0.007*\"craig\" + 0.007*\"zoroastrians\" + 0.006*\"yayayay\" + 0.005*\"unfortunately\" + 0.005*\"windows\" + 0.005*\"rayshade\" + 0.004*\"tdb\"'),\n",
" '0.002*\"mechanism\" + 0.002*\"led\" + 0.002*\"apple\" + 0.002*\"color\" + 0.002*\"mormons\" + 0.002*\"activity\" + 0.002*\"concepts\" + 0.002*\"frank\" + 0.002*\"platform\" + 0.002*\"fault\"'),\n",
" (2,\n",
" '0.009*\"mary\" + 0.007*\"whatever\" + 0.006*\"god\" + 0.005*\"ns\" + 0.005*\"lucky\" + 0.005*\"joseph\" + 0.005*\"ssrt\" + 0.005*\"samaritan\" + 0.005*\"crusades\" + 0.004*\"phobos\"'),\n",
" '0.005*\"objects\" + 0.005*\"obtained\" + 0.003*\"manhattan\" + 0.003*\"capability\" + 0.003*\"education\" + 0.003*\"men\" + 0.003*\"photo\" + 0.003*\"decent\" + 0.003*\"environmental\" + 0.003*\"pain\"'),\n",
" (3,\n",
" '0.009*\"islam\" + 0.008*\"western\" + 0.008*\"plane\" + 0.008*\"jeff\" + 0.007*\"cheers\" + 0.007*\"kent\" + 0.007*\"joy\" + 0.007*\"khomeini\" + 0.007*\"davidian\" + 0.006*\"basically\"')]"
" '0.004*\"car\" + 0.004*\"contain\" + 0.004*\"groups\" + 0.004*\"center\" + 0.004*\"evil\" + 0.004*\"maintain\" + 0.004*\"comets\" + 0.004*\"88\" + 0.004*\"density\" + 0.003*\"company\"')]"
]
},
"execution_count": 61,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -211,7 +237,7 @@
},
{
"cell_type": "code",
"execution_count": 62,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -243,14 +269,14 @@
},
{
"cell_type": "code",
"execution_count": 63,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dictionary(10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...)\n"
"Dictionary<10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...>\n"
]
}
],
@@ -263,7 +289,7 @@
},
{
"cell_type": "code",
"execution_count": 64,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
@@ -274,7 +300,7 @@
},
{
"cell_type": "code",
"execution_count": 65,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@@ -286,14 +312,14 @@
},
{
"cell_type": "code",
"execution_count": 71,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dictionary(10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...)\n"
"Dictionary<10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...>\n"
]
}
],
@@ -305,7 +331,7 @@
},
{
"cell_type": "code",
"execution_count": 72,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@@ -315,7 +341,7 @@
},
{
"cell_type": "code",
"execution_count": 73,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -328,7 +354,7 @@
},
{
"cell_type": "code",
"execution_count": 74,
"execution_count": 12,
"metadata": {},
"outputs": [
{
@@ -346,7 +372,7 @@
},
{
"cell_type": "code",
"execution_count": 75,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@@ -359,23 +385,23 @@
},
{
"cell_type": "code",
"execution_count": 76,
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" '0.009*\"whatever\" + 0.007*\"plane\" + 0.007*\"ns\" + 0.007*\"joy\" + 0.006*\"happy\" + 0.005*\"bob\" + 0.004*\"phil\" + 0.004*\"nasa\" + 0.003*\"purdue\" + 0.003*\"neie\"'),\n",
" '0.011*\"mary\" + 0.007*\"ns\" + 0.006*\"joseph\" + 0.006*\"lucky\" + 0.006*\"ssrt\" + 0.005*\"god\" + 0.005*\"unfortunately\" + 0.004*\"rayshade\" + 0.004*\"phil\" + 0.004*\"nasa\"'),\n",
" (1,\n",
" '0.009*\"god\" + 0.008*\"mary\" + 0.008*\"targa\" + 0.007*\"baptist\" + 0.007*\"thanks\" + 0.007*\"koresh\" + 0.006*\"really\" + 0.006*\"bible\" + 0.005*\"lot\" + 0.005*\"lucky\"'),\n",
" '0.009*\"thanks\" + 0.009*\"targa\" + 0.008*\"whatever\" + 0.008*\"baptist\" + 0.007*\"islam\" + 0.006*\"cheers\" + 0.006*\"kent\" + 0.006*\"zoroastrians\" + 0.006*\"joy\" + 0.006*\"lot\"'),\n",
" (2,\n",
" '0.010*\"moon\" + 0.007*\"phobos\" + 0.006*\"unfortunately\" + 0.006*\"martian\" + 0.006*\"russian\" + 0.005*\"rayshade\" + 0.005*\"anybody\" + 0.005*\"perturbations\" + 0.005*\"thanks\" + 0.004*\"apollo\"'),\n",
" '0.008*\"moon\" + 0.008*\"really\" + 0.008*\"western\" + 0.007*\"plane\" + 0.006*\"samaritan\" + 0.006*\"crusades\" + 0.006*\"baltimore\" + 0.005*\"bob\" + 0.005*\"septuagint\" + 0.005*\"virtual\"'),\n",
" (3,\n",
" '0.008*\"islam\" + 0.008*\"western\" + 0.007*\"jeff\" + 0.007*\"zoroastrians\" + 0.006*\"davidian\" + 0.006*\"basically\" + 0.005*\"bull\" + 0.005*\"gerald\" + 0.005*\"sorry\" + 0.004*\"kent\"')]"
" '0.009*\"koresh\" + 0.008*\"bible\" + 0.008*\"jeff\" + 0.007*\"basically\" + 0.006*\"gerald\" + 0.006*\"bull\" + 0.005*\"pd\" + 0.004*\"also\" + 0.003*\"dam\" + 0.003*\"feiner\"')]"
]
},
"execution_count": 76,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@@ -387,14 +413,14 @@
},
{
"cell_type": "code",
"execution_count": 77,
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.7154438), (1, 0.10569019), (2, 0.09522807), (3, 0.08363795)]\n"
"[(0, 0.09161347), (1, 0.1133858), (2, 0.103424065), (3, 0.69157666)]\n"
]
}
],
@@ -406,7 +432,7 @@
},
{
"cell_type": "code",
"execution_count": 78,
"execution_count": 16,
"metadata": {},
"outputs": [
{
@@ -427,14 +453,14 @@
},
{
"cell_type": "code",
"execution_count": 79,
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.06320839), (1, 0.80878526), (2, 0.06274223), (3, 0.065264106)]\n"
"[(0, 0.066217005), (1, 0.8084562), (2, 0.062542014), (3, 0.0627848)]\n"
]
}
],
@@ -446,14 +472,14 @@
},
{
"cell_type": "code",
"execution_count": 80,
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.009*\"god\" + 0.008*\"mary\" + 0.008*\"targa\" + 0.007*\"baptist\" + 0.007*\"thanks\" + 0.007*\"koresh\" + 0.006*\"really\" + 0.006*\"bible\" + 0.005*\"lot\" + 0.005*\"lucky\"\n"
"0.009*\"thanks\" + 0.009*\"targa\" + 0.008*\"whatever\" + 0.008*\"baptist\" + 0.007*\"islam\" + 0.006*\"cheers\" + 0.006*\"kent\" + 0.006*\"zoroastrians\" + 0.006*\"joy\" + 0.006*\"lot\"\n"
]
}
],
@@ -464,15 +490,15 @@
},
{
"cell_type": "code",
"execution_count": 81,
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.10564032), (1, 0.67894983), (2, 0.104482815), (3, 0.11092702)]\n",
"0.009*\"god\" + 0.008*\"mary\" + 0.008*\"targa\" + 0.007*\"baptist\" + 0.007*\"thanks\" + 0.007*\"koresh\" + 0.006*\"really\" + 0.006*\"bible\" + 0.005*\"lot\" + 0.005*\"lucky\"\n"
"[(0, 0.11006463), (1, 0.6813435), (2, 0.10399808), (3, 0.10459379)]\n",
"0.009*\"thanks\" + 0.009*\"targa\" + 0.008*\"whatever\" + 0.008*\"baptist\" + 0.007*\"islam\" + 0.006*\"cheers\" + 0.006*\"kent\" + 0.006*\"zoroastrians\" + 0.006*\"joy\" + 0.006*\"lot\"\n"
]
}
],
@@ -492,7 +518,7 @@
},
{
"cell_type": "code",
"execution_count": 82,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
@@ -508,23 +534,23 @@
},
{
"cell_type": "code",
"execution_count": 83,
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" '0.769*\"god\" + 0.346*\"jesus\" + 0.235*\"bible\" + 0.204*\"christian\" + 0.148*\"christians\" + 0.107*\"christ\" + 0.090*\"well\" + 0.085*\"koresh\" + 0.081*\"kent\" + 0.080*\"christianity\"'),\n",
" '-0.769*\"god\" + -0.345*\"jesus\" + -0.235*\"bible\" + -0.203*\"christian\" + -0.149*\"christians\" + -0.107*\"christ\" + -0.089*\"well\" + -0.085*\"koresh\" + -0.082*\"kent\" + -0.081*\"christianity\"'),\n",
" (1,\n",
" '-0.863*\"thanks\" + -0.255*\"please\" + -0.159*\"hello\" + -0.152*\"hi\" + 0.124*\"god\" + -0.111*\"sorry\" + -0.088*\"could\" + -0.074*\"windows\" + -0.067*\"jpeg\" + -0.063*\"gif\"'),\n",
" '-0.863*\"thanks\" + -0.255*\"please\" + -0.159*\"hello\" + -0.152*\"hi\" + 0.123*\"god\" + -0.112*\"sorry\" + -0.088*\"could\" + -0.074*\"windows\" + -0.067*\"jpeg\" + -0.063*\"gif\"'),\n",
" (2,\n",
" '-0.780*\"well\" + 0.229*\"god\" + -0.165*\"yes\" + 0.154*\"thanks\" + -0.133*\"ico\" + -0.133*\"tek\" + -0.130*\"queens\" + -0.130*\"bronx\" + -0.130*\"beauchaine\" + -0.130*\"manhattan\"'),\n",
" '0.779*\"well\" + -0.229*\"god\" + 0.165*\"yes\" + -0.154*\"thanks\" + 0.135*\"ico\" + 0.134*\"tek\" + 0.131*\"queens\" + 0.131*\"bronx\" + 0.131*\"beauchaine\" + 0.131*\"manhattan\"'),\n",
" (3,\n",
" '-0.338*\"well\" + 0.336*\"ico\" + 0.334*\"tek\" + 0.328*\"bronx\" + 0.328*\"beauchaine\" + 0.328*\"queens\" + 0.326*\"manhattan\" + 0.305*\"com\" + 0.305*\"bob\" + 0.072*\"god\"')]"
" '-0.342*\"well\" + 0.335*\"ico\" + 0.333*\"tek\" + 0.327*\"bronx\" + 0.327*\"queens\" + 0.327*\"beauchaine\" + 0.325*\"manhattan\" + 0.305*\"bob\" + 0.304*\"com\" + 0.073*\"god\"')]"
]
},
"execution_count": 83,
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@@ -536,7 +562,7 @@
},
{
"cell_type": "code",
"execution_count": 84,
"execution_count": 22,
"metadata": {},
"outputs": [
{
@@ -595,7 +621,7 @@
"window_display": false
},
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -609,7 +635,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
"version": "3.10.10"
},
"latex_envs": {
"LaTeX_envs_menu_present": true,