1
0
mirror of https://github.com/gsi-upm/sitc synced 2024-11-21 14:02:28 +00:00

Updated with the new libraries

This commit is contained in:
cif2cif 2021-05-07 11:10:21 +02:00
parent 2ba0e2f3d9
commit ae8d3d3ba2

View File

@ -76,7 +76,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 33,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -85,7 +85,7 @@
"(2034, 2807)" "(2034, 2807)"
] ]
}, },
"execution_count": 1, "execution_count": 33,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -126,12 +126,15 @@
"source": [ "source": [
"Although scikit-learn provides an LDA implementation, it is more popular the package *gensim*, which also provides an LSI implementation, as well as other functionalities. Fortunately, scikit-learn sparse matrices can be used in Gensim using the function *matutils.Sparse2Corpus()*. Anyway, if you are using intensively LDA,it can be convenient to create the corpus with their functions.\n", "Although scikit-learn provides an LDA implementation, it is more popular the package *gensim*, which also provides an LSI implementation, as well as other functionalities. Fortunately, scikit-learn sparse matrices can be used in Gensim using the function *matutils.Sparse2Corpus()*. Anyway, if you are using intensively LDA,it can be convenient to create the corpus with their functions.\n",
"\n", "\n",
"You should install first *gensim*. Run 'conda install -c anaconda gensim=0.12.4' in a terminal." "You should install first:\n",
"\n",
"* *gensim*. Run 'conda install gensim' in a terminal.\n",
"* *python-Levenshtein*. Run 'conda install python-Levenshtein' in a terminal"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 34,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -159,7 +162,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 60,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -173,23 +176,23 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 61,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"[(0,\n", "[(0,\n",
" '0.007*\"car\" + 0.006*\"increased\" + 0.006*\"closely\" + 0.006*\"groups\" + 0.006*\"center\" + 0.006*\"88\" + 0.006*\"offer\" + 0.005*\"archie\" + 0.005*\"beginning\" + 0.005*\"comets\"'),\n", " '0.011*\"baptist\" + 0.010*\"koresh\" + 0.009*\"bible\" + 0.006*\"reality\" + 0.006*\"virtual\" + 0.005*\"scarlet\" + 0.005*\"shag\" + 0.004*\"tootsie\" + 0.004*\"kinda\" + 0.004*\"captain\"'),\n",
" (1,\n", " (1,\n",
" '0.005*\"allow\" + 0.005*\"discuss\" + 0.005*\"condition\" + 0.004*\"certain\" + 0.004*\"member\" + 0.004*\"manipulation\" + 0.004*\"little\" + 0.003*\"proposal\" + 0.003*\"heavily\" + 0.003*\"obvious\"'),\n", " '0.010*\"targa\" + 0.008*\"thanks\" + 0.008*\"moon\" + 0.007*\"craig\" + 0.007*\"zoroastrians\" + 0.006*\"yayayay\" + 0.005*\"unfortunately\" + 0.005*\"windows\" + 0.005*\"rayshade\" + 0.004*\"tdb\"'),\n",
" (2,\n", " (2,\n",
" '0.002*\"led\" + 0.002*\"mechanism\" + 0.002*\"frank\" + 0.002*\"platform\" + 0.002*\"mormons\" + 0.002*\"concepts\" + 0.002*\"proton\" + 0.002*\"aeronautics\" + 0.002*\"header\" + 0.002*\"foreign\"'),\n", " '0.009*\"mary\" + 0.007*\"whatever\" + 0.006*\"god\" + 0.005*\"ns\" + 0.005*\"lucky\" + 0.005*\"joseph\" + 0.005*\"ssrt\" + 0.005*\"samaritan\" + 0.005*\"crusades\" + 0.004*\"phobos\"'),\n",
" (3,\n", " (3,\n",
" '0.004*\"objects\" + 0.003*\"activity\" + 0.003*\"manhattan\" + 0.003*\"obtained\" + 0.003*\"eyes\" + 0.003*\"education\" + 0.003*\"netters\" + 0.003*\"complex\" + 0.003*\"europe\" + 0.002*\"missions\"')]" " '0.009*\"islam\" + 0.008*\"western\" + 0.008*\"plane\" + 0.008*\"jeff\" + 0.007*\"cheers\" + 0.007*\"kent\" + 0.007*\"joy\" + 0.007*\"khomeini\" + 0.007*\"davidian\" + 0.006*\"basically\"')]"
] ]
}, },
"execution_count": 4, "execution_count": 61,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -208,7 +211,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 62,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -240,7 +243,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 63,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -253,14 +256,14 @@
], ],
"source": [ "source": [
"# You can save the dictionary\n", "# You can save the dictionary\n",
"dictionary.save('newsgroup.dict')\n", "dictionary.save('newsgroup.dict.texts')\n",
"\n", "\n",
"print(dictionary)" "print(dictionary)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 64,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -271,7 +274,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 65,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -283,28 +286,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 71,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:random_state not set so using default value\n",
"WARNING:root:failed to load state from newsgroups.dict.state: [Errno 2] No such file or directory: 'newsgroups.dict.state'\n"
]
}
],
"source": [
"# You can optionally save the dictionary \n",
"\n",
"dictionary.save('newsgroups.dict')\n",
"lda = LdaModel.load('newsgroups.dict')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -323,7 +305,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 72,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -333,7 +315,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 73,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -346,7 +328,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 74,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -364,7 +346,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 20, "execution_count": 75,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -377,23 +359,23 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 21, "execution_count": 76,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"[(0,\n", "[(0,\n",
" '0.011*\"thanks\" + 0.010*\"targa\" + 0.008*\"mary\" + 0.008*\"western\" + 0.007*\"craig\" + 0.007*\"jeff\" + 0.006*\"yayayay\" + 0.006*\"phobos\" + 0.005*\"unfortunately\" + 0.005*\"martian\"'),\n", " '0.009*\"whatever\" + 0.007*\"plane\" + 0.007*\"ns\" + 0.007*\"joy\" + 0.006*\"happy\" + 0.005*\"bob\" + 0.004*\"phil\" + 0.004*\"nasa\" + 0.003*\"purdue\" + 0.003*\"neie\"'),\n",
" (1,\n", " (1,\n",
" '0.007*\"islam\" + 0.006*\"koresh\" + 0.006*\"moon\" + 0.006*\"bible\" + 0.006*\"plane\" + 0.006*\"ns\" + 0.005*\"zoroastrians\" + 0.005*\"joy\" + 0.005*\"lucky\" + 0.005*\"ssrt\"'),\n", " '0.009*\"god\" + 0.008*\"mary\" + 0.008*\"targa\" + 0.007*\"baptist\" + 0.007*\"thanks\" + 0.007*\"koresh\" + 0.006*\"really\" + 0.006*\"bible\" + 0.005*\"lot\" + 0.005*\"lucky\"'),\n",
" (2,\n", " (2,\n",
" '0.009*\"whatever\" + 0.009*\"baptist\" + 0.007*\"cheers\" + 0.007*\"kent\" + 0.006*\"khomeini\" + 0.006*\"davidian\" + 0.005*\"gerald\" + 0.005*\"bull\" + 0.005*\"sorry\" + 0.005*\"jesus\"'),\n", " '0.010*\"moon\" + 0.007*\"phobos\" + 0.006*\"unfortunately\" + 0.006*\"martian\" + 0.006*\"russian\" + 0.005*\"rayshade\" + 0.005*\"anybody\" + 0.005*\"perturbations\" + 0.005*\"thanks\" + 0.004*\"apollo\"'),\n",
" (3,\n", " (3,\n",
" '0.005*\"pd\" + 0.004*\"baltimore\" + 0.004*\"also\" + 0.003*\"ipx\" + 0.003*\"dam\" + 0.003*\"feiner\" + 0.003*\"foley\" + 0.003*\"ideally\" + 0.003*\"srgp\" + 0.003*\"thank\"')]" " '0.008*\"islam\" + 0.008*\"western\" + 0.007*\"jeff\" + 0.007*\"zoroastrians\" + 0.006*\"davidian\" + 0.006*\"basically\" + 0.005*\"bull\" + 0.005*\"gerald\" + 0.005*\"sorry\" + 0.004*\"kent\"')]"
] ]
}, },
"execution_count": 21, "execution_count": 76,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -405,14 +387,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 22, "execution_count": 77,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"[(0, 0.09401487), (1, 0.08991001), (2, 0.08514047), (3, 0.7309346)]\n" "[(0, 0.7154438), (1, 0.10569019), (2, 0.09522807), (3, 0.08363795)]\n"
] ]
} }
], ],
@ -424,7 +406,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 24, "execution_count": 78,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -445,14 +427,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 25, "execution_count": 79,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"[(0, 0.06678458), (1, 0.8006135), (2, 0.06974816), (3, 0.062853776)]\n" "[(0, 0.06320839), (1, 0.80878526), (2, 0.06274223), (3, 0.065264106)]\n"
] ]
} }
], ],
@ -464,14 +446,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 26, "execution_count": 80,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"0.007*\"islam\" + 0.006*\"koresh\" + 0.006*\"moon\" + 0.006*\"bible\" + 0.006*\"plane\" + 0.006*\"ns\" + 0.005*\"zoroastrians\" + 0.005*\"joy\" + 0.005*\"lucky\" + 0.005*\"ssrt\"\n" "0.009*\"god\" + 0.008*\"mary\" + 0.008*\"targa\" + 0.007*\"baptist\" + 0.007*\"thanks\" + 0.007*\"koresh\" + 0.006*\"really\" + 0.006*\"bible\" + 0.005*\"lot\" + 0.005*\"lucky\"\n"
] ]
} }
], ],
@ -482,15 +464,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 27, "execution_count": 81,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"[(0, 0.110989906), (1, 0.670005), (2, 0.11422917), (3, 0.10477593)]\n", "[(0, 0.10564032), (1, 0.67894983), (2, 0.104482815), (3, 0.11092702)]\n",
"0.007*\"islam\" + 0.006*\"koresh\" + 0.006*\"moon\" + 0.006*\"bible\" + 0.006*\"plane\" + 0.006*\"ns\" + 0.005*\"zoroastrians\" + 0.005*\"joy\" + 0.005*\"lucky\" + 0.005*\"ssrt\"\n" "0.009*\"god\" + 0.008*\"mary\" + 0.008*\"targa\" + 0.007*\"baptist\" + 0.007*\"thanks\" + 0.007*\"koresh\" + 0.006*\"really\" + 0.006*\"bible\" + 0.005*\"lot\" + 0.005*\"lucky\"\n"
] ]
} }
], ],
@ -510,7 +492,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 28, "execution_count": 82,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -526,23 +508,23 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 29, "execution_count": 83,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"[(0,\n", "[(0,\n",
" '0.769*\"god\" + 0.345*\"jesus\" + 0.235*\"bible\" + 0.203*\"christian\" + 0.149*\"christians\" + 0.108*\"christ\" + 0.089*\"well\" + 0.085*\"koresh\" + 0.081*\"kent\" + 0.080*\"christianity\"'),\n", " '0.769*\"god\" + 0.346*\"jesus\" + 0.235*\"bible\" + 0.204*\"christian\" + 0.148*\"christians\" + 0.107*\"christ\" + 0.090*\"well\" + 0.085*\"koresh\" + 0.081*\"kent\" + 0.080*\"christianity\"'),\n",
" (1,\n", " (1,\n",
" '-0.863*\"thanks\" + -0.255*\"please\" + -0.160*\"hello\" + -0.153*\"hi\" + 0.123*\"god\" + -0.112*\"sorry\" + -0.088*\"could\" + -0.075*\"windows\" + -0.068*\"jpeg\" + -0.062*\"gif\"'),\n", " '-0.863*\"thanks\" + -0.255*\"please\" + -0.159*\"hello\" + -0.152*\"hi\" + 0.124*\"god\" + -0.111*\"sorry\" + -0.088*\"could\" + -0.074*\"windows\" + -0.067*\"jpeg\" + -0.063*\"gif\"'),\n",
" (2,\n", " (2,\n",
" '-0.779*\"well\" + 0.229*\"god\" + -0.164*\"yes\" + 0.153*\"thanks\" + -0.135*\"ico\" + -0.135*\"tek\" + -0.132*\"beauchaine\" + -0.132*\"queens\" + -0.132*\"bronx\" + -0.131*\"manhattan\"'),\n", " '-0.780*\"well\" + 0.229*\"god\" + -0.165*\"yes\" + 0.154*\"thanks\" + -0.133*\"ico\" + -0.133*\"tek\" + -0.130*\"queens\" + -0.130*\"bronx\" + -0.130*\"beauchaine\" + -0.130*\"manhattan\"'),\n",
" (3,\n", " (3,\n",
" '0.343*\"well\" + -0.335*\"ico\" + -0.334*\"tek\" + -0.328*\"bronx\" + -0.328*\"beauchaine\" + -0.328*\"queens\" + -0.325*\"manhattan\" + -0.305*\"com\" + -0.303*\"bob\" + -0.073*\"god\"')]" " '-0.338*\"well\" + 0.336*\"ico\" + 0.334*\"tek\" + 0.328*\"bronx\" + 0.328*\"beauchaine\" + 0.328*\"queens\" + 0.326*\"manhattan\" + 0.305*\"com\" + 0.305*\"bob\" + 0.072*\"god\"')]"
] ]
}, },
"execution_count": 29, "execution_count": 83,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -554,7 +536,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 30, "execution_count": 84,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -603,6 +585,15 @@
} }
], ],
"metadata": { "metadata": {
"datacleaner": {
"position": {
"top": "50px"
},
"python": {
"varRefreshCmd": "try:\n print(_datacleaner.dataframe_metadata())\nexcept:\n print([])"
},
"window_display": false
},
"kernelspec": { "kernelspec": {
"display_name": "Python 3", "display_name": "Python 3",
"language": "python", "language": "python",
@ -618,7 +609,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.7.1" "version": "3.8.8"
}, },
"latex_envs": { "latex_envs": {
"LaTeX_envs_menu_present": true, "LaTeX_envs_menu_present": true,