diff --git a/nlp/4_5_Semantic_Models.ipynb b/nlp/4_5_Semantic_Models.ipynb index e89e2dc..cef544f 100644 --- a/nlp/4_5_Semantic_Models.ipynb +++ b/nlp/4_5_Semantic_Models.ipynb @@ -76,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -85,7 +85,7 @@ "(2034, 2807)" ] }, - "execution_count": 1, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -126,12 +126,15 @@ "source": [ "Although scikit-learn provides an LDA implementation, it is more popular the package *gensim*, which also provides an LSI implementation, as well as other functionalities. Fortunately, scikit-learn sparse matrices can be used in Gensim using the function *matutils.Sparse2Corpus()*. Anyway, if you are using intensively LDA,it can be convenient to create the corpus with their functions.\n", "\n", - "You should install first *gensim*. Run 'conda install -c anaconda gensim=0.12.4' in a terminal." + "You should install first:\n", + "\n", + "* *gensim*. Run 'conda install gensim' in a terminal.\n", + "* *python-Levenshtein*. Run 'conda install python-Levenshtein' in a terminal" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -159,7 +162,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -173,23 +176,23 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0,\n", - " '0.007*\"car\" + 0.006*\"increased\" + 0.006*\"closely\" + 0.006*\"groups\" + 0.006*\"center\" + 0.006*\"88\" + 0.006*\"offer\" + 0.005*\"archie\" + 0.005*\"beginning\" + 0.005*\"comets\"'),\n", + " '0.011*\"baptist\" + 0.010*\"koresh\" + 0.009*\"bible\" + 0.006*\"reality\" + 0.006*\"virtual\" + 0.005*\"scarlet\" + 0.005*\"shag\" + 0.004*\"tootsie\" + 0.004*\"kinda\" + 0.004*\"captain\"'),\n", " (1,\n", - " '0.005*\"allow\" + 0.005*\"discuss\" + 0.005*\"condition\" + 0.004*\"certain\" + 0.004*\"member\" + 0.004*\"manipulation\" + 0.004*\"little\" + 0.003*\"proposal\" + 0.003*\"heavily\" + 0.003*\"obvious\"'),\n", + " '0.010*\"targa\" + 0.008*\"thanks\" + 0.008*\"moon\" + 0.007*\"craig\" + 0.007*\"zoroastrians\" + 0.006*\"yayayay\" + 0.005*\"unfortunately\" + 0.005*\"windows\" + 0.005*\"rayshade\" + 0.004*\"tdb\"'),\n", " (2,\n", - " '0.002*\"led\" + 0.002*\"mechanism\" + 0.002*\"frank\" + 0.002*\"platform\" + 0.002*\"mormons\" + 0.002*\"concepts\" + 0.002*\"proton\" + 0.002*\"aeronautics\" + 0.002*\"header\" + 0.002*\"foreign\"'),\n", + " '0.009*\"mary\" + 0.007*\"whatever\" + 0.006*\"god\" + 0.005*\"ns\" + 0.005*\"lucky\" + 0.005*\"joseph\" + 0.005*\"ssrt\" + 0.005*\"samaritan\" + 0.005*\"crusades\" + 0.004*\"phobos\"'),\n", " (3,\n", - " '0.004*\"objects\" + 0.003*\"activity\" + 0.003*\"manhattan\" + 0.003*\"obtained\" + 0.003*\"eyes\" + 0.003*\"education\" + 0.003*\"netters\" + 0.003*\"complex\" + 0.003*\"europe\" + 0.002*\"missions\"')]" + " '0.009*\"islam\" + 0.008*\"western\" + 0.008*\"plane\" + 0.008*\"jeff\" + 0.007*\"cheers\" + 0.007*\"kent\" + 0.007*\"joy\" + 0.007*\"khomeini\" + 0.007*\"davidian\" + 0.006*\"basically\"')]" ] }, - "execution_count": 4, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } @@ -208,7 +211,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -240,7 +243,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 63, "metadata": {}, "outputs": [ { @@ -253,14 +256,14 @@ ], "source": [ "# You can save the dictionary\n", - "dictionary.save('newsgroup.dict')\n", + "dictionary.save('newsgroup.dict.texts')\n", "\n", "print(dictionary)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -271,7 +274,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -283,28 +286,7 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:root:random_state not set so using default value\n", - "WARNING:root:failed to load state from newsgroups.dict.state: [Errno 2] No such file or directory: 'newsgroups.dict.state'\n" - ] - } - ], - "source": [ - "# You can optionally save the dictionary \n", - "\n", - "dictionary.save('newsgroups.dict')\n", - "lda = LdaModel.load('newsgroups.dict')" - ] - }, - { - "cell_type": "code", - "execution_count": 16, + "execution_count": 71, "metadata": {}, "outputs": [ { @@ -323,7 +305,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -333,7 +315,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ @@ -346,7 +328,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 74, "metadata": {}, "outputs": [ { @@ -364,7 +346,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -377,23 +359,23 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0,\n", - " '0.011*\"thanks\" + 0.010*\"targa\" + 0.008*\"mary\" + 0.008*\"western\" + 0.007*\"craig\" + 0.007*\"jeff\" + 0.006*\"yayayay\" + 0.006*\"phobos\" + 0.005*\"unfortunately\" + 0.005*\"martian\"'),\n", + " '0.009*\"whatever\" + 0.007*\"plane\" + 0.007*\"ns\" + 0.007*\"joy\" + 0.006*\"happy\" + 0.005*\"bob\" + 0.004*\"phil\" + 0.004*\"nasa\" + 0.003*\"purdue\" + 0.003*\"neie\"'),\n", " (1,\n", - " '0.007*\"islam\" + 0.006*\"koresh\" + 0.006*\"moon\" + 0.006*\"bible\" + 0.006*\"plane\" + 0.006*\"ns\" + 0.005*\"zoroastrians\" + 0.005*\"joy\" + 0.005*\"lucky\" + 0.005*\"ssrt\"'),\n", + " '0.009*\"god\" + 0.008*\"mary\" + 0.008*\"targa\" + 0.007*\"baptist\" + 0.007*\"thanks\" + 0.007*\"koresh\" + 0.006*\"really\" + 0.006*\"bible\" + 0.005*\"lot\" + 0.005*\"lucky\"'),\n", " (2,\n", - " '0.009*\"whatever\" + 0.009*\"baptist\" + 0.007*\"cheers\" + 0.007*\"kent\" + 0.006*\"khomeini\" + 0.006*\"davidian\" + 0.005*\"gerald\" + 0.005*\"bull\" + 0.005*\"sorry\" + 0.005*\"jesus\"'),\n", + " '0.010*\"moon\" + 0.007*\"phobos\" + 0.006*\"unfortunately\" + 0.006*\"martian\" + 0.006*\"russian\" + 0.005*\"rayshade\" + 0.005*\"anybody\" + 0.005*\"perturbations\" + 0.005*\"thanks\" + 0.004*\"apollo\"'),\n", " (3,\n", - " '0.005*\"pd\" + 0.004*\"baltimore\" + 0.004*\"also\" + 0.003*\"ipx\" + 0.003*\"dam\" + 0.003*\"feiner\" + 0.003*\"foley\" + 0.003*\"ideally\" + 0.003*\"srgp\" + 0.003*\"thank\"')]" + " '0.008*\"islam\" + 0.008*\"western\" + 0.007*\"jeff\" + 0.007*\"zoroastrians\" + 0.006*\"davidian\" + 0.006*\"basically\" + 0.005*\"bull\" + 0.005*\"gerald\" + 0.005*\"sorry\" + 0.004*\"kent\"')]" ] }, - "execution_count": 21, + "execution_count": 76, "metadata": {}, "output_type": "execute_result" } @@ -405,14 +387,14 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 77, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[(0, 0.09401487), (1, 0.08991001), (2, 0.08514047), (3, 0.7309346)]\n" + "[(0, 0.7154438), (1, 0.10569019), (2, 0.09522807), (3, 0.08363795)]\n" ] } ], @@ -424,7 +406,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 78, "metadata": {}, "outputs": [ { @@ -445,14 +427,14 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 79, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[(0, 0.06678458), (1, 0.8006135), (2, 0.06974816), (3, 0.062853776)]\n" + "[(0, 0.06320839), (1, 0.80878526), (2, 0.06274223), (3, 0.065264106)]\n" ] } ], @@ -464,14 +446,14 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 80, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.007*\"islam\" + 0.006*\"koresh\" + 0.006*\"moon\" + 0.006*\"bible\" + 0.006*\"plane\" + 0.006*\"ns\" + 0.005*\"zoroastrians\" + 0.005*\"joy\" + 0.005*\"lucky\" + 0.005*\"ssrt\"\n" + "0.009*\"god\" + 0.008*\"mary\" + 0.008*\"targa\" + 0.007*\"baptist\" + 0.007*\"thanks\" + 0.007*\"koresh\" + 0.006*\"really\" + 0.006*\"bible\" + 0.005*\"lot\" + 0.005*\"lucky\"\n" ] } ], @@ -482,15 +464,15 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 81, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[(0, 0.110989906), (1, 0.670005), (2, 0.11422917), (3, 0.10477593)]\n", - "0.007*\"islam\" + 0.006*\"koresh\" + 0.006*\"moon\" + 0.006*\"bible\" + 0.006*\"plane\" + 0.006*\"ns\" + 0.005*\"zoroastrians\" + 0.005*\"joy\" + 0.005*\"lucky\" + 0.005*\"ssrt\"\n" + "[(0, 0.10564032), (1, 0.67894983), (2, 0.104482815), (3, 0.11092702)]\n", + "0.009*\"god\" + 0.008*\"mary\" + 0.008*\"targa\" + 0.007*\"baptist\" + 0.007*\"thanks\" + 0.007*\"koresh\" + 0.006*\"really\" + 0.006*\"bible\" + 0.005*\"lot\" + 0.005*\"lucky\"\n" ] } ], @@ -510,7 +492,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -526,23 +508,23 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 83, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0,\n", - " '0.769*\"god\" + 0.345*\"jesus\" + 0.235*\"bible\" + 0.203*\"christian\" + 0.149*\"christians\" + 0.108*\"christ\" + 0.089*\"well\" + 0.085*\"koresh\" + 0.081*\"kent\" + 0.080*\"christianity\"'),\n", + " '0.769*\"god\" + 0.346*\"jesus\" + 0.235*\"bible\" + 0.204*\"christian\" + 0.148*\"christians\" + 0.107*\"christ\" + 0.090*\"well\" + 0.085*\"koresh\" + 0.081*\"kent\" + 0.080*\"christianity\"'),\n", " (1,\n", - " '-0.863*\"thanks\" + -0.255*\"please\" + -0.160*\"hello\" + -0.153*\"hi\" + 0.123*\"god\" + -0.112*\"sorry\" + -0.088*\"could\" + -0.075*\"windows\" + -0.068*\"jpeg\" + -0.062*\"gif\"'),\n", + " '-0.863*\"thanks\" + -0.255*\"please\" + -0.159*\"hello\" + -0.152*\"hi\" + 0.124*\"god\" + -0.111*\"sorry\" + -0.088*\"could\" + -0.074*\"windows\" + -0.067*\"jpeg\" + -0.063*\"gif\"'),\n", " (2,\n", - " '-0.779*\"well\" + 0.229*\"god\" + -0.164*\"yes\" + 0.153*\"thanks\" + -0.135*\"ico\" + -0.135*\"tek\" + -0.132*\"beauchaine\" + -0.132*\"queens\" + -0.132*\"bronx\" + -0.131*\"manhattan\"'),\n", + " '-0.780*\"well\" + 0.229*\"god\" + -0.165*\"yes\" + 0.154*\"thanks\" + -0.133*\"ico\" + -0.133*\"tek\" + -0.130*\"queens\" + -0.130*\"bronx\" + -0.130*\"beauchaine\" + -0.130*\"manhattan\"'),\n", " (3,\n", - " '0.343*\"well\" + -0.335*\"ico\" + -0.334*\"tek\" + -0.328*\"bronx\" + -0.328*\"beauchaine\" + -0.328*\"queens\" + -0.325*\"manhattan\" + -0.305*\"com\" + -0.303*\"bob\" + -0.073*\"god\"')]" + " '-0.338*\"well\" + 0.336*\"ico\" + 0.334*\"tek\" + 0.328*\"bronx\" + 0.328*\"beauchaine\" + 0.328*\"queens\" + 0.326*\"manhattan\" + 0.305*\"com\" + 0.305*\"bob\" + 0.072*\"god\"')]" ] }, - "execution_count": 29, + "execution_count": 83, "metadata": {}, "output_type": "execute_result" } @@ -554,7 +536,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 84, "metadata": {}, "outputs": [ { @@ -603,6 +585,15 @@ } ], "metadata": { + "datacleaner": { + "position": { + "top": "50px" + }, + "python": { + "varRefreshCmd": "try:\n print(_datacleaner.dataframe_metadata())\nexcept:\n print([])" + }, + "window_display": false + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -618,7 +609,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.1" + "version": "3.8.8" }, "latex_envs": { "LaTeX_envs_menu_present": true,