1
0
mirror of https://github.com/gsi-upm/sitc synced 2024-11-21 22:12:30 +00:00

Cambiado nombre diccionario

This commit is contained in:
Carlos A. Iglesias 2019-04-23 10:39:56 +02:00
parent e42299ac7a
commit 2c8238f1f2

View File

@ -76,9 +76,20 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"(2034, 2807)"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"from sklearn.datasets import fetch_20newsgroups\n", "from sklearn.datasets import fetch_20newsgroups\n",
"\n", "\n",
@ -120,7 +131,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -148,7 +159,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -162,9 +173,27 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" '0.007*\"car\" + 0.006*\"increased\" + 0.006*\"closely\" + 0.006*\"groups\" + 0.006*\"center\" + 0.006*\"88\" + 0.006*\"offer\" + 0.005*\"archie\" + 0.005*\"beginning\" + 0.005*\"comets\"'),\n",
" (1,\n",
" '0.005*\"allow\" + 0.005*\"discuss\" + 0.005*\"condition\" + 0.004*\"certain\" + 0.004*\"member\" + 0.004*\"manipulation\" + 0.004*\"little\" + 0.003*\"proposal\" + 0.003*\"heavily\" + 0.003*\"obvious\"'),\n",
" (2,\n",
" '0.002*\"led\" + 0.002*\"mechanism\" + 0.002*\"frank\" + 0.002*\"platform\" + 0.002*\"mormons\" + 0.002*\"concepts\" + 0.002*\"proton\" + 0.002*\"aeronautics\" + 0.002*\"header\" + 0.002*\"foreign\"'),\n",
" (3,\n",
" '0.004*\"objects\" + 0.003*\"activity\" + 0.003*\"manhattan\" + 0.003*\"obtained\" + 0.003*\"eyes\" + 0.003*\"education\" + 0.003*\"netters\" + 0.003*\"complex\" + 0.003*\"europe\" + 0.002*\"missions\"')]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"# check the topics\n", "# check the topics\n",
"lda.print_topics(4)" "lda.print_topics(4)"
@ -179,7 +208,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -211,9 +240,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dictionary(10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...)\n"
]
}
],
"source": [ "source": [
"# You can save the dictionary\n", "# You can save the dictionary\n",
"dictionary.save('newsgroup.dict')\n", "dictionary.save('newsgroup.dict')\n",
@ -223,7 +260,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -234,7 +271,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -246,21 +283,38 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 15,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:random_state not set so using default value\n",
"WARNING:root:failed to load state from newsgroups.dict.state: [Errno 2] No such file or directory: 'newsgroups.dict.state'\n"
]
}
],
"source": [ "source": [
"# You can optionally save the dictionary \n", "# You can optionally save the dictionary \n",
"\n", "\n",
"dictionary.save('newsgroups.dict')\n", "dictionary.save('newsgroups.dict')\n",
"lda = LdaModel.load('newsgroups.lda')" "lda = LdaModel.load('newsgroups.dict')"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 16,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dictionary(10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...)\n"
]
}
],
"source": [ "source": [
"# We can print the dictionary, it is a mappying of id and tokens\n", "# We can print the dictionary, it is a mappying of id and tokens\n",
"\n", "\n",
@ -269,7 +323,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 17,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -279,7 +333,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 18,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -292,9 +346,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 19,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.24093628445650234), (1, 0.5700978153855775), (2, 0.10438175896914427), (3, 0.1598114653031772), (4, 0.722808853369507), (5, 0.24093628445650234)]\n"
]
}
],
"source": [ "source": [
"#print tf-idf of first document\n", "#print tf-idf of first document\n",
"print(corpus_tfidf[0])" "print(corpus_tfidf[0])"
@ -302,7 +364,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 20,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -315,9 +377,27 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 21,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" '0.011*\"thanks\" + 0.010*\"targa\" + 0.008*\"mary\" + 0.008*\"western\" + 0.007*\"craig\" + 0.007*\"jeff\" + 0.006*\"yayayay\" + 0.006*\"phobos\" + 0.005*\"unfortunately\" + 0.005*\"martian\"'),\n",
" (1,\n",
" '0.007*\"islam\" + 0.006*\"koresh\" + 0.006*\"moon\" + 0.006*\"bible\" + 0.006*\"plane\" + 0.006*\"ns\" + 0.005*\"zoroastrians\" + 0.005*\"joy\" + 0.005*\"lucky\" + 0.005*\"ssrt\"'),\n",
" (2,\n",
" '0.009*\"whatever\" + 0.009*\"baptist\" + 0.007*\"cheers\" + 0.007*\"kent\" + 0.006*\"khomeini\" + 0.006*\"davidian\" + 0.005*\"gerald\" + 0.005*\"bull\" + 0.005*\"sorry\" + 0.005*\"jesus\"'),\n",
" (3,\n",
" '0.005*\"pd\" + 0.004*\"baltimore\" + 0.004*\"also\" + 0.003*\"ipx\" + 0.003*\"dam\" + 0.003*\"feiner\" + 0.003*\"foley\" + 0.003*\"ideally\" + 0.003*\"srgp\" + 0.003*\"thank\"')]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"# check the topics\n", "# check the topics\n",
"lda_model.print_topics(4)" "lda_model.print_topics(4)"
@ -325,9 +405,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 22,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.09401487), (1, 0.08991001), (2, 0.08514047), (3, 0.7309346)]\n"
]
}
],
"source": [ "source": [
"# check the lsa vector for the first document\n", "# check the lsa vector for the first document\n",
"corpus_lda = lda_model[corpus_tfidf]\n", "corpus_lda = lda_model[corpus_tfidf]\n",
@ -336,9 +424,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 24,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('lord', 1), ('god', 2)]\n"
]
}
],
"source": [ "source": [
"#predict topics of a new doc\n", "#predict topics of a new doc\n",
"new_doc = \"God is love and God is the Lord\"\n", "new_doc = \"God is love and God is the Lord\"\n",
@ -349,9 +445,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 25,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.06678458), (1, 0.8006135), (2, 0.06974816), (3, 0.062853776)]\n"
]
}
],
"source": [ "source": [
"#transform into LDA space\n", "#transform into LDA space\n",
"lda_vector = lda_model[bow_vector]\n", "lda_vector = lda_model[bow_vector]\n",
@ -360,9 +464,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 26,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.007*\"islam\" + 0.006*\"koresh\" + 0.006*\"moon\" + 0.006*\"bible\" + 0.006*\"plane\" + 0.006*\"ns\" + 0.005*\"zoroastrians\" + 0.005*\"joy\" + 0.005*\"lucky\" + 0.005*\"ssrt\"\n"
]
}
],
"source": [ "source": [
"# print the document's single most prominent LDA topic\n", "# print the document's single most prominent LDA topic\n",
"print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))" "print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))"
@ -370,9 +482,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 27,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.110989906), (1, 0.670005), (2, 0.11422917), (3, 0.10477593)]\n",
"0.007*\"islam\" + 0.006*\"koresh\" + 0.006*\"moon\" + 0.006*\"bible\" + 0.006*\"plane\" + 0.006*\"ns\" + 0.005*\"zoroastrians\" + 0.005*\"joy\" + 0.005*\"lucky\" + 0.005*\"ssrt\"\n"
]
}
],
"source": [ "source": [
"lda_vector_tfidf = lda_model[tfidf_model[bow_vector]]\n", "lda_vector_tfidf = lda_model[tfidf_model[bow_vector]]\n",
"print(lda_vector_tfidf)\n", "print(lda_vector_tfidf)\n",
@ -389,7 +510,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 28,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -405,9 +526,27 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 29,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" '0.769*\"god\" + 0.345*\"jesus\" + 0.235*\"bible\" + 0.203*\"christian\" + 0.149*\"christians\" + 0.108*\"christ\" + 0.089*\"well\" + 0.085*\"koresh\" + 0.081*\"kent\" + 0.080*\"christianity\"'),\n",
" (1,\n",
" '-0.863*\"thanks\" + -0.255*\"please\" + -0.160*\"hello\" + -0.153*\"hi\" + 0.123*\"god\" + -0.112*\"sorry\" + -0.088*\"could\" + -0.075*\"windows\" + -0.068*\"jpeg\" + -0.062*\"gif\"'),\n",
" (2,\n",
" '-0.779*\"well\" + 0.229*\"god\" + -0.164*\"yes\" + 0.153*\"thanks\" + -0.135*\"ico\" + -0.135*\"tek\" + -0.132*\"beauchaine\" + -0.132*\"queens\" + -0.132*\"bronx\" + -0.131*\"manhattan\"'),\n",
" (3,\n",
" '0.343*\"well\" + -0.335*\"ico\" + -0.334*\"tek\" + -0.328*\"bronx\" + -0.328*\"beauchaine\" + -0.328*\"queens\" + -0.325*\"manhattan\" + -0.305*\"com\" + -0.303*\"bob\" + -0.073*\"god\"')]"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"# check the topics\n", "# check the topics\n",
"lsi_model.print_topics(4)" "lsi_model.print_topics(4)"
@ -415,9 +554,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 30,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.24093628445650234), (1, 0.5700978153855775), (2, 0.10438175896914427), (3, 0.1598114653031772), (4, 0.722808853369507), (5, 0.24093628445650234)]\n"
]
}
],
"source": [ "source": [
"# check the lsi vector for the first document\n", "# check the lsi vector for the first document\n",
"print(corpus_tfidf[0])" "print(corpus_tfidf[0])"