1
0
mirror of https://github.com/gsi-upm/sitc synced 2025-08-24 02:22:21 +00:00

Included installation of nltk

This commit is contained in:
cif2cif
2017-04-20 12:56:39 +02:00
parent cb40531dc4
commit e88e144a50
5 changed files with 140 additions and 987 deletions

View File

@@ -76,22 +76,11 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(2034, 2807)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from sklearn.datasets import fetch_20newsgroups\n",
"\n",
@@ -133,7 +122,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {
"collapsed": false
},
@@ -163,7 +152,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 3,
"metadata": {
"collapsed": false
},
@@ -179,29 +168,11 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" '0.004*objects + 0.004*obtained + 0.003*comets + 0.003*manhattan + 0.003*member + 0.003*beginning + 0.003*center + 0.003*groups + 0.003*aware + 0.003*increased'),\n",
" (1,\n",
" '0.003*activity + 0.002*objects + 0.002*professional + 0.002*eyes + 0.002*manhattan + 0.002*pressure + 0.002*netters + 0.002*chosen + 0.002*attempted + 0.002*medical'),\n",
" (2,\n",
" '0.003*mechanism + 0.003*led + 0.003*platform + 0.003*frank + 0.003*mormons + 0.003*aeronautics + 0.002*concepts + 0.002*header + 0.002*forces + 0.002*profit'),\n",
" (3,\n",
" '0.005*diameter + 0.005*having + 0.004*complex + 0.004*conclusions + 0.004*activity + 0.004*looking + 0.004*action + 0.004*inflatable + 0.004*defined + 0.004*association')]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# check the topics\n",
"lda.print_topics(4)"
@@ -216,7 +187,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {
"collapsed": false
},
@@ -250,19 +221,11 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dictionary(10913 unique tokens: ['whose', 'used', 'hoc', 'transfinite', 'newtek']...)\n"
]
}
],
"outputs": [],
"source": [
"# You can save the dictionary\n",
"dictionary.save('newsgroup.dict')\n",
@@ -272,7 +235,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {
"collapsed": false
},
@@ -285,7 +248,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"metadata": {
"collapsed": false
},
@@ -299,7 +262,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {
"collapsed": true
},
@@ -313,19 +276,11 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dictionary(10913 unique tokens: ['whose', 'used', 'hoc', 'transfinite', 'newtek']...)\n"
]
}
],
"outputs": [],
"source": [
"# We can print the dictionary, it is a mappying of id and tokens\n",
"\n",
@@ -334,7 +289,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {
"collapsed": true
},
@@ -346,7 +301,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": null,
"metadata": {
"collapsed": false
},
@@ -361,19 +316,11 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.1598114653031772), (1, 0.10438175896914427), (2, 0.5700978153855775), (3, 0.24093628445650234), (4, 0.722808853369507), (5, 0.24093628445650234)]\n"
]
}
],
"outputs": [],
"source": [
"#print tf-idf of first document\n",
"print(corpus_tfidf[0])"
@@ -381,7 +328,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"metadata": {
"collapsed": false
},
@@ -396,29 +343,11 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" '0.010*targa + 0.007*ns + 0.006*thanks + 0.006*davidian + 0.006*ssrt + 0.006*yayayay + 0.005*craig + 0.005*bull + 0.005*gerald + 0.005*sorry'),\n",
" (1,\n",
" '0.011*god + 0.010*mary + 0.008*baptist + 0.008*islam + 0.006*zoroastrians + 0.006*joseph + 0.006*lucky + 0.006*khomeini + 0.006*samaritan + 0.005*crusades'),\n",
" (2,\n",
" '0.007*koresh + 0.007*moon + 0.007*western + 0.006*plane + 0.006*jeff + 0.006*unix + 0.005*bible + 0.005*also + 0.005*basically + 0.005*bob'),\n",
" (3,\n",
" '0.011*whatever + 0.008*joy + 0.007*happy + 0.006*virtual + 0.006*reality + 0.004*really + 0.003*samuel___ + 0.003*oh + 0.003*virtually + 0.003*toaster')]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# check the topics\n",
"lda_model.print_topics(4)"
@@ -426,19 +355,11 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.085176135689180726), (1, 0.6919655173835938), (2, 0.1377903468164027), (3, 0.0850680001108228)]\n"
]
}
],
"outputs": [],
"source": [
"# check the lsa vector for the first document\n",
"corpus_lda = lda_model[corpus_tfidf]\n",
@@ -447,19 +368,11 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('lord', 1), ('god', 2)]\n"
]
}
],
"outputs": [],
"source": [
"#predict topics of a new doc\n",
"new_doc = \"God is love and God is the Lord\"\n",
@@ -470,19 +383,11 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.062509420435514051), (1, 0.81246608790618835), (2, 0.062508281488992554), (3, 0.062516210169305114)]\n"
]
}
],
"outputs": [],
"source": [
"#transform into LDA space\n",
"lda_vector = lda_model[bow_vector]\n",
@@ -491,19 +396,11 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.011*god + 0.010*mary + 0.008*baptist + 0.008*islam + 0.006*zoroastrians + 0.006*joseph + 0.006*lucky + 0.006*khomeini + 0.006*samaritan + 0.005*crusades\n"
]
}
],
"outputs": [],
"source": [
"# print the document's single most prominent LDA topic\n",
"print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))"
@@ -511,20 +408,11 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.10392179866025079), (1, 0.68822094221870811), (2, 0.10391916429993264), (3, 0.10393809482110833)]\n",
"0.011*god + 0.010*mary + 0.008*baptist + 0.008*islam + 0.006*zoroastrians + 0.006*joseph + 0.006*lucky + 0.006*khomeini + 0.006*samaritan + 0.005*crusades\n"
]
}
],
"outputs": [],
"source": [
"lda_vector_tfidf = lda_model[tfidf_model[bow_vector]]\n",
"print(lda_vector_tfidf)\n",
@@ -541,7 +429,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": null,
"metadata": {
"collapsed": false
},
@@ -559,29 +447,11 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" '0.769*\"god\" + 0.346*\"jesus\" + 0.235*\"bible\" + 0.204*\"christian\" + 0.149*\"christians\" + 0.107*\"christ\" + 0.090*\"well\" + 0.085*\"koresh\" + 0.081*\"kent\" + 0.080*\"christianity\"'),\n",
" (1,\n",
" '-0.863*\"thanks\" + -0.255*\"please\" + -0.159*\"hello\" + -0.153*\"hi\" + 0.123*\"god\" + -0.112*\"sorry\" + -0.087*\"could\" + -0.074*\"windows\" + -0.067*\"jpeg\" + -0.063*\"vga\"'),\n",
" (2,\n",
" '0.780*\"well\" + -0.229*\"god\" + 0.165*\"yes\" + -0.153*\"thanks\" + 0.133*\"ico\" + 0.133*\"tek\" + 0.130*\"bronx\" + 0.130*\"beauchaine\" + 0.130*\"queens\" + 0.129*\"manhattan\"'),\n",
" (3,\n",
" '0.340*\"well\" + -0.335*\"ico\" + -0.334*\"tek\" + -0.328*\"beauchaine\" + -0.328*\"bronx\" + -0.328*\"queens\" + -0.326*\"manhattan\" + -0.305*\"bob\" + -0.305*\"com\" + -0.072*\"god\"')]"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# check the topics\n",
"lsi_model.print_topics(4)"
@@ -589,19 +459,11 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.1598114653031772), (1, 0.10438175896914427), (2, 0.5700978153855775), (3, 0.24093628445650234), (4, 0.722808853369507), (5, 0.24093628445650234)]\n"
]
}
],
"outputs": [],
"source": [
"# check the lsi vector for the first document\n",
"print(corpus_tfidf[0])"
@@ -655,7 +517,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
"version": "3.5.2"
}
},
"nbformat": 4,