mirror of
https://github.com/gsi-upm/sitc
synced 2025-08-24 02:22:21 +00:00
added import nltk
This commit is contained in:
@@ -74,11 +74,19 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.datasets import fetch_20newsgroups\n",
|
||||
"\n",
|
||||
@@ -92,11 +100,19 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"20\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#Number of categories\n",
|
||||
"print(len(newsgroups_train.target_names))"
|
||||
@@ -104,11 +120,28 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Category id 4 comp.sys.mac.hardware\n",
|
||||
"Doc A fair number of brave souls who upgraded their SI clock oscillator have\n",
|
||||
"shared their experiences for this poll. Please send a brief message detailing\n",
|
||||
"your experiences with the procedure. Top speed attained, CPU rated speed,\n",
|
||||
"add on cards and adapters, heat sinks, hour of usage per day, floppy disk\n",
|
||||
"functionality with 800 and 1.4 m floppies are especially requested.\n",
|
||||
"\n",
|
||||
"I will be summarizing in the next two days, so please add to the network\n",
|
||||
"knowledge base if you have done the clock upgrade and haven't answered this\n",
|
||||
"poll. Thanks.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Show a document\n",
|
||||
"docid = 1\n",
|
||||
@@ -121,11 +154,22 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(11314,)"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#Number of files\n",
|
||||
"newsgroups_train.filenames.shape"
|
||||
@@ -133,11 +177,30 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/cif/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2652: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.\n",
|
||||
" VisibleDeprecationWarning)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(11314, 101323)"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Obtain a vector\n",
|
||||
"\n",
|
||||
@@ -151,11 +214,22 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"66.80510871486653"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# The tf-idf vectors are very sparse with an average of 66 non zero components in 101.323 dimensions (.06%)\n",
|
||||
"vectors_train.nnz / float(vectors_train.shape[0])"
|
||||
@@ -177,11 +251,30 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/cif/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2652: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.\n",
|
||||
" VisibleDeprecationWarning)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0.69545360719001303"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||
"\n",
|
||||
@@ -209,11 +302,20 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"dimensionality: 101323\n",
|
||||
"density: 1.000000\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.utils.extmath import density\n",
|
||||
"\n",
|
||||
@@ -223,11 +325,38 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"alt.atheism: islam atheists say just religion atheism think don people god\n",
|
||||
"comp.graphics: looking format 3d know program file files thanks image graphics\n",
|
||||
"comp.os.ms-windows.misc: card problem thanks driver drivers use files dos file windows\n",
|
||||
"comp.sys.ibm.pc.hardware: monitor disk thanks pc ide controller bus card scsi drive\n",
|
||||
"comp.sys.mac.hardware: know monitor does quadra simms thanks problem drive apple mac\n",
|
||||
"comp.windows.x: using windows x11r5 use application thanks widget server motif window\n",
|
||||
"misc.forsale: asking email sell price condition new shipping offer 00 sale\n",
|
||||
"rec.autos: don ford new good dealer just engine like cars car\n",
|
||||
"rec.motorcycles: don just helmet riding like motorcycle ride bikes dod bike\n",
|
||||
"rec.sport.baseball: braves players pitching hit runs games game baseball team year\n",
|
||||
"rec.sport.hockey: league year nhl games season players play hockey team game\n",
|
||||
"sci.crypt: people use escrow nsa keys government chip clipper encryption key\n",
|
||||
"sci.electronics: don thanks voltage used know does like circuit power use\n",
|
||||
"sci.med: skepticism cadre dsl banks chastity n3jxp pitt gordon geb msg\n",
|
||||
"sci.space: just lunar earth shuttle like moon launch orbit nasa space\n",
|
||||
"soc.religion.christian: believe faith christian christ bible people christians church jesus god\n",
|
||||
"talk.politics.guns: just law firearms government fbi don weapons people guns gun\n",
|
||||
"talk.politics.mideast: said arabs arab turkish people armenians armenian jews israeli israel\n",
|
||||
"talk.politics.misc: know state clinton president just think tax don government people\n",
|
||||
"talk.religion.misc: think don koresh objective christians bible people christian jesus god\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# We can review the top features per topic in Bayes (attribute coef_)\n",
|
||||
"import numpy as np\n",
|
||||
@@ -244,11 +373,28 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 10,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[ 2 15]\n",
|
||||
"['comp.os.ms-windows.misc', 'soc.religion.christian']\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/cif/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2652: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.\n",
|
||||
" VisibleDeprecationWarning)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# We try the classifier in two new docs\n",
|
||||
"\n",
|
||||
|
Reference in New Issue
Block a user