mirror of
https://github.com/gsi-upm/sitc
synced 2024-11-24 23:42:29 +00:00
added import nltk
This commit is contained in:
parent
c55a1c077b
commit
b24f866056
@ -102,7 +102,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.1+"
|
"version": "3.5.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -76,7 +76,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 1,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": true
|
"collapsed": true
|
||||||
},
|
},
|
||||||
@ -145,7 +145,6 @@
|
|||||||
"plt.hist(iris.target, bins=10)\n",
|
"plt.hist(iris.target, bins=10)\n",
|
||||||
"plt.xlabel('Number of species')\n",
|
"plt.xlabel('Number of species')\n",
|
||||||
"plt.ylabel('iris class')\n"
|
"plt.ylabel('iris class')\n"
|
||||||
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -379,7 +378,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.1+"
|
"version": "3.5.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -3838,7 +3838,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.1"
|
"version": "3.5.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -924,7 +924,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.1+"
|
"version": "3.5.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -5403,7 +5403,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.1+"
|
"version": "3.5.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -4763,7 +4763,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.4.3+"
|
"version": "3.5.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -114,7 +114,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.1+"
|
"version": "3.5.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -1170,7 +1170,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.1+"
|
"version": "3.5.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -162,157 +162,13 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 29,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"$: dollar\n",
|
|
||||||
" $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$\n",
|
|
||||||
"'': closing quotation mark\n",
|
|
||||||
" ' ''\n",
|
|
||||||
"(: opening parenthesis\n",
|
|
||||||
" ( [ {\n",
|
|
||||||
"): closing parenthesis\n",
|
|
||||||
" ) ] }\n",
|
|
||||||
",: comma\n",
|
|
||||||
" ,\n",
|
|
||||||
"--: dash\n",
|
|
||||||
" --\n",
|
|
||||||
".: sentence terminator\n",
|
|
||||||
" . ! ?\n",
|
|
||||||
":: colon or ellipsis\n",
|
|
||||||
" : ; ...\n",
|
|
||||||
"CC: conjunction, coordinating\n",
|
|
||||||
" & 'n and both but either et for less minus neither nor or plus so\n",
|
|
||||||
" therefore times v. versus vs. whether yet\n",
|
|
||||||
"CD: numeral, cardinal\n",
|
|
||||||
" mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-\n",
|
|
||||||
" seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025\n",
|
|
||||||
" fifteen 271,124 dozen quintillion DM2,000 ...\n",
|
|
||||||
"DT: determiner\n",
|
|
||||||
" all an another any both del each either every half la many much nary\n",
|
|
||||||
" neither no some such that the them these this those\n",
|
|
||||||
"EX: existential there\n",
|
|
||||||
" there\n",
|
|
||||||
"FW: foreign word\n",
|
|
||||||
" gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous\n",
|
|
||||||
" lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte\n",
|
|
||||||
" terram fiche oui corporis ...\n",
|
|
||||||
"IN: preposition or conjunction, subordinating\n",
|
|
||||||
" astride among uppon whether out inside pro despite on by throughout\n",
|
|
||||||
" below within for towards near behind atop around if like until below\n",
|
|
||||||
" next into if beside ...\n",
|
|
||||||
"JJ: adjective or numeral, ordinal\n",
|
|
||||||
" third ill-mannered pre-war regrettable oiled calamitous first separable\n",
|
|
||||||
" ectoplasmic battery-powered participatory fourth still-to-be-named\n",
|
|
||||||
" multilingual multi-disciplinary ...\n",
|
|
||||||
"JJR: adjective, comparative\n",
|
|
||||||
" bleaker braver breezier briefer brighter brisker broader bumper busier\n",
|
|
||||||
" calmer cheaper choosier cleaner clearer closer colder commoner costlier\n",
|
|
||||||
" cozier creamier crunchier cuter ...\n",
|
|
||||||
"JJS: adjective, superlative\n",
|
|
||||||
" calmest cheapest choicest classiest cleanest clearest closest commonest\n",
|
|
||||||
" corniest costliest crassest creepiest crudest cutest darkest deadliest\n",
|
|
||||||
" dearest deepest densest dinkiest ...\n",
|
|
||||||
"LS: list item marker\n",
|
|
||||||
" A A. B B. C C. D E F First G H I J K One SP-44001 SP-44002 SP-44005\n",
|
|
||||||
" SP-44007 Second Third Three Two * a b c d first five four one six three\n",
|
|
||||||
" two\n",
|
|
||||||
"MD: modal auxiliary\n",
|
|
||||||
" can cannot could couldn't dare may might must need ought shall should\n",
|
|
||||||
" shouldn't will would\n",
|
|
||||||
"NN: noun, common, singular or mass\n",
|
|
||||||
" common-carrier cabbage knuckle-duster Casino afghan shed thermostat\n",
|
|
||||||
" investment slide humour falloff slick wind hyena override subhumanity\n",
|
|
||||||
" machinist ...\n",
|
|
||||||
"NNP: noun, proper, singular\n",
|
|
||||||
" Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos\n",
|
|
||||||
" Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA\n",
|
|
||||||
" Shannon A.K.C. Meltex Liverpool ...\n",
|
|
||||||
"NNPS: noun, proper, plural\n",
|
|
||||||
" Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists\n",
|
|
||||||
" Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques\n",
|
|
||||||
" Apache Apaches Apocrypha ...\n",
|
|
||||||
"NNS: noun, common, plural\n",
|
|
||||||
" undergraduates scotches bric-a-brac products bodyguards facets coasts\n",
|
|
||||||
" divestitures storehouses designs clubs fragrances averages\n",
|
|
||||||
" subjectivists apprehensions muses factory-jobs ...\n",
|
|
||||||
"PDT: pre-determiner\n",
|
|
||||||
" all both half many quite such sure this\n",
|
|
||||||
"POS: genitive marker\n",
|
|
||||||
" ' 's\n",
|
|
||||||
"PRP: pronoun, personal\n",
|
|
||||||
" hers herself him himself hisself it itself me myself one oneself ours\n",
|
|
||||||
" ourselves ownself self she thee theirs them themselves they thou thy us\n",
|
|
||||||
"PRP$: pronoun, possessive\n",
|
|
||||||
" her his mine my our ours their thy your\n",
|
|
||||||
"RB: adverb\n",
|
|
||||||
" occasionally unabatingly maddeningly adventurously professedly\n",
|
|
||||||
" stirringly prominently technologically magisterially predominately\n",
|
|
||||||
" swiftly fiscally pitilessly ...\n",
|
|
||||||
"RBR: adverb, comparative\n",
|
|
||||||
" further gloomier grander graver greater grimmer harder harsher\n",
|
|
||||||
" healthier heavier higher however larger later leaner lengthier less-\n",
|
|
||||||
" perfectly lesser lonelier longer louder lower more ...\n",
|
|
||||||
"RBS: adverb, superlative\n",
|
|
||||||
" best biggest bluntest earliest farthest first furthest hardest\n",
|
|
||||||
" heartiest highest largest least less most nearest second tightest worst\n",
|
|
||||||
"RP: particle\n",
|
|
||||||
" aboard about across along apart around aside at away back before behind\n",
|
|
||||||
" by crop down ever fast for forth from go high i.e. in into just later\n",
|
|
||||||
" low more off on open out over per pie raising start teeth that through\n",
|
|
||||||
" under unto up up-pp upon whole with you\n",
|
|
||||||
"SYM: symbol\n",
|
|
||||||
" % & ' '' ''. ) ). * + ,. < = > @ A[fj] U.S U.S.S.R * ** ***\n",
|
|
||||||
"TO: \"to\" as preposition or infinitive marker\n",
|
|
||||||
" to\n",
|
|
||||||
"UH: interjection\n",
|
|
||||||
" Goodbye Goody Gosh Wow Jeepers Jee-sus Hubba Hey Kee-reist Oops amen\n",
|
|
||||||
" huh howdy uh dammit whammo shucks heck anyways whodunnit honey golly\n",
|
|
||||||
" man baby diddle hush sonuvabitch ...\n",
|
|
||||||
"VB: verb, base form\n",
|
|
||||||
" ask assemble assess assign assume atone attention avoid bake balkanize\n",
|
|
||||||
" bank begin behold believe bend benefit bevel beware bless boil bomb\n",
|
|
||||||
" boost brace break bring broil brush build ...\n",
|
|
||||||
"VBD: verb, past tense\n",
|
|
||||||
" dipped pleaded swiped regummed soaked tidied convened halted registered\n",
|
|
||||||
" cushioned exacted snubbed strode aimed adopted belied figgered\n",
|
|
||||||
" speculated wore appreciated contemplated ...\n",
|
|
||||||
"VBG: verb, present participle or gerund\n",
|
|
||||||
" telegraphing stirring focusing angering judging stalling lactating\n",
|
|
||||||
" hankerin' alleging veering capping approaching traveling besieging\n",
|
|
||||||
" encrypting interrupting erasing wincing ...\n",
|
|
||||||
"VBN: verb, past participle\n",
|
|
||||||
" multihulled dilapidated aerosolized chaired languished panelized used\n",
|
|
||||||
" experimented flourished imitated reunifed factored condensed sheared\n",
|
|
||||||
" unsettled primed dubbed desired ...\n",
|
|
||||||
"VBP: verb, present tense, not 3rd person singular\n",
|
|
||||||
" predominate wrap resort sue twist spill cure lengthen brush terminate\n",
|
|
||||||
" appear tend stray glisten obtain comprise detest tease attract\n",
|
|
||||||
" emphasize mold postpone sever return wag ...\n",
|
|
||||||
"VBZ: verb, present tense, 3rd person singular\n",
|
|
||||||
" bases reconstructs marks mixes displeases seals carps weaves snatches\n",
|
|
||||||
" slumps stretches authorizes smolders pictures emerges stockpiles\n",
|
|
||||||
" seduces fizzes uses bolsters slaps speaks pleads ...\n",
|
|
||||||
"WDT: WH-determiner\n",
|
|
||||||
" that what whatever which whichever\n",
|
|
||||||
"WP: WH-pronoun\n",
|
|
||||||
" that what whatever whatsoever which who whom whosoever\n",
|
|
||||||
"WP$: WH-pronoun, possessive\n",
|
|
||||||
" whose\n",
|
|
||||||
"WRB: Wh-adverb\n",
|
|
||||||
" how however whence whenever where whereby whereever wherein whereof why\n",
|
|
||||||
"``: opening quotation mark\n",
|
|
||||||
" ` ``\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
|
"import nltk\n",
|
||||||
"nltk.help.upenn_tagset()"
|
"nltk.help.upenn_tagset()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -74,11 +74,19 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 1,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.datasets import fetch_20newsgroups\n",
|
"from sklearn.datasets import fetch_20newsgroups\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -92,11 +100,19 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 2,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"20\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Number of categories\n",
|
"#Number of categories\n",
|
||||||
"print(len(newsgroups_train.target_names))"
|
"print(len(newsgroups_train.target_names))"
|
||||||
@ -104,11 +120,28 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 3,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Category id 4 comp.sys.mac.hardware\n",
|
||||||
|
"Doc A fair number of brave souls who upgraded their SI clock oscillator have\n",
|
||||||
|
"shared their experiences for this poll. Please send a brief message detailing\n",
|
||||||
|
"your experiences with the procedure. Top speed attained, CPU rated speed,\n",
|
||||||
|
"add on cards and adapters, heat sinks, hour of usage per day, floppy disk\n",
|
||||||
|
"functionality with 800 and 1.4 m floppies are especially requested.\n",
|
||||||
|
"\n",
|
||||||
|
"I will be summarizing in the next two days, so please add to the network\n",
|
||||||
|
"knowledge base if you have done the clock upgrade and haven't answered this\n",
|
||||||
|
"poll. Thanks.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# Show a document\n",
|
"# Show a document\n",
|
||||||
"docid = 1\n",
|
"docid = 1\n",
|
||||||
@ -121,11 +154,22 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 4,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"(11314,)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Number of files\n",
|
"#Number of files\n",
|
||||||
"newsgroups_train.filenames.shape"
|
"newsgroups_train.filenames.shape"
|
||||||
@ -133,11 +177,30 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 5,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/home/cif/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2652: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.\n",
|
||||||
|
" VisibleDeprecationWarning)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"(11314, 101323)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# Obtain a vector\n",
|
"# Obtain a vector\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -151,11 +214,22 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 6,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"66.80510871486653"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# The tf-idf vectors are very sparse with an average of 66 non zero components in 101.323 dimensions (.06%)\n",
|
"# The tf-idf vectors are very sparse with an average of 66 non zero components in 101.323 dimensions (.06%)\n",
|
||||||
"vectors_train.nnz / float(vectors_train.shape[0])"
|
"vectors_train.nnz / float(vectors_train.shape[0])"
|
||||||
@ -177,11 +251,30 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 7,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/home/cif/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2652: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.\n",
|
||||||
|
" VisibleDeprecationWarning)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0.69545360719001303"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.naive_bayes import MultinomialNB\n",
|
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -209,11 +302,20 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 8,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"dimensionality: 101323\n",
|
||||||
|
"density: 1.000000\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.utils.extmath import density\n",
|
"from sklearn.utils.extmath import density\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -223,11 +325,38 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 9,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"alt.atheism: islam atheists say just religion atheism think don people god\n",
|
||||||
|
"comp.graphics: looking format 3d know program file files thanks image graphics\n",
|
||||||
|
"comp.os.ms-windows.misc: card problem thanks driver drivers use files dos file windows\n",
|
||||||
|
"comp.sys.ibm.pc.hardware: monitor disk thanks pc ide controller bus card scsi drive\n",
|
||||||
|
"comp.sys.mac.hardware: know monitor does quadra simms thanks problem drive apple mac\n",
|
||||||
|
"comp.windows.x: using windows x11r5 use application thanks widget server motif window\n",
|
||||||
|
"misc.forsale: asking email sell price condition new shipping offer 00 sale\n",
|
||||||
|
"rec.autos: don ford new good dealer just engine like cars car\n",
|
||||||
|
"rec.motorcycles: don just helmet riding like motorcycle ride bikes dod bike\n",
|
||||||
|
"rec.sport.baseball: braves players pitching hit runs games game baseball team year\n",
|
||||||
|
"rec.sport.hockey: league year nhl games season players play hockey team game\n",
|
||||||
|
"sci.crypt: people use escrow nsa keys government chip clipper encryption key\n",
|
||||||
|
"sci.electronics: don thanks voltage used know does like circuit power use\n",
|
||||||
|
"sci.med: skepticism cadre dsl banks chastity n3jxp pitt gordon geb msg\n",
|
||||||
|
"sci.space: just lunar earth shuttle like moon launch orbit nasa space\n",
|
||||||
|
"soc.religion.christian: believe faith christian christ bible people christians church jesus god\n",
|
||||||
|
"talk.politics.guns: just law firearms government fbi don weapons people guns gun\n",
|
||||||
|
"talk.politics.mideast: said arabs arab turkish people armenians armenian jews israeli israel\n",
|
||||||
|
"talk.politics.misc: know state clinton president just think tax don government people\n",
|
||||||
|
"talk.religion.misc: think don koresh objective christians bible people christian jesus god\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# We can review the top features per topic in Bayes (attribute coef_)\n",
|
"# We can review the top features per topic in Bayes (attribute coef_)\n",
|
||||||
"import numpy as np\n",
|
"import numpy as np\n",
|
||||||
@ -244,11 +373,28 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 10,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[ 2 15]\n",
|
||||||
|
"['comp.os.ms-windows.misc', 'soc.religion.christian']\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/home/cif/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2652: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.\n",
|
||||||
|
" VisibleDeprecationWarning)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# We try the classifier in two new docs\n",
|
"# We try the classifier in two new docs\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -115,7 +115,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.1"
|
"version": "3.5.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -193,7 +193,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.1"
|
"version": "3.5.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
Loading…
Reference in New Issue
Block a user