1
0
mirror of https://github.com/gsi-upm/sitc synced 2024-12-22 03:38:13 +00:00

added import nltk

This commit is contained in:
cif2cif 2017-04-20 16:07:10 +02:00
parent c55a1c077b
commit b24f866056
12 changed files with 180 additions and 179 deletions

View File

@ -102,7 +102,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1+"
"version": "3.5.2"
}
},
"nbformat": 4,

View File

@ -76,7 +76,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 1,
"metadata": {
"collapsed": true
},
@ -145,7 +145,6 @@
"plt.hist(iris.target, bins=10)\n",
"plt.xlabel('Number of species')\n",
"plt.ylabel('iris class')\n"
]
},
{
@ -379,7 +378,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1+"
"version": "3.5.2"
}
},
"nbformat": 4,

View File

@ -3838,7 +3838,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
"version": "3.5.2"
}
},
"nbformat": 4,

View File

@ -924,7 +924,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1+"
"version": "3.5.2"
}
},
"nbformat": 4,

View File

@ -5403,7 +5403,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1+"
"version": "3.5.2"
}
},
"nbformat": 4,

View File

@ -4763,7 +4763,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3+"
"version": "3.5.2"
}
},
"nbformat": 4,

View File

@ -114,7 +114,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1+"
"version": "3.5.2"
}
},
"nbformat": 4,

View File

@ -1170,7 +1170,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1+"
"version": "3.5.2"
}
},
"nbformat": 4,

View File

@ -162,157 +162,13 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"$: dollar\n",
" $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$\n",
"'': closing quotation mark\n",
" ' ''\n",
"(: opening parenthesis\n",
" ( [ {\n",
"): closing parenthesis\n",
" ) ] }\n",
",: comma\n",
" ,\n",
"--: dash\n",
" --\n",
".: sentence terminator\n",
" . ! ?\n",
":: colon or ellipsis\n",
" : ; ...\n",
"CC: conjunction, coordinating\n",
" & 'n and both but either et for less minus neither nor or plus so\n",
" therefore times v. versus vs. whether yet\n",
"CD: numeral, cardinal\n",
" mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-\n",
" seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025\n",
" fifteen 271,124 dozen quintillion DM2,000 ...\n",
"DT: determiner\n",
" all an another any both del each either every half la many much nary\n",
" neither no some such that the them these this those\n",
"EX: existential there\n",
" there\n",
"FW: foreign word\n",
" gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous\n",
" lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte\n",
" terram fiche oui corporis ...\n",
"IN: preposition or conjunction, subordinating\n",
" astride among uppon whether out inside pro despite on by throughout\n",
" below within for towards near behind atop around if like until below\n",
" next into if beside ...\n",
"JJ: adjective or numeral, ordinal\n",
" third ill-mannered pre-war regrettable oiled calamitous first separable\n",
" ectoplasmic battery-powered participatory fourth still-to-be-named\n",
" multilingual multi-disciplinary ...\n",
"JJR: adjective, comparative\n",
" bleaker braver breezier briefer brighter brisker broader bumper busier\n",
" calmer cheaper choosier cleaner clearer closer colder commoner costlier\n",
" cozier creamier crunchier cuter ...\n",
"JJS: adjective, superlative\n",
" calmest cheapest choicest classiest cleanest clearest closest commonest\n",
" corniest costliest crassest creepiest crudest cutest darkest deadliest\n",
" dearest deepest densest dinkiest ...\n",
"LS: list item marker\n",
" A A. B B. C C. D E F First G H I J K One SP-44001 SP-44002 SP-44005\n",
" SP-44007 Second Third Three Two * a b c d first five four one six three\n",
" two\n",
"MD: modal auxiliary\n",
" can cannot could couldn't dare may might must need ought shall should\n",
" shouldn't will would\n",
"NN: noun, common, singular or mass\n",
" common-carrier cabbage knuckle-duster Casino afghan shed thermostat\n",
" investment slide humour falloff slick wind hyena override subhumanity\n",
" machinist ...\n",
"NNP: noun, proper, singular\n",
" Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos\n",
" Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA\n",
" Shannon A.K.C. Meltex Liverpool ...\n",
"NNPS: noun, proper, plural\n",
" Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists\n",
" Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques\n",
" Apache Apaches Apocrypha ...\n",
"NNS: noun, common, plural\n",
" undergraduates scotches bric-a-brac products bodyguards facets coasts\n",
" divestitures storehouses designs clubs fragrances averages\n",
" subjectivists apprehensions muses factory-jobs ...\n",
"PDT: pre-determiner\n",
" all both half many quite such sure this\n",
"POS: genitive marker\n",
" ' 's\n",
"PRP: pronoun, personal\n",
" hers herself him himself hisself it itself me myself one oneself ours\n",
" ourselves ownself self she thee theirs them themselves they thou thy us\n",
"PRP$: pronoun, possessive\n",
" her his mine my our ours their thy your\n",
"RB: adverb\n",
" occasionally unabatingly maddeningly adventurously professedly\n",
" stirringly prominently technologically magisterially predominately\n",
" swiftly fiscally pitilessly ...\n",
"RBR: adverb, comparative\n",
" further gloomier grander graver greater grimmer harder harsher\n",
" healthier heavier higher however larger later leaner lengthier less-\n",
" perfectly lesser lonelier longer louder lower more ...\n",
"RBS: adverb, superlative\n",
" best biggest bluntest earliest farthest first furthest hardest\n",
" heartiest highest largest least less most nearest second tightest worst\n",
"RP: particle\n",
" aboard about across along apart around aside at away back before behind\n",
" by crop down ever fast for forth from go high i.e. in into just later\n",
" low more off on open out over per pie raising start teeth that through\n",
" under unto up up-pp upon whole with you\n",
"SYM: symbol\n",
" % & ' '' ''. ) ). * + ,. < = > @ A[fj] U.S U.S.S.R * ** ***\n",
"TO: \"to\" as preposition or infinitive marker\n",
" to\n",
"UH: interjection\n",
" Goodbye Goody Gosh Wow Jeepers Jee-sus Hubba Hey Kee-reist Oops amen\n",
" huh howdy uh dammit whammo shucks heck anyways whodunnit honey golly\n",
" man baby diddle hush sonuvabitch ...\n",
"VB: verb, base form\n",
" ask assemble assess assign assume atone attention avoid bake balkanize\n",
" bank begin behold believe bend benefit bevel beware bless boil bomb\n",
" boost brace break bring broil brush build ...\n",
"VBD: verb, past tense\n",
" dipped pleaded swiped regummed soaked tidied convened halted registered\n",
" cushioned exacted snubbed strode aimed adopted belied figgered\n",
" speculated wore appreciated contemplated ...\n",
"VBG: verb, present participle or gerund\n",
" telegraphing stirring focusing angering judging stalling lactating\n",
" hankerin' alleging veering capping approaching traveling besieging\n",
" encrypting interrupting erasing wincing ...\n",
"VBN: verb, past participle\n",
" multihulled dilapidated aerosolized chaired languished panelized used\n",
" experimented flourished imitated reunifed factored condensed sheared\n",
" unsettled primed dubbed desired ...\n",
"VBP: verb, present tense, not 3rd person singular\n",
" predominate wrap resort sue twist spill cure lengthen brush terminate\n",
" appear tend stray glisten obtain comprise detest tease attract\n",
" emphasize mold postpone sever return wag ...\n",
"VBZ: verb, present tense, 3rd person singular\n",
" bases reconstructs marks mixes displeases seals carps weaves snatches\n",
" slumps stretches authorizes smolders pictures emerges stockpiles\n",
" seduces fizzes uses bolsters slaps speaks pleads ...\n",
"WDT: WH-determiner\n",
" that what whatever which whichever\n",
"WP: WH-pronoun\n",
" that what whatever whatsoever which who whom whosoever\n",
"WP$: WH-pronoun, possessive\n",
" whose\n",
"WRB: Wh-adverb\n",
" how however whence whenever where whereby whereever wherein whereof why\n",
"``: opening quotation mark\n",
" ` ``\n"
]
}
],
"outputs": [],
"source": [
"import nltk\n",
"nltk.help.upenn_tagset()"
]
},

View File

@ -74,11 +74,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n"
]
}
],
"source": [
"from sklearn.datasets import fetch_20newsgroups\n",
"\n",
@ -92,11 +100,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"20\n"
]
}
],
"source": [
"#Number of categories\n",
"print(len(newsgroups_train.target_names))"
@ -104,11 +120,28 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Category id 4 comp.sys.mac.hardware\n",
"Doc A fair number of brave souls who upgraded their SI clock oscillator have\n",
"shared their experiences for this poll. Please send a brief message detailing\n",
"your experiences with the procedure. Top speed attained, CPU rated speed,\n",
"add on cards and adapters, heat sinks, hour of usage per day, floppy disk\n",
"functionality with 800 and 1.4 m floppies are especially requested.\n",
"\n",
"I will be summarizing in the next two days, so please add to the network\n",
"knowledge base if you have done the clock upgrade and haven't answered this\n",
"poll. Thanks.\n"
]
}
],
"source": [
"# Show a document\n",
"docid = 1\n",
@ -121,11 +154,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"(11314,)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Number of files\n",
"newsgroups_train.filenames.shape"
@ -133,11 +177,30 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/cif/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2652: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.\n",
" VisibleDeprecationWarning)\n"
]
},
{
"data": {
"text/plain": [
"(11314, 101323)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Obtain a vector\n",
"\n",
@ -151,11 +214,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"66.80510871486653"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# The tf-idf vectors are very sparse with an average of 66 non zero components in 101.323 dimensions (.06%)\n",
"vectors_train.nnz / float(vectors_train.shape[0])"
@ -177,11 +251,30 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/cif/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2652: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.\n",
" VisibleDeprecationWarning)\n"
]
},
{
"data": {
"text/plain": [
"0.69545360719001303"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.naive_bayes import MultinomialNB\n",
"\n",
@ -209,11 +302,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dimensionality: 101323\n",
"density: 1.000000\n"
]
}
],
"source": [
"from sklearn.utils.extmath import density\n",
"\n",
@ -223,11 +325,38 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"alt.atheism: islam atheists say just religion atheism think don people god\n",
"comp.graphics: looking format 3d know program file files thanks image graphics\n",
"comp.os.ms-windows.misc: card problem thanks driver drivers use files dos file windows\n",
"comp.sys.ibm.pc.hardware: monitor disk thanks pc ide controller bus card scsi drive\n",
"comp.sys.mac.hardware: know monitor does quadra simms thanks problem drive apple mac\n",
"comp.windows.x: using windows x11r5 use application thanks widget server motif window\n",
"misc.forsale: asking email sell price condition new shipping offer 00 sale\n",
"rec.autos: don ford new good dealer just engine like cars car\n",
"rec.motorcycles: don just helmet riding like motorcycle ride bikes dod bike\n",
"rec.sport.baseball: braves players pitching hit runs games game baseball team year\n",
"rec.sport.hockey: league year nhl games season players play hockey team game\n",
"sci.crypt: people use escrow nsa keys government chip clipper encryption key\n",
"sci.electronics: don thanks voltage used know does like circuit power use\n",
"sci.med: skepticism cadre dsl banks chastity n3jxp pitt gordon geb msg\n",
"sci.space: just lunar earth shuttle like moon launch orbit nasa space\n",
"soc.religion.christian: believe faith christian christ bible people christians church jesus god\n",
"talk.politics.guns: just law firearms government fbi don weapons people guns gun\n",
"talk.politics.mideast: said arabs arab turkish people armenians armenian jews israeli israel\n",
"talk.politics.misc: know state clinton president just think tax don government people\n",
"talk.religion.misc: think don koresh objective christians bible people christian jesus god\n"
]
}
],
"source": [
"# We can review the top features per topic in Bayes (attribute coef_)\n",
"import numpy as np\n",
@ -244,11 +373,28 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 2 15]\n",
"['comp.os.ms-windows.misc', 'soc.religion.christian']\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/cif/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2652: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.\n",
" VisibleDeprecationWarning)\n"
]
}
],
"source": [
"# We try the classifier in two new docs\n",
"\n",

View File

@ -115,7 +115,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
"version": "3.5.2"
}
},
"nbformat": 4,

View File

@ -193,7 +193,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
"version": "3.5.2"
}
},
"nbformat": 4,