From b24f866056061208e6a8019b43294629c7a1d98b Mon Sep 17 00:00:00 2001 From: cif2cif Date: Thu, 20 Apr 2017 16:07:10 +0200 Subject: [PATCH] added import nltk --- ml1/2_0_0_Intro_ML.ipynb | 2 +- ml1/2_3_0_Visualisation.ipynb | 5 +- ml2/3_1_Read_Data.ipynb | 2 +- ml2/3_2_Pandas.ipynb | 2 +- ml2/3_3_Data_Munging_with_Pandas.ipynb | 2 +- ml2/3_4_Visualisation_Pandas.ipynb | 2 +- ml2/3_6_Machine_Learning.ipynb | 2 +- ml2/3_7_SVM.ipynb | 2 +- nlp/4_2_Syntactic_Processing.ipynb | 150 +------------------- nlp/4_4_Classification.ipynb | 186 ++++++++++++++++++++++--- python/1_0_Intro_Python.ipynb | 2 +- python/1_1_Notebooks.ipynb | 2 +- 12 files changed, 180 insertions(+), 179 deletions(-) diff --git a/ml1/2_0_0_Intro_ML.ipynb b/ml1/2_0_0_Intro_ML.ipynb index f9fc770..b141a39 100644 --- a/ml1/2_0_0_Intro_ML.ipynb +++ b/ml1/2_0_0_Intro_ML.ipynb @@ -102,7 +102,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1+" + "version": "3.5.2" } }, "nbformat": 4, diff --git a/ml1/2_3_0_Visualisation.ipynb b/ml1/2_3_0_Visualisation.ipynb index 2689f85..645b394 100644 --- a/ml1/2_3_0_Visualisation.ipynb +++ b/ml1/2_3_0_Visualisation.ipynb @@ -76,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": { "collapsed": true }, @@ -145,7 +145,6 @@ "plt.hist(iris.target, bins=10)\n", "plt.xlabel('Number of species')\n", "plt.ylabel('iris class')\n" - ] }, { @@ -379,7 +378,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1+" + "version": "3.5.2" } }, "nbformat": 4, diff --git a/ml2/3_1_Read_Data.ipynb b/ml2/3_1_Read_Data.ipynb index 0e4244c..88a110c 100644 --- a/ml2/3_1_Read_Data.ipynb +++ b/ml2/3_1_Read_Data.ipynb @@ -3838,7 +3838,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, diff --git a/ml2/3_2_Pandas.ipynb b/ml2/3_2_Pandas.ipynb index e35d3c7..c8aeb04 100644 --- a/ml2/3_2_Pandas.ipynb +++ b/ml2/3_2_Pandas.ipynb @@ -924,7 +924,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1+" + "version": "3.5.2" } }, "nbformat": 4, diff --git a/ml2/3_3_Data_Munging_with_Pandas.ipynb b/ml2/3_3_Data_Munging_with_Pandas.ipynb index 34021fd..20c1a66 100644 --- a/ml2/3_3_Data_Munging_with_Pandas.ipynb +++ b/ml2/3_3_Data_Munging_with_Pandas.ipynb @@ -5403,7 +5403,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1+" + "version": "3.5.2" } }, "nbformat": 4, diff --git a/ml2/3_4_Visualisation_Pandas.ipynb b/ml2/3_4_Visualisation_Pandas.ipynb index 6b913a2..22a268c 100644 --- a/ml2/3_4_Visualisation_Pandas.ipynb +++ b/ml2/3_4_Visualisation_Pandas.ipynb @@ -4763,7 +4763,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.4.3+" + "version": "3.5.2" } }, "nbformat": 4, diff --git a/ml2/3_6_Machine_Learning.ipynb b/ml2/3_6_Machine_Learning.ipynb index 305bc7d..c996a61 100644 --- a/ml2/3_6_Machine_Learning.ipynb +++ b/ml2/3_6_Machine_Learning.ipynb @@ -114,7 +114,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1+" + "version": "3.5.2" } }, "nbformat": 4, diff --git a/ml2/3_7_SVM.ipynb b/ml2/3_7_SVM.ipynb index ffe3710..d6b0dc7 100644 --- a/ml2/3_7_SVM.ipynb +++ b/ml2/3_7_SVM.ipynb @@ -1170,7 +1170,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1+" + "version": "3.5.2" } }, "nbformat": 4, diff --git a/nlp/4_2_Syntactic_Processing.ipynb b/nlp/4_2_Syntactic_Processing.ipynb index 314e432..a0b8d2b 100644 --- a/nlp/4_2_Syntactic_Processing.ipynb +++ b/nlp/4_2_Syntactic_Processing.ipynb @@ -162,157 +162,13 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "$: dollar\n", - " $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$\n", - "'': closing quotation mark\n", - " ' ''\n", - "(: opening parenthesis\n", - " ( [ {\n", - "): closing parenthesis\n", - " ) ] }\n", - ",: comma\n", - " ,\n", - "--: dash\n", - " --\n", - ".: sentence terminator\n", - " . ! ?\n", - ":: colon or ellipsis\n", - " : ; ...\n", - "CC: conjunction, coordinating\n", - " & 'n and both but either et for less minus neither nor or plus so\n", - " therefore times v. versus vs. whether yet\n", - "CD: numeral, cardinal\n", - " mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-\n", - " seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025\n", - " fifteen 271,124 dozen quintillion DM2,000 ...\n", - "DT: determiner\n", - " all an another any both del each either every half la many much nary\n", - " neither no some such that the them these this those\n", - "EX: existential there\n", - " there\n", - "FW: foreign word\n", - " gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous\n", - " lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte\n", - " terram fiche oui corporis ...\n", - "IN: preposition or conjunction, subordinating\n", - " astride among uppon whether out inside pro despite on by throughout\n", - " below within for towards near behind atop around if like until below\n", - " next into if beside ...\n", - "JJ: adjective or numeral, ordinal\n", - " third ill-mannered pre-war regrettable oiled calamitous first separable\n", - " ectoplasmic battery-powered participatory fourth still-to-be-named\n", - " multilingual multi-disciplinary ...\n", - "JJR: adjective, comparative\n", - " bleaker braver breezier briefer brighter brisker broader bumper busier\n", - " calmer cheaper choosier cleaner clearer closer colder commoner costlier\n", - " cozier creamier crunchier cuter ...\n", - "JJS: adjective, superlative\n", - " calmest cheapest choicest classiest cleanest clearest closest commonest\n", - " corniest costliest crassest creepiest crudest cutest darkest deadliest\n", - " dearest deepest densest dinkiest ...\n", - "LS: list item marker\n", - " A A. B B. C C. D E F First G H I J K One SP-44001 SP-44002 SP-44005\n", - " SP-44007 Second Third Three Two * a b c d first five four one six three\n", - " two\n", - "MD: modal auxiliary\n", - " can cannot could couldn't dare may might must need ought shall should\n", - " shouldn't will would\n", - "NN: noun, common, singular or mass\n", - " common-carrier cabbage knuckle-duster Casino afghan shed thermostat\n", - " investment slide humour falloff slick wind hyena override subhumanity\n", - " machinist ...\n", - "NNP: noun, proper, singular\n", - " Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos\n", - " Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA\n", - " Shannon A.K.C. Meltex Liverpool ...\n", - "NNPS: noun, proper, plural\n", - " Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists\n", - " Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques\n", - " Apache Apaches Apocrypha ...\n", - "NNS: noun, common, plural\n", - " undergraduates scotches bric-a-brac products bodyguards facets coasts\n", - " divestitures storehouses designs clubs fragrances averages\n", - " subjectivists apprehensions muses factory-jobs ...\n", - "PDT: pre-determiner\n", - " all both half many quite such sure this\n", - "POS: genitive marker\n", - " ' 's\n", - "PRP: pronoun, personal\n", - " hers herself him himself hisself it itself me myself one oneself ours\n", - " ourselves ownself self she thee theirs them themselves they thou thy us\n", - "PRP$: pronoun, possessive\n", - " her his mine my our ours their thy your\n", - "RB: adverb\n", - " occasionally unabatingly maddeningly adventurously professedly\n", - " stirringly prominently technologically magisterially predominately\n", - " swiftly fiscally pitilessly ...\n", - "RBR: adverb, comparative\n", - " further gloomier grander graver greater grimmer harder harsher\n", - " healthier heavier higher however larger later leaner lengthier less-\n", - " perfectly lesser lonelier longer louder lower more ...\n", - "RBS: adverb, superlative\n", - " best biggest bluntest earliest farthest first furthest hardest\n", - " heartiest highest largest least less most nearest second tightest worst\n", - "RP: particle\n", - " aboard about across along apart around aside at away back before behind\n", - " by crop down ever fast for forth from go high i.e. in into just later\n", - " low more off on open out over per pie raising start teeth that through\n", - " under unto up up-pp upon whole with you\n", - "SYM: symbol\n", - " % & ' '' ''. ) ). * + ,. < = > @ A[fj] U.S U.S.S.R * ** ***\n", - "TO: \"to\" as preposition or infinitive marker\n", - " to\n", - "UH: interjection\n", - " Goodbye Goody Gosh Wow Jeepers Jee-sus Hubba Hey Kee-reist Oops amen\n", - " huh howdy uh dammit whammo shucks heck anyways whodunnit honey golly\n", - " man baby diddle hush sonuvabitch ...\n", - "VB: verb, base form\n", - " ask assemble assess assign assume atone attention avoid bake balkanize\n", - " bank begin behold believe bend benefit bevel beware bless boil bomb\n", - " boost brace break bring broil brush build ...\n", - "VBD: verb, past tense\n", - " dipped pleaded swiped regummed soaked tidied convened halted registered\n", - " cushioned exacted snubbed strode aimed adopted belied figgered\n", - " speculated wore appreciated contemplated ...\n", - "VBG: verb, present participle or gerund\n", - " telegraphing stirring focusing angering judging stalling lactating\n", - " hankerin' alleging veering capping approaching traveling besieging\n", - " encrypting interrupting erasing wincing ...\n", - "VBN: verb, past participle\n", - " multihulled dilapidated aerosolized chaired languished panelized used\n", - " experimented flourished imitated reunifed factored condensed sheared\n", - " unsettled primed dubbed desired ...\n", - "VBP: verb, present tense, not 3rd person singular\n", - " predominate wrap resort sue twist spill cure lengthen brush terminate\n", - " appear tend stray glisten obtain comprise detest tease attract\n", - " emphasize mold postpone sever return wag ...\n", - "VBZ: verb, present tense, 3rd person singular\n", - " bases reconstructs marks mixes displeases seals carps weaves snatches\n", - " slumps stretches authorizes smolders pictures emerges stockpiles\n", - " seduces fizzes uses bolsters slaps speaks pleads ...\n", - "WDT: WH-determiner\n", - " that what whatever which whichever\n", - "WP: WH-pronoun\n", - " that what whatever whatsoever which who whom whosoever\n", - "WP$: WH-pronoun, possessive\n", - " whose\n", - "WRB: Wh-adverb\n", - " how however whence whenever where whereby whereever wherein whereof why\n", - "``: opening quotation mark\n", - " ` ``\n" - ] - } - ], + "outputs": [], "source": [ + "import nltk\n", "nltk.help.upenn_tagset()" ] }, diff --git a/nlp/4_4_Classification.ipynb b/nlp/4_4_Classification.ipynb index b854c5a..e714b59 100644 --- a/nlp/4_4_Classification.ipynb +++ b/nlp/4_4_Classification.ipynb @@ -74,11 +74,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n" + ] + } + ], "source": [ "from sklearn.datasets import fetch_20newsgroups\n", "\n", @@ -92,11 +100,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "20\n" + ] + } + ], "source": [ "#Number of categories\n", "print(len(newsgroups_train.target_names))" @@ -104,11 +120,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Category id 4 comp.sys.mac.hardware\n", + "Doc A fair number of brave souls who upgraded their SI clock oscillator have\n", + "shared their experiences for this poll. Please send a brief message detailing\n", + "your experiences with the procedure. Top speed attained, CPU rated speed,\n", + "add on cards and adapters, heat sinks, hour of usage per day, floppy disk\n", + "functionality with 800 and 1.4 m floppies are especially requested.\n", + "\n", + "I will be summarizing in the next two days, so please add to the network\n", + "knowledge base if you have done the clock upgrade and haven't answered this\n", + "poll. Thanks.\n" + ] + } + ], "source": [ "# Show a document\n", "docid = 1\n", @@ -121,11 +154,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(11314,)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "#Number of files\n", "newsgroups_train.filenames.shape" @@ -133,11 +177,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/cif/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2652: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.\n", + " VisibleDeprecationWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "(11314, 101323)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Obtain a vector\n", "\n", @@ -151,11 +214,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "66.80510871486653" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# The tf-idf vectors are very sparse with an average of 66 non zero components in 101.323 dimensions (.06%)\n", "vectors_train.nnz / float(vectors_train.shape[0])" @@ -177,11 +251,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/cif/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2652: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.\n", + " VisibleDeprecationWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "0.69545360719001303" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.naive_bayes import MultinomialNB\n", "\n", @@ -209,11 +302,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dimensionality: 101323\n", + "density: 1.000000\n" + ] + } + ], "source": [ "from sklearn.utils.extmath import density\n", "\n", @@ -223,11 +325,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "alt.atheism: islam atheists say just religion atheism think don people god\n", + "comp.graphics: looking format 3d know program file files thanks image graphics\n", + "comp.os.ms-windows.misc: card problem thanks driver drivers use files dos file windows\n", + "comp.sys.ibm.pc.hardware: monitor disk thanks pc ide controller bus card scsi drive\n", + "comp.sys.mac.hardware: know monitor does quadra simms thanks problem drive apple mac\n", + "comp.windows.x: using windows x11r5 use application thanks widget server motif window\n", + "misc.forsale: asking email sell price condition new shipping offer 00 sale\n", + "rec.autos: don ford new good dealer just engine like cars car\n", + "rec.motorcycles: don just helmet riding like motorcycle ride bikes dod bike\n", + "rec.sport.baseball: braves players pitching hit runs games game baseball team year\n", + "rec.sport.hockey: league year nhl games season players play hockey team game\n", + "sci.crypt: people use escrow nsa keys government chip clipper encryption key\n", + "sci.electronics: don thanks voltage used know does like circuit power use\n", + "sci.med: skepticism cadre dsl banks chastity n3jxp pitt gordon geb msg\n", + "sci.space: just lunar earth shuttle like moon launch orbit nasa space\n", + "soc.religion.christian: believe faith christian christ bible people christians church jesus god\n", + "talk.politics.guns: just law firearms government fbi don weapons people guns gun\n", + "talk.politics.mideast: said arabs arab turkish people armenians armenian jews israeli israel\n", + "talk.politics.misc: know state clinton president just think tax don government people\n", + "talk.religion.misc: think don koresh objective christians bible people christian jesus god\n" + ] + } + ], "source": [ "# We can review the top features per topic in Bayes (attribute coef_)\n", "import numpy as np\n", @@ -244,11 +373,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 2 15]\n", + "['comp.os.ms-windows.misc', 'soc.religion.christian']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/cif/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2652: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.\n", + " VisibleDeprecationWarning)\n" + ] + } + ], "source": [ "# We try the classifier in two new docs\n", "\n", diff --git a/python/1_0_Intro_Python.ipynb b/python/1_0_Intro_Python.ipynb index 83f5840..5c2579e 100644 --- a/python/1_0_Intro_Python.ipynb +++ b/python/1_0_Intro_Python.ipynb @@ -115,7 +115,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4, diff --git a/python/1_1_Notebooks.ipynb b/python/1_1_Notebooks.ipynb index 671d8db..42cdade 100644 --- a/python/1_1_Notebooks.ipynb +++ b/python/1_1_Notebooks.ipynb @@ -193,7 +193,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4,