diff --git a/nlp/4_2_Syntactic_Processing.ipynb b/nlp/4_2_Syntactic_Processing.ipynb index 042afcd..314e432 100644 --- a/nlp/4_2_Syntactic_Processing.ipynb +++ b/nlp/4_2_Syntactic_Processing.ipynb @@ -61,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "collapsed": false }, @@ -109,30 +109,235 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('I', 'PRON'), ('purchased', 'VERB'), ('this', 'DET'), ('Dell', 'NOUN'), ('monitor', 'NOUN'), ('because', 'ADP'), ('of', 'ADP'), ('budgetary', 'ADJ'), ('concerns', 'NOUN'), ('.', '.'), ('This', 'DET'), ('item', 'NOUN'), ('was', 'VERB'), ('the', 'DET'), ('most', 'ADV'), ('inexpensive', 'ADJ'), ('17', 'NUM'), ('inch', 'NOUN'), ('Apple', 'NOUN'), ('monitor', 'NOUN'), ('available', 'ADJ'), ('to', 'PRT'), ('me', 'PRON'), ('at', 'ADP'), ('the', 'DET'), ('time', 'NOUN'), ('I', 'PRON'), ('made', 'VERB'), ('the', 'DET'), ('purchase', 'NOUN'), ('.', '.'), ('My', 'PRON'), ('overall', 'ADJ'), ('experience', 'NOUN'), ('with', 'ADP'), ('this', 'DET'), ('monitor', 'NOUN'), ('was', 'VERB'), ('very', 'ADV'), ('poor', 'ADJ'), ('.', '.'), ('When', 'ADV'), ('the', 'DET'), ('screen', 'NOUN'), ('was', 'VERB'), (\"n't\", 'ADV'), ('contracting', 'VERB'), ('or', 'CONJ'), ('glitching', 'VERB'), ('the', 'DET'), ('overall', 'ADJ'), ('picture', 'NOUN'), ('quality', 'NOUN'), ('was', 'VERB'), ('poor', 'ADJ'), ('to', 'PRT'), ('fair', 'VERB'), ('.', '.'), ('I', 'PRON'), (\"'ve\", 'VERB'), ('viewed', 'VERB'), ('numerous', 'ADJ'), ('different', 'ADJ'), ('monitor', 'NOUN'), ('models', 'NOUN'), ('since', 'ADP'), ('I', 'PRON'), (\"'m\", 'VERB'), ('a', 'DET'), ('college', 'NOUN'), ('student', 'NOUN'), ('at', 'ADP'), ('UPM', 'NOUN'), ('in', 'ADP'), ('Madrid', 'NOUN'), ('and', 'CONJ'), ('this', 'DET'), ('particular', 'ADJ'), ('monitor', 'NOUN'), ('had', 'VERB'), ('as', 'ADP'), ('poor', 'ADJ'), ('of', 'ADP'), ('picture', 'NOUN'), ('quality', 'NOUN'), ('as', 'ADP'), ('any', 'DET'), ('I', 'PRON'), (\"'ve\", 'VERB'), ('seen', 'VERB'), ('.', '.')]\n" + ] + } + ], "source": [ "from nltk import pos_tag, word_tokenize\n", - "print (pos_tag(word_tokenize(review), tagset='universal'))" + "print (pos_tag(word_tokenize(review), tagset='universal'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Based on this POS info, we could use correctly now the WordNetLemmatizer. The WordNetLemmatizer only is interesting for 4 POS categories: ADJ, ADV, NOUN, and VERB." + "We could have used another tagset for POS, such as UPenn." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('I', 'PRP'), ('purchased', 'VBD'), ('this', 'DT'), ('Dell', 'NNP'), ('monitor', 'NN'), ('because', 'IN'), ('of', 'IN'), ('budgetary', 'JJ'), ('concerns', 'NNS'), ('.', '.'), ('This', 'DT'), ('item', 'NN'), ('was', 'VBD'), ('the', 'DT'), ('most', 'RBS'), ('inexpensive', 'JJ'), ('17', 'CD'), ('inch', 'NN'), ('Apple', 'NNP'), ('monitor', 'NN'), ('available', 'JJ'), ('to', 'TO'), ('me', 'PRP'), ('at', 'IN'), ('the', 'DT'), ('time', 'NN'), ('I', 'PRP'), ('made', 'VBD'), ('the', 'DT'), ('purchase', 'NN'), ('.', '.'), ('My', 'PRP$'), ('overall', 'JJ'), ('experience', 'NN'), ('with', 'IN'), ('this', 'DT'), ('monitor', 'NN'), ('was', 'VBD'), ('very', 'RB'), ('poor', 'JJ'), ('.', '.'), ('When', 'WRB'), ('the', 'DT'), ('screen', 'NN'), ('was', 'VBD'), (\"n't\", 'RB'), ('contracting', 'VBG'), ('or', 'CC'), ('glitching', 'VBG'), ('the', 'DT'), ('overall', 'JJ'), ('picture', 'NN'), ('quality', 'NN'), ('was', 'VBD'), ('poor', 'JJ'), ('to', 'TO'), ('fair', 'VB'), ('.', '.'), ('I', 'PRP'), (\"'ve\", 'VBP'), ('viewed', 'VBN'), ('numerous', 'JJ'), ('different', 'JJ'), ('monitor', 'NN'), ('models', 'NNS'), ('since', 'IN'), ('I', 'PRP'), (\"'m\", 'VBP'), ('a', 'DT'), ('college', 'NN'), ('student', 'NN'), ('at', 'IN'), ('UPM', 'NNP'), ('in', 'IN'), ('Madrid', 'NNP'), ('and', 'CC'), ('this', 'DT'), ('particular', 'JJ'), ('monitor', 'NN'), ('had', 'VBD'), ('as', 'IN'), ('poor', 'JJ'), ('of', 'IN'), ('picture', 'NN'), ('quality', 'NN'), ('as', 'IN'), ('any', 'DT'), ('I', 'PRP'), (\"'ve\", 'VBP'), ('seen', 'VBN'), ('.', '.')]\n" + ] + } + ], + "source": [ + "print (pos_tag(word_tokenize(review)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The meaning of these tags can be obtained here:" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "$: dollar\n", + " $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$\n", + "'': closing quotation mark\n", + " ' ''\n", + "(: opening parenthesis\n", + " ( [ {\n", + "): closing parenthesis\n", + " ) ] }\n", + ",: comma\n", + " ,\n", + "--: dash\n", + " --\n", + ".: sentence terminator\n", + " . ! ?\n", + ":: colon or ellipsis\n", + " : ; ...\n", + "CC: conjunction, coordinating\n", + " & 'n and both but either et for less minus neither nor or plus so\n", + " therefore times v. versus vs. whether yet\n", + "CD: numeral, cardinal\n", + " mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-\n", + " seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025\n", + " fifteen 271,124 dozen quintillion DM2,000 ...\n", + "DT: determiner\n", + " all an another any both del each either every half la many much nary\n", + " neither no some such that the them these this those\n", + "EX: existential there\n", + " there\n", + "FW: foreign word\n", + " gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous\n", + " lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte\n", + " terram fiche oui corporis ...\n", + "IN: preposition or conjunction, subordinating\n", + " astride among uppon whether out inside pro despite on by throughout\n", + " below within for towards near behind atop around if like until below\n", + " next into if beside ...\n", + "JJ: adjective or numeral, ordinal\n", + " third ill-mannered pre-war regrettable oiled calamitous first separable\n", + " ectoplasmic battery-powered participatory fourth still-to-be-named\n", + " multilingual multi-disciplinary ...\n", + "JJR: adjective, comparative\n", + " bleaker braver breezier briefer brighter brisker broader bumper busier\n", + " calmer cheaper choosier cleaner clearer closer colder commoner costlier\n", + " cozier creamier crunchier cuter ...\n", + "JJS: adjective, superlative\n", + " calmest cheapest choicest classiest cleanest clearest closest commonest\n", + " corniest costliest crassest creepiest crudest cutest darkest deadliest\n", + " dearest deepest densest dinkiest ...\n", + "LS: list item marker\n", + " A A. B B. C C. D E F First G H I J K One SP-44001 SP-44002 SP-44005\n", + " SP-44007 Second Third Three Two * a b c d first five four one six three\n", + " two\n", + "MD: modal auxiliary\n", + " can cannot could couldn't dare may might must need ought shall should\n", + " shouldn't will would\n", + "NN: noun, common, singular or mass\n", + " common-carrier cabbage knuckle-duster Casino afghan shed thermostat\n", + " investment slide humour falloff slick wind hyena override subhumanity\n", + " machinist ...\n", + "NNP: noun, proper, singular\n", + " Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos\n", + " Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA\n", + " Shannon A.K.C. Meltex Liverpool ...\n", + "NNPS: noun, proper, plural\n", + " Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists\n", + " Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques\n", + " Apache Apaches Apocrypha ...\n", + "NNS: noun, common, plural\n", + " undergraduates scotches bric-a-brac products bodyguards facets coasts\n", + " divestitures storehouses designs clubs fragrances averages\n", + " subjectivists apprehensions muses factory-jobs ...\n", + "PDT: pre-determiner\n", + " all both half many quite such sure this\n", + "POS: genitive marker\n", + " ' 's\n", + "PRP: pronoun, personal\n", + " hers herself him himself hisself it itself me myself one oneself ours\n", + " ourselves ownself self she thee theirs them themselves they thou thy us\n", + "PRP$: pronoun, possessive\n", + " her his mine my our ours their thy your\n", + "RB: adverb\n", + " occasionally unabatingly maddeningly adventurously professedly\n", + " stirringly prominently technologically magisterially predominately\n", + " swiftly fiscally pitilessly ...\n", + "RBR: adverb, comparative\n", + " further gloomier grander graver greater grimmer harder harsher\n", + " healthier heavier higher however larger later leaner lengthier less-\n", + " perfectly lesser lonelier longer louder lower more ...\n", + "RBS: adverb, superlative\n", + " best biggest bluntest earliest farthest first furthest hardest\n", + " heartiest highest largest least less most nearest second tightest worst\n", + "RP: particle\n", + " aboard about across along apart around aside at away back before behind\n", + " by crop down ever fast for forth from go high i.e. in into just later\n", + " low more off on open out over per pie raising start teeth that through\n", + " under unto up up-pp upon whole with you\n", + "SYM: symbol\n", + " % & ' '' ''. ) ). * + ,. < = > @ A[fj] U.S U.S.S.R * ** ***\n", + "TO: \"to\" as preposition or infinitive marker\n", + " to\n", + "UH: interjection\n", + " Goodbye Goody Gosh Wow Jeepers Jee-sus Hubba Hey Kee-reist Oops amen\n", + " huh howdy uh dammit whammo shucks heck anyways whodunnit honey golly\n", + " man baby diddle hush sonuvabitch ...\n", + "VB: verb, base form\n", + " ask assemble assess assign assume atone attention avoid bake balkanize\n", + " bank begin behold believe bend benefit bevel beware bless boil bomb\n", + " boost brace break bring broil brush build ...\n", + "VBD: verb, past tense\n", + " dipped pleaded swiped regummed soaked tidied convened halted registered\n", + " cushioned exacted snubbed strode aimed adopted belied figgered\n", + " speculated wore appreciated contemplated ...\n", + "VBG: verb, present participle or gerund\n", + " telegraphing stirring focusing angering judging stalling lactating\n", + " hankerin' alleging veering capping approaching traveling besieging\n", + " encrypting interrupting erasing wincing ...\n", + "VBN: verb, past participle\n", + " multihulled dilapidated aerosolized chaired languished panelized used\n", + " experimented flourished imitated reunifed factored condensed sheared\n", + " unsettled primed dubbed desired ...\n", + "VBP: verb, present tense, not 3rd person singular\n", + " predominate wrap resort sue twist spill cure lengthen brush terminate\n", + " appear tend stray glisten obtain comprise detest tease attract\n", + " emphasize mold postpone sever return wag ...\n", + "VBZ: verb, present tense, 3rd person singular\n", + " bases reconstructs marks mixes displeases seals carps weaves snatches\n", + " slumps stretches authorizes smolders pictures emerges stockpiles\n", + " seduces fizzes uses bolsters slaps speaks pleads ...\n", + "WDT: WH-determiner\n", + " that what whatever which whichever\n", + "WP: WH-pronoun\n", + " that what whatever whatsoever which who whom whosoever\n", + "WP$: WH-pronoun, possessive\n", + " whose\n", + "WRB: Wh-adverb\n", + " how however whence whenever where whereby whereever wherein whereof why\n", + "``: opening quotation mark\n", + " ` ``\n" + ] + } + ], + "source": [ + "nltk.help.upenn_tagset()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are going to use the Univeral tagset in this example. Based on this POS info, we could use correctly now the WordNetLemmatizer. The WordNetLemmatizer only is interesting for 4 POS categories: ADJ, ADV, NOUN, and VERB. This is because WordNet lemmatizer will only lemmatize adjectives, adverbs, nouns and verbs, and it needs that all the provided tags are in [n, a, r, v]." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['I', 'purchase', 'Dell', 'monitor', 'because', 'of', 'budgetary', 'concern', 'item', 'be', 'most', 'inexpensive', '17', 'inch', 'Apple', 'monitor', 'available', 'me', 'at', 'time', 'I', 'make', 'purchase', 'My', 'overall', 'experience', 'with', 'monitor', 'be', 'very', 'poor', 'When', 'screen', 'be', \"n't\", 'contract', 'or', 'glitching', 'overall', 'picture', 'quality', 'be', 'poor', 'fair', 'I', \"'ve\", 'view', 'numerous', 'different', 'monitor', 'model', 'since', 'I', \"'m\", 'college', 'student', 'at', 'UPM', 'in', 'Madrid', 'and', 'particular', 'monitor', 'have', 'a', 'poor', 'of', 'picture', 'quality', 'a', 'I', \"'ve\", 'see']\n" + ] + } + ], "source": [ "from nltk.stem import WordNetLemmatizer\n", "\n", @@ -156,16 +361,115 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Named Entity Recognition (NER) is an information retrieval for identifying named entities of places, organisation of persons. NER usually relies in a tagged corpus. NER algorithms can be trained for new corpora." + "Named Entity Recognition (NER) is an information retrieval for identifying named entities of places, organisation of persons. NER usually relies in a tagged corpus. NER algorithms can be trained for new corpora. Here we are using the Brown tagset (http://www.comp.leeds.ac.uk/ccalas/tagsets/brown.html)." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(S\n", + " I/PRP\n", + " purchased/VBD\n", + " this/DT\n", + " (ORGANIZATION Dell/NNP)\n", + " monitor/NN\n", + " because/IN\n", + " of/IN\n", + " budgetary/JJ\n", + " concerns/NNS\n", + " ./.\n", + " This/DT\n", + " item/NN\n", + " was/VBD\n", + " the/DT\n", + " most/RBS\n", + " inexpensive/JJ\n", + " 17/CD\n", + " inch/NN\n", + " Apple/NNP\n", + " monitor/NN\n", + " available/JJ\n", + " to/TO\n", + " me/PRP\n", + " at/IN\n", + " the/DT\n", + " time/NN\n", + " I/PRP\n", + " made/VBD\n", + " the/DT\n", + " purchase/NN\n", + " ./.\n", + " My/PRP$\n", + " overall/JJ\n", + " experience/NN\n", + " with/IN\n", + " this/DT\n", + " monitor/NN\n", + " was/VBD\n", + " very/RB\n", + " poor/JJ\n", + " ./.\n", + " When/WRB\n", + " the/DT\n", + " screen/NN\n", + " was/VBD\n", + " n't/RB\n", + " contracting/VBG\n", + " or/CC\n", + " glitching/VBG\n", + " the/DT\n", + " overall/JJ\n", + " picture/NN\n", + " quality/NN\n", + " was/VBD\n", + " poor/JJ\n", + " to/TO\n", + " fair/VB\n", + " ./.\n", + " I/PRP\n", + " 've/VBP\n", + " viewed/VBN\n", + " numerous/JJ\n", + " different/JJ\n", + " monitor/NN\n", + " models/NNS\n", + " since/IN\n", + " I/PRP\n", + " 'm/VBP\n", + " a/DT\n", + " college/NN\n", + " student/NN\n", + " at/IN\n", + " (ORGANIZATION UPM/NNP)\n", + " in/IN\n", + " (GPE Madrid/NNP)\n", + " and/CC\n", + " this/DT\n", + " particular/JJ\n", + " monitor/NN\n", + " had/VBD\n", + " as/IN\n", + " poor/JJ\n", + " of/IN\n", + " picture/NN\n", + " quality/NN\n", + " as/IN\n", + " any/DT\n", + " I/PRP\n", + " 've/VBP\n", + " seen/VBN\n", + " ./.)\n" + ] + } + ], "source": [ "from nltk import ne_chunk, pos_tag, word_tokenize\n", "ne_tagged = ne_chunk(pos_tag(word_tokenize(review)), binary=False)\n", @@ -206,7 +510,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "collapsed": false }, @@ -229,11 +533,90 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(S\n", + " I/PRON\n", + " purchased/VERB\n", + " (NP this/DET Dell/NOUN monitor/NOUN)\n", + " because/ADP\n", + " of/ADP\n", + " (NP budgetary/ADJ concerns/NOUN)\n", + " ./.\n", + " (NP This/DET item/NOUN)\n", + " was/VERB\n", + " (NP\n", + " the/DET\n", + " most/ADV\n", + " inexpensive/ADJ\n", + " 17/NUM\n", + " inch/NOUN\n", + " Apple/NOUN\n", + " monitor/NOUN)\n", + " available/ADJ\n", + " to/PRT\n", + " me/PRON\n", + " at/ADP\n", + " (NP the/DET time/NOUN)\n", + " I/PRON\n", + " made/VERB\n", + " (NP the/DET purchase/NOUN)\n", + " ./.\n", + " (NP My/PRON overall/ADJ experience/NOUN)\n", + " with/ADP\n", + " (NP this/DET monitor/NOUN)\n", + " was/VERB\n", + " very/ADV\n", + " poor/ADJ\n", + " ./.\n", + " When/ADV\n", + " (NP the/DET screen/NOUN)\n", + " was/VERB\n", + " n't/ADV\n", + " contracting/VERB\n", + " or/CONJ\n", + " glitching/VERB\n", + " (NP the/DET overall/ADJ picture/NOUN quality/NOUN)\n", + " was/VERB\n", + " poor/ADJ\n", + " to/PRT\n", + " fair/VERB\n", + " ./.\n", + " I/PRON\n", + " 've/VERB\n", + " viewed/VERB\n", + " (NP numerous/ADJ different/ADJ monitor/NOUN models/NOUN)\n", + " since/ADP\n", + " I/PRON\n", + " 'm/VERB\n", + " (NP a/DET college/NOUN student/NOUN)\n", + " at/ADP\n", + " (NP UPM/NOUN)\n", + " in/ADP\n", + " (NP Madrid/NOUN)\n", + " and/CONJ\n", + " (NP this/DET particular/ADJ monitor/NOUN)\n", + " had/VERB\n", + " as/ADP\n", + " poor/ADJ\n", + " of/ADP\n", + " (NP picture/NOUN quality/NOUN)\n", + " as/ADP\n", + " any/DET\n", + " I/PRON\n", + " 've/VERB\n", + " seen/VERB\n", + " ./.)\n" + ] + } + ], "source": [ "from nltk.chunk.regexp import *\n", "pattern = \"\"\"NP: {+} \n", @@ -257,11 +640,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[Tree('NP', [('this', 'DET'), ('Dell', 'NOUN'), ('monitor', 'NOUN')]),\n", + " Tree('NP', [('budgetary', 'ADJ'), ('concerns', 'NOUN')]),\n", + " Tree('NP', [('This', 'DET'), ('item', 'NOUN')]),\n", + " Tree('NP', [('the', 'DET'), ('most', 'ADV'), ('inexpensive', 'ADJ'), ('17', 'NUM'), ('inch', 'NOUN'), ('Apple', 'NOUN'), ('monitor', 'NOUN')]),\n", + " Tree('NP', [('the', 'DET'), ('time', 'NOUN')]),\n", + " Tree('NP', [('the', 'DET'), ('purchase', 'NOUN')]),\n", + " Tree('NP', [('My', 'PRON'), ('overall', 'ADJ'), ('experience', 'NOUN')]),\n", + " Tree('NP', [('this', 'DET'), ('monitor', 'NOUN')]),\n", + " Tree('NP', [('the', 'DET'), ('screen', 'NOUN')]),\n", + " Tree('NP', [('the', 'DET'), ('overall', 'ADJ'), ('picture', 'NOUN'), ('quality', 'NOUN')]),\n", + " Tree('NP', [('numerous', 'ADJ'), ('different', 'ADJ'), ('monitor', 'NOUN'), ('models', 'NOUN')]),\n", + " Tree('NP', [('a', 'DET'), ('college', 'NOUN'), ('student', 'NOUN')]),\n", + " Tree('NP', [('UPM', 'NOUN')]),\n", + " Tree('NP', [('Madrid', 'NOUN')]),\n", + " Tree('NP', [('this', 'DET'), ('particular', 'ADJ'), ('monitor', 'NOUN')]),\n", + " Tree('NP', [('picture', 'NOUN'), ('quality', 'NOUN')])]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def extractTrees(parsed_tree, category='NP'):\n", " return list(parsed_tree.subtrees(filter=lambda x: x.label()==category))\n", @@ -271,11 +680,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['this Dell monitor',\n", + " 'budgetary concerns',\n", + " 'This item',\n", + " 'the most inexpensive 17 inch Apple monitor',\n", + " 'the time',\n", + " 'the purchase',\n", + " 'My overall experience',\n", + " 'this monitor',\n", + " 'the screen',\n", + " 'the overall picture quality',\n", + " 'numerous different monitor models',\n", + " 'a college student',\n", + " 'UPM',\n", + " 'Madrid',\n", + " 'this particular monitor',\n", + " 'picture quality']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def extractStrings(parsed_tree, category='NP'):\n", " return [\" \".join(word for word, pos in vp.leaves()) for vp in extractTrees(parsed_tree, category)]\n",