Included installation of nltk

2026-02-09 08:08:17 +00:00 · 2017-04-20 12:56:39 +02:00
parent cb40531dc4
commit e88e144a50
5 changed files with 140 additions and 987 deletions
--- a/nlp/4_2_Syntactic_Processing.ipynb
+++ b/nlp/4_2_Syntactic_Processing.ipynb
@@ -61,7 +61,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
@@ -109,19 +109,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[('I', 'PRON'), ('purchased', 'VERB'), ('this', 'DET'), ('Dell', 'NOUN'), ('monitor', 'NOUN'), ('because', 'ADP'), ('of', 'ADP'), ('budgetary', 'ADJ'), ('concerns', 'NOUN'), ('.', '.'), ('This', 'DET'), ('item', 'NOUN'), ('was', 'VERB'), ('the', 'DET'), ('most', 'ADV'), ('inexpensive', 'ADJ'), ('17', 'NUM'), ('inch', 'NOUN'), ('Apple', 'NOUN'), ('monitor', 'NOUN'), ('available', 'ADJ'), ('to', 'PRT'), ('me', 'PRON'), ('at', 'ADP'), ('the', 'DET'), ('time', 'NOUN'), ('I', 'PRON'), ('made', 'VERB'), ('the', 'DET'), ('purchase', 'NOUN'), ('.', '.'), ('My', 'PRON'), ('overall', 'ADJ'), ('experience', 'NOUN'), ('with', 'ADP'), ('this', 'DET'), ('monitor', 'NOUN'), ('was', 'VERB'), ('very', 'ADV'), ('poor', 'ADJ'), ('.', '.'), ('When', 'ADV'), ('the', 'DET'), ('screen', 'NOUN'), ('was', 'VERB'), (\"n't\", 'ADV'), ('contracting', 'VERB'), ('or', 'CONJ'), ('glitching', 'VERB'), ('the', 'DET'), ('overall', 'ADJ'), ('picture', 'NOUN'), ('quality', 'NOUN'), ('was', 'VERB'), ('poor', 'ADJ'), ('to', 'PRT'), ('fair', 'VERB'), ('.', '.'), ('I', 'PRON'), (\"'ve\", 'VERB'), ('viewed', 'VERB'), ('numerous', 'ADJ'), ('different', 'ADJ'), ('monitor', 'NOUN'), ('models', 'NOUN'), ('since', 'ADP'), ('I', 'PRON'), (\"'m\", 'VERB'), ('a', 'DET'), ('college', 'NOUN'), ('student', 'NOUN'), ('at', 'ADP'), ('UPM', 'NOUN'), ('in', 'ADP'), ('Madrid', 'NOUN'), ('and', 'CONJ'), ('this', 'DET'), ('particular', 'ADJ'), ('monitor', 'NOUN'), ('had', 'VERB'), ('as', 'ADP'), ('poor', 'ADJ'), ('of', 'ADP'), ('picture', 'NOUN'), ('quality', 'NOUN'), ('as', 'ADP'), ('any', 'DET'), ('I', 'PRON'), (\"'ve\", 'VERB'), ('seen', 'VERB'), ('.', '.')]\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from nltk import pos_tag, word_tokenize\n",
    "print (pos_tag(word_tokenize(review),  tagset='universal'))"
@@ -136,19 +128,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['I', 'purchase', 'Dell', 'monitor', 'because', 'of', 'budgetary', 'concern', 'item', 'be', 'most', 'inexpensive', '17', 'inch', 'Apple', 'monitor', 'available', 'me', 'at', 'time', 'I', 'make', 'purchase', 'My', 'overall', 'experience', 'with', 'monitor', 'be', 'very', 'poor', 'When', 'screen', 'be', \"n't\", 'contract', 'or', 'glitching', 'overall', 'picture', 'quality', 'be', 'poor', 'fair', 'I', \"'ve\", 'view', 'numerous', 'different', 'monitor', 'model', 'since', 'I', \"'m\", 'college', 'student', 'at', 'UPM', 'in', 'Madrid', 'and', 'particular', 'monitor', 'have', 'a', 'poor', 'of', 'picture', 'quality', 'a', 'I', \"'ve\", 'see']\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from nltk.stem import WordNetLemmatizer\n",
    "\n",
@@ -177,110 +161,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(S\n",
-      "  I/PRP\n",
-      "  purchased/VBD\n",
-      "  this/DT\n",
-      "  (ORGANIZATION Dell/NNP)\n",
-      "  monitor/NN\n",
-      "  because/IN\n",
-      "  of/IN\n",
-      "  budgetary/JJ\n",
-      "  concerns/NNS\n",
-      "  ./.\n",
-      "  This/DT\n",
-      "  item/NN\n",
-      "  was/VBD\n",
-      "  the/DT\n",
-      "  most/RBS\n",
-      "  inexpensive/JJ\n",
-      "  17/CD\n",
-      "  inch/NN\n",
-      "  Apple/NNP\n",
-      "  monitor/NN\n",
-      "  available/JJ\n",
-      "  to/TO\n",
-      "  me/PRP\n",
-      "  at/IN\n",
-      "  the/DT\n",
-      "  time/NN\n",
-      "  I/PRP\n",
-      "  made/VBD\n",
-      "  the/DT\n",
-      "  purchase/NN\n",
-      "  ./.\n",
-      "  My/PRP$\n",
-      "  overall/JJ\n",
-      "  experience/NN\n",
-      "  with/IN\n",
-      "  this/DT\n",
-      "  monitor/NN\n",
-      "  was/VBD\n",
-      "  very/RB\n",
-      "  poor/JJ\n",
-      "  ./.\n",
-      "  When/WRB\n",
-      "  the/DT\n",
-      "  screen/NN\n",
-      "  was/VBD\n",
-      "  n't/RB\n",
-      "  contracting/VBG\n",
-      "  or/CC\n",
-      "  glitching/VBG\n",
-      "  the/DT\n",
-      "  overall/JJ\n",
-      "  picture/NN\n",
-      "  quality/NN\n",
-      "  was/VBD\n",
-      "  poor/JJ\n",
-      "  to/TO\n",
-      "  fair/VB\n",
-      "  ./.\n",
-      "  I/PRP\n",
-      "  've/VBP\n",
-      "  viewed/VBN\n",
-      "  numerous/JJ\n",
-      "  different/JJ\n",
-      "  monitor/NN\n",
-      "  models/NNS\n",
-      "  since/IN\n",
-      "  I/PRP\n",
-      "  'm/VBP\n",
-      "  a/DT\n",
-      "  college/NN\n",
-      "  student/NN\n",
-      "  at/IN\n",
-      "  (ORGANIZATION UPM/NNP)\n",
-      "  in/IN\n",
-      "  (GPE Madrid/NNP)\n",
-      "  and/CC\n",
-      "  this/DT\n",
-      "  particular/JJ\n",
-      "  monitor/NN\n",
-      "  had/VBD\n",
-      "  as/IN\n",
-      "  poor/JJ\n",
-      "  of/IN\n",
-      "  picture/NN\n",
-      "  quality/NN\n",
-      "  as/IN\n",
-      "  any/DT\n",
-      "  I/PRP\n",
-      "  've/VBP\n",
-      "  seen/VBN\n",
-      "  ./.)\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from nltk import ne_chunk, pos_tag, word_tokenize\n",
    "ne_tagged = ne_chunk(pos_tag(word_tokenize(review)), binary=False)\n",
@@ -321,7 +206,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
@@ -344,90 +229,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(S\n",
-      "  I/PRON\n",
-      "  purchased/VERB\n",
-      "  (NP this/DET Dell/NOUN monitor/NOUN)\n",
-      "  because/ADP\n",
-      "  of/ADP\n",
-      "  (NP budgetary/ADJ concerns/NOUN)\n",
-      "  ./.\n",
-      "  (NP This/DET item/NOUN)\n",
-      "  was/VERB\n",
-      "  (NP\n",
-      "    the/DET\n",
-      "    most/ADV\n",
-      "    inexpensive/ADJ\n",
-      "    17/NUM\n",
-      "    inch/NOUN\n",
-      "    Apple/NOUN\n",
-      "    monitor/NOUN)\n",
-      "  available/ADJ\n",
-      "  to/PRT\n",
-      "  me/PRON\n",
-      "  at/ADP\n",
-      "  (NP the/DET time/NOUN)\n",
-      "  I/PRON\n",
-      "  made/VERB\n",
-      "  (NP the/DET purchase/NOUN)\n",
-      "  ./.\n",
-      "  (NP My/PRON overall/ADJ experience/NOUN)\n",
-      "  with/ADP\n",
-      "  (NP this/DET monitor/NOUN)\n",
-      "  was/VERB\n",
-      "  very/ADV\n",
-      "  poor/ADJ\n",
-      "  ./.\n",
-      "  When/ADV\n",
-      "  (NP the/DET screen/NOUN)\n",
-      "  was/VERB\n",
-      "  n't/ADV\n",
-      "  contracting/VERB\n",
-      "  or/CONJ\n",
-      "  glitching/VERB\n",
-      "  (NP the/DET overall/ADJ picture/NOUN quality/NOUN)\n",
-      "  was/VERB\n",
-      "  poor/ADJ\n",
-      "  to/PRT\n",
-      "  fair/VERB\n",
-      "  ./.\n",
-      "  I/PRON\n",
-      "  've/VERB\n",
-      "  viewed/VERB\n",
-      "  (NP numerous/ADJ different/ADJ monitor/NOUN models/NOUN)\n",
-      "  since/ADP\n",
-      "  I/PRON\n",
-      "  'm/VERB\n",
-      "  (NP a/DET college/NOUN student/NOUN)\n",
-      "  at/ADP\n",
-      "  (NP UPM/NOUN)\n",
-      "  in/ADP\n",
-      "  (NP Madrid/NOUN)\n",
-      "  and/CONJ\n",
-      "  (NP this/DET particular/ADJ monitor/NOUN)\n",
-      "  had/VERB\n",
-      "  as/ADP\n",
-      "  poor/ADJ\n",
-      "  of/ADP\n",
-      "  (NP picture/NOUN quality/NOUN)\n",
-      "  as/ADP\n",
-      "  any/DET\n",
-      "  I/PRON\n",
-      "  've/VERB\n",
-      "  seen/VERB\n",
-      "  ./.)\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from nltk.chunk.regexp import *\n",
    "pattern = \"\"\"NP: {<PRON><ADJ><NOUN>+} \n",
@@ -451,37 +257,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[Tree('NP', [('this', 'DET'), ('Dell', 'NOUN'), ('monitor', 'NOUN')]),\n",
-       " Tree('NP', [('budgetary', 'ADJ'), ('concerns', 'NOUN')]),\n",
-       " Tree('NP', [('This', 'DET'), ('item', 'NOUN')]),\n",
-       " Tree('NP', [('the', 'DET'), ('most', 'ADV'), ('inexpensive', 'ADJ'), ('17', 'NUM'), ('inch', 'NOUN'), ('Apple', 'NOUN'), ('monitor', 'NOUN')]),\n",
-       " Tree('NP', [('the', 'DET'), ('time', 'NOUN')]),\n",
-       " Tree('NP', [('the', 'DET'), ('purchase', 'NOUN')]),\n",
-       " Tree('NP', [('My', 'PRON'), ('overall', 'ADJ'), ('experience', 'NOUN')]),\n",
-       " Tree('NP', [('this', 'DET'), ('monitor', 'NOUN')]),\n",
-       " Tree('NP', [('the', 'DET'), ('screen', 'NOUN')]),\n",
-       " Tree('NP', [('the', 'DET'), ('overall', 'ADJ'), ('picture', 'NOUN'), ('quality', 'NOUN')]),\n",
-       " Tree('NP', [('numerous', 'ADJ'), ('different', 'ADJ'), ('monitor', 'NOUN'), ('models', 'NOUN')]),\n",
-       " Tree('NP', [('a', 'DET'), ('college', 'NOUN'), ('student', 'NOUN')]),\n",
-       " Tree('NP', [('UPM', 'NOUN')]),\n",
-       " Tree('NP', [('Madrid', 'NOUN')]),\n",
-       " Tree('NP', [('this', 'DET'), ('particular', 'ADJ'), ('monitor', 'NOUN')]),\n",
-       " Tree('NP', [('picture', 'NOUN'), ('quality', 'NOUN')])]"
-      ]
-     },
-     "execution_count": 54,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "def extractTrees(parsed_tree, category='NP'):\n",
    "    return list(parsed_tree.subtrees(filter=lambda x: x.label()==category))\n",
@@ -491,37 +271,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 90,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['this Dell monitor',\n",
-       " 'budgetary concerns',\n",
-       " 'This item',\n",
-       " 'the most inexpensive 17 inch Apple monitor',\n",
-       " 'the time',\n",
-       " 'the purchase',\n",
-       " 'My overall experience',\n",
-       " 'this monitor',\n",
-       " 'the screen',\n",
-       " 'the overall picture quality',\n",
-       " 'numerous different monitor models',\n",
-       " 'a college student',\n",
-       " 'UPM',\n",
-       " 'Madrid',\n",
-       " 'this particular monitor',\n",
-       " 'picture quality']"
-      ]
-     },
-     "execution_count": 90,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "def extractStrings(parsed_tree, category='NP'):\n",
    "    return [\" \".join(word for word, pos in vp.leaves()) for vp in extractTrees(parsed_tree, category)]\n",
@@ -529,15 +283,6 @@
    "extractStrings(chunks_np)"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -587,7 +332,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "version": "3.5.2"
  }
 },
 "nbformat": 4,