Update 4_1_Lexical_Processing.ipynb

Merge pull request #4 from gsi-upm/dveni-patch-1
Update 3_3_Data_Munging_with_Pandas.ipynb
2025-11-18 14:58:16 +00:00 · 2019-11-26 15:14:40 +01:00 · 2019-09-19 10:46:19 +02:00 · 2019-09-18 15:39:16 +02:00 · 2019-04-23 10:39:56 +02:00 · 2019-04-22 23:50:16 +02:00
6 changed files with 208 additions and 57 deletions
--- a/5
+++ b/5
@@ -1,10 +1,11 @@
 FOLDER:=.
 ERROR:=255
 exec:
-	find $(FOLDER) -iname '*.ipynb' -print0 | xargs -n 1 -0 sh -c 'jupyter nbconvert --execute --ClearOutputPreprocessor.enabled=True --inplace $$0 || exit 255'
+	find $(FOLDER) -iname '*.ipynb' -print0 | xargs -n 1 -0 sh -c 'jupyter nbconvert --execute --ClearOutputPreprocessor.enabled=True --inplace $$0 || exit $(ERROR)'
 clean:
-	find $(FOLDER) -iname '*.ipynb' -print0 | xargs -n 1 -0 sh -c 'jupyter nbconvert --ClearOutputPreprocessor.enabled=True --inplace $$0 || exit 255'
+	find $(FOLDER) -iname '*.ipynb' -print0 | xargs -n 1 -0 sh -c 'nbstripout $$0 || exit $(ERROR)'
 .PHONY: exec clean
--- a/ml2/3_3_Data_Munging_with_Pandas.ipynb
+++ b/ml2/3_3_Data_Munging_with_Pandas.ipynb
@@ -437,7 +437,7 @@
    "\n",
    "#Show mean Age, mean SibSp, and number of passengers older than 25 that survived,  grouped by Passenger Class and Sex\n",
    "df[(df.Age > 25 & (df.Survived == 1))].groupby(['Pclass', 'Sex'])['Age','SibSp','Survived'].agg({'Age': np.mean, \n",
-    "                                                                         'SibSp': np.mean, 'Survived': np.size})"
+    "                                                                         'SibSp': np.mean, 'Survived': np.sum})"
   ]
  },
  {
--- a/ml3/2_4_1_Exercise.ipynb
+++ b/ml3/2_4_1_Exercise.ipynb
@@ -275,7 +275,10 @@
    "print(classification_report(y_test, lr_preds))\n",
    "\n",
    "plt.figure(figsize=(10,7))\n",
-    "plot_decision_surface(X, y, lr)"
+    "# This methods outputs a visualization\n",
    "# the h parameter adjusts the precision of the visualization\n",
    "# if you find memory errors, set h to a higher value (e.g., h=0.1)\n",
    "plot_decision_surface(X, y, lr, h=0.02) "
   ]
  },
  {
--- a/nlp/4_1_Lexical_Processing.ipynb
+++ b/nlp/4_1_Lexical_Processing.ipynb
@@ -326,7 +326,7 @@
    "def preprocess(words, type='doc'):\n",
    "    if (type == 'tweet'):\n",
    "        tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)\n",
-    "        tokens = tknzr.tokenize(tweet)\n",
+    "        tokens = tknzr.tokenize(words)\n",
    "    else:\n",
    "        tokens = nltk.word_tokenize(words.lower())\n",
    "    porter = nltk.PorterStemmer()\n",
--- a/nlp/4_5_Semantic_Models.ipynb
+++ b/nlp/4_5_Semantic_Models.ipynb
@@ -76,9 +76,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2034, 2807)"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.datasets import fetch_20newsgroups\n",
    "\n",
@@ -120,7 +131,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -148,7 +159,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -162,9 +173,27 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0,\n",
       "  '0.007*\"car\" + 0.006*\"increased\" + 0.006*\"closely\" + 0.006*\"groups\" + 0.006*\"center\" + 0.006*\"88\" + 0.006*\"offer\" + 0.005*\"archie\" + 0.005*\"beginning\" + 0.005*\"comets\"'),\n",
       " (1,\n",
       "  '0.005*\"allow\" + 0.005*\"discuss\" + 0.005*\"condition\" + 0.004*\"certain\" + 0.004*\"member\" + 0.004*\"manipulation\" + 0.004*\"little\" + 0.003*\"proposal\" + 0.003*\"heavily\" + 0.003*\"obvious\"'),\n",
       " (2,\n",
       "  '0.002*\"led\" + 0.002*\"mechanism\" + 0.002*\"frank\" + 0.002*\"platform\" + 0.002*\"mormons\" + 0.002*\"concepts\" + 0.002*\"proton\" + 0.002*\"aeronautics\" + 0.002*\"header\" + 0.002*\"foreign\"'),\n",
       " (3,\n",
       "  '0.004*\"objects\" + 0.003*\"activity\" + 0.003*\"manhattan\" + 0.003*\"obtained\" + 0.003*\"eyes\" + 0.003*\"education\" + 0.003*\"netters\" + 0.003*\"complex\" + 0.003*\"europe\" + 0.002*\"missions\"')]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check the topics\n",
    "lda.print_topics(4)"
@@ -179,7 +208,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -211,9 +240,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dictionary(10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...)\n"
     ]
    }
   ],
   "source": [
    "# You can save the dictionary\n",
    "dictionary.save('newsgroup.dict')\n",
@@ -223,7 +260,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -234,7 +271,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -246,21 +283,38 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:root:random_state not set so using default value\n",
      "WARNING:root:failed to load state from newsgroups.dict.state: [Errno 2] No such file or directory: 'newsgroups.dict.state'\n"
     ]
    }
   ],
   "source": [
    "# You can optionally save the  dictionary \n",
    "\n",
    "dictionary.save('newsgroups.dict')\n",
-    "lda = LdaModel.load('newsgroups.lda')"
+    "lda = LdaModel.load('newsgroups.dict')"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dictionary(10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...)\n"
     ]
    }
   ],
   "source": [
    "# We can print the dictionary, it is a mappying of id and tokens\n",
    "\n",
@@ -269,7 +323,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -279,7 +333,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -292,9 +346,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 0.24093628445650234), (1, 0.5700978153855775), (2, 0.10438175896914427), (3, 0.1598114653031772), (4, 0.722808853369507), (5, 0.24093628445650234)]\n"
     ]
    }
   ],
   "source": [
    "#print tf-idf of first document\n",
    "print(corpus_tfidf[0])"
@@ -302,7 +364,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -315,9 +377,27 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0,\n",
       "  '0.011*\"thanks\" + 0.010*\"targa\" + 0.008*\"mary\" + 0.008*\"western\" + 0.007*\"craig\" + 0.007*\"jeff\" + 0.006*\"yayayay\" + 0.006*\"phobos\" + 0.005*\"unfortunately\" + 0.005*\"martian\"'),\n",
       " (1,\n",
       "  '0.007*\"islam\" + 0.006*\"koresh\" + 0.006*\"moon\" + 0.006*\"bible\" + 0.006*\"plane\" + 0.006*\"ns\" + 0.005*\"zoroastrians\" + 0.005*\"joy\" + 0.005*\"lucky\" + 0.005*\"ssrt\"'),\n",
       " (2,\n",
       "  '0.009*\"whatever\" + 0.009*\"baptist\" + 0.007*\"cheers\" + 0.007*\"kent\" + 0.006*\"khomeini\" + 0.006*\"davidian\" + 0.005*\"gerald\" + 0.005*\"bull\" + 0.005*\"sorry\" + 0.005*\"jesus\"'),\n",
       " (3,\n",
       "  '0.005*\"pd\" + 0.004*\"baltimore\" + 0.004*\"also\" + 0.003*\"ipx\" + 0.003*\"dam\" + 0.003*\"feiner\" + 0.003*\"foley\" + 0.003*\"ideally\" + 0.003*\"srgp\" + 0.003*\"thank\"')]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check the topics\n",
    "lda_model.print_topics(4)"
@@ -325,9 +405,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 0.09401487), (1, 0.08991001), (2, 0.08514047), (3, 0.7309346)]\n"
     ]
    }
   ],
   "source": [
    "# check the lsa vector for the first document\n",
    "corpus_lda = lda_model[corpus_tfidf]\n",
@@ -336,9 +424,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 24,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('lord', 1), ('god', 2)]\n"
     ]
    }
   ],
   "source": [
    "#predict topics of a new doc\n",
    "new_doc = \"God is love and God is the Lord\"\n",
@@ -349,9 +445,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 0.06678458), (1, 0.8006135), (2, 0.06974816), (3, 0.062853776)]\n"
     ]
    }
   ],
   "source": [
    "#transform into LDA space\n",
    "lda_vector = lda_model[bow_vector]\n",
@@ -360,9 +464,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 26,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.007*\"islam\" + 0.006*\"koresh\" + 0.006*\"moon\" + 0.006*\"bible\" + 0.006*\"plane\" + 0.006*\"ns\" + 0.005*\"zoroastrians\" + 0.005*\"joy\" + 0.005*\"lucky\" + 0.005*\"ssrt\"\n"
     ]
    }
   ],
   "source": [
    "# print the document's single most prominent LDA topic\n",
    "print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))"
@@ -370,9 +482,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 27,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 0.110989906), (1, 0.670005), (2, 0.11422917), (3, 0.10477593)]\n",
      "0.007*\"islam\" + 0.006*\"koresh\" + 0.006*\"moon\" + 0.006*\"bible\" + 0.006*\"plane\" + 0.006*\"ns\" + 0.005*\"zoroastrians\" + 0.005*\"joy\" + 0.005*\"lucky\" + 0.005*\"ssrt\"\n"
     ]
    }
   ],
   "source": [
    "lda_vector_tfidf = lda_model[tfidf_model[bow_vector]]\n",
    "print(lda_vector_tfidf)\n",
@@ -389,7 +510,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -405,9 +526,27 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 29,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0,\n",
       "  '0.769*\"god\" + 0.345*\"jesus\" + 0.235*\"bible\" + 0.203*\"christian\" + 0.149*\"christians\" + 0.108*\"christ\" + 0.089*\"well\" + 0.085*\"koresh\" + 0.081*\"kent\" + 0.080*\"christianity\"'),\n",
       " (1,\n",
       "  '-0.863*\"thanks\" + -0.255*\"please\" + -0.160*\"hello\" + -0.153*\"hi\" + 0.123*\"god\" + -0.112*\"sorry\" + -0.088*\"could\" + -0.075*\"windows\" + -0.068*\"jpeg\" + -0.062*\"gif\"'),\n",
       " (2,\n",
       "  '-0.779*\"well\" + 0.229*\"god\" + -0.164*\"yes\" + 0.153*\"thanks\" + -0.135*\"ico\" + -0.135*\"tek\" + -0.132*\"beauchaine\" + -0.132*\"queens\" + -0.132*\"bronx\" + -0.131*\"manhattan\"'),\n",
       " (3,\n",
       "  '0.343*\"well\" + -0.335*\"ico\" + -0.334*\"tek\" + -0.328*\"bronx\" + -0.328*\"beauchaine\" + -0.328*\"queens\" + -0.325*\"manhattan\" + -0.305*\"com\" + -0.303*\"bob\" + -0.073*\"god\"')]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check the topics\n",
    "lsi_model.print_topics(4)"
@@ -415,9 +554,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 30,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 0.24093628445650234), (1, 0.5700978153855775), (2, 0.10438175896914427), (3, 0.1598114653031772), (4, 0.722808853369507), (5, 0.24093628445650234)]\n"
     ]
    }
   ],
   "source": [
    "# check the lsi vector for the first document\n",
    "print(corpus_tfidf[0])"
--- a/nlp/4_6_Combining_Features.ipynb
+++ b/nlp/4_6_Combining_Features.ipynb
@@ -84,17 +84,17 @@
    "\n",
    "Each of these files contains 28 columns:\n",
    "\n",
-    "* essay_id: A unique identifier for each individual student essay\n",
+    "* **essay_id**: A unique identifier for each individual student essay\n",
-    "* essay_set: 1-8, an id for each set of essays\n",
+    "* **essay_set**: 1-8, an id for each set of essays\n",
-    "* essay: The ascii text of a student's response\n",
+    "* **essay**: The ascii text of a student's response\n",
-    "* rater1_domain1: Rater 1's domain 1 score; all essays have this\n",
+    "* **rater1_domain1**: Rater 1's domain 1 score; all essays have this\n",
-    "* rater2_domain1: Rater 2's domain 1 score; all essays have this\n",
+    "* **rater2_domain1**: Rater 2's domain 1 score; all essays have this\n",
-    "* rater3_domain1: Rater 3's domain 1 score; only some essays in set 8 have this.\n",
+    "* **rater3_domain1**: Rater 3's domain 1 score; only some essays in set 8 have this.\n",
-    "* domain1_score: Resolved score between the raters; all essays have this\n",
+    "* **domain1_score**: Resolved score between the raters; all essays have this\n",
-    "* rater1_domain2: Rater 1's domain 2 score; only essays in set 2 have this\n",
+    "* **rater1_domain2**: Rater 1's domain 2 score; only essays in set 2 have this\n",
-    "* rater2_domain2: Rater 2's domain 2 score; only essays in set 2 have this\n",
+    "* **rater2_domain2**: Rater 2's domain 2 score; only essays in set 2 have this\n",
-    "* domain2_score: Resolved score between the raters; only essays in set 2 have this\n",
+    "* **domain2_score**: Resolved score between the raters; only essays in set 2 have this\n",
-    "* rater1_trait1 score - rater3_trait6 score: trait scores for sets 7-8\n",
+    "* **rater1_trait1 score - rater3_trait6 score**: trait scores for sets 7-8\n",
    "\n",
    "The dataset is provided in the folder *data-kaggle/training_set_rel3.tsv*.\n",
    "\n",
@@ -102,7 +102,7 @@
    "\n",
    "The dataset has been anonymized  to remove personally identifying information from the essays using the Named Entity Recognizer (NER) from the Stanford Natural Language Processing group and a variety of other approaches. The relevant entities are identified in the text and then replaced with a string such as \"@PERSON1.\"\n",
    "\n",
-    "The entitities identified by NER are: \"PERSON\", \"ORGANIZATION\", \"LOCATION\", \"DATE\", \"TIME\", \"MONEY\", \"PERCENT\"\n",
+    "The entities identified by NER are: \"PERSON\", \"ORGANIZATION\", \"LOCATION\", \"DATE\", \"TIME\", \"MONEY\", \"PERCENT\"\n",
    "\n",
    "Other replacements made: \"MONTH\" (any month name not tagged as a date by the NER), \"EMAIL\" (anything that looks like an e-mail address), \"NUM\" (word containing digits or non-alphanumeric symbols), and \"CAPS\" (any capitalized word that doesn't begin a sentence, except in essays where more than 20% of the characters are capitalized letters), \"DR\" (any word following \"Dr.\" with or without the period, with any capitalization, that doesn't fall into any of the above), \"CITY\" and \"STATE\" (various cities and states)."
   ]
@@ -427,7 +427,7 @@
    "                            ])),\n",
    "                    ('lda', Pipeline([ \n",
    "                                ('count', CountVectorizer(tokenizer=custom_tokenizer)),\n",
-    "                                ('lda',  LatentDirichletAllocation(n_topics=4, max_iter=5,\n",
+    "                                ('lda',  LatentDirichletAllocation(n_components=4, max_iter=5,\n",
    "                                                       learning_method='online', \n",
    "                                                       learning_offset=50.,\n",
    "                                                       random_state=0))\n",
Author	SHA1	Message	Date
Dani Vera	19ea5dff09	Update 4_1_Lexical_Processing.ipynb	2019-11-26 15:14:40 +01:00
Carlos A. Iglesias	e70689072f	Merge pull request #4 from gsi-upm/dveni-patch-1 Update 3_3_Data_Munging_with_Pandas.ipynb	2019-09-19 10:46:19 +02:00
Dani Vera	344e054ba4	Update 3_3_Data_Munging_with_Pandas.ipynb Se utiliza np.size en la última columna. Esto calcula el tamaño de la serie, creo que de valores no null, pero no lo que pienso que se pretende es calcular el número de supervivientes, para lo que se podría utilizar np.sum.	2019-09-18 15:39:16 +02:00
Carlos A. Iglesias	2c8238f1f2	Cambiado nombre diccionario	2019-04-23 10:39:56 +02:00
Carlos A. Iglesias	e42299ac7a	cambiado n_topics por n_components por compatibilidad	2019-04-22 23:50:16 +02:00
Oscar Araque	9d1b88dfea	Makefile updated	2019-03-28 14:13:22 +01:00
Oscar Araque	ae3c34f94c	description about parameter h added	2019-03-21 19:35:50 +01:00