1
0
mirror of https://github.com/gsi-upm/sitc synced 2025-09-19 05:12:20 +00:00

Compare commits

7 Commits

Author SHA1 Message Date
Dani Vera
19ea5dff09 Update 4_1_Lexical_Processing.ipynb 2019-11-26 15:14:40 +01:00
Carlos A. Iglesias
e70689072f Merge pull request #4 from gsi-upm/dveni-patch-1
Update 3_3_Data_Munging_with_Pandas.ipynb
2019-09-19 10:46:19 +02:00
Dani Vera
344e054ba4 Update 3_3_Data_Munging_with_Pandas.ipynb
Se utiliza np.size en la última columna. Esto calcula el tamaño de la serie, creo que de valores no null, pero no lo que pienso que se pretende es calcular el número de supervivientes, para lo que se podría utilizar np.sum.
2019-09-18 15:39:16 +02:00
Carlos A. Iglesias
2c8238f1f2 Cambiado nombre diccionario 2019-04-23 10:39:56 +02:00
Carlos A. Iglesias
e42299ac7a cambiado n_topics por n_components por compatibilidad 2019-04-22 23:50:16 +02:00
Oscar Araque
9d1b88dfea Makefile updated 2019-03-28 14:13:22 +01:00
Oscar Araque
ae3c34f94c description about parameter h added 2019-03-21 19:35:50 +01:00
6 changed files with 208 additions and 57 deletions

View File

@@ -1,10 +1,11 @@
FOLDER:=. FOLDER:=.
ERROR:=255
exec: exec:
find $(FOLDER) -iname '*.ipynb' -print0 | xargs -n 1 -0 sh -c 'jupyter nbconvert --execute --ClearOutputPreprocessor.enabled=True --inplace $$0 || exit 255' find $(FOLDER) -iname '*.ipynb' -print0 | xargs -n 1 -0 sh -c 'jupyter nbconvert --execute --ClearOutputPreprocessor.enabled=True --inplace $$0 || exit $(ERROR)'
clean: clean:
find $(FOLDER) -iname '*.ipynb' -print0 | xargs -n 1 -0 sh -c 'jupyter nbconvert --ClearOutputPreprocessor.enabled=True --inplace $$0 || exit 255' find $(FOLDER) -iname '*.ipynb' -print0 | xargs -n 1 -0 sh -c 'nbstripout $$0 || exit $(ERROR)'
.PHONY: exec clean .PHONY: exec clean

View File

@@ -437,7 +437,7 @@
"\n", "\n",
"#Show mean Age, mean SibSp, and number of passengers older than 25 that survived, grouped by Passenger Class and Sex\n", "#Show mean Age, mean SibSp, and number of passengers older than 25 that survived, grouped by Passenger Class and Sex\n",
"df[(df.Age > 25 & (df.Survived == 1))].groupby(['Pclass', 'Sex'])['Age','SibSp','Survived'].agg({'Age': np.mean, \n", "df[(df.Age > 25 & (df.Survived == 1))].groupby(['Pclass', 'Sex'])['Age','SibSp','Survived'].agg({'Age': np.mean, \n",
" 'SibSp': np.mean, 'Survived': np.size})" " 'SibSp': np.mean, 'Survived': np.sum})"
] ]
}, },
{ {

View File

@@ -275,7 +275,10 @@
"print(classification_report(y_test, lr_preds))\n", "print(classification_report(y_test, lr_preds))\n",
"\n", "\n",
"plt.figure(figsize=(10,7))\n", "plt.figure(figsize=(10,7))\n",
"plot_decision_surface(X, y, lr)" "# This methods outputs a visualization\n",
"# the h parameter adjusts the precision of the visualization\n",
"# if you find memory errors, set h to a higher value (e.g., h=0.1)\n",
"plot_decision_surface(X, y, lr, h=0.02) "
] ]
}, },
{ {

View File

@@ -326,7 +326,7 @@
"def preprocess(words, type='doc'):\n", "def preprocess(words, type='doc'):\n",
" if (type == 'tweet'):\n", " if (type == 'tweet'):\n",
" tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)\n", " tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)\n",
" tokens = tknzr.tokenize(tweet)\n", " tokens = tknzr.tokenize(words)\n",
" else:\n", " else:\n",
" tokens = nltk.word_tokenize(words.lower())\n", " tokens = nltk.word_tokenize(words.lower())\n",
" porter = nltk.PorterStemmer()\n", " porter = nltk.PorterStemmer()\n",

View File

@@ -76,9 +76,20 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"(2034, 2807)"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"from sklearn.datasets import fetch_20newsgroups\n", "from sklearn.datasets import fetch_20newsgroups\n",
"\n", "\n",
@@ -120,7 +131,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -148,7 +159,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -162,9 +173,27 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" '0.007*\"car\" + 0.006*\"increased\" + 0.006*\"closely\" + 0.006*\"groups\" + 0.006*\"center\" + 0.006*\"88\" + 0.006*\"offer\" + 0.005*\"archie\" + 0.005*\"beginning\" + 0.005*\"comets\"'),\n",
" (1,\n",
" '0.005*\"allow\" + 0.005*\"discuss\" + 0.005*\"condition\" + 0.004*\"certain\" + 0.004*\"member\" + 0.004*\"manipulation\" + 0.004*\"little\" + 0.003*\"proposal\" + 0.003*\"heavily\" + 0.003*\"obvious\"'),\n",
" (2,\n",
" '0.002*\"led\" + 0.002*\"mechanism\" + 0.002*\"frank\" + 0.002*\"platform\" + 0.002*\"mormons\" + 0.002*\"concepts\" + 0.002*\"proton\" + 0.002*\"aeronautics\" + 0.002*\"header\" + 0.002*\"foreign\"'),\n",
" (3,\n",
" '0.004*\"objects\" + 0.003*\"activity\" + 0.003*\"manhattan\" + 0.003*\"obtained\" + 0.003*\"eyes\" + 0.003*\"education\" + 0.003*\"netters\" + 0.003*\"complex\" + 0.003*\"europe\" + 0.002*\"missions\"')]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"# check the topics\n", "# check the topics\n",
"lda.print_topics(4)" "lda.print_topics(4)"
@@ -179,7 +208,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -211,9 +240,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dictionary(10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...)\n"
]
}
],
"source": [ "source": [
"# You can save the dictionary\n", "# You can save the dictionary\n",
"dictionary.save('newsgroup.dict')\n", "dictionary.save('newsgroup.dict')\n",
@@ -223,7 +260,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -234,7 +271,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -246,21 +283,38 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 15,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:random_state not set so using default value\n",
"WARNING:root:failed to load state from newsgroups.dict.state: [Errno 2] No such file or directory: 'newsgroups.dict.state'\n"
]
}
],
"source": [ "source": [
"# You can optionally save the dictionary \n", "# You can optionally save the dictionary \n",
"\n", "\n",
"dictionary.save('newsgroups.dict')\n", "dictionary.save('newsgroups.dict')\n",
"lda = LdaModel.load('newsgroups.lda')" "lda = LdaModel.load('newsgroups.dict')"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 16,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dictionary(10913 unique tokens: ['cel', 'ds', 'hi', 'nothing', 'prj']...)\n"
]
}
],
"source": [ "source": [
"# We can print the dictionary, it is a mappying of id and tokens\n", "# We can print the dictionary, it is a mappying of id and tokens\n",
"\n", "\n",
@@ -269,7 +323,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 17,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -279,7 +333,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 18,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -292,9 +346,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 19,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.24093628445650234), (1, 0.5700978153855775), (2, 0.10438175896914427), (3, 0.1598114653031772), (4, 0.722808853369507), (5, 0.24093628445650234)]\n"
]
}
],
"source": [ "source": [
"#print tf-idf of first document\n", "#print tf-idf of first document\n",
"print(corpus_tfidf[0])" "print(corpus_tfidf[0])"
@@ -302,7 +364,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 20,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -315,9 +377,27 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 21,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" '0.011*\"thanks\" + 0.010*\"targa\" + 0.008*\"mary\" + 0.008*\"western\" + 0.007*\"craig\" + 0.007*\"jeff\" + 0.006*\"yayayay\" + 0.006*\"phobos\" + 0.005*\"unfortunately\" + 0.005*\"martian\"'),\n",
" (1,\n",
" '0.007*\"islam\" + 0.006*\"koresh\" + 0.006*\"moon\" + 0.006*\"bible\" + 0.006*\"plane\" + 0.006*\"ns\" + 0.005*\"zoroastrians\" + 0.005*\"joy\" + 0.005*\"lucky\" + 0.005*\"ssrt\"'),\n",
" (2,\n",
" '0.009*\"whatever\" + 0.009*\"baptist\" + 0.007*\"cheers\" + 0.007*\"kent\" + 0.006*\"khomeini\" + 0.006*\"davidian\" + 0.005*\"gerald\" + 0.005*\"bull\" + 0.005*\"sorry\" + 0.005*\"jesus\"'),\n",
" (3,\n",
" '0.005*\"pd\" + 0.004*\"baltimore\" + 0.004*\"also\" + 0.003*\"ipx\" + 0.003*\"dam\" + 0.003*\"feiner\" + 0.003*\"foley\" + 0.003*\"ideally\" + 0.003*\"srgp\" + 0.003*\"thank\"')]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"# check the topics\n", "# check the topics\n",
"lda_model.print_topics(4)" "lda_model.print_topics(4)"
@@ -325,9 +405,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 22,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.09401487), (1, 0.08991001), (2, 0.08514047), (3, 0.7309346)]\n"
]
}
],
"source": [ "source": [
"# check the lsa vector for the first document\n", "# check the lsa vector for the first document\n",
"corpus_lda = lda_model[corpus_tfidf]\n", "corpus_lda = lda_model[corpus_tfidf]\n",
@@ -336,9 +424,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 24,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('lord', 1), ('god', 2)]\n"
]
}
],
"source": [ "source": [
"#predict topics of a new doc\n", "#predict topics of a new doc\n",
"new_doc = \"God is love and God is the Lord\"\n", "new_doc = \"God is love and God is the Lord\"\n",
@@ -349,9 +445,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 25,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.06678458), (1, 0.8006135), (2, 0.06974816), (3, 0.062853776)]\n"
]
}
],
"source": [ "source": [
"#transform into LDA space\n", "#transform into LDA space\n",
"lda_vector = lda_model[bow_vector]\n", "lda_vector = lda_model[bow_vector]\n",
@@ -360,9 +464,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 26,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.007*\"islam\" + 0.006*\"koresh\" + 0.006*\"moon\" + 0.006*\"bible\" + 0.006*\"plane\" + 0.006*\"ns\" + 0.005*\"zoroastrians\" + 0.005*\"joy\" + 0.005*\"lucky\" + 0.005*\"ssrt\"\n"
]
}
],
"source": [ "source": [
"# print the document's single most prominent LDA topic\n", "# print the document's single most prominent LDA topic\n",
"print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))" "print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))"
@@ -370,9 +482,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 27,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.110989906), (1, 0.670005), (2, 0.11422917), (3, 0.10477593)]\n",
"0.007*\"islam\" + 0.006*\"koresh\" + 0.006*\"moon\" + 0.006*\"bible\" + 0.006*\"plane\" + 0.006*\"ns\" + 0.005*\"zoroastrians\" + 0.005*\"joy\" + 0.005*\"lucky\" + 0.005*\"ssrt\"\n"
]
}
],
"source": [ "source": [
"lda_vector_tfidf = lda_model[tfidf_model[bow_vector]]\n", "lda_vector_tfidf = lda_model[tfidf_model[bow_vector]]\n",
"print(lda_vector_tfidf)\n", "print(lda_vector_tfidf)\n",
@@ -389,7 +510,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 28,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -405,9 +526,27 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 29,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" '0.769*\"god\" + 0.345*\"jesus\" + 0.235*\"bible\" + 0.203*\"christian\" + 0.149*\"christians\" + 0.108*\"christ\" + 0.089*\"well\" + 0.085*\"koresh\" + 0.081*\"kent\" + 0.080*\"christianity\"'),\n",
" (1,\n",
" '-0.863*\"thanks\" + -0.255*\"please\" + -0.160*\"hello\" + -0.153*\"hi\" + 0.123*\"god\" + -0.112*\"sorry\" + -0.088*\"could\" + -0.075*\"windows\" + -0.068*\"jpeg\" + -0.062*\"gif\"'),\n",
" (2,\n",
" '-0.779*\"well\" + 0.229*\"god\" + -0.164*\"yes\" + 0.153*\"thanks\" + -0.135*\"ico\" + -0.135*\"tek\" + -0.132*\"beauchaine\" + -0.132*\"queens\" + -0.132*\"bronx\" + -0.131*\"manhattan\"'),\n",
" (3,\n",
" '0.343*\"well\" + -0.335*\"ico\" + -0.334*\"tek\" + -0.328*\"bronx\" + -0.328*\"beauchaine\" + -0.328*\"queens\" + -0.325*\"manhattan\" + -0.305*\"com\" + -0.303*\"bob\" + -0.073*\"god\"')]"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"# check the topics\n", "# check the topics\n",
"lsi_model.print_topics(4)" "lsi_model.print_topics(4)"
@@ -415,9 +554,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 30,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.24093628445650234), (1, 0.5700978153855775), (2, 0.10438175896914427), (3, 0.1598114653031772), (4, 0.722808853369507), (5, 0.24093628445650234)]\n"
]
}
],
"source": [ "source": [
"# check the lsi vector for the first document\n", "# check the lsi vector for the first document\n",
"print(corpus_tfidf[0])" "print(corpus_tfidf[0])"

View File

@@ -84,17 +84,17 @@
"\n", "\n",
"Each of these files contains 28 columns:\n", "Each of these files contains 28 columns:\n",
"\n", "\n",
"* essay_id: A unique identifier for each individual student essay\n", "* **essay_id**: A unique identifier for each individual student essay\n",
"* essay_set: 1-8, an id for each set of essays\n", "* **essay_set**: 1-8, an id for each set of essays\n",
"* essay: The ascii text of a student's response\n", "* **essay**: The ascii text of a student's response\n",
"* rater1_domain1: Rater 1's domain 1 score; all essays have this\n", "* **rater1_domain1**: Rater 1's domain 1 score; all essays have this\n",
"* rater2_domain1: Rater 2's domain 1 score; all essays have this\n", "* **rater2_domain1**: Rater 2's domain 1 score; all essays have this\n",
"* rater3_domain1: Rater 3's domain 1 score; only some essays in set 8 have this.\n", "* **rater3_domain1**: Rater 3's domain 1 score; only some essays in set 8 have this.\n",
"* domain1_score: Resolved score between the raters; all essays have this\n", "* **domain1_score**: Resolved score between the raters; all essays have this\n",
"* rater1_domain2: Rater 1's domain 2 score; only essays in set 2 have this\n", "* **rater1_domain2**: Rater 1's domain 2 score; only essays in set 2 have this\n",
"* rater2_domain2: Rater 2's domain 2 score; only essays in set 2 have this\n", "* **rater2_domain2**: Rater 2's domain 2 score; only essays in set 2 have this\n",
"* domain2_score: Resolved score between the raters; only essays in set 2 have this\n", "* **domain2_score**: Resolved score between the raters; only essays in set 2 have this\n",
"* rater1_trait1 score - rater3_trait6 score: trait scores for sets 7-8\n", "* **rater1_trait1 score - rater3_trait6 score**: trait scores for sets 7-8\n",
"\n", "\n",
"The dataset is provided in the folder *data-kaggle/training_set_rel3.tsv*.\n", "The dataset is provided in the folder *data-kaggle/training_set_rel3.tsv*.\n",
"\n", "\n",
@@ -102,7 +102,7 @@
"\n", "\n",
"The dataset has been anonymized to remove personally identifying information from the essays using the Named Entity Recognizer (NER) from the Stanford Natural Language Processing group and a variety of other approaches. The relevant entities are identified in the text and then replaced with a string such as \"@PERSON1.\"\n", "The dataset has been anonymized to remove personally identifying information from the essays using the Named Entity Recognizer (NER) from the Stanford Natural Language Processing group and a variety of other approaches. The relevant entities are identified in the text and then replaced with a string such as \"@PERSON1.\"\n",
"\n", "\n",
"The entitities identified by NER are: \"PERSON\", \"ORGANIZATION\", \"LOCATION\", \"DATE\", \"TIME\", \"MONEY\", \"PERCENT\"\n", "The entities identified by NER are: \"PERSON\", \"ORGANIZATION\", \"LOCATION\", \"DATE\", \"TIME\", \"MONEY\", \"PERCENT\"\n",
"\n", "\n",
"Other replacements made: \"MONTH\" (any month name not tagged as a date by the NER), \"EMAIL\" (anything that looks like an e-mail address), \"NUM\" (word containing digits or non-alphanumeric symbols), and \"CAPS\" (any capitalized word that doesn't begin a sentence, except in essays where more than 20% of the characters are capitalized letters), \"DR\" (any word following \"Dr.\" with or without the period, with any capitalization, that doesn't fall into any of the above), \"CITY\" and \"STATE\" (various cities and states)." "Other replacements made: \"MONTH\" (any month name not tagged as a date by the NER), \"EMAIL\" (anything that looks like an e-mail address), \"NUM\" (word containing digits or non-alphanumeric symbols), and \"CAPS\" (any capitalized word that doesn't begin a sentence, except in essays where more than 20% of the characters are capitalized letters), \"DR\" (any word following \"Dr.\" with or without the period, with any capitalization, that doesn't fall into any of the above), \"CITY\" and \"STATE\" (various cities and states)."
] ]
@@ -427,7 +427,7 @@
" ])),\n", " ])),\n",
" ('lda', Pipeline([ \n", " ('lda', Pipeline([ \n",
" ('count', CountVectorizer(tokenizer=custom_tokenizer)),\n", " ('count', CountVectorizer(tokenizer=custom_tokenizer)),\n",
" ('lda', LatentDirichletAllocation(n_topics=4, max_iter=5,\n", " ('lda', LatentDirichletAllocation(n_components=4, max_iter=5,\n",
" learning_method='online', \n", " learning_method='online', \n",
" learning_offset=50.,\n", " learning_offset=50.,\n",
" random_state=0))\n", " random_state=0))\n",