1
0
mirror of https://github.com/gsi-upm/sitc synced 2024-12-26 05:18:11 +00:00

Included installation of nltk

This commit is contained in:
cif2cif 2017-04-20 12:56:39 +02:00
parent cb40531dc4
commit e88e144a50
5 changed files with 140 additions and 987 deletions

View File

@ -67,7 +67,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": null,
"metadata": {
"collapsed": false
},
@ -102,21 +102,22 @@
"\n",
"\n",
"In order to use nltk, we should download first the lexical resources we are going to use. We can updated them later. For this, you need:\n",
"* install nltk. Execute 'conda install nltk'\n",
"* import nltk\n",
"* Run *nltk.download()* (the first time we use it). A window will appear. You should select just the corpus 'book' and press download.\n",
"If you inspect the window, you can get an overview of available lexical resources (corpora, lexicons and grammars). For example, you can find some relevant sentiment lexicons in corpora (SentiWordNet, Sentence Polarity Dataset, Vader, Opinion Lexicon or VADER Sentiment Lexicon)."
"If you inspect the window, you can get an overview of available lexical resources (corpora, lexicons and grammars). For example, you can find some relevant sentiment lexicons in corpora (SentiWordNet, Sentence Polarity Dataset, Vader, Opinion Lexicon or VADER Sentiment Lexicon). Don't forget to close the window once the data has been downloaded."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
"import nltk\n",
"ntlk.download()"
"nltk.download()"
]
},
{
@ -169,19 +170,11 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['I purchased this monitor because of budgetary concerns.', 'This item was the most inexpensive 17 inch monitor \\navailable to me at the time I made the purchase.', 'My overall experience with this monitor was very poor.', \"When the \\nscreen wasn't contracting or glitching the overall picture quality was poor to fair.\", \"I've viewed numerous different \\nmonitor models since I 'm a college student and this particular monitor had as poor of picture quality as \\nany I 've seen.\"]\n"
]
}
],
"outputs": [],
"source": [
"from nltk.tokenize import sent_tokenize, word_tokenize\n",
"\n",
@ -205,20 +198,12 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['I', 'purchased', 'this', 'monitor', 'because', 'of', 'budgetary', 'concerns', '.'], ['This', 'item', 'was', 'the', 'most', 'inexpensive', '17', 'inch', 'monitor', 'available', 'to', 'me', 'at', 'the', 'time', 'I', 'made', 'the', 'purchase', '.'], ['My', 'overall', 'experience', 'with', 'this', 'monitor', 'was', 'very', 'poor', '.'], ['When', 'the', 'screen', 'was', \"n't\", 'contracting', 'or', 'glitching', 'the', 'overall', 'picture', 'quality', 'was', 'poor', 'to', 'fair', '.'], ['I', \"'ve\", 'viewed', 'numerous', 'different', 'monitor', 'models', 'since', 'I', \"'m\", 'a', 'college', 'student', 'and', 'this', 'particular', 'monitor', 'had', 'as', 'poor', 'of', 'picture', 'quality', 'as', 'any', 'I', \"'ve\", 'seen', '.']]\n"
]
}
],
"outputs": [],
"source": [
"words = [word_tokenize(t) for t in sent_tokenize(review)]\n",
"print(words)"
@ -233,19 +218,11 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['I', 'purchased', 'this', 'monitor', 'because', 'of', 'budgetary', 'concerns', '.', 'This', 'item', 'was', 'the', 'most', 'inexpensive', '17', 'inch', 'monitor', 'available', 'to', 'me', 'at', 'the', 'time', 'I', 'made', 'the', 'purchase', '.', 'My', 'overall', 'experience', 'with', 'this', 'monitor', 'was', 'very', 'poor', '.', 'When', 'the', 'screen', 'was', \"n't\", 'contracting', 'or', 'glitching', 'the', 'overall', 'picture', 'quality', 'was', 'poor', 'to', 'fair', '.', 'I', \"'ve\", 'viewed', 'numerous', 'different', 'monitor', 'models', 'since', 'I', \"'m\", 'a', 'college', 'student', 'and', 'this', 'particular', 'monitor', 'had', 'as', 'poor', 'of', 'picture', 'quality', 'as', 'any', 'I', \"'ve\", 'seen', '.']\n"
]
}
],
"outputs": [],
"source": [
"words = word_tokenize(review)\n",
"print(words)"
@ -261,20 +238,11 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"With TweetTokenizer Lady Gaga is actually at the Britney Spears Femme Fatale Concert tonight ! ! ! She still listens to her music ! ! ! WOW ! ! ! #ladygaga #britney\n",
"With word_tokenizer @ concert Lady Gaga is actually at the Britney Spears Femme Fatale Concert tonight ! ! ! She still listens to her music ! ! ! ! WOW ! ! ! # ladygaga # britney\n"
]
}
],
"outputs": [],
"source": [
"from nltk.tokenize import TweetTokenizer\n",
"tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)\n",
@ -299,26 +267,11 @@
},
{
"cell_type": "code",
"execution_count": 75,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Porter: boy children are have is ha Madrid\n",
"Execution time: 0.00093841552734375\n",
"Lancaster: boy childr ar hav is has madrid\n",
"Execution time: 0.0014300346374511719\n",
"WordNet: boy child are have is ha Madrid\n",
"Execution time: 0.0008304119110107422\n",
"SnowBall: boy children are have is has madrid\n",
"Execution time: 0.0017843246459960938\n"
]
}
],
"outputs": [],
"source": [
"from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer\n",
"from nltk.stem.snowball import EnglishStemmer\n",
@ -362,19 +315,11 @@
},
{
"cell_type": "code",
"execution_count": 74,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WordNet: be cry be have have\n"
]
}
],
"outputs": [],
"source": [
"verbs = \"are crying is have has\"\n",
"print(\"WordNet: \" + \" \".join([wordnet.lemmatize(w, pos='v') for w in word_tokenize(verbs)]))\n"
@ -395,20 +340,11 @@
},
{
"cell_type": "code",
"execution_count": 82,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['i', 'purchas', 'thi', 'monitor', 'becaus', 'of', 'budgetari', 'concern', '.', 'thi', 'item', 'wa', 'the', 'most', 'inexpens', '17', 'inch', 'monitor', 'avail', 'to', 'me', 'at', 'the', 'time', 'i', 'made', 'the', 'purchas', '.', 'my', 'overal', 'experi', 'with', 'thi', 'monitor', 'wa', 'veri', 'poor', '.', 'when', 'the', 'screen', 'wa', \"n't\", 'contract', 'or', 'glitch', 'the', 'overal', 'pictur', 'qualiti', 'wa', 'poor', 'to', 'fair', '.', 'i', \"'ve\", 'view', 'numer', 'differ', 'monitor', 'model', 'sinc', 'i', \"'m\", 'a', 'colleg', 'student', 'and', 'thi', 'particular', 'monitor', 'had', 'as', 'poor', 'of', 'pictur', 'qualiti', 'as', 'ani', 'i', \"'ve\", 'seen', '.']\n",
"['Ladi', 'Gaga', 'is', 'actual', 'at', 'the', 'Britney', 'Spear', 'Femm', 'Fatal', 'Concert', 'tonight', '!', '!', '!', 'She', 'still', 'listen', 'to', 'her', 'music', '!', '!', '!', 'WOW', '!', '!', '!', '#ladygaga', '#britney']\n"
]
}
],
"outputs": [],
"source": [
"def preprocess(words, type='doc'):\n",
" if (type == 'tweet'):\n",
@ -439,19 +375,11 @@
},
{
"cell_type": "code",
"execution_count": 86,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn']\n"
]
}
],
"outputs": [],
"source": [
"from nltk.corpus import stopwords\n",
"\n",
@ -461,20 +389,11 @@
},
{
"cell_type": "code",
"execution_count": 89,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['purchas', 'thi', 'monitor', 'becaus', 'budgetari', 'concern', '.', 'thi', 'item', 'wa', 'inexpens', '17', 'inch', 'monitor', 'avail', 'time', 'made', 'purchas', '.', 'overal', 'experi', 'thi', 'monitor', 'wa', 'veri', 'poor', '.', 'screen', 'wa', \"n't\", 'contract', 'glitch', 'overal', 'pictur', 'qualiti', 'wa', 'poor', 'fair', '.', \"'ve\", 'view', 'numer', 'differ', 'monitor', 'model', 'sinc', \"'m\", 'colleg', 'student', 'thi', 'particular', 'monitor', 'poor', 'pictur', 'qualiti', 'ani', \"'ve\", 'seen', '.']\n",
"['Ladi', 'Gaga', 'actual', 'Britney', 'Spear', 'Femm', 'Fatal', 'Concert', 'tonight', '!', '!', '!', 'She', 'still', 'listen', 'music', '!', '!', '!', 'WOW', '!', '!', '!', '#ladygaga', '#britney']\n"
]
}
],
"outputs": [],
"source": [
"def preprocess(words, type='doc'):\n",
" if (type == 'tweet'):\n",
@ -508,20 +427,11 @@
},
{
"cell_type": "code",
"execution_count": 104,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['purchas', 'thi', 'monitor', 'becaus', 'budgetari', 'concern', 'thi', 'item', 'wa', 'inexpens', '17', 'inch', 'monitor', 'avail', 'time', 'made', 'purchas', 'overal', 'experi', 'thi', 'monitor', 'wa', 'veri', 'poor', 'screen', 'wa', \"n't\", 'contract', 'glitch', 'overal', 'pictur', 'qualiti', 'wa', 'poor', 'fair', \"'ve\", 'view', 'numer', 'differ', 'monitor', 'model', 'sinc', \"'m\", 'colleg', 'student', 'thi', 'particular', 'monitor', 'poor', 'pictur', 'qualiti', 'ani', \"'ve\", 'seen']\n",
"['Ladi', 'Gaga', 'actual', 'Britney', 'Spear', 'Femm', 'Fatal', 'Concert', 'tonight', 'She', 'still', 'listen', 'music', 'WOW', '#ladygaga', '#britney']\n"
]
}
],
"outputs": [],
"source": [
"import string\n",
"\n",
@ -563,22 +473,11 @@
},
{
"cell_type": "code",
"execution_count": 124,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Most frequent\n",
"[('.', 5), ('I', 5), ('the', 5), ('monitor', 5), ('was', 4), ('this', 3), ('poor', 3), ('as', 2), ('overall', 2), ('picture', 2)]\n",
"Least frequent\n",
"['models', '17', 'purchase', 'different', 'most', \"n't\", 'monitor', \"'m\", 'was', 'My']\n"
]
}
],
"outputs": [],
"source": [
"frec = nltk.FreqDist(nltk.word_tokenize(review))\n",
"print(\"Most frequent\")\n",
@ -636,7 +535,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
"version": "3.5.2"
}
},
"nbformat": 4,

View File

@ -61,7 +61,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {
"collapsed": false
},
@ -109,19 +109,11 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('I', 'PRON'), ('purchased', 'VERB'), ('this', 'DET'), ('Dell', 'NOUN'), ('monitor', 'NOUN'), ('because', 'ADP'), ('of', 'ADP'), ('budgetary', 'ADJ'), ('concerns', 'NOUN'), ('.', '.'), ('This', 'DET'), ('item', 'NOUN'), ('was', 'VERB'), ('the', 'DET'), ('most', 'ADV'), ('inexpensive', 'ADJ'), ('17', 'NUM'), ('inch', 'NOUN'), ('Apple', 'NOUN'), ('monitor', 'NOUN'), ('available', 'ADJ'), ('to', 'PRT'), ('me', 'PRON'), ('at', 'ADP'), ('the', 'DET'), ('time', 'NOUN'), ('I', 'PRON'), ('made', 'VERB'), ('the', 'DET'), ('purchase', 'NOUN'), ('.', '.'), ('My', 'PRON'), ('overall', 'ADJ'), ('experience', 'NOUN'), ('with', 'ADP'), ('this', 'DET'), ('monitor', 'NOUN'), ('was', 'VERB'), ('very', 'ADV'), ('poor', 'ADJ'), ('.', '.'), ('When', 'ADV'), ('the', 'DET'), ('screen', 'NOUN'), ('was', 'VERB'), (\"n't\", 'ADV'), ('contracting', 'VERB'), ('or', 'CONJ'), ('glitching', 'VERB'), ('the', 'DET'), ('overall', 'ADJ'), ('picture', 'NOUN'), ('quality', 'NOUN'), ('was', 'VERB'), ('poor', 'ADJ'), ('to', 'PRT'), ('fair', 'VERB'), ('.', '.'), ('I', 'PRON'), (\"'ve\", 'VERB'), ('viewed', 'VERB'), ('numerous', 'ADJ'), ('different', 'ADJ'), ('monitor', 'NOUN'), ('models', 'NOUN'), ('since', 'ADP'), ('I', 'PRON'), (\"'m\", 'VERB'), ('a', 'DET'), ('college', 'NOUN'), ('student', 'NOUN'), ('at', 'ADP'), ('UPM', 'NOUN'), ('in', 'ADP'), ('Madrid', 'NOUN'), ('and', 'CONJ'), ('this', 'DET'), ('particular', 'ADJ'), ('monitor', 'NOUN'), ('had', 'VERB'), ('as', 'ADP'), ('poor', 'ADJ'), ('of', 'ADP'), ('picture', 'NOUN'), ('quality', 'NOUN'), ('as', 'ADP'), ('any', 'DET'), ('I', 'PRON'), (\"'ve\", 'VERB'), ('seen', 'VERB'), ('.', '.')]\n"
]
}
],
"outputs": [],
"source": [
"from nltk import pos_tag, word_tokenize\n",
"print (pos_tag(word_tokenize(review), tagset='universal'))"
@ -136,19 +128,11 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['I', 'purchase', 'Dell', 'monitor', 'because', 'of', 'budgetary', 'concern', 'item', 'be', 'most', 'inexpensive', '17', 'inch', 'Apple', 'monitor', 'available', 'me', 'at', 'time', 'I', 'make', 'purchase', 'My', 'overall', 'experience', 'with', 'monitor', 'be', 'very', 'poor', 'When', 'screen', 'be', \"n't\", 'contract', 'or', 'glitching', 'overall', 'picture', 'quality', 'be', 'poor', 'fair', 'I', \"'ve\", 'view', 'numerous', 'different', 'monitor', 'model', 'since', 'I', \"'m\", 'college', 'student', 'at', 'UPM', 'in', 'Madrid', 'and', 'particular', 'monitor', 'have', 'a', 'poor', 'of', 'picture', 'quality', 'a', 'I', \"'ve\", 'see']\n"
]
}
],
"outputs": [],
"source": [
"from nltk.stem import WordNetLemmatizer\n",
"\n",
@ -177,110 +161,11 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(S\n",
" I/PRP\n",
" purchased/VBD\n",
" this/DT\n",
" (ORGANIZATION Dell/NNP)\n",
" monitor/NN\n",
" because/IN\n",
" of/IN\n",
" budgetary/JJ\n",
" concerns/NNS\n",
" ./.\n",
" This/DT\n",
" item/NN\n",
" was/VBD\n",
" the/DT\n",
" most/RBS\n",
" inexpensive/JJ\n",
" 17/CD\n",
" inch/NN\n",
" Apple/NNP\n",
" monitor/NN\n",
" available/JJ\n",
" to/TO\n",
" me/PRP\n",
" at/IN\n",
" the/DT\n",
" time/NN\n",
" I/PRP\n",
" made/VBD\n",
" the/DT\n",
" purchase/NN\n",
" ./.\n",
" My/PRP$\n",
" overall/JJ\n",
" experience/NN\n",
" with/IN\n",
" this/DT\n",
" monitor/NN\n",
" was/VBD\n",
" very/RB\n",
" poor/JJ\n",
" ./.\n",
" When/WRB\n",
" the/DT\n",
" screen/NN\n",
" was/VBD\n",
" n't/RB\n",
" contracting/VBG\n",
" or/CC\n",
" glitching/VBG\n",
" the/DT\n",
" overall/JJ\n",
" picture/NN\n",
" quality/NN\n",
" was/VBD\n",
" poor/JJ\n",
" to/TO\n",
" fair/VB\n",
" ./.\n",
" I/PRP\n",
" 've/VBP\n",
" viewed/VBN\n",
" numerous/JJ\n",
" different/JJ\n",
" monitor/NN\n",
" models/NNS\n",
" since/IN\n",
" I/PRP\n",
" 'm/VBP\n",
" a/DT\n",
" college/NN\n",
" student/NN\n",
" at/IN\n",
" (ORGANIZATION UPM/NNP)\n",
" in/IN\n",
" (GPE Madrid/NNP)\n",
" and/CC\n",
" this/DT\n",
" particular/JJ\n",
" monitor/NN\n",
" had/VBD\n",
" as/IN\n",
" poor/JJ\n",
" of/IN\n",
" picture/NN\n",
" quality/NN\n",
" as/IN\n",
" any/DT\n",
" I/PRP\n",
" 've/VBP\n",
" seen/VBN\n",
" ./.)\n"
]
}
],
"outputs": [],
"source": [
"from nltk import ne_chunk, pos_tag, word_tokenize\n",
"ne_tagged = ne_chunk(pos_tag(word_tokenize(review)), binary=False)\n",
@ -321,7 +206,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"metadata": {
"collapsed": false
},
@ -344,90 +229,11 @@
},
{
"cell_type": "code",
"execution_count": 42,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(S\n",
" I/PRON\n",
" purchased/VERB\n",
" (NP this/DET Dell/NOUN monitor/NOUN)\n",
" because/ADP\n",
" of/ADP\n",
" (NP budgetary/ADJ concerns/NOUN)\n",
" ./.\n",
" (NP This/DET item/NOUN)\n",
" was/VERB\n",
" (NP\n",
" the/DET\n",
" most/ADV\n",
" inexpensive/ADJ\n",
" 17/NUM\n",
" inch/NOUN\n",
" Apple/NOUN\n",
" monitor/NOUN)\n",
" available/ADJ\n",
" to/PRT\n",
" me/PRON\n",
" at/ADP\n",
" (NP the/DET time/NOUN)\n",
" I/PRON\n",
" made/VERB\n",
" (NP the/DET purchase/NOUN)\n",
" ./.\n",
" (NP My/PRON overall/ADJ experience/NOUN)\n",
" with/ADP\n",
" (NP this/DET monitor/NOUN)\n",
" was/VERB\n",
" very/ADV\n",
" poor/ADJ\n",
" ./.\n",
" When/ADV\n",
" (NP the/DET screen/NOUN)\n",
" was/VERB\n",
" n't/ADV\n",
" contracting/VERB\n",
" or/CONJ\n",
" glitching/VERB\n",
" (NP the/DET overall/ADJ picture/NOUN quality/NOUN)\n",
" was/VERB\n",
" poor/ADJ\n",
" to/PRT\n",
" fair/VERB\n",
" ./.\n",
" I/PRON\n",
" 've/VERB\n",
" viewed/VERB\n",
" (NP numerous/ADJ different/ADJ monitor/NOUN models/NOUN)\n",
" since/ADP\n",
" I/PRON\n",
" 'm/VERB\n",
" (NP a/DET college/NOUN student/NOUN)\n",
" at/ADP\n",
" (NP UPM/NOUN)\n",
" in/ADP\n",
" (NP Madrid/NOUN)\n",
" and/CONJ\n",
" (NP this/DET particular/ADJ monitor/NOUN)\n",
" had/VERB\n",
" as/ADP\n",
" poor/ADJ\n",
" of/ADP\n",
" (NP picture/NOUN quality/NOUN)\n",
" as/ADP\n",
" any/DET\n",
" I/PRON\n",
" 've/VERB\n",
" seen/VERB\n",
" ./.)\n"
]
}
],
"outputs": [],
"source": [
"from nltk.chunk.regexp import *\n",
"pattern = \"\"\"NP: {<PRON><ADJ><NOUN>+} \n",
@ -451,37 +257,11 @@
},
{
"cell_type": "code",
"execution_count": 54,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[Tree('NP', [('this', 'DET'), ('Dell', 'NOUN'), ('monitor', 'NOUN')]),\n",
" Tree('NP', [('budgetary', 'ADJ'), ('concerns', 'NOUN')]),\n",
" Tree('NP', [('This', 'DET'), ('item', 'NOUN')]),\n",
" Tree('NP', [('the', 'DET'), ('most', 'ADV'), ('inexpensive', 'ADJ'), ('17', 'NUM'), ('inch', 'NOUN'), ('Apple', 'NOUN'), ('monitor', 'NOUN')]),\n",
" Tree('NP', [('the', 'DET'), ('time', 'NOUN')]),\n",
" Tree('NP', [('the', 'DET'), ('purchase', 'NOUN')]),\n",
" Tree('NP', [('My', 'PRON'), ('overall', 'ADJ'), ('experience', 'NOUN')]),\n",
" Tree('NP', [('this', 'DET'), ('monitor', 'NOUN')]),\n",
" Tree('NP', [('the', 'DET'), ('screen', 'NOUN')]),\n",
" Tree('NP', [('the', 'DET'), ('overall', 'ADJ'), ('picture', 'NOUN'), ('quality', 'NOUN')]),\n",
" Tree('NP', [('numerous', 'ADJ'), ('different', 'ADJ'), ('monitor', 'NOUN'), ('models', 'NOUN')]),\n",
" Tree('NP', [('a', 'DET'), ('college', 'NOUN'), ('student', 'NOUN')]),\n",
" Tree('NP', [('UPM', 'NOUN')]),\n",
" Tree('NP', [('Madrid', 'NOUN')]),\n",
" Tree('NP', [('this', 'DET'), ('particular', 'ADJ'), ('monitor', 'NOUN')]),\n",
" Tree('NP', [('picture', 'NOUN'), ('quality', 'NOUN')])]"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"def extractTrees(parsed_tree, category='NP'):\n",
" return list(parsed_tree.subtrees(filter=lambda x: x.label()==category))\n",
@ -491,37 +271,11 @@
},
{
"cell_type": "code",
"execution_count": 90,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['this Dell monitor',\n",
" 'budgetary concerns',\n",
" 'This item',\n",
" 'the most inexpensive 17 inch Apple monitor',\n",
" 'the time',\n",
" 'the purchase',\n",
" 'My overall experience',\n",
" 'this monitor',\n",
" 'the screen',\n",
" 'the overall picture quality',\n",
" 'numerous different monitor models',\n",
" 'a college student',\n",
" 'UPM',\n",
" 'Madrid',\n",
" 'this particular monitor',\n",
" 'picture quality']"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"def extractStrings(parsed_tree, category='NP'):\n",
" return [\" \".join(word for word, pos in vp.leaves()) for vp in extractTrees(parsed_tree, category)]\n",
@ -529,15 +283,6 @@
"extractStrings(chunks_np)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
@ -587,7 +332,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
"version": "3.5.2"
}
},
"nbformat": 4,

View File

@ -59,7 +59,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {
"collapsed": false
},
@ -109,27 +109,11 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=5000, min_df=1,\n",
" ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
" tokenizer=None, vocabulary=None)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"\n",
@ -152,23 +136,11 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<3x10 sparse matrix of type '<class 'numpy.int64'>'\n",
"\twith 15 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectors = vectorizer.fit_transform(documents)\n",
"vectors"
@ -186,22 +158,11 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0 1 1 2 0 0 1 2 0 0]\n",
" [1 0 0 0 2 0 0 1 2 1]\n",
" [1 0 0 0 2 1 0 0 1 1]]\n",
"['and', 'but', 'coming', 'is', 'like', 'sandwiches', 'short', 'summer', 'the', 'winter']\n"
]
}
],
"outputs": [],
"source": [
"print(vectors.toarray())\n",
"print(vectorizer.get_feature_names())"
@ -219,32 +180,11 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['and',\n",
" 'but',\n",
" 'coming',\n",
" 'i',\n",
" 'is',\n",
" 'like',\n",
" 'sandwiches',\n",
" 'short',\n",
" 'summer',\n",
" 'the',\n",
" 'winter']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words=None, token_pattern='(?u)\\\\b\\\\w+\\\\b') \n",
"vectors = vectorizer.fit_transform(documents)\n",
@ -260,22 +200,11 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', token_pattern='(?u)\\\\b\\\\w+\\\\b') \n",
"vectors = vectorizer.fit_transform(documents)\n",
@ -284,19 +213,11 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"frozenset({'could', 'sixty', 'onto', 'by', 'against', 'up', 'a', 'everything', 'other', 'otherwise', 'ourselves', 'beside', 'nowhere', 'then', 'below', 'put', 'ten', 'such', 'cannot', 'either', 'due', 'hasnt', 'whereupon', 'were', 'once', 'at', 'for', 'front', 'get', 'whereas', 'that', 'eight', 'another', 'except', 'of', 'wherever', 'over', 'to', 'whom', 'you', 'former', 'behind', 'yours', 'yourself', 'what', 'even', 'however', 'go', 'less', 'bottom', 'may', 'along', 'is', 'can', 'move', 'eg', 'somewhere', 'latterly', 'seemed', 'thence', 'becoming', 'himself', 'whether', 'six', 'first', 'off', 'do', 'many', 'namely', 'never', 'because', 'mostly', 'nevertheless', 'thereupon', 'here', 'least', 'anyone', 'one', 'others', 'cry', 'they', 'thereby', 'ie', 'am', 'this', 'would', 'any', 'while', 'see', 'too', 'your', 'somehow', 'within', 'same', 'sometimes', 'thereafter', 'must', 'take', 're', 'both', 'fill', 'nor', 'sometime', 'he', 'third', 'more', 'also', 'most', 'during', 'much', 'our', 'thick', 'enough', 'full', 'toward', 'with', 'mill', 'anyhow', 'nobody', 'why', 'thru', 'although', 'nothing', 'meanwhile', 'or', 'some', 'ltd', 'wherein', 'thus', 'someone', 'whereby', 'who', 'un', 'are', 'hundred', 'whereafter', 'fire', 'twenty', 'only', 'several', 'among', 'no', 'than', 'before', 'been', 'else', 'find', 'fifteen', 'hence', 'ours', 'already', 'be', 'besides', 'next', 'interest', 'whither', 'whole', 'eleven', 'without', 'five', 'show', 'in', 'throughout', 'own', 'amongst', 'will', 'neither', 'everywhere', 'part', 'give', 'my', 'hers', 'his', 'upon', 'well', 'him', 'yourselves', 'whatever', 'cant', 'though', 'had', 'again', 'every', 'noone', 'top', 'which', 'de', 'almost', 'system', 'under', 'down', 'latter', 'above', 'whence', 'found', 'myself', 'three', 'those', 'become', 'moreover', 'but', 'anyway', 'beyond', 'from', 'now', 'as', 'seeming', 'con', 'themselves', 'hereupon', 'each', 'serious', 'two', 'across', 'out', 'the', 'therein', 'between', 'inc', 'where', 'anything', 'seem', 'co', 'therefore', 'whoever', 'herein', 'about', 'herself', 'should', 'anywhere', 'how', 'we', 'after', 'describe', 'being', 'etc', 'very', 'not', 'an', 'me', 'call', 'per', 'detail', 'still', 'around', 'hereby', 'sincere', 'their', 'has', 'became', 'beforehand', 'everyone', 'hereafter', 'made', 'ever', 'indeed', 'itself', 'something', 'afterwards', 'none', 'done', 'nine', 'alone', 'please', 'its', 'name', 'since', 'on', 'she', 'bill', 'have', 'mine', 'few', 'her', 'seems', 'always', 'side', 'forty', 'further', 'via', 'last', 'amount', 'towards', 'fify', 'through', 'whose', 'couldnt', 'perhaps', 'thin', 'until', 'becomes', 'elsewhere', 'and', 'i', 'them', 'together', 'us', 'was', 'when', 'rather', 'whenever', 'formerly', 'keep', 'so', 'back', 'there', 'amoungst', 'might', 'these', 'all', 'empty', 'often', 'into', 'it', 'twelve', 'yet', 'if', 'four'})\n"
]
}
],
"outputs": [],
"source": [
"#stop words in scikit-learn for English\n",
"print(vectorizer.get_stop_words())"
@ -304,24 +225,11 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1, 0, 0, 1, 2, 0],\n",
" [0, 2, 0, 0, 1, 1],\n",
" [0, 2, 1, 0, 0, 1]], dtype=int64)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# Vectors\n",
"f_array = vectors.toarray()\n",
@ -337,19 +245,11 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.666666666667 1.0 0.166666666667\n"
]
}
],
"outputs": [],
"source": [
"from scipy.spatial.distance import cosine\n",
"d12 = cosine(f_array[0], f_array[1])\n",
@ -374,22 +274,11 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', binary=True) \n",
"vectors = vectorizer.fit_transform(documents)\n",
@ -398,24 +287,11 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1, 0, 0, 1, 1, 0],\n",
" [0, 1, 0, 0, 1, 1],\n",
" [0, 1, 1, 0, 0, 1]], dtype=int64)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectors.toarray()"
]
@ -436,29 +312,11 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['coming summer',\n",
" 'like sandwiches',\n",
" 'like summer',\n",
" 'like winter',\n",
" 'sandwiches like',\n",
" 'summer coming',\n",
" 'summer like',\n",
" 'summer short']"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', ngram_range=[2,2]) \n",
"vectors = vectorizer.fit_transform(documents)\n",
@ -467,24 +325,11 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1, 0, 0, 0, 0, 1, 0, 1],\n",
" [0, 0, 1, 1, 0, 0, 1, 0],\n",
" [0, 1, 0, 1, 1, 0, 0, 0]], dtype=int64)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectors.toarray()"
]
@ -505,22 +350,11 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
@ -531,27 +365,11 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.48148213, 0. , 0. , 0.48148213, 0.73235914,\n",
" 0. ],\n",
" [ 0. , 0.81649658, 0. , 0. , 0.40824829,\n",
" 0.40824829],\n",
" [ 0. , 0.77100584, 0.50689001, 0. , 0. ,\n",
" 0.38550292]])"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectors.toarray()"
]
@ -565,22 +383,11 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"train = [doc1, doc2, doc3]\n",
"vectorizer = TfidfVectorizer(analyzer=\"word\", stop_words='english')\n",
@ -592,28 +399,12 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.48148213, 0. , 0. , 0.48148213, 0.73235914,\n",
" 0. ],\n",
" [ 0. , 0.81649658, 0. , 0. , 0.40824829,\n",
" 0.40824829],\n",
" [ 0. , 0.77100584, 0.50689001, 0. , 0. ,\n",
" 0.38550292]])"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectors.toarray()"
]
@ -627,22 +418,11 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.38324078, 0.24713249, 0.23336362]])"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from sklearn.metrics.pairwise import cosine_similarity\n",
"\n",
@ -664,22 +444,11 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.38324078, 0.24713249, 0.23336362])"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from sklearn.metrics.pairwise import linear_kernel\n",
"cosine_similarity = linear_kernel(vector_query, vectors).flatten()\n",
@ -734,7 +503,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
"version": "3.5.2"
}
},
"nbformat": 4,

View File

@ -74,19 +74,11 @@
},
{
"cell_type": "code",
"execution_count": 93,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n"
]
}
],
"outputs": [],
"source": [
"from sklearn.datasets import fetch_20newsgroups\n",
"\n",
@ -100,19 +92,11 @@
},
{
"cell_type": "code",
"execution_count": 106,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"20\n"
]
}
],
"outputs": [],
"source": [
"#Number of categories\n",
"print(len(newsgroups_train.target_names))"
@ -120,28 +104,11 @@
},
{
"cell_type": "code",
"execution_count": 94,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Category id 4 comp.sys.mac.hardware\n",
"Doc A fair number of brave souls who upgraded their SI clock oscillator have\n",
"shared their experiences for this poll. Please send a brief message detailing\n",
"your experiences with the procedure. Top speed attained, CPU rated speed,\n",
"add on cards and adapters, heat sinks, hour of usage per day, floppy disk\n",
"functionality with 800 and 1.4 m floppies are especially requested.\n",
"\n",
"I will be summarizing in the next two days, so please add to the network\n",
"knowledge base if you have done the clock upgrade and haven't answered this\n",
"poll. Thanks.\n"
]
}
],
"outputs": [],
"source": [
"# Show a document\n",
"docid = 1\n",
@ -154,22 +121,11 @@
},
{
"cell_type": "code",
"execution_count": 95,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(11314,)"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"#Number of files\n",
"newsgroups_train.filenames.shape"
@ -177,22 +133,11 @@
},
{
"cell_type": "code",
"execution_count": 96,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(11314, 101323)"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# Obtain a vector\n",
"\n",
@ -206,22 +151,11 @@
},
{
"cell_type": "code",
"execution_count": 97,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"66.80510871486653"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# The tf-idf vectors are very sparse with an average of 66 non zero components in 101.323 dimensions (.06%)\n",
"vectors_train.nnz / float(vectors_train.shape[0])"
@ -243,22 +177,11 @@
},
{
"cell_type": "code",
"execution_count": 138,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.69545360719001303"
]
},
"execution_count": 138,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from sklearn.naive_bayes import MultinomialNB\n",
"\n",
@ -271,7 +194,7 @@
"model = MultinomialNB(alpha=.01)\n",
"model.fit(vectors_train, newsgroups_train.target)\n",
"\n",
"\n",
"vectors_test = vectorizer.transform(newsgroups_test.data)\n",
"pred = model.predict(vectors_test)\n",
"\n",
"metrics.f1_score(newsgroups_test.target, pred, average='weighted')\n"
@ -286,20 +209,11 @@
},
{
"cell_type": "code",
"execution_count": 141,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dimensionality: 101323\n",
"density: 1.000000\n"
]
}
],
"outputs": [],
"source": [
"from sklearn.utils.extmath import density\n",
"\n",
@ -309,38 +223,11 @@
},
{
"cell_type": "code",
"execution_count": 140,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"alt.atheism: islam atheists say just religion atheism think don people god\n",
"comp.graphics: looking format 3d know program file files thanks image graphics\n",
"comp.os.ms-windows.misc: card problem thanks driver drivers use files dos file windows\n",
"comp.sys.ibm.pc.hardware: monitor disk thanks pc ide controller bus card scsi drive\n",
"comp.sys.mac.hardware: know monitor does quadra simms thanks problem drive apple mac\n",
"comp.windows.x: using windows x11r5 use application thanks widget server motif window\n",
"misc.forsale: asking email sell price condition new shipping offer 00 sale\n",
"rec.autos: don ford new good dealer just engine like cars car\n",
"rec.motorcycles: don just helmet riding like motorcycle ride bikes dod bike\n",
"rec.sport.baseball: braves players pitching hit runs games game baseball team year\n",
"rec.sport.hockey: league year nhl games season players play hockey team game\n",
"sci.crypt: people use escrow nsa keys government chip clipper encryption key\n",
"sci.electronics: don thanks voltage used know does like circuit power use\n",
"sci.med: skepticism cadre dsl banks chastity n3jxp pitt gordon geb msg\n",
"sci.space: just lunar earth shuttle like moon launch orbit nasa space\n",
"soc.religion.christian: believe faith christian christ bible people christians church jesus god\n",
"talk.politics.guns: just law firearms government fbi don weapons people guns gun\n",
"talk.politics.mideast: said arabs arab turkish people armenians armenian jews israeli israel\n",
"talk.politics.misc: know state clinton president just think tax don government people\n",
"talk.religion.misc: think don koresh objective christians bible people christian jesus god\n"
]
}
],
"outputs": [],
"source": [
"# We can review the top features per topic in Bayes (attribute coef_)\n",
"import numpy as np\n",
@ -357,20 +244,11 @@
},
{
"cell_type": "code",
"execution_count": 136,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 3 15]\n",
"['comp.sys.ibm.pc.hardware', 'soc.religion.christian']\n"
]
}
],
"outputs": [],
"source": [
"# We try the classifier in two new docs\n",
"\n",
@ -431,7 +309,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
"version": "3.5.2"
}
},
"nbformat": 4,

View File

@ -76,22 +76,11 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(2034, 2807)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from sklearn.datasets import fetch_20newsgroups\n",
"\n",
@ -133,7 +122,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {
"collapsed": false
},
@ -163,7 +152,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 3,
"metadata": {
"collapsed": false
},
@ -179,29 +168,11 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" '0.004*objects + 0.004*obtained + 0.003*comets + 0.003*manhattan + 0.003*member + 0.003*beginning + 0.003*center + 0.003*groups + 0.003*aware + 0.003*increased'),\n",
" (1,\n",
" '0.003*activity + 0.002*objects + 0.002*professional + 0.002*eyes + 0.002*manhattan + 0.002*pressure + 0.002*netters + 0.002*chosen + 0.002*attempted + 0.002*medical'),\n",
" (2,\n",
" '0.003*mechanism + 0.003*led + 0.003*platform + 0.003*frank + 0.003*mormons + 0.003*aeronautics + 0.002*concepts + 0.002*header + 0.002*forces + 0.002*profit'),\n",
" (3,\n",
" '0.005*diameter + 0.005*having + 0.004*complex + 0.004*conclusions + 0.004*activity + 0.004*looking + 0.004*action + 0.004*inflatable + 0.004*defined + 0.004*association')]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# check the topics\n",
"lda.print_topics(4)"
@ -216,7 +187,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {
"collapsed": false
},
@ -250,19 +221,11 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dictionary(10913 unique tokens: ['whose', 'used', 'hoc', 'transfinite', 'newtek']...)\n"
]
}
],
"outputs": [],
"source": [
"# You can save the dictionary\n",
"dictionary.save('newsgroup.dict')\n",
@ -272,7 +235,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {
"collapsed": false
},
@ -285,7 +248,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"metadata": {
"collapsed": false
},
@ -299,7 +262,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {
"collapsed": true
},
@ -313,19 +276,11 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dictionary(10913 unique tokens: ['whose', 'used', 'hoc', 'transfinite', 'newtek']...)\n"
]
}
],
"outputs": [],
"source": [
"# We can print the dictionary, it is a mappying of id and tokens\n",
"\n",
@ -334,7 +289,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {
"collapsed": true
},
@ -346,7 +301,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": null,
"metadata": {
"collapsed": false
},
@ -361,19 +316,11 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.1598114653031772), (1, 0.10438175896914427), (2, 0.5700978153855775), (3, 0.24093628445650234), (4, 0.722808853369507), (5, 0.24093628445650234)]\n"
]
}
],
"outputs": [],
"source": [
"#print tf-idf of first document\n",
"print(corpus_tfidf[0])"
@ -381,7 +328,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"metadata": {
"collapsed": false
},
@ -396,29 +343,11 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" '0.010*targa + 0.007*ns + 0.006*thanks + 0.006*davidian + 0.006*ssrt + 0.006*yayayay + 0.005*craig + 0.005*bull + 0.005*gerald + 0.005*sorry'),\n",
" (1,\n",
" '0.011*god + 0.010*mary + 0.008*baptist + 0.008*islam + 0.006*zoroastrians + 0.006*joseph + 0.006*lucky + 0.006*khomeini + 0.006*samaritan + 0.005*crusades'),\n",
" (2,\n",
" '0.007*koresh + 0.007*moon + 0.007*western + 0.006*plane + 0.006*jeff + 0.006*unix + 0.005*bible + 0.005*also + 0.005*basically + 0.005*bob'),\n",
" (3,\n",
" '0.011*whatever + 0.008*joy + 0.007*happy + 0.006*virtual + 0.006*reality + 0.004*really + 0.003*samuel___ + 0.003*oh + 0.003*virtually + 0.003*toaster')]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# check the topics\n",
"lda_model.print_topics(4)"
@ -426,19 +355,11 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.085176135689180726), (1, 0.6919655173835938), (2, 0.1377903468164027), (3, 0.0850680001108228)]\n"
]
}
],
"outputs": [],
"source": [
"# check the lsa vector for the first document\n",
"corpus_lda = lda_model[corpus_tfidf]\n",
@ -447,19 +368,11 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('lord', 1), ('god', 2)]\n"
]
}
],
"outputs": [],
"source": [
"#predict topics of a new doc\n",
"new_doc = \"God is love and God is the Lord\"\n",
@ -470,19 +383,11 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.062509420435514051), (1, 0.81246608790618835), (2, 0.062508281488992554), (3, 0.062516210169305114)]\n"
]
}
],
"outputs": [],
"source": [
"#transform into LDA space\n",
"lda_vector = lda_model[bow_vector]\n",
@ -491,19 +396,11 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.011*god + 0.010*mary + 0.008*baptist + 0.008*islam + 0.006*zoroastrians + 0.006*joseph + 0.006*lucky + 0.006*khomeini + 0.006*samaritan + 0.005*crusades\n"
]
}
],
"outputs": [],
"source": [
"# print the document's single most prominent LDA topic\n",
"print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))"
@ -511,20 +408,11 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.10392179866025079), (1, 0.68822094221870811), (2, 0.10391916429993264), (3, 0.10393809482110833)]\n",
"0.011*god + 0.010*mary + 0.008*baptist + 0.008*islam + 0.006*zoroastrians + 0.006*joseph + 0.006*lucky + 0.006*khomeini + 0.006*samaritan + 0.005*crusades\n"
]
}
],
"outputs": [],
"source": [
"lda_vector_tfidf = lda_model[tfidf_model[bow_vector]]\n",
"print(lda_vector_tfidf)\n",
@ -541,7 +429,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": null,
"metadata": {
"collapsed": false
},
@ -559,29 +447,11 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" '0.769*\"god\" + 0.346*\"jesus\" + 0.235*\"bible\" + 0.204*\"christian\" + 0.149*\"christians\" + 0.107*\"christ\" + 0.090*\"well\" + 0.085*\"koresh\" + 0.081*\"kent\" + 0.080*\"christianity\"'),\n",
" (1,\n",
" '-0.863*\"thanks\" + -0.255*\"please\" + -0.159*\"hello\" + -0.153*\"hi\" + 0.123*\"god\" + -0.112*\"sorry\" + -0.087*\"could\" + -0.074*\"windows\" + -0.067*\"jpeg\" + -0.063*\"vga\"'),\n",
" (2,\n",
" '0.780*\"well\" + -0.229*\"god\" + 0.165*\"yes\" + -0.153*\"thanks\" + 0.133*\"ico\" + 0.133*\"tek\" + 0.130*\"bronx\" + 0.130*\"beauchaine\" + 0.130*\"queens\" + 0.129*\"manhattan\"'),\n",
" (3,\n",
" '0.340*\"well\" + -0.335*\"ico\" + -0.334*\"tek\" + -0.328*\"beauchaine\" + -0.328*\"bronx\" + -0.328*\"queens\" + -0.326*\"manhattan\" + -0.305*\"bob\" + -0.305*\"com\" + -0.072*\"god\"')]"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# check the topics\n",
"lsi_model.print_topics(4)"
@ -589,19 +459,11 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.1598114653031772), (1, 0.10438175896914427), (2, 0.5700978153855775), (3, 0.24093628445650234), (4, 0.722808853369507), (5, 0.24093628445650234)]\n"
]
}
],
"outputs": [],
"source": [
"# check the lsi vector for the first document\n",
"print(corpus_tfidf[0])"
@ -655,7 +517,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
"version": "3.5.2"
}
},
"nbformat": 4,