mirror of
https://github.com/gsi-upm/sitc
synced 2024-11-24 23:42:29 +00:00
Included installation of nltk
This commit is contained in:
parent
cb40531dc4
commit
e88e144a50
@ -67,7 +67,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 36,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
@ -102,21 +102,22 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"In order to use nltk, we should download first the lexical resources we are going to use. We can updated them later. For this, you need:\n",
|
"In order to use nltk, we should download first the lexical resources we are going to use. We can updated them later. For this, you need:\n",
|
||||||
|
"* install nltk. Execute 'conda install nltk'\n",
|
||||||
"* import nltk\n",
|
"* import nltk\n",
|
||||||
"* Run *nltk.download()* (the first time we use it). A window will appear. You should select just the corpus 'book' and press download.\n",
|
"* Run *nltk.download()* (the first time we use it). A window will appear. You should select just the corpus 'book' and press download.\n",
|
||||||
"If you inspect the window, you can get an overview of available lexical resources (corpora, lexicons and grammars). For example, you can find some relevant sentiment lexicons in corpora (SentiWordNet, Sentence Polarity Dataset, Vader, Opinion Lexicon or VADER Sentiment Lexicon)."
|
"If you inspect the window, you can get an overview of available lexical resources (corpora, lexicons and grammars). For example, you can find some relevant sentiment lexicons in corpora (SentiWordNet, Sentence Polarity Dataset, Vader, Opinion Lexicon or VADER Sentiment Lexicon). Don't forget to close the window once the data has been downloaded."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": true
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import nltk\n",
|
"import nltk\n",
|
||||||
"ntlk.download()"
|
"nltk.download()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -169,19 +170,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 22,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"['I purchased this monitor because of budgetary concerns.', 'This item was the most inexpensive 17 inch monitor \\navailable to me at the time I made the purchase.', 'My overall experience with this monitor was very poor.', \"When the \\nscreen wasn't contracting or glitching the overall picture quality was poor to fair.\", \"I've viewed numerous different \\nmonitor models since I 'm a college student and this particular monitor had as poor of picture quality as \\nany I 've seen.\"]\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from nltk.tokenize import sent_tokenize, word_tokenize\n",
|
"from nltk.tokenize import sent_tokenize, word_tokenize\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -205,20 +198,12 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 23,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"scrolled": true
|
"scrolled": true
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"[['I', 'purchased', 'this', 'monitor', 'because', 'of', 'budgetary', 'concerns', '.'], ['This', 'item', 'was', 'the', 'most', 'inexpensive', '17', 'inch', 'monitor', 'available', 'to', 'me', 'at', 'the', 'time', 'I', 'made', 'the', 'purchase', '.'], ['My', 'overall', 'experience', 'with', 'this', 'monitor', 'was', 'very', 'poor', '.'], ['When', 'the', 'screen', 'was', \"n't\", 'contracting', 'or', 'glitching', 'the', 'overall', 'picture', 'quality', 'was', 'poor', 'to', 'fair', '.'], ['I', \"'ve\", 'viewed', 'numerous', 'different', 'monitor', 'models', 'since', 'I', \"'m\", 'a', 'college', 'student', 'and', 'this', 'particular', 'monitor', 'had', 'as', 'poor', 'of', 'picture', 'quality', 'as', 'any', 'I', \"'ve\", 'seen', '.']]\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"words = [word_tokenize(t) for t in sent_tokenize(review)]\n",
|
"words = [word_tokenize(t) for t in sent_tokenize(review)]\n",
|
||||||
"print(words)"
|
"print(words)"
|
||||||
@ -233,19 +218,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 20,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"['I', 'purchased', 'this', 'monitor', 'because', 'of', 'budgetary', 'concerns', '.', 'This', 'item', 'was', 'the', 'most', 'inexpensive', '17', 'inch', 'monitor', 'available', 'to', 'me', 'at', 'the', 'time', 'I', 'made', 'the', 'purchase', '.', 'My', 'overall', 'experience', 'with', 'this', 'monitor', 'was', 'very', 'poor', '.', 'When', 'the', 'screen', 'was', \"n't\", 'contracting', 'or', 'glitching', 'the', 'overall', 'picture', 'quality', 'was', 'poor', 'to', 'fair', '.', 'I', \"'ve\", 'viewed', 'numerous', 'different', 'monitor', 'models', 'since', 'I', \"'m\", 'a', 'college', 'student', 'and', 'this', 'particular', 'monitor', 'had', 'as', 'poor', 'of', 'picture', 'quality', 'as', 'any', 'I', \"'ve\", 'seen', '.']\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"words = word_tokenize(review)\n",
|
"words = word_tokenize(review)\n",
|
||||||
"print(words)"
|
"print(words)"
|
||||||
@ -261,20 +238,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 35,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"With TweetTokenizer Lady Gaga is actually at the Britney Spears Femme Fatale Concert tonight ! ! ! She still listens to her music ! ! ! WOW ! ! ! #ladygaga #britney\n",
|
|
||||||
"With word_tokenizer @ concert Lady Gaga is actually at the Britney Spears Femme Fatale Concert tonight ! ! ! She still listens to her music ! ! ! ! WOW ! ! ! # ladygaga # britney\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from nltk.tokenize import TweetTokenizer\n",
|
"from nltk.tokenize import TweetTokenizer\n",
|
||||||
"tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)\n",
|
"tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)\n",
|
||||||
@ -299,26 +267,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 75,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Porter: boy children are have is ha Madrid\n",
|
|
||||||
"Execution time: 0.00093841552734375\n",
|
|
||||||
"Lancaster: boy childr ar hav is has madrid\n",
|
|
||||||
"Execution time: 0.0014300346374511719\n",
|
|
||||||
"WordNet: boy child are have is ha Madrid\n",
|
|
||||||
"Execution time: 0.0008304119110107422\n",
|
|
||||||
"SnowBall: boy children are have is has madrid\n",
|
|
||||||
"Execution time: 0.0017843246459960938\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer\n",
|
"from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer\n",
|
||||||
"from nltk.stem.snowball import EnglishStemmer\n",
|
"from nltk.stem.snowball import EnglishStemmer\n",
|
||||||
@ -362,19 +315,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 74,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"WordNet: be cry be have have\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"verbs = \"are crying is have has\"\n",
|
"verbs = \"are crying is have has\"\n",
|
||||||
"print(\"WordNet: \" + \" \".join([wordnet.lemmatize(w, pos='v') for w in word_tokenize(verbs)]))\n"
|
"print(\"WordNet: \" + \" \".join([wordnet.lemmatize(w, pos='v') for w in word_tokenize(verbs)]))\n"
|
||||||
@ -395,20 +340,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 82,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"['i', 'purchas', 'thi', 'monitor', 'becaus', 'of', 'budgetari', 'concern', '.', 'thi', 'item', 'wa', 'the', 'most', 'inexpens', '17', 'inch', 'monitor', 'avail', 'to', 'me', 'at', 'the', 'time', 'i', 'made', 'the', 'purchas', '.', 'my', 'overal', 'experi', 'with', 'thi', 'monitor', 'wa', 'veri', 'poor', '.', 'when', 'the', 'screen', 'wa', \"n't\", 'contract', 'or', 'glitch', 'the', 'overal', 'pictur', 'qualiti', 'wa', 'poor', 'to', 'fair', '.', 'i', \"'ve\", 'view', 'numer', 'differ', 'monitor', 'model', 'sinc', 'i', \"'m\", 'a', 'colleg', 'student', 'and', 'thi', 'particular', 'monitor', 'had', 'as', 'poor', 'of', 'pictur', 'qualiti', 'as', 'ani', 'i', \"'ve\", 'seen', '.']\n",
|
|
||||||
"['Ladi', 'Gaga', 'is', 'actual', 'at', 'the', 'Britney', 'Spear', 'Femm', 'Fatal', 'Concert', 'tonight', '!', '!', '!', 'She', 'still', 'listen', 'to', 'her', 'music', '!', '!', '!', 'WOW', '!', '!', '!', '#ladygaga', '#britney']\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"def preprocess(words, type='doc'):\n",
|
"def preprocess(words, type='doc'):\n",
|
||||||
" if (type == 'tweet'):\n",
|
" if (type == 'tweet'):\n",
|
||||||
@ -439,19 +375,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 86,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn']\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from nltk.corpus import stopwords\n",
|
"from nltk.corpus import stopwords\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -461,20 +389,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 89,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"['purchas', 'thi', 'monitor', 'becaus', 'budgetari', 'concern', '.', 'thi', 'item', 'wa', 'inexpens', '17', 'inch', 'monitor', 'avail', 'time', 'made', 'purchas', '.', 'overal', 'experi', 'thi', 'monitor', 'wa', 'veri', 'poor', '.', 'screen', 'wa', \"n't\", 'contract', 'glitch', 'overal', 'pictur', 'qualiti', 'wa', 'poor', 'fair', '.', \"'ve\", 'view', 'numer', 'differ', 'monitor', 'model', 'sinc', \"'m\", 'colleg', 'student', 'thi', 'particular', 'monitor', 'poor', 'pictur', 'qualiti', 'ani', \"'ve\", 'seen', '.']\n",
|
|
||||||
"['Ladi', 'Gaga', 'actual', 'Britney', 'Spear', 'Femm', 'Fatal', 'Concert', 'tonight', '!', '!', '!', 'She', 'still', 'listen', 'music', '!', '!', '!', 'WOW', '!', '!', '!', '#ladygaga', '#britney']\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"def preprocess(words, type='doc'):\n",
|
"def preprocess(words, type='doc'):\n",
|
||||||
" if (type == 'tweet'):\n",
|
" if (type == 'tweet'):\n",
|
||||||
@ -508,20 +427,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 104,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"['purchas', 'thi', 'monitor', 'becaus', 'budgetari', 'concern', 'thi', 'item', 'wa', 'inexpens', '17', 'inch', 'monitor', 'avail', 'time', 'made', 'purchas', 'overal', 'experi', 'thi', 'monitor', 'wa', 'veri', 'poor', 'screen', 'wa', \"n't\", 'contract', 'glitch', 'overal', 'pictur', 'qualiti', 'wa', 'poor', 'fair', \"'ve\", 'view', 'numer', 'differ', 'monitor', 'model', 'sinc', \"'m\", 'colleg', 'student', 'thi', 'particular', 'monitor', 'poor', 'pictur', 'qualiti', 'ani', \"'ve\", 'seen']\n",
|
|
||||||
"['Ladi', 'Gaga', 'actual', 'Britney', 'Spear', 'Femm', 'Fatal', 'Concert', 'tonight', 'She', 'still', 'listen', 'music', 'WOW', '#ladygaga', '#britney']\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"import string\n",
|
"import string\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -563,22 +473,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 124,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Most frequent\n",
|
|
||||||
"[('.', 5), ('I', 5), ('the', 5), ('monitor', 5), ('was', 4), ('this', 3), ('poor', 3), ('as', 2), ('overall', 2), ('picture', 2)]\n",
|
|
||||||
"Least frequent\n",
|
|
||||||
"['models', '17', 'purchase', 'different', 'most', \"n't\", 'monitor', \"'m\", 'was', 'My']\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"frec = nltk.FreqDist(nltk.word_tokenize(review))\n",
|
"frec = nltk.FreqDist(nltk.word_tokenize(review))\n",
|
||||||
"print(\"Most frequent\")\n",
|
"print(\"Most frequent\")\n",
|
||||||
@ -636,7 +535,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.1"
|
"version": "3.5.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -61,7 +61,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
@ -109,19 +109,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"[('I', 'PRON'), ('purchased', 'VERB'), ('this', 'DET'), ('Dell', 'NOUN'), ('monitor', 'NOUN'), ('because', 'ADP'), ('of', 'ADP'), ('budgetary', 'ADJ'), ('concerns', 'NOUN'), ('.', '.'), ('This', 'DET'), ('item', 'NOUN'), ('was', 'VERB'), ('the', 'DET'), ('most', 'ADV'), ('inexpensive', 'ADJ'), ('17', 'NUM'), ('inch', 'NOUN'), ('Apple', 'NOUN'), ('monitor', 'NOUN'), ('available', 'ADJ'), ('to', 'PRT'), ('me', 'PRON'), ('at', 'ADP'), ('the', 'DET'), ('time', 'NOUN'), ('I', 'PRON'), ('made', 'VERB'), ('the', 'DET'), ('purchase', 'NOUN'), ('.', '.'), ('My', 'PRON'), ('overall', 'ADJ'), ('experience', 'NOUN'), ('with', 'ADP'), ('this', 'DET'), ('monitor', 'NOUN'), ('was', 'VERB'), ('very', 'ADV'), ('poor', 'ADJ'), ('.', '.'), ('When', 'ADV'), ('the', 'DET'), ('screen', 'NOUN'), ('was', 'VERB'), (\"n't\", 'ADV'), ('contracting', 'VERB'), ('or', 'CONJ'), ('glitching', 'VERB'), ('the', 'DET'), ('overall', 'ADJ'), ('picture', 'NOUN'), ('quality', 'NOUN'), ('was', 'VERB'), ('poor', 'ADJ'), ('to', 'PRT'), ('fair', 'VERB'), ('.', '.'), ('I', 'PRON'), (\"'ve\", 'VERB'), ('viewed', 'VERB'), ('numerous', 'ADJ'), ('different', 'ADJ'), ('monitor', 'NOUN'), ('models', 'NOUN'), ('since', 'ADP'), ('I', 'PRON'), (\"'m\", 'VERB'), ('a', 'DET'), ('college', 'NOUN'), ('student', 'NOUN'), ('at', 'ADP'), ('UPM', 'NOUN'), ('in', 'ADP'), ('Madrid', 'NOUN'), ('and', 'CONJ'), ('this', 'DET'), ('particular', 'ADJ'), ('monitor', 'NOUN'), ('had', 'VERB'), ('as', 'ADP'), ('poor', 'ADJ'), ('of', 'ADP'), ('picture', 'NOUN'), ('quality', 'NOUN'), ('as', 'ADP'), ('any', 'DET'), ('I', 'PRON'), (\"'ve\", 'VERB'), ('seen', 'VERB'), ('.', '.')]\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from nltk import pos_tag, word_tokenize\n",
|
"from nltk import pos_tag, word_tokenize\n",
|
||||||
"print (pos_tag(word_tokenize(review), tagset='universal'))"
|
"print (pos_tag(word_tokenize(review), tagset='universal'))"
|
||||||
@ -136,19 +128,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"['I', 'purchase', 'Dell', 'monitor', 'because', 'of', 'budgetary', 'concern', 'item', 'be', 'most', 'inexpensive', '17', 'inch', 'Apple', 'monitor', 'available', 'me', 'at', 'time', 'I', 'make', 'purchase', 'My', 'overall', 'experience', 'with', 'monitor', 'be', 'very', 'poor', 'When', 'screen', 'be', \"n't\", 'contract', 'or', 'glitching', 'overall', 'picture', 'quality', 'be', 'poor', 'fair', 'I', \"'ve\", 'view', 'numerous', 'different', 'monitor', 'model', 'since', 'I', \"'m\", 'college', 'student', 'at', 'UPM', 'in', 'Madrid', 'and', 'particular', 'monitor', 'have', 'a', 'poor', 'of', 'picture', 'quality', 'a', 'I', \"'ve\", 'see']\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from nltk.stem import WordNetLemmatizer\n",
|
"from nltk.stem import WordNetLemmatizer\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -177,110 +161,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"(S\n",
|
|
||||||
" I/PRP\n",
|
|
||||||
" purchased/VBD\n",
|
|
||||||
" this/DT\n",
|
|
||||||
" (ORGANIZATION Dell/NNP)\n",
|
|
||||||
" monitor/NN\n",
|
|
||||||
" because/IN\n",
|
|
||||||
" of/IN\n",
|
|
||||||
" budgetary/JJ\n",
|
|
||||||
" concerns/NNS\n",
|
|
||||||
" ./.\n",
|
|
||||||
" This/DT\n",
|
|
||||||
" item/NN\n",
|
|
||||||
" was/VBD\n",
|
|
||||||
" the/DT\n",
|
|
||||||
" most/RBS\n",
|
|
||||||
" inexpensive/JJ\n",
|
|
||||||
" 17/CD\n",
|
|
||||||
" inch/NN\n",
|
|
||||||
" Apple/NNP\n",
|
|
||||||
" monitor/NN\n",
|
|
||||||
" available/JJ\n",
|
|
||||||
" to/TO\n",
|
|
||||||
" me/PRP\n",
|
|
||||||
" at/IN\n",
|
|
||||||
" the/DT\n",
|
|
||||||
" time/NN\n",
|
|
||||||
" I/PRP\n",
|
|
||||||
" made/VBD\n",
|
|
||||||
" the/DT\n",
|
|
||||||
" purchase/NN\n",
|
|
||||||
" ./.\n",
|
|
||||||
" My/PRP$\n",
|
|
||||||
" overall/JJ\n",
|
|
||||||
" experience/NN\n",
|
|
||||||
" with/IN\n",
|
|
||||||
" this/DT\n",
|
|
||||||
" monitor/NN\n",
|
|
||||||
" was/VBD\n",
|
|
||||||
" very/RB\n",
|
|
||||||
" poor/JJ\n",
|
|
||||||
" ./.\n",
|
|
||||||
" When/WRB\n",
|
|
||||||
" the/DT\n",
|
|
||||||
" screen/NN\n",
|
|
||||||
" was/VBD\n",
|
|
||||||
" n't/RB\n",
|
|
||||||
" contracting/VBG\n",
|
|
||||||
" or/CC\n",
|
|
||||||
" glitching/VBG\n",
|
|
||||||
" the/DT\n",
|
|
||||||
" overall/JJ\n",
|
|
||||||
" picture/NN\n",
|
|
||||||
" quality/NN\n",
|
|
||||||
" was/VBD\n",
|
|
||||||
" poor/JJ\n",
|
|
||||||
" to/TO\n",
|
|
||||||
" fair/VB\n",
|
|
||||||
" ./.\n",
|
|
||||||
" I/PRP\n",
|
|
||||||
" 've/VBP\n",
|
|
||||||
" viewed/VBN\n",
|
|
||||||
" numerous/JJ\n",
|
|
||||||
" different/JJ\n",
|
|
||||||
" monitor/NN\n",
|
|
||||||
" models/NNS\n",
|
|
||||||
" since/IN\n",
|
|
||||||
" I/PRP\n",
|
|
||||||
" 'm/VBP\n",
|
|
||||||
" a/DT\n",
|
|
||||||
" college/NN\n",
|
|
||||||
" student/NN\n",
|
|
||||||
" at/IN\n",
|
|
||||||
" (ORGANIZATION UPM/NNP)\n",
|
|
||||||
" in/IN\n",
|
|
||||||
" (GPE Madrid/NNP)\n",
|
|
||||||
" and/CC\n",
|
|
||||||
" this/DT\n",
|
|
||||||
" particular/JJ\n",
|
|
||||||
" monitor/NN\n",
|
|
||||||
" had/VBD\n",
|
|
||||||
" as/IN\n",
|
|
||||||
" poor/JJ\n",
|
|
||||||
" of/IN\n",
|
|
||||||
" picture/NN\n",
|
|
||||||
" quality/NN\n",
|
|
||||||
" as/IN\n",
|
|
||||||
" any/DT\n",
|
|
||||||
" I/PRP\n",
|
|
||||||
" 've/VBP\n",
|
|
||||||
" seen/VBN\n",
|
|
||||||
" ./.)\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from nltk import ne_chunk, pos_tag, word_tokenize\n",
|
"from nltk import ne_chunk, pos_tag, word_tokenize\n",
|
||||||
"ne_tagged = ne_chunk(pos_tag(word_tokenize(review)), binary=False)\n",
|
"ne_tagged = ne_chunk(pos_tag(word_tokenize(review)), binary=False)\n",
|
||||||
@ -321,7 +206,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
@ -344,90 +229,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 42,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"(S\n",
|
|
||||||
" I/PRON\n",
|
|
||||||
" purchased/VERB\n",
|
|
||||||
" (NP this/DET Dell/NOUN monitor/NOUN)\n",
|
|
||||||
" because/ADP\n",
|
|
||||||
" of/ADP\n",
|
|
||||||
" (NP budgetary/ADJ concerns/NOUN)\n",
|
|
||||||
" ./.\n",
|
|
||||||
" (NP This/DET item/NOUN)\n",
|
|
||||||
" was/VERB\n",
|
|
||||||
" (NP\n",
|
|
||||||
" the/DET\n",
|
|
||||||
" most/ADV\n",
|
|
||||||
" inexpensive/ADJ\n",
|
|
||||||
" 17/NUM\n",
|
|
||||||
" inch/NOUN\n",
|
|
||||||
" Apple/NOUN\n",
|
|
||||||
" monitor/NOUN)\n",
|
|
||||||
" available/ADJ\n",
|
|
||||||
" to/PRT\n",
|
|
||||||
" me/PRON\n",
|
|
||||||
" at/ADP\n",
|
|
||||||
" (NP the/DET time/NOUN)\n",
|
|
||||||
" I/PRON\n",
|
|
||||||
" made/VERB\n",
|
|
||||||
" (NP the/DET purchase/NOUN)\n",
|
|
||||||
" ./.\n",
|
|
||||||
" (NP My/PRON overall/ADJ experience/NOUN)\n",
|
|
||||||
" with/ADP\n",
|
|
||||||
" (NP this/DET monitor/NOUN)\n",
|
|
||||||
" was/VERB\n",
|
|
||||||
" very/ADV\n",
|
|
||||||
" poor/ADJ\n",
|
|
||||||
" ./.\n",
|
|
||||||
" When/ADV\n",
|
|
||||||
" (NP the/DET screen/NOUN)\n",
|
|
||||||
" was/VERB\n",
|
|
||||||
" n't/ADV\n",
|
|
||||||
" contracting/VERB\n",
|
|
||||||
" or/CONJ\n",
|
|
||||||
" glitching/VERB\n",
|
|
||||||
" (NP the/DET overall/ADJ picture/NOUN quality/NOUN)\n",
|
|
||||||
" was/VERB\n",
|
|
||||||
" poor/ADJ\n",
|
|
||||||
" to/PRT\n",
|
|
||||||
" fair/VERB\n",
|
|
||||||
" ./.\n",
|
|
||||||
" I/PRON\n",
|
|
||||||
" 've/VERB\n",
|
|
||||||
" viewed/VERB\n",
|
|
||||||
" (NP numerous/ADJ different/ADJ monitor/NOUN models/NOUN)\n",
|
|
||||||
" since/ADP\n",
|
|
||||||
" I/PRON\n",
|
|
||||||
" 'm/VERB\n",
|
|
||||||
" (NP a/DET college/NOUN student/NOUN)\n",
|
|
||||||
" at/ADP\n",
|
|
||||||
" (NP UPM/NOUN)\n",
|
|
||||||
" in/ADP\n",
|
|
||||||
" (NP Madrid/NOUN)\n",
|
|
||||||
" and/CONJ\n",
|
|
||||||
" (NP this/DET particular/ADJ monitor/NOUN)\n",
|
|
||||||
" had/VERB\n",
|
|
||||||
" as/ADP\n",
|
|
||||||
" poor/ADJ\n",
|
|
||||||
" of/ADP\n",
|
|
||||||
" (NP picture/NOUN quality/NOUN)\n",
|
|
||||||
" as/ADP\n",
|
|
||||||
" any/DET\n",
|
|
||||||
" I/PRON\n",
|
|
||||||
" 've/VERB\n",
|
|
||||||
" seen/VERB\n",
|
|
||||||
" ./.)\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from nltk.chunk.regexp import *\n",
|
"from nltk.chunk.regexp import *\n",
|
||||||
"pattern = \"\"\"NP: {<PRON><ADJ><NOUN>+} \n",
|
"pattern = \"\"\"NP: {<PRON><ADJ><NOUN>+} \n",
|
||||||
@ -451,37 +257,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 54,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"[Tree('NP', [('this', 'DET'), ('Dell', 'NOUN'), ('monitor', 'NOUN')]),\n",
|
|
||||||
" Tree('NP', [('budgetary', 'ADJ'), ('concerns', 'NOUN')]),\n",
|
|
||||||
" Tree('NP', [('This', 'DET'), ('item', 'NOUN')]),\n",
|
|
||||||
" Tree('NP', [('the', 'DET'), ('most', 'ADV'), ('inexpensive', 'ADJ'), ('17', 'NUM'), ('inch', 'NOUN'), ('Apple', 'NOUN'), ('monitor', 'NOUN')]),\n",
|
|
||||||
" Tree('NP', [('the', 'DET'), ('time', 'NOUN')]),\n",
|
|
||||||
" Tree('NP', [('the', 'DET'), ('purchase', 'NOUN')]),\n",
|
|
||||||
" Tree('NP', [('My', 'PRON'), ('overall', 'ADJ'), ('experience', 'NOUN')]),\n",
|
|
||||||
" Tree('NP', [('this', 'DET'), ('monitor', 'NOUN')]),\n",
|
|
||||||
" Tree('NP', [('the', 'DET'), ('screen', 'NOUN')]),\n",
|
|
||||||
" Tree('NP', [('the', 'DET'), ('overall', 'ADJ'), ('picture', 'NOUN'), ('quality', 'NOUN')]),\n",
|
|
||||||
" Tree('NP', [('numerous', 'ADJ'), ('different', 'ADJ'), ('monitor', 'NOUN'), ('models', 'NOUN')]),\n",
|
|
||||||
" Tree('NP', [('a', 'DET'), ('college', 'NOUN'), ('student', 'NOUN')]),\n",
|
|
||||||
" Tree('NP', [('UPM', 'NOUN')]),\n",
|
|
||||||
" Tree('NP', [('Madrid', 'NOUN')]),\n",
|
|
||||||
" Tree('NP', [('this', 'DET'), ('particular', 'ADJ'), ('monitor', 'NOUN')]),\n",
|
|
||||||
" Tree('NP', [('picture', 'NOUN'), ('quality', 'NOUN')])]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 54,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"def extractTrees(parsed_tree, category='NP'):\n",
|
"def extractTrees(parsed_tree, category='NP'):\n",
|
||||||
" return list(parsed_tree.subtrees(filter=lambda x: x.label()==category))\n",
|
" return list(parsed_tree.subtrees(filter=lambda x: x.label()==category))\n",
|
||||||
@ -491,37 +271,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 90,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['this Dell monitor',\n",
|
|
||||||
" 'budgetary concerns',\n",
|
|
||||||
" 'This item',\n",
|
|
||||||
" 'the most inexpensive 17 inch Apple monitor',\n",
|
|
||||||
" 'the time',\n",
|
|
||||||
" 'the purchase',\n",
|
|
||||||
" 'My overall experience',\n",
|
|
||||||
" 'this monitor',\n",
|
|
||||||
" 'the screen',\n",
|
|
||||||
" 'the overall picture quality',\n",
|
|
||||||
" 'numerous different monitor models',\n",
|
|
||||||
" 'a college student',\n",
|
|
||||||
" 'UPM',\n",
|
|
||||||
" 'Madrid',\n",
|
|
||||||
" 'this particular monitor',\n",
|
|
||||||
" 'picture quality']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 90,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"def extractStrings(parsed_tree, category='NP'):\n",
|
"def extractStrings(parsed_tree, category='NP'):\n",
|
||||||
" return [\" \".join(word for word, pos in vp.leaves()) for vp in extractTrees(parsed_tree, category)]\n",
|
" return [\" \".join(word for word, pos in vp.leaves()) for vp in extractTrees(parsed_tree, category)]\n",
|
||||||
@ -529,15 +283,6 @@
|
|||||||
"extractStrings(chunks_np)"
|
"extractStrings(chunks_np)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -587,7 +332,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.1"
|
"version": "3.5.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -59,7 +59,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
@ -109,27 +109,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
|
|
||||||
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
|
|
||||||
" lowercase=True, max_df=1.0, max_features=5000, min_df=1,\n",
|
|
||||||
" ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
|
|
||||||
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
|
|
||||||
" tokenizer=None, vocabulary=None)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -152,23 +136,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"<3x10 sparse matrix of type '<class 'numpy.int64'>'\n",
|
|
||||||
"\twith 15 stored elements in Compressed Sparse Row format>"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"vectors = vectorizer.fit_transform(documents)\n",
|
"vectors = vectorizer.fit_transform(documents)\n",
|
||||||
"vectors"
|
"vectors"
|
||||||
@ -186,22 +158,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"[[0 1 1 2 0 0 1 2 0 0]\n",
|
|
||||||
" [1 0 0 0 2 0 0 1 2 1]\n",
|
|
||||||
" [1 0 0 0 2 1 0 0 1 1]]\n",
|
|
||||||
"['and', 'but', 'coming', 'is', 'like', 'sandwiches', 'short', 'summer', 'the', 'winter']\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"print(vectors.toarray())\n",
|
"print(vectors.toarray())\n",
|
||||||
"print(vectorizer.get_feature_names())"
|
"print(vectorizer.get_feature_names())"
|
||||||
@ -219,32 +180,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['and',\n",
|
|
||||||
" 'but',\n",
|
|
||||||
" 'coming',\n",
|
|
||||||
" 'i',\n",
|
|
||||||
" 'is',\n",
|
|
||||||
" 'like',\n",
|
|
||||||
" 'sandwiches',\n",
|
|
||||||
" 'short',\n",
|
|
||||||
" 'summer',\n",
|
|
||||||
" 'the',\n",
|
|
||||||
" 'winter']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 7,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words=None, token_pattern='(?u)\\\\b\\\\w+\\\\b') \n",
|
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words=None, token_pattern='(?u)\\\\b\\\\w+\\\\b') \n",
|
||||||
"vectors = vectorizer.fit_transform(documents)\n",
|
"vectors = vectorizer.fit_transform(documents)\n",
|
||||||
@ -260,22 +200,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 8,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', token_pattern='(?u)\\\\b\\\\w+\\\\b') \n",
|
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', token_pattern='(?u)\\\\b\\\\w+\\\\b') \n",
|
||||||
"vectors = vectorizer.fit_transform(documents)\n",
|
"vectors = vectorizer.fit_transform(documents)\n",
|
||||||
@ -284,19 +213,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"frozenset({'could', 'sixty', 'onto', 'by', 'against', 'up', 'a', 'everything', 'other', 'otherwise', 'ourselves', 'beside', 'nowhere', 'then', 'below', 'put', 'ten', 'such', 'cannot', 'either', 'due', 'hasnt', 'whereupon', 'were', 'once', 'at', 'for', 'front', 'get', 'whereas', 'that', 'eight', 'another', 'except', 'of', 'wherever', 'over', 'to', 'whom', 'you', 'former', 'behind', 'yours', 'yourself', 'what', 'even', 'however', 'go', 'less', 'bottom', 'may', 'along', 'is', 'can', 'move', 'eg', 'somewhere', 'latterly', 'seemed', 'thence', 'becoming', 'himself', 'whether', 'six', 'first', 'off', 'do', 'many', 'namely', 'never', 'because', 'mostly', 'nevertheless', 'thereupon', 'here', 'least', 'anyone', 'one', 'others', 'cry', 'they', 'thereby', 'ie', 'am', 'this', 'would', 'any', 'while', 'see', 'too', 'your', 'somehow', 'within', 'same', 'sometimes', 'thereafter', 'must', 'take', 're', 'both', 'fill', 'nor', 'sometime', 'he', 'third', 'more', 'also', 'most', 'during', 'much', 'our', 'thick', 'enough', 'full', 'toward', 'with', 'mill', 'anyhow', 'nobody', 'why', 'thru', 'although', 'nothing', 'meanwhile', 'or', 'some', 'ltd', 'wherein', 'thus', 'someone', 'whereby', 'who', 'un', 'are', 'hundred', 'whereafter', 'fire', 'twenty', 'only', 'several', 'among', 'no', 'than', 'before', 'been', 'else', 'find', 'fifteen', 'hence', 'ours', 'already', 'be', 'besides', 'next', 'interest', 'whither', 'whole', 'eleven', 'without', 'five', 'show', 'in', 'throughout', 'own', 'amongst', 'will', 'neither', 'everywhere', 'part', 'give', 'my', 'hers', 'his', 'upon', 'well', 'him', 'yourselves', 'whatever', 'cant', 'though', 'had', 'again', 'every', 'noone', 'top', 'which', 'de', 'almost', 'system', 'under', 'down', 'latter', 'above', 'whence', 'found', 'myself', 'three', 'those', 'become', 'moreover', 'but', 'anyway', 'beyond', 'from', 'now', 'as', 'seeming', 'con', 'themselves', 'hereupon', 'each', 'serious', 'two', 'across', 'out', 'the', 'therein', 'between', 'inc', 'where', 'anything', 'seem', 'co', 'therefore', 'whoever', 'herein', 'about', 'herself', 'should', 'anywhere', 'how', 'we', 'after', 'describe', 'being', 'etc', 'very', 'not', 'an', 'me', 'call', 'per', 'detail', 'still', 'around', 'hereby', 'sincere', 'their', 'has', 'became', 'beforehand', 'everyone', 'hereafter', 'made', 'ever', 'indeed', 'itself', 'something', 'afterwards', 'none', 'done', 'nine', 'alone', 'please', 'its', 'name', 'since', 'on', 'she', 'bill', 'have', 'mine', 'few', 'her', 'seems', 'always', 'side', 'forty', 'further', 'via', 'last', 'amount', 'towards', 'fify', 'through', 'whose', 'couldnt', 'perhaps', 'thin', 'until', 'becomes', 'elsewhere', 'and', 'i', 'them', 'together', 'us', 'was', 'when', 'rather', 'whenever', 'formerly', 'keep', 'so', 'back', 'there', 'amoungst', 'might', 'these', 'all', 'empty', 'often', 'into', 'it', 'twelve', 'yet', 'if', 'four'})\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"#stop words in scikit-learn for English\n",
|
"#stop words in scikit-learn for English\n",
|
||||||
"print(vectorizer.get_stop_words())"
|
"print(vectorizer.get_stop_words())"
|
||||||
@ -304,24 +225,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"array([[1, 0, 0, 1, 2, 0],\n",
|
|
||||||
" [0, 2, 0, 0, 1, 1],\n",
|
|
||||||
" [0, 2, 1, 0, 0, 1]], dtype=int64)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 10,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# Vectors\n",
|
"# Vectors\n",
|
||||||
"f_array = vectors.toarray()\n",
|
"f_array = vectors.toarray()\n",
|
||||||
@ -337,19 +245,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"0.666666666667 1.0 0.166666666667\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from scipy.spatial.distance import cosine\n",
|
"from scipy.spatial.distance import cosine\n",
|
||||||
"d12 = cosine(f_array[0], f_array[1])\n",
|
"d12 = cosine(f_array[0], f_array[1])\n",
|
||||||
@ -374,22 +274,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 12,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', binary=True) \n",
|
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', binary=True) \n",
|
||||||
"vectors = vectorizer.fit_transform(documents)\n",
|
"vectors = vectorizer.fit_transform(documents)\n",
|
||||||
@ -398,24 +287,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 13,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"array([[1, 0, 0, 1, 1, 0],\n",
|
|
||||||
" [0, 1, 0, 0, 1, 1],\n",
|
|
||||||
" [0, 1, 1, 0, 0, 1]], dtype=int64)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 13,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"vectors.toarray()"
|
"vectors.toarray()"
|
||||||
]
|
]
|
||||||
@ -436,29 +312,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 14,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['coming summer',\n",
|
|
||||||
" 'like sandwiches',\n",
|
|
||||||
" 'like summer',\n",
|
|
||||||
" 'like winter',\n",
|
|
||||||
" 'sandwiches like',\n",
|
|
||||||
" 'summer coming',\n",
|
|
||||||
" 'summer like',\n",
|
|
||||||
" 'summer short']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 14,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', ngram_range=[2,2]) \n",
|
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', ngram_range=[2,2]) \n",
|
||||||
"vectors = vectorizer.fit_transform(documents)\n",
|
"vectors = vectorizer.fit_transform(documents)\n",
|
||||||
@ -467,24 +325,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 15,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"array([[1, 0, 0, 0, 0, 1, 0, 1],\n",
|
|
||||||
" [0, 0, 1, 1, 0, 0, 1, 0],\n",
|
|
||||||
" [0, 1, 0, 1, 1, 0, 0, 0]], dtype=int64)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 15,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"vectors.toarray()"
|
"vectors.toarray()"
|
||||||
]
|
]
|
||||||
@ -505,22 +350,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 16,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 16,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -531,27 +365,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 17,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"array([[ 0.48148213, 0. , 0. , 0.48148213, 0.73235914,\n",
|
|
||||||
" 0. ],\n",
|
|
||||||
" [ 0. , 0.81649658, 0. , 0. , 0.40824829,\n",
|
|
||||||
" 0.40824829],\n",
|
|
||||||
" [ 0. , 0.77100584, 0.50689001, 0. , 0. ,\n",
|
|
||||||
" 0.38550292]])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 17,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"vectors.toarray()"
|
"vectors.toarray()"
|
||||||
]
|
]
|
||||||
@ -565,22 +383,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 30,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 30,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"train = [doc1, doc2, doc3]\n",
|
"train = [doc1, doc2, doc3]\n",
|
||||||
"vectorizer = TfidfVectorizer(analyzer=\"word\", stop_words='english')\n",
|
"vectorizer = TfidfVectorizer(analyzer=\"word\", stop_words='english')\n",
|
||||||
@ -592,28 +399,12 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 31,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"scrolled": true
|
"scrolled": true
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"array([[ 0.48148213, 0. , 0. , 0.48148213, 0.73235914,\n",
|
|
||||||
" 0. ],\n",
|
|
||||||
" [ 0. , 0.81649658, 0. , 0. , 0.40824829,\n",
|
|
||||||
" 0.40824829],\n",
|
|
||||||
" [ 0. , 0.77100584, 0.50689001, 0. , 0. ,\n",
|
|
||||||
" 0.38550292]])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 31,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"vectors.toarray()"
|
"vectors.toarray()"
|
||||||
]
|
]
|
||||||
@ -627,22 +418,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 33,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"array([[ 0.38324078, 0.24713249, 0.23336362]])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 33,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -664,22 +444,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 29,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"array([ 0.38324078, 0.24713249, 0.23336362])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 29,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.metrics.pairwise import linear_kernel\n",
|
"from sklearn.metrics.pairwise import linear_kernel\n",
|
||||||
"cosine_similarity = linear_kernel(vector_query, vectors).flatten()\n",
|
"cosine_similarity = linear_kernel(vector_query, vectors).flatten()\n",
|
||||||
@ -734,7 +503,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.1"
|
"version": "3.5.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -74,19 +74,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 93,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.datasets import fetch_20newsgroups\n",
|
"from sklearn.datasets import fetch_20newsgroups\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -100,19 +92,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 106,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"20\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"#Number of categories\n",
|
"#Number of categories\n",
|
||||||
"print(len(newsgroups_train.target_names))"
|
"print(len(newsgroups_train.target_names))"
|
||||||
@ -120,28 +104,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 94,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Category id 4 comp.sys.mac.hardware\n",
|
|
||||||
"Doc A fair number of brave souls who upgraded their SI clock oscillator have\n",
|
|
||||||
"shared their experiences for this poll. Please send a brief message detailing\n",
|
|
||||||
"your experiences with the procedure. Top speed attained, CPU rated speed,\n",
|
|
||||||
"add on cards and adapters, heat sinks, hour of usage per day, floppy disk\n",
|
|
||||||
"functionality with 800 and 1.4 m floppies are especially requested.\n",
|
|
||||||
"\n",
|
|
||||||
"I will be summarizing in the next two days, so please add to the network\n",
|
|
||||||
"knowledge base if you have done the clock upgrade and haven't answered this\n",
|
|
||||||
"poll. Thanks.\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# Show a document\n",
|
"# Show a document\n",
|
||||||
"docid = 1\n",
|
"docid = 1\n",
|
||||||
@ -154,22 +121,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 95,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"(11314,)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 95,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"#Number of files\n",
|
"#Number of files\n",
|
||||||
"newsgroups_train.filenames.shape"
|
"newsgroups_train.filenames.shape"
|
||||||
@ -177,22 +133,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 96,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"(11314, 101323)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 96,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# Obtain a vector\n",
|
"# Obtain a vector\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -206,22 +151,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 97,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"66.80510871486653"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 97,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# The tf-idf vectors are very sparse with an average of 66 non zero components in 101.323 dimensions (.06%)\n",
|
"# The tf-idf vectors are very sparse with an average of 66 non zero components in 101.323 dimensions (.06%)\n",
|
||||||
"vectors_train.nnz / float(vectors_train.shape[0])"
|
"vectors_train.nnz / float(vectors_train.shape[0])"
|
||||||
@ -243,22 +177,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 138,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"0.69545360719001303"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 138,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.naive_bayes import MultinomialNB\n",
|
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -271,7 +194,7 @@
|
|||||||
"model = MultinomialNB(alpha=.01)\n",
|
"model = MultinomialNB(alpha=.01)\n",
|
||||||
"model.fit(vectors_train, newsgroups_train.target)\n",
|
"model.fit(vectors_train, newsgroups_train.target)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"vectors_test = vectorizer.transform(newsgroups_test.data)\n",
|
||||||
"pred = model.predict(vectors_test)\n",
|
"pred = model.predict(vectors_test)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"metrics.f1_score(newsgroups_test.target, pred, average='weighted')\n"
|
"metrics.f1_score(newsgroups_test.target, pred, average='weighted')\n"
|
||||||
@ -286,20 +209,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 141,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"dimensionality: 101323\n",
|
|
||||||
"density: 1.000000\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.utils.extmath import density\n",
|
"from sklearn.utils.extmath import density\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -309,38 +223,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 140,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"alt.atheism: islam atheists say just religion atheism think don people god\n",
|
|
||||||
"comp.graphics: looking format 3d know program file files thanks image graphics\n",
|
|
||||||
"comp.os.ms-windows.misc: card problem thanks driver drivers use files dos file windows\n",
|
|
||||||
"comp.sys.ibm.pc.hardware: monitor disk thanks pc ide controller bus card scsi drive\n",
|
|
||||||
"comp.sys.mac.hardware: know monitor does quadra simms thanks problem drive apple mac\n",
|
|
||||||
"comp.windows.x: using windows x11r5 use application thanks widget server motif window\n",
|
|
||||||
"misc.forsale: asking email sell price condition new shipping offer 00 sale\n",
|
|
||||||
"rec.autos: don ford new good dealer just engine like cars car\n",
|
|
||||||
"rec.motorcycles: don just helmet riding like motorcycle ride bikes dod bike\n",
|
|
||||||
"rec.sport.baseball: braves players pitching hit runs games game baseball team year\n",
|
|
||||||
"rec.sport.hockey: league year nhl games season players play hockey team game\n",
|
|
||||||
"sci.crypt: people use escrow nsa keys government chip clipper encryption key\n",
|
|
||||||
"sci.electronics: don thanks voltage used know does like circuit power use\n",
|
|
||||||
"sci.med: skepticism cadre dsl banks chastity n3jxp pitt gordon geb msg\n",
|
|
||||||
"sci.space: just lunar earth shuttle like moon launch orbit nasa space\n",
|
|
||||||
"soc.religion.christian: believe faith christian christ bible people christians church jesus god\n",
|
|
||||||
"talk.politics.guns: just law firearms government fbi don weapons people guns gun\n",
|
|
||||||
"talk.politics.mideast: said arabs arab turkish people armenians armenian jews israeli israel\n",
|
|
||||||
"talk.politics.misc: know state clinton president just think tax don government people\n",
|
|
||||||
"talk.religion.misc: think don koresh objective christians bible people christian jesus god\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# We can review the top features per topic in Bayes (attribute coef_)\n",
|
"# We can review the top features per topic in Bayes (attribute coef_)\n",
|
||||||
"import numpy as np\n",
|
"import numpy as np\n",
|
||||||
@ -357,20 +244,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 136,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"[ 3 15]\n",
|
|
||||||
"['comp.sys.ibm.pc.hardware', 'soc.religion.christian']\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# We try the classifier in two new docs\n",
|
"# We try the classifier in two new docs\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -431,7 +309,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.1"
|
"version": "3.5.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -76,22 +76,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"(2034, 2807)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.datasets import fetch_20newsgroups\n",
|
"from sklearn.datasets import fetch_20newsgroups\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -133,7 +122,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
@ -163,7 +152,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 3,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
@ -179,29 +168,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"[(0,\n",
|
|
||||||
" '0.004*objects + 0.004*obtained + 0.003*comets + 0.003*manhattan + 0.003*member + 0.003*beginning + 0.003*center + 0.003*groups + 0.003*aware + 0.003*increased'),\n",
|
|
||||||
" (1,\n",
|
|
||||||
" '0.003*activity + 0.002*objects + 0.002*professional + 0.002*eyes + 0.002*manhattan + 0.002*pressure + 0.002*netters + 0.002*chosen + 0.002*attempted + 0.002*medical'),\n",
|
|
||||||
" (2,\n",
|
|
||||||
" '0.003*mechanism + 0.003*led + 0.003*platform + 0.003*frank + 0.003*mormons + 0.003*aeronautics + 0.002*concepts + 0.002*header + 0.002*forces + 0.002*profit'),\n",
|
|
||||||
" (3,\n",
|
|
||||||
" '0.005*diameter + 0.005*having + 0.004*complex + 0.004*conclusions + 0.004*activity + 0.004*looking + 0.004*action + 0.004*inflatable + 0.004*defined + 0.004*association')]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 7,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# check the topics\n",
|
"# check the topics\n",
|
||||||
"lda.print_topics(4)"
|
"lda.print_topics(4)"
|
||||||
@ -216,7 +187,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
@ -250,19 +221,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Dictionary(10913 unique tokens: ['whose', 'used', 'hoc', 'transfinite', 'newtek']...)\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# You can save the dictionary\n",
|
"# You can save the dictionary\n",
|
||||||
"dictionary.save('newsgroup.dict')\n",
|
"dictionary.save('newsgroup.dict')\n",
|
||||||
@ -272,7 +235,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
@ -285,7 +248,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
@ -299,7 +262,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": true
|
"collapsed": true
|
||||||
},
|
},
|
||||||
@ -313,19 +276,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 13,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Dictionary(10913 unique tokens: ['whose', 'used', 'hoc', 'transfinite', 'newtek']...)\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# We can print the dictionary, it is a mappying of id and tokens\n",
|
"# We can print the dictionary, it is a mappying of id and tokens\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -334,7 +289,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 14,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": true
|
"collapsed": true
|
||||||
},
|
},
|
||||||
@ -346,7 +301,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 15,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
@ -361,19 +316,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 16,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"[(0, 0.1598114653031772), (1, 0.10438175896914427), (2, 0.5700978153855775), (3, 0.24093628445650234), (4, 0.722808853369507), (5, 0.24093628445650234)]\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"#print tf-idf of first document\n",
|
"#print tf-idf of first document\n",
|
||||||
"print(corpus_tfidf[0])"
|
"print(corpus_tfidf[0])"
|
||||||
@ -381,7 +328,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 17,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
@ -396,29 +343,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 18,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"[(0,\n",
|
|
||||||
" '0.010*targa + 0.007*ns + 0.006*thanks + 0.006*davidian + 0.006*ssrt + 0.006*yayayay + 0.005*craig + 0.005*bull + 0.005*gerald + 0.005*sorry'),\n",
|
|
||||||
" (1,\n",
|
|
||||||
" '0.011*god + 0.010*mary + 0.008*baptist + 0.008*islam + 0.006*zoroastrians + 0.006*joseph + 0.006*lucky + 0.006*khomeini + 0.006*samaritan + 0.005*crusades'),\n",
|
|
||||||
" (2,\n",
|
|
||||||
" '0.007*koresh + 0.007*moon + 0.007*western + 0.006*plane + 0.006*jeff + 0.006*unix + 0.005*bible + 0.005*also + 0.005*basically + 0.005*bob'),\n",
|
|
||||||
" (3,\n",
|
|
||||||
" '0.011*whatever + 0.008*joy + 0.007*happy + 0.006*virtual + 0.006*reality + 0.004*really + 0.003*samuel___ + 0.003*oh + 0.003*virtually + 0.003*toaster')]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 18,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# check the topics\n",
|
"# check the topics\n",
|
||||||
"lda_model.print_topics(4)"
|
"lda_model.print_topics(4)"
|
||||||
@ -426,19 +355,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 19,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"[(0, 0.085176135689180726), (1, 0.6919655173835938), (2, 0.1377903468164027), (3, 0.0850680001108228)]\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# check the lsa vector for the first document\n",
|
"# check the lsa vector for the first document\n",
|
||||||
"corpus_lda = lda_model[corpus_tfidf]\n",
|
"corpus_lda = lda_model[corpus_tfidf]\n",
|
||||||
@ -447,19 +368,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 20,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"[('lord', 1), ('god', 2)]\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"#predict topics of a new doc\n",
|
"#predict topics of a new doc\n",
|
||||||
"new_doc = \"God is love and God is the Lord\"\n",
|
"new_doc = \"God is love and God is the Lord\"\n",
|
||||||
@ -470,19 +383,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 21,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"[(0, 0.062509420435514051), (1, 0.81246608790618835), (2, 0.062508281488992554), (3, 0.062516210169305114)]\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"#transform into LDA space\n",
|
"#transform into LDA space\n",
|
||||||
"lda_vector = lda_model[bow_vector]\n",
|
"lda_vector = lda_model[bow_vector]\n",
|
||||||
@ -491,19 +396,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 22,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"0.011*god + 0.010*mary + 0.008*baptist + 0.008*islam + 0.006*zoroastrians + 0.006*joseph + 0.006*lucky + 0.006*khomeini + 0.006*samaritan + 0.005*crusades\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# print the document's single most prominent LDA topic\n",
|
"# print the document's single most prominent LDA topic\n",
|
||||||
"print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))"
|
"print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))"
|
||||||
@ -511,20 +408,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 23,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"[(0, 0.10392179866025079), (1, 0.68822094221870811), (2, 0.10391916429993264), (3, 0.10393809482110833)]\n",
|
|
||||||
"0.011*god + 0.010*mary + 0.008*baptist + 0.008*islam + 0.006*zoroastrians + 0.006*joseph + 0.006*lucky + 0.006*khomeini + 0.006*samaritan + 0.005*crusades\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"lda_vector_tfidf = lda_model[tfidf_model[bow_vector]]\n",
|
"lda_vector_tfidf = lda_model[tfidf_model[bow_vector]]\n",
|
||||||
"print(lda_vector_tfidf)\n",
|
"print(lda_vector_tfidf)\n",
|
||||||
@ -541,7 +429,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 25,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
@ -559,29 +447,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 26,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"[(0,\n",
|
|
||||||
" '0.769*\"god\" + 0.346*\"jesus\" + 0.235*\"bible\" + 0.204*\"christian\" + 0.149*\"christians\" + 0.107*\"christ\" + 0.090*\"well\" + 0.085*\"koresh\" + 0.081*\"kent\" + 0.080*\"christianity\"'),\n",
|
|
||||||
" (1,\n",
|
|
||||||
" '-0.863*\"thanks\" + -0.255*\"please\" + -0.159*\"hello\" + -0.153*\"hi\" + 0.123*\"god\" + -0.112*\"sorry\" + -0.087*\"could\" + -0.074*\"windows\" + -0.067*\"jpeg\" + -0.063*\"vga\"'),\n",
|
|
||||||
" (2,\n",
|
|
||||||
" '0.780*\"well\" + -0.229*\"god\" + 0.165*\"yes\" + -0.153*\"thanks\" + 0.133*\"ico\" + 0.133*\"tek\" + 0.130*\"bronx\" + 0.130*\"beauchaine\" + 0.130*\"queens\" + 0.129*\"manhattan\"'),\n",
|
|
||||||
" (3,\n",
|
|
||||||
" '0.340*\"well\" + -0.335*\"ico\" + -0.334*\"tek\" + -0.328*\"beauchaine\" + -0.328*\"bronx\" + -0.328*\"queens\" + -0.326*\"manhattan\" + -0.305*\"bob\" + -0.305*\"com\" + -0.072*\"god\"')]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 26,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# check the topics\n",
|
"# check the topics\n",
|
||||||
"lsi_model.print_topics(4)"
|
"lsi_model.print_topics(4)"
|
||||||
@ -589,19 +459,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 27,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"[(0, 0.1598114653031772), (1, 0.10438175896914427), (2, 0.5700978153855775), (3, 0.24093628445650234), (4, 0.722808853369507), (5, 0.24093628445650234)]\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# check the lsi vector for the first document\n",
|
"# check the lsi vector for the first document\n",
|
||||||
"print(corpus_tfidf[0])"
|
"print(corpus_tfidf[0])"
|
||||||
@ -655,7 +517,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.1"
|
"version": "3.5.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
Loading…
Reference in New Issue
Block a user