@ -67,7 +67,7 @@
},
{
"cell_type": "code",
"execution_count": 36 ,
"execution_count": null ,
"metadata": {
"collapsed": false
},
@ -102,21 +102,22 @@
"\n",
"\n",
"In order to use nltk, we should download first the lexical resources we are going to use. We can updated them later. For this, you need:\n",
"* install nltk. Execute 'conda install nltk'\n",
"* import nltk\n",
"* Run *nltk.download()* (the first time we use it). A window will appear. You should select just the corpus 'book' and press download.\n",
"If you inspect the window, you can get an overview of available lexical resources (corpora, lexicons and grammars). For example, you can find some relevant sentiment lexicons in corpora (SentiWordNet, Sentence Polarity Dataset, Vader, Opinion Lexicon or VADER Sentiment Lexicon)."
"If you inspect the window, you can get an overview of available lexical resources (corpora, lexicons and grammars). For example, you can find some relevant sentiment lexicons in corpora (SentiWordNet, Sentence Polarity Dataset, Vader, Opinion Lexicon or VADER Sentiment Lexicon). Don't forget to close the window once the data has been downloaded. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": tru e
"collapsed": fals e
},
"outputs": [],
"source": [
"import nltk\n",
"nt lk.download()"
"nlt k.download()"
]
},
{
@ -169,19 +170,11 @@
},
{
"cell_type": "code",
"execution_count": 22 ,
"execution_count": null ,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['I purchased this monitor because of budgetary concerns.', 'This item was the most inexpensive 17 inch monitor \\navailable to me at the time I made the purchase.', 'My overall experience with this monitor was very poor.', \"When the \\nscreen wasn't contracting or glitching the overall picture quality was poor to fair.\", \"I've viewed numerous different \\nmonitor models since I 'm a college student and this particular monitor had as poor of picture quality as \\nany I 've seen.\"]\n"
]
}
],
"outputs": [],
"source": [
"from nltk.tokenize import sent_tokenize, word_tokenize\n",
"\n",
@ -205,20 +198,12 @@
},
{
"cell_type": "code",
"execution_count": 23 ,
"execution_count": null ,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['I', 'purchased', 'this', 'monitor', 'because', 'of', 'budgetary', 'concerns', '.'], ['This', 'item', 'was', 'the', 'most', 'inexpensive', '17', 'inch', 'monitor', 'available', 'to', 'me', 'at', 'the', 'time', 'I', 'made', 'the', 'purchase', '.'], ['My', 'overall', 'experience', 'with', 'this', 'monitor', 'was', 'very', 'poor', '.'], ['When', 'the', 'screen', 'was', \"n't\", 'contracting', 'or', 'glitching', 'the', 'overall', 'picture', 'quality', 'was', 'poor', 'to', 'fair', '.'], ['I', \"'ve\", 'viewed', 'numerous', 'different', 'monitor', 'models', 'since', 'I', \"'m\", 'a', 'college', 'student', 'and', 'this', 'particular', 'monitor', 'had', 'as', 'poor', 'of', 'picture', 'quality', 'as', 'any', 'I', \"'ve\", 'seen', '.']]\n"
]
}
],
"outputs": [],
"source": [
"words = [word_tokenize(t) for t in sent_tokenize(review)]\n",
"print(words)"
@ -233,19 +218,11 @@
},
{
"cell_type": "code",
"execution_count": 20 ,
"execution_count": null ,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['I', 'purchased', 'this', 'monitor', 'because', 'of', 'budgetary', 'concerns', '.', 'This', 'item', 'was', 'the', 'most', 'inexpensive', '17', 'inch', 'monitor', 'available', 'to', 'me', 'at', 'the', 'time', 'I', 'made', 'the', 'purchase', '.', 'My', 'overall', 'experience', 'with', 'this', 'monitor', 'was', 'very', 'poor', '.', 'When', 'the', 'screen', 'was', \"n't\", 'contracting', 'or', 'glitching', 'the', 'overall', 'picture', 'quality', 'was', 'poor', 'to', 'fair', '.', 'I', \"'ve\", 'viewed', 'numerous', 'different', 'monitor', 'models', 'since', 'I', \"'m\", 'a', 'college', 'student', 'and', 'this', 'particular', 'monitor', 'had', 'as', 'poor', 'of', 'picture', 'quality', 'as', 'any', 'I', \"'ve\", 'seen', '.']\n"
]
}
],
"outputs": [],
"source": [
"words = word_tokenize(review)\n",
"print(words)"
@ -261,20 +238,11 @@
},
{
"cell_type": "code",
"execution_count": 35 ,
"execution_count": null ,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"With TweetTokenizer Lady Gaga is actually at the Britney Spears Femme Fatale Concert tonight ! ! ! She still listens to her music ! ! ! WOW ! ! ! #ladygaga #britney\n",
"With word_tokenizer @ concert Lady Gaga is actually at the Britney Spears Femme Fatale Concert tonight ! ! ! She still listens to her music ! ! ! ! WOW ! ! ! # ladygaga # britney\n"
]
}
],
"outputs": [],
"source": [
"from nltk.tokenize import TweetTokenizer\n",
"tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)\n",
@ -299,26 +267,11 @@
},
{
"cell_type": "code",
"execution_count": 75 ,
"execution_count": null ,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Porter: boy children are have is ha Madrid\n",
"Execution time: 0.00093841552734375\n",
"Lancaster: boy childr ar hav is has madrid\n",
"Execution time: 0.0014300346374511719\n",
"WordNet: boy child are have is ha Madrid\n",
"Execution time: 0.0008304119110107422\n",
"SnowBall: boy children are have is has madrid\n",
"Execution time: 0.0017843246459960938\n"
]
}
],
"outputs": [],
"source": [
"from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer\n",
"from nltk.stem.snowball import EnglishStemmer\n",
@ -362,19 +315,11 @@
},
{
"cell_type": "code",
"execution_count": 74 ,
"execution_count": null ,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WordNet: be cry be have have\n"
]
}
],
"outputs": [],
"source": [
"verbs = \"are crying is have has\"\n",
"print(\"WordNet: \" + \" \".join([wordnet.lemmatize(w, pos='v') for w in word_tokenize(verbs)]))\n"
@ -395,20 +340,11 @@
},
{
"cell_type": "code",
"execution_count": 82 ,
"execution_count": null ,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['i', 'purchas', 'thi', 'monitor', 'becaus', 'of', 'budgetari', 'concern', '.', 'thi', 'item', 'wa', 'the', 'most', 'inexpens', '17', 'inch', 'monitor', 'avail', 'to', 'me', 'at', 'the', 'time', 'i', 'made', 'the', 'purchas', '.', 'my', 'overal', 'experi', 'with', 'thi', 'monitor', 'wa', 'veri', 'poor', '.', 'when', 'the', 'screen', 'wa', \"n't\", 'contract', 'or', 'glitch', 'the', 'overal', 'pictur', 'qualiti', 'wa', 'poor', 'to', 'fair', '.', 'i', \"'ve\", 'view', 'numer', 'differ', 'monitor', 'model', 'sinc', 'i', \"'m\", 'a', 'colleg', 'student', 'and', 'thi', 'particular', 'monitor', 'had', 'as', 'poor', 'of', 'pictur', 'qualiti', 'as', 'ani', 'i', \"'ve\", 'seen', '.']\n",
"['Ladi', 'Gaga', 'is', 'actual', 'at', 'the', 'Britney', 'Spear', 'Femm', 'Fatal', 'Concert', 'tonight', '!', '!', '!', 'She', 'still', 'listen', 'to', 'her', 'music', '!', '!', '!', 'WOW', '!', '!', '!', '#ladygaga', '#britney']\n"
]
}
],
"outputs": [],
"source": [
"def preprocess(words, type='doc'):\n",
" if (type == 'tweet'):\n",
@ -439,19 +375,11 @@
},
{
"cell_type": "code",
"execution_count": 86 ,
"execution_count": null ,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn']\n"
]
}
],
"outputs": [],
"source": [
"from nltk.corpus import stopwords\n",
"\n",
@ -461,20 +389,11 @@
},
{
"cell_type": "code",
"execution_count": 89 ,
"execution_count": null ,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['purchas', 'thi', 'monitor', 'becaus', 'budgetari', 'concern', '.', 'thi', 'item', 'wa', 'inexpens', '17', 'inch', 'monitor', 'avail', 'time', 'made', 'purchas', '.', 'overal', 'experi', 'thi', 'monitor', 'wa', 'veri', 'poor', '.', 'screen', 'wa', \"n't\", 'contract', 'glitch', 'overal', 'pictur', 'qualiti', 'wa', 'poor', 'fair', '.', \"'ve\", 'view', 'numer', 'differ', 'monitor', 'model', 'sinc', \"'m\", 'colleg', 'student', 'thi', 'particular', 'monitor', 'poor', 'pictur', 'qualiti', 'ani', \"'ve\", 'seen', '.']\n",
"['Ladi', 'Gaga', 'actual', 'Britney', 'Spear', 'Femm', 'Fatal', 'Concert', 'tonight', '!', '!', '!', 'She', 'still', 'listen', 'music', '!', '!', '!', 'WOW', '!', '!', '!', '#ladygaga', '#britney']\n"
]
}
],
"outputs": [],
"source": [
"def preprocess(words, type='doc'):\n",
" if (type == 'tweet'):\n",
@ -508,20 +427,11 @@
},
{
"cell_type": "code",
"execution_count": 104 ,
"execution_count": null ,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['purchas', 'thi', 'monitor', 'becaus', 'budgetari', 'concern', 'thi', 'item', 'wa', 'inexpens', '17', 'inch', 'monitor', 'avail', 'time', 'made', 'purchas', 'overal', 'experi', 'thi', 'monitor', 'wa', 'veri', 'poor', 'screen', 'wa', \"n't\", 'contract', 'glitch', 'overal', 'pictur', 'qualiti', 'wa', 'poor', 'fair', \"'ve\", 'view', 'numer', 'differ', 'monitor', 'model', 'sinc', \"'m\", 'colleg', 'student', 'thi', 'particular', 'monitor', 'poor', 'pictur', 'qualiti', 'ani', \"'ve\", 'seen']\n",
"['Ladi', 'Gaga', 'actual', 'Britney', 'Spear', 'Femm', 'Fatal', 'Concert', 'tonight', 'She', 'still', 'listen', 'music', 'WOW', '#ladygaga', '#britney']\n"
]
}
],
"outputs": [],
"source": [
"import string\n",
"\n",
@ -563,22 +473,11 @@
},
{
"cell_type": "code",
"execution_count": 124 ,
"execution_count": null ,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Most frequent\n",
"[('.', 5), ('I', 5), ('the', 5), ('monitor', 5), ('was', 4), ('this', 3), ('poor', 3), ('as', 2), ('overall', 2), ('picture', 2)]\n",
"Least frequent\n",
"['models', '17', 'purchase', 'different', 'most', \"n't\", 'monitor', \"'m\", 'was', 'My']\n"
]
}
],
"outputs": [],
"source": [
"frec = nltk.FreqDist(nltk.word_tokenize(review))\n",
"print(\"Most frequent\")\n",
@ -636,7 +535,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1 "
"version": "3.5.2 "
}
},
"nbformat": 4,