mirror of https://github.com/balkian/gists
master
commit
6b2d0f3a6e
@ -0,0 +1,596 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Download DepecheMood from GitHub"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "SyntaxError",
|
||||||
|
"evalue": "invalid syntax (<ipython-input-15-537542d5c5d8>, line 1)",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[1;36m File \u001b[1;32m\"<ipython-input-15-537542d5c5d8>\"\u001b[1;36m, line \u001b[1;32m1\u001b[0m\n\u001b[1;33m wget https://github.com/marcoguerini/DepecheMood/releases/download/v1.0/DepecheMood_V1.0.zip -O latest-DepecheMood.zip\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!wget https://github.com/marcoguerini/DepecheMood/releases/download/v1.0/DepecheMood_V1.0.zip -O latest-DepecheMood.zip\n",
|
||||||
|
"!yes 2>/dev/null | unzip latest-DepecheMood.zip"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 283,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"DepecheMood_freq.txt DepecheMood_normfreq.txt DepecheMood_tfidf.tsv DepecheMood_tfidf.txt README.txt\r\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"ls DepecheMood_V1.0/"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 150,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"This package contains the three versions of the Lexicon 'DepecheMood' as described in the paper: \r\n",
|
||||||
|
"\r\n",
|
||||||
|
"Staiano, J., & Guerini, M. (2014). \"DepecheMood: a Lexicon for Emotion Analysis from Crowd-Annotated News\". Proceedings of ACL-2014. \r\n",
|
||||||
|
"\r\n",
|
||||||
|
"Each version of DepecheMood is built starting from word-by-document matrices either using raw frequencies (DepecheMood_freq.txt), normalized frequencies (DepecheMood_normfreq.txt) or tf-idf (DepecheMood_tfidf.txt). \r\n",
|
||||||
|
"\r\n",
|
||||||
|
"The files are tab-separated; each row contains one Lemma#PoS followed by the scores for the following emotions: AFRAID, AMUSED, ANGRY, ANNOYED, DONT_CARE, HAPPY, INSPIRED, SAD.\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"This resource is freely available for research purposes. \r\n",
|
||||||
|
"\r\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"cat DepecheMood_V1.0/README.txt"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Lemma#PoS\r\n",
|
||||||
|
"50#n\r\n",
|
||||||
|
"500#n\r\n",
|
||||||
|
"50th#a\r\n",
|
||||||
|
"55th#a\r\n",
|
||||||
|
"5th#a\r\n",
|
||||||
|
"6#n\r\n",
|
||||||
|
"60#n\r\n",
|
||||||
|
"60th#a\r\n",
|
||||||
|
"64th#a\r\n",
|
||||||
|
"65th#a\r\n",
|
||||||
|
"6th#a\r\n",
|
||||||
|
"7#n\r\n",
|
||||||
|
"70#n\r\n",
|
||||||
|
"70th#a\r\n",
|
||||||
|
"75th#a\r\n",
|
||||||
|
"78#n\r\n",
|
||||||
|
"7th#a\r\n",
|
||||||
|
"8#n\r\n",
|
||||||
|
"80#n\r\n",
|
||||||
|
"80th#a\r\n",
|
||||||
|
"85th#a\r\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!sed -n '1p;100,120p' DepecheMood_V1.0/DepecheMood_tfidf.txt | cut -f 1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import nltk\n",
|
||||||
|
"import pandas as pd"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 398,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"lexicon = pd.DataFrame.from_csv(\"DepecheMood_V1.0/DepecheMood_normfreq.txt\", sep=\"\\t\", index_col=False)\n",
|
||||||
|
"s = lexicon[\"Lemma#PoS\"].str.split(\"#\")\n",
|
||||||
|
"lexicon[\"Lemma\"] = s.str[0]\n",
|
||||||
|
"lexicon[\"PoS\"] = s.str[1]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 399,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"scrolled": false
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 399,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"s=lexicon.index.to_series()\n",
|
||||||
|
"s.str.contains(\"#\", False).count() # All words contain # (POS tag)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 400,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"array(['n', 'a', 'r', 'v'], dtype=object)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 400,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"lexicon[\"PoS\"].unique() # According to the paper, there's verbs (v), nouns (n), adjectives (a) and adverbs (r)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 435,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#lexicon[lexicon.groupby(\"Lemma\")[\"PoS\"].transform(len) > 1] # Getting lemmas with different PoS"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 436,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"lexdict = {}\n",
|
||||||
|
"header = []\n",
|
||||||
|
"with open(\"DepecheMood_V1.0/DepecheMood_tfidf.tsv\") as f:\n",
|
||||||
|
" header = f.next().strip().split()\n",
|
||||||
|
" for l in f:\n",
|
||||||
|
" temp = l.strip().split(\"\\t\")\n",
|
||||||
|
" lexdict[temp[0]] = map(float, temp[1:])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Compare performance"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Pandas is great for analysis, but for sentiment analysis we need something that is quick at finding entries in the lexicon and calculating the values of each emotion. Hence, we need to compare pandas with a plain dictionary for this task."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 438,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import random\n",
|
||||||
|
"keys = lexdict.keys()\n",
|
||||||
|
"random.shuffle(keys)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 439,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"10 loops, best of 3: 16.9 ms per loop\n",
|
||||||
|
"10 loops, best of 3: 35.9 ms per loop\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"def search_all(d):\n",
|
||||||
|
" for i in keys:\n",
|
||||||
|
" if i in d:\n",
|
||||||
|
" ja = d[i][0] + d[i][1]\n",
|
||||||
|
" \n",
|
||||||
|
"%timeit search_all(lexdict)\n",
|
||||||
|
"%timeit search_all(lexicon)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 440,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"10 loops, best of 3: 34.6 ms per loop\n",
|
||||||
|
"10 loops, best of 3: 39 ms per loop\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"def search_all(d):\n",
|
||||||
|
" for i in keys:\n",
|
||||||
|
" if i in d:\n",
|
||||||
|
" ja = 0\n",
|
||||||
|
" for j in d[i]:\n",
|
||||||
|
" ja += j\n",
|
||||||
|
" \n",
|
||||||
|
"%timeit search_all(lexdict)\n",
|
||||||
|
"%timeit search_all(lexicon.ix[:, lexicon.columns - [\"Lemma#PoS\", \"Lemma\", \"PoS\"]])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Apparently, it is slightly better to use a python dictionary. Plus, we avoid dependencies."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Sentiment Analysis"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 441,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"mapping = {\n",
|
||||||
|
" \n",
|
||||||
|
"#CC - Coordinating conjunction\n",
|
||||||
|
"\"CD\": \"n\", # - Cardinal number\n",
|
||||||
|
"#DT - Determiner\n",
|
||||||
|
"#EX - Existential there\n",
|
||||||
|
"\"FW\": \"n\", # - Foreign word\n",
|
||||||
|
"#IN #- Preposition or subordinating conjunction\n",
|
||||||
|
"\"JJ\": \"a\", #- Adjective\n",
|
||||||
|
"\"JJR\": \"a\", # - Adjective, comparative\n",
|
||||||
|
"\"JJS\": \"a\", # - Adjective, superlative\n",
|
||||||
|
"#LS - List item marker\n",
|
||||||
|
"#MD - Modal\n",
|
||||||
|
"\"NN\": \"n\", # - Noun, singular or mass\n",
|
||||||
|
"\"NNS\": \"n\", # - Noun, plural\n",
|
||||||
|
"\"NNP\": \"n\", # - Proper noun, singular\n",
|
||||||
|
"\"NNPS\": \"n\", # - Proper noun, plural\n",
|
||||||
|
"#PDT - Predeterminer\n",
|
||||||
|
"#POS - Possessive ending\n",
|
||||||
|
"\"PRP\": \"n\", # - Personal pronoun\n",
|
||||||
|
"#PRP$ - Possessive pronoun (prolog version PRP-S)\n",
|
||||||
|
"\"RB\": \"r\", # - Adverb\n",
|
||||||
|
"\"RBR\": \"r\", # - Adverb, comparative\n",
|
||||||
|
"\"RBS\": \"r\", #- Adverb, superlative\n",
|
||||||
|
"#RP - Particle\n",
|
||||||
|
"#SYM - Symbol\n",
|
||||||
|
"#TO - to\n",
|
||||||
|
"#\"UH\": \"n\", - Interjection\n",
|
||||||
|
"\"VB\": \"v\", # - Verb, base form\n",
|
||||||
|
"\"VBD\": \"v\", # - Verb, past tense\n",
|
||||||
|
"\"VBG\": \"v\", #- Verb, gerund or present participle\n",
|
||||||
|
"\"VBN\": \"v\", #- Verb, past participle\n",
|
||||||
|
"\"VBP\": \"v\", #- Verb, non-3rd person singular present\n",
|
||||||
|
"\"VBZ\": \"v\", #- Verb, 3rd person singular present\n",
|
||||||
|
"#\"WDT\": \"n\" # - Wh-determiner\n",
|
||||||
|
"#\"WP: - Wh-pronoun\n",
|
||||||
|
"#WP$ - Possessive wh-pronoun (prolog version WP-S)\n",
|
||||||
|
"\"WRB\": \"r\", # - Wh-adverb\n",
|
||||||
|
" }\n",
|
||||||
|
"def simplify(tag):\n",
|
||||||
|
" return mapping.get(tag, \"n\")\n",
|
||||||
|
"\n",
|
||||||
|
"#Alternative:\n",
|
||||||
|
"#from nltk.tag.simplify import simplify_wsj_tag"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 442,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"scrolled": false
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"image/png": [
|
||||||
|
"iVBORw0KGgoAAAANSUhEUgAAAW8AAAExCAYAAACzuCOTAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\n",
|
||||||
|
"AAALEgAACxIB0t1+/AAAF+hJREFUeJzt3Xu4HHV9x/H3ISd4w4AR1ApiqlKJF7xjqFajWAVspcUq\n",
|
||||||
|
"iFXjFapovSN9Wom1rbXeeBAe5FEs2lapt1ovFLxxoLWiVSGAJMotj0RRAS+AYk3K6R/fWc5kM7tn\n",
|
||||||
|
"z8nOzu+bvF/PM2FmZ9n9ZjPnc2Z/v9/8BiRJkiRJkiRJkiRJkiRJ0k7ug8CPgUuHPOdk4ApgHfCI\n",
|
||||||
|
"SRQlSRru94hAHhTehwFnV+uPBS6cRFGSpPmtYHB4vw84sra9Abhn2wVJ0s5slzG8xt7AtbXtTcA+\n",
|
||||||
|
"Y3hdSdIA02N6nam+7dmG51wJ3H9M7ydJO4t1wMMX+z+vYHizyVG17UHNJk2BXqq1XRewAGu7LmCB\n",
|
||||||
|
"1nb8/rMwO+Jy4gKe2/nxvbbj91+ItV0XsABruy6AAcfWOJpNPgM8v1pfBfycGJ0iSWrJKM0mHwWe\n",
|
||||||
|
"COxJtG2fCCyt9p1OjDQ5jGgW+SXwwvGXKUnqStdfKxdiddcFLMDqrgtYoNUdv/8CmkLOy9Rssrrj\n",
|
||||||
|
"91+I1V0XsACruy6AAcdWf0dj2wVM8v2kJrPt5OzU7X9IY9aYneNo85YkTZjhLUkJGd6SlJDhLUkJ\n",
|
||||||
|
"Gd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6S\n",
|
||||||
|
"lJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDh\n",
|
||||||
|
"LUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlNAo4X0IsAG4Aji+Yf+ewDnAxcBlwJpxFSdJWpwlwJXA\n",
|
||||||
|
"CmApEdAr+56zFnhbtb4ncCMw3fBas61UKC3MLMy2sHh8qzWNx9Z8Z94HEuG9EdgMnAUc3vec64Bl\n",
|
||||||
|
"1foyIry3LLZKSdL8ms6Q6/YGrq1tbwIe2/ec9wNfAX4I3BV49tiqkyQ1mi+8R/kq+BdEc8pq4P7A\n",
|
||||||
|
"F4GHATc3PHdtbX2mWiRJc1ZXy3ZZRXRG9pzAtp2WZwOPq21/GXh0w2vZJqgS2OatbBbV5v1NYD+i\n",
|
||||||
|
"w3JX4EjgM33P2QA8pVq/J/BA4OrFVilJGo9Dge8SHZcnVI8dUy0QI0w+C6wDLgWOHvA6npmoBJ55\n",
|
||||||
|
"K5vGY2tqwgVM8v2kJrPt5OzU7X9IY9aYnV5hKUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJ\n",
|
||||||
|
"Gd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6S\n",
|
||||||
|
"lJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDh\n",
|
||||||
|
"LUkJGd6SlJDhLUkJjRLehwAbgCuA4wc8ZzVwEXAZMDOOwiRJi7cEuBJYASwFLgZW9j1nD+A7wD7V\n",
|
||||||
|
"9p4DXmu2hfqkhZqF2RYWj2+1pvHYmu/M+0AivDcCm4GzgMP7nnM08ElgU7V9w6JLlCSNZL7w3hu4\n",
|
||||||
|
"tra9qXqsbj9gOXAe8E3geWOrTpLUaHqe/aN8FVwKPBI4GLgz8DXgQqKNXJLUgvnC+wfAfWrb92Gu\n",
|
||||||
|
"eaTnWqKp5NZquQB4GM3hvba2PoOdm5LUb3W1bJdp4Cqiw3JXmjss9we+RHRu3hm4FHhQw2vZoaMS\n",
|
||||||
|
"2GGpbBZ9bB0KfJfouDyheuyYaul5PTHi5FLgVeMuQBojw1vZNB5bUxMuYJLvJzWZbSdnp27/Qxqz\n",
|
||||||
|
"xuz0CktJSsjwlqSEDG9JSsjwlqSEDG9JSsjwlqSEDG9JSsjwlqSEDG9JSsjwlqSEDG9JSsjwlqSE\n",
|
||||||
|
"DG9JSsjwlqSEDG9JSsjwlqSEDG9JSsjwlqSEDG+NwfRNxK2axrhM3zTZv4OUi/ew1Di0cF/I1u4J\n",
|
||||||
|
"6T0slY33sJSkHYXhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDh\n",
|
||||||
|
"LUkJGd6SlJDhLUkJGd6SlJDhLUkJjRLehwAbgCuA44c87zHAFuCIMdQlSdoOS4ArgRXAUuBiYOWA\n",
|
||||||
|
"530F+BzwzAGv1cbtS1SGWZgd89La8dJCra3WKzUeW/OdeR9IhPdGYDNwFnB4w/NeCXwCuH7x9UmS\n",
|
||||||
|
"RjVfeO8NXFvb3lQ91v+cw4HTqm3PQCSpZdPz7B8liE8C3sTcTTKH3YR1bW19plomZPom2HLXFl73\n",
|
||||||
|
"ZtiybPyvK2kntbpatssq4Jza9gls22l5NXBNtdwM/Bh4RsNrdX1GbltnezJ9rh4HymZRx9Y0cBXR\n",
|
||||||
|
"Ybkrgzsse/6RwaNNuj64/aFtT6bP1eNA2TQeW/M1m2wBjgPOJUaUnAGsB46p9p8+ruokSaMb1j49\n",
|
||||||
|
"brMTfr+G92/j5Gjq9j92Yi18tq19rh4HyqYxO73CUpISMrwlKSHDW5ISMrwlKSHDW5ISMrwlKSHD\n",
|
||||||
|
"W5ISMrwlKSHDW5ISMrwlKSHDW5ISMrwlKSHDW5ISMrwlKSHDW5ISMrwlKSHDW5ISMrwlKSHDW5IS\n",
|
||||||
|
"MryLNX0Tce+6MS7TN0327yCpLd6AeLtlulGutZLvOJC8AbEk7SgMb0lKyPCWpIQMb0lKyPCWpIQM\n",
|
||||||
|
"b0lKyPCWpIQMb0lKyPCWpIQMb0lKyPCWpIQMb0lKyPCWpIQMb0lKyPCWpIRGDe9DgA3AFcDxDfuf\n",
|
||||||
|
"C6wDLgG+ChwwluokSYu2BLgSWAEsBS4GVvY95yBg92r9EODChtdpYwb8hZiF2RaW1v5e1pqm1lbr\n",
|
||||||
|
"lRqPrVHOvA8kwnsjsBk4Czi87zlfA35RrX8d2GdRJUqSRjJKeO8NXFvb3lQ9NsiLgbO3pyhJ0nDT\n",
|
||||||
|
"IzxnIV8HnwS8CHjcgP1ra+sz1SJJmrO6WrbbKuCc2vYJNHdaHkA0rzxgwOt03SaYra3TWtPUapu3\n",
|
||||||
|
"WrXoY2sauIrosNyV5g7LfYngXtVGAWOS7YfWWtPUanirVY3H1ijNJluA44BziZEnZwDrgWOq/acD\n",
|
||||||
|
"bwbuBpxWPbaZ6OiUJLVgaoLvNTvh92t4/zZOjqZu/2PMWqjXWsl3HEiN2ekVlpKUkOEtSQkZ3pKU\n",
|
||||||
|
"kOEtFWv6JqK9c8zL9E0T/WuoFXZYbrdMHWvWSq7jIFOtao8dlpK0ozC8JSkhw1uSEjK8JSkhw1uS\n",
|
||||||
|
"EjK8JSkhw1uSEjK8JSkhw1uSEjK8JSkhw1uSEjK8JSkhw1uSEjK8JSkhw1vSTqiNudInO0+683lv\n",
|
||||||
|
"t0zzTlsruY6DTLVmk+yYdT5vSdohGN6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJ\n",
|
||||||
|
"Gd6SlJDhLUkJGd6SxqCNiZ4mP9lTJk5Mtd0yTaBkreQ6Dqw11XHgxFSSpHmMEt6HABuAK4DjBzzn\n",
|
||||||
|
"5Gr/OuAR4ylNkjTIfOG9BDiFCPAHAc8BVvY95zDgAcB+wMuA08ZcYwdmui5gAWa6LmCBZrouYAFm\n",
|
||||||
|
"ui5gAWa6LmABZrouYAFmui5goPnC+0DgSmAjsBk4Czi87znPAD5UrX8d2AO45/hK7MJM1wUswEzX\n",
|
||||||
|
"BSzQTNcFLMBM1wUswEzXBSzATNcFLMBM1wUMNF947w1cW9veVD0233P22f7SJEmDzBfeo3bH9veE\n",
|
||||||
|
"ttHtLEka0SrgnNr2CWzbafk+4Kja9gaam02upJVxoC4uLi479HIxizANXAWsAHatXqSpw/Lsan0V\n",
|
||||||
|
"cOFi3kiSNF6HAt8lzpxPqB47plp6Tqn2rwMeOdHqJEmSJEmSWtHlXCNSKfYnOtoB7gj8uravtH6c\n",
|
||||||
|
"JcD/dV3EAuxPXLy3f7V9OfB+oim2RAcQtc4C64HLui1n8tYA3wZ+VS3fBF7QZUFDLCUuNHoj8Abg\n",
|
||||||
|
"D4iO2tKs6LqARcjy2V5UW//2kH0lWAf8btdFjOgg4DrgLcTFfX8M/HX12EEd1tVkd+KKnKuBfwM+\n",
|
||||||
|
"TQzWOA9Y1l1Zk/UC4oB/EnG15d2AJwPfAp7fYV1N9ibOAM4H3gOcBFxQPXbvDutq0uswLjH8mmT6\n",
|
||||||
|
"bC8asN603bXHAt8gzl7v1nEt8zkHWN3w+BOB/5hsKfN6L/BOtr72ZQnwD9W+ncLXgd9ueHxFta8k\n",
|
||||||
|
"HwJe3fD4q5i75L8Uy4gAvAR4Qse1jCLTZ5spvCEC5uXEWeIpRLi8l5ggriTfG7KvtGaT9cQ3xX5L\n",
|
||||||
|
"mWtSK0obZ3F3Ba5peHxjta8kq2huznkv5R1cNxFh+GjgS8APgNuqfbNEW11JMn22+xDBN0V8Y+it\n",
|
||||||
|
"w7bTQZRgOXEc/IT4RnsbUe9sl0U1uGXIvl9NrIrR/IaYv6nfZuB/J1zLSNoI718vcl8Xbh3w+Czl\n",
|
||||||
|
"HVwABxNn3x8ATqW8H9a6TJ/tG4i6pogw7H2uU0R/TUmOJep9J/Biyj4G7sPWvwjrSvuleAfiGpX+\n",
|
||||||
|
"X4JT1b7itBHeK4FLB+y7fwvvtz2WAUew9cHV+yEurZPiLOKH4WgGf74lyfTZnjlkX2mTrD2e6Oz7\n",
|
||||||
|
"SdeFjKD3SxG2DsUSfyn+CHjXgH3XTbKQUbUxVHDFPPs3tvCei3Umw89cXjihOkbxUqKTqt9uwHHA\n",
|
||||||
|
"30+2nHmdSZ7PFuBRwP2IoWzfIX5R/hUxl/2+HdbVbxVwOjGH/iXE2fflnVa0OEtpbqYo0a5Es4q0\n",
|
||||||
|
"KPcm2ovPJnrAdwNeQ0zBW1pHVTZ/Q3RYfZQYHvYuot/m1cS475J8C/h9oq5nAed2W85Q/1Vb/6e+\n",
|
||||||
|
"ff1DMkszBTwFOAP4cce1TMwtwM0DltLuBH1Sbf3P+/adOcE6RvElYC1xJngS8Q3mLOBe3ZU0VKbP\n",
|
||||||
|
"9nLmQno58EvKHVefYTRMT7ZRPBBNUicD3yeybA1xTKgwmQ6u/mkhNxHjUEuV6bPtr2dRU3BOyNVE\n",
|
||||||
|
"X8Izq6W+fUSHdTXJdAy8jRjaeA7wIiKwm0bNFWNSF3zchTiwjgKePqH33NHswtwZwBTwU+KqsJ6f\n",
|
||||||
|
"TryiHcf9gM/WtlfUtmeJq0RLcQHwh0O2PzXZcobanblO6946te2SvIRokjqNuICo+DbuNsP7DkRQ\n",
|
||||||
|
"Pwd4GnFQva/F91uMJUQgTtXWqW2XZBlxcPVM1bZniQAqSabPtv++rPVRB6UNxVvTdQELUP/F0v9L\n",
|
||||||
|
"5vzJlzPUbxF9CUcRFz7NAHciV8fqdnsa0ab5feDDxD/Yxg7rGWYj8dXomr713qLF20j+z3ZfYl6W\n",
|
||||||
|
"kqwi5jf5JfA14EHdlrNoJU+Kd0fgT4BPEJ2VH+m2nMm5DfgMW89fkeWHtWTTxAiTnlXEZfJPoLwr\n",
|
||||||
|
"VyHPHCz97gG8ghgpcTWDx/52JdNok5692Dqsj6a82foOJM6+e15A5NgZlDcnU2seDrwduIJo/H8x\n",
|
||||||
|
"cRZeovsSk2f1PJnoaX4tMbazJO9i6/uHXkO0y36R+LxLk2n2u2VEc8S5zA0V/EGXBQ1Resdf3RHA\n",
|
||||||
|
"DcRFLpuIvoNvEzP2lXbHrYuYa9p7AlHzM4lhpJ/oqqiuTAGPI9qPriOC/GWdVrStbzD3DeHhwI3A\n",
|
||||||
|
"64jmng90VdQAF7P1xDm9H9op4KuTL2demWa/u5U4y1pVe6zUb4uZRptcSlxMBHER1Ba2bvcuybra\n",
|
||||||
|
"+qnEsNymfcWYVLvTEmJejiOJM/FSXMLchE7vJJp83kiM7FgHPLSjuprUawV4KvCFan0d8LCJVzS/\n",
|
||||||
|
"XYi5OF5PXFzU6/ybJWYXLMWriY71pcDHgI8T4+qbZsfs2plsO/dGfbukK1cvAh5R274MeEhHtczn\n",
|
||||||
|
"MqLWzcTEaS9jrlP1O8CDO6proLbaJe9BNElcRQxhuyMxC9pTW3q/xar/8jqYuRss39bw3K4tJb7e\n",
|
||||||
|
"9y506gX37hQ6cQ55Zr87qVruT4w2+DTR/nk8MTH/sKlNJ23NkH2lXbC1F9EE2fs526O2PQu8u6O6\n",
|
||||||
|
"mnyUCOsbiInT/rN6fD/g510VNWkvJ3pov1b995VEiJ/E1h0CJTiZOMs6mfia3GvnvjflTZzzWuLs\n",
|
||||||
|
"9b61x1YQY1Jf30VB8ziW+Hf/M8oeWTDIQ4G/I/4OJduDGKP8ZeCHHdfSby1wYm1Z2/ff0hxE3O3n\n",
|
||||||
|
"LrXHfofy2udbczlzDf/3JebCfVR35Qy1C/F1+TVsPUXlI4ghj6U5luj8/Wm1fJ8IxxL9M/ENDGKU\n",
|
||||||
|
"zG5Dntu1/YjZ+vo9nrk225LcmThuP0McAz8n7lhT2vh5taiNM6L+dq5S22Mz602pWtpcMf1eDryJ\n",
|
||||||
|
"ueC+hRgZc2pnFTX7PNFkdknf4wcAf0tZnWwfJTqDv0C0z59P3CKvxPb5YbcPK63fI5022rzrdyWB\n",
|
||||||
|
"aCrpbZf2D3YLW7e/zhJtXl8h2jtv7KKoAV7A8LbiD0+qkBH9JTFUcDUxIgLiKtCTiW9mb+2mrEb3\n",
|
||||||
|
"ZNvgpnqstFBcSfQhrK+Wku8k339jC5ib0720fo902jjzXsPwf7DS7l/YbznxdziIuAiiFKew7QE/\n",
|
||||||
|
"RZwV7kN5X5m/R3zj6r+jzp2IUNxv4hUNdiWDm0eG7evKSqLZ5NnA9dX2Q4gbCpSmN3jhCnaijr+s\n",
|
||||||
|
"mm7imVHJFz/sAvwpMY72Xynv/pUw/Katpd3Q9Syar0F4KfH5luzRxEVF3wf+u+Na+r2E+JbQG7zQ\n",
|
||||||
|
"P4eMClOfZH1Ym1fJltL8NbprS4kfiA3EN5gHdlvOUF8hJrPvdzBw3oRrmc+9iIA5nxi+9u5q/ULK\n",
|
||||||
|
"GyE1yC7ElYEl+Q4xXBCiyezCDmvZ4bTR5l1vimnqwS/JM5lr0um5G3ExUWmXxB5H9Bd8GTiUcq8A\n",
|
||||||
|
"7Hkl8O/EHCHfIj7jRxHHRGlnYD8i2uefRDQ/zAKfI34B1S2n+6l36ydE/cfuLDF7Xyl+QzTrQPR7\n",
|
||||||
|
"lHo9QkptjzbpH3lSmjPZtsPyRmI6yM93UM8wtxFfQa9v2DdLmU0ndyImIXowUePlxAxtg+4sX7oS\n",
|
||||||
|
"juc1zIX2W4A3s3XfUkl9StcTo2N69R1JNFGVOHghnTbC+1aikwfiirX6RQ6lhkwGK/q2ez/A+xLD\n",
|
||||||
|
"8Q6bdEE7oRLCu660evqtYdtL+XtK+0WTThvNJiv7tvtDpiTvZduvnj2lnRlsrK0/khht8Kzq8U92\n",
|
||||||
|
"UM98hjXrlHjzCI3fmQ2PLQd+hkMFt1sb4b2xtl56yBxLTEjzMeYuLa5/BS3JA4nP8kji6+jHiU6q\n",
|
||||||
|
"1R3WNMxjauuzRK1HEpfyl37ncI3HicTP1nqivfscYvjoFuC5xHTGKsgDibkL1hOdJ6+k3Pm89yQu\n",
|
||||||
|
"Lz+PmEXupWw9v3dJeje52Lf2WOmdlhChvYYYefAvlHnnl1GHt5YwfPQW4OZq2VJbv5nyrri9nLmT\n",
|
||||||
|
"oZcRfUlLiG/n/9NRTRoia8jsQ5wV/hB4Xse1NPkjYszxRuJeoAdT7u3lICb5OpaYXvMMyrvQpW7U\n",
|
||||||
|
"bwJ3b7WKHU/9l92niOOhaZ8KkS1kIIawvYO44cEZlHl22LMb8ZXzc8R9DE+jvKl2Ie6cspGY9Kt3\n",
|
||||||
|
"o4BSbxpgkLTjQmJ2xr2IIZb1fo7vdlLRDqTNqTp3I8bzPocYP/thYm7kLwz7nybsrcQojfXEEKZz\n",
|
||||||
|
"yXWn6OXEjVKPIm7hVpIzq/8O6jso6aYBm4gLcwZ1XJc073Qmq4gRJXsB72FuPpunE1cIP6ejunYI\n",
|
||||||
|
"k5pnudSQuY1o0vlVwz6HNe48riO+JQ7ylkkVIo0q4yT547Sib9ux0+PTNAtifTa5kmZBLH28dFb9\n",
|
||||||
|
"x0D/SK6SjoF02roNWhYba+ulD2vM5jEMnwWxpB/cnf0kpi2ZjgElk2lYY2alz4K4fP6naDuVfgwo\n",
|
||||||
|
"mazDGrPIMgvizQOWWyn7ZgcZZDkGlEzGYY1ZHEfckOE0yrsbzXx2I/o8riHmytbiZD4GimdbX8gw\n",
|
||||||
|
"rDGbjLMg7gG8muho+wgxRLCkW+Flk/EYSMPw3lapwxqzWdG3XfJInr2A1xFzr3yQuM/mLzqtaMew\n",
|
||||||
|
"Yp79GydQww7L8NYkNI3kKekuS78kbjz9QeZuSl0f1uZFOirOzj5UUO3JNAviO2rru3VWxY6n94uw\n",
|
||||||
|
"ySywbIK1SBqRI3mkFu3SdQHaYR1BDLW7gLmRPCU30x1G1HpjtZxPzMEhSTulDLMgvhT4JtFBvXu1\n",
|
||||||
|
"PBn4BnBMh3VJUhGWE5Py99+VvWvraZ6r++7ExSWSpAKtX+Q+qTO2eUtx+7CHNzz+MOIyeak4S7ou\n",
|
||||||
|
"QCrABmIo4x7AXYH9gaOBtxO37nKyMhWn5N5/aZLuBbyCuVvgXQ6cCvyos4okSSO7R7VIkgo3Rczr\n",
|
||||||
|
"fgPws2q5ATgRv51KUrFeC3yRractvR8xq+RrO6lIkjSvi4mZBfvtVe2TiuNQQSkmaGuac/p6nLxN\n",
|
||||||
|
"hTK8Jdi8yH1SZ+yMkeI+lb8asO9OePYtSZIkSZIkSZIkSdKc/wcYrs3L+RfeGwAAAABJRU5ErkJg\n",
|
||||||
|
"gg==\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<matplotlib.figure.Figure at 0x7fdb05bc9910>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from nltk.tokenize import word_tokenize\n",
|
||||||
|
"\n",
|
||||||
|
"ex = \"\"\" The father of a woman who died after a savage gang-rape in Delhi said he thought everyone should watch a documentary about the attack broadcast by the BBC but banned in India \"If a man can speak like that in jail, imagine what he would say if he was walking free,\" said the father of the victim \"\"\"\n",
|
||||||
|
"# ex = \"The bombing killed 20 people and wounded more that 30.\"\n",
|
||||||
|
"def get_emotions(text):\n",
|
||||||
|
" pos = nltk.pos_tag(word_tokenize(text))\n",
|
||||||
|
" simply = [(i,simplify(j)) for i,j in pos]\n",
|
||||||
|
"# print header\n",
|
||||||
|
" total = None\n",
|
||||||
|
" numwords = 0\n",
|
||||||
|
" for (i,j) in simply:\n",
|
||||||
|
" key = u\"{}#{}\".format(i,j)\n",
|
||||||
|
" if key in lexdict:\n",
|
||||||
|
" numwords += 1\n",
|
||||||
|
" total = map(sum, zip(total, lexdict[key])) if total else lexdict[key]\n",
|
||||||
|
"# print \"{}\\n- {}\".format(key, lexdict[key])\n",
|
||||||
|
"# print(\"The total is\")\n",
|
||||||
|
" minemotion = min(total)\n",
|
||||||
|
" maxemotion = max(total)-minemotion\n",
|
||||||
|
" total = map(lambda x: (x-minemotion)/maxemotion, total)\n",
|
||||||
|
"# maxemotion = max(total)\n",
|
||||||
|
"# total = map(lambda x: (x)/maxemotion, total)\n",
|
||||||
|
"# print(total)\n",
|
||||||
|
" return total\n",
|
||||||
|
"total = get_emotions(ex)\n",
|
||||||
|
"ind = np.arange(len(total))\n",
|
||||||
|
"width = 0.5\n",
|
||||||
|
"fig, ax = plt.subplots(1, 1)\n",
|
||||||
|
"ax.bar(ind, total, width=width)\n",
|
||||||
|
"ax.set_xticks(ind+width/2)\n",
|
||||||
|
"ax.set_xticklabels(header[1:], rotation=\"vertical\")\n",
|
||||||
|
"None"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 434,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"AFRAID 0.112249\n",
|
||||||
|
"AMUSED 0.123500\n",
|
||||||
|
"ANGRY 0.138816\n",
|
||||||
|
"ANNOYED 0.116850\n",
|
||||||
|
"DONT_CARE 0.145988\n",
|
||||||
|
"HAPPY 0.116959\n",
|
||||||
|
"INSPIRED 0.112220\n",
|
||||||
|
"SAD 0.133418\n",
|
||||||
|
"dtype: float64"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 434,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"ex = \"\"\" The father of a woman who died after a savage gang-rape in Delhi said he thought everyone should watch a documentary about the attack broadcast by the BBC but banned in India \"If a man can speak like that in jail, imagine what he would say if he was walking free,\" said the father of the victim \"\"\"\n",
|
||||||
|
"def get_emotions_pd(text):\n",
|
||||||
|
" df = pd.DataFrame(columns=header[1:])\n",
|
||||||
|
" pos = nltk.pos_tag(word_tokenize(text))\n",
|
||||||
|
" simply = [(i,simplify(j)) for i,j in pos]\n",
|
||||||
|
" for (i,j) in simply:\n",
|
||||||
|
" key = u\"{}#{}\".format(i,j)\n",
|
||||||
|
" if key in lexdict:\n",
|
||||||
|
" df.loc[key] = lexdict[key]\n",
|
||||||
|
" return df\n",
|
||||||
|
"\n",
|
||||||
|
"df = get_emotions_pd(ex)\n",
|
||||||
|
"df.mean()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 2",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python2"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 2
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython2",
|
||||||
|
"version": "2.7.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
|
}
|
Loading…
Reference in New Issue