You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

596 lines
22 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Download DepecheMood from GitHub"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "invalid syntax (<ipython-input-15-537542d5c5d8>, line 1)",
"output_type": "error",
"traceback": [
"\u001b[1;36m File \u001b[1;32m\"<ipython-input-15-537542d5c5d8>\"\u001b[1;36m, line \u001b[1;32m1\u001b[0m\n\u001b[1;33m wget https://github.com/marcoguerini/DepecheMood/releases/download/v1.0/DepecheMood_V1.0.zip -O latest-DepecheMood.zip\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
]
}
],
"source": [
"!wget https://github.com/marcoguerini/DepecheMood/releases/download/v1.0/DepecheMood_V1.0.zip -O latest-DepecheMood.zip\n",
"!yes 2>/dev/null | unzip latest-DepecheMood.zip"
]
},
{
"cell_type": "code",
"execution_count": 283,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DepecheMood_freq.txt DepecheMood_normfreq.txt DepecheMood_tfidf.tsv DepecheMood_tfidf.txt README.txt\r\n"
]
}
],
"source": [
"ls DepecheMood_V1.0/"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"This package contains the three versions of the Lexicon 'DepecheMood' as described in the paper: \r\n",
"\r\n",
"Staiano, J., & Guerini, M. (2014). \"DepecheMood: a Lexicon for Emotion Analysis from Crowd-Annotated News\". Proceedings of ACL-2014. \r\n",
"\r\n",
"Each version of DepecheMood is built starting from word-by-document matrices either using raw frequencies (DepecheMood_freq.txt), normalized frequencies (DepecheMood_normfreq.txt) or tf-idf (DepecheMood_tfidf.txt). \r\n",
"\r\n",
"The files are tab-separated; each row contains one Lemma#PoS followed by the scores for the following emotions: AFRAID, AMUSED, ANGRY, ANNOYED, DONT_CARE, HAPPY, INSPIRED, SAD.\r\n",
"\r\n",
"This resource is freely available for research purposes. \r\n",
"\r\n"
]
}
],
"source": [
"cat DepecheMood_V1.0/README.txt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Lemma#PoS\r\n",
"50#n\r\n",
"500#n\r\n",
"50th#a\r\n",
"55th#a\r\n",
"5th#a\r\n",
"6#n\r\n",
"60#n\r\n",
"60th#a\r\n",
"64th#a\r\n",
"65th#a\r\n",
"6th#a\r\n",
"7#n\r\n",
"70#n\r\n",
"70th#a\r\n",
"75th#a\r\n",
"78#n\r\n",
"7th#a\r\n",
"8#n\r\n",
"80#n\r\n",
"80th#a\r\n",
"85th#a\r\n"
]
}
],
"source": [
"!sed -n '1p;100,120p' DepecheMood_V1.0/DepecheMood_tfidf.txt | cut -f 1"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import nltk\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 398,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"lexicon = pd.DataFrame.from_csv(\"DepecheMood_V1.0/DepecheMood_normfreq.txt\", sep=\"\\t\", index_col=False)\n",
"s = lexicon[\"Lemma#PoS\"].str.split(\"#\")\n",
"lexicon[\"Lemma\"] = s.str[0]\n",
"lexicon[\"PoS\"] = s.str[1]"
]
},
{
"cell_type": "code",
"execution_count": 399,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 399,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s=lexicon.index.to_series()\n",
"s.str.contains(\"#\", False).count() # All words contain # (POS tag)"
]
},
{
"cell_type": "code",
"execution_count": 400,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array(['n', 'a', 'r', 'v'], dtype=object)"
]
},
"execution_count": 400,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lexicon[\"PoS\"].unique() # According to the paper, there's verbs (v), nouns (n), adjectives (a) and adverbs (r)"
]
},
{
"cell_type": "code",
"execution_count": 435,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#lexicon[lexicon.groupby(\"Lemma\")[\"PoS\"].transform(len) > 1] # Getting lemmas with different PoS"
]
},
{
"cell_type": "code",
"execution_count": 436,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"lexdict = {}\n",
"header = []\n",
"with open(\"DepecheMood_V1.0/DepecheMood_tfidf.tsv\") as f:\n",
" header = f.next().strip().split()\n",
" for l in f:\n",
" temp = l.strip().split(\"\\t\")\n",
" lexdict[temp[0]] = map(float, temp[1:])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Compare performance"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Pandas is great for analysis, but for sentiment analysis we need something that is quick at finding entries in the lexicon and calculating the values of each emotion. Hence, we need to compare pandas with a plain dictionary for this task."
]
},
{
"cell_type": "code",
"execution_count": 438,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import random\n",
"keys = lexdict.keys()\n",
"random.shuffle(keys)"
]
},
{
"cell_type": "code",
"execution_count": 439,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10 loops, best of 3: 16.9 ms per loop\n",
"10 loops, best of 3: 35.9 ms per loop\n"
]
}
],
"source": [
"def search_all(d):\n",
" for i in keys:\n",
" if i in d:\n",
" ja = d[i][0] + d[i][1]\n",
" \n",
"%timeit search_all(lexdict)\n",
"%timeit search_all(lexicon)"
]
},
{
"cell_type": "code",
"execution_count": 440,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10 loops, best of 3: 34.6 ms per loop\n",
"10 loops, best of 3: 39 ms per loop\n"
]
}
],
"source": [
"def search_all(d):\n",
" for i in keys:\n",
" if i in d:\n",
" ja = 0\n",
" for j in d[i]:\n",
" ja += j\n",
" \n",
"%timeit search_all(lexdict)\n",
"%timeit search_all(lexicon.ix[:, lexicon.columns - [\"Lemma#PoS\", \"Lemma\", \"PoS\"]])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Apparently, it is slightly better to use a python dictionary. Plus, we avoid dependencies."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Sentiment Analysis"
]
},
{
"cell_type": "code",
"execution_count": 441,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"mapping = {\n",
" \n",
"#CC - Coordinating conjunction\n",
"\"CD\": \"n\", # - Cardinal number\n",
"#DT - Determiner\n",
"#EX - Existential there\n",
"\"FW\": \"n\", # - Foreign word\n",
"#IN #- Preposition or subordinating conjunction\n",
"\"JJ\": \"a\", #- Adjective\n",
"\"JJR\": \"a\", # - Adjective, comparative\n",
"\"JJS\": \"a\", # - Adjective, superlative\n",
"#LS - List item marker\n",
"#MD - Modal\n",
"\"NN\": \"n\", # - Noun, singular or mass\n",
"\"NNS\": \"n\", # - Noun, plural\n",
"\"NNP\": \"n\", # - Proper noun, singular\n",
"\"NNPS\": \"n\", # - Proper noun, plural\n",
"#PDT - Predeterminer\n",
"#POS - Possessive ending\n",
"\"PRP\": \"n\", # - Personal pronoun\n",
"#PRP$ - Possessive pronoun (prolog version PRP-S)\n",
"\"RB\": \"r\", # - Adverb\n",
"\"RBR\": \"r\", # - Adverb, comparative\n",
"\"RBS\": \"r\", #- Adverb, superlative\n",
"#RP - Particle\n",
"#SYM - Symbol\n",
"#TO - to\n",
"#\"UH\": \"n\", - Interjection\n",
"\"VB\": \"v\", # - Verb, base form\n",
"\"VBD\": \"v\", # - Verb, past tense\n",
"\"VBG\": \"v\", #- Verb, gerund or present participle\n",
"\"VBN\": \"v\", #- Verb, past participle\n",
"\"VBP\": \"v\", #- Verb, non-3rd person singular present\n",
"\"VBZ\": \"v\", #- Verb, 3rd person singular present\n",
"#\"WDT\": \"n\" # - Wh-determiner\n",
"#\"WP: - Wh-pronoun\n",
"#WP$ - Possessive wh-pronoun (prolog version WP-S)\n",
"\"WRB\": \"r\", # - Wh-adverb\n",
" }\n",
"def simplify(tag):\n",
" return mapping.get(tag, \"n\")\n",
"\n",
"#Alternative:\n",
"#from nltk.tag.simplify import simplify_wsj_tag"
]
},
{
"cell_type": "code",
"execution_count": 442,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"data": {
"image/png": [
"iVBORw0KGgoAAAANSUhEUgAAAW8AAAExCAYAAACzuCOTAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\n",
"AAALEgAACxIB0t1+/AAAF+hJREFUeJzt3Xu4HHV9x/H3ISd4w4AR1ApiqlKJF7xjqFajWAVspcUq\n",
"iFXjFapovSN9Wom1rbXeeBAe5FEs2lapt1ovFLxxoLWiVSGAJMotj0RRAS+AYk3K6R/fWc5kM7tn\n",
"z8nOzu+bvF/PM2FmZ9n9ZjPnc2Z/v9/8BiRJkiRJkiRJkiRJkiRJ0k7ug8CPgUuHPOdk4ApgHfCI\n",
"SRQlSRru94hAHhTehwFnV+uPBS6cRFGSpPmtYHB4vw84sra9Abhn2wVJ0s5slzG8xt7AtbXtTcA+\n",
"Y3hdSdIA02N6nam+7dmG51wJ3H9M7ydJO4t1wMMX+z+vYHizyVG17UHNJk2BXqq1XRewAGu7LmCB\n",
"1nb8/rMwO+Jy4gKe2/nxvbbj91+ItV0XsABruy6AAcfWOJpNPgM8v1pfBfycGJ0iSWrJKM0mHwWe\n",
"COxJtG2fCCyt9p1OjDQ5jGgW+SXwwvGXKUnqStdfKxdiddcFLMDqrgtYoNUdv/8CmkLOy9Rssrrj\n",
"91+I1V0XsACruy6AAcdWf0dj2wVM8v2kJrPt5OzU7X9IY9aYneNo85YkTZjhLUkJGd6SlJDhLUkJ\n",
"Gd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6S\n",
"lJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDh\n",
"LUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlNAo4X0IsAG4Aji+Yf+ewDnAxcBlwJpxFSdJWpwlwJXA\n",
"CmApEdAr+56zFnhbtb4ncCMw3fBas61UKC3MLMy2sHh8qzWNx9Z8Z94HEuG9EdgMnAUc3vec64Bl\n",
"1foyIry3LLZKSdL8ms6Q6/YGrq1tbwIe2/ec9wNfAX4I3BV49tiqkyQ1mi+8R/kq+BdEc8pq4P7A\n",
"F4GHATc3PHdtbX2mWiRJc1ZXy3ZZRXRG9pzAtp2WZwOPq21/GXh0w2vZJqgS2OatbBbV5v1NYD+i\n",
"w3JX4EjgM33P2QA8pVq/J/BA4OrFVilJGo9Dge8SHZcnVI8dUy0QI0w+C6wDLgWOHvA6npmoBJ55\n",
"K5vGY2tqwgVM8v2kJrPt5OzU7X9IY9aYnV5hKUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJ\n",
"Gd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6S\n",
"lJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDh\n",
"LUkJGd6SlJDhLUkJjRLehwAbgCuA4wc8ZzVwEXAZMDOOwiRJi7cEuBJYASwFLgZW9j1nD+A7wD7V\n",
"9p4DXmu2hfqkhZqF2RYWj2+1pvHYmu/M+0AivDcCm4GzgMP7nnM08ElgU7V9w6JLlCSNZL7w3hu4\n",
"tra9qXqsbj9gOXAe8E3geWOrTpLUaHqe/aN8FVwKPBI4GLgz8DXgQqKNXJLUgvnC+wfAfWrb92Gu\n",
"eaTnWqKp5NZquQB4GM3hvba2PoOdm5LUb3W1bJdp4Cqiw3JXmjss9we+RHRu3hm4FHhQw2vZoaMS\n",
"2GGpbBZ9bB0KfJfouDyheuyYaul5PTHi5FLgVeMuQBojw1vZNB5bUxMuYJLvJzWZbSdnp27/Qxqz\n",
"xuz0CktJSsjwlqSEDG9JSsjwlqSEDG9JSsjwlqSEDG9JSsjwlqSEDG9JSsjwlqSEDG9JSsjwlqSE\n",
"DG9JSsjwlqSEDG9JSsjwlqSEDG9JSsjwlqSEDG+NwfRNxK2axrhM3zTZv4OUi/ew1Di0cF/I1u4J\n",
"6T0slY33sJSkHYXhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDh\n",
"LUkJGd6SlJDhLUkJGd6SlJDhLUkJjRLehwAbgCuA44c87zHAFuCIMdQlSdoOS4ArgRXAUuBiYOWA\n",
"530F+BzwzAGv1cbtS1SGWZgd89La8dJCra3WKzUeW/OdeR9IhPdGYDNwFnB4w/NeCXwCuH7x9UmS\n",
"RjVfeO8NXFvb3lQ91v+cw4HTqm3PQCSpZdPz7B8liE8C3sTcTTKH3YR1bW19plomZPom2HLXFl73\n",
"ZtiybPyvK2kntbpatssq4Jza9gls22l5NXBNtdwM/Bh4RsNrdX1GbltnezJ9rh4HymZRx9Y0cBXR\n",
"Ybkrgzsse/6RwaNNuj64/aFtT6bP1eNA2TQeW/M1m2wBjgPOJUaUnAGsB46p9p8+ruokSaMb1j49\n",
"brMTfr+G92/j5Gjq9j92Yi18tq19rh4HyqYxO73CUpISMrwlKSHDW5ISMrwlKSHDW5ISMrwlKSHD\n",
"W5ISMrwlKSHDW5ISMrwlKSHDW5ISMrwlKSHDW5ISMrwlKSHDW5ISMrwlKSHDW5ISMrwlKSHDW5IS\n",
"MryLNX0Tce+6MS7TN0327yCpLd6AeLtlulGutZLvOJC8AbEk7SgMb0lKyPCWpIQMb0lKyPCWpIQM\n",
"b0lKyPCWpIQMb0lKyPCWpIQMb0lKyPCWpIQMb0lKyPCWpIQMb0lKyPCWpIRGDe9DgA3AFcDxDfuf\n",
"C6wDLgG+ChwwluokSYu2BLgSWAEsBS4GVvY95yBg92r9EODChtdpYwb8hZiF2RaW1v5e1pqm1lbr\n",
"lRqPrVHOvA8kwnsjsBk4Czi87zlfA35RrX8d2GdRJUqSRjJKeO8NXFvb3lQ9NsiLgbO3pyhJ0nDT\n",
"IzxnIV8HnwS8CHjcgP1ra+sz1SJJmrO6WrbbKuCc2vYJNHdaHkA0rzxgwOt03SaYra3TWtPUapu3\n",
"WrXoY2sauIrosNyV5g7LfYngXtVGAWOS7YfWWtPUanirVY3H1ijNJluA44BziZEnZwDrgWOq/acD\n",
"bwbuBpxWPbaZ6OiUJLVgaoLvNTvh92t4/zZOjqZu/2PMWqjXWsl3HEiN2ekVlpKUkOEtSQkZ3pKU\n",
"kOEtFWv6JqK9c8zL9E0T/WuoFXZYbrdMHWvWSq7jIFOtao8dlpK0ozC8JSkhw1uSEjK8JSkhw1uS\n",
"EjK8JSkhw1uSEjK8JSkhw1uSEjK8JSkhw1uSEjK8JSkhw1uSEjK8JSkhw1vSTqiNudInO0+683lv\n",
"t0zzTlsruY6DTLVmk+yYdT5vSdohGN6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJGd6SlJDhLUkJ\n",
"Gd6SlJDhLUkJGd6SxqCNiZ4mP9lTJk5Mtd0yTaBkreQ6Dqw11XHgxFSSpHmMEt6HABuAK4DjBzzn\n",
"5Gr/OuAR4ylNkjTIfOG9BDiFCPAHAc8BVvY95zDgAcB+wMuA08ZcYwdmui5gAWa6LmCBZrouYAFm\n",
"ui5gAWa6LmABZrouYAFmui5goPnC+0DgSmAjsBk4Czi87znPAD5UrX8d2AO45/hK7MJM1wUswEzX\n",
"BSzQTNcFLMBM1wUswEzXBSzATNcFLMBM1wUMNF947w1cW9veVD0233P22f7SJEmDzBfeo3bH9veE\n",
"ttHtLEka0SrgnNr2CWzbafk+4Kja9gaam02upJVxoC4uLi479HIxizANXAWsAHatXqSpw/Lsan0V\n",
"cOFi3kiSNF6HAt8lzpxPqB47plp6Tqn2rwMeOdHqJEmSJEmSWtHlXCNSKfYnOtoB7gj8uravtH6c\n",
"JcD/dV3EAuxPXLy3f7V9OfB+oim2RAcQtc4C64HLui1n8tYA3wZ+VS3fBF7QZUFDLCUuNHoj8Abg\n",
"D4iO2tKs6LqARcjy2V5UW//2kH0lWAf8btdFjOgg4DrgLcTFfX8M/HX12EEd1tVkd+KKnKuBfwM+\n",
"TQzWOA9Y1l1Zk/UC4oB/EnG15d2AJwPfAp7fYV1N9ibOAM4H3gOcBFxQPXbvDutq0uswLjH8mmT6\n",
"bC8asN603bXHAt8gzl7v1nEt8zkHWN3w+BOB/5hsKfN6L/BOtr72ZQnwD9W+ncLXgd9ueHxFta8k\n",
"HwJe3fD4q5i75L8Uy4gAvAR4Qse1jCLTZ5spvCEC5uXEWeIpRLi8l5ggriTfG7KvtGaT9cQ3xX5L\n",
"mWtSK0obZ3F3Ba5peHxjta8kq2huznkv5R1cNxFh+GjgS8APgNuqfbNEW11JMn22+xDBN0V8Y+it\n",
"w7bTQZRgOXEc/IT4RnsbUe9sl0U1uGXIvl9NrIrR/IaYv6nfZuB/J1zLSNoI718vcl8Xbh3w+Czl\n",
"HVwABxNn3x8ATqW8H9a6TJ/tG4i6pogw7H2uU0R/TUmOJep9J/Biyj4G7sPWvwjrSvuleAfiGpX+\n",
"X4JT1b7itBHeK4FLB+y7fwvvtz2WAUew9cHV+yEurZPiLOKH4WgGf74lyfTZnjlkX2mTrD2e6Oz7\n",
"SdeFjKD3SxG2DsUSfyn+CHjXgH3XTbKQUbUxVHDFPPs3tvCei3Umw89cXjihOkbxUqKTqt9uwHHA\n",
"30+2nHmdSZ7PFuBRwP2IoWzfIX5R/hUxl/2+HdbVbxVwOjGH/iXE2fflnVa0OEtpbqYo0a5Es4q0\n",
"KPcm2ovPJnrAdwNeQ0zBW1pHVTZ/Q3RYfZQYHvYuot/m1cS475J8C/h9oq5nAed2W85Q/1Vb/6e+\n",
"ff1DMkszBTwFOAP4cce1TMwtwM0DltLuBH1Sbf3P+/adOcE6RvElYC1xJngS8Q3mLOBe3ZU0VKbP\n",
"9nLmQno58EvKHVefYTRMT7ZRPBBNUicD3yeybA1xTKgwmQ6u/mkhNxHjUEuV6bPtr2dRU3BOyNVE\n",
"X8Izq6W+fUSHdTXJdAy8jRjaeA7wIiKwm0bNFWNSF3zchTiwjgKePqH33NHswtwZwBTwU+KqsJ6f\n",
"TryiHcf9gM/WtlfUtmeJq0RLcQHwh0O2PzXZcobanblO6946te2SvIRokjqNuICo+DbuNsP7DkRQ\n",
"Pwd4GnFQva/F91uMJUQgTtXWqW2XZBlxcPVM1bZniQAqSabPtv++rPVRB6UNxVvTdQELUP/F0v9L\n",
"5vzJlzPUbxF9CUcRFz7NAHciV8fqdnsa0ab5feDDxD/Yxg7rGWYj8dXomr713qLF20j+z3ZfYl6W\n",
"kqwi5jf5JfA14EHdlrNoJU+Kd0fgT4BPEJ2VH+m2nMm5DfgMW89fkeWHtWTTxAiTnlXEZfJPoLwr\n",
"VyHPHCz97gG8ghgpcTWDx/52JdNok5692Dqsj6a82foOJM6+e15A5NgZlDcnU2seDrwduIJo/H8x\n",
"cRZeovsSk2f1PJnoaX4tMbazJO9i6/uHXkO0y36R+LxLk2n2u2VEc8S5zA0V/EGXBQ1Resdf3RHA\n",
"DcRFLpuIvoNvEzP2lXbHrYuYa9p7AlHzM4lhpJ/oqqiuTAGPI9qPriOC/GWdVrStbzD3DeHhwI3A\n",
"64jmng90VdQAF7P1xDm9H9op4KuTL2demWa/u5U4y1pVe6zUb4uZRptcSlxMBHER1Ba2bvcuybra\n",
"+qnEsNymfcWYVLvTEmJejiOJM/FSXMLchE7vJJp83kiM7FgHPLSjuprUawV4KvCFan0d8LCJVzS/\n",
"XYi5OF5PXFzU6/ybJWYXLMWriY71pcDHgI8T4+qbZsfs2plsO/dGfbukK1cvAh5R274MeEhHtczn\n",
"MqLWzcTEaS9jrlP1O8CDO6proLbaJe9BNElcRQxhuyMxC9pTW3q/xar/8jqYuRss39bw3K4tJb7e\n",
"9y506gX37hQ6cQ55Zr87qVruT4w2+DTR/nk8MTH/sKlNJ23NkH2lXbC1F9EE2fs526O2PQu8u6O6\n",
"mnyUCOsbiInT/rN6fD/g510VNWkvJ3pov1b995VEiJ/E1h0CJTiZOMs6mfia3GvnvjflTZzzWuLs\n",
"9b61x1YQY1Jf30VB8ziW+Hf/M8oeWTDIQ4G/I/4OJduDGKP8ZeCHHdfSby1wYm1Z2/ff0hxE3O3n\n",
"LrXHfofy2udbczlzDf/3JebCfVR35Qy1C/F1+TVsPUXlI4ghj6U5luj8/Wm1fJ8IxxL9M/ENDGKU\n",
"zG5Dntu1/YjZ+vo9nrk225LcmThuP0McAz8n7lhT2vh5taiNM6L+dq5S22Mz602pWtpcMf1eDryJ\n",
"ueC+hRgZc2pnFTX7PNFkdknf4wcAf0tZnWwfJTqDv0C0z59P3CKvxPb5YbcPK63fI5022rzrdyWB\n",
"aCrpbZf2D3YLW7e/zhJtXl8h2jtv7KKoAV7A8LbiD0+qkBH9JTFUcDUxIgLiKtCTiW9mb+2mrEb3\n",
"ZNvgpnqstFBcSfQhrK+Wku8k339jC5ib0720fo902jjzXsPwf7DS7l/YbznxdziIuAiiFKew7QE/\n",
"RZwV7kN5X5m/R3zj6r+jzp2IUNxv4hUNdiWDm0eG7evKSqLZ5NnA9dX2Q4gbCpSmN3jhCnaijr+s\n",
"mm7imVHJFz/sAvwpMY72Xynv/pUw/Katpd3Q9Syar0F4KfH5luzRxEVF3wf+u+Na+r2E+JbQG7zQ\n",
"P4eMClOfZH1Ym1fJltL8NbprS4kfiA3EN5gHdlvOUF8hJrPvdzBw3oRrmc+9iIA5nxi+9u5q/ULK\n",
"GyE1yC7ElYEl+Q4xXBCiyezCDmvZ4bTR5l1vimnqwS/JM5lr0um5G3ExUWmXxB5H9Bd8GTiUcq8A\n",
"7Hkl8O/EHCHfIj7jRxHHRGlnYD8i2uefRDQ/zAKfI34B1S2n+6l36ydE/cfuLDF7Xyl+QzTrQPR7\n",
"lHo9QkptjzbpH3lSmjPZtsPyRmI6yM93UM8wtxFfQa9v2DdLmU0ndyImIXowUePlxAxtg+4sX7oS\n",
"juc1zIX2W4A3s3XfUkl9StcTo2N69R1JNFGVOHghnTbC+1aikwfiirX6RQ6lhkwGK/q2ez/A+xLD\n",
"8Q6bdEE7oRLCu660evqtYdtL+XtK+0WTThvNJiv7tvtDpiTvZduvnj2lnRlsrK0/khht8Kzq8U92\n",
"UM98hjXrlHjzCI3fmQ2PLQd+hkMFt1sb4b2xtl56yBxLTEjzMeYuLa5/BS3JA4nP8kji6+jHiU6q\n",
"1R3WNMxjauuzRK1HEpfyl37ncI3HicTP1nqivfscYvjoFuC5xHTGKsgDibkL1hOdJ6+k3Pm89yQu\n",
"Lz+PmEXupWw9v3dJeje52Lf2WOmdlhChvYYYefAvlHnnl1GHt5YwfPQW4OZq2VJbv5nyrri9nLmT\n",
"oZcRfUlLiG/n/9NRTRoia8jsQ5wV/hB4Xse1NPkjYszxRuJeoAdT7u3lICb5OpaYXvMMyrvQpW7U\n",
"bwJ3b7WKHU/9l92niOOhaZ8KkS1kIIawvYO44cEZlHl22LMb8ZXzc8R9DE+jvKl2Ie6cspGY9Kt3\n",
"o4BSbxpgkLTjQmJ2xr2IIZb1fo7vdlLRDqTNqTp3I8bzPocYP/thYm7kLwz7nybsrcQojfXEEKZz\n",
"yXWn6OXEjVKPIm7hVpIzq/8O6jso6aYBm4gLcwZ1XJc073Qmq4gRJXsB72FuPpunE1cIP6ejunYI\n",
"k5pnudSQuY1o0vlVwz6HNe48riO+JQ7ylkkVIo0q4yT547Sib9ux0+PTNAtifTa5kmZBLH28dFb9\n",
"x0D/SK6SjoF02roNWhYba+ulD2vM5jEMnwWxpB/cnf0kpi2ZjgElk2lYY2alz4K4fP6naDuVfgwo\n",
"mazDGrPIMgvizQOWWyn7ZgcZZDkGlEzGYY1ZHEfckOE0yrsbzXx2I/o8riHmytbiZD4GimdbX8gw\n",
"rDGbjLMg7gG8muho+wgxRLCkW+Flk/EYSMPw3lapwxqzWdG3XfJInr2A1xFzr3yQuM/mLzqtaMew\n",
"Yp79GydQww7L8NYkNI3kKekuS78kbjz9QeZuSl0f1uZFOirOzj5UUO3JNAviO2rru3VWxY6n94uw\n",
"ySywbIK1SBqRI3mkFu3SdQHaYR1BDLW7gLmRPCU30x1G1HpjtZxPzMEhSTulDLMgvhT4JtFBvXu1\n",
"PBn4BnBMh3VJUhGWE5Py99+VvWvraZ6r++7ExSWSpAKtX+Q+qTO2eUtx+7CHNzz+MOIyeak4S7ou\n",
"QCrABmIo4x7AXYH9gaOBtxO37nKyMhWn5N5/aZLuBbyCuVvgXQ6cCvyos4okSSO7R7VIkgo3Rczr\n",
"fgPws2q5ATgRv51KUrFeC3yRractvR8xq+RrO6lIkjSvi4mZBfvtVe2TiuNQQSkmaGuac/p6nLxN\n",
"hTK8Jdi8yH1SZ+yMkeI+lb8asO9OePYtSZIkSZIkSZIkSdKc/wcYrs3L+RfeGwAAAABJRU5ErkJg\n",
"gg==\n"
],
"text/plain": [
"<matplotlib.figure.Figure at 0x7fdb05bc9910>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from nltk.tokenize import word_tokenize\n",
"\n",
"ex = \"\"\" The father of a woman who died after a savage gang-rape in Delhi said he thought everyone should watch a documentary about the attack broadcast by the BBC but banned in India \"If a man can speak like that in jail, imagine what he would say if he was walking free,\" said the father of the victim \"\"\"\n",
"# ex = \"The bombing killed 20 people and wounded more that 30.\"\n",
"def get_emotions(text):\n",
" pos = nltk.pos_tag(word_tokenize(text))\n",
" simply = [(i,simplify(j)) for i,j in pos]\n",
"# print header\n",
" total = None\n",
" numwords = 0\n",
" for (i,j) in simply:\n",
" key = u\"{}#{}\".format(i,j)\n",
" if key in lexdict:\n",
" numwords += 1\n",
" total = map(sum, zip(total, lexdict[key])) if total else lexdict[key]\n",
"# print \"{}\\n- {}\".format(key, lexdict[key])\n",
"# print(\"The total is\")\n",
" minemotion = min(total)\n",
" maxemotion = max(total)-minemotion\n",
" total = map(lambda x: (x-minemotion)/maxemotion, total)\n",
"# maxemotion = max(total)\n",
"# total = map(lambda x: (x)/maxemotion, total)\n",
"# print(total)\n",
" return total\n",
"total = get_emotions(ex)\n",
"ind = np.arange(len(total))\n",
"width = 0.5\n",
"fig, ax = plt.subplots(1, 1)\n",
"ax.bar(ind, total, width=width)\n",
"ax.set_xticks(ind+width/2)\n",
"ax.set_xticklabels(header[1:], rotation=\"vertical\")\n",
"None"
]
},
{
"cell_type": "code",
"execution_count": 434,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"AFRAID 0.112249\n",
"AMUSED 0.123500\n",
"ANGRY 0.138816\n",
"ANNOYED 0.116850\n",
"DONT_CARE 0.145988\n",
"HAPPY 0.116959\n",
"INSPIRED 0.112220\n",
"SAD 0.133418\n",
"dtype: float64"
]
},
"execution_count": 434,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ex = \"\"\" The father of a woman who died after a savage gang-rape in Delhi said he thought everyone should watch a documentary about the attack broadcast by the BBC but banned in India \"If a man can speak like that in jail, imagine what he would say if he was walking free,\" said the father of the victim \"\"\"\n",
"def get_emotions_pd(text):\n",
" df = pd.DataFrame(columns=header[1:])\n",
" pos = nltk.pos_tag(word_tokenize(text))\n",
" simply = [(i,simplify(j)) for i,j in pos]\n",
" for (i,j) in simply:\n",
" key = u\"{}#{}\".format(i,j)\n",
" if key in lexdict:\n",
" df.loc[key] = lexdict[key]\n",
" return df\n",
"\n",
"df = get_emotions_pd(ex)\n",
"df.mean()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}