1
0
mirror of https://github.com/gsi-upm/senpy synced 2024-09-21 06:01:43 +00:00
senpy/emotion-depechemood/depechemood_plugin.py

182 lines
6.7 KiB
Python
Raw Normal View History

2019-01-09 16:19:22 +00:00
#!/usr/local/bin/python
# coding: utf-8
2019-04-04 10:56:46 +00:00
from future import standard_library
standard_library.install_aliases()
2019-01-09 16:19:22 +00:00
import os
import re
2019-01-09 18:29:24 +00:00
import sys
2019-01-09 16:19:22 +00:00
import string
import numpy as np
from six.moves import urllib
from nltk.corpus import stopwords
2019-04-04 10:56:46 +00:00
from senpy import EmotionBox, models
2019-01-09 16:19:22 +00:00
2019-01-09 18:29:24 +00:00
def ignore(dchars):
deletechars = "".join(dchars)
2019-04-04 10:56:46 +00:00
tbl = str.maketrans("", "", deletechars)
ignore = lambda s: s.translate(tbl)
2019-01-09 18:29:24 +00:00
return ignore
2019-04-04 10:56:46 +00:00
class DepecheMood(EmotionBox):
'''
Plugin that uses the DepecheMood emotion lexicon.
DepecheMood is an emotion lexicon automatically generated from news articles where users expressed their associated emotions. It contains two languages (English and Italian), as well as three types of word representations (token, lemma and lemma#PoS). For English, the lexicon contains 165k tokens, while the Italian version contains 116k. Unsupervised techniques can be applied to generate simple but effective baselines. To learn more, please visit https://github.com/marcoguerini/DepecheMood and http://www.depechemood.eu/
'''
2019-01-09 16:19:22 +00:00
author = 'Oscar Araque'
2019-04-04 10:56:46 +00:00
name = 'emotion-depechemood'
2019-01-09 16:19:22 +00:00
version = '0.1'
2019-04-04 10:56:46 +00:00
requirements = ['pandas']
nltk_resources = ["stopwords"]
onyx__usesEmotionModel = 'wna:WNAModel'
EMOTIONS = ['wna:negative-fear',
'wna:amusement',
'wna:anger',
'wna:annoyance',
'wna:indifference',
'wna:joy',
'wna:awe',
'wna:sadness']
DM_EMOTIONS = ['AFRAID', 'AMUSED', 'ANGRY', 'ANNOYED', 'DONT_CARE', 'HAPPY', 'INSPIRED', 'SAD',]
2019-01-09 16:19:22 +00:00
def __init__(self, *args, **kwargs):
super(DepecheMood, self).__init__(*args, **kwargs)
self.LEXICON_URL = "https://github.com/marcoguerini/DepecheMood/raw/master/DepecheMood%2B%2B/DepecheMood_english_token_full.tsv"
2019-01-09 18:29:24 +00:00
self._denoise = ignore(set(string.punctuation)|set('«»'))
self._stop_words = []
2019-01-09 16:19:22 +00:00
self._lex_vocab = None
self._lex = None
def activate(self):
self._lex = self.download_lex()
self._lex_vocab = set(list(self._lex.keys()))
2019-01-09 18:29:24 +00:00
self._stop_words = stopwords.words('english') + ['']
2019-01-09 16:19:22 +00:00
def clean_str(self, string):
string = re.sub(r"[^A-Za-z0-9().,!?\'\`]", " ", string)
string = re.sub(r"[0-9]+", " num ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r"\.", " . ", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " ( ", string)
string = re.sub(r"\)", " ) ", string)
string = re.sub(r"\?", " ? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def preprocess(self, text):
if text is None:
return None
2019-01-09 18:29:24 +00:00
tokens = self._denoise(self.clean_str(text)).split(' ')
2019-01-09 16:19:22 +00:00
tokens = [tok for tok in tokens if tok not in self._stop_words]
return tokens
def estimate_emotion(self, tokens, emotion):
s = []
for tok in tokens:
s.append(self._lex[tok][emotion])
dividend = np.sum(s) if np.sum(s) > 0 else 0
divisor = len(s) if len(s) > 0 else 1
S = np.sum(s) / divisor
return S
def estimate_all_emotions(self, tokens):
2019-04-04 10:56:46 +00:00
S = []
2019-01-09 16:19:22 +00:00
intersection = set(tokens) & self._lex_vocab
2019-04-04 10:56:46 +00:00
for emotion in self.DM_EMOTIONS:
2019-01-09 16:19:22 +00:00
s = self.estimate_emotion(intersection, emotion)
2019-04-04 10:56:46 +00:00
S.append(s)
2019-01-09 16:19:22 +00:00
return S
def download_lex(self, file_path='DepecheMood_english_token_full.tsv', freq_threshold=10):
2019-04-04 10:56:46 +00:00
import pandas as pd
2019-01-09 16:19:22 +00:00
try:
file_path = self.find_file(file_path)
except IOError:
2019-04-04 10:56:46 +00:00
file_path = self.path(file_path)
2019-01-09 16:19:22 +00:00
filename, _ = urllib.request.urlretrieve(self.LEXICON_URL, file_path)
lexicon = pd.read_csv(file_path, sep='\t', index_col=0)
lexicon = lexicon[lexicon['freq'] >= freq_threshold]
lexicon.drop('freq', axis=1, inplace=True)
lexicon = lexicon.T.to_dict()
return lexicon
2019-04-04 10:56:46 +00:00
def predict_one(self, features, **kwargs):
tokens = self.preprocess(features[0])
2019-01-09 16:19:22 +00:00
estimation = self.estimate_all_emotions(tokens)
return estimation
test_cases = [
{
'entry': {
'nif:isString': 'My cat is very happy',
},
'expected': {
2019-04-04 10:56:46 +00:00
'onyx:hasEmotionSet': [
2019-01-09 16:19:22 +00:00
{
'onyx:hasEmotion': [
2019-04-04 10:56:46 +00:00
{
'onyx:hasEmotionCategory': 'wna:negative-fear',
'onyx:hasEmotionIntensity': 0.05278117640010922
},
{
'onyx:hasEmotionCategory': 'wna:amusement',
'onyx:hasEmotionIntensity': 0.2114806151413433,
},
{
'onyx:hasEmotionCategory': 'wna:anger',
'onyx:hasEmotionIntensity': 0.05726119426520887
},
{
'onyx:hasEmotionCategory': 'wna:annoyance',
'onyx:hasEmotionIntensity': 0.12295990731053638,
},
{
'onyx:hasEmotionCategory': 'wna:indifference',
'onyx:hasEmotionIntensity': 0.1860159893608025,
},
{
'onyx:hasEmotionCategory': 'wna:joy',
'onyx:hasEmotionIntensity': 0.12904050973724163,
},
{
'onyx:hasEmotionCategory': 'wna:awe',
'onyx:hasEmotionIntensity': 0.17973650399862967,
},
{
'onyx:hasEmotionCategory': 'wna:sadness',
'onyx:hasEmotionIntensity': 0.060724103786128455,
},
2019-01-09 16:19:22 +00:00
]
}
]
}
}
]
if __name__ == '__main__':
from senpy.utils import easy, easy_load, easy_test
# sp, app = easy_load()
# for plug in sp.analysis_plugins:
# plug.test()
2019-04-04 10:56:46 +00:00
easy_test(debug=False)