# -*- coding: utf-8 -*- import re import nltk import csv import sys import os import unicodedata import string import xml.etree.ElementTree as ET import math from sklearn.svm import LinearSVC from sklearn.feature_extraction import DictVectorizer from nltk import bigrams from nltk import trigrams from nltk.corpus import stopwords from pattern.en import parse as parse_en from pattern.es import parse as parse_es from senpy.plugins import EmotionPlugin, SenpyPlugin from senpy.models import Results, EmotionSet, Entry, Emotion class ANEW(EmotionPlugin): description = "This plugin consists on an emotion classifier using ANEW lexicon dictionary to calculate VAD (valence-arousal-dominance) of the sentence and determinate which emotion is closer to this value. Each emotion has a centroid, calculated according to this article: http://www.aclweb.org/anthology/W10-0208. The plugin is going to look for the words in the sentence that appear in the ANEW dictionary and calculate the average VAD score for the sentence. To obtain a categorical value (e.g., happy) use the emotion conversion API (e.g., `emotion-model=emoml:big6`)." author = "@icorcuera" version = "0.5.2" name = "emotion-anew" extra_params = { "language": { "description": "language of the input", "aliases": ["language", "l"], "required": True, "options": ["es","en"], "default": "en" } } anew_path_es = "Dictionary/Redondo(2007).csv" anew_path_en = "Dictionary/ANEW2010All.txt" onyx__usesEmotionModel = "emoml:pad-dimensions" nltk_resources = ['stopwords'] def activate(self, *args, **kwargs): self._stopwords = stopwords.words('english') dictionary={} dictionary['es'] = {} with self.open(self.anew_path_es,'r') as tabfile: reader = csv.reader(tabfile, delimiter='\t') for row in reader: dictionary['es'][row[2]]={} dictionary['es'][row[2]]['V']=row[3] dictionary['es'][row[2]]['A']=row[5] dictionary['es'][row[2]]['D']=row[7] dictionary['en'] = {} with self.open(self.anew_path_en,'r') as tabfile: reader = csv.reader(tabfile, delimiter='\t') for row in reader: dictionary['en'][row[0]]={} dictionary['en'][row[0]]['V']=row[2] dictionary['en'][row[0]]['A']=row[4] dictionary['en'][row[0]]['D']=row[6] self._dictionary = dictionary def _my_preprocessor(self, text): regHttp = re.compile('(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?') regHttps = re.compile('(https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?') regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*') text = re.sub(regHttp, '', text) text = re.sub(regAt, '', text) text = re.sub('RT : ', '', text) text = re.sub(regHttps, '', text) text = re.sub('[0-9]', '', text) text = self._delete_punctuation(text) return text def _delete_punctuation(self, text): exclude = set(string.punctuation) s = ''.join(ch for ch in text if ch not in exclude) return s def _extract_ngrams(self, text, lang): unigrams_lemmas = [] unigrams_words = [] pos_tagged = [] if lang == 'es': sentences = list(parse_es(text, lemmata=True).split()) else: sentences = list(parse_en(text, lemmata=True).split()) for sentence in sentences: for token in sentence: if token[0].lower() not in self._stopwords: unigrams_words.append(token[0].lower()) unigrams_lemmas.append(token[4]) pos_tagged.append(token[1]) return unigrams_lemmas,unigrams_words,pos_tagged def _find_ngrams(self, input_list, n): return zip(*[input_list[i:] for i in range(n)]) def _extract_features(self, tweet,dictionary,lang): feature_set={} ngrams_lemmas,ngrams_words,pos_tagged = self._extract_ngrams(tweet,lang) pos_tags={'NN':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB', 'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'} totalVAD=[0,0,0] matches=0 for word in range(len(ngrams_lemmas)): VAD=[] if ngrams_lemmas[word] in dictionary: matches+=1 totalVAD = [totalVAD[0]+float(dictionary[ngrams_lemmas[word]]['V']), totalVAD[1]+float(dictionary[ngrams_lemmas[word]]['A']), totalVAD[2]+float(dictionary[ngrams_lemmas[word]]['D'])] elif ngrams_words[word] in dictionary: matches+=1 totalVAD = [totalVAD[0]+float(dictionary[ngrams_words[word]]['V']), totalVAD[1]+float(dictionary[ngrams_words[word]]['A']), totalVAD[2]+float(dictionary[ngrams_words[word]]['D'])] if matches==0: emotion='neutral' else: totalVAD=[totalVAD[0]/matches,totalVAD[1]/matches,totalVAD[2]/matches] feature_set['V'] = totalVAD[0] feature_set['A'] = totalVAD[1] feature_set['D'] = totalVAD[2] return feature_set def analyse_entry(self, entry, activity): params = activity.params text_input = entry.text text = self._my_preprocessor(text_input) dictionary = self._dictionary[params['language']] feature_set=self._extract_features(text, dictionary, params['language']) emotions = EmotionSet() emotions.id = "Emotions0" emotion1 = Emotion(id="Emotion0") emotion1["emoml:pad-dimensions_pleasure"] = feature_set['V'] emotion1["emoml:pad-dimensions_arousal"] = feature_set['A'] emotion1["emoml:pad-dimensions_dominance"] = feature_set['D'] emotion1.prov(activity) emotions.prov(activity) emotions.onyx__hasEmotion.append(emotion1) entry.emotions = [emotions, ] yield entry ontology = "http://gsi.dit.upm.es/ontologies/wnaffect/ns#" test_cases = [ { 'name': 'anger with VAD=(2.12, 6.95, 5.05)', 'input': 'I hate you', 'expected': { 'onyx:hasEmotionSet': [{ 'onyx:hasEmotion': [{ "http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#arousal": 6.95, "http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#dominance": 5.05, "http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#valence": 2.12, }] }] } }, { 'input': 'i am sad', 'expected': { 'onyx:hasEmotionSet': [{ 'onyx:hasEmotion': [{ "http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#arousal": 4.13, "http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#dominance": 3.45, "http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#valence": 1.61, }] }] } }, { 'name': 'joy', 'input': 'i am happy with my marks', 'expected': { 'onyx:hasEmotionSet': [{ 'onyx:hasEmotion': [{ "http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#arousal": 6.49, "http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#dominance": 6.63, "http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#valence": 8.21, }] }] } }, { 'name': 'negative-feat', 'input': 'This movie is scary', 'expected': { 'onyx:hasEmotionSet': [{ 'onyx:hasEmotion': [{ "http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#arousal": 5.8100000000000005, "http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#dominance": 4.33, "http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#valence": 5.050000000000001, }] }] } }, { 'name': 'negative-fear', 'input': 'this cake is disgusting' , 'expected': { 'onyx:hasEmotionSet': [{ 'onyx:hasEmotion': [{ "http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#arousal": 5.09, "http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#dominance": 4.4, "http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#valence": 5.109999999999999, }] }] } } ]