# -*- coding: utf-8 -*- import re import nltk import csv import sys import os import unicodedata import string import xml.etree.ElementTree as ET import math from sklearn.svm import LinearSVC from sklearn.feature_extraction import DictVectorizer from nltk import bigrams from nltk import trigrams from nltk.corpus import stopwords from pattern.en import parse as parse_en from pattern.es import parse as parse_es from senpy.plugins import EmotionPlugin, SenpyPlugin from senpy.models import Results, EmotionSet, Entry, Emotion ### BEGIN WORKAROUND FOR PATTERN # See: https://github.com/clips/pattern/issues/308 import os.path import pattern.text from pattern.helpers import decode_string from codecs import BOM_UTF8 BOM_UTF8 = BOM_UTF8.decode("utf-8") decode_utf8 = decode_string MODEL = "emoml:pad-dimensions_" VALENCE = f"{MODEL}_valence" AROUSAL = f"{MODEL}_arousal" DOMINANCE = f"{MODEL}_dominance" def _read(path, encoding="utf-8", comment=";;;"): """Returns an iterator over the lines in the file at the given path, strippping comments and decoding each line to Unicode. """ if path: if isinstance(path, str) and os.path.exists(path): # From file path. f = open(path, "r", encoding="utf-8") elif isinstance(path, str): # From string. f = path.splitlines() else: # From file or buffer. f = path for i, line in enumerate(f): line = line.strip(BOM_UTF8) if i == 0 and isinstance(line, str) else line line = line.strip() line = decode_utf8(line, encoding) if not line or (comment and line.startswith(comment)): continue yield line pattern.text._read = _read ## END WORKAROUND class ANEW(EmotionPlugin): description = "This plugin consists on an emotion classifier using ANEW lexicon dictionary. It averages the VAD (valence-arousal-dominance) value of each word in the text that is also in the ANEW dictionary. To obtain a categorical value (e.g., happy) use the emotion conversion API (e.g., `emotion-model=emoml:big6`)." author = "@icorcuera" version = "0.5.2" name = "emotion-anew" extra_params = { "language": { "description": "language of the input", "aliases": ["language", "l"], "required": True, "options": ["es","en"], "default": "en" } } anew_path_es = "Dictionary/Redondo(2007).csv" anew_path_en = "Dictionary/ANEW2010All.txt" onyx__usesEmotionModel = MODEL nltk_resources = ['stopwords'] def activate(self, *args, **kwargs): self._stopwords = stopwords.words('english') dictionary={} dictionary['es'] = {} with self.open(self.anew_path_es,'r') as tabfile: reader = csv.reader(tabfile, delimiter='\t') for row in reader: dictionary['es'][row[2]]={} dictionary['es'][row[2]]['V']=row[3] dictionary['es'][row[2]]['A']=row[5] dictionary['es'][row[2]]['D']=row[7] dictionary['en'] = {} with self.open(self.anew_path_en,'r') as tabfile: reader = csv.reader(tabfile, delimiter='\t') for row in reader: dictionary['en'][row[0]]={} dictionary['en'][row[0]]['V']=row[2] dictionary['en'][row[0]]['A']=row[4] dictionary['en'][row[0]]['D']=row[6] self._dictionary = dictionary def _my_preprocessor(self, text): regHttp = re.compile('(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?') regHttps = re.compile('(https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?') regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*') text = re.sub(regHttp, '', text) text = re.sub(regAt, '', text) text = re.sub('RT : ', '', text) text = re.sub(regHttps, '', text) text = re.sub('[0-9]', '', text) text = self._delete_punctuation(text) return text def _delete_punctuation(self, text): exclude = set(string.punctuation) s = ''.join(ch for ch in text if ch not in exclude) return s def _extract_ngrams(self, text, lang): unigrams_lemmas = [] unigrams_words = [] pos_tagged = [] if lang == 'es': sentences = list(parse_es(text, lemmata=True).split()) else: sentences = list(parse_en(text, lemmata=True).split()) for sentence in sentences: for token in sentence: if token[0].lower() not in self._stopwords: unigrams_words.append(token[0].lower()) unigrams_lemmas.append(token[4]) pos_tagged.append(token[1]) return unigrams_lemmas,unigrams_words,pos_tagged def _find_ngrams(self, input_list, n): return zip(*[input_list[i:] for i in range(n)]) def _extract_features(self, tweet,dictionary,lang): feature_set={} ngrams_lemmas,ngrams_words,pos_tagged = self._extract_ngrams(tweet,lang) pos_tags={'NN':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB', 'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'} totalVAD=[0,0,0] matches=0 for word in range(len(ngrams_lemmas)): VAD=[] if ngrams_lemmas[word] in dictionary: matches+=1 totalVAD = [totalVAD[0]+float(dictionary[ngrams_lemmas[word]]['V']), totalVAD[1]+float(dictionary[ngrams_lemmas[word]]['A']), totalVAD[2]+float(dictionary[ngrams_lemmas[word]]['D'])] elif ngrams_words[word] in dictionary: matches+=1 totalVAD = [totalVAD[0]+float(dictionary[ngrams_words[word]]['V']), totalVAD[1]+float(dictionary[ngrams_words[word]]['A']), totalVAD[2]+float(dictionary[ngrams_words[word]]['D'])] if matches==0: emotion='neutral' else: totalVAD=[totalVAD[0]/matches,totalVAD[1]/matches,totalVAD[2]/matches] feature_set['V'] = totalVAD[0] feature_set['A'] = totalVAD[1] feature_set['D'] = totalVAD[2] return feature_set def analyse_entry(self, entry, activity): params = activity.params text_input = entry.text text = self._my_preprocessor(text_input) dictionary = self._dictionary[params['language']] feature_set=self._extract_features(text, dictionary, params['language']) emotions = EmotionSet() emotions.id = "Emotions0" emotion1 = Emotion(id="Emotion0") emotion1[VALENCE] = feature_set['V'] emotion1[AROUSAL] = feature_set['A'] emotion1[DOMINANCE] = feature_set['D'] emotion1.prov(activity) emotions.prov(activity) emotions.onyx__hasEmotion.append(emotion1) entry.emotions = [emotions, ] yield entry test_cases = [ { 'name': 'anger with VAD=(2.12, 6.95, 5.05)', 'input': 'I hate you', 'expected': { 'onyx:hasEmotionSet': [{ 'onyx:hasEmotion': [{ AROUSAL: 6.95, DOMINANCE: 5.05, VALENCE: 2.12, }] }] } }, { 'input': 'i am sad', 'expected': { 'onyx:hasEmotionSet': [{ 'onyx:hasEmotion': [{ f"{MODEL}_arousal": 4.13, }] }] } }, { 'name': 'joy', 'input': 'i am happy with my marks', 'expected': { 'onyx:hasEmotionSet': [{ 'onyx:hasEmotion': [{ AROUSAL: 6.49, DOMINANCE: 6.63, VALENCE: 8.21, }] }] } }, { 'name': 'negative-feat', 'input': 'This movie is scary', 'expected': { 'onyx:hasEmotionSet': [{ 'onyx:hasEmotion': [{ AROUSAL: 5.8100000000000005, DOMINANCE: 4.33, VALENCE: 5.050000000000001, }] }] } }, { 'name': 'negative-fear', 'input': 'this cake is disgusting' , 'expected': { 'onyx:hasEmotionSet': [{ 'onyx:hasEmotion': [{ AROUSAL: 5.09, DOMINANCE: 4.4, VALENCE: 5.109999999999999, }] }] } } ]