senpy/senpy/plugins/emotion/anew/emotion-anew.py

# -*- coding: utf-8 -*-

import re
import nltk
import csv
import sys
import os
import unicodedata
import string
import xml.etree.ElementTree as ET
import math

from sklearn.svm import LinearSVC
from sklearn.feature_extraction import DictVectorizer

from nltk import bigrams
from nltk import trigrams
from nltk.corpus import stopwords

from pattern.en import parse as parse_en
from pattern.es import parse as parse_es
from senpy.plugins import EmotionPlugin, SenpyPlugin
from senpy.models import Results, EmotionSet, Entry, Emotion


### BEGIN WORKAROUND FOR PATTERN
# See: https://github.com/clips/pattern/issues/308

import os.path

import pattern.text

from pattern.helpers import decode_string
from codecs import BOM_UTF8

BOM_UTF8 = BOM_UTF8.decode("utf-8")
decode_utf8 = decode_string

MODEL = "emoml:pad-dimensions_"
VALENCE = f"{MODEL}_valence"
AROUSAL = f"{MODEL}_arousal"
DOMINANCE = f"{MODEL}_dominance"

def _read(path, encoding="utf-8", comment=";;;"):
    """Returns an iterator over the lines in the file at the given path,
    strippping comments and decoding each line to Unicode.
    """
    if path:
        if isinstance(path, str) and os.path.exists(path):
            # From file path.
            f = open(path, "r", encoding="utf-8")
        elif isinstance(path, str):
            # From string.
            f = path.splitlines()
        else:
            # From file or buffer.
            f = path
        for i, line in enumerate(f):
            line = line.strip(BOM_UTF8) if i == 0 and isinstance(line, str) else line
            line = line.strip()
            line = decode_utf8(line, encoding)
            if not line or (comment and line.startswith(comment)):
                continue
            yield line


pattern.text._read = _read
## END WORKAROUND


class ANEW(EmotionPlugin):
    description = "This plugin consists on an emotion classifier using ANEW lexicon dictionary. It averages the VAD (valence-arousal-dominance) value of each word in the text that is also in the ANEW dictionary. To obtain a categorical value (e.g., happy) use the emotion conversion API (e.g., `emotion-model=emoml:big6`)."
    author = "@icorcuera"
    version = "0.5.2"
    name = "emotion-anew"

    extra_params = {
        "language": {
            "description": "language of the input",
            "aliases": ["language", "l"],
            "required": True,
            "options": ["es","en"],
            "default": "en"
        }
    }

    anew_path_es = "Dictionary/Redondo(2007).csv"
    anew_path_en = "Dictionary/ANEW2010All.txt"
    onyx__usesEmotionModel = MODEL
    nltk_resources = ['stopwords']

    def activate(self, *args, **kwargs):
        self._stopwords = stopwords.words('english')
        dictionary={}
        dictionary['es'] = {}
        with self.open(self.anew_path_es,'r') as tabfile:
            reader = csv.reader(tabfile, delimiter='\t')
            for row in reader:
                dictionary['es'][row[2]]={}
                dictionary['es'][row[2]]['V']=row[3]
                dictionary['es'][row[2]]['A']=row[5]
                dictionary['es'][row[2]]['D']=row[7]
        dictionary['en'] = {}
        with self.open(self.anew_path_en,'r') as tabfile:
            reader = csv.reader(tabfile, delimiter='\t')
            for row in reader:
                dictionary['en'][row[0]]={}
                dictionary['en'][row[0]]['V']=row[2]
                dictionary['en'][row[0]]['A']=row[4]
                dictionary['en'][row[0]]['D']=row[6]
        self._dictionary = dictionary

    def _my_preprocessor(self, text):

        regHttp = re.compile('(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
        regHttps = re.compile('(https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
        regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*')
        text = re.sub(regHttp, '', text)
        text = re.sub(regAt, '', text)
        text = re.sub('RT : ', '', text)
        text = re.sub(regHttps, '', text)
        text = re.sub('[0-9]', '', text)
        text = self._delete_punctuation(text)
        return text

    def _delete_punctuation(self, text):

        exclude = set(string.punctuation)
        s = ''.join(ch for ch in text if ch not in exclude)
        return s

    def _extract_ngrams(self, text, lang):
        unigrams_lemmas = []
        unigrams_words = []
        pos_tagged = []
        if lang == 'es':
            sentences = list(parse_es(text, lemmata=True).split())
        else:
            sentences = list(parse_en(text, lemmata=True).split())

        for sentence in sentences:
            for token in sentence:
                if token[0].lower() not in self._stopwords:
                    unigrams_words.append(token[0].lower())
                    unigrams_lemmas.append(token[4])
                    pos_tagged.append(token[1])

        return unigrams_lemmas,unigrams_words,pos_tagged

    def _find_ngrams(self, input_list, n):
        return zip(*[input_list[i:] for i in range(n)])

    def _extract_features(self, tweet,dictionary,lang):
        feature_set={}
        ngrams_lemmas,ngrams_words,pos_tagged = self._extract_ngrams(tweet,lang)
        pos_tags={'NN':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB',
         'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'}
        totalVAD=[0,0,0]
        matches=0
        for word in range(len(ngrams_lemmas)):
            VAD=[]
            if ngrams_lemmas[word] in dictionary:
                matches+=1
                totalVAD = [totalVAD[0]+float(dictionary[ngrams_lemmas[word]]['V']),
                            totalVAD[1]+float(dictionary[ngrams_lemmas[word]]['A']),
                            totalVAD[2]+float(dictionary[ngrams_lemmas[word]]['D'])]
            elif ngrams_words[word] in dictionary:
                matches+=1
                totalVAD = [totalVAD[0]+float(dictionary[ngrams_words[word]]['V']),
                            totalVAD[1]+float(dictionary[ngrams_words[word]]['A']),
                            totalVAD[2]+float(dictionary[ngrams_words[word]]['D'])]
        if matches==0:
            emotion='neutral'
        else:
            totalVAD=[totalVAD[0]/matches,totalVAD[1]/matches,totalVAD[2]/matches]
        feature_set['V'] = totalVAD[0]
        feature_set['A'] = totalVAD[1]
        feature_set['D'] = totalVAD[2]
        return feature_set

    def analyse_entry(self, entry, activity):
        params = activity.params

        text_input = entry.text

        text = self._my_preprocessor(text_input)
        dictionary = self._dictionary[params['language']]

        feature_set=self._extract_features(text, dictionary, params['language'])

        emotions = EmotionSet()
        emotions.id = "Emotions0"

        emotion1 = Emotion(id="Emotion0")
        emotion1[VALENCE] = feature_set['V']
        emotion1[AROUSAL] = feature_set['A']
        emotion1[DOMINANCE] = feature_set['D']

        emotion1.prov(activity)
        emotions.prov(activity)

        emotions.onyx__hasEmotion.append(emotion1)
        entry.emotions = [emotions, ]

        yield entry

    test_cases = [
        {
            'name': 'anger with VAD=(2.12, 6.95, 5.05)',
            'input': 'I hate you',
            'expected': {
                'onyx:hasEmotionSet': [{
                    'onyx:hasEmotion': [{
                        AROUSAL: 6.95,
                        DOMINANCE: 5.05,
                        VALENCE: 2.12,
                    }]
                }]
            }
        }, {
            'input': 'i am sad',
            'expected': {
                'onyx:hasEmotionSet': [{
                    'onyx:hasEmotion': [{
                        f"{MODEL}_arousal": 4.13,

                    }]
                }]
            }
        }, {
            'name': 'joy',
            'input': 'i am happy with my marks',
            'expected': {
                'onyx:hasEmotionSet': [{
                    'onyx:hasEmotion': [{
                        AROUSAL: 6.49,
                        DOMINANCE: 6.63,
                        VALENCE: 8.21,
                    }]
                }]
            }
        }, {
            'name': 'negative-feat',
            'input': 'This movie is scary',
            'expected': {
                'onyx:hasEmotionSet': [{
                    'onyx:hasEmotion': [{
                    AROUSAL: 5.8100000000000005,
                    DOMINANCE: 4.33,
                    VALENCE: 5.050000000000001,

                    }]
                }]
            }
        }, {
            'name': 'negative-fear',
            'input': 'this cake is disgusting' ,
            'expected': {
                'onyx:hasEmotionSet': [{
                    'onyx:hasEmotion': [{
                        AROUSAL: 5.09,
                        DOMINANCE: 4.4,
                        VALENCE: 5.109999999999999,

                    }]
                }]
            }
        }
    ]