senpy/senpy/plugins/emotion/anew/emotion-anew.py

# -*- coding: utf-8 -*-

import re
import nltk
import csv
import sys
import os
import unicodedata
import string
import xml.etree.ElementTree as ET
import math

from sklearn.svm import LinearSVC
from sklearn.feature_extraction import DictVectorizer

from nltk import bigrams
from nltk import trigrams
from nltk.corpus import stopwords

from pattern.en import parse as parse_en
from pattern.es import parse as parse_es
from senpy.plugins import EmotionPlugin, SenpyPlugin
from senpy.models import Results, EmotionSet, Entry, Emotion


### BEGIN WORKAROUND FOR PATTERN
# See: https://github.com/clips/pattern/issues/308

import os.path

import pattern.text

from pattern.helpers import decode_string
from codecs import BOM_UTF8

BOM_UTF8 = BOM_UTF8.decode("utf-8")
decode_utf8 = decode_string

MODEL = "emoml:pad-dimensions_"
VALENCE = f"{MODEL}_valence"
AROUSAL = f"{MODEL}_arousal"
DOMINANCE = f"{MODEL}_dominance"

def _read(path, encoding="utf-8", comment=";;;"):
    """Returns an iterator over the lines in the file at the given path,
    strippping comments and decoding each line to Unicode.
    """
    if path:
        if isinstance(path, str) and os.path.exists(path):
            # From file path.
            f = open(path, "r", encoding="utf-8")
        elif isinstance(path, str):
            # From string.
            f = path.splitlines()
        else:
            # From file or buffer.
            f = path
        for i, line in enumerate(f):
            line = line.strip(BOM_UTF8) if i == 0 and isinstance(line, str) else line
            line = line.strip()
            line = decode_utf8(line, encoding)
            if not line or (comment and line.startswith(comment)):
                continue
            yield line


pattern.text._read = _read
## END WORKAROUND


class ANEW(EmotionPlugin):
    description = "This plugin consists on an emotion classifier using ANEW lexicon dictionary. It averages the VAD (valence-arousal-dominance) value of each word in the text that is also in the ANEW dictionary. To obtain a categorical value (e.g., happy) use the emotion conversion API (e.g., `emotion-model=emoml:big6`)."
    author = "@icorcuera"
    version = "0.5.2"
    name = "emotion-anew"

    extra_params = {
        "language": {
            "description": "language of the input",
            "aliases": ["language", "l"],
            "required": True,
            "options": ["es","en"],
            "default": "en"
        }
    }

    anew_path_es = "Dictionary/Redondo(2007).csv"
    anew_path_en = "Dictionary/ANEW2010All.txt"
    onyx__usesEmotionModel = MODEL
    nltk_resources = ['stopwords']

    def activate(self, *args, **kwargs):
        self._stopwords = stopwords.words('english')
        dictionary={}
        dictionary['es'] = {}
        with self.open(self.anew_path_es,'r') as tabfile:
            reader = csv.reader(tabfile, delimiter='\t')
            for row in reader:
                dictionary['es'][row[2]]={}
                dictionary['es'][row[2]]['V']=row[3]
                dictionary['es'][row[2]]['A']=row[5]
                dictionary['es'][row[2]]['D']=row[7]
        dictionary['en'] = {}
        with self.open(self.anew_path_en,'r') as tabfile:
            reader = csv.reader(tabfile, delimiter='\t')
            for row in reader:
                dictionary['en'][row[0]]={}
                dictionary['en'][row[0]]['V']=row[2]
                dictionary['en'][row[0]]['A']=row[4]
                dictionary['en'][row[0]]['D']=row[6]
        self._dictionary = dictionary

    def _my_preprocessor(self, text):

        regHttp = re.compile('(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
        regHttps = re.compile('(https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
        regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*')
        text = re.sub(regHttp, '', text)
        text = re.sub(regAt, '', text)
        text = re.sub('RT : ', '', text)
        text = re.sub(regHttps, '', text)
        text = re.sub('[0-9]', '', text)
        text = self._delete_punctuation(text)
        return text

    def _delete_punctuation(self, text):

        exclude = set(string.punctuation)
        s = ''.join(ch for ch in text if ch not in exclude)
        return s

    def _extract_ngrams(self, text, lang):
        unigrams_lemmas = []
        unigrams_words = []
        pos_tagged = []
        if lang == 'es':
            sentences = list(parse_es(text, lemmata=True).split())
        else:
            sentences = list(parse_en(text, lemmata=True).split())

        for sentence in sentences:
            for token in sentence:
                if token[0].lower() not in self._stopwords:
                    unigrams_words.append(token[0].lower())
                    unigrams_lemmas.append(token[4])
                    pos_tagged.append(token[1])

        return unigrams_lemmas,unigrams_words,pos_tagged

    def _find_ngrams(self, input_list, n):
        return zip(*[input_list[i:] for i in range(n)])

    def _extract_features(self, tweet,dictionary,lang):
        feature_set={}
        ngrams_lemmas,ngrams_words,pos_tagged = self._extract_ngrams(tweet,lang)
        pos_tags={'NN':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB',
         'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'}
        totalVAD=[0,0,0]
        matches=0
        for word in range(len(ngrams_lemmas)):
            VAD=[]
            if ngrams_lemmas[word] in dictionary:
                matches+=1
                totalVAD = [totalVAD[0]+float(dictionary[ngrams_lemmas[word]]['V']),
                            totalVAD[1]+float(dictionary[ngrams_lemmas[word]]['A']),
                            totalVAD[2]+float(dictionary[ngrams_lemmas[word]]['D'])]
            elif ngrams_words[word] in dictionary:
                matches+=1
                totalVAD = [totalVAD[0]+float(dictionary[ngrams_words[word]]['V']),
                            totalVAD[1]+float(dictionary[ngrams_words[word]]['A']),
                            totalVAD[2]+float(dictionary[ngrams_words[word]]['D'])]
        if matches==0:
            emotion='neutral'
        else:
            totalVAD=[totalVAD[0]/matches,totalVAD[1]/matches,totalVAD[2]/matches]
        feature_set['V'] = totalVAD[0]
        feature_set['A'] = totalVAD[1]
        feature_set['D'] = totalVAD[2]
        return feature_set

    def analyse_entry(self, entry, activity):
        params = activity.params

        text_input = entry.text

        text = self._my_preprocessor(text_input)
        dictionary = self._dictionary[params['language']]

        feature_set=self._extract_features(text, dictionary, params['language'])

        emotions = EmotionSet()
        emotions.id = "Emotions0"

        emotion1 = Emotion(id="Emotion0")
        emotion1[VALENCE] = feature_set['V']
        emotion1[AROUSAL] = feature_set['A']
        emotion1[DOMINANCE] = feature_set['D']

        emotion1.prov(activity)
        emotions.prov(activity)

        emotions.onyx__hasEmotion.append(emotion1)
        entry.emotions = [emotions, ]

        yield entry

    test_cases = [
        {
            'name': 'anger with VAD=(2.12, 6.95, 5.05)',
            'input': 'I hate you',
            'expected': {
                'onyx:hasEmotionSet': [{
                    'onyx:hasEmotion': [{
                        AROUSAL: 6.95,
                        DOMINANCE: 5.05,
                        VALENCE: 2.12,
                    }]
                }]
            }
        }, {
            'input': 'i am sad',
            'expected': {
                'onyx:hasEmotionSet': [{
                    'onyx:hasEmotion': [{
                        f"{MODEL}_arousal": 4.13,

                    }]
                }]
            }
        }, {
            'name': 'joy',
            'input': 'i am happy with my marks',
            'expected': {
                'onyx:hasEmotionSet': [{
                    'onyx:hasEmotion': [{
                        AROUSAL: 6.49,
                        DOMINANCE: 6.63,
                        VALENCE: 8.21,
                    }]
                }]
            }
        }, {
            'name': 'negative-feat',
            'input': 'This movie is scary',
            'expected': {
                'onyx:hasEmotionSet': [{
                    'onyx:hasEmotion': [{
                    AROUSAL: 5.8100000000000005,
                    DOMINANCE: 4.33,
                    VALENCE: 5.050000000000001,

                    }]
                }]
            }
        }, {
            'name': 'negative-fear',
            'input': 'this cake is disgusting' ,
            'expected': {
                'onyx:hasEmotionSet': [{
                    'onyx:hasEmotion': [{
                        AROUSAL: 5.09,
                        DOMINANCE: 4.4,
                        VALENCE: 5.109999999999999,

                    }]
                }]
            }
        }
    ]
Squashed 'emotion-anew/' content from commit e8a3c83 git-subtree-dir: emotion-anew git-subtree-split: e8a3c837e3543a5f5f19086e1fcaa34b22be639e 6 years ago			`# -- coding: utf-8 --`

			`import re`
			`import nltk`
			`import csv`
			`import sys`
			`import os`
			`import unicodedata`
			`import string`
			`import xml.etree.ElementTree as ET`
			`import math`

			`from sklearn.svm import LinearSVC`
			`from sklearn.feature_extraction import DictVectorizer`

			`from nltk import bigrams`
			`from nltk import trigrams`
			`from nltk.corpus import stopwords`

			`from pattern.en import parse as parse_en`
			`from pattern.es import parse as parse_es`
Update to senpy 0.20 5 years ago			`from senpy.plugins import EmotionPlugin, SenpyPlugin`
Squashed 'emotion-anew/' content from commit e8a3c83 git-subtree-dir: emotion-anew git-subtree-split: e8a3c837e3543a5f5f19086e1fcaa34b22be639e 6 years ago			`from senpy.models import Results, EmotionSet, Entry, Emotion`


update to pass tests with community plugins 8 months ago			`### BEGIN WORKAROUND FOR PATTERN`
			`# See: https://github.com/clips/pattern/issues/308`

			`import os.path`

			`import pattern.text`

			`from pattern.helpers import decode_string`
			`from codecs import BOM_UTF8`

			`BOM_UTF8 = BOM_UTF8.decode("utf-8")`
			`decode_utf8 = decode_string`

			`MODEL = "emoml:pad-dimensions_"`
			`VALENCE = f"{MODEL}_valence"`
			`AROUSAL = f"{MODEL}_arousal"`
			`DOMINANCE = f"{MODEL}_dominance"`

			`def _read(path, encoding="utf-8", comment=";;;"):`
			`"""Returns an iterator over the lines in the file at the given path,`
			`strippping comments and decoding each line to Unicode.`
			`"""`
			`if path:`
			`if isinstance(path, str) and os.path.exists(path):`
			`# From file path.`
			`f = open(path, "r", encoding="utf-8")`
			`elif isinstance(path, str):`
			`# From string.`
			`f = path.splitlines()`
			`else:`
			`# From file or buffer.`
			`f = path`
			`for i, line in enumerate(f):`
			`line = line.strip(BOM_UTF8) if i == 0 and isinstance(line, str) else line`
			`line = line.strip()`
			`line = decode_utf8(line, encoding)`
			`if not line or (comment and line.startswith(comment)):`
			`continue`
			`yield line`


			`pattern.text._read = _read`
			`## END WORKAROUND`


Update to senpy 0.20 5 years ago			`class ANEW(EmotionPlugin):`
update emotion-anew description 5 years ago			description = "This plugin consists on an emotion classifier using ANEW lexicon dictionary. It averages the VAD (valence-arousal-dominance) value of each word in the text that is also in the ANEW dictionary. To obtain a categorical value (e.g., happy) use the emotion conversion API (e.g., `emotion-model=emoml:big6`)."
Merged into monorepo 6 years ago			`author = "@icorcuera"`
Clean up emotion-anew 5 years ago			`version = "0.5.2"`
Merged into monorepo 6 years ago			`name = "emotion-anew"`

			`extra_params = {`
			`"language": {`
Update to senpy 0.20 5 years ago			`"description": "language of the input",`
Merged into monorepo 6 years ago			`"aliases": ["language", "l"],`
			`"required": True,`
			`"options": ["es","en"],`
			`"default": "en"`
			`}`
			`}`

			`anew_path_es = "Dictionary/Redondo(2007).csv"`
			`anew_path_en = "Dictionary/ANEW2010All.txt"`
update to pass tests with community plugins 8 months ago			`onyx__usesEmotionModel = MODEL`
Merged into monorepo 6 years ago			`nltk_resources = ['stopwords']`
Squashed 'emotion-anew/' content from commit e8a3c83 git-subtree-dir: emotion-anew git-subtree-split: e8a3c837e3543a5f5f19086e1fcaa34b22be639e 6 years ago
			`def activate(self, args, *kwargs):`
			`self._stopwords = stopwords.words('english')`
Merged into monorepo 6 years ago			`dictionary={}`
			`dictionary['es'] = {}`
tweaks for py2/py3 compatibility 5 years ago			`with self.open(self.anew_path_es,'r') as tabfile:`
Merged into monorepo 6 years ago			`reader = csv.reader(tabfile, delimiter='\t')`
			`for row in reader:`
			`dictionary['es'][row[2]]={}`
			`dictionary['es'][row[2]]['V']=row[3]`
			`dictionary['es'][row[2]]['A']=row[5]`
			`dictionary['es'][row[2]]['D']=row[7]`
			`dictionary['en'] = {}`
tweaks for py2/py3 compatibility 5 years ago			`with self.open(self.anew_path_en,'r') as tabfile:`
Merged into monorepo 6 years ago			`reader = csv.reader(tabfile, delimiter='\t')`
			`for row in reader:`
			`dictionary['en'][row[0]]={}`
			`dictionary['en'][row[0]]['V']=row[2]`
			`dictionary['en'][row[0]]['A']=row[4]`
			`dictionary['en'][row[0]]['D']=row[6]`
			`self._dictionary = dictionary`
Squashed 'emotion-anew/' content from commit e8a3c83 git-subtree-dir: emotion-anew git-subtree-split: e8a3c837e3543a5f5f19086e1fcaa34b22be639e 6 years ago
			`def _my_preprocessor(self, text):`

			`regHttp = re.compile('(http://)[a-zA-Z0-9].[a-zA-Z0-9/](.[a-zA-Z0-9]*)?')`
			`regHttps = re.compile('(https://)[a-zA-Z0-9].[a-zA-Z0-9/](.[a-zA-Z0-9]*)?')`
			`regAt = re.compile('@([a-zA-Z0-9][_/&%#@$])[a-zA-Z0-9]*')`
			`text = re.sub(regHttp, '', text)`
			`text = re.sub(regAt, '', text)`
			`text = re.sub('RT : ', '', text)`
			`text = re.sub(regHttps, '', text)`
			`text = re.sub('[0-9]', '', text)`
			`text = self._delete_punctuation(text)`
			`return text`

			`def _delete_punctuation(self, text):`

			`exclude = set(string.punctuation)`
			`s = ''.join(ch for ch in text if ch not in exclude)`
			`return s`

			`def _extract_ngrams(self, text, lang):`
			`unigrams_lemmas = []`
			`unigrams_words = []`
			`pos_tagged = []`
			`if lang == 'es':`
Update to senpy 0.20 5 years ago			`sentences = list(parse_es(text, lemmata=True).split())`
Squashed 'emotion-anew/' content from commit e8a3c83 git-subtree-dir: emotion-anew git-subtree-split: e8a3c837e3543a5f5f19086e1fcaa34b22be639e 6 years ago			`else:`
Update to senpy 0.20 5 years ago			`sentences = list(parse_en(text, lemmata=True).split())`
Squashed 'emotion-anew/' content from commit e8a3c83 git-subtree-dir: emotion-anew git-subtree-split: e8a3c837e3543a5f5f19086e1fcaa34b22be639e 6 years ago
			`for sentence in sentences:`
			`for token in sentence:`
			`if token[0].lower() not in self._stopwords:`
			`unigrams_words.append(token[0].lower())`
Merged into monorepo 6 years ago			`unigrams_lemmas.append(token[4])`
			`pos_tagged.append(token[1])`
Squashed 'emotion-anew/' content from commit e8a3c83 git-subtree-dir: emotion-anew git-subtree-split: e8a3c837e3543a5f5f19086e1fcaa34b22be639e 6 years ago
			`return unigrams_lemmas,unigrams_words,pos_tagged`

			`def _find_ngrams(self, input_list, n):`
			`return zip(*[input_list[i:] for i in range(n)])`

			`def _extract_features(self, tweet,dictionary,lang):`
			`feature_set={}`
			`ngrams_lemmas,ngrams_words,pos_tagged = self._extract_ngrams(tweet,lang)`
			`pos_tags={'NN':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB',`
			`'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'}`
			`totalVAD=[0,0,0]`
			`matches=0`
			`for word in range(len(ngrams_lemmas)):`
			`VAD=[]`
			`if ngrams_lemmas[word] in dictionary:`
			`matches+=1`
			`totalVAD = [totalVAD[0]+float(dictionary[ngrams_lemmas[word]]['V']),`
			`totalVAD[1]+float(dictionary[ngrams_lemmas[word]]['A']),`
			`totalVAD[2]+float(dictionary[ngrams_lemmas[word]]['D'])]`
			`elif ngrams_words[word] in dictionary:`
			`matches+=1`
			`totalVAD = [totalVAD[0]+float(dictionary[ngrams_words[word]]['V']),`
			`totalVAD[1]+float(dictionary[ngrams_words[word]]['A']),`
			`totalVAD[2]+float(dictionary[ngrams_words[word]]['D'])]`
			`if matches==0:`
			`emotion='neutral'`
			`else:`
			`totalVAD=[totalVAD[0]/matches,totalVAD[1]/matches,totalVAD[2]/matches]`
Update to senpy 0.20 5 years ago			`feature_set['V'] = totalVAD[0]`
			`feature_set['A'] = totalVAD[1]`
			`feature_set['D'] = totalVAD[2]`
Squashed 'emotion-anew/' content from commit e8a3c83 git-subtree-dir: emotion-anew git-subtree-split: e8a3c837e3543a5f5f19086e1fcaa34b22be639e 6 years ago			`return feature_set`

Update to senpy 0.20 5 years ago			`def analyse_entry(self, entry, activity):`
			`params = activity.params`
Squashed 'emotion-anew/' content from commit e8a3c83 git-subtree-dir: emotion-anew git-subtree-split: e8a3c837e3543a5f5f19086e1fcaa34b22be639e 6 years ago
Merged into monorepo 6 years ago			`text_input = entry.text`
Squashed 'emotion-anew/' content from commit e8a3c83 git-subtree-dir: emotion-anew git-subtree-split: e8a3c837e3543a5f5f19086e1fcaa34b22be639e 6 years ago
Merged into monorepo 6 years ago			`text = self._my_preprocessor(text_input)`
			`dictionary = self._dictionary[params['language']]`
Squashed 'emotion-anew/' content from commit e8a3c83 git-subtree-dir: emotion-anew git-subtree-split: e8a3c837e3543a5f5f19086e1fcaa34b22be639e 6 years ago
Merged into monorepo 6 years ago			`feature_set=self._extract_features(text, dictionary, params['language'])`
Squashed 'emotion-anew/' content from commit e8a3c83 git-subtree-dir: emotion-anew git-subtree-split: e8a3c837e3543a5f5f19086e1fcaa34b22be639e 6 years ago
			`emotions = EmotionSet()`
			`emotions.id = "Emotions0"`

			`emotion1 = Emotion(id="Emotion0")`
update to pass tests with community plugins 8 months ago			`emotion1[VALENCE] = feature_set['V']`
			`emotion1[AROUSAL] = feature_set['A']`
			`emotion1[DOMINANCE] = feature_set['D']`
Squashed 'emotion-anew/' content from commit e8a3c83 git-subtree-dir: emotion-anew git-subtree-split: e8a3c837e3543a5f5f19086e1fcaa34b22be639e 6 years ago
Update to senpy 0.20 5 years ago			`emotion1.prov(activity)`
			`emotions.prov(activity)`
Merged into monorepo 6 years ago
Squashed 'emotion-anew/' content from commit e8a3c83 git-subtree-dir: emotion-anew git-subtree-split: e8a3c837e3543a5f5f19086e1fcaa34b22be639e 6 years ago			`emotions.onyx__hasEmotion.append(emotion1)`
Merged into monorepo 6 years ago			`entry.emotions = [emotions, ]`
Squashed 'emotion-anew/' content from commit e8a3c83 git-subtree-dir: emotion-anew git-subtree-split: e8a3c837e3543a5f5f19086e1fcaa34b22be639e 6 years ago
			`yield entry`
Merged into monorepo 6 years ago
			`test_cases = [`
			`{`
Update to senpy 0.20 5 years ago			`'name': 'anger with VAD=(2.12, 6.95, 5.05)',`
Merged into monorepo 6 years ago			`'input': 'I hate you',`
			`'expected': {`
Update to senpy 0.20 5 years ago			`'onyx:hasEmotionSet': [{`
Merged into monorepo 6 years ago			`'onyx:hasEmotion': [{`
update to pass tests with community plugins 8 months ago			`AROUSAL: 6.95,`
			`DOMINANCE: 5.05,`
			`VALENCE: 2.12,`
Merged into monorepo 6 years ago			`}]`
			`}]`
			`}`
			`}, {`
			`'input': 'i am sad',`
			`'expected': {`
Update to senpy 0.20 5 years ago			`'onyx:hasEmotionSet': [{`
Merged into monorepo 6 years ago			`'onyx:hasEmotion': [{`
update to pass tests with community plugins 8 months ago			`f"{MODEL}_arousal": 4.13,`
Update to senpy 0.20 5 years ago
Merged into monorepo 6 years ago			`}]`
			`}]`
			`}`
			`}, {`
Update to senpy 0.20 5 years ago			`'name': 'joy',`
Merged into monorepo 6 years ago			`'input': 'i am happy with my marks',`
			`'expected': {`
Update to senpy 0.20 5 years ago			`'onyx:hasEmotionSet': [{`
Merged into monorepo 6 years ago			`'onyx:hasEmotion': [{`
update to pass tests with community plugins 8 months ago			`AROUSAL: 6.49,`
			`DOMINANCE: 6.63,`
			`VALENCE: 8.21,`
Merged into monorepo 6 years ago			`}]`
			`}]`
			`}`
			`}, {`
Update to senpy 0.20 5 years ago			`'name': 'negative-feat',`
Merged into monorepo 6 years ago			`'input': 'This movie is scary',`
			`'expected': {`
Update to senpy 0.20 5 years ago			`'onyx:hasEmotionSet': [{`
Merged into monorepo 6 years ago			`'onyx:hasEmotion': [{`
update to pass tests with community plugins 8 months ago			`AROUSAL: 5.8100000000000005,`
			`DOMINANCE: 4.33,`
			`VALENCE: 5.050000000000001,`
Update to senpy 0.20 5 years ago
Merged into monorepo 6 years ago			`}]`
			`}]`
			`}`
			`}, {`
Update to senpy 0.20 5 years ago			`'name': 'negative-fear',`
Merged into monorepo 6 years ago			`'input': 'this cake is disgusting' ,`
			`'expected': {`
Update to senpy 0.20 5 years ago			`'onyx:hasEmotionSet': [{`
Merged into monorepo 6 years ago			`'onyx:hasEmotion': [{`
update to pass tests with community plugins 8 months ago			`AROUSAL: 5.09,`
			`DOMINANCE: 4.4,`
			`VALENCE: 5.109999999999999,`
Update to senpy 0.20 5 years ago
Merged into monorepo 6 years ago			`}]`
			`}]`
			`}`
			`}`
			`]`