senpy/emotion-depechemood/depechemood_plugin.py

#!/usr/local/bin/python
# coding: utf-8

from future import standard_library
standard_library.install_aliases()

import os
import re
import sys
import string
import numpy as np
from six.moves import urllib
from nltk.corpus import stopwords

from senpy import EmotionBox, models


def ignore(dchars):
    deletechars = "".join(dchars)
    tbl = str.maketrans("", "", deletechars)
    ignore = lambda s: s.translate(tbl)
    return ignore


class DepecheMood(EmotionBox):
    '''
    Plugin that uses the DepecheMood emotion lexicon.

    DepecheMood is an emotion lexicon automatically generated from news articles where users expressed their associated emotions. It contains two languages (English and Italian), as well as three types of word representations (token, lemma and lemma#PoS). For English, the lexicon contains 165k tokens, while the Italian version contains 116k. Unsupervised techniques can be applied to generate simple but effective baselines. To learn more, please visit https://github.com/marcoguerini/DepecheMood and http://www.depechemood.eu/
    '''

    author = 'Oscar Araque'
    name = 'emotion-depechemood'
    version = '0.1'
    requirements = ['pandas']
    nltk_resources = ["stopwords"]

    onyx__usesEmotionModel = 'wna:WNAModel'

    EMOTIONS =  ['wna:negative-fear',
                 'wna:amusement',
                 'wna:anger',
                 'wna:annoyance',
                 'wna:indifference',
                 'wna:joy',
                 'wna:awe',
                 'wna:sadness']

    DM_EMOTIONS = ['AFRAID', 'AMUSED', 'ANGRY', 'ANNOYED', 'DONT_CARE', 'HAPPY', 'INSPIRED', 'SAD',]

    def __init__(self, *args, **kwargs):
        super(DepecheMood, self).__init__(*args, **kwargs)
        self.LEXICON_URL = "https://github.com/marcoguerini/DepecheMood/raw/master/DepecheMood%2B%2B/DepecheMood_english_token_full.tsv"
        self._denoise = ignore(set(string.punctuation)|set('«»'))
        self._stop_words = []
        self._lex_vocab = None
        self._lex = None

    def activate(self):
        self._lex = self.download_lex()
        self._lex_vocab = set(list(self._lex.keys()))
        self._stop_words = stopwords.words('english') + ['']

    def clean_str(self, string):
        string = re.sub(r"[^A-Za-z0-9().,!?\'\`]", " ", string)
        string = re.sub(r"[0-9]+", " num ", string)
        string = re.sub(r"\'s", " \'s", string)
        string = re.sub(r"\'ve", " \'ve", string)
        string = re.sub(r"n\'t", " n\'t", string)
        string = re.sub(r"\'re", " \'re", string)
        string = re.sub(r"\'d", " \'d", string)
        string = re.sub(r"\'ll", " \'ll", string)
        string = re.sub(r"\.", " . ", string)
        string = re.sub(r",", " , ", string)
        string = re.sub(r"!", " ! ", string)
        string = re.sub(r"\(", " ( ", string)
        string = re.sub(r"\)", " ) ", string)
        string = re.sub(r"\?", " ? ", string)
        string = re.sub(r"\s{2,}", " ", string)
        return string.strip().lower()

    def preprocess(self, text):
        if text is None:
            return None
        tokens = self._denoise(self.clean_str(text)).split(' ')
        tokens = [tok for tok in tokens if tok not in self._stop_words]
        return tokens   

    def estimate_emotion(self, tokens, emotion):
        s = []
        for tok in tokens:
            s.append(self._lex[tok][emotion])
        dividend = np.sum(s) if np.sum(s) > 0 else 0
        divisor = len(s) if len(s) > 0 else 1
        S = np.sum(s) / divisor
        return S

    def estimate_all_emotions(self, tokens):
        S = []
        intersection = set(tokens) & self._lex_vocab
        for emotion in self.DM_EMOTIONS:
            s = self.estimate_emotion(intersection, emotion)
            S.append(s)
        return S

    def download_lex(self, file_path='DepecheMood_english_token_full.tsv', freq_threshold=10):

        import pandas as pd

        try:
            file_path = self.find_file(file_path)
        except IOError:
            file_path = self.path(file_path)
            filename, _ = urllib.request.urlretrieve(self.LEXICON_URL, file_path)

        lexicon = pd.read_csv(file_path, sep='\t', index_col=0)
        lexicon = lexicon[lexicon['freq'] >= freq_threshold]
        lexicon.drop('freq', axis=1, inplace=True)
        lexicon = lexicon.T.to_dict()
        return lexicon

    def predict_one(self, features, **kwargs):
        tokens = self.preprocess(features[0])
        estimation = self.estimate_all_emotions(tokens)
        return estimation

    test_cases = [
        {
            'entry': {
                'nif:isString': 'My cat is very happy',
            },
            'expected': {
                'onyx:hasEmotionSet': [
                    {
                        'onyx:hasEmotion': [
                            {
                             'onyx:hasEmotionCategory': 'wna:negative-fear',
                             'onyx:hasEmotionIntensity': 0.05278117640010922
                            },
                            {
                                'onyx:hasEmotionCategory': 'wna:amusement',
                                'onyx:hasEmotionIntensity': 0.2114806151413433,
                            },
                            {
                                'onyx:hasEmotionCategory': 'wna:anger',
                                'onyx:hasEmotionIntensity': 0.05726119426520887
                            },
                            {
                                'onyx:hasEmotionCategory': 'wna:annoyance',
                                'onyx:hasEmotionIntensity': 0.12295990731053638,
                            },
                            {
                                'onyx:hasEmotionCategory': 'wna:indifference',
                                'onyx:hasEmotionIntensity': 0.1860159893608025,
                            },
                            {
                                'onyx:hasEmotionCategory': 'wna:joy',
                                'onyx:hasEmotionIntensity': 0.12904050973724163,
                            },
                            {
                                'onyx:hasEmotionCategory': 'wna:awe',
                                'onyx:hasEmotionIntensity': 0.17973650399862967,
                            },
                            {
                                'onyx:hasEmotionCategory': 'wna:sadness',
                                'onyx:hasEmotionIntensity': 0.060724103786128455,
                            },
                        ]
                    }
                ]
            }
        }
    ]


if __name__ == '__main__':
    from senpy.utils import easy, easy_load, easy_test
    # sp, app = easy_load()
    # for plug in sp.analysis_plugins:
    #     plug.test()
    easy_test(debug=False)
depechemood updated 2019-01-09 16:19:22 +00:00			`#!/usr/local/bin/python`
			`# coding: utf-8`

Update to senpy 0.20 2019-04-04 10:56:46 +00:00			`from future import standard_library`
			`standard_library.install_aliases()`

depechemood updated 2019-01-09 16:19:22 +00:00			`import os`
			`import re`
tweaks for py2/py3 compatibility 2019-01-09 18:29:24 +00:00			`import sys`
depechemood updated 2019-01-09 16:19:22 +00:00			`import string`
			`import numpy as np`
			`from six.moves import urllib`
			`from nltk.corpus import stopwords`

Update to senpy 0.20 2019-04-04 10:56:46 +00:00			`from senpy import EmotionBox, models`
depechemood updated 2019-01-09 16:19:22 +00:00

tweaks for py2/py3 compatibility 2019-01-09 18:29:24 +00:00			`def ignore(dchars):`
			`deletechars = "".join(dchars)`
Update to senpy 0.20 2019-04-04 10:56:46 +00:00			`tbl = str.maketrans("", "", deletechars)`
			`ignore = lambda s: s.translate(tbl)`
tweaks for py2/py3 compatibility 2019-01-09 18:29:24 +00:00			`return ignore`


Update to senpy 0.20 2019-04-04 10:56:46 +00:00			`class DepecheMood(EmotionBox):`
			`'''`
			`Plugin that uses the DepecheMood emotion lexicon.`

			DepecheMood is an emotion lexicon automatically generated from news articles where users expressed their associated emotions. It contains two languages (English and Italian), as well as three types of word representations (token, lemma and lemma#PoS). For English, the lexicon contains 165k tokens, while the Italian version contains 116k. Unsupervised techniques can be applied to generate simple but effective baselines. To learn more, please visit https://github.com/marcoguerini/DepecheMood and http://www.depechemood.eu/
			`'''`
depechemood updated 2019-01-09 16:19:22 +00:00
			`author = 'Oscar Araque'`
Update to senpy 0.20 2019-04-04 10:56:46 +00:00			`name = 'emotion-depechemood'`
depechemood updated 2019-01-09 16:19:22 +00:00			`version = '0.1'`
Update to senpy 0.20 2019-04-04 10:56:46 +00:00			`requirements = ['pandas']`
			`nltk_resources = ["stopwords"]`

			`onyx__usesEmotionModel = 'wna:WNAModel'`

			`EMOTIONS = ['wna:negative-fear',`
			`'wna:amusement',`
			`'wna:anger',`
			`'wna:annoyance',`
			`'wna:indifference',`
			`'wna:joy',`
			`'wna:awe',`
			`'wna:sadness']`

			`DM_EMOTIONS = ['AFRAID', 'AMUSED', 'ANGRY', 'ANNOYED', 'DONT_CARE', 'HAPPY', 'INSPIRED', 'SAD',]`
depechemood updated 2019-01-09 16:19:22 +00:00
			`def __init__(self, args, *kwargs):`
			`super(DepecheMood, self).__init__(args, *kwargs)`
			`self.LEXICON_URL = "https://github.com/marcoguerini/DepecheMood/raw/master/DepecheMood%2B%2B/DepecheMood_english_token_full.tsv"`
tweaks for py2/py3 compatibility 2019-01-09 18:29:24 +00:00			`self._denoise = ignore(set(string.punctuation)\|set('«»'))`
			`self._stop_words = []`
depechemood updated 2019-01-09 16:19:22 +00:00			`self._lex_vocab = None`
			`self._lex = None`

			`def activate(self):`
			`self._lex = self.download_lex()`
			`self._lex_vocab = set(list(self._lex.keys()))`
tweaks for py2/py3 compatibility 2019-01-09 18:29:24 +00:00			`self._stop_words = stopwords.words('english') + ['']`
depechemood updated 2019-01-09 16:19:22 +00:00
			`def clean_str(self, string):`
			string = re.sub(r"[^A-Za-z0-9().,!?\'\`]", " ", string)
			`string = re.sub(r"[0-9]+", " num ", string)`
			`string = re.sub(r"\'s", " \'s", string)`
			`string = re.sub(r"\'ve", " \'ve", string)`
			`string = re.sub(r"n\'t", " n\'t", string)`
			`string = re.sub(r"\'re", " \'re", string)`
			`string = re.sub(r"\'d", " \'d", string)`
			`string = re.sub(r"\'ll", " \'ll", string)`
			`string = re.sub(r"\.", " . ", string)`
			`string = re.sub(r",", " , ", string)`
			`string = re.sub(r"!", " ! ", string)`
			`string = re.sub(r"\(", " ( ", string)`
			`string = re.sub(r"\)", " ) ", string)`
			`string = re.sub(r"\?", " ? ", string)`
			`string = re.sub(r"\s{2,}", " ", string)`
			`return string.strip().lower()`

			`def preprocess(self, text):`
			`if text is None:`
			`return None`
tweaks for py2/py3 compatibility 2019-01-09 18:29:24 +00:00			`tokens = self._denoise(self.clean_str(text)).split(' ')`
depechemood updated 2019-01-09 16:19:22 +00:00			`tokens = [tok for tok in tokens if tok not in self._stop_words]`
			`return tokens`

			`def estimate_emotion(self, tokens, emotion):`
			`s = []`
			`for tok in tokens:`
			`s.append(self._lex[tok][emotion])`
			`dividend = np.sum(s) if np.sum(s) > 0 else 0`
			`divisor = len(s) if len(s) > 0 else 1`
			`S = np.sum(s) / divisor`
			`return S`

			`def estimate_all_emotions(self, tokens):`
Update to senpy 0.20 2019-04-04 10:56:46 +00:00			`S = []`
depechemood updated 2019-01-09 16:19:22 +00:00			`intersection = set(tokens) & self._lex_vocab`
Update to senpy 0.20 2019-04-04 10:56:46 +00:00			`for emotion in self.DM_EMOTIONS:`
depechemood updated 2019-01-09 16:19:22 +00:00			`s = self.estimate_emotion(intersection, emotion)`
Update to senpy 0.20 2019-04-04 10:56:46 +00:00			`S.append(s)`
depechemood updated 2019-01-09 16:19:22 +00:00			`return S`

			`def download_lex(self, file_path='DepecheMood_english_token_full.tsv', freq_threshold=10):`

Update to senpy 0.20 2019-04-04 10:56:46 +00:00			`import pandas as pd`

depechemood updated 2019-01-09 16:19:22 +00:00			`try:`
			`file_path = self.find_file(file_path)`
			`except IOError:`
Update to senpy 0.20 2019-04-04 10:56:46 +00:00			`file_path = self.path(file_path)`
depechemood updated 2019-01-09 16:19:22 +00:00			`filename, _ = urllib.request.urlretrieve(self.LEXICON_URL, file_path)`

			`lexicon = pd.read_csv(file_path, sep='\t', index_col=0)`
			`lexicon = lexicon[lexicon['freq'] >= freq_threshold]`
			`lexicon.drop('freq', axis=1, inplace=True)`
			`lexicon = lexicon.T.to_dict()`
			`return lexicon`

Update to senpy 0.20 2019-04-04 10:56:46 +00:00			`def predict_one(self, features, **kwargs):`
			`tokens = self.preprocess(features[0])`
depechemood updated 2019-01-09 16:19:22 +00:00			`estimation = self.estimate_all_emotions(tokens)`
			`return estimation`

			`test_cases = [`
			`{`
			`'entry': {`
			`'nif:isString': 'My cat is very happy',`
			`},`
			`'expected': {`
Update to senpy 0.20 2019-04-04 10:56:46 +00:00			`'onyx:hasEmotionSet': [`
depechemood updated 2019-01-09 16:19:22 +00:00			`{`
			`'onyx:hasEmotion': [`
Update to senpy 0.20 2019-04-04 10:56:46 +00:00			`{`
			`'onyx:hasEmotionCategory': 'wna:negative-fear',`
			`'onyx:hasEmotionIntensity': 0.05278117640010922`
			`},`
			`{`
			`'onyx:hasEmotionCategory': 'wna:amusement',`
			`'onyx:hasEmotionIntensity': 0.2114806151413433,`
			`},`
			`{`
			`'onyx:hasEmotionCategory': 'wna:anger',`
			`'onyx:hasEmotionIntensity': 0.05726119426520887`
			`},`
			`{`
			`'onyx:hasEmotionCategory': 'wna:annoyance',`
			`'onyx:hasEmotionIntensity': 0.12295990731053638,`
			`},`
			`{`
			`'onyx:hasEmotionCategory': 'wna:indifference',`
			`'onyx:hasEmotionIntensity': 0.1860159893608025,`
			`},`
			`{`
			`'onyx:hasEmotionCategory': 'wna:joy',`
			`'onyx:hasEmotionIntensity': 0.12904050973724163,`
			`},`
			`{`
			`'onyx:hasEmotionCategory': 'wna:awe',`
			`'onyx:hasEmotionIntensity': 0.17973650399862967,`
			`},`
			`{`
			`'onyx:hasEmotionCategory': 'wna:sadness',`
			`'onyx:hasEmotionIntensity': 0.060724103786128455,`
			`},`
depechemood updated 2019-01-09 16:19:22 +00:00			`]`
			`}`
			`]`
			`}`
			`}`
			`]`


			`if __name__ == '__main__':`
			`from senpy.utils import easy, easy_load, easy_test`
			`# sp, app = easy_load()`
			`# for plug in sp.analysis_plugins:`
			`# plug.test()`
Update to senpy 0.20 2019-04-04 10:56:46 +00:00			`easy_test(debug=False)`