Added WordNet-Affect plugin and Makefile

2026-06-14 02:01:59 +00:00 · 2016-09-21 21:48:57 +02:00
parent 0e9db7081c
commit 5e8bc717a8
11 changed files with 527 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -55,3 +55,11 @@ docs/_build/
 # PyBuilder
 target/
 .*
 *.pyc
 **/__pycache__
 */wordnet1.6
 */Corpus
 */a-hierarchy.xml
 */a-synsets.xml
 */wn16.txt
--- a/15
+++ b/15
@@ -0,0 +1,15 @@
 from gsiupm/senpy:0.6.1-python2.7
 RUN mkdir -p /senpy-plugins
 RUN pip install nltk
 RUN python -m nltk.downloader stopwords
 RUN python -m nltk.downloader punkt
 RUN python -m nltk.downloader maxent_treebank_pos_tagger
 RUN python -m nltk.downloader wordnet
 RUN pip install pytest
 RUN pip install mock
 ADD . /senpy-plugins
 RUN senpy -f /senpy-plugins --only-install
 WORKDIR /senpy-plugins/
--- a/25
+++ b/25
@@ -0,0 +1,25 @@
 PYVERSION=2.7
 NAME=senpycommunity
 REPO=gsiupm
 VERSION=test
 PLUGINS= $(filter %/, $(wildcard */))
 all: build run
 build: clean Dockerfile
 	docker build -t '$(REPO)/$(NAME):$(VERSION)-python$(PYVERSION)' -f Dockerfile .;
 test-%:
 	docker run -v $$PWD/$*:/senpy-plugins/ --rm --entrypoint=/usr/local/bin/py.test -ti '$(REPO)/$(NAME):$(VERSION)-python$(PYVERSION)' test.py
 test: $(addprefix test-,$(PLUGINS))
 clean:
 	@docker ps -a | awk '/$(REPO)\/$(NAME)/{ split($$2, vers, "-"); if(vers[1] != "${VERSION}"){ print $$1;}}' | xargs docker rm 2>/dev/null|| true
 	@docker images | awk '/$(REPO)\/$(NAME)/{ split($$2, vers, "-"); if(vers[1] != "${VERSION}"){ print $$1":"$$2;}}' | xargs docker rmi 2>/dev/null|| true
 run: build
 	docker run --rm -p 5000:5000 -ti '$(REPO)/$(NAME):$(VERSION)-python$(PYMAIN)'
 .PHONY: test test-% build-% build test test_pip run clean
--- a/emoTextWNA/README.rst
+++ b/emoTextWNA/README.rst
@@ -0,0 +1,10 @@
 This plugin uses WNAffect labels for emotion analysis.
 The emotextWAF.senpy file can be copied and modified to use different versions of wnaffect with the same python code.
 Known issues
 ============
  * This plugin uses the pattern library, which means it will only run on python 2.7
  * Wnaffect and corpora files are not included in the repository, but can be easily added either to the docker image (using a volume) or in a new docker image.
--- a/emoTextWNA/emotextWAF.py
+++ b/emoTextWNA/emotextWAF.py
@@ -0,0 +1,185 @@
 # -*- coding: utf-8 -*-
 from __future__ import division
 import re
 import nltk
 import logging
 import os
 import string
 import xml.etree.ElementTree as ET
 from nltk.corpus import stopwords
 from nltk.corpus import WordNetCorpusReader
 from emotion import Emotion as Emo
 from pattern.en import parse
 from senpy.plugins import EmotionPlugin, SenpyPlugin
 from senpy.models import Results, EmotionSet, Entry, Emotion
 logger = logging.getLogger(__name__)
 class EmotionTextPlugin(EmotionPlugin):
    def __init__(self, info, *args, **kwargs):
        super(EmotionTextPlugin, self).__init__(info, *args, **kwargs)
        self.id = info['module']
        self.info = info
        self._stopwords = stopwords.words('english')
        local_path=os.path.dirname(os.path.abspath(__file__))
        self._categories = {'anger': ['general-dislike',],
                            'fear': ['negative-fear',],
                            'disgust': ['shame',],
                            'joy': ['gratitude','affective','enthusiasm','love','joy','liking'],
                            'sadness': ['ingrattitude','daze','humility','compassion','despair','anxiety','sadness']}
        self._wnaffect_mappings = {'anger': 'anger',
                                   'fear': 'negative-fear',
                                   'disgust': 'disgust',
                                   'joy': 'joy',
                                   'sadness': 'sadness'}
        self._load_emotions(local_path+self.info['hierarchy_path'])     
        self._total_synsets = self._load_synsets(local_path+self.info['synsets_path'])
        self._wn16_path = local_path+self.info['wn16_path']
        self._wn16= None
        self._wn16 = WordNetCorpusReader(os.path.abspath("{0}".format(self._wn16_path)), nltk.data.find(self._wn16_path))
    def _load_synsets(self, synsets_path):
        """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str)."""
        tree = ET.parse(synsets_path)
        root = tree.getroot()
        pos_map = { "noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB" }
        synsets = {}
        for pos in ["noun", "adj", "verb", "adv"]:
            tag = pos_map[pos]
            synsets[tag] = {}
            for elem in root.findall(".//{0}-syn-list//{0}-syn".format(pos, pos)):
                offset = int(elem.get("id")[2:])                
                if not offset: continue
                if elem.get("categ"):
                    synsets[tag][offset] = Emo.emotions[elem.get("categ")] if elem.get("categ") in Emo.emotions else None
                elif elem.get("noun-id"):
                    synsets[tag][offset] = synsets[pos_map["noun"]][int(elem.get("noun-id")[2:])]
        return synsets
    def _load_emotions(self, hierarchy_path):
        """Loads the hierarchy of emotions from the WordNet-Affect xml."""
        tree = ET.parse(hierarchy_path)
        root = tree.getroot()
        for elem in root.findall("categ"):
            name = elem.get("name")
            if name == "root":
                Emo.emotions["root"] = Emo("root")
            else:
                Emo.emotions[name] = Emo(name, elem.get("isa"))
    def activate(self, *args, **kwargs):
        logger.info("EmoText plugin is ready to go!")
    def deactivate(self, *args, **kwargs):
        logger.info("EmoText plugin is being deactivated...")
    def _my_preprocessor(self, text):
        regHttp = re.compile('(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
        regHttps = re.compile('(https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
        regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*')
        text = re.sub(regHttp, '', text)
        text = re.sub(regAt, '', text)
        text = re.sub('RT : ', '', text)
        text = re.sub(regHttps, '', text)
        text = re.sub('[0-9]', '', text)
        text = self._delete_punctuation(text)
        return text
    def _delete_punctuation(self, text):
        exclude = set(string.punctuation)
        s = ''.join(ch for ch in text if ch not in exclude)
        return s
    def _extract_ngrams(self, text):
        unigrams_lemmas = []
        pos_tagged = []
        unigrams_words = []
        sentences = parse(text,lemmata=True).split()
        for sentence in sentences:
            for token in sentence:
                if token[0].lower() not in self._stopwords:
                    unigrams_words.append(token[0].lower())
                    unigrams_lemmas.append(token[4])  
                    pos_tagged.append(token[1])        
        return unigrams_words,unigrams_lemmas,pos_tagged
    def _find_ngrams(self, input_list, n):
        return zip(*[input_list[i:] for i in range(n)])
    def _clean_pos(self, pos_tagged):
        pos_tags={'NN':'NN', 'NNP':'NN','NNP-LOC':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB',
        'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'}
        for i in range(len(pos_tagged)):
            if pos_tagged[i] in pos_tags:
                pos_tagged[i]=pos_tags[pos_tagged[i]]
        return pos_tagged
    def _extract_features(self, text):
        feature_set={k:0 for k in self._categories}
        ngrams_words,ngrams_lemmas,pos_tagged = self._extract_ngrams(text)
        matches=0
        pos_tagged=self._clean_pos(pos_tagged)
        tag_wn={'NN':self._wn16.NOUN,'JJ':self._wn16.ADJ,'VB':self._wn16.VERB,'RB':self._wn16.ADV}
        for i in range(len(pos_tagged)):
            if pos_tagged[i] in tag_wn:
                synsets = self._wn16.synsets(ngrams_words[i], tag_wn[pos_tagged[i]])   
                if synsets:
                    offset = synsets[0].offset()
                    if offset in self._total_synsets[pos_tagged[i]]:
                        if self._total_synsets[pos_tagged[i]][offset] is None:
                            continue
                        else:
                            emotion = self._total_synsets[pos_tagged[i]][offset].get_level(5).name
                            matches+=1
                            for i in self._categories:
                                if emotion in self._categories[i]:
                                    feature_set[i]+=1
        if matches == 0:
            matches=1                
        for i in feature_set:
            feature_set[i] = (feature_set[i]/matches)*100
        return feature_set
    def analyse(self, **params):
        logger.debug("Analysing with params {}".format(params))
        text_input = params.get("input", None)
        text=self._my_preprocessor(text_input)
        feature_text=self._extract_features(text)
        response = Results()
        entry = Entry(id="Entry",
                      text=text_input)
        emotionSet = EmotionSet(id="Emotions0")
        emotions = emotionSet.onyx__hasEmotion
        for i in feature_text:
            emotions.append(Emotion(onyx__hasEmotionCategory=self._wnaffect_mappings[i],
                                    onyx__hasEmotionIntensity=feature_text[i]))
        entry.emotions = [emotionSet]
        response.entries.append(entry)
        return response
--- a/emoTextWNA/emotextWAF.senpy
+++ b/emoTextWNA/emotextWAF.senpy
@@ -0,0 +1,29 @@
 {
    "name": "EmoTextWAF",
    "module": "emotextWAF",
    "description": "Emotion classifier using rule-based classification.",
    "author": "@icorcuera @balkian",
    "version": "0.2",
    "extra_params": {
        "language": {
            "aliases": ["language", "l"],
            "required": true,
            "options": ["en"],
            "default": "en"
        }
    },
    "requirements": {},
    "synsets_path": "/a-synsets.xml",
    "hierarchy_path": "/a-hierarchy.xml",
    "wn16_path": "/wordnet1.6/dict",
    "requirements": [
        "nltk>=3.0.5",
        "numpy>=1.8.2",
        "scipy>=0.14.0",
        "scikit-learn>=0.14.1",
        "lxml>=3.4.2",
        "pandas",
        "senpy",
        "pattern"
    ]
 }
--- a/emoTextWNA/emotion.py
+++ b/emoTextWNA/emotion.py
@@ -0,0 +1,97 @@
 # coding: utf-8
 """
 Clement Michard (c) 2015
 """
 class Emotion:
    """Defines an emotion."""
    emotions = {} # name to emotion (str -> Emotion)
    def __init__(self, name, parent_name=None):
        """Initializes an Emotion object.
            name -- name of the emotion (str)
            parent_name -- name of the parent emotion (str)
        """
        self.name = name
        self.parent = None
        self.level = 0
        self.children = []
        if parent_name:
            self.parent = Emotion.emotions[parent_name] if parent_name else None
            self.parent.children.append(self)
            self.level = self.parent.level + 1
    def get_level(self, level):
        """Returns the parent of self at the given level.
            level -- level in the hierarchy (int)        
        """
        em = self
        while em.level > level and em.level >= 0:
            em = em.parent
        return em
    def __str__(self):
        """Returns the emotion string formatted."""
        return self.name
    def nb_children(self):
        """Returns the number of children of the emotion."""
        return sum(child.nb_children() for child in self.children) + 1
    @staticmethod
    def printTree(emotion=None, indent="", last='updown'):
        """Prints the hierarchy of emotions.
            emotion -- root emotion (Emotion)
        """
        if not emotion:
            emotion = Emotion.emotions["root"]
        size_branch = {child: child.nb_children() for child in emotion.children}
        leaves = sorted(emotion.children, key=lambda emotion: emotion.nb_children())
        up, down = [], []
        if leaves:
            while sum(size_branch[e] for e in down) < sum(size_branch[e] for e in leaves):
                down.append(leaves.pop())
            up = leaves
        for leaf in up:     
            next_last = 'up' if up.index(leaf) is 0 else ''
            next_indent = '{0}{1}{2}'.format(indent, ' ' if 'up' in last else '│', " " * len(emotion.name))
            Emotion.printTree(leaf, indent=next_indent, last=next_last)
        if last == 'up':
            start_shape = '┌'
        elif last == 'down':
            start_shape = '└'
        elif last == 'updown':
            start_shape = ' '
        else:
            start_shape = '├'
        if up:
            end_shape = '┤'
        elif down:
            end_shape = '┐'
        else:
            end_shape = ''
        print '{0}{1}{2}{3}'.format(indent, start_shape, emotion.name, end_shape)
        for leaf in down:
            next_last = 'down' if down.index(leaf) is len(down) - 1 else ''
            next_indent = '{0}{1}{2}'.format(indent, ' ' if 'down' in last else '│', " " * len(emotion.name))
            Emotion.printTree(leaf, indent=next_indent, last=next_last)
--- a/emoTextWNA/test_wna.py
+++ b/emoTextWNA/test_wna.py
@@ -0,0 +1,42 @@
 import os
 import logging
 logging.basicConfig()
 try:
    import unittest.mock as mock
 except ImportError:
    import mock
 from senpy.extensions import Senpy
 from flask import Flask
 import unittest
 class emoTextWAFTest(unittest.TestCase):
    def setUp(self):
        self.app = Flask("test_plugin")
        self.dir = os.path.join(os.path.dirname(__file__))
        self.senpy = Senpy(plugin_folder=self.dir, default_plugins=False)
        self.senpy.init_app(self.app)
    def tearDown(self):
        self.senpy.deactivate_plugin("EmoTextWAF", sync=True)
    def test_analyse(self):
        plugin = self.senpy.plugins["EmoTextWAF"]
        plugin.activate()
        texts = {'I hate you': 'anger',
                 'i am sad': 'sadness',
                 'i am happy with my marks': 'joy',
                 'This movie is scary': 'negative-fear'}
        for text in texts:
            response = plugin.analyse(input=text)
            expected = texts[text]
            emotionSet = response.entries[0].emotions[0]
            max_emotion = max(emotionSet['onyx:hasEmotion'], key=lambda x: x['onyx:hasEmotionIntensity'])
            assert max_emotion['onyx:hasEmotionCategory'] == expected
        plugin.deactivate()
 if __name__ == '__main__':
    unittest.main()
--- a/emoTextWNA/wnaffect.py
+++ b/emoTextWNA/wnaffect.py
@@ -0,0 +1,92 @@
 # coding: utf-8
 # In[1]:
 # -*- coding: utf-8 -*-
 """
 Clement Michard (c) 2015
 """
 import os
 import sys
 import nltk
 from emotion import Emotion
 from nltk.corpus import WordNetCorpusReader
 import xml.etree.ElementTree as ET
 class WNAffect:
    """WordNet-Affect ressource."""
    def __init__(self, wordnet16_dir, wn_domains_dir):
        """Initializes the WordNet-Affect object."""
        cwd = os.getcwd()
        nltk.data.path.append(cwd)
        wn16_path = "{0}/dict".format(wordnet16_dir)
        self.wn16 = WordNetCorpusReader(os.path.abspath("{0}/{1}".format(cwd, wn16_path)), nltk.data.find(wn16_path))
        self.flat_pos = {'NN':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB', 'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'}
        self.wn_pos = {'NN':self.wn16.NOUN, 'JJ':self.wn16.ADJ, 'VB':self.wn16.VERB, 'RB':self.wn16.ADV}
        self._load_emotions(wn_domains_dir)
        self.synsets = self._load_synsets(wn_domains_dir)
    def _load_synsets(self, wn_domains_dir):
        """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str)."""
        tree = ET.parse("{0}/a-synsets.xml".format(wn_domains_dir))
        root = tree.getroot()
        pos_map = { "noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB" }
        synsets = {}
        for pos in ["noun", "adj", "verb", "adv"]:
            tag = pos_map[pos]
            synsets[tag] = {}
            for elem in root.findall(".//{0}-syn-list//{0}-syn".format(pos, pos)):
                offset = int(elem.get("id")[2:])                
                if not offset: continue
                if elem.get("categ"):
                    synsets[tag][offset] = Emotion.emotions[elem.get("categ")] if elem.get("categ") in Emotion.emotions else None
                elif elem.get("noun-id"):
                    synsets[tag][offset] = synsets[pos_map["noun"]][int(elem.get("noun-id")[2:])]
        return synsets
    def _load_emotions(self, wn_domains_dir):
        """Loads the hierarchy of emotions from the WordNet-Affect xml."""
        tree = ET.parse("{0}/a-hierarchy.xml".format(wn_domains_dir))
        root = tree.getroot()
        for elem in root.findall("categ"):
            name = elem.get("name")
            if name == "root":
                Emotion.emotions["root"] = Emotion("root")
            else:
                Emotion.emotions[name] = Emotion(name, elem.get("isa"))
    def get_emotion(self, word, pos):
        """Returns the emotion of the word.
            word -- the word (str)
            pos -- part-of-speech (str)
        """
        if pos in self.flat_pos:
            pos = self.flat_pos[pos]
            synsets = self.wn16.synsets(word, self.wn_pos[pos])         
            if synsets:
                offset = synsets[0].offset()
                if offset in self.synsets[pos]:
                    return self.synsets[pos][offset]
        return None
 if __name__ == "__main__":
    wordnet16, wndomains32, word, pos = sys.argv[1:5]
    wna = WNAffect(wordnet16, wndomains32)
    print wna.get_emotion(word, pos)
--- a/example-plugin/example.senpy
+++ b/example-plugin/example.senpy
@@ -12,5 +12,6 @@
            "default": 42
        }
     },
    "requirements": ["noop"],
    "custom_attribute": "42"
 }
--- a/example-plugin/test_example.py
+++ b/example-plugin/test_example.py
@@ -0,0 +1,23 @@
 import unittest
 from flask import Flask
 import os
 from senpy.extensions import Senpy
 class emoTextWAFTest(unittest.TestCase):
    def setUp(self):
        self.app = Flask("Example")
        self.dir = os.path.join(os.path.dirname(__file__))
        self.senpy = Senpy(plugin_folder=self.dir, default_plugins=False)
        self.senpy.init_app(self.app)
    def tearDown(self):
        self.senpy.deactivate_plugin("ExamplePlugin", sync=True)
    def test_analyse(self):
        assert len(self.senpy.plugins.keys()) == 1
        assert True
 if __name__ == '__main__':
    unittest.main()