From 5e8bc717a844773862e9f656fc30f2c75d19eb76 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=2E=20Fernando=20S=C3=A1nchez?= <balkian@gmail.com>
Date: Wed, 21 Sep 2016 21:48:57 +0200
Subject: [PATCH] Added WordNet-Affect plugin and Makefile

---
 .gitignore                     |   8 ++
 Dockerfile                     |  15 +++
 Makefile                       |  25 +++++
 emoTextWNA/README.rst          |  10 ++
 emoTextWNA/emotextWAF.py       | 185 +++++++++++++++++++++++++++++++++
 emoTextWNA/emotextWAF.senpy    |  29 ++++++
 emoTextWNA/emotion.py          |  97 +++++++++++++++++
 emoTextWNA/test_wna.py         |  42 ++++++++
 emoTextWNA/wnaffect.py         |  92 ++++++++++++++++
 example-plugin/example.senpy   |   1 +
 example-plugin/test_example.py |  23 ++++
 11 files changed, 527 insertions(+)
 create mode 100644 Dockerfile
 create mode 100644 Makefile
 create mode 100644 emoTextWNA/README.rst
 create mode 100644 emoTextWNA/emotextWAF.py
 create mode 100644 emoTextWNA/emotextWAF.senpy
 create mode 100644 emoTextWNA/emotion.py
 create mode 100644 emoTextWNA/test_wna.py
 create mode 100644 emoTextWNA/wnaffect.py
 create mode 100644 example-plugin/test_example.py

diff --git a/.gitignore b/.gitignore
index ba74660..108d10f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -55,3 +55,11 @@ docs/_build/
 
 # PyBuilder
 target/
+.*
+*.pyc
+**/__pycache__
+*/wordnet1.6
+*/Corpus
+*/a-hierarchy.xml
+*/a-synsets.xml
+*/wn16.txt
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..8232327
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,15 @@
+from gsiupm/senpy:0.6.1-python2.7
+
+RUN mkdir -p /senpy-plugins
+RUN pip install nltk
+RUN python -m nltk.downloader stopwords
+RUN python -m nltk.downloader punkt
+RUN python -m nltk.downloader maxent_treebank_pos_tagger
+RUN python -m nltk.downloader wordnet
+
+RUN pip install pytest
+RUN pip install mock
+ADD . /senpy-plugins
+RUN senpy -f /senpy-plugins --only-install
+
+WORKDIR /senpy-plugins/
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..9c33135
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,25 @@
+PYVERSION=2.7
+NAME=senpycommunity
+REPO=gsiupm
+VERSION=test
+PLUGINS= $(filter %/, $(wildcard */))
+
+
+all: build run
+
+build: clean Dockerfile
+	docker build -t '$(REPO)/$(NAME):$(VERSION)-python$(PYVERSION)' -f Dockerfile .;
+
+test-%:
+	docker run -v $$PWD/$*:/senpy-plugins/ --rm --entrypoint=/usr/local/bin/py.test -ti '$(REPO)/$(NAME):$(VERSION)-python$(PYVERSION)' test.py
+
+test: $(addprefix test-,$(PLUGINS))
+
+clean:
+	@docker ps -a | awk '/$(REPO)\/$(NAME)/{ split($$2, vers, "-"); if(vers[1] != "${VERSION}"){ print $$1;}}' | xargs docker rm 2>/dev/null|| true
+	@docker images | awk '/$(REPO)\/$(NAME)/{ split($$2, vers, "-"); if(vers[1] != "${VERSION}"){ print $$1":"$$2;}}' | xargs docker rmi 2>/dev/null|| true
+
+run: build
+	docker run --rm -p 5000:5000 -ti '$(REPO)/$(NAME):$(VERSION)-python$(PYMAIN)'
+
+.PHONY: test test-% build-% build test test_pip run clean
diff --git a/emoTextWNA/README.rst b/emoTextWNA/README.rst
new file mode 100644
index 0000000..1ab8599
--- /dev/null
+++ b/emoTextWNA/README.rst
@@ -0,0 +1,10 @@
+This plugin uses WNAffect labels for emotion analysis.
+
+The emotextWAF.senpy file can be copied and modified to use different versions of wnaffect with the same python code.
+
+
+Known issues
+============
+
+  * This plugin uses the pattern library, which means it will only run on python 2.7
+  * Wnaffect and corpora files are not included in the repository, but can be easily added either to the docker image (using a volume) or in a new docker image.
diff --git a/emoTextWNA/emotextWAF.py b/emoTextWNA/emotextWAF.py
new file mode 100644
index 0000000..a1e23f0
--- /dev/null
+++ b/emoTextWNA/emotextWAF.py
@@ -0,0 +1,185 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import division
+import re
+import nltk
+import logging
+import os
+import string
+import xml.etree.ElementTree as ET
+from nltk.corpus import stopwords
+from nltk.corpus import WordNetCorpusReader
+
+from emotion import Emotion as Emo
+from pattern.en import parse
+from senpy.plugins import EmotionPlugin, SenpyPlugin
+from senpy.models import Results, EmotionSet, Entry, Emotion
+
+logger = logging.getLogger(__name__)
+
+class EmotionTextPlugin(EmotionPlugin):
+    
+    def __init__(self, info, *args, **kwargs):
+        super(EmotionTextPlugin, self).__init__(info, *args, **kwargs)
+        self.id = info['module']
+        self.info = info
+        self._stopwords = stopwords.words('english')
+        local_path=os.path.dirname(os.path.abspath(__file__))
+        self._categories = {'anger': ['general-dislike',],
+                            'fear': ['negative-fear',],
+                            'disgust': ['shame',],
+                            'joy': ['gratitude','affective','enthusiasm','love','joy','liking'],
+                            'sadness': ['ingrattitude','daze','humility','compassion','despair','anxiety','sadness']}
+
+        self._wnaffect_mappings = {'anger': 'anger',
+                                   'fear': 'negative-fear',
+                                   'disgust': 'disgust',
+                                   'joy': 'joy',
+                                   'sadness': 'sadness'}
+
+        self._load_emotions(local_path+self.info['hierarchy_path'])     
+        self._total_synsets = self._load_synsets(local_path+self.info['synsets_path'])
+        self._wn16_path = local_path+self.info['wn16_path']
+        self._wn16= None
+        self._wn16 = WordNetCorpusReader(os.path.abspath("{0}".format(self._wn16_path)), nltk.data.find(self._wn16_path))
+        
+
+    def _load_synsets(self, synsets_path):
+        """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str)."""
+        tree = ET.parse(synsets_path)
+        root = tree.getroot()
+        pos_map = { "noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB" }
+
+        synsets = {}
+        for pos in ["noun", "adj", "verb", "adv"]:
+            tag = pos_map[pos]
+            synsets[tag] = {}
+            for elem in root.findall(".//{0}-syn-list//{0}-syn".format(pos, pos)):
+                offset = int(elem.get("id")[2:])                
+                if not offset: continue
+                if elem.get("categ"):
+                    synsets[tag][offset] = Emo.emotions[elem.get("categ")] if elem.get("categ") in Emo.emotions else None
+                elif elem.get("noun-id"):
+                    synsets[tag][offset] = synsets[pos_map["noun"]][int(elem.get("noun-id")[2:])]
+        return synsets
+
+    def _load_emotions(self, hierarchy_path):
+        """Loads the hierarchy of emotions from the WordNet-Affect xml."""
+
+        tree = ET.parse(hierarchy_path)
+        root = tree.getroot()
+        for elem in root.findall("categ"):
+            name = elem.get("name")
+            if name == "root":
+                Emo.emotions["root"] = Emo("root")
+            else:
+                Emo.emotions[name] = Emo(name, elem.get("isa"))
+
+    def activate(self, *args, **kwargs):
+        logger.info("EmoText plugin is ready to go!")
+
+    def deactivate(self, *args, **kwargs):
+
+        logger.info("EmoText plugin is being deactivated...")
+
+    def _my_preprocessor(self, text):
+
+        regHttp = re.compile('(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
+        regHttps = re.compile('(https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
+        regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*')
+        text = re.sub(regHttp, '', text)
+        text = re.sub(regAt, '', text)
+        text = re.sub('RT : ', '', text)
+        text = re.sub(regHttps, '', text)
+        text = re.sub('[0-9]', '', text)
+        text = self._delete_punctuation(text)
+        return text
+
+    def _delete_punctuation(self, text):
+
+        exclude = set(string.punctuation)
+        s = ''.join(ch for ch in text if ch not in exclude)
+        return s
+
+    def _extract_ngrams(self, text):
+
+        unigrams_lemmas = []
+        pos_tagged = []
+        unigrams_words = []
+        sentences = parse(text,lemmata=True).split()
+        for sentence in sentences:
+            for token in sentence:
+                if token[0].lower() not in self._stopwords:
+                    unigrams_words.append(token[0].lower())
+                    unigrams_lemmas.append(token[4])  
+                    pos_tagged.append(token[1])        
+
+        return unigrams_words,unigrams_lemmas,pos_tagged
+
+    def _find_ngrams(self, input_list, n):
+        return zip(*[input_list[i:] for i in range(n)])
+
+    def _clean_pos(self, pos_tagged):
+
+        pos_tags={'NN':'NN', 'NNP':'NN','NNP-LOC':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB',
+        'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'}
+
+        for i in range(len(pos_tagged)):
+            if pos_tagged[i] in pos_tags:
+                pos_tagged[i]=pos_tags[pos_tagged[i]]
+        return pos_tagged
+    
+    def _extract_features(self, text):
+
+        feature_set={k:0 for k in self._categories}
+        ngrams_words,ngrams_lemmas,pos_tagged = self._extract_ngrams(text)
+        matches=0
+        pos_tagged=self._clean_pos(pos_tagged)
+
+        tag_wn={'NN':self._wn16.NOUN,'JJ':self._wn16.ADJ,'VB':self._wn16.VERB,'RB':self._wn16.ADV}
+        for i in range(len(pos_tagged)):
+            if pos_tagged[i] in tag_wn:
+                synsets = self._wn16.synsets(ngrams_words[i], tag_wn[pos_tagged[i]])   
+                if synsets:
+                    offset = synsets[0].offset()
+                    if offset in self._total_synsets[pos_tagged[i]]:
+                        if self._total_synsets[pos_tagged[i]][offset] is None:
+                            continue
+                        else:
+                            emotion = self._total_synsets[pos_tagged[i]][offset].get_level(5).name
+                            matches+=1
+                            for i in self._categories:
+                                if emotion in self._categories[i]:
+                                    feature_set[i]+=1
+        if matches == 0:
+            matches=1                
+
+        for i in feature_set:
+            feature_set[i] = (feature_set[i]/matches)*100
+
+        return feature_set
+
+    def analyse(self, **params):
+
+        logger.debug("Analysing with params {}".format(params))
+
+        text_input = params.get("input", None)
+
+        text=self._my_preprocessor(text_input)
+
+        feature_text=self._extract_features(text)
+
+        response = Results()
+
+        entry = Entry(id="Entry",
+                      text=text_input)
+        emotionSet = EmotionSet(id="Emotions0")
+        emotions = emotionSet.onyx__hasEmotion
+
+        for i in feature_text:
+            emotions.append(Emotion(onyx__hasEmotionCategory=self._wnaffect_mappings[i],
+                                    onyx__hasEmotionIntensity=feature_text[i]))
+
+        entry.emotions = [emotionSet]
+        response.entries.append(entry)
+        return response
diff --git a/emoTextWNA/emotextWAF.senpy b/emoTextWNA/emotextWAF.senpy
new file mode 100644
index 0000000..ae9f812
--- /dev/null
+++ b/emoTextWNA/emotextWAF.senpy
@@ -0,0 +1,29 @@
+{
+    "name": "EmoTextWAF",
+    "module": "emotextWAF",
+    "description": "Emotion classifier using rule-based classification.",
+    "author": "@icorcuera @balkian",
+    "version": "0.2",
+    "extra_params": {
+        "language": {
+            "aliases": ["language", "l"],
+            "required": true,
+            "options": ["en"],
+            "default": "en"
+        }
+    },
+    "requirements": {},
+    "synsets_path": "/a-synsets.xml",
+    "hierarchy_path": "/a-hierarchy.xml",
+    "wn16_path": "/wordnet1.6/dict",
+    "requirements": [
+        "nltk>=3.0.5",
+        "numpy>=1.8.2",
+        "scipy>=0.14.0",
+        "scikit-learn>=0.14.1",
+        "lxml>=3.4.2",
+        "pandas",
+        "senpy",
+        "pattern"
+    ]
+}
diff --git a/emoTextWNA/emotion.py b/emoTextWNA/emotion.py
new file mode 100644
index 0000000..31a4534
--- /dev/null
+++ b/emoTextWNA/emotion.py
@@ -0,0 +1,97 @@
+
+# coding: utf-8
+
+"""
+Clement Michard (c) 2015
+"""
+
+class Emotion:
+    """Defines an emotion."""
+    
+    emotions = {} # name to emotion (str -> Emotion)
+    
+    def __init__(self, name, parent_name=None):
+        """Initializes an Emotion object.
+            name -- name of the emotion (str)
+            parent_name -- name of the parent emotion (str)
+        """
+        
+        self.name = name
+        self.parent = None
+        self.level = 0
+        self.children = []
+        
+        if parent_name:
+            self.parent = Emotion.emotions[parent_name] if parent_name else None
+            self.parent.children.append(self)
+            self.level = self.parent.level + 1
+            
+            
+    def get_level(self, level):
+        """Returns the parent of self at the given level.
+            level -- level in the hierarchy (int)        
+        """
+        
+        em = self
+        while em.level > level and em.level >= 0:
+            em = em.parent
+        return em
+    
+    
+    def __str__(self):
+        """Returns the emotion string formatted."""
+        
+        return self.name
+        
+        
+    def nb_children(self):
+        """Returns the number of children of the emotion."""
+        
+        return sum(child.nb_children() for child in self.children) + 1
+        
+        
+    @staticmethod
+    def printTree(emotion=None, indent="", last='updown'):
+        """Prints the hierarchy of emotions.
+            emotion -- root emotion (Emotion)
+        """
+        
+        if not emotion:
+            emotion = Emotion.emotions["root"]
+
+        size_branch = {child: child.nb_children() for child in emotion.children}
+        leaves = sorted(emotion.children, key=lambda emotion: emotion.nb_children())
+        up, down = [], []
+        if leaves:
+            while sum(size_branch[e] for e in down) < sum(size_branch[e] for e in leaves):
+                down.append(leaves.pop())
+            up = leaves
+
+        for leaf in up:     
+            next_last = 'up' if up.index(leaf) is 0 else ''
+            next_indent = '{0}{1}{2}'.format(indent, ' ' if 'up' in last else '│', " " * len(emotion.name))
+            Emotion.printTree(leaf, indent=next_indent, last=next_last)
+        if last == 'up':
+            start_shape = '┌'
+        elif last == 'down':
+            start_shape = '└'
+        elif last == 'updown':
+            start_shape = ' '
+        else:
+            start_shape = '├'
+        if up:
+            end_shape = '┤'
+        elif down:
+            end_shape = '┐'
+        else:
+            end_shape = ''
+        print '{0}{1}{2}{3}'.format(indent, start_shape, emotion.name, end_shape)
+        for leaf in down:
+            next_last = 'down' if down.index(leaf) is len(down) - 1 else ''
+            next_indent = '{0}{1}{2}'.format(indent, ' ' if 'down' in last else '│', " " * len(emotion.name))
+            Emotion.printTree(leaf, indent=next_indent, last=next_last)
+
+
+
+
+
diff --git a/emoTextWNA/test_wna.py b/emoTextWNA/test_wna.py
new file mode 100644
index 0000000..0f782ef
--- /dev/null
+++ b/emoTextWNA/test_wna.py
@@ -0,0 +1,42 @@
+import os
+import logging
+logging.basicConfig()
+try:
+    import unittest.mock as mock
+except ImportError:
+    import mock
+from senpy.extensions import Senpy
+from flask import Flask
+import unittest
+
+class emoTextWAFTest(unittest.TestCase):
+
+    def setUp(self):
+        self.app = Flask("test_plugin")
+        self.dir = os.path.join(os.path.dirname(__file__))
+        self.senpy = Senpy(plugin_folder=self.dir, default_plugins=False)
+        self.senpy.init_app(self.app)
+
+    def tearDown(self):
+        self.senpy.deactivate_plugin("EmoTextWAF", sync=True)
+
+    def test_analyse(self):
+        plugin = self.senpy.plugins["EmoTextWAF"]
+        plugin.activate()
+
+        texts = {'I hate you': 'anger',
+                 'i am sad': 'sadness',
+                 'i am happy with my marks': 'joy',
+                 'This movie is scary': 'negative-fear'}
+
+        for text in texts:
+            response = plugin.analyse(input=text)
+            expected = texts[text]
+            emotionSet = response.entries[0].emotions[0]
+            max_emotion = max(emotionSet['onyx:hasEmotion'], key=lambda x: x['onyx:hasEmotionIntensity'])
+            assert max_emotion['onyx:hasEmotionCategory'] == expected
+
+        plugin.deactivate()
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/emoTextWNA/wnaffect.py b/emoTextWNA/wnaffect.py
new file mode 100644
index 0000000..29cf64d
--- /dev/null
+++ b/emoTextWNA/wnaffect.py
@@ -0,0 +1,92 @@
+
+# coding: utf-8
+
+# In[1]:
+
+
+# -*- coding: utf-8 -*-
+"""
+Clement Michard (c) 2015
+"""
+
+import os
+import sys
+import nltk
+from emotion import Emotion
+from nltk.corpus import WordNetCorpusReader
+import xml.etree.ElementTree as ET
+
+class WNAffect:
+    """WordNet-Affect ressource."""
+    
+    def __init__(self, wordnet16_dir, wn_domains_dir):
+        """Initializes the WordNet-Affect object."""
+        
+        cwd = os.getcwd()
+        nltk.data.path.append(cwd)
+        wn16_path = "{0}/dict".format(wordnet16_dir)
+        self.wn16 = WordNetCorpusReader(os.path.abspath("{0}/{1}".format(cwd, wn16_path)), nltk.data.find(wn16_path))
+        self.flat_pos = {'NN':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB', 'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'}
+        self.wn_pos = {'NN':self.wn16.NOUN, 'JJ':self.wn16.ADJ, 'VB':self.wn16.VERB, 'RB':self.wn16.ADV}
+        self._load_emotions(wn_domains_dir)
+        self.synsets = self._load_synsets(wn_domains_dir)
+        
+
+
+    def _load_synsets(self, wn_domains_dir):
+        """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str)."""
+        
+        tree = ET.parse("{0}/a-synsets.xml".format(wn_domains_dir))
+        root = tree.getroot()
+        pos_map = { "noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB" }
+    
+        synsets = {}
+        for pos in ["noun", "adj", "verb", "adv"]:
+            tag = pos_map[pos]
+            synsets[tag] = {}
+            for elem in root.findall(".//{0}-syn-list//{0}-syn".format(pos, pos)):
+                offset = int(elem.get("id")[2:])                
+                if not offset: continue
+                if elem.get("categ"):
+                    synsets[tag][offset] = Emotion.emotions[elem.get("categ")] if elem.get("categ") in Emotion.emotions else None
+                elif elem.get("noun-id"):
+                    synsets[tag][offset] = synsets[pos_map["noun"]][int(elem.get("noun-id")[2:])]
+    
+        return synsets
+        
+    def _load_emotions(self, wn_domains_dir):
+        """Loads the hierarchy of emotions from the WordNet-Affect xml."""
+        
+        tree = ET.parse("{0}/a-hierarchy.xml".format(wn_domains_dir))
+        root = tree.getroot()
+        for elem in root.findall("categ"):
+            name = elem.get("name")
+            if name == "root":
+                Emotion.emotions["root"] = Emotion("root")
+            else:
+                Emotion.emotions[name] = Emotion(name, elem.get("isa"))
+    
+    def get_emotion(self, word, pos):
+        """Returns the emotion of the word.
+            word -- the word (str)
+            pos -- part-of-speech (str)
+        """
+        
+        if pos in self.flat_pos:
+            pos = self.flat_pos[pos]
+            synsets = self.wn16.synsets(word, self.wn_pos[pos])         
+            if synsets:
+                offset = synsets[0].offset()
+                if offset in self.synsets[pos]:
+                    return self.synsets[pos][offset]
+        return None
+
+            
+
+            
+if __name__ == "__main__":
+    wordnet16, wndomains32, word, pos = sys.argv[1:5]
+    wna = WNAffect(wordnet16, wndomains32)
+    print wna.get_emotion(word, pos)
+
+
diff --git a/example-plugin/example.senpy b/example-plugin/example.senpy
index 3d1a7db..6c667f4 100644
--- a/example-plugin/example.senpy
+++ b/example-plugin/example.senpy
@@ -12,5 +12,6 @@
             "default": 42
         }
      },
+    "requirements": ["noop"],
     "custom_attribute": "42"
 }
diff --git a/example-plugin/test_example.py b/example-plugin/test_example.py
new file mode 100644
index 0000000..f6dafc5
--- /dev/null
+++ b/example-plugin/test_example.py
@@ -0,0 +1,23 @@
+import unittest
+from flask import Flask
+import os
+
+from senpy.extensions import Senpy
+
+class emoTextWAFTest(unittest.TestCase):
+
+    def setUp(self):
+        self.app = Flask("Example")
+        self.dir = os.path.join(os.path.dirname(__file__))
+        self.senpy = Senpy(plugin_folder=self.dir, default_plugins=False)
+        self.senpy.init_app(self.app)
+
+    def tearDown(self):
+        self.senpy.deactivate_plugin("ExamplePlugin", sync=True)
+
+    def test_analyse(self):
+        assert len(self.senpy.plugins.keys()) == 1
+        assert True
+
+if __name__ == '__main__':
+    unittest.main()