From 5e8bc717a844773862e9f656fc30f2c75d19eb76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Fernando=20S=C3=A1nchez?= Date: Wed, 21 Sep 2016 21:48:57 +0200 Subject: [PATCH] Added WordNet-Affect plugin and Makefile --- .gitignore | 8 ++ Dockerfile | 15 +++ Makefile | 25 +++++ emoTextWNA/README.rst | 10 ++ emoTextWNA/emotextWAF.py | 185 +++++++++++++++++++++++++++++++++ emoTextWNA/emotextWAF.senpy | 29 ++++++ emoTextWNA/emotion.py | 97 +++++++++++++++++ emoTextWNA/test_wna.py | 42 ++++++++ emoTextWNA/wnaffect.py | 92 ++++++++++++++++ example-plugin/example.senpy | 1 + example-plugin/test_example.py | 23 ++++ 11 files changed, 527 insertions(+) create mode 100644 Dockerfile create mode 100644 Makefile create mode 100644 emoTextWNA/README.rst create mode 100644 emoTextWNA/emotextWAF.py create mode 100644 emoTextWNA/emotextWAF.senpy create mode 100644 emoTextWNA/emotion.py create mode 100644 emoTextWNA/test_wna.py create mode 100644 emoTextWNA/wnaffect.py create mode 100644 example-plugin/test_example.py diff --git a/.gitignore b/.gitignore index ba74660..108d10f 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,11 @@ docs/_build/ # PyBuilder target/ +.* +*.pyc +**/__pycache__ +*/wordnet1.6 +*/Corpus +*/a-hierarchy.xml +*/a-synsets.xml +*/wn16.txt \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..8232327 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,15 @@ +from gsiupm/senpy:0.6.1-python2.7 + +RUN mkdir -p /senpy-plugins +RUN pip install nltk +RUN python -m nltk.downloader stopwords +RUN python -m nltk.downloader punkt +RUN python -m nltk.downloader maxent_treebank_pos_tagger +RUN python -m nltk.downloader wordnet + +RUN pip install pytest +RUN pip install mock +ADD . /senpy-plugins +RUN senpy -f /senpy-plugins --only-install + +WORKDIR /senpy-plugins/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..9c33135 --- /dev/null +++ b/Makefile @@ -0,0 +1,25 @@ +PYVERSION=2.7 +NAME=senpycommunity +REPO=gsiupm +VERSION=test +PLUGINS= $(filter %/, $(wildcard */)) + + +all: build run + +build: clean Dockerfile + docker build -t '$(REPO)/$(NAME):$(VERSION)-python$(PYVERSION)' -f Dockerfile .; + +test-%: + docker run -v $$PWD/$*:/senpy-plugins/ --rm --entrypoint=/usr/local/bin/py.test -ti '$(REPO)/$(NAME):$(VERSION)-python$(PYVERSION)' test.py + +test: $(addprefix test-,$(PLUGINS)) + +clean: + @docker ps -a | awk '/$(REPO)\/$(NAME)/{ split($$2, vers, "-"); if(vers[1] != "${VERSION}"){ print $$1;}}' | xargs docker rm 2>/dev/null|| true + @docker images | awk '/$(REPO)\/$(NAME)/{ split($$2, vers, "-"); if(vers[1] != "${VERSION}"){ print $$1":"$$2;}}' | xargs docker rmi 2>/dev/null|| true + +run: build + docker run --rm -p 5000:5000 -ti '$(REPO)/$(NAME):$(VERSION)-python$(PYMAIN)' + +.PHONY: test test-% build-% build test test_pip run clean diff --git a/emoTextWNA/README.rst b/emoTextWNA/README.rst new file mode 100644 index 0000000..1ab8599 --- /dev/null +++ b/emoTextWNA/README.rst @@ -0,0 +1,10 @@ +This plugin uses WNAffect labels for emotion analysis. + +The emotextWAF.senpy file can be copied and modified to use different versions of wnaffect with the same python code. + + +Known issues +============ + + * This plugin uses the pattern library, which means it will only run on python 2.7 + * Wnaffect and corpora files are not included in the repository, but can be easily added either to the docker image (using a volume) or in a new docker image. diff --git a/emoTextWNA/emotextWAF.py b/emoTextWNA/emotextWAF.py new file mode 100644 index 0000000..a1e23f0 --- /dev/null +++ b/emoTextWNA/emotextWAF.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- + +from __future__ import division +import re +import nltk +import logging +import os +import string +import xml.etree.ElementTree as ET +from nltk.corpus import stopwords +from nltk.corpus import WordNetCorpusReader + +from emotion import Emotion as Emo +from pattern.en import parse +from senpy.plugins import EmotionPlugin, SenpyPlugin +from senpy.models import Results, EmotionSet, Entry, Emotion + +logger = logging.getLogger(__name__) + +class EmotionTextPlugin(EmotionPlugin): + + def __init__(self, info, *args, **kwargs): + super(EmotionTextPlugin, self).__init__(info, *args, **kwargs) + self.id = info['module'] + self.info = info + self._stopwords = stopwords.words('english') + local_path=os.path.dirname(os.path.abspath(__file__)) + self._categories = {'anger': ['general-dislike',], + 'fear': ['negative-fear',], + 'disgust': ['shame',], + 'joy': ['gratitude','affective','enthusiasm','love','joy','liking'], + 'sadness': ['ingrattitude','daze','humility','compassion','despair','anxiety','sadness']} + + self._wnaffect_mappings = {'anger': 'anger', + 'fear': 'negative-fear', + 'disgust': 'disgust', + 'joy': 'joy', + 'sadness': 'sadness'} + + self._load_emotions(local_path+self.info['hierarchy_path']) + self._total_synsets = self._load_synsets(local_path+self.info['synsets_path']) + self._wn16_path = local_path+self.info['wn16_path'] + self._wn16= None + self._wn16 = WordNetCorpusReader(os.path.abspath("{0}".format(self._wn16_path)), nltk.data.find(self._wn16_path)) + + + def _load_synsets(self, synsets_path): + """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str).""" + tree = ET.parse(synsets_path) + root = tree.getroot() + pos_map = { "noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB" } + + synsets = {} + for pos in ["noun", "adj", "verb", "adv"]: + tag = pos_map[pos] + synsets[tag] = {} + for elem in root.findall(".//{0}-syn-list//{0}-syn".format(pos, pos)): + offset = int(elem.get("id")[2:]) + if not offset: continue + if elem.get("categ"): + synsets[tag][offset] = Emo.emotions[elem.get("categ")] if elem.get("categ") in Emo.emotions else None + elif elem.get("noun-id"): + synsets[tag][offset] = synsets[pos_map["noun"]][int(elem.get("noun-id")[2:])] + return synsets + + def _load_emotions(self, hierarchy_path): + """Loads the hierarchy of emotions from the WordNet-Affect xml.""" + + tree = ET.parse(hierarchy_path) + root = tree.getroot() + for elem in root.findall("categ"): + name = elem.get("name") + if name == "root": + Emo.emotions["root"] = Emo("root") + else: + Emo.emotions[name] = Emo(name, elem.get("isa")) + + def activate(self, *args, **kwargs): + logger.info("EmoText plugin is ready to go!") + + def deactivate(self, *args, **kwargs): + + logger.info("EmoText plugin is being deactivated...") + + def _my_preprocessor(self, text): + + regHttp = re.compile('(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?') + regHttps = re.compile('(https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?') + regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*') + text = re.sub(regHttp, '', text) + text = re.sub(regAt, '', text) + text = re.sub('RT : ', '', text) + text = re.sub(regHttps, '', text) + text = re.sub('[0-9]', '', text) + text = self._delete_punctuation(text) + return text + + def _delete_punctuation(self, text): + + exclude = set(string.punctuation) + s = ''.join(ch for ch in text if ch not in exclude) + return s + + def _extract_ngrams(self, text): + + unigrams_lemmas = [] + pos_tagged = [] + unigrams_words = [] + sentences = parse(text,lemmata=True).split() + for sentence in sentences: + for token in sentence: + if token[0].lower() not in self._stopwords: + unigrams_words.append(token[0].lower()) + unigrams_lemmas.append(token[4]) + pos_tagged.append(token[1]) + + return unigrams_words,unigrams_lemmas,pos_tagged + + def _find_ngrams(self, input_list, n): + return zip(*[input_list[i:] for i in range(n)]) + + def _clean_pos(self, pos_tagged): + + pos_tags={'NN':'NN', 'NNP':'NN','NNP-LOC':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB', + 'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'} + + for i in range(len(pos_tagged)): + if pos_tagged[i] in pos_tags: + pos_tagged[i]=pos_tags[pos_tagged[i]] + return pos_tagged + + def _extract_features(self, text): + + feature_set={k:0 for k in self._categories} + ngrams_words,ngrams_lemmas,pos_tagged = self._extract_ngrams(text) + matches=0 + pos_tagged=self._clean_pos(pos_tagged) + + tag_wn={'NN':self._wn16.NOUN,'JJ':self._wn16.ADJ,'VB':self._wn16.VERB,'RB':self._wn16.ADV} + for i in range(len(pos_tagged)): + if pos_tagged[i] in tag_wn: + synsets = self._wn16.synsets(ngrams_words[i], tag_wn[pos_tagged[i]]) + if synsets: + offset = synsets[0].offset() + if offset in self._total_synsets[pos_tagged[i]]: + if self._total_synsets[pos_tagged[i]][offset] is None: + continue + else: + emotion = self._total_synsets[pos_tagged[i]][offset].get_level(5).name + matches+=1 + for i in self._categories: + if emotion in self._categories[i]: + feature_set[i]+=1 + if matches == 0: + matches=1 + + for i in feature_set: + feature_set[i] = (feature_set[i]/matches)*100 + + return feature_set + + def analyse(self, **params): + + logger.debug("Analysing with params {}".format(params)) + + text_input = params.get("input", None) + + text=self._my_preprocessor(text_input) + + feature_text=self._extract_features(text) + + response = Results() + + entry = Entry(id="Entry", + text=text_input) + emotionSet = EmotionSet(id="Emotions0") + emotions = emotionSet.onyx__hasEmotion + + for i in feature_text: + emotions.append(Emotion(onyx__hasEmotionCategory=self._wnaffect_mappings[i], + onyx__hasEmotionIntensity=feature_text[i])) + + entry.emotions = [emotionSet] + response.entries.append(entry) + return response diff --git a/emoTextWNA/emotextWAF.senpy b/emoTextWNA/emotextWAF.senpy new file mode 100644 index 0000000..ae9f812 --- /dev/null +++ b/emoTextWNA/emotextWAF.senpy @@ -0,0 +1,29 @@ +{ + "name": "EmoTextWAF", + "module": "emotextWAF", + "description": "Emotion classifier using rule-based classification.", + "author": "@icorcuera @balkian", + "version": "0.2", + "extra_params": { + "language": { + "aliases": ["language", "l"], + "required": true, + "options": ["en"], + "default": "en" + } + }, + "requirements": {}, + "synsets_path": "/a-synsets.xml", + "hierarchy_path": "/a-hierarchy.xml", + "wn16_path": "/wordnet1.6/dict", + "requirements": [ + "nltk>=3.0.5", + "numpy>=1.8.2", + "scipy>=0.14.0", + "scikit-learn>=0.14.1", + "lxml>=3.4.2", + "pandas", + "senpy", + "pattern" + ] +} diff --git a/emoTextWNA/emotion.py b/emoTextWNA/emotion.py new file mode 100644 index 0000000..31a4534 --- /dev/null +++ b/emoTextWNA/emotion.py @@ -0,0 +1,97 @@ + +# coding: utf-8 + +""" +Clement Michard (c) 2015 +""" + +class Emotion: + """Defines an emotion.""" + + emotions = {} # name to emotion (str -> Emotion) + + def __init__(self, name, parent_name=None): + """Initializes an Emotion object. + name -- name of the emotion (str) + parent_name -- name of the parent emotion (str) + """ + + self.name = name + self.parent = None + self.level = 0 + self.children = [] + + if parent_name: + self.parent = Emotion.emotions[parent_name] if parent_name else None + self.parent.children.append(self) + self.level = self.parent.level + 1 + + + def get_level(self, level): + """Returns the parent of self at the given level. + level -- level in the hierarchy (int) + """ + + em = self + while em.level > level and em.level >= 0: + em = em.parent + return em + + + def __str__(self): + """Returns the emotion string formatted.""" + + return self.name + + + def nb_children(self): + """Returns the number of children of the emotion.""" + + return sum(child.nb_children() for child in self.children) + 1 + + + @staticmethod + def printTree(emotion=None, indent="", last='updown'): + """Prints the hierarchy of emotions. + emotion -- root emotion (Emotion) + """ + + if not emotion: + emotion = Emotion.emotions["root"] + + size_branch = {child: child.nb_children() for child in emotion.children} + leaves = sorted(emotion.children, key=lambda emotion: emotion.nb_children()) + up, down = [], [] + if leaves: + while sum(size_branch[e] for e in down) < sum(size_branch[e] for e in leaves): + down.append(leaves.pop()) + up = leaves + + for leaf in up: + next_last = 'up' if up.index(leaf) is 0 else '' + next_indent = '{0}{1}{2}'.format(indent, ' ' if 'up' in last else '│', " " * len(emotion.name)) + Emotion.printTree(leaf, indent=next_indent, last=next_last) + if last == 'up': + start_shape = '┌' + elif last == 'down': + start_shape = '└' + elif last == 'updown': + start_shape = ' ' + else: + start_shape = '├' + if up: + end_shape = '┤' + elif down: + end_shape = '┐' + else: + end_shape = '' + print '{0}{1}{2}{3}'.format(indent, start_shape, emotion.name, end_shape) + for leaf in down: + next_last = 'down' if down.index(leaf) is len(down) - 1 else '' + next_indent = '{0}{1}{2}'.format(indent, ' ' if 'down' in last else '│', " " * len(emotion.name)) + Emotion.printTree(leaf, indent=next_indent, last=next_last) + + + + + diff --git a/emoTextWNA/test_wna.py b/emoTextWNA/test_wna.py new file mode 100644 index 0000000..0f782ef --- /dev/null +++ b/emoTextWNA/test_wna.py @@ -0,0 +1,42 @@ +import os +import logging +logging.basicConfig() +try: + import unittest.mock as mock +except ImportError: + import mock +from senpy.extensions import Senpy +from flask import Flask +import unittest + +class emoTextWAFTest(unittest.TestCase): + + def setUp(self): + self.app = Flask("test_plugin") + self.dir = os.path.join(os.path.dirname(__file__)) + self.senpy = Senpy(plugin_folder=self.dir, default_plugins=False) + self.senpy.init_app(self.app) + + def tearDown(self): + self.senpy.deactivate_plugin("EmoTextWAF", sync=True) + + def test_analyse(self): + plugin = self.senpy.plugins["EmoTextWAF"] + plugin.activate() + + texts = {'I hate you': 'anger', + 'i am sad': 'sadness', + 'i am happy with my marks': 'joy', + 'This movie is scary': 'negative-fear'} + + for text in texts: + response = plugin.analyse(input=text) + expected = texts[text] + emotionSet = response.entries[0].emotions[0] + max_emotion = max(emotionSet['onyx:hasEmotion'], key=lambda x: x['onyx:hasEmotionIntensity']) + assert max_emotion['onyx:hasEmotionCategory'] == expected + + plugin.deactivate() + +if __name__ == '__main__': + unittest.main() diff --git a/emoTextWNA/wnaffect.py b/emoTextWNA/wnaffect.py new file mode 100644 index 0000000..29cf64d --- /dev/null +++ b/emoTextWNA/wnaffect.py @@ -0,0 +1,92 @@ + +# coding: utf-8 + +# In[1]: + + +# -*- coding: utf-8 -*- +""" +Clement Michard (c) 2015 +""" + +import os +import sys +import nltk +from emotion import Emotion +from nltk.corpus import WordNetCorpusReader +import xml.etree.ElementTree as ET + +class WNAffect: + """WordNet-Affect ressource.""" + + def __init__(self, wordnet16_dir, wn_domains_dir): + """Initializes the WordNet-Affect object.""" + + cwd = os.getcwd() + nltk.data.path.append(cwd) + wn16_path = "{0}/dict".format(wordnet16_dir) + self.wn16 = WordNetCorpusReader(os.path.abspath("{0}/{1}".format(cwd, wn16_path)), nltk.data.find(wn16_path)) + self.flat_pos = {'NN':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB', 'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'} + self.wn_pos = {'NN':self.wn16.NOUN, 'JJ':self.wn16.ADJ, 'VB':self.wn16.VERB, 'RB':self.wn16.ADV} + self._load_emotions(wn_domains_dir) + self.synsets = self._load_synsets(wn_domains_dir) + + + + def _load_synsets(self, wn_domains_dir): + """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str).""" + + tree = ET.parse("{0}/a-synsets.xml".format(wn_domains_dir)) + root = tree.getroot() + pos_map = { "noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB" } + + synsets = {} + for pos in ["noun", "adj", "verb", "adv"]: + tag = pos_map[pos] + synsets[tag] = {} + for elem in root.findall(".//{0}-syn-list//{0}-syn".format(pos, pos)): + offset = int(elem.get("id")[2:]) + if not offset: continue + if elem.get("categ"): + synsets[tag][offset] = Emotion.emotions[elem.get("categ")] if elem.get("categ") in Emotion.emotions else None + elif elem.get("noun-id"): + synsets[tag][offset] = synsets[pos_map["noun"]][int(elem.get("noun-id")[2:])] + + return synsets + + def _load_emotions(self, wn_domains_dir): + """Loads the hierarchy of emotions from the WordNet-Affect xml.""" + + tree = ET.parse("{0}/a-hierarchy.xml".format(wn_domains_dir)) + root = tree.getroot() + for elem in root.findall("categ"): + name = elem.get("name") + if name == "root": + Emotion.emotions["root"] = Emotion("root") + else: + Emotion.emotions[name] = Emotion(name, elem.get("isa")) + + def get_emotion(self, word, pos): + """Returns the emotion of the word. + word -- the word (str) + pos -- part-of-speech (str) + """ + + if pos in self.flat_pos: + pos = self.flat_pos[pos] + synsets = self.wn16.synsets(word, self.wn_pos[pos]) + if synsets: + offset = synsets[0].offset() + if offset in self.synsets[pos]: + return self.synsets[pos][offset] + return None + + + + +if __name__ == "__main__": + wordnet16, wndomains32, word, pos = sys.argv[1:5] + wna = WNAffect(wordnet16, wndomains32) + print wna.get_emotion(word, pos) + + diff --git a/example-plugin/example.senpy b/example-plugin/example.senpy index 3d1a7db..6c667f4 100644 --- a/example-plugin/example.senpy +++ b/example-plugin/example.senpy @@ -12,5 +12,6 @@ "default": 42 } }, + "requirements": ["noop"], "custom_attribute": "42" } diff --git a/example-plugin/test_example.py b/example-plugin/test_example.py new file mode 100644 index 0000000..f6dafc5 --- /dev/null +++ b/example-plugin/test_example.py @@ -0,0 +1,23 @@ +import unittest +from flask import Flask +import os + +from senpy.extensions import Senpy + +class emoTextWAFTest(unittest.TestCase): + + def setUp(self): + self.app = Flask("Example") + self.dir = os.path.join(os.path.dirname(__file__)) + self.senpy = Senpy(plugin_folder=self.dir, default_plugins=False) + self.senpy.init_app(self.app) + + def tearDown(self): + self.senpy.deactivate_plugin("ExamplePlugin", sync=True) + + def test_analyse(self): + assert len(self.senpy.plugins.keys()) == 1 + assert True + +if __name__ == '__main__': + unittest.main()