1
0
mirror of https://github.com/gsi-upm/senpy synced 2024-11-22 00:02:28 +00:00

Added WordNet-Affect plugin and Makefile

This commit is contained in:
J. Fernando Sánchez 2016-09-21 21:48:57 +02:00
parent 0e9db7081c
commit 5e8bc717a8
11 changed files with 527 additions and 0 deletions

8
.gitignore vendored
View File

@ -55,3 +55,11 @@ docs/_build/
# PyBuilder # PyBuilder
target/ target/
.*
*.pyc
**/__pycache__
*/wordnet1.6
*/Corpus
*/a-hierarchy.xml
*/a-synsets.xml
*/wn16.txt

15
Dockerfile Normal file
View File

@ -0,0 +1,15 @@
from gsiupm/senpy:0.6.1-python2.7
RUN mkdir -p /senpy-plugins
RUN pip install nltk
RUN python -m nltk.downloader stopwords
RUN python -m nltk.downloader punkt
RUN python -m nltk.downloader maxent_treebank_pos_tagger
RUN python -m nltk.downloader wordnet
RUN pip install pytest
RUN pip install mock
ADD . /senpy-plugins
RUN senpy -f /senpy-plugins --only-install
WORKDIR /senpy-plugins/

25
Makefile Normal file
View File

@ -0,0 +1,25 @@
PYVERSION=2.7
NAME=senpycommunity
REPO=gsiupm
VERSION=test
PLUGINS= $(filter %/, $(wildcard */))
all: build run
build: clean Dockerfile
docker build -t '$(REPO)/$(NAME):$(VERSION)-python$(PYVERSION)' -f Dockerfile .;
test-%:
docker run -v $$PWD/$*:/senpy-plugins/ --rm --entrypoint=/usr/local/bin/py.test -ti '$(REPO)/$(NAME):$(VERSION)-python$(PYVERSION)' test.py
test: $(addprefix test-,$(PLUGINS))
clean:
@docker ps -a | awk '/$(REPO)\/$(NAME)/{ split($$2, vers, "-"); if(vers[1] != "${VERSION}"){ print $$1;}}' | xargs docker rm 2>/dev/null|| true
@docker images | awk '/$(REPO)\/$(NAME)/{ split($$2, vers, "-"); if(vers[1] != "${VERSION}"){ print $$1":"$$2;}}' | xargs docker rmi 2>/dev/null|| true
run: build
docker run --rm -p 5000:5000 -ti '$(REPO)/$(NAME):$(VERSION)-python$(PYMAIN)'
.PHONY: test test-% build-% build test test_pip run clean

10
emoTextWNA/README.rst Normal file
View File

@ -0,0 +1,10 @@
This plugin uses WNAffect labels for emotion analysis.
The emotextWAF.senpy file can be copied and modified to use different versions of wnaffect with the same python code.
Known issues
============
* This plugin uses the pattern library, which means it will only run on python 2.7
* Wnaffect and corpora files are not included in the repository, but can be easily added either to the docker image (using a volume) or in a new docker image.

185
emoTextWNA/emotextWAF.py Normal file
View File

@ -0,0 +1,185 @@
# -*- coding: utf-8 -*-
from __future__ import division
import re
import nltk
import logging
import os
import string
import xml.etree.ElementTree as ET
from nltk.corpus import stopwords
from nltk.corpus import WordNetCorpusReader
from emotion import Emotion as Emo
from pattern.en import parse
from senpy.plugins import EmotionPlugin, SenpyPlugin
from senpy.models import Results, EmotionSet, Entry, Emotion
logger = logging.getLogger(__name__)
class EmotionTextPlugin(EmotionPlugin):
def __init__(self, info, *args, **kwargs):
super(EmotionTextPlugin, self).__init__(info, *args, **kwargs)
self.id = info['module']
self.info = info
self._stopwords = stopwords.words('english')
local_path=os.path.dirname(os.path.abspath(__file__))
self._categories = {'anger': ['general-dislike',],
'fear': ['negative-fear',],
'disgust': ['shame',],
'joy': ['gratitude','affective','enthusiasm','love','joy','liking'],
'sadness': ['ingrattitude','daze','humility','compassion','despair','anxiety','sadness']}
self._wnaffect_mappings = {'anger': 'anger',
'fear': 'negative-fear',
'disgust': 'disgust',
'joy': 'joy',
'sadness': 'sadness'}
self._load_emotions(local_path+self.info['hierarchy_path'])
self._total_synsets = self._load_synsets(local_path+self.info['synsets_path'])
self._wn16_path = local_path+self.info['wn16_path']
self._wn16= None
self._wn16 = WordNetCorpusReader(os.path.abspath("{0}".format(self._wn16_path)), nltk.data.find(self._wn16_path))
def _load_synsets(self, synsets_path):
"""Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str)."""
tree = ET.parse(synsets_path)
root = tree.getroot()
pos_map = { "noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB" }
synsets = {}
for pos in ["noun", "adj", "verb", "adv"]:
tag = pos_map[pos]
synsets[tag] = {}
for elem in root.findall(".//{0}-syn-list//{0}-syn".format(pos, pos)):
offset = int(elem.get("id")[2:])
if not offset: continue
if elem.get("categ"):
synsets[tag][offset] = Emo.emotions[elem.get("categ")] if elem.get("categ") in Emo.emotions else None
elif elem.get("noun-id"):
synsets[tag][offset] = synsets[pos_map["noun"]][int(elem.get("noun-id")[2:])]
return synsets
def _load_emotions(self, hierarchy_path):
"""Loads the hierarchy of emotions from the WordNet-Affect xml."""
tree = ET.parse(hierarchy_path)
root = tree.getroot()
for elem in root.findall("categ"):
name = elem.get("name")
if name == "root":
Emo.emotions["root"] = Emo("root")
else:
Emo.emotions[name] = Emo(name, elem.get("isa"))
def activate(self, *args, **kwargs):
logger.info("EmoText plugin is ready to go!")
def deactivate(self, *args, **kwargs):
logger.info("EmoText plugin is being deactivated...")
def _my_preprocessor(self, text):
regHttp = re.compile('(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
regHttps = re.compile('(https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*')
text = re.sub(regHttp, '', text)
text = re.sub(regAt, '', text)
text = re.sub('RT : ', '', text)
text = re.sub(regHttps, '', text)
text = re.sub('[0-9]', '', text)
text = self._delete_punctuation(text)
return text
def _delete_punctuation(self, text):
exclude = set(string.punctuation)
s = ''.join(ch for ch in text if ch not in exclude)
return s
def _extract_ngrams(self, text):
unigrams_lemmas = []
pos_tagged = []
unigrams_words = []
sentences = parse(text,lemmata=True).split()
for sentence in sentences:
for token in sentence:
if token[0].lower() not in self._stopwords:
unigrams_words.append(token[0].lower())
unigrams_lemmas.append(token[4])
pos_tagged.append(token[1])
return unigrams_words,unigrams_lemmas,pos_tagged
def _find_ngrams(self, input_list, n):
return zip(*[input_list[i:] for i in range(n)])
def _clean_pos(self, pos_tagged):
pos_tags={'NN':'NN', 'NNP':'NN','NNP-LOC':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB',
'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'}
for i in range(len(pos_tagged)):
if pos_tagged[i] in pos_tags:
pos_tagged[i]=pos_tags[pos_tagged[i]]
return pos_tagged
def _extract_features(self, text):
feature_set={k:0 for k in self._categories}
ngrams_words,ngrams_lemmas,pos_tagged = self._extract_ngrams(text)
matches=0
pos_tagged=self._clean_pos(pos_tagged)
tag_wn={'NN':self._wn16.NOUN,'JJ':self._wn16.ADJ,'VB':self._wn16.VERB,'RB':self._wn16.ADV}
for i in range(len(pos_tagged)):
if pos_tagged[i] in tag_wn:
synsets = self._wn16.synsets(ngrams_words[i], tag_wn[pos_tagged[i]])
if synsets:
offset = synsets[0].offset()
if offset in self._total_synsets[pos_tagged[i]]:
if self._total_synsets[pos_tagged[i]][offset] is None:
continue
else:
emotion = self._total_synsets[pos_tagged[i]][offset].get_level(5).name
matches+=1
for i in self._categories:
if emotion in self._categories[i]:
feature_set[i]+=1
if matches == 0:
matches=1
for i in feature_set:
feature_set[i] = (feature_set[i]/matches)*100
return feature_set
def analyse(self, **params):
logger.debug("Analysing with params {}".format(params))
text_input = params.get("input", None)
text=self._my_preprocessor(text_input)
feature_text=self._extract_features(text)
response = Results()
entry = Entry(id="Entry",
text=text_input)
emotionSet = EmotionSet(id="Emotions0")
emotions = emotionSet.onyx__hasEmotion
for i in feature_text:
emotions.append(Emotion(onyx__hasEmotionCategory=self._wnaffect_mappings[i],
onyx__hasEmotionIntensity=feature_text[i]))
entry.emotions = [emotionSet]
response.entries.append(entry)
return response

View File

@ -0,0 +1,29 @@
{
"name": "EmoTextWAF",
"module": "emotextWAF",
"description": "Emotion classifier using rule-based classification.",
"author": "@icorcuera @balkian",
"version": "0.2",
"extra_params": {
"language": {
"aliases": ["language", "l"],
"required": true,
"options": ["en"],
"default": "en"
}
},
"requirements": {},
"synsets_path": "/a-synsets.xml",
"hierarchy_path": "/a-hierarchy.xml",
"wn16_path": "/wordnet1.6/dict",
"requirements": [
"nltk>=3.0.5",
"numpy>=1.8.2",
"scipy>=0.14.0",
"scikit-learn>=0.14.1",
"lxml>=3.4.2",
"pandas",
"senpy",
"pattern"
]
}

97
emoTextWNA/emotion.py Normal file
View File

@ -0,0 +1,97 @@
# coding: utf-8
"""
Clement Michard (c) 2015
"""
class Emotion:
"""Defines an emotion."""
emotions = {} # name to emotion (str -> Emotion)
def __init__(self, name, parent_name=None):
"""Initializes an Emotion object.
name -- name of the emotion (str)
parent_name -- name of the parent emotion (str)
"""
self.name = name
self.parent = None
self.level = 0
self.children = []
if parent_name:
self.parent = Emotion.emotions[parent_name] if parent_name else None
self.parent.children.append(self)
self.level = self.parent.level + 1
def get_level(self, level):
"""Returns the parent of self at the given level.
level -- level in the hierarchy (int)
"""
em = self
while em.level > level and em.level >= 0:
em = em.parent
return em
def __str__(self):
"""Returns the emotion string formatted."""
return self.name
def nb_children(self):
"""Returns the number of children of the emotion."""
return sum(child.nb_children() for child in self.children) + 1
@staticmethod
def printTree(emotion=None, indent="", last='updown'):
"""Prints the hierarchy of emotions.
emotion -- root emotion (Emotion)
"""
if not emotion:
emotion = Emotion.emotions["root"]
size_branch = {child: child.nb_children() for child in emotion.children}
leaves = sorted(emotion.children, key=lambda emotion: emotion.nb_children())
up, down = [], []
if leaves:
while sum(size_branch[e] for e in down) < sum(size_branch[e] for e in leaves):
down.append(leaves.pop())
up = leaves
for leaf in up:
next_last = 'up' if up.index(leaf) is 0 else ''
next_indent = '{0}{1}{2}'.format(indent, ' ' if 'up' in last else '', " " * len(emotion.name))
Emotion.printTree(leaf, indent=next_indent, last=next_last)
if last == 'up':
start_shape = ''
elif last == 'down':
start_shape = ''
elif last == 'updown':
start_shape = ' '
else:
start_shape = ''
if up:
end_shape = ''
elif down:
end_shape = ''
else:
end_shape = ''
print '{0}{1}{2}{3}'.format(indent, start_shape, emotion.name, end_shape)
for leaf in down:
next_last = 'down' if down.index(leaf) is len(down) - 1 else ''
next_indent = '{0}{1}{2}'.format(indent, ' ' if 'down' in last else '', " " * len(emotion.name))
Emotion.printTree(leaf, indent=next_indent, last=next_last)

42
emoTextWNA/test_wna.py Normal file
View File

@ -0,0 +1,42 @@
import os
import logging
logging.basicConfig()
try:
import unittest.mock as mock
except ImportError:
import mock
from senpy.extensions import Senpy
from flask import Flask
import unittest
class emoTextWAFTest(unittest.TestCase):
def setUp(self):
self.app = Flask("test_plugin")
self.dir = os.path.join(os.path.dirname(__file__))
self.senpy = Senpy(plugin_folder=self.dir, default_plugins=False)
self.senpy.init_app(self.app)
def tearDown(self):
self.senpy.deactivate_plugin("EmoTextWAF", sync=True)
def test_analyse(self):
plugin = self.senpy.plugins["EmoTextWAF"]
plugin.activate()
texts = {'I hate you': 'anger',
'i am sad': 'sadness',
'i am happy with my marks': 'joy',
'This movie is scary': 'negative-fear'}
for text in texts:
response = plugin.analyse(input=text)
expected = texts[text]
emotionSet = response.entries[0].emotions[0]
max_emotion = max(emotionSet['onyx:hasEmotion'], key=lambda x: x['onyx:hasEmotionIntensity'])
assert max_emotion['onyx:hasEmotionCategory'] == expected
plugin.deactivate()
if __name__ == '__main__':
unittest.main()

92
emoTextWNA/wnaffect.py Normal file
View File

@ -0,0 +1,92 @@
# coding: utf-8
# In[1]:
# -*- coding: utf-8 -*-
"""
Clement Michard (c) 2015
"""
import os
import sys
import nltk
from emotion import Emotion
from nltk.corpus import WordNetCorpusReader
import xml.etree.ElementTree as ET
class WNAffect:
"""WordNet-Affect ressource."""
def __init__(self, wordnet16_dir, wn_domains_dir):
"""Initializes the WordNet-Affect object."""
cwd = os.getcwd()
nltk.data.path.append(cwd)
wn16_path = "{0}/dict".format(wordnet16_dir)
self.wn16 = WordNetCorpusReader(os.path.abspath("{0}/{1}".format(cwd, wn16_path)), nltk.data.find(wn16_path))
self.flat_pos = {'NN':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB', 'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'}
self.wn_pos = {'NN':self.wn16.NOUN, 'JJ':self.wn16.ADJ, 'VB':self.wn16.VERB, 'RB':self.wn16.ADV}
self._load_emotions(wn_domains_dir)
self.synsets = self._load_synsets(wn_domains_dir)
def _load_synsets(self, wn_domains_dir):
"""Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str)."""
tree = ET.parse("{0}/a-synsets.xml".format(wn_domains_dir))
root = tree.getroot()
pos_map = { "noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB" }
synsets = {}
for pos in ["noun", "adj", "verb", "adv"]:
tag = pos_map[pos]
synsets[tag] = {}
for elem in root.findall(".//{0}-syn-list//{0}-syn".format(pos, pos)):
offset = int(elem.get("id")[2:])
if not offset: continue
if elem.get("categ"):
synsets[tag][offset] = Emotion.emotions[elem.get("categ")] if elem.get("categ") in Emotion.emotions else None
elif elem.get("noun-id"):
synsets[tag][offset] = synsets[pos_map["noun"]][int(elem.get("noun-id")[2:])]
return synsets
def _load_emotions(self, wn_domains_dir):
"""Loads the hierarchy of emotions from the WordNet-Affect xml."""
tree = ET.parse("{0}/a-hierarchy.xml".format(wn_domains_dir))
root = tree.getroot()
for elem in root.findall("categ"):
name = elem.get("name")
if name == "root":
Emotion.emotions["root"] = Emotion("root")
else:
Emotion.emotions[name] = Emotion(name, elem.get("isa"))
def get_emotion(self, word, pos):
"""Returns the emotion of the word.
word -- the word (str)
pos -- part-of-speech (str)
"""
if pos in self.flat_pos:
pos = self.flat_pos[pos]
synsets = self.wn16.synsets(word, self.wn_pos[pos])
if synsets:
offset = synsets[0].offset()
if offset in self.synsets[pos]:
return self.synsets[pos][offset]
return None
if __name__ == "__main__":
wordnet16, wndomains32, word, pos = sys.argv[1:5]
wna = WNAffect(wordnet16, wndomains32)
print wna.get_emotion(word, pos)

View File

@ -12,5 +12,6 @@
"default": 42 "default": 42
} }
}, },
"requirements": ["noop"],
"custom_attribute": "42" "custom_attribute": "42"
} }

View File

@ -0,0 +1,23 @@
import unittest
from flask import Flask
import os
from senpy.extensions import Senpy
class emoTextWAFTest(unittest.TestCase):
def setUp(self):
self.app = Flask("Example")
self.dir = os.path.join(os.path.dirname(__file__))
self.senpy = Senpy(plugin_folder=self.dir, default_plugins=False)
self.senpy.init_app(self.app)
def tearDown(self):
self.senpy.deactivate_plugin("ExamplePlugin", sync=True)
def test_analyse(self):
assert len(self.senpy.plugins.keys()) == 1
assert True
if __name__ == '__main__':
unittest.main()