from __future__ import division import re import nltk import logging import os import string import xml.etree.ElementTree as ET from nltk.corpus import stopwords from nltk.corpus import WordNetCorpusReader from nltk.stem import wordnet from emotion import Emotion as Emo from senpy.plugins import EmotionPlugin, AnalysisPlugin, ShelfMixin from senpy.models import Results, EmotionSet, Entry, Emotion class EmotionTextPlugin(EmotionPlugin, ShelfMixin): def _load_synsets(self, synsets_path): """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str).""" tree = ET.parse(synsets_path) root = tree.getroot() pos_map = {"noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB"} synsets = {} for pos in ["noun", "adj", "verb", "adv"]: tag = pos_map[pos] synsets[tag] = {} for elem in root.findall( ".//{0}-syn-list//{0}-syn".format(pos, pos)): offset = int(elem.get("id")[2:]) if not offset: continue if elem.get("categ"): synsets[tag][offset] = Emo.emotions[elem.get( "categ")] if elem.get( "categ") in Emo.emotions else None elif elem.get("noun-id"): synsets[tag][offset] = synsets[pos_map["noun"]][int( elem.get("noun-id")[2:])] return synsets def _load_emotions(self, hierarchy_path): """Loads the hierarchy of emotions from the WordNet-Affect xml.""" tree = ET.parse(hierarchy_path) root = tree.getroot() for elem in root.findall("categ"): name = elem.get("name") if name == "root": Emo.emotions["root"] = Emo("root") else: Emo.emotions[name] = Emo(name, elem.get("isa")) def activate(self, *args, **kwargs): nltk.download(['stopwords', 'averaged_perceptron_tagger', 'wordnet']) self._stopwords = stopwords.words('english') self._wnlemma = wordnet.WordNetLemmatizer() self._syntactics = {'N': 'n', 'V': 'v', 'J': 'a', 'S': 's', 'R': 'r'} local_path = os.path.dirname(os.path.abspath(__file__)) self._categories = { 'anger': [ 'general-dislike', ], 'fear': [ 'negative-fear', ], 'disgust': [ 'shame', ], 'joy': ['gratitude', 'affective', 'enthusiasm', 'love', 'joy', 'liking'], 'sadness': [ 'ingrattitude', 'daze', 'humility', 'compassion', 'despair', 'anxiety', 'sadness' ] } self._wnaffect_mappings = { 'anger': 'anger', 'fear': 'negative-fear', 'disgust': 'disgust', 'joy': 'joy', 'sadness': 'sadness' } self._load_emotions(local_path + self.hierarchy_path) if 'total_synsets' not in self.sh: total_synsets = self._load_synsets(local_path + self.synsets_path) self.sh['total_synsets'] = total_synsets self._total_synsets = self.sh['total_synsets'] self._wn16_path = self.wn16_path self._wn16 = WordNetCorpusReader(os.path.abspath("{0}".format(local_path + self._wn16_path)), nltk.data.find(local_path + self._wn16_path)) def deactivate(self, *args, **kwargs): self.save() def _my_preprocessor(self, text): regHttp = re.compile( '(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?') regHttps = re.compile( '(https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?') regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*') text = re.sub(regHttp, '', text) text = re.sub(regAt, '', text) text = re.sub('RT : ', '', text) text = re.sub(regHttps, '', text) text = re.sub('[0-9]', '', text) text = self._delete_punctuation(text) return text def _delete_punctuation(self, text): exclude = set(string.punctuation) s = ''.join(ch for ch in text if ch not in exclude) return s def _extract_ngrams(self, text): unigrams_lemmas = [] pos_tagged = [] unigrams_words = [] tokens = text.split() for token in nltk.pos_tag(tokens): unigrams_words.append(token[0]) pos_tagged.append(token[1]) if token[1][0] in self._syntactics.keys(): unigrams_lemmas.append( self._wnlemma.lemmatize(token[0], self._syntactics[token[1] [0]])) else: unigrams_lemmas.append(token[0]) return unigrams_words, unigrams_lemmas, pos_tagged def _find_ngrams(self, input_list, n): return zip(*[input_list[i:] for i in range(n)]) def _clean_pos(self, pos_tagged): pos_tags = { 'NN': 'NN', 'NNP': 'NN', 'NNP-LOC': 'NN', 'NNS': 'NN', 'JJ': 'JJ', 'JJR': 'JJ', 'JJS': 'JJ', 'RB': 'RB', 'RBR': 'RB', 'RBS': 'RB', 'VB': 'VB', 'VBD': 'VB', 'VGB': 'VB', 'VBN': 'VB', 'VBP': 'VB', 'VBZ': 'VB' } for i in range(len(pos_tagged)): if pos_tagged[i] in pos_tags: pos_tagged[i] = pos_tags[pos_tagged[i]] return pos_tagged def _extract_features(self, text): feature_set = {k: 0 for k in self._categories} ngrams_words, ngrams_lemmas, pos_tagged = self._extract_ngrams(text) matches = 0 pos_tagged = self._clean_pos(pos_tagged) tag_wn = { 'NN': self._wn16.NOUN, 'JJ': self._wn16.ADJ, 'VB': self._wn16.VERB, 'RB': self._wn16.ADV } for i in range(len(pos_tagged)): if pos_tagged[i] in tag_wn: synsets = self._wn16.synsets(ngrams_words[i], tag_wn[pos_tagged[i]]) if synsets: offset = synsets[0].offset() if offset in self._total_synsets[pos_tagged[i]]: if self._total_synsets[pos_tagged[i]][offset] is None: continue else: emotion = self._total_synsets[pos_tagged[i]][ offset].get_level(5).name matches += 1 for i in self._categories: if emotion in self._categories[i]: feature_set[i] += 1 if matches == 0: matches = 1 for i in feature_set: feature_set[i] = (feature_set[i] / matches) * 100 return feature_set def analyse_entry(self, entry, params): text_input = entry.get("text", None) text = self._my_preprocessor(text_input) feature_text = self._extract_features(text) emotionSet = EmotionSet(id="Emotions0") emotions = emotionSet.onyx__hasEmotion for i in feature_text: emotions.append( Emotion( onyx__hasEmotionCategory=self._wnaffect_mappings[i], onyx__hasEmotionIntensity=feature_text[i])) entry.emotions = [emotionSet] yield entry