You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
senpy/emotion-anew/emotion-anew.py

155 lines
5.8 KiB
Python

# -*- coding: utf-8 -*-
import re
import nltk
import csv
import sys
import os
import unicodedata
import string
import xml.etree.ElementTree as ET
import math
from sklearn.svm import LinearSVC
from sklearn.feature_extraction import DictVectorizer
from nltk import bigrams
from nltk import trigrams
from nltk.corpus import stopwords
from pattern.en import parse as parse_en
from pattern.es import parse as parse_es
from senpy.plugins import SentimentPlugin, SenpyPlugin
from senpy.models import Results, EmotionSet, Entry, Emotion
class EmotionTextPlugin(SentimentPlugin):
def activate(self, *args, **kwargs):
self._stopwords = stopwords.words('english')
self._local_path=os.path.dirname(os.path.abspath(__file__))
def _my_preprocessor(self, text):
regHttp = re.compile('(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
regHttps = re.compile('(https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*')
text = re.sub(regHttp, '', text)
text = re.sub(regAt, '', text)
text = re.sub('RT : ', '', text)
text = re.sub(regHttps, '', text)
text = re.sub('[0-9]', '', text)
text = self._delete_punctuation(text)
return text
def _delete_punctuation(self, text):
exclude = set(string.punctuation)
s = ''.join(ch for ch in text if ch not in exclude)
return s
def _extract_ngrams(self, text, lang):
unigrams_lemmas = []
unigrams_words = []
pos_tagged = []
if lang == 'es':
sentences = parse_es(text,lemmata=True).split()
else:
sentences = parse_en(text,lemmata=True).split()
for sentence in sentences:
for token in sentence:
if token[0].lower() not in self._stopwords:
unigrams_words.append(token[0].lower())
unigrams_lemmas.append(token[4])
pos_tagged.append(token[1])
return unigrams_lemmas,unigrams_words,pos_tagged
def _find_ngrams(self, input_list, n):
return zip(*[input_list[i:] for i in range(n)])
def _emotion_calculate(self, VAD):
emotion=''
value=10000000000000000000000.0
for state in self.centroids:
valence=VAD[0]-self.centroids[state]['V']
arousal=VAD[1]-self.centroids[state]['A']
dominance=VAD[2]-self.centroids[state]['D']
new_value=math.sqrt((valence*valence)+(arousal*arousal)+(dominance*dominance))
if new_value < value:
value=new_value
emotion=state
return emotion
def _extract_features(self, tweet,dictionary,lang):
feature_set={}
ngrams_lemmas,ngrams_words,pos_tagged = self._extract_ngrams(tweet,lang)
pos_tags={'NN':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB',
'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'}
totalVAD=[0,0,0]
matches=0
for word in range(len(ngrams_lemmas)):
VAD=[]
if ngrams_lemmas[word] in dictionary:
matches+=1
totalVAD = [totalVAD[0]+float(dictionary[ngrams_lemmas[word]]['V']),
totalVAD[1]+float(dictionary[ngrams_lemmas[word]]['A']),
totalVAD[2]+float(dictionary[ngrams_lemmas[word]]['D'])]
elif ngrams_words[word] in dictionary:
matches+=1
totalVAD = [totalVAD[0]+float(dictionary[ngrams_words[word]]['V']),
totalVAD[1]+float(dictionary[ngrams_words[word]]['A']),
totalVAD[2]+float(dictionary[ngrams_words[word]]['D'])]
if matches==0:
emotion='neutral'
else:
totalVAD=[totalVAD[0]/matches,totalVAD[1]/matches,totalVAD[2]/matches]
emotion=self._emotion_calculate(totalVAD)
feature_set['emotion']=emotion
feature_set['V']=totalVAD[0]
feature_set['A']=totalVAD[1]
feature_set['D']=totalVAD[2]
return feature_set
def analyse_entry(self, entry, params):
text_input = entry.get("text", None)
text= self._my_preprocessor(text_input)
dictionary={}
lang = params.get("language", "auto")
if lang == 'es':
with open(self._local_path+self.anew_path_es,'rb') as tabfile:
reader = csv.reader(tabfile, delimiter='\t')
for row in reader:
dictionary[row[2]]={}
dictionary[row[2]]['V']=row[3]
dictionary[row[2]]['A']=row[5]
dictionary[row[2]]['D']=row[7]
else:
with open(self._local_path+self.anew_path_en,'rb') as tabfile:
reader = csv.reader(tabfile, delimiter='\t')
for row in reader:
dictionary[row[0]]={}
dictionary[row[0]]['V']=row[2]
dictionary[row[0]]['A']=row[4]
dictionary[row[0]]['D']=row[6]
feature_set=self._extract_features(text,dictionary,lang)
emotions = EmotionSet()
emotions.id = "Emotions0"
emotion1 = Emotion(id="Emotion0")
emotion1["onyx:hasEmotionCategory"] = self.emotions_ontology[feature_set['emotion']]
emotion1["http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#valence"] = feature_set['V']
emotion1["http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#arousal"] = feature_set['A']
emotion1["http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#dominance"] = feature_set['D']
emotions.onyx__hasEmotion.append(emotion1)
entry.emotions = [emotions,]
yield entry