2018-06-12 08:01:45 +00:00
# -*- coding: utf-8 -*-
import re
import nltk
import csv
import sys
import os
import unicodedata
import string
import xml . etree . ElementTree as ET
import math
from sklearn . svm import LinearSVC
from sklearn . feature_extraction import DictVectorizer
from nltk import bigrams
from nltk import trigrams
from nltk . corpus import stopwords
from pattern . en import parse as parse_en
from pattern . es import parse as parse_es
2019-04-04 10:56:46 +00:00
from senpy . plugins import EmotionPlugin , SenpyPlugin
2018-06-12 08:01:45 +00:00
from senpy . models import Results , EmotionSet , Entry , Emotion
2019-04-04 10:56:46 +00:00
class ANEW ( EmotionPlugin ) :
2019-09-02 12:07:13 +00:00
description = " This plugin consists on an emotion classifier using ANEW lexicon dictionary. It averages the VAD (valence-arousal-dominance) value of each word in the text that is also in the ANEW dictionary. To obtain a categorical value (e.g., happy) use the emotion conversion API (e.g., `emotion-model=emoml:big6`). "
2018-06-14 17:38:08 +00:00
author = " @icorcuera "
2019-07-10 11:09:48 +00:00
version = " 0.5.2 "
2018-06-14 17:38:08 +00:00
name = " emotion-anew "
extra_params = {
" language " : {
2019-04-04 10:56:46 +00:00
" description " : " language of the input " ,
2018-06-14 17:38:08 +00:00
" aliases " : [ " language " , " l " ] ,
" required " : True ,
" options " : [ " es " , " en " ] ,
" default " : " en "
}
}
anew_path_es = " Dictionary/Redondo(2007).csv "
anew_path_en = " Dictionary/ANEW2010All.txt "
2019-07-10 11:07:55 +00:00
onyx__usesEmotionModel = " emoml:pad-dimensions "
2018-06-14 17:38:08 +00:00
nltk_resources = [ ' stopwords ' ]
2018-06-12 08:01:45 +00:00
def activate ( self , * args , * * kwargs ) :
self . _stopwords = stopwords . words ( ' english ' )
2018-06-14 17:38:08 +00:00
dictionary = { }
dictionary [ ' es ' ] = { }
2019-01-09 18:29:24 +00:00
with self . open ( self . anew_path_es , ' r ' ) as tabfile :
2018-06-14 17:38:08 +00:00
reader = csv . reader ( tabfile , delimiter = ' \t ' )
for row in reader :
dictionary [ ' es ' ] [ row [ 2 ] ] = { }
dictionary [ ' es ' ] [ row [ 2 ] ] [ ' V ' ] = row [ 3 ]
dictionary [ ' es ' ] [ row [ 2 ] ] [ ' A ' ] = row [ 5 ]
dictionary [ ' es ' ] [ row [ 2 ] ] [ ' D ' ] = row [ 7 ]
dictionary [ ' en ' ] = { }
2019-01-09 18:29:24 +00:00
with self . open ( self . anew_path_en , ' r ' ) as tabfile :
2018-06-14 17:38:08 +00:00
reader = csv . reader ( tabfile , delimiter = ' \t ' )
for row in reader :
dictionary [ ' en ' ] [ row [ 0 ] ] = { }
dictionary [ ' en ' ] [ row [ 0 ] ] [ ' V ' ] = row [ 2 ]
dictionary [ ' en ' ] [ row [ 0 ] ] [ ' A ' ] = row [ 4 ]
dictionary [ ' en ' ] [ row [ 0 ] ] [ ' D ' ] = row [ 6 ]
self . _dictionary = dictionary
2018-06-12 08:01:45 +00:00
def _my_preprocessor ( self , text ) :
regHttp = re . compile ( ' (http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)? ' )
regHttps = re . compile ( ' (https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)? ' )
regAt = re . compile ( ' @([a-zA-Z0-9]*[*_/& % #@$]*)*[a-zA-Z0-9]* ' )
text = re . sub ( regHttp , ' ' , text )
text = re . sub ( regAt , ' ' , text )
text = re . sub ( ' RT : ' , ' ' , text )
text = re . sub ( regHttps , ' ' , text )
text = re . sub ( ' [0-9] ' , ' ' , text )
text = self . _delete_punctuation ( text )
return text
def _delete_punctuation ( self , text ) :
exclude = set ( string . punctuation )
s = ' ' . join ( ch for ch in text if ch not in exclude )
return s
def _extract_ngrams ( self , text , lang ) :
unigrams_lemmas = [ ]
unigrams_words = [ ]
pos_tagged = [ ]
if lang == ' es ' :
2019-04-04 10:56:46 +00:00
sentences = list ( parse_es ( text , lemmata = True ) . split ( ) )
2018-06-12 08:01:45 +00:00
else :
2019-04-04 10:56:46 +00:00
sentences = list ( parse_en ( text , lemmata = True ) . split ( ) )
2018-06-12 08:01:45 +00:00
for sentence in sentences :
for token in sentence :
if token [ 0 ] . lower ( ) not in self . _stopwords :
unigrams_words . append ( token [ 0 ] . lower ( ) )
2018-06-14 17:38:08 +00:00
unigrams_lemmas . append ( token [ 4 ] )
pos_tagged . append ( token [ 1 ] )
2018-06-12 08:01:45 +00:00
return unigrams_lemmas , unigrams_words , pos_tagged
def _find_ngrams ( self , input_list , n ) :
return zip ( * [ input_list [ i : ] for i in range ( n ) ] )
def _extract_features ( self , tweet , dictionary , lang ) :
feature_set = { }
ngrams_lemmas , ngrams_words , pos_tagged = self . _extract_ngrams ( tweet , lang )
pos_tags = { ' NN ' : ' NN ' , ' NNS ' : ' NN ' , ' JJ ' : ' JJ ' , ' JJR ' : ' JJ ' , ' JJS ' : ' JJ ' , ' RB ' : ' RB ' , ' RBR ' : ' RB ' ,
' RBS ' : ' RB ' , ' VB ' : ' VB ' , ' VBD ' : ' VB ' , ' VGB ' : ' VB ' , ' VBN ' : ' VB ' , ' VBP ' : ' VB ' , ' VBZ ' : ' VB ' }
totalVAD = [ 0 , 0 , 0 ]
matches = 0
for word in range ( len ( ngrams_lemmas ) ) :
VAD = [ ]
if ngrams_lemmas [ word ] in dictionary :
matches + = 1
totalVAD = [ totalVAD [ 0 ] + float ( dictionary [ ngrams_lemmas [ word ] ] [ ' V ' ] ) ,
totalVAD [ 1 ] + float ( dictionary [ ngrams_lemmas [ word ] ] [ ' A ' ] ) ,
totalVAD [ 2 ] + float ( dictionary [ ngrams_lemmas [ word ] ] [ ' D ' ] ) ]
elif ngrams_words [ word ] in dictionary :
matches + = 1
totalVAD = [ totalVAD [ 0 ] + float ( dictionary [ ngrams_words [ word ] ] [ ' V ' ] ) ,
totalVAD [ 1 ] + float ( dictionary [ ngrams_words [ word ] ] [ ' A ' ] ) ,
totalVAD [ 2 ] + float ( dictionary [ ngrams_words [ word ] ] [ ' D ' ] ) ]
if matches == 0 :
emotion = ' neutral '
else :
totalVAD = [ totalVAD [ 0 ] / matches , totalVAD [ 1 ] / matches , totalVAD [ 2 ] / matches ]
2019-04-04 10:56:46 +00:00
feature_set [ ' V ' ] = totalVAD [ 0 ]
feature_set [ ' A ' ] = totalVAD [ 1 ]
feature_set [ ' D ' ] = totalVAD [ 2 ]
2018-06-12 08:01:45 +00:00
return feature_set
2019-04-04 10:56:46 +00:00
def analyse_entry ( self , entry , activity ) :
params = activity . params
2018-06-12 08:01:45 +00:00
2018-06-14 17:38:08 +00:00
text_input = entry . text
2018-06-12 08:01:45 +00:00
2018-06-14 17:38:08 +00:00
text = self . _my_preprocessor ( text_input )
dictionary = self . _dictionary [ params [ ' language ' ] ]
2018-06-12 08:01:45 +00:00
2018-06-14 17:38:08 +00:00
feature_set = self . _extract_features ( text , dictionary , params [ ' language ' ] )
2018-06-12 08:01:45 +00:00
emotions = EmotionSet ( )
emotions . id = " Emotions0 "
emotion1 = Emotion ( id = " Emotion0 " )
2019-07-10 11:07:55 +00:00
emotion1 [ " emoml:pad-dimensions_pleasure " ] = feature_set [ ' V ' ]
emotion1 [ " emoml:pad-dimensions_arousal " ] = feature_set [ ' A ' ]
emotion1 [ " emoml:pad-dimensions_dominance " ] = feature_set [ ' D ' ]
2018-06-12 08:01:45 +00:00
2019-04-04 10:56:46 +00:00
emotion1 . prov ( activity )
emotions . prov ( activity )
2018-06-14 17:38:08 +00:00
2018-06-12 08:01:45 +00:00
emotions . onyx__hasEmotion . append ( emotion1 )
2018-06-14 17:38:08 +00:00
entry . emotions = [ emotions , ]
2018-06-12 08:01:45 +00:00
yield entry
2018-06-14 17:38:08 +00:00
ontology = " http://gsi.dit.upm.es/ontologies/wnaffect/ns# "
test_cases = [
{
2019-04-04 10:56:46 +00:00
' name ' : ' anger with VAD=(2.12, 6.95, 5.05) ' ,
2018-06-14 17:38:08 +00:00
' input ' : ' I hate you ' ,
' expected ' : {
2019-04-04 10:56:46 +00:00
' onyx:hasEmotionSet ' : [ {
2018-06-14 17:38:08 +00:00
' onyx:hasEmotion ' : [ {
2019-04-04 10:56:46 +00:00
" http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#arousal " : 6.95 ,
" http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#dominance " : 5.05 ,
" http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#valence " : 2.12 ,
2018-06-14 17:38:08 +00:00
} ]
} ]
}
} , {
' input ' : ' i am sad ' ,
' expected ' : {
2019-04-04 10:56:46 +00:00
' onyx:hasEmotionSet ' : [ {
2018-06-14 17:38:08 +00:00
' onyx:hasEmotion ' : [ {
2019-04-04 10:56:46 +00:00
" http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#arousal " : 4.13 ,
" http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#dominance " : 3.45 ,
" http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#valence " : 1.61 ,
2018-06-14 17:38:08 +00:00
} ]
} ]
}
} , {
2019-04-04 10:56:46 +00:00
' name ' : ' joy ' ,
2018-06-14 17:38:08 +00:00
' input ' : ' i am happy with my marks ' ,
' expected ' : {
2019-04-04 10:56:46 +00:00
' onyx:hasEmotionSet ' : [ {
2018-06-14 17:38:08 +00:00
' onyx:hasEmotion ' : [ {
2019-04-04 10:56:46 +00:00
" http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#arousal " : 6.49 ,
" http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#dominance " : 6.63 ,
" http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#valence " : 8.21 ,
2018-06-14 17:38:08 +00:00
} ]
} ]
}
} , {
2019-04-04 10:56:46 +00:00
' name ' : ' negative-feat ' ,
2018-06-14 17:38:08 +00:00
' input ' : ' This movie is scary ' ,
' expected ' : {
2019-04-04 10:56:46 +00:00
' onyx:hasEmotionSet ' : [ {
2018-06-14 17:38:08 +00:00
' onyx:hasEmotion ' : [ {
2019-04-04 10:56:46 +00:00
" http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#arousal " : 5.8100000000000005 ,
" http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#dominance " : 4.33 ,
" http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#valence " : 5.050000000000001 ,
2018-06-14 17:38:08 +00:00
} ]
} ]
}
} , {
2019-04-04 10:56:46 +00:00
' name ' : ' negative-fear ' ,
2018-06-14 17:38:08 +00:00
' input ' : ' this cake is disgusting ' ,
' expected ' : {
2019-04-04 10:56:46 +00:00
' onyx:hasEmotionSet ' : [ {
2018-06-14 17:38:08 +00:00
' onyx:hasEmotion ' : [ {
2019-04-04 10:56:46 +00:00
" http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#arousal " : 5.09 ,
" http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#dominance " : 4.4 ,
" http://www.gsi.dit.upm.es/ontologies/onyx/vocabularies/anew/ns#valence " : 5.109999999999999 ,
2018-06-14 17:38:08 +00:00
} ]
} ]
}
}
]