1
0
mirror of https://github.com/gsi-upm/senpy synced 2024-09-21 06:01:43 +00:00
senpy/sentiment-basic/sentiment-basic.py

178 lines
5.9 KiB
Python
Raw Normal View History

2018-06-14 17:38:08 +00:00
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
2019-01-09 18:29:24 +00:00
import sys
import string
import nltk
import pickle
from sentiwn import SentiWordNet
from nltk.corpus import wordnet as wn
from textblob import TextBlob
from scipy.interpolate import interp1d
from os import path
2019-04-04 10:56:46 +00:00
from senpy.plugins import SentimentBox, SenpyPlugin
from senpy.models import Results, Entry, Sentiment, Error
2019-01-09 18:29:24 +00:00
if sys.version_info[0] >= 3:
unicode = str
2019-04-04 10:56:46 +00:00
class SentimentBasic(SentimentBox):
2018-06-14 17:38:08 +00:00
'''
Sentiment classifier using rule-based classification for Spanish. Based on english to spanish translation and SentiWordNet sentiment knowledge. This is a demo plugin that uses only some features from the TASS 2015 classifier. To use the entirely functional classifier you can use the service in: http://senpy.cluster.gsi.dit.upm.es.
'''
name = "sentiment-basic"
author = "github.com/nachtkatze"
version = "0.1.1"
extra_params = {
"language": {
2019-04-04 10:56:46 +00:00
"description": "language of the text",
2018-06-14 17:38:08 +00:00
"aliases": ["language", "l"],
"required": True,
2019-04-04 10:56:46 +00:00
"options": ["en","es", "it", "fr"],
"default": "en"
2018-06-14 17:38:08 +00:00
}
}
sentiword_path = "SentiWordNet_3.0.txt"
pos_path = "unigram_spanish.pickle"
maxPolarityValue = 1
minPolarityValue = -1
2018-06-20 10:29:01 +00:00
nltk_resources = ['punkt','wordnet', 'omw']
2019-04-04 10:56:46 +00:00
with_polarity = False
def _load_swn(self):
2018-06-14 17:38:08 +00:00
self.swn_path = self.find_file(self.sentiword_path)
swn = SentiWordNet(self.swn_path)
return swn
def _load_pos_tagger(self):
2018-06-14 17:38:08 +00:00
self.pos_path = self.find_file(self.pos_path)
2019-01-09 18:29:24 +00:00
with open(self.pos_path, 'rb') as f:
tagger = pickle.load(f)
return tagger
def activate(self, *args, **kwargs):
self._swn = self._load_swn()
self._pos_tagger = self._load_pos_tagger()
def _remove_punctuation(self, tokens):
return [t for t in tokens if t not in string.punctuation]
def _tokenize(self, text):
2019-04-04 10:56:46 +00:00
sentence_ = {}
words = nltk.word_tokenize(text)
sentence_['sentence'] = text
tokens_ = [w.lower() for w in words]
sentence_['tokens'] = self._remove_punctuation(tokens_)
return sentence_
def _pos(self, tokens):
2019-04-04 10:56:46 +00:00
tokens['tokens'] = self._pos_tagger.tag(tokens['tokens'])
return tokens
2019-04-04 10:56:46 +00:00
def _compare_synsets(self, synsets, tokens):
for synset in synsets:
2019-04-04 10:56:46 +00:00
for word, lemmas in tokens['lemmas'].items():
for lemma in lemmas:
synset_ = lemma.synset()
if synset == synset_:
return synset
return None
2019-04-04 10:56:46 +00:00
def predict_one(self, features, activity):
language = activity.param("language")
text = features[0]
tokens = self._tokenize(text)
tokens = self._pos(tokens)
sufixes = {'es':'spa','en':'eng','it':'ita','fr':'fra'}
2019-04-04 10:56:46 +00:00
tokens['lemmas'] = {}
for w in tokens['tokens']:
lemmas = wn.lemmas(w[0], lang=sufixes[language])
if len(lemmas) == 0:
continue
tokens['lemmas'][w[0]] = lemmas
if language == "en":
trans = TextBlob(unicode(text))
else:
2019-04-04 10:56:46 +00:00
try:
trans = TextBlob(unicode(text)).translate(from_lang=language,to='en')
except Exception as ex:
raise Error('Could not translate the text from "{}" to "{}": {}'.format(language,
'en',
str(ex)))
useful_synsets = {}
2019-04-04 10:56:46 +00:00
for w_i, t_w in enumerate(trans.sentences[0].words):
synsets = wn.synsets(trans.sentences[0].words[w_i])
if len(synsets) == 0:
continue
eq_synset = self._compare_synsets(synsets, tokens)
useful_synsets[t_w] = eq_synset
scores = {}
scores = {}
2019-04-04 10:56:46 +00:00
if useful_synsets != None:
for word in useful_synsets:
if useful_synsets[word] is None:
continue
temp_scores = self._swn.get_score(useful_synsets[word].name().split('.')[0].replace(' ',' '))
for score in temp_scores:
if score['synset'] == useful_synsets[word]:
t_score = score['pos'] - score['neg']
f_score = 'neu'
if t_score > 0:
f_score = 'pos'
elif t_score < 0:
f_score = 'neg'
score['score'] = f_score
scores[word] = score
break
g_score = 0.5
for i in scores:
n_pos = 0.0
n_neg = 0.0
2019-04-04 10:56:46 +00:00
for w in scores:
if scores[w]['score'] == 'pos':
n_pos += 1.0
2019-04-04 10:56:46 +00:00
elif scores[w]['score'] == 'neg':
n_neg += 1.0
inter = interp1d([-1.0, 1.0], [0.0, 1.0])
2019-04-04 10:56:46 +00:00
try:
g_score = (n_pos - n_neg) / (n_pos + n_neg)
g_score = float(inter(g_score))
except:
if n_pos == 0 and n_neg == 0:
g_score = 0.5
2019-04-04 10:56:46 +00:00
if g_score > 0.5: # Positive
return [1, 0, 0]
elif g_score < 0.5: # Negative
return [0, 0, 1]
else:
return [0, 1, 0]
2018-06-14 17:38:08 +00:00
test_cases = [
{
2019-04-04 10:56:46 +00:00
'input': 'Odio ir al cine',
2018-06-14 17:38:08 +00:00
'params': {'language': 'es'},
'polarity': 'marl:Negative'
},
{
2019-04-04 10:56:46 +00:00
'input': 'El cielo está nublado',
2018-06-14 17:38:08 +00:00
'params': {'language': 'es'},
2019-04-04 10:56:46 +00:00
'polarity': 'marl:Neutral'
2018-06-14 17:38:08 +00:00
},
{
2019-04-04 10:56:46 +00:00
'input': 'Esta tarta está muy buena',
2018-06-14 17:38:08 +00:00
'params': {'language': 'es'},
2019-04-04 10:56:46 +00:00
'polarity': 'marl:Negative' # SURPRISINGLY!
2018-06-14 17:38:08 +00:00
}
]