1
0
mirror of https://github.com/gsi-upm/senpy synced 2025-08-24 02:22:20 +00:00

Update to senpy 0.20

This commit is contained in:
J. Fernando Sánchez
2019-04-04 12:56:46 +02:00
parent fa993c6e2a
commit 4f286057c9
13 changed files with 239 additions and 247 deletions

View File

@@ -12,14 +12,14 @@ from textblob import TextBlob
from scipy.interpolate import interp1d
from os import path
from senpy.plugins import SentimentPlugin, SenpyPlugin
from senpy.models import Results, Entry, Sentiment
from senpy.plugins import SentimentBox, SenpyPlugin
from senpy.models import Results, Entry, Sentiment, Error
if sys.version_info[0] >= 3:
unicode = str
class SentimentBasic(SentimentPlugin):
class SentimentBasic(SentimentBox):
'''
Sentiment classifier using rule-based classification for Spanish. Based on english to spanish translation and SentiWordNet sentiment knowledge. This is a demo plugin that uses only some features from the TASS 2015 classifier. To use the entirely functional classifier you can use the service in: http://senpy.cluster.gsi.dit.upm.es.
'''
@@ -28,10 +28,11 @@ class SentimentBasic(SentimentPlugin):
version = "0.1.1"
extra_params = {
"language": {
"description": "language of the text",
"aliases": ["language", "l"],
"required": True,
"options": ["en","es", "it", "fr", "auto"],
"default": "auto"
"options": ["en","es", "it", "fr"],
"default": "en"
}
}
sentiword_path = "SentiWordNet_3.0.txt"
@@ -40,6 +41,8 @@ class SentimentBasic(SentimentPlugin):
minPolarityValue = -1
nltk_resources = ['punkt','wordnet', 'omw']
with_polarity = False
def _load_swn(self):
self.swn_path = self.find_file(self.sentiword_path)
swn = SentiWordNet(self.swn_path)
@@ -59,128 +62,116 @@ class SentimentBasic(SentimentPlugin):
return [t for t in tokens if t not in string.punctuation]
def _tokenize(self, text):
data = {}
sentences = nltk.sent_tokenize(text)
for i, sentence in enumerate(sentences):
sentence_ = {}
words = nltk.word_tokenize(sentence)
sentence_['sentence'] = sentence
tokens_ = [w.lower() for w in words]
sentence_['tokens'] = self._remove_punctuation(tokens_)
data[i] = sentence_
return data
sentence_ = {}
words = nltk.word_tokenize(text)
sentence_['sentence'] = text
tokens_ = [w.lower() for w in words]
sentence_['tokens'] = self._remove_punctuation(tokens_)
return sentence_
def _pos(self, tokens):
for i in tokens:
tokens[i]['tokens'] = self._pos_tagger.tag(tokens[i]['tokens'])
tokens['tokens'] = self._pos_tagger.tag(tokens['tokens'])
return tokens
def _compare_synsets(self, synsets, tokens, i):
def _compare_synsets(self, synsets, tokens):
for synset in synsets:
for word in tokens[i]['lemmas']:
for lemma in tokens[i]['lemmas'][word]:
for word, lemmas in tokens['lemmas'].items():
for lemma in lemmas:
synset_ = lemma.synset()
if synset == synset_:
return synset
return None
def analyse_entry(self, entry, params):
language = params.get("language")
text = entry.text
def predict_one(self, features, activity):
language = activity.param("language")
text = features[0]
tokens = self._tokenize(text)
tokens = self._pos(tokens)
sufixes = {'es':'spa','en':'eng','it':'ita','fr':'fra'}
for i in tokens:
tokens[i]['lemmas'] = {}
for w in tokens[i]['tokens']:
lemmas = wn.lemmas(w[0], lang=sufixes[language])
if len(lemmas) == 0:
continue
tokens[i]['lemmas'][w[0]] = lemmas
tokens['lemmas'] = {}
for w in tokens['tokens']:
lemmas = wn.lemmas(w[0], lang=sufixes[language])
if len(lemmas) == 0:
continue
tokens['lemmas'][w[0]] = lemmas
if language == "en":
trans = TextBlob(unicode(text))
else:
trans = TextBlob(unicode(text)).translate(from_lang=language,to='en')
try:
trans = TextBlob(unicode(text)).translate(from_lang=language,to='en')
except Exception as ex:
raise Error('Could not translate the text from "{}" to "{}": {}'.format(language,
'en',
str(ex)))
useful_synsets = {}
for s_i, t_s in enumerate(trans.sentences):
useful_synsets[s_i] = {}
for w_i, t_w in enumerate(trans.sentences[s_i].words):
synsets = wn.synsets(trans.sentences[s_i].words[w_i])
if len(synsets) == 0:
continue
eq_synset = self._compare_synsets(synsets, tokens, s_i)
useful_synsets[s_i][t_w] = eq_synset
for w_i, t_w in enumerate(trans.sentences[0].words):
synsets = wn.synsets(trans.sentences[0].words[w_i])
if len(synsets) == 0:
continue
eq_synset = self._compare_synsets(synsets, tokens)
useful_synsets[t_w] = eq_synset
scores = {}
for i in tokens:
scores[i] = {}
if useful_synsets != None:
for word in useful_synsets[i]:
if useful_synsets[i][word] is None:
continue
temp_scores = self._swn.get_score(useful_synsets[i][word].name().split('.')[0].replace(' ',' '))
for score in temp_scores:
if score['synset'] == useful_synsets[i][word]:
t_score = score['pos'] - score['neg']
f_score = 'neu'
if t_score > 0:
f_score = 'pos'
elif t_score < 0:
f_score = 'neg'
score['score'] = f_score
scores[i][word] = score
break
p = params.get("prefix", None)
scores = {}
if useful_synsets != None:
for word in useful_synsets:
if useful_synsets[word] is None:
continue
temp_scores = self._swn.get_score(useful_synsets[word].name().split('.')[0].replace(' ',' '))
for score in temp_scores:
if score['synset'] == useful_synsets[word]:
t_score = score['pos'] - score['neg']
f_score = 'neu'
if t_score > 0:
f_score = 'pos'
elif t_score < 0:
f_score = 'neg'
score['score'] = f_score
scores[word] = score
break
g_score = 0.5
for i in scores:
n_pos = 0.0
n_neg = 0.0
for w in scores[i]:
if scores[i][w]['score'] == 'pos':
for w in scores:
if scores[w]['score'] == 'pos':
n_pos += 1.0
elif scores[i][w]['score'] == 'neg':
elif scores[w]['score'] == 'neg':
n_neg += 1.0
inter = interp1d([-1.0, 1.0], [0.0, 1.0])
try:
g_score = (n_pos - n_neg) / (n_pos + n_neg)
g_score = float(inter(g_score))
except:
if n_pos == 0 and n_neg == 0:
g_score = 0.5
if g_score >= 0.5:
polarity = 'marl:Positive'
polarity_value = 1
elif g_score < 0.5:
polarity = 'marl:Negative'
polarity_value = -1
else:
polarity = 'marl:Neutral'
polarity_value = 0
opinion = Sentiment(id="Opinion0"+'_'+str(i),
marl__hasPolarity=polarity,
marl__polarityValue=polarity_value)
opinion.prov(self)
entry.sentiments.append(opinion)
if g_score > 0.5: # Positive
return [1, 0, 0]
elif g_score < 0.5: # Negative
return [0, 0, 1]
else:
return [0, 1, 0]
yield entry
test_cases = [
{
'input': u'Odio ir al cine',
'input': 'Odio ir al cine',
'params': {'language': 'es'},
'polarity': 'marl:Negative'
},
{
'input': u'El cielo está nublado',
'input': 'El cielo está nublado',
'params': {'language': 'es'},
'polarity': 'marl:Positive'
'polarity': 'marl:Neutral'
},
{
'input': u'Esta tarta está muy buena',
'input': 'Esta tarta está muy buena',
'params': {'language': 'es'},
'polarity': 'marl:Negative'
'polarity': 'marl:Negative' # SURPRISINGLY!
}
]