Update to senpy 0.20

2025-10-19 09:48:26 +00:00 · 2019-04-04 12:56:46 +02:00
parent fa993c6e2a
commit 4f286057c9
13 changed files with 239 additions and 247 deletions
--- a/sentiment-basic/sentiment-basic.py
+++ b/sentiment-basic/sentiment-basic.py
@@ -12,14 +12,14 @@ from textblob import TextBlob
 from scipy.interpolate import interp1d
 from os import path

-from senpy.plugins import SentimentPlugin, SenpyPlugin
-from senpy.models import Results, Entry, Sentiment
+from senpy.plugins import SentimentBox, SenpyPlugin
+from senpy.models import Results, Entry, Sentiment, Error

 if sys.version_info[0] >= 3:
    unicode = str


-class SentimentBasic(SentimentPlugin):
+class SentimentBasic(SentimentBox):
    '''
    Sentiment classifier using rule-based classification for Spanish. Based on english to spanish translation and SentiWordNet sentiment knowledge. This is a demo plugin that uses only some features from the TASS 2015 classifier. To use the entirely functional classifier you can use the service in: http://senpy.cluster.gsi.dit.upm.es.
    '''
@@ -28,10 +28,11 @@ class SentimentBasic(SentimentPlugin):
    version = "0.1.1"
    extra_params = {
        "language": {
+            "description": "language of the text",
            "aliases": ["language", "l"],
            "required": True,
-            "options": ["en","es", "it", "fr", "auto"],
-            "default": "auto"
+            "options": ["en","es", "it", "fr"],
+            "default": "en"
        }
    }
    sentiword_path = "SentiWordNet_3.0.txt"
@@ -40,6 +41,8 @@ class SentimentBasic(SentimentPlugin):
    minPolarityValue = -1
    nltk_resources = ['punkt','wordnet', 'omw']

+    with_polarity = False
+
    def _load_swn(self):
        self.swn_path = self.find_file(self.sentiword_path)
        swn = SentiWordNet(self.swn_path)
@@ -59,128 +62,116 @@ class SentimentBasic(SentimentPlugin):
        return [t for t in tokens if t not in string.punctuation]

    def _tokenize(self, text):
-        data = {}
-        sentences = nltk.sent_tokenize(text)
-        for i, sentence in enumerate(sentences):
-            sentence_ = {}
-            words = nltk.word_tokenize(sentence)
-            sentence_['sentence'] = sentence
-            tokens_ = [w.lower() for w in words]
-            sentence_['tokens'] = self._remove_punctuation(tokens_)
-            data[i] = sentence_
-        return data
+        sentence_ = {}
+        words = nltk.word_tokenize(text)
+        sentence_['sentence'] = text
+        tokens_ = [w.lower() for w in words]
+        sentence_['tokens'] = self._remove_punctuation(tokens_)
+        return sentence_

    def _pos(self, tokens):
-        for i in tokens:
-            tokens[i]['tokens'] = self._pos_tagger.tag(tokens[i]['tokens'])
+        tokens['tokens'] = self._pos_tagger.tag(tokens['tokens'])
        return tokens

-    def _compare_synsets(self, synsets, tokens, i):
+    def _compare_synsets(self, synsets, tokens):
        for synset in synsets:
-            for word in tokens[i]['lemmas']:
-                for lemma in tokens[i]['lemmas'][word]:
+            for word, lemmas in tokens['lemmas'].items():
+                for lemma in lemmas:
                    synset_ = lemma.synset() 
                    if synset == synset_:
                        return synset
        return None

-
-    def analyse_entry(self, entry, params):
-        language = params.get("language")
-        text = entry.text
+    def predict_one(self, features, activity):
+        language = activity.param("language")
+        text = features[0]
        tokens = self._tokenize(text)
        tokens = self._pos(tokens)
        sufixes = {'es':'spa','en':'eng','it':'ita','fr':'fra'}
-        for i in tokens:
-            tokens[i]['lemmas'] = {}
-            for w in tokens[i]['tokens']:
-                lemmas = wn.lemmas(w[0], lang=sufixes[language])
-                if len(lemmas) == 0:
-                    continue
-                tokens[i]['lemmas'][w[0]] = lemmas
+        tokens['lemmas'] = {}
+        for w in tokens['tokens']:
+            lemmas = wn.lemmas(w[0], lang=sufixes[language])
+            if len(lemmas) == 0:
+                continue
+            tokens['lemmas'][w[0]] = lemmas
        if language == "en":
            trans = TextBlob(unicode(text))
        else:
-            trans = TextBlob(unicode(text)).translate(from_lang=language,to='en')
+            try:
+                trans = TextBlob(unicode(text)).translate(from_lang=language,to='en')
+            except Exception as ex:
+                raise Error('Could not translate the text from "{}" to "{}": {}'.format(language,
+                                                                                    'en',
+                                                                                    str(ex)))
        useful_synsets = {}
-        for s_i, t_s in enumerate(trans.sentences):
-            useful_synsets[s_i] = {}
-            for w_i, t_w in enumerate(trans.sentences[s_i].words):
-                synsets = wn.synsets(trans.sentences[s_i].words[w_i])
-                if len(synsets) == 0:
-                    continue
-                eq_synset = self._compare_synsets(synsets, tokens, s_i)
-                useful_synsets[s_i][t_w] = eq_synset
+        for w_i, t_w in enumerate(trans.sentences[0].words):
+            synsets = wn.synsets(trans.sentences[0].words[w_i])
+            if len(synsets) == 0:
+                continue
+            eq_synset = self._compare_synsets(synsets, tokens)
+            useful_synsets[t_w] = eq_synset
        scores = {}
-        for i in tokens:
-            scores[i] = {}
-            if useful_synsets != None:   
-                for word in useful_synsets[i]:
-                    if useful_synsets[i][word] is None:
-                        continue
-                    temp_scores = self._swn.get_score(useful_synsets[i][word].name().split('.')[0].replace(' ',' '))
-                    for score in temp_scores:
-                        if score['synset'] == useful_synsets[i][word]:
-                            t_score = score['pos'] - score['neg']
-                            f_score = 'neu'
-                            if t_score > 0:
-                                f_score = 'pos'
-                            elif t_score < 0:
-                                f_score = 'neg'
-                            score['score'] = f_score
-                            scores[i][word] = score
-                            break
-        p = params.get("prefix", None)
+        scores = {}
+        if useful_synsets != None:
+            for word in useful_synsets:
+                if useful_synsets[word] is None:
+                    continue
+                temp_scores = self._swn.get_score(useful_synsets[word].name().split('.')[0].replace(' ',' '))
+                for score in temp_scores:
+                    if score['synset'] == useful_synsets[word]:
+                        t_score = score['pos'] - score['neg']
+                        f_score = 'neu'
+                        if t_score > 0:
+                            f_score = 'pos'
+                        elif t_score < 0:
+                            f_score = 'neg'
+                        score['score'] = f_score
+                        scores[word] = score
+                        break
+        g_score = 0.5
+
        for i in scores:
            n_pos = 0.0
            n_neg = 0.0
-            for w in scores[i]:
-                if scores[i][w]['score'] == 'pos':
+            for w in scores:
+                if scores[w]['score'] == 'pos':
                    n_pos += 1.0
-                elif scores[i][w]['score'] == 'neg':
+                elif scores[w]['score'] == 'neg':
                    n_neg += 1.0
            inter = interp1d([-1.0, 1.0], [0.0, 1.0])
+
            try:
                g_score = (n_pos - n_neg) / (n_pos + n_neg)
                g_score = float(inter(g_score))
            except:
                if n_pos == 0 and n_neg == 0:
                    g_score = 0.5
-            if g_score >= 0.5:
-                polarity = 'marl:Positive'
-                polarity_value = 1
-            elif g_score < 0.5:
-                polarity = 'marl:Negative'
-                polarity_value = -1
-            else:
-                polarity = 'marl:Neutral'
-                polarity_value = 0
-            opinion = Sentiment(id="Opinion0"+'_'+str(i),
-                          marl__hasPolarity=polarity,
-                          marl__polarityValue=polarity_value)

-            opinion.prov(self)
-            entry.sentiments.append(opinion)
+        if g_score > 0.5:  # Positive
+            return [1, 0, 0]
+        elif g_score < 0.5:  # Negative
+            return [0, 0, 1]
+        else:
+            return [0, 1, 0]

-        yield entry

    test_cases = [
        {
-            'input': u'Odio ir al cine',
+            'input': 'Odio ir al cine',
            'params': {'language': 'es'},
            'polarity': 'marl:Negative'

        },
        {
-            'input': u'El cielo está nublado',
+            'input': 'El cielo está nublado',
            'params': {'language': 'es'},
-            'polarity': 'marl:Positive'
+            'polarity': 'marl:Neutral'

        },
        {
-            'input': u'Esta tarta está muy buena',
+            'input': 'Esta tarta está muy buena',
            'params': {'language': 'es'},
-            'polarity': 'marl:Negative'
+            'polarity': 'marl:Negative' # SURPRISINGLY!

        }
    ]