Added SentiText plugin (for Spanish)

2025-11-28 18:28:16 +00:00 · 2015-10-30 17:58:37 +01:00
parent 94d82238b8
commit 17976d85b1
5 changed files with 213240 additions and 0 deletions
--- a/sentiText/SentiWordNet_3.0.txt
+++ b/sentiText/SentiWordNet_3.0.txt
--- a/sentiText/sentitext.py
+++ b/sentiText/sentitext.py
@@ -0,0 +1,175 @@
+import os
+import logging
+import string
+import nltk
+import pickle
+
+from sentiwn import SentiWordNet
+from nltk.corpus import wordnet as wn
+from textblob import TextBlob
+from scipy.interpolate import interp1d
+from os import path
+
+from senpy.plugins import SentimentPlugin, SenpyPlugin
+from senpy.models import Response, Opinion, Entry
+
+
+logger = logging.getLogger(__name__)
+
+class SentiTextPlugin(SentimentPlugin):
+
+    def __init__(self, info, *args, **kwargs):
+        super(SentiTextPlugin, self).__init__(info, *args, **kwargs)
+        self.id = info['module']
+        base = path.abspath(path.dirname(__file__))
+        self.swn_path = path.join(base, info['sentiword_path'])
+        self.pos_path = path.join(base, info['pos_path'])
+        self._swn = None
+        self._pos_tagger = None
+
+    def _load_swn(self):
+        swn = SentiWordNet(self.swn_path)
+        return swn
+
+    def _load_pos_tagger(self):
+        with open(self.pos_path, 'r') as f:
+            tagger = pickle.load(f)
+        return tagger
+
+    def activate(self, *args, **kwargs):
+        self._swn = self._load_swn()
+        self._pos_tagger = self._load_pos_tagger()
+        logger.info("SentiText plugin is ready to go!")
+
+    def deactivate(self, *args, **kwargs):
+        logger.info("SentiText plugin is being deactivated...")
+
+
+
+    def _remove_punctuation(self, tokens):
+        return [t for t in tokens if t not in string.punctuation]
+
+    def _tokenize(self, text):
+        data = {}
+        sentences = nltk.sent_tokenize(text)
+        for i, sentence in enumerate(sentences):
+            sentence_ = {}
+            words = nltk.word_tokenize(sentence)
+            sentence_['sentence'] = sentence
+            tokens_ = [w.lower() for w in words]
+            sentence_['tokens'] = self._remove_punctuation(tokens_)
+            data[i] = sentence_
+        return data
+
+    def _pos(self, tokens):
+        for i in tokens:
+            tokens[i]['tokens'] = self._pos_tagger.tag(tokens[i]['tokens'])
+        return tokens
+
+    # def _stopwords(sentences, lang='english'):
+    #     for i in sentences:
+    #         sentences[i]['tokens'] = [t for t in sentences[i]['tokens'] if t not in nltk.corpus.stopwords.words(lang)]
+    #     return sentences
+
+    def _compare_synsets(self, synsets, tokens, i):
+        for synset in synsets:
+            for word in tokens[i]['lemmas']:
+                for lemma in tokens[i]['lemmas'][word]:
+                    synset_ = lemma.synset() 
+                    if synset == synset_:
+                        return synset
+        return None
+
+
+    def analyse(self, **params):
+        logger.debug("Analysing with params {}".format(params))
+
+        text = params.get("input", None)
+        tokens = self._tokenize(text)
+        tokens = self._pos(tokens)
+        
+        
+        for i in tokens:
+            tokens[i]['lemmas'] = {}
+            for w in tokens[i]['tokens']:
+                lemmas = wn.lemmas(w[0], lang='spa')
+                if len(lemmas) == 0:
+                    continue
+                tokens[i]['lemmas'][w[0]] = lemmas
+        logger.debug("Tokens: {}".format(tokens))
+        
+        trans = TextBlob(unicode(text)).translate(from_lang='es',to='en')
+        useful_synsets = {}
+        for s_i, t_s in enumerate(trans.sentences):
+            useful_synsets[s_i] = {}
+            for w_i, t_w in enumerate(trans.sentences[s_i].words):
+                synsets = wn.synsets(trans.sentences[s_i].words[w_i])
+                if len(synsets) == 0:
+                    continue
+                eq_synset = self._compare_synsets(synsets, tokens, s_i)
+                useful_synsets[s_i][t_w] = eq_synset
+        logger.debug("Synsets used for analysis: {}".format(useful_synsets))
+
+        scores = {}
+        for i in tokens:
+            scores[i] = {}
+            for word in useful_synsets[i]:
+                if useful_synsets[i][word] is None:
+                    continue
+                temp_scores = self._swn.get_score(useful_synsets[i][word].name().split('.')[0].replace(' ',' '))
+                for score in temp_scores:
+                    if score['synset'] == useful_synsets[i][word]:
+                        t_score = score['pos'] - score['neg']
+                        f_score = 'neu'
+                        if t_score > 0:
+                            f_score = 'pos'
+                        elif t_score < 0:
+                            f_score = 'neg'
+                        score['score'] = f_score
+                        scores[i][word] = score
+                        break
+        logger.debug("All scores (some not used): {}".format(scores))
+
+
+        lang = params.get("language", "auto")
+        p = params.get("prefix", None)
+        response = Response(prefix=p)
+
+        for i in scores:
+            n_pos = 0.0
+            n_neg = 0.0
+            for w in scores[i]:
+                if scores[i][w]['score'] == 'pos':
+                    n_pos += 1.0
+                elif scores[i][w]['score'] == 'neg':
+                    n_neg += 1.0
+
+            inter = interp1d([-1.0, 1.0], [0.0, 1.0])
+            try:
+                g_score = (n_pos - n_neg) / (n_pos + n_neg)
+                g_score = float(inter(g_score))
+            except:
+                if n_pos == 0 and n_neg == 0:
+                    g_score = 0.5
+
+            polarity = 'marl:Neutral'
+            if g_score > 0.5:
+                polarity = 'marl:Positive'
+            elif g_score < 0.5:
+                polarity = 'marl:Negative'
+
+            entry = Entry(id="Entry"+str(i),
+                      text=tokens[i]['sentence'],
+                      prefix=p)
+            polarity
+            opinion = Opinion(id="Opinion0"+'_'+str(i),
+                          prefix=p,
+                          hasPolarity=polarity,
+                          polarityValue=float("{0:.2f}".format(g_score)))
+
+
+            opinion["prov:wasGeneratedBy"] = self.id
+            entry.opinions.append(opinion)
+            entry.language = lang
+            response.entries.append(entry)
+        return response
--- a/sentiText/sentitext.senpy
+++ b/sentiText/sentitext.senpy
@@ -0,0 +1,18 @@
+{
+    "name": "SentiText",
+    "module": "sentitext",
+    "description": "Sentiment classifier using rule-based classification for Spanish. Based on english to spanish translation and SentiWordNet sentiment knowledge.",
+    "author": "github.com/nachtkatze",
+    "version": "0.1",
+    "extra_params": {
+        "language": {
+            "aliases": ["language", "l"],
+            "required": true,
+            "options": ["es"],
+            "default": "es"
+        }
+    },
+    "requirements": {},
+    "sentiword_path": "SentiWordNet_3.0.txt",
+    "pos_path": "unigram_spanish.pickle"
+}
--- a/sentiText/sentiwn.py
+++ b/sentiText/sentiwn.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+"""
+Author : Jaganadh Gopinadhan <jaganadhg@gmail.com>
+Copywright (C) : Jaganadh Gopinadhan
+
+ Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+ 
+      http://www.apache.org/licenses/LICENSE-2.0
+ 
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+"""
+
+import sys,os
+import re
+
+from nltk.corpus import wordnet
+
+class SentiWordNet(object):
+    """
+    Interface to SentiWordNet
+    """
+    def __init__(self,swn_file):
+        """
+        """
+        self.swn_file = swn_file
+        self.pos_synset = self.__parse_swn_file()
+
+    def __parse_swn_file(self):
+        """
+        Parse the SentiWordNet file and populate the POS and SynsetID hash
+        """
+        pos_synset_hash = {}
+        swn_data = open(self.swn_file,'r').readlines()
+        head_less_swn_data = filter((lambda line: not re.search(r"^\s*#",\
+        line)), swn_data)
+
+        for data in head_less_swn_data:
+            fields = data.strip().split("\t")
+            try:
+                pos,syn_set_id,pos_score,neg_score,syn_set_score,\
+                gloss = fields
+            except:
+                print "Found data without all details"
+                pass
+
+            if pos and syn_set_score:
+                pos_synset_hash[(pos,int(syn_set_id))] = (float(pos_score),\
+                float(neg_score))
+
+        return pos_synset_hash
+
+    def get_score(self,word,pos=None):
+        """
+        Get score for a given word/word pos combination
+        """
+        senti_scores = []
+        synsets = wordnet.synsets(word,pos)
+        for synset in synsets:
+            if self.pos_synset.has_key((synset.pos(), synset.offset())):
+                pos_val, neg_val = self.pos_synset[(synset.pos(), synset.offset())]
+                senti_scores.append({"pos":pos_val,"neg":neg_val,\
+                "obj": 1.0 - (pos_val - neg_val),'synset':synset})
+
+        return senti_scores
--- a/sentiText/unigram_spanish.pickle
+++ b/sentiText/unigram_spanish.pickle