mirror of
				https://github.com/gsi-upm/senpy
				synced 2025-11-04 09:18:16 +00:00 
			
		
		
		
	Added SentiText plugin (for Spanish)
This commit is contained in:
		
							
								
								
									
										117687
									
								
								sentiText/SentiWordNet_3.0.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										117687
									
								
								sentiText/SentiWordNet_3.0.txt
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										175
									
								
								sentiText/sentitext.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										175
									
								
								sentiText/sentitext.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,175 @@
 | 
			
		||||
import os
 | 
			
		||||
import logging
 | 
			
		||||
import string
 | 
			
		||||
import nltk
 | 
			
		||||
import pickle
 | 
			
		||||
 | 
			
		||||
from sentiwn import SentiWordNet
 | 
			
		||||
from nltk.corpus import wordnet as wn
 | 
			
		||||
from textblob import TextBlob
 | 
			
		||||
from scipy.interpolate import interp1d
 | 
			
		||||
from os import path
 | 
			
		||||
 | 
			
		||||
from senpy.plugins import SentimentPlugin, SenpyPlugin
 | 
			
		||||
from senpy.models import Response, Opinion, Entry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
class SentiTextPlugin(SentimentPlugin):
 | 
			
		||||
 | 
			
		||||
    def __init__(self, info, *args, **kwargs):
 | 
			
		||||
        super(SentiTextPlugin, self).__init__(info, *args, **kwargs)
 | 
			
		||||
        self.id = info['module']
 | 
			
		||||
        base = path.abspath(path.dirname(__file__))
 | 
			
		||||
        self.swn_path = path.join(base, info['sentiword_path'])
 | 
			
		||||
        self.pos_path = path.join(base, info['pos_path'])
 | 
			
		||||
        self._swn = None
 | 
			
		||||
        self._pos_tagger = None
 | 
			
		||||
 | 
			
		||||
    def _load_swn(self):
 | 
			
		||||
        swn = SentiWordNet(self.swn_path)
 | 
			
		||||
        return swn
 | 
			
		||||
 | 
			
		||||
    def _load_pos_tagger(self):
 | 
			
		||||
        with open(self.pos_path, 'r') as f:
 | 
			
		||||
            tagger = pickle.load(f)
 | 
			
		||||
        return tagger
 | 
			
		||||
 | 
			
		||||
    def activate(self, *args, **kwargs):
 | 
			
		||||
        self._swn = self._load_swn()
 | 
			
		||||
        self._pos_tagger = self._load_pos_tagger()
 | 
			
		||||
        logger.info("SentiText plugin is ready to go!")
 | 
			
		||||
 | 
			
		||||
    def deactivate(self, *args, **kwargs):
 | 
			
		||||
        logger.info("SentiText plugin is being deactivated...")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def _remove_punctuation(self, tokens):
 | 
			
		||||
        return [t for t in tokens if t not in string.punctuation]
 | 
			
		||||
 | 
			
		||||
    def _tokenize(self, text):
 | 
			
		||||
        data = {}
 | 
			
		||||
        sentences = nltk.sent_tokenize(text)
 | 
			
		||||
        for i, sentence in enumerate(sentences):
 | 
			
		||||
            sentence_ = {}
 | 
			
		||||
            words = nltk.word_tokenize(sentence)
 | 
			
		||||
            sentence_['sentence'] = sentence
 | 
			
		||||
            tokens_ = [w.lower() for w in words]
 | 
			
		||||
            sentence_['tokens'] = self._remove_punctuation(tokens_)
 | 
			
		||||
            data[i] = sentence_
 | 
			
		||||
        return data
 | 
			
		||||
 | 
			
		||||
    def _pos(self, tokens):
 | 
			
		||||
        for i in tokens:
 | 
			
		||||
            tokens[i]['tokens'] = self._pos_tagger.tag(tokens[i]['tokens'])
 | 
			
		||||
        return tokens
 | 
			
		||||
 | 
			
		||||
    # def _stopwords(sentences, lang='english'):
 | 
			
		||||
    #     for i in sentences:
 | 
			
		||||
    #         sentences[i]['tokens'] = [t for t in sentences[i]['tokens'] if t not in nltk.corpus.stopwords.words(lang)]
 | 
			
		||||
    #     return sentences
 | 
			
		||||
 | 
			
		||||
    def _compare_synsets(self, synsets, tokens, i):
 | 
			
		||||
        for synset in synsets:
 | 
			
		||||
            for word in tokens[i]['lemmas']:
 | 
			
		||||
                for lemma in tokens[i]['lemmas'][word]:
 | 
			
		||||
                    synset_ = lemma.synset() 
 | 
			
		||||
                    if synset == synset_:
 | 
			
		||||
                        return synset
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def analyse(self, **params):
 | 
			
		||||
        logger.debug("Analysing with params {}".format(params))
 | 
			
		||||
 | 
			
		||||
        text = params.get("input", None)
 | 
			
		||||
        tokens = self._tokenize(text)
 | 
			
		||||
        tokens = self._pos(tokens)
 | 
			
		||||
        
 | 
			
		||||
        
 | 
			
		||||
        for i in tokens:
 | 
			
		||||
            tokens[i]['lemmas'] = {}
 | 
			
		||||
            for w in tokens[i]['tokens']:
 | 
			
		||||
                lemmas = wn.lemmas(w[0], lang='spa')
 | 
			
		||||
                if len(lemmas) == 0:
 | 
			
		||||
                    continue
 | 
			
		||||
                tokens[i]['lemmas'][w[0]] = lemmas
 | 
			
		||||
        logger.debug("Tokens: {}".format(tokens))
 | 
			
		||||
        
 | 
			
		||||
        trans = TextBlob(unicode(text)).translate(from_lang='es',to='en')
 | 
			
		||||
        useful_synsets = {}
 | 
			
		||||
        for s_i, t_s in enumerate(trans.sentences):
 | 
			
		||||
            useful_synsets[s_i] = {}
 | 
			
		||||
            for w_i, t_w in enumerate(trans.sentences[s_i].words):
 | 
			
		||||
                synsets = wn.synsets(trans.sentences[s_i].words[w_i])
 | 
			
		||||
                if len(synsets) == 0:
 | 
			
		||||
                    continue
 | 
			
		||||
                eq_synset = self._compare_synsets(synsets, tokens, s_i)
 | 
			
		||||
                useful_synsets[s_i][t_w] = eq_synset
 | 
			
		||||
        logger.debug("Synsets used for analysis: {}".format(useful_synsets))
 | 
			
		||||
 | 
			
		||||
        scores = {}
 | 
			
		||||
        for i in tokens:
 | 
			
		||||
            scores[i] = {}
 | 
			
		||||
            for word in useful_synsets[i]:
 | 
			
		||||
                if useful_synsets[i][word] is None:
 | 
			
		||||
                    continue
 | 
			
		||||
                temp_scores = self._swn.get_score(useful_synsets[i][word].name().split('.')[0].replace(' ',' '))
 | 
			
		||||
                for score in temp_scores:
 | 
			
		||||
                    if score['synset'] == useful_synsets[i][word]:
 | 
			
		||||
                        t_score = score['pos'] - score['neg']
 | 
			
		||||
                        f_score = 'neu'
 | 
			
		||||
                        if t_score > 0:
 | 
			
		||||
                            f_score = 'pos'
 | 
			
		||||
                        elif t_score < 0:
 | 
			
		||||
                            f_score = 'neg'
 | 
			
		||||
                        score['score'] = f_score
 | 
			
		||||
                        scores[i][word] = score
 | 
			
		||||
                        break
 | 
			
		||||
        logger.debug("All scores (some not used): {}".format(scores))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        lang = params.get("language", "auto")
 | 
			
		||||
        p = params.get("prefix", None)
 | 
			
		||||
        response = Response(prefix=p)
 | 
			
		||||
 | 
			
		||||
        for i in scores:
 | 
			
		||||
            n_pos = 0.0
 | 
			
		||||
            n_neg = 0.0
 | 
			
		||||
            for w in scores[i]:
 | 
			
		||||
                if scores[i][w]['score'] == 'pos':
 | 
			
		||||
                    n_pos += 1.0
 | 
			
		||||
                elif scores[i][w]['score'] == 'neg':
 | 
			
		||||
                    n_neg += 1.0
 | 
			
		||||
 | 
			
		||||
            inter = interp1d([-1.0, 1.0], [0.0, 1.0])
 | 
			
		||||
            try:
 | 
			
		||||
                g_score = (n_pos - n_neg) / (n_pos + n_neg)
 | 
			
		||||
                g_score = float(inter(g_score))
 | 
			
		||||
            except:
 | 
			
		||||
                if n_pos == 0 and n_neg == 0:
 | 
			
		||||
                    g_score = 0.5
 | 
			
		||||
 | 
			
		||||
            polarity = 'marl:Neutral'
 | 
			
		||||
            if g_score > 0.5:
 | 
			
		||||
                polarity = 'marl:Positive'
 | 
			
		||||
            elif g_score < 0.5:
 | 
			
		||||
                polarity = 'marl:Negative'
 | 
			
		||||
 | 
			
		||||
            entry = Entry(id="Entry"+str(i),
 | 
			
		||||
                      text=tokens[i]['sentence'],
 | 
			
		||||
                      prefix=p)
 | 
			
		||||
            polarity
 | 
			
		||||
            opinion = Opinion(id="Opinion0"+'_'+str(i),
 | 
			
		||||
                          prefix=p,
 | 
			
		||||
                          hasPolarity=polarity,
 | 
			
		||||
                          polarityValue=float("{0:.2f}".format(g_score)))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
            opinion["prov:wasGeneratedBy"] = self.id
 | 
			
		||||
            entry.opinions.append(opinion)
 | 
			
		||||
            entry.language = lang
 | 
			
		||||
            response.entries.append(entry)
 | 
			
		||||
        return response
 | 
			
		||||
							
								
								
									
										18
									
								
								sentiText/sentitext.senpy
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								sentiText/sentitext.senpy
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,18 @@
 | 
			
		||||
{
 | 
			
		||||
    "name": "SentiText",
 | 
			
		||||
    "module": "sentitext",
 | 
			
		||||
    "description": "Sentiment classifier using rule-based classification for Spanish. Based on english to spanish translation and SentiWordNet sentiment knowledge.",
 | 
			
		||||
    "author": "github.com/nachtkatze",
 | 
			
		||||
    "version": "0.1",
 | 
			
		||||
    "extra_params": {
 | 
			
		||||
        "language": {
 | 
			
		||||
            "aliases": ["language", "l"],
 | 
			
		||||
            "required": true,
 | 
			
		||||
            "options": ["es"],
 | 
			
		||||
            "default": "es"
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    "requirements": {},
 | 
			
		||||
    "sentiword_path": "SentiWordNet_3.0.txt",
 | 
			
		||||
    "pos_path": "unigram_spanish.pickle"
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										70
									
								
								sentiText/sentiwn.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										70
									
								
								sentiText/sentiwn.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,70 @@
 | 
			
		||||
#!/usr/bin/env python
 | 
			
		||||
"""
 | 
			
		||||
Author : Jaganadh Gopinadhan <jaganadhg@gmail.com>
 | 
			
		||||
Copywright (C) : Jaganadh Gopinadhan
 | 
			
		||||
 | 
			
		||||
 Apache License, Version 2.0
 | 
			
		||||
  (the "License"); you may not use this file except in compliance with
 | 
			
		||||
  the License.  You may obtain a copy of the License at
 | 
			
		||||
 
 | 
			
		||||
      http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 
 | 
			
		||||
  Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
  distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
  See the License for the specific language governing permissions and
 | 
			
		||||
  limitations under the License.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
import sys,os
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
from nltk.corpus import wordnet
 | 
			
		||||
 | 
			
		||||
class SentiWordNet(object):
 | 
			
		||||
    """
 | 
			
		||||
    Interface to SentiWordNet
 | 
			
		||||
    """
 | 
			
		||||
    def __init__(self,swn_file):
 | 
			
		||||
        """
 | 
			
		||||
        """
 | 
			
		||||
        self.swn_file = swn_file
 | 
			
		||||
        self.pos_synset = self.__parse_swn_file()
 | 
			
		||||
 | 
			
		||||
    def __parse_swn_file(self):
 | 
			
		||||
        """
 | 
			
		||||
        Parse the SentiWordNet file and populate the POS and SynsetID hash
 | 
			
		||||
        """
 | 
			
		||||
        pos_synset_hash = {}
 | 
			
		||||
        swn_data = open(self.swn_file,'r').readlines()
 | 
			
		||||
        head_less_swn_data = filter((lambda line: not re.search(r"^\s*#",\
 | 
			
		||||
        line)), swn_data)
 | 
			
		||||
 | 
			
		||||
        for data in head_less_swn_data:
 | 
			
		||||
            fields = data.strip().split("\t")
 | 
			
		||||
            try:
 | 
			
		||||
                pos,syn_set_id,pos_score,neg_score,syn_set_score,\
 | 
			
		||||
                gloss = fields
 | 
			
		||||
            except:
 | 
			
		||||
                print "Found data without all details"
 | 
			
		||||
                pass
 | 
			
		||||
 | 
			
		||||
            if pos and syn_set_score:
 | 
			
		||||
                pos_synset_hash[(pos,int(syn_set_id))] = (float(pos_score),\
 | 
			
		||||
                float(neg_score))
 | 
			
		||||
 | 
			
		||||
        return pos_synset_hash
 | 
			
		||||
 | 
			
		||||
    def get_score(self,word,pos=None):
 | 
			
		||||
        """
 | 
			
		||||
        Get score for a given word/word pos combination
 | 
			
		||||
        """
 | 
			
		||||
        senti_scores = []
 | 
			
		||||
        synsets = wordnet.synsets(word,pos)
 | 
			
		||||
        for synset in synsets:
 | 
			
		||||
            if self.pos_synset.has_key((synset.pos(), synset.offset())):
 | 
			
		||||
                pos_val, neg_val = self.pos_synset[(synset.pos(), synset.offset())]
 | 
			
		||||
                senti_scores.append({"pos":pos_val,"neg":neg_val,\
 | 
			
		||||
                "obj": 1.0 - (pos_val - neg_val),'synset':synset})
 | 
			
		||||
 | 
			
		||||
        return senti_scores
 | 
			
		||||
							
								
								
									
										95290
									
								
								sentiText/unigram_spanish.pickle
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										95290
									
								
								sentiText/unigram_spanish.pickle
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Reference in New Issue
	
	Block a user