Merge commit '7c959aace896e9d318497a417e0eec8f78b62314' as 'sentiment-basic'

2026-06-02 05:21:59 +00:00 · 2018-06-12 10:01:45 +02:00
parent 2a4cc96905 7c959aace8
commit e51b659030
7 changed files with 316 additions and 0 deletions
--- a/sentiment-basic/.gitmodules
+++ b/sentiment-basic/.gitmodules
@@ -0,0 +1,3 @@
 [submodule "data"]
 	path = data
 	url = ../data/sentiment-basic
--- a/sentiment-basic/README.md
+++ b/sentiment-basic/README.md
@@ -0,0 +1,28 @@
 # Sentiment basic plugin
 This plugin is based on the classifier developed for the TASS 2015 competition. It has been developed for Spanish and English. This is a demo plugin that uses only some features from the TASS 2015 classifier. To use the entirely functional classifier you can use the service in: http://senpy.cluster.gsi.dit.upm.es
 There is more information avaliable in:
 	- Aspect based Sentiment Analysis of Spanish Tweets, Oscar Araque and Ignacio Corcuera-Platas and Constantino Román-Gómez and Carlos A. Iglesias and J. Fernando Sánchez-Rada. http://gsi.dit.upm.es/es/investigacion/publicaciones?view=publication&task=show&id=376
 ## Usage
 Params accepted:
 - Language: Spanish (es).
 - Input: text to analyse.
 Example request: 
 ```
 http://senpy.cluster.gsi.dit.upm.es/api/?algo=sentiment-basic&language=es&input=I%20love%20Madrid
 ```
 Example respond: This plugin follows the standard for the senpy plugin response. For more information, please visit [senpy documentation](http://senpy.readthedocs.io). Specifically, NIF API section. 
 This plugin only supports **python2**
 ![alt GSI Logo][logoGSI]
 [logoGSI]: http://www.gsi.dit.upm.es/images/stories/logos/gsi.png "GSI Logo"
--- a/sentiment-basic/data
+++ b/sentiment-basic/data
--- a/sentiment-basic/sentiment-basic.py
+++ b/sentiment-basic/sentiment-basic.py
@@ -0,0 +1,148 @@
 import os
 import logging
 import string
 import nltk
 import pickle
 from sentiwn import SentiWordNet
 from nltk.corpus import wordnet as wn
 from textblob import TextBlob
 from scipy.interpolate import interp1d
 from os import path
 from senpy.plugins import SentimentPlugin, SenpyPlugin
 from senpy.models import Results, Entry, Sentiment
 logger = logging.getLogger(__name__)
 class SentiTextPlugin(SentimentPlugin):
    def _load_swn(self):
        self.swn_path = path.join(path.abspath(path.dirname(__file__)), self.sentiword_path)
        swn = SentiWordNet(self.swn_path)
        return swn
    def _load_pos_tagger(self):
        self.pos_path = path.join(path.abspath(path.dirname(__file__)), self.pos_path)
        with open(self.pos_path, 'r') as f:
            tagger = pickle.load(f)
        return tagger
    def activate(self, *args, **kwargs):
        nltk.download(['punkt','wordnet'])
        self._swn = self._load_swn()
        self._pos_tagger = self._load_pos_tagger()
    def _remove_punctuation(self, tokens):
        return [t for t in tokens if t not in string.punctuation]
    def _tokenize(self, text):
        data = {}
        sentences = nltk.sent_tokenize(text)
        for i, sentence in enumerate(sentences):
            sentence_ = {}
            words = nltk.word_tokenize(sentence)
            sentence_['sentence'] = sentence
            tokens_ = [w.lower() for w in words]
            sentence_['tokens'] = self._remove_punctuation(tokens_)
            data[i] = sentence_
        return data
    def _pos(self, tokens):
        for i in tokens:
            tokens[i]['tokens'] = self._pos_tagger.tag(tokens[i]['tokens'])
        return tokens
    # def _stopwords(sentences, lang='english'):
    #     for i in sentences:
    #         sentences[i]['tokens'] = [t for t in sentences[i]['tokens'] if t not in nltk.corpus.stopwords.words(lang)]
    #     return sentences
    def _compare_synsets(self, synsets, tokens, i):
        for synset in synsets:
            for word in tokens[i]['lemmas']:
                for lemma in tokens[i]['lemmas'][word]:
                    synset_ = lemma.synset() 
                    if synset == synset_:
                        return synset
        return None
    def analyse_entry(self, entry, params):
        language = params.get("language")
        text = entry.get("text", None)
        tokens = self._tokenize(text)
        tokens = self._pos(tokens)
        sufixes = {'es':'spa','en':'eng','it':'ita','fr':'fra'}
        for i in tokens:
            tokens[i]['lemmas'] = {}
            for w in tokens[i]['tokens']:
                lemmas = wn.lemmas(w[0], lang=sufixes[language])
                if len(lemmas) == 0:
                    continue
                tokens[i]['lemmas'][w[0]] = lemmas
        if language == "en":
            trans = TextBlob(unicode(text))
        else:
            trans = TextBlob(unicode(text)).translate(from_lang=language,to='en')
        useful_synsets = {}
        for s_i, t_s in enumerate(trans.sentences):
            useful_synsets[s_i] = {}
            for w_i, t_w in enumerate(trans.sentences[s_i].words):
                synsets = wn.synsets(trans.sentences[s_i].words[w_i])
                if len(synsets) == 0:
                    continue
                eq_synset = self._compare_synsets(synsets, tokens, s_i)
                useful_synsets[s_i][t_w] = eq_synset
        scores = {}
        for i in tokens:
            scores[i] = {}
            if useful_synsets != None:   
                for word in useful_synsets[i]:
                    if useful_synsets[i][word] is None:
                        continue
                    temp_scores = self._swn.get_score(useful_synsets[i][word].name().split('.')[0].replace(' ',' '))
                    for score in temp_scores:
                        if score['synset'] == useful_synsets[i][word]:
                            t_score = score['pos'] - score['neg']
                            f_score = 'neu'
                            if t_score > 0:
                                f_score = 'pos'
                            elif t_score < 0:
                                f_score = 'neg'
                            score['score'] = f_score
                            scores[i][word] = score
                            break
        p = params.get("prefix", None)
        for i in scores:
            n_pos = 0.0
            n_neg = 0.0
            for w in scores[i]:
                if scores[i][w]['score'] == 'pos':
                    n_pos += 1.0
                elif scores[i][w]['score'] == 'neg':
                    n_neg += 1.0
            inter = interp1d([-1.0, 1.0], [0.0, 1.0])
            try:
                g_score = (n_pos - n_neg) / (n_pos + n_neg)
                g_score = float(inter(g_score))
            except:
                if n_pos == 0 and n_neg == 0:
                    g_score = 0.5
            polarity = 'marl:Neutral'
            polarity_value = 0
            if g_score > 0.5:
                polarity = 'marl:Positive'
                polarity_value = 1
            elif g_score < 0.5:
                polarity = 'marl:Negative'
                polarity_value = -1
            opinion = Sentiment(id="Opinion0"+'_'+str(i),
                          marl__hasPolarity=polarity,
                          marl__polarityValue=polarity_value)
            entry.sentiments.append(opinion)
        yield entry
--- a/sentiment-basic/sentiment-basic.senpy
+++ b/sentiment-basic/sentiment-basic.senpy
@@ -0,0 +1,24 @@
 {
    "name": "sentiment-basic",
    "module": "sentiment-basic",
    "description": "Sentiment classifier using rule-based classification for Spanish. Based on english to spanish translation and SentiWordNet sentiment knowledge. This is a demo plugin that uses only some features from the TASS 2015 classifier. To use the entirely functional classifier you can use the service in: http://senpy.cluster.gsi.dit.upm.es.",
    "author": "github.com/nachtkatze",
    "version": "0.1",
    "requirements": [
        "nltk>=3.0.5",
        "scipy>=0.14.0",
        "textblob"
    ],
    "extra_params": {
        "language": {
            "aliases": ["language", "l"],
            "required": true,
            "options": ["en","es", "it", "fr", "auto"],
            "default": "auto"
        },
    },
    "sentiword_path": "data/SentiWordNet_3.0.txt",
    "pos_path": "data/unigram_spanish.pickle",
    "maxPolarityValue": "1",
    "minPolarityValue": "-1"
 }
--- a/sentiment-basic/sentiwn.py
+++ b/sentiment-basic/sentiwn.py
@@ -0,0 +1,70 @@
 #!/usr/bin/env python
 """
 Author : Jaganadh Gopinadhan <jaganadhg@gmail.com>
 Copywright (C) : Jaganadh Gopinadhan
 Apache License, Version 2.0
  (the "License"); you may not use this file except in compliance with
  the License.  You may obtain a copy of the License at
      http://www.apache.org/licenses/LICENSE-2.0
  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
 """
 import sys,os
 import re
 from nltk.corpus import wordnet
 class SentiWordNet(object):
    """
    Interface to SentiWordNet
    """
    def __init__(self,swn_file):
        """
        """
        self.swn_file = swn_file
        self.pos_synset = self.__parse_swn_file()
    def __parse_swn_file(self):
        """
        Parse the SentiWordNet file and populate the POS and SynsetID hash
        """
        pos_synset_hash = {}
        swn_data = open(self.swn_file,'r').readlines()
        head_less_swn_data = filter((lambda line: not re.search(r"^\s*#",\
        line)), swn_data)
        for data in head_less_swn_data:
            fields = data.strip().split("\t")
            try:
                pos,syn_set_id,pos_score,neg_score,syn_set_score,\
                gloss = fields
            except:
                print "Found data without all details"
                pass
            if pos and syn_set_score:
                pos_synset_hash[(pos,int(syn_set_id))] = (float(pos_score),\
                float(neg_score))
        return pos_synset_hash
    def get_score(self,word,pos=None):
        """
        Get score for a given word/word pos combination
        """
        senti_scores = []
        synsets = wordnet.synsets(word,pos)
        for synset in synsets:
            if self.pos_synset.has_key((synset.pos(), synset.offset())):
                pos_val, neg_val = self.pos_synset[(synset.pos(), synset.offset())]
                senti_scores.append({"pos":pos_val,"neg":neg_val,\
                "obj": 1.0 - (pos_val - neg_val),'synset':synset})
        return senti_scores
--- a/sentiment-basic/test.py
+++ b/sentiment-basic/test.py
@@ -0,0 +1,42 @@
 import os
 import logging
 logging.basicConfig()
 try:
    import unittest.mock as mock
 except ImportError:
    import mock
 from senpy.extensions import Senpy
 from flask import Flask
 import unittest
 class SentiTextTest(unittest.TestCase):
    def setUp(self):
        self.app = Flask("test_plugin")
        self.dir = os.path.join(os.path.dirname(__file__))
        self.senpy = Senpy(plugin_folder=self.dir, default_plugins=False)
        self.senpy.init_app(self.app)
    def tearDown(self):
        self.senpy.deactivate_plugin("SentiText", sync=True)
    def test_analyse(self):
        plugin = self.senpy.plugins["SentiText"]
        plugin.activate()
        texts = {'Odio ir al cine' :  'marl:Neutral',
                 'El cielo esta nublado' : 'marl:Positive',
                 'Esta tarta esta muy buena' : 'marl:Neutral'}
        for text in texts:
            response = plugin.analyse(input=text)
            sentimentSet = response.entries[0].sentiments[0]
            print sentimentSet
            expected = texts[text]
            assert sentimentSet['marl:hasPolarity'] == expected
        plugin.deactivate()
 if __name__ == '__main__':
    unittest.main()