From 7c959aace896e9d318497a417e0eec8f78b62314 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Fernando=20S=C3=A1nchez?= Date: Tue, 12 Jun 2018 10:01:45 +0200 Subject: [PATCH] Squashed 'sentiment-basic/' content from commit beb8e31 git-subtree-dir: sentiment-basic git-subtree-split: beb8e311619059a0c660411edef1cf95b3826c0a --- .gitmodules | 3 + README.md | 28 ++++++++ data | 1 + sentiment-basic.py | 148 ++++++++++++++++++++++++++++++++++++++++++ sentiment-basic.senpy | 24 +++++++ sentiwn.py | 70 ++++++++++++++++++++ test.py | 42 ++++++++++++ 7 files changed, 316 insertions(+) create mode 100644 .gitmodules create mode 100755 README.md create mode 160000 data create mode 100644 sentiment-basic.py create mode 100644 sentiment-basic.senpy create mode 100644 sentiwn.py create mode 100644 test.py diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..3f87a17 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "data"] + path = data + url = ../data/sentiment-basic diff --git a/README.md b/README.md new file mode 100755 index 0000000..701af8d --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +# Sentiment basic plugin + +This plugin is based on the classifier developed for the TASS 2015 competition. It has been developed for Spanish and English. This is a demo plugin that uses only some features from the TASS 2015 classifier. To use the entirely functional classifier you can use the service in: http://senpy.cluster.gsi.dit.upm.es + +There is more information avaliable in: + + - Aspect based Sentiment Analysis of Spanish Tweets, Oscar Araque and Ignacio Corcuera-Platas and Constantino Román-Gómez and Carlos A. Iglesias and J. Fernando Sánchez-Rada. http://gsi.dit.upm.es/es/investigacion/publicaciones?view=publication&task=show&id=376 + +## Usage +Params accepted: + +- Language: Spanish (es). +- Input: text to analyse. + + +Example request: +``` +http://senpy.cluster.gsi.dit.upm.es/api/?algo=sentiment-basic&language=es&input=I%20love%20Madrid +``` + +Example respond: This plugin follows the standard for the senpy plugin response. For more information, please visit [senpy documentation](http://senpy.readthedocs.io). Specifically, NIF API section. + +This plugin only supports **python2** + + +![alt GSI Logo][logoGSI] + +[logoGSI]: http://www.gsi.dit.upm.es/images/stories/logos/gsi.png "GSI Logo" diff --git a/data b/data new file mode 160000 index 0000000..7f99680 --- /dev/null +++ b/data @@ -0,0 +1 @@ +Subproject commit 7f99680db607fd06dc46009a7dae13ca4fc4e6ce diff --git a/sentiment-basic.py b/sentiment-basic.py new file mode 100644 index 0000000..9dbe4a9 --- /dev/null +++ b/sentiment-basic.py @@ -0,0 +1,148 @@ +import os +import logging +import string +import nltk +import pickle + +from sentiwn import SentiWordNet +from nltk.corpus import wordnet as wn +from textblob import TextBlob +from scipy.interpolate import interp1d +from os import path + +from senpy.plugins import SentimentPlugin, SenpyPlugin +from senpy.models import Results, Entry, Sentiment + +logger = logging.getLogger(__name__) + + +class SentiTextPlugin(SentimentPlugin): + + def _load_swn(self): + self.swn_path = path.join(path.abspath(path.dirname(__file__)), self.sentiword_path) + swn = SentiWordNet(self.swn_path) + return swn + + def _load_pos_tagger(self): + self.pos_path = path.join(path.abspath(path.dirname(__file__)), self.pos_path) + with open(self.pos_path, 'r') as f: + tagger = pickle.load(f) + return tagger + + def activate(self, *args, **kwargs): + nltk.download(['punkt','wordnet']) + self._swn = self._load_swn() + self._pos_tagger = self._load_pos_tagger() + + def _remove_punctuation(self, tokens): + return [t for t in tokens if t not in string.punctuation] + + def _tokenize(self, text): + data = {} + sentences = nltk.sent_tokenize(text) + for i, sentence in enumerate(sentences): + sentence_ = {} + words = nltk.word_tokenize(sentence) + sentence_['sentence'] = sentence + tokens_ = [w.lower() for w in words] + sentence_['tokens'] = self._remove_punctuation(tokens_) + data[i] = sentence_ + return data + + def _pos(self, tokens): + for i in tokens: + tokens[i]['tokens'] = self._pos_tagger.tag(tokens[i]['tokens']) + return tokens + + # def _stopwords(sentences, lang='english'): + # for i in sentences: + # sentences[i]['tokens'] = [t for t in sentences[i]['tokens'] if t not in nltk.corpus.stopwords.words(lang)] + # return sentences + + def _compare_synsets(self, synsets, tokens, i): + for synset in synsets: + for word in tokens[i]['lemmas']: + for lemma in tokens[i]['lemmas'][word]: + synset_ = lemma.synset() + if synset == synset_: + return synset + return None + + + def analyse_entry(self, entry, params): + language = params.get("language") + text = entry.get("text", None) + tokens = self._tokenize(text) + tokens = self._pos(tokens) + sufixes = {'es':'spa','en':'eng','it':'ita','fr':'fra'} + for i in tokens: + tokens[i]['lemmas'] = {} + for w in tokens[i]['tokens']: + lemmas = wn.lemmas(w[0], lang=sufixes[language]) + if len(lemmas) == 0: + continue + tokens[i]['lemmas'][w[0]] = lemmas + if language == "en": + trans = TextBlob(unicode(text)) + else: + trans = TextBlob(unicode(text)).translate(from_lang=language,to='en') + useful_synsets = {} + for s_i, t_s in enumerate(trans.sentences): + useful_synsets[s_i] = {} + for w_i, t_w in enumerate(trans.sentences[s_i].words): + synsets = wn.synsets(trans.sentences[s_i].words[w_i]) + if len(synsets) == 0: + continue + eq_synset = self._compare_synsets(synsets, tokens, s_i) + useful_synsets[s_i][t_w] = eq_synset + scores = {} + for i in tokens: + scores[i] = {} + if useful_synsets != None: + for word in useful_synsets[i]: + if useful_synsets[i][word] is None: + continue + temp_scores = self._swn.get_score(useful_synsets[i][word].name().split('.')[0].replace(' ',' ')) + for score in temp_scores: + if score['synset'] == useful_synsets[i][word]: + t_score = score['pos'] - score['neg'] + f_score = 'neu' + if t_score > 0: + f_score = 'pos' + elif t_score < 0: + f_score = 'neg' + score['score'] = f_score + scores[i][word] = score + break + p = params.get("prefix", None) + for i in scores: + n_pos = 0.0 + n_neg = 0.0 + for w in scores[i]: + if scores[i][w]['score'] == 'pos': + n_pos += 1.0 + elif scores[i][w]['score'] == 'neg': + n_neg += 1.0 + inter = interp1d([-1.0, 1.0], [0.0, 1.0]) + try: + g_score = (n_pos - n_neg) / (n_pos + n_neg) + g_score = float(inter(g_score)) + except: + if n_pos == 0 and n_neg == 0: + g_score = 0.5 + polarity = 'marl:Neutral' + polarity_value = 0 + if g_score > 0.5: + polarity = 'marl:Positive' + polarity_value = 1 + elif g_score < 0.5: + polarity = 'marl:Negative' + polarity_value = -1 + opinion = Sentiment(id="Opinion0"+'_'+str(i), + marl__hasPolarity=polarity, + marl__polarityValue=polarity_value) + + + entry.sentiments.append(opinion) + + yield entry diff --git a/sentiment-basic.senpy b/sentiment-basic.senpy new file mode 100644 index 0000000..13b2429 --- /dev/null +++ b/sentiment-basic.senpy @@ -0,0 +1,24 @@ +{ + "name": "sentiment-basic", + "module": "sentiment-basic", + "description": "Sentiment classifier using rule-based classification for Spanish. Based on english to spanish translation and SentiWordNet sentiment knowledge. This is a demo plugin that uses only some features from the TASS 2015 classifier. To use the entirely functional classifier you can use the service in: http://senpy.cluster.gsi.dit.upm.es.", + "author": "github.com/nachtkatze", + "version": "0.1", + "requirements": [ + "nltk>=3.0.5", + "scipy>=0.14.0", + "textblob" + ], + "extra_params": { + "language": { + "aliases": ["language", "l"], + "required": true, + "options": ["en","es", "it", "fr", "auto"], + "default": "auto" + }, + }, + "sentiword_path": "data/SentiWordNet_3.0.txt", + "pos_path": "data/unigram_spanish.pickle", + "maxPolarityValue": "1", + "minPolarityValue": "-1" +} diff --git a/sentiwn.py b/sentiwn.py new file mode 100644 index 0000000..42631e5 --- /dev/null +++ b/sentiwn.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +""" +Author : Jaganadh Gopinadhan +Copywright (C) : Jaganadh Gopinadhan + + Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import sys,os +import re + +from nltk.corpus import wordnet + +class SentiWordNet(object): + """ + Interface to SentiWordNet + """ + def __init__(self,swn_file): + """ + """ + self.swn_file = swn_file + self.pos_synset = self.__parse_swn_file() + + def __parse_swn_file(self): + """ + Parse the SentiWordNet file and populate the POS and SynsetID hash + """ + pos_synset_hash = {} + swn_data = open(self.swn_file,'r').readlines() + head_less_swn_data = filter((lambda line: not re.search(r"^\s*#",\ + line)), swn_data) + + for data in head_less_swn_data: + fields = data.strip().split("\t") + try: + pos,syn_set_id,pos_score,neg_score,syn_set_score,\ + gloss = fields + except: + print "Found data without all details" + pass + + if pos and syn_set_score: + pos_synset_hash[(pos,int(syn_set_id))] = (float(pos_score),\ + float(neg_score)) + + return pos_synset_hash + + def get_score(self,word,pos=None): + """ + Get score for a given word/word pos combination + """ + senti_scores = [] + synsets = wordnet.synsets(word,pos) + for synset in synsets: + if self.pos_synset.has_key((synset.pos(), synset.offset())): + pos_val, neg_val = self.pos_synset[(synset.pos(), synset.offset())] + senti_scores.append({"pos":pos_val,"neg":neg_val,\ + "obj": 1.0 - (pos_val - neg_val),'synset':synset}) + + return senti_scores \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..1bb0c75 --- /dev/null +++ b/test.py @@ -0,0 +1,42 @@ +import os +import logging +logging.basicConfig() +try: + import unittest.mock as mock +except ImportError: + import mock +from senpy.extensions import Senpy +from flask import Flask +import unittest + +class SentiTextTest(unittest.TestCase): + + def setUp(self): + self.app = Flask("test_plugin") + self.dir = os.path.join(os.path.dirname(__file__)) + self.senpy = Senpy(plugin_folder=self.dir, default_plugins=False) + self.senpy.init_app(self.app) + + def tearDown(self): + self.senpy.deactivate_plugin("SentiText", sync=True) + + def test_analyse(self): + plugin = self.senpy.plugins["SentiText"] + plugin.activate() + + texts = {'Odio ir al cine' : 'marl:Neutral', + 'El cielo esta nublado' : 'marl:Positive', + 'Esta tarta esta muy buena' : 'marl:Neutral'} + + for text in texts: + response = plugin.analyse(input=text) + sentimentSet = response.entries[0].sentiments[0] + print sentimentSet + expected = texts[text] + + assert sentimentSet['marl:hasPolarity'] == expected + + plugin.deactivate() + +if __name__ == '__main__': + unittest.main() \ No newline at end of file