mirror of
https://github.com/gsi-upm/senpy
synced 2024-11-22 00:02:28 +00:00
Squashed 'sentiment-basic/' content from commit beb8e31
git-subtree-dir: sentiment-basic git-subtree-split: beb8e311619059a0c660411edef1cf95b3826c0a
This commit is contained in:
commit
7c959aace8
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
[submodule "data"]
|
||||||
|
path = data
|
||||||
|
url = ../data/sentiment-basic
|
28
README.md
Executable file
28
README.md
Executable file
@ -0,0 +1,28 @@
|
|||||||
|
# Sentiment basic plugin
|
||||||
|
|
||||||
|
This plugin is based on the classifier developed for the TASS 2015 competition. It has been developed for Spanish and English. This is a demo plugin that uses only some features from the TASS 2015 classifier. To use the entirely functional classifier you can use the service in: http://senpy.cluster.gsi.dit.upm.es
|
||||||
|
|
||||||
|
There is more information avaliable in:
|
||||||
|
|
||||||
|
- Aspect based Sentiment Analysis of Spanish Tweets, Oscar Araque and Ignacio Corcuera-Platas and Constantino Román-Gómez and Carlos A. Iglesias and J. Fernando Sánchez-Rada. http://gsi.dit.upm.es/es/investigacion/publicaciones?view=publication&task=show&id=376
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
Params accepted:
|
||||||
|
|
||||||
|
- Language: Spanish (es).
|
||||||
|
- Input: text to analyse.
|
||||||
|
|
||||||
|
|
||||||
|
Example request:
|
||||||
|
```
|
||||||
|
http://senpy.cluster.gsi.dit.upm.es/api/?algo=sentiment-basic&language=es&input=I%20love%20Madrid
|
||||||
|
```
|
||||||
|
|
||||||
|
Example respond: This plugin follows the standard for the senpy plugin response. For more information, please visit [senpy documentation](http://senpy.readthedocs.io). Specifically, NIF API section.
|
||||||
|
|
||||||
|
This plugin only supports **python2**
|
||||||
|
|
||||||
|
|
||||||
|
![alt GSI Logo][logoGSI]
|
||||||
|
|
||||||
|
[logoGSI]: http://www.gsi.dit.upm.es/images/stories/logos/gsi.png "GSI Logo"
|
1
data
Submodule
1
data
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 7f99680db607fd06dc46009a7dae13ca4fc4e6ce
|
148
sentiment-basic.py
Normal file
148
sentiment-basic.py
Normal file
@ -0,0 +1,148 @@
|
|||||||
|
import os
|
||||||
|
import logging
|
||||||
|
import string
|
||||||
|
import nltk
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
from sentiwn import SentiWordNet
|
||||||
|
from nltk.corpus import wordnet as wn
|
||||||
|
from textblob import TextBlob
|
||||||
|
from scipy.interpolate import interp1d
|
||||||
|
from os import path
|
||||||
|
|
||||||
|
from senpy.plugins import SentimentPlugin, SenpyPlugin
|
||||||
|
from senpy.models import Results, Entry, Sentiment
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class SentiTextPlugin(SentimentPlugin):
|
||||||
|
|
||||||
|
def _load_swn(self):
|
||||||
|
self.swn_path = path.join(path.abspath(path.dirname(__file__)), self.sentiword_path)
|
||||||
|
swn = SentiWordNet(self.swn_path)
|
||||||
|
return swn
|
||||||
|
|
||||||
|
def _load_pos_tagger(self):
|
||||||
|
self.pos_path = path.join(path.abspath(path.dirname(__file__)), self.pos_path)
|
||||||
|
with open(self.pos_path, 'r') as f:
|
||||||
|
tagger = pickle.load(f)
|
||||||
|
return tagger
|
||||||
|
|
||||||
|
def activate(self, *args, **kwargs):
|
||||||
|
nltk.download(['punkt','wordnet'])
|
||||||
|
self._swn = self._load_swn()
|
||||||
|
self._pos_tagger = self._load_pos_tagger()
|
||||||
|
|
||||||
|
def _remove_punctuation(self, tokens):
|
||||||
|
return [t for t in tokens if t not in string.punctuation]
|
||||||
|
|
||||||
|
def _tokenize(self, text):
|
||||||
|
data = {}
|
||||||
|
sentences = nltk.sent_tokenize(text)
|
||||||
|
for i, sentence in enumerate(sentences):
|
||||||
|
sentence_ = {}
|
||||||
|
words = nltk.word_tokenize(sentence)
|
||||||
|
sentence_['sentence'] = sentence
|
||||||
|
tokens_ = [w.lower() for w in words]
|
||||||
|
sentence_['tokens'] = self._remove_punctuation(tokens_)
|
||||||
|
data[i] = sentence_
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _pos(self, tokens):
|
||||||
|
for i in tokens:
|
||||||
|
tokens[i]['tokens'] = self._pos_tagger.tag(tokens[i]['tokens'])
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
# def _stopwords(sentences, lang='english'):
|
||||||
|
# for i in sentences:
|
||||||
|
# sentences[i]['tokens'] = [t for t in sentences[i]['tokens'] if t not in nltk.corpus.stopwords.words(lang)]
|
||||||
|
# return sentences
|
||||||
|
|
||||||
|
def _compare_synsets(self, synsets, tokens, i):
|
||||||
|
for synset in synsets:
|
||||||
|
for word in tokens[i]['lemmas']:
|
||||||
|
for lemma in tokens[i]['lemmas'][word]:
|
||||||
|
synset_ = lemma.synset()
|
||||||
|
if synset == synset_:
|
||||||
|
return synset
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def analyse_entry(self, entry, params):
|
||||||
|
language = params.get("language")
|
||||||
|
text = entry.get("text", None)
|
||||||
|
tokens = self._tokenize(text)
|
||||||
|
tokens = self._pos(tokens)
|
||||||
|
sufixes = {'es':'spa','en':'eng','it':'ita','fr':'fra'}
|
||||||
|
for i in tokens:
|
||||||
|
tokens[i]['lemmas'] = {}
|
||||||
|
for w in tokens[i]['tokens']:
|
||||||
|
lemmas = wn.lemmas(w[0], lang=sufixes[language])
|
||||||
|
if len(lemmas) == 0:
|
||||||
|
continue
|
||||||
|
tokens[i]['lemmas'][w[0]] = lemmas
|
||||||
|
if language == "en":
|
||||||
|
trans = TextBlob(unicode(text))
|
||||||
|
else:
|
||||||
|
trans = TextBlob(unicode(text)).translate(from_lang=language,to='en')
|
||||||
|
useful_synsets = {}
|
||||||
|
for s_i, t_s in enumerate(trans.sentences):
|
||||||
|
useful_synsets[s_i] = {}
|
||||||
|
for w_i, t_w in enumerate(trans.sentences[s_i].words):
|
||||||
|
synsets = wn.synsets(trans.sentences[s_i].words[w_i])
|
||||||
|
if len(synsets) == 0:
|
||||||
|
continue
|
||||||
|
eq_synset = self._compare_synsets(synsets, tokens, s_i)
|
||||||
|
useful_synsets[s_i][t_w] = eq_synset
|
||||||
|
scores = {}
|
||||||
|
for i in tokens:
|
||||||
|
scores[i] = {}
|
||||||
|
if useful_synsets != None:
|
||||||
|
for word in useful_synsets[i]:
|
||||||
|
if useful_synsets[i][word] is None:
|
||||||
|
continue
|
||||||
|
temp_scores = self._swn.get_score(useful_synsets[i][word].name().split('.')[0].replace(' ',' '))
|
||||||
|
for score in temp_scores:
|
||||||
|
if score['synset'] == useful_synsets[i][word]:
|
||||||
|
t_score = score['pos'] - score['neg']
|
||||||
|
f_score = 'neu'
|
||||||
|
if t_score > 0:
|
||||||
|
f_score = 'pos'
|
||||||
|
elif t_score < 0:
|
||||||
|
f_score = 'neg'
|
||||||
|
score['score'] = f_score
|
||||||
|
scores[i][word] = score
|
||||||
|
break
|
||||||
|
p = params.get("prefix", None)
|
||||||
|
for i in scores:
|
||||||
|
n_pos = 0.0
|
||||||
|
n_neg = 0.0
|
||||||
|
for w in scores[i]:
|
||||||
|
if scores[i][w]['score'] == 'pos':
|
||||||
|
n_pos += 1.0
|
||||||
|
elif scores[i][w]['score'] == 'neg':
|
||||||
|
n_neg += 1.0
|
||||||
|
inter = interp1d([-1.0, 1.0], [0.0, 1.0])
|
||||||
|
try:
|
||||||
|
g_score = (n_pos - n_neg) / (n_pos + n_neg)
|
||||||
|
g_score = float(inter(g_score))
|
||||||
|
except:
|
||||||
|
if n_pos == 0 and n_neg == 0:
|
||||||
|
g_score = 0.5
|
||||||
|
polarity = 'marl:Neutral'
|
||||||
|
polarity_value = 0
|
||||||
|
if g_score > 0.5:
|
||||||
|
polarity = 'marl:Positive'
|
||||||
|
polarity_value = 1
|
||||||
|
elif g_score < 0.5:
|
||||||
|
polarity = 'marl:Negative'
|
||||||
|
polarity_value = -1
|
||||||
|
opinion = Sentiment(id="Opinion0"+'_'+str(i),
|
||||||
|
marl__hasPolarity=polarity,
|
||||||
|
marl__polarityValue=polarity_value)
|
||||||
|
|
||||||
|
|
||||||
|
entry.sentiments.append(opinion)
|
||||||
|
|
||||||
|
yield entry
|
24
sentiment-basic.senpy
Normal file
24
sentiment-basic.senpy
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
{
|
||||||
|
"name": "sentiment-basic",
|
||||||
|
"module": "sentiment-basic",
|
||||||
|
"description": "Sentiment classifier using rule-based classification for Spanish. Based on english to spanish translation and SentiWordNet sentiment knowledge. This is a demo plugin that uses only some features from the TASS 2015 classifier. To use the entirely functional classifier you can use the service in: http://senpy.cluster.gsi.dit.upm.es.",
|
||||||
|
"author": "github.com/nachtkatze",
|
||||||
|
"version": "0.1",
|
||||||
|
"requirements": [
|
||||||
|
"nltk>=3.0.5",
|
||||||
|
"scipy>=0.14.0",
|
||||||
|
"textblob"
|
||||||
|
],
|
||||||
|
"extra_params": {
|
||||||
|
"language": {
|
||||||
|
"aliases": ["language", "l"],
|
||||||
|
"required": true,
|
||||||
|
"options": ["en","es", "it", "fr", "auto"],
|
||||||
|
"default": "auto"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"sentiword_path": "data/SentiWordNet_3.0.txt",
|
||||||
|
"pos_path": "data/unigram_spanish.pickle",
|
||||||
|
"maxPolarityValue": "1",
|
||||||
|
"minPolarityValue": "-1"
|
||||||
|
}
|
70
sentiwn.py
Normal file
70
sentiwn.py
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Author : Jaganadh Gopinadhan <jaganadhg@gmail.com>
|
||||||
|
Copywright (C) : Jaganadh Gopinadhan
|
||||||
|
|
||||||
|
Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys,os
|
||||||
|
import re
|
||||||
|
|
||||||
|
from nltk.corpus import wordnet
|
||||||
|
|
||||||
|
class SentiWordNet(object):
|
||||||
|
"""
|
||||||
|
Interface to SentiWordNet
|
||||||
|
"""
|
||||||
|
def __init__(self,swn_file):
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
self.swn_file = swn_file
|
||||||
|
self.pos_synset = self.__parse_swn_file()
|
||||||
|
|
||||||
|
def __parse_swn_file(self):
|
||||||
|
"""
|
||||||
|
Parse the SentiWordNet file and populate the POS and SynsetID hash
|
||||||
|
"""
|
||||||
|
pos_synset_hash = {}
|
||||||
|
swn_data = open(self.swn_file,'r').readlines()
|
||||||
|
head_less_swn_data = filter((lambda line: not re.search(r"^\s*#",\
|
||||||
|
line)), swn_data)
|
||||||
|
|
||||||
|
for data in head_less_swn_data:
|
||||||
|
fields = data.strip().split("\t")
|
||||||
|
try:
|
||||||
|
pos,syn_set_id,pos_score,neg_score,syn_set_score,\
|
||||||
|
gloss = fields
|
||||||
|
except:
|
||||||
|
print "Found data without all details"
|
||||||
|
pass
|
||||||
|
|
||||||
|
if pos and syn_set_score:
|
||||||
|
pos_synset_hash[(pos,int(syn_set_id))] = (float(pos_score),\
|
||||||
|
float(neg_score))
|
||||||
|
|
||||||
|
return pos_synset_hash
|
||||||
|
|
||||||
|
def get_score(self,word,pos=None):
|
||||||
|
"""
|
||||||
|
Get score for a given word/word pos combination
|
||||||
|
"""
|
||||||
|
senti_scores = []
|
||||||
|
synsets = wordnet.synsets(word,pos)
|
||||||
|
for synset in synsets:
|
||||||
|
if self.pos_synset.has_key((synset.pos(), synset.offset())):
|
||||||
|
pos_val, neg_val = self.pos_synset[(synset.pos(), synset.offset())]
|
||||||
|
senti_scores.append({"pos":pos_val,"neg":neg_val,\
|
||||||
|
"obj": 1.0 - (pos_val - neg_val),'synset':synset})
|
||||||
|
|
||||||
|
return senti_scores
|
42
test.py
Normal file
42
test.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
import os
|
||||||
|
import logging
|
||||||
|
logging.basicConfig()
|
||||||
|
try:
|
||||||
|
import unittest.mock as mock
|
||||||
|
except ImportError:
|
||||||
|
import mock
|
||||||
|
from senpy.extensions import Senpy
|
||||||
|
from flask import Flask
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
class SentiTextTest(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.app = Flask("test_plugin")
|
||||||
|
self.dir = os.path.join(os.path.dirname(__file__))
|
||||||
|
self.senpy = Senpy(plugin_folder=self.dir, default_plugins=False)
|
||||||
|
self.senpy.init_app(self.app)
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
self.senpy.deactivate_plugin("SentiText", sync=True)
|
||||||
|
|
||||||
|
def test_analyse(self):
|
||||||
|
plugin = self.senpy.plugins["SentiText"]
|
||||||
|
plugin.activate()
|
||||||
|
|
||||||
|
texts = {'Odio ir al cine' : 'marl:Neutral',
|
||||||
|
'El cielo esta nublado' : 'marl:Positive',
|
||||||
|
'Esta tarta esta muy buena' : 'marl:Neutral'}
|
||||||
|
|
||||||
|
for text in texts:
|
||||||
|
response = plugin.analyse(input=text)
|
||||||
|
sentimentSet = response.entries[0].sentiments[0]
|
||||||
|
print sentimentSet
|
||||||
|
expected = texts[text]
|
||||||
|
|
||||||
|
assert sentimentSet['marl:hasPolarity'] == expected
|
||||||
|
|
||||||
|
plugin.deactivate()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
Loading…
Reference in New Issue
Block a user