Added SentiText plugin (for Spanish)

master
Oscar Araque 9 years ago
parent 94d82238b8
commit 17976d85b1

File diff suppressed because it is too large Load Diff

@ -0,0 +1,175 @@
import os
import logging
import string
import nltk
import pickle
from sentiwn import SentiWordNet
from nltk.corpus import wordnet as wn
from textblob import TextBlob
from scipy.interpolate import interp1d
from os import path
from senpy.plugins import SentimentPlugin, SenpyPlugin
from senpy.models import Response, Opinion, Entry
logger = logging.getLogger(__name__)
class SentiTextPlugin(SentimentPlugin):
def __init__(self, info, *args, **kwargs):
super(SentiTextPlugin, self).__init__(info, *args, **kwargs)
self.id = info['module']
base = path.abspath(path.dirname(__file__))
self.swn_path = path.join(base, info['sentiword_path'])
self.pos_path = path.join(base, info['pos_path'])
self._swn = None
self._pos_tagger = None
def _load_swn(self):
swn = SentiWordNet(self.swn_path)
return swn
def _load_pos_tagger(self):
with open(self.pos_path, 'r') as f:
tagger = pickle.load(f)
return tagger
def activate(self, *args, **kwargs):
self._swn = self._load_swn()
self._pos_tagger = self._load_pos_tagger()
logger.info("SentiText plugin is ready to go!")
def deactivate(self, *args, **kwargs):
logger.info("SentiText plugin is being deactivated...")
def _remove_punctuation(self, tokens):
return [t for t in tokens if t not in string.punctuation]
def _tokenize(self, text):
data = {}
sentences = nltk.sent_tokenize(text)
for i, sentence in enumerate(sentences):
sentence_ = {}
words = nltk.word_tokenize(sentence)
sentence_['sentence'] = sentence
tokens_ = [w.lower() for w in words]
sentence_['tokens'] = self._remove_punctuation(tokens_)
data[i] = sentence_
return data
def _pos(self, tokens):
for i in tokens:
tokens[i]['tokens'] = self._pos_tagger.tag(tokens[i]['tokens'])
return tokens
# def _stopwords(sentences, lang='english'):
# for i in sentences:
# sentences[i]['tokens'] = [t for t in sentences[i]['tokens'] if t not in nltk.corpus.stopwords.words(lang)]
# return sentences
def _compare_synsets(self, synsets, tokens, i):
for synset in synsets:
for word in tokens[i]['lemmas']:
for lemma in tokens[i]['lemmas'][word]:
synset_ = lemma.synset()
if synset == synset_:
return synset
return None
def analyse(self, **params):
logger.debug("Analysing with params {}".format(params))
text = params.get("input", None)
tokens = self._tokenize(text)
tokens = self._pos(tokens)
for i in tokens:
tokens[i]['lemmas'] = {}
for w in tokens[i]['tokens']:
lemmas = wn.lemmas(w[0], lang='spa')
if len(lemmas) == 0:
continue
tokens[i]['lemmas'][w[0]] = lemmas
logger.debug("Tokens: {}".format(tokens))
trans = TextBlob(unicode(text)).translate(from_lang='es',to='en')
useful_synsets = {}
for s_i, t_s in enumerate(trans.sentences):
useful_synsets[s_i] = {}
for w_i, t_w in enumerate(trans.sentences[s_i].words):
synsets = wn.synsets(trans.sentences[s_i].words[w_i])
if len(synsets) == 0:
continue
eq_synset = self._compare_synsets(synsets, tokens, s_i)
useful_synsets[s_i][t_w] = eq_synset
logger.debug("Synsets used for analysis: {}".format(useful_synsets))
scores = {}
for i in tokens:
scores[i] = {}
for word in useful_synsets[i]:
if useful_synsets[i][word] is None:
continue
temp_scores = self._swn.get_score(useful_synsets[i][word].name().split('.')[0].replace(' ',' '))
for score in temp_scores:
if score['synset'] == useful_synsets[i][word]:
t_score = score['pos'] - score['neg']
f_score = 'neu'
if t_score > 0:
f_score = 'pos'
elif t_score < 0:
f_score = 'neg'
score['score'] = f_score
scores[i][word] = score
break
logger.debug("All scores (some not used): {}".format(scores))
lang = params.get("language", "auto")
p = params.get("prefix", None)
response = Response(prefix=p)
for i in scores:
n_pos = 0.0
n_neg = 0.0
for w in scores[i]:
if scores[i][w]['score'] == 'pos':
n_pos += 1.0
elif scores[i][w]['score'] == 'neg':
n_neg += 1.0
inter = interp1d([-1.0, 1.0], [0.0, 1.0])
try:
g_score = (n_pos - n_neg) / (n_pos + n_neg)
g_score = float(inter(g_score))
except:
if n_pos == 0 and n_neg == 0:
g_score = 0.5
polarity = 'marl:Neutral'
if g_score > 0.5:
polarity = 'marl:Positive'
elif g_score < 0.5:
polarity = 'marl:Negative'
entry = Entry(id="Entry"+str(i),
text=tokens[i]['sentence'],
prefix=p)
polarity
opinion = Opinion(id="Opinion0"+'_'+str(i),
prefix=p,
hasPolarity=polarity,
polarityValue=float("{0:.2f}".format(g_score)))
opinion["prov:wasGeneratedBy"] = self.id
entry.opinions.append(opinion)
entry.language = lang
response.entries.append(entry)
return response

@ -0,0 +1,18 @@
{
"name": "SentiText",
"module": "sentitext",
"description": "Sentiment classifier using rule-based classification for Spanish. Based on english to spanish translation and SentiWordNet sentiment knowledge.",
"author": "github.com/nachtkatze",
"version": "0.1",
"extra_params": {
"language": {
"aliases": ["language", "l"],
"required": true,
"options": ["es"],
"default": "es"
}
},
"requirements": {},
"sentiword_path": "SentiWordNet_3.0.txt",
"pos_path": "unigram_spanish.pickle"
}

@ -0,0 +1,70 @@
#!/usr/bin/env python
"""
Author : Jaganadh Gopinadhan <jaganadhg@gmail.com>
Copywright (C) : Jaganadh Gopinadhan
Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import sys,os
import re
from nltk.corpus import wordnet
class SentiWordNet(object):
"""
Interface to SentiWordNet
"""
def __init__(self,swn_file):
"""
"""
self.swn_file = swn_file
self.pos_synset = self.__parse_swn_file()
def __parse_swn_file(self):
"""
Parse the SentiWordNet file and populate the POS and SynsetID hash
"""
pos_synset_hash = {}
swn_data = open(self.swn_file,'r').readlines()
head_less_swn_data = filter((lambda line: not re.search(r"^\s*#",\
line)), swn_data)
for data in head_less_swn_data:
fields = data.strip().split("\t")
try:
pos,syn_set_id,pos_score,neg_score,syn_set_score,\
gloss = fields
except:
print "Found data without all details"
pass
if pos and syn_set_score:
pos_synset_hash[(pos,int(syn_set_id))] = (float(pos_score),\
float(neg_score))
return pos_synset_hash
def get_score(self,word,pos=None):
"""
Get score for a given word/word pos combination
"""
senti_scores = []
synsets = wordnet.synsets(word,pos)
for synset in synsets:
if self.pos_synset.has_key((synset.pos(), synset.offset())):
pos_val, neg_val = self.pos_synset[(synset.pos(), synset.offset())]
senti_scores.append({"pos":pos_val,"neg":neg_val,\
"obj": 1.0 - (pos_val - neg_val),'synset':synset})
return senti_scores

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save