1
0
mirror of https://github.com/gsi-upm/senpy synced 2024-11-21 15:52:28 +00:00

Add evaluation tests

This commit is contained in:
J. Fernando Sánchez 2018-04-24 19:36:50 +02:00
parent 5e2ada1654
commit c0aa7ddc3c
7 changed files with 121 additions and 76 deletions

View File

@ -18,7 +18,7 @@ class BasicBox(SentimentBox):
'default': 'marl:Neutral'
}
def predict(self, input):
def predict_one(self, input):
output = basic.get_polarity(input)
return self.mappings.get(output, self.mappings['default'])

View File

@ -18,7 +18,7 @@ class Basic(MappingMixin, SentimentBox):
'default': 'marl:Neutral'
}
def predict(self, input):
def predict_one(self, input):
return basic.get_polarity(input)
test_cases = [{

View File

@ -18,7 +18,7 @@ class PipelineSentiment(MappingMixin, SentimentBox):
-1: 'marl:Negative'
}
def predict(self, input):
def predict_one(self, input):
return pipeline.predict([input, ])[0]
test_cases = [

View File

@ -6,7 +6,7 @@ from future import standard_library
standard_library.install_aliases()
from . import plugins, api
from .plugins import Plugin
from .plugins import Plugin, evaluate
from .models import Error, AggregatedEvaluation
from .blueprints import api_blueprint, demo_blueprint, ns_blueprint
@ -17,7 +17,6 @@ import copy
import errno
import logging
#Correct this import for managing the datasets
from gsitk.datasets.datasets import DatasetManager
@ -197,13 +196,13 @@ class Senpy(object):
if dataset not in self.datasets:
logger.debug(("The dataset '{}' is not valid\n"
"Valid datasets: {}").format(dataset,
self.datasets.keys()))
self.datasets.keys()))
raise Error(
status=404,
message="The dataset '{}' is not valid".format(dataset))
datasets = self._dm.prepare_datasets(datasets_name)
return datasets
@property
def datasets(self):
self._dataset_list = {}
@ -219,29 +218,17 @@ class Senpy(object):
def evaluate(self, params):
logger.debug("evaluating request: {}".format(params))
try:
results = AggregatedEvaluation()
results.parameters = params
datasets = self._get_datasets(results)
plugins = self._get_plugins(results)
collector = list()
for plugin in plugins:
for eval in plugin.score(datasets):
results.evaluations.append(eval)
if 'with_parameters' not in results.parameters:
del results.parameters
logger.debug("Returning evaluation result: {}".format(results))
except (Error,Exception) as ex:
if not isinstance(ex, Error):
msg = "Error during evaluation: {} \n\t{}".format(ex,
traceback.format_exc())
ex = Error(message=msg, status=500)
logger.exception('Error returning evaluation result')
raise ex
#results.evaluations = collector
results = AggregatedEvaluation()
results.parameters = params
datasets = self._get_datasets(results)
plugins = self._get_plugins(results)
for eval in evaluate(plugins, datasets):
results.evaluations.append(eval)
if 'with_parameters' not in results.parameters:
del results.parameters
logger.debug("Returning evaluation result: {}".format(results))
return results
def _conversion_candidates(self, fromModel, toModel):
candidates = self.plugins(plugin_type='emotionConversionPlugin')
for candidate in candidates:

View File

@ -25,6 +25,8 @@ from .. import api
from gsitk.evaluation.evaluation import Evaluation as Eval
from sklearn.pipeline import Pipeline
import numpy as np
logger = logging.getLogger(__name__)
@ -254,7 +256,7 @@ class Box(AnalysisPlugin):
.. code-block::
entry --> input() --> predict() --> output() --> entry'
entry --> input() --> predict_one() --> output() --> entry'
In other words: their ``input`` method convers a query (entry and a set of parameters) into
@ -270,15 +272,33 @@ class Box(AnalysisPlugin):
'''Transforms the results of the black box into an entry'''
return output
def predict(self, input):
def predict_one(self, input):
raise NotImplementedError('You should define the behavior of this plugin')
def analyse_entries(self, entries, params):
for entry in entries:
input = self.input(entry=entry, params=params)
results = self.predict(input=input)
results = self.predict_one(input=input)
yield self.output(output=results, entry=entry, params=params)
def fit(self, X=None, y=None):
return self
def transform(self, X):
return np.array([self.predict_one(x) for x in X])
def predict(self, X):
return self.transform(X)
def fit_transform(self, X, y):
self.fit(X, y)
return self.transform(X)
def as_pipe(self):
pipe = Pipeline([('plugin', self)])
pipe.name = self.name
return pipe
class TextBox(Box):
'''A black box plugin that takes only text as input'''
@ -323,48 +343,6 @@ class EmotionBox(TextBox, EmotionPlugin):
return entry
class EvaluationBox():
'''
A box plugin where it is implemented the evaluation. It is necessary to have a pipeline.
'''
def score(self, datasets):
pipelines = [self._pipeline]
ev = Eval(tuples = None,
datasets = datasets,
pipelines = pipelines)
ev.evaluate()
results = ev.results
evaluations = self._evaluations_toJSONLD(results)
return evaluations
def _evaluations_toJSONLD(self, results):
'''
Map the evaluation results to a JSONLD scheme
'''
evaluations = list()
metric_names = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'f1_weighted', 'f1_micro', 'f1_macro']
for index, row in results.iterrows():
evaluation = models.Evaluation()
if row['CV'] == False:
evaluation['@type'] = ['StaticCV', 'Evaluation']
evaluation.evaluatesOn = row['Dataset']
evaluation.evaluates = row['Model']
i = 0
for name in metric_names:
metric = models.Metric()
metric['@id'] = 'Metric' + str(i)
metric['@type'] = name.capitalize()
metric.value = row[name]
evaluation.metrics.append(metric)
i+=1
evaluations.append(evaluation)
return evaluations
class MappingMixin(object):
@property
@ -605,3 +583,47 @@ def _from_loaded_module(module, info=None, **kwargs):
yield cls(info=info, **kwargs)
for instance in _instances_in_module(module):
yield instance
def evaluate(plugins, datasets, **kwargs):
ev = Eval(tuples=None,
datasets=datasets,
pipelines=[plugin.as_pipe() for plugin in plugins])
ev.evaluate()
results = ev.results
evaluations = evaluations_to_JSONLD(results, **kwargs)
return evaluations
def evaluations_to_JSONLD(results, flatten=False):
'''
Map the evaluation results to a JSONLD scheme
'''
evaluations = list()
metric_names = ['accuracy', 'precision_macro', 'recall_macro',
'f1_macro', 'f1_weighted', 'f1_micro', 'f1_macro']
for index, row in results.iterrows():
evaluation = models.Evaluation()
if row.get('CV', True):
evaluation['@type'] = ['StaticCV', 'Evaluation']
evaluation.evaluatesOn = row['Dataset']
evaluation.evaluates = row['Model']
i = 0
if flatten:
metric = models.Metric()
for name in metric_names:
metric[name] = row[name]
evaluation.metrics.append(metric)
else:
# We should probably discontinue this representation
for name in metric_names:
metric = models.Metric()
metric['@id'] = 'Metric' + str(i)
metric['@type'] = name.capitalize()
metric.value = row[name]
evaluation.metrics.append(metric)
i += 1
evaluations.append(evaluation)
return evaluations

View File

@ -43,7 +43,7 @@
"$ref": "response.json"
},
"AggregatedEvaluation": {
"$ref": "aggregatedevaluation.json"
"$ref": "aggregatedEvaluation.json"
},
"Evaluation": {
"$ref": "evaluation.json"

View File

@ -10,6 +10,8 @@ from senpy.models import Results, Entry, EmotionSet, Emotion, Plugins
from senpy import plugins
from senpy.plugins.conversion.emotion.centroids import CentroidConversion
import pandas as pd
class ShelfDummyPlugin(plugins.SentimentPlugin, plugins.ShelfMixin):
'''Dummy plugin for tests.'''
@ -212,7 +214,7 @@ class PluginsTest(TestCase):
def input(self, entry, **kwargs):
return entry.text
def predict(self, input):
def predict_one(self, input):
return 'SIGN' in input
def output(self, output, entry, **kwargs):
@ -242,7 +244,7 @@ class PluginsTest(TestCase):
mappings = {'happy': 'marl:Positive', 'sad': 'marl:Negative'}
def predict(self, input, **kwargs):
def predict_one(self, input, **kwargs):
return 'happy' if ':)' in input else 'sad'
test_cases = [
@ -309,6 +311,40 @@ class PluginsTest(TestCase):
res = c._backwards_conversion(e)
assert res["onyx:hasEmotionCategory"] == "c2"
def test_evaluation(self):
testdata = []
for i in range(50):
testdata.append(["good", 1])
for i in range(50):
testdata.append(["bad", 0])
dataset = pd.DataFrame(testdata, columns=['text', 'polarity'])
class DummyPlugin(plugins.TextBox):
description = 'Plugin to test evaluation'
version = 0
def predict_one(self, input):
return 0
class SmartPlugin(plugins.TextBox):
description = 'Plugin to test evaluation'
version = 0
def predict_one(self, input):
if input == 'good':
return 1
return 0
dpipe = DummyPlugin()
results = plugins.evaluate(datasets={'testdata': dataset}, plugins=[dpipe], flatten=True)
dumb_metrics = results[0].metrics[0]
assert abs(dumb_metrics['accuracy'] - 0.5) < 0.01
spipe = SmartPlugin()
results = plugins.evaluate(datasets={'testdata': dataset}, plugins=[spipe], flatten=True)
smart_metrics = results[0].metrics[0]
assert abs(smart_metrics['accuracy'] - 1) < 0.01
def make_mini_test(fpath):
def mini_test(self):