From c0aa7ddc3c918fa578733e38f4b2a4e9555e9bae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Fernando=20S=C3=A1nchez?= Date: Tue, 24 Apr 2018 19:36:50 +0200 Subject: [PATCH] Add evaluation tests --- example-plugins/basic_box_plugin.py | 2 +- example-plugins/basic_plugin.py | 2 +- example-plugins/sklearn/pipeline_plugin.py | 2 +- senpy/extensions.py | 37 +++---- senpy/plugins/__init__.py | 112 ++++++++++++--------- senpy/schemas/definitions.json | 2 +- tests/test_plugins.py | 40 +++++++- 7 files changed, 121 insertions(+), 76 deletions(-) diff --git a/example-plugins/basic_box_plugin.py b/example-plugins/basic_box_plugin.py index 0b85951..3b18cf9 100644 --- a/example-plugins/basic_box_plugin.py +++ b/example-plugins/basic_box_plugin.py @@ -18,7 +18,7 @@ class BasicBox(SentimentBox): 'default': 'marl:Neutral' } - def predict(self, input): + def predict_one(self, input): output = basic.get_polarity(input) return self.mappings.get(output, self.mappings['default']) diff --git a/example-plugins/basic_plugin.py b/example-plugins/basic_plugin.py index 35af16a..3c91e76 100644 --- a/example-plugins/basic_plugin.py +++ b/example-plugins/basic_plugin.py @@ -18,7 +18,7 @@ class Basic(MappingMixin, SentimentBox): 'default': 'marl:Neutral' } - def predict(self, input): + def predict_one(self, input): return basic.get_polarity(input) test_cases = [{ diff --git a/example-plugins/sklearn/pipeline_plugin.py b/example-plugins/sklearn/pipeline_plugin.py index a8eca0d..29e8f36 100644 --- a/example-plugins/sklearn/pipeline_plugin.py +++ b/example-plugins/sklearn/pipeline_plugin.py @@ -18,7 +18,7 @@ class PipelineSentiment(MappingMixin, SentimentBox): -1: 'marl:Negative' } - def predict(self, input): + def predict_one(self, input): return pipeline.predict([input, ])[0] test_cases = [ diff --git a/senpy/extensions.py b/senpy/extensions.py index 8ed6e58..a98fa55 100644 --- a/senpy/extensions.py +++ b/senpy/extensions.py @@ -6,7 +6,7 @@ from future import standard_library standard_library.install_aliases() from . import plugins, api -from .plugins import Plugin +from .plugins import Plugin, evaluate from .models import Error, AggregatedEvaluation from .blueprints import api_blueprint, demo_blueprint, ns_blueprint @@ -17,7 +17,6 @@ import copy import errno import logging -#Correct this import for managing the datasets from gsitk.datasets.datasets import DatasetManager @@ -197,13 +196,13 @@ class Senpy(object): if dataset not in self.datasets: logger.debug(("The dataset '{}' is not valid\n" "Valid datasets: {}").format(dataset, - self.datasets.keys())) + self.datasets.keys())) raise Error( status=404, message="The dataset '{}' is not valid".format(dataset)) datasets = self._dm.prepare_datasets(datasets_name) return datasets - + @property def datasets(self): self._dataset_list = {} @@ -219,29 +218,17 @@ class Senpy(object): def evaluate(self, params): logger.debug("evaluating request: {}".format(params)) - try: - results = AggregatedEvaluation() - results.parameters = params - datasets = self._get_datasets(results) - plugins = self._get_plugins(results) - collector = list() - for plugin in plugins: - for eval in plugin.score(datasets): - results.evaluations.append(eval) - if 'with_parameters' not in results.parameters: - del results.parameters - logger.debug("Returning evaluation result: {}".format(results)) - except (Error,Exception) as ex: - if not isinstance(ex, Error): - msg = "Error during evaluation: {} \n\t{}".format(ex, - traceback.format_exc()) - ex = Error(message=msg, status=500) - logger.exception('Error returning evaluation result') - raise ex - #results.evaluations = collector + results = AggregatedEvaluation() + results.parameters = params + datasets = self._get_datasets(results) + plugins = self._get_plugins(results) + for eval in evaluate(plugins, datasets): + results.evaluations.append(eval) + if 'with_parameters' not in results.parameters: + del results.parameters + logger.debug("Returning evaluation result: {}".format(results)) return results - def _conversion_candidates(self, fromModel, toModel): candidates = self.plugins(plugin_type='emotionConversionPlugin') for candidate in candidates: diff --git a/senpy/plugins/__init__.py b/senpy/plugins/__init__.py index c498a6e..6a8c7be 100644 --- a/senpy/plugins/__init__.py +++ b/senpy/plugins/__init__.py @@ -25,6 +25,8 @@ from .. import api from gsitk.evaluation.evaluation import Evaluation as Eval from sklearn.pipeline import Pipeline +import numpy as np + logger = logging.getLogger(__name__) @@ -254,7 +256,7 @@ class Box(AnalysisPlugin): .. code-block:: - entry --> input() --> predict() --> output() --> entry' + entry --> input() --> predict_one() --> output() --> entry' In other words: their ``input`` method convers a query (entry and a set of parameters) into @@ -270,15 +272,33 @@ class Box(AnalysisPlugin): '''Transforms the results of the black box into an entry''' return output - def predict(self, input): + def predict_one(self, input): raise NotImplementedError('You should define the behavior of this plugin') def analyse_entries(self, entries, params): for entry in entries: input = self.input(entry=entry, params=params) - results = self.predict(input=input) + results = self.predict_one(input=input) yield self.output(output=results, entry=entry, params=params) + def fit(self, X=None, y=None): + return self + + def transform(self, X): + return np.array([self.predict_one(x) for x in X]) + + def predict(self, X): + return self.transform(X) + + def fit_transform(self, X, y): + self.fit(X, y) + return self.transform(X) + + def as_pipe(self): + pipe = Pipeline([('plugin', self)]) + pipe.name = self.name + return pipe + class TextBox(Box): '''A black box plugin that takes only text as input''' @@ -323,48 +343,6 @@ class EmotionBox(TextBox, EmotionPlugin): return entry -class EvaluationBox(): - ''' - A box plugin where it is implemented the evaluation. It is necessary to have a pipeline. - ''' - - def score(self, datasets): - pipelines = [self._pipeline] - - ev = Eval(tuples = None, - datasets = datasets, - pipelines = pipelines) - ev.evaluate() - results = ev.results - evaluations = self._evaluations_toJSONLD(results) - return evaluations - - def _evaluations_toJSONLD(self, results): - ''' - Map the evaluation results to a JSONLD scheme - ''' - - evaluations = list() - metric_names = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'f1_weighted', 'f1_micro', 'f1_macro'] - - for index, row in results.iterrows(): - - evaluation = models.Evaluation() - if row['CV'] == False: - evaluation['@type'] = ['StaticCV', 'Evaluation'] - evaluation.evaluatesOn = row['Dataset'] - evaluation.evaluates = row['Model'] - i = 0 - for name in metric_names: - metric = models.Metric() - metric['@id'] = 'Metric' + str(i) - metric['@type'] = name.capitalize() - metric.value = row[name] - evaluation.metrics.append(metric) - i+=1 - evaluations.append(evaluation) - return evaluations - class MappingMixin(object): @property @@ -605,3 +583,47 @@ def _from_loaded_module(module, info=None, **kwargs): yield cls(info=info, **kwargs) for instance in _instances_in_module(module): yield instance + + +def evaluate(plugins, datasets, **kwargs): + ev = Eval(tuples=None, + datasets=datasets, + pipelines=[plugin.as_pipe() for plugin in plugins]) + ev.evaluate() + results = ev.results + evaluations = evaluations_to_JSONLD(results, **kwargs) + return evaluations + + +def evaluations_to_JSONLD(results, flatten=False): + ''' + Map the evaluation results to a JSONLD scheme + ''' + + evaluations = list() + metric_names = ['accuracy', 'precision_macro', 'recall_macro', + 'f1_macro', 'f1_weighted', 'f1_micro', 'f1_macro'] + + for index, row in results.iterrows(): + evaluation = models.Evaluation() + if row.get('CV', True): + evaluation['@type'] = ['StaticCV', 'Evaluation'] + evaluation.evaluatesOn = row['Dataset'] + evaluation.evaluates = row['Model'] + i = 0 + if flatten: + metric = models.Metric() + for name in metric_names: + metric[name] = row[name] + evaluation.metrics.append(metric) + else: + # We should probably discontinue this representation + for name in metric_names: + metric = models.Metric() + metric['@id'] = 'Metric' + str(i) + metric['@type'] = name.capitalize() + metric.value = row[name] + evaluation.metrics.append(metric) + i += 1 + evaluations.append(evaluation) + return evaluations diff --git a/senpy/schemas/definitions.json b/senpy/schemas/definitions.json index b74e8d0..4db5e24 100644 --- a/senpy/schemas/definitions.json +++ b/senpy/schemas/definitions.json @@ -43,7 +43,7 @@ "$ref": "response.json" }, "AggregatedEvaluation": { - "$ref": "aggregatedevaluation.json" + "$ref": "aggregatedEvaluation.json" }, "Evaluation": { "$ref": "evaluation.json" diff --git a/tests/test_plugins.py b/tests/test_plugins.py index 1ec08d2..7f42605 100644 --- a/tests/test_plugins.py +++ b/tests/test_plugins.py @@ -10,6 +10,8 @@ from senpy.models import Results, Entry, EmotionSet, Emotion, Plugins from senpy import plugins from senpy.plugins.conversion.emotion.centroids import CentroidConversion +import pandas as pd + class ShelfDummyPlugin(plugins.SentimentPlugin, plugins.ShelfMixin): '''Dummy plugin for tests.''' @@ -212,7 +214,7 @@ class PluginsTest(TestCase): def input(self, entry, **kwargs): return entry.text - def predict(self, input): + def predict_one(self, input): return 'SIGN' in input def output(self, output, entry, **kwargs): @@ -242,7 +244,7 @@ class PluginsTest(TestCase): mappings = {'happy': 'marl:Positive', 'sad': 'marl:Negative'} - def predict(self, input, **kwargs): + def predict_one(self, input, **kwargs): return 'happy' if ':)' in input else 'sad' test_cases = [ @@ -309,6 +311,40 @@ class PluginsTest(TestCase): res = c._backwards_conversion(e) assert res["onyx:hasEmotionCategory"] == "c2" + def test_evaluation(self): + testdata = [] + for i in range(50): + testdata.append(["good", 1]) + for i in range(50): + testdata.append(["bad", 0]) + dataset = pd.DataFrame(testdata, columns=['text', 'polarity']) + + class DummyPlugin(plugins.TextBox): + description = 'Plugin to test evaluation' + version = 0 + + def predict_one(self, input): + return 0 + + class SmartPlugin(plugins.TextBox): + description = 'Plugin to test evaluation' + version = 0 + + def predict_one(self, input): + if input == 'good': + return 1 + return 0 + + dpipe = DummyPlugin() + results = plugins.evaluate(datasets={'testdata': dataset}, plugins=[dpipe], flatten=True) + dumb_metrics = results[0].metrics[0] + assert abs(dumb_metrics['accuracy'] - 0.5) < 0.01 + + spipe = SmartPlugin() + results = plugins.evaluate(datasets={'testdata': dataset}, plugins=[spipe], flatten=True) + smart_metrics = results[0].metrics[0] + assert abs(smart_metrics['accuracy'] - 1) < 0.01 + def make_mini_test(fpath): def mini_test(self):