Add evaluation tests

2025-08-23 18:12:20 +00:00 · 2018-04-24 19:36:50 +02:00
parent 5e2ada1654
commit c0aa7ddc3c
7 changed files with 121 additions and 76 deletions
--- a/example-plugins/basic_box_plugin.py
+++ b/example-plugins/basic_box_plugin.py
@@ -18,7 +18,7 @@ class BasicBox(SentimentBox):
        'default': 'marl:Neutral'
    }
-    def predict(self, input):
+    def predict_one(self, input):
        output = basic.get_polarity(input)
        return self.mappings.get(output, self.mappings['default'])
--- a/example-plugins/basic_plugin.py
+++ b/example-plugins/basic_plugin.py
@@ -18,7 +18,7 @@ class Basic(MappingMixin, SentimentBox):
        'default': 'marl:Neutral'
    }
-    def predict(self, input):
+    def predict_one(self, input):
        return basic.get_polarity(input)
    test_cases = [{
--- a/example-plugins/sklearn/pipeline_plugin.py
+++ b/example-plugins/sklearn/pipeline_plugin.py
@@ -18,7 +18,7 @@ class PipelineSentiment(MappingMixin, SentimentBox):
        -1: 'marl:Negative'
    }
-    def predict(self, input):
+    def predict_one(self, input):
        return pipeline.predict([input, ])[0]
    test_cases = [
--- a/senpy/extensions.py
+++ b/senpy/extensions.py
@@ -6,7 +6,7 @@ from future import standard_library
 standard_library.install_aliases()
 from . import plugins, api
-from .plugins import Plugin
+from .plugins import Plugin, evaluate
 from .models import Error, AggregatedEvaluation
 from .blueprints import api_blueprint, demo_blueprint, ns_blueprint
@@ -17,7 +17,6 @@ import copy
 import errno
 import logging
 #Correct this import for managing the datasets
 from gsitk.datasets.datasets import DatasetManager
@@ -197,7 +196,7 @@ class Senpy(object):
            if dataset not in self.datasets:
                logger.debug(("The dataset '{}' is not valid\n"
                              "Valid datasets: {}").format(dataset,
-                                                            self.datasets.keys()))
+                                                           self.datasets.keys()))
                raise Error(
                    status=404,
                    message="The dataset '{}' is not valid".format(dataset))
@@ -219,29 +218,17 @@ class Senpy(object):
    def evaluate(self, params):
        logger.debug("evaluating request: {}".format(params))
-        try:
+        results = AggregatedEvaluation()
-            results = AggregatedEvaluation()
+        results.parameters = params
-            results.parameters = params
+        datasets = self._get_datasets(results)
-            datasets = self._get_datasets(results)
+        plugins = self._get_plugins(results)
-            plugins = self._get_plugins(results)
+        for eval in evaluate(plugins, datasets):
-            collector = list()
+            results.evaluations.append(eval)
-            for plugin in plugins:
+        if 'with_parameters' not in results.parameters:
-                for eval in plugin.score(datasets):
+            del results.parameters
-                    results.evaluations.append(eval)
+        logger.debug("Returning evaluation result: {}".format(results))
            if 'with_parameters' not in results.parameters:
                del results.parameters
            logger.debug("Returning evaluation result: {}".format(results))
        except (Error,Exception) as ex:
            if not isinstance(ex, Error):
                msg = "Error during evaluation: {} \n\t{}".format(ex,
                                                                traceback.format_exc())
                ex = Error(message=msg, status=500)
            logger.exception('Error returning evaluation result')
            raise ex
        #results.evaluations = collector
        return results
    def _conversion_candidates(self, fromModel, toModel):
        candidates = self.plugins(plugin_type='emotionConversionPlugin')
        for candidate in candidates:
--- a/senpy/plugins/init.py
+++ b/senpy/plugins/init.py
@@ -25,6 +25,8 @@ from .. import api
 from gsitk.evaluation.evaluation import Evaluation as Eval
 from sklearn.pipeline import Pipeline
 import numpy as np
 logger = logging.getLogger(__name__)
@@ -254,7 +256,7 @@ class Box(AnalysisPlugin):
    .. code-block::
-                   entry --> input() --> predict() --> output() --> entry'
+                   entry --> input() --> predict_one() --> output() --> entry'
    In other words: their ``input`` method convers a query (entry and a set of parameters) into
@@ -270,15 +272,33 @@ class Box(AnalysisPlugin):
        '''Transforms the results of the black box into an entry'''
        return output
-    def predict(self, input):
+    def predict_one(self, input):
        raise NotImplementedError('You should define the behavior of this plugin')
    def analyse_entries(self, entries, params):
        for entry in entries:
            input = self.input(entry=entry, params=params)
-            results = self.predict(input=input)
+            results = self.predict_one(input=input)
            yield self.output(output=results, entry=entry, params=params)
    def fit(self, X=None, y=None):
        return self
    def transform(self, X):
        return np.array([self.predict_one(x) for x in X])
    def predict(self, X):
        return self.transform(X)
    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)
    def as_pipe(self):
        pipe = Pipeline([('plugin', self)])
        pipe.name = self.name
        return pipe
 class TextBox(Box):
    '''A black box plugin that takes only text as input'''
@@ -323,48 +343,6 @@ class EmotionBox(TextBox, EmotionPlugin):
        return entry
 class EvaluationBox():
    '''
    A box plugin where it is implemented the evaluation. It is necessary to have a pipeline.
    '''
    def score(self, datasets):
        pipelines = [self._pipeline]
        ev = Eval(tuples = None,
            datasets = datasets,
            pipelines = pipelines)
        ev.evaluate()
        results = ev.results
        evaluations = self._evaluations_toJSONLD(results)
        return evaluations
    def _evaluations_toJSONLD(self, results):
        '''
        Map the evaluation results to a JSONLD scheme
        '''
        evaluations = list()
        metric_names = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'f1_weighted', 'f1_micro', 'f1_macro']
        for index, row in results.iterrows():
            evaluation = models.Evaluation()
            if row['CV'] == False:
                evaluation['@type'] = ['StaticCV', 'Evaluation']
            evaluation.evaluatesOn = row['Dataset']
            evaluation.evaluates = row['Model']
            i = 0
            for name in metric_names:
                metric = models.Metric()
                metric['@id'] = 'Metric' + str(i)
                metric['@type'] = name.capitalize()
                metric.value = row[name]
                evaluation.metrics.append(metric)
                i+=1
            evaluations.append(evaluation)
        return evaluations
 class MappingMixin(object):
    @property
@@ -605,3 +583,47 @@ def _from_loaded_module(module, info=None, **kwargs):
        yield cls(info=info, **kwargs)
    for instance in _instances_in_module(module):
        yield instance
 def evaluate(plugins, datasets, **kwargs):
    ev = Eval(tuples=None,
              datasets=datasets,
              pipelines=[plugin.as_pipe() for plugin in plugins])
    ev.evaluate()
    results = ev.results
    evaluations = evaluations_to_JSONLD(results, **kwargs)
    return evaluations
 def evaluations_to_JSONLD(results, flatten=False):
    '''
    Map the evaluation results to a JSONLD scheme
    '''
    evaluations = list()
    metric_names = ['accuracy', 'precision_macro', 'recall_macro',
                    'f1_macro', 'f1_weighted', 'f1_micro', 'f1_macro']
    for index, row in results.iterrows():
        evaluation = models.Evaluation()
        if row.get('CV', True):
            evaluation['@type'] = ['StaticCV', 'Evaluation']
        evaluation.evaluatesOn = row['Dataset']
        evaluation.evaluates = row['Model']
        i = 0
        if flatten:
            metric = models.Metric()
            for name in metric_names:
                metric[name] = row[name]
            evaluation.metrics.append(metric)
        else:
            # We should probably discontinue this representation
            for name in metric_names:
                metric = models.Metric()
                metric['@id'] = 'Metric' + str(i)
                metric['@type'] = name.capitalize()
                metric.value = row[name]
                evaluation.metrics.append(metric)
                i += 1
        evaluations.append(evaluation)
    return evaluations
--- a/senpy/schemas/definitions.json
+++ b/senpy/schemas/definitions.json
@@ -43,7 +43,7 @@
    "$ref": "response.json"
  },
  "AggregatedEvaluation": {
-    "$ref": "aggregatedevaluation.json"
+    "$ref": "aggregatedEvaluation.json"
  },
  "Evaluation": {
    "$ref": "evaluation.json"
--- a/tests/test_plugins.py
+++ b/tests/test_plugins.py
@@ -10,6 +10,8 @@ from senpy.models import Results, Entry, EmotionSet, Emotion, Plugins
 from senpy import plugins
 from senpy.plugins.conversion.emotion.centroids import CentroidConversion
 import pandas as pd
 class ShelfDummyPlugin(plugins.SentimentPlugin, plugins.ShelfMixin):
    '''Dummy plugin for tests.'''
@@ -212,7 +214,7 @@ class PluginsTest(TestCase):
            def input(self, entry, **kwargs):
                return entry.text
-            def predict(self, input):
+            def predict_one(self, input):
                return 'SIGN' in input
            def output(self, output, entry, **kwargs):
@@ -242,7 +244,7 @@ class PluginsTest(TestCase):
            mappings = {'happy': 'marl:Positive', 'sad': 'marl:Negative'}
-            def predict(self, input, **kwargs):
+            def predict_one(self, input, **kwargs):
                return 'happy' if ':)' in input else 'sad'
            test_cases = [
@@ -309,6 +311,40 @@ class PluginsTest(TestCase):
        res = c._backwards_conversion(e)
        assert res["onyx:hasEmotionCategory"] == "c2"
    def test_evaluation(self):
        testdata = []
        for i in range(50):
            testdata.append(["good", 1])
        for i in range(50):
            testdata.append(["bad", 0])
        dataset = pd.DataFrame(testdata, columns=['text', 'polarity'])
        class DummyPlugin(plugins.TextBox):
            description = 'Plugin to test evaluation'
            version = 0
            def predict_one(self, input):
                return 0
        class SmartPlugin(plugins.TextBox):
            description = 'Plugin to test evaluation'
            version = 0
            def predict_one(self, input):
                if input == 'good':
                    return 1
                return 0
        dpipe = DummyPlugin()
        results = plugins.evaluate(datasets={'testdata': dataset}, plugins=[dpipe], flatten=True)
        dumb_metrics = results[0].metrics[0]
        assert abs(dumb_metrics['accuracy'] - 0.5) < 0.01
        spipe = SmartPlugin()
        results = plugins.evaluate(datasets={'testdata': dataset}, plugins=[spipe], flatten=True)
        smart_metrics = results[0].metrics[0]
        assert abs(smart_metrics['accuracy'] - 1) < 0.01
 def make_mini_test(fpath):
    def mini_test(self):