Add evaluation tests

2025-10-13 16:52:22 +00:00 · 2018-04-24 19:36:50 +02:00
parent 5e2ada1654
commit c0aa7ddc3c
7 changed files with 121 additions and 76 deletions
--- a/example-plugins/basic_box_plugin.py
+++ b/example-plugins/basic_box_plugin.py
@@ -18,7 +18,7 @@ class BasicBox(SentimentBox):
        'default': 'marl:Neutral'
    }

-    def predict(self, input):
+    def predict_one(self, input):
        output = basic.get_polarity(input)
        return self.mappings.get(output, self.mappings['default'])

--- a/example-plugins/basic_plugin.py
+++ b/example-plugins/basic_plugin.py
@@ -18,7 +18,7 @@ class Basic(MappingMixin, SentimentBox):
        'default': 'marl:Neutral'
    }

-    def predict(self, input):
+    def predict_one(self, input):
        return basic.get_polarity(input)

    test_cases = [{
--- a/example-plugins/sklearn/pipeline_plugin.py
+++ b/example-plugins/sklearn/pipeline_plugin.py
@@ -18,7 +18,7 @@ class PipelineSentiment(MappingMixin, SentimentBox):
        -1: 'marl:Negative'
    }

-    def predict(self, input):
+    def predict_one(self, input):
        return pipeline.predict([input, ])[0]

    test_cases = [
--- a/senpy/extensions.py
+++ b/senpy/extensions.py
@@ -6,7 +6,7 @@ from future import standard_library
 standard_library.install_aliases()

 from . import plugins, api
-from .plugins import Plugin
+from .plugins import Plugin, evaluate
 from .models import Error, AggregatedEvaluation
 from .blueprints import api_blueprint, demo_blueprint, ns_blueprint

@@ -17,7 +17,6 @@ import copy
 import errno
 import logging

-#Correct this import for managing the datasets
 from gsitk.datasets.datasets import DatasetManager


@@ -197,13 +196,13 @@ class Senpy(object):
            if dataset not in self.datasets:
                logger.debug(("The dataset '{}' is not valid\n"
                              "Valid datasets: {}").format(dataset,
-                                                            self.datasets.keys()))
+                                                           self.datasets.keys()))
                raise Error(
                    status=404,
                    message="The dataset '{}' is not valid".format(dataset))
        datasets = self._dm.prepare_datasets(datasets_name)
        return datasets
-        
+
    @property
    def datasets(self):
        self._dataset_list = {}
@@ -219,29 +218,17 @@ class Senpy(object):
    def evaluate(self, params):

        logger.debug("evaluating request: {}".format(params))
-        try:
-            results = AggregatedEvaluation()
-            results.parameters = params
-            datasets = self._get_datasets(results)
-            plugins = self._get_plugins(results)
-            collector = list()
-            for plugin in plugins:
-                for eval in plugin.score(datasets):
-                    results.evaluations.append(eval)
-            if 'with_parameters' not in results.parameters:
-                del results.parameters
-            logger.debug("Returning evaluation result: {}".format(results))
-        except (Error,Exception) as ex:
-            if not isinstance(ex, Error):
-                msg = "Error during evaluation: {} \n\t{}".format(ex,
-                                                                traceback.format_exc())
-                ex = Error(message=msg, status=500)
-            logger.exception('Error returning evaluation result')
-            raise ex
-        #results.evaluations = collector
+        results = AggregatedEvaluation()
+        results.parameters = params
+        datasets = self._get_datasets(results)
+        plugins = self._get_plugins(results)
+        for eval in evaluate(plugins, datasets):
+            results.evaluations.append(eval)
+        if 'with_parameters' not in results.parameters:
+            del results.parameters
+        logger.debug("Returning evaluation result: {}".format(results))
        return results

-
    def _conversion_candidates(self, fromModel, toModel):
        candidates = self.plugins(plugin_type='emotionConversionPlugin')
        for candidate in candidates:
--- a/senpy/plugins/init.py
+++ b/senpy/plugins/init.py
@@ -25,6 +25,8 @@ from .. import api
 from gsitk.evaluation.evaluation import Evaluation as Eval
 from sklearn.pipeline import Pipeline

+import numpy as np
+
 logger = logging.getLogger(__name__)


@@ -254,7 +256,7 @@ class Box(AnalysisPlugin):

    .. code-block::

-                   entry --> input() --> predict() --> output() --> entry'
+                   entry --> input() --> predict_one() --> output() --> entry'


    In other words: their ``input`` method convers a query (entry and a set of parameters) into
@@ -270,15 +272,33 @@ class Box(AnalysisPlugin):
        '''Transforms the results of the black box into an entry'''
        return output

-    def predict(self, input):
+    def predict_one(self, input):
        raise NotImplementedError('You should define the behavior of this plugin')

    def analyse_entries(self, entries, params):
        for entry in entries:
            input = self.input(entry=entry, params=params)
-            results = self.predict(input=input)
+            results = self.predict_one(input=input)
            yield self.output(output=results, entry=entry, params=params)

+    def fit(self, X=None, y=None):
+        return self
+
+    def transform(self, X):
+        return np.array([self.predict_one(x) for x in X])
+
+    def predict(self, X):
+        return self.transform(X)
+
+    def fit_transform(self, X, y):
+        self.fit(X, y)
+        return self.transform(X)
+
+    def as_pipe(self):
+        pipe = Pipeline([('plugin', self)])
+        pipe.name = self.name
+        return pipe
+

 class TextBox(Box):
    '''A black box plugin that takes only text as input'''
@@ -323,48 +343,6 @@ class EmotionBox(TextBox, EmotionPlugin):
        return entry


-class EvaluationBox():
-    '''
-    A box plugin where it is implemented the evaluation. It is necessary to have a pipeline.
-    '''
-
-    def score(self, datasets):
-        pipelines = [self._pipeline]
-
-        ev = Eval(tuples = None,
-            datasets = datasets,
-            pipelines = pipelines)
-        ev.evaluate()
-        results = ev.results
-        evaluations = self._evaluations_toJSONLD(results)
-        return evaluations
-
-    def _evaluations_toJSONLD(self, results):
-        '''
-        Map the evaluation results to a JSONLD scheme
-        '''
-
-        evaluations = list()
-        metric_names = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'f1_weighted', 'f1_micro', 'f1_macro']
-        
-        for index, row in results.iterrows():
-            
-            evaluation = models.Evaluation()
-            if row['CV'] == False:
-                evaluation['@type'] = ['StaticCV', 'Evaluation']
-            evaluation.evaluatesOn = row['Dataset']
-            evaluation.evaluates = row['Model']
-            i = 0
-            for name in metric_names:
-                metric = models.Metric()
-                metric['@id'] = 'Metric' + str(i)
-                metric['@type'] = name.capitalize()
-                metric.value = row[name]
-                evaluation.metrics.append(metric)
-                i+=1
-            evaluations.append(evaluation)
-        return evaluations
-
 class MappingMixin(object):

    @property
@@ -605,3 +583,47 @@ def _from_loaded_module(module, info=None, **kwargs):
        yield cls(info=info, **kwargs)
    for instance in _instances_in_module(module):
        yield instance
+
+
+def evaluate(plugins, datasets, **kwargs):
+    ev = Eval(tuples=None,
+              datasets=datasets,
+              pipelines=[plugin.as_pipe() for plugin in plugins])
+    ev.evaluate()
+    results = ev.results
+    evaluations = evaluations_to_JSONLD(results, **kwargs)
+    return evaluations
+
+
+def evaluations_to_JSONLD(results, flatten=False):
+    '''
+    Map the evaluation results to a JSONLD scheme
+    '''
+
+    evaluations = list()
+    metric_names = ['accuracy', 'precision_macro', 'recall_macro',
+                    'f1_macro', 'f1_weighted', 'f1_micro', 'f1_macro']
+
+    for index, row in results.iterrows():
+        evaluation = models.Evaluation()
+        if row.get('CV', True):
+            evaluation['@type'] = ['StaticCV', 'Evaluation']
+        evaluation.evaluatesOn = row['Dataset']
+        evaluation.evaluates = row['Model']
+        i = 0
+        if flatten:
+            metric = models.Metric()
+            for name in metric_names:
+                metric[name] = row[name]
+            evaluation.metrics.append(metric)
+        else:
+            # We should probably discontinue this representation
+            for name in metric_names:
+                metric = models.Metric()
+                metric['@id'] = 'Metric' + str(i)
+                metric['@type'] = name.capitalize()
+                metric.value = row[name]
+                evaluation.metrics.append(metric)
+                i += 1
+        evaluations.append(evaluation)
+    return evaluations
--- a/senpy/schemas/definitions.json
+++ b/senpy/schemas/definitions.json
@@ -43,7 +43,7 @@
    "$ref": "response.json"
  },
  "AggregatedEvaluation": {
-    "$ref": "aggregatedevaluation.json"
+    "$ref": "aggregatedEvaluation.json"
  },
  "Evaluation": {
    "$ref": "evaluation.json"
--- a/tests/test_plugins.py
+++ b/tests/test_plugins.py
@@ -10,6 +10,8 @@ from senpy.models import Results, Entry, EmotionSet, Emotion, Plugins
 from senpy import plugins
 from senpy.plugins.conversion.emotion.centroids import CentroidConversion

+import pandas as pd
+

 class ShelfDummyPlugin(plugins.SentimentPlugin, plugins.ShelfMixin):
    '''Dummy plugin for tests.'''
@@ -212,7 +214,7 @@ class PluginsTest(TestCase):
            def input(self, entry, **kwargs):
                return entry.text

-            def predict(self, input):
+            def predict_one(self, input):
                return 'SIGN' in input

            def output(self, output, entry, **kwargs):
@@ -242,7 +244,7 @@ class PluginsTest(TestCase):

            mappings = {'happy': 'marl:Positive', 'sad': 'marl:Negative'}

-            def predict(self, input, **kwargs):
+            def predict_one(self, input, **kwargs):
                return 'happy' if ':)' in input else 'sad'

            test_cases = [
@@ -309,6 +311,40 @@ class PluginsTest(TestCase):
        res = c._backwards_conversion(e)
        assert res["onyx:hasEmotionCategory"] == "c2"

+    def test_evaluation(self):
+        testdata = []
+        for i in range(50):
+            testdata.append(["good", 1])
+        for i in range(50):
+            testdata.append(["bad", 0])
+        dataset = pd.DataFrame(testdata, columns=['text', 'polarity'])
+
+        class DummyPlugin(plugins.TextBox):
+            description = 'Plugin to test evaluation'
+            version = 0
+
+            def predict_one(self, input):
+                return 0
+
+        class SmartPlugin(plugins.TextBox):
+            description = 'Plugin to test evaluation'
+            version = 0
+
+            def predict_one(self, input):
+                if input == 'good':
+                    return 1
+                return 0
+
+        dpipe = DummyPlugin()
+        results = plugins.evaluate(datasets={'testdata': dataset}, plugins=[dpipe], flatten=True)
+        dumb_metrics = results[0].metrics[0]
+        assert abs(dumb_metrics['accuracy'] - 0.5) < 0.01
+
+        spipe = SmartPlugin()
+        results = plugins.evaluate(datasets={'testdata': dataset}, plugins=[spipe], flatten=True)
+        smart_metrics = results[0].metrics[0]
+        assert abs(smart_metrics['accuracy'] - 1) < 0.01
+

 def make_mini_test(fpath):
    def mini_test(self):