From c0aa7ddc3c918fa578733e38f4b2a4e9555e9bae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=2E=20Fernando=20S=C3=A1nchez?= <balkian@gmail.com>
Date: Tue, 24 Apr 2018 19:36:50 +0200
Subject: [PATCH] Add evaluation tests

---
 example-plugins/basic_box_plugin.py        |   2 +-
 example-plugins/basic_plugin.py            |   2 +-
 example-plugins/sklearn/pipeline_plugin.py |   2 +-
 senpy/extensions.py                        |  37 +++----
 senpy/plugins/__init__.py                  | 112 ++++++++++++---------
 senpy/schemas/definitions.json             |   2 +-
 tests/test_plugins.py                      |  40 +++++++-
 7 files changed, 121 insertions(+), 76 deletions(-)

diff --git a/example-plugins/basic_box_plugin.py b/example-plugins/basic_box_plugin.py
index 0b85951..3b18cf9 100644
--- a/example-plugins/basic_box_plugin.py
+++ b/example-plugins/basic_box_plugin.py
@@ -18,7 +18,7 @@ class BasicBox(SentimentBox):
         'default': 'marl:Neutral'
     }
 
-    def predict(self, input):
+    def predict_one(self, input):
         output = basic.get_polarity(input)
         return self.mappings.get(output, self.mappings['default'])
 
diff --git a/example-plugins/basic_plugin.py b/example-plugins/basic_plugin.py
index 35af16a..3c91e76 100644
--- a/example-plugins/basic_plugin.py
+++ b/example-plugins/basic_plugin.py
@@ -18,7 +18,7 @@ class Basic(MappingMixin, SentimentBox):
         'default': 'marl:Neutral'
     }
 
-    def predict(self, input):
+    def predict_one(self, input):
         return basic.get_polarity(input)
 
     test_cases = [{
diff --git a/example-plugins/sklearn/pipeline_plugin.py b/example-plugins/sklearn/pipeline_plugin.py
index a8eca0d..29e8f36 100644
--- a/example-plugins/sklearn/pipeline_plugin.py
+++ b/example-plugins/sklearn/pipeline_plugin.py
@@ -18,7 +18,7 @@ class PipelineSentiment(MappingMixin, SentimentBox):
         -1: 'marl:Negative'
     }
 
-    def predict(self, input):
+    def predict_one(self, input):
         return pipeline.predict([input, ])[0]
 
     test_cases = [
diff --git a/senpy/extensions.py b/senpy/extensions.py
index 8ed6e58..a98fa55 100644
--- a/senpy/extensions.py
+++ b/senpy/extensions.py
@@ -6,7 +6,7 @@ from future import standard_library
 standard_library.install_aliases()
 
 from . import plugins, api
-from .plugins import Plugin
+from .plugins import Plugin, evaluate
 from .models import Error, AggregatedEvaluation
 from .blueprints import api_blueprint, demo_blueprint, ns_blueprint
 
@@ -17,7 +17,6 @@ import copy
 import errno
 import logging
 
-#Correct this import for managing the datasets
 from gsitk.datasets.datasets import DatasetManager
 
 
@@ -197,13 +196,13 @@ class Senpy(object):
             if dataset not in self.datasets:
                 logger.debug(("The dataset '{}' is not valid\n"
                               "Valid datasets: {}").format(dataset,
-                                                            self.datasets.keys()))
+                                                           self.datasets.keys()))
                 raise Error(
                     status=404,
                     message="The dataset '{}' is not valid".format(dataset))
         datasets = self._dm.prepare_datasets(datasets_name)
         return datasets
-        
+
     @property
     def datasets(self):
         self._dataset_list = {}
@@ -219,29 +218,17 @@ class Senpy(object):
     def evaluate(self, params):
 
         logger.debug("evaluating request: {}".format(params))
-        try:
-            results = AggregatedEvaluation()
-            results.parameters = params
-            datasets = self._get_datasets(results)
-            plugins = self._get_plugins(results)
-            collector = list()
-            for plugin in plugins:
-                for eval in plugin.score(datasets):
-                    results.evaluations.append(eval)
-            if 'with_parameters' not in results.parameters:
-                del results.parameters
-            logger.debug("Returning evaluation result: {}".format(results))
-        except (Error,Exception) as ex:
-            if not isinstance(ex, Error):
-                msg = "Error during evaluation: {} \n\t{}".format(ex,
-                                                                traceback.format_exc())
-                ex = Error(message=msg, status=500)
-            logger.exception('Error returning evaluation result')
-            raise ex
-        #results.evaluations = collector
+        results = AggregatedEvaluation()
+        results.parameters = params
+        datasets = self._get_datasets(results)
+        plugins = self._get_plugins(results)
+        for eval in evaluate(plugins, datasets):
+            results.evaluations.append(eval)
+        if 'with_parameters' not in results.parameters:
+            del results.parameters
+        logger.debug("Returning evaluation result: {}".format(results))
         return results
 
-
     def _conversion_candidates(self, fromModel, toModel):
         candidates = self.plugins(plugin_type='emotionConversionPlugin')
         for candidate in candidates:
diff --git a/senpy/plugins/__init__.py b/senpy/plugins/__init__.py
index c498a6e..6a8c7be 100644
--- a/senpy/plugins/__init__.py
+++ b/senpy/plugins/__init__.py
@@ -25,6 +25,8 @@ from .. import api
 from gsitk.evaluation.evaluation import Evaluation as Eval
 from sklearn.pipeline import Pipeline
 
+import numpy as np
+
 logger = logging.getLogger(__name__)
 
 
@@ -254,7 +256,7 @@ class Box(AnalysisPlugin):
 
     .. code-block::
 
-                   entry --> input() --> predict() --> output() --> entry'
+                   entry --> input() --> predict_one() --> output() --> entry'
 
 
     In other words: their ``input`` method convers a query (entry and a set of parameters) into
@@ -270,15 +272,33 @@ class Box(AnalysisPlugin):
         '''Transforms the results of the black box into an entry'''
         return output
 
-    def predict(self, input):
+    def predict_one(self, input):
         raise NotImplementedError('You should define the behavior of this plugin')
 
     def analyse_entries(self, entries, params):
         for entry in entries:
             input = self.input(entry=entry, params=params)
-            results = self.predict(input=input)
+            results = self.predict_one(input=input)
             yield self.output(output=results, entry=entry, params=params)
 
+    def fit(self, X=None, y=None):
+        return self
+
+    def transform(self, X):
+        return np.array([self.predict_one(x) for x in X])
+
+    def predict(self, X):
+        return self.transform(X)
+
+    def fit_transform(self, X, y):
+        self.fit(X, y)
+        return self.transform(X)
+
+    def as_pipe(self):
+        pipe = Pipeline([('plugin', self)])
+        pipe.name = self.name
+        return pipe
+
 
 class TextBox(Box):
     '''A black box plugin that takes only text as input'''
@@ -323,48 +343,6 @@ class EmotionBox(TextBox, EmotionPlugin):
         return entry
 
 
-class EvaluationBox():
-    '''
-    A box plugin where it is implemented the evaluation. It is necessary to have a pipeline.
-    '''
-
-    def score(self, datasets):
-        pipelines = [self._pipeline]
-
-        ev = Eval(tuples = None,
-            datasets = datasets,
-            pipelines = pipelines)
-        ev.evaluate()
-        results = ev.results
-        evaluations = self._evaluations_toJSONLD(results)
-        return evaluations
-
-    def _evaluations_toJSONLD(self, results):
-        '''
-        Map the evaluation results to a JSONLD scheme
-        '''
-
-        evaluations = list()
-        metric_names = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'f1_weighted', 'f1_micro', 'f1_macro']
-        
-        for index, row in results.iterrows():
-            
-            evaluation = models.Evaluation()
-            if row['CV'] == False:
-                evaluation['@type'] = ['StaticCV', 'Evaluation']
-            evaluation.evaluatesOn = row['Dataset']
-            evaluation.evaluates = row['Model']
-            i = 0
-            for name in metric_names:
-                metric = models.Metric()
-                metric['@id'] = 'Metric' + str(i)
-                metric['@type'] = name.capitalize()
-                metric.value = row[name]
-                evaluation.metrics.append(metric)
-                i+=1
-            evaluations.append(evaluation)
-        return evaluations
-
 class MappingMixin(object):
 
     @property
@@ -605,3 +583,47 @@ def _from_loaded_module(module, info=None, **kwargs):
         yield cls(info=info, **kwargs)
     for instance in _instances_in_module(module):
         yield instance
+
+
+def evaluate(plugins, datasets, **kwargs):
+    ev = Eval(tuples=None,
+              datasets=datasets,
+              pipelines=[plugin.as_pipe() for plugin in plugins])
+    ev.evaluate()
+    results = ev.results
+    evaluations = evaluations_to_JSONLD(results, **kwargs)
+    return evaluations
+
+
+def evaluations_to_JSONLD(results, flatten=False):
+    '''
+    Map the evaluation results to a JSONLD scheme
+    '''
+
+    evaluations = list()
+    metric_names = ['accuracy', 'precision_macro', 'recall_macro',
+                    'f1_macro', 'f1_weighted', 'f1_micro', 'f1_macro']
+
+    for index, row in results.iterrows():
+        evaluation = models.Evaluation()
+        if row.get('CV', True):
+            evaluation['@type'] = ['StaticCV', 'Evaluation']
+        evaluation.evaluatesOn = row['Dataset']
+        evaluation.evaluates = row['Model']
+        i = 0
+        if flatten:
+            metric = models.Metric()
+            for name in metric_names:
+                metric[name] = row[name]
+            evaluation.metrics.append(metric)
+        else:
+            # We should probably discontinue this representation
+            for name in metric_names:
+                metric = models.Metric()
+                metric['@id'] = 'Metric' + str(i)
+                metric['@type'] = name.capitalize()
+                metric.value = row[name]
+                evaluation.metrics.append(metric)
+                i += 1
+        evaluations.append(evaluation)
+    return evaluations
diff --git a/senpy/schemas/definitions.json b/senpy/schemas/definitions.json
index b74e8d0..4db5e24 100644
--- a/senpy/schemas/definitions.json
+++ b/senpy/schemas/definitions.json
@@ -43,7 +43,7 @@
     "$ref": "response.json"
   },
   "AggregatedEvaluation": {
-    "$ref": "aggregatedevaluation.json"
+    "$ref": "aggregatedEvaluation.json"
   },
   "Evaluation": {
     "$ref": "evaluation.json"
diff --git a/tests/test_plugins.py b/tests/test_plugins.py
index 1ec08d2..7f42605 100644
--- a/tests/test_plugins.py
+++ b/tests/test_plugins.py
@@ -10,6 +10,8 @@ from senpy.models import Results, Entry, EmotionSet, Emotion, Plugins
 from senpy import plugins
 from senpy.plugins.conversion.emotion.centroids import CentroidConversion
 
+import pandas as pd
+
 
 class ShelfDummyPlugin(plugins.SentimentPlugin, plugins.ShelfMixin):
     '''Dummy plugin for tests.'''
@@ -212,7 +214,7 @@ class PluginsTest(TestCase):
             def input(self, entry, **kwargs):
                 return entry.text
 
-            def predict(self, input):
+            def predict_one(self, input):
                 return 'SIGN' in input
 
             def output(self, output, entry, **kwargs):
@@ -242,7 +244,7 @@ class PluginsTest(TestCase):
 
             mappings = {'happy': 'marl:Positive', 'sad': 'marl:Negative'}
 
-            def predict(self, input, **kwargs):
+            def predict_one(self, input, **kwargs):
                 return 'happy' if ':)' in input else 'sad'
 
             test_cases = [
@@ -309,6 +311,40 @@ class PluginsTest(TestCase):
         res = c._backwards_conversion(e)
         assert res["onyx:hasEmotionCategory"] == "c2"
 
+    def test_evaluation(self):
+        testdata = []
+        for i in range(50):
+            testdata.append(["good", 1])
+        for i in range(50):
+            testdata.append(["bad", 0])
+        dataset = pd.DataFrame(testdata, columns=['text', 'polarity'])
+
+        class DummyPlugin(plugins.TextBox):
+            description = 'Plugin to test evaluation'
+            version = 0
+
+            def predict_one(self, input):
+                return 0
+
+        class SmartPlugin(plugins.TextBox):
+            description = 'Plugin to test evaluation'
+            version = 0
+
+            def predict_one(self, input):
+                if input == 'good':
+                    return 1
+                return 0
+
+        dpipe = DummyPlugin()
+        results = plugins.evaluate(datasets={'testdata': dataset}, plugins=[dpipe], flatten=True)
+        dumb_metrics = results[0].metrics[0]
+        assert abs(dumb_metrics['accuracy'] - 0.5) < 0.01
+
+        spipe = SmartPlugin()
+        results = plugins.evaluate(datasets={'testdata': dataset}, plugins=[spipe], flatten=True)
+        smart_metrics = results[0].metrics[0]
+        assert abs(smart_metrics['accuracy'] - 1) < 0.01
+
 
 def make_mini_test(fpath):
     def mini_test(self):