Add evaluation tests

44-add-basic-evaluation-with-gsitk
J. Fernando Sánchez 6 years ago
parent 5e2ada1654
commit c0aa7ddc3c

@ -18,7 +18,7 @@ class BasicBox(SentimentBox):
'default': 'marl:Neutral' 'default': 'marl:Neutral'
} }
def predict(self, input): def predict_one(self, input):
output = basic.get_polarity(input) output = basic.get_polarity(input)
return self.mappings.get(output, self.mappings['default']) return self.mappings.get(output, self.mappings['default'])

@ -18,7 +18,7 @@ class Basic(MappingMixin, SentimentBox):
'default': 'marl:Neutral' 'default': 'marl:Neutral'
} }
def predict(self, input): def predict_one(self, input):
return basic.get_polarity(input) return basic.get_polarity(input)
test_cases = [{ test_cases = [{

@ -18,7 +18,7 @@ class PipelineSentiment(MappingMixin, SentimentBox):
-1: 'marl:Negative' -1: 'marl:Negative'
} }
def predict(self, input): def predict_one(self, input):
return pipeline.predict([input, ])[0] return pipeline.predict([input, ])[0]
test_cases = [ test_cases = [

@ -6,7 +6,7 @@ from future import standard_library
standard_library.install_aliases() standard_library.install_aliases()
from . import plugins, api from . import plugins, api
from .plugins import Plugin from .plugins import Plugin, evaluate
from .models import Error, AggregatedEvaluation from .models import Error, AggregatedEvaluation
from .blueprints import api_blueprint, demo_blueprint, ns_blueprint from .blueprints import api_blueprint, demo_blueprint, ns_blueprint
@ -17,7 +17,6 @@ import copy
import errno import errno
import logging import logging
#Correct this import for managing the datasets
from gsitk.datasets.datasets import DatasetManager from gsitk.datasets.datasets import DatasetManager
@ -197,13 +196,13 @@ class Senpy(object):
if dataset not in self.datasets: if dataset not in self.datasets:
logger.debug(("The dataset '{}' is not valid\n" logger.debug(("The dataset '{}' is not valid\n"
"Valid datasets: {}").format(dataset, "Valid datasets: {}").format(dataset,
self.datasets.keys())) self.datasets.keys()))
raise Error( raise Error(
status=404, status=404,
message="The dataset '{}' is not valid".format(dataset)) message="The dataset '{}' is not valid".format(dataset))
datasets = self._dm.prepare_datasets(datasets_name) datasets = self._dm.prepare_datasets(datasets_name)
return datasets return datasets
@property @property
def datasets(self): def datasets(self):
self._dataset_list = {} self._dataset_list = {}
@ -219,29 +218,17 @@ class Senpy(object):
def evaluate(self, params): def evaluate(self, params):
logger.debug("evaluating request: {}".format(params)) logger.debug("evaluating request: {}".format(params))
try: results = AggregatedEvaluation()
results = AggregatedEvaluation() results.parameters = params
results.parameters = params datasets = self._get_datasets(results)
datasets = self._get_datasets(results) plugins = self._get_plugins(results)
plugins = self._get_plugins(results) for eval in evaluate(plugins, datasets):
collector = list() results.evaluations.append(eval)
for plugin in plugins: if 'with_parameters' not in results.parameters:
for eval in plugin.score(datasets): del results.parameters
results.evaluations.append(eval) logger.debug("Returning evaluation result: {}".format(results))
if 'with_parameters' not in results.parameters:
del results.parameters
logger.debug("Returning evaluation result: {}".format(results))
except (Error,Exception) as ex:
if not isinstance(ex, Error):
msg = "Error during evaluation: {} \n\t{}".format(ex,
traceback.format_exc())
ex = Error(message=msg, status=500)
logger.exception('Error returning evaluation result')
raise ex
#results.evaluations = collector
return results return results
def _conversion_candidates(self, fromModel, toModel): def _conversion_candidates(self, fromModel, toModel):
candidates = self.plugins(plugin_type='emotionConversionPlugin') candidates = self.plugins(plugin_type='emotionConversionPlugin')
for candidate in candidates: for candidate in candidates:

@ -25,6 +25,8 @@ from .. import api
from gsitk.evaluation.evaluation import Evaluation as Eval from gsitk.evaluation.evaluation import Evaluation as Eval
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
import numpy as np
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -254,7 +256,7 @@ class Box(AnalysisPlugin):
.. code-block:: .. code-block::
entry --> input() --> predict() --> output() --> entry' entry --> input() --> predict_one() --> output() --> entry'
In other words: their ``input`` method convers a query (entry and a set of parameters) into In other words: their ``input`` method convers a query (entry and a set of parameters) into
@ -270,15 +272,33 @@ class Box(AnalysisPlugin):
'''Transforms the results of the black box into an entry''' '''Transforms the results of the black box into an entry'''
return output return output
def predict(self, input): def predict_one(self, input):
raise NotImplementedError('You should define the behavior of this plugin') raise NotImplementedError('You should define the behavior of this plugin')
def analyse_entries(self, entries, params): def analyse_entries(self, entries, params):
for entry in entries: for entry in entries:
input = self.input(entry=entry, params=params) input = self.input(entry=entry, params=params)
results = self.predict(input=input) results = self.predict_one(input=input)
yield self.output(output=results, entry=entry, params=params) yield self.output(output=results, entry=entry, params=params)
def fit(self, X=None, y=None):
return self
def transform(self, X):
return np.array([self.predict_one(x) for x in X])
def predict(self, X):
return self.transform(X)
def fit_transform(self, X, y):
self.fit(X, y)
return self.transform(X)
def as_pipe(self):
pipe = Pipeline([('plugin', self)])
pipe.name = self.name
return pipe
class TextBox(Box): class TextBox(Box):
'''A black box plugin that takes only text as input''' '''A black box plugin that takes only text as input'''
@ -323,48 +343,6 @@ class EmotionBox(TextBox, EmotionPlugin):
return entry return entry
class EvaluationBox():
'''
A box plugin where it is implemented the evaluation. It is necessary to have a pipeline.
'''
def score(self, datasets):
pipelines = [self._pipeline]
ev = Eval(tuples = None,
datasets = datasets,
pipelines = pipelines)
ev.evaluate()
results = ev.results
evaluations = self._evaluations_toJSONLD(results)
return evaluations
def _evaluations_toJSONLD(self, results):
'''
Map the evaluation results to a JSONLD scheme
'''
evaluations = list()
metric_names = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'f1_weighted', 'f1_micro', 'f1_macro']
for index, row in results.iterrows():
evaluation = models.Evaluation()
if row['CV'] == False:
evaluation['@type'] = ['StaticCV', 'Evaluation']
evaluation.evaluatesOn = row['Dataset']
evaluation.evaluates = row['Model']
i = 0
for name in metric_names:
metric = models.Metric()
metric['@id'] = 'Metric' + str(i)
metric['@type'] = name.capitalize()
metric.value = row[name]
evaluation.metrics.append(metric)
i+=1
evaluations.append(evaluation)
return evaluations
class MappingMixin(object): class MappingMixin(object):
@property @property
@ -605,3 +583,47 @@ def _from_loaded_module(module, info=None, **kwargs):
yield cls(info=info, **kwargs) yield cls(info=info, **kwargs)
for instance in _instances_in_module(module): for instance in _instances_in_module(module):
yield instance yield instance
def evaluate(plugins, datasets, **kwargs):
ev = Eval(tuples=None,
datasets=datasets,
pipelines=[plugin.as_pipe() for plugin in plugins])
ev.evaluate()
results = ev.results
evaluations = evaluations_to_JSONLD(results, **kwargs)
return evaluations
def evaluations_to_JSONLD(results, flatten=False):
'''
Map the evaluation results to a JSONLD scheme
'''
evaluations = list()
metric_names = ['accuracy', 'precision_macro', 'recall_macro',
'f1_macro', 'f1_weighted', 'f1_micro', 'f1_macro']
for index, row in results.iterrows():
evaluation = models.Evaluation()
if row.get('CV', True):
evaluation['@type'] = ['StaticCV', 'Evaluation']
evaluation.evaluatesOn = row['Dataset']
evaluation.evaluates = row['Model']
i = 0
if flatten:
metric = models.Metric()
for name in metric_names:
metric[name] = row[name]
evaluation.metrics.append(metric)
else:
# We should probably discontinue this representation
for name in metric_names:
metric = models.Metric()
metric['@id'] = 'Metric' + str(i)
metric['@type'] = name.capitalize()
metric.value = row[name]
evaluation.metrics.append(metric)
i += 1
evaluations.append(evaluation)
return evaluations

@ -43,7 +43,7 @@
"$ref": "response.json" "$ref": "response.json"
}, },
"AggregatedEvaluation": { "AggregatedEvaluation": {
"$ref": "aggregatedevaluation.json" "$ref": "aggregatedEvaluation.json"
}, },
"Evaluation": { "Evaluation": {
"$ref": "evaluation.json" "$ref": "evaluation.json"

@ -10,6 +10,8 @@ from senpy.models import Results, Entry, EmotionSet, Emotion, Plugins
from senpy import plugins from senpy import plugins
from senpy.plugins.conversion.emotion.centroids import CentroidConversion from senpy.plugins.conversion.emotion.centroids import CentroidConversion
import pandas as pd
class ShelfDummyPlugin(plugins.SentimentPlugin, plugins.ShelfMixin): class ShelfDummyPlugin(plugins.SentimentPlugin, plugins.ShelfMixin):
'''Dummy plugin for tests.''' '''Dummy plugin for tests.'''
@ -212,7 +214,7 @@ class PluginsTest(TestCase):
def input(self, entry, **kwargs): def input(self, entry, **kwargs):
return entry.text return entry.text
def predict(self, input): def predict_one(self, input):
return 'SIGN' in input return 'SIGN' in input
def output(self, output, entry, **kwargs): def output(self, output, entry, **kwargs):
@ -242,7 +244,7 @@ class PluginsTest(TestCase):
mappings = {'happy': 'marl:Positive', 'sad': 'marl:Negative'} mappings = {'happy': 'marl:Positive', 'sad': 'marl:Negative'}
def predict(self, input, **kwargs): def predict_one(self, input, **kwargs):
return 'happy' if ':)' in input else 'sad' return 'happy' if ':)' in input else 'sad'
test_cases = [ test_cases = [
@ -309,6 +311,40 @@ class PluginsTest(TestCase):
res = c._backwards_conversion(e) res = c._backwards_conversion(e)
assert res["onyx:hasEmotionCategory"] == "c2" assert res["onyx:hasEmotionCategory"] == "c2"
def test_evaluation(self):
testdata = []
for i in range(50):
testdata.append(["good", 1])
for i in range(50):
testdata.append(["bad", 0])
dataset = pd.DataFrame(testdata, columns=['text', 'polarity'])
class DummyPlugin(plugins.TextBox):
description = 'Plugin to test evaluation'
version = 0
def predict_one(self, input):
return 0
class SmartPlugin(plugins.TextBox):
description = 'Plugin to test evaluation'
version = 0
def predict_one(self, input):
if input == 'good':
return 1
return 0
dpipe = DummyPlugin()
results = plugins.evaluate(datasets={'testdata': dataset}, plugins=[dpipe], flatten=True)
dumb_metrics = results[0].metrics[0]
assert abs(dumb_metrics['accuracy'] - 0.5) < 0.01
spipe = SmartPlugin()
results = plugins.evaluate(datasets={'testdata': dataset}, plugins=[spipe], flatten=True)
smart_metrics = results[0].metrics[0]
assert abs(smart_metrics['accuracy'] - 1) < 0.01
def make_mini_test(fpath): def make_mini_test(fpath):
def mini_test(self): def mini_test(self):

Loading…
Cancel
Save