diff --git a/Dockerfile.template b/Dockerfile.template
index 438b99f..1aa5236 100644
--- a/Dockerfile.template
+++ b/Dockerfile.template
@@ -20,8 +20,8 @@ ONBUILD WORKDIR /senpy-plugins/
WORKDIR /usr/src/app
-COPY test-requirements.txt requirements.txt /usr/src/app/
-RUN pip install --no-cache-dir --use-wheel -r test-requirements.txt -r requirements.txt
+COPY test-requirements.txt requirements.txt extra-requirements.txt /usr/src/app/
+RUN pip install --no-cache-dir -r test-requirements.txt -r requirements.txt -r extra-requirements.txt
COPY . /usr/src/app/
RUN pip install --no-cache-dir --no-index --no-deps --editable .
diff --git a/docs/senpy.rst b/docs/senpy.rst
index 67cdba7..bbdfb0d 100644
--- a/docs/senpy.rst
+++ b/docs/senpy.rst
@@ -1,8 +1,11 @@
What is Senpy?
--------------
-Web services can get really complex: data validation, user interaction, formatting, logging., etc.
-The figure below summarizes the typical features in an analysis service.
+Senpy is a framework for text analysis using Linked Data. There are three main applications of Senpy so far: sentiment and emotion analysis, user profiling and entity recoginition. Annotations and Services are compliant with NIF (NLP Interchange Format).
+
+Senpy aims at providing a framework where analysis modules can be integrated easily as plugins, and providing a core functionality for managing tasks such as data validation, user interaction, formatting, logging, translation to linked data, etc.
+
+The figure below summarizes the typical features in a text analysis service.
Senpy implements all the common blocks, so developers can focus on what really matters: great analysis algorithms that solve real problems.
.. image:: senpy-framework.png
diff --git a/docs/vocabularies.rst b/docs/vocabularies.rst
index c47c69a..e55ad36 100644
--- a/docs/vocabularies.rst
+++ b/docs/vocabularies.rst
@@ -1,8 +1,24 @@
Vocabularies and model
======================
-The model used in Senpy is based on the following vocabularies:
+The model used in Senpy is based on NIF 2.0 [1], which defines a semantic format and API for improving interoperability among natural language processing services.
-* Marl, a vocabulary designed to annotate and describe subjetive opinions expressed on the web or in information systems.
-* Onyx, which is built one the same principles as Marl to annotate and describe emotions, and provides interoperability with Emotion Markup Language.
-* NIF 2.0, which defines a semantic format and APO for improving interoperability among natural language processing services
+Senpy has been applied to sentiment and emotion analysis services using the following vocabularies:
+
+* Marl [2,6], a vocabulary designed to annotate and describe subjetive opinions expressed on the web or in information systems.
+* Onyx [3,5], which is built one the same principles as Marl to annotate and describe emotions, and provides interoperability with Emotion Markup Language.
+
+An overview of the vocabularies and their use can be found in [4].
+
+
+[1] Guidelines for developing NIF-based NLP services, Final Community Group Report 22 December 2015 Available at: https://www.w3.org/2015/09/bpmlod-reports/nif-based-nlp-webservices/
+
+[2] Marl Ontology Specification, available at http://www.gsi.dit.upm.es/ontologies/marl/
+
+[3] Onyx Ontology Specification, available at http://www.gsi.dit.upm.es/ontologies/onyx/
+
+[4] Iglesias, C. A., Sánchez-Rada, J. F., Vulcu, G., & Buitelaar, P. (2017). Linked Data Models for Sentiment and Emotion Analysis in Social Networks. In Sentiment Analysis in Social Networks (pp. 49-69).
+
+[5] Sánchez-Rada, J. F., & Iglesias, C. A. (2016). Onyx: A linked data approach to emotion representation. Information Processing & Management, 52(1), 99-114.
+
+[6] Westerski, A., Iglesias Fernandez, C. A., & Tapia Rico, F. (2011). Linked opinions: Describing sentiments on the structured web of data.
diff --git a/example-plugins/basic_box_plugin.py b/example-plugins/basic_box_plugin.py
index 0b85951..3b18cf9 100644
--- a/example-plugins/basic_box_plugin.py
+++ b/example-plugins/basic_box_plugin.py
@@ -18,7 +18,7 @@ class BasicBox(SentimentBox):
'default': 'marl:Neutral'
}
- def predict(self, input):
+ def predict_one(self, input):
output = basic.get_polarity(input)
return self.mappings.get(output, self.mappings['default'])
diff --git a/example-plugins/basic_plugin.py b/example-plugins/basic_plugin.py
index 35af16a..3c91e76 100644
--- a/example-plugins/basic_plugin.py
+++ b/example-plugins/basic_plugin.py
@@ -18,7 +18,7 @@ class Basic(MappingMixin, SentimentBox):
'default': 'marl:Neutral'
}
- def predict(self, input):
+ def predict_one(self, input):
return basic.get_polarity(input)
test_cases = [{
diff --git a/example-plugins/sklearn/pipeline_plugin.py b/example-plugins/sklearn/pipeline_plugin.py
index a8eca0d..29e8f36 100644
--- a/example-plugins/sklearn/pipeline_plugin.py
+++ b/example-plugins/sklearn/pipeline_plugin.py
@@ -18,7 +18,7 @@ class PipelineSentiment(MappingMixin, SentimentBox):
-1: 'marl:Negative'
}
- def predict(self, input):
+ def predict_one(self, input):
return pipeline.predict([input, ])[0]
test_cases = [
diff --git a/extra-requirements.txt b/extra-requirements.txt
new file mode 100644
index 0000000..4f16b72
--- /dev/null
+++ b/extra-requirements.txt
@@ -0,0 +1 @@
+gsitk
diff --git a/senpy/api.py b/senpy/api.py
index 358bb66..08a8a6d 100644
--- a/senpy/api.py
+++ b/senpy/api.py
@@ -53,6 +53,21 @@ API_PARAMS = {
}
}
+EVAL_PARAMS = {
+ "algorithm": {
+ "aliases": ["plug", "p", "plugins", "algorithms", 'algo', 'a', 'plugin'],
+ "description": "Plugins to be evaluated",
+ "required": True,
+ "help": "See activated plugins in /plugins"
+ },
+ "dataset": {
+ "aliases": ["datasets", "data", "d"],
+ "description": "Datasets to be evaluated",
+ "required": True,
+ "help": "See avalaible datasets in /datasets"
+ }
+}
+
PLUGINS_PARAMS = {
"plugin_type": {
"@id": "pluginType",
diff --git a/senpy/blueprints.py b/senpy/blueprints.py
index 6a4fbe0..c453665 100644
--- a/senpy/blueprints.py
+++ b/senpy/blueprints.py
@@ -19,7 +19,7 @@ Blueprints for Senpy
"""
from flask import (Blueprint, request, current_app, render_template, url_for,
jsonify)
-from .models import Error, Response, Help, Plugins, read_schema
+from .models import Error, Response, Help, Plugins, read_schema, Datasets
from . import api
from .version import __version__
from functools import wraps
@@ -133,6 +133,17 @@ def api_root():
req = api.parse_call(request.parameters)
return current_app.senpy.analyse(req)
+@api_blueprint.route('/evaluate/', methods=['POST', 'GET'])
+@basic_api
+def evaluate():
+ if request.parameters['help']:
+ dic = dict(api.EVAL_PARAMS)
+ response = Help(parameters=dic)
+ return response
+ else:
+ params = api.parse_params(request.parameters, api.EVAL_PARAMS)
+ response = current_app.senpy.evaluate(params)
+ return response
@api_blueprint.route('/plugins/', methods=['POST', 'GET'])
@basic_api
@@ -150,3 +161,12 @@ def plugins():
def plugin(plugin=None):
sp = current_app.senpy
return sp.get_plugin(plugin)
+
+
+@api_blueprint.route('/datasets/', methods=['POST','GET'])
+@basic_api
+def datasets():
+ sp = current_app.senpy
+ datasets = sp.datasets
+ dic = Datasets(datasets = list(datasets.values()))
+ return dic
\ No newline at end of file
diff --git a/senpy/client.py b/senpy/client.py
index ae1e375..892e77a 100644
--- a/senpy/client.py
+++ b/senpy/client.py
@@ -12,10 +12,17 @@ class Client(object):
def analyse(self, input, method='GET', **kwargs):
return self.request('/', method=method, input=input, **kwargs)
+ def evaluate(self, input, method='GET', **kwargs):
+ return self.request('/evaluate', method = method, input=input, **kwargs)
+
def plugins(self, *args, **kwargs):
resp = self.request(path='/plugins').plugins
return {p.name: p for p in resp}
+ def datasets(self):
+ resp = self.request(path='/datasets').datasets
+ return {d.name: d for d in resp}
+
def request(self, path=None, method='GET', **params):
url = '{}{}'.format(self.endpoint, path)
response = requests.request(method=method, url=url, params=params)
diff --git a/senpy/extensions.py b/senpy/extensions.py
index 462be2a..f3ec2b7 100644
--- a/senpy/extensions.py
+++ b/senpy/extensions.py
@@ -6,8 +6,8 @@ from future import standard_library
standard_library.install_aliases()
from . import plugins, api
-from .plugins import Plugin
-from .models import Error
+from .plugins import Plugin, evaluate
+from .models import Error, AggregatedEvaluation
from .blueprints import api_blueprint, demo_blueprint, ns_blueprint
from threading import Thread
@@ -17,12 +17,19 @@ import copy
import errno
import logging
+
logger = logging.getLogger(__name__)
+try:
+ from gsitk.datasets.datasets import DatasetManager
+ GSITK_AVAILABLE = True
+except ImportError:
+ logger.warn('GSITK is not installed. Some functions will be unavailable.')
+ GSITK_AVAILABLE = False
+
class Senpy(object):
""" Default Senpy extension for Flask """
-
def __init__(self,
app=None,
plugin_folder=".",
@@ -181,6 +188,55 @@ class Senpy(object):
results.analysis = [i['plugin'].id for i in results.analysis]
return results
+ def _get_datasets(self, request):
+ if not self.datasets:
+ raise Error(
+ status=404,
+ message=("No datasets found."
+ " Please verify DatasetManager"))
+ datasets_name = request.parameters.get('dataset', None).split(',')
+ for dataset in datasets_name:
+ if dataset not in self.datasets:
+ logger.debug(("The dataset '{}' is not valid\n"
+ "Valid datasets: {}").format(dataset,
+ self.datasets.keys()))
+ raise Error(
+ status=404,
+ message="The dataset '{}' is not valid".format(dataset))
+ dm = DatasetManager()
+ datasets = dm.prepare_datasets(datasets_name)
+ return datasets
+
+ @property
+ def datasets(self):
+ if not GSITK_AVAILABLE:
+ raise Exception('GSITK is not available. Install it to use this function.')
+ self._dataset_list = {}
+ dm = DatasetManager()
+ for item in dm.get_datasets():
+ for key in item:
+ if key in self._dataset_list:
+ continue
+ properties = item[key]
+ properties['@id'] = key
+ self._dataset_list[key] = properties
+ return self._dataset_list
+
+ def evaluate(self, params):
+ if not GSITK_AVAILABLE:
+ raise Exception('GSITK is not available. Install it to use this function.')
+ logger.debug("evaluating request: {}".format(params))
+ results = AggregatedEvaluation()
+ results.parameters = params
+ datasets = self._get_datasets(results)
+ plugins = self._get_plugins(results)
+ for eval in evaluate(plugins, datasets):
+ results.evaluations.append(eval)
+ if 'with_parameters' not in results.parameters:
+ del results.parameters
+ logger.debug("Returning evaluation result: {}".format(results))
+ return results
+
def _conversion_candidates(self, fromModel, toModel):
candidates = self.plugins(plugin_type='emotionConversionPlugin')
for candidate in candidates:
diff --git a/senpy/models.py b/senpy/models.py
index d8f6eb9..25636a0 100644
--- a/senpy/models.py
+++ b/senpy/models.py
@@ -335,5 +335,11 @@ for i in [
'results',
'sentimentPlugin',
'suggestion',
+ 'aggregatedEvaluation',
+ 'evaluation',
+ 'metric',
+ 'dataset',
+ 'datasets',
+
]:
_add_class_from_schema(i)
diff --git a/senpy/plugins/__init__.py b/senpy/plugins/__init__.py
index 6072597..621ae76 100644
--- a/senpy/plugins/__init__.py
+++ b/senpy/plugins/__init__.py
@@ -19,11 +19,22 @@ import importlib
import yaml
import threading
+import numpy as np
+
from .. import models, utils
from .. import api
+
logger = logging.getLogger(__name__)
+try:
+ from gsitk.evaluation.evaluation import Evaluation as Eval
+ from sklearn.pipeline import Pipeline
+ GSITK_AVAILABLE = True
+except ImportError:
+ logger.warn('GSITK is not installed. Some functions will be unavailable.')
+ GSITK_AVAILABLE = False
+
class PluginMeta(models.BaseMeta):
_classes = {}
@@ -251,7 +262,7 @@ class Box(AnalysisPlugin):
.. code-block::
- entry --> input() --> predict() --> output() --> entry'
+ entry --> input() --> predict_one() --> output() --> entry'
In other words: their ``input`` method convers a query (entry and a set of parameters) into
@@ -267,15 +278,33 @@ class Box(AnalysisPlugin):
'''Transforms the results of the black box into an entry'''
return output
- def predict(self, input):
+ def predict_one(self, input):
raise NotImplementedError('You should define the behavior of this plugin')
def analyse_entries(self, entries, params):
for entry in entries:
input = self.input(entry=entry, params=params)
- results = self.predict(input=input)
+ results = self.predict_one(input=input)
yield self.output(output=results, entry=entry, params=params)
+ def fit(self, X=None, y=None):
+ return self
+
+ def transform(self, X):
+ return np.array([self.predict_one(x) for x in X])
+
+ def predict(self, X):
+ return self.transform(X)
+
+ def fit_transform(self, X, y):
+ self.fit(X, y)
+ return self.transform(X)
+
+ def as_pipe(self):
+ pipe = Pipeline([('plugin', self)])
+ pipe.name = self.name
+ return pipe
+
class TextBox(Box):
'''A black box plugin that takes only text as input'''
@@ -438,7 +467,7 @@ def install_deps(*plugins):
for info in plugins:
requirements = info.get('requirements', [])
if requirements:
- pip_args = [sys.executable, '-m', 'pip', 'install', '--use-wheel']
+ pip_args = [sys.executable, '-m', 'pip', 'install']
for req in requirements:
pip_args.append(req)
logger.info('Installing requirements: ' + str(requirements))
@@ -560,3 +589,50 @@ def _from_loaded_module(module, info=None, **kwargs):
yield cls(info=info, **kwargs)
for instance in _instances_in_module(module):
yield instance
+
+
+def evaluate(plugins, datasets, **kwargs):
+ if not GSITK_AVAILABLE:
+ raise Exception('GSITK is not available. Install it to use this function.')
+
+ ev = Eval(tuples=None,
+ datasets=datasets,
+ pipelines=[plugin.as_pipe() for plugin in plugins])
+ ev.evaluate()
+ results = ev.results
+ evaluations = evaluations_to_JSONLD(results, **kwargs)
+ return evaluations
+
+
+def evaluations_to_JSONLD(results, flatten=False):
+ '''
+ Map the evaluation results to a JSONLD scheme
+ '''
+
+ evaluations = list()
+ metric_names = ['accuracy', 'precision_macro', 'recall_macro',
+ 'f1_macro', 'f1_weighted', 'f1_micro', 'f1_macro']
+
+ for index, row in results.iterrows():
+ evaluation = models.Evaluation()
+ if row.get('CV', True):
+ evaluation['@type'] = ['StaticCV', 'Evaluation']
+ evaluation.evaluatesOn = row['Dataset']
+ evaluation.evaluates = row['Model']
+ i = 0
+ if flatten:
+ metric = models.Metric()
+ for name in metric_names:
+ metric[name] = row[name]
+ evaluation.metrics.append(metric)
+ else:
+ # We should probably discontinue this representation
+ for name in metric_names:
+ metric = models.Metric()
+ metric['@id'] = 'Metric' + str(i)
+ metric['@type'] = name.capitalize()
+ metric.value = row[name]
+ evaluation.metrics.append(metric)
+ i += 1
+ evaluations.append(evaluation)
+ return evaluations
diff --git a/senpy/schemas/aggregatedEvaluation.json b/senpy/schemas/aggregatedEvaluation.json
new file mode 100644
index 0000000..4560aab
--- /dev/null
+++ b/senpy/schemas/aggregatedEvaluation.json
@@ -0,0 +1,38 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "allOf": [
+ {"$ref": "response.json"},
+ {
+ "title": "AggregatedEvaluation",
+ "description": "The results of the evaluation",
+ "type": "object",
+ "properties": {
+ "@context": {
+ "$ref": "context.json"
+ },
+ "@type": {
+ "default": "AggregatedEvaluation"
+ },
+ "@id": {
+ "description": "ID of the aggregated evaluation",
+ "type": "string"
+ },
+ "evaluations": {
+ "default": [],
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "$ref": "evaluation.json"
+ },{
+ "type": "string"
+ }
+ ]
+ }
+ }
+
+ },
+ "required": ["@id", "evaluations"]
+ }
+ ]
+}
diff --git a/senpy/schemas/dataset.json b/senpy/schemas/dataset.json
new file mode 100644
index 0000000..6786d8f
--- /dev/null
+++ b/senpy/schemas/dataset.json
@@ -0,0 +1,29 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "name": "Dataset",
+ "properties": {
+ "@id": {
+ "type": "string"
+ },
+ "name": {
+ "type": "string"
+ },
+ "compression": {
+ "type": "string"
+ },
+ "expected_bytes": {
+ "type": "int"
+ },
+ "filename": {
+ "description": "Name of the dataset",
+ "type": "string"
+ },
+ "url": {
+ "description": "Classifier or plugin evaluated",
+ "type": "string"
+ },
+ "stats": {
+ }
+ },
+ "required": ["@id"]
+}
diff --git a/senpy/schemas/datasets.json b/senpy/schemas/datasets.json
new file mode 100644
index 0000000..98bec82
--- /dev/null
+++ b/senpy/schemas/datasets.json
@@ -0,0 +1,18 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "allOf": [
+ {"$ref": "response.json"},
+ {
+ "required": ["datasets"],
+ "properties": {
+ "datasets": {
+ "type": "array",
+ "default": [],
+ "items": {
+ "$ref": "dataset.json"
+ }
+ }
+ }
+ }
+ ]
+}
diff --git a/senpy/schemas/definitions.json b/senpy/schemas/definitions.json
index 0b748d6..4db5e24 100644
--- a/senpy/schemas/definitions.json
+++ b/senpy/schemas/definitions.json
@@ -41,5 +41,20 @@
},
"Response": {
"$ref": "response.json"
+ },
+ "AggregatedEvaluation": {
+ "$ref": "aggregatedEvaluation.json"
+ },
+ "Evaluation": {
+ "$ref": "evaluation.json"
+ },
+ "Metric": {
+ "$ref": "metric.json"
+ },
+ "Dataset": {
+ "$ref": "dataset.json"
+ },
+ "Datasets": {
+ "$ref": "datasets.json"
}
}
diff --git a/senpy/schemas/evaluation.json b/senpy/schemas/evaluation.json
new file mode 100644
index 0000000..c8816e9
--- /dev/null
+++ b/senpy/schemas/evaluation.json
@@ -0,0 +1,28 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "name": "Evalation",
+ "properties": {
+ "@id": {
+ "type": "string"
+ },
+ "@type": {
+ "type": "array",
+ "default": "Evaluation"
+
+ },
+ "metrics": {
+ "type": "array",
+ "items": {"$ref": "metric.json" },
+ "default": []
+ },
+ "evaluatesOn": {
+ "description": "Name of the dataset evaluated ",
+ "type": "string"
+ },
+ "evaluates": {
+ "description": "Classifier or plugin evaluated",
+ "type": "string"
+ }
+ },
+ "required": ["@id", "metrics"]
+}
diff --git a/senpy/schemas/metric.json b/senpy/schemas/metric.json
new file mode 100644
index 0000000..842e099
--- /dev/null
+++ b/senpy/schemas/metric.json
@@ -0,0 +1,24 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "properties": {
+ "@id": {
+ "type": "string"
+ },
+ "@type": {
+ "type": "string"
+ },
+ "maxValue": {
+ "type": "number"
+ },
+ "minValue": {
+ "type": "number"
+ },
+ "value": {
+ "type": "number"
+ },
+ "deviation": {
+ "type": "number"
+ }
+ },
+ "required": ["@id"]
+}
diff --git a/senpy/static/js/main.js b/senpy/static/js/main.js
index be243a4..2499149 100644
--- a/senpy/static/js/main.js
+++ b/senpy/static/js/main.js
@@ -33,6 +33,10 @@ function get_plugins(response){
plugins = response.plugins;
}
+function get_datasets(response){
+ datasets = response.datasets
+}
+
function group_plugins(){
for (r in plugins){
ptype = plugins[r]['@type'];
@@ -77,7 +81,10 @@ function draw_plugins_selection(){
}
}
html += ""
- document.getElementById('plugins').innerHTML = html;
+ // Two elements with plugin class
+ // One from the evaluate tab and another one from the analyse tab
+ document.getElementsByClassName('plugin')[0].innerHTML = html;
+ document.getElementsByClassName('plugin')[1].innerHTML = html;
}
function draw_plugins_list(){
@@ -98,15 +105,29 @@ function draw_plugins_list(){
}
}
+function draw_datasets(){
+ html = "";
+ repeated_html = ""+datasets[dataset]["@id"];
+ html += "
"
+ }
+ document.getElementById("datasets").innerHTML = html;
+}
+
$(document).ready(function() {
var response = JSON.parse($.ajax({type: "GET", url: "/api/plugins/" , async: false}).responseText);
defaultPlugin= JSON.parse($.ajax({type: "GET", url: "/api/plugins/default" , async: false}).responseText);
+ var response2 = JSON.parse($.ajax({type: "GET", url: "/api/datasets/" , async: false}).responseText);
+
get_plugins(response);
get_default_parameters();
+ get_datasets(response2);
draw_plugins_list();
draw_plugins_selection();
draw_parameters();
+ draw_datasets();
$(window).on('hashchange', hashchanged);
hashchanged();
@@ -129,7 +150,7 @@ function draw_default_parameters(){
}
function draw_extra_parameters(){
- var plugin = document.getElementById("plugins").options[document.getElementById("plugins").selectedIndex].value;
+ var plugin = document.getElementsByClassName('plugin')[0].options[document.getElementsByClassName('plugin')[0].selectedIndex].value;
get_parameters();
var extra_params = document.getElementById("extra_params");
@@ -240,13 +261,16 @@ function add_param(key, value){
return "&"+key+"="+value;
}
+
function load_JSON(){
url = "/api";
var container = document.getElementById('results');
var rawcontainer = document.getElementById("jsonraw");
rawcontainer.innerHTML = '';
container.innerHTML = '';
- var plugin = document.getElementById("plugins").options[document.getElementById("plugins").selectedIndex].value;
+
+ var plugin = document.getElementsByClassName("plugin")[0].options[document.getElementsByClassName("plugin")[0].selectedIndex].value;
+
var input = encodeURIComponent(document.getElementById("input").value);
url += "?algo="+plugin+"&i="+input
@@ -278,3 +302,85 @@ function load_JSON(){
// location.hash = 'raw';
}
}
+
+function get_datasets_from_checkbox(){
+ var checks = document.getElementsByClassName("checks-datasets");
+
+ datasets = "";
+ for (var i = 0; i < checks.length; i++){
+ if (checks[i].checked){
+ datasets += checks[i].value + ",";
+ }
+ }
+ datasets = datasets.slice(0, -1);
+}
+
+
+function create_body_metrics(evaluations){
+ var new_tbody = document.createElement('tbody')
+ var metric_html = ""
+ for (var eval in evaluations){
+ metric_html += "