1
0
mirror of https://github.com/gsi-upm/senpy synced 2025-09-18 12:32:21 +00:00

WIP simpler pipeline

This commit is contained in:
J. Fernando Sánchez
2017-06-21 19:58:18 +02:00
parent fca0ac00c4
commit a243f68bfc
19 changed files with 369 additions and 227 deletions

View File

@@ -76,7 +76,7 @@ class AnalysisPlugin(Plugin):
Note that this method may yield an annotated entry or a list of
entries (e.g. in a tokenizer)
"""
text = entry['text']
text = entry['nif:isString']
params = copy.copy(parameters)
params['input'] = text
results = self.analyse(**params)

View File

@@ -0,0 +1,64 @@
from senpy.plugins import AnalysisPlugin
from senpy.models import Entry
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize.simple import LineTokenizer
import nltk
class SplitPlugin(AnalysisPlugin):
def activate(self):
nltk.download('punkt')
def analyse_entry(self, entry, params):
chunker_type = params.get("delimiter", "sentence")
original_text = entry.get('nif:isString', None)
if chunker_type == "sentence":
tokenizer = PunktSentenceTokenizer()
if chunker_type == "paragraph":
tokenizer = LineTokenizer()
chars = tokenizer.span_tokenize(original_text)
for i, chunk in enumerate(tokenizer.tokenize(original_text)):
e = Entry()
e['nif:isString'] = chunk
if entry.id:
e.id = entry.id + "#char={},{}".format(chars[i][0], chars[i][1])
yield e
test_cases = [
{
'entry': {
'nif:isString': 'Hello. World.'
},
'params': {
'delimiter': 'sentence',
},
'expected': [
{
'nif:isString': 'Hello.'
},
{
'nif:isString': 'World.'
}
]
},
{
'entry': {
"id": ":test",
'nif:isString': 'Hello. World.'
},
'params': {
'delimiter': 'sentence',
},
'expected': [
{
"@id": ":test#char=0,6",
'nif:isString': 'Hello.'
},
{
"@id": ":test#char=7,13",
'nif:isString': 'World.'
}
]
}
]

View File

@@ -0,0 +1,19 @@
---
name: split
module: senpy.plugins.misc.split
description: A sample plugin that chunks input text
author: "@militarpancho"
version: '0.2'
url: "https://github.com/gsi-upm/senpy"
requirements:
- nltk
extra_params:
delimiter:
aliases:
- type
- t
required: false
default: sentence
options:
- sentence
- paragraph

View File

@@ -12,7 +12,7 @@ class Sentiment140Plugin(SentimentPlugin):
json.dumps({
"language": lang,
"data": [{
"text": entry.nif__isString
"text": entry['nif:isString']
}]
}))
p = params.get("prefix", None)