mirror of
https://github.com/gsi-upm/senpy
synced 2024-09-28 17:01:43 +00:00
21a5a3f201
* Fixed Options for extra_params in UI * Enhanced meta-programming for models * Plugins can be imported from a python file if they're named `senpy_<whatever>.py>` (no need for `.senpy` anymore!) * Add docstings and tests to most plugins * Read plugin description from the docstring * Refactor code to get rid of unnecessary `.senpy`s * Load models, plugins and utils into the main namespace (see __init__.py) * Enhanced plugin development/experience with utils (easy_test, easy_serve) * Fix bug in check_template that wouldn't check objects * Make model defaults a private variable * Add option to list loaded plugins in CLI * Update docs
67 lines
1.9 KiB
Python
67 lines
1.9 KiB
Python
from senpy.plugins import AnalysisPlugin
|
|
from senpy.models import Entry
|
|
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
|
from nltk.tokenize.simple import LineTokenizer
|
|
import nltk
|
|
|
|
|
|
class SplitPlugin(AnalysisPlugin):
|
|
'''description: A sample plugin that chunks input text'''
|
|
|
|
def activate(self):
|
|
nltk.download('punkt')
|
|
|
|
def analyse_entry(self, entry, params):
|
|
chunker_type = params["delimiter"]
|
|
original_text = entry['nif:isString']
|
|
if chunker_type == "sentence":
|
|
tokenizer = PunktSentenceTokenizer()
|
|
if chunker_type == "paragraph":
|
|
tokenizer = LineTokenizer()
|
|
chars = list(tokenizer.span_tokenize(original_text))
|
|
for i, chunk in enumerate(tokenizer.tokenize(original_text)):
|
|
print(chunk)
|
|
e = Entry()
|
|
e['nif:isString'] = chunk
|
|
if entry.id:
|
|
e.id = entry.id + "#char={},{}".format(chars[i][0], chars[i][1])
|
|
yield e
|
|
|
|
test_cases = [
|
|
{
|
|
'entry': {
|
|
'nif:isString': 'Hello. World.'
|
|
},
|
|
'params': {
|
|
'delimiter': 'sentence',
|
|
},
|
|
'expected': [
|
|
{
|
|
'nif:isString': 'Hello.'
|
|
},
|
|
{
|
|
'nif:isString': 'World.'
|
|
}
|
|
]
|
|
},
|
|
{
|
|
'entry': {
|
|
"@id": ":test",
|
|
'nif:isString': 'Hello\nWorld'
|
|
},
|
|
'params': {
|
|
'delimiter': 'paragraph',
|
|
},
|
|
'expected': [
|
|
{
|
|
"@id": ":test#char=0,5",
|
|
'nif:isString': 'Hello'
|
|
},
|
|
{
|
|
"@id": ":test#char=6,11",
|
|
'nif:isString': 'World'
|
|
}
|
|
]
|
|
}
|
|
]
|