mirror of
https://github.com/gsi-upm/senpy
synced 2025-09-18 12:32:21 +00:00
WIP simpler pipeline
This commit is contained in:
@@ -76,7 +76,7 @@ class AnalysisPlugin(Plugin):
|
||||
Note that this method may yield an annotated entry or a list of
|
||||
entries (e.g. in a tokenizer)
|
||||
"""
|
||||
text = entry['text']
|
||||
text = entry['nif:isString']
|
||||
params = copy.copy(parameters)
|
||||
params['input'] = text
|
||||
results = self.analyse(**params)
|
||||
|
64
senpy/plugins/misc/split.py
Normal file
64
senpy/plugins/misc/split.py
Normal file
@@ -0,0 +1,64 @@
|
||||
from senpy.plugins import AnalysisPlugin
|
||||
from senpy.models import Entry
|
||||
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
||||
from nltk.tokenize.simple import LineTokenizer
|
||||
import nltk
|
||||
|
||||
|
||||
class SplitPlugin(AnalysisPlugin):
|
||||
|
||||
def activate(self):
|
||||
nltk.download('punkt')
|
||||
|
||||
def analyse_entry(self, entry, params):
|
||||
chunker_type = params.get("delimiter", "sentence")
|
||||
original_text = entry.get('nif:isString', None)
|
||||
if chunker_type == "sentence":
|
||||
tokenizer = PunktSentenceTokenizer()
|
||||
if chunker_type == "paragraph":
|
||||
tokenizer = LineTokenizer()
|
||||
chars = tokenizer.span_tokenize(original_text)
|
||||
for i, chunk in enumerate(tokenizer.tokenize(original_text)):
|
||||
e = Entry()
|
||||
e['nif:isString'] = chunk
|
||||
if entry.id:
|
||||
e.id = entry.id + "#char={},{}".format(chars[i][0], chars[i][1])
|
||||
yield e
|
||||
|
||||
test_cases = [
|
||||
{
|
||||
'entry': {
|
||||
'nif:isString': 'Hello. World.'
|
||||
},
|
||||
'params': {
|
||||
'delimiter': 'sentence',
|
||||
},
|
||||
'expected': [
|
||||
{
|
||||
'nif:isString': 'Hello.'
|
||||
},
|
||||
{
|
||||
'nif:isString': 'World.'
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
'entry': {
|
||||
"id": ":test",
|
||||
'nif:isString': 'Hello. World.'
|
||||
},
|
||||
'params': {
|
||||
'delimiter': 'sentence',
|
||||
},
|
||||
'expected': [
|
||||
{
|
||||
"@id": ":test#char=0,6",
|
||||
'nif:isString': 'Hello.'
|
||||
},
|
||||
{
|
||||
"@id": ":test#char=7,13",
|
||||
'nif:isString': 'World.'
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
19
senpy/plugins/misc/split.senpy
Normal file
19
senpy/plugins/misc/split.senpy
Normal file
@@ -0,0 +1,19 @@
|
||||
---
|
||||
name: split
|
||||
module: senpy.plugins.misc.split
|
||||
description: A sample plugin that chunks input text
|
||||
author: "@militarpancho"
|
||||
version: '0.2'
|
||||
url: "https://github.com/gsi-upm/senpy"
|
||||
requirements:
|
||||
- nltk
|
||||
extra_params:
|
||||
delimiter:
|
||||
aliases:
|
||||
- type
|
||||
- t
|
||||
required: false
|
||||
default: sentence
|
||||
options:
|
||||
- sentence
|
||||
- paragraph
|
@@ -12,7 +12,7 @@ class Sentiment140Plugin(SentimentPlugin):
|
||||
json.dumps({
|
||||
"language": lang,
|
||||
"data": [{
|
||||
"text": entry.nif__isString
|
||||
"text": entry['nif:isString']
|
||||
}]
|
||||
}))
|
||||
p = params.get("prefix", None)
|
||||
|
Reference in New Issue
Block a user