1
0
mirror of https://github.com/gsi-upm/senpy synced 2024-09-29 01:11:42 +00:00
senpy/senpy/plugins/misc/split.py

66 lines
1.8 KiB
Python
Raw Normal View History

2017-06-21 17:58:18 +00:00
from senpy.plugins import AnalysisPlugin
from senpy.models import Entry
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize.simple import LineTokenizer
import nltk
class SplitPlugin(AnalysisPlugin):
def activate(self):
nltk.download('punkt')
def analyse_entry(self, entry, params):
chunker_type = params.get("delimiter", "sentence")
2017-12-12 15:53:26 +00:00
original_text = entry['nif:isString']
2017-06-21 17:58:18 +00:00
if chunker_type == "sentence":
tokenizer = PunktSentenceTokenizer()
if chunker_type == "paragraph":
tokenizer = LineTokenizer()
2017-12-12 15:53:26 +00:00
chars = list(tokenizer.span_tokenize(original_text))
2017-06-21 17:58:18 +00:00
for i, chunk in enumerate(tokenizer.tokenize(original_text)):
2017-12-12 15:53:26 +00:00
print(chunk)
2017-06-21 17:58:18 +00:00
e = Entry()
e['nif:isString'] = chunk
if entry.id:
e.id = entry.id + "#char={},{}".format(chars[i][0], chars[i][1])
yield e
test_cases = [
{
'entry': {
'nif:isString': 'Hello. World.'
},
'params': {
'delimiter': 'sentence',
},
'expected': [
{
'nif:isString': 'Hello.'
},
{
'nif:isString': 'World.'
}
]
},
{
'entry': {
"id": ":test",
2017-12-12 15:53:26 +00:00
'nif:isString': 'Hello\nWorld'
2017-06-21 17:58:18 +00:00
},
'params': {
2017-12-12 15:53:26 +00:00
'delimiter': 'paragraph',
2017-06-21 17:58:18 +00:00
},
'expected': [
{
2017-12-12 15:53:26 +00:00
"@id": ":test#char=0,5",
'nif:isString': 'Hello'
2017-06-21 17:58:18 +00:00
},
{
2017-12-12 15:53:26 +00:00
"@id": ":test#char=6,11",
'nif:isString': 'World'
2017-06-21 17:58:18 +00:00
}
]
}
]