2017-06-21 17:58:18 +00:00
|
|
|
from senpy.plugins import AnalysisPlugin
|
|
|
|
from senpy.models import Entry
|
|
|
|
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
|
|
|
from nltk.tokenize.simple import LineTokenizer
|
|
|
|
import nltk
|
|
|
|
|
|
|
|
|
|
|
|
class SplitPlugin(AnalysisPlugin):
|
2018-01-03 08:39:30 +00:00
|
|
|
'''description: A sample plugin that chunks input text'''
|
2017-06-21 17:58:18 +00:00
|
|
|
|
|
|
|
def activate(self):
|
|
|
|
nltk.download('punkt')
|
|
|
|
|
|
|
|
def analyse_entry(self, entry, params):
|
2018-01-01 12:13:17 +00:00
|
|
|
chunker_type = params["delimiter"]
|
2017-12-12 15:53:26 +00:00
|
|
|
original_text = entry['nif:isString']
|
2017-06-21 17:58:18 +00:00
|
|
|
if chunker_type == "sentence":
|
|
|
|
tokenizer = PunktSentenceTokenizer()
|
|
|
|
if chunker_type == "paragraph":
|
|
|
|
tokenizer = LineTokenizer()
|
2017-12-12 15:53:26 +00:00
|
|
|
chars = list(tokenizer.span_tokenize(original_text))
|
2017-06-21 17:58:18 +00:00
|
|
|
for i, chunk in enumerate(tokenizer.tokenize(original_text)):
|
2017-12-12 15:53:26 +00:00
|
|
|
print(chunk)
|
2017-06-21 17:58:18 +00:00
|
|
|
e = Entry()
|
|
|
|
e['nif:isString'] = chunk
|
|
|
|
if entry.id:
|
|
|
|
e.id = entry.id + "#char={},{}".format(chars[i][0], chars[i][1])
|
|
|
|
yield e
|
|
|
|
|
|
|
|
test_cases = [
|
|
|
|
{
|
|
|
|
'entry': {
|
|
|
|
'nif:isString': 'Hello. World.'
|
|
|
|
},
|
|
|
|
'params': {
|
|
|
|
'delimiter': 'sentence',
|
|
|
|
},
|
|
|
|
'expected': [
|
|
|
|
{
|
|
|
|
'nif:isString': 'Hello.'
|
|
|
|
},
|
|
|
|
{
|
|
|
|
'nif:isString': 'World.'
|
|
|
|
}
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
'entry': {
|
2017-12-30 17:59:58 +00:00
|
|
|
"@id": ":test",
|
2017-12-12 15:53:26 +00:00
|
|
|
'nif:isString': 'Hello\nWorld'
|
2017-06-21 17:58:18 +00:00
|
|
|
},
|
|
|
|
'params': {
|
2017-12-12 15:53:26 +00:00
|
|
|
'delimiter': 'paragraph',
|
2017-06-21 17:58:18 +00:00
|
|
|
},
|
|
|
|
'expected': [
|
|
|
|
{
|
2017-12-12 15:53:26 +00:00
|
|
|
"@id": ":test#char=0,5",
|
|
|
|
'nif:isString': 'Hello'
|
2017-06-21 17:58:18 +00:00
|
|
|
},
|
|
|
|
{
|
2017-12-12 15:53:26 +00:00
|
|
|
"@id": ":test#char=6,11",
|
|
|
|
'nif:isString': 'World'
|
2017-06-21 17:58:18 +00:00
|
|
|
}
|
|
|
|
]
|
|
|
|
}
|
|
|
|
]
|