1
0
mirror of https://github.com/gsi-upm/senpy synced 2024-11-22 00:02:28 +00:00

Merge branch 'split-fix'

Fix #48
This commit is contained in:
J. Fernando Sánchez 2017-12-13 12:30:59 +01:00
commit 869c00f709

View File

@ -12,13 +12,14 @@ class SplitPlugin(AnalysisPlugin):
def analyse_entry(self, entry, params): def analyse_entry(self, entry, params):
chunker_type = params.get("delimiter", "sentence") chunker_type = params.get("delimiter", "sentence")
original_text = entry.get('nif:isString', None) original_text = entry['nif:isString']
if chunker_type == "sentence": if chunker_type == "sentence":
tokenizer = PunktSentenceTokenizer() tokenizer = PunktSentenceTokenizer()
if chunker_type == "paragraph": if chunker_type == "paragraph":
tokenizer = LineTokenizer() tokenizer = LineTokenizer()
chars = tokenizer.span_tokenize(original_text) chars = list(tokenizer.span_tokenize(original_text))
for i, chunk in enumerate(tokenizer.tokenize(original_text)): for i, chunk in enumerate(tokenizer.tokenize(original_text)):
print(chunk)
e = Entry() e = Entry()
e['nif:isString'] = chunk e['nif:isString'] = chunk
if entry.id: if entry.id:
@ -45,19 +46,19 @@ class SplitPlugin(AnalysisPlugin):
{ {
'entry': { 'entry': {
"id": ":test", "id": ":test",
'nif:isString': 'Hello. World.' 'nif:isString': 'Hello\nWorld'
}, },
'params': { 'params': {
'delimiter': 'sentence', 'delimiter': 'paragraph',
}, },
'expected': [ 'expected': [
{ {
"@id": ":test#char=0,6", "@id": ":test#char=0,5",
'nif:isString': 'Hello.' 'nif:isString': 'Hello'
}, },
{ {
"@id": ":test#char=7,13", "@id": ":test#char=6,11",
'nif:isString': 'World.' 'nif:isString': 'World'
} }
] ]
} }