pre-1.0
militarpancho 7 years ago
parent 694201d8d3
commit 55be0e57da

@ -12,13 +12,14 @@ class SplitPlugin(AnalysisPlugin):
def analyse_entry(self, entry, params): def analyse_entry(self, entry, params):
chunker_type = params.get("delimiter", "sentence") chunker_type = params.get("delimiter", "sentence")
original_text = entry.get('nif:isString', None) original_text = entry['nif:isString']
if chunker_type == "sentence": if chunker_type == "sentence":
tokenizer = PunktSentenceTokenizer() tokenizer = PunktSentenceTokenizer()
if chunker_type == "paragraph": if chunker_type == "paragraph":
tokenizer = LineTokenizer() tokenizer = LineTokenizer()
chars = tokenizer.span_tokenize(original_text) chars = list(tokenizer.span_tokenize(original_text))
for i, chunk in enumerate(tokenizer.tokenize(original_text)): for i, chunk in enumerate(tokenizer.tokenize(original_text)):
print(chunk)
e = Entry() e = Entry()
e['nif:isString'] = chunk e['nif:isString'] = chunk
if entry.id: if entry.id:
@ -45,19 +46,19 @@ class SplitPlugin(AnalysisPlugin):
{ {
'entry': { 'entry': {
"id": ":test", "id": ":test",
'nif:isString': 'Hello. World.' 'nif:isString': 'Hello\nWorld'
}, },
'params': { 'params': {
'delimiter': 'sentence', 'delimiter': 'paragraph',
}, },
'expected': [ 'expected': [
{ {
"@id": ":test#char=0,6", "@id": ":test#char=0,5",
'nif:isString': 'Hello.' 'nif:isString': 'Hello'
}, },
{ {
"@id": ":test#char=7,13", "@id": ":test#char=6,11",
'nif:isString': 'World.' 'nif:isString': 'World'
} }
] ]
} }

Loading…
Cancel
Save