From aa35e62a2724f9a3c33765d88de01fe8e44ba5f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Fernando=20S=C3=A1nchez?= Date: Mon, 20 Aug 2018 14:07:33 +0200 Subject: [PATCH] Avoid duplication in split plugin --- senpy/plugins/misc/split_plugin.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/senpy/plugins/misc/split_plugin.py b/senpy/plugins/misc/split_plugin.py index 4c11f3a..d54a6fb 100644 --- a/senpy/plugins/misc/split_plugin.py +++ b/senpy/plugins/misc/split_plugin.py @@ -9,7 +9,7 @@ class Split(AnalysisPlugin): '''description: A sample plugin that chunks input text''' author = ["@militarpancho", '@balkian'] - version = '0.2' + version = '0.3' url = "https://github.com/gsi-upm/senpy" extra_params = { @@ -33,12 +33,15 @@ class Split(AnalysisPlugin): if chunker_type == "paragraph": tokenizer = LineTokenizer() chars = list(tokenizer.span_tokenize(original_text)) - for i, chunk in enumerate(tokenizer.tokenize(original_text)): - print(chunk) + if len(chars) == 1: + # This sentence was already split + return + for i, chunk in enumerate(chars): + start, end = chunk e = Entry() - e['nif:isString'] = chunk + e['nif:isString'] = original_text[start:end] if entry.id: - e.id = entry.id + "#char={},{}".format(chars[i][0], chars[i][1]) + e.id = entry.id + "#char={},{}".format(start, end) yield e test_cases = [