1
0
mirror of https://github.com/gsi-upm/senpy synced 2024-11-23 08:32:29 +00:00

Avoid duplication in split plugin

This commit is contained in:
J. Fernando Sánchez 2018-08-20 14:07:33 +02:00
parent 6dd4a44924
commit aa35e62a27

View File

@ -9,7 +9,7 @@ class Split(AnalysisPlugin):
'''description: A sample plugin that chunks input text''' '''description: A sample plugin that chunks input text'''
author = ["@militarpancho", '@balkian'] author = ["@militarpancho", '@balkian']
version = '0.2' version = '0.3'
url = "https://github.com/gsi-upm/senpy" url = "https://github.com/gsi-upm/senpy"
extra_params = { extra_params = {
@ -33,12 +33,15 @@ class Split(AnalysisPlugin):
if chunker_type == "paragraph": if chunker_type == "paragraph":
tokenizer = LineTokenizer() tokenizer = LineTokenizer()
chars = list(tokenizer.span_tokenize(original_text)) chars = list(tokenizer.span_tokenize(original_text))
for i, chunk in enumerate(tokenizer.tokenize(original_text)): if len(chars) == 1:
print(chunk) # This sentence was already split
return
for i, chunk in enumerate(chars):
start, end = chunk
e = Entry() e = Entry()
e['nif:isString'] = chunk e['nif:isString'] = original_text[start:end]
if entry.id: if entry.id:
e.id = entry.id + "#char={},{}".format(chars[i][0], chars[i][1]) e.id = entry.id + "#char={},{}".format(start, end)
yield e yield e
test_cases = [ test_cases = [