mirror of
https://github.com/gsi-upm/senpy
synced 2024-11-23 08:32:29 +00:00
Avoid duplication in split plugin
This commit is contained in:
parent
6dd4a44924
commit
aa35e62a27
@ -9,7 +9,7 @@ class Split(AnalysisPlugin):
|
|||||||
'''description: A sample plugin that chunks input text'''
|
'''description: A sample plugin that chunks input text'''
|
||||||
|
|
||||||
author = ["@militarpancho", '@balkian']
|
author = ["@militarpancho", '@balkian']
|
||||||
version = '0.2'
|
version = '0.3'
|
||||||
url = "https://github.com/gsi-upm/senpy"
|
url = "https://github.com/gsi-upm/senpy"
|
||||||
|
|
||||||
extra_params = {
|
extra_params = {
|
||||||
@ -33,12 +33,15 @@ class Split(AnalysisPlugin):
|
|||||||
if chunker_type == "paragraph":
|
if chunker_type == "paragraph":
|
||||||
tokenizer = LineTokenizer()
|
tokenizer = LineTokenizer()
|
||||||
chars = list(tokenizer.span_tokenize(original_text))
|
chars = list(tokenizer.span_tokenize(original_text))
|
||||||
for i, chunk in enumerate(tokenizer.tokenize(original_text)):
|
if len(chars) == 1:
|
||||||
print(chunk)
|
# This sentence was already split
|
||||||
|
return
|
||||||
|
for i, chunk in enumerate(chars):
|
||||||
|
start, end = chunk
|
||||||
e = Entry()
|
e = Entry()
|
||||||
e['nif:isString'] = chunk
|
e['nif:isString'] = original_text[start:end]
|
||||||
if entry.id:
|
if entry.id:
|
||||||
e.id = entry.id + "#char={},{}".format(chars[i][0], chars[i][1])
|
e.id = entry.id + "#char={},{}".format(start, end)
|
||||||
yield e
|
yield e
|
||||||
|
|
||||||
test_cases = [
|
test_cases = [
|
||||||
|
Loading…
Reference in New Issue
Block a user