mirror of
https://github.com/gsi-upm/senpy
synced 2024-11-22 08:12:27 +00:00
commit
869c00f709
@ -12,13 +12,14 @@ class SplitPlugin(AnalysisPlugin):
|
|||||||
|
|
||||||
def analyse_entry(self, entry, params):
|
def analyse_entry(self, entry, params):
|
||||||
chunker_type = params.get("delimiter", "sentence")
|
chunker_type = params.get("delimiter", "sentence")
|
||||||
original_text = entry.get('nif:isString', None)
|
original_text = entry['nif:isString']
|
||||||
if chunker_type == "sentence":
|
if chunker_type == "sentence":
|
||||||
tokenizer = PunktSentenceTokenizer()
|
tokenizer = PunktSentenceTokenizer()
|
||||||
if chunker_type == "paragraph":
|
if chunker_type == "paragraph":
|
||||||
tokenizer = LineTokenizer()
|
tokenizer = LineTokenizer()
|
||||||
chars = tokenizer.span_tokenize(original_text)
|
chars = list(tokenizer.span_tokenize(original_text))
|
||||||
for i, chunk in enumerate(tokenizer.tokenize(original_text)):
|
for i, chunk in enumerate(tokenizer.tokenize(original_text)):
|
||||||
|
print(chunk)
|
||||||
e = Entry()
|
e = Entry()
|
||||||
e['nif:isString'] = chunk
|
e['nif:isString'] = chunk
|
||||||
if entry.id:
|
if entry.id:
|
||||||
@ -45,19 +46,19 @@ class SplitPlugin(AnalysisPlugin):
|
|||||||
{
|
{
|
||||||
'entry': {
|
'entry': {
|
||||||
"id": ":test",
|
"id": ":test",
|
||||||
'nif:isString': 'Hello. World.'
|
'nif:isString': 'Hello\nWorld'
|
||||||
},
|
},
|
||||||
'params': {
|
'params': {
|
||||||
'delimiter': 'sentence',
|
'delimiter': 'paragraph',
|
||||||
},
|
},
|
||||||
'expected': [
|
'expected': [
|
||||||
{
|
{
|
||||||
"@id": ":test#char=0,6",
|
"@id": ":test#char=0,5",
|
||||||
'nif:isString': 'Hello.'
|
'nif:isString': 'Hello'
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"@id": ":test#char=7,13",
|
"@id": ":test#char=6,11",
|
||||||
'nif:isString': 'World.'
|
'nif:isString': 'World'
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user