mirror of
https://github.com/gsi-upm/senpy
synced 2024-11-22 08:12:27 +00:00
Added chunker plugin to tokenize texts
This commit is contained in:
parent
312e7f7f12
commit
f8ca595bc9
30
senpy/plugins/chunker/chunker.py
Normal file
30
senpy/plugins/chunker/chunker.py
Normal file
@ -0,0 +1,30 @@
|
||||
from senpy.plugins import AnalysisPlugin
|
||||
from senpy.models import Entry
|
||||
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
||||
from nltk.tokenize.simple import LineTokenizer
|
||||
import nltk
|
||||
class ChunkerPlugin(AnalysisPlugin):
|
||||
def activate(self):
|
||||
nltk.download('punkt')
|
||||
|
||||
def analyse_entry(self, entry, params):
|
||||
chunker_type = params.get("type", "sentence")
|
||||
original_id = entry.id
|
||||
original_text = entry.get("text", None)
|
||||
if chunker_type == "sentence":
|
||||
tokenizer = PunktSentenceTokenizer()
|
||||
chars = tokenizer.span_tokenize(original_text)
|
||||
for i, sentence in enumerate(tokenizer.tokenize(original_text)):
|
||||
e = Entry()
|
||||
e.text = sentence
|
||||
e.id = original_id + "#char={},{}".format(chars[i][0], chars[i][1])
|
||||
yield e
|
||||
if chunker_type == "paragraph":
|
||||
tokenizer = LineTokenizer()
|
||||
chars = tokenizer.span_tokenize(original_text)
|
||||
for i, paragraph in enumerate(tokenizer.tokenize(original_text)):
|
||||
e = Entry()
|
||||
e.text = paragraph
|
||||
chars = [char for char in chars]
|
||||
e.id = original_id + "#char={},{}".format(chars[i][0], chars[i][1])
|
||||
yield e
|
18
senpy/plugins/chunker/chunker.senpy
Normal file
18
senpy/plugins/chunker/chunker.senpy
Normal file
@ -0,0 +1,18 @@
|
||||
---
|
||||
name: chunker
|
||||
module: chunker
|
||||
description: A sample plugin that chunks input text
|
||||
author: "@militarpancho"
|
||||
version: '0.1'
|
||||
url: "https://github.com/gsi-upm/senpy"
|
||||
requirements: {nltk}
|
||||
extra_params:
|
||||
type:
|
||||
aliases:
|
||||
- type
|
||||
- t
|
||||
required: false
|
||||
default: sentence
|
||||
options:
|
||||
- sentence
|
||||
- paragraph
|
Loading…
Reference in New Issue
Block a user