1
0
mirror of https://github.com/gsi-upm/senpy synced 2024-11-22 08:12:27 +00:00

Added chunker plugin to tokenize texts

This commit is contained in:
militarpancho 2017-06-13 14:00:40 +02:00
parent 312e7f7f12
commit f8ca595bc9
2 changed files with 48 additions and 0 deletions

View File

@ -0,0 +1,30 @@
from senpy.plugins import AnalysisPlugin
from senpy.models import Entry
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize.simple import LineTokenizer
import nltk
class ChunkerPlugin(AnalysisPlugin):
def activate(self):
nltk.download('punkt')
def analyse_entry(self, entry, params):
chunker_type = params.get("type", "sentence")
original_id = entry.id
original_text = entry.get("text", None)
if chunker_type == "sentence":
tokenizer = PunktSentenceTokenizer()
chars = tokenizer.span_tokenize(original_text)
for i, sentence in enumerate(tokenizer.tokenize(original_text)):
e = Entry()
e.text = sentence
e.id = original_id + "#char={},{}".format(chars[i][0], chars[i][1])
yield e
if chunker_type == "paragraph":
tokenizer = LineTokenizer()
chars = tokenizer.span_tokenize(original_text)
for i, paragraph in enumerate(tokenizer.tokenize(original_text)):
e = Entry()
e.text = paragraph
chars = [char for char in chars]
e.id = original_id + "#char={},{}".format(chars[i][0], chars[i][1])
yield e

View File

@ -0,0 +1,18 @@
---
name: chunker
module: chunker
description: A sample plugin that chunks input text
author: "@militarpancho"
version: '0.1'
url: "https://github.com/gsi-upm/senpy"
requirements: {nltk}
extra_params:
type:
aliases:
- type
- t
required: false
default: sentence
options:
- sentence
- paragraph