diff --git a/senpy/plugins/chunker/chunker.py b/senpy/plugins/chunker/chunker.py new file mode 100644 index 0000000..375498b --- /dev/null +++ b/senpy/plugins/chunker/chunker.py @@ -0,0 +1,30 @@ +from senpy.plugins import AnalysisPlugin +from senpy.models import Entry +from nltk.tokenize.punkt import PunktSentenceTokenizer +from nltk.tokenize.simple import LineTokenizer +import nltk +class ChunkerPlugin(AnalysisPlugin): + def activate(self): + nltk.download('punkt') + + def analyse_entry(self, entry, params): + chunker_type = params.get("type", "sentence") + original_id = entry.id + original_text = entry.get("text", None) + if chunker_type == "sentence": + tokenizer = PunktSentenceTokenizer() + chars = tokenizer.span_tokenize(original_text) + for i, sentence in enumerate(tokenizer.tokenize(original_text)): + e = Entry() + e.text = sentence + e.id = original_id + "#char={},{}".format(chars[i][0], chars[i][1]) + yield e + if chunker_type == "paragraph": + tokenizer = LineTokenizer() + chars = tokenizer.span_tokenize(original_text) + for i, paragraph in enumerate(tokenizer.tokenize(original_text)): + e = Entry() + e.text = paragraph + chars = [char for char in chars] + e.id = original_id + "#char={},{}".format(chars[i][0], chars[i][1]) + yield e diff --git a/senpy/plugins/chunker/chunker.senpy b/senpy/plugins/chunker/chunker.senpy new file mode 100644 index 0000000..b2bac33 --- /dev/null +++ b/senpy/plugins/chunker/chunker.senpy @@ -0,0 +1,18 @@ +--- +name: chunker +module: chunker +description: A sample plugin that chunks input text +author: "@militarpancho" +version: '0.1' +url: "https://github.com/gsi-upm/senpy" +requirements: {nltk} +extra_params: + type: + aliases: + - type + - t + required: false + default: sentence + options: + - sentence + - paragraph