Change name to split, according to issue #37

Added chunker plugin to tokenize texts
Avoid python temporary files in pip tests
2025-11-08 19:08:15 +00:00 · 2017-06-13 19:44:40 +02:00 · 2017-06-13 14:00:40 +02:00 · 2017-06-12 21:50:51 +02:00 · 2017-06-12 21:27:02 +02:00
4 changed files with 51 additions and 5 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -98,7 +98,7 @@ clean :
    - make -e clean
  when: manual

-cleanup_pypirc:
+cleanup_py:
   stage: clean
   when: always   # this is important; run even if preceding stages failed.
   script:
--- a/6
+++ b/6
@@ -76,14 +76,12 @@ test-%:
 test: test-$(PYMAIN)

 dist/$(TARNAME): version
-	docker run --rm -v $$PWD:/usr/src/app/ -w /usr/src/app/ python:$(PYMAIN) python setup.py sdist;
-	docker run --rm -v $$PWD:/usr/src/app/ -w /usr/src/app/ python:$(PYMAIN) chmod -R a+rwx dist;
-
+	python setup.py sdist;

 sdist: dist/$(TARNAME)

 pip_test-%: sdist
-	docker run --rm -v $$PWD/dist:/dist/ -ti python:$* pip install /dist/$(TARNAME);
+	docker run --rm -v $$PWD/dist:/dist/ python:$* pip install /dist/$(TARNAME);

 pip_test: $(addprefix pip_test-,$(PYVERSIONS))

--- a/senpy/plugins/split/split.py
+++ b/senpy/plugins/split/split.py
@@ -0,0 +1,30 @@
+from senpy.plugins import AnalysisPlugin
+from senpy.models import Entry
+from nltk.tokenize.punkt import PunktSentenceTokenizer
+from nltk.tokenize.simple import LineTokenizer
+import nltk
+class SplitPlugin(AnalysisPlugin):
+    def activate(self):
+        nltk.download('punkt')
+    
+    def analyse_entry(self, entry, params):
+        chunker_type = params.get("delimiter", "sentence")
+        original_id = entry.id
+        original_text = entry.get("text", None)
+        if chunker_type == "sentence":
+            tokenizer = PunktSentenceTokenizer()
+            chars = tokenizer.span_tokenize(original_text)
+            for i, sentence in enumerate(tokenizer.tokenize(original_text)):
+                e = Entry()
+                e.text = sentence
+                e.id = original_id + "#char={},{}".format(chars[i][0], chars[i][1])
+                yield e
+        if chunker_type == "paragraph":
+            tokenizer = LineTokenizer()
+            chars = tokenizer.span_tokenize(original_text)
+            for i, paragraph in enumerate(tokenizer.tokenize(original_text)):
+                e = Entry() 
+                e.text = paragraph
+                chars = [char for char in chars]
+                e.id = original_id + "#char={},{}".format(chars[i][0], chars[i][1])
+                yield e
--- a/senpy/plugins/split/split.senpy
+++ b/senpy/plugins/split/split.senpy
@@ -0,0 +1,18 @@
+---
+name: split
+module: split
+description: A sample plugin that chunks input text
+author: "@militarpancho"
+version: '0.1'
+url: "https://github.com/gsi-upm/senpy"
+requirements: {nltk}
+extra_params:
+  delimiter:
+    aliases:
+    - type
+    - t
+    required: false
+    default: sentence
+    options:
+    - sentence
+    - paragraph
Author	SHA1	Message	Date
militarpancho	83e2d415a1	Change name to split, according to issue #37	2017-06-13 19:44:40 +02:00
militarpancho	f8ca595bc9	Added chunker plugin to tokenize texts	2017-06-13 14:00:40 +02:00
J. Fernando Sánchez	312e7f7f12	Avoid python temporary files in pip tests	2017-06-12 21:50:51 +02:00
J. Fernando Sánchez	c555b9547e	Non-interactive pip test	2017-06-12 21:27:02 +02:00