Change name to split, according to issue #37

Added chunker plugin to tokenize texts
Avoid python temporary files in pip tests
2025-11-09 11:08:15 +00:00 · 2017-06-13 19:44:40 +02:00 · 2017-06-13 14:00:40 +02:00 · 2017-06-12 21:50:51 +02:00 · 2017-06-12 21:27:02 +02:00 · 2017-06-12 21:20:07 +02:00
7 changed files with 58 additions and 8 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -72,7 +72,7 @@ deploy_pypi:
    - make pip_upload
    - echo "" > ~/.pypirc && rm ~/.pypirc  # If the above fails, this won't run.
  only:
-    - /^v\d+\.\d+\.\d+([abc]\d*)?$/  # PEP-440 compliant version (tags)
+    - /^v?\d+\.\d+\.\d+([abc]\d*)?$/  # PEP-440 compliant version (tags)
  except:
    - branches
@@ -98,7 +98,7 @@ clean :
    - make -e clean
  when: manual
-cleanup_pypirc:
+cleanup_py:
   stage: clean
   when: always   # this is important; run even if preceding stages failed.
   script:
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,7 +7,6 @@ language: python
 env:
  - PYV=2.7
  - PYV=3.4
  - PYV=3.5
 # run nosetests - Tests
 script: make test-$PYV
--- a/7
+++ b/7
@@ -76,14 +76,12 @@ test-%:
 test: test-$(PYMAIN)
 dist/$(TARNAME): version
-	docker run --rm -ti -v $$PWD:/usr/src/app/ -w /usr/src/app/ python:$(PYMAIN) python setup.py sdist;
+	python setup.py sdist;
 	docker run --rm -ti -v $$PWD:/usr/src/app/ -w /usr/src/app/ python:$(PYMAIN) chmod -R a+rwx dist;
 sdist: dist/$(TARNAME)
 pip_test-%: sdist
-	docker run --rm -v $$PWD/dist:/dist/ -ti python:$* pip install /dist/$(TARNAME);
+	docker run --rm -v $$PWD/dist:/dist/ python:$* pip install /dist/$(TARNAME);
 pip_test: $(addprefix pip_test-,$(PYVERSIONS))
@@ -133,6 +131,7 @@ push-github:
 	@echo "$$GITHUB_DEPLOY_KEY" > $(KEY_FILE)
 	@git remote rm github-deploy || true
 	git remote add github-deploy $(GITHUB_REPO)
 	@GIT_SSH_COMMAND="ssh -i $(KEY_FILE)" git fetch github-deploy $(CI_COMMIT_REF_NAME) || true
 	@GIT_SSH_COMMAND="ssh -i $(KEY_FILE)" git push github-deploy $(CI_COMMIT_REF_NAME)
 	rm $(KEY_FILE)
--- a/k8s/senpy-deployment.yaml
+++ b/k8s/senpy-deployment.yaml
@@ -14,6 +14,7 @@ spec:
      containers:
      - name: senpy-latest
        image: gsiupm/senpy:latest
        imagePullPolicy: Always
        args:
          - "--default-plugins"
        resources:
--- a/senpy/plugins/split/split.py
+++ b/senpy/plugins/split/split.py
@@ -0,0 +1,30 @@
 from senpy.plugins import AnalysisPlugin
 from senpy.models import Entry
 from nltk.tokenize.punkt import PunktSentenceTokenizer
 from nltk.tokenize.simple import LineTokenizer
 import nltk
 class SplitPlugin(AnalysisPlugin):
    def activate(self):
        nltk.download('punkt')
    def analyse_entry(self, entry, params):
        chunker_type = params.get("delimiter", "sentence")
        original_id = entry.id
        original_text = entry.get("text", None)
        if chunker_type == "sentence":
            tokenizer = PunktSentenceTokenizer()
            chars = tokenizer.span_tokenize(original_text)
            for i, sentence in enumerate(tokenizer.tokenize(original_text)):
                e = Entry()
                e.text = sentence
                e.id = original_id + "#char={},{}".format(chars[i][0], chars[i][1])
                yield e
        if chunker_type == "paragraph":
            tokenizer = LineTokenizer()
            chars = tokenizer.span_tokenize(original_text)
            for i, paragraph in enumerate(tokenizer.tokenize(original_text)):
                e = Entry() 
                e.text = paragraph
                chars = [char for char in chars]
                e.id = original_id + "#char={},{}".format(chars[i][0], chars[i][1])
                yield e
--- a/senpy/plugins/split/split.senpy
+++ b/senpy/plugins/split/split.senpy
@@ -0,0 +1,18 @@
 ---
 name: split
 module: split
 description: A sample plugin that chunks input text
 author: "@militarpancho"
 version: '0.1'
 url: "https://github.com/gsi-upm/senpy"
 requirements: {nltk}
 extra_params:
  delimiter:
    aliases:
    - type
    - t
    required: false
    default: sentence
    options:
    - sentence
    - paragraph
--- a/setup.cfg
+++ b/setup.cfg
@@ -12,3 +12,6 @@ max-line-length = 100
 universal=1
 [tool:pytest]
 addopts = --cov=senpy --cov-report term-missing
 [coverage:report]
 omit = senpy/__main__.py
Author	SHA1	Message	Date
militarpancho	83e2d415a1	Change name to split, according to issue #37	2017-06-13 19:44:40 +02:00
militarpancho	f8ca595bc9	Added chunker plugin to tokenize texts	2017-06-13 14:00:40 +02:00
J. Fernando Sánchez	312e7f7f12	Avoid python temporary files in pip tests	2017-06-12 21:50:51 +02:00
J. Fernando Sánchez	c555b9547e	Non-interactive pip test	2017-06-12 21:27:02 +02:00
J. Fernando Sánchez	991ade8f4d	Make sdist non-interactive non-tty	2017-06-12 21:20:07 +02:00
J. Fernando Sánchez	1104e816cb	Push pip for tags without a preceding v	2017-06-12 21:06:34 +02:00
J. Fernando Sánchez	c19d03b41d	Added SSH access to github fetch	2017-06-12 20:47:46 +02:00
J. Fernando Sánchez	42c9068991	Add pull policy to k8s deployment * Add git fetch to (try to) fix github push from gitlab	2017-06-12 20:43:39 +02:00
J. Fernando Sánchez	96843827bd	Removed __main__ from test coverage reports	2017-06-12 20:29:29 +02:00
J. Fernando Sánchez	d76e4618fe	Removed python 3.4 from travis versions	2017-06-12 20:18:56 +02:00