1
0
mirror of https://github.com/gsi-upm/senpy synced 2025-10-22 19:28:23 +00:00

Compare commits

...

10 Commits

Author SHA1 Message Date
militarpancho
83e2d415a1 Change name to split, according to issue #37 2017-06-13 19:44:40 +02:00
militarpancho
f8ca595bc9 Added chunker plugin to tokenize texts 2017-06-13 14:00:40 +02:00
J. Fernando Sánchez
312e7f7f12 Avoid python temporary files in pip tests 2017-06-12 21:50:51 +02:00
J. Fernando Sánchez
c555b9547e Non-interactive pip test 2017-06-12 21:27:02 +02:00
J. Fernando Sánchez
991ade8f4d Make sdist non-interactive non-tty 2017-06-12 21:20:07 +02:00
J. Fernando Sánchez
1104e816cb Push pip for tags without a preceding v 2017-06-12 21:06:34 +02:00
J. Fernando Sánchez
c19d03b41d Added SSH access to github fetch 2017-06-12 20:47:46 +02:00
J. Fernando Sánchez
42c9068991 Add pull policy to k8s deployment
* Add git fetch to (try to) fix github push from gitlab
2017-06-12 20:43:39 +02:00
J. Fernando Sánchez
96843827bd Removed __main__ from test coverage reports 2017-06-12 20:29:29 +02:00
J. Fernando Sánchez
d76e4618fe Removed python 3.4 from travis versions 2017-06-12 20:18:56 +02:00
7 changed files with 58 additions and 8 deletions

View File

@@ -72,7 +72,7 @@ deploy_pypi:
- make pip_upload
- echo "" > ~/.pypirc && rm ~/.pypirc # If the above fails, this won't run.
only:
- /^v\d+\.\d+\.\d+([abc]\d*)?$/ # PEP-440 compliant version (tags)
- /^v?\d+\.\d+\.\d+([abc]\d*)?$/ # PEP-440 compliant version (tags)
except:
- branches
@@ -98,7 +98,7 @@ clean :
- make -e clean
when: manual
cleanup_pypirc:
cleanup_py:
stage: clean
when: always # this is important; run even if preceding stages failed.
script:

View File

@@ -7,7 +7,6 @@ language: python
env:
- PYV=2.7
- PYV=3.4
- PYV=3.5
# run nosetests - Tests
script: make test-$PYV

View File

@@ -76,14 +76,12 @@ test-%:
test: test-$(PYMAIN)
dist/$(TARNAME): version
docker run --rm -ti -v $$PWD:/usr/src/app/ -w /usr/src/app/ python:$(PYMAIN) python setup.py sdist;
docker run --rm -ti -v $$PWD:/usr/src/app/ -w /usr/src/app/ python:$(PYMAIN) chmod -R a+rwx dist;
python setup.py sdist;
sdist: dist/$(TARNAME)
pip_test-%: sdist
docker run --rm -v $$PWD/dist:/dist/ -ti python:$* pip install /dist/$(TARNAME);
docker run --rm -v $$PWD/dist:/dist/ python:$* pip install /dist/$(TARNAME);
pip_test: $(addprefix pip_test-,$(PYVERSIONS))
@@ -133,6 +131,7 @@ push-github:
@echo "$$GITHUB_DEPLOY_KEY" > $(KEY_FILE)
@git remote rm github-deploy || true
git remote add github-deploy $(GITHUB_REPO)
@GIT_SSH_COMMAND="ssh -i $(KEY_FILE)" git fetch github-deploy $(CI_COMMIT_REF_NAME) || true
@GIT_SSH_COMMAND="ssh -i $(KEY_FILE)" git push github-deploy $(CI_COMMIT_REF_NAME)
rm $(KEY_FILE)

View File

@@ -14,6 +14,7 @@ spec:
containers:
- name: senpy-latest
image: gsiupm/senpy:latest
imagePullPolicy: Always
args:
- "--default-plugins"
resources:

View File

@@ -0,0 +1,30 @@
from senpy.plugins import AnalysisPlugin
from senpy.models import Entry
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize.simple import LineTokenizer
import nltk
class SplitPlugin(AnalysisPlugin):
def activate(self):
nltk.download('punkt')
def analyse_entry(self, entry, params):
chunker_type = params.get("delimiter", "sentence")
original_id = entry.id
original_text = entry.get("text", None)
if chunker_type == "sentence":
tokenizer = PunktSentenceTokenizer()
chars = tokenizer.span_tokenize(original_text)
for i, sentence in enumerate(tokenizer.tokenize(original_text)):
e = Entry()
e.text = sentence
e.id = original_id + "#char={},{}".format(chars[i][0], chars[i][1])
yield e
if chunker_type == "paragraph":
tokenizer = LineTokenizer()
chars = tokenizer.span_tokenize(original_text)
for i, paragraph in enumerate(tokenizer.tokenize(original_text)):
e = Entry()
e.text = paragraph
chars = [char for char in chars]
e.id = original_id + "#char={},{}".format(chars[i][0], chars[i][1])
yield e

View File

@@ -0,0 +1,18 @@
---
name: split
module: split
description: A sample plugin that chunks input text
author: "@militarpancho"
version: '0.1'
url: "https://github.com/gsi-upm/senpy"
requirements: {nltk}
extra_params:
delimiter:
aliases:
- type
- t
required: false
default: sentence
options:
- sentence
- paragraph

View File

@@ -11,4 +11,7 @@ max-line-length = 100
[bdist_wheel]
universal=1
[tool:pytest]
addopts = --cov=senpy --cov-report term-missing
addopts = --cov=senpy --cov-report term-missing
[coverage:report]
omit = senpy/__main__.py