mirror of
https://github.com/gsi-upm/senpy
synced 2025-09-17 12:02:21 +00:00
Compare commits
4 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
83e2d415a1 | ||
|
f8ca595bc9 | ||
|
312e7f7f12 | ||
|
c555b9547e |
@@ -98,7 +98,7 @@ clean :
|
|||||||
- make -e clean
|
- make -e clean
|
||||||
when: manual
|
when: manual
|
||||||
|
|
||||||
cleanup_pypirc:
|
cleanup_py:
|
||||||
stage: clean
|
stage: clean
|
||||||
when: always # this is important; run even if preceding stages failed.
|
when: always # this is important; run even if preceding stages failed.
|
||||||
script:
|
script:
|
||||||
|
6
Makefile
6
Makefile
@@ -76,14 +76,12 @@ test-%:
|
|||||||
test: test-$(PYMAIN)
|
test: test-$(PYMAIN)
|
||||||
|
|
||||||
dist/$(TARNAME): version
|
dist/$(TARNAME): version
|
||||||
docker run --rm -v $$PWD:/usr/src/app/ -w /usr/src/app/ python:$(PYMAIN) python setup.py sdist;
|
python setup.py sdist;
|
||||||
docker run --rm -v $$PWD:/usr/src/app/ -w /usr/src/app/ python:$(PYMAIN) chmod -R a+rwx dist;
|
|
||||||
|
|
||||||
|
|
||||||
sdist: dist/$(TARNAME)
|
sdist: dist/$(TARNAME)
|
||||||
|
|
||||||
pip_test-%: sdist
|
pip_test-%: sdist
|
||||||
docker run --rm -v $$PWD/dist:/dist/ -ti python:$* pip install /dist/$(TARNAME);
|
docker run --rm -v $$PWD/dist:/dist/ python:$* pip install /dist/$(TARNAME);
|
||||||
|
|
||||||
pip_test: $(addprefix pip_test-,$(PYVERSIONS))
|
pip_test: $(addprefix pip_test-,$(PYVERSIONS))
|
||||||
|
|
||||||
|
30
senpy/plugins/split/split.py
Normal file
30
senpy/plugins/split/split.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
from senpy.plugins import AnalysisPlugin
|
||||||
|
from senpy.models import Entry
|
||||||
|
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
||||||
|
from nltk.tokenize.simple import LineTokenizer
|
||||||
|
import nltk
|
||||||
|
class SplitPlugin(AnalysisPlugin):
|
||||||
|
def activate(self):
|
||||||
|
nltk.download('punkt')
|
||||||
|
|
||||||
|
def analyse_entry(self, entry, params):
|
||||||
|
chunker_type = params.get("delimiter", "sentence")
|
||||||
|
original_id = entry.id
|
||||||
|
original_text = entry.get("text", None)
|
||||||
|
if chunker_type == "sentence":
|
||||||
|
tokenizer = PunktSentenceTokenizer()
|
||||||
|
chars = tokenizer.span_tokenize(original_text)
|
||||||
|
for i, sentence in enumerate(tokenizer.tokenize(original_text)):
|
||||||
|
e = Entry()
|
||||||
|
e.text = sentence
|
||||||
|
e.id = original_id + "#char={},{}".format(chars[i][0], chars[i][1])
|
||||||
|
yield e
|
||||||
|
if chunker_type == "paragraph":
|
||||||
|
tokenizer = LineTokenizer()
|
||||||
|
chars = tokenizer.span_tokenize(original_text)
|
||||||
|
for i, paragraph in enumerate(tokenizer.tokenize(original_text)):
|
||||||
|
e = Entry()
|
||||||
|
e.text = paragraph
|
||||||
|
chars = [char for char in chars]
|
||||||
|
e.id = original_id + "#char={},{}".format(chars[i][0], chars[i][1])
|
||||||
|
yield e
|
18
senpy/plugins/split/split.senpy
Normal file
18
senpy/plugins/split/split.senpy
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
---
|
||||||
|
name: split
|
||||||
|
module: split
|
||||||
|
description: A sample plugin that chunks input text
|
||||||
|
author: "@militarpancho"
|
||||||
|
version: '0.1'
|
||||||
|
url: "https://github.com/gsi-upm/senpy"
|
||||||
|
requirements: {nltk}
|
||||||
|
extra_params:
|
||||||
|
delimiter:
|
||||||
|
aliases:
|
||||||
|
- type
|
||||||
|
- t
|
||||||
|
required: false
|
||||||
|
default: sentence
|
||||||
|
options:
|
||||||
|
- sentence
|
||||||
|
- paragraph
|
Reference in New Issue
Block a user