senpy/senpy/plugins/misc/split.py

from senpy.plugins import AnalysisPlugin
from senpy.models import Entry
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize.simple import LineTokenizer
import nltk


class SplitPlugin(AnalysisPlugin):

    def activate(self):
        nltk.download('punkt')

    def analyse_entry(self, entry, params):
        chunker_type = params.get("delimiter", "sentence")
        original_text = entry['nif:isString']
        if chunker_type == "sentence":
            tokenizer = PunktSentenceTokenizer()
        if chunker_type == "paragraph":
            tokenizer = LineTokenizer()
        chars = list(tokenizer.span_tokenize(original_text))
        for i, chunk in enumerate(tokenizer.tokenize(original_text)):
            print(chunk)
            e = Entry()
            e['nif:isString'] = chunk
            if entry.id:
                e.id = entry.id + "#char={},{}".format(chars[i][0], chars[i][1])
            yield e

    test_cases = [
        {
            'entry': {
                'nif:isString': 'Hello. World.'
            },
            'params': {
                'delimiter': 'sentence',
            },
            'expected': [
                {
                    'nif:isString': 'Hello.'
                },
                {
                    'nif:isString': 'World.'
                }
            ]
        },
        {
            'entry': {
                "id": ":test",
                'nif:isString': 'Hello\nWorld'
            },
            'params': {
                'delimiter': 'paragraph',
            },
            'expected': [
                {
                    "@id": ":test#char=0,5",
                    'nif:isString': 'Hello'
                },
                {
                    "@id": ":test#char=6,11",
                    'nif:isString': 'World'
                }
            ]
        }
    ]
WIP simpler pipeline 2017-06-21 17:58:18 +00:00			`from senpy.plugins import AnalysisPlugin`
			`from senpy.models import Entry`
			`from nltk.tokenize.punkt import PunktSentenceTokenizer`
			`from nltk.tokenize.simple import LineTokenizer`
			`import nltk`


			`class SplitPlugin(AnalysisPlugin):`

			`def activate(self):`
			`nltk.download('punkt')`

			`def analyse_entry(self, entry, params):`
			`chunker_type = params.get("delimiter", "sentence")`
Fix #48 2017-12-12 15:53:26 +00:00			`original_text = entry['nif:isString']`
WIP simpler pipeline 2017-06-21 17:58:18 +00:00			`if chunker_type == "sentence":`
			`tokenizer = PunktSentenceTokenizer()`
			`if chunker_type == "paragraph":`
			`tokenizer = LineTokenizer()`
Fix #48 2017-12-12 15:53:26 +00:00			`chars = list(tokenizer.span_tokenize(original_text))`
WIP simpler pipeline 2017-06-21 17:58:18 +00:00			`for i, chunk in enumerate(tokenizer.tokenize(original_text)):`
Fix #48 2017-12-12 15:53:26 +00:00			`print(chunk)`
WIP simpler pipeline 2017-06-21 17:58:18 +00:00			`e = Entry()`
			`e['nif:isString'] = chunk`
			`if entry.id:`
			`e.id = entry.id + "#char={},{}".format(chars[i][0], chars[i][1])`
			`yield e`

			`test_cases = [`
			`{`
			`'entry': {`
			`'nif:isString': 'Hello. World.'`
			`},`
			`'params': {`
			`'delimiter': 'sentence',`
			`},`
			`'expected': [`
			`{`
			`'nif:isString': 'Hello.'`
			`},`
			`{`
			`'nif:isString': 'World.'`
			`}`
			`]`
			`},`
			`{`
			`'entry': {`
			`"id": ":test",`
Fix #48 2017-12-12 15:53:26 +00:00			`'nif:isString': 'Hello\nWorld'`
WIP simpler pipeline 2017-06-21 17:58:18 +00:00			`},`
			`'params': {`
Fix #48 2017-12-12 15:53:26 +00:00			`'delimiter': 'paragraph',`
WIP simpler pipeline 2017-06-21 17:58:18 +00:00			`},`
			`'expected': [`
			`{`
Fix #48 2017-12-12 15:53:26 +00:00			`"@id": ":test#char=0,5",`
			`'nif:isString': 'Hello'`
WIP simpler pipeline 2017-06-21 17:58:18 +00:00			`},`
			`{`
Fix #48 2017-12-12 15:53:26 +00:00			`"@id": ":test#char=6,11",`
			`'nif:isString': 'World'`
WIP simpler pipeline 2017-06-21 17:58:18 +00:00			`}`
			`]`
			`}`
			`]`