WIP simpler pipeline

2025-12-28 05:48:15 +00:00 · 2017-06-21 19:58:18 +02:00
parent fca0ac00c4
commit a243f68bfc
19 changed files with 369 additions and 227 deletions
--- a/senpy/plugins/init.py
+++ b/senpy/plugins/init.py
@@ -76,7 +76,7 @@ class AnalysisPlugin(Plugin):
        Note that this method may yield an annotated entry or a list of
        entries (e.g. in a tokenizer)
        """
-        text = entry['text']
+        text = entry['nif:isString']
        params = copy.copy(parameters)
        params['input'] = text
        results = self.analyse(**params)
--- a/senpy/plugins/misc/split.py
+++ b/senpy/plugins/misc/split.py
@@ -0,0 +1,64 @@
+from senpy.plugins import AnalysisPlugin
+from senpy.models import Entry
+from nltk.tokenize.punkt import PunktSentenceTokenizer
+from nltk.tokenize.simple import LineTokenizer
+import nltk
+
+
+class SplitPlugin(AnalysisPlugin):
+
+    def activate(self):
+        nltk.download('punkt')
+
+    def analyse_entry(self, entry, params):
+        chunker_type = params.get("delimiter", "sentence")
+        original_text = entry.get('nif:isString', None)
+        if chunker_type == "sentence":
+            tokenizer = PunktSentenceTokenizer()
+        if chunker_type == "paragraph":
+            tokenizer = LineTokenizer()
+        chars = tokenizer.span_tokenize(original_text)
+        for i, chunk in enumerate(tokenizer.tokenize(original_text)):
+            e = Entry()
+            e['nif:isString'] = chunk
+            if entry.id:
+                e.id = entry.id + "#char={},{}".format(chars[i][0], chars[i][1])
+            yield e
+
+    test_cases = [
+        {
+            'entry': {
+                'nif:isString': 'Hello. World.'
+            },
+            'params': {
+                'delimiter': 'sentence',
+            },
+            'expected': [
+                {
+                    'nif:isString': 'Hello.'
+                },
+                {
+                    'nif:isString': 'World.'
+                }
+            ]
+        },
+        {
+            'entry': {
+                "id": ":test",
+                'nif:isString': 'Hello. World.'
+            },
+            'params': {
+                'delimiter': 'sentence',
+            },
+            'expected': [
+                {
+                    "@id": ":test#char=0,6",
+                    'nif:isString': 'Hello.'
+                },
+                {
+                    "@id": ":test#char=7,13",
+                    'nif:isString': 'World.'
+                }
+            ]
+        }
+    ]
--- a/senpy/plugins/misc/split.senpy
+++ b/senpy/plugins/misc/split.senpy
@@ -0,0 +1,19 @@
+---
+name: split
+module: senpy.plugins.misc.split
+description: A sample plugin that chunks input text
+author: "@militarpancho"
+version: '0.2'
+url: "https://github.com/gsi-upm/senpy"
+requirements:
+    - nltk
+extra_params:
+  delimiter:
+    aliases:
+    - type
+    - t
+    required: false
+    default: sentence
+    options:
+    - sentence
+    - paragraph
--- a/senpy/plugins/sentiment/sentiment140/sentiment140.py
+++ b/senpy/plugins/sentiment/sentiment140/sentiment140.py
@@ -12,7 +12,7 @@ class Sentiment140Plugin(SentimentPlugin):
                            json.dumps({
                                "language": lang,
                                "data": [{
-                                    "text": entry.nif__isString
+                                    "text": entry['nif:isString']
                                }]
                            }))
        p = params.get("prefix", None)