2019-07-17 14:29:30 +00:00
|
|
|
#
|
|
|
|
# Copyright 2014 Grupo de Sistemas Inteligentes (GSI) DIT, UPM
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
|
|
|
|
2018-01-07 22:01:07 +00:00
|
|
|
from sklearn.pipeline import Pipeline
|
|
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
|
|
from mydata import text, labels
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.12, random_state=42)
|
|
|
|
|
|
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
|
|
|
|
|
|
|
|
|
|
count_vec = CountVectorizer(tokenizer=lambda x: x.split())
|
|
|
|
clf3 = MultinomialNB()
|
|
|
|
pipeline = Pipeline([('cv', count_vec),
|
|
|
|
('clf', clf3)])
|
|
|
|
|
|
|
|
pipeline.fit(X_train, y_train)
|
2023-09-22 21:28:19 +00:00
|
|
|
print('Feature names: {}'.format(count_vec.get_feature_names_out()))
|
2018-01-07 22:01:07 +00:00
|
|
|
print('Class count: {}'.format(clf3.class_count_))
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
print('--Results--')
|
|
|
|
tests = [
|
|
|
|
(['The sentiment for senpy should be positive :)', ], 1),
|
|
|
|
(['The sentiment for anything else should be negative :()', ], -1)
|
|
|
|
]
|
|
|
|
for features, expected in tests:
|
|
|
|
result = pipeline.predict(features)
|
|
|
|
print('Input: {}\nExpected: {}\nGot: {}'.format(features[0], expected, result))
|