From 380340d66d363cf5d6f1da1e202b3d5c938e8590 Mon Sep 17 00:00:00 2001 From: cif Date: Sun, 23 Apr 2023 16:41:53 +0200 Subject: [PATCH] Updated 4_4 to use get_feature_names_out() instead of get_feature_names --- nlp/4_3_Vector_Representation.ipynb | 123 ++++++++++++++++++++++------ 1 file changed, 100 insertions(+), 23 deletions(-) diff --git a/nlp/4_3_Vector_Representation.ipynb b/nlp/4_3_Vector_Representation.ipynb index c7261ff..e8a3e60 100644 --- a/nlp/4_3_Vector_Representation.ipynb +++ b/nlp/4_3_Vector_Representation.ipynb @@ -105,9 +105,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
CountVectorizer(max_features=5000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "CountVectorizer(max_features=5000)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", @@ -128,9 +142,21 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<3x10 sparse matrix of type ''\n", + "\twith 15 stored elements in Compressed Sparse Row format>" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "vectors = vectorizer.fit_transform(documents)\n", "vectors" @@ -146,12 +172,24 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0 1 1 2 0 0 1 2 0 0]\n", + " [1 0 0 0 2 0 0 1 2 1]\n", + " [1 0 0 0 2 1 0 0 1 1]]\n", + "['and' 'but' 'coming' 'is' 'like' 'sandwiches' 'short' 'summer' 'the'\n", + " 'winter']\n" + ] + } + ], "source": [ "print(vectors.toarray())\n", - "print(vectorizer.get_feature_names())" + "print(vectorizer.get_feature_names_out())" ] }, { @@ -164,13 +202,25 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['and', 'but', 'coming', 'i', 'is', 'like', 'sandwiches', 'short',\n", + " 'summer', 'the', 'winter'], dtype=object)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "vectorizer = CountVectorizer(analyzer=\"word\", stop_words=None, token_pattern='(?u)\\\\b\\\\w+\\\\b') \n", "vectors = vectorizer.fit_transform(documents)\n", - "vectorizer.get_feature_names()" + "vectorizer.get_feature_names_out()" ] }, { @@ -182,20 +232,47 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/cif/anaconda3/lib/python3.10/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.\n", + " warnings.warn(msg, category=FutureWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', token_pattern='(?u)\\\\b\\\\w+\\\\b') \n", "vectors = vectorizer.fit_transform(documents)\n", - "vectorizer.get_feature_names()" + "vectorizer.get_feature_names_out()" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "frozenset({'or', 'be', 'least', 'ours', 'very', 'noone', 'more', 'can', 'front', 'last', 'co', 'where', 'beyond', 'you', 'was', 'to', 'nine', 'here', 'describe', 'than', 'rather', 'therefore', 'except', 'at', 'again', 'ourselves', 'most', 'anyway', 'thick', 'whither', 'thereupon', 'someone', 'hereupon', 'besides', 'among', 'hasnt', 'across', 'namely', 'because', 'is', 'out', 'same', 'yourself', 'somehow', 'sincere', 'con', 'hereby', 'towards', 'interest', 'much', 'up', 'why', 'myself', 'all', 'nobody', 'though', 'every', 'show', 'not', 'there', 'whether', 'still', 'name', 'when', 'the', 'each', 'six', 'nor', 'and', 'under', 'thereby', 'less', 'either', 'thence', 'into', 'seemed', 'something', 'four', 'sometimes', 'himself', 'those', 'nowhere', 'almost', 'are', 'empty', 'must', 'while', 'afterwards', 'perhaps', 'from', 'detail', 'through', 'any', 'have', 'may', 'he', 'anywhere', 'alone', 'without', 'beforehand', 'had', 'too', 'yourselves', 'our', 'see', 'how', 'please', 'what', 'am', 'do', 'it', 'serious', 'yet', 'down', 'top', 'amount', 'then', 'both', 'fire', 'been', 'wherein', 'done', 'etc', 'whose', 'whereafter', 'who', 'ltd', 'meanwhile', 'further', 'few', 'first', 'behind', 'made', 'yours', 'until', 'toward', 'amoungst', 'anyhow', 'we', 'with', 'give', 'go', 'no', 'back', 'else', 'becomes', 'your', 'fill', 'together', 'another', 'throughout', 'onto', 'de', 'me', 'ten', 'system', 'became', 'per', 'therein', 'everyone', 'often', 'ie', 'put', 'hers', 'herself', 'nevertheless', 'itself', 'eg', 'herein', 'his', 'this', 'cry', 'due', 'bill', 'one', 'on', 'being', 'themselves', 'of', 'some', 'their', 'neither', 'elsewhere', 'since', 'whole', 'eight', 'i', 'a', 'whoever', 'own', 'call', 'them', 'mostly', 'she', 'my', 'cannot', 'us', 'never', 'as', 'thin', 'upon', 'cant', 'un', 'before', 'her', 'otherwise', 'full', 'these', 'next', 'they', 'side', 'somewhere', 'fifty', 'hence', 'so', 'along', 'already', 'three', 'latter', 'anything', 'whom', 'could', 'indeed', 'nothing', 'whereby', 'which', 'sometime', 'become', 'ever', 'amongst', 'by', 'in', 'five', 'after', 'mine', 'fifteen', 'wherever', 'found', 'thereafter', 'third', 'keep', 'anyone', 'will', 'bottom', 'off', 'seem', 'none', 'an', 'whatever', 'over', 'during', 'also', 'latterly', 'via', 'take', 'former', 'above', 'now', 'becoming', 'hereafter', 'such', 'two', 'only', 'about', 'sixty', 're', 'everything', 'others', 'hundred', 'twelve', 'thus', 'even', 'well', 'always', 'once', 'beside', 'get', 'mill', 'seems', 'if', 'whereupon', 'find', 'forty', 'inc', 'whenever', 'around', 'other', 'should', 'many', 'enough', 'however', 'move', 'against', 'several', 'everywhere', 'has', 'whereas', 'that', 'whence', 'eleven', 'its', 'within', 'twenty', 'part', 'although', 'thru', 'couldnt', 'moreover', 'him', 'formerly', 'might', 'seeming', 'but', 'below', 'would', 'between', 'were', 'for'})\n" + ] + } + ], "source": [ "#stop words in scikit-learn for English\n", "print(vectorizer.get_stop_words())" @@ -442,7 +519,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -456,7 +533,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.1" + "version": "3.10.10" }, "latex_envs": { "LaTeX_envs_menu_present": true,