mirror of
https://github.com/gsi-upm/sitc
synced 2025-08-24 02:22:21 +00:00
Remove outputs and metadata
This commit is contained in:
@@ -60,9 +60,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"doc1 = 'Summer is coming but Summer is short'\n",
|
||||
@@ -73,9 +71,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Tools"
|
||||
]
|
||||
@@ -110,9 +106,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||||
@@ -123,9 +117,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As we can see, [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer) comes with many options. We can define many configuration options, such as the maximum or minimum frequency of a term (*min_fd*, *max_df*), maximum number of features (*max_features*), if we analyze words or characters (*analyzer*), or if the output is binary or not (*binary*). *CountVectorizer* also allows us to include if we want to preprocess the input (*preprocessor*) before tokenizing it (*tokenizer*) and exclude stop words (*stop_words*).\n",
|
||||
"\n",
|
||||
@@ -137,9 +129,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vectors = vectorizer.fit_transform(documents)\n",
|
||||
@@ -148,9 +138,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We see the vectors are stored as a sparse matrix of 3x6 dimensions.\n",
|
||||
"We can print the matrix as well as the feature names."
|
||||
@@ -159,9 +147,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(vectors.toarray())\n",
|
||||
@@ -170,9 +156,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As you can see, the pronoun 'I' has been removed because of the default token_pattern. \n",
|
||||
"We can change this as follows."
|
||||
@@ -181,9 +165,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words=None, token_pattern='(?u)\\\\b\\\\w+\\\\b') \n",
|
||||
@@ -201,9 +183,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', token_pattern='(?u)\\\\b\\\\w+\\\\b') \n",
|
||||
@@ -214,9 +194,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#stop words in scikit-learn for English\n",
|
||||
@@ -226,9 +204,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Vectors\n",
|
||||
@@ -246,9 +222,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from scipy.spatial.distance import cosine\n",
|
||||
@@ -275,9 +249,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', binary=True) \n",
|
||||
@@ -288,9 +260,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vectors.toarray()"
|
||||
@@ -313,9 +283,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', ngram_range=[2,2]) \n",
|
||||
@@ -326,9 +294,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vectors.toarray()"
|
||||
@@ -351,9 +317,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
@@ -366,9 +330,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vectors.toarray()"
|
||||
@@ -384,9 +346,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train = [doc1, doc2, doc3]\n",
|
||||
@@ -400,10 +360,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"scrolled": true
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vectors.toarray()"
|
||||
@@ -419,9 +376,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
||||
@@ -445,9 +400,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.metrics.pairwise import linear_kernel\n",
|
||||
|
Reference in New Issue
Block a user