1
0
mirror of https://github.com/gsi-upm/sitc synced 2025-08-24 02:22:21 +00:00

Included installation of nltk

This commit is contained in:
cif2cif
2017-04-20 12:56:39 +02:00
parent cb40531dc4
commit e88e144a50
5 changed files with 140 additions and 987 deletions

View File

@@ -59,7 +59,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {
"collapsed": false
},
@@ -109,27 +109,11 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=5000, min_df=1,\n",
" ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
" tokenizer=None, vocabulary=None)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"\n",
@@ -152,23 +136,11 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<3x10 sparse matrix of type '<class 'numpy.int64'>'\n",
"\twith 15 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectors = vectorizer.fit_transform(documents)\n",
"vectors"
@@ -186,22 +158,11 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0 1 1 2 0 0 1 2 0 0]\n",
" [1 0 0 0 2 0 0 1 2 1]\n",
" [1 0 0 0 2 1 0 0 1 1]]\n",
"['and', 'but', 'coming', 'is', 'like', 'sandwiches', 'short', 'summer', 'the', 'winter']\n"
]
}
],
"outputs": [],
"source": [
"print(vectors.toarray())\n",
"print(vectorizer.get_feature_names())"
@@ -219,32 +180,11 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['and',\n",
" 'but',\n",
" 'coming',\n",
" 'i',\n",
" 'is',\n",
" 'like',\n",
" 'sandwiches',\n",
" 'short',\n",
" 'summer',\n",
" 'the',\n",
" 'winter']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words=None, token_pattern='(?u)\\\\b\\\\w+\\\\b') \n",
"vectors = vectorizer.fit_transform(documents)\n",
@@ -260,22 +200,11 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', token_pattern='(?u)\\\\b\\\\w+\\\\b') \n",
"vectors = vectorizer.fit_transform(documents)\n",
@@ -284,19 +213,11 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"frozenset({'could', 'sixty', 'onto', 'by', 'against', 'up', 'a', 'everything', 'other', 'otherwise', 'ourselves', 'beside', 'nowhere', 'then', 'below', 'put', 'ten', 'such', 'cannot', 'either', 'due', 'hasnt', 'whereupon', 'were', 'once', 'at', 'for', 'front', 'get', 'whereas', 'that', 'eight', 'another', 'except', 'of', 'wherever', 'over', 'to', 'whom', 'you', 'former', 'behind', 'yours', 'yourself', 'what', 'even', 'however', 'go', 'less', 'bottom', 'may', 'along', 'is', 'can', 'move', 'eg', 'somewhere', 'latterly', 'seemed', 'thence', 'becoming', 'himself', 'whether', 'six', 'first', 'off', 'do', 'many', 'namely', 'never', 'because', 'mostly', 'nevertheless', 'thereupon', 'here', 'least', 'anyone', 'one', 'others', 'cry', 'they', 'thereby', 'ie', 'am', 'this', 'would', 'any', 'while', 'see', 'too', 'your', 'somehow', 'within', 'same', 'sometimes', 'thereafter', 'must', 'take', 're', 'both', 'fill', 'nor', 'sometime', 'he', 'third', 'more', 'also', 'most', 'during', 'much', 'our', 'thick', 'enough', 'full', 'toward', 'with', 'mill', 'anyhow', 'nobody', 'why', 'thru', 'although', 'nothing', 'meanwhile', 'or', 'some', 'ltd', 'wherein', 'thus', 'someone', 'whereby', 'who', 'un', 'are', 'hundred', 'whereafter', 'fire', 'twenty', 'only', 'several', 'among', 'no', 'than', 'before', 'been', 'else', 'find', 'fifteen', 'hence', 'ours', 'already', 'be', 'besides', 'next', 'interest', 'whither', 'whole', 'eleven', 'without', 'five', 'show', 'in', 'throughout', 'own', 'amongst', 'will', 'neither', 'everywhere', 'part', 'give', 'my', 'hers', 'his', 'upon', 'well', 'him', 'yourselves', 'whatever', 'cant', 'though', 'had', 'again', 'every', 'noone', 'top', 'which', 'de', 'almost', 'system', 'under', 'down', 'latter', 'above', 'whence', 'found', 'myself', 'three', 'those', 'become', 'moreover', 'but', 'anyway', 'beyond', 'from', 'now', 'as', 'seeming', 'con', 'themselves', 'hereupon', 'each', 'serious', 'two', 'across', 'out', 'the', 'therein', 'between', 'inc', 'where', 'anything', 'seem', 'co', 'therefore', 'whoever', 'herein', 'about', 'herself', 'should', 'anywhere', 'how', 'we', 'after', 'describe', 'being', 'etc', 'very', 'not', 'an', 'me', 'call', 'per', 'detail', 'still', 'around', 'hereby', 'sincere', 'their', 'has', 'became', 'beforehand', 'everyone', 'hereafter', 'made', 'ever', 'indeed', 'itself', 'something', 'afterwards', 'none', 'done', 'nine', 'alone', 'please', 'its', 'name', 'since', 'on', 'she', 'bill', 'have', 'mine', 'few', 'her', 'seems', 'always', 'side', 'forty', 'further', 'via', 'last', 'amount', 'towards', 'fify', 'through', 'whose', 'couldnt', 'perhaps', 'thin', 'until', 'becomes', 'elsewhere', 'and', 'i', 'them', 'together', 'us', 'was', 'when', 'rather', 'whenever', 'formerly', 'keep', 'so', 'back', 'there', 'amoungst', 'might', 'these', 'all', 'empty', 'often', 'into', 'it', 'twelve', 'yet', 'if', 'four'})\n"
]
}
],
"outputs": [],
"source": [
"#stop words in scikit-learn for English\n",
"print(vectorizer.get_stop_words())"
@@ -304,24 +225,11 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1, 0, 0, 1, 2, 0],\n",
" [0, 2, 0, 0, 1, 1],\n",
" [0, 2, 1, 0, 0, 1]], dtype=int64)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# Vectors\n",
"f_array = vectors.toarray()\n",
@@ -337,19 +245,11 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.666666666667 1.0 0.166666666667\n"
]
}
],
"outputs": [],
"source": [
"from scipy.spatial.distance import cosine\n",
"d12 = cosine(f_array[0], f_array[1])\n",
@@ -374,22 +274,11 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', binary=True) \n",
"vectors = vectorizer.fit_transform(documents)\n",
@@ -398,24 +287,11 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1, 0, 0, 1, 1, 0],\n",
" [0, 1, 0, 0, 1, 1],\n",
" [0, 1, 1, 0, 0, 1]], dtype=int64)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectors.toarray()"
]
@@ -436,29 +312,11 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['coming summer',\n",
" 'like sandwiches',\n",
" 'like summer',\n",
" 'like winter',\n",
" 'sandwiches like',\n",
" 'summer coming',\n",
" 'summer like',\n",
" 'summer short']"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectorizer = CountVectorizer(analyzer=\"word\", stop_words='english', ngram_range=[2,2]) \n",
"vectors = vectorizer.fit_transform(documents)\n",
@@ -467,24 +325,11 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1, 0, 0, 0, 0, 1, 0, 1],\n",
" [0, 0, 1, 1, 0, 0, 1, 0],\n",
" [0, 1, 0, 1, 1, 0, 0, 0]], dtype=int64)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectors.toarray()"
]
@@ -505,22 +350,11 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
@@ -531,27 +365,11 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.48148213, 0. , 0. , 0.48148213, 0.73235914,\n",
" 0. ],\n",
" [ 0. , 0.81649658, 0. , 0. , 0.40824829,\n",
" 0.40824829],\n",
" [ 0. , 0.77100584, 0.50689001, 0. , 0. ,\n",
" 0.38550292]])"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectors.toarray()"
]
@@ -565,22 +383,11 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['coming', 'like', 'sandwiches', 'short', 'summer', 'winter']"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"train = [doc1, doc2, doc3]\n",
"vectorizer = TfidfVectorizer(analyzer=\"word\", stop_words='english')\n",
@@ -592,28 +399,12 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.48148213, 0. , 0. , 0.48148213, 0.73235914,\n",
" 0. ],\n",
" [ 0. , 0.81649658, 0. , 0. , 0.40824829,\n",
" 0.40824829],\n",
" [ 0. , 0.77100584, 0.50689001, 0. , 0. ,\n",
" 0.38550292]])"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"vectors.toarray()"
]
@@ -627,22 +418,11 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.38324078, 0.24713249, 0.23336362]])"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from sklearn.metrics.pairwise import cosine_similarity\n",
"\n",
@@ -664,22 +444,11 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.38324078, 0.24713249, 0.23336362])"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from sklearn.metrics.pairwise import linear_kernel\n",
"cosine_similarity = linear_kernel(vector_query, vectors).flatten()\n",
@@ -734,7 +503,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
"version": "3.5.2"
}
},
"nbformat": 4,