mirror of
				https://github.com/gsi-upm/sitc
				synced 2025-10-30 23:18:18 +00:00 
			
		
		
		
	Compare commits
	
		
			2 Commits
		
	
	
		
			36d117e417
			...
			6e8448f22f
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 6e8448f22f | ||
|  | 8f2a5c17d8 | 
| @@ -89,7 +89,7 @@ | |||||||
|     } |     } | ||||||
|    }, |    }, | ||||||
|    "source": [ |    "source": [ | ||||||
|     "In this session we are going to learn to process text so that can apply machine learning techniques." |     "In this session, we are going to learn to process text so that we can apply machine learning techniques." | ||||||
|    ] |    ] | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
| @@ -101,7 +101,7 @@ | |||||||
|    }, |    }, | ||||||
|    "source": [ |    "source": [ | ||||||
|     "# NLP Basics\n", |     "# NLP Basics\n", | ||||||
|     "In this notebook we are going to use two popular NLP libraries:\n", |     "In this notebook, we are going to use two popular NLP libraries:\n", | ||||||
|     "* NLTK (Natural Language Toolkit, https://www.nltk.org/) \n", |     "* NLTK (Natural Language Toolkit, https://www.nltk.org/) \n", | ||||||
|     "* Spacy (https://spacy.io/)" |     "* Spacy (https://spacy.io/)" | ||||||
|    ] |    ] | ||||||
| @@ -116,7 +116,7 @@ | |||||||
|    "source": [ |    "source": [ | ||||||
|     "Main characteristics:\n", |     "Main characteristics:\n", | ||||||
|     "* both are open source and very popular\n", |     "* both are open source and very popular\n", | ||||||
|     "* NLTK was released in 2001 while Spacy was in 2015\n", |     "* NLTK was released in 2001, while Spacy was in 2015\n", | ||||||
|     "* Spacy provides very efficient implementations" |     "* Spacy provides very efficient implementations" | ||||||
|    ] |    ] | ||||||
|   }, |   }, | ||||||
| @@ -130,7 +130,7 @@ | |||||||
|    "source": [ |    "source": [ | ||||||
|     "# Spacy installation\n", |     "# Spacy installation\n", | ||||||
|     "\n", |     "\n", | ||||||
|     "You need to install previously spacy if not installed:\n", |     "You need to install spacy if not installed:\n", | ||||||
|     "* `pip install spacy`\n", |     "* `pip install spacy`\n", | ||||||
|     "* or `conda install -c conda-forge spacy`\n", |     "* or `conda install -c conda-forge spacy`\n", | ||||||
|     "\n", |     "\n", | ||||||
| @@ -148,7 +148,7 @@ | |||||||
|    "source": [ |    "source": [ | ||||||
|     "# Spacy pipelines\n", |     "# Spacy pipelines\n", | ||||||
|     "\n", |     "\n", | ||||||
|     "The function **nlp** takes a raw text and perform several operations (tokenization, tagger, NER, ...)\n", |     "The function **nlp** takes a raw text and performs several operations (tokenization, tagger, NER, ...)\n", | ||||||
|     "" |     "" | ||||||
|    ] |    ] | ||||||
|   }, |   }, | ||||||
| @@ -160,7 +160,7 @@ | |||||||
|     } |     } | ||||||
|    }, |    }, | ||||||
|    "source": [ |    "source": [ | ||||||
|     "From text to doc trough the pipeline" |     "From text to doc through the pipeline" | ||||||
|    ] |    ] | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
| @@ -205,7 +205,7 @@ | |||||||
|     "\n", |     "\n", | ||||||
|     "* **Tokenizer exception:** Special-case rule to split a string into several tokens or prevent a token from being split when punctuation rules are applied.\n", |     "* **Tokenizer exception:** Special-case rule to split a string into several tokens or prevent a token from being split when punctuation rules are applied.\n", | ||||||
|     "* **Prefix:** Character(s) at the beginning, e.g. $, (, “, ¿.\n", |     "* **Prefix:** Character(s) at the beginning, e.g. $, (, “, ¿.\n", | ||||||
|     "* **Suffix:** Character(s) at the end, e.g. km, ), ”, !.\n", |     "* **Suffix:** Character(s) at the end, e.g. km, ”, !.\n", | ||||||
|     "* **Infix:** Character(s) in between, e.g. -, --, /, …." |     "* **Infix:** Character(s) in between, e.g. -, --, /, …." | ||||||
|    ] |    ] | ||||||
|   }, |   }, | ||||||
|   | |||||||
| @@ -82,7 +82,7 @@ | |||||||
|     } |     } | ||||||
|    }, |    }, | ||||||
|    "source": [ |    "source": [ | ||||||
|     "### 1. List the first 10 tokens of the doc" |     "### 1. List the first 10 tokens of the doc." | ||||||
|    ] |    ] | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
| @@ -149,7 +149,7 @@ | |||||||
|     } |     } | ||||||
|    }, |    }, | ||||||
|    "source": [ |    "source": [ | ||||||
|     "###  7. Visualize the dependency grammar analysis of the second sentence" |     "###  7. Visualize the dependency grammar analysis of the second sentence." | ||||||
|    ] |    ] | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
| @@ -178,7 +178,7 @@ | |||||||
|     } |     } | ||||||
|    }, |    }, | ||||||
|    "source": [ |    "source": [ | ||||||
|     "### 9. List frequencies of POS in the document in a table " |     "### 9. List the frequencies of POS in the document in a table." | ||||||
|    ] |    ] | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
| @@ -191,7 +191,7 @@ | |||||||
|    "source": [ |    "source": [ | ||||||
|     "### 10. Preprocessing\n", |     "### 10. Preprocessing\n", | ||||||
|     "\n", |     "\n", | ||||||
|     "Remove from the doc stopwords, digits and punctuation.\n", |     "Remove from the doc stopwords, digits, and punctuation.\n", | ||||||
|     "\n", |     "\n", | ||||||
|     "Hint: check the token api https://spacy.io/api/token\n", |     "Hint: check the token api https://spacy.io/api/token\n", | ||||||
|     "\n", |     "\n", | ||||||
| @@ -207,7 +207,7 @@ | |||||||
|    }, |    }, | ||||||
|    "source": [ |    "source": [ | ||||||
|     "### 11. Entities of the document\n", |     "### 11. Entities of the document\n", | ||||||
|     "Print the entities of the document, the type of the entity and what the explanation of the entity in a table with three columns.\n", |     "Print the entities of the document, the type of the entity, and the explanation of the entity in a table with three columns.\n", | ||||||
|     "\n", |     "\n", | ||||||
|     "Example:\n", |     "Example:\n", | ||||||
|     "\n", |     "\n", | ||||||
| @@ -223,7 +223,7 @@ | |||||||
|    }, |    }, | ||||||
|    "source": [ |    "source": [ | ||||||
|     "### 12. Visualize the entities\n", |     "### 12. Visualize the entities\n", | ||||||
|     "Show the entities in a graph." |     "Show the entities highlighted in the text." | ||||||
|    ] |    ] | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
| @@ -236,7 +236,7 @@ | |||||||
|    "source": [ |    "source": [ | ||||||
|     "# Movie review\n", |     "# Movie review\n", | ||||||
|     "\n", |     "\n", | ||||||
|     "Classify the rmoview reviews from the following dataset  https://data.world/rajeevsharma993/movie-reviews" |     "Classify the movie reviews from the following dataset  https://data.world/rajeevsharma993/movie-reviews" | ||||||
|    ] |    ] | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user