{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Part-of-Spech (POS) Tagging / Categorización Léxica\n", "\n", "* *30 min* | Última modificación: Diciembre 1, 2020" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "http://www.nltk.org/book/\n", "\n", "Text Analytics with Python" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "TAG Descripción Ejemplo\n", "------------------------------------------------------------\n", "CC Coordination conjuntion and, or\n", "CD Cardinal number one, two, 3\n", "DT Determiner a, the\n", "EX Existential there there were two cars \n", "FW Foreign word hola mundo cruel \n", "IN Preposition/subordinating conjunction of, in, on, that\n", "JJ Adjective quick, lazy\n", "JJR Adjective, comparative quicker, lazier\n", "JJS Adjective, superlative quickest, laziest\n", "NN Noun, singular or mass fox, dog\n", "NNS Noun, plural foxes, dogs\n", "NNPS Noun, proper singular John, Alice \n", "NNP Noun, proper plural Vikings, Indians, Germans\n", "\n", "```" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package averaged_perceptron_tagger to\n", "[nltk_data] /root/nltk_data...\n", "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n", "[nltk_data] date!\n", "[nltk_data] Downloading package brown to /root/nltk_data...\n", "[nltk_data] Package brown is already up-to-date!\n", "[nltk_data] Downloading package universal_tagset to /root/nltk_data...\n", "[nltk_data] Package universal_tagset is already up-to-date!\n", "[nltk_data] Downloading package treebank to /root/nltk_data...\n", "[nltk_data] Package treebank is already up-to-date!\n", "[nltk_data] Downloading package gutenberg to /root/nltk_data...\n", "[nltk_data] Package gutenberg is already up-to-date!\n", "[nltk_data] Downloading package words to /root/nltk_data...\n", "[nltk_data] Package words is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "\n", "nltk.download(\"averaged_perceptron_tagger\")\n", "nltk.download(\"brown\")\n", "nltk.download(\"universal_tagset\")\n", "nltk.download(\"treebank\")\n", "nltk.download(\"gutenberg\")\n", "nltk.download(\"words\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "##\n", "## Para ver todos los posibles tags ejecute el siguiente codigo\n", "##\n", "# nltk.download('tagsets')\n", "# nltk.help.upenn_tagset()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('And', 'CC'),\n", " ('now', 'RB'),\n", " ('for', 'IN'),\n", " ('something', 'NN'),\n", " ('completely', 'RB'),\n", " ('different', 'JJ')]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Obtención de tags para un texto\n", "##\n", "from nltk.tokenize import word_tokenize\n", "\n", "text = word_tokenize(\"And now for something completely different\")\n", "nltk.pos_tag(text)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('They', 'PRP'),\n", " ('refuse', 'VBP'),\n", " ('to', 'TO'),\n", " ('permit', 'VB'),\n", " ('us', 'PRP'),\n", " ('to', 'TO'),\n", " ('obtain', 'VB'),\n", " ('the', 'DT'),\n", " ('refuse', 'NN'),\n", " ('permit', 'NN')]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Obtención de tags para un texto\n", "##\n", "text = word_tokenize(\"They refuse to permit us to obtain the refuse permit\")\n", "nltk.pos_tag(text)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "man time day year car moment world house family child country boy\n", "state job place way war girl work word\n" ] } ], "source": [ "##\n", "## Ejemplo de obtención de texto similares\n", "##\n", "text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())\n", "text.similar(\"woman\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "made said done put had seen found given left heard was been brought\n", "set got that took in told felt\n" ] } ], "source": [ "##\n", "## Ejemplo de obtención de texto similares\n", "##\n", "text.similar(\"bought\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "in on to of and for with from at by that into as up out down through\n", "is all about\n" ] } ], "source": [ "##\n", "## Ejemplo de obtención de texto similares\n", "##\n", "text.similar(\"over\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a his this their its her an that our any all one these my in your no\n", "some other and\n" ] } ], "source": [ "##\n", "## Ejemplo de obtención de texto similares\n", "##\n", "text.similar(\"the\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "##\n", "## Ejemplo --- POS tagging usando spaCy\n", "##\n", "sentence = \"US unveils world's most powerful supercomputer, beats China.\"\n", "\n", "# nlp = spacy.load(\"en_core_web_sm\", parse=True, tag=True, entity=True)\n", "\n", "# sentence_nlp = nlp(sentence)\n", "\n", "# spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]\n", "# pd.DataFrame(spacy_pos_tagged, columns=[\"Word\", \"POS tag\", \"Tag type\"])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Word | \n", "POS tag | \n", "
---|---|---|
0 | \n", "US | \n", "NNP | \n", "
1 | \n", "unveils | \n", "JJ | \n", "
2 | \n", "world | \n", "NN | \n", "
3 | \n", "'s | \n", "POS | \n", "
4 | \n", "most | \n", "RBS | \n", "
5 | \n", "powerful | \n", "JJ | \n", "
6 | \n", "supercomputer | \n", "NN | \n", "
7 | \n", ", | \n", ", | \n", "
8 | \n", "beats | \n", "VBZ | \n", "
9 | \n", "China | \n", "NNP | \n", "
10 | \n", ". | \n", ". | \n", "