{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Tagging automático\n", "\n", "* *30 min* | Última modificación: Diciembre 4, 2020" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "http://www.nltk.org/book/\n", "\n", "Text Analytics with Python" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), (\"Atlanta's\", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), (\"''\", \"''\"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlanta', 'NP-TL'), (\"''\", \"''\"), ('for', 'IN'), ('the', 'AT'), ('manner', 'NN'), ('in', 'IN'), ('which', 'WDT'), ('the', 'AT'), ('election', 'NN'), ('was', 'BEDZ'), ('conducted', 'VBN'), ('.', '.')], ...]" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Preparación\n", "##\n", "import nltk\n", "from nltk.corpus import brown\n", "\n", "##\n", "## Tags por frase\n", "##\n", "brown_tagged_sents = brown.tagged_sents(categories=\"news\")\n", "brown_tagged_sents" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', \"Atlanta's\", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', \"''\", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', \"''\", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Sentencias tokenizadas\n", "##\n", "brown_sents = brown.sents(categories=\"news\")\n", "brown_sents" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'NN'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Default tagger\n", "## Se asigna el tag mas común.\n", "## Cómputo del tag más común\n", "##\n", "tags = [tag for (word, tag) in brown.tagged_words(categories=\"news\")]\n", "nltk.FreqDist(tags).max()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('I', 'NN'),\n", " ('do', 'NN'),\n", " ('not', 'NN'),\n", " ('like', 'NN'),\n", " ('green', 'NN'),\n", " ('eggs', 'NN'),\n", " ('and', 'NN'),\n", " ('ham', 'NN'),\n", " (',', 'NN'),\n", " ('I', 'NN'),\n", " ('do', 'NN'),\n", " ('not', 'NN'),\n", " ('like', 'NN'),\n", " ('them', 'NN'),\n", " ('Sam', 'NN'),\n", " ('I', 'NN'),\n", " ('am', 'NN'),\n", " ('!', 'NN')]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Tagging\n", "##\n", "raw = \"I do not like green eggs and ham, I do not like them Sam I am!\"\n", "tokens = nltk.word_tokenize(raw)\n", "\n", "## Crea el tagger\n", "default_tagger = nltk.DefaultTagger(\"NN\")\n", "\n", "## Aplica el tagger\n", "default_tagger.tag(tokens)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.13089484257215028" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Evaluación del DefaultTagger('NN') vs las\n", "## palabras ya clasificadas. El valor indica\n", "## el porcentaje de tags correctamente asignados\n", "##\n", "default_tagger.evaluate(brown_tagged_sents)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('``', 'NN'),\n", " ('Only', 'NN'),\n", " ('a', 'NN'),\n", " ('relative', 'NN'),\n", " ('handful', 'NN'),\n", " ('of', 'NN'),\n", " ('such', 'NN'),\n", " ('reports', 'NNS'),\n", " ('was', 'NNS'),\n", " ('received', 'VBD'),\n", " (\"''\", 'NN'),\n", " (',', 'NN'),\n", " ('the', 'NN'),\n", " ('jury', 'NN'),\n", " ('said', 'NN'),\n", " (',', 'NN'),\n", " ('``', 'NN'),\n", " ('considering', 'VBG'),\n", " ('the', 'NN'),\n", " ('widespread', 'NN'),\n", " ('interest', 'NN'),\n", " ('in', 'NN'),\n", " ('the', 'NN'),\n", " ('election', 'NN'),\n", " (',', 'NN'),\n", " ('the', 'NN'),\n", " ('number', 'NN'),\n", " ('of', 'NN'),\n", " ('voters', 'NNS'),\n", " ('and', 'NN'),\n", " ('the', 'NN'),\n", " ('size', 'NN'),\n", " ('of', 'NN'),\n", " ('this', 'NNS'),\n", " ('city', 'NN'),\n", " (\"''\", 'NN'),\n", " ('.', 'NN')]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Regular expression tagger\n", "## Asigna los tokens de acuerdo a los patrones\n", "## dados como expresiones regulares\n", "##\n", "patterns = [\n", " (r\".*ing$\", \"VBG\"), # gerunds\n", " (r\".*ed$\", \"VBD\"), # simple past\n", " (r\".*es$\", \"VBZ\"), # 3rd singular present\n", " (r\".*ould$\", \"MD\"), # modals\n", " (r\".*\\'s$\", \"NN$\"), # possessive nouns\n", " (r\".*s$\", \"NNS\"), # plural nouns\n", " (r\"^-?[0-9]+(\\.[0-9]+)?$\", \"CD\"), # cardinal numbers\n", " (r\".*\", \"NN\"), # nouns (default)\n", "]\n", "\n", "## crea el tagger\n", "regexp_tagger = nltk.RegexpTagger(patterns)\n", "\n", "## aplica el tagger a una sentencia\n", "regexp_tagger.tag(brown_sents[3])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.20186168625812995" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Evaluación de la precisión del tagger\n", "##\n", "regexp_tagger.evaluate(brown_tagged_sents)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.45578495136941344" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Lookup tagger\n", "## Busca las N palabras más frecuentes y almacena\n", "## su tag más probable\n", "##\n", "\n", "## Computa las palabras más frecuentes\n", "fd = nltk.FreqDist(brown.words(categories=\"news\"))\n", "most_freq_words = fd.most_common(100)\n", "\n", "## computa el tag más frecuente para cada palabra\n", "cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories=\"news\"))\n", "likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)\n", "\n", "## construye el tagger\n", "baseline_tagger = nltk.UnigramTagger(model=likely_tags)\n", "\n", "## Evaluación\n", "baseline_tagger.evaluate(brown_tagged_sents)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('``', '``'),\n", " ('Only', None),\n", " ('a', 'AT'),\n", " ('relative', None),\n", " ('handful', None),\n", " ('of', 'IN'),\n", " ('such', None),\n", " ('reports', None),\n", " ('was', 'BEDZ'),\n", " ('received', None),\n", " (\"''\", \"''\"),\n", " (',', ','),\n", " ('the', 'AT'),\n", " ('jury', None),\n", " ('said', 'VBD'),\n", " (',', ','),\n", " ('``', '``'),\n", " ('considering', None),\n", " ('the', 'AT'),\n", " ('widespread', None),\n", " ('interest', None),\n", " ('in', 'IN'),\n", " ('the', 'AT'),\n", " ('election', None),\n", " (',', ','),\n", " ('the', 'AT'),\n", " ('number', None),\n", " ('of', 'IN'),\n", " ('voters', None),\n", " ('and', 'CC'),\n", " ('the', 'AT'),\n", " ('size', None),\n", " ('of', 'IN'),\n", " ('this', 'DT'),\n", " ('city', None),\n", " (\"''\", \"''\"),\n", " ('.', '.')]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Aplica el tagger\n", "##\n", "baseline_tagger.tag(brown.sents(categories=\"news\")[3])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "##\n", "## Note que muchos de los tags anteriores son None.\n", "## En este caso se puede utilizar otro tagger para que\n", "## clasifique estos tokens\n", "##\n", "baseline_tagger = nltk.UnigramTagger(\n", " model=likely_tags, backoff=nltk.DefaultTagger(\"NN\")\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "##\n", "## El siguiente fragmento de código permite evaluar\n", "## el tagger para diferentes cantidades de palabras\n", "##\n", "def display():\n", " def performance(cfd, wordlist):\n", " lt = dict((word, cfd[word].max()) for word in wordlist)\n", " baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger(\"NN\"))\n", " return baseline_tagger.evaluate(brown.tagged_sents(categories=\"news\"))\n", "\n", " import pylab\n", "\n", " word_freqs = nltk.FreqDist(brown.words(categories=\"news\")).most_common()\n", " words_by_freq = [w for (w, _) in word_freqs]\n", " cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories=\"news\"))\n", " sizes = 2 ** pylab.arange(15)\n", " perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]\n", " pylab.plot(sizes, perfs, \"-bo\")\n", " pylab.title(\"Lookup Tagger Performance with Varying Model Size\")\n", " pylab.xlabel(\"Model Size\")\n", " pylab.ylabel(\"Performance\")\n", " pylab.show()\n", "\n", "\n", "display()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Various', 'JJ'),\n", " ('of', 'IN'),\n", " ('the', 'AT'),\n", " ('apartments', 'NNS'),\n", " ('are', 'BER'),\n", " ('of', 'IN'),\n", " ('the', 'AT'),\n", " ('terrace', 'NN'),\n", " ('type', 'NN'),\n", " (',', ','),\n", " ('being', 'BEG'),\n", " ('on', 'IN'),\n", " ('the', 'AT'),\n", " ('ground', 'NN'),\n", " ('floor', 'NN'),\n", " ('so', 'QL'),\n", " ('that', 'CS'),\n", " ('entrance', 'NN'),\n", " ('is', 'BEZ'),\n", " ('direct', 'JJ'),\n", " ('.', '.')]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Unigram tagging\n", "## Se basa en asignar el tag que es más frecuente para\n", "## la palabra analizada. Este tagger puede ser\n", "## *entrenado*\n", "##\n", "from nltk.corpus import brown\n", "\n", "## Obtiene las palabras y sus tags\n", "brown_tagged_sents = brown.tagged_sents(categories=\"news\")\n", "\n", "## obtiene las sentencias\n", "brown_sents = brown.sents(categories=\"news\")\n", "\n", "## El constructor recibe las sentencias taggeadas\n", "unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)\n", "\n", "##\n", "unigram_tagger.tag(brown_sents[2007])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9349006503968017" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unigram_tagger.evaluate(brown_tagged_sents)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4160" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Creación de conjuntos de entrenamiento y\n", "## validación\n", "##\n", "size = int(len(brown_tagged_sents) * 0.9)\n", "size" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8121200039868434" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Entrenamiento y validación\n", "##\n", "train_sents = brown_tagged_sents[:size]\n", "test_sents = brown_tagged_sents[size:]\n", "unigram_tagger = nltk.UnigramTagger(train_sents)\n", "unigram_tagger.evaluate(test_sents)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Various', 'JJ'),\n", " ('of', 'IN'),\n", " ('the', 'AT'),\n", " ('apartments', 'NNS'),\n", " ('are', 'BER'),\n", " ('of', 'IN'),\n", " ('the', 'AT'),\n", " ('terrace', 'NN'),\n", " ('type', 'NN'),\n", " (',', ','),\n", " ('being', 'BEG'),\n", " ('on', 'IN'),\n", " ('the', 'AT'),\n", " ('ground', 'NN'),\n", " ('floor', 'NN'),\n", " ('so', 'CS'),\n", " ('that', 'CS'),\n", " ('entrance', 'NN'),\n", " ('is', 'BEZ'),\n", " ('direct', 'JJ'),\n", " ('.', '.')]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## General N-gram tagging\n", "## Asigna el tag com base en los n-1 tags previos\n", "## teniendo en cuenta el contexto\n", "##\n", "bigram_tagger = nltk.BigramTagger(train_sents)\n", "bigram_tagger.tag(brown_sents[2007])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('The', 'AT'),\n", " ('population', 'NN'),\n", " ('of', 'IN'),\n", " ('the', 'AT'),\n", " ('Congo', 'NP'),\n", " ('is', 'BEZ'),\n", " ('13.5', None),\n", " ('million', None),\n", " (',', None),\n", " ('divided', None),\n", " ('into', None),\n", " ('at', None),\n", " ('least', None),\n", " ('seven', None),\n", " ('major', None),\n", " ('``', None),\n", " ('culture', None),\n", " ('clusters', None),\n", " (\"''\", None),\n", " ('and', None),\n", " ('innumerable', None),\n", " ('tribes', None),\n", " ('speaking', None),\n", " ('400', None),\n", " ('separate', None),\n", " ('dialects', None),\n", " ('.', None)]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## uso del tagger entrenado\n", "##\n", "unseen_sent = brown_sents[4203]\n", "bigram_tagger.tag(unseen_sent)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.10206319146815508" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Evaluación del tagger\n", "##\n", "bigram_tagger.evaluate(test_sents)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8452108043456593" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Combinación de taggers\n", "## backoff indica el tagger que se usa si el\n", "## tagger actual no encuentra un token adecuado\n", "##\n", "t0 = nltk.DefaultTagger(\"NN\")\n", "t1 = nltk.UnigramTagger(train_sents, backoff=t0)\n", "t2 = nltk.BigramTagger(train_sents, backoff=t1)\n", "t2.evaluate(test_sents)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "##\n", "## Almacenamiento de taggers usando pickle\n", "##\n", "from pickle import dump\n", "\n", "output = open(\"t2.pkl\", \"wb\")\n", "dump(t2, output, -1)\n", "output.close()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "##\n", "## Recuperación del tagger almacenado en disco\n", "##\n", "from pickle import load\n", "\n", "input = open(\"t2.pkl\", \"rb\")\n", "tagger = load(input)\n", "input.close()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('The', 'AT'),\n", " (\"board's\", 'NN$'),\n", " ('action', 'NN'),\n", " ('shows', 'NNS'),\n", " ('what', 'WDT'),\n", " ('free', 'JJ'),\n", " ('enterprise', 'NN'),\n", " ('is', 'BEZ'),\n", " ('up', 'RP'),\n", " ('against', 'IN'),\n", " ('in', 'IN'),\n", " ('our', 'PP$'),\n", " ('complex', 'JJ'),\n", " ('maze', 'NN'),\n", " ('of', 'IN'),\n", " ('regulatory', 'NN'),\n", " ('laws', 'NNS'),\n", " ('.', '.')]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Uso del tagger recuperado\n", "##\n", "text = \"\"\"The board's action shows what free enterprise\n", " is up against in our complex maze of regulatory laws .\"\"\"\n", "tokens = text.split()\n", "tagger.tag(tokens)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }