{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Tagging automático\n", "\n", "* *30 min* | Última modificación: Diciembre 4, 2020" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "http://www.nltk.org/book/\n", "\n", "Text Analytics with Python" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), (\"Atlanta's\", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), (\"''\", \"''\"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlanta', 'NP-TL'), (\"''\", \"''\"), ('for', 'IN'), ('the', 'AT'), ('manner', 'NN'), ('in', 'IN'), ('which', 'WDT'), ('the', 'AT'), ('election', 'NN'), ('was', 'BEDZ'), ('conducted', 'VBN'), ('.', '.')], ...]" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Preparación\n", "##\n", "import nltk\n", "from nltk.corpus import brown\n", "\n", "##\n", "## Tags por frase\n", "##\n", "brown_tagged_sents = brown.tagged_sents(categories=\"news\")\n", "brown_tagged_sents" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', \"Atlanta's\", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', \"''\", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', \"''\", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Sentencias tokenizadas\n", "##\n", "brown_sents = brown.sents(categories=\"news\")\n", "brown_sents" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'NN'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Default tagger\n", "## Se asigna el tag mas común.\n", "## Cómputo del tag más común\n", "##\n", "tags = [tag for (word, tag) in brown.tagged_words(categories=\"news\")]\n", "nltk.FreqDist(tags).max()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('I', 'NN'),\n", " ('do', 'NN'),\n", " ('not', 'NN'),\n", " ('like', 'NN'),\n", " ('green', 'NN'),\n", " ('eggs', 'NN'),\n", " ('and', 'NN'),\n", " ('ham', 'NN'),\n", " (',', 'NN'),\n", " ('I', 'NN'),\n", " ('do', 'NN'),\n", " ('not', 'NN'),\n", " ('like', 'NN'),\n", " ('them', 'NN'),\n", " ('Sam', 'NN'),\n", " ('I', 'NN'),\n", " ('am', 'NN'),\n", " ('!', 'NN')]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Tagging\n", "##\n", "raw = \"I do not like green eggs and ham, I do not like them Sam I am!\"\n", "tokens = nltk.word_tokenize(raw)\n", "\n", "## Crea el tagger\n", "default_tagger = nltk.DefaultTagger(\"NN\")\n", "\n", "## Aplica el tagger\n", "default_tagger.tag(tokens)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.13089484257215028" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Evaluación del DefaultTagger('NN') vs las\n", "## palabras ya clasificadas. El valor indica\n", "## el porcentaje de tags correctamente asignados\n", "##\n", "default_tagger.evaluate(brown_tagged_sents)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('``', 'NN'),\n", " ('Only', 'NN'),\n", " ('a', 'NN'),\n", " ('relative', 'NN'),\n", " ('handful', 'NN'),\n", " ('of', 'NN'),\n", " ('such', 'NN'),\n", " ('reports', 'NNS'),\n", " ('was', 'NNS'),\n", " ('received', 'VBD'),\n", " (\"''\", 'NN'),\n", " (',', 'NN'),\n", " ('the', 'NN'),\n", " ('jury', 'NN'),\n", " ('said', 'NN'),\n", " (',', 'NN'),\n", " ('``', 'NN'),\n", " ('considering', 'VBG'),\n", " ('the', 'NN'),\n", " ('widespread', 'NN'),\n", " ('interest', 'NN'),\n", " ('in', 'NN'),\n", " ('the', 'NN'),\n", " ('election', 'NN'),\n", " (',', 'NN'),\n", " ('the', 'NN'),\n", " ('number', 'NN'),\n", " ('of', 'NN'),\n", " ('voters', 'NNS'),\n", " ('and', 'NN'),\n", " ('the', 'NN'),\n", " ('size', 'NN'),\n", " ('of', 'NN'),\n", " ('this', 'NNS'),\n", " ('city', 'NN'),\n", " (\"''\", 'NN'),\n", " ('.', 'NN')]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Regular expression tagger\n", "## Asigna los tokens de acuerdo a los patrones\n", "## dados como expresiones regulares\n", "##\n", "patterns = [\n", " (r\".*ing$\", \"VBG\"), # gerunds\n", " (r\".*ed$\", \"VBD\"), # simple past\n", " (r\".*es$\", \"VBZ\"), # 3rd singular present\n", " (r\".*ould$\", \"MD\"), # modals\n", " (r\".*\\'s$\", \"NN$\"), # possessive nouns\n", " (r\".*s$\", \"NNS\"), # plural nouns\n", " (r\"^-?[0-9]+(\\.[0-9]+)?$\", \"CD\"), # cardinal numbers\n", " (r\".*\", \"NN\"), # nouns (default)\n", "]\n", "\n", "## crea el tagger\n", "regexp_tagger = nltk.RegexpTagger(patterns)\n", "\n", "## aplica el tagger a una sentencia\n", "regexp_tagger.tag(brown_sents[3])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.20186168625812995" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Evaluación de la precisión del tagger\n", "##\n", "regexp_tagger.evaluate(brown_tagged_sents)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.45578495136941344" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Lookup tagger\n", "## Busca las N palabras más frecuentes y almacena\n", "## su tag más probable\n", "##\n", "\n", "## Computa las palabras más frecuentes\n", "fd = nltk.FreqDist(brown.words(categories=\"news\"))\n", "most_freq_words = fd.most_common(100)\n", "\n", "## computa el tag más frecuente para cada palabra\n", "cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories=\"news\"))\n", "likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)\n", "\n", "## construye el tagger\n", "baseline_tagger = nltk.UnigramTagger(model=likely_tags)\n", "\n", "## Evaluación\n", "baseline_tagger.evaluate(brown_tagged_sents)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('``', '``'),\n", " ('Only', None),\n", " ('a', 'AT'),\n", " ('relative', None),\n", " ('handful', None),\n", " ('of', 'IN'),\n", " ('such', None),\n", " ('reports', None),\n", " ('was', 'BEDZ'),\n", " ('received', None),\n", " (\"''\", \"''\"),\n", " (',', ','),\n", " ('the', 'AT'),\n", " ('jury', None),\n", " ('said', 'VBD'),\n", " (',', ','),\n", " ('``', '``'),\n", " ('considering', None),\n", " ('the', 'AT'),\n", " ('widespread', None),\n", " ('interest', None),\n", " ('in', 'IN'),\n", " ('the', 'AT'),\n", " ('election', None),\n", " (',', ','),\n", " ('the', 'AT'),\n", " ('number', None),\n", " ('of', 'IN'),\n", " ('voters', None),\n", " ('and', 'CC'),\n", " ('the', 'AT'),\n", " ('size', None),\n", " ('of', 'IN'),\n", " ('this', 'DT'),\n", " ('city', None),\n", " (\"''\", \"''\"),\n", " ('.', '.')]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Aplica el tagger\n", "##\n", "baseline_tagger.tag(brown.sents(categories=\"news\")[3])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "##\n", "## Note que muchos de los tags anteriores son None.\n", "## En este caso se puede utilizar otro tagger para que\n", "## clasifique estos tokens\n", "##\n", "baseline_tagger = nltk.UnigramTagger(\n", " model=likely_tags, backoff=nltk.DefaultTagger(\"NN\")\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deZwcdZ3/8dc7IRxCOBNcyDEJGMQIqDgg/kBlFRBQElx1FwwihyIqiD9012D8KbJmvd1Vl10MyiVxwyFiUARhFREVyHCGBIEYEpJwTSCc4Ujg8/vj+22n0tMz0zN0zdXv5+PRj+46uupT1dX1qfpW1feriMDMzJrXiIEOwMzMBpYTgZlZk3MiMDNrck4EZmZNzonAzKzJORGYmTU5J4JekhSSXjPQcQxnkl4t6XpJT0v6zkDHM5hIekbSTt0MXybpgP6MqR6SZkj6zUDHUS9Jk/J/faM6xj1G0g19nM9Zkv5fX77bSMM6EQzWP0V38h/mmfx6TtLLhe5nBjq+ruR1/VyO8xFJ50naoo+TOwFYDWwZEZ9tYJhDXkRsERFLAfI6/mpfpiNpH0nP1vqNJN0m6aRXGmtRRMyNiIMaOc2KvO29KGlMVf/b8s58UhnzrZek4yX9JR/YPCLpSkmjASLixIj414GMD4Z5IhiK8h9mi4jYAjgEeLDSnfsNKCVdbTeH5Rj3BFqBL/Zx2i3A4ujD0471HMEZRMSNwErgA8X+knYDpgL/05vpDYL1fj9wZKVD0u7AqwYunL/F8Q7g34AjI2I08DrgooGNqrOmTASSNpH0H5IezK//kLRJYfjHJC2R9Lik+ZJ27GI6+0laIWl/SadLurAwbINTS0nXSfqapJslPSXpF5K27WXcMyX9NR9ZLJb0vsKwkZK+I2m1pPslnVQ1/8mF4pZrJZ1ZFe8+kv4k6QlJd0javzDsOkmzJf0RWAt0WTQBEBGrgF8Du/Vh2hcAHwH+JZ9dHNDd75XX/UpJn5f0MHBu/i0ukXRhXt6FknaRdJqkR/NvdlAhhmMl3Z3HXSrp44Vhlel/Nn/3IUnHFoZvltf7cklPSrpB0mY9LXfV73qspCsK3fdJuqTQvULSG/PnkPQaSScAMwrr6YrCJN8o6c4cz0WSNu3ipzofOLqq39HAlRHxmKTv5Xk/JekWSW8rxHS6pEvzOn4KmClpraTtCuPsKald0ihVFZ/k5TgxL+sTeXtUHtbtttyFn1Qty0dI21JxPW8l6YIc03JJX1Q+qMnz/Hae51LgPTW+++P8+6+S9FVJI7uJp2Iv4M8RcRtARDweEedHxNN5un87q5N0hQpn/0qlAcfkYbtKukZpn3SPpH+sY971i4hh+wKWAQfU6H8GcCOwPTAW+BPwr3nYO0nFEnsCmwA/AK4vfDeA1wAHAyuAvXP/04ELC+NNyuNulLuvA1aRdo6bAz8rjt9F/PsDKwvdHwR2JCXwfwKeBXbIw04EFgPjgW2Aa6vm/2fg28DGwH7AU5X5A+OAx4BD87QPzN1jC7E/ALwe2AgY1d26BiYAi4B/7cu0gfOAr9b5e+0PrAe+kX+vzfJv8Tzw7jzNC0hHjLPy9D8G3F+Y/nuAnQEB7yAlpD2rpn9G/u6hefg2efiZeRnGASOB/5Pj6Ha5q9bdTsATebwdgeWV3z0PWwOMKG5/+fMG66nwO9ycp7MtcDdwYhfb14S8bBNy9wjSWcLhufsoYLu8Dj8LPAxsWtje1wGH5+9tBlwJfKIw/X8HfpA/HwPcUPU/+iWwNTARaAcOrmdb7mrbA+4hHXGPzMvRkr83KY93AfALYDTp/3kvcHxhnn/J62Rb4Hds+P/5OfBD0n93+7yOP15r2apiexvwHPAVYF9gk6rhnX7D3P8Q4MEcz+akfc2x+bd4E2kfNbVh+8r+3DH394uuE8FfgUML3e8GluXPPwa+WRi2Rd7gKxtTAKeR/qy7FcY7nZ4TwdcLw6cCLwIju4l/fwqJoMbw24Hp+fNvKxtm7j6gMv/8R1sPvKow/EI6EsHngZ9UTftq4COF2M+oY10/Q9qhLQf+i7Rz6PW0q/8cPfxe++f1uGnVb3FNofuwHNvI3D06r5utu1iWy4FTCtN/jsJOCHgU2Ie0A3wOeEONaXS73DXGX0E6+DgCmEPa0exK+vPPL4xXTyI4qtD9TeCsbn63a4Ev5M8HknbInRJ9Hr6msqx5HV9fNfyfgD/mzyNJiaNyoHQMnRPBfoXui4GZPW3L3f3PSUWRXyMdpF1D2vaD9F8cmbeTqYXvfRy4rjDPEwvDDqLj//Nq4AVgs8LwI4Hf1Vq2GvEdAlxB+m88A3y3sC3W+g13ydvYfoX1+oeqcX4IfLm7/2RvXgNdrjdQKkddFctzv8qwWysDIuIZSY+RjvCW5d6fAS6IiLt6Od8VVfMcBYwBHqnny5KOBk4lbdiQklTlAtmOVdMvft4ReDwi1lYNn5A/twAflHRYYfgo0lFRrel15fCIuLYq5kZMu7vfC6A9Ip6v+k5xnT4HrI6IlwrdkNbfE5IOAb5M+gOOIJUtLyx8/7GIWF/oXkvHut+UlKiq1bPcRb8nJZ3X5M9PkM5O3pq7e+PhqlhrFm1m5wNfIJVjfxiYFxHrACR9Djg+fz+ALenY3qDz7/YL4CxJk4HXAk9GxM29iLNyDay7bbk7PwGuByZTVSyU4x5F5+1oXBfzLI7Xkr/7UC69grSd1BVXRPwa+HUuhvp74BLS2csPq8eVtBVpPX4xIipFaS3AWyQ9URh1I9LyNkSzJoIHSSt3Ue6emPsVhwEgaXPS6fGqwvc/CPxY0sqI+F7u9ywbXpz6uxrznVD4PJF0prG6noDzDvVs4F2kMseXJN1OKs4AeIh0Kl1rXg8B20p6VSEZFIevIB29fqybEKKeOGtoxLS7+71eSWwoXWv4Gal8+RcRsU7S5XSs1+6sJhVB7QzcUTWsnuUu+j3pzGUyaaf8BOkawFuB/+ziO31e7oLLgP+S9PfAP5CSEfl6wL+QtrdFEfGypDVsuF42mH9EPC/pYlKR0q70fUfV3bbcpYhYLul+UnHc8VWDV5P+by2kYidI21Hlf/0Qnf+fFStIZwRjqg4IeiUiXgb+V9JvydfPinKi+CnpTGNO1fx/HxEH9nXePWmGi8WjJG1aeG1EuiPii5LGKt1y9iVSUQl52LGS3ph3Ev8G3BQRywrTfJD0BzlF0idyv9uBt0uamLP6aTViOUrSVEmvIpU5X1o4Su3J5qQ/XjukC4xsuDFdnOMZJ2lrUtEEkP4gQBtwuqSNJb2VtNOpuBA4TNK780WzTZUukhb/jH3ViGl393u9UhuTyvTbgfX57KCu2xzzH/sc4LuSdszL99a83fR2uX9POlrcLCJWAn8gFXFsB9zWxXceoYcL93Usw7PApcC5wPKIaMuDRpOKE9uBjSR9iXRG0JMLSEUl0+h7IuhyW67D8cA783L9Tf6fXQzMljQ6H1idSsd2dDHwaUnjJW0DzCx89yHgN8B3JG0paYSknZXuCOqWpOmSjpC0jZK9SWd6N9YYfTbpf35KVf9fArtI+rDShfdRkvaS9Lo61kddmiERXEkqCqi8Tge+Stox3kkqArg19yMXbfw/0lHiQ6SjvSOqJxoRD5CSwUxJH42Ia0i3hd0J3EL68ar9hFQm+DCpSOHT9S5ERCwGvkO66PsIsDvwx8IoZ5M21jtJO44rSX/kSqKpHF0+lpf1ItJRDhGxAphOKiJoJx2B/DMN2D4aNO0uf68GxPc06Xe4mFQG/iFgfi8m8bkc0wLgcdJF6xG9Xe6IuJdUfvyH3P0UsJRU5t7VwcKPgalKd91c3ouYq51POlIuFqdcDVxFuqC6nHTm02NRSET8EXgZuDUfgPRFT9tyd/P/ayGZVTuZdOa+FLiBdPR9TmGeV5PO7G4lnSkVHU06aFhM2k4uBXaoY1nWkG5OuI98gwbwrYiYW2PcI0nXntYU7hyakbfRg0j7oQdJ+4/KzRENoXzhwUom6TrSxdkf9dP8DiFdJGzpYvhFwF8i4sv9EY81j1z08dNGbes9bcv2yjXDGUFTULqf/VBJG0kaR7r4+fPC8L3y6ewISQeTjlZfyVGkWSeS9iLd/dTnh6Z62pat8ZwIhg+R7lVeQzqdvptUll7xd6RbNZ8Bvk+637ursmezXpN0Pul21M/k4ow+T4rut2VrMBcNmZk1OZ8RmJk1uSH3HMGYMWNi0qRJAx2GmdmQcsstt6yOiLG1hg25RDBp0iTa2rq6O8zMzGqR1OXtvC4aMjNrck4EZmZNzonAzKzJORGYmTU5JwIzsybnRGBmNsjNnQuTJsGIEel9bq0q616BIXf7qJlZM5k7F044AdbmlkSWL0/dADNmNGYePiMwMxtEImDNGrjnHvjDH+Azn+lIAhVr18KsWY2bp88IzMxKtnYtPPpofa/2dlhfRztoDzzQuPicCMzMemndOli9uvudebH72WdrT2eLLWD77dOrpQX22quje+zY9H700fDQQ52/O3Fi53595URgZk2vUhzT1Y68+vX447WnM2rUhjvxKVM6duzVr7Fj4VWvqj2dom99a8NrBJC+N3t2Y5YdnAjMbJhqVHHMdtt17Lx3373rHfv228NWW4HU2OWoXBCeNSsVB02cmJJAoy4UwxBsj6C1tTVc6ZxZ8+mpOKZ6x15PcUx3R+vbbw9jxsBGw+RwWdItEdFaa9gwWUQzG2pefhmeeKK+MvZ6imMqO/BddnnlxTHNptREkNvG/R4wEvhRRHy9angLcA4wFngcOCoiVpYZk5mV59ln6y9nH8zFMc2mtEQgaSRwJnAgsBJYIGl+RCwujPZt4IKIOF/SO4GvAR8uKyYz653eFMc8+mjn+90rurs7ZjgXxwwVZa7uvYElEbEUQNI8YDpQTARTgVPz598Bl5cYj1nT6644plbRTD3FMdtv7+KYoa7MRDAOWFHoXgm8pWqcO4B/IBUfvQ8YLWm7iHisOJKkE4ATACY28uZZs2GgWBzTUzm7i2OsloE+Afsc8J+SjgGuB1YBL1WPFBFzgDmQ7hrqzwDNejJ3bmNv7Vu3Lu2weypjb0RxTOWI3cUxza3Mn34VMKHQPT73+5uIeJB0RoCkLYD3R8QTJcZk1lD1VAhWb3FM5bVmTe15uTjGylLacwSSNgLuBd5FSgALgA9FxKLCOGOAxyPiZUmzgZci4kvdTdfPEdhg0tJSu86XTTeF1762d8UxPb1cHGOvxIA8RxAR6yWdBFxNun30nIhYJOkMoC0i5gP7A1+TFKSioU+VFY/ZK/XEE7BwIdx5Z8d7VxV/Pf+8i2Ns6PCTxWZVXnwxVQFc3OkvXAgrCrc+bLMN7LEH3HorPP1052m0tMCyZf0WslmP/GSxWQ0RsGrVhjv7O++Ev/wlXbCFVC7/utfBO96R7qTZffeUAHbcMRXTVF8jgMZXCGZWNicCawpPPw133dW5aOeJwq0JEyaknfx73pPed989lfOPGtX1dPujQjCzsrloyIaV9evhvvs2PMJfuBDuv79jnNGjO47sK++77QZbbz1wcZuVzUVDNuxEwCOPdD7CX7wYXnghjTNyZDqi33tv+OhHO4p2Wlp8941ZkROBDXpr18KiRZ0v3ra3d4yzww5pJ3/yyR1H+bvumm7jNLPuORHYoPHyy7B06YZH+AsXwpIl6QwA0oXY3XaDadM6inZ23z3dimlmfeNEYANi9erO5fh33dVx940Er3lN2tnPmNFxlL/TTjBixMDGbjbcOBFYqZ5/Hu6+u/NOv9gY95gxaSf/sY91HOW//vWuIsGsvzgRWENEpHp2qi/e3nsvvJSrEdxkE5g6FQ46aMO7dl79al+8NRtITgTWSU+1aVaqWqgu1nnqqY5xJk9OO/n3v7+jHH/KFFepYDYY+W9pG6hVm+Zxx8FFF6Uj++qqFrbeOh3Zf/jDG96TP3r0wMRvZr3nRGAb+MIXOtdv/+KLcMUVaUf/trdt+CDWuHEu1jEb6pwIDEgPYp17bte1aUqpGMjMhh8ngib25JMwb15KADfdlMrvN9sMnnuu87huIdRs+PId2U3m5Zfh2mvTxd+/+zs48cTU5u13v5tq4jz77M63bbo2TbPhzWcETeL+++G88+D889MF4K23TheBjz0W3vzmjnJ+16Zp1nycCIaxZ5+Fn/0sFf1cd13a2R90EHzjGzB9etf18MyY4R2/WTNxIhhmIuDPf047/4suSvXw77wzfPWrcPTRqc59M7OiUhOBpIOB75HaLP5RRHy9avhE4Hxg6zzOzIi4ssyYhqsHH4QLLkjFP/fcA5tvDv/4j6noZ7/9fIunmXWttEQgaSRwJnAgsBJYIGl+RCwujPZF4OKI+G9JU4ErgUllxTTcvPBCur//3HPhqqvSheC3vQ0+/3n44Adhiy0GOkIzGwrKPCPYG1gSEUsBJM0DpgPFRBDAlvnzVsCDJcYzbNx2W9r5z50Ljz+eHuo67TQ45phUY6eZWW+UmQjGAYXKCFgJvKVqnNOB30g6GdgcOKDWhCSdAJwAMLFJb2hfvRp++lM45xy4445Ugdvhh6einwMOSK1xmZn1xUA/R3AkcF5EjAcOBX4iqVNMETEnIlojonXs2LH9HuRAWb8efvUr+MAHYMcd4ZRTUkPqZ56ZqnGeNw/e/W4nATN7ZcpMBKuA4j0q43O/ouOBiwEi4s/ApkDTtDU1dy5MmpQaWpk0KXVDutg7c2a6h/+974Xrr4eTTkpVPCxYAJ/8JGyzzUBGbmbDSZlFQwuAKZImkxLAEcCHqsZ5AHgXcJ6k15ESQTtNoKtaPk8/PTXNOHIkvOc9qejn0ENh440HNFwzG8ZKSwQRsV7SScDVpFtDz4mIRZLOANoiYj7wWeBsSf+XdOH4mIhK67TD26xZtWv5XLYMvvUtOOqoVAWEmVnZNNT2u62trdHW1jbQYbxiI0Z0NMheJKXbQM3MGknSLRHRWmvYQF8sbkrPPNN1e7xNelOUmQ0gJ4J+duutsOeeqR6gUaM2HOZaPs1sIDgR9JMI+N734K1vTdcGrrsuPRTW0pKKg1paYM4cV/ZmZv3Plc71g9Wr090/v/wlHHZYSgDbbZeGecdvZgPNZwQlu+46eMMb4De/ge9/H37xi44kYGY2GDgRlGT9evjSl+Cd70yVv914I5x8smsBNbPBx0VDJVixAj70IbjhhlQR3A9+4JpAzWzwciJosMsvT08Ir1sHF17oawBmNvi5aKhBnn8+1Qf0vvfBTjulqqKdBMxsKHAiaIC774a3vCXVCnrqqfCnP7ldADMbOpwIeqlYY2hLS6o4rrU1NRX5q1/Bd77jCuLMbGjxNYJeqK4x9IEH4OyzYepUuOaa1GaAmdlQ4zOCXqhVYyikuoOcBMxsqHIi6IUHHqjdf8WK2v3NzIYCJ4JemDChdn/XGGpmQ5kTQS/stVfnfq4x1MyGOieCOl11FVx2Gey7bzoDcI2hZjZclHrXkKSDge+Rmqr8UUR8vWr4vwN/nztfBWwfEVuXGVNf3H9/qjJi991T5XFdNSpjZjYUlZYIJI0EzgQOBFYCCyTNj4jFlXEi4v8Wxj8ZeFNZ8fTVc8/B+9+fmo+87DInATMbfsosGtobWBIRSyPiRWAeML2b8Y8E/qfEeHotAj75yVRdxIUXws47D3REZmaNV2YiGAcUb6xcmft1IqkFmAz8tovhJ0hqk9TW3t7e8EC7MmcOnHdeqk76ve/tt9mamfWrwXKx+Ajg0oh4qdbAiJgTEa0R0Tp27Nh+Ceimm1L7AQcfnBKBmdlwVWYiWAUU77wfn/vVcgSDoFioWI/QvvvCVlulfiNHDnRkZmblKTMRLACmSJosaWPSzn5+9UiSdgW2Af5cYiw9qtQjtHx5ujbw0kup6ohf/3ogozIzK19piSAi1gMnAVcDdwMXR8QiSWdImlYY9QhgXkREWbHUo1Y9Qs8/n/qbmQ1nGuD9b6+1trZGW1tbw6c7YkQ6E6gmpVtHzcyGMkm3RERrrWGD5WLxgOuqviDXI2Rmw50TQXbGGZ37uR4hM2sGTgTZ6NHpfexY1yNkZs2lriomJL0K+CwwMSI+JmkK8NqI+GWp0fWjs86C8eNTvUIbud02M2si9Z4RnAu8ALw1d68CvlpKRANg6dJUmdxHP+okYGbNp95EsHNEfBNYBxARawGVFlU/mzMnPTT20Y8OdCRmZv2v3kTwoqTNgACQtDPpDGHIe/FFOOccOOwwGFezJiQzs+Gt3oKQLwNXARMkzQX2BY4pK6j+9POfQ3s7fPzjAx2JmdnAqCsRRMQ1km4F9iEVCZ0SEatLjaxkc+emp4aXL0/FQquH9NKYmfVdXUVDkt4HrI+IX+U7hdZLOrzc0MpTrFcIUr1CH/946m9m1mzqvUbw5Yh4stIREU+QiouGpFr1Cq1d63qFzKw51ZsIao03ZG+0fOCB3vU3MxvO6k0EbZK+K2nn/PoucEuZgZXJ9QqZmXWoNxGcDLwIXJRfLwCfKiuoss2eDZtssmE/1ytkZs2q3ruGngVmlhxLv5kxAy6/HC69NNUrNHFiSgKuV8jMmlG9dQ3tAnwOmFT8TkS8s5ywyvf887DrrnD33QMdiZnZwKr3gu8lwFnAj4CaDcwPJRGpcfpDDx3oSMzMBl69iWB9RPx3qZH0o2XL0tPEb3nLQEdiZjbw6r1YfIWkT0raQdK2lVdPX5J0sKR7JC2RVPMag6R/lLRY0iJJP+1V9H10443pfZ99+mNuZmaDW71nBB/J7/9c6BfATl19QdJI4EzgQGAlsEDS/IhYXBhnCnAasG9ErJG0fW+C74u5c+ETn0ifDz8c/u3ffJHYzJpbvXcNTe7DtPcGlkTEUgBJ84DpwOLCOB8DzoyINXk+j/ZhPnWrVC1Rear4gQdSNzgZmFnzqrupSkm75WKcoyuvHr4yDlhR6F6Z+xXtAuwi6Y+SbpR0cBfzPkFSm6S29vb2ekPuxFVLmJl1Vu/to18G9gemAlcChwA3ABc0YP5T8rTHA9dL2j3XZfQ3ETEHmAPQ2toafZ2Zq5YwM+us3jOCDwDvAh6OiGOBNwBb9fCdVcCEQvf43K9oJTA/ItZFxP3AvaTEUApXLWFm1lm9ieC5iHiZVP30lsCjbLiTr2UBMEXSZEkbA0cA86vGuZx0NoCkMaSioqV1xtRrs2fDxhtv2M9VS5hZs+tNpXNbA2eTKpu7Ffhzd1+IiPXAScDVwN3AxRGxSNIZkqbl0a4GHpO0GPgd8M8R8VgflqMuM2bAtDxnCVpaUnvFvlBsZs1MEb0rcpc0CdgyIu4sI6CetLa2RltbW5+//4lPwCWXuEUyM2sukm6JiNZaw+puU0DSHhTqGpL0moi4rCER9qOVK2H8+IGOwsxs8Kj3rqFzgD2ARcDLuXcATgRmZkNcvWcE+0TE1FIj6ScrV7qOITOzonovFv9Z0pBOBHPnpttEV6+GefPcUL2ZWUW9ZwQXkJLBw6TWyQREROxRWmQNVF21xJNPumoJM7OKuu4akrQEOBVYSMc1AiJieXmh1daXu4YmTYLlNSJtaUlVUpuZDXeNuGuoPSKqHwYbMly1hJlZ1+pNBLfltgKuIBUNATBUbh+dOLH2GYGrljAzq/9i8WakBHAQcFh+vbesoBpt9uxUlUSRq5YwM0t6PCPIDcw8FhGf64d4SlG5IHzyybBmTXqO4Otf94ViMzOoIxFExEuS9u2PYMo0Y0Z6hmDmTLjnns5nCGZmzareawS3S5oPXAI8W+k5VK4RVDz3XHrfdNOBjcPMbDCpNxFsCjwGvLPQb8hVMbF2LWy2GYyou102M7Phr942i48tO5D+sHati4TMzKrVdWwsabykn0t6NL9+JmnIVd323HNOBGZm1eotJDmX1LrYjvl1Re43pFSKhszMrEO9iWBsRJwbEevz6zxgbIlxlcJFQ2ZmndWbCB6TdJSkkfl1FOnicbckHSzpHklLJM2sMfwYSe2Sbs+vj/Z2AXrDicDMrLN67xo6DvgB8O+ku4X+BHR7ATk/iHYmcCCwElggaX5ELK4a9aKIOKlXUfeRrxGYmXXW7RmBpG/kj3tHxLSIGBsR20fE4RHRU5VtewNLImJpRLwIzAOmNyDmPpk7F26+Ga69NtVG6vYIzMySnoqGDpUk4LQ+THscsKLQvTL3q/Z+SXdKulTShFoTknSCpDZJbe3t7b0OpNIewbp1qXv58tTtZGBm1nMiuApYA+wh6SlJTxffGzD/K4BJuYGba4Dza40UEXMiojUiWseO7f016lmzOhqlqVi7NvU3M2t23SaCiPjniNga+FVEbBkRo4vvPUx7FVA8wh+f+xWn/1hEVKq1/hHw5l7GXxe3R2Bm1rUe7xrKF3172unXsgCYImmypI2BI0jPIhSnvUOhcxpwdx/m06Ou2h1wewRmZnUkgoh4CXhZ0la9mXBErAdOAq4m7eAvjohFks6QNC2P9mlJiyTdAXwaOKZX0dfJ7RGYmXWt3jaLfwG8iVSOX6x99NPlhVZbX9oshnRh+MMfhojUVvHs2W6PwMyaRyPaLL6MIVbTaLUZM+C44+DUU+FrXxvoaMzMBo96ax89X9JmwMSIuKfkmEqzbh2MGjXQUZiZDS711j56GHA76XZSJL0xN1QzZLz0UioW2qjecyAzsyZRb11Dp5OeFH4CICJuB3YqKaZSrF+f3n1GYGa2oXoTwbqIeLKq38uNDqZMlaeKnQjMzDZUb0HJIkkfAkZKmkK61fNP5YXVeJVE4KIhM7MN1XtGcDLweuAF4KfAk8BnygqqDC4aMjOrrdvjY0mbAicCrwEWAm/ND4oNOT4jMDOrraczgvOBVlISOAT4dukRlcTXCMzMauspEUyNiKMi4ofAB4C390NMpbgsPw53/PFuj8DMrKinRLCu8mGoFglB2ul/4Qsd3W6PwMysQ0+J4A25/YGnJD1NVbsE/RFgI8yaBc8/v2E/t0dgZpZ0e+k0Ikb2VyBlcnsEZmZdq/f20SHN7RGYmXWtKRLB7Nmw6aYb9nN7BGZmSVMkghkzYObM9FlK7RHMmeP2CMzMoEkSAcCBB6b3q66CZcucBMzMKkpNBJIOlnSPpCWSZnYz3vslhaSarec0wksvpfcRTZP6zMzqU5b5it0AAA1TSURBVNpuMTd6fybpieSpwJGSptYYbzRwCnBTWbEAvJzrSh05LO6DMjNrnDKPj/cGlkTE0oh4EZgHTK8x3r8C3wCerzGsYXxGYGZWW5m7xXHAikL3ytzvbyTtCUyIiF91NyFJJ0hqk9TW3t7ep2B8RmBmVtuAHR9LGgF8F/hsT+NGxJyIaI2I1rFjx/Zpfj4jMDOrrczd4ipgQqF7fO5XMRrYDbhO0jJgH2B+WReMfUZgZlZbmYlgATBF0mRJGwNHAH9r8D4inoyIMRExKSImATcC0yKirYxgfEZgZlZbabvFXFvpScDVwN3AxRGxSNIZkqaVNd9a5s6F445Ln6dNc62jZmZFpbbXFRFXAldW9ftSF+PuX0YMc+emKqfXrk3dDz+cusEPlZmZQRM8WTxrVkcSqHAV1GZmHYZ9InAV1GZm3Rv2icBVUJuZdW/YJ4LZs1OV00WugtrMrMOwTwQzZqQqp8eMSd077ugqqM3Mikq9a2iwmDEjtUMwYwb89rfw2tcOdERmZoPHsD8jqIhI79LAxmFmNtg4EZiZNTknAjOzJudEYGbW5JwIzMyanBOBmVmTcyIwM2tyTgRmZk3OicDMrMk5EZiZNTknAjOzJtcUiWDuXJg5M33eZx83VWlmVlRqIpB0sKR7JC2RNLPG8BMlLZR0u6QbJE1tdAyVpioffzx1r1qVup0MzMwSRaXMpNETlkYC9wIHAiuBBcCREbG4MM6WEfFU/jwN+GREHNzddFtbW6Otra3uOCZNguXLO/dvaYFly+qejJnZkCbplohorTWszDOCvYElEbE0Il4E5gHTiyNUkkC2OdDwrOSmKs3MuldmIhgHrCh0r8z9NiDpU5L+CnwT+HStCUk6QVKbpLb29vZeBeGmKs3MujfgF4sj4syI2Bn4PPDFLsaZExGtEdE6duzYXk3fTVWamXWvzESwCphQ6B6f+3VlHnB4o4OoNFW5zTY5iPFuqtLMrKjMpioXAFMkTSYlgCOADxVHkDQlIu7Lne8B7qMEM2bAk0/Cpz4Ft9wC229fxlzMzIam0hJBRKyXdBJwNTASOCciFkk6A2iLiPnASZIOANYBa4CPlBdPWVM2MxvaSm28PiKuBK6s6velwudTypx/LX6y2MxsQwN+sbi/+IzAzKy2pkkEFT4jMDPbUNMkAp8RmJnV1jSJoMJnBGZmG2qaROAzAjOz2pomEVT4jMDMbENNkwh8RmBmVltTJIK5c+ErX0mf3/AGt0VgZlZU6gNlg0GlYZq1a1P3ypWpG1zfkJkZNMEZwaxZHUmgYu3a1N/MzJogEbhhGjOz7g37ROCGaczMujfsE4EbpjEz696wTwTVDdNMmOCGaczMiob9XUOQdvqPPAKf/SzcdRdsueVAR2RmNngM+zMCMzPrXtMkAj9ZbGZWW6mJQNLBku6RtETSzBrDT5W0WNKdkv5XUkuZ8aR5lj0HM7OhpbREIGkkcCZwCDAVOFLS1KrRbgNaI2IP4FLgm2XF4zMCM7Payjwj2BtYEhFLI+JFYB4wvThCRPwuIirP/d4IjC8xHsBnBGZm1cpMBOOAFYXulblfV44Hfl1rgKQTJLVJamtvb29giGZmNiguFks6CmgFvlVreETMiYjWiGgdO3Zsr6c/d27HA2RTp7r2UTOzojKfI1gFTCh0j8/9NiDpAGAW8I6IeKHRQVTXPrpihWsfNTMrKvOMYAEwRdJkSRsDRwDziyNIehPwQ2BaRDxaRhCufdTMrHulJYKIWA+cBFwN3A1cHBGLJJ0haVoe7VvAFsAlkm6XNL+LyfWZax81M+teqVVMRMSVwJVV/b5U+HxAmfOHVMvo8uW1+5uZ2SC5WFwm1z5qZta9YZ8IKrWPtrSkZwhaWlz7qJlZUdPUPuodv5lZbcP+jMDMzLrnRGBm1uScCMzMmpwTgZlZk3MiMDNrcoohVlG/pHagxiNidRkDrG5gOP1pqMY+VOOGoRu74+5/QyH2loioWWvnkEsEr4SktohoHeg4+mKoxj5U44ahG7vj7n9DOXZw0ZCZWdNzIjAza3LNlgjmDHQAr8BQjX2oxg1DN3bH3f+GcuzNdY3AzMw6a7YzAjMzq+JEYGbW5JomEUg6WNI9kpZImjkI4pkg6XeSFktaJOmU3P90Satyi223Szq08J3Tcvz3SHp3oX+/LpukZZIW5vjacr9tJV0j6b78vk3uL0nfz7HdKWnPwnQ+kse/T9JH+iHu1xbW6+2SnpL0mcG4ziWdI+lRSXcV+jVsHUt6c/4Nl+TvquTYvyXpLzm+n0vaOvefJOm5wro/q6cYu1oPJcXdsG1Dqdnem3L/i5Sa8B0cImLYv4CRwF+BnYCNgTuAqQMc0w7AnvnzaOBeYCpwOvC5GuNPzXFvAkzOyzNyIJYNWAaMqer3TWBm/jwT+Eb+fCjwa0DAPsBNuf+2wNL8vk3+vE0/bxMPAy2DcZ0Dbwf2BO4qYx0DN+dxlb97SMmxHwRslD9/oxD7pOJ4VdOpGWNX66GkuBu2bQAXA0fkz2cBn+iv7b2nV7OcEewNLImIpRHxIjAPmD6QAUXEQxFxa/78NKld53HdfGU6MC8iXoiI+4ElpOUaLMs2HTg/fz4fOLzQ/4JIbgS2lrQD8G7gmoh4PCLWANcAB/djvO8C/hoR3T2lPmDrPCKuBx6vEc8rXsd52JYRcWOkvdIFhWmVEntE/CZSO+YANwLju5tGDzF2tR4aHnc3erVt5LOZdwKXNjruRmiWRDAOWFHoXkn3O91+JWkS8CbgptzrpHwKfU7htLerZRiIZQvgN5JukXRC7vfqiHgof34YeHX+PJjiLjoC+J9C92Bf59C4dTwuf67u31+OIx3hV0yWdJuk30t6W+7XXYxdrYeyNGLb2A54opAMB9U+qFkSwaAlaQvgZ8BnIuIp4L+BnYE3Ag8B3xnA8LqyX0TsCRwCfErS24sD8xHcoL0vOZfNTgMuyb2GwjrfwGBfx12RNAtYD8zNvR4CJkbEm4BTgZ9K2rLe6fXDehhy20ZfNEsiWAVMKHSPz/0GlKRRpCQwNyIuA4iIRyLipYh4GTibdKoJXS9Dvy9bRKzK748CP88xPpJP5yun9Y8OtrgLDgFujYhHYGis86xR63gVGxbN9Ev8ko4B3gvMyDtwctHKY/nzLaTy9V16iLGr9dBwDdw2HiMV2W1U1X9QaJZEsACYkq/ab0wqFpg/kAHlMsMfA3dHxHcL/XcojPY+oHIHw3zgCEmbSJoMTCFdTOvXZZO0uaTRlc+ki4B35XlW7kr5CPCLQtxH5ztb9gGezKf1VwMHSdomn24flPv1hyMpFAsN9nVe0JB1nIc9JWmfvB0eXZhWKSQdDPwLMC0i1hb6j5U0Mn/eibSOl/YQY1froYy4G7Jt5MT3O+AD/RF3rw301er+epHurLiXdMQxaxDEsx/plPZO4Pb8OhT4CbAw958P7FD4zqwc/z0U7vLoz2Uj3Q1xR34tqsyPVAb6v8B9wLXAtrm/gDNzbAuB1sK0jiNdZFsCHNtP631z0tHZVoV+g26dkxLVQ8A6Unny8Y1cx0Araaf2V+A/ybUMlBj7ElLZeWVbPyuP+/68Hd0O3Aoc1lOMXa2HkuJu2LaR/zs353VxCbBJf2zz9bxcxYSZWZNrlqIhMzPrghOBmVmTcyIwM2tyTgRmZk3OicDMrMk5EVhTkBSSLix0bySpXdIvezmdZZLG9GUcSccp1aZ5p6S7JE3P/c+QdEBv4jBrpI16HsVsWHgW2E3SZhHxHHAg/fhkp6TxpPvO94yIJ3PVImMBIuJL/RWHWS0+I7BmciXwnvy5+unibSVdno/Wb5S0R+6/naTfKLUZ8SPSw1uV7xwl6Waleup/WHlCtgvbA08DzwBExDORaq1E0nmSPiCpVR313i+UFHn4zpKuypX8/UHSrg1cJ2ZOBNZU5pGqBdgU2IOO2l4BvgLcFhF7AF8gVXsM8GXghoh4PalepYkAkl4H/BOwb0S8EXgJmNHNvO8AHgHul3SupMOqR4iItoh4Y57eVcC386A5wMkR8Wbgc8B/9X7RzbrmoiFrGhFxp1KV30eSzg6K9iNVd0BE/DafCWxJaqzkH3L/X0lak8d/F/BmYEGqCofN6Kbys4h4Kde3s1f+7r9LenNEnF49rqR/IjWQclAuQvo/wCXqaERsk94tuVn3nAis2cwnHWnvT6qzpq8EnB8Rp9X7hUj1udwM3CzpGuBcUgtYHROVdsv93p6TxwhSPfZvfAWxmnXLRUPWbM4BvhIRC6v6/4FctCNpf2B1pPYhrgc+lPsfQmryEVKlZx+QtH0etq2klq5mKmlHFdoSJtVvv7xqnK1J1y2Ojoh2gBzD/ZI+mMeRpDf0eqnNuuEzAmsqEbES+H6NQacD50i6E1hLRzXHXwH+R9Ii4E/AA3k6iyV9kdRS2whSjZWfomrnXjAK+LakHYHngXbgxKpxppPaUD67UgyUzwRmAP+d5zeKdK3jjt4tuVnXXPuomVmTc9GQmVmTcyIwM2tyTgRmZk3OicDMrMk5EZiZNTknAjOzJudEYGbW5P4/ixw++7SSv5kAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "##\n", "## El siguiente fragmento de código permite evaluar\n", "## el tagger para diferentes cantidades de palabras\n", "##\n", "def display():\n", " def performance(cfd, wordlist):\n", " lt = dict((word, cfd[word].max()) for word in wordlist)\n", " baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger(\"NN\"))\n", " return baseline_tagger.evaluate(brown.tagged_sents(categories=\"news\"))\n", "\n", " import pylab\n", "\n", " word_freqs = nltk.FreqDist(brown.words(categories=\"news\")).most_common()\n", " words_by_freq = [w for (w, _) in word_freqs]\n", " cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories=\"news\"))\n", " sizes = 2 ** pylab.arange(15)\n", " perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]\n", " pylab.plot(sizes, perfs, \"-bo\")\n", " pylab.title(\"Lookup Tagger Performance with Varying Model Size\")\n", " pylab.xlabel(\"Model Size\")\n", " pylab.ylabel(\"Performance\")\n", " pylab.show()\n", "\n", "\n", "display()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Various', 'JJ'),\n", " ('of', 'IN'),\n", " ('the', 'AT'),\n", " ('apartments', 'NNS'),\n", " ('are', 'BER'),\n", " ('of', 'IN'),\n", " ('the', 'AT'),\n", " ('terrace', 'NN'),\n", " ('type', 'NN'),\n", " (',', ','),\n", " ('being', 'BEG'),\n", " ('on', 'IN'),\n", " ('the', 'AT'),\n", " ('ground', 'NN'),\n", " ('floor', 'NN'),\n", " ('so', 'QL'),\n", " ('that', 'CS'),\n", " ('entrance', 'NN'),\n", " ('is', 'BEZ'),\n", " ('direct', 'JJ'),\n", " ('.', '.')]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Unigram tagging\n", "## Se basa en asignar el tag que es más frecuente para\n", "## la palabra analizada. Este tagger puede ser\n", "## *entrenado*\n", "##\n", "from nltk.corpus import brown\n", "\n", "## Obtiene las palabras y sus tags\n", "brown_tagged_sents = brown.tagged_sents(categories=\"news\")\n", "\n", "## obtiene las sentencias\n", "brown_sents = brown.sents(categories=\"news\")\n", "\n", "## El constructor recibe las sentencias taggeadas\n", "unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)\n", "\n", "##\n", "unigram_tagger.tag(brown_sents[2007])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9349006503968017" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unigram_tagger.evaluate(brown_tagged_sents)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4160" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Creación de conjuntos de entrenamiento y\n", "## validación\n", "##\n", "size = int(len(brown_tagged_sents) * 0.9)\n", "size" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8121200039868434" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Entrenamiento y validación\n", "##\n", "train_sents = brown_tagged_sents[:size]\n", "test_sents = brown_tagged_sents[size:]\n", "unigram_tagger = nltk.UnigramTagger(train_sents)\n", "unigram_tagger.evaluate(test_sents)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Various', 'JJ'),\n", " ('of', 'IN'),\n", " ('the', 'AT'),\n", " ('apartments', 'NNS'),\n", " ('are', 'BER'),\n", " ('of', 'IN'),\n", " ('the', 'AT'),\n", " ('terrace', 'NN'),\n", " ('type', 'NN'),\n", " (',', ','),\n", " ('being', 'BEG'),\n", " ('on', 'IN'),\n", " ('the', 'AT'),\n", " ('ground', 'NN'),\n", " ('floor', 'NN'),\n", " ('so', 'CS'),\n", " ('that', 'CS'),\n", " ('entrance', 'NN'),\n", " ('is', 'BEZ'),\n", " ('direct', 'JJ'),\n", " ('.', '.')]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## General N-gram tagging\n", "## Asigna el tag com base en los n-1 tags previos\n", "## teniendo en cuenta el contexto\n", "##\n", "bigram_tagger = nltk.BigramTagger(train_sents)\n", "bigram_tagger.tag(brown_sents[2007])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('The', 'AT'),\n", " ('population', 'NN'),\n", " ('of', 'IN'),\n", " ('the', 'AT'),\n", " ('Congo', 'NP'),\n", " ('is', 'BEZ'),\n", " ('13.5', None),\n", " ('million', None),\n", " (',', None),\n", " ('divided', None),\n", " ('into', None),\n", " ('at', None),\n", " ('least', None),\n", " ('seven', None),\n", " ('major', None),\n", " ('``', None),\n", " ('culture', None),\n", " ('clusters', None),\n", " (\"''\", None),\n", " ('and', None),\n", " ('innumerable', None),\n", " ('tribes', None),\n", " ('speaking', None),\n", " ('400', None),\n", " ('separate', None),\n", " ('dialects', None),\n", " ('.', None)]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## uso del tagger entrenado\n", "##\n", "unseen_sent = brown_sents[4203]\n", "bigram_tagger.tag(unseen_sent)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.10206319146815508" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Evaluación del tagger\n", "##\n", "bigram_tagger.evaluate(test_sents)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8452108043456593" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Combinación de taggers\n", "## backoff indica el tagger que se usa si el\n", "## tagger actual no encuentra un token adecuado\n", "##\n", "t0 = nltk.DefaultTagger(\"NN\")\n", "t1 = nltk.UnigramTagger(train_sents, backoff=t0)\n", "t2 = nltk.BigramTagger(train_sents, backoff=t1)\n", "t2.evaluate(test_sents)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "##\n", "## Almacenamiento de taggers usando pickle\n", "##\n", "from pickle import dump\n", "\n", "output = open(\"t2.pkl\", \"wb\")\n", "dump(t2, output, -1)\n", "output.close()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "##\n", "## Recuperación del tagger almacenado en disco\n", "##\n", "from pickle import load\n", "\n", "input = open(\"t2.pkl\", \"rb\")\n", "tagger = load(input)\n", "input.close()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('The', 'AT'),\n", " (\"board's\", 'NN$'),\n", " ('action', 'NN'),\n", " ('shows', 'NNS'),\n", " ('what', 'WDT'),\n", " ('free', 'JJ'),\n", " ('enterprise', 'NN'),\n", " ('is', 'BEZ'),\n", " ('up', 'RP'),\n", " ('against', 'IN'),\n", " ('in', 'IN'),\n", " ('our', 'PP$'),\n", " ('complex', 'JJ'),\n", " ('maze', 'NN'),\n", " ('of', 'IN'),\n", " ('regulatory', 'NN'),\n", " ('laws', 'NNS'),\n", " ('.', '.')]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Uso del tagger recuperado\n", "##\n", "text = \"\"\"The board's action shows what free enterprise\n", " is up against in our complex maze of regulatory laws .\"\"\"\n", "tokens = text.split()\n", "tagger.tag(tokens)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }