{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Clasificación POS-tagging\n", "\n", "* *30 min* | Última modificación: Diciembre 9, 2020" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "http://www.nltk.org/book/\n", "\n", "Text Analytics with Python" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package brown to /root/nltk_data...\n", "[nltk_data] Package brown is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "\n", "nltk.download(\"brown\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## En este ejemplo se construye un clasificador que\n", "## indica el tag de la palabra con base en sus últimas\n", "## letras\n", "##\n", "from nltk.corpus import brown\n", "\n", "##\n", "## Crea el objeto vacio\n", "##\n", "suffix_fdist = nltk.FreqDist()\n", "\n", "##\n", "## Computa la frecuencia de la última, dos últimas y\n", "## tres últimas letras de la palabra\n", "##\n", "for word in brown.words():\n", " word = word.lower()\n", " suffix_fdist[word[-1:]] += 1\n", " suffix_fdist[word[-2:]] += 1\n", " suffix_fdist[word[-3:]] += 1\n", "\n", "##\n", "## Sufijos más comunes\n", "##\n", "common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]\n", "common_suffixes[:10]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6270512182993535" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Representación usando bag-of-words\n", "##\n", "def pos_features(word):\n", " features = {}\n", " for suffix in common_suffixes:\n", " features[\"endswith({})\".format(suffix)] = word.lower().endswith(suffix)\n", " return features\n", "\n", "\n", "##\n", "## Conjunto de palabras taggeadas\n", "##\n", "tagged_words = brown.tagged_words(categories=\"news\")\n", "\n", "##\n", "## Preparación de los datos\n", "##\n", "featuresets = [(pos_features(n), g) for (n, g) in tagged_words]\n", "\n", "##\n", "## Conjuntos de entrenamiento y validación\n", "##\n", "size = int(len(featuresets) * 0.1)\n", "train_set, test_set = featuresets[size:], featuresets[:size]\n", "\n", "##\n", "## Entrenamiento y evaluación del clasificador\n", "##\n", "classifier = nltk.DecisionTreeClassifier.train(train_set)\n", "nltk.classify.accuracy(classifier, test_set)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'NNS'" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Clasificación de una palabra\n", "##\n", "classifier.classify(pos_features(\"cats\"))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "if endswith(the) == False: \n", " if endswith(,) == False: \n", " if endswith(s) == False: \n", " if endswith(.) == False: return '.'\n", " if endswith(.) == True: return '.'\n", " if endswith(s) == True: \n", " if endswith(is) == False: return 'PP$'\n", " if endswith(is) == True: return 'BEZ'\n", " if endswith(,) == True: return ','\n", "if endswith(the) == True: return 'AT'\n", "\n" ] } ], "source": [ "##\n", "## Reglas equivalentes\n", "##\n", "print(classifier.pseudocode(depth=4))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Aumento de las características teniendo\n", "## en cuenta la palabra anterior\n", "##\n", "def pos_features(sentence, i):\n", " features = {\n", " \"suffix(1)\": sentence[i][-1:],\n", " \"suffix(2)\": sentence[i][-2:],\n", " \"suffix(3)\": sentence[i][-3:],\n", " }\n", " if i == 0:\n", " features[\"prev-word\"] = \"\"\n", " else:\n", " features[\"prev-word\"] = sentence[i - 1]\n", " return features\n", "\n", "\n", "pos_features(brown.sents()[0], 8)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.7891596220785678" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Carga de las sentencias\n", "##\n", "tagged_sents = brown.tagged_sents(categories=\"news\")\n", "\n", "##\n", "## Extracción de características\n", "##\n", "featuresets = []\n", "for tagged_sent in tagged_sents:\n", " untagged_sent = nltk.tag.untag(tagged_sent)\n", " for i, (word, tag) in enumerate(tagged_sent):\n", " featuresets.append((pos_features(untagged_sent, i), tag))\n", "\n", "##\n", "## Conjuntos de entrenamiento y validación\n", "##\n", "size = int(len(featuresets) * 0.1)\n", "train_set, test_set = featuresets[size:], featuresets[:size]\n", "\n", "##\n", "## Entrenamiento y evaluación del clasificador\n", "##\n", "classifier = nltk.NaiveBayesClassifier.train(train_set)\n", "nltk.classify.accuracy(classifier, test_set)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.7980528511821975\n" ] } ], "source": [ "##\n", "## Clasificación de secuencias de tags\n", "##\n", "def pos_features(sentence, i, history):\n", " features = {\n", " \"suffix(1)\": sentence[i][-1:],\n", " \"suffix(2)\": sentence[i][-2:],\n", " \"suffix(3)\": sentence[i][-3:],\n", " }\n", " if i == 0:\n", " features[\"prev-word\"] = \"\"\n", " features[\"prev-tag\"] = \"\"\n", " else:\n", " features[\"prev-word\"] = sentence[i - 1]\n", " features[\"prev-tag\"] = history[i - 1]\n", " return features\n", "\n", "\n", "class ConsecutivePosTagger(nltk.TaggerI):\n", " ##\n", " ## Constructor\n", " ## Construye y entrena el clasificador\n", " ##\n", " def __init__(self, train_sents):\n", " train_set = []\n", " for tagged_sent in train_sents:\n", " untagged_sent = nltk.tag.untag(tagged_sent)\n", " history = []\n", " for i, (word, tag) in enumerate(tagged_sent):\n", " featureset = pos_features(untagged_sent, i, history)\n", " train_set.append((featureset, tag))\n", " history.append(tag)\n", " self.classifier = nltk.NaiveBayesClassifier.train(train_set)\n", "\n", " ##\n", " ## Aplica el tag a un grupo de sentencias\n", " ##\n", " def tag(self, sentence):\n", " history = []\n", " for i, word in enumerate(sentence):\n", " featureset = pos_features(sentence, i, history)\n", " tag = self.classifier.classify(featureset)\n", " history.append(tag)\n", " return zip(sentence, history)\n", "\n", "\n", "##\n", "## Carga los datos\n", "##\n", "tagged_sents = brown.tagged_sents(categories=\"news\")\n", "\n", "##\n", "## Conjuntos de entrenamiento y validación\n", "##\n", "size = int(len(tagged_sents) * 0.1)\n", "train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]\n", "\n", "##\n", "## Entrenamiento y evaluación del clasificador\n", "##\n", "tagger = ConsecutivePosTagger(train_sents)\n", "print(tagger.evaluate(test_sents))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }