{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Clasificación de documentos\n", "\n", "* *30 min* | Última modificación: Diciembre 9, 2020" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "http://www.nltk.org/book/\n", "\n", "Text Analytics with Python" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package movie_reviews to /root/nltk_data...\n", "[nltk_data] Package movie_reviews is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "\n", "nltk.download(\"movie_reviews\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "##\n", "## Carga la base de datos movie-review\n", "##\n", "import random\n", "\n", "from nltk.corpus import movie_reviews\n", "\n", "documents = [\n", " (list(movie_reviews.words(fileid)), category)\n", " for category in movie_reviews.categories()\n", " for fileid in movie_reviews.fileids(category)\n", "]\n", "\n", "##\n", "## Mezcla aleatoriamente los documentos\n", "##\n", "random.shuffle(documents)\n", "\n", "\n", "##\n", "## Computa la frecuencia de las palabras\n", "##\n", "all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())\n", "\n", "##\n", "## Separa las 2000 palabras más frecuentes\n", "##\n", "word_features = list(all_words)[:2000]\n", "\n", "##\n", "## Representación binaria de la occurrencia de\n", "## la palabra en un revisión\n", "##\n", "def document_features(document):\n", " document_words = set(document)\n", " features = {}\n", " for word in word_features:\n", " features[\"contains({})\".format(word)] = word in document_words\n", " return features\n", "\n", "\n", "##\n", "## Ejemplo de la representación para una revisión.\n", "## La salida es muy larga\n", "#  document_features(movie_reviews.words('pos/cv957_8737.txt'))\n", "\n", "#\n", "# {'contains(plot)': True,\n", "# 'contains(:)': True,\n", "# 'contains(two)': True,\n", "# 'contains(teen)': False,\n", "# 'contains(couples)': False,\n", "# 'contains(go)': False,\n", "# ...}\n", "#" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.83" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Construye los conjuntos de datos y\n", "## evalua la precisión del clasificador\n", "##\n", "featuresets = [(document_features(d), c) for (d, c) in documents]\n", "train_set, test_set = featuresets[100:], featuresets[:100]\n", "classifier = nltk.NaiveBayesClassifier.train(train_set)\n", "nltk.classify.accuracy(classifier, test_set)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Most Informative Features\n", " contains(welles) = True neg : pos = 7.7 : 1.0\n", " contains(unimaginative) = True neg : pos = 7.7 : 1.0\n", " contains(suvari) = True neg : pos = 7.1 : 1.0\n", " contains(mena) = True neg : pos = 7.1 : 1.0\n", " contains(atrocious) = True neg : pos = 6.7 : 1.0\n" ] } ], "source": [ "classifier.show_most_informative_features(5)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }