{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Part-of-Spech (POS) Tagging / Categorización Léxica\n", "\n", "* *30 min* | Última modificación: Diciembre 1, 2020" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "http://www.nltk.org/book/\n", "\n", "Text Analytics with Python" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "TAG Descripción Ejemplo\n", "------------------------------------------------------------\n", "CC Coordination conjuntion and, or\n", "CD Cardinal number one, two, 3\n", "DT Determiner a, the\n", "EX Existential there there were two cars \n", "FW Foreign word hola mundo cruel \n", "IN Preposition/subordinating conjunction of, in, on, that\n", "JJ Adjective quick, lazy\n", "JJR Adjective, comparative quicker, lazier\n", "JJS Adjective, superlative quickest, laziest\n", "NN Noun, singular or mass fox, dog\n", "NNS Noun, plural foxes, dogs\n", "NNPS Noun, proper singular John, Alice \n", "NNP Noun, proper plural Vikings, Indians, Germans\n", "\n", "```" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package averaged_perceptron_tagger to\n", "[nltk_data] /root/nltk_data...\n", "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n", "[nltk_data] date!\n", "[nltk_data] Downloading package brown to /root/nltk_data...\n", "[nltk_data] Package brown is already up-to-date!\n", "[nltk_data] Downloading package universal_tagset to /root/nltk_data...\n", "[nltk_data] Package universal_tagset is already up-to-date!\n", "[nltk_data] Downloading package treebank to /root/nltk_data...\n", "[nltk_data] Package treebank is already up-to-date!\n", "[nltk_data] Downloading package gutenberg to /root/nltk_data...\n", "[nltk_data] Package gutenberg is already up-to-date!\n", "[nltk_data] Downloading package words to /root/nltk_data...\n", "[nltk_data] Package words is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "\n", "nltk.download(\"averaged_perceptron_tagger\")\n", "nltk.download(\"brown\")\n", "nltk.download(\"universal_tagset\")\n", "nltk.download(\"treebank\")\n", "nltk.download(\"gutenberg\")\n", "nltk.download(\"words\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "##\n", "## Para ver todos los posibles tags ejecute el siguiente codigo\n", "##\n", "#  nltk.download('tagsets')\n", "#  nltk.help.upenn_tagset()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('And', 'CC'),\n", " ('now', 'RB'),\n", " ('for', 'IN'),\n", " ('something', 'NN'),\n", " ('completely', 'RB'),\n", " ('different', 'JJ')]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Obtención de tags para un texto\n", "##\n", "from nltk.tokenize import word_tokenize\n", "\n", "text = word_tokenize(\"And now for something completely different\")\n", "nltk.pos_tag(text)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('They', 'PRP'),\n", " ('refuse', 'VBP'),\n", " ('to', 'TO'),\n", " ('permit', 'VB'),\n", " ('us', 'PRP'),\n", " ('to', 'TO'),\n", " ('obtain', 'VB'),\n", " ('the', 'DT'),\n", " ('refuse', 'NN'),\n", " ('permit', 'NN')]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Obtención de tags para un texto\n", "##\n", "text = word_tokenize(\"They refuse to permit us to obtain the refuse permit\")\n", "nltk.pos_tag(text)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "man time day year car moment world house family child country boy\n", "state job place way war girl work word\n" ] } ], "source": [ "##\n", "## Ejemplo de obtención de texto similares\n", "##\n", "text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())\n", "text.similar(\"woman\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "made said done put had seen found given left heard was been brought\n", "set got that took in told felt\n" ] } ], "source": [ "##\n", "## Ejemplo de obtención de texto similares\n", "##\n", "text.similar(\"bought\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "in on to of and for with from at by that into as up out down through\n", "is all about\n" ] } ], "source": [ "##\n", "## Ejemplo de obtención de texto similares\n", "##\n", "text.similar(\"over\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a his this their its her an that our any all one these my in your no\n", "some other and\n" ] } ], "source": [ "##\n", "## Ejemplo de obtención de texto similares\n", "##\n", "text.similar(\"the\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "##\n", "## Ejemplo --- POS tagging usando spaCy\n", "##\n", "sentence = \"US unveils world's most powerful supercomputer, beats China.\"\n", "\n", "#  nlp = spacy.load(\"en_core_web_sm\", parse=True, tag=True, entity=True)\n", "\n", "#  sentence_nlp = nlp(sentence)\n", "\n", "# spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]\n", "#  pd.DataFrame(spacy_pos_tagged, columns=[\"Word\", \"POS tag\", \"Tag type\"])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
WordPOS tag
0USNNP
1unveilsJJ
2worldNN
3'sPOS
4mostRBS
5powerfulJJ
6supercomputerNN
7,,
8beatsVBZ
9ChinaNNP
10..
\n", "
" ], "text/plain": [ " Word POS tag\n", "0 US NNP\n", "1 unveils JJ\n", "2 world NN\n", "3 's POS\n", "4 most RBS\n", "5 powerful JJ\n", "6 supercomputer NN\n", "7 , ,\n", "8 beats VBZ\n", "9 China NNP\n", "10 . ." ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Ejemplo --- POS tagging usando NLTK\n", "##\n", "import pandas as pd\n", "\n", "sentence = \"US unveils world's most powerful supercomputer, beats China.\"\n", "nltk_pos_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))\n", "pd.DataFrame(nltk_pos_tagged, columns=[\"Word\", \"POS tag\"])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('fly', 'NN')" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Representación de tokens tageados\n", "##\n", "tagged_token = nltk.tag.str2tuple(\"fly/NN\")\n", "tagged_token" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'fly'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Obtención del token\n", "##\n", "tagged_token[0]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'NN'" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Obtención del tag\n", "##\n", "tagged_token[1]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('The', 'AT'),\n", " ('grand', 'JJ'),\n", " ('jury', 'NN'),\n", " ('commented', 'VBD'),\n", " ('on', 'IN'),\n", " ('a', 'AT'),\n", " ('number', 'NN'),\n", " ('of', 'IN'),\n", " ('other', 'AP'),\n", " ('topics', 'NNS'),\n", " (',', ','),\n", " ('AMONG', 'IN'),\n", " ('them', 'PPO'),\n", " ('the', 'AT'),\n", " ('Atlanta', 'NP'),\n", " ('and', 'CC'),\n", " ('Fulton', 'NP-TL'),\n", " ('County', 'NN-TL'),\n", " ('purchasing', 'VBG'),\n", " ('departments', 'NNS'),\n", " ('which', 'WDT'),\n", " ('it', 'PPS'),\n", " ('said', 'VBD'),\n", " ('``', '``'),\n", " ('ARE', 'BER'),\n", " ('well', 'QL'),\n", " ('operated', 'VBN'),\n", " ('and', 'CC'),\n", " ('follow', 'VB'),\n", " ('generally', 'RB'),\n", " ('accepted', 'VBN'),\n", " ('practices', 'NNS'),\n", " ('which', 'WDT'),\n", " ('inure', 'VB'),\n", " ('to', 'IN'),\n", " ('the', 'AT'),\n", " ('best', 'JJT'),\n", " ('interest', 'NN'),\n", " ('of', 'IN'),\n", " ('both', 'ABX'),\n", " ('governments', 'NNS'),\n", " (\"''\", \"''\"),\n", " ('.', '.')]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Representación de un texto taggeado\n", "##\n", "sent = \"\"\"\n", "The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN\n", "other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC\n", "Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PPS\n", "said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/RB\n", "accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT\n", "interest/NN of/IN both/ABX governments/NNS ''/'' ./.\n", "\"\"\"\n", "[nltk.tag.str2tuple(t) for t in sent.split()]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Tagset simplificado**\n", "\n", "```\n", "ADJ adjective new, good, high, special, big, local\n", "ADP adposition on, of, at, with, by, into, under\n", "ADV adverb really, already, still, early, now\n", "CONJ conjunction and, or, but, if, while, although\n", "DET determiner, article the, a, some, most, every, no, which\n", "NOUN noun year, home, costs, time, Africa\n", "NUM numeral twenty-four, fourth, 1991, 14:24\n", "PRT particle at, on, out, over per, that, up, with\n", "PRON pronoun he, their, her, its, my, I, us\n", "VERB verb is, say, told, given, playing, would\n", ". punctuation marks . , ; !\n", "X other ersatz, esprit, dunno, gr8, univeristy\n", "\n", "\n", "```" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('NOUN', 30654),\n", " ('VERB', 14399),\n", " ('ADP', 12355),\n", " ('.', 11928),\n", " ('DET', 11389),\n", " ('ADJ', 6706),\n", " ('ADV', 3349),\n", " ('CONJ', 2717),\n", " ('PRON', 2535),\n", " ('PRT', 2264),\n", " ('NUM', 2166),\n", " ('X', 92)]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Principales categorias usadas en la categoría news\n", "##\n", "from nltk.corpus import brown\n", "\n", "brown_news_tagged = brown.tagged_words(categories=\"news\", tagset=\"universal\")\n", "tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)\n", "tag_fd.most_common()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['NOUN',\n", " 'DET',\n", " 'ADJ',\n", " 'ADP',\n", " '.',\n", " 'VERB',\n", " 'CONJ',\n", " 'NUM',\n", " 'ADV',\n", " 'PRT',\n", " 'PRON',\n", " 'X']" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Sustantivos\n", "## Se refieren a personas, cosas o conceptos. Los sustantivos pueden\n", "## aparecer después de determinantes y adjetivos, y pueden ser el\n", "## sujeto u objeto del verbo.\n", "##\n", "\n", "## Selecciona los bigramas del corpus\n", "word_tag_pairs = nltk.bigrams(brown_news_tagged)\n", "\n", "## Selecciona los predecesores de los sustantivos\n", "noun_preceders = [a[1] for (a, b) in word_tag_pairs if b[1] == \"NOUN\"]\n", "\n", "## Obtiene los tags mas comunes de los predecesores de los sustantivos\n", "[tag for (tag, _) in nltk.FreqDist(noun_preceders).most_common()]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['is',\n", " 'said',\n", " 'was',\n", " 'are',\n", " 'be',\n", " 'has',\n", " 'have',\n", " 'will',\n", " 'says',\n", " 'would',\n", " 'were',\n", " 'had',\n", " 'been',\n", " 'could',\n", " \"'s\",\n", " 'can',\n", " 'do',\n", " 'say',\n", " 'make',\n", " 'may']" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Verbos\n", "## Describen accioes o eventos. En el contexto de una sentencia, los verbos\n", "## tipicamente expresan una relación que involucra a los referentes de una\n", "## o más frases nominales\n", "##\n", "\n", "## Obtiene las palabras taggeadas\n", "wsj = nltk.corpus.treebank.tagged_words(tagset=\"universal\")\n", "\n", "[wt[0] for (wt, _) in nltk.FreqDist(wsj).most_common() if wt[1] == \"VERB\"][:20]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NN [('year', 137), ('time', 97), ('state', 88), ('week', 85), ('man', 72)]\n", "NN$ [(\"year's\", 13), (\"world's\", 8), (\"state's\", 7), (\"nation's\", 6), (\"city's\", 6)]\n", "NN$-HL [(\"Golf's\", 1), (\"Navy's\", 1)]\n", "NN$-TL [(\"President's\", 11), (\"Administration's\", 3), (\"Army's\", 3), (\"League's\", 3), (\"University's\", 3)]\n", "NN-HL [('sp.', 2), ('problem', 2), ('Question', 2), ('cut', 2), ('party', 2)]\n", "NN-NC [('ova', 1), ('eva', 1), ('aya', 1)]\n", "NN-TL [('President', 88), ('House', 68), ('State', 59), ('University', 42), ('City', 41)]\n", "NN-TL-HL [('Fort', 2), ('Mayor', 1), ('Commissioner', 1), ('City', 1), ('Oak', 1)]\n", "NNS [('years', 101), ('members', 69), ('people', 52), ('sales', 51), ('men', 46)]\n", "NNS$ [(\"children's\", 7), (\"women's\", 5), (\"men's\", 3), (\"janitors'\", 3), (\"taxpayers'\", 2)]\n" ] } ], "source": [ "##\n", "## Tags no simplificados\n", "##\n", "\n", "## Para cada prefijo encuentra los cinco tags más comunes\n", "def findtags(tag_prefix, tagged_text):\n", " cfd = nltk.ConditionalFreqDist(\n", " (tag, word) for (word, tag) in tagged_text if tag.startswith(tag_prefix)\n", " )\n", " return dict((tag, cfd[tag].most_common(5)) for tag in cfd.conditions())\n", "\n", "\n", "## Obtiene los tags que inician con NN\n", "tagdict = findtags(\"NN\", nltk.corpus.brown.tagged_words(categories=\"news\"))\n", "\n", "for tag in sorted(tagdict)[0:10]:\n", " print(tag, tagdict[tag])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[',',\n", " '.',\n", " 'accomplished',\n", " 'analytically',\n", " 'appear',\n", " 'apt',\n", " 'associated',\n", " 'assuming',\n", " 'became',\n", " 'become',\n", " 'been',\n", " 'began',\n", " 'call',\n", " 'called',\n", " 'carefully',\n", " 'chose',\n", " 'classified',\n", " 'colorful',\n", " 'composed',\n", " 'contain',\n", " 'differed',\n", " 'difficult',\n", " 'encountered',\n", " 'enough',\n", " 'equate',\n", " 'extremely',\n", " 'found',\n", " 'happens',\n", " 'have',\n", " 'ignored',\n", " 'in',\n", " 'involved',\n", " 'more',\n", " 'needed',\n", " 'nightly',\n", " 'observed',\n", " 'of',\n", " 'on',\n", " 'out',\n", " 'quite',\n", " 'represent',\n", " 'responsible',\n", " 'revamped',\n", " 'seclude',\n", " 'set',\n", " 'shortened',\n", " 'sing',\n", " 'sounded',\n", " 'stated',\n", " 'still',\n", " 'sung',\n", " 'supported',\n", " 'than',\n", " 'to',\n", " 'when',\n", " 'work']" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Exploración de corpus taggeados\n", "## Análisis de la palabra 'often'\n", "##\n", "\n", "## Obtiene tl texto\n", "brown_learned_text = brown.words(categories=\"learned\")\n", "\n", "## Obtiene y ordena los bigramas que empiezan por often\n", "sorted(set(b for (a, b) in nltk.bigrams(brown_learned_text) if a == \"often\"))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "VERB ADV ADP ADJ . PRT \n", " 37 8 7 6 4 2 \n" ] } ], "source": [ "##\n", "## Tags que se usan después de la palabra often\n", "##\n", "brown_lrnd_tagged = brown.tagged_words(categories=\"learned\", tagset=\"universal\")\n", "tags = [b[1] for (a, b) in nltk.bigrams(brown_lrnd_tagged) if a[0] == \"often\"]\n", "nltk.FreqDist(tags).tabulate()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "combined to achieve\n", "continue to place\n", "serve to protect\n", "wanted to wait\n", "allowed to place\n", "expected to become\n", "expected to approve\n", "expected to make\n", "intends to make\n", "seek to set\n", "like to see\n" ] } ], "source": [ "##\n", "## Análisis del caso \" to \"\n", "##\n", "from nltk.corpus import brown\n", "\n", "\n", "def process(sentence):\n", " for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):\n", " if t1.startswith(\"V\") and t2 == \"TO\" and t3.startswith(\"V\"):\n", " print(w1, w2, w3)\n", "\n", "\n", "for tagged_sent in brown.tagged_sents()[:100]:\n", " process(tagged_sent)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "best ADJ ADV VERB NOUN\n", "close ADV ADJ VERB NOUN\n", "open ADJ VERB NOUN ADV\n", "present ADJ ADV NOUN VERB\n", "that ADP DET PRON ADV\n" ] } ], "source": [ "##\n", "## Palabras que tienen distinta clasificación basads en el contexto\n", "##\n", "brown_news_tagged = brown.tagged_words(categories=\"news\", tagset=\"universal\")\n", "data = nltk.ConditionalFreqDist(\n", " (word.lower(), tag) for (word, tag) in brown_news_tagged\n", ")\n", "for word in sorted(data.conditions()):\n", " if len(data[word]) > 3:\n", " tags = [tag for (tag, _) in data[word].most_common()]\n", " print(word, \" \".join(tags))" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['[',\n", " 'Alice',\n", " \"'\",\n", " 's',\n", " 'Adventures',\n", " 'in',\n", " 'Wonderland',\n", " 'by',\n", " 'UNK',\n", " 'UNK',\n", " 'UNK',\n", " 'UNK',\n", " 'CHAPTER',\n", " 'I',\n", " '.',\n", " 'Down',\n", " 'the',\n", " 'Rabbit',\n", " '-',\n", " 'UNK',\n", " 'Alice',\n", " 'was',\n", " 'beginning',\n", " 'to',\n", " 'get',\n", " 'very',\n", " 'tired',\n", " 'of',\n", " 'sitting',\n", " 'by',\n", " 'her',\n", " 'sister',\n", " 'on',\n", " 'the',\n", " 'bank',\n", " ',',\n", " 'and',\n", " 'of',\n", " 'having',\n", " 'nothing',\n", " 'to',\n", " 'do',\n", " ':',\n", " 'once',\n", " 'or',\n", " 'twice',\n", " 'she',\n", " 'had',\n", " 'peeped',\n", " 'into']" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Uso de defaultdict\n", "##\n", "from collections import defaultdict\n", "\n", "alice = nltk.corpus.gutenberg.words(\"carroll-alice.txt\")\n", "vocab = nltk.FreqDist(alice)\n", "v1000 = [word for (word, _) in vocab.most_common(1000)]\n", "mapping = defaultdict(lambda: \"UNK\")\n", "for v in v1000:\n", " mapping[v] = v\n", "\n", "alice2 = [mapping[v] for v in alice]\n", "alice2[:50]" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1001" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(set(alice2))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "30654" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Conteo de la cantidad por tipo de tag\n", "##\n", "from collections import defaultdict\n", "\n", "counts = defaultdict(int)\n", "from nltk.corpus import brown\n", "\n", "for (word, tag) in brown.tagged_words(categories=\"news\", tagset=\"universal\"):\n", " counts[tag] += 1\n", "\n", "counts[\"NOUN\"]" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['.',\n", " 'ADJ',\n", " 'ADP',\n", " 'ADV',\n", " 'CONJ',\n", " 'DET',\n", " 'NOUN',\n", " 'NUM',\n", " 'PRON',\n", " 'PRT',\n", " 'VERB',\n", " 'X']" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sorted(counts)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('NOUN', 30654),\n", " ('VERB', 14399),\n", " ('ADP', 12355),\n", " ('.', 11928),\n", " ('DET', 11389),\n", " ('ADJ', 6706),\n", " ('ADV', 3349),\n", " ('CONJ', 2717),\n", " ('PRON', 2535),\n", " ('PRT', 2264),\n", " ('NUM', 2166),\n", " ('X', 92)]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Conteo ordenado por cantidad\n", "##\n", "from operator import itemgetter\n", "\n", "sorted(counts.items(), key=itemgetter(1), reverse=True)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['NOUN',\n", " 'VERB',\n", " 'ADP',\n", " '.',\n", " 'DET',\n", " 'ADJ',\n", " 'ADV',\n", " 'CONJ',\n", " 'PRON',\n", " 'PRT',\n", " 'NUM',\n", " 'X']" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Extracción de los tags ordenados\n", "##\n", "[t for t, _ in sorted(counts.items(), key=itemgetter(1), reverse=True)]" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['abactinally',\n", " 'abandonedly',\n", " 'abasedly',\n", " 'abashedly',\n", " 'abashlessly',\n", " 'abbreviately',\n", " 'abdominally',\n", " 'abhorrently',\n", " 'abidingly',\n", " 'abiogenetically',\n", " 'abiologically',\n", " 'abjectly',\n", " 'ableptically',\n", " 'ably',\n", " 'abnormally']" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Ejemplo de indexación por las últimas dos letras\n", "##\n", "last_letters = defaultdict(list)\n", "words = nltk.corpus.words.words(\"en\")\n", "for word in words:\n", " key = word[-2:]\n", " last_letters[key].append(word)\n", "\n", "last_letters[\"ly\"][:15]" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['blazy',\n", " 'bleezy',\n", " 'blowzy',\n", " 'boozy',\n", " 'breezy',\n", " 'bronzy',\n", " 'buzzy',\n", " 'Chazy',\n", " 'cozy',\n", " 'crazy',\n", " 'dazy',\n", " 'dizzy',\n", " 'dozy',\n", " 'enfrenzy',\n", " 'fezzy']" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "last_letters[\"zy\"][:15]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Creación de un diccionario de anagramas\n", "##\n", "anagrams = defaultdict(list)\n", "for word in words:\n", " key = \"\".join(sorted(word))\n", " anagrams[key].append(word)\n", "\n", "anagrams[\"aeilnrt\"]" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Código equivalente a defaultdict(list) en NLTK\n", "## con nltk.Index(). Note que nltk.Index() recibe una\n", "## tupla (clave, valor)\n", "##\n", "anagrams = nltk.Index((\"\".join(sorted(w)), w) for w in words)\n", "anagrams[\"aeilnrt\"]" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "defaultdict(int, {'NOUN': 5, 'ADJ': 11})" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Manejo de claves complejas y valores\n", "##\n", "pos = defaultdict(lambda: defaultdict(int))\n", "brown_news_tagged = brown.tagged_words(categories=\"news\", tagset=\"universal\")\n", "for ((w1, t1), (w2, t2)) in nltk.bigrams(brown_news_tagged):\n", " pos[(t1, w2)][t2] += 1\n", "\n", "pos[(\"DET\", \"right\")]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['mortal',\n", " 'Against',\n", " 'Him',\n", " 'There',\n", " 'brought',\n", " 'King',\n", " 'virtue',\n", " 'every',\n", " 'been',\n", " 'thine']" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Inversión de un diccionario\n", "##\n", "counts = defaultdict(int)\n", "for word in nltk.corpus.gutenberg.words(\"milton-paradise.txt\"):\n", " counts[word] += 1\n", "\n", "[key for (key, value) in counts.items() if value == 32]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'ideas'" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Inversión directa de un dicionario\n", "##\n", "pos = {\"colorless\": \"ADJ\", \"ideas\": \"N\", \"sleep\": \"V\", \"furiously\": \"ADV\"}\n", "pos2 = dict((value, key) for (key, value) in pos.items())\n", "pos2[\"N\"]" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['furiously', 'peacefully']" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pos.update({\"cats\": \"N\", \"scratch\": \"V\", \"peacefully\": \"ADV\", \"old\": \"ADJ\"})\n", "pos2 = defaultdict(list)\n", "for key, value in pos.items():\n", " pos2[value].append(key)\n", "\n", "pos2[\"ADV\"]" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['furiously', 'peacefully']" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Código equivalente en NLTK\n", "##\n", "pos2 = nltk.Index((value, key) for (key, value) in pos.items())\n", "pos2[\"ADV\"]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }