Part-of-Spech (POS) Tagging / Categorización Léxica

  • 30 min | Última modificación: Diciembre 1, 2020

http://www.nltk.org/book/

Text Analytics with Python

TAG   Descripción      Ejemplo
------------------------------------------------------------
CC    Coordination conjuntion                and, or
CD    Cardinal number                        one, two, 3
DT    Determiner                             a, the
EX    Existential there                      there were two cars
FW    Foreign word                           hola mundo cruel
IN    Preposition/subordinating conjunction  of, in, on, that
JJ    Adjective                              quick, lazy
JJR   Adjective, comparative                 quicker, lazier
JJS   Adjective, superlative                 quickest, laziest
NN    Noun, singular or mass                 fox, dog
NNS   Noun, plural                           foxes, dogs
NNPS  Noun, proper singular                  John, Alice
NNP   Noun, proper plural                    Vikings, Indians, Germans
[1]:
import nltk

nltk.download("averaged_perceptron_tagger")
nltk.download("brown")
nltk.download("universal_tagset")
nltk.download("treebank")
nltk.download("gutenberg")
nltk.download("words")
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[1]:
True
[2]:
##
## Para ver todos los posibles tags ejecute el siguiente codigo
##
#  nltk.download('tagsets')
#  nltk.help.upenn_tagset()
[3]:
##
## Obtención de tags para un texto
##
from nltk.tokenize import word_tokenize

text = word_tokenize("And now for something completely different")
nltk.pos_tag(text)
[3]:
[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]
[4]:
##
## Obtención de tags para un texto
##
text = word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)
[4]:
[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]
[5]:
##
## Ejemplo de obtención de texto similares
##
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar("woman")
man time day year car moment world house family child country boy
state job place way war girl work word
[6]:
##
## Ejemplo de obtención de texto similares
##
text.similar("bought")
made said done put had seen found given left heard was been brought
set got that took in told felt
[7]:
##
## Ejemplo de obtención de texto similares
##
text.similar("over")
in on to of and for with from at by that into as up out down through
is all about
[8]:
##
## Ejemplo de obtención de texto similares
##
text.similar("the")
a his this their its her an that our any all one these my in your no
some other and
[9]:
##
## Ejemplo --- POS tagging usando spaCy
##
sentence = "US unveils world's most powerful supercomputer, beats China."

#  nlp = spacy.load("en_core_web_sm", parse=True, tag=True, entity=True)

#  sentence_nlp = nlp(sentence)

# spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
#  pd.DataFrame(spacy_pos_tagged, columns=["Word", "POS tag", "Tag type"])
[10]:
##
## Ejemplo --- POS tagging usando NLTK
##
import pandas as pd

sentence = "US unveils world's most powerful supercomputer, beats China."
nltk_pos_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
pd.DataFrame(nltk_pos_tagged, columns=["Word", "POS tag"])
[10]:
Word POS tag
0 US NNP
1 unveils JJ
2 world NN
3 's POS
4 most RBS
5 powerful JJ
6 supercomputer NN
7 , ,
8 beats VBZ
9 China NNP
10 . .
[11]:
##
## Representación de tokens tageados
##
tagged_token = nltk.tag.str2tuple("fly/NN")
tagged_token
[11]:
('fly', 'NN')
[12]:
##
## Obtención del token
##
tagged_token[0]
[12]:
'fly'
[13]:
##
## Obtención del tag
##
tagged_token[1]
[13]:
'NN'
[14]:
##
## Representación de un texto taggeado
##
sent = """
The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN
other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC
Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PPS
said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/RB
accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT
interest/NN of/IN both/ABX governments/NNS ''/'' ./.
"""
[nltk.tag.str2tuple(t) for t in sent.split()]
[14]:
[('The', 'AT'),
 ('grand', 'JJ'),
 ('jury', 'NN'),
 ('commented', 'VBD'),
 ('on', 'IN'),
 ('a', 'AT'),
 ('number', 'NN'),
 ('of', 'IN'),
 ('other', 'AP'),
 ('topics', 'NNS'),
 (',', ','),
 ('AMONG', 'IN'),
 ('them', 'PPO'),
 ('the', 'AT'),
 ('Atlanta', 'NP'),
 ('and', 'CC'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('purchasing', 'VBG'),
 ('departments', 'NNS'),
 ('which', 'WDT'),
 ('it', 'PPS'),
 ('said', 'VBD'),
 ('``', '``'),
 ('ARE', 'BER'),
 ('well', 'QL'),
 ('operated', 'VBN'),
 ('and', 'CC'),
 ('follow', 'VB'),
 ('generally', 'RB'),
 ('accepted', 'VBN'),
 ('practices', 'NNS'),
 ('which', 'WDT'),
 ('inure', 'VB'),
 ('to', 'IN'),
 ('the', 'AT'),
 ('best', 'JJT'),
 ('interest', 'NN'),
 ('of', 'IN'),
 ('both', 'ABX'),
 ('governments', 'NNS'),
 ("''", "''"),
 ('.', '.')]

Tagset simplificado

ADJ    adjective             new, good, high, special, big, local
ADP    adposition            on, of, at, with, by, into, under
ADV    adverb                really, already, still, early, now
CONJ   conjunction           and, or, but, if, while, although
DET    determiner, article   the, a, some, most, every, no, which
NOUN   noun                  year, home, costs, time, Africa
NUM    numeral               twenty-four, fourth, 1991, 14:24
PRT    particle              at, on, out, over per, that, up, with
PRON   pronoun               he, their, her, its, my, I, us
VERB   verb                  is, say, told, given, playing, would
.     punctuation marks      . , ; !
X     other                  ersatz, esprit, dunno, gr8, univeristy
[15]:
##
## Principales categorias usadas en la categoría news
##
from nltk.corpus import brown

brown_news_tagged = brown.tagged_words(categories="news", tagset="universal")
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.most_common()
[15]:
[('NOUN', 30654),
 ('VERB', 14399),
 ('ADP', 12355),
 ('.', 11928),
 ('DET', 11389),
 ('ADJ', 6706),
 ('ADV', 3349),
 ('CONJ', 2717),
 ('PRON', 2535),
 ('PRT', 2264),
 ('NUM', 2166),
 ('X', 92)]
[16]:
##
## Sustantivos
##   Se refieren a personas, cosas o conceptos. Los sustantivos pueden
##   aparecer después de determinantes y adjetivos, y pueden ser el
##   sujeto u objeto del verbo.
##

## Selecciona los bigramas del corpus
word_tag_pairs = nltk.bigrams(brown_news_tagged)

## Selecciona los predecesores de los sustantivos
noun_preceders = [a[1] for (a, b) in word_tag_pairs if b[1] == "NOUN"]

## Obtiene los tags mas comunes de los predecesores de los sustantivos
[tag for (tag, _) in nltk.FreqDist(noun_preceders).most_common()]
[16]:
['NOUN',
 'DET',
 'ADJ',
 'ADP',
 '.',
 'VERB',
 'CONJ',
 'NUM',
 'ADV',
 'PRT',
 'PRON',
 'X']
[17]:
##
## Verbos
##   Describen accioes o eventos. En el contexto de una sentencia, los verbos
##   tipicamente expresan una relación que involucra a los referentes de una
##   o más frases nominales
##

## Obtiene las palabras taggeadas
wsj = nltk.corpus.treebank.tagged_words(tagset="universal")

[wt[0] for (wt, _) in nltk.FreqDist(wsj).most_common() if wt[1] == "VERB"][:20]
[17]:
['is',
 'said',
 'was',
 'are',
 'be',
 'has',
 'have',
 'will',
 'says',
 'would',
 'were',
 'had',
 'been',
 'could',
 "'s",
 'can',
 'do',
 'say',
 'make',
 'may']
[18]:
##
## Tags no simplificados
##

## Para cada prefijo encuentra los cinco tags más comunes
def findtags(tag_prefix, tagged_text):
    cfd = nltk.ConditionalFreqDist(
        (tag, word) for (word, tag) in tagged_text if tag.startswith(tag_prefix)
    )
    return dict((tag, cfd[tag].most_common(5)) for tag in cfd.conditions())


## Obtiene los tags que inician con NN
tagdict = findtags("NN", nltk.corpus.brown.tagged_words(categories="news"))

for tag in sorted(tagdict)[0:10]:
    print(tag, tagdict[tag])
NN [('year', 137), ('time', 97), ('state', 88), ('week', 85), ('man', 72)]
NN$ [("year's", 13), ("world's", 8), ("state's", 7), ("nation's", 6), ("city's", 6)]
NN$-HL [("Golf's", 1), ("Navy's", 1)]
NN$-TL [("President's", 11), ("Administration's", 3), ("Army's", 3), ("League's", 3), ("University's", 3)]
NN-HL [('sp.', 2), ('problem', 2), ('Question', 2), ('cut', 2), ('party', 2)]
NN-NC [('ova', 1), ('eva', 1), ('aya', 1)]
NN-TL [('President', 88), ('House', 68), ('State', 59), ('University', 42), ('City', 41)]
NN-TL-HL [('Fort', 2), ('Mayor', 1), ('Commissioner', 1), ('City', 1), ('Oak', 1)]
NNS [('years', 101), ('members', 69), ('people', 52), ('sales', 51), ('men', 46)]
NNS$ [("children's", 7), ("women's", 5), ("men's", 3), ("janitors'", 3), ("taxpayers'", 2)]
[19]:
##
## Exploración de corpus taggeados
##   Análisis de la palabra 'often'
##

## Obtiene tl texto
brown_learned_text = brown.words(categories="learned")

## Obtiene y ordena los bigramas que empiezan por often
sorted(set(b for (a, b) in nltk.bigrams(brown_learned_text) if a == "often"))
[19]:
[',',
 '.',
 'accomplished',
 'analytically',
 'appear',
 'apt',
 'associated',
 'assuming',
 'became',
 'become',
 'been',
 'began',
 'call',
 'called',
 'carefully',
 'chose',
 'classified',
 'colorful',
 'composed',
 'contain',
 'differed',
 'difficult',
 'encountered',
 'enough',
 'equate',
 'extremely',
 'found',
 'happens',
 'have',
 'ignored',
 'in',
 'involved',
 'more',
 'needed',
 'nightly',
 'observed',
 'of',
 'on',
 'out',
 'quite',
 'represent',
 'responsible',
 'revamped',
 'seclude',
 'set',
 'shortened',
 'sing',
 'sounded',
 'stated',
 'still',
 'sung',
 'supported',
 'than',
 'to',
 'when',
 'work']
[20]:
##
## Tags que se usan después de la palabra often
##
brown_lrnd_tagged = brown.tagged_words(categories="learned", tagset="universal")
tags = [b[1] for (a, b) in nltk.bigrams(brown_lrnd_tagged) if a[0] == "often"]
nltk.FreqDist(tags).tabulate()
VERB  ADV  ADP  ADJ    .  PRT
  37    8    7    6    4    2
[21]:
##
## Análisis del caso "<Verb> to <Verb>"
##
from nltk.corpus import brown


def process(sentence):
    for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
        if t1.startswith("V") and t2 == "TO" and t3.startswith("V"):
            print(w1, w2, w3)


for tagged_sent in brown.tagged_sents()[:100]:
    process(tagged_sent)
combined to achieve
continue to place
serve to protect
wanted to wait
allowed to place
expected to become
expected to approve
expected to make
intends to make
seek to set
like to see
[22]:
##
## Palabras que tienen distinta clasificación basads en el contexto
##
brown_news_tagged = brown.tagged_words(categories="news", tagset="universal")
data = nltk.ConditionalFreqDist(
    (word.lower(), tag) for (word, tag) in brown_news_tagged
)
for word in sorted(data.conditions()):
    if len(data[word]) > 3:
        tags = [tag for (tag, _) in data[word].most_common()]
        print(word, " ".join(tags))
best ADJ ADV VERB NOUN
close ADV ADJ VERB NOUN
open ADJ VERB NOUN ADV
present ADJ ADV NOUN VERB
that ADP DET PRON ADV
[23]:
##
## Uso de defaultdict
##
from collections import defaultdict

alice = nltk.corpus.gutenberg.words("carroll-alice.txt")
vocab = nltk.FreqDist(alice)
v1000 = [word for (word, _) in vocab.most_common(1000)]
mapping = defaultdict(lambda: "UNK")
for v in v1000:
    mapping[v] = v

alice2 = [mapping[v] for v in alice]
alice2[:50]
[23]:
['[',
 'Alice',
 "'",
 's',
 'Adventures',
 'in',
 'Wonderland',
 'by',
 'UNK',
 'UNK',
 'UNK',
 'UNK',
 'CHAPTER',
 'I',
 '.',
 'Down',
 'the',
 'Rabbit',
 '-',
 'UNK',
 'Alice',
 'was',
 'beginning',
 'to',
 'get',
 'very',
 'tired',
 'of',
 'sitting',
 'by',
 'her',
 'sister',
 'on',
 'the',
 'bank',
 ',',
 'and',
 'of',
 'having',
 'nothing',
 'to',
 'do',
 ':',
 'once',
 'or',
 'twice',
 'she',
 'had',
 'peeped',
 'into']
[24]:
len(set(alice2))
[24]:
1001
[25]:
##
## Conteo de la cantidad por tipo de tag
##
from collections import defaultdict

counts = defaultdict(int)
from nltk.corpus import brown

for (word, tag) in brown.tagged_words(categories="news", tagset="universal"):
    counts[tag] += 1

counts["NOUN"]
[25]:
30654
[26]:
sorted(counts)
[26]:
['.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X']
[27]:
##
## Conteo ordenado por cantidad
##
from operator import itemgetter

sorted(counts.items(), key=itemgetter(1), reverse=True)
[27]:
[('NOUN', 30654),
 ('VERB', 14399),
 ('ADP', 12355),
 ('.', 11928),
 ('DET', 11389),
 ('ADJ', 6706),
 ('ADV', 3349),
 ('CONJ', 2717),
 ('PRON', 2535),
 ('PRT', 2264),
 ('NUM', 2166),
 ('X', 92)]
[28]:
##
## Extracción de los tags ordenados
##
[t for t, _ in sorted(counts.items(), key=itemgetter(1), reverse=True)]
[28]:
['NOUN',
 'VERB',
 'ADP',
 '.',
 'DET',
 'ADJ',
 'ADV',
 'CONJ',
 'PRON',
 'PRT',
 'NUM',
 'X']
[29]:
##
## Ejemplo de indexación por las últimas dos letras
##
last_letters = defaultdict(list)
words = nltk.corpus.words.words("en")
for word in words:
    key = word[-2:]
    last_letters[key].append(word)

last_letters["ly"][:15]
[29]:
['abactinally',
 'abandonedly',
 'abasedly',
 'abashedly',
 'abashlessly',
 'abbreviately',
 'abdominally',
 'abhorrently',
 'abidingly',
 'abiogenetically',
 'abiologically',
 'abjectly',
 'ableptically',
 'ably',
 'abnormally']
[30]:
last_letters["zy"][:15]
[30]:
['blazy',
 'bleezy',
 'blowzy',
 'boozy',
 'breezy',
 'bronzy',
 'buzzy',
 'Chazy',
 'cozy',
 'crazy',
 'dazy',
 'dizzy',
 'dozy',
 'enfrenzy',
 'fezzy']
[31]:
##
## Creación de un diccionario de anagramas
##
anagrams = defaultdict(list)
for word in words:
    key = "".join(sorted(word))
    anagrams[key].append(word)

anagrams["aeilnrt"]
[31]:
['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']
[32]:
##
## Código equivalente a defaultdict(list) en NLTK
## con nltk.Index(). Note que nltk.Index() recibe una
## tupla (clave, valor)
##
anagrams = nltk.Index(("".join(sorted(w)), w) for w in words)
anagrams["aeilnrt"]
[32]:
['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']
[33]:
##
## Manejo de claves complejas y valores
##
pos = defaultdict(lambda: defaultdict(int))
brown_news_tagged = brown.tagged_words(categories="news", tagset="universal")
for ((w1, t1), (w2, t2)) in nltk.bigrams(brown_news_tagged):
    pos[(t1, w2)][t2] += 1

pos[("DET", "right")]
[33]:
defaultdict(int, {'NOUN': 5, 'ADJ': 11})
[34]:
##
## Inversión de un diccionario
##
counts = defaultdict(int)
for word in nltk.corpus.gutenberg.words("milton-paradise.txt"):
    counts[word] += 1

[key for (key, value) in counts.items() if value == 32]
[34]:
['mortal',
 'Against',
 'Him',
 'There',
 'brought',
 'King',
 'virtue',
 'every',
 'been',
 'thine']
[35]:
##
## Inversión directa de un dicionario
##
pos = {"colorless": "ADJ", "ideas": "N", "sleep": "V", "furiously": "ADV"}
pos2 = dict((value, key) for (key, value) in pos.items())
pos2["N"]
[35]:
'ideas'
[36]:
pos.update({"cats": "N", "scratch": "V", "peacefully": "ADV", "old": "ADJ"})
pos2 = defaultdict(list)
for key, value in pos.items():
    pos2[value].append(key)

pos2["ADV"]
[36]:
['furiously', 'peacefully']
[37]:
##
## Código equivalente en NLTK
##
pos2 = nltk.Index((value, key) for (key, value) in pos.items())
pos2["ADV"]
[37]:
['furiously', 'peacefully']