Clasificación POS-tagging¶

30 min | Última modificación: Diciembre 9, 2020

http://www.nltk.org/book/

Text Analytics with Python

[1]:

import nltk

nltk.download("brown")

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!

[1]:

True

[2]:

##
## En este ejemplo se construye un clasificador que
## indica el tag de la palabra con base en sus últimas
## letras
##
from nltk.corpus import brown

##
## Crea el objeto vacio
##
suffix_fdist = nltk.FreqDist()

##
## Computa la frecuencia de la última, dos últimas y
## tres últimas letras de la palabra
##
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

##
## Sufijos más comunes
##
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
common_suffixes[:10]

[2]:

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of']

[3]:

##
## Representación usando bag-of-words
##
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features["endswith({})".format(suffix)] = word.lower().endswith(suffix)
    return features


##
## Conjunto de palabras taggeadas
##
tagged_words = brown.tagged_words(categories="news")

##
## Preparación de los datos
##
featuresets = [(pos_features(n), g) for (n, g) in tagged_words]

##
## Conjuntos de entrenamiento y validación
##
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

##
## Entrenamiento y evaluación del clasificador
##
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

[3]:

0.6270512182993535

[4]:

##
## Clasificación de una palabra
##
classifier.classify(pos_features("cats"))

[4]:

'NNS'

[5]:

##
## Reglas equivalentes
##
print(classifier.pseudocode(depth=4))

if endswith(the) == False:
  if endswith(,) == False:
    if endswith(s) == False:
      if endswith(.) == False: return '.'
      if endswith(.) == True: return '.'
    if endswith(s) == True:
      if endswith(is) == False: return 'PP$'
      if endswith(is) == True: return 'BEZ'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'

[6]:

##
## Aumento de las características teniendo
## en cuenta la palabra anterior
##
def pos_features(sentence, i):
    features = {
        "suffix(1)": sentence[i][-1:],
        "suffix(2)": sentence[i][-2:],
        "suffix(3)": sentence[i][-3:],
    }
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i - 1]
    return features


pos_features(brown.sents()[0], 8)

[6]:

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}

[7]:

##
## Carga de las sentencias
##
tagged_sents = brown.tagged_sents(categories="news")

##
## Extracción de características
##
featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append((pos_features(untagged_sent, i), tag))

##
## Conjuntos de entrenamiento y validación
##
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

##
## Entrenamiento y evaluación del clasificador
##
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

[7]:

0.7891596220785678

[8]:

##
## Clasificación de secuencias de tags
##
def pos_features(sentence, i, history):
    features = {
        "suffix(1)": sentence[i][-1:],
        "suffix(2)": sentence[i][-2:],
        "suffix(3)": sentence[i][-3:],
    }
    if i == 0:
        features["prev-word"] = "<START>"
        features["prev-tag"] = "<START>"
    else:
        features["prev-word"] = sentence[i - 1]
        features["prev-tag"] = history[i - 1]
    return features


class ConsecutivePosTagger(nltk.TaggerI):
    ##
    ## Constructor
    ##   Construye y entrena el clasificador
    ##
    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history)
                train_set.append((featureset, tag))
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    ##
    ## Aplica el tag a un grupo de sentencias
    ##
    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)


##
## Carga los datos
##
tagged_sents = brown.tagged_sents(categories="news")

##
## Conjuntos de entrenamiento y validación
##
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]

##
## Entrenamiento y evaluación del clasificador
##
tagger = ConsecutivePosTagger(train_sents)
print(tagger.evaluate(test_sents))

0.7980528511821975