Clasificación POS-tagging¶
30 min | Última modificación: Diciembre 9, 2020
Text Analytics with Python
[1]:
import nltk
nltk.download("brown")
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data] Package brown is already up-to-date!
[1]:
True
[2]:
##
## En este ejemplo se construye un clasificador que
## indica el tag de la palabra con base en sus últimas
## letras
##
from nltk.corpus import brown
##
## Crea el objeto vacio
##
suffix_fdist = nltk.FreqDist()
##
## Computa la frecuencia de la última, dos últimas y
## tres últimas letras de la palabra
##
for word in brown.words():
word = word.lower()
suffix_fdist[word[-1:]] += 1
suffix_fdist[word[-2:]] += 1
suffix_fdist[word[-3:]] += 1
##
## Sufijos más comunes
##
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
common_suffixes[:10]
[2]:
['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of']
[3]:
##
## Representación usando bag-of-words
##
def pos_features(word):
features = {}
for suffix in common_suffixes:
features["endswith({})".format(suffix)] = word.lower().endswith(suffix)
return features
##
## Conjunto de palabras taggeadas
##
tagged_words = brown.tagged_words(categories="news")
##
## Preparación de los datos
##
featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
##
## Conjuntos de entrenamiento y validación
##
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
##
## Entrenamiento y evaluación del clasificador
##
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)
[3]:
0.6270512182993535
[4]:
##
## Clasificación de una palabra
##
classifier.classify(pos_features("cats"))
[4]:
'NNS'
[5]:
##
## Reglas equivalentes
##
print(classifier.pseudocode(depth=4))
if endswith(the) == False:
if endswith(,) == False:
if endswith(s) == False:
if endswith(.) == False: return '.'
if endswith(.) == True: return '.'
if endswith(s) == True:
if endswith(is) == False: return 'PP$'
if endswith(is) == True: return 'BEZ'
if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'
[6]:
##
## Aumento de las características teniendo
## en cuenta la palabra anterior
##
def pos_features(sentence, i):
features = {
"suffix(1)": sentence[i][-1:],
"suffix(2)": sentence[i][-2:],
"suffix(3)": sentence[i][-3:],
}
if i == 0:
features["prev-word"] = "<START>"
else:
features["prev-word"] = sentence[i - 1]
return features
pos_features(brown.sents()[0], 8)
[6]:
{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}
[7]:
##
## Carga de las sentencias
##
tagged_sents = brown.tagged_sents(categories="news")
##
## Extracción de características
##
featuresets = []
for tagged_sent in tagged_sents:
untagged_sent = nltk.tag.untag(tagged_sent)
for i, (word, tag) in enumerate(tagged_sent):
featuresets.append((pos_features(untagged_sent, i), tag))
##
## Conjuntos de entrenamiento y validación
##
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
##
## Entrenamiento y evaluación del clasificador
##
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)
[7]:
0.7891596220785678
[8]:
##
## Clasificación de secuencias de tags
##
def pos_features(sentence, i, history):
features = {
"suffix(1)": sentence[i][-1:],
"suffix(2)": sentence[i][-2:],
"suffix(3)": sentence[i][-3:],
}
if i == 0:
features["prev-word"] = "<START>"
features["prev-tag"] = "<START>"
else:
features["prev-word"] = sentence[i - 1]
features["prev-tag"] = history[i - 1]
return features
class ConsecutivePosTagger(nltk.TaggerI):
##
## Constructor
## Construye y entrena el clasificador
##
def __init__(self, train_sents):
train_set = []
for tagged_sent in train_sents:
untagged_sent = nltk.tag.untag(tagged_sent)
history = []
for i, (word, tag) in enumerate(tagged_sent):
featureset = pos_features(untagged_sent, i, history)
train_set.append((featureset, tag))
history.append(tag)
self.classifier = nltk.NaiveBayesClassifier.train(train_set)
##
## Aplica el tag a un grupo de sentencias
##
def tag(self, sentence):
history = []
for i, word in enumerate(sentence):
featureset = pos_features(sentence, i, history)
tag = self.classifier.classify(featureset)
history.append(tag)
return zip(sentence, history)
##
## Carga los datos
##
tagged_sents = brown.tagged_sents(categories="news")
##
## Conjuntos de entrenamiento y validación
##
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
##
## Entrenamiento y evaluación del clasificador
##
tagger = ConsecutivePosTagger(train_sents)
print(tagger.evaluate(test_sents))
0.7980528511821975