# Clasificación del genero de nombres

* *30 min* | Última modificación: Diciembre 9, 2020

http://www.nltk.org/book/

Text Analytics with Python

In [1]:
import nltk

nltk.download("names")

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data] Package names is already up-to-date!


True

In [2]:
##
## Clasificación de nombres por género
##
def gender_features(word):
 return {"last_letter": word[-1]}


gender_features("Shrek")

{'last_letter': 'k'}

In [3]:
##
## Se lee una base de datos de nombres masculinos y femeninos
##
from nltk.corpus import names

labeled_names = [(name, "male") for name in names.words("male.txt")] + [
 (name, "female") for name in names.words("female.txt")
]

##
## Se mezclan los nombres aleatoriamente
##
import random

random.shuffle(labeled_names)

##
## Se asignan las letras por genero
##
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

##
## Conjuntos de entrenamiento y validación
##
train_set, test_set = featuresets[500:], featuresets[:500]

##
## Clasificador bayesiano
##
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [4]:
##
## Uso del clasificador
##
classifier.classify(gender_features("Neo"))

'male'

In [5]:
##
## Uso del clasificador
##
classifier.classify(gender_features("Trinity"))

'female'

In [6]:
##
## Evaluación de la precisión
##
nltk.classify.accuracy(classifier, test_set)

0.758

In [7]:
##
## características más importantes
##
classifier.show_most_informative_features(10)

Most Informative Features
 last_letter = 'k' male : female = 43.2 : 1.0
 last_letter = 'a' female : male = 33.3 : 1.0
 last_letter = 'f' male : female = 14.6 : 1.0
 last_letter = 'p' male : female = 12.6 : 1.0
 last_letter = 'd' male : female = 10.2 : 1.0
 last_letter = 'v' male : female = 9.9 : 1.0
 last_letter = 'o' male : female = 8.8 : 1.0
 last_letter = 'm' male : female = 8.6 : 1.0
 last_letter = 'r' male : female = 6.7 : 1.0
 last_letter = 'w' male : female = 6.6 : 1.0


In [8]:
##
## Selección de las caracterísitcas correctas
## Retorna un conteo de las letras existentes entre
## la primera y ultima letra del nombre como un
## diccionario
##
def gender_features2(name):
 features = {}
 features["first_letter"] = name[0].lower()
 features["last_letter"] = name[-1].lower()
 for letter in "abcdefghijklmnopqrstuvwxyz":
 features["count({})".format(letter)] = name.lower().count(letter)
 features["has({})".format(letter)] = letter in name.lower()
 return features


gender_features2("John")

{'first_letter': 'j',
 'last_letter': 'n',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 0,
 'has(e)': False,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 1,
 'has(h)': True,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 1,
 'has(j)': True,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 1,
 'has(n)': True,
 'count(o)': 1,
 'has(o)': True,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 0,
 'has(r)': False,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [9]:
##
## Construye las caracteristicas usando una
## estructura bag-of-words
##
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

##
## Separa los conjuntos de entrenamiento y validación
##
train_set, test_set = featuresets[500:], featuresets[:500]

##
## Entrenamiento y evaluación del clasificador
##
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.758

In [10]:
##
## Se usan tres sets para realizar el análisis de errores
##
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

##
## Conjuntos de entrenamiento
##
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]

##
## Entrenamiento y evaluación del clasificador
##
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, devtest_set)

0.759

In [11]:
##
## Se construye una función de análisis para determinar
## (si es posible) por que se equivoca el clasificador.
## Por ejemplo, las dos o tres últimas letras podrían
## ser un indicativo del genero
##
errors = []
for (name, tag) in devtest_names:
 guess = classifier.classify(gender_features(name))
 if guess != tag:
 errors.append((tag, guess, name))

for (tag, guess, name) in sorted(errors):
 print("correct={:<8} guess={:<8s} name={:<30}".format(tag, guess, name))

correct=female guess=male name=Abigael 
correct=female guess=male name=Aidan 
correct=female guess=male name=Aileen 
correct=female guess=male name=Alexis 
correct=female guess=male name=Amabel 
correct=female guess=male name=Angil 
correct=female guess=male name=Ann 
correct=female guess=male name=Avis 
correct=female guess=male name=Bliss 
correct=female guess=male name=Bo 
correct=female guess=male name=Bren 
correct=female guess=male name=Brittan 
correct=female guess=male name=Caitrin 
correct=female guess=male name=Camel 
correct=female guess=male name=Caril 
correct=female guess=male name=Carilyn 
correct=female guess=male name=Carleen 
correct=female guess=male name=Carlin 
correct=female guess=male name=Carmel 
correct=female guess=male name=Carmen 
correct=female guess=male name=Carolann 
correct=female guess=male name=Caryn 
correct=female guess=male name=Catherin 
correct=female guess=male name=Cathrin 
correct=female guess=male name=Ceil 
correct=female guess=male name=Cel

In [12]:
##
## Implementación de los sufijos
##
def gender_features(word):
 return {"suffix1": word[-1:], "suffix2": word[-2:]}


train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, devtest_set)

0.775