Clasificación del genero de nombres¶
30 min | Última modificación: Diciembre 9, 2020
Text Analytics with Python
[1]:
import nltk
nltk.download("names")
[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data] Package names is already up-to-date!
[1]:
True
[2]:
##
## Clasificación de nombres por género
##
def gender_features(word):
return {"last_letter": word[-1]}
gender_features("Shrek")
[2]:
{'last_letter': 'k'}
[3]:
##
## Se lee una base de datos de nombres masculinos y femeninos
##
from nltk.corpus import names
labeled_names = [(name, "male") for name in names.words("male.txt")] + [
(name, "female") for name in names.words("female.txt")
]
##
## Se mezclan los nombres aleatoriamente
##
import random
random.shuffle(labeled_names)
##
## Se asignan las letras por genero
##
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
##
## Conjuntos de entrenamiento y validación
##
train_set, test_set = featuresets[500:], featuresets[:500]
##
## Clasificador bayesiano
##
classifier = nltk.NaiveBayesClassifier.train(train_set)
[4]:
##
## Uso del clasificador
##
classifier.classify(gender_features("Neo"))
[4]:
'male'
[5]:
##
## Uso del clasificador
##
classifier.classify(gender_features("Trinity"))
[5]:
'female'
[6]:
##
## Evaluación de la precisión
##
nltk.classify.accuracy(classifier, test_set)
[6]:
0.758
[7]:
##
## características más importantes
##
classifier.show_most_informative_features(10)
Most Informative Features
last_letter = 'k' male : female = 43.2 : 1.0
last_letter = 'a' female : male = 33.3 : 1.0
last_letter = 'f' male : female = 14.6 : 1.0
last_letter = 'p' male : female = 12.6 : 1.0
last_letter = 'd' male : female = 10.2 : 1.0
last_letter = 'v' male : female = 9.9 : 1.0
last_letter = 'o' male : female = 8.8 : 1.0
last_letter = 'm' male : female = 8.6 : 1.0
last_letter = 'r' male : female = 6.7 : 1.0
last_letter = 'w' male : female = 6.6 : 1.0
[8]:
##
## Selección de las caracterísitcas correctas
## Retorna un conteo de las letras existentes entre
## la primera y ultima letra del nombre como un
## diccionario
##
def gender_features2(name):
features = {}
features["first_letter"] = name[0].lower()
features["last_letter"] = name[-1].lower()
for letter in "abcdefghijklmnopqrstuvwxyz":
features["count({})".format(letter)] = name.lower().count(letter)
features["has({})".format(letter)] = letter in name.lower()
return features
gender_features2("John")
[8]:
{'first_letter': 'j',
'last_letter': 'n',
'count(a)': 0,
'has(a)': False,
'count(b)': 0,
'has(b)': False,
'count(c)': 0,
'has(c)': False,
'count(d)': 0,
'has(d)': False,
'count(e)': 0,
'has(e)': False,
'count(f)': 0,
'has(f)': False,
'count(g)': 0,
'has(g)': False,
'count(h)': 1,
'has(h)': True,
'count(i)': 0,
'has(i)': False,
'count(j)': 1,
'has(j)': True,
'count(k)': 0,
'has(k)': False,
'count(l)': 0,
'has(l)': False,
'count(m)': 0,
'has(m)': False,
'count(n)': 1,
'has(n)': True,
'count(o)': 1,
'has(o)': True,
'count(p)': 0,
'has(p)': False,
'count(q)': 0,
'has(q)': False,
'count(r)': 0,
'has(r)': False,
'count(s)': 0,
'has(s)': False,
'count(t)': 0,
'has(t)': False,
'count(u)': 0,
'has(u)': False,
'count(v)': 0,
'has(v)': False,
'count(w)': 0,
'has(w)': False,
'count(x)': 0,
'has(x)': False,
'count(y)': 0,
'has(y)': False,
'count(z)': 0,
'has(z)': False}
[9]:
##
## Construye las caracteristicas usando una
## estructura bag-of-words
##
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
##
## Separa los conjuntos de entrenamiento y validación
##
train_set, test_set = featuresets[500:], featuresets[:500]
##
## Entrenamiento y evaluación del clasificador
##
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)
[9]:
0.758
[10]:
##
## Se usan tres sets para realizar el análisis de errores
##
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]
##
## Conjuntos de entrenamiento
##
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
##
## Entrenamiento y evaluación del clasificador
##
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, devtest_set)
[10]:
0.759
[11]:
##
## Se construye una función de análisis para determinar
## (si es posible) por que se equivoca el clasificador.
## Por ejemplo, las dos o tres últimas letras podrían
## ser un indicativo del genero
##
errors = []
for (name, tag) in devtest_names:
guess = classifier.classify(gender_features(name))
if guess != tag:
errors.append((tag, guess, name))
for (tag, guess, name) in sorted(errors):
print("correct={:<8} guess={:<8s} name={:<30}".format(tag, guess, name))
correct=female guess=male name=Abigael
correct=female guess=male name=Aidan
correct=female guess=male name=Aileen
correct=female guess=male name=Alexis
correct=female guess=male name=Amabel
correct=female guess=male name=Angil
correct=female guess=male name=Ann
correct=female guess=male name=Avis
correct=female guess=male name=Bliss
correct=female guess=male name=Bo
correct=female guess=male name=Bren
correct=female guess=male name=Brittan
correct=female guess=male name=Caitrin
correct=female guess=male name=Camel
correct=female guess=male name=Caril
correct=female guess=male name=Carilyn
correct=female guess=male name=Carleen
correct=female guess=male name=Carlin
correct=female guess=male name=Carmel
correct=female guess=male name=Carmen
correct=female guess=male name=Carolann
correct=female guess=male name=Caryn
correct=female guess=male name=Catherin
correct=female guess=male name=Cathrin
correct=female guess=male name=Ceil
correct=female guess=male name=Celestyn
correct=female guess=male name=Clovis
correct=female guess=male name=Colleen
correct=female guess=male name=Dagmar
correct=female guess=male name=Del
correct=female guess=male name=Devon
correct=female guess=male name=Dido
correct=female guess=male name=Donnajean
correct=female guess=male name=Doris
correct=female guess=male name=Dyann
correct=female guess=male name=Eden
correct=female guess=male name=Elizabet
correct=female guess=male name=Em
correct=female guess=male name=Estell
correct=female guess=male name=Ethyl
correct=female guess=male name=Fan
correct=female guess=male name=Farand
correct=female guess=male name=Fawn
correct=female guess=male name=Flower
correct=female guess=male name=Garnet
correct=female guess=male name=Gaynor
correct=female guess=male name=Gillian
correct=female guess=male name=Ginger
correct=female guess=male name=Grethel
correct=female guess=male name=Grissel
correct=female guess=male name=Hannis
correct=female guess=male name=Harriett
correct=female guess=male name=Imojean
correct=female guess=male name=Ingaborg
correct=female guess=male name=Isador
correct=female guess=male name=Jackelyn
correct=female guess=male name=Janel
correct=female guess=male name=Jaynell
correct=female guess=male name=Jazmin
correct=female guess=male name=Jerrilyn
correct=female guess=male name=Jewell
correct=female guess=male name=Joan
correct=female guess=male name=Joannes
correct=female guess=male name=Jojo
correct=female guess=male name=Jolyn
correct=female guess=male name=Jolynn
correct=female guess=male name=Jordan
correct=female guess=male name=Kaitlyn
correct=female guess=male name=Kaitlynn
correct=female guess=male name=Karon
correct=female guess=male name=Karylin
correct=female guess=male name=Katheryn
correct=female guess=male name=Kathlin
correct=female guess=male name=Keriann
correct=female guess=male name=Kerstin
correct=female guess=male name=Kimberlyn
correct=female guess=male name=Kristien
correct=female guess=male name=Kristyn
correct=female guess=male name=Laureen
correct=female guess=male name=Leanor
correct=female guess=male name=Lillian
correct=female guess=male name=Lou
correct=female guess=male name=Lurleen
correct=female guess=male name=Lyndell
correct=female guess=male name=Lynnet
correct=female guess=male name=Madel
correct=female guess=male name=Mair
correct=female guess=male name=Maribel
correct=female guess=male name=Marlo
correct=female guess=male name=Maryl
correct=female guess=male name=Meridel
correct=female guess=male name=Michell
correct=female guess=male name=Mildred
correct=female guess=male name=Millisent
correct=female guess=male name=Mirabel
correct=female guess=male name=Morgen
correct=female guess=male name=Muffin
correct=female guess=male name=Nitin
correct=female guess=male name=Noel
correct=female guess=male name=Noelyn
correct=female guess=male name=Olwen
correct=female guess=male name=Peg
correct=female guess=male name=Perl
correct=female guess=male name=Phil
correct=female guess=male name=Renell
correct=female guess=male name=Rhiamon
correct=female guess=male name=Robbyn
correct=female guess=male name=Rosamund
correct=female guess=male name=Shannen
correct=female guess=male name=Sharleen
correct=female guess=male name=Sharron
correct=female guess=male name=Sherilyn
correct=female guess=male name=Shirleen
correct=female guess=male name=Sigrid
correct=female guess=male name=Sybil
correct=female guess=male name=Venus
correct=female guess=male name=Veradis
correct=female guess=male name=Vin
correct=female guess=male name=Yoshiko
correct=male guess=female name=Alfie
correct=male guess=female name=Alphonse
correct=male guess=female name=Andy
correct=male guess=female name=Archie
correct=male guess=female name=Ari
correct=male guess=female name=Aubrey
correct=male guess=female name=Augie
correct=male guess=female name=Bailey
correct=male guess=female name=Bartolomei
correct=male guess=female name=Bary
correct=male guess=female name=Bradly
correct=male guess=female name=Brady
correct=male guess=female name=Brice
correct=male guess=female name=Bruce
correct=male guess=female name=Burnaby
correct=male guess=female name=Christie
correct=male guess=female name=Christophe
correct=male guess=female name=Cobbie
correct=male guess=female name=Cobby
correct=male guess=female name=Dana
correct=male guess=female name=Darcy
correct=male guess=female name=Dimitri
correct=male guess=female name=Dudley
correct=male guess=female name=Eli
correct=male guess=female name=Elmore
correct=male guess=female name=Erich
correct=male guess=female name=Ernie
correct=male guess=female name=Felix
correct=male guess=female name=Filipe
correct=male guess=female name=Frankie
correct=male guess=female name=Freddie
correct=male guess=female name=Frederich
correct=male guess=female name=Garcia
correct=male guess=female name=Garey
correct=male guess=female name=Garry
correct=male guess=female name=Garth
correct=male guess=female name=Gayle
correct=male guess=female name=Geri
correct=male guess=female name=Giffie
correct=male guess=female name=Giffy
correct=male guess=female name=Godfrey
correct=male guess=female name=Goose
correct=male guess=female name=Guthrie
correct=male guess=female name=Hilary
correct=male guess=female name=Humphrey
correct=male guess=female name=Huntley
correct=male guess=female name=Jeffery
correct=male guess=female name=Jermaine
correct=male guess=female name=Jonah
correct=male guess=female name=Jordy
correct=male guess=female name=Jory
correct=male guess=female name=Judah
correct=male guess=female name=Julie
correct=male guess=female name=Kennedy
correct=male guess=female name=Klee
correct=male guess=female name=Leroy
correct=male guess=female name=Lindsay
correct=male guess=female name=Lindy
correct=male guess=female name=Mace
correct=male guess=female name=Maurice
correct=male guess=female name=Mendie
correct=male guess=female name=Micah
correct=male guess=female name=Mika
correct=male guess=female name=Montgomery
correct=male guess=female name=Morlee
correct=male guess=female name=Murphy
correct=male guess=female name=Nealy
correct=male guess=female name=Nichole
correct=male guess=female name=Nikki
correct=male guess=female name=Obie
correct=male guess=female name=Olle
correct=male guess=female name=Orbadiah
correct=male guess=female name=Orville
correct=male guess=female name=Ozzy
correct=male guess=female name=Parry
correct=male guess=female name=Pasquale
correct=male guess=female name=Pearce
correct=male guess=female name=Pembroke
correct=male guess=female name=Pierce
correct=male guess=female name=Ramsey
correct=male guess=female name=Rawley
correct=male guess=female name=Ricky
correct=male guess=female name=Rocky
correct=male guess=female name=Roddie
correct=male guess=female name=Rodolphe
correct=male guess=female name=Rory
correct=male guess=female name=Rudie
correct=male guess=female name=Sascha
correct=male guess=female name=Sax
correct=male guess=female name=Sherlocke
correct=male guess=female name=Sidney
correct=male guess=female name=Sparky
correct=male guess=female name=Sully
correct=male guess=female name=Sunny
correct=male guess=female name=Tammie
correct=male guess=female name=Tammy
correct=male guess=female name=Tanny
correct=male guess=female name=Thayne
correct=male guess=female name=Timmie
correct=male guess=female name=Toby
correct=male guess=female name=Tommie
correct=male guess=female name=Tommy
correct=male guess=female name=Torey
correct=male guess=female name=Torre
correct=male guess=female name=Trace
correct=male guess=female name=Tray
correct=male guess=female name=Trey
correct=male guess=female name=Uriah
correct=male guess=female name=Vasili
correct=male guess=female name=Verge
correct=male guess=female name=Vijay
correct=male guess=female name=Vinny
correct=male guess=female name=Virgie
correct=male guess=female name=Wally
correct=male guess=female name=Willey
correct=male guess=female name=Worthy
correct=male guess=female name=Wye
correct=male guess=female name=Yance
correct=male guess=female name=Zacharie
correct=male guess=female name=Zackariah
correct=male guess=female name=Zippy
correct=male guess=female name=Zollie
[12]:
##
## Implementación de los sufijos
##
def gender_features(word):
return {"suffix1": word[-1:], "suffix2": word[-2:]}
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, devtest_set)
[12]:
0.775