Clasificación del genero de nombres

  • 30 min | Última modificación: Diciembre 9, 2020

http://www.nltk.org/book/

Text Analytics with Python

[1]:
import nltk

nltk.download("names")
[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!
[1]:
True
[2]:
##
## Clasificación de nombres por género
##
def gender_features(word):
    return {"last_letter": word[-1]}


gender_features("Shrek")
[2]:
{'last_letter': 'k'}
[3]:
##
## Se lee una base de datos de nombres masculinos y femeninos
##
from nltk.corpus import names

labeled_names = [(name, "male") for name in names.words("male.txt")] + [
    (name, "female") for name in names.words("female.txt")
]

##
## Se mezclan los nombres aleatoriamente
##
import random

random.shuffle(labeled_names)

##
## Se asignan las letras por genero
##
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

##
## Conjuntos de entrenamiento y validación
##
train_set, test_set = featuresets[500:], featuresets[:500]

##
## Clasificador bayesiano
##
classifier = nltk.NaiveBayesClassifier.train(train_set)
[4]:
##
## Uso del clasificador
##
classifier.classify(gender_features("Neo"))
[4]:
'male'
[5]:
##
## Uso del clasificador
##
classifier.classify(gender_features("Trinity"))
[5]:
'female'
[6]:
##
## Evaluación de la precisión
##
nltk.classify.accuracy(classifier, test_set)
[6]:
0.758
[7]:
##
## características más importantes
##
classifier.show_most_informative_features(10)
Most Informative Features
             last_letter = 'k'              male : female =     43.2 : 1.0
             last_letter = 'a'            female : male   =     33.3 : 1.0
             last_letter = 'f'              male : female =     14.6 : 1.0
             last_letter = 'p'              male : female =     12.6 : 1.0
             last_letter = 'd'              male : female =     10.2 : 1.0
             last_letter = 'v'              male : female =      9.9 : 1.0
             last_letter = 'o'              male : female =      8.8 : 1.0
             last_letter = 'm'              male : female =      8.6 : 1.0
             last_letter = 'r'              male : female =      6.7 : 1.0
             last_letter = 'w'              male : female =      6.6 : 1.0
[8]:
##
## Selección de las caracterísitcas correctas
##   Retorna un conteo de las letras existentes entre
##   la primera y ultima letra del nombre como un
##   diccionario
##
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in "abcdefghijklmnopqrstuvwxyz":
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = letter in name.lower()
    return features


gender_features2("John")
[8]:
{'first_letter': 'j',
 'last_letter': 'n',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 0,
 'has(e)': False,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 1,
 'has(h)': True,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 1,
 'has(j)': True,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 1,
 'has(n)': True,
 'count(o)': 1,
 'has(o)': True,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 0,
 'has(r)': False,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}
[9]:
##
## Construye las caracteristicas usando una
## estructura bag-of-words
##
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

##
## Separa los conjuntos de entrenamiento y validación
##
train_set, test_set = featuresets[500:], featuresets[:500]

##
## Entrenamiento y evaluación del clasificador
##
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)
[9]:
0.758
[10]:
##
## Se usan tres sets para realizar el análisis de errores
##
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

##
## Conjuntos de entrenamiento
##
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]

##
## Entrenamiento y evaluación del clasificador
##
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, devtest_set)
[10]:
0.759
[11]:
##
## Se construye una función de análisis para determinar
## (si es posible) por que se equivoca el clasificador.
## Por ejemplo, las dos o tres últimas letras podrían
## ser un indicativo del genero
##
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))

for (tag, guess, name) in sorted(errors):
    print("correct={:<8} guess={:<8s} name={:<30}".format(tag, guess, name))
correct=female   guess=male     name=Abigael
correct=female   guess=male     name=Aidan
correct=female   guess=male     name=Aileen
correct=female   guess=male     name=Alexis
correct=female   guess=male     name=Amabel
correct=female   guess=male     name=Angil
correct=female   guess=male     name=Ann
correct=female   guess=male     name=Avis
correct=female   guess=male     name=Bliss
correct=female   guess=male     name=Bo
correct=female   guess=male     name=Bren
correct=female   guess=male     name=Brittan
correct=female   guess=male     name=Caitrin
correct=female   guess=male     name=Camel
correct=female   guess=male     name=Caril
correct=female   guess=male     name=Carilyn
correct=female   guess=male     name=Carleen
correct=female   guess=male     name=Carlin
correct=female   guess=male     name=Carmel
correct=female   guess=male     name=Carmen
correct=female   guess=male     name=Carolann
correct=female   guess=male     name=Caryn
correct=female   guess=male     name=Catherin
correct=female   guess=male     name=Cathrin
correct=female   guess=male     name=Ceil
correct=female   guess=male     name=Celestyn
correct=female   guess=male     name=Clovis
correct=female   guess=male     name=Colleen
correct=female   guess=male     name=Dagmar
correct=female   guess=male     name=Del
correct=female   guess=male     name=Devon
correct=female   guess=male     name=Dido
correct=female   guess=male     name=Donnajean
correct=female   guess=male     name=Doris
correct=female   guess=male     name=Dyann
correct=female   guess=male     name=Eden
correct=female   guess=male     name=Elizabet
correct=female   guess=male     name=Em
correct=female   guess=male     name=Estell
correct=female   guess=male     name=Ethyl
correct=female   guess=male     name=Fan
correct=female   guess=male     name=Farand
correct=female   guess=male     name=Fawn
correct=female   guess=male     name=Flower
correct=female   guess=male     name=Garnet
correct=female   guess=male     name=Gaynor
correct=female   guess=male     name=Gillian
correct=female   guess=male     name=Ginger
correct=female   guess=male     name=Grethel
correct=female   guess=male     name=Grissel
correct=female   guess=male     name=Hannis
correct=female   guess=male     name=Harriett
correct=female   guess=male     name=Imojean
correct=female   guess=male     name=Ingaborg
correct=female   guess=male     name=Isador
correct=female   guess=male     name=Jackelyn
correct=female   guess=male     name=Janel
correct=female   guess=male     name=Jaynell
correct=female   guess=male     name=Jazmin
correct=female   guess=male     name=Jerrilyn
correct=female   guess=male     name=Jewell
correct=female   guess=male     name=Joan
correct=female   guess=male     name=Joannes
correct=female   guess=male     name=Jojo
correct=female   guess=male     name=Jolyn
correct=female   guess=male     name=Jolynn
correct=female   guess=male     name=Jordan
correct=female   guess=male     name=Kaitlyn
correct=female   guess=male     name=Kaitlynn
correct=female   guess=male     name=Karon
correct=female   guess=male     name=Karylin
correct=female   guess=male     name=Katheryn
correct=female   guess=male     name=Kathlin
correct=female   guess=male     name=Keriann
correct=female   guess=male     name=Kerstin
correct=female   guess=male     name=Kimberlyn
correct=female   guess=male     name=Kristien
correct=female   guess=male     name=Kristyn
correct=female   guess=male     name=Laureen
correct=female   guess=male     name=Leanor
correct=female   guess=male     name=Lillian
correct=female   guess=male     name=Lou
correct=female   guess=male     name=Lurleen
correct=female   guess=male     name=Lyndell
correct=female   guess=male     name=Lynnet
correct=female   guess=male     name=Madel
correct=female   guess=male     name=Mair
correct=female   guess=male     name=Maribel
correct=female   guess=male     name=Marlo
correct=female   guess=male     name=Maryl
correct=female   guess=male     name=Meridel
correct=female   guess=male     name=Michell
correct=female   guess=male     name=Mildred
correct=female   guess=male     name=Millisent
correct=female   guess=male     name=Mirabel
correct=female   guess=male     name=Morgen
correct=female   guess=male     name=Muffin
correct=female   guess=male     name=Nitin
correct=female   guess=male     name=Noel
correct=female   guess=male     name=Noelyn
correct=female   guess=male     name=Olwen
correct=female   guess=male     name=Peg
correct=female   guess=male     name=Perl
correct=female   guess=male     name=Phil
correct=female   guess=male     name=Renell
correct=female   guess=male     name=Rhiamon
correct=female   guess=male     name=Robbyn
correct=female   guess=male     name=Rosamund
correct=female   guess=male     name=Shannen
correct=female   guess=male     name=Sharleen
correct=female   guess=male     name=Sharron
correct=female   guess=male     name=Sherilyn
correct=female   guess=male     name=Shirleen
correct=female   guess=male     name=Sigrid
correct=female   guess=male     name=Sybil
correct=female   guess=male     name=Venus
correct=female   guess=male     name=Veradis
correct=female   guess=male     name=Vin
correct=female   guess=male     name=Yoshiko
correct=male     guess=female   name=Alfie
correct=male     guess=female   name=Alphonse
correct=male     guess=female   name=Andy
correct=male     guess=female   name=Archie
correct=male     guess=female   name=Ari
correct=male     guess=female   name=Aubrey
correct=male     guess=female   name=Augie
correct=male     guess=female   name=Bailey
correct=male     guess=female   name=Bartolomei
correct=male     guess=female   name=Bary
correct=male     guess=female   name=Bradly
correct=male     guess=female   name=Brady
correct=male     guess=female   name=Brice
correct=male     guess=female   name=Bruce
correct=male     guess=female   name=Burnaby
correct=male     guess=female   name=Christie
correct=male     guess=female   name=Christophe
correct=male     guess=female   name=Cobbie
correct=male     guess=female   name=Cobby
correct=male     guess=female   name=Dana
correct=male     guess=female   name=Darcy
correct=male     guess=female   name=Dimitri
correct=male     guess=female   name=Dudley
correct=male     guess=female   name=Eli
correct=male     guess=female   name=Elmore
correct=male     guess=female   name=Erich
correct=male     guess=female   name=Ernie
correct=male     guess=female   name=Felix
correct=male     guess=female   name=Filipe
correct=male     guess=female   name=Frankie
correct=male     guess=female   name=Freddie
correct=male     guess=female   name=Frederich
correct=male     guess=female   name=Garcia
correct=male     guess=female   name=Garey
correct=male     guess=female   name=Garry
correct=male     guess=female   name=Garth
correct=male     guess=female   name=Gayle
correct=male     guess=female   name=Geri
correct=male     guess=female   name=Giffie
correct=male     guess=female   name=Giffy
correct=male     guess=female   name=Godfrey
correct=male     guess=female   name=Goose
correct=male     guess=female   name=Guthrie
correct=male     guess=female   name=Hilary
correct=male     guess=female   name=Humphrey
correct=male     guess=female   name=Huntley
correct=male     guess=female   name=Jeffery
correct=male     guess=female   name=Jermaine
correct=male     guess=female   name=Jonah
correct=male     guess=female   name=Jordy
correct=male     guess=female   name=Jory
correct=male     guess=female   name=Judah
correct=male     guess=female   name=Julie
correct=male     guess=female   name=Kennedy
correct=male     guess=female   name=Klee
correct=male     guess=female   name=Leroy
correct=male     guess=female   name=Lindsay
correct=male     guess=female   name=Lindy
correct=male     guess=female   name=Mace
correct=male     guess=female   name=Maurice
correct=male     guess=female   name=Mendie
correct=male     guess=female   name=Micah
correct=male     guess=female   name=Mika
correct=male     guess=female   name=Montgomery
correct=male     guess=female   name=Morlee
correct=male     guess=female   name=Murphy
correct=male     guess=female   name=Nealy
correct=male     guess=female   name=Nichole
correct=male     guess=female   name=Nikki
correct=male     guess=female   name=Obie
correct=male     guess=female   name=Olle
correct=male     guess=female   name=Orbadiah
correct=male     guess=female   name=Orville
correct=male     guess=female   name=Ozzy
correct=male     guess=female   name=Parry
correct=male     guess=female   name=Pasquale
correct=male     guess=female   name=Pearce
correct=male     guess=female   name=Pembroke
correct=male     guess=female   name=Pierce
correct=male     guess=female   name=Ramsey
correct=male     guess=female   name=Rawley
correct=male     guess=female   name=Ricky
correct=male     guess=female   name=Rocky
correct=male     guess=female   name=Roddie
correct=male     guess=female   name=Rodolphe
correct=male     guess=female   name=Rory
correct=male     guess=female   name=Rudie
correct=male     guess=female   name=Sascha
correct=male     guess=female   name=Sax
correct=male     guess=female   name=Sherlocke
correct=male     guess=female   name=Sidney
correct=male     guess=female   name=Sparky
correct=male     guess=female   name=Sully
correct=male     guess=female   name=Sunny
correct=male     guess=female   name=Tammie
correct=male     guess=female   name=Tammy
correct=male     guess=female   name=Tanny
correct=male     guess=female   name=Thayne
correct=male     guess=female   name=Timmie
correct=male     guess=female   name=Toby
correct=male     guess=female   name=Tommie
correct=male     guess=female   name=Tommy
correct=male     guess=female   name=Torey
correct=male     guess=female   name=Torre
correct=male     guess=female   name=Trace
correct=male     guess=female   name=Tray
correct=male     guess=female   name=Trey
correct=male     guess=female   name=Uriah
correct=male     guess=female   name=Vasili
correct=male     guess=female   name=Verge
correct=male     guess=female   name=Vijay
correct=male     guess=female   name=Vinny
correct=male     guess=female   name=Virgie
correct=male     guess=female   name=Wally
correct=male     guess=female   name=Willey
correct=male     guess=female   name=Worthy
correct=male     guess=female   name=Wye
correct=male     guess=female   name=Yance
correct=male     guess=female   name=Zacharie
correct=male     guess=female   name=Zackariah
correct=male     guess=female   name=Zippy
correct=male     guess=female   name=Zollie
[12]:
##
## Implementación de los sufijos
##
def gender_features(word):
    return {"suffix1": word[-1:], "suffix2": word[-2:]}


train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, devtest_set)
[12]:
0.775