Representación TF-IDF#
En una matriz documento-término, las cantidades representan la ocurrencia de un token en cada uno de los documentos.
Las palabras como artículos, verbos ser/estar, conectores, etc. son palabras muy comunes en los textos (tienen frecuencias altas en una matriz documento-término) y tienen poca utilidad para extraer información valiosa de un documento. Adicionalmente, distorcionan y obscurecen términos interesantes que serían de mucha más utilidad.
La representación TF-IDF (term-frequency inverse document-frequency) recomputa los valores de la matriz como:
\text{tf-idf}(t, d) = \text{tf}(t, d) \times \text{idf}(t)
t representa el término.
d representa el documento.
La transformación también pondera la frecuencia de cada token respecto a su frecuencia en el documento y la cantidad de tokens en dicho documento.
Para explicar esta representación considere la siguiente matriz documento-termino:
[1]:
import numpy as np
#
# tf: term-frequency
#
tf = np.array(
[
[3, 0, 1],
[2, 0, 0],
[3, 0, 0],
[4, 0, 0],
[3, 2, 0],
[3, 0, 2],
]
)
tf
[1]:
array([[3, 0, 1],
[2, 0, 0],
[3, 0, 0],
[4, 0, 0],
[3, 2, 0],
[3, 0, 2]])
Ahora se crea el transformador:
[2]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(
# -------------------------------------------------------------
# Each output row will have unit norm.
# "l1", "l2"
norm="l1",
# -------------------------------------------------------------
# Enable inverse-document-frequency reweighting.
use_idf=True,
# -------------------------------------------------------------
# Smooth idf weights by adding one to document frequencies, as
# if an extra document was seen containing every term in the
# collection exactly once. Prevents zero divisions.
smooth_idf=False,
# -------------------------------------------------------------
# Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
sublinear_tf=False,
)
transformer.fit_transform(tf).toarray()
[2]:
array([[0.5883954 , 0. , 0.4116046 ],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[0.34950701, 0.65049299, 0. ],
[0.41682734, 0. , 0.58317266]])
Caso 1#
* norm="l1"
* use_idf=False
[3]:
#
# term-frequency
#
tf
[3]:
array([[3, 0, 1],
[2, 0, 0],
[3, 0, 0],
[4, 0, 0],
[3, 2, 0],
[3, 0, 2]])
[4]:
#
# norma "l1" de cada fila
#
row_norm = np.tile(tf.sum(axis=1).reshape(-1, 1), (1, 3))
row_norm
[4]:
array([[4, 4, 4],
[2, 2, 2],
[3, 3, 3],
[4, 4, 4],
[5, 5, 5],
[5, 5, 5]])
[5]:
tf / row_norm
[5]:
array([[0.75, 0. , 0.25],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[0.6 , 0.4 , 0. ],
[0.6 , 0. , 0.4 ]])
[6]:
#
# Verificación
#
TfidfTransformer(
norm="l1",
use_idf=False,
smooth_idf=False,
sublinear_tf=False,
).fit_transform(tf).toarray()
[6]:
array([[0.75, 0. , 0.25],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[0.6 , 0.4 , 0. ],
[0.6 , 0. , 0.4 ]])
Caso 2#
* norm="l2"
* use_idf=False
[7]:
#
# term-frequency
#
tf
[7]:
array([[3, 0, 1],
[2, 0, 0],
[3, 0, 0],
[4, 0, 0],
[3, 2, 0],
[3, 0, 2]])
[8]:
#
# norma "l2" de cada fila
#
row_norm = np.tile(np.sqrt(np.power(tf, 2).sum(axis=1).reshape(-1, 1)), (1, 3))
row_norm
[8]:
array([[3.16227766, 3.16227766, 3.16227766],
[2. , 2. , 2. ],
[3. , 3. , 3. ],
[4. , 4. , 4. ],
[3.60555128, 3.60555128, 3.60555128],
[3.60555128, 3.60555128, 3.60555128]])
[9]:
tf / row_norm
[9]:
array([[0.9486833 , 0. , 0.31622777],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[0.83205029, 0.5547002 , 0. ],
[0.83205029, 0. , 0.5547002 ]])
[10]:
#
# Verificación
#
TfidfTransformer(
norm="l2",
use_idf=False,
smooth_idf=False,
sublinear_tf=False,
).fit_transform(tf).toarray()
[10]:
array([[0.9486833 , 0. , 0.31622777],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[0.83205029, 0.5547002 , 0. ],
[0.83205029, 0. , 0.5547002 ]])
Caso 3#
* norm="l2"
* use_idf=True
* smooth_idf=False
* sublinear_tf=False
[11]:
#
# número de documentos = 6
#
n = tf.shape[0]
[12]:
#
# Cuenta los documentos en que aparece un término
#
df = np.where(tf > 0, 1, 0)
df = df.sum(axis=0)
df
[12]:
array([6, 1, 2])
[13]:
#
# Computa idf(t). Para smooth_idf=False
#
# n
# idf(t) = log ---- + 1
# df
#
idf = np.log(n / df) + 1
idf = np.tile(idf, (6, 1))
idf
[13]:
array([[1. , 2.79175947, 2.09861229],
[1. , 2.79175947, 2.09861229],
[1. , 2.79175947, 2.09861229],
[1. , 2.79175947, 2.09861229],
[1. , 2.79175947, 2.09861229],
[1. , 2.79175947, 2.09861229]])
[14]:
tf_idf_raw = tf * idf
tf_idf_raw
[14]:
array([[3. , 0. , 2.09861229],
[2. , 0. , 0. ],
[3. , 0. , 0. ],
[4. , 0. , 0. ],
[3. , 5.58351894, 0. ],
[3. , 0. , 4.19722458]])
[15]:
row_norm = np.tile(np.sqrt(np.power(tf_idf_raw, 2).sum(axis=1).reshape(-1, 1)), (1, 3))
row_norm
[15]:
array([[3.66117106, 3.66117106, 3.66117106],
[2. , 2. , 2. ],
[3. , 3. , 3. ],
[4. , 4. , 4. ],
[6.33842912, 6.33842912, 6.33842912],
[5.15913696, 5.15913696, 5.15913696]])
[16]:
tf_idf = tf_idf_raw / row_norm
tf_idf
[16]:
array([[0.81940995, 0. , 0.57320793],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[0.47330339, 0.88089948, 0. ],
[0.58149261, 0. , 0.81355169]])
[17]:
#
# Verificación
#
TfidfTransformer(
norm="l2",
use_idf=True,
smooth_idf=False,
sublinear_tf=False,
).fit_transform(tf).toarray()
[17]:
array([[0.81940995, 0. , 0.57320793],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[0.47330339, 0.88089948, 0. ],
[0.58149261, 0. , 0.81355169]])
Caso 4#
* norm="l2"
* use_idf=True
* smooth_idf=True
* sublinear_tf=False
[18]:
#
# Computa idf(t). Para smooth_idf=True. Equivale
# a un documento que tiene todos los terminos
#
# 1+n
# idf(t) = log ------ + 1
# 1+df
#
idf = np.log((1 + n) / (1 + df)) + 1
idf = np.tile(idf, (6, 1))
idf
[18]:
array([[1. , 2.25276297, 1.84729786],
[1. , 2.25276297, 1.84729786],
[1. , 2.25276297, 1.84729786],
[1. , 2.25276297, 1.84729786],
[1. , 2.25276297, 1.84729786],
[1. , 2.25276297, 1.84729786]])
[19]:
tf_idf_raw = tf * idf
tf_idf_raw
[19]:
array([[3. , 0. , 1.84729786],
[2. , 0. , 0. ],
[3. , 0. , 0. ],
[4. , 0. , 0. ],
[3. , 4.50552594, 0. ],
[3. , 0. , 3.69459572]])
[20]:
row_norm = np.tile(np.sqrt(np.power(tf_idf_raw, 2).sum(axis=1).reshape(-1, 1)), (1, 3))
row_norm
[20]:
array([[3.52313914, 3.52313914, 3.52313914],
[2. , 2. , 2. ],
[3. , 3. , 3. ],
[4. , 4. , 4. ],
[5.41292564, 5.41292564, 5.41292564],
[4.75920556, 4.75920556, 4.75920556]])
[21]:
tf_idf = tf_idf_raw / row_norm
tf_idf
[21]:
array([[0.85151335, 0. , 0.52433293],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[0.55422893, 0.83236428, 0. ],
[0.63035731, 0. , 0.77630514]])
[22]:
#
# Verificación
#
TfidfTransformer(
norm="l2",
use_idf=True,
smooth_idf=True,
sublinear_tf=False,
).fit_transform(tf).toarray()
[22]:
array([[0.85151335, 0. , 0.52433293],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[0.55422893, 0.83236428, 0. ],
[0.63035731, 0. , 0.77630514]])
Caso 5#
* norm="l2"
* use_idf=True
* smooth_idf=True
* sublinear_tf=True
[23]:
#
# Computa idf(t). Para smooth_idf=True. Equivale
# a un documento que tiene todos los terminos
#
# 1+n
# idf(t) = log ------ + 1
# 1+df
#
idf = np.log((1 + n) / (1 + df)) + 1
idf = np.tile(idf, (6, 1))
idf
[23]:
array([[1. , 2.25276297, 1.84729786],
[1. , 2.25276297, 1.84729786],
[1. , 2.25276297, 1.84729786],
[1. , 2.25276297, 1.84729786],
[1. , 2.25276297, 1.84729786],
[1. , 2.25276297, 1.84729786]])
[24]:
#
# Cuando sublinear_tf=True, reemplaza tf por 1 + log(tf)
#
mylog = lambda x: 1 + np.log(x) if x > 0 else 0
tf_idf_raw = np.vectorize(mylog)(tf) * idf
tf_idf_raw
[24]:
array([[2.09861229, 0. , 1.84729786],
[1.69314718, 0. , 0. ],
[2.09861229, 0. , 0. ],
[2.38629436, 0. , 0. ],
[2.09861229, 3.81425927, 0. ],
[2.09861229, 0. , 3.12774716]])
[25]:
row_norm = np.tile(np.sqrt(np.power(tf_idf_raw, 2).sum(axis=1).reshape(-1, 1)), (1, 3))
row_norm
[25]:
array([[2.79583314, 2.79583314, 2.79583314],
[1.69314718, 1.69314718, 1.69314718],
[2.09861229, 2.09861229, 2.09861229],
[2.38629436, 2.38629436, 2.38629436],
[4.35347531, 4.35347531, 4.35347531],
[3.76656022, 3.76656022, 3.76656022]])
[26]:
tf_idf = tf_idf_raw / row_norm
tf_idf
[26]:
array([[0.75062144, 0. , 0.66073252],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[0.48205448, 0.87614124, 0. ],
[0.55716945, 0. , 0.83039882]])
[27]:
#
# Verificación
#
TfidfTransformer(
norm="l2",
use_idf=True,
smooth_idf=True,
sublinear_tf=True,
).fit_transform(tf).toarray()
[27]:
array([[0.75062144, 0. , 0.66073252],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[1. , 0. , 0. ],
[0.48205448, 0.87614124, 0. ],
[0.55716945, 0. , 0.83039882]])