Representación TF-IDF#

  • En una matriz documento-término, las cantidades representan la ocurrencia de un token en cada uno de los documentos.

  • Las palabras como artículos, verbos ser/estar, conectores, etc. son palabras muy comunes en los textos (tienen frecuencias altas en una matriz documento-término) y tienen poca utilidad para extraer información valiosa de un documento. Adicionalmente, distorcionan y obscurecen términos interesantes que serían de mucha más utilidad.

  • La representación TF-IDF (term-frequency inverse document-frequency) recomputa los valores de la matriz como:

    \text{tf-idf}(t, d) = \text{tf}(t, d) \times \text{idf}(t)

    • t representa el término.

    • d representa el documento.

  • La transformación también pondera la frecuencia de cada token respecto a su frecuencia en el documento y la cantidad de tokens en dicho documento.

Para explicar esta representación considere la siguiente matriz documento-termino:

[1]:
import numpy as np

#
# tf: term-frequency
#
tf = np.array(
    [
        [3, 0, 1],
        [2, 0, 0],
        [3, 0, 0],
        [4, 0, 0],
        [3, 2, 0],
        [3, 0, 2],
    ]
)

tf
[1]:
array([[3, 0, 1],
       [2, 0, 0],
       [3, 0, 0],
       [4, 0, 0],
       [3, 2, 0],
       [3, 0, 2]])

Ahora se crea el transformador:

[2]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(
    # -------------------------------------------------------------
    # Each output row will have unit norm.
    # "l1", "l2"
    norm="l1",
    # -------------------------------------------------------------
    # Enable inverse-document-frequency reweighting.
    use_idf=True,
    # -------------------------------------------------------------
    # Smooth idf weights by adding one to document frequencies, as
    # if an extra document was seen containing every term in the
    # collection exactly once. Prevents zero divisions.
    smooth_idf=False,
    # -------------------------------------------------------------
    # Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
    sublinear_tf=False,
)

transformer.fit_transform(tf).toarray()
[2]:
array([[0.5883954 , 0.        , 0.4116046 ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.34950701, 0.65049299, 0.        ],
       [0.41682734, 0.        , 0.58317266]])

Caso 1#

* norm="l1"

* use_idf=False
[3]:
#
# term-frequency
#
tf
[3]:
array([[3, 0, 1],
       [2, 0, 0],
       [3, 0, 0],
       [4, 0, 0],
       [3, 2, 0],
       [3, 0, 2]])
[4]:
#
# norma "l1" de cada fila
#
row_norm = np.tile(tf.sum(axis=1).reshape(-1, 1), (1, 3))
row_norm
[4]:
array([[4, 4, 4],
       [2, 2, 2],
       [3, 3, 3],
       [4, 4, 4],
       [5, 5, 5],
       [5, 5, 5]])
[5]:
tf / row_norm
[5]:
array([[0.75, 0.  , 0.25],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.6 , 0.4 , 0.  ],
       [0.6 , 0.  , 0.4 ]])
[6]:
#
# Verificación
#
TfidfTransformer(
    norm="l1",
    use_idf=False,
    smooth_idf=False,
    sublinear_tf=False,
).fit_transform(tf).toarray()
[6]:
array([[0.75, 0.  , 0.25],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.6 , 0.4 , 0.  ],
       [0.6 , 0.  , 0.4 ]])

Caso 2#

* norm="l2"

* use_idf=False
[7]:
#
# term-frequency
#
tf
[7]:
array([[3, 0, 1],
       [2, 0, 0],
       [3, 0, 0],
       [4, 0, 0],
       [3, 2, 0],
       [3, 0, 2]])
[8]:
#
# norma "l2" de cada fila
#
row_norm = np.tile(np.sqrt(np.power(tf, 2).sum(axis=1).reshape(-1, 1)), (1, 3))
row_norm
[8]:
array([[3.16227766, 3.16227766, 3.16227766],
       [2.        , 2.        , 2.        ],
       [3.        , 3.        , 3.        ],
       [4.        , 4.        , 4.        ],
       [3.60555128, 3.60555128, 3.60555128],
       [3.60555128, 3.60555128, 3.60555128]])
[9]:
tf / row_norm
[9]:
array([[0.9486833 , 0.        , 0.31622777],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.83205029, 0.5547002 , 0.        ],
       [0.83205029, 0.        , 0.5547002 ]])
[10]:
#
# Verificación
#
TfidfTransformer(
    norm="l2",
    use_idf=False,
    smooth_idf=False,
    sublinear_tf=False,
).fit_transform(tf).toarray()
[10]:
array([[0.9486833 , 0.        , 0.31622777],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.83205029, 0.5547002 , 0.        ],
       [0.83205029, 0.        , 0.5547002 ]])

Caso 3#

* norm="l2"

* use_idf=True

* smooth_idf=False

* sublinear_tf=False
[11]:
#
# número de documentos = 6
#
n = tf.shape[0]
[12]:
#
# Cuenta los documentos en que aparece un término
#
df = np.where(tf > 0, 1, 0)
df = df.sum(axis=0)
df
[12]:
array([6, 1, 2])
[13]:
#
# Computa idf(t). Para smooth_idf=False
#
#                  n
#   idf(t) = log ---- + 1
#                 df
#
idf = np.log(n / df) + 1
idf = np.tile(idf, (6, 1))
idf
[13]:
array([[1.        , 2.79175947, 2.09861229],
       [1.        , 2.79175947, 2.09861229],
       [1.        , 2.79175947, 2.09861229],
       [1.        , 2.79175947, 2.09861229],
       [1.        , 2.79175947, 2.09861229],
       [1.        , 2.79175947, 2.09861229]])
[14]:
tf_idf_raw = tf * idf
tf_idf_raw
[14]:
array([[3.        , 0.        , 2.09861229],
       [2.        , 0.        , 0.        ],
       [3.        , 0.        , 0.        ],
       [4.        , 0.        , 0.        ],
       [3.        , 5.58351894, 0.        ],
       [3.        , 0.        , 4.19722458]])
[15]:
row_norm = np.tile(np.sqrt(np.power(tf_idf_raw, 2).sum(axis=1).reshape(-1, 1)), (1, 3))
row_norm
[15]:
array([[3.66117106, 3.66117106, 3.66117106],
       [2.        , 2.        , 2.        ],
       [3.        , 3.        , 3.        ],
       [4.        , 4.        , 4.        ],
       [6.33842912, 6.33842912, 6.33842912],
       [5.15913696, 5.15913696, 5.15913696]])
[16]:
tf_idf = tf_idf_raw / row_norm
tf_idf
[16]:
array([[0.81940995, 0.        , 0.57320793],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.47330339, 0.88089948, 0.        ],
       [0.58149261, 0.        , 0.81355169]])
[17]:
#
# Verificación
#
TfidfTransformer(
    norm="l2",
    use_idf=True,
    smooth_idf=False,
    sublinear_tf=False,
).fit_transform(tf).toarray()
[17]:
array([[0.81940995, 0.        , 0.57320793],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.47330339, 0.88089948, 0.        ],
       [0.58149261, 0.        , 0.81355169]])

Caso 4#

* norm="l2"

* use_idf=True

* smooth_idf=True

* sublinear_tf=False
[18]:
#
# Computa idf(t). Para smooth_idf=True. Equivale
# a un documento que tiene todos los terminos
#
#                  1+n
#   idf(t) = log ------ + 1
#                 1+df
#
idf = np.log((1 + n) / (1 + df)) + 1
idf = np.tile(idf, (6, 1))
idf
[18]:
array([[1.        , 2.25276297, 1.84729786],
       [1.        , 2.25276297, 1.84729786],
       [1.        , 2.25276297, 1.84729786],
       [1.        , 2.25276297, 1.84729786],
       [1.        , 2.25276297, 1.84729786],
       [1.        , 2.25276297, 1.84729786]])
[19]:
tf_idf_raw = tf * idf
tf_idf_raw
[19]:
array([[3.        , 0.        , 1.84729786],
       [2.        , 0.        , 0.        ],
       [3.        , 0.        , 0.        ],
       [4.        , 0.        , 0.        ],
       [3.        , 4.50552594, 0.        ],
       [3.        , 0.        , 3.69459572]])
[20]:
row_norm = np.tile(np.sqrt(np.power(tf_idf_raw, 2).sum(axis=1).reshape(-1, 1)), (1, 3))
row_norm
[20]:
array([[3.52313914, 3.52313914, 3.52313914],
       [2.        , 2.        , 2.        ],
       [3.        , 3.        , 3.        ],
       [4.        , 4.        , 4.        ],
       [5.41292564, 5.41292564, 5.41292564],
       [4.75920556, 4.75920556, 4.75920556]])
[21]:
tf_idf = tf_idf_raw / row_norm
tf_idf
[21]:
array([[0.85151335, 0.        , 0.52433293],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.55422893, 0.83236428, 0.        ],
       [0.63035731, 0.        , 0.77630514]])
[22]:
#
# Verificación
#
TfidfTransformer(
    norm="l2",
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=False,
).fit_transform(tf).toarray()
[22]:
array([[0.85151335, 0.        , 0.52433293],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.55422893, 0.83236428, 0.        ],
       [0.63035731, 0.        , 0.77630514]])

Caso 5#

* norm="l2"

* use_idf=True

* smooth_idf=True

* sublinear_tf=True
[23]:
#
# Computa idf(t). Para smooth_idf=True. Equivale
# a un documento que tiene todos los terminos
#
#                  1+n
#   idf(t) = log ------ + 1
#                 1+df
#
idf = np.log((1 + n) / (1 + df)) + 1
idf = np.tile(idf, (6, 1))
idf
[23]:
array([[1.        , 2.25276297, 1.84729786],
       [1.        , 2.25276297, 1.84729786],
       [1.        , 2.25276297, 1.84729786],
       [1.        , 2.25276297, 1.84729786],
       [1.        , 2.25276297, 1.84729786],
       [1.        , 2.25276297, 1.84729786]])
[24]:
#
# Cuando sublinear_tf=True, reemplaza tf por 1 + log(tf)
#
mylog = lambda x: 1 + np.log(x) if x > 0 else 0

tf_idf_raw = np.vectorize(mylog)(tf) * idf
tf_idf_raw
[24]:
array([[2.09861229, 0.        , 1.84729786],
       [1.69314718, 0.        , 0.        ],
       [2.09861229, 0.        , 0.        ],
       [2.38629436, 0.        , 0.        ],
       [2.09861229, 3.81425927, 0.        ],
       [2.09861229, 0.        , 3.12774716]])
[25]:
row_norm = np.tile(np.sqrt(np.power(tf_idf_raw, 2).sum(axis=1).reshape(-1, 1)), (1, 3))
row_norm
[25]:
array([[2.79583314, 2.79583314, 2.79583314],
       [1.69314718, 1.69314718, 1.69314718],
       [2.09861229, 2.09861229, 2.09861229],
       [2.38629436, 2.38629436, 2.38629436],
       [4.35347531, 4.35347531, 4.35347531],
       [3.76656022, 3.76656022, 3.76656022]])
[26]:
tf_idf = tf_idf_raw / row_norm
tf_idf
[26]:
array([[0.75062144, 0.        , 0.66073252],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.48205448, 0.87614124, 0.        ],
       [0.55716945, 0.        , 0.83039882]])
[27]:
#
# Verificación
#
TfidfTransformer(
    norm="l2",
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=True,
).fit_transform(tf).toarray()
[27]:
array([[0.75062144, 0.        , 0.66073252],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.48205448, 0.87614124, 0.        ],
       [0.55716945, 0.        , 0.83039882]])