Segmentación de sentencias

  • 30 min | Última modificación: Diciembre 9, 2020

http://www.nltk.org/book/

Text Analytics with Python

[1]:
import nltk
nltk.download('treebank')
nltk.download('punkt')
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[1]:
True
[2]:
##
## Carga las sentencias de prueba
##
sents = nltk.corpus.treebank_raw.sents()

##
## Los tokens son los textos (lexemas).
## Boundaries define el vector donde se ubican
##
tokens = []
boundaries = set()
offset = 0
for sent in sents:
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

##
## Boundaries:
##
## {1,
##  90116,
##  16389,
##  40968,
##  81929,
##  24587,
##  16396,
##  65548,
##  ...}
##
tokens[:40]
[2]:
['.',
 'START',
 'Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov',
 '.',
 '29',
 '.',
 'Mr',
 '.',
 'Vinken',
 'is',
 'chairman',
 'of',
 'Elsevier',
 'N',
 '.',
 'V',
 '.,',
 'the',
 'Dutch',
 'publishing',
 'group',
 '.',
 '.',
 'START',
 'Rudolph']
[3]:
##
## Define las características y las computa
##
def punct_features(tokens, i):
    return {
        "next-word-capitalized": tokens[i + 1][0].isupper(),
        "prev-word": tokens[i - 1].lower(),
        "punct": tokens[i],
        "prev-word-is-one-char": len(tokens[i - 1]) == 1,
    }

##
## Llama la función únicamente cuando encuenetra ".?!"
##
featuresets = [
    (punct_features(tokens, i), (i in boundaries))
    for i in range(1, len(tokens) - 1)
    if tokens[i] in ".?!"
]

featuresets[0:5]
[3]:
[({'next-word-capitalized': False,
   'prev-word': 'nov',
   'punct': '.',
   'prev-word-is-one-char': False},
  False),
 ({'next-word-capitalized': True,
   'prev-word': '29',
   'punct': '.',
   'prev-word-is-one-char': False},
  True),
 ({'next-word-capitalized': True,
   'prev-word': 'mr',
   'punct': '.',
   'prev-word-is-one-char': False},
  False),
 ({'next-word-capitalized': True,
   'prev-word': 'n',
   'punct': '.',
   'prev-word-is-one-char': True},
  False),
 ({'next-word-capitalized': False,
   'prev-word': 'group',
   'punct': '.',
   'prev-word-is-one-char': False},
  True)]
[4]:
##
## Entrenamiento y evaluación del clasificador
##
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)
[4]:
0.936026936026936
[5]:
##
## Función para usar el clasificador
##
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in enumerate(words):
        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
            sents.append(words[start:i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])
    return sents