Segmentación de sentencias¶
30 min | Última modificación: Diciembre 9, 2020
Text Analytics with Python
[1]:
import nltk
nltk.download('treebank')
nltk.download('punkt')
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data] Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Package punkt is already up-to-date!
[1]:
True
[2]:
##
## Carga las sentencias de prueba
##
sents = nltk.corpus.treebank_raw.sents()
##
## Los tokens son los textos (lexemas).
## Boundaries define el vector donde se ubican
##
tokens = []
boundaries = set()
offset = 0
for sent in sents:
tokens.extend(sent)
offset += len(sent)
boundaries.add(offset-1)
##
## Boundaries:
##
## {1,
## 90116,
## 16389,
## 40968,
## 81929,
## 24587,
## 16396,
## 65548,
## ...}
##
tokens[:40]
[2]:
['.',
'START',
'Pierre',
'Vinken',
',',
'61',
'years',
'old',
',',
'will',
'join',
'the',
'board',
'as',
'a',
'nonexecutive',
'director',
'Nov',
'.',
'29',
'.',
'Mr',
'.',
'Vinken',
'is',
'chairman',
'of',
'Elsevier',
'N',
'.',
'V',
'.,',
'the',
'Dutch',
'publishing',
'group',
'.',
'.',
'START',
'Rudolph']
[3]:
##
## Define las características y las computa
##
def punct_features(tokens, i):
return {
"next-word-capitalized": tokens[i + 1][0].isupper(),
"prev-word": tokens[i - 1].lower(),
"punct": tokens[i],
"prev-word-is-one-char": len(tokens[i - 1]) == 1,
}
##
## Llama la función únicamente cuando encuenetra ".?!"
##
featuresets = [
(punct_features(tokens, i), (i in boundaries))
for i in range(1, len(tokens) - 1)
if tokens[i] in ".?!"
]
featuresets[0:5]
[3]:
[({'next-word-capitalized': False,
'prev-word': 'nov',
'punct': '.',
'prev-word-is-one-char': False},
False),
({'next-word-capitalized': True,
'prev-word': '29',
'punct': '.',
'prev-word-is-one-char': False},
True),
({'next-word-capitalized': True,
'prev-word': 'mr',
'punct': '.',
'prev-word-is-one-char': False},
False),
({'next-word-capitalized': True,
'prev-word': 'n',
'punct': '.',
'prev-word-is-one-char': True},
False),
({'next-word-capitalized': False,
'prev-word': 'group',
'punct': '.',
'prev-word-is-one-char': False},
True)]
[4]:
##
## Entrenamiento y evaluación del clasificador
##
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)
[4]:
0.936026936026936
[5]:
##
## Función para usar el clasificador
##
def segment_sentences(words):
start = 0
sents = []
for i, word in enumerate(words):
if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
sents.append(words[start:i+1])
start = i+1
if start < len(words):
sents.append(words[start:])
return sents