{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Segmentación de sentencias\n",
    "\n",
    "* *30 min* | Última modificación: Diciembre 9, 2020"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "http://www.nltk.org/book/\n",
    "\n",
    "Text Analytics with Python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package treebank to /root/nltk_data...\n",
      "[nltk_data]   Package treebank is already up-to-date!\n",
      "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download('treebank')\n",
    "nltk.download('punkt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['.',\n",
       " 'START',\n",
       " 'Pierre',\n",
       " 'Vinken',\n",
       " ',',\n",
       " '61',\n",
       " 'years',\n",
       " 'old',\n",
       " ',',\n",
       " 'will',\n",
       " 'join',\n",
       " 'the',\n",
       " 'board',\n",
       " 'as',\n",
       " 'a',\n",
       " 'nonexecutive',\n",
       " 'director',\n",
       " 'Nov',\n",
       " '.',\n",
       " '29',\n",
       " '.',\n",
       " 'Mr',\n",
       " '.',\n",
       " 'Vinken',\n",
       " 'is',\n",
       " 'chairman',\n",
       " 'of',\n",
       " 'Elsevier',\n",
       " 'N',\n",
       " '.',\n",
       " 'V',\n",
       " '.,',\n",
       " 'the',\n",
       " 'Dutch',\n",
       " 'publishing',\n",
       " 'group',\n",
       " '.',\n",
       " '.',\n",
       " 'START',\n",
       " 'Rudolph']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "##\n",
    "## Carga las sentencias de prueba\n",
    "##\n",
    "sents = nltk.corpus.treebank_raw.sents()\n",
    "\n",
    "##\n",
    "## Los tokens son los textos (lexemas).\n",
    "## Boundaries define el vector donde se ubican\n",
    "##\n",
    "tokens = []\n",
    "boundaries = set()\n",
    "offset = 0\n",
    "for sent in sents:\n",
    "    tokens.extend(sent)\n",
    "    offset += len(sent)\n",
    "    boundaries.add(offset-1)\n",
    "    \n",
    "##\n",
    "## Boundaries:\n",
    "##\n",
    "## {1,\n",
    "##  90116,\n",
    "##  16389,\n",
    "##  40968,\n",
    "##  81929,\n",
    "##  24587,\n",
    "##  16396,\n",
    "##  65548,\n",
    "##  ...}\n",
    "##\n",
    "tokens[:40]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[({'next-word-capitalized': False,\n",
       "   'prev-word': 'nov',\n",
       "   'punct': '.',\n",
       "   'prev-word-is-one-char': False},\n",
       "  False),\n",
       " ({'next-word-capitalized': True,\n",
       "   'prev-word': '29',\n",
       "   'punct': '.',\n",
       "   'prev-word-is-one-char': False},\n",
       "  True),\n",
       " ({'next-word-capitalized': True,\n",
       "   'prev-word': 'mr',\n",
       "   'punct': '.',\n",
       "   'prev-word-is-one-char': False},\n",
       "  False),\n",
       " ({'next-word-capitalized': True,\n",
       "   'prev-word': 'n',\n",
       "   'punct': '.',\n",
       "   'prev-word-is-one-char': True},\n",
       "  False),\n",
       " ({'next-word-capitalized': False,\n",
       "   'prev-word': 'group',\n",
       "   'punct': '.',\n",
       "   'prev-word-is-one-char': False},\n",
       "  True)]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "##\n",
    "## Define las características y las computa\n",
    "##\n",
    "def punct_features(tokens, i):\n",
    "    return {\n",
    "        \"next-word-capitalized\": tokens[i + 1][0].isupper(),\n",
    "        \"prev-word\": tokens[i - 1].lower(),\n",
    "        \"punct\": tokens[i],\n",
    "        \"prev-word-is-one-char\": len(tokens[i - 1]) == 1,\n",
    "    }\n",
    "\n",
    "##\n",
    "## Llama la función únicamente cuando encuenetra \".?!\"\n",
    "##\n",
    "featuresets = [\n",
    "    (punct_features(tokens, i), (i in boundaries))\n",
    "    for i in range(1, len(tokens) - 1)\n",
    "    if tokens[i] in \".?!\"\n",
    "]\n",
    "\n",
    "featuresets[0:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.936026936026936"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "##\n",
    "## Entrenamiento y evaluación del clasificador\n",
    "##\n",
    "size = int(len(featuresets) * 0.1)\n",
    "train_set, test_set = featuresets[size:], featuresets[:size]\n",
    "classifier = nltk.NaiveBayesClassifier.train(train_set)\n",
    "nltk.classify.accuracy(classifier, test_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "##\n",
    "## Función para usar el clasificador\n",
    "##\n",
    "def segment_sentences(words):\n",
    "    start = 0\n",
    "    sents = []\n",
    "    for i, word in enumerate(words):\n",
    "        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:\n",
    "            sents.append(words[start:i+1])\n",
    "            start = i+1\n",
    "    if start < len(words):\n",
    "        sents.append(words[start:])\n",
    "    return sents"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}