{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Creating a concordance with TokenHandler\n",
    "\n",
    "The `type2toks` attribute of the `nephosem.TokenHandler` class is a dictionary with type names as keys and `nephosem.TypeNode` objects as values.\n",
    "The `TypeNode` objects have a `tokens` attribute, which is a list of `nephosem.TokenNode` objects with information on each collected token. From them, we can create a concordance with a function like `tokenConcordance()` below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "nephosemdir = \"../../nephosem/\"\n",
    "sys.path.append(nephosemdir)\n",
    "mydir = \"./\"\n",
    "from nephosem import ConfigLoader, Vocab, TokenHandler\n",
    "from nephosem.utils import save_concordance\n",
    "conf = ConfigLoader()\n",
    "settings = conf.update_config('config.ini')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Collect tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = Vocab({'girl/N' : 0}) # dummy query just for illustration\n",
    "# alternatively, if you already have a vocabulary, vocab.subvocab(['girl/N'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING: Not provide the temporary path!\n",
      "WARNING: Use the default tmp directory: '~/tmp'!\n",
      "Scanning tokens of queries in corpus...\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[21, 39]                   be/V  what/W  that/I  a/D  ,/,  ask/V  and/C  ...\n",
       "girl/N/StanfDepSents.1/3   NaN   NaN     NaN     NaN  NaN  NaN    NaN    ...\n",
       "girl/N/StanfDepSents.1/13  NaN   NaN     NaN     NaN  NaN  NaN    NaN    ...\n",
       "girl/N/StanfDepSents.1/20  NaN   NaN     NaN     NaN  NaN  NaN    NaN    ...\n",
       "girl/N/StanfDepSents.2/29  -4    NaN     NaN     NaN  NaN  NaN    NaN    ...\n",
       "girl/N/StanfDepSents.8/3   NaN   NaN     NaN     NaN  NaN  NaN    NaN    ...\n",
       "girl/N/StanfDepSents.8/15  NaN   NaN     NaN     NaN  NaN  NaN    NaN    ...\n",
       "girl/N/StanfDepSents.8/25  NaN   NaN     NaN     NaN  NaN  NaN    -2     ...\n",
       "...                        ...   ...     ...     ...  ...  ...    ...    ..."
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokhan = TokenHandler(query, settings=settings)\n",
    "tokens = tokhan.retrieve_tokens()\n",
    "tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "outputfile = 'output/concordance.tsv'\n",
    "save_concordance(outputfile, tokhan.type2toks, colloc_fmt='word')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read concordance\n",
    "\n",
    "`nephosem.utils.save_concordance()` directly stores the concordance as a tab-separated dataframe in `outputfile`, without headers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>token_id</th>\n",
       "      <th>left</th>\n",
       "      <th>target</th>\n",
       "      <th>right</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>girl/N/StanfDepSents.1/3</td>\n",
       "      <td>The</td>\n",
       "      <td>girl</td>\n",
       "      <td>looks healthy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>girl/N/StanfDepSents.1/13</td>\n",
       "      <td>boy looks at the</td>\n",
       "      <td>girl</td>\n",
       "      <td>as she eats</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>girl/N/StanfDepSents.1/20</td>\n",
       "      <td>The</td>\n",
       "      <td>girl</td>\n",
       "      <td>eats less healthy food</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>girl/N/StanfDepSents.2/29</td>\n",
       "      <td>are eaten by the</td>\n",
       "      <td>girl</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>girl/N/StanfDepSents.8/3</td>\n",
       "      <td>The</td>\n",
       "      <td>girl</td>\n",
       "      <td>sat on the apple</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>girl/N/StanfDepSents.8/15</td>\n",
       "      <td>boy looked at the</td>\n",
       "      <td>girl</td>\n",
       "      <td>'s apple</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>girl/N/StanfDepSents.8/25</td>\n",
       "      <td>the boys and the</td>\n",
       "      <td>girls</td>\n",
       "      <td>eat apples</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>girl/N/StanfDepSents.4/7</td>\n",
       "      <td>boy says that the</td>\n",
       "      <td>girl</td>\n",
       "      <td>should eat the apple</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>girl/N/StanfDepSents.4/15</td>\n",
       "      <td>The</td>\n",
       "      <td>girl</td>\n",
       "      <td>eats the apple that</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>girl/N/StanfDepSents.9/14</td>\n",
       "      <td>The older</td>\n",
       "      <td>girl</td>\n",
       "      <td>looks at a boy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>girl/N/StanfDepSents.5/19</td>\n",
       "      <td>What the</td>\n",
       "      <td>girl</td>\n",
       "      <td>eats was given by</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>girl/N/StanfDepSents.11/3</td>\n",
       "      <td>The</td>\n",
       "      <td>girl</td>\n",
       "      <td>looks at the boy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>girl/N/StanfDepSents.11/19</td>\n",
       "      <td>the apple which the</td>\n",
       "      <td>girl</td>\n",
       "      <td>gave him</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>girl/N/StanfDepSents.11/28</td>\n",
       "      <td>This year , the</td>\n",
       "      <td>girl</td>\n",
       "      <td>looked at a boy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>girl/N/StanfDepSents.3/21</td>\n",
       "      <td>The boy and the</td>\n",
       "      <td>girl</td>\n",
       "      <td>eat a healthy and</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>girl/N/StanfDepSents.6/6</td>\n",
       "      <td>The boy gives the</td>\n",
       "      <td>girl</td>\n",
       "      <td>a tasty healthy apple</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>girl/N/StanfDepSents.6/21</td>\n",
       "      <td>The</td>\n",
       "      <td>girl</td>\n",
       "      <td>does n't eat</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>girl/N/StanfDepSents.10/13</td>\n",
       "      <td>The</td>\n",
       "      <td>girl</td>\n",
       "      <td>sits down</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>girl/N/StanfDepSents.10/19</td>\n",
       "      <td>The</td>\n",
       "      <td>girl</td>\n",
       "      <td>eats about ten apples</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>girl/N/StanfDepSents.7/7</td>\n",
       "      <td>old boy gives the</td>\n",
       "      <td>girl</td>\n",
       "      <td>a baby apple</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>girl/N/StanfDepSents.7/25</td>\n",
       "      <td>The boy asked the</td>\n",
       "      <td>girl</td>\n",
       "      <td>about eating apples</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      token_id                 left target  \\\n",
       "0     girl/N/StanfDepSents.1/3                  The   girl   \n",
       "1    girl/N/StanfDepSents.1/13     boy looks at the   girl   \n",
       "2    girl/N/StanfDepSents.1/20                  The   girl   \n",
       "3    girl/N/StanfDepSents.2/29     are eaten by the   girl   \n",
       "4     girl/N/StanfDepSents.8/3                  The   girl   \n",
       "5    girl/N/StanfDepSents.8/15    boy looked at the   girl   \n",
       "6    girl/N/StanfDepSents.8/25     the boys and the  girls   \n",
       "7     girl/N/StanfDepSents.4/7    boy says that the   girl   \n",
       "8    girl/N/StanfDepSents.4/15                  The   girl   \n",
       "9    girl/N/StanfDepSents.9/14            The older   girl   \n",
       "10   girl/N/StanfDepSents.5/19             What the   girl   \n",
       "11   girl/N/StanfDepSents.11/3                  The   girl   \n",
       "12  girl/N/StanfDepSents.11/19  the apple which the   girl   \n",
       "13  girl/N/StanfDepSents.11/28      This year , the   girl   \n",
       "14   girl/N/StanfDepSents.3/21      The boy and the   girl   \n",
       "15    girl/N/StanfDepSents.6/6    The boy gives the   girl   \n",
       "16   girl/N/StanfDepSents.6/21                  The   girl   \n",
       "17  girl/N/StanfDepSents.10/13                  The   girl   \n",
       "18  girl/N/StanfDepSents.10/19                  The   girl   \n",
       "19    girl/N/StanfDepSents.7/7    old boy gives the   girl   \n",
       "20   girl/N/StanfDepSents.7/25    The boy asked the   girl   \n",
       "\n",
       "                     right  \n",
       "0            looks healthy  \n",
       "1              as she eats  \n",
       "2   eats less healthy food  \n",
       "3                      NaN  \n",
       "4         sat on the apple  \n",
       "5                 's apple  \n",
       "6               eat apples  \n",
       "7     should eat the apple  \n",
       "8      eats the apple that  \n",
       "9           looks at a boy  \n",
       "10       eats was given by  \n",
       "11        looks at the boy  \n",
       "12                gave him  \n",
       "13         looked at a boy  \n",
       "14       eat a healthy and  \n",
       "15   a tasty healthy apple  \n",
       "16            does n't eat  \n",
       "17               sits down  \n",
       "18   eats about ten apples  \n",
       "19            a baby apple  \n",
       "20     about eating apples  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "pd.read_csv(outputfile, sep = '\\t', names = ['token_id', 'left', 'target', 'right'])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}