diff --git a/.gitignore b/.gitignore
index d32d3f8..b34512a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@
 **/.ipynb_checkpoints
 /target
 /test-data/generator/build/
+/wordlist/venv
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index 63c7116..0000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-click==8.1.3
-defusedxml==0.7.1
-joblib==1.2.0
-nltk==3.8.1
-numpy==1.24.2
-odfpy==1.4.1
-pandas==1.5.3
-python-dateutil==2.8.2
-pytz==2022.7.1
-regex==2022.10.31
-six==1.16.0
-tqdm==4.64.1
diff --git a/docs/wordlist-new.ipynb b/docs/wordlist-new.ipynb
index e2b5ba9..2928327 100644
--- a/docs/wordlist-new.ipynb
+++ b/docs/wordlist-new.ipynb
@@ -12,21 +12,34 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n",
-      "Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n",
-      "Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n",
+      "Collecting nltk\n",
+      "  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hCollecting odfpy\n",
+      "  Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m26.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hCollecting regex>=2021.8.3\n",
+      "  Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m29.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
       "Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
-      "Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
       "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
-      "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n"
+      "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n",
+      "Building wheels for collected packages: odfpy\n",
+      "  Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=5bfe9fcd7c590666411d404ea3e4ef0f704c9e62ff6621deb4ab09c84bec082a\n",
+      "  Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
+      "Successfully built odfpy\n",
+      "Installing collected packages: regex, odfpy, nltk\n",
+      "Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n",
-      "[nltk_data]   Package wordnet is already up-to-date!\n"
+      "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n"
      ]
     }
    ],
@@ -63,6 +76,8 @@
     "                return ret\n",
     "            ret.append(str(l).lower())\n",
     "        return ret\n",
+    "\n",
+    "\n",
     "    \n",
     "WORDLIST_SIZE = 8192 + 3\n",
     "lemmatizer = WordNetLemmatizer()\n",
diff --git a/docs/wordlist-new2.ipynb b/docs/wordlist-new2.ipynb
new file mode 100644
index 0000000..5a05a82
--- /dev/null
+++ b/docs/wordlist-new2.ipynb
@@ -0,0 +1,112 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "991a711f-be98-4aae-a657-84b065449916",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: spacy in /opt/conda/lib/python3.10/site-packages (3.5.0)\n",
+      "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.4)\n",
+      "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.3.0)\n",
+      "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.12)\n",
+      "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.10.5)\n",
+      "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.1.2)\n",
+      "Requirement already satisfied: typer<0.8.0,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.7.0)\n",
+      "Requirement already satisfied: numpy>=1.15.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.23.5)\n",
+      "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (6.3.0)\n",
+      "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.8)\n",
+      "Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from spacy) (67.3.2)\n",
+      "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (23.0)\n",
+      "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.1.1)\n",
+      "Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (8.1.7)\n",
+      "Requirement already satisfied: pathy>=0.10.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.10.1)\n",
+      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.8)\n",
+      "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.4.6)\n",
+      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (4.64.1)\n",
+      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.7)\n",
+      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.9)\n",
+      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.28.2)\n",
+      "Requirement already satisfied: typing-extensions>=4.2.0 in /opt/conda/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy) (4.4.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2022.12.7)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.1.1)\n",
+      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.26.14)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4)\n",
+      "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.7.9)\n",
+      "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.0.4)\n",
+      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/conda/lib/python3.10/site-packages (from typer<0.8.0,>=0.3.0->spacy) (8.1.3)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->spacy) (2.1.2)\n",
+      "Collecting en-core-web-trf==3.5.0\n",
+      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl (460.3 MB)\n",
+      "\u001b[2K     \u001b[91m━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.1/460.3 MB\u001b[0m \u001b[31m31.6 MB/s\u001b[0m eta \u001b[36m0:00:14\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    _initialized\n",
+    "except:\n",
+    "    !pip install spacy\n",
+    "    !python -m spacy download en_core_web_trf\n",
+    "    import spacy\n",
+    "    \n",
+    "    spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n",
+    "    \n",
+    "    _initialized=True\n",
+    "    \n",
+    "import pandas as pd\n",
+    "import gzip\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b93818f-c54a-4c88-9968-df4244b7c6f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import spacy\n",
+    "\n",
+    "# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization\n",
+    "nlp = spacy.load('en', disable=['parser', 'ner'])\n",
+    "\n",
+    "sentence = \"The striped bats are hanging on their feet for best\"\n",
+    "\n",
+    "# Parse the sentence using the loaded 'en' model object `nlp`\n",
+    "doc = nlp(sentence)\n",
+    "\n",
+    "# Extract the lemma for each token and join\n",
+    "\" \".join([token.lemma_ for token in doc])\n",
+    "#> 'the strip bat be hang on -PRON- foot for good'"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/wordlist/01-lemmatized-words.csv b/wordlist/01-lemmatized-words.csv
new file mode 100644
index 0000000..d95343b
--- /dev/null
+++ b/wordlist/01-lemmatized-words.csv
@@ -0,0 +1 @@
+word,lemmatized_word
diff --git a/wordlist/01-lemmatized-words.py b/wordlist/01-lemmatized-words.py
new file mode 100755
index 0000000..fffa46b
--- /dev/null
+++ b/wordlist/01-lemmatized-words.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+# coding: utf-8
+
+print("Step 1")
+
+
+try:
+    _initialized
+except:
+    # !pip install spacy
+    # !python -m spacy download en_core_web_trf
+    import spacy
+    from tqdm import tqdm
+
+    nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
+
+    _initialized=True
+
+import pandas as pd
+import gzip
+import re
+
+
+print("Step 2")
+
+
+def get_lines(filename):
+    with gzip.open(filename, 'r') as f:
+        ret = []
+        for l in f:
+            if len(ret) > 30_000:
+                return ret
+            ret.append(str(l).lower())
+        return ret
+
+
+
+WORDLIST_SIZE = 8192 + 3
+word_re = re.compile(r"^[A-Za-z]+$")
+
+
+print("Step 3")
+
+
+annotated_words=pd.read_excel("annotated_words.ods")
+
+excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
+excluded_words[0:10]
+
+custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
+
+custom_maps = [
+    (m[1]["word"].lower(), mapping.lower())
+    for m in custom_maps.iterrows()
+    for mapping in m[1]["maps_to"]
+]
+custom_maps
+
+
+print("Step 4")
+
+
+# Start parsing the wordlist
+all_words = get_lines("00-frequency-all.txt.gz")
+
+# Delete header line
+all_words = all_words[1:]
+
+# Get only the word (fixed width)
+all_words = [w[13:36].strip() for w in all_words]
+
+# Remove special characters
+all_words = [w for w in all_words if word_re.search(w)]
+
+# Remove all removed words
+all_words = [w for w in all_words if w not in excluded_words]
+
+# Add all custom mappings
+for m in list(sum(custom_maps, ())):
+    if m[0] not in all_words:
+        all_words.append(m[0])
+    if m[1] not in all_words:
+        all_words.append(m[1])
+
+
+print("Step 5")
+
+# Lemmatize all words (plural -> singular)
+lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
+
+with open("01-lemmatized-words.csv", "w") as f:
+    f.write("word,lemmatized_word\n")
+
+    for w in lemmatize_mappings:
+        f.write(f"{w[0]},{w[1]}")
+        f.write("\n")
diff --git a/docs/annotated_words.ods b/wordlist/annotated_words.ods
similarity index 100%
rename from docs/annotated_words.ods
rename to wordlist/annotated_words.ods
diff --git a/wordlist/requirements.txt b/wordlist/requirements.txt
new file mode 100644
index 0000000..f99d8bb
--- /dev/null
+++ b/wordlist/requirements.txt
@@ -0,0 +1,124 @@
+anyio==3.6.2
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+arrow==1.2.3
+asttokens==2.2.1
+attrs==22.2.0
+backcall==0.2.0
+beautifulsoup4==4.11.2
+bleach==6.0.0
+blis==0.7.9
+catalogue==2.0.8
+certifi==2022.12.7
+cffi==1.15.1
+charset-normalizer==3.0.1
+click==8.1.3
+comm==0.1.2
+confection==0.0.4
+cymem==2.0.7
+debugpy==1.6.6
+decorator==5.1.1
+defusedxml==0.7.1
+en-core-web-trf @ https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl
+executing==1.2.0
+fastjsonschema==2.16.3
+filelock==3.9.0
+fqdn==1.5.1
+huggingface-hub==0.12.1
+idna==3.4
+ipykernel==6.21.2
+ipython==8.11.0
+ipython-genutils==0.2.0
+ipywidgets==8.0.4
+isoduration==20.11.0
+jedi==0.18.2
+Jinja2==3.1.2
+jsonpointer==2.3
+jsonschema==4.17.3
+jupyter==1.0.0
+jupyter-console==6.6.2
+jupyter-events==0.6.3
+jupyter_client==8.0.3
+jupyter_core==5.2.0
+jupyter_server==2.3.0
+jupyter_server_terminals==0.4.4
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==3.0.5
+langcodes==3.3.0
+MarkupSafe==2.1.2
+matplotlib-inline==0.1.6
+mistune==2.0.5
+murmurhash==1.0.9
+nbclassic==0.5.2
+nbclient==0.7.2
+nbconvert==7.2.9
+nbformat==5.7.3
+nest-asyncio==1.5.6
+notebook==6.5.2
+notebook_shim==0.2.2
+numpy==1.24.2
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+odfpy==1.4.1
+packaging==23.0
+pandas==1.5.3
+pandocfilters==1.5.0
+parso==0.8.3
+pathy==0.10.1
+pexpect==4.8.0
+pickleshare==0.7.5
+platformdirs==3.0.0
+preshed==3.0.8
+prometheus-client==0.16.0
+prompt-toolkit==3.0.38
+psutil==5.9.4
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pycparser==2.21
+pydantic==1.10.5
+Pygments==2.14.0
+pyrsistent==0.19.3
+python-dateutil==2.8.2
+python-json-logger==2.0.7
+pytz==2022.7.1
+PyYAML==6.0
+pyzmq==25.0.0
+qtconsole==5.4.0
+QtPy==2.3.0
+regex==2022.10.31
+requests==2.28.2
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+Send2Trash==1.8.0
+six==1.16.0
+smart-open==6.3.0
+sniffio==1.3.0
+soupsieve==2.4
+spacy==3.5.0
+spacy-alignments==0.9.0
+spacy-legacy==3.0.12
+spacy-loggers==1.0.4
+spacy-transformers==1.2.2
+srsly==2.4.6
+stack-data==0.6.2
+terminado==0.17.1
+thinc==8.1.7
+tinycss2==1.2.1
+tokenizers==0.13.2
+torch==1.13.1
+tornado==6.2
+tqdm==4.64.1
+traitlets==5.9.0
+transformers==4.26.1
+typer==0.7.0
+typing_extensions==4.5.0
+uri-template==1.2.0
+urllib3==1.26.14
+wasabi==1.1.1
+wcwidth==0.2.6
+webcolors==1.12
+webencodings==0.5.1
+websocket-client==1.5.1
+widgetsnbextension==4.0.5
diff --git a/wordlist/wordlist-new.ipynb b/wordlist/wordlist-new.ipynb
new file mode 100644
index 0000000..9978270
--- /dev/null
+++ b/wordlist/wordlist-new.ipynb
@@ -0,0 +1,2657 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "0b00342f-7b19-49cc-bc6c-21019f8cc7dc",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting nltk\n",
+      "  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hCollecting odfpy\n",
+      "  Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m26.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hCollecting regex>=2021.8.3\n",
+      "  Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m29.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
+      "Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
+      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
+      "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n",
+      "Building wheels for collected packages: odfpy\n",
+      "  Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=5bfe9fcd7c590666411d404ea3e4ef0f704c9e62ff6621deb4ab09c84bec082a\n",
+      "  Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
+      "Successfully built odfpy\n",
+      "Installing collected packages: regex, odfpy, nltk\n",
+      "Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    _initialized\n",
+    "except:\n",
+    "    !pip install nltk odfpy\n",
+    "    import nltk\n",
+    "    \n",
+    "    nltk.download(\"wordnet\")\n",
+    "    _initialized=True\n",
+    "    \n",
+    "from nltk.stem.wordnet import WordNetLemmatizer\n",
+    "import pandas as pd\n",
+    "import gzip\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "985883de-8049-4f81-acd9-34e1abcd4070",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def get_lines(filename):\n",
+    "    with gzip.open(filename, 'r') as f:\n",
+    "        ret = []\n",
+    "        for l in f:\n",
+    "            if len(ret) > 30_000:\n",
+    "                return ret\n",
+    "            ret.append(str(l).lower())\n",
+    "        return ret\n",
+    "\n",
+    "\n",
+    "    \n",
+    "WORDLIST_SIZE = 8192 + 3\n",
+    "lemmatizer = WordNetLemmatizer()\n",
+    "word_re = re.compile(r\"^[A-Za-z]+$\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "926d0d84-0d7e-4939-b87f-1a170f870a8f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "annotated_words=pd.read_excel(\"annotated_words.ods\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "8b0d26e4-051c-4669-b566-bbd5ddbbe02b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['a', 'as', 'it', 'was', 'i', 'has', 'so', 'its', 's', 'p']"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n",
+    "excluded_words[0:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "2eea14b2-82bf-4353-8982-76a6c7f46d22",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('be', 'bee'),\n",
+       " ('by', 'bye'),\n",
+       " ('per', 'purr'),\n",
+       " ('sense', 'cent'),\n",
+       " ('died', 'dyed'),\n",
+       " ('cents', 'sense'),\n",
+       " ('yellow', 'hello'),\n",
+       " ('corps', 'core'),\n",
+       " ('ore', 'oar'),\n",
+       " ('ore', ' or'),\n",
+       " ('vary', 'very'),\n",
+       " ('com', 'calm'),\n",
+       " ('filing', 'filling'),\n",
+       " ('fax', 'facts'),\n",
+       " ('favour', 'favor'),\n",
+       " ('theatre', 'theater'),\n",
+       " ('par', 'parse'),\n",
+       " ('honour', 'honor'),\n",
+       " ('harry', 'hairy'),\n",
+       " ('brings', 'bring'),\n",
+       " ('organisation', 'organization'),\n",
+       " ('simultaneously', 'simultaneous'),\n",
+       " ('aluminum', 'aluminium'),\n",
+       " ('knight', 'night'),\n",
+       " ('electronics', 'electronic'),\n",
+       " ('senses', 'cent'),\n",
+       " ('organisations', 'organization'),\n",
+       " ('fortunately', 'fortunate'),\n",
+       " ('corp', 'core'),\n",
+       " ('chile', 'chilly'),\n",
+       " ('chile', ' chili'),\n",
+       " ('owe', 'oh'),\n",
+       " ('capitol', 'capital'),\n",
+       " ('weary', 'wary'),\n",
+       " ('berry', 'barry'),\n",
+       " ('lecturer', 'lecture'),\n",
+       " ('weigh', 'way'),\n",
+       " ('aluminium', 'aluminum'),\n",
+       " ('isle', 'aisle'),\n",
+       " ('boulder', 'bolder'),\n",
+       " ('blew', 'blue'),\n",
+       " ('reformed', 'reform'),\n",
+       " ('scent', 'cent'),\n",
+       " ('ads', 'adds'),\n",
+       " ('honours', 'honors'),\n",
+       " ('bot', 'bought'),\n",
+       " ('dew', 'do'),\n",
+       " ('dew', ' due'),\n",
+       " ('theatres', 'theater'),\n",
+       " ('thru', 'through'),\n",
+       " ('sensed', 'cent'),\n",
+       " ('monies', 'moneys'),\n",
+       " ('cue', 'queue'),\n",
+       " ('hairy', 'harry'),\n",
+       " ('weighs', 'way'),\n",
+       " ('hem', 'him'),\n",
+       " ('nun', 'none'),\n",
+       " ('organisational', 'organizational'),\n",
+       " ('grate', 'great'),\n",
+       " ('dessert', 'desert'),\n",
+       " ('aux', 'ox'),\n",
+       " ('rap', 'wrap'),\n",
+       " ('filings', 'filling'),\n",
+       " ('pars', 'parse'),\n",
+       " ('dazed', 'day'),\n",
+       " ('scents', 'cent'),\n",
+       " ('daze', 'day'),\n",
+       " ('four', 'for')]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n",
+    "\n",
+    "custom_maps = [\n",
+    "    (m[1][\"word\"].lower(), mapping.lower())\n",
+    "    for m in custom_maps.iterrows()\n",
+    "    for mapping in m[1][\"maps_to\"]\n",
+    "]\n",
+    "custom_maps"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "8bdfd108-bf43-4c0f-bc5c-f91925da753f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Start parsing the wordlist\n",
+    "all_words = get_lines(\"frequency-all.txt.gz\")\n",
+    "\n",
+    "# Delete header line\n",
+    "all_words = all_words[1:]\n",
+    "\n",
+    "# Get only the word (fixed width)\n",
+    "all_words = [w[13:36].strip() for w in all_words]\n",
+    "\n",
+    "# Remove special characters\n",
+    "all_words = [w for w in all_words if word_re.search(w)]\n",
+    "\n",
+    "# Remove all removed words\n",
+    "all_words = [w for w in all_words if w not in excluded_words]\n",
+    "\n",
+    "# Add all custom mappings\n",
+    "for m in list(sum(custom_maps, ())):\n",
+    "    if m[0] not in all_words:\n",
+    "        all_words.append(m[0])\n",
+    "    if m[1] not in all_words:\n",
+    "        all_words.append(m[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "e42f2b56-98b3-4465-95be-812d8584b511",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['the',\n",
+       " 'of',\n",
+       " 'and',\n",
+       " 'to',\n",
+       " 'in',\n",
+       " 'is',\n",
+       " 'that',\n",
+       " 'for',\n",
+       " 'be',\n",
+       " 'by',\n",
+       " 'with',\n",
+       " 'on',\n",
+       " 'not',\n",
+       " 'or',\n",
+       " 'this',\n",
+       " 'are',\n",
+       " 'at',\n",
+       " 'from',\n",
+       " 'he',\n",
+       " 'which',\n",
+       " 'his',\n",
+       " 'have',\n",
+       " 'an',\n",
+       " 'but',\n",
+       " 'you',\n",
+       " 'they',\n",
+       " 'were',\n",
+       " 'had',\n",
+       " 'we',\n",
+       " 'all',\n",
+       " 'one',\n",
+       " 'their',\n",
+       " 'been',\n",
+       " 'will',\n",
+       " 'there',\n",
+       " 'can',\n",
+       " 'if',\n",
+       " 'other',\n",
+       " 'would',\n",
+       " 'no',\n",
+       " 'her',\n",
+       " 'may',\n",
+       " 'more',\n",
+       " 'when',\n",
+       " 'who',\n",
+       " 'such',\n",
+       " 'these',\n",
+       " 'any',\n",
+       " 'she',\n",
+       " 'new',\n",
+       " 'time',\n",
+       " 'than',\n",
+       " 'do',\n",
+       " 'some',\n",
+       " 'what',\n",
+       " 'only',\n",
+       " 'into',\n",
+       " 'them',\n",
+       " 'two',\n",
+       " 'also',\n",
+       " 'about',\n",
+       " 'out',\n",
+       " 'him',\n",
+       " 'my',\n",
+       " 'said',\n",
+       " 'up',\n",
+       " 'our',\n",
+       " 'first',\n",
+       " 'should',\n",
+       " 'under',\n",
+       " 'made',\n",
+       " 'state',\n",
+       " 'see',\n",
+       " 'after',\n",
+       " 'could',\n",
+       " 'then',\n",
+       " 'me',\n",
+       " 'most',\n",
+       " 'over',\n",
+       " 'very',\n",
+       " 'your',\n",
+       " 'between',\n",
+       " 'where',\n",
+       " 'now',\n",
+       " 'shall',\n",
+       " 'work',\n",
+       " 'those',\n",
+       " 'same',\n",
+       " 'well',\n",
+       " 'each',\n",
+       " 'many',\n",
+       " 'being',\n",
+       " 'years',\n",
+       " 'did',\n",
+       " 'year',\n",
+       " 'through',\n",
+       " 'must',\n",
+       " 'upon',\n",
+       " 'before',\n",
+       " 'like',\n",
+       " 'use',\n",
+       " 'part',\n",
+       " 'general',\n",
+       " 'people',\n",
+       " 'because',\n",
+       " 'used',\n",
+       " 'how',\n",
+       " 'even',\n",
+       " 'much',\n",
+       " 'states',\n",
+       " 'during',\n",
+       " 'both',\n",
+       " 'case',\n",
+       " 'three',\n",
+       " 'number',\n",
+       " 'make',\n",
+       " 'per',\n",
+       " 'great',\n",
+       " 'act',\n",
+       " 'way',\n",
+       " 'life',\n",
+       " 'good',\n",
+       " 'day',\n",
+       " 'public',\n",
+       " 'man',\n",
+       " 'however',\n",
+       " 'system',\n",
+       " 'water',\n",
+       " 'without',\n",
+       " 'government',\n",
+       " 'while',\n",
+       " 'long',\n",
+       " 'order',\n",
+       " 'law',\n",
+       " 'section',\n",
+       " 'court',\n",
+       " 'high',\n",
+       " 'right',\n",
+       " 'own',\n",
+       " 'found',\n",
+       " 'united',\n",
+       " 'just',\n",
+       " 'here',\n",
+       " 'against',\n",
+       " 'world',\n",
+       " 'does',\n",
+       " 'company',\n",
+       " 'within',\n",
+       " 'given',\n",
+       " 'service',\n",
+       " 'house',\n",
+       " 'another',\n",
+       " 'power',\n",
+       " 'place',\n",
+       " 'know',\n",
+       " 'little',\n",
+       " 'down',\n",
+       " 'present',\n",
+       " 'every',\n",
+       " 'national',\n",
+       " 'back',\n",
+       " 'take',\n",
+       " 'information',\n",
+       " 'men',\n",
+       " 'since',\n",
+       " 'might',\n",
+       " 'small',\n",
+       " 'large',\n",
+       " 'school',\n",
+       " 'following',\n",
+       " 'still',\n",
+       " 'less',\n",
+       " 'last',\n",
+       " 'city',\n",
+       " 'second',\n",
+       " 'development',\n",
+       " 'different',\n",
+       " 'university',\n",
+       " 'old',\n",
+       " 'form',\n",
+       " 'point',\n",
+       " 'total',\n",
+       " 'data',\n",
+       " 'too',\n",
+       " 'committee',\n",
+       " 'report',\n",
+       " 'business',\n",
+       " 'think',\n",
+       " 'end',\n",
+       " 'get',\n",
+       " 'set',\n",
+       " 'research',\n",
+       " 'say',\n",
+       " 'come',\n",
+       " 'country',\n",
+       " 'never',\n",
+       " 'fact',\n",
+       " 'go',\n",
+       " 'control',\n",
+       " 'thus',\n",
+       " 'having',\n",
+       " 'value',\n",
+       " 'social',\n",
+       " 'department',\n",
+       " 'few',\n",
+       " 'above',\n",
+       " 'important',\n",
+       " 'interest',\n",
+       " 'study',\n",
+       " 'off',\n",
+       " 'area',\n",
+       " 'means',\n",
+       " 'office',\n",
+       " 'group',\n",
+       " 'give',\n",
+       " 'again',\n",
+       " 'war',\n",
+       " 'whether',\n",
+       " 'question',\n",
+       " 'called',\n",
+       " 'period',\n",
+       " 'line',\n",
+       " 'land',\n",
+       " 'four',\n",
+       " 'among',\n",
+       " 'table',\n",
+       " 'board',\n",
+       " 'until',\n",
+       " 'hand',\n",
+       " 'taken',\n",
+       " 'need',\n",
+       " 'education',\n",
+       " 'certain',\n",
+       " 'county',\n",
+       " 'action',\n",
+       " 'several',\n",
+       " 'am',\n",
+       " 'course',\n",
+       " 'cases',\n",
+       " 'far',\n",
+       " 'effect',\n",
+       " 'possible',\n",
+       " 'though',\n",
+       " 'left',\n",
+       " 'further',\n",
+       " 'home',\n",
+       " 'days',\n",
+       " 'person',\n",
+       " 'health',\n",
+       " 'amount',\n",
+       " 'members',\n",
+       " 'subject',\n",
+       " 'yet',\n",
+       " 'program',\n",
+       " 'therefore',\n",
+       " 'process',\n",
+       " 'services',\n",
+       " 'rate',\n",
+       " 'local',\n",
+       " 'name',\n",
+       " 'find',\n",
+       " 'necessary',\n",
+       " 'often',\n",
+       " 'others',\n",
+       " 'whole',\n",
+       " 'change',\n",
+       " 'example',\n",
+       " 'president',\n",
+       " 'history',\n",
+       " 'best',\n",
+       " 'although',\n",
+       " 'family',\n",
+       " 'side',\n",
+       " 'women',\n",
+       " 'held',\n",
+       " 'based',\n",
+       " 'south',\n",
+       " 'special',\n",
+       " 'required',\n",
+       " 'came',\n",
+       " 'thought',\n",
+       " 'five',\n",
+       " 'always',\n",
+       " 'himself',\n",
+       " 'air',\n",
+       " 'known',\n",
+       " 'head',\n",
+       " 'either',\n",
+       " 'property',\n",
+       " 'cost',\n",
+       " 'rather',\n",
+       " 'bill',\n",
+       " 'put',\n",
+       " 'human',\n",
+       " 'figure',\n",
+       " 'results',\n",
+       " 'level',\n",
+       " 'conditions',\n",
+       " 'full',\n",
+       " 'times',\n",
+       " 'book',\n",
+       " 'available',\n",
+       " 'early',\n",
+       " 'matter',\n",
+       " 'common',\n",
+       " 'light',\n",
+       " 'let',\n",
+       " 'society',\n",
+       " 'body',\n",
+       " 'international',\n",
+       " 'including',\n",
+       " 'free',\n",
+       " 'evidence',\n",
+       " 'better',\n",
+       " 'type',\n",
+       " 'provided',\n",
+       " 'due',\n",
+       " 'next',\n",
+       " 'production',\n",
+       " 'once',\n",
+       " 'done',\n",
+       " 'making',\n",
+       " 'least',\n",
+       " 'support',\n",
+       " 'north',\n",
+       " 'later',\n",
+       " 'using',\n",
+       " 'things',\n",
+       " 'economic',\n",
+       " 'chapter',\n",
+       " 'various',\n",
+       " 'why',\n",
+       " 'white',\n",
+       " 'going',\n",
+       " 'commission',\n",
+       " 'federal',\n",
+       " 'away',\n",
+       " 'field',\n",
+       " 'result',\n",
+       " 'nature',\n",
+       " 'policy',\n",
+       " 'become',\n",
+       " 'political',\n",
+       " 'increase',\n",
+       " 'around',\n",
+       " 'age',\n",
+       " 'want',\n",
+       " 'low',\n",
+       " 'trade',\n",
+       " 'half',\n",
+       " 'position',\n",
+       " 'young',\n",
+       " 'money',\n",
+       " 'percent',\n",
+       " 'cent',\n",
+       " 'class',\n",
+       " 'words',\n",
+       " 'view',\n",
+       " 'provide',\n",
+       " 'seen',\n",
+       " 'show',\n",
+       " 'district',\n",
+       " 'party',\n",
+       " 'analysis',\n",
+       " 'care',\n",
+       " 'june',\n",
+       " 'foreign',\n",
+       " 'shown',\n",
+       " 'received',\n",
+       " 'management',\n",
+       " 'third',\n",
+       " 'took',\n",
+       " 'something',\n",
+       " 'tax',\n",
+       " 'account',\n",
+       " 'problem',\n",
+       " 'almost',\n",
+       " 'west',\n",
+       " 'nothing',\n",
+       " 'together',\n",
+       " 'individual',\n",
+       " 'open',\n",
+       " 'material',\n",
+       " 'paper',\n",
+       " 'feet',\n",
+       " 'force',\n",
+       " 'association',\n",
+       " 'purpose',\n",
+       " 'terms',\n",
+       " 'method',\n",
+       " 'help',\n",
+       " 'real',\n",
+       " 'ever',\n",
+       " 'already',\n",
+       " 'along',\n",
+       " 'went',\n",
+       " 'term',\n",
+       " 'systems',\n",
+       " 'member',\n",
+       " 'particular',\n",
+       " 'problems',\n",
+       " 'energy',\n",
+       " 'secretary',\n",
+       " 'date',\n",
+       " 'price',\n",
+       " 'short',\n",
+       " 'true',\n",
+       " 'street',\n",
+       " 'building',\n",
+       " 'room',\n",
+       " 'market',\n",
+       " 'look',\n",
+       " 'similar',\n",
+       " 'industry',\n",
+       " 'areas',\n",
+       " 'bank',\n",
+       " 'according',\n",
+       " 'studies',\n",
+       " 'itself',\n",
+       " 'application',\n",
+       " 'current',\n",
+       " 'read',\n",
+       " 'press',\n",
+       " 'community',\n",
+       " 'plan',\n",
+       " 'whose',\n",
+       " 'major',\n",
+       " 'considered',\n",
+       " 'mind',\n",
+       " 'union',\n",
+       " 'cause',\n",
+       " 'able',\n",
+       " 'surface',\n",
+       " 'face',\n",
+       " 'river',\n",
+       " 'council',\n",
+       " 'income',\n",
+       " 'july',\n",
+       " 'near',\n",
+       " 'experience',\n",
+       " 'non',\n",
+       " 'paid',\n",
+       " 'pay',\n",
+       " 'reason',\n",
+       " 'themselves',\n",
+       " 'asked',\n",
+       " 'march',\n",
+       " 'king',\n",
+       " 'higher',\n",
+       " 'single',\n",
+       " 'rights',\n",
+       " 'average',\n",
+       " 'father',\n",
+       " 'note',\n",
+       " 'treatment',\n",
+       " 'love',\n",
+       " 'changes',\n",
+       " 'black',\n",
+       " 'knowledge',\n",
+       " 'enough',\n",
+       " 'future',\n",
+       " 'kind',\n",
+       " 'lower',\n",
+       " 'authority',\n",
+       " 'past',\n",
+       " 'natural',\n",
+       " 'six',\n",
+       " 'persons',\n",
+       " 'food',\n",
+       " 'working',\n",
+       " 'central',\n",
+       " 'college',\n",
+       " 'self',\n",
+       " 'products',\n",
+       " 'model',\n",
+       " 'brought',\n",
+       " 'greater',\n",
+       " 'countries',\n",
+       " 'test',\n",
+       " 'nor',\n",
+       " 'students',\n",
+       " 'private',\n",
+       " 'construction',\n",
+       " 'perhaps',\n",
+       " 'ground',\n",
+       " 'sir',\n",
+       " 'basis',\n",
+       " 'months',\n",
+       " 'growth',\n",
+       " 'increased',\n",
+       " 'word',\n",
+       " 'east',\n",
+       " 'language',\n",
+       " 'rule',\n",
+       " 'continued',\n",
+       " 'quite',\n",
+       " 'except',\n",
+       " 'series',\n",
+       " 'practice',\n",
+       " 'thing',\n",
+       " 'night',\n",
+       " 'works',\n",
+       " 'eyes',\n",
+       " 'oil',\n",
+       " 'art',\n",
+       " 'told',\n",
+       " 'especially',\n",
+       " 'population',\n",
+       " 'science',\n",
+       " 'whom',\n",
+       " 'obtained',\n",
+       " 'parts',\n",
+       " 'capital',\n",
+       " 'include',\n",
+       " 'generally',\n",
+       " 'meeting',\n",
+       " 'specific',\n",
+       " 'described',\n",
+       " 'believe',\n",
+       " 'review',\n",
+       " 'issue',\n",
+       " 'respect',\n",
+       " 'contract',\n",
+       " 'became',\n",
+       " 'effects',\n",
+       " 'medical',\n",
+       " 'road',\n",
+       " 'got',\n",
+       " 'clear',\n",
+       " 'main',\n",
+       " 'labor',\n",
+       " 'operation',\n",
+       " 'size',\n",
+       " 'below',\n",
+       " 'hours',\n",
+       " 'sense',\n",
+       " 'addition',\n",
+       " 'probably',\n",
+       " 'mean',\n",
+       " 'groups',\n",
+       " 'century',\n",
+       " 'personal',\n",
+       " 'plant',\n",
+       " 'training',\n",
+       " 'design',\n",
+       " 'statement',\n",
+       " 'structure',\n",
+       " 'project',\n",
+       " 'million',\n",
+       " 'usually',\n",
+       " 'range',\n",
+       " 'call',\n",
+       " 'mother',\n",
+       " 'seems',\n",
+       " 'standard',\n",
+       " 'return',\n",
+       " 'title',\n",
+       " 'established',\n",
+       " 'keep',\n",
+       " 'space',\n",
+       " 'annual',\n",
+       " 'record',\n",
+       " 'close',\n",
+       " 'april',\n",
+       " 'complete',\n",
+       " 'page',\n",
+       " 'heart',\n",
+       " 'says',\n",
+       " 'fig',\n",
+       " 'quality',\n",
+       " 'gas',\n",
+       " 'methods',\n",
+       " 'letter',\n",
+       " 'stock',\n",
+       " 'costs',\n",
+       " 'gave',\n",
+       " 'related',\n",
+       " 'administration',\n",
+       " 'activities',\n",
+       " 'condition',\n",
+       " 'theory',\n",
+       " 'town',\n",
+       " 'equipment',\n",
+       " 'rates',\n",
+       " 'soon',\n",
+       " 'decision',\n",
+       " 'pressure',\n",
+       " 'written',\n",
+       " 'lines',\n",
+       " 'corporation',\n",
+       " 'tell',\n",
+       " 'schools',\n",
+       " 'agreement',\n",
+       " 'reported',\n",
+       " 'attention',\n",
+       " 'materials',\n",
+       " 'fire',\n",
+       " 'direct',\n",
+       " 'saw',\n",
+       " 'published',\n",
+       " 'temperature',\n",
+       " 'species',\n",
+       " 'really',\n",
+       " 'laws',\n",
+       " 'woman',\n",
+       " 'function',\n",
+       " 'military',\n",
+       " 'proposed',\n",
+       " 'january',\n",
+       " 'additional',\n",
+       " 'late',\n",
+       " 'books',\n",
+       " 'opinion',\n",
+       " 'loss',\n",
+       " 'limited',\n",
+       " 'source',\n",
+       " 'article',\n",
+       " 'notice',\n",
+       " 'security',\n",
+       " 'organization',\n",
+       " 'hands',\n",
+       " 'financial',\n",
+       " 'rules',\n",
+       " 'follows',\n",
+       " 'miles',\n",
+       " 'values',\n",
+       " 'points',\n",
+       " 'chief',\n",
+       " 'distribution',\n",
+       " 'sometimes',\n",
+       " 'insurance',\n",
+       " 'son',\n",
+       " 'strong',\n",
+       " 'length',\n",
+       " 'activity',\n",
+       " 'original',\n",
+       " 'forms',\n",
+       " 'yes',\n",
+       " 'effective',\n",
+       " 'defendant',\n",
+       " 'living',\n",
+       " 'december',\n",
+       " 'character',\n",
+       " 'began',\n",
+       " 'carried',\n",
+       " 'supply',\n",
+       " 'blood',\n",
+       " 'taking',\n",
+       " 'manner',\n",
+       " 'journal',\n",
+       " 'hundred',\n",
+       " 'red',\n",
+       " 'shows',\n",
+       " 'developed',\n",
+       " 'performance',\n",
+       " 'situation',\n",
+       " 'programs',\n",
+       " 'felt',\n",
+       " 'workers',\n",
+       " 'volume',\n",
+       " 'presented',\n",
+       " 'knew',\n",
+       " 'answer',\n",
+       " 'resources',\n",
+       " 'questions',\n",
+       " 'industrial',\n",
+       " 'needs',\n",
+       " 'twenty',\n",
+       " 'sent',\n",
+       " 'looked',\n",
+       " 'purposes',\n",
+       " 'library',\n",
+       " 'added',\n",
+       " 'passed',\n",
+       " 'ten',\n",
+       " 'sea',\n",
+       " 'applied',\n",
+       " 'included',\n",
+       " 'physical',\n",
+       " 'across',\n",
+       " 'army',\n",
+       " 'toward',\n",
+       " 'produced',\n",
+       " 'makes',\n",
+       " 'placed',\n",
+       " 'role',\n",
+       " 'october',\n",
+       " 'final',\n",
+       " 'approach',\n",
+       " 'provisions',\n",
+       " 'leave',\n",
+       " 'director',\n",
+       " 'employment',\n",
+       " 'anything',\n",
+       " 'particularly',\n",
+       " 'hard',\n",
+       " 'outside',\n",
+       " 'week',\n",
+       " 'feel',\n",
+       " 'charge',\n",
+       " 'indeed',\n",
+       " 'degree',\n",
+       " 'reference',\n",
+       " 'requirements',\n",
+       " 'september',\n",
+       " 'today',\n",
+       " 'western',\n",
+       " 'influence',\n",
+       " 'unit',\n",
+       " 'solution',\n",
+       " 'chairman',\n",
+       " 'legal',\n",
+       " 'motion',\n",
+       " 'region',\n",
+       " 'idea',\n",
+       " 'list',\n",
+       " 'judgment',\n",
+       " 'determined',\n",
+       " 'poor',\n",
+       " 'disease',\n",
+       " 'civil',\n",
+       " 'turn',\n",
+       " 'modern',\n",
+       " 'normal',\n",
+       " 'appear',\n",
+       " 'employees',\n",
+       " 'latter',\n",
+       " 'heard',\n",
+       " 'top',\n",
+       " 'sure',\n",
+       " 'moment',\n",
+       " 'code',\n",
+       " 'reports',\n",
+       " 'wife',\n",
+       " 'post',\n",
+       " 'difficult',\n",
+       " 'recent',\n",
+       " 'extent',\n",
+       " 'longer',\n",
+       " 'story',\n",
+       " 'meet',\n",
+       " 'officers',\n",
+       " 'companies',\n",
+       " 'patients',\n",
+       " 'front',\n",
+       " 'doing',\n",
+       " 'staff',\n",
+       " 'product',\n",
+       " 'august',\n",
+       " 'needed',\n",
+       " 'involved',\n",
+       " 'likely',\n",
+       " 'former',\n",
+       " 'run',\n",
+       " 'author',\n",
+       " 'middle',\n",
+       " 'turned',\n",
+       " 'agency',\n",
+       " 'reading',\n",
+       " 'beginning',\n",
+       " 'duty',\n",
+       " 'movement',\n",
+       " 'month',\n",
+       " 'alone',\n",
+       " 'issues',\n",
+       " 'beyond',\n",
+       " 'fine',\n",
+       " 'base',\n",
+       " 'parties',\n",
+       " 'relations',\n",
+       " 'simple',\n",
+       " 'consider',\n",
+       " 'proper',\n",
+       " 'instead',\n",
+       " 'significant',\n",
+       " 'appears',\n",
+       " 'equal',\n",
+       " 'lost',\n",
+       " 'followed',\n",
+       " 'hope',\n",
+       " 'cut',\n",
+       " 'unless',\n",
+       " 'nearly',\n",
+       " 'claim',\n",
+       " 'associated',\n",
+       " 'expected',\n",
+       " 'operations',\n",
+       " 'difference',\n",
+       " 'funds',\n",
+       " 'direction',\n",
+       " 'cross',\n",
+       " 'live',\n",
+       " 'finally',\n",
+       " 'weight',\n",
+       " 'lead',\n",
+       " 'trial',\n",
+       " 'justice',\n",
+       " 'officer',\n",
+       " 'factors',\n",
+       " 'response',\n",
+       " 'cells',\n",
+       " 'earth',\n",
+       " 'rest',\n",
+       " 'fund',\n",
+       " 'bring',\n",
+       " 'trust',\n",
+       " 'goods',\n",
+       " 'observed',\n",
+       " 'behind',\n",
+       " 'job',\n",
+       " 'door',\n",
+       " 'types',\n",
+       " 'understand',\n",
+       " 'acid',\n",
+       " 'hold',\n",
+       " 'technology',\n",
+       " 'wide',\n",
+       " 'protection',\n",
+       " 'basic',\n",
+       " 'november',\n",
+       " 'seemed',\n",
+       " 'throughout',\n",
+       " 'levels',\n",
+       " 'importance',\n",
+       " 'sales',\n",
+       " 'sale',\n",
+       " 'stated',\n",
+       " 'address',\n",
+       " 'potential',\n",
+       " 'payment',\n",
+       " 'prior',\n",
+       " 'discussion',\n",
+       " 'conference',\n",
+       " 'writing',\n",
+       " 'stage',\n",
+       " 'fall',\n",
+       " 'notes',\n",
+       " 'iron',\n",
+       " 'play',\n",
+       " 'ask',\n",
+       " 'plants',\n",
+       " 'relationship',\n",
+       " 'towards',\n",
+       " 'regard',\n",
+       " 'referred',\n",
+       " 'patient',\n",
+       " 'flow',\n",
+       " 'consideration',\n",
+       " 'hospital',\n",
+       " 'seem',\n",
+       " 'february',\n",
+       " 'soil',\n",
+       " 'morning',\n",
+       " 'commercial',\n",
+       " 'planning',\n",
+       " 'provides',\n",
+       " 'appropriate',\n",
+       " 'technical',\n",
+       " 'demand',\n",
+       " 'sufficient',\n",
+       " 'principal',\n",
+       " 'credit',\n",
+       " 'peace',\n",
+       " 'previous',\n",
+       " 'object',\n",
+       " 'prices',\n",
+       " 'kept',\n",
+       " 'sound',\n",
+       " 'wanted',\n",
+       " 'looking',\n",
+       " 'entire',\n",
+       " 'plaintiff',\n",
+       " 'heat',\n",
+       " 'ways',\n",
+       " 'otherwise',\n",
+       " 'judge',\n",
+       " 'hour',\n",
+       " 'capacity',\n",
+       " 'brown',\n",
+       " 'music',\n",
+       " 'risk',\n",
+       " 'box',\n",
+       " 'exchange',\n",
+       " 'produce',\n",
+       " 'station',\n",
+       " 'big',\n",
+       " 'primary',\n",
+       " 'institute',\n",
+       " 'mentioned',\n",
+       " 'prepared',\n",
+       " 'cell',\n",
+       " 'spirit',\n",
+       " 'allowed',\n",
+       " 'claims',\n",
+       " 'site',\n",
+       " 'green',\n",
+       " 'directly',\n",
+       " 'text',\n",
+       " 'friends',\n",
+       " 'presence',\n",
+       " 'survey',\n",
+       " 'determine',\n",
+       " 'car',\n",
+       " 'larger',\n",
+       " 'gives',\n",
+       " 'deep',\n",
+       " 'simply',\n",
+       " 'immediately',\n",
+       " 'distance',\n",
+       " 'coming',\n",
+       " 'seven',\n",
+       " 'steel',\n",
+       " 'friend',\n",
+       " 'records',\n",
+       " 'existing',\n",
+       " 'clearly',\n",
+       " 'actual',\n",
+       " 'relation',\n",
+       " 'born',\n",
+       " 'learning',\n",
+       " 'forces',\n",
+       " 'voice',\n",
+       " 'earlier',\n",
+       " 'circumstances',\n",
+       " 'safety',\n",
+       " 'ago',\n",
+       " 'issued',\n",
+       " 'upper',\n",
+       " 'require',\n",
+       " 'scale',\n",
+       " 'island',\n",
+       " 'culture',\n",
+       " 'employed',\n",
+       " 'eight',\n",
+       " 'estate',\n",
+       " 'facts',\n",
+       " 'portion',\n",
+       " 'deal',\n",
+       " 'share',\n",
+       " 'actually',\n",
+       " 'aid',\n",
+       " 'engineering',\n",
+       " 'continue',\n",
+       " 'formed',\n",
+       " 'agricultural',\n",
+       " 'entitled',\n",
+       " 'mass',\n",
+       " 'truth',\n",
+       " 'giving',\n",
+       " 'numbers',\n",
+       " 'places',\n",
+       " 'met',\n",
+       " 'built',\n",
+       " 'content',\n",
+       " 'connection',\n",
+       " 'assistance',\n",
+       " 'coal',\n",
+       " 'progress',\n",
+       " 'receive',\n",
+       " 'active',\n",
+       " 'nation',\n",
+       " 'contact',\n",
+       " 'amendment',\n",
+       " 'interests',\n",
+       " 'net',\n",
+       " 'wall',\n",
+       " 'standards',\n",
+       " 'farm',\n",
+       " 'understanding',\n",
+       " 'strength',\n",
+       " 'minutes',\n",
+       " 'figures',\n",
+       " 'move',\n",
+       " 'elements',\n",
+       " 'concerned',\n",
+       " 'regulations',\n",
+       " 'step',\n",
+       " 'literature',\n",
+       " 'units',\n",
+       " 'opportunity',\n",
+       " 'investment',\n",
+       " 'led',\n",
+       " 'reduced',\n",
+       " 'follow',\n",
+       " 'facilities',\n",
+       " 'benefit',\n",
+       " 'compared',\n",
+       " 'reached',\n",
+       " 'student',\n",
+       " 'religious',\n",
+       " 'measure',\n",
+       " 'individuals',\n",
+       " 'meaning',\n",
+       " 'considerable',\n",
+       " 'relative',\n",
+       " 'electric',\n",
+       " 'joint',\n",
+       " 'certainly',\n",
+       " 'failure',\n",
+       " 'apply',\n",
+       " ...]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd9e939e-7827-42f9-89be-bcfbb8bd3f52",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "64b6fcd3-acf7-45da-a335-79c538963bdd",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "discus\n",
+      "physic\n",
+      "posse\n",
+      "serf\n",
+      "sens\n",
+      "caput\n",
+      "bos\n",
+      "graf\n",
+      "pant\n",
+      "barrack\n",
+      "auspex\n",
+      "footstep\n",
+      "colonist\n",
+      "villager\n",
+      "kilometer\n",
+      "granule\n",
+      "credential\n",
+      "petal\n",
+      "trouser\n",
+      "shortcoming\n",
+      "microorganism\n",
+      "italic\n",
+      "grandchild\n",
+      "munition\n",
+      "parenthesis\n",
+      "foodstuff\n",
+      "attache\n",
+      "grandparent\n",
+      "tropic\n",
+      "kilometre\n",
+      "congratulation\n",
+      "fume\n",
+      "convulsion\n",
+      "nostril\n",
+      "utensil\n",
+      "cooky\n",
+      "amenity\n",
+      "reptile\n",
+      "pretension\n",
+      "sock\n",
+      "peso\n",
+      "mitochondrion\n",
+      "reminiscence\n",
+      "invader\n",
+      "macrophage\n",
+      "eyelid\n",
+      "dweller\n",
+      "bristle\n",
+      "tenet\n",
+      "taxon\n",
+      "outskirt\n",
+      "policyholder\n",
+      "stamen\n",
+      "horseman\n",
+      "striker\n",
+      "ramification\n",
+      "tuft\n",
+      "cultivar\n",
+      "interrogatory\n",
+      "bylaw\n",
+      "bellow\n",
+      "neoplasm\n",
+      "insurgent\n",
+      "chore\n",
+      "pensioner\n",
+      "exigency\n",
+      "forefather\n",
+      "atrocity\n",
+      "dissenter\n",
+      "corpuscle\n",
+      "islander\n",
+      "numeral\n",
+      "bureaucrat\n",
+      "classmate\n",
+      "crossroad\n",
+      "pitfall\n",
+      "firework\n",
+      "ravage\n",
+      "broadcaster\n",
+      "heretic\n",
+      "appurtenance\n",
+      "potentiality\n",
+      "louse\n",
+      "conspirator\n",
+      "revers\n",
+      "combatant\n",
+      "conferee\n",
+      "serviceman\n",
+      "repercussion\n",
+      "grader\n",
+      "exhibitor\n",
+      "alkaloid\n",
+      "collaborator\n",
+      "slipper\n",
+      "foothill\n",
+      "homeowner\n",
+      "hallucination\n",
+      "ailment\n",
+      "crumb\n",
+      "milligram\n",
+      "turnip\n",
+      "fingertip\n",
+      "tradesman\n",
+      "archaeologist\n",
+      "bondholder\n",
+      "lira\n",
+      "emolument\n",
+      "tailing\n",
+      "enthusiast\n",
+      "tubule\n",
+      "warship\n",
+      "speculator\n",
+      "jobber\n",
+      "raisin\n",
+      "vicissitude\n",
+      "courtier\n",
+      "clove\n",
+      "entrant\n",
+      "festivity\n",
+      "bough\n",
+      "imago\n",
+      "fibroblast\n",
+      "bruise\n",
+      "misgiving\n",
+      "parishioner\n",
+      "bract\n",
+      "microbe\n",
+      "industrialist\n",
+      "sprout\n",
+      "wrinkle\n",
+      "worshipper\n",
+      "retiree\n",
+      "cracker\n",
+      "negotiator\n",
+      "pronouncement\n",
+      "devotee\n",
+      "sandal\n",
+      "sepal\n",
+      "interrelationship\n",
+      "corticosteroid\n",
+      "sou\n",
+      "framer\n",
+      "knuckle\n",
+      "leukocyte\n",
+      "malformation\n",
+      "geographer\n",
+      "fastener\n",
+      "ruble\n",
+      "whisker\n",
+      "tentacle\n",
+      "footprint\n",
+      "ratepayer\n",
+      "marketer\n",
+      "refiner\n",
+      "cilium\n",
+      "inroad\n",
+      "dragoon\n",
+      "litigant\n",
+      "kilo\n",
+      "shipowner\n",
+      "rudiment\n",
+      "appointee\n",
+      "fingerprint\n",
+      "anther\n",
+      "depredation\n",
+      "stave\n",
+      "rancher\n",
+      "cytokine\n",
+      "artefact\n",
+      "freeholder\n",
+      "churchman\n",
+      "fungicide\n",
+      "inequity\n",
+      "contraindication\n",
+      "arrhythmia\n",
+      "functionary\n",
+      "bandit\n",
+      "horde\n",
+      "spermatozoon\n",
+      "selectman\n",
+      "blocker\n",
+      "inaccuracy\n",
+      "gramme\n",
+      "billet\n",
+      "radiograph\n",
+      "demonstrator\n",
+      "amphibian\n",
+      "mussel\n",
+      "rafter\n",
+      "headlight\n",
+      "vestige\n",
+      "loin\n",
+      "raider\n",
+      "crevice\n",
+      "suitor\n",
+      "technologist\n",
+      "trooper\n",
+      "globule\n",
+      "firefighter\n",
+      "woodcut\n",
+      "purr\n",
+      " or\n",
+      "parse\n",
+      " chili\n",
+      "bolder\n",
+      " due\n",
+      "scents\n",
+      "daze\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lemmatize all words (plural -> singular)\n",
+    "lemmatize_mappings = [\n",
+    "    (w, lemmatizer.lemmatize(w)) \n",
+    "    for w in all_words\n",
+    "    # if w != lemmatizer.lemmatize(w)\n",
+    "]\n",
+    "\n",
+    "# Remove all words that lemmatize to another word\n",
+    "#all_words = [w for w in all_words if w not in ]\n",
+    "\n",
+    "# Add custom lemmatizations\n",
+    "for l in custom_maps:\n",
+    "    if l in lemmatize_mappings:\n",
+    "        print(f\"Warning: {l} is already lemmatized\")\n",
+    "    else:\n",
+    "        lemmatize_mappings.append(l)\n",
+    "        \n",
+    "lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]\n",
+    "\n",
+    "# Now, re-add all lematized words to the list of every word\n",
+    "for w in sum(lemmatize_mappings, ()):\n",
+    "    if w not in all_words:\n",
+    "        print(w)\n",
+    "        all_words.append(w)\n",
+    "        \n",
+    "lemmatize_mappings = {k: v for k, v in lemmatize_mappings}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "8bdff9d0-f3ff-498f-952d-13f1a91bfbd5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "final_wordlist = []\n",
+    "seen_lemmatizations = set()\n",
+    "for w in all_words:\n",
+    "    lemmatized = lemmatize_mappings.get(w) or w\n",
+    "    if lemmatized in seen_lemmatizations:\n",
+    "        # The lemmatized version of this word was already seen\n",
+    "        continue\n",
+    "    else:\n",
+    "        # The lemmatized version hasn't been seen. We're good to add it\n",
+    "        final_wordlist.append([\n",
+    "            k\n",
+    "            for k\n",
+    "            in lemmatize_mappings.keys()\n",
+    "            if lemmatize_mappings[k] == lemmatized\n",
+    "        ])\n",
+    "        seen_lemmatizations.add(lemmatized)\n",
+    "\n",
+    "    if len(final_wordlist) >= WORDLIST_SIZE:\n",
+    "        break\n",
+    "\n",
+    "# Now, convert it to the format (number, word)\n",
+    "final_wordlist = [\n",
+    "    (idx, w)\n",
+    "    for idx, words in enumerate(final_wordlist)\n",
+    "    for w in words\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "65bd6887-613e-45ae-ac45-6ed5967b3a43",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[(0, 'the'),\n",
+       " (1, 'of'),\n",
+       " (2, 'and'),\n",
+       " (3, 'to'),\n",
+       " (4, 'in'),\n",
+       " (5, 'is'),\n",
+       " (6, 'that'),\n",
+       " (7, 'for'),\n",
+       " (7, 'four'),\n",
+       " (8, 'be'),\n",
+       " (8, 'bee'),\n",
+       " (8, 'bees'),\n",
+       " (9, 'by'),\n",
+       " (9, 'bye'),\n",
+       " (10, 'with'),\n",
+       " (11, 'on'),\n",
+       " (12, 'not'),\n",
+       " (13, 'or'),\n",
+       " (14, 'this'),\n",
+       " (15, 'are'),\n",
+       " (16, 'at'),\n",
+       " (17, 'from'),\n",
+       " (18, 'he'),\n",
+       " (19, 'which'),\n",
+       " (20, 'his'),\n",
+       " (21, 'have'),\n",
+       " (22, 'an'),\n",
+       " (23, 'but'),\n",
+       " (24, 'you'),\n",
+       " (25, 'they'),\n",
+       " (26, 'were'),\n",
+       " (27, 'had'),\n",
+       " (28, 'we'),\n",
+       " (29, 'all'),\n",
+       " (30, 'one'),\n",
+       " (30, 'ones'),\n",
+       " (31, 'their'),\n",
+       " (32, 'been'),\n",
+       " (33, 'will'),\n",
+       " (33, 'wills'),\n",
+       " (34, 'there'),\n",
+       " (35, 'can'),\n",
+       " (35, 'cans'),\n",
+       " (36, 'if'),\n",
+       " (37, 'other'),\n",
+       " (38, 'would'),\n",
+       " (39, 'no'),\n",
+       " (39, 'nos'),\n",
+       " (40, 'her'),\n",
+       " (41, 'may'),\n",
+       " (42, 'more'),\n",
+       " (42, 'mores'),\n",
+       " (43, 'when'),\n",
+       " (44, 'who'),\n",
+       " (45, 'such'),\n",
+       " (46, 'these'),\n",
+       " (47, 'any'),\n",
+       " (48, 'she'),\n",
+       " (49, 'new'),\n",
+       " (50, 'time'),\n",
+       " (50, 'times'),\n",
+       " (51, 'than'),\n",
+       " (52, 'do'),\n",
+       " (53, 'some'),\n",
+       " (54, 'what'),\n",
+       " (55, 'only'),\n",
+       " (56, 'into'),\n",
+       " (57, 'them'),\n",
+       " (58, 'two'),\n",
+       " (59, 'also'),\n",
+       " (60, 'about'),\n",
+       " (61, 'out'),\n",
+       " (61, 'outs'),\n",
+       " (62, 'him'),\n",
+       " (62, 'hem'),\n",
+       " (63, 'my'),\n",
+       " (64, 'said'),\n",
+       " (65, 'up'),\n",
+       " (66, 'our'),\n",
+       " (67, 'first'),\n",
+       " (68, 'should'),\n",
+       " (69, 'under'),\n",
+       " (70, 'made'),\n",
+       " (71, 'state'),\n",
+       " (71, 'states'),\n",
+       " (72, 'see'),\n",
+       " (72, 'sees'),\n",
+       " (73, 'after'),\n",
+       " (74, 'could'),\n",
+       " (75, 'then'),\n",
+       " (76, 'me'),\n",
+       " (77, 'most'),\n",
+       " (78, 'over'),\n",
+       " (79, 'very'),\n",
+       " (79, 'vary'),\n",
+       " (80, 'your'),\n",
+       " (81, 'between'),\n",
+       " (82, 'where'),\n",
+       " (83, 'now'),\n",
+       " (84, 'shall'),\n",
+       " (85, 'work'),\n",
+       " (85, 'works'),\n",
+       " (86, 'those'),\n",
+       " (87, 'same'),\n",
+       " (88, 'well'),\n",
+       " (88, 'wells'),\n",
+       " (89, 'each'),\n",
+       " (90, 'many'),\n",
+       " (91, 'being'),\n",
+       " (91, 'beings'),\n",
+       " (92, 'years'),\n",
+       " (92, 'year'),\n",
+       " (93, 'did'),\n",
+       " (94, 'through'),\n",
+       " (94, 'thru'),\n",
+       " (95, 'must'),\n",
+       " (96, 'upon'),\n",
+       " (97, 'before'),\n",
+       " (98, 'like'),\n",
+       " (98, 'likes'),\n",
+       " (99, 'use'),\n",
+       " (100, 'part'),\n",
+       " (100, 'parts'),\n",
+       " (101, 'general'),\n",
+       " (101, 'generals'),\n",
+       " (102, 'people'),\n",
+       " (102, 'peoples'),\n",
+       " (103, 'because'),\n",
+       " (104, 'used'),\n",
+       " (105, 'how'),\n",
+       " (106, 'even'),\n",
+       " (107, 'much'),\n",
+       " (108, 'during'),\n",
+       " (109, 'both'),\n",
+       " (110, 'case'),\n",
+       " (110, 'cases'),\n",
+       " (111, 'three'),\n",
+       " (112, 'number'),\n",
+       " (112, 'numbers'),\n",
+       " (113, 'make'),\n",
+       " (113, 'makes'),\n",
+       " (114, 'per'),\n",
+       " (115, 'great'),\n",
+       " (115, 'grate'),\n",
+       " (116, 'act'),\n",
+       " (116, 'acts'),\n",
+       " (117, 'way'),\n",
+       " (117, 'ways'),\n",
+       " (117, 'weigh'),\n",
+       " (117, 'weighs'),\n",
+       " (118, 'life'),\n",
+       " (118, 'lives'),\n",
+       " (119, 'good'),\n",
+       " (119, 'goods'),\n",
+       " (120, 'day'),\n",
+       " (120, 'days'),\n",
+       " (120, 'dazed'),\n",
+       " (120, 'daze'),\n",
+       " (121, 'public'),\n",
+       " (122, 'man'),\n",
+       " (122, 'mans'),\n",
+       " (123, 'however'),\n",
+       " (124, 'system'),\n",
+       " (124, 'systems'),\n",
+       " (125, 'water'),\n",
+       " (125, 'waters'),\n",
+       " (126, 'without'),\n",
+       " (127, 'government'),\n",
+       " (127, 'governments'),\n",
+       " (128, 'while'),\n",
+       " (129, 'long'),\n",
+       " (130, 'order'),\n",
+       " (130, 'orders'),\n",
+       " (131, 'law'),\n",
+       " (131, 'laws'),\n",
+       " (132, 'section'),\n",
+       " (132, 'sections'),\n",
+       " (133, 'court'),\n",
+       " (133, 'courts'),\n",
+       " (134, 'high'),\n",
+       " (135, 'right'),\n",
+       " (135, 'rights'),\n",
+       " (136, 'own'),\n",
+       " (137, 'found'),\n",
+       " (138, 'united'),\n",
+       " (139, 'just'),\n",
+       " (140, 'here'),\n",
+       " (141, 'against'),\n",
+       " (142, 'world'),\n",
+       " (142, 'worlds'),\n",
+       " (144, 'company'),\n",
+       " (144, 'companies'),\n",
+       " (145, 'within'),\n",
+       " (146, 'given'),\n",
+       " (147, 'service'),\n",
+       " (147, 'services'),\n",
+       " (148, 'house'),\n",
+       " (148, 'houses'),\n",
+       " (149, 'another'),\n",
+       " (150, 'power'),\n",
+       " (150, 'powers'),\n",
+       " (151, 'place'),\n",
+       " (151, 'places'),\n",
+       " (152, 'know'),\n",
+       " (152, 'knows'),\n",
+       " (153, 'little'),\n",
+       " (154, 'down'),\n",
+       " (155, 'present'),\n",
+       " (155, 'presents'),\n",
+       " (156, 'every'),\n",
+       " (157, 'national'),\n",
+       " (157, 'nationals'),\n",
+       " (158, 'back'),\n",
+       " (158, 'backs'),\n",
+       " (159, 'take'),\n",
+       " (159, 'takes'),\n",
+       " (160, 'information'),\n",
+       " (161, 'men'),\n",
+       " (162, 'since'),\n",
+       " (163, 'might'),\n",
+       " (164, 'small'),\n",
+       " (165, 'large'),\n",
+       " (166, 'school'),\n",
+       " (166, 'schools'),\n",
+       " (167, 'following'),\n",
+       " (168, 'still'),\n",
+       " (170, 'last'),\n",
+       " (170, 'lasts'),\n",
+       " (171, 'city'),\n",
+       " (171, 'cities'),\n",
+       " (172, 'second'),\n",
+       " (172, 'seconds'),\n",
+       " (173, 'development'),\n",
+       " (173, 'developments'),\n",
+       " (174, 'different'),\n",
+       " (175, 'university'),\n",
+       " (175, 'universities'),\n",
+       " (176, 'old'),\n",
+       " (177, 'form'),\n",
+       " (177, 'forms'),\n",
+       " (178, 'point'),\n",
+       " (178, 'points'),\n",
+       " (179, 'total'),\n",
+       " (179, 'totals'),\n",
+       " (180, 'data'),\n",
+       " (181, 'too'),\n",
+       " (182, 'committee'),\n",
+       " (182, 'committees'),\n",
+       " (183, 'report'),\n",
+       " (183, 'reports'),\n",
+       " (184, 'business'),\n",
+       " (184, 'businesses'),\n",
+       " (185, 'think'),\n",
+       " (185, 'thinks'),\n",
+       " (186, 'end'),\n",
+       " (186, 'ends'),\n",
+       " (187, 'get'),\n",
+       " (187, 'gets'),\n",
+       " (188, 'set'),\n",
+       " (188, 'sets'),\n",
+       " (189, 'research'),\n",
+       " (189, 'researches'),\n",
+       " (190, 'say'),\n",
+       " (190, 'says'),\n",
+       " (191, 'come'),\n",
+       " (191, 'comes'),\n",
+       " (192, 'country'),\n",
+       " (192, 'countries'),\n",
+       " (193, 'never'),\n",
+       " (194, 'fact'),\n",
+       " (194, 'facts'),\n",
+       " (195, 'go'),\n",
+       " (195, 'goes'),\n",
+       " (196, 'control'),\n",
+       " (196, 'controls'),\n",
+       " (197, 'thus'),\n",
+       " (198, 'having'),\n",
+       " (199, 'value'),\n",
+       " (199, 'values'),\n",
+       " (200, 'social'),\n",
+       " (201, 'department'),\n",
+       " (201, 'departments'),\n",
+       " (202, 'few'),\n",
+       " (203, 'above'),\n",
+       " (204, 'important'),\n",
+       " (205, 'interest'),\n",
+       " (205, 'interests'),\n",
+       " (206, 'study'),\n",
+       " (206, 'studies'),\n",
+       " (207, 'off'),\n",
+       " (208, 'area'),\n",
+       " (208, 'areas'),\n",
+       " (209, 'means'),\n",
+       " (209, 'mean'),\n",
+       " (210, 'office'),\n",
+       " (210, 'offices'),\n",
+       " (211, 'group'),\n",
+       " (211, 'groups'),\n",
+       " (212, 'give'),\n",
+       " (212, 'gives'),\n",
+       " (213, 'again'),\n",
+       " (214, 'war'),\n",
+       " (214, 'wars'),\n",
+       " (215, 'whether'),\n",
+       " (216, 'question'),\n",
+       " (216, 'questions'),\n",
+       " (217, 'called'),\n",
+       " (218, 'period'),\n",
+       " (218, 'periods'),\n",
+       " (219, 'line'),\n",
+       " (219, 'lines'),\n",
+       " (220, 'land'),\n",
+       " (220, 'lands'),\n",
+       " (221, 'among'),\n",
+       " (222, 'table'),\n",
+       " (222, 'tables'),\n",
+       " (223, 'board'),\n",
+       " (223, 'boards'),\n",
+       " (224, 'until'),\n",
+       " (225, 'hand'),\n",
+       " (225, 'hands'),\n",
+       " (226, 'taken'),\n",
+       " (227, 'need'),\n",
+       " (227, 'needs'),\n",
+       " (228, 'education'),\n",
+       " (229, 'certain'),\n",
+       " (230, 'county'),\n",
+       " (230, 'counties'),\n",
+       " (231, 'action'),\n",
+       " (231, 'actions'),\n",
+       " (232, 'several'),\n",
+       " (233, 'am'),\n",
+       " (234, 'course'),\n",
+       " (234, 'courses'),\n",
+       " (235, 'far'),\n",
+       " (236, 'effect'),\n",
+       " (236, 'effects'),\n",
+       " (237, 'possible'),\n",
+       " (238, 'though'),\n",
+       " (239, 'left'),\n",
+       " (240, 'further'),\n",
+       " (241, 'home'),\n",
+       " (241, 'homes'),\n",
+       " (242, 'person'),\n",
+       " (242, 'persons'),\n",
+       " (243, 'health'),\n",
+       " (244, 'amount'),\n",
+       " (244, 'amounts'),\n",
+       " (245, 'members'),\n",
+       " (245, 'member'),\n",
+       " (246, 'subject'),\n",
+       " (246, 'subjects'),\n",
+       " (247, 'yet'),\n",
+       " (248, 'program'),\n",
+       " (248, 'programs'),\n",
+       " (249, 'therefore'),\n",
+       " (250, 'process'),\n",
+       " (250, 'processes'),\n",
+       " (251, 'rate'),\n",
+       " (251, 'rates'),\n",
+       " (252, 'local'),\n",
+       " (252, 'locals'),\n",
+       " (253, 'name'),\n",
+       " (253, 'names'),\n",
+       " (254, 'find'),\n",
+       " (254, 'finds'),\n",
+       " (255, 'necessary'),\n",
+       " (255, 'necessaries'),\n",
+       " (256, 'often'),\n",
+       " (257, 'others'),\n",
+       " (258, 'whole'),\n",
+       " (259, 'change'),\n",
+       " (259, 'changes'),\n",
+       " (260, 'example'),\n",
+       " (260, 'examples'),\n",
+       " (261, 'president'),\n",
+       " (262, 'history'),\n",
+       " (262, 'histories'),\n",
+       " (263, 'best'),\n",
+       " (264, 'although'),\n",
+       " (265, 'family'),\n",
+       " (265, 'families'),\n",
+       " (266, 'side'),\n",
+       " (266, 'sides'),\n",
+       " (267, 'women'),\n",
+       " (267, 'woman'),\n",
+       " (268, 'held'),\n",
+       " (269, 'based'),\n",
+       " (270, 'south'),\n",
+       " (271, 'special'),\n",
+       " (272, 'required'),\n",
+       " (273, 'came'),\n",
+       " (274, 'thought'),\n",
+       " (274, 'thoughts'),\n",
+       " (275, 'five'),\n",
+       " (276, 'always'),\n",
+       " (277, 'himself'),\n",
+       " (278, 'air'),\n",
+       " (278, 'airs'),\n",
+       " (279, 'known'),\n",
+       " (280, 'head'),\n",
+       " (280, 'heads'),\n",
+       " (281, 'either'),\n",
+       " (282, 'property'),\n",
+       " (282, 'properties'),\n",
+       " (283, 'cost'),\n",
+       " (283, 'costs'),\n",
+       " (284, 'rather'),\n",
+       " (285, 'bill'),\n",
+       " (285, 'bills'),\n",
+       " (286, 'put'),\n",
+       " (286, 'puts'),\n",
+       " (287, 'human'),\n",
+       " (287, 'humans'),\n",
+       " (288, 'figure'),\n",
+       " (288, 'figures'),\n",
+       " (289, 'results'),\n",
+       " (289, 'result'),\n",
+       " (290, 'level'),\n",
+       " (290, 'levels'),\n",
+       " (291, 'conditions'),\n",
+       " (291, 'condition'),\n",
+       " (292, 'full'),\n",
+       " (293, 'book'),\n",
+       " (293, 'books'),\n",
+       " (294, 'available'),\n",
+       " (295, 'early'),\n",
+       " (296, 'matter'),\n",
+       " (296, 'matters'),\n",
+       " (297, 'common'),\n",
+       " (298, 'light'),\n",
+       " (298, 'lights'),\n",
+       " (299, 'let'),\n",
+       " (299, 'lets'),\n",
+       " (300, 'society'),\n",
+       " (300, 'societies'),\n",
+       " (301, 'body'),\n",
+       " (301, 'bodies'),\n",
+       " (302, 'international'),\n",
+       " (303, 'including'),\n",
+       " (304, 'free'),\n",
+       " (305, 'evidence'),\n",
+       " (305, 'evidences'),\n",
+       " (306, 'better'),\n",
+       " (307, 'type'),\n",
+       " (307, 'types'),\n",
+       " (308, 'provided'),\n",
+       " (309, 'due'),\n",
+       " (309, 'dues'),\n",
+       " (310, 'next'),\n",
+       " (311, 'production'),\n",
+       " (311, 'productions'),\n",
+       " (312, 'once'),\n",
+       " (313, 'done'),\n",
+       " (314, 'making'),\n",
+       " (315, 'least'),\n",
+       " (316, 'support'),\n",
+       " (316, 'supports'),\n",
+       " (317, 'north'),\n",
+       " (318, 'later'),\n",
+       " (319, 'using'),\n",
+       " (320, 'things'),\n",
+       " (320, 'thing'),\n",
+       " (321, 'economic'),\n",
+       " (322, 'chapter'),\n",
+       " (322, 'chapters'),\n",
+       " (323, 'various'),\n",
+       " (324, 'why'),\n",
+       " (325, 'white'),\n",
+       " (325, 'whites'),\n",
+       " (326, 'going'),\n",
+       " (327, 'commission'),\n",
+       " (327, 'commissions'),\n",
+       " (328, 'federal'),\n",
+       " (329, 'away'),\n",
+       " (330, 'field'),\n",
+       " (330, 'fields'),\n",
+       " (331, 'nature'),\n",
+       " (331, 'natures'),\n",
+       " (332, 'policy'),\n",
+       " (332, 'policies'),\n",
+       " (333, 'become'),\n",
+       " (334, 'political'),\n",
+       " (335, 'increase'),\n",
+       " (335, 'increases'),\n",
+       " (336, 'around'),\n",
+       " (337, 'age'),\n",
+       " (337, 'ages'),\n",
+       " (338, 'want'),\n",
+       " (338, 'wants'),\n",
+       " (339, 'low'),\n",
+       " (339, 'lows'),\n",
+       " (340, 'trade'),\n",
+       " (340, 'trades'),\n",
+       " (341, 'half'),\n",
+       " (341, 'halves'),\n",
+       " (342, 'position'),\n",
+       " (342, 'positions'),\n",
+       " (343, 'young'),\n",
+       " (344, 'money'),\n",
+       " (344, 'moneys'),\n",
+       " (345, 'percent'),\n",
+       " (346, 'cent'),\n",
+       " (346, 'sense'),\n",
+       " (346, 'senses'),\n",
+       " (346, 'scent'),\n",
+       " (346, 'sensed'),\n",
+       " (346, 'scents'),\n",
+       " (347, 'class'),\n",
+       " (347, 'classes'),\n",
+       " (348, 'words'),\n",
+       " (348, 'word'),\n",
+       " (349, 'view'),\n",
+       " (349, 'views'),\n",
+       " (350, 'provide'),\n",
+       " (351, 'seen'),\n",
+       " (352, 'show'),\n",
+       " (352, 'shows'),\n",
+       " (353, 'district'),\n",
+       " (353, 'districts'),\n",
+       " (354, 'party'),\n",
+       " (354, 'parties'),\n",
+       " (355, 'analysis'),\n",
+       " (355, 'analyses'),\n",
+       " (356, 'care'),\n",
+       " (356, 'cares'),\n",
+       " (357, 'june'),\n",
+       " (358, 'foreign'),\n",
+       " (359, 'shown'),\n",
+       " (360, 'received'),\n",
+       " (361, 'management'),\n",
+       " (362, 'third'),\n",
+       " (362, 'thirds'),\n",
+       " (363, 'took'),\n",
+       " (364, 'something'),\n",
+       " (365, 'tax'),\n",
+       " (365, 'taxes'),\n",
+       " (366, 'account'),\n",
+       " (366, 'accounts'),\n",
+       " (367, 'problem'),\n",
+       " (367, 'problems'),\n",
+       " (368, 'almost'),\n",
+       " (369, 'west'),\n",
+       " (370, 'nothing'),\n",
+       " (371, 'together'),\n",
+       " (372, 'individual'),\n",
+       " (372, 'individuals'),\n",
+       " (373, 'open'),\n",
+       " (373, 'opens'),\n",
+       " (374, 'material'),\n",
+       " (374, 'materials'),\n",
+       " (375, 'paper'),\n",
+       " (375, 'papers'),\n",
+       " (376, 'feet'),\n",
+       " (376, 'foot'),\n",
+       " (377, 'force'),\n",
+       " (377, 'forces'),\n",
+       " (378, 'association'),\n",
+       " (378, 'associations'),\n",
+       " (379, 'purpose'),\n",
+       " (379, 'purposes'),\n",
+       " (380, 'terms'),\n",
+       " (380, 'term'),\n",
+       " (381, 'method'),\n",
+       " (381, 'methods'),\n",
+       " (382, 'help'),\n",
+       " (382, 'helps'),\n",
+       " (383, 'real'),\n",
+       " (384, 'ever'),\n",
+       " (385, 'already'),\n",
+       " (386, 'along'),\n",
+       " (387, 'went'),\n",
+       " (388, 'particular'),\n",
+       " (388, 'particulars'),\n",
+       " (389, 'energy'),\n",
+       " (389, 'energies'),\n",
+       " (390, 'secretary'),\n",
+       " (391, 'date'),\n",
+       " (391, 'dates'),\n",
+       " (392, 'price'),\n",
+       " (392, 'prices'),\n",
+       " (393, 'short'),\n",
+       " (393, 'shorts'),\n",
+       " (394, 'true'),\n",
+       " (395, 'street'),\n",
+       " (395, 'streets'),\n",
+       " (396, 'building'),\n",
+       " (396, 'buildings'),\n",
+       " (397, 'room'),\n",
+       " (397, 'rooms'),\n",
+       " (398, 'market'),\n",
+       " (398, 'markets'),\n",
+       " (399, 'look'),\n",
+       " (399, 'looks'),\n",
+       " (400, 'similar'),\n",
+       " (401, 'industry'),\n",
+       " (401, 'industries'),\n",
+       " (402, 'bank'),\n",
+       " (402, 'banks'),\n",
+       " (403, 'according'),\n",
+       " (404, 'itself'),\n",
+       " (405, 'application'),\n",
+       " (405, 'applications'),\n",
+       " (406, 'current'),\n",
+       " (406, 'currents'),\n",
+       " (407, 'read'),\n",
+       " (407, 'reads'),\n",
+       " (408, 'press'),\n",
+       " (408, 'presses'),\n",
+       " (409, 'community'),\n",
+       " (409, 'communities'),\n",
+       " (410, 'plan'),\n",
+       " (410, 'plans'),\n",
+       " (411, 'whose'),\n",
+       " (412, 'major'),\n",
+       " (412, 'majors'),\n",
+       " (413, 'considered'),\n",
+       " (414, 'mind'),\n",
+       " (414, 'minds'),\n",
+       " (415, 'union'),\n",
+       " (415, 'unions'),\n",
+       " (416, 'cause'),\n",
+       " (416, 'causes'),\n",
+       " (417, 'able'),\n",
+       " (418, 'surface'),\n",
+       " (418, 'surfaces'),\n",
+       " (419, 'face'),\n",
+       " (419, 'faces'),\n",
+       " (420, 'river'),\n",
+       " (420, 'rivers'),\n",
+       " (421, 'council'),\n",
+       " (421, 'councils'),\n",
+       " (422, 'income'),\n",
+       " (422, 'incomes'),\n",
+       " (423, 'july'),\n",
+       " (424, 'near'),\n",
+       " (425, 'experience'),\n",
+       " (425, 'experiences'),\n",
+       " (426, 'non'),\n",
+       " (427, 'paid'),\n",
+       " (428, 'pay'),\n",
+       " (428, 'pays'),\n",
+       " (429, 'reason'),\n",
+       " (429, 'reasons'),\n",
+       " (430, 'themselves'),\n",
+       " (431, 'asked'),\n",
+       " (432, 'march'),\n",
+       " (432, 'marches'),\n",
+       " (433, 'king'),\n",
+       " (433, 'kings'),\n",
+       " (434, 'higher'),\n",
+       " (435, 'single'),\n",
+       " (435, 'singles'),\n",
+       " (436, 'average'),\n",
+       " (436, 'averages'),\n",
+       " (437, 'father'),\n",
+       " (437, 'fathers'),\n",
+       " (438, 'note'),\n",
+       " (438, 'notes'),\n",
+       " (439, 'treatment'),\n",
+       " (439, 'treatments'),\n",
+       " (440, 'love'),\n",
+       " (440, 'loves'),\n",
+       " (441, 'black'),\n",
+       " (441, 'blacks'),\n",
+       " (442, 'knowledge'),\n",
+       " (443, 'enough'),\n",
+       " (444, 'future'),\n",
+       " (444, 'futures'),\n",
+       " (445, 'kind'),\n",
+       " (445, 'kinds'),\n",
+       " (446, 'lower'),\n",
+       " (446, 'lowers'),\n",
+       " (447, 'authority'),\n",
+       " (447, 'authorities'),\n",
+       " (448, 'past'),\n",
+       " (449, 'natural'),\n",
+       " (450, 'six'),\n",
+       " (451, 'food'),\n",
+       " (451, 'foods'),\n",
+       " (452, 'working'),\n",
+       " (452, 'workings'),\n",
+       " (453, 'central'),\n",
+       " (454, 'college'),\n",
+       " (454, 'colleges'),\n",
+       " (455, 'self'),\n",
+       " (455, 'selves'),\n",
+       " (456, 'products'),\n",
+       " (456, 'product'),\n",
+       " (457, 'model'),\n",
+       " (457, 'models'),\n",
+       " (458, 'brought'),\n",
+       " (459, 'greater'),\n",
+       " (460, 'test'),\n",
+       " (460, 'tests'),\n",
+       " (461, 'nor'),\n",
+       " (462, 'students'),\n",
+       " (462, 'student'),\n",
+       " (463, 'private'),\n",
+       " (464, 'construction'),\n",
+       " (464, 'constructions'),\n",
+       " (465, 'perhaps'),\n",
+       " (466, 'ground'),\n",
+       " (466, 'grounds'),\n",
+       " (467, 'sir'),\n",
+       " (468, 'basis'),\n",
+       " (469, 'months'),\n",
+       " (469, 'month'),\n",
+       " (470, 'growth'),\n",
+       " (470, 'growths'),\n",
+       " (471, 'increased'),\n",
+       " (472, 'east'),\n",
+       " (473, 'language'),\n",
+       " (473, 'languages'),\n",
+       " (474, 'rule'),\n",
+       " (474, 'rules'),\n",
+       " (475, 'continued'),\n",
+       " (476, 'quite'),\n",
+       " (477, 'except'),\n",
+       " (478, 'series'),\n",
+       " (479, 'practice'),\n",
+       " (479, 'practices'),\n",
+       " (480, 'night'),\n",
+       " (480, 'knight'),\n",
+       " (480, 'nights'),\n",
+       " (481, 'eyes'),\n",
+       " (481, 'eye'),\n",
+       " (482, 'oil'),\n",
+       " (482, 'oils'),\n",
+       " (483, 'art'),\n",
+       " (483, 'arts'),\n",
+       " (484, 'told'),\n",
+       " (485, 'especially'),\n",
+       " (486, 'population'),\n",
+       " (486, 'populations'),\n",
+       " (487, 'science'),\n",
+       " (487, 'sciences'),\n",
+       " (488, 'whom'),\n",
+       " (489, 'obtained'),\n",
+       " (490, 'capital'),\n",
+       " (490, 'capitol'),\n",
+       " (490, 'capitals'),\n",
+       " (491, 'include'),\n",
+       " (492, 'generally'),\n",
+       " (493, 'meeting'),\n",
+       " (493, 'meetings'),\n",
+       " (494, 'specific'),\n",
+       " (494, 'specifics'),\n",
+       " (495, 'described'),\n",
+       " (496, 'believe'),\n",
+       " (497, 'review'),\n",
+       " (497, 'reviews'),\n",
+       " (498, 'issue'),\n",
+       " (498, 'issues'),\n",
+       " (499, 'respect'),\n",
+       " (499, 'respects'),\n",
+       " (500, 'contract'),\n",
+       " (500, 'contracts'),\n",
+       " (501, 'became'),\n",
+       " (502, 'medical'),\n",
+       " (503, 'road'),\n",
+       " (503, 'roads'),\n",
+       " (504, 'got'),\n",
+       " (505, 'clear'),\n",
+       " (505, 'clears'),\n",
+       " (506, 'main'),\n",
+       " (506, 'mains'),\n",
+       " (507, 'labor'),\n",
+       " (507, 'labors'),\n",
+       " (508, 'operation'),\n",
+       " (508, 'operations'),\n",
+       " (509, 'size'),\n",
+       " (509, 'sizes'),\n",
+       " (510, 'below'),\n",
+       " (511, 'hours'),\n",
+       " (511, 'hour'),\n",
+       " (512, 'addition'),\n",
+       " (512, 'additions'),\n",
+       " (513, 'probably'),\n",
+       " (514, 'century'),\n",
+       " (514, 'centuries'),\n",
+       " (515, 'personal'),\n",
+       " (516, 'plant'),\n",
+       " (516, 'plants'),\n",
+       " (517, 'training'),\n",
+       " (518, 'design'),\n",
+       " (518, 'designs'),\n",
+       " (519, 'statement'),\n",
+       " (519, 'statements'),\n",
+       " (520, 'structure'),\n",
+       " (520, 'structures'),\n",
+       " (521, 'project'),\n",
+       " (521, 'projects'),\n",
+       " (522, 'million'),\n",
+       " (522, 'millions'),\n",
+       " (523, 'usually'),\n",
+       " (524, 'range'),\n",
+       " (524, 'ranges'),\n",
+       " (525, 'call'),\n",
+       " (525, 'calls'),\n",
+       " (526, 'mother'),\n",
+       " (526, 'mothers'),\n",
+       " (527, 'seems'),\n",
+       " (528, 'standard'),\n",
+       " (528, 'standards'),\n",
+       " (529, 'return'),\n",
+       " (529, 'returns'),\n",
+       " (530, 'title'),\n",
+       " (530, 'titles'),\n",
+       " (531, 'established'),\n",
+       " (532, 'keep'),\n",
+       " (532, 'keeps'),\n",
+       " (533, 'space'),\n",
+       " (533, 'spaces'),\n",
+       " (534, 'annual'),\n",
+       " (535, 'record'),\n",
+       " (535, 'records'),\n",
+       " (536, 'close'),\n",
+       " (536, 'closes'),\n",
+       " (537, 'april'),\n",
+       " (538, 'complete'),\n",
+       " (539, 'page'),\n",
+       " (539, 'pages'),\n",
+       " (540, 'heart'),\n",
+       " (540, 'hearts'),\n",
+       " (541, 'fig'),\n",
+       " (541, 'figs'),\n",
+       " (542, 'quality'),\n",
+       " (542, 'qualities'),\n",
+       " (543, 'gas'),\n",
+       " (543, 'gases'),\n",
+       " (544, 'letter'),\n",
+       " (544, 'letters'),\n",
+       " (545, 'stock'),\n",
+       " (545, 'stocks'),\n",
+       " (546, 'gave'),\n",
+       " (547, 'related'),\n",
+       " (548, 'administration'),\n",
+       " (548, 'administrations'),\n",
+       " (549, 'activities'),\n",
+       " (549, 'activity'),\n",
+       " (550, 'theory'),\n",
+       " (550, 'theories'),\n",
+       " (551, 'town'),\n",
+       " (551, 'towns'),\n",
+       " (552, 'equipment'),\n",
+       " (552, 'equipments'),\n",
+       " (553, 'soon'),\n",
+       " (554, 'decision'),\n",
+       " (554, 'decisions'),\n",
+       " (555, 'pressure'),\n",
+       " (555, 'pressures'),\n",
+       " (556, 'written'),\n",
+       " (557, 'corporation'),\n",
+       " (557, 'corporations'),\n",
+       " (558, 'tell'),\n",
+       " (558, 'tells'),\n",
+       " (559, 'agreement'),\n",
+       " (559, 'agreements'),\n",
+       " (560, 'reported'),\n",
+       " (561, 'attention'),\n",
+       " (561, 'attentions'),\n",
+       " (562, 'fire'),\n",
+       " (562, 'fires'),\n",
+       " (563, 'direct'),\n",
+       " (564, 'saw'),\n",
+       " (564, 'saws'),\n",
+       " (565, 'published'),\n",
+       " (566, 'temperature'),\n",
+       " (566, 'temperatures'),\n",
+       " (567, 'species'),\n",
+       " (567, 'specie'),\n",
+       " (568, 'really'),\n",
+       " (569, 'function'),\n",
+       " (569, 'functions'),\n",
+       " (570, 'military'),\n",
+       " (571, 'proposed'),\n",
+       " (572, 'january'),\n",
+       " (573, 'additional'),\n",
+       " (574, 'late'),\n",
+       " (575, 'opinion'),\n",
+       " (575, 'opinions'),\n",
+       " (576, 'loss'),\n",
+       " (576, 'losses'),\n",
+       " (577, 'limited'),\n",
+       " (578, 'source'),\n",
+       " (578, 'sources'),\n",
+       " (579, 'article'),\n",
+       " (579, 'articles'),\n",
+       " (580, 'notice'),\n",
+       " (580, 'notices'),\n",
+       " (581, 'security'),\n",
+       " (581, 'securities'),\n",
+       " (582, 'organization'),\n",
+       " (582, 'organizations'),\n",
+       " (582, 'organisation'),\n",
+       " (582, 'organisations'),\n",
+       " (583, 'financial'),\n",
+       " (584, 'follows'),\n",
+       " (585, 'miles'),\n",
+       " (585, 'mile'),\n",
+       " (586, 'chief'),\n",
+       " (586, 'chiefs'),\n",
+       " (587, 'distribution'),\n",
+       " (587, 'distributions'),\n",
+       " (588, 'sometimes'),\n",
+       " (589, 'insurance'),\n",
+       " (590, 'son'),\n",
+       " (590, 'sons'),\n",
+       " (591, 'strong'),\n",
+       " (592, 'length'),\n",
+       " (592, 'lengths'),\n",
+       " (593, 'original'),\n",
+       " (593, 'originals'),\n",
+       " (594, 'yes'),\n",
+       " (595, 'effective'),\n",
+       " (596, 'defendant'),\n",
+       " (596, 'defendants'),\n",
+       " (597, 'living'),\n",
+       " (598, 'december'),\n",
+       " (599, 'character'),\n",
+       " (599, 'characters'),\n",
+       " (600, 'began'),\n",
+       " (601, 'carried'),\n",
+       " (602, 'supply'),\n",
+       " (602, 'supplies'),\n",
+       " (603, 'blood'),\n",
+       " (604, 'taking'),\n",
+       " (605, 'manner'),\n",
+       " (605, 'manners'),\n",
+       " (606, 'journal'),\n",
+       " (606, 'journals'),\n",
+       " (607, 'hundred'),\n",
+       " (607, 'hundreds'),\n",
+       " (608, 'red'),\n",
+       " (609, 'developed'),\n",
+       " (610, 'performance'),\n",
+       " (610, 'performances'),\n",
+       " (611, 'situation'),\n",
+       " (611, 'situations'),\n",
+       " (612, 'felt'),\n",
+       " (613, 'workers'),\n",
+       " (613, 'worker'),\n",
+       " (614, 'volume'),\n",
+       " (614, 'volumes'),\n",
+       " (615, 'presented'),\n",
+       " (616, 'knew'),\n",
+       " (617, 'answer'),\n",
+       " (617, 'answers'),\n",
+       " (618, 'resources'),\n",
+       " (618, 'resource'),\n",
+       " (619, 'industrial'),\n",
+       " (620, 'twenty'),\n",
+       " (620, 'twenties'),\n",
+       " (621, 'sent'),\n",
+       " (622, 'looked'),\n",
+       " (623, 'library'),\n",
+       " (623, 'libraries'),\n",
+       " (624, 'added'),\n",
+       " (625, 'passed'),\n",
+       " (626, 'ten'),\n",
+       " (626, 'tens'),\n",
+       " (627, 'sea'),\n",
+       " (627, 'seas'),\n",
+       " (628, 'applied'),\n",
+       " (629, 'included'),\n",
+       " (630, 'physical'),\n",
+       " (631, 'across'),\n",
+       " (632, 'army'),\n",
+       " (632, 'armies'),\n",
+       " (633, 'toward'),\n",
+       " (634, 'produced'),\n",
+       " (635, 'placed'),\n",
+       " (636, 'role'),\n",
+       " (636, 'roles'),\n",
+       " (637, 'october'),\n",
+       " (638, 'final'),\n",
+       " (639, 'approach'),\n",
+       " (639, 'approaches'),\n",
+       " (640, 'provisions'),\n",
+       " (640, 'provision'),\n",
+       " (641, 'leave'),\n",
+       " (642, 'director'),\n",
+       " (642, 'directors'),\n",
+       " (643, 'employment'),\n",
+       " (643, 'employments'),\n",
+       " (644, 'anything'),\n",
+       " (645, 'particularly'),\n",
+       " (646, 'hard'),\n",
+       " (647, 'outside'),\n",
+       " (648, 'week'),\n",
+       " (648, 'weeks'),\n",
+       " (649, 'feel'),\n",
+       " (649, 'feels'),\n",
+       " (650, 'charge'),\n",
+       " (650, 'charges'),\n",
+       " (651, 'indeed'),\n",
+       " (652, 'degree'),\n",
+       " (652, 'degrees'),\n",
+       " (653, 'reference'),\n",
+       " ...]"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "final_wordlist"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "d1a06597-4ad5-4566-a716-8bbad416b7ab",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "with open(\"final_wordlist.csv\", \"w\") as f:\n",
+    "    f.write(\"word,number\\n\")\n",
+    "    \n",
+    "    for w in final_wordlist:\n",
+    "        lemmatized = \"\" if not w[1] else w[1]\n",
+    "        f.write(f\"{w[1].upper()},{w[0]}\")\n",
+    "        f.write(\"\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c88fe193-11cc-4a06-a3cf-d1ad85f44d14",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "final_wordlist"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a0d177b-3499-42fb-8091-29547567d69a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/wordlist/wordlist-new2.ipynb b/wordlist/wordlist-new2.ipynb
new file mode 100644
index 0000000..52a0d2f
--- /dev/null
+++ b/wordlist/wordlist-new2.ipynb
@@ -0,0 +1,220 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "991a711f-be98-4aae-a657-84b065449916",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    _initialized\n",
+    "except:\n",
+    "    # !pip install spacy\n",
+    "    # !python -m spacy download en_core_web_trf\n",
+    "    import spacy\n",
+    "    \n",
+    "    nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n",
+    "    \n",
+    "    _initialized=True\n",
+    "    \n",
+    "import pandas as pd\n",
+    "import gzip\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "d130bb84",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_lines(filename):\n",
+    "    with gzip.open(filename, 'r') as f:\n",
+    "        ret = []\n",
+    "        for l in f:\n",
+    "            if len(ret) > 30_000:\n",
+    "                return ret\n",
+    "            ret.append(str(l).lower())\n",
+    "        return ret\n",
+    "\n",
+    "\n",
+    "    \n",
+    "WORDLIST_SIZE = 8192 + 3\n",
+    "word_re = re.compile(r\"^[A-Za-z]+$\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "de2d1731",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pwd\n",
+    "!ls"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "90665714",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "annotated_words=pd.read_excel(\"annotated_words.ods\")\n",
+    "\n",
+    "excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n",
+    "excluded_words[0:10]\n",
+    "\n",
+    "custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n",
+    "\n",
+    "custom_maps = [\n",
+    "    (m[1][\"word\"].lower(), mapping.lower())\n",
+    "    for m in custom_maps.iterrows()\n",
+    "    for mapping in m[1][\"maps_to\"]\n",
+    "]\n",
+    "custom_maps"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "fb50c69e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Start parsing the wordlist\n",
+    "all_words = get_lines(\"00-frequency-all.txt.gz\")\n",
+    "\n",
+    "# Delete header line\n",
+    "all_words = all_words[1:]\n",
+    "\n",
+    "# Get only the word (fixed width)\n",
+    "all_words = [w[13:36].strip() for w in all_words]\n",
+    "\n",
+    "# Remove special characters\n",
+    "all_words = [w for w in all_words if word_re.search(w)]\n",
+    "\n",
+    "# Remove all removed words\n",
+    "all_words = [w for w in all_words if w not in excluded_words]\n",
+    "\n",
+    "# Add all custom mappings\n",
+    "for m in list(sum(custom_maps, ())):\n",
+    "    if m[0] not in all_words:\n",
+    "        all_words.append(m[0])\n",
+    "    if m[1] not in all_words:\n",
+    "        all_words.append(m[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "cd21bff5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lemmatize all words (plural -> singular)\n",
+    "lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in all_words[:100]]\n",
+    "print(lemmatize_mappings[:100])\n",
+    "\n",
+    "# Add custom lemmatizations\n",
+    "for l in custom_maps:\n",
+    "    if l in lemmatize_mappings:\n",
+    "        print(f\"Warning: {l} is already lemmatized\")\n",
+    "    else:\n",
+    "        lemmatize_mappings.append(l)\n",
+    "        \n",
+    "print(lemmatize_mappings[:100])\n",
+    "\n",
+    "lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]\n",
+    "print(lemmatize_mappings[:100])\n",
+    "\n",
+    "# Now, re-add all lematized words to the list of every word\n",
+    "for w in sum(lemmatize_mappings, ()):\n",
+    "    if w not in all_words:\n",
+    "        print(w)\n",
+    "        all_words.append(w)\n",
+    "        \n",
+    "lemmatize_mappings = {k: v for k, v in lemmatize_mappings}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "0ee9af7d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_wordlist = []\n",
+    "seen_lemmatizations = set()\n",
+    "for w in all_words:\n",
+    "    lemmatized = lemmatize_mappings.get(w) or w\n",
+    "    if lemmatized in seen_lemmatizations:\n",
+    "        # The lemmatized version of this word was already seen\n",
+    "        continue\n",
+    "    else:\n",
+    "        # The lemmatized version hasn't been seen. We're good to add it\n",
+    "        final_wordlist.append([\n",
+    "            k\n",
+    "            for k\n",
+    "            in lemmatize_mappings.keys()\n",
+    "            if lemmatize_mappings[k] == lemmatized\n",
+    "        ])\n",
+    "        seen_lemmatizations.add(lemmatized)\n",
+    "\n",
+    "    if len(final_wordlist) >= WORDLIST_SIZE:\n",
+    "        break\n",
+    "\n",
+    "# Now, convert it to the format (number, word)\n",
+    "final_wordlist = [\n",
+    "    (idx, w)\n",
+    "    for idx, words in enumerate(final_wordlist)\n",
+    "    for w in words\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "07c1293c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(len(lemmatize_mappings))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19c255d0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/wordlist/wordlist-new2.py b/wordlist/wordlist-new2.py
new file mode 100755
index 0000000..ca84229
--- /dev/null
+++ b/wordlist/wordlist-new2.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+# coding: utf-8
+
+print("Step 1")
+
+
+try:
+    _initialized
+except:
+    # !pip install spacy
+    # !python -m spacy download en_core_web_trf
+    import spacy
+    from tqdm import tqdm
+
+    nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
+
+    _initialized=True
+
+import pandas as pd
+import gzip
+import re
+
+
+print("Step 2")
+
+
+def get_lines(filename):
+    with gzip.open(filename, 'r') as f:
+        ret = []
+        for l in f:
+            if len(ret) > 30_000:
+                return ret
+            ret.append(str(l).lower())
+        return ret
+
+
+
+WORDLIST_SIZE = 8192 + 3
+word_re = re.compile(r"^[A-Za-z]+$")
+
+
+print("Step 3")
+
+
+annotated_words=pd.read_excel("annotated_words.ods")
+
+excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
+excluded_words[0:10]
+
+custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
+
+custom_maps = [
+    (m[1]["word"].lower(), mapping.lower())
+    for m in custom_maps.iterrows()
+    for mapping in m[1]["maps_to"]
+]
+custom_maps
+
+
+print("Step 4")
+
+
+# Start parsing the wordlist
+all_words = get_lines("00-frequency-all.txt.gz")
+
+# Delete header line
+all_words = all_words[1:]
+
+# Get only the word (fixed width)
+all_words = [w[13:36].strip() for w in all_words]
+
+# Remove special characters
+all_words = [w for w in all_words if word_re.search(w)]
+
+# Remove all removed words
+all_words = [w for w in all_words if w not in excluded_words]
+
+# Add all custom mappings
+for m in list(sum(custom_maps, ())):
+    if m[0] not in all_words:
+        all_words.append(m[0])
+    if m[1] not in all_words:
+        all_words.append(m[1])
+
+
+print("Step 5")
+
+
+# Lemmatize all words (plural -> singular)
+lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
+print(lemmatize_mappings[:100])
+
+# Add custom lemmatizations
+for l in custom_maps:
+    if l in lemmatize_mappings:
+        print(f"Warning: {l} is already lemmatized")
+    else:
+        lemmatize_mappings.append(l)
+
+print(lemmatize_mappings[:100])
+
+lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]
+print(lemmatize_mappings[:100])
+
+# Now, re-add all lematized words to the list of every word
+for w in sum(lemmatize_mappings, ()):
+    if w not in all_words:
+        print(w)
+        all_words.append(w)
+
+lemmatize_mappings = {k: v for k, v in lemmatize_mappings}
+
+
+print("Step 6")
+
+
+final_wordlist = []
+seen_lemmatizations = set()
+for w in all_words:
+    lemmatized = lemmatize_mappings.get(w) or w
+    if lemmatized in seen_lemmatizations:
+        # The lemmatized version of this word was already seen
+        continue
+    else:
+        # The lemmatized version hasn't been seen. We're good to add it
+        final_wordlist.append([
+            k
+            for k
+            in lemmatize_mappings.keys()
+            if lemmatize_mappings[k] == lemmatized
+        ])
+        seen_lemmatizations.add(lemmatized)
+
+    if len(final_wordlist) >= WORDLIST_SIZE:
+        break
+
+# Now, convert it to the format (number, word)
+final_wordlist = [
+    (idx, w)
+    for idx, words in enumerate(final_wordlist)
+    for w in words
+]
+
+
+print("Step 7")
+
+print(len(lemmatize_mappings))
+
+print("Step 8")
+
+with open("01-generated-wordlist.csv", "w") as f:
+    f.write("word,number\n")
+
+    for w in final_wordlist:
+        lemmatized = "" if not w[1] else w[1]
+        f.write(f"{w[1].upper()},{w[0]}")
+        f.write("\n")
+
+print("Done")