Start working on new wordlist generation

2023-03-01 22:26:06 -05:00 · 2023-03-01 22:26:06 -05:00 · 6ebe8cd489
commit 6ebe8cd489
parent c034652d86
11 changed files with 3392 additions and 19 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,3 +8,4 @@
 **/.ipynb_checkpoints
 /target
 /test-data/generator/build/
+/wordlist/venv
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1,12 +0,0 @@
-click==8.1.3
-defusedxml==0.7.1
-joblib==1.2.0
-nltk==3.8.1
-numpy==1.24.2
-odfpy==1.4.1
-pandas==1.5.3
-python-dateutil==2.8.2
-pytz==2022.7.1
-regex==2022.10.31
-six==1.16.0
-tqdm==4.64.1
--- a/docs/wordlist-new.ipynb
+++ b/docs/wordlist-new.ipynb
@ -12,21 +12,34 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n",
-      "Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n",
-      "Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n",
+      "Collecting nltk\n",
+      "  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hCollecting odfpy\n",
+      "  Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m26.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hCollecting regex>=2021.8.3\n",
+      "  Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m29.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
      "Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
-      "Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
-      "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n"
+      "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n",
+      "Building wheels for collected packages: odfpy\n",
+      "  Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=5bfe9fcd7c590666411d404ea3e4ef0f704c9e62ff6621deb4ab09c84bec082a\n",
+      "  Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
+      "Successfully built odfpy\n",
+      "Installing collected packages: regex, odfpy, nltk\n",
+      "Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n",
-      "[nltk_data]   Package wordnet is already up-to-date!\n"
+      "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n"
     ]
    }
   ],
@ -64,6 +77,8 @@
    "            ret.append(str(l).lower())\n",
    "        return ret\n",
    "\n",
+    "\n",
+    "    \n",
    "WORDLIST_SIZE = 8192 + 3\n",
    "lemmatizer = WordNetLemmatizer()\n",
    "word_re = re.compile(r\"^[A-Za-z]+$\")"
--- a/docs/wordlist-new2.ipynb
+++ b/docs/wordlist-new2.ipynb
@ -0,0 +1,112 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "991a711f-be98-4aae-a657-84b065449916",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: spacy in /opt/conda/lib/python3.10/site-packages (3.5.0)\n",
+      "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.4)\n",
+      "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.3.0)\n",
+      "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.12)\n",
+      "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.10.5)\n",
+      "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.1.2)\n",
+      "Requirement already satisfied: typer<0.8.0,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.7.0)\n",
+      "Requirement already satisfied: numpy>=1.15.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.23.5)\n",
+      "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (6.3.0)\n",
+      "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.8)\n",
+      "Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from spacy) (67.3.2)\n",
+      "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (23.0)\n",
+      "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.1.1)\n",
+      "Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (8.1.7)\n",
+      "Requirement already satisfied: pathy>=0.10.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.10.1)\n",
+      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.8)\n",
+      "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.4.6)\n",
+      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (4.64.1)\n",
+      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.7)\n",
+      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.9)\n",
+      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.28.2)\n",
+      "Requirement already satisfied: typing-extensions>=4.2.0 in /opt/conda/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy) (4.4.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2022.12.7)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.1.1)\n",
+      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.26.14)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4)\n",
+      "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.7.9)\n",
+      "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.0.4)\n",
+      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/conda/lib/python3.10/site-packages (from typer<0.8.0,>=0.3.0->spacy) (8.1.3)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->spacy) (2.1.2)\n",
+      "Collecting en-core-web-trf==3.5.0\n",
+      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl (460.3 MB)\n",
+      "\u001b[2K     \u001b[91m━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.1/460.3 MB\u001b[0m \u001b[31m31.6 MB/s\u001b[0m eta \u001b[36m0:00:14\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    _initialized\n",
+    "except:\n",
+    "    !pip install spacy\n",
+    "    !python -m spacy download en_core_web_trf\n",
+    "    import spacy\n",
+    "    \n",
+    "    spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n",
+    "    \n",
+    "    _initialized=True\n",
+    "    \n",
+    "import pandas as pd\n",
+    "import gzip\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b93818f-c54a-4c88-9968-df4244b7c6f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import spacy\n",
+    "\n",
+    "# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization\n",
+    "nlp = spacy.load('en', disable=['parser', 'ner'])\n",
+    "\n",
+    "sentence = \"The striped bats are hanging on their feet for best\"\n",
+    "\n",
+    "# Parse the sentence using the loaded 'en' model object `nlp`\n",
+    "doc = nlp(sentence)\n",
+    "\n",
+    "# Extract the lemma for each token and join\n",
+    "\" \".join([token.lemma_ for token in doc])\n",
+    "#> 'the strip bat be hang on -PRON- foot for good'"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/wordlist/01-lemmatized-words.csv
+++ b/wordlist/01-lemmatized-words.csv
@ -0,0 +1 @@
+word,lemmatized_word
--- a/wordlist/01-lemmatized-words.py
+++ b/wordlist/01-lemmatized-words.py
@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+# coding: utf-8
+
+print("Step 1")
+
+
+try:
+    _initialized
+except:
+    # !pip install spacy
+    # !python -m spacy download en_core_web_trf
+    import spacy
+    from tqdm import tqdm
+
+    nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
+
+    _initialized=True
+
+import pandas as pd
+import gzip
+import re
+
+
+print("Step 2")
+
+
+def get_lines(filename):
+    with gzip.open(filename, 'r') as f:
+        ret = []
+        for l in f:
+            if len(ret) > 30_000:
+                return ret
+            ret.append(str(l).lower())
+        return ret
+
+
+
+WORDLIST_SIZE = 8192 + 3
+word_re = re.compile(r"^[A-Za-z]+$")
+
+
+print("Step 3")
+
+
+annotated_words=pd.read_excel("annotated_words.ods")
+
+excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
+excluded_words[0:10]
+
+custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
+
+custom_maps = [
+    (m[1]["word"].lower(), mapping.lower())
+    for m in custom_maps.iterrows()
+    for mapping in m[1]["maps_to"]
+]
+custom_maps
+
+
+print("Step 4")
+
+
+# Start parsing the wordlist
+all_words = get_lines("00-frequency-all.txt.gz")
+
+# Delete header line
+all_words = all_words[1:]
+
+# Get only the word (fixed width)
+all_words = [w[13:36].strip() for w in all_words]
+
+# Remove special characters
+all_words = [w for w in all_words if word_re.search(w)]
+
+# Remove all removed words
+all_words = [w for w in all_words if w not in excluded_words]
+
+# Add all custom mappings
+for m in list(sum(custom_maps, ())):
+    if m[0] not in all_words:
+        all_words.append(m[0])
+    if m[1] not in all_words:
+        all_words.append(m[1])
+
+
+print("Step 5")
+
+# Lemmatize all words (plural -> singular)
+lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
+
+with open("01-lemmatized-words.csv", "w") as f:
+    f.write("word,lemmatized_word\n")
+
+    for w in lemmatize_mappings:
+        f.write(f"{w[0]},{w[1]}")
+        f.write("\n")
--- a/wordlist/annotated_words.ods
+++ b/wordlist/annotated_words.ods
--- a/wordlist/requirements.txt
+++ b/wordlist/requirements.txt
@ -0,0 +1,124 @@
+anyio==3.6.2
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+arrow==1.2.3
+asttokens==2.2.1
+attrs==22.2.0
+backcall==0.2.0
+beautifulsoup4==4.11.2
+bleach==6.0.0
+blis==0.7.9
+catalogue==2.0.8
+certifi==2022.12.7
+cffi==1.15.1
+charset-normalizer==3.0.1
+click==8.1.3
+comm==0.1.2
+confection==0.0.4
+cymem==2.0.7
+debugpy==1.6.6
+decorator==5.1.1
+defusedxml==0.7.1
+en-core-web-trf @ https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl
+executing==1.2.0
+fastjsonschema==2.16.3
+filelock==3.9.0
+fqdn==1.5.1
+huggingface-hub==0.12.1
+idna==3.4
+ipykernel==6.21.2
+ipython==8.11.0
+ipython-genutils==0.2.0
+ipywidgets==8.0.4
+isoduration==20.11.0
+jedi==0.18.2
+Jinja2==3.1.2
+jsonpointer==2.3
+jsonschema==4.17.3
+jupyter==1.0.0
+jupyter-console==6.6.2
+jupyter-events==0.6.3
+jupyter_client==8.0.3
+jupyter_core==5.2.0
+jupyter_server==2.3.0
+jupyter_server_terminals==0.4.4
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==3.0.5
+langcodes==3.3.0
+MarkupSafe==2.1.2
+matplotlib-inline==0.1.6
+mistune==2.0.5
+murmurhash==1.0.9
+nbclassic==0.5.2
+nbclient==0.7.2
+nbconvert==7.2.9
+nbformat==5.7.3
+nest-asyncio==1.5.6
+notebook==6.5.2
+notebook_shim==0.2.2
+numpy==1.24.2
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+odfpy==1.4.1
+packaging==23.0
+pandas==1.5.3
+pandocfilters==1.5.0
+parso==0.8.3
+pathy==0.10.1
+pexpect==4.8.0
+pickleshare==0.7.5
+platformdirs==3.0.0
+preshed==3.0.8
+prometheus-client==0.16.0
+prompt-toolkit==3.0.38
+psutil==5.9.4
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pycparser==2.21
+pydantic==1.10.5
+Pygments==2.14.0
+pyrsistent==0.19.3
+python-dateutil==2.8.2
+python-json-logger==2.0.7
+pytz==2022.7.1
+PyYAML==6.0
+pyzmq==25.0.0
+qtconsole==5.4.0
+QtPy==2.3.0
+regex==2022.10.31
+requests==2.28.2
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+Send2Trash==1.8.0
+six==1.16.0
+smart-open==6.3.0
+sniffio==1.3.0
+soupsieve==2.4
+spacy==3.5.0
+spacy-alignments==0.9.0
+spacy-legacy==3.0.12
+spacy-loggers==1.0.4
+spacy-transformers==1.2.2
+srsly==2.4.6
+stack-data==0.6.2
+terminado==0.17.1
+thinc==8.1.7
+tinycss2==1.2.1
+tokenizers==0.13.2
+torch==1.13.1
+tornado==6.2
+tqdm==4.64.1
+traitlets==5.9.0
+transformers==4.26.1
+typer==0.7.0
+typing_extensions==4.5.0
+uri-template==1.2.0
+urllib3==1.26.14
+wasabi==1.1.1
+wcwidth==0.2.6
+webcolors==1.12
+webencodings==0.5.1
+websocket-client==1.5.1
+widgetsnbextension==4.0.5
--- a/wordlist/wordlist-new.ipynb
+++ b/wordlist/wordlist-new.ipynb
--- a/wordlist/wordlist-new2.ipynb
+++ b/wordlist/wordlist-new2.ipynb
@ -0,0 +1,220 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "991a711f-be98-4aae-a657-84b065449916",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    _initialized\n",
+    "except:\n",
+    "    # !pip install spacy\n",
+    "    # !python -m spacy download en_core_web_trf\n",
+    "    import spacy\n",
+    "    \n",
+    "    nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n",
+    "    \n",
+    "    _initialized=True\n",
+    "    \n",
+    "import pandas as pd\n",
+    "import gzip\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "d130bb84",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_lines(filename):\n",
+    "    with gzip.open(filename, 'r') as f:\n",
+    "        ret = []\n",
+    "        for l in f:\n",
+    "            if len(ret) > 30_000:\n",
+    "                return ret\n",
+    "            ret.append(str(l).lower())\n",
+    "        return ret\n",
+    "\n",
+    "\n",
+    "    \n",
+    "WORDLIST_SIZE = 8192 + 3\n",
+    "word_re = re.compile(r\"^[A-Za-z]+$\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "de2d1731",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pwd\n",
+    "!ls"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "90665714",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "annotated_words=pd.read_excel(\"annotated_words.ods\")\n",
+    "\n",
+    "excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n",
+    "excluded_words[0:10]\n",
+    "\n",
+    "custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n",
+    "\n",
+    "custom_maps = [\n",
+    "    (m[1][\"word\"].lower(), mapping.lower())\n",
+    "    for m in custom_maps.iterrows()\n",
+    "    for mapping in m[1][\"maps_to\"]\n",
+    "]\n",
+    "custom_maps"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "fb50c69e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Start parsing the wordlist\n",
+    "all_words = get_lines(\"00-frequency-all.txt.gz\")\n",
+    "\n",
+    "# Delete header line\n",
+    "all_words = all_words[1:]\n",
+    "\n",
+    "# Get only the word (fixed width)\n",
+    "all_words = [w[13:36].strip() for w in all_words]\n",
+    "\n",
+    "# Remove special characters\n",
+    "all_words = [w for w in all_words if word_re.search(w)]\n",
+    "\n",
+    "# Remove all removed words\n",
+    "all_words = [w for w in all_words if w not in excluded_words]\n",
+    "\n",
+    "# Add all custom mappings\n",
+    "for m in list(sum(custom_maps, ())):\n",
+    "    if m[0] not in all_words:\n",
+    "        all_words.append(m[0])\n",
+    "    if m[1] not in all_words:\n",
+    "        all_words.append(m[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "cd21bff5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lemmatize all words (plural -> singular)\n",
+    "lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in all_words[:100]]\n",
+    "print(lemmatize_mappings[:100])\n",
+    "\n",
+    "# Add custom lemmatizations\n",
+    "for l in custom_maps:\n",
+    "    if l in lemmatize_mappings:\n",
+    "        print(f\"Warning: {l} is already lemmatized\")\n",
+    "    else:\n",
+    "        lemmatize_mappings.append(l)\n",
+    "        \n",
+    "print(lemmatize_mappings[:100])\n",
+    "\n",
+    "lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]\n",
+    "print(lemmatize_mappings[:100])\n",
+    "\n",
+    "# Now, re-add all lematized words to the list of every word\n",
+    "for w in sum(lemmatize_mappings, ()):\n",
+    "    if w not in all_words:\n",
+    "        print(w)\n",
+    "        all_words.append(w)\n",
+    "        \n",
+    "lemmatize_mappings = {k: v for k, v in lemmatize_mappings}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "0ee9af7d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_wordlist = []\n",
+    "seen_lemmatizations = set()\n",
+    "for w in all_words:\n",
+    "    lemmatized = lemmatize_mappings.get(w) or w\n",
+    "    if lemmatized in seen_lemmatizations:\n",
+    "        # The lemmatized version of this word was already seen\n",
+    "        continue\n",
+    "    else:\n",
+    "        # The lemmatized version hasn't been seen. We're good to add it\n",
+    "        final_wordlist.append([\n",
+    "            k\n",
+    "            for k\n",
+    "            in lemmatize_mappings.keys()\n",
+    "            if lemmatize_mappings[k] == lemmatized\n",
+    "        ])\n",
+    "        seen_lemmatizations.add(lemmatized)\n",
+    "\n",
+    "    if len(final_wordlist) >= WORDLIST_SIZE:\n",
+    "        break\n",
+    "\n",
+    "# Now, convert it to the format (number, word)\n",
+    "final_wordlist = [\n",
+    "    (idx, w)\n",
+    "    for idx, words in enumerate(final_wordlist)\n",
+    "    for w in words\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "07c1293c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(len(lemmatize_mappings))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19c255d0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/wordlist/wordlist-new2.py
+++ b/wordlist/wordlist-new2.py
@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+# coding: utf-8
+
+print("Step 1")
+
+
+try:
+    _initialized
+except:
+    # !pip install spacy
+    # !python -m spacy download en_core_web_trf
+    import spacy
+    from tqdm import tqdm
+
+    nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
+
+    _initialized=True
+
+import pandas as pd
+import gzip
+import re
+
+
+print("Step 2")
+
+
+def get_lines(filename):
+    with gzip.open(filename, 'r') as f:
+        ret = []
+        for l in f:
+            if len(ret) > 30_000:
+                return ret
+            ret.append(str(l).lower())
+        return ret
+
+
+
+WORDLIST_SIZE = 8192 + 3
+word_re = re.compile(r"^[A-Za-z]+$")
+
+
+print("Step 3")
+
+
+annotated_words=pd.read_excel("annotated_words.ods")
+
+excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
+excluded_words[0:10]
+
+custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
+
+custom_maps = [
+    (m[1]["word"].lower(), mapping.lower())
+    for m in custom_maps.iterrows()
+    for mapping in m[1]["maps_to"]
+]
+custom_maps
+
+
+print("Step 4")
+
+
+# Start parsing the wordlist
+all_words = get_lines("00-frequency-all.txt.gz")
+
+# Delete header line
+all_words = all_words[1:]
+
+# Get only the word (fixed width)
+all_words = [w[13:36].strip() for w in all_words]
+
+# Remove special characters
+all_words = [w for w in all_words if word_re.search(w)]
+
+# Remove all removed words
+all_words = [w for w in all_words if w not in excluded_words]
+
+# Add all custom mappings
+for m in list(sum(custom_maps, ())):
+    if m[0] not in all_words:
+        all_words.append(m[0])
+    if m[1] not in all_words:
+        all_words.append(m[1])
+
+
+print("Step 5")
+
+
+# Lemmatize all words (plural -> singular)
+lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
+print(lemmatize_mappings[:100])
+
+# Add custom lemmatizations
+for l in custom_maps:
+    if l in lemmatize_mappings:
+        print(f"Warning: {l} is already lemmatized")
+    else:
+        lemmatize_mappings.append(l)
+
+print(lemmatize_mappings[:100])
+
+lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]
+print(lemmatize_mappings[:100])
+
+# Now, re-add all lematized words to the list of every word
+for w in sum(lemmatize_mappings, ()):
+    if w not in all_words:
+        print(w)
+        all_words.append(w)
+
+lemmatize_mappings = {k: v for k, v in lemmatize_mappings}
+
+
+print("Step 6")
+
+
+final_wordlist = []
+seen_lemmatizations = set()
+for w in all_words:
+    lemmatized = lemmatize_mappings.get(w) or w
+    if lemmatized in seen_lemmatizations:
+        # The lemmatized version of this word was already seen
+        continue
+    else:
+        # The lemmatized version hasn't been seen. We're good to add it
+        final_wordlist.append([
+            k
+            for k
+            in lemmatize_mappings.keys()
+            if lemmatize_mappings[k] == lemmatized
+        ])
+        seen_lemmatizations.add(lemmatized)
+
+    if len(final_wordlist) >= WORDLIST_SIZE:
+        break
+
+# Now, convert it to the format (number, word)
+final_wordlist = [
+    (idx, w)
+    for idx, words in enumerate(final_wordlist)
+    for w in words
+]
+
+
+print("Step 7")
+
+print(len(lemmatize_mappings))
+
+print("Step 8")
+
+with open("01-generated-wordlist.csv", "w") as f:
+    f.write("word,number\n")
+
+    for w in final_wordlist:
+        lemmatized = "" if not w[1] else w[1]
+        f.write(f"{w[1].upper()},{w[0]}")
+        f.write("\n")
+
+print("Done")