Start working on new wordlist generation

2023-03-01 22:26:06 -05:00 · 2023-03-01 22:26:06 -05:00 · 6ebe8cd489
commit 6ebe8cd489
parent c034652d86
11 changed files with 3392 additions and 19 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,3 +8,4 @@
 **/.ipynb_checkpoints
 /target
 /test-data/generator/build/
 /wordlist/venv
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1,12 +0,0 @@
 click==8.1.3
 defusedxml==0.7.1
 joblib==1.2.0
 nltk==3.8.1
 numpy==1.24.2
 odfpy==1.4.1
 pandas==1.5.3
 python-dateutil==2.8.2
 pytz==2022.7.1
 regex==2022.10.31
 six==1.16.0
 tqdm==4.64.1
--- a/docs/wordlist-new.ipynb
+++ b/docs/wordlist-new.ipynb
@ -12,21 +12,34 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n",
+      "Collecting nltk\n",
-      "Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n",
+      "  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
-      "Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hCollecting odfpy\n",
      "  Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m26.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25hCollecting regex>=2021.8.3\n",
      "  Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m29.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
      "Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
      "Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
-      "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n"
+      "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n",
      "Building wheels for collected packages: odfpy\n",
      "  Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25h  Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=5bfe9fcd7c590666411d404ea3e4ef0f704c9e62ff6621deb4ab09c84bec082a\n",
      "  Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
      "Successfully built odfpy\n",
      "Installing collected packages: regex, odfpy, nltk\n",
      "Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n",
+      "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n"
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    }
   ],
@ -64,6 +77,8 @@
    "            ret.append(str(l).lower())\n",
    "        return ret\n",
    "\n",
    "\n",
    "    \n",
    "WORDLIST_SIZE = 8192 + 3\n",
    "lemmatizer = WordNetLemmatizer()\n",
    "word_re = re.compile(r\"^[A-Za-z]+$\")"
--- a/docs/wordlist-new2.ipynb
+++ b/docs/wordlist-new2.ipynb
@ -0,0 +1,112 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "991a711f-be98-4aae-a657-84b065449916",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: spacy in /opt/conda/lib/python3.10/site-packages (3.5.0)\n",
      "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.4)\n",
      "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.3.0)\n",
      "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.12)\n",
      "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.10.5)\n",
      "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.1.2)\n",
      "Requirement already satisfied: typer<0.8.0,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.7.0)\n",
      "Requirement already satisfied: numpy>=1.15.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.23.5)\n",
      "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (6.3.0)\n",
      "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.8)\n",
      "Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from spacy) (67.3.2)\n",
      "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (23.0)\n",
      "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.1.1)\n",
      "Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (8.1.7)\n",
      "Requirement already satisfied: pathy>=0.10.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.10.1)\n",
      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.8)\n",
      "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.4.6)\n",
      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (4.64.1)\n",
      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.7)\n",
      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.9)\n",
      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.28.2)\n",
      "Requirement already satisfied: typing-extensions>=4.2.0 in /opt/conda/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy) (4.4.0)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2022.12.7)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.1.1)\n",
      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.26.14)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4)\n",
      "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.7.9)\n",
      "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.0.4)\n",
      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/conda/lib/python3.10/site-packages (from typer<0.8.0,>=0.3.0->spacy) (8.1.3)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->spacy) (2.1.2)\n",
      "Collecting en-core-web-trf==3.5.0\n",
      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl (460.3 MB)\n",
      "\u001b[2K     \u001b[91m━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.1/460.3 MB\u001b[0m \u001b[31m31.6 MB/s\u001b[0m eta \u001b[36m0:00:14\u001b[0m"
     ]
    }
   ],
   "source": [
    "try:\n",
    "    _initialized\n",
    "except:\n",
    "    !pip install spacy\n",
    "    !python -m spacy download en_core_web_trf\n",
    "    import spacy\n",
    "    \n",
    "    spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n",
    "    \n",
    "    _initialized=True\n",
    "    \n",
    "import pandas as pd\n",
    "import gzip\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b93818f-c54a-4c88-9968-df4244b7c6f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import spacy\n",
    "\n",
    "# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization\n",
    "nlp = spacy.load('en', disable=['parser', 'ner'])\n",
    "\n",
    "sentence = \"The striped bats are hanging on their feet for best\"\n",
    "\n",
    "# Parse the sentence using the loaded 'en' model object `nlp`\n",
    "doc = nlp(sentence)\n",
    "\n",
    "# Extract the lemma for each token and join\n",
    "\" \".join([token.lemma_ for token in doc])\n",
    "#> 'the strip bat be hang on -PRON- foot for good'"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/wordlist/01-lemmatized-words.csv
+++ b/wordlist/01-lemmatized-words.csv
@ -0,0 +1 @@
 word,lemmatized_word
--- a/wordlist/01-lemmatized-words.py
+++ b/wordlist/01-lemmatized-words.py
@ -0,0 +1,96 @@
 #!/usr/bin/env python3
 # coding: utf-8
 print("Step 1")
 try:
    _initialized
 except:
    # !pip install spacy
    # !python -m spacy download en_core_web_trf
    import spacy
    from tqdm import tqdm
    nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
    _initialized=True
 import pandas as pd
 import gzip
 import re
 print("Step 2")
 def get_lines(filename):
    with gzip.open(filename, 'r') as f:
        ret = []
        for l in f:
            if len(ret) > 30_000:
                return ret
            ret.append(str(l).lower())
        return ret
 WORDLIST_SIZE = 8192 + 3
 word_re = re.compile(r"^[A-Za-z]+$")
 print("Step 3")
 annotated_words=pd.read_excel("annotated_words.ods")
 excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
 excluded_words[0:10]
 custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
 custom_maps = [
    (m[1]["word"].lower(), mapping.lower())
    for m in custom_maps.iterrows()
    for mapping in m[1]["maps_to"]
 ]
 custom_maps
 print("Step 4")
 # Start parsing the wordlist
 all_words = get_lines("00-frequency-all.txt.gz")
 # Delete header line
 all_words = all_words[1:]
 # Get only the word (fixed width)
 all_words = [w[13:36].strip() for w in all_words]
 # Remove special characters
 all_words = [w for w in all_words if word_re.search(w)]
 # Remove all removed words
 all_words = [w for w in all_words if w not in excluded_words]
 # Add all custom mappings
 for m in list(sum(custom_maps, ())):
    if m[0] not in all_words:
        all_words.append(m[0])
    if m[1] not in all_words:
        all_words.append(m[1])
 print("Step 5")
 # Lemmatize all words (plural -> singular)
 lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
 with open("01-lemmatized-words.csv", "w") as f:
    f.write("word,lemmatized_word\n")
    for w in lemmatize_mappings:
        f.write(f"{w[0]},{w[1]}")
        f.write("\n")
--- a/wordlist/annotated_words.ods
+++ b/wordlist/annotated_words.ods
--- a/wordlist/requirements.txt
+++ b/wordlist/requirements.txt
@ -0,0 +1,124 @@
 anyio==3.6.2
 argon2-cffi==21.3.0
 argon2-cffi-bindings==21.2.0
 arrow==1.2.3
 asttokens==2.2.1
 attrs==22.2.0
 backcall==0.2.0
 beautifulsoup4==4.11.2
 bleach==6.0.0
 blis==0.7.9
 catalogue==2.0.8
 certifi==2022.12.7
 cffi==1.15.1
 charset-normalizer==3.0.1
 click==8.1.3
 comm==0.1.2
 confection==0.0.4
 cymem==2.0.7
 debugpy==1.6.6
 decorator==5.1.1
 defusedxml==0.7.1
 en-core-web-trf @ https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl
 executing==1.2.0
 fastjsonschema==2.16.3
 filelock==3.9.0
 fqdn==1.5.1
 huggingface-hub==0.12.1
 idna==3.4
 ipykernel==6.21.2
 ipython==8.11.0
 ipython-genutils==0.2.0
 ipywidgets==8.0.4
 isoduration==20.11.0
 jedi==0.18.2
 Jinja2==3.1.2
 jsonpointer==2.3
 jsonschema==4.17.3
 jupyter==1.0.0
 jupyter-console==6.6.2
 jupyter-events==0.6.3
 jupyter_client==8.0.3
 jupyter_core==5.2.0
 jupyter_server==2.3.0
 jupyter_server_terminals==0.4.4
 jupyterlab-pygments==0.2.2
 jupyterlab-widgets==3.0.5
 langcodes==3.3.0
 MarkupSafe==2.1.2
 matplotlib-inline==0.1.6
 mistune==2.0.5
 murmurhash==1.0.9
 nbclassic==0.5.2
 nbclient==0.7.2
 nbconvert==7.2.9
 nbformat==5.7.3
 nest-asyncio==1.5.6
 notebook==6.5.2
 notebook_shim==0.2.2
 numpy==1.24.2
 nvidia-cublas-cu11==11.10.3.66
 nvidia-cuda-nvrtc-cu11==11.7.99
 nvidia-cuda-runtime-cu11==11.7.99
 nvidia-cudnn-cu11==8.5.0.96
 odfpy==1.4.1
 packaging==23.0
 pandas==1.5.3
 pandocfilters==1.5.0
 parso==0.8.3
 pathy==0.10.1
 pexpect==4.8.0
 pickleshare==0.7.5
 platformdirs==3.0.0
 preshed==3.0.8
 prometheus-client==0.16.0
 prompt-toolkit==3.0.38
 psutil==5.9.4
 ptyprocess==0.7.0
 pure-eval==0.2.2
 pycparser==2.21
 pydantic==1.10.5
 Pygments==2.14.0
 pyrsistent==0.19.3
 python-dateutil==2.8.2
 python-json-logger==2.0.7
 pytz==2022.7.1
 PyYAML==6.0
 pyzmq==25.0.0
 qtconsole==5.4.0
 QtPy==2.3.0
 regex==2022.10.31
 requests==2.28.2
 rfc3339-validator==0.1.4
 rfc3986-validator==0.1.1
 Send2Trash==1.8.0
 six==1.16.0
 smart-open==6.3.0
 sniffio==1.3.0
 soupsieve==2.4
 spacy==3.5.0
 spacy-alignments==0.9.0
 spacy-legacy==3.0.12
 spacy-loggers==1.0.4
 spacy-transformers==1.2.2
 srsly==2.4.6
 stack-data==0.6.2
 terminado==0.17.1
 thinc==8.1.7
 tinycss2==1.2.1
 tokenizers==0.13.2
 torch==1.13.1
 tornado==6.2
 tqdm==4.64.1
 traitlets==5.9.0
 transformers==4.26.1
 typer==0.7.0
 typing_extensions==4.5.0
 uri-template==1.2.0
 urllib3==1.26.14
 wasabi==1.1.1
 wcwidth==0.2.6
 webcolors==1.12
 webencodings==0.5.1
 websocket-client==1.5.1
 widgetsnbextension==4.0.5
--- a/wordlist/wordlist-new.ipynb
+++ b/wordlist/wordlist-new.ipynb
--- a/wordlist/wordlist-new2.ipynb
+++ b/wordlist/wordlist-new2.ipynb
@ -0,0 +1,220 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "991a711f-be98-4aae-a657-84b065449916",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "try:\n",
    "    _initialized\n",
    "except:\n",
    "    # !pip install spacy\n",
    "    # !python -m spacy download en_core_web_trf\n",
    "    import spacy\n",
    "    \n",
    "    nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n",
    "    \n",
    "    _initialized=True\n",
    "    \n",
    "import pandas as pd\n",
    "import gzip\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d130bb84",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_lines(filename):\n",
    "    with gzip.open(filename, 'r') as f:\n",
    "        ret = []\n",
    "        for l in f:\n",
    "            if len(ret) > 30_000:\n",
    "                return ret\n",
    "            ret.append(str(l).lower())\n",
    "        return ret\n",
    "\n",
    "\n",
    "    \n",
    "WORDLIST_SIZE = 8192 + 3\n",
    "word_re = re.compile(r\"^[A-Za-z]+$\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "de2d1731",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pwd\n",
    "!ls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "90665714",
   "metadata": {},
   "outputs": [],
   "source": [
    "annotated_words=pd.read_excel(\"annotated_words.ods\")\n",
    "\n",
    "excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n",
    "excluded_words[0:10]\n",
    "\n",
    "custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n",
    "\n",
    "custom_maps = [\n",
    "    (m[1][\"word\"].lower(), mapping.lower())\n",
    "    for m in custom_maps.iterrows()\n",
    "    for mapping in m[1][\"maps_to\"]\n",
    "]\n",
    "custom_maps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fb50c69e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Start parsing the wordlist\n",
    "all_words = get_lines(\"00-frequency-all.txt.gz\")\n",
    "\n",
    "# Delete header line\n",
    "all_words = all_words[1:]\n",
    "\n",
    "# Get only the word (fixed width)\n",
    "all_words = [w[13:36].strip() for w in all_words]\n",
    "\n",
    "# Remove special characters\n",
    "all_words = [w for w in all_words if word_re.search(w)]\n",
    "\n",
    "# Remove all removed words\n",
    "all_words = [w for w in all_words if w not in excluded_words]\n",
    "\n",
    "# Add all custom mappings\n",
    "for m in list(sum(custom_maps, ())):\n",
    "    if m[0] not in all_words:\n",
    "        all_words.append(m[0])\n",
    "    if m[1] not in all_words:\n",
    "        all_words.append(m[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "cd21bff5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Lemmatize all words (plural -> singular)\n",
    "lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in all_words[:100]]\n",
    "print(lemmatize_mappings[:100])\n",
    "\n",
    "# Add custom lemmatizations\n",
    "for l in custom_maps:\n",
    "    if l in lemmatize_mappings:\n",
    "        print(f\"Warning: {l} is already lemmatized\")\n",
    "    else:\n",
    "        lemmatize_mappings.append(l)\n",
    "        \n",
    "print(lemmatize_mappings[:100])\n",
    "\n",
    "lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]\n",
    "print(lemmatize_mappings[:100])\n",
    "\n",
    "# Now, re-add all lematized words to the list of every word\n",
    "for w in sum(lemmatize_mappings, ()):\n",
    "    if w not in all_words:\n",
    "        print(w)\n",
    "        all_words.append(w)\n",
    "        \n",
    "lemmatize_mappings = {k: v for k, v in lemmatize_mappings}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "0ee9af7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "final_wordlist = []\n",
    "seen_lemmatizations = set()\n",
    "for w in all_words:\n",
    "    lemmatized = lemmatize_mappings.get(w) or w\n",
    "    if lemmatized in seen_lemmatizations:\n",
    "        # The lemmatized version of this word was already seen\n",
    "        continue\n",
    "    else:\n",
    "        # The lemmatized version hasn't been seen. We're good to add it\n",
    "        final_wordlist.append([\n",
    "            k\n",
    "            for k\n",
    "            in lemmatize_mappings.keys()\n",
    "            if lemmatize_mappings[k] == lemmatized\n",
    "        ])\n",
    "        seen_lemmatizations.add(lemmatized)\n",
    "\n",
    "    if len(final_wordlist) >= WORDLIST_SIZE:\n",
    "        break\n",
    "\n",
    "# Now, convert it to the format (number, word)\n",
    "final_wordlist = [\n",
    "    (idx, w)\n",
    "    for idx, words in enumerate(final_wordlist)\n",
    "    for w in words\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "07c1293c",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(len(lemmatize_mappings))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19c255d0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/wordlist/wordlist-new2.py
+++ b/wordlist/wordlist-new2.py
@ -0,0 +1,159 @@
 #!/usr/bin/env python3
 # coding: utf-8
 print("Step 1")
 try:
    _initialized
 except:
    # !pip install spacy
    # !python -m spacy download en_core_web_trf
    import spacy
    from tqdm import tqdm
    nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
    _initialized=True
 import pandas as pd
 import gzip
 import re
 print("Step 2")
 def get_lines(filename):
    with gzip.open(filename, 'r') as f:
        ret = []
        for l in f:
            if len(ret) > 30_000:
                return ret
            ret.append(str(l).lower())
        return ret
 WORDLIST_SIZE = 8192 + 3
 word_re = re.compile(r"^[A-Za-z]+$")
 print("Step 3")
 annotated_words=pd.read_excel("annotated_words.ods")
 excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
 excluded_words[0:10]
 custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
 custom_maps = [
    (m[1]["word"].lower(), mapping.lower())
    for m in custom_maps.iterrows()
    for mapping in m[1]["maps_to"]
 ]
 custom_maps
 print("Step 4")
 # Start parsing the wordlist
 all_words = get_lines("00-frequency-all.txt.gz")
 # Delete header line
 all_words = all_words[1:]
 # Get only the word (fixed width)
 all_words = [w[13:36].strip() for w in all_words]
 # Remove special characters
 all_words = [w for w in all_words if word_re.search(w)]
 # Remove all removed words
 all_words = [w for w in all_words if w not in excluded_words]
 # Add all custom mappings
 for m in list(sum(custom_maps, ())):
    if m[0] not in all_words:
        all_words.append(m[0])
    if m[1] not in all_words:
        all_words.append(m[1])
 print("Step 5")
 # Lemmatize all words (plural -> singular)
 lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
 print(lemmatize_mappings[:100])
 # Add custom lemmatizations
 for l in custom_maps:
    if l in lemmatize_mappings:
        print(f"Warning: {l} is already lemmatized")
    else:
        lemmatize_mappings.append(l)
 print(lemmatize_mappings[:100])
 lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]
 print(lemmatize_mappings[:100])
 # Now, re-add all lematized words to the list of every word
 for w in sum(lemmatize_mappings, ()):
    if w not in all_words:
        print(w)
        all_words.append(w)
 lemmatize_mappings = {k: v for k, v in lemmatize_mappings}
 print("Step 6")
 final_wordlist = []
 seen_lemmatizations = set()
 for w in all_words:
    lemmatized = lemmatize_mappings.get(w) or w
    if lemmatized in seen_lemmatizations:
        # The lemmatized version of this word was already seen
        continue
    else:
        # The lemmatized version hasn't been seen. We're good to add it
        final_wordlist.append([
            k
            for k
            in lemmatize_mappings.keys()
            if lemmatize_mappings[k] == lemmatized
        ])
        seen_lemmatizations.add(lemmatized)
    if len(final_wordlist) >= WORDLIST_SIZE:
        break
 # Now, convert it to the format (number, word)
 final_wordlist = [
    (idx, w)
    for idx, words in enumerate(final_wordlist)
    for w in words
 ]
 print("Step 7")
 print(len(lemmatize_mappings))
 print("Step 8")
 with open("01-generated-wordlist.csv", "w") as f:
    f.write("word,number\n")
    for w in final_wordlist:
        lemmatized = "" if not w[1] else w[1]
        f.write(f"{w[1].upper()},{w[0]}")
        f.write("\n")
 print("Done")