diff --git a/.gitignore b/.gitignore index d32d3f8..b34512a 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ **/.ipynb_checkpoints /target /test-data/generator/build/ +/wordlist/venv diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 63c7116..0000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,12 +0,0 @@ -click==8.1.3 -defusedxml==0.7.1 -joblib==1.2.0 -nltk==3.8.1 -numpy==1.24.2 -odfpy==1.4.1 -pandas==1.5.3 -python-dateutil==2.8.2 -pytz==2022.7.1 -regex==2022.10.31 -six==1.16.0 -tqdm==4.64.1 diff --git a/docs/wordlist-new.ipynb b/docs/wordlist-new.ipynb index e2b5ba9..2928327 100644 --- a/docs/wordlist-new.ipynb +++ b/docs/wordlist-new.ipynb @@ -12,21 +12,34 @@ "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n", - "Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n", - "Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n", + "Collecting nltk\n", + " Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hCollecting odfpy\n", + " Downloading odfpy-1.4.1.tar.gz (717 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m26.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25hCollecting regex>=2021.8.3\n", + " Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m29.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n", "Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n", - "Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n", "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n", - "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n" + "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n", + "Building wheels for collected packages: odfpy\n", + " Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=5bfe9fcd7c590666411d404ea3e4ef0f704c9e62ff6621deb4ab09c84bec082a\n", + " Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n", + "Successfully built odfpy\n", + "Installing collected packages: regex, odfpy, nltk\n", + "Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n", - "[nltk_data] Package wordnet is already up-to-date!\n" + "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n" ] } ], @@ -63,6 +76,8 @@ " return ret\n", " ret.append(str(l).lower())\n", " return ret\n", + "\n", + "\n", " \n", "WORDLIST_SIZE = 8192 + 3\n", "lemmatizer = WordNetLemmatizer()\n", diff --git a/docs/wordlist-new2.ipynb b/docs/wordlist-new2.ipynb new file mode 100644 index 0000000..5a05a82 --- /dev/null +++ b/docs/wordlist-new2.ipynb @@ -0,0 +1,112 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "991a711f-be98-4aae-a657-84b065449916", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: spacy in /opt/conda/lib/python3.10/site-packages (3.5.0)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.4)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.3.0)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.12)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.10.5)\n", + "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.1.2)\n", + "Requirement already satisfied: typer<0.8.0,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.7.0)\n", + "Requirement already satisfied: numpy>=1.15.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.23.5)\n", + "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (6.3.0)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.8)\n", + "Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from spacy) (67.3.2)\n", + "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (23.0)\n", + "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.1.1)\n", + "Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (8.1.7)\n", + "Requirement already satisfied: pathy>=0.10.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.10.1)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.8)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.4.6)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (4.64.1)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.7)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.9)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.28.2)\n", + "Requirement already satisfied: typing-extensions>=4.2.0 in /opt/conda/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy) (4.4.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2022.12.7)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.1.1)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.26.14)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4)\n", + "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.7.9)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.0.4)\n", + "Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/conda/lib/python3.10/site-packages (from typer<0.8.0,>=0.3.0->spacy) (8.1.3)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->spacy) (2.1.2)\n", + "Collecting en-core-web-trf==3.5.0\n", + " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl (460.3 MB)\n", + "\u001b[2K \u001b[91m━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.1/460.3 MB\u001b[0m \u001b[31m31.6 MB/s\u001b[0m eta \u001b[36m0:00:14\u001b[0m" + ] + } + ], + "source": [ + "try:\n", + " _initialized\n", + "except:\n", + " !pip install spacy\n", + " !python -m spacy download en_core_web_trf\n", + " import spacy\n", + " \n", + " spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n", + " \n", + " _initialized=True\n", + " \n", + "import pandas as pd\n", + "import gzip\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b93818f-c54a-4c88-9968-df4244b7c6f6", + "metadata": {}, + "outputs": [], + "source": [ + "import spacy\n", + "\n", + "# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization\n", + "nlp = spacy.load('en', disable=['parser', 'ner'])\n", + "\n", + "sentence = \"The striped bats are hanging on their feet for best\"\n", + "\n", + "# Parse the sentence using the loaded 'en' model object `nlp`\n", + "doc = nlp(sentence)\n", + "\n", + "# Extract the lemma for each token and join\n", + "\" \".join([token.lemma_ for token in doc])\n", + "#> 'the strip bat be hang on -PRON- foot for good'" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/wordlist/01-lemmatized-words.csv b/wordlist/01-lemmatized-words.csv new file mode 100644 index 0000000..d95343b --- /dev/null +++ b/wordlist/01-lemmatized-words.csv @@ -0,0 +1 @@ +word,lemmatized_word diff --git a/wordlist/01-lemmatized-words.py b/wordlist/01-lemmatized-words.py new file mode 100755 index 0000000..fffa46b --- /dev/null +++ b/wordlist/01-lemmatized-words.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +print("Step 1") + + +try: + _initialized +except: + # !pip install spacy + # !python -m spacy download en_core_web_trf + import spacy + from tqdm import tqdm + + nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner']) + + _initialized=True + +import pandas as pd +import gzip +import re + + +print("Step 2") + + +def get_lines(filename): + with gzip.open(filename, 'r') as f: + ret = [] + for l in f: + if len(ret) > 30_000: + return ret + ret.append(str(l).lower()) + return ret + + + +WORDLIST_SIZE = 8192 + 3 +word_re = re.compile(r"^[A-Za-z]+$") + + +print("Step 3") + + +annotated_words=pd.read_excel("annotated_words.ods") + +excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower()) +excluded_words[0:10] + +custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(","))) + +custom_maps = [ + (m[1]["word"].lower(), mapping.lower()) + for m in custom_maps.iterrows() + for mapping in m[1]["maps_to"] +] +custom_maps + + +print("Step 4") + + +# Start parsing the wordlist +all_words = get_lines("00-frequency-all.txt.gz") + +# Delete header line +all_words = all_words[1:] + +# Get only the word (fixed width) +all_words = [w[13:36].strip() for w in all_words] + +# Remove special characters +all_words = [w for w in all_words if word_re.search(w)] + +# Remove all removed words +all_words = [w for w in all_words if w not in excluded_words] + +# Add all custom mappings +for m in list(sum(custom_maps, ())): + if m[0] not in all_words: + all_words.append(m[0]) + if m[1] not in all_words: + all_words.append(m[1]) + + +print("Step 5") + +# Lemmatize all words (plural -> singular) +lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)] + +with open("01-lemmatized-words.csv", "w") as f: + f.write("word,lemmatized_word\n") + + for w in lemmatize_mappings: + f.write(f"{w[0]},{w[1]}") + f.write("\n") diff --git a/docs/annotated_words.ods b/wordlist/annotated_words.ods similarity index 100% rename from docs/annotated_words.ods rename to wordlist/annotated_words.ods diff --git a/wordlist/requirements.txt b/wordlist/requirements.txt new file mode 100644 index 0000000..f99d8bb --- /dev/null +++ b/wordlist/requirements.txt @@ -0,0 +1,124 @@ +anyio==3.6.2 +argon2-cffi==21.3.0 +argon2-cffi-bindings==21.2.0 +arrow==1.2.3 +asttokens==2.2.1 +attrs==22.2.0 +backcall==0.2.0 +beautifulsoup4==4.11.2 +bleach==6.0.0 +blis==0.7.9 +catalogue==2.0.8 +certifi==2022.12.7 +cffi==1.15.1 +charset-normalizer==3.0.1 +click==8.1.3 +comm==0.1.2 +confection==0.0.4 +cymem==2.0.7 +debugpy==1.6.6 +decorator==5.1.1 +defusedxml==0.7.1 +en-core-web-trf @ https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl +executing==1.2.0 +fastjsonschema==2.16.3 +filelock==3.9.0 +fqdn==1.5.1 +huggingface-hub==0.12.1 +idna==3.4 +ipykernel==6.21.2 +ipython==8.11.0 +ipython-genutils==0.2.0 +ipywidgets==8.0.4 +isoduration==20.11.0 +jedi==0.18.2 +Jinja2==3.1.2 +jsonpointer==2.3 +jsonschema==4.17.3 +jupyter==1.0.0 +jupyter-console==6.6.2 +jupyter-events==0.6.3 +jupyter_client==8.0.3 +jupyter_core==5.2.0 +jupyter_server==2.3.0 +jupyter_server_terminals==0.4.4 +jupyterlab-pygments==0.2.2 +jupyterlab-widgets==3.0.5 +langcodes==3.3.0 +MarkupSafe==2.1.2 +matplotlib-inline==0.1.6 +mistune==2.0.5 +murmurhash==1.0.9 +nbclassic==0.5.2 +nbclient==0.7.2 +nbconvert==7.2.9 +nbformat==5.7.3 +nest-asyncio==1.5.6 +notebook==6.5.2 +notebook_shim==0.2.2 +numpy==1.24.2 +nvidia-cublas-cu11==11.10.3.66 +nvidia-cuda-nvrtc-cu11==11.7.99 +nvidia-cuda-runtime-cu11==11.7.99 +nvidia-cudnn-cu11==8.5.0.96 +odfpy==1.4.1 +packaging==23.0 +pandas==1.5.3 +pandocfilters==1.5.0 +parso==0.8.3 +pathy==0.10.1 +pexpect==4.8.0 +pickleshare==0.7.5 +platformdirs==3.0.0 +preshed==3.0.8 +prometheus-client==0.16.0 +prompt-toolkit==3.0.38 +psutil==5.9.4 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pycparser==2.21 +pydantic==1.10.5 +Pygments==2.14.0 +pyrsistent==0.19.3 +python-dateutil==2.8.2 +python-json-logger==2.0.7 +pytz==2022.7.1 +PyYAML==6.0 +pyzmq==25.0.0 +qtconsole==5.4.0 +QtPy==2.3.0 +regex==2022.10.31 +requests==2.28.2 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +Send2Trash==1.8.0 +six==1.16.0 +smart-open==6.3.0 +sniffio==1.3.0 +soupsieve==2.4 +spacy==3.5.0 +spacy-alignments==0.9.0 +spacy-legacy==3.0.12 +spacy-loggers==1.0.4 +spacy-transformers==1.2.2 +srsly==2.4.6 +stack-data==0.6.2 +terminado==0.17.1 +thinc==8.1.7 +tinycss2==1.2.1 +tokenizers==0.13.2 +torch==1.13.1 +tornado==6.2 +tqdm==4.64.1 +traitlets==5.9.0 +transformers==4.26.1 +typer==0.7.0 +typing_extensions==4.5.0 +uri-template==1.2.0 +urllib3==1.26.14 +wasabi==1.1.1 +wcwidth==0.2.6 +webcolors==1.12 +webencodings==0.5.1 +websocket-client==1.5.1 +widgetsnbextension==4.0.5 diff --git a/wordlist/wordlist-new.ipynb b/wordlist/wordlist-new.ipynb new file mode 100644 index 0000000..9978270 --- /dev/null +++ b/wordlist/wordlist-new.ipynb @@ -0,0 +1,2657 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "0b00342f-7b19-49cc-bc6c-21019f8cc7dc", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting nltk\n", + " Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hCollecting odfpy\n", + " Downloading odfpy-1.4.1.tar.gz (717 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m26.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25hCollecting regex>=2021.8.3\n", + " Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m29.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n", + "Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n", + "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n", + "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n", + "Building wheels for collected packages: odfpy\n", + " Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=5bfe9fcd7c590666411d404ea3e4ef0f704c9e62ff6621deb4ab09c84bec082a\n", + " Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n", + "Successfully built odfpy\n", + "Installing collected packages: regex, odfpy, nltk\n", + "Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n" + ] + } + ], + "source": [ + "try:\n", + " _initialized\n", + "except:\n", + " !pip install nltk odfpy\n", + " import nltk\n", + " \n", + " nltk.download(\"wordnet\")\n", + " _initialized=True\n", + " \n", + "from nltk.stem.wordnet import WordNetLemmatizer\n", + "import pandas as pd\n", + "import gzip\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "985883de-8049-4f81-acd9-34e1abcd4070", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def get_lines(filename):\n", + " with gzip.open(filename, 'r') as f:\n", + " ret = []\n", + " for l in f:\n", + " if len(ret) > 30_000:\n", + " return ret\n", + " ret.append(str(l).lower())\n", + " return ret\n", + "\n", + "\n", + " \n", + "WORDLIST_SIZE = 8192 + 3\n", + "lemmatizer = WordNetLemmatizer()\n", + "word_re = re.compile(r\"^[A-Za-z]+$\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "926d0d84-0d7e-4939-b87f-1a170f870a8f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "annotated_words=pd.read_excel(\"annotated_words.ods\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8b0d26e4-051c-4669-b566-bbd5ddbbe02b", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['a', 'as', 'it', 'was', 'i', 'has', 'so', 'its', 's', 'p']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n", + "excluded_words[0:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2eea14b2-82bf-4353-8982-76a6c7f46d22", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('be', 'bee'),\n", + " ('by', 'bye'),\n", + " ('per', 'purr'),\n", + " ('sense', 'cent'),\n", + " ('died', 'dyed'),\n", + " ('cents', 'sense'),\n", + " ('yellow', 'hello'),\n", + " ('corps', 'core'),\n", + " ('ore', 'oar'),\n", + " ('ore', ' or'),\n", + " ('vary', 'very'),\n", + " ('com', 'calm'),\n", + " ('filing', 'filling'),\n", + " ('fax', 'facts'),\n", + " ('favour', 'favor'),\n", + " ('theatre', 'theater'),\n", + " ('par', 'parse'),\n", + " ('honour', 'honor'),\n", + " ('harry', 'hairy'),\n", + " ('brings', 'bring'),\n", + " ('organisation', 'organization'),\n", + " ('simultaneously', 'simultaneous'),\n", + " ('aluminum', 'aluminium'),\n", + " ('knight', 'night'),\n", + " ('electronics', 'electronic'),\n", + " ('senses', 'cent'),\n", + " ('organisations', 'organization'),\n", + " ('fortunately', 'fortunate'),\n", + " ('corp', 'core'),\n", + " ('chile', 'chilly'),\n", + " ('chile', ' chili'),\n", + " ('owe', 'oh'),\n", + " ('capitol', 'capital'),\n", + " ('weary', 'wary'),\n", + " ('berry', 'barry'),\n", + " ('lecturer', 'lecture'),\n", + " ('weigh', 'way'),\n", + " ('aluminium', 'aluminum'),\n", + " ('isle', 'aisle'),\n", + " ('boulder', 'bolder'),\n", + " ('blew', 'blue'),\n", + " ('reformed', 'reform'),\n", + " ('scent', 'cent'),\n", + " ('ads', 'adds'),\n", + " ('honours', 'honors'),\n", + " ('bot', 'bought'),\n", + " ('dew', 'do'),\n", + " ('dew', ' due'),\n", + " ('theatres', 'theater'),\n", + " ('thru', 'through'),\n", + " ('sensed', 'cent'),\n", + " ('monies', 'moneys'),\n", + " ('cue', 'queue'),\n", + " ('hairy', 'harry'),\n", + " ('weighs', 'way'),\n", + " ('hem', 'him'),\n", + " ('nun', 'none'),\n", + " ('organisational', 'organizational'),\n", + " ('grate', 'great'),\n", + " ('dessert', 'desert'),\n", + " ('aux', 'ox'),\n", + " ('rap', 'wrap'),\n", + " ('filings', 'filling'),\n", + " ('pars', 'parse'),\n", + " ('dazed', 'day'),\n", + " ('scents', 'cent'),\n", + " ('daze', 'day'),\n", + " ('four', 'for')]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n", + "\n", + "custom_maps = [\n", + " (m[1][\"word\"].lower(), mapping.lower())\n", + " for m in custom_maps.iterrows()\n", + " for mapping in m[1][\"maps_to\"]\n", + "]\n", + "custom_maps" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8bdfd108-bf43-4c0f-bc5c-f91925da753f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Start parsing the wordlist\n", + "all_words = get_lines(\"frequency-all.txt.gz\")\n", + "\n", + "# Delete header line\n", + "all_words = all_words[1:]\n", + "\n", + "# Get only the word (fixed width)\n", + "all_words = [w[13:36].strip() for w in all_words]\n", + "\n", + "# Remove special characters\n", + "all_words = [w for w in all_words if word_re.search(w)]\n", + "\n", + "# Remove all removed words\n", + "all_words = [w for w in all_words if w not in excluded_words]\n", + "\n", + "# Add all custom mappings\n", + "for m in list(sum(custom_maps, ())):\n", + " if m[0] not in all_words:\n", + " all_words.append(m[0])\n", + " if m[1] not in all_words:\n", + " all_words.append(m[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e42f2b56-98b3-4465-95be-812d8584b511", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['the',\n", + " 'of',\n", + " 'and',\n", + " 'to',\n", + " 'in',\n", + " 'is',\n", + " 'that',\n", + " 'for',\n", + " 'be',\n", + " 'by',\n", + " 'with',\n", + " 'on',\n", + " 'not',\n", + " 'or',\n", + " 'this',\n", + " 'are',\n", + " 'at',\n", + " 'from',\n", + " 'he',\n", + " 'which',\n", + " 'his',\n", + " 'have',\n", + " 'an',\n", + " 'but',\n", + " 'you',\n", + " 'they',\n", + " 'were',\n", + " 'had',\n", + " 'we',\n", + " 'all',\n", + " 'one',\n", + " 'their',\n", + " 'been',\n", + " 'will',\n", + " 'there',\n", + " 'can',\n", + " 'if',\n", + " 'other',\n", + " 'would',\n", + " 'no',\n", + " 'her',\n", + " 'may',\n", + " 'more',\n", + " 'when',\n", + " 'who',\n", + " 'such',\n", + " 'these',\n", + " 'any',\n", + " 'she',\n", + " 'new',\n", + " 'time',\n", + " 'than',\n", + " 'do',\n", + " 'some',\n", + " 'what',\n", + " 'only',\n", + " 'into',\n", + " 'them',\n", + " 'two',\n", + " 'also',\n", + " 'about',\n", + " 'out',\n", + " 'him',\n", + " 'my',\n", + " 'said',\n", + " 'up',\n", + " 'our',\n", + " 'first',\n", + " 'should',\n", + " 'under',\n", + " 'made',\n", + " 'state',\n", + " 'see',\n", + " 'after',\n", + " 'could',\n", + " 'then',\n", + " 'me',\n", + " 'most',\n", + " 'over',\n", + " 'very',\n", + " 'your',\n", + " 'between',\n", + " 'where',\n", + " 'now',\n", + " 'shall',\n", + " 'work',\n", + " 'those',\n", + " 'same',\n", + " 'well',\n", + " 'each',\n", + " 'many',\n", + " 'being',\n", + " 'years',\n", + " 'did',\n", + " 'year',\n", + " 'through',\n", + " 'must',\n", + " 'upon',\n", + " 'before',\n", + " 'like',\n", + " 'use',\n", + " 'part',\n", + " 'general',\n", + " 'people',\n", + " 'because',\n", + " 'used',\n", + " 'how',\n", + " 'even',\n", + " 'much',\n", + " 'states',\n", + " 'during',\n", + " 'both',\n", + " 'case',\n", + " 'three',\n", + " 'number',\n", + " 'make',\n", + " 'per',\n", + " 'great',\n", + " 'act',\n", + " 'way',\n", + " 'life',\n", + " 'good',\n", + " 'day',\n", + " 'public',\n", + " 'man',\n", + " 'however',\n", + " 'system',\n", + " 'water',\n", + " 'without',\n", + " 'government',\n", + " 'while',\n", + " 'long',\n", + " 'order',\n", + " 'law',\n", + " 'section',\n", + " 'court',\n", + " 'high',\n", + " 'right',\n", + " 'own',\n", + " 'found',\n", + " 'united',\n", + " 'just',\n", + " 'here',\n", + " 'against',\n", + " 'world',\n", + " 'does',\n", + " 'company',\n", + " 'within',\n", + " 'given',\n", + " 'service',\n", + " 'house',\n", + " 'another',\n", + " 'power',\n", + " 'place',\n", + " 'know',\n", + " 'little',\n", + " 'down',\n", + " 'present',\n", + " 'every',\n", + " 'national',\n", + " 'back',\n", + " 'take',\n", + " 'information',\n", + " 'men',\n", + " 'since',\n", + " 'might',\n", + " 'small',\n", + " 'large',\n", + " 'school',\n", + " 'following',\n", + " 'still',\n", + " 'less',\n", + " 'last',\n", + " 'city',\n", + " 'second',\n", + " 'development',\n", + " 'different',\n", + " 'university',\n", + " 'old',\n", + " 'form',\n", + " 'point',\n", + " 'total',\n", + " 'data',\n", + " 'too',\n", + " 'committee',\n", + " 'report',\n", + " 'business',\n", + " 'think',\n", + " 'end',\n", + " 'get',\n", + " 'set',\n", + " 'research',\n", + " 'say',\n", + " 'come',\n", + " 'country',\n", + " 'never',\n", + " 'fact',\n", + " 'go',\n", + " 'control',\n", + " 'thus',\n", + " 'having',\n", + " 'value',\n", + " 'social',\n", + " 'department',\n", + " 'few',\n", + " 'above',\n", + " 'important',\n", + " 'interest',\n", + " 'study',\n", + " 'off',\n", + " 'area',\n", + " 'means',\n", + " 'office',\n", + " 'group',\n", + " 'give',\n", + " 'again',\n", + " 'war',\n", + " 'whether',\n", + " 'question',\n", + " 'called',\n", + " 'period',\n", + " 'line',\n", + " 'land',\n", + " 'four',\n", + " 'among',\n", + " 'table',\n", + " 'board',\n", + " 'until',\n", + " 'hand',\n", + " 'taken',\n", + " 'need',\n", + " 'education',\n", + " 'certain',\n", + " 'county',\n", + " 'action',\n", + " 'several',\n", + " 'am',\n", + " 'course',\n", + " 'cases',\n", + " 'far',\n", + " 'effect',\n", + " 'possible',\n", + " 'though',\n", + " 'left',\n", + " 'further',\n", + " 'home',\n", + " 'days',\n", + " 'person',\n", + " 'health',\n", + " 'amount',\n", + " 'members',\n", + " 'subject',\n", + " 'yet',\n", + " 'program',\n", + " 'therefore',\n", + " 'process',\n", + " 'services',\n", + " 'rate',\n", + " 'local',\n", + " 'name',\n", + " 'find',\n", + " 'necessary',\n", + " 'often',\n", + " 'others',\n", + " 'whole',\n", + " 'change',\n", + " 'example',\n", + " 'president',\n", + " 'history',\n", + " 'best',\n", + " 'although',\n", + " 'family',\n", + " 'side',\n", + " 'women',\n", + " 'held',\n", + " 'based',\n", + " 'south',\n", + " 'special',\n", + " 'required',\n", + " 'came',\n", + " 'thought',\n", + " 'five',\n", + " 'always',\n", + " 'himself',\n", + " 'air',\n", + " 'known',\n", + " 'head',\n", + " 'either',\n", + " 'property',\n", + " 'cost',\n", + " 'rather',\n", + " 'bill',\n", + " 'put',\n", + " 'human',\n", + " 'figure',\n", + " 'results',\n", + " 'level',\n", + " 'conditions',\n", + " 'full',\n", + " 'times',\n", + " 'book',\n", + " 'available',\n", + " 'early',\n", + " 'matter',\n", + " 'common',\n", + " 'light',\n", + " 'let',\n", + " 'society',\n", + " 'body',\n", + " 'international',\n", + " 'including',\n", + " 'free',\n", + " 'evidence',\n", + " 'better',\n", + " 'type',\n", + " 'provided',\n", + " 'due',\n", + " 'next',\n", + " 'production',\n", + " 'once',\n", + " 'done',\n", + " 'making',\n", + " 'least',\n", + " 'support',\n", + " 'north',\n", + " 'later',\n", + " 'using',\n", + " 'things',\n", + " 'economic',\n", + " 'chapter',\n", + " 'various',\n", + " 'why',\n", + " 'white',\n", + " 'going',\n", + " 'commission',\n", + " 'federal',\n", + " 'away',\n", + " 'field',\n", + " 'result',\n", + " 'nature',\n", + " 'policy',\n", + " 'become',\n", + " 'political',\n", + " 'increase',\n", + " 'around',\n", + " 'age',\n", + " 'want',\n", + " 'low',\n", + " 'trade',\n", + " 'half',\n", + " 'position',\n", + " 'young',\n", + " 'money',\n", + " 'percent',\n", + " 'cent',\n", + " 'class',\n", + " 'words',\n", + " 'view',\n", + " 'provide',\n", + " 'seen',\n", + " 'show',\n", + " 'district',\n", + " 'party',\n", + " 'analysis',\n", + " 'care',\n", + " 'june',\n", + " 'foreign',\n", + " 'shown',\n", + " 'received',\n", + " 'management',\n", + " 'third',\n", + " 'took',\n", + " 'something',\n", + " 'tax',\n", + " 'account',\n", + " 'problem',\n", + " 'almost',\n", + " 'west',\n", + " 'nothing',\n", + " 'together',\n", + " 'individual',\n", + " 'open',\n", + " 'material',\n", + " 'paper',\n", + " 'feet',\n", + " 'force',\n", + " 'association',\n", + " 'purpose',\n", + " 'terms',\n", + " 'method',\n", + " 'help',\n", + " 'real',\n", + " 'ever',\n", + " 'already',\n", + " 'along',\n", + " 'went',\n", + " 'term',\n", + " 'systems',\n", + " 'member',\n", + " 'particular',\n", + " 'problems',\n", + " 'energy',\n", + " 'secretary',\n", + " 'date',\n", + " 'price',\n", + " 'short',\n", + " 'true',\n", + " 'street',\n", + " 'building',\n", + " 'room',\n", + " 'market',\n", + " 'look',\n", + " 'similar',\n", + " 'industry',\n", + " 'areas',\n", + " 'bank',\n", + " 'according',\n", + " 'studies',\n", + " 'itself',\n", + " 'application',\n", + " 'current',\n", + " 'read',\n", + " 'press',\n", + " 'community',\n", + " 'plan',\n", + " 'whose',\n", + " 'major',\n", + " 'considered',\n", + " 'mind',\n", + " 'union',\n", + " 'cause',\n", + " 'able',\n", + " 'surface',\n", + " 'face',\n", + " 'river',\n", + " 'council',\n", + " 'income',\n", + " 'july',\n", + " 'near',\n", + " 'experience',\n", + " 'non',\n", + " 'paid',\n", + " 'pay',\n", + " 'reason',\n", + " 'themselves',\n", + " 'asked',\n", + " 'march',\n", + " 'king',\n", + " 'higher',\n", + " 'single',\n", + " 'rights',\n", + " 'average',\n", + " 'father',\n", + " 'note',\n", + " 'treatment',\n", + " 'love',\n", + " 'changes',\n", + " 'black',\n", + " 'knowledge',\n", + " 'enough',\n", + " 'future',\n", + " 'kind',\n", + " 'lower',\n", + " 'authority',\n", + " 'past',\n", + " 'natural',\n", + " 'six',\n", + " 'persons',\n", + " 'food',\n", + " 'working',\n", + " 'central',\n", + " 'college',\n", + " 'self',\n", + " 'products',\n", + " 'model',\n", + " 'brought',\n", + " 'greater',\n", + " 'countries',\n", + " 'test',\n", + " 'nor',\n", + " 'students',\n", + " 'private',\n", + " 'construction',\n", + " 'perhaps',\n", + " 'ground',\n", + " 'sir',\n", + " 'basis',\n", + " 'months',\n", + " 'growth',\n", + " 'increased',\n", + " 'word',\n", + " 'east',\n", + " 'language',\n", + " 'rule',\n", + " 'continued',\n", + " 'quite',\n", + " 'except',\n", + " 'series',\n", + " 'practice',\n", + " 'thing',\n", + " 'night',\n", + " 'works',\n", + " 'eyes',\n", + " 'oil',\n", + " 'art',\n", + " 'told',\n", + " 'especially',\n", + " 'population',\n", + " 'science',\n", + " 'whom',\n", + " 'obtained',\n", + " 'parts',\n", + " 'capital',\n", + " 'include',\n", + " 'generally',\n", + " 'meeting',\n", + " 'specific',\n", + " 'described',\n", + " 'believe',\n", + " 'review',\n", + " 'issue',\n", + " 'respect',\n", + " 'contract',\n", + " 'became',\n", + " 'effects',\n", + " 'medical',\n", + " 'road',\n", + " 'got',\n", + " 'clear',\n", + " 'main',\n", + " 'labor',\n", + " 'operation',\n", + " 'size',\n", + " 'below',\n", + " 'hours',\n", + " 'sense',\n", + " 'addition',\n", + " 'probably',\n", + " 'mean',\n", + " 'groups',\n", + " 'century',\n", + " 'personal',\n", + " 'plant',\n", + " 'training',\n", + " 'design',\n", + " 'statement',\n", + " 'structure',\n", + " 'project',\n", + " 'million',\n", + " 'usually',\n", + " 'range',\n", + " 'call',\n", + " 'mother',\n", + " 'seems',\n", + " 'standard',\n", + " 'return',\n", + " 'title',\n", + " 'established',\n", + " 'keep',\n", + " 'space',\n", + " 'annual',\n", + " 'record',\n", + " 'close',\n", + " 'april',\n", + " 'complete',\n", + " 'page',\n", + " 'heart',\n", + " 'says',\n", + " 'fig',\n", + " 'quality',\n", + " 'gas',\n", + " 'methods',\n", + " 'letter',\n", + " 'stock',\n", + " 'costs',\n", + " 'gave',\n", + " 'related',\n", + " 'administration',\n", + " 'activities',\n", + " 'condition',\n", + " 'theory',\n", + " 'town',\n", + " 'equipment',\n", + " 'rates',\n", + " 'soon',\n", + " 'decision',\n", + " 'pressure',\n", + " 'written',\n", + " 'lines',\n", + " 'corporation',\n", + " 'tell',\n", + " 'schools',\n", + " 'agreement',\n", + " 'reported',\n", + " 'attention',\n", + " 'materials',\n", + " 'fire',\n", + " 'direct',\n", + " 'saw',\n", + " 'published',\n", + " 'temperature',\n", + " 'species',\n", + " 'really',\n", + " 'laws',\n", + " 'woman',\n", + " 'function',\n", + " 'military',\n", + " 'proposed',\n", + " 'january',\n", + " 'additional',\n", + " 'late',\n", + " 'books',\n", + " 'opinion',\n", + " 'loss',\n", + " 'limited',\n", + " 'source',\n", + " 'article',\n", + " 'notice',\n", + " 'security',\n", + " 'organization',\n", + " 'hands',\n", + " 'financial',\n", + " 'rules',\n", + " 'follows',\n", + " 'miles',\n", + " 'values',\n", + " 'points',\n", + " 'chief',\n", + " 'distribution',\n", + " 'sometimes',\n", + " 'insurance',\n", + " 'son',\n", + " 'strong',\n", + " 'length',\n", + " 'activity',\n", + " 'original',\n", + " 'forms',\n", + " 'yes',\n", + " 'effective',\n", + " 'defendant',\n", + " 'living',\n", + " 'december',\n", + " 'character',\n", + " 'began',\n", + " 'carried',\n", + " 'supply',\n", + " 'blood',\n", + " 'taking',\n", + " 'manner',\n", + " 'journal',\n", + " 'hundred',\n", + " 'red',\n", + " 'shows',\n", + " 'developed',\n", + " 'performance',\n", + " 'situation',\n", + " 'programs',\n", + " 'felt',\n", + " 'workers',\n", + " 'volume',\n", + " 'presented',\n", + " 'knew',\n", + " 'answer',\n", + " 'resources',\n", + " 'questions',\n", + " 'industrial',\n", + " 'needs',\n", + " 'twenty',\n", + " 'sent',\n", + " 'looked',\n", + " 'purposes',\n", + " 'library',\n", + " 'added',\n", + " 'passed',\n", + " 'ten',\n", + " 'sea',\n", + " 'applied',\n", + " 'included',\n", + " 'physical',\n", + " 'across',\n", + " 'army',\n", + " 'toward',\n", + " 'produced',\n", + " 'makes',\n", + " 'placed',\n", + " 'role',\n", + " 'october',\n", + " 'final',\n", + " 'approach',\n", + " 'provisions',\n", + " 'leave',\n", + " 'director',\n", + " 'employment',\n", + " 'anything',\n", + " 'particularly',\n", + " 'hard',\n", + " 'outside',\n", + " 'week',\n", + " 'feel',\n", + " 'charge',\n", + " 'indeed',\n", + " 'degree',\n", + " 'reference',\n", + " 'requirements',\n", + " 'september',\n", + " 'today',\n", + " 'western',\n", + " 'influence',\n", + " 'unit',\n", + " 'solution',\n", + " 'chairman',\n", + " 'legal',\n", + " 'motion',\n", + " 'region',\n", + " 'idea',\n", + " 'list',\n", + " 'judgment',\n", + " 'determined',\n", + " 'poor',\n", + " 'disease',\n", + " 'civil',\n", + " 'turn',\n", + " 'modern',\n", + " 'normal',\n", + " 'appear',\n", + " 'employees',\n", + " 'latter',\n", + " 'heard',\n", + " 'top',\n", + " 'sure',\n", + " 'moment',\n", + " 'code',\n", + " 'reports',\n", + " 'wife',\n", + " 'post',\n", + " 'difficult',\n", + " 'recent',\n", + " 'extent',\n", + " 'longer',\n", + " 'story',\n", + " 'meet',\n", + " 'officers',\n", + " 'companies',\n", + " 'patients',\n", + " 'front',\n", + " 'doing',\n", + " 'staff',\n", + " 'product',\n", + " 'august',\n", + " 'needed',\n", + " 'involved',\n", + " 'likely',\n", + " 'former',\n", + " 'run',\n", + " 'author',\n", + " 'middle',\n", + " 'turned',\n", + " 'agency',\n", + " 'reading',\n", + " 'beginning',\n", + " 'duty',\n", + " 'movement',\n", + " 'month',\n", + " 'alone',\n", + " 'issues',\n", + " 'beyond',\n", + " 'fine',\n", + " 'base',\n", + " 'parties',\n", + " 'relations',\n", + " 'simple',\n", + " 'consider',\n", + " 'proper',\n", + " 'instead',\n", + " 'significant',\n", + " 'appears',\n", + " 'equal',\n", + " 'lost',\n", + " 'followed',\n", + " 'hope',\n", + " 'cut',\n", + " 'unless',\n", + " 'nearly',\n", + " 'claim',\n", + " 'associated',\n", + " 'expected',\n", + " 'operations',\n", + " 'difference',\n", + " 'funds',\n", + " 'direction',\n", + " 'cross',\n", + " 'live',\n", + " 'finally',\n", + " 'weight',\n", + " 'lead',\n", + " 'trial',\n", + " 'justice',\n", + " 'officer',\n", + " 'factors',\n", + " 'response',\n", + " 'cells',\n", + " 'earth',\n", + " 'rest',\n", + " 'fund',\n", + " 'bring',\n", + " 'trust',\n", + " 'goods',\n", + " 'observed',\n", + " 'behind',\n", + " 'job',\n", + " 'door',\n", + " 'types',\n", + " 'understand',\n", + " 'acid',\n", + " 'hold',\n", + " 'technology',\n", + " 'wide',\n", + " 'protection',\n", + " 'basic',\n", + " 'november',\n", + " 'seemed',\n", + " 'throughout',\n", + " 'levels',\n", + " 'importance',\n", + " 'sales',\n", + " 'sale',\n", + " 'stated',\n", + " 'address',\n", + " 'potential',\n", + " 'payment',\n", + " 'prior',\n", + " 'discussion',\n", + " 'conference',\n", + " 'writing',\n", + " 'stage',\n", + " 'fall',\n", + " 'notes',\n", + " 'iron',\n", + " 'play',\n", + " 'ask',\n", + " 'plants',\n", + " 'relationship',\n", + " 'towards',\n", + " 'regard',\n", + " 'referred',\n", + " 'patient',\n", + " 'flow',\n", + " 'consideration',\n", + " 'hospital',\n", + " 'seem',\n", + " 'february',\n", + " 'soil',\n", + " 'morning',\n", + " 'commercial',\n", + " 'planning',\n", + " 'provides',\n", + " 'appropriate',\n", + " 'technical',\n", + " 'demand',\n", + " 'sufficient',\n", + " 'principal',\n", + " 'credit',\n", + " 'peace',\n", + " 'previous',\n", + " 'object',\n", + " 'prices',\n", + " 'kept',\n", + " 'sound',\n", + " 'wanted',\n", + " 'looking',\n", + " 'entire',\n", + " 'plaintiff',\n", + " 'heat',\n", + " 'ways',\n", + " 'otherwise',\n", + " 'judge',\n", + " 'hour',\n", + " 'capacity',\n", + " 'brown',\n", + " 'music',\n", + " 'risk',\n", + " 'box',\n", + " 'exchange',\n", + " 'produce',\n", + " 'station',\n", + " 'big',\n", + " 'primary',\n", + " 'institute',\n", + " 'mentioned',\n", + " 'prepared',\n", + " 'cell',\n", + " 'spirit',\n", + " 'allowed',\n", + " 'claims',\n", + " 'site',\n", + " 'green',\n", + " 'directly',\n", + " 'text',\n", + " 'friends',\n", + " 'presence',\n", + " 'survey',\n", + " 'determine',\n", + " 'car',\n", + " 'larger',\n", + " 'gives',\n", + " 'deep',\n", + " 'simply',\n", + " 'immediately',\n", + " 'distance',\n", + " 'coming',\n", + " 'seven',\n", + " 'steel',\n", + " 'friend',\n", + " 'records',\n", + " 'existing',\n", + " 'clearly',\n", + " 'actual',\n", + " 'relation',\n", + " 'born',\n", + " 'learning',\n", + " 'forces',\n", + " 'voice',\n", + " 'earlier',\n", + " 'circumstances',\n", + " 'safety',\n", + " 'ago',\n", + " 'issued',\n", + " 'upper',\n", + " 'require',\n", + " 'scale',\n", + " 'island',\n", + " 'culture',\n", + " 'employed',\n", + " 'eight',\n", + " 'estate',\n", + " 'facts',\n", + " 'portion',\n", + " 'deal',\n", + " 'share',\n", + " 'actually',\n", + " 'aid',\n", + " 'engineering',\n", + " 'continue',\n", + " 'formed',\n", + " 'agricultural',\n", + " 'entitled',\n", + " 'mass',\n", + " 'truth',\n", + " 'giving',\n", + " 'numbers',\n", + " 'places',\n", + " 'met',\n", + " 'built',\n", + " 'content',\n", + " 'connection',\n", + " 'assistance',\n", + " 'coal',\n", + " 'progress',\n", + " 'receive',\n", + " 'active',\n", + " 'nation',\n", + " 'contact',\n", + " 'amendment',\n", + " 'interests',\n", + " 'net',\n", + " 'wall',\n", + " 'standards',\n", + " 'farm',\n", + " 'understanding',\n", + " 'strength',\n", + " 'minutes',\n", + " 'figures',\n", + " 'move',\n", + " 'elements',\n", + " 'concerned',\n", + " 'regulations',\n", + " 'step',\n", + " 'literature',\n", + " 'units',\n", + " 'opportunity',\n", + " 'investment',\n", + " 'led',\n", + " 'reduced',\n", + " 'follow',\n", + " 'facilities',\n", + " 'benefit',\n", + " 'compared',\n", + " 'reached',\n", + " 'student',\n", + " 'religious',\n", + " 'measure',\n", + " 'individuals',\n", + " 'meaning',\n", + " 'considerable',\n", + " 'relative',\n", + " 'electric',\n", + " 'joint',\n", + " 'certainly',\n", + " 'failure',\n", + " 'apply',\n", + " ...]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_words" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd9e939e-7827-42f9-89be-bcfbb8bd3f52", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "64b6fcd3-acf7-45da-a335-79c538963bdd", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "discus\n", + "physic\n", + "posse\n", + "serf\n", + "sens\n", + "caput\n", + "bos\n", + "graf\n", + "pant\n", + "barrack\n", + "auspex\n", + "footstep\n", + "colonist\n", + "villager\n", + "kilometer\n", + "granule\n", + "credential\n", + "petal\n", + "trouser\n", + "shortcoming\n", + "microorganism\n", + "italic\n", + "grandchild\n", + "munition\n", + "parenthesis\n", + "foodstuff\n", + "attache\n", + "grandparent\n", + "tropic\n", + "kilometre\n", + "congratulation\n", + "fume\n", + "convulsion\n", + "nostril\n", + "utensil\n", + "cooky\n", + "amenity\n", + "reptile\n", + "pretension\n", + "sock\n", + "peso\n", + "mitochondrion\n", + "reminiscence\n", + "invader\n", + "macrophage\n", + "eyelid\n", + "dweller\n", + "bristle\n", + "tenet\n", + "taxon\n", + "outskirt\n", + "policyholder\n", + "stamen\n", + "horseman\n", + "striker\n", + "ramification\n", + "tuft\n", + "cultivar\n", + "interrogatory\n", + "bylaw\n", + "bellow\n", + "neoplasm\n", + "insurgent\n", + "chore\n", + "pensioner\n", + "exigency\n", + "forefather\n", + "atrocity\n", + "dissenter\n", + "corpuscle\n", + "islander\n", + "numeral\n", + "bureaucrat\n", + "classmate\n", + "crossroad\n", + "pitfall\n", + "firework\n", + "ravage\n", + "broadcaster\n", + "heretic\n", + "appurtenance\n", + "potentiality\n", + "louse\n", + "conspirator\n", + "revers\n", + "combatant\n", + "conferee\n", + "serviceman\n", + "repercussion\n", + "grader\n", + "exhibitor\n", + "alkaloid\n", + "collaborator\n", + "slipper\n", + "foothill\n", + "homeowner\n", + "hallucination\n", + "ailment\n", + "crumb\n", + "milligram\n", + "turnip\n", + "fingertip\n", + "tradesman\n", + "archaeologist\n", + "bondholder\n", + "lira\n", + "emolument\n", + "tailing\n", + "enthusiast\n", + "tubule\n", + "warship\n", + "speculator\n", + "jobber\n", + "raisin\n", + "vicissitude\n", + "courtier\n", + "clove\n", + "entrant\n", + "festivity\n", + "bough\n", + "imago\n", + "fibroblast\n", + "bruise\n", + "misgiving\n", + "parishioner\n", + "bract\n", + "microbe\n", + "industrialist\n", + "sprout\n", + "wrinkle\n", + "worshipper\n", + "retiree\n", + "cracker\n", + "negotiator\n", + "pronouncement\n", + "devotee\n", + "sandal\n", + "sepal\n", + "interrelationship\n", + "corticosteroid\n", + "sou\n", + "framer\n", + "knuckle\n", + "leukocyte\n", + "malformation\n", + "geographer\n", + "fastener\n", + "ruble\n", + "whisker\n", + "tentacle\n", + "footprint\n", + "ratepayer\n", + "marketer\n", + "refiner\n", + "cilium\n", + "inroad\n", + "dragoon\n", + "litigant\n", + "kilo\n", + "shipowner\n", + "rudiment\n", + "appointee\n", + "fingerprint\n", + "anther\n", + "depredation\n", + "stave\n", + "rancher\n", + "cytokine\n", + "artefact\n", + "freeholder\n", + "churchman\n", + "fungicide\n", + "inequity\n", + "contraindication\n", + "arrhythmia\n", + "functionary\n", + "bandit\n", + "horde\n", + "spermatozoon\n", + "selectman\n", + "blocker\n", + "inaccuracy\n", + "gramme\n", + "billet\n", + "radiograph\n", + "demonstrator\n", + "amphibian\n", + "mussel\n", + "rafter\n", + "headlight\n", + "vestige\n", + "loin\n", + "raider\n", + "crevice\n", + "suitor\n", + "technologist\n", + "trooper\n", + "globule\n", + "firefighter\n", + "woodcut\n", + "purr\n", + " or\n", + "parse\n", + " chili\n", + "bolder\n", + " due\n", + "scents\n", + "daze\n" + ] + } + ], + "source": [ + "# Lemmatize all words (plural -> singular)\n", + "lemmatize_mappings = [\n", + " (w, lemmatizer.lemmatize(w)) \n", + " for w in all_words\n", + " # if w != lemmatizer.lemmatize(w)\n", + "]\n", + "\n", + "# Remove all words that lemmatize to another word\n", + "#all_words = [w for w in all_words if w not in ]\n", + "\n", + "# Add custom lemmatizations\n", + "for l in custom_maps:\n", + " if l in lemmatize_mappings:\n", + " print(f\"Warning: {l} is already lemmatized\")\n", + " else:\n", + " lemmatize_mappings.append(l)\n", + " \n", + "lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]\n", + "\n", + "# Now, re-add all lematized words to the list of every word\n", + "for w in sum(lemmatize_mappings, ()):\n", + " if w not in all_words:\n", + " print(w)\n", + " all_words.append(w)\n", + " \n", + "lemmatize_mappings = {k: v for k, v in lemmatize_mappings}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "8bdff9d0-f3ff-498f-952d-13f1a91bfbd5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "final_wordlist = []\n", + "seen_lemmatizations = set()\n", + "for w in all_words:\n", + " lemmatized = lemmatize_mappings.get(w) or w\n", + " if lemmatized in seen_lemmatizations:\n", + " # The lemmatized version of this word was already seen\n", + " continue\n", + " else:\n", + " # The lemmatized version hasn't been seen. We're good to add it\n", + " final_wordlist.append([\n", + " k\n", + " for k\n", + " in lemmatize_mappings.keys()\n", + " if lemmatize_mappings[k] == lemmatized\n", + " ])\n", + " seen_lemmatizations.add(lemmatized)\n", + "\n", + " if len(final_wordlist) >= WORDLIST_SIZE:\n", + " break\n", + "\n", + "# Now, convert it to the format (number, word)\n", + "final_wordlist = [\n", + " (idx, w)\n", + " for idx, words in enumerate(final_wordlist)\n", + " for w in words\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "65bd6887-613e-45ae-ac45-6ed5967b3a43", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(0, 'the'),\n", + " (1, 'of'),\n", + " (2, 'and'),\n", + " (3, 'to'),\n", + " (4, 'in'),\n", + " (5, 'is'),\n", + " (6, 'that'),\n", + " (7, 'for'),\n", + " (7, 'four'),\n", + " (8, 'be'),\n", + " (8, 'bee'),\n", + " (8, 'bees'),\n", + " (9, 'by'),\n", + " (9, 'bye'),\n", + " (10, 'with'),\n", + " (11, 'on'),\n", + " (12, 'not'),\n", + " (13, 'or'),\n", + " (14, 'this'),\n", + " (15, 'are'),\n", + " (16, 'at'),\n", + " (17, 'from'),\n", + " (18, 'he'),\n", + " (19, 'which'),\n", + " (20, 'his'),\n", + " (21, 'have'),\n", + " (22, 'an'),\n", + " (23, 'but'),\n", + " (24, 'you'),\n", + " (25, 'they'),\n", + " (26, 'were'),\n", + " (27, 'had'),\n", + " (28, 'we'),\n", + " (29, 'all'),\n", + " (30, 'one'),\n", + " (30, 'ones'),\n", + " (31, 'their'),\n", + " (32, 'been'),\n", + " (33, 'will'),\n", + " (33, 'wills'),\n", + " (34, 'there'),\n", + " (35, 'can'),\n", + " (35, 'cans'),\n", + " (36, 'if'),\n", + " (37, 'other'),\n", + " (38, 'would'),\n", + " (39, 'no'),\n", + " (39, 'nos'),\n", + " (40, 'her'),\n", + " (41, 'may'),\n", + " (42, 'more'),\n", + " (42, 'mores'),\n", + " (43, 'when'),\n", + " (44, 'who'),\n", + " (45, 'such'),\n", + " (46, 'these'),\n", + " (47, 'any'),\n", + " (48, 'she'),\n", + " (49, 'new'),\n", + " (50, 'time'),\n", + " (50, 'times'),\n", + " (51, 'than'),\n", + " (52, 'do'),\n", + " (53, 'some'),\n", + " (54, 'what'),\n", + " (55, 'only'),\n", + " (56, 'into'),\n", + " (57, 'them'),\n", + " (58, 'two'),\n", + " (59, 'also'),\n", + " (60, 'about'),\n", + " (61, 'out'),\n", + " (61, 'outs'),\n", + " (62, 'him'),\n", + " (62, 'hem'),\n", + " (63, 'my'),\n", + " (64, 'said'),\n", + " (65, 'up'),\n", + " (66, 'our'),\n", + " (67, 'first'),\n", + " (68, 'should'),\n", + " (69, 'under'),\n", + " (70, 'made'),\n", + " (71, 'state'),\n", + " (71, 'states'),\n", + " (72, 'see'),\n", + " (72, 'sees'),\n", + " (73, 'after'),\n", + " (74, 'could'),\n", + " (75, 'then'),\n", + " (76, 'me'),\n", + " (77, 'most'),\n", + " (78, 'over'),\n", + " (79, 'very'),\n", + " (79, 'vary'),\n", + " (80, 'your'),\n", + " (81, 'between'),\n", + " (82, 'where'),\n", + " (83, 'now'),\n", + " (84, 'shall'),\n", + " (85, 'work'),\n", + " (85, 'works'),\n", + " (86, 'those'),\n", + " (87, 'same'),\n", + " (88, 'well'),\n", + " (88, 'wells'),\n", + " (89, 'each'),\n", + " (90, 'many'),\n", + " (91, 'being'),\n", + " (91, 'beings'),\n", + " (92, 'years'),\n", + " (92, 'year'),\n", + " (93, 'did'),\n", + " (94, 'through'),\n", + " (94, 'thru'),\n", + " (95, 'must'),\n", + " (96, 'upon'),\n", + " (97, 'before'),\n", + " (98, 'like'),\n", + " (98, 'likes'),\n", + " (99, 'use'),\n", + " (100, 'part'),\n", + " (100, 'parts'),\n", + " (101, 'general'),\n", + " (101, 'generals'),\n", + " (102, 'people'),\n", + " (102, 'peoples'),\n", + " (103, 'because'),\n", + " (104, 'used'),\n", + " (105, 'how'),\n", + " (106, 'even'),\n", + " (107, 'much'),\n", + " (108, 'during'),\n", + " (109, 'both'),\n", + " (110, 'case'),\n", + " (110, 'cases'),\n", + " (111, 'three'),\n", + " (112, 'number'),\n", + " (112, 'numbers'),\n", + " (113, 'make'),\n", + " (113, 'makes'),\n", + " (114, 'per'),\n", + " (115, 'great'),\n", + " (115, 'grate'),\n", + " (116, 'act'),\n", + " (116, 'acts'),\n", + " (117, 'way'),\n", + " (117, 'ways'),\n", + " (117, 'weigh'),\n", + " (117, 'weighs'),\n", + " (118, 'life'),\n", + " (118, 'lives'),\n", + " (119, 'good'),\n", + " (119, 'goods'),\n", + " (120, 'day'),\n", + " (120, 'days'),\n", + " (120, 'dazed'),\n", + " (120, 'daze'),\n", + " (121, 'public'),\n", + " (122, 'man'),\n", + " (122, 'mans'),\n", + " (123, 'however'),\n", + " (124, 'system'),\n", + " (124, 'systems'),\n", + " (125, 'water'),\n", + " (125, 'waters'),\n", + " (126, 'without'),\n", + " (127, 'government'),\n", + " (127, 'governments'),\n", + " (128, 'while'),\n", + " (129, 'long'),\n", + " (130, 'order'),\n", + " (130, 'orders'),\n", + " (131, 'law'),\n", + " (131, 'laws'),\n", + " (132, 'section'),\n", + " (132, 'sections'),\n", + " (133, 'court'),\n", + " (133, 'courts'),\n", + " (134, 'high'),\n", + " (135, 'right'),\n", + " (135, 'rights'),\n", + " (136, 'own'),\n", + " (137, 'found'),\n", + " (138, 'united'),\n", + " (139, 'just'),\n", + " (140, 'here'),\n", + " (141, 'against'),\n", + " (142, 'world'),\n", + " (142, 'worlds'),\n", + " (144, 'company'),\n", + " (144, 'companies'),\n", + " (145, 'within'),\n", + " (146, 'given'),\n", + " (147, 'service'),\n", + " (147, 'services'),\n", + " (148, 'house'),\n", + " (148, 'houses'),\n", + " (149, 'another'),\n", + " (150, 'power'),\n", + " (150, 'powers'),\n", + " (151, 'place'),\n", + " (151, 'places'),\n", + " (152, 'know'),\n", + " (152, 'knows'),\n", + " (153, 'little'),\n", + " (154, 'down'),\n", + " (155, 'present'),\n", + " (155, 'presents'),\n", + " (156, 'every'),\n", + " (157, 'national'),\n", + " (157, 'nationals'),\n", + " (158, 'back'),\n", + " (158, 'backs'),\n", + " (159, 'take'),\n", + " (159, 'takes'),\n", + " (160, 'information'),\n", + " (161, 'men'),\n", + " (162, 'since'),\n", + " (163, 'might'),\n", + " (164, 'small'),\n", + " (165, 'large'),\n", + " (166, 'school'),\n", + " (166, 'schools'),\n", + " (167, 'following'),\n", + " (168, 'still'),\n", + " (170, 'last'),\n", + " (170, 'lasts'),\n", + " (171, 'city'),\n", + " (171, 'cities'),\n", + " (172, 'second'),\n", + " (172, 'seconds'),\n", + " (173, 'development'),\n", + " (173, 'developments'),\n", + " (174, 'different'),\n", + " (175, 'university'),\n", + " (175, 'universities'),\n", + " (176, 'old'),\n", + " (177, 'form'),\n", + " (177, 'forms'),\n", + " (178, 'point'),\n", + " (178, 'points'),\n", + " (179, 'total'),\n", + " (179, 'totals'),\n", + " (180, 'data'),\n", + " (181, 'too'),\n", + " (182, 'committee'),\n", + " (182, 'committees'),\n", + " (183, 'report'),\n", + " (183, 'reports'),\n", + " (184, 'business'),\n", + " (184, 'businesses'),\n", + " (185, 'think'),\n", + " (185, 'thinks'),\n", + " (186, 'end'),\n", + " (186, 'ends'),\n", + " (187, 'get'),\n", + " (187, 'gets'),\n", + " (188, 'set'),\n", + " (188, 'sets'),\n", + " (189, 'research'),\n", + " (189, 'researches'),\n", + " (190, 'say'),\n", + " (190, 'says'),\n", + " (191, 'come'),\n", + " (191, 'comes'),\n", + " (192, 'country'),\n", + " (192, 'countries'),\n", + " (193, 'never'),\n", + " (194, 'fact'),\n", + " (194, 'facts'),\n", + " (195, 'go'),\n", + " (195, 'goes'),\n", + " (196, 'control'),\n", + " (196, 'controls'),\n", + " (197, 'thus'),\n", + " (198, 'having'),\n", + " (199, 'value'),\n", + " (199, 'values'),\n", + " (200, 'social'),\n", + " (201, 'department'),\n", + " (201, 'departments'),\n", + " (202, 'few'),\n", + " (203, 'above'),\n", + " (204, 'important'),\n", + " (205, 'interest'),\n", + " (205, 'interests'),\n", + " (206, 'study'),\n", + " (206, 'studies'),\n", + " (207, 'off'),\n", + " (208, 'area'),\n", + " (208, 'areas'),\n", + " (209, 'means'),\n", + " (209, 'mean'),\n", + " (210, 'office'),\n", + " (210, 'offices'),\n", + " (211, 'group'),\n", + " (211, 'groups'),\n", + " (212, 'give'),\n", + " (212, 'gives'),\n", + " (213, 'again'),\n", + " (214, 'war'),\n", + " (214, 'wars'),\n", + " (215, 'whether'),\n", + " (216, 'question'),\n", + " (216, 'questions'),\n", + " (217, 'called'),\n", + " (218, 'period'),\n", + " (218, 'periods'),\n", + " (219, 'line'),\n", + " (219, 'lines'),\n", + " (220, 'land'),\n", + " (220, 'lands'),\n", + " (221, 'among'),\n", + " (222, 'table'),\n", + " (222, 'tables'),\n", + " (223, 'board'),\n", + " (223, 'boards'),\n", + " (224, 'until'),\n", + " (225, 'hand'),\n", + " (225, 'hands'),\n", + " (226, 'taken'),\n", + " (227, 'need'),\n", + " (227, 'needs'),\n", + " (228, 'education'),\n", + " (229, 'certain'),\n", + " (230, 'county'),\n", + " (230, 'counties'),\n", + " (231, 'action'),\n", + " (231, 'actions'),\n", + " (232, 'several'),\n", + " (233, 'am'),\n", + " (234, 'course'),\n", + " (234, 'courses'),\n", + " (235, 'far'),\n", + " (236, 'effect'),\n", + " (236, 'effects'),\n", + " (237, 'possible'),\n", + " (238, 'though'),\n", + " (239, 'left'),\n", + " (240, 'further'),\n", + " (241, 'home'),\n", + " (241, 'homes'),\n", + " (242, 'person'),\n", + " (242, 'persons'),\n", + " (243, 'health'),\n", + " (244, 'amount'),\n", + " (244, 'amounts'),\n", + " (245, 'members'),\n", + " (245, 'member'),\n", + " (246, 'subject'),\n", + " (246, 'subjects'),\n", + " (247, 'yet'),\n", + " (248, 'program'),\n", + " (248, 'programs'),\n", + " (249, 'therefore'),\n", + " (250, 'process'),\n", + " (250, 'processes'),\n", + " (251, 'rate'),\n", + " (251, 'rates'),\n", + " (252, 'local'),\n", + " (252, 'locals'),\n", + " (253, 'name'),\n", + " (253, 'names'),\n", + " (254, 'find'),\n", + " (254, 'finds'),\n", + " (255, 'necessary'),\n", + " (255, 'necessaries'),\n", + " (256, 'often'),\n", + " (257, 'others'),\n", + " (258, 'whole'),\n", + " (259, 'change'),\n", + " (259, 'changes'),\n", + " (260, 'example'),\n", + " (260, 'examples'),\n", + " (261, 'president'),\n", + " (262, 'history'),\n", + " (262, 'histories'),\n", + " (263, 'best'),\n", + " (264, 'although'),\n", + " (265, 'family'),\n", + " (265, 'families'),\n", + " (266, 'side'),\n", + " (266, 'sides'),\n", + " (267, 'women'),\n", + " (267, 'woman'),\n", + " (268, 'held'),\n", + " (269, 'based'),\n", + " (270, 'south'),\n", + " (271, 'special'),\n", + " (272, 'required'),\n", + " (273, 'came'),\n", + " (274, 'thought'),\n", + " (274, 'thoughts'),\n", + " (275, 'five'),\n", + " (276, 'always'),\n", + " (277, 'himself'),\n", + " (278, 'air'),\n", + " (278, 'airs'),\n", + " (279, 'known'),\n", + " (280, 'head'),\n", + " (280, 'heads'),\n", + " (281, 'either'),\n", + " (282, 'property'),\n", + " (282, 'properties'),\n", + " (283, 'cost'),\n", + " (283, 'costs'),\n", + " (284, 'rather'),\n", + " (285, 'bill'),\n", + " (285, 'bills'),\n", + " (286, 'put'),\n", + " (286, 'puts'),\n", + " (287, 'human'),\n", + " (287, 'humans'),\n", + " (288, 'figure'),\n", + " (288, 'figures'),\n", + " (289, 'results'),\n", + " (289, 'result'),\n", + " (290, 'level'),\n", + " (290, 'levels'),\n", + " (291, 'conditions'),\n", + " (291, 'condition'),\n", + " (292, 'full'),\n", + " (293, 'book'),\n", + " (293, 'books'),\n", + " (294, 'available'),\n", + " (295, 'early'),\n", + " (296, 'matter'),\n", + " (296, 'matters'),\n", + " (297, 'common'),\n", + " (298, 'light'),\n", + " (298, 'lights'),\n", + " (299, 'let'),\n", + " (299, 'lets'),\n", + " (300, 'society'),\n", + " (300, 'societies'),\n", + " (301, 'body'),\n", + " (301, 'bodies'),\n", + " (302, 'international'),\n", + " (303, 'including'),\n", + " (304, 'free'),\n", + " (305, 'evidence'),\n", + " (305, 'evidences'),\n", + " (306, 'better'),\n", + " (307, 'type'),\n", + " (307, 'types'),\n", + " (308, 'provided'),\n", + " (309, 'due'),\n", + " (309, 'dues'),\n", + " (310, 'next'),\n", + " (311, 'production'),\n", + " (311, 'productions'),\n", + " (312, 'once'),\n", + " (313, 'done'),\n", + " (314, 'making'),\n", + " (315, 'least'),\n", + " (316, 'support'),\n", + " (316, 'supports'),\n", + " (317, 'north'),\n", + " (318, 'later'),\n", + " (319, 'using'),\n", + " (320, 'things'),\n", + " (320, 'thing'),\n", + " (321, 'economic'),\n", + " (322, 'chapter'),\n", + " (322, 'chapters'),\n", + " (323, 'various'),\n", + " (324, 'why'),\n", + " (325, 'white'),\n", + " (325, 'whites'),\n", + " (326, 'going'),\n", + " (327, 'commission'),\n", + " (327, 'commissions'),\n", + " (328, 'federal'),\n", + " (329, 'away'),\n", + " (330, 'field'),\n", + " (330, 'fields'),\n", + " (331, 'nature'),\n", + " (331, 'natures'),\n", + " (332, 'policy'),\n", + " (332, 'policies'),\n", + " (333, 'become'),\n", + " (334, 'political'),\n", + " (335, 'increase'),\n", + " (335, 'increases'),\n", + " (336, 'around'),\n", + " (337, 'age'),\n", + " (337, 'ages'),\n", + " (338, 'want'),\n", + " (338, 'wants'),\n", + " (339, 'low'),\n", + " (339, 'lows'),\n", + " (340, 'trade'),\n", + " (340, 'trades'),\n", + " (341, 'half'),\n", + " (341, 'halves'),\n", + " (342, 'position'),\n", + " (342, 'positions'),\n", + " (343, 'young'),\n", + " (344, 'money'),\n", + " (344, 'moneys'),\n", + " (345, 'percent'),\n", + " (346, 'cent'),\n", + " (346, 'sense'),\n", + " (346, 'senses'),\n", + " (346, 'scent'),\n", + " (346, 'sensed'),\n", + " (346, 'scents'),\n", + " (347, 'class'),\n", + " (347, 'classes'),\n", + " (348, 'words'),\n", + " (348, 'word'),\n", + " (349, 'view'),\n", + " (349, 'views'),\n", + " (350, 'provide'),\n", + " (351, 'seen'),\n", + " (352, 'show'),\n", + " (352, 'shows'),\n", + " (353, 'district'),\n", + " (353, 'districts'),\n", + " (354, 'party'),\n", + " (354, 'parties'),\n", + " (355, 'analysis'),\n", + " (355, 'analyses'),\n", + " (356, 'care'),\n", + " (356, 'cares'),\n", + " (357, 'june'),\n", + " (358, 'foreign'),\n", + " (359, 'shown'),\n", + " (360, 'received'),\n", + " (361, 'management'),\n", + " (362, 'third'),\n", + " (362, 'thirds'),\n", + " (363, 'took'),\n", + " (364, 'something'),\n", + " (365, 'tax'),\n", + " (365, 'taxes'),\n", + " (366, 'account'),\n", + " (366, 'accounts'),\n", + " (367, 'problem'),\n", + " (367, 'problems'),\n", + " (368, 'almost'),\n", + " (369, 'west'),\n", + " (370, 'nothing'),\n", + " (371, 'together'),\n", + " (372, 'individual'),\n", + " (372, 'individuals'),\n", + " (373, 'open'),\n", + " (373, 'opens'),\n", + " (374, 'material'),\n", + " (374, 'materials'),\n", + " (375, 'paper'),\n", + " (375, 'papers'),\n", + " (376, 'feet'),\n", + " (376, 'foot'),\n", + " (377, 'force'),\n", + " (377, 'forces'),\n", + " (378, 'association'),\n", + " (378, 'associations'),\n", + " (379, 'purpose'),\n", + " (379, 'purposes'),\n", + " (380, 'terms'),\n", + " (380, 'term'),\n", + " (381, 'method'),\n", + " (381, 'methods'),\n", + " (382, 'help'),\n", + " (382, 'helps'),\n", + " (383, 'real'),\n", + " (384, 'ever'),\n", + " (385, 'already'),\n", + " (386, 'along'),\n", + " (387, 'went'),\n", + " (388, 'particular'),\n", + " (388, 'particulars'),\n", + " (389, 'energy'),\n", + " (389, 'energies'),\n", + " (390, 'secretary'),\n", + " (391, 'date'),\n", + " (391, 'dates'),\n", + " (392, 'price'),\n", + " (392, 'prices'),\n", + " (393, 'short'),\n", + " (393, 'shorts'),\n", + " (394, 'true'),\n", + " (395, 'street'),\n", + " (395, 'streets'),\n", + " (396, 'building'),\n", + " (396, 'buildings'),\n", + " (397, 'room'),\n", + " (397, 'rooms'),\n", + " (398, 'market'),\n", + " (398, 'markets'),\n", + " (399, 'look'),\n", + " (399, 'looks'),\n", + " (400, 'similar'),\n", + " (401, 'industry'),\n", + " (401, 'industries'),\n", + " (402, 'bank'),\n", + " (402, 'banks'),\n", + " (403, 'according'),\n", + " (404, 'itself'),\n", + " (405, 'application'),\n", + " (405, 'applications'),\n", + " (406, 'current'),\n", + " (406, 'currents'),\n", + " (407, 'read'),\n", + " (407, 'reads'),\n", + " (408, 'press'),\n", + " (408, 'presses'),\n", + " (409, 'community'),\n", + " (409, 'communities'),\n", + " (410, 'plan'),\n", + " (410, 'plans'),\n", + " (411, 'whose'),\n", + " (412, 'major'),\n", + " (412, 'majors'),\n", + " (413, 'considered'),\n", + " (414, 'mind'),\n", + " (414, 'minds'),\n", + " (415, 'union'),\n", + " (415, 'unions'),\n", + " (416, 'cause'),\n", + " (416, 'causes'),\n", + " (417, 'able'),\n", + " (418, 'surface'),\n", + " (418, 'surfaces'),\n", + " (419, 'face'),\n", + " (419, 'faces'),\n", + " (420, 'river'),\n", + " (420, 'rivers'),\n", + " (421, 'council'),\n", + " (421, 'councils'),\n", + " (422, 'income'),\n", + " (422, 'incomes'),\n", + " (423, 'july'),\n", + " (424, 'near'),\n", + " (425, 'experience'),\n", + " (425, 'experiences'),\n", + " (426, 'non'),\n", + " (427, 'paid'),\n", + " (428, 'pay'),\n", + " (428, 'pays'),\n", + " (429, 'reason'),\n", + " (429, 'reasons'),\n", + " (430, 'themselves'),\n", + " (431, 'asked'),\n", + " (432, 'march'),\n", + " (432, 'marches'),\n", + " (433, 'king'),\n", + " (433, 'kings'),\n", + " (434, 'higher'),\n", + " (435, 'single'),\n", + " (435, 'singles'),\n", + " (436, 'average'),\n", + " (436, 'averages'),\n", + " (437, 'father'),\n", + " (437, 'fathers'),\n", + " (438, 'note'),\n", + " (438, 'notes'),\n", + " (439, 'treatment'),\n", + " (439, 'treatments'),\n", + " (440, 'love'),\n", + " (440, 'loves'),\n", + " (441, 'black'),\n", + " (441, 'blacks'),\n", + " (442, 'knowledge'),\n", + " (443, 'enough'),\n", + " (444, 'future'),\n", + " (444, 'futures'),\n", + " (445, 'kind'),\n", + " (445, 'kinds'),\n", + " (446, 'lower'),\n", + " (446, 'lowers'),\n", + " (447, 'authority'),\n", + " (447, 'authorities'),\n", + " (448, 'past'),\n", + " (449, 'natural'),\n", + " (450, 'six'),\n", + " (451, 'food'),\n", + " (451, 'foods'),\n", + " (452, 'working'),\n", + " (452, 'workings'),\n", + " (453, 'central'),\n", + " (454, 'college'),\n", + " (454, 'colleges'),\n", + " (455, 'self'),\n", + " (455, 'selves'),\n", + " (456, 'products'),\n", + " (456, 'product'),\n", + " (457, 'model'),\n", + " (457, 'models'),\n", + " (458, 'brought'),\n", + " (459, 'greater'),\n", + " (460, 'test'),\n", + " (460, 'tests'),\n", + " (461, 'nor'),\n", + " (462, 'students'),\n", + " (462, 'student'),\n", + " (463, 'private'),\n", + " (464, 'construction'),\n", + " (464, 'constructions'),\n", + " (465, 'perhaps'),\n", + " (466, 'ground'),\n", + " (466, 'grounds'),\n", + " (467, 'sir'),\n", + " (468, 'basis'),\n", + " (469, 'months'),\n", + " (469, 'month'),\n", + " (470, 'growth'),\n", + " (470, 'growths'),\n", + " (471, 'increased'),\n", + " (472, 'east'),\n", + " (473, 'language'),\n", + " (473, 'languages'),\n", + " (474, 'rule'),\n", + " (474, 'rules'),\n", + " (475, 'continued'),\n", + " (476, 'quite'),\n", + " (477, 'except'),\n", + " (478, 'series'),\n", + " (479, 'practice'),\n", + " (479, 'practices'),\n", + " (480, 'night'),\n", + " (480, 'knight'),\n", + " (480, 'nights'),\n", + " (481, 'eyes'),\n", + " (481, 'eye'),\n", + " (482, 'oil'),\n", + " (482, 'oils'),\n", + " (483, 'art'),\n", + " (483, 'arts'),\n", + " (484, 'told'),\n", + " (485, 'especially'),\n", + " (486, 'population'),\n", + " (486, 'populations'),\n", + " (487, 'science'),\n", + " (487, 'sciences'),\n", + " (488, 'whom'),\n", + " (489, 'obtained'),\n", + " (490, 'capital'),\n", + " (490, 'capitol'),\n", + " (490, 'capitals'),\n", + " (491, 'include'),\n", + " (492, 'generally'),\n", + " (493, 'meeting'),\n", + " (493, 'meetings'),\n", + " (494, 'specific'),\n", + " (494, 'specifics'),\n", + " (495, 'described'),\n", + " (496, 'believe'),\n", + " (497, 'review'),\n", + " (497, 'reviews'),\n", + " (498, 'issue'),\n", + " (498, 'issues'),\n", + " (499, 'respect'),\n", + " (499, 'respects'),\n", + " (500, 'contract'),\n", + " (500, 'contracts'),\n", + " (501, 'became'),\n", + " (502, 'medical'),\n", + " (503, 'road'),\n", + " (503, 'roads'),\n", + " (504, 'got'),\n", + " (505, 'clear'),\n", + " (505, 'clears'),\n", + " (506, 'main'),\n", + " (506, 'mains'),\n", + " (507, 'labor'),\n", + " (507, 'labors'),\n", + " (508, 'operation'),\n", + " (508, 'operations'),\n", + " (509, 'size'),\n", + " (509, 'sizes'),\n", + " (510, 'below'),\n", + " (511, 'hours'),\n", + " (511, 'hour'),\n", + " (512, 'addition'),\n", + " (512, 'additions'),\n", + " (513, 'probably'),\n", + " (514, 'century'),\n", + " (514, 'centuries'),\n", + " (515, 'personal'),\n", + " (516, 'plant'),\n", + " (516, 'plants'),\n", + " (517, 'training'),\n", + " (518, 'design'),\n", + " (518, 'designs'),\n", + " (519, 'statement'),\n", + " (519, 'statements'),\n", + " (520, 'structure'),\n", + " (520, 'structures'),\n", + " (521, 'project'),\n", + " (521, 'projects'),\n", + " (522, 'million'),\n", + " (522, 'millions'),\n", + " (523, 'usually'),\n", + " (524, 'range'),\n", + " (524, 'ranges'),\n", + " (525, 'call'),\n", + " (525, 'calls'),\n", + " (526, 'mother'),\n", + " (526, 'mothers'),\n", + " (527, 'seems'),\n", + " (528, 'standard'),\n", + " (528, 'standards'),\n", + " (529, 'return'),\n", + " (529, 'returns'),\n", + " (530, 'title'),\n", + " (530, 'titles'),\n", + " (531, 'established'),\n", + " (532, 'keep'),\n", + " (532, 'keeps'),\n", + " (533, 'space'),\n", + " (533, 'spaces'),\n", + " (534, 'annual'),\n", + " (535, 'record'),\n", + " (535, 'records'),\n", + " (536, 'close'),\n", + " (536, 'closes'),\n", + " (537, 'april'),\n", + " (538, 'complete'),\n", + " (539, 'page'),\n", + " (539, 'pages'),\n", + " (540, 'heart'),\n", + " (540, 'hearts'),\n", + " (541, 'fig'),\n", + " (541, 'figs'),\n", + " (542, 'quality'),\n", + " (542, 'qualities'),\n", + " (543, 'gas'),\n", + " (543, 'gases'),\n", + " (544, 'letter'),\n", + " (544, 'letters'),\n", + " (545, 'stock'),\n", + " (545, 'stocks'),\n", + " (546, 'gave'),\n", + " (547, 'related'),\n", + " (548, 'administration'),\n", + " (548, 'administrations'),\n", + " (549, 'activities'),\n", + " (549, 'activity'),\n", + " (550, 'theory'),\n", + " (550, 'theories'),\n", + " (551, 'town'),\n", + " (551, 'towns'),\n", + " (552, 'equipment'),\n", + " (552, 'equipments'),\n", + " (553, 'soon'),\n", + " (554, 'decision'),\n", + " (554, 'decisions'),\n", + " (555, 'pressure'),\n", + " (555, 'pressures'),\n", + " (556, 'written'),\n", + " (557, 'corporation'),\n", + " (557, 'corporations'),\n", + " (558, 'tell'),\n", + " (558, 'tells'),\n", + " (559, 'agreement'),\n", + " (559, 'agreements'),\n", + " (560, 'reported'),\n", + " (561, 'attention'),\n", + " (561, 'attentions'),\n", + " (562, 'fire'),\n", + " (562, 'fires'),\n", + " (563, 'direct'),\n", + " (564, 'saw'),\n", + " (564, 'saws'),\n", + " (565, 'published'),\n", + " (566, 'temperature'),\n", + " (566, 'temperatures'),\n", + " (567, 'species'),\n", + " (567, 'specie'),\n", + " (568, 'really'),\n", + " (569, 'function'),\n", + " (569, 'functions'),\n", + " (570, 'military'),\n", + " (571, 'proposed'),\n", + " (572, 'january'),\n", + " (573, 'additional'),\n", + " (574, 'late'),\n", + " (575, 'opinion'),\n", + " (575, 'opinions'),\n", + " (576, 'loss'),\n", + " (576, 'losses'),\n", + " (577, 'limited'),\n", + " (578, 'source'),\n", + " (578, 'sources'),\n", + " (579, 'article'),\n", + " (579, 'articles'),\n", + " (580, 'notice'),\n", + " (580, 'notices'),\n", + " (581, 'security'),\n", + " (581, 'securities'),\n", + " (582, 'organization'),\n", + " (582, 'organizations'),\n", + " (582, 'organisation'),\n", + " (582, 'organisations'),\n", + " (583, 'financial'),\n", + " (584, 'follows'),\n", + " (585, 'miles'),\n", + " (585, 'mile'),\n", + " (586, 'chief'),\n", + " (586, 'chiefs'),\n", + " (587, 'distribution'),\n", + " (587, 'distributions'),\n", + " (588, 'sometimes'),\n", + " (589, 'insurance'),\n", + " (590, 'son'),\n", + " (590, 'sons'),\n", + " (591, 'strong'),\n", + " (592, 'length'),\n", + " (592, 'lengths'),\n", + " (593, 'original'),\n", + " (593, 'originals'),\n", + " (594, 'yes'),\n", + " (595, 'effective'),\n", + " (596, 'defendant'),\n", + " (596, 'defendants'),\n", + " (597, 'living'),\n", + " (598, 'december'),\n", + " (599, 'character'),\n", + " (599, 'characters'),\n", + " (600, 'began'),\n", + " (601, 'carried'),\n", + " (602, 'supply'),\n", + " (602, 'supplies'),\n", + " (603, 'blood'),\n", + " (604, 'taking'),\n", + " (605, 'manner'),\n", + " (605, 'manners'),\n", + " (606, 'journal'),\n", + " (606, 'journals'),\n", + " (607, 'hundred'),\n", + " (607, 'hundreds'),\n", + " (608, 'red'),\n", + " (609, 'developed'),\n", + " (610, 'performance'),\n", + " (610, 'performances'),\n", + " (611, 'situation'),\n", + " (611, 'situations'),\n", + " (612, 'felt'),\n", + " (613, 'workers'),\n", + " (613, 'worker'),\n", + " (614, 'volume'),\n", + " (614, 'volumes'),\n", + " (615, 'presented'),\n", + " (616, 'knew'),\n", + " (617, 'answer'),\n", + " (617, 'answers'),\n", + " (618, 'resources'),\n", + " (618, 'resource'),\n", + " (619, 'industrial'),\n", + " (620, 'twenty'),\n", + " (620, 'twenties'),\n", + " (621, 'sent'),\n", + " (622, 'looked'),\n", + " (623, 'library'),\n", + " (623, 'libraries'),\n", + " (624, 'added'),\n", + " (625, 'passed'),\n", + " (626, 'ten'),\n", + " (626, 'tens'),\n", + " (627, 'sea'),\n", + " (627, 'seas'),\n", + " (628, 'applied'),\n", + " (629, 'included'),\n", + " (630, 'physical'),\n", + " (631, 'across'),\n", + " (632, 'army'),\n", + " (632, 'armies'),\n", + " (633, 'toward'),\n", + " (634, 'produced'),\n", + " (635, 'placed'),\n", + " (636, 'role'),\n", + " (636, 'roles'),\n", + " (637, 'october'),\n", + " (638, 'final'),\n", + " (639, 'approach'),\n", + " (639, 'approaches'),\n", + " (640, 'provisions'),\n", + " (640, 'provision'),\n", + " (641, 'leave'),\n", + " (642, 'director'),\n", + " (642, 'directors'),\n", + " (643, 'employment'),\n", + " (643, 'employments'),\n", + " (644, 'anything'),\n", + " (645, 'particularly'),\n", + " (646, 'hard'),\n", + " (647, 'outside'),\n", + " (648, 'week'),\n", + " (648, 'weeks'),\n", + " (649, 'feel'),\n", + " (649, 'feels'),\n", + " (650, 'charge'),\n", + " (650, 'charges'),\n", + " (651, 'indeed'),\n", + " (652, 'degree'),\n", + " (652, 'degrees'),\n", + " (653, 'reference'),\n", + " ...]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_wordlist" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d1a06597-4ad5-4566-a716-8bbad416b7ab", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "with open(\"final_wordlist.csv\", \"w\") as f:\n", + " f.write(\"word,number\\n\")\n", + " \n", + " for w in final_wordlist:\n", + " lemmatized = \"\" if not w[1] else w[1]\n", + " f.write(f\"{w[1].upper()},{w[0]}\")\n", + " f.write(\"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c88fe193-11cc-4a06-a3cf-d1ad85f44d14", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "final_wordlist" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a0d177b-3499-42fb-8091-29547567d69a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/wordlist/wordlist-new2.ipynb b/wordlist/wordlist-new2.ipynb new file mode 100644 index 0000000..52a0d2f --- /dev/null +++ b/wordlist/wordlist-new2.ipynb @@ -0,0 +1,220 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "991a711f-be98-4aae-a657-84b065449916", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "try:\n", + " _initialized\n", + "except:\n", + " # !pip install spacy\n", + " # !python -m spacy download en_core_web_trf\n", + " import spacy\n", + " \n", + " nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n", + " \n", + " _initialized=True\n", + " \n", + "import pandas as pd\n", + "import gzip\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d130bb84", + "metadata": {}, + "outputs": [], + "source": [ + "def get_lines(filename):\n", + " with gzip.open(filename, 'r') as f:\n", + " ret = []\n", + " for l in f:\n", + " if len(ret) > 30_000:\n", + " return ret\n", + " ret.append(str(l).lower())\n", + " return ret\n", + "\n", + "\n", + " \n", + "WORDLIST_SIZE = 8192 + 3\n", + "word_re = re.compile(r\"^[A-Za-z]+$\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "de2d1731", + "metadata": {}, + "outputs": [], + "source": [ + "!pwd\n", + "!ls" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "90665714", + "metadata": {}, + "outputs": [], + "source": [ + "annotated_words=pd.read_excel(\"annotated_words.ods\")\n", + "\n", + "excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n", + "excluded_words[0:10]\n", + "\n", + "custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n", + "\n", + "custom_maps = [\n", + " (m[1][\"word\"].lower(), mapping.lower())\n", + " for m in custom_maps.iterrows()\n", + " for mapping in m[1][\"maps_to\"]\n", + "]\n", + "custom_maps" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "fb50c69e", + "metadata": {}, + "outputs": [], + "source": [ + "# Start parsing the wordlist\n", + "all_words = get_lines(\"00-frequency-all.txt.gz\")\n", + "\n", + "# Delete header line\n", + "all_words = all_words[1:]\n", + "\n", + "# Get only the word (fixed width)\n", + "all_words = [w[13:36].strip() for w in all_words]\n", + "\n", + "# Remove special characters\n", + "all_words = [w for w in all_words if word_re.search(w)]\n", + "\n", + "# Remove all removed words\n", + "all_words = [w for w in all_words if w not in excluded_words]\n", + "\n", + "# Add all custom mappings\n", + "for m in list(sum(custom_maps, ())):\n", + " if m[0] not in all_words:\n", + " all_words.append(m[0])\n", + " if m[1] not in all_words:\n", + " all_words.append(m[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "cd21bff5", + "metadata": {}, + "outputs": [], + "source": [ + "# Lemmatize all words (plural -> singular)\n", + "lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in all_words[:100]]\n", + "print(lemmatize_mappings[:100])\n", + "\n", + "# Add custom lemmatizations\n", + "for l in custom_maps:\n", + " if l in lemmatize_mappings:\n", + " print(f\"Warning: {l} is already lemmatized\")\n", + " else:\n", + " lemmatize_mappings.append(l)\n", + " \n", + "print(lemmatize_mappings[:100])\n", + "\n", + "lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]\n", + "print(lemmatize_mappings[:100])\n", + "\n", + "# Now, re-add all lematized words to the list of every word\n", + "for w in sum(lemmatize_mappings, ()):\n", + " if w not in all_words:\n", + " print(w)\n", + " all_words.append(w)\n", + " \n", + "lemmatize_mappings = {k: v for k, v in lemmatize_mappings}" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0ee9af7d", + "metadata": {}, + "outputs": [], + "source": [ + "final_wordlist = []\n", + "seen_lemmatizations = set()\n", + "for w in all_words:\n", + " lemmatized = lemmatize_mappings.get(w) or w\n", + " if lemmatized in seen_lemmatizations:\n", + " # The lemmatized version of this word was already seen\n", + " continue\n", + " else:\n", + " # The lemmatized version hasn't been seen. We're good to add it\n", + " final_wordlist.append([\n", + " k\n", + " for k\n", + " in lemmatize_mappings.keys()\n", + " if lemmatize_mappings[k] == lemmatized\n", + " ])\n", + " seen_lemmatizations.add(lemmatized)\n", + "\n", + " if len(final_wordlist) >= WORDLIST_SIZE:\n", + " break\n", + "\n", + "# Now, convert it to the format (number, word)\n", + "final_wordlist = [\n", + " (idx, w)\n", + " for idx, words in enumerate(final_wordlist)\n", + " for w in words\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "07c1293c", + "metadata": {}, + "outputs": [], + "source": [ + "print(len(lemmatize_mappings))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19c255d0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/wordlist/wordlist-new2.py b/wordlist/wordlist-new2.py new file mode 100755 index 0000000..ca84229 --- /dev/null +++ b/wordlist/wordlist-new2.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +print("Step 1") + + +try: + _initialized +except: + # !pip install spacy + # !python -m spacy download en_core_web_trf + import spacy + from tqdm import tqdm + + nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner']) + + _initialized=True + +import pandas as pd +import gzip +import re + + +print("Step 2") + + +def get_lines(filename): + with gzip.open(filename, 'r') as f: + ret = [] + for l in f: + if len(ret) > 30_000: + return ret + ret.append(str(l).lower()) + return ret + + + +WORDLIST_SIZE = 8192 + 3 +word_re = re.compile(r"^[A-Za-z]+$") + + +print("Step 3") + + +annotated_words=pd.read_excel("annotated_words.ods") + +excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower()) +excluded_words[0:10] + +custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(","))) + +custom_maps = [ + (m[1]["word"].lower(), mapping.lower()) + for m in custom_maps.iterrows() + for mapping in m[1]["maps_to"] +] +custom_maps + + +print("Step 4") + + +# Start parsing the wordlist +all_words = get_lines("00-frequency-all.txt.gz") + +# Delete header line +all_words = all_words[1:] + +# Get only the word (fixed width) +all_words = [w[13:36].strip() for w in all_words] + +# Remove special characters +all_words = [w for w in all_words if word_re.search(w)] + +# Remove all removed words +all_words = [w for w in all_words if w not in excluded_words] + +# Add all custom mappings +for m in list(sum(custom_maps, ())): + if m[0] not in all_words: + all_words.append(m[0]) + if m[1] not in all_words: + all_words.append(m[1]) + + +print("Step 5") + + +# Lemmatize all words (plural -> singular) +lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)] +print(lemmatize_mappings[:100]) + +# Add custom lemmatizations +for l in custom_maps: + if l in lemmatize_mappings: + print(f"Warning: {l} is already lemmatized") + else: + lemmatize_mappings.append(l) + +print(lemmatize_mappings[:100]) + +lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words] +print(lemmatize_mappings[:100]) + +# Now, re-add all lematized words to the list of every word +for w in sum(lemmatize_mappings, ()): + if w not in all_words: + print(w) + all_words.append(w) + +lemmatize_mappings = {k: v for k, v in lemmatize_mappings} + + +print("Step 6") + + +final_wordlist = [] +seen_lemmatizations = set() +for w in all_words: + lemmatized = lemmatize_mappings.get(w) or w + if lemmatized in seen_lemmatizations: + # The lemmatized version of this word was already seen + continue + else: + # The lemmatized version hasn't been seen. We're good to add it + final_wordlist.append([ + k + for k + in lemmatize_mappings.keys() + if lemmatize_mappings[k] == lemmatized + ]) + seen_lemmatizations.add(lemmatized) + + if len(final_wordlist) >= WORDLIST_SIZE: + break + +# Now, convert it to the format (number, word) +final_wordlist = [ + (idx, w) + for idx, words in enumerate(final_wordlist) + for w in words +] + + +print("Step 7") + +print(len(lemmatize_mappings)) + +print("Step 8") + +with open("01-generated-wordlist.csv", "w") as f: + f.write("word,number\n") + + for w in final_wordlist: + lemmatized = "" if not w[1] else w[1] + f.write(f"{w[1].upper()},{w[0]}") + f.write("\n") + +print("Done")