Start working on new wordlist generation

This commit is contained in:
Austen Adler 2023-03-01 22:26:06 -05:00
parent c034652d86
commit 6ebe8cd489
11 changed files with 3392 additions and 19 deletions

1
.gitignore vendored
View File

@ -8,3 +8,4 @@
**/.ipynb_checkpoints **/.ipynb_checkpoints
/target /target
/test-data/generator/build/ /test-data/generator/build/
/wordlist/venv

View File

@ -1,12 +0,0 @@
click==8.1.3
defusedxml==0.7.1
joblib==1.2.0
nltk==3.8.1
numpy==1.24.2
odfpy==1.4.1
pandas==1.5.3
python-dateutil==2.8.2
pytz==2022.7.1
regex==2022.10.31
six==1.16.0
tqdm==4.64.1

View File

@ -12,21 +12,34 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n", "Collecting nltk\n",
"Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n", " Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
"Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25hCollecting odfpy\n",
" Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m26.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25hCollecting regex>=2021.8.3\n",
" Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m29.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
"Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n", "Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
"Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n", "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
"Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n" "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n",
"Building wheels for collected packages: odfpy\n",
" Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25h Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=5bfe9fcd7c590666411d404ea3e4ef0f704c9e62ff6621deb4ab09c84bec082a\n",
" Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
"Successfully built odfpy\n",
"Installing collected packages: regex, odfpy, nltk\n",
"Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
] ]
}, },
{ {
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n", "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n"
"[nltk_data] Package wordnet is already up-to-date!\n"
] ]
} }
], ],
@ -63,6 +76,8 @@
" return ret\n", " return ret\n",
" ret.append(str(l).lower())\n", " ret.append(str(l).lower())\n",
" return ret\n", " return ret\n",
"\n",
"\n",
" \n", " \n",
"WORDLIST_SIZE = 8192 + 3\n", "WORDLIST_SIZE = 8192 + 3\n",
"lemmatizer = WordNetLemmatizer()\n", "lemmatizer = WordNetLemmatizer()\n",

112
docs/wordlist-new2.ipynb Normal file
View File

@ -0,0 +1,112 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "991a711f-be98-4aae-a657-84b065449916",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: spacy in /opt/conda/lib/python3.10/site-packages (3.5.0)\n",
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.4)\n",
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.3.0)\n",
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.12)\n",
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.10.5)\n",
"Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.1.2)\n",
"Requirement already satisfied: typer<0.8.0,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.7.0)\n",
"Requirement already satisfied: numpy>=1.15.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.23.5)\n",
"Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (6.3.0)\n",
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.8)\n",
"Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from spacy) (67.3.2)\n",
"Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (23.0)\n",
"Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.1.1)\n",
"Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (8.1.7)\n",
"Requirement already satisfied: pathy>=0.10.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.10.1)\n",
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.8)\n",
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.4.6)\n",
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (4.64.1)\n",
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.7)\n",
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.9)\n",
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.28.2)\n",
"Requirement already satisfied: typing-extensions>=4.2.0 in /opt/conda/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy) (4.4.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2022.12.7)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.1.1)\n",
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.26.14)\n",
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4)\n",
"Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.7.9)\n",
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.0.4)\n",
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/conda/lib/python3.10/site-packages (from typer<0.8.0,>=0.3.0->spacy) (8.1.3)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->spacy) (2.1.2)\n",
"Collecting en-core-web-trf==3.5.0\n",
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl (460.3 MB)\n",
"\u001b[2K \u001b[91m━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.1/460.3 MB\u001b[0m \u001b[31m31.6 MB/s\u001b[0m eta \u001b[36m0:00:14\u001b[0m"
]
}
],
"source": [
"try:\n",
" _initialized\n",
"except:\n",
" !pip install spacy\n",
" !python -m spacy download en_core_web_trf\n",
" import spacy\n",
" \n",
" spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n",
" \n",
" _initialized=True\n",
" \n",
"import pandas as pd\n",
"import gzip\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b93818f-c54a-4c88-9968-df4244b7c6f6",
"metadata": {},
"outputs": [],
"source": [
"import spacy\n",
"\n",
"# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization\n",
"nlp = spacy.load('en', disable=['parser', 'ner'])\n",
"\n",
"sentence = \"The striped bats are hanging on their feet for best\"\n",
"\n",
"# Parse the sentence using the loaded 'en' model object `nlp`\n",
"doc = nlp(sentence)\n",
"\n",
"# Extract the lemma for each token and join\n",
"\" \".join([token.lemma_ for token in doc])\n",
"#> 'the strip bat be hang on -PRON- foot for good'"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1 @@
word,lemmatized_word
1 word lemmatized_word

96
wordlist/01-lemmatized-words.py Executable file
View File

@ -0,0 +1,96 @@
#!/usr/bin/env python3
# coding: utf-8
print("Step 1")
try:
_initialized
except:
# !pip install spacy
# !python -m spacy download en_core_web_trf
import spacy
from tqdm import tqdm
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
_initialized=True
import pandas as pd
import gzip
import re
print("Step 2")
def get_lines(filename):
with gzip.open(filename, 'r') as f:
ret = []
for l in f:
if len(ret) > 30_000:
return ret
ret.append(str(l).lower())
return ret
WORDLIST_SIZE = 8192 + 3
word_re = re.compile(r"^[A-Za-z]+$")
print("Step 3")
annotated_words=pd.read_excel("annotated_words.ods")
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
excluded_words[0:10]
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
custom_maps = [
(m[1]["word"].lower(), mapping.lower())
for m in custom_maps.iterrows()
for mapping in m[1]["maps_to"]
]
custom_maps
print("Step 4")
# Start parsing the wordlist
all_words = get_lines("00-frequency-all.txt.gz")
# Delete header line
all_words = all_words[1:]
# Get only the word (fixed width)
all_words = [w[13:36].strip() for w in all_words]
# Remove special characters
all_words = [w for w in all_words if word_re.search(w)]
# Remove all removed words
all_words = [w for w in all_words if w not in excluded_words]
# Add all custom mappings
for m in list(sum(custom_maps, ())):
if m[0] not in all_words:
all_words.append(m[0])
if m[1] not in all_words:
all_words.append(m[1])
print("Step 5")
# Lemmatize all words (plural -> singular)
lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
with open("01-lemmatized-words.csv", "w") as f:
f.write("word,lemmatized_word\n")
for w in lemmatize_mappings:
f.write(f"{w[0]},{w[1]}")
f.write("\n")

124
wordlist/requirements.txt Normal file
View File

@ -0,0 +1,124 @@
anyio==3.6.2
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
arrow==1.2.3
asttokens==2.2.1
attrs==22.2.0
backcall==0.2.0
beautifulsoup4==4.11.2
bleach==6.0.0
blis==0.7.9
catalogue==2.0.8
certifi==2022.12.7
cffi==1.15.1
charset-normalizer==3.0.1
click==8.1.3
comm==0.1.2
confection==0.0.4
cymem==2.0.7
debugpy==1.6.6
decorator==5.1.1
defusedxml==0.7.1
en-core-web-trf @ https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl
executing==1.2.0
fastjsonschema==2.16.3
filelock==3.9.0
fqdn==1.5.1
huggingface-hub==0.12.1
idna==3.4
ipykernel==6.21.2
ipython==8.11.0
ipython-genutils==0.2.0
ipywidgets==8.0.4
isoduration==20.11.0
jedi==0.18.2
Jinja2==3.1.2
jsonpointer==2.3
jsonschema==4.17.3
jupyter==1.0.0
jupyter-console==6.6.2
jupyter-events==0.6.3
jupyter_client==8.0.3
jupyter_core==5.2.0
jupyter_server==2.3.0
jupyter_server_terminals==0.4.4
jupyterlab-pygments==0.2.2
jupyterlab-widgets==3.0.5
langcodes==3.3.0
MarkupSafe==2.1.2
matplotlib-inline==0.1.6
mistune==2.0.5
murmurhash==1.0.9
nbclassic==0.5.2
nbclient==0.7.2
nbconvert==7.2.9
nbformat==5.7.3
nest-asyncio==1.5.6
notebook==6.5.2
notebook_shim==0.2.2
numpy==1.24.2
nvidia-cublas-cu11==11.10.3.66
nvidia-cuda-nvrtc-cu11==11.7.99
nvidia-cuda-runtime-cu11==11.7.99
nvidia-cudnn-cu11==8.5.0.96
odfpy==1.4.1
packaging==23.0
pandas==1.5.3
pandocfilters==1.5.0
parso==0.8.3
pathy==0.10.1
pexpect==4.8.0
pickleshare==0.7.5
platformdirs==3.0.0
preshed==3.0.8
prometheus-client==0.16.0
prompt-toolkit==3.0.38
psutil==5.9.4
ptyprocess==0.7.0
pure-eval==0.2.2
pycparser==2.21
pydantic==1.10.5
Pygments==2.14.0
pyrsistent==0.19.3
python-dateutil==2.8.2
python-json-logger==2.0.7
pytz==2022.7.1
PyYAML==6.0
pyzmq==25.0.0
qtconsole==5.4.0
QtPy==2.3.0
regex==2022.10.31
requests==2.28.2
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
Send2Trash==1.8.0
six==1.16.0
smart-open==6.3.0
sniffio==1.3.0
soupsieve==2.4
spacy==3.5.0
spacy-alignments==0.9.0
spacy-legacy==3.0.12
spacy-loggers==1.0.4
spacy-transformers==1.2.2
srsly==2.4.6
stack-data==0.6.2
terminado==0.17.1
thinc==8.1.7
tinycss2==1.2.1
tokenizers==0.13.2
torch==1.13.1
tornado==6.2
tqdm==4.64.1
traitlets==5.9.0
transformers==4.26.1
typer==0.7.0
typing_extensions==4.5.0
uri-template==1.2.0
urllib3==1.26.14
wasabi==1.1.1
wcwidth==0.2.6
webcolors==1.12
webencodings==0.5.1
websocket-client==1.5.1
widgetsnbextension==4.0.5

2657
wordlist/wordlist-new.ipynb Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,220 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "991a711f-be98-4aae-a657-84b065449916",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"try:\n",
" _initialized\n",
"except:\n",
" # !pip install spacy\n",
" # !python -m spacy download en_core_web_trf\n",
" import spacy\n",
" \n",
" nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n",
" \n",
" _initialized=True\n",
" \n",
"import pandas as pd\n",
"import gzip\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d130bb84",
"metadata": {},
"outputs": [],
"source": [
"def get_lines(filename):\n",
" with gzip.open(filename, 'r') as f:\n",
" ret = []\n",
" for l in f:\n",
" if len(ret) > 30_000:\n",
" return ret\n",
" ret.append(str(l).lower())\n",
" return ret\n",
"\n",
"\n",
" \n",
"WORDLIST_SIZE = 8192 + 3\n",
"word_re = re.compile(r\"^[A-Za-z]+$\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "de2d1731",
"metadata": {},
"outputs": [],
"source": [
"!pwd\n",
"!ls"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "90665714",
"metadata": {},
"outputs": [],
"source": [
"annotated_words=pd.read_excel(\"annotated_words.ods\")\n",
"\n",
"excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n",
"excluded_words[0:10]\n",
"\n",
"custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n",
"\n",
"custom_maps = [\n",
" (m[1][\"word\"].lower(), mapping.lower())\n",
" for m in custom_maps.iterrows()\n",
" for mapping in m[1][\"maps_to\"]\n",
"]\n",
"custom_maps"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "fb50c69e",
"metadata": {},
"outputs": [],
"source": [
"# Start parsing the wordlist\n",
"all_words = get_lines(\"00-frequency-all.txt.gz\")\n",
"\n",
"# Delete header line\n",
"all_words = all_words[1:]\n",
"\n",
"# Get only the word (fixed width)\n",
"all_words = [w[13:36].strip() for w in all_words]\n",
"\n",
"# Remove special characters\n",
"all_words = [w for w in all_words if word_re.search(w)]\n",
"\n",
"# Remove all removed words\n",
"all_words = [w for w in all_words if w not in excluded_words]\n",
"\n",
"# Add all custom mappings\n",
"for m in list(sum(custom_maps, ())):\n",
" if m[0] not in all_words:\n",
" all_words.append(m[0])\n",
" if m[1] not in all_words:\n",
" all_words.append(m[1])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "cd21bff5",
"metadata": {},
"outputs": [],
"source": [
"# Lemmatize all words (plural -> singular)\n",
"lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in all_words[:100]]\n",
"print(lemmatize_mappings[:100])\n",
"\n",
"# Add custom lemmatizations\n",
"for l in custom_maps:\n",
" if l in lemmatize_mappings:\n",
" print(f\"Warning: {l} is already lemmatized\")\n",
" else:\n",
" lemmatize_mappings.append(l)\n",
" \n",
"print(lemmatize_mappings[:100])\n",
"\n",
"lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]\n",
"print(lemmatize_mappings[:100])\n",
"\n",
"# Now, re-add all lematized words to the list of every word\n",
"for w in sum(lemmatize_mappings, ()):\n",
" if w not in all_words:\n",
" print(w)\n",
" all_words.append(w)\n",
" \n",
"lemmatize_mappings = {k: v for k, v in lemmatize_mappings}"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0ee9af7d",
"metadata": {},
"outputs": [],
"source": [
"final_wordlist = []\n",
"seen_lemmatizations = set()\n",
"for w in all_words:\n",
" lemmatized = lemmatize_mappings.get(w) or w\n",
" if lemmatized in seen_lemmatizations:\n",
" # The lemmatized version of this word was already seen\n",
" continue\n",
" else:\n",
" # The lemmatized version hasn't been seen. We're good to add it\n",
" final_wordlist.append([\n",
" k\n",
" for k\n",
" in lemmatize_mappings.keys()\n",
" if lemmatize_mappings[k] == lemmatized\n",
" ])\n",
" seen_lemmatizations.add(lemmatized)\n",
"\n",
" if len(final_wordlist) >= WORDLIST_SIZE:\n",
" break\n",
"\n",
"# Now, convert it to the format (number, word)\n",
"final_wordlist = [\n",
" (idx, w)\n",
" for idx, words in enumerate(final_wordlist)\n",
" for w in words\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "07c1293c",
"metadata": {},
"outputs": [],
"source": [
"print(len(lemmatize_mappings))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "19c255d0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

159
wordlist/wordlist-new2.py Executable file
View File

@ -0,0 +1,159 @@
#!/usr/bin/env python3
# coding: utf-8
print("Step 1")
try:
_initialized
except:
# !pip install spacy
# !python -m spacy download en_core_web_trf
import spacy
from tqdm import tqdm
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
_initialized=True
import pandas as pd
import gzip
import re
print("Step 2")
def get_lines(filename):
with gzip.open(filename, 'r') as f:
ret = []
for l in f:
if len(ret) > 30_000:
return ret
ret.append(str(l).lower())
return ret
WORDLIST_SIZE = 8192 + 3
word_re = re.compile(r"^[A-Za-z]+$")
print("Step 3")
annotated_words=pd.read_excel("annotated_words.ods")
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
excluded_words[0:10]
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
custom_maps = [
(m[1]["word"].lower(), mapping.lower())
for m in custom_maps.iterrows()
for mapping in m[1]["maps_to"]
]
custom_maps
print("Step 4")
# Start parsing the wordlist
all_words = get_lines("00-frequency-all.txt.gz")
# Delete header line
all_words = all_words[1:]
# Get only the word (fixed width)
all_words = [w[13:36].strip() for w in all_words]
# Remove special characters
all_words = [w for w in all_words if word_re.search(w)]
# Remove all removed words
all_words = [w for w in all_words if w not in excluded_words]
# Add all custom mappings
for m in list(sum(custom_maps, ())):
if m[0] not in all_words:
all_words.append(m[0])
if m[1] not in all_words:
all_words.append(m[1])
print("Step 5")
# Lemmatize all words (plural -> singular)
lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
print(lemmatize_mappings[:100])
# Add custom lemmatizations
for l in custom_maps:
if l in lemmatize_mappings:
print(f"Warning: {l} is already lemmatized")
else:
lemmatize_mappings.append(l)
print(lemmatize_mappings[:100])
lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]
print(lemmatize_mappings[:100])
# Now, re-add all lematized words to the list of every word
for w in sum(lemmatize_mappings, ()):
if w not in all_words:
print(w)
all_words.append(w)
lemmatize_mappings = {k: v for k, v in lemmatize_mappings}
print("Step 6")
final_wordlist = []
seen_lemmatizations = set()
for w in all_words:
lemmatized = lemmatize_mappings.get(w) or w
if lemmatized in seen_lemmatizations:
# The lemmatized version of this word was already seen
continue
else:
# The lemmatized version hasn't been seen. We're good to add it
final_wordlist.append([
k
for k
in lemmatize_mappings.keys()
if lemmatize_mappings[k] == lemmatized
])
seen_lemmatizations.add(lemmatized)
if len(final_wordlist) >= WORDLIST_SIZE:
break
# Now, convert it to the format (number, word)
final_wordlist = [
(idx, w)
for idx, words in enumerate(final_wordlist)
for w in words
]
print("Step 7")
print(len(lemmatize_mappings))
print("Step 8")
with open("01-generated-wordlist.csv", "w") as f:
f.write("word,number\n")
for w in final_wordlist:
lemmatized = "" if not w[1] else w[1]
f.write(f"{w[1].upper()},{w[0]}")
f.write("\n")
print("Done")