Start working on new wordlist generation
This commit is contained in:
parent
c034652d86
commit
6ebe8cd489
1
.gitignore
vendored
1
.gitignore
vendored
@ -8,3 +8,4 @@
|
||||
**/.ipynb_checkpoints
|
||||
/target
|
||||
/test-data/generator/build/
|
||||
/wordlist/venv
|
||||
|
@ -1,12 +0,0 @@
|
||||
click==8.1.3
|
||||
defusedxml==0.7.1
|
||||
joblib==1.2.0
|
||||
nltk==3.8.1
|
||||
numpy==1.24.2
|
||||
odfpy==1.4.1
|
||||
pandas==1.5.3
|
||||
python-dateutil==2.8.2
|
||||
pytz==2022.7.1
|
||||
regex==2022.10.31
|
||||
six==1.16.0
|
||||
tqdm==4.64.1
|
@ -12,21 +12,34 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n",
|
||||
"Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n",
|
||||
"Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n",
|
||||
"Collecting nltk\n",
|
||||
" Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hCollecting odfpy\n",
|
||||
" Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m26.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hCollecting regex>=2021.8.3\n",
|
||||
" Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m29.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
|
||||
"Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
|
||||
"Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
|
||||
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
|
||||
"Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n"
|
||||
"Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n",
|
||||
"Building wheels for collected packages: odfpy\n",
|
||||
" Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=5bfe9fcd7c590666411d404ea3e4ef0f704c9e62ff6621deb4ab09c84bec082a\n",
|
||||
" Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
|
||||
"Successfully built odfpy\n",
|
||||
"Installing collected packages: regex, odfpy, nltk\n",
|
||||
"Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n",
|
||||
"[nltk_data] Package wordnet is already up-to-date!\n"
|
||||
"[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -64,6 +77,8 @@
|
||||
" ret.append(str(l).lower())\n",
|
||||
" return ret\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" \n",
|
||||
"WORDLIST_SIZE = 8192 + 3\n",
|
||||
"lemmatizer = WordNetLemmatizer()\n",
|
||||
"word_re = re.compile(r\"^[A-Za-z]+$\")"
|
||||
|
112
docs/wordlist-new2.ipynb
Normal file
112
docs/wordlist-new2.ipynb
Normal file
@ -0,0 +1,112 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "991a711f-be98-4aae-a657-84b065449916",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Requirement already satisfied: spacy in /opt/conda/lib/python3.10/site-packages (3.5.0)\n",
|
||||
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.4)\n",
|
||||
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.3.0)\n",
|
||||
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.12)\n",
|
||||
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.10.5)\n",
|
||||
"Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.1.2)\n",
|
||||
"Requirement already satisfied: typer<0.8.0,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.7.0)\n",
|
||||
"Requirement already satisfied: numpy>=1.15.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.23.5)\n",
|
||||
"Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (6.3.0)\n",
|
||||
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.8)\n",
|
||||
"Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from spacy) (67.3.2)\n",
|
||||
"Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (23.0)\n",
|
||||
"Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.1.1)\n",
|
||||
"Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (8.1.7)\n",
|
||||
"Requirement already satisfied: pathy>=0.10.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.10.1)\n",
|
||||
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.8)\n",
|
||||
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.4.6)\n",
|
||||
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (4.64.1)\n",
|
||||
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.7)\n",
|
||||
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.9)\n",
|
||||
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.28.2)\n",
|
||||
"Requirement already satisfied: typing-extensions>=4.2.0 in /opt/conda/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy) (4.4.0)\n",
|
||||
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2022.12.7)\n",
|
||||
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.1.1)\n",
|
||||
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.26.14)\n",
|
||||
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4)\n",
|
||||
"Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.7.9)\n",
|
||||
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.0.4)\n",
|
||||
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/conda/lib/python3.10/site-packages (from typer<0.8.0,>=0.3.0->spacy) (8.1.3)\n",
|
||||
"Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->spacy) (2.1.2)\n",
|
||||
"Collecting en-core-web-trf==3.5.0\n",
|
||||
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl (460.3 MB)\n",
|
||||
"\u001b[2K \u001b[91m━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.1/460.3 MB\u001b[0m \u001b[31m31.6 MB/s\u001b[0m eta \u001b[36m0:00:14\u001b[0m"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" _initialized\n",
|
||||
"except:\n",
|
||||
" !pip install spacy\n",
|
||||
" !python -m spacy download en_core_web_trf\n",
|
||||
" import spacy\n",
|
||||
" \n",
|
||||
" spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n",
|
||||
" \n",
|
||||
" _initialized=True\n",
|
||||
" \n",
|
||||
"import pandas as pd\n",
|
||||
"import gzip\n",
|
||||
"import re"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6b93818f-c54a-4c88-9968-df4244b7c6f6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import spacy\n",
|
||||
"\n",
|
||||
"# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization\n",
|
||||
"nlp = spacy.load('en', disable=['parser', 'ner'])\n",
|
||||
"\n",
|
||||
"sentence = \"The striped bats are hanging on their feet for best\"\n",
|
||||
"\n",
|
||||
"# Parse the sentence using the loaded 'en' model object `nlp`\n",
|
||||
"doc = nlp(sentence)\n",
|
||||
"\n",
|
||||
"# Extract the lemma for each token and join\n",
|
||||
"\" \".join([token.lemma_ for token in doc])\n",
|
||||
"#> 'the strip bat be hang on -PRON- foot for good'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
1
wordlist/01-lemmatized-words.csv
Normal file
1
wordlist/01-lemmatized-words.csv
Normal file
@ -0,0 +1 @@
|
||||
word,lemmatized_word
|
|
96
wordlist/01-lemmatized-words.py
Executable file
96
wordlist/01-lemmatized-words.py
Executable file
@ -0,0 +1,96 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding: utf-8
|
||||
|
||||
print("Step 1")
|
||||
|
||||
|
||||
try:
|
||||
_initialized
|
||||
except:
|
||||
# !pip install spacy
|
||||
# !python -m spacy download en_core_web_trf
|
||||
import spacy
|
||||
from tqdm import tqdm
|
||||
|
||||
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
|
||||
|
||||
_initialized=True
|
||||
|
||||
import pandas as pd
|
||||
import gzip
|
||||
import re
|
||||
|
||||
|
||||
print("Step 2")
|
||||
|
||||
|
||||
def get_lines(filename):
|
||||
with gzip.open(filename, 'r') as f:
|
||||
ret = []
|
||||
for l in f:
|
||||
if len(ret) > 30_000:
|
||||
return ret
|
||||
ret.append(str(l).lower())
|
||||
return ret
|
||||
|
||||
|
||||
|
||||
WORDLIST_SIZE = 8192 + 3
|
||||
word_re = re.compile(r"^[A-Za-z]+$")
|
||||
|
||||
|
||||
print("Step 3")
|
||||
|
||||
|
||||
annotated_words=pd.read_excel("annotated_words.ods")
|
||||
|
||||
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
|
||||
excluded_words[0:10]
|
||||
|
||||
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
|
||||
|
||||
custom_maps = [
|
||||
(m[1]["word"].lower(), mapping.lower())
|
||||
for m in custom_maps.iterrows()
|
||||
for mapping in m[1]["maps_to"]
|
||||
]
|
||||
custom_maps
|
||||
|
||||
|
||||
print("Step 4")
|
||||
|
||||
|
||||
# Start parsing the wordlist
|
||||
all_words = get_lines("00-frequency-all.txt.gz")
|
||||
|
||||
# Delete header line
|
||||
all_words = all_words[1:]
|
||||
|
||||
# Get only the word (fixed width)
|
||||
all_words = [w[13:36].strip() for w in all_words]
|
||||
|
||||
# Remove special characters
|
||||
all_words = [w for w in all_words if word_re.search(w)]
|
||||
|
||||
# Remove all removed words
|
||||
all_words = [w for w in all_words if w not in excluded_words]
|
||||
|
||||
# Add all custom mappings
|
||||
for m in list(sum(custom_maps, ())):
|
||||
if m[0] not in all_words:
|
||||
all_words.append(m[0])
|
||||
if m[1] not in all_words:
|
||||
all_words.append(m[1])
|
||||
|
||||
|
||||
print("Step 5")
|
||||
|
||||
# Lemmatize all words (plural -> singular)
|
||||
lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
|
||||
|
||||
with open("01-lemmatized-words.csv", "w") as f:
|
||||
f.write("word,lemmatized_word\n")
|
||||
|
||||
for w in lemmatize_mappings:
|
||||
f.write(f"{w[0]},{w[1]}")
|
||||
f.write("\n")
|
124
wordlist/requirements.txt
Normal file
124
wordlist/requirements.txt
Normal file
@ -0,0 +1,124 @@
|
||||
anyio==3.6.2
|
||||
argon2-cffi==21.3.0
|
||||
argon2-cffi-bindings==21.2.0
|
||||
arrow==1.2.3
|
||||
asttokens==2.2.1
|
||||
attrs==22.2.0
|
||||
backcall==0.2.0
|
||||
beautifulsoup4==4.11.2
|
||||
bleach==6.0.0
|
||||
blis==0.7.9
|
||||
catalogue==2.0.8
|
||||
certifi==2022.12.7
|
||||
cffi==1.15.1
|
||||
charset-normalizer==3.0.1
|
||||
click==8.1.3
|
||||
comm==0.1.2
|
||||
confection==0.0.4
|
||||
cymem==2.0.7
|
||||
debugpy==1.6.6
|
||||
decorator==5.1.1
|
||||
defusedxml==0.7.1
|
||||
en-core-web-trf @ https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl
|
||||
executing==1.2.0
|
||||
fastjsonschema==2.16.3
|
||||
filelock==3.9.0
|
||||
fqdn==1.5.1
|
||||
huggingface-hub==0.12.1
|
||||
idna==3.4
|
||||
ipykernel==6.21.2
|
||||
ipython==8.11.0
|
||||
ipython-genutils==0.2.0
|
||||
ipywidgets==8.0.4
|
||||
isoduration==20.11.0
|
||||
jedi==0.18.2
|
||||
Jinja2==3.1.2
|
||||
jsonpointer==2.3
|
||||
jsonschema==4.17.3
|
||||
jupyter==1.0.0
|
||||
jupyter-console==6.6.2
|
||||
jupyter-events==0.6.3
|
||||
jupyter_client==8.0.3
|
||||
jupyter_core==5.2.0
|
||||
jupyter_server==2.3.0
|
||||
jupyter_server_terminals==0.4.4
|
||||
jupyterlab-pygments==0.2.2
|
||||
jupyterlab-widgets==3.0.5
|
||||
langcodes==3.3.0
|
||||
MarkupSafe==2.1.2
|
||||
matplotlib-inline==0.1.6
|
||||
mistune==2.0.5
|
||||
murmurhash==1.0.9
|
||||
nbclassic==0.5.2
|
||||
nbclient==0.7.2
|
||||
nbconvert==7.2.9
|
||||
nbformat==5.7.3
|
||||
nest-asyncio==1.5.6
|
||||
notebook==6.5.2
|
||||
notebook_shim==0.2.2
|
||||
numpy==1.24.2
|
||||
nvidia-cublas-cu11==11.10.3.66
|
||||
nvidia-cuda-nvrtc-cu11==11.7.99
|
||||
nvidia-cuda-runtime-cu11==11.7.99
|
||||
nvidia-cudnn-cu11==8.5.0.96
|
||||
odfpy==1.4.1
|
||||
packaging==23.0
|
||||
pandas==1.5.3
|
||||
pandocfilters==1.5.0
|
||||
parso==0.8.3
|
||||
pathy==0.10.1
|
||||
pexpect==4.8.0
|
||||
pickleshare==0.7.5
|
||||
platformdirs==3.0.0
|
||||
preshed==3.0.8
|
||||
prometheus-client==0.16.0
|
||||
prompt-toolkit==3.0.38
|
||||
psutil==5.9.4
|
||||
ptyprocess==0.7.0
|
||||
pure-eval==0.2.2
|
||||
pycparser==2.21
|
||||
pydantic==1.10.5
|
||||
Pygments==2.14.0
|
||||
pyrsistent==0.19.3
|
||||
python-dateutil==2.8.2
|
||||
python-json-logger==2.0.7
|
||||
pytz==2022.7.1
|
||||
PyYAML==6.0
|
||||
pyzmq==25.0.0
|
||||
qtconsole==5.4.0
|
||||
QtPy==2.3.0
|
||||
regex==2022.10.31
|
||||
requests==2.28.2
|
||||
rfc3339-validator==0.1.4
|
||||
rfc3986-validator==0.1.1
|
||||
Send2Trash==1.8.0
|
||||
six==1.16.0
|
||||
smart-open==6.3.0
|
||||
sniffio==1.3.0
|
||||
soupsieve==2.4
|
||||
spacy==3.5.0
|
||||
spacy-alignments==0.9.0
|
||||
spacy-legacy==3.0.12
|
||||
spacy-loggers==1.0.4
|
||||
spacy-transformers==1.2.2
|
||||
srsly==2.4.6
|
||||
stack-data==0.6.2
|
||||
terminado==0.17.1
|
||||
thinc==8.1.7
|
||||
tinycss2==1.2.1
|
||||
tokenizers==0.13.2
|
||||
torch==1.13.1
|
||||
tornado==6.2
|
||||
tqdm==4.64.1
|
||||
traitlets==5.9.0
|
||||
transformers==4.26.1
|
||||
typer==0.7.0
|
||||
typing_extensions==4.5.0
|
||||
uri-template==1.2.0
|
||||
urllib3==1.26.14
|
||||
wasabi==1.1.1
|
||||
wcwidth==0.2.6
|
||||
webcolors==1.12
|
||||
webencodings==0.5.1
|
||||
websocket-client==1.5.1
|
||||
widgetsnbextension==4.0.5
|
2657
wordlist/wordlist-new.ipynb
Normal file
2657
wordlist/wordlist-new.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
220
wordlist/wordlist-new2.ipynb
Normal file
220
wordlist/wordlist-new2.ipynb
Normal file
@ -0,0 +1,220 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "991a711f-be98-4aae-a657-84b065449916",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" _initialized\n",
|
||||
"except:\n",
|
||||
" # !pip install spacy\n",
|
||||
" # !python -m spacy download en_core_web_trf\n",
|
||||
" import spacy\n",
|
||||
" \n",
|
||||
" nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n",
|
||||
" \n",
|
||||
" _initialized=True\n",
|
||||
" \n",
|
||||
"import pandas as pd\n",
|
||||
"import gzip\n",
|
||||
"import re"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "d130bb84",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_lines(filename):\n",
|
||||
" with gzip.open(filename, 'r') as f:\n",
|
||||
" ret = []\n",
|
||||
" for l in f:\n",
|
||||
" if len(ret) > 30_000:\n",
|
||||
" return ret\n",
|
||||
" ret.append(str(l).lower())\n",
|
||||
" return ret\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" \n",
|
||||
"WORDLIST_SIZE = 8192 + 3\n",
|
||||
"word_re = re.compile(r\"^[A-Za-z]+$\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "de2d1731",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pwd\n",
|
||||
"!ls"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "90665714",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"annotated_words=pd.read_excel(\"annotated_words.ods\")\n",
|
||||
"\n",
|
||||
"excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n",
|
||||
"excluded_words[0:10]\n",
|
||||
"\n",
|
||||
"custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n",
|
||||
"\n",
|
||||
"custom_maps = [\n",
|
||||
" (m[1][\"word\"].lower(), mapping.lower())\n",
|
||||
" for m in custom_maps.iterrows()\n",
|
||||
" for mapping in m[1][\"maps_to\"]\n",
|
||||
"]\n",
|
||||
"custom_maps"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "fb50c69e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Start parsing the wordlist\n",
|
||||
"all_words = get_lines(\"00-frequency-all.txt.gz\")\n",
|
||||
"\n",
|
||||
"# Delete header line\n",
|
||||
"all_words = all_words[1:]\n",
|
||||
"\n",
|
||||
"# Get only the word (fixed width)\n",
|
||||
"all_words = [w[13:36].strip() for w in all_words]\n",
|
||||
"\n",
|
||||
"# Remove special characters\n",
|
||||
"all_words = [w for w in all_words if word_re.search(w)]\n",
|
||||
"\n",
|
||||
"# Remove all removed words\n",
|
||||
"all_words = [w for w in all_words if w not in excluded_words]\n",
|
||||
"\n",
|
||||
"# Add all custom mappings\n",
|
||||
"for m in list(sum(custom_maps, ())):\n",
|
||||
" if m[0] not in all_words:\n",
|
||||
" all_words.append(m[0])\n",
|
||||
" if m[1] not in all_words:\n",
|
||||
" all_words.append(m[1])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "cd21bff5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Lemmatize all words (plural -> singular)\n",
|
||||
"lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in all_words[:100]]\n",
|
||||
"print(lemmatize_mappings[:100])\n",
|
||||
"\n",
|
||||
"# Add custom lemmatizations\n",
|
||||
"for l in custom_maps:\n",
|
||||
" if l in lemmatize_mappings:\n",
|
||||
" print(f\"Warning: {l} is already lemmatized\")\n",
|
||||
" else:\n",
|
||||
" lemmatize_mappings.append(l)\n",
|
||||
" \n",
|
||||
"print(lemmatize_mappings[:100])\n",
|
||||
"\n",
|
||||
"lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]\n",
|
||||
"print(lemmatize_mappings[:100])\n",
|
||||
"\n",
|
||||
"# Now, re-add all lematized words to the list of every word\n",
|
||||
"for w in sum(lemmatize_mappings, ()):\n",
|
||||
" if w not in all_words:\n",
|
||||
" print(w)\n",
|
||||
" all_words.append(w)\n",
|
||||
" \n",
|
||||
"lemmatize_mappings = {k: v for k, v in lemmatize_mappings}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "0ee9af7d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"final_wordlist = []\n",
|
||||
"seen_lemmatizations = set()\n",
|
||||
"for w in all_words:\n",
|
||||
" lemmatized = lemmatize_mappings.get(w) or w\n",
|
||||
" if lemmatized in seen_lemmatizations:\n",
|
||||
" # The lemmatized version of this word was already seen\n",
|
||||
" continue\n",
|
||||
" else:\n",
|
||||
" # The lemmatized version hasn't been seen. We're good to add it\n",
|
||||
" final_wordlist.append([\n",
|
||||
" k\n",
|
||||
" for k\n",
|
||||
" in lemmatize_mappings.keys()\n",
|
||||
" if lemmatize_mappings[k] == lemmatized\n",
|
||||
" ])\n",
|
||||
" seen_lemmatizations.add(lemmatized)\n",
|
||||
"\n",
|
||||
" if len(final_wordlist) >= WORDLIST_SIZE:\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
"# Now, convert it to the format (number, word)\n",
|
||||
"final_wordlist = [\n",
|
||||
" (idx, w)\n",
|
||||
" for idx, words in enumerate(final_wordlist)\n",
|
||||
" for w in words\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "07c1293c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(len(lemmatize_mappings))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "19c255d0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
159
wordlist/wordlist-new2.py
Executable file
159
wordlist/wordlist-new2.py
Executable file
@ -0,0 +1,159 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding: utf-8
|
||||
|
||||
print("Step 1")
|
||||
|
||||
|
||||
try:
|
||||
_initialized
|
||||
except:
|
||||
# !pip install spacy
|
||||
# !python -m spacy download en_core_web_trf
|
||||
import spacy
|
||||
from tqdm import tqdm
|
||||
|
||||
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
|
||||
|
||||
_initialized=True
|
||||
|
||||
import pandas as pd
|
||||
import gzip
|
||||
import re
|
||||
|
||||
|
||||
print("Step 2")
|
||||
|
||||
|
||||
def get_lines(filename):
|
||||
with gzip.open(filename, 'r') as f:
|
||||
ret = []
|
||||
for l in f:
|
||||
if len(ret) > 30_000:
|
||||
return ret
|
||||
ret.append(str(l).lower())
|
||||
return ret
|
||||
|
||||
|
||||
|
||||
WORDLIST_SIZE = 8192 + 3
|
||||
word_re = re.compile(r"^[A-Za-z]+$")
|
||||
|
||||
|
||||
print("Step 3")
|
||||
|
||||
|
||||
annotated_words=pd.read_excel("annotated_words.ods")
|
||||
|
||||
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
|
||||
excluded_words[0:10]
|
||||
|
||||
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
|
||||
|
||||
custom_maps = [
|
||||
(m[1]["word"].lower(), mapping.lower())
|
||||
for m in custom_maps.iterrows()
|
||||
for mapping in m[1]["maps_to"]
|
||||
]
|
||||
custom_maps
|
||||
|
||||
|
||||
print("Step 4")
|
||||
|
||||
|
||||
# Start parsing the wordlist
|
||||
all_words = get_lines("00-frequency-all.txt.gz")
|
||||
|
||||
# Delete header line
|
||||
all_words = all_words[1:]
|
||||
|
||||
# Get only the word (fixed width)
|
||||
all_words = [w[13:36].strip() for w in all_words]
|
||||
|
||||
# Remove special characters
|
||||
all_words = [w for w in all_words if word_re.search(w)]
|
||||
|
||||
# Remove all removed words
|
||||
all_words = [w for w in all_words if w not in excluded_words]
|
||||
|
||||
# Add all custom mappings
|
||||
for m in list(sum(custom_maps, ())):
|
||||
if m[0] not in all_words:
|
||||
all_words.append(m[0])
|
||||
if m[1] not in all_words:
|
||||
all_words.append(m[1])
|
||||
|
||||
|
||||
print("Step 5")
|
||||
|
||||
|
||||
# Lemmatize all words (plural -> singular)
|
||||
lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
|
||||
print(lemmatize_mappings[:100])
|
||||
|
||||
# Add custom lemmatizations
|
||||
for l in custom_maps:
|
||||
if l in lemmatize_mappings:
|
||||
print(f"Warning: {l} is already lemmatized")
|
||||
else:
|
||||
lemmatize_mappings.append(l)
|
||||
|
||||
print(lemmatize_mappings[:100])
|
||||
|
||||
lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]
|
||||
print(lemmatize_mappings[:100])
|
||||
|
||||
# Now, re-add all lematized words to the list of every word
|
||||
for w in sum(lemmatize_mappings, ()):
|
||||
if w not in all_words:
|
||||
print(w)
|
||||
all_words.append(w)
|
||||
|
||||
lemmatize_mappings = {k: v for k, v in lemmatize_mappings}
|
||||
|
||||
|
||||
print("Step 6")
|
||||
|
||||
|
||||
final_wordlist = []
|
||||
seen_lemmatizations = set()
|
||||
for w in all_words:
|
||||
lemmatized = lemmatize_mappings.get(w) or w
|
||||
if lemmatized in seen_lemmatizations:
|
||||
# The lemmatized version of this word was already seen
|
||||
continue
|
||||
else:
|
||||
# The lemmatized version hasn't been seen. We're good to add it
|
||||
final_wordlist.append([
|
||||
k
|
||||
for k
|
||||
in lemmatize_mappings.keys()
|
||||
if lemmatize_mappings[k] == lemmatized
|
||||
])
|
||||
seen_lemmatizations.add(lemmatized)
|
||||
|
||||
if len(final_wordlist) >= WORDLIST_SIZE:
|
||||
break
|
||||
|
||||
# Now, convert it to the format (number, word)
|
||||
final_wordlist = [
|
||||
(idx, w)
|
||||
for idx, words in enumerate(final_wordlist)
|
||||
for w in words
|
||||
]
|
||||
|
||||
|
||||
print("Step 7")
|
||||
|
||||
print(len(lemmatize_mappings))
|
||||
|
||||
print("Step 8")
|
||||
|
||||
with open("01-generated-wordlist.csv", "w") as f:
|
||||
f.write("word,number\n")
|
||||
|
||||
for w in final_wordlist:
|
||||
lemmatized = "" if not w[1] else w[1]
|
||||
f.write(f"{w[1].upper()},{w[0]}")
|
||||
f.write("\n")
|
||||
|
||||
print("Done")
|
Loading…
Reference in New Issue
Block a user