Start working on new wordlist generation
This commit is contained in:
parent
c034652d86
commit
6ebe8cd489
1
.gitignore
vendored
1
.gitignore
vendored
@ -8,3 +8,4 @@
|
|||||||
**/.ipynb_checkpoints
|
**/.ipynb_checkpoints
|
||||||
/target
|
/target
|
||||||
/test-data/generator/build/
|
/test-data/generator/build/
|
||||||
|
/wordlist/venv
|
||||||
|
@ -1,12 +0,0 @@
|
|||||||
click==8.1.3
|
|
||||||
defusedxml==0.7.1
|
|
||||||
joblib==1.2.0
|
|
||||||
nltk==3.8.1
|
|
||||||
numpy==1.24.2
|
|
||||||
odfpy==1.4.1
|
|
||||||
pandas==1.5.3
|
|
||||||
python-dateutil==2.8.2
|
|
||||||
pytz==2022.7.1
|
|
||||||
regex==2022.10.31
|
|
||||||
six==1.16.0
|
|
||||||
tqdm==4.64.1
|
|
@ -12,21 +12,34 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n",
|
"Collecting nltk\n",
|
||||||
"Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n",
|
" Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
|
||||||
"Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n",
|
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
||||||
|
"\u001b[?25hCollecting odfpy\n",
|
||||||
|
" Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
|
||||||
|
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m26.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||||
|
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||||
|
"\u001b[?25hCollecting regex>=2021.8.3\n",
|
||||||
|
" Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
|
||||||
|
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m29.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||||
|
"\u001b[?25hRequirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
|
||||||
"Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
|
"Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
|
||||||
"Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
|
|
||||||
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
|
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
|
||||||
"Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n"
|
"Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n",
|
||||||
|
"Building wheels for collected packages: odfpy\n",
|
||||||
|
" Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n",
|
||||||
|
"\u001b[?25h Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=5bfe9fcd7c590666411d404ea3e4ef0f704c9e62ff6621deb4ab09c84bec082a\n",
|
||||||
|
" Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
|
||||||
|
"Successfully built odfpy\n",
|
||||||
|
"Installing collected packages: regex, odfpy, nltk\n",
|
||||||
|
"Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "stderr",
|
"name": "stderr",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n",
|
"[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n"
|
||||||
"[nltk_data] Package wordnet is already up-to-date!\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -64,6 +77,8 @@
|
|||||||
" ret.append(str(l).lower())\n",
|
" ret.append(str(l).lower())\n",
|
||||||
" return ret\n",
|
" return ret\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" \n",
|
||||||
"WORDLIST_SIZE = 8192 + 3\n",
|
"WORDLIST_SIZE = 8192 + 3\n",
|
||||||
"lemmatizer = WordNetLemmatizer()\n",
|
"lemmatizer = WordNetLemmatizer()\n",
|
||||||
"word_re = re.compile(r\"^[A-Za-z]+$\")"
|
"word_re = re.compile(r\"^[A-Za-z]+$\")"
|
||||||
|
112
docs/wordlist-new2.ipynb
Normal file
112
docs/wordlist-new2.ipynb
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "991a711f-be98-4aae-a657-84b065449916",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Requirement already satisfied: spacy in /opt/conda/lib/python3.10/site-packages (3.5.0)\n",
|
||||||
|
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.4)\n",
|
||||||
|
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.3.0)\n",
|
||||||
|
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.12)\n",
|
||||||
|
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.10.5)\n",
|
||||||
|
"Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.1.2)\n",
|
||||||
|
"Requirement already satisfied: typer<0.8.0,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.7.0)\n",
|
||||||
|
"Requirement already satisfied: numpy>=1.15.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.23.5)\n",
|
||||||
|
"Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (6.3.0)\n",
|
||||||
|
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.8)\n",
|
||||||
|
"Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from spacy) (67.3.2)\n",
|
||||||
|
"Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (23.0)\n",
|
||||||
|
"Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.1.1)\n",
|
||||||
|
"Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (8.1.7)\n",
|
||||||
|
"Requirement already satisfied: pathy>=0.10.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (0.10.1)\n",
|
||||||
|
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (3.0.8)\n",
|
||||||
|
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.4.6)\n",
|
||||||
|
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (4.64.1)\n",
|
||||||
|
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.0.7)\n",
|
||||||
|
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (1.0.9)\n",
|
||||||
|
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from spacy) (2.28.2)\n",
|
||||||
|
"Requirement already satisfied: typing-extensions>=4.2.0 in /opt/conda/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy) (4.4.0)\n",
|
||||||
|
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2022.12.7)\n",
|
||||||
|
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.1.1)\n",
|
||||||
|
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.26.14)\n",
|
||||||
|
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4)\n",
|
||||||
|
"Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.7.9)\n",
|
||||||
|
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/conda/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.0.4)\n",
|
||||||
|
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/conda/lib/python3.10/site-packages (from typer<0.8.0,>=0.3.0->spacy) (8.1.3)\n",
|
||||||
|
"Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->spacy) (2.1.2)\n",
|
||||||
|
"Collecting en-core-web-trf==3.5.0\n",
|
||||||
|
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl (460.3 MB)\n",
|
||||||
|
"\u001b[2K \u001b[91m━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.1/460.3 MB\u001b[0m \u001b[31m31.6 MB/s\u001b[0m eta \u001b[36m0:00:14\u001b[0m"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"try:\n",
|
||||||
|
" _initialized\n",
|
||||||
|
"except:\n",
|
||||||
|
" !pip install spacy\n",
|
||||||
|
" !python -m spacy download en_core_web_trf\n",
|
||||||
|
" import spacy\n",
|
||||||
|
" \n",
|
||||||
|
" spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n",
|
||||||
|
" \n",
|
||||||
|
" _initialized=True\n",
|
||||||
|
" \n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import gzip\n",
|
||||||
|
"import re"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "6b93818f-c54a-4c88-9968-df4244b7c6f6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import spacy\n",
|
||||||
|
"\n",
|
||||||
|
"# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization\n",
|
||||||
|
"nlp = spacy.load('en', disable=['parser', 'ner'])\n",
|
||||||
|
"\n",
|
||||||
|
"sentence = \"The striped bats are hanging on their feet for best\"\n",
|
||||||
|
"\n",
|
||||||
|
"# Parse the sentence using the loaded 'en' model object `nlp`\n",
|
||||||
|
"doc = nlp(sentence)\n",
|
||||||
|
"\n",
|
||||||
|
"# Extract the lemma for each token and join\n",
|
||||||
|
"\" \".join([token.lemma_ for token in doc])\n",
|
||||||
|
"#> 'the strip bat be hang on -PRON- foot for good'"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
1
wordlist/01-lemmatized-words.csv
Normal file
1
wordlist/01-lemmatized-words.csv
Normal file
@ -0,0 +1 @@
|
|||||||
|
word,lemmatized_word
|
|
96
wordlist/01-lemmatized-words.py
Executable file
96
wordlist/01-lemmatized-words.py
Executable file
@ -0,0 +1,96 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
print("Step 1")
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
_initialized
|
||||||
|
except:
|
||||||
|
# !pip install spacy
|
||||||
|
# !python -m spacy download en_core_web_trf
|
||||||
|
import spacy
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
|
||||||
|
|
||||||
|
_initialized=True
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import gzip
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
print("Step 2")
|
||||||
|
|
||||||
|
|
||||||
|
def get_lines(filename):
|
||||||
|
with gzip.open(filename, 'r') as f:
|
||||||
|
ret = []
|
||||||
|
for l in f:
|
||||||
|
if len(ret) > 30_000:
|
||||||
|
return ret
|
||||||
|
ret.append(str(l).lower())
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
WORDLIST_SIZE = 8192 + 3
|
||||||
|
word_re = re.compile(r"^[A-Za-z]+$")
|
||||||
|
|
||||||
|
|
||||||
|
print("Step 3")
|
||||||
|
|
||||||
|
|
||||||
|
annotated_words=pd.read_excel("annotated_words.ods")
|
||||||
|
|
||||||
|
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
|
||||||
|
excluded_words[0:10]
|
||||||
|
|
||||||
|
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
|
||||||
|
|
||||||
|
custom_maps = [
|
||||||
|
(m[1]["word"].lower(), mapping.lower())
|
||||||
|
for m in custom_maps.iterrows()
|
||||||
|
for mapping in m[1]["maps_to"]
|
||||||
|
]
|
||||||
|
custom_maps
|
||||||
|
|
||||||
|
|
||||||
|
print("Step 4")
|
||||||
|
|
||||||
|
|
||||||
|
# Start parsing the wordlist
|
||||||
|
all_words = get_lines("00-frequency-all.txt.gz")
|
||||||
|
|
||||||
|
# Delete header line
|
||||||
|
all_words = all_words[1:]
|
||||||
|
|
||||||
|
# Get only the word (fixed width)
|
||||||
|
all_words = [w[13:36].strip() for w in all_words]
|
||||||
|
|
||||||
|
# Remove special characters
|
||||||
|
all_words = [w for w in all_words if word_re.search(w)]
|
||||||
|
|
||||||
|
# Remove all removed words
|
||||||
|
all_words = [w for w in all_words if w not in excluded_words]
|
||||||
|
|
||||||
|
# Add all custom mappings
|
||||||
|
for m in list(sum(custom_maps, ())):
|
||||||
|
if m[0] not in all_words:
|
||||||
|
all_words.append(m[0])
|
||||||
|
if m[1] not in all_words:
|
||||||
|
all_words.append(m[1])
|
||||||
|
|
||||||
|
|
||||||
|
print("Step 5")
|
||||||
|
|
||||||
|
# Lemmatize all words (plural -> singular)
|
||||||
|
lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
|
||||||
|
|
||||||
|
with open("01-lemmatized-words.csv", "w") as f:
|
||||||
|
f.write("word,lemmatized_word\n")
|
||||||
|
|
||||||
|
for w in lemmatize_mappings:
|
||||||
|
f.write(f"{w[0]},{w[1]}")
|
||||||
|
f.write("\n")
|
124
wordlist/requirements.txt
Normal file
124
wordlist/requirements.txt
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
anyio==3.6.2
|
||||||
|
argon2-cffi==21.3.0
|
||||||
|
argon2-cffi-bindings==21.2.0
|
||||||
|
arrow==1.2.3
|
||||||
|
asttokens==2.2.1
|
||||||
|
attrs==22.2.0
|
||||||
|
backcall==0.2.0
|
||||||
|
beautifulsoup4==4.11.2
|
||||||
|
bleach==6.0.0
|
||||||
|
blis==0.7.9
|
||||||
|
catalogue==2.0.8
|
||||||
|
certifi==2022.12.7
|
||||||
|
cffi==1.15.1
|
||||||
|
charset-normalizer==3.0.1
|
||||||
|
click==8.1.3
|
||||||
|
comm==0.1.2
|
||||||
|
confection==0.0.4
|
||||||
|
cymem==2.0.7
|
||||||
|
debugpy==1.6.6
|
||||||
|
decorator==5.1.1
|
||||||
|
defusedxml==0.7.1
|
||||||
|
en-core-web-trf @ https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl
|
||||||
|
executing==1.2.0
|
||||||
|
fastjsonschema==2.16.3
|
||||||
|
filelock==3.9.0
|
||||||
|
fqdn==1.5.1
|
||||||
|
huggingface-hub==0.12.1
|
||||||
|
idna==3.4
|
||||||
|
ipykernel==6.21.2
|
||||||
|
ipython==8.11.0
|
||||||
|
ipython-genutils==0.2.0
|
||||||
|
ipywidgets==8.0.4
|
||||||
|
isoduration==20.11.0
|
||||||
|
jedi==0.18.2
|
||||||
|
Jinja2==3.1.2
|
||||||
|
jsonpointer==2.3
|
||||||
|
jsonschema==4.17.3
|
||||||
|
jupyter==1.0.0
|
||||||
|
jupyter-console==6.6.2
|
||||||
|
jupyter-events==0.6.3
|
||||||
|
jupyter_client==8.0.3
|
||||||
|
jupyter_core==5.2.0
|
||||||
|
jupyter_server==2.3.0
|
||||||
|
jupyter_server_terminals==0.4.4
|
||||||
|
jupyterlab-pygments==0.2.2
|
||||||
|
jupyterlab-widgets==3.0.5
|
||||||
|
langcodes==3.3.0
|
||||||
|
MarkupSafe==2.1.2
|
||||||
|
matplotlib-inline==0.1.6
|
||||||
|
mistune==2.0.5
|
||||||
|
murmurhash==1.0.9
|
||||||
|
nbclassic==0.5.2
|
||||||
|
nbclient==0.7.2
|
||||||
|
nbconvert==7.2.9
|
||||||
|
nbformat==5.7.3
|
||||||
|
nest-asyncio==1.5.6
|
||||||
|
notebook==6.5.2
|
||||||
|
notebook_shim==0.2.2
|
||||||
|
numpy==1.24.2
|
||||||
|
nvidia-cublas-cu11==11.10.3.66
|
||||||
|
nvidia-cuda-nvrtc-cu11==11.7.99
|
||||||
|
nvidia-cuda-runtime-cu11==11.7.99
|
||||||
|
nvidia-cudnn-cu11==8.5.0.96
|
||||||
|
odfpy==1.4.1
|
||||||
|
packaging==23.0
|
||||||
|
pandas==1.5.3
|
||||||
|
pandocfilters==1.5.0
|
||||||
|
parso==0.8.3
|
||||||
|
pathy==0.10.1
|
||||||
|
pexpect==4.8.0
|
||||||
|
pickleshare==0.7.5
|
||||||
|
platformdirs==3.0.0
|
||||||
|
preshed==3.0.8
|
||||||
|
prometheus-client==0.16.0
|
||||||
|
prompt-toolkit==3.0.38
|
||||||
|
psutil==5.9.4
|
||||||
|
ptyprocess==0.7.0
|
||||||
|
pure-eval==0.2.2
|
||||||
|
pycparser==2.21
|
||||||
|
pydantic==1.10.5
|
||||||
|
Pygments==2.14.0
|
||||||
|
pyrsistent==0.19.3
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
python-json-logger==2.0.7
|
||||||
|
pytz==2022.7.1
|
||||||
|
PyYAML==6.0
|
||||||
|
pyzmq==25.0.0
|
||||||
|
qtconsole==5.4.0
|
||||||
|
QtPy==2.3.0
|
||||||
|
regex==2022.10.31
|
||||||
|
requests==2.28.2
|
||||||
|
rfc3339-validator==0.1.4
|
||||||
|
rfc3986-validator==0.1.1
|
||||||
|
Send2Trash==1.8.0
|
||||||
|
six==1.16.0
|
||||||
|
smart-open==6.3.0
|
||||||
|
sniffio==1.3.0
|
||||||
|
soupsieve==2.4
|
||||||
|
spacy==3.5.0
|
||||||
|
spacy-alignments==0.9.0
|
||||||
|
spacy-legacy==3.0.12
|
||||||
|
spacy-loggers==1.0.4
|
||||||
|
spacy-transformers==1.2.2
|
||||||
|
srsly==2.4.6
|
||||||
|
stack-data==0.6.2
|
||||||
|
terminado==0.17.1
|
||||||
|
thinc==8.1.7
|
||||||
|
tinycss2==1.2.1
|
||||||
|
tokenizers==0.13.2
|
||||||
|
torch==1.13.1
|
||||||
|
tornado==6.2
|
||||||
|
tqdm==4.64.1
|
||||||
|
traitlets==5.9.0
|
||||||
|
transformers==4.26.1
|
||||||
|
typer==0.7.0
|
||||||
|
typing_extensions==4.5.0
|
||||||
|
uri-template==1.2.0
|
||||||
|
urllib3==1.26.14
|
||||||
|
wasabi==1.1.1
|
||||||
|
wcwidth==0.2.6
|
||||||
|
webcolors==1.12
|
||||||
|
webencodings==0.5.1
|
||||||
|
websocket-client==1.5.1
|
||||||
|
widgetsnbextension==4.0.5
|
2657
wordlist/wordlist-new.ipynb
Normal file
2657
wordlist/wordlist-new.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
220
wordlist/wordlist-new2.ipynb
Normal file
220
wordlist/wordlist-new2.ipynb
Normal file
@ -0,0 +1,220 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "991a711f-be98-4aae-a657-84b065449916",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"try:\n",
|
||||||
|
" _initialized\n",
|
||||||
|
"except:\n",
|
||||||
|
" # !pip install spacy\n",
|
||||||
|
" # !python -m spacy download en_core_web_trf\n",
|
||||||
|
" import spacy\n",
|
||||||
|
" \n",
|
||||||
|
" nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n",
|
||||||
|
" \n",
|
||||||
|
" _initialized=True\n",
|
||||||
|
" \n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import gzip\n",
|
||||||
|
"import re"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "d130bb84",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def get_lines(filename):\n",
|
||||||
|
" with gzip.open(filename, 'r') as f:\n",
|
||||||
|
" ret = []\n",
|
||||||
|
" for l in f:\n",
|
||||||
|
" if len(ret) > 30_000:\n",
|
||||||
|
" return ret\n",
|
||||||
|
" ret.append(str(l).lower())\n",
|
||||||
|
" return ret\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" \n",
|
||||||
|
"WORDLIST_SIZE = 8192 + 3\n",
|
||||||
|
"word_re = re.compile(r\"^[A-Za-z]+$\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "de2d1731",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"!pwd\n",
|
||||||
|
"!ls"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "90665714",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"annotated_words=pd.read_excel(\"annotated_words.ods\")\n",
|
||||||
|
"\n",
|
||||||
|
"excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n",
|
||||||
|
"excluded_words[0:10]\n",
|
||||||
|
"\n",
|
||||||
|
"custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n",
|
||||||
|
"\n",
|
||||||
|
"custom_maps = [\n",
|
||||||
|
" (m[1][\"word\"].lower(), mapping.lower())\n",
|
||||||
|
" for m in custom_maps.iterrows()\n",
|
||||||
|
" for mapping in m[1][\"maps_to\"]\n",
|
||||||
|
"]\n",
|
||||||
|
"custom_maps"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "fb50c69e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Start parsing the wordlist\n",
|
||||||
|
"all_words = get_lines(\"00-frequency-all.txt.gz\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Delete header line\n",
|
||||||
|
"all_words = all_words[1:]\n",
|
||||||
|
"\n",
|
||||||
|
"# Get only the word (fixed width)\n",
|
||||||
|
"all_words = [w[13:36].strip() for w in all_words]\n",
|
||||||
|
"\n",
|
||||||
|
"# Remove special characters\n",
|
||||||
|
"all_words = [w for w in all_words if word_re.search(w)]\n",
|
||||||
|
"\n",
|
||||||
|
"# Remove all removed words\n",
|
||||||
|
"all_words = [w for w in all_words if w not in excluded_words]\n",
|
||||||
|
"\n",
|
||||||
|
"# Add all custom mappings\n",
|
||||||
|
"for m in list(sum(custom_maps, ())):\n",
|
||||||
|
" if m[0] not in all_words:\n",
|
||||||
|
" all_words.append(m[0])\n",
|
||||||
|
" if m[1] not in all_words:\n",
|
||||||
|
" all_words.append(m[1])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"id": "cd21bff5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Lemmatize all words (plural -> singular)\n",
|
||||||
|
"lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in all_words[:100]]\n",
|
||||||
|
"print(lemmatize_mappings[:100])\n",
|
||||||
|
"\n",
|
||||||
|
"# Add custom lemmatizations\n",
|
||||||
|
"for l in custom_maps:\n",
|
||||||
|
" if l in lemmatize_mappings:\n",
|
||||||
|
" print(f\"Warning: {l} is already lemmatized\")\n",
|
||||||
|
" else:\n",
|
||||||
|
" lemmatize_mappings.append(l)\n",
|
||||||
|
" \n",
|
||||||
|
"print(lemmatize_mappings[:100])\n",
|
||||||
|
"\n",
|
||||||
|
"lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]\n",
|
||||||
|
"print(lemmatize_mappings[:100])\n",
|
||||||
|
"\n",
|
||||||
|
"# Now, re-add all lematized words to the list of every word\n",
|
||||||
|
"for w in sum(lemmatize_mappings, ()):\n",
|
||||||
|
" if w not in all_words:\n",
|
||||||
|
" print(w)\n",
|
||||||
|
" all_words.append(w)\n",
|
||||||
|
" \n",
|
||||||
|
"lemmatize_mappings = {k: v for k, v in lemmatize_mappings}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "0ee9af7d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"final_wordlist = []\n",
|
||||||
|
"seen_lemmatizations = set()\n",
|
||||||
|
"for w in all_words:\n",
|
||||||
|
" lemmatized = lemmatize_mappings.get(w) or w\n",
|
||||||
|
" if lemmatized in seen_lemmatizations:\n",
|
||||||
|
" # The lemmatized version of this word was already seen\n",
|
||||||
|
" continue\n",
|
||||||
|
" else:\n",
|
||||||
|
" # The lemmatized version hasn't been seen. We're good to add it\n",
|
||||||
|
" final_wordlist.append([\n",
|
||||||
|
" k\n",
|
||||||
|
" for k\n",
|
||||||
|
" in lemmatize_mappings.keys()\n",
|
||||||
|
" if lemmatize_mappings[k] == lemmatized\n",
|
||||||
|
" ])\n",
|
||||||
|
" seen_lemmatizations.add(lemmatized)\n",
|
||||||
|
"\n",
|
||||||
|
" if len(final_wordlist) >= WORDLIST_SIZE:\n",
|
||||||
|
" break\n",
|
||||||
|
"\n",
|
||||||
|
"# Now, convert it to the format (number, word)\n",
|
||||||
|
"final_wordlist = [\n",
|
||||||
|
" (idx, w)\n",
|
||||||
|
" for idx, words in enumerate(final_wordlist)\n",
|
||||||
|
" for w in words\n",
|
||||||
|
"]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "07c1293c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"print(len(lemmatize_mappings))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "19c255d0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
159
wordlist/wordlist-new2.py
Executable file
159
wordlist/wordlist-new2.py
Executable file
@ -0,0 +1,159 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
print("Step 1")
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
_initialized
|
||||||
|
except:
|
||||||
|
# !pip install spacy
|
||||||
|
# !python -m spacy download en_core_web_trf
|
||||||
|
import spacy
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
|
||||||
|
|
||||||
|
_initialized=True
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import gzip
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
print("Step 2")
|
||||||
|
|
||||||
|
|
||||||
|
def get_lines(filename):
|
||||||
|
with gzip.open(filename, 'r') as f:
|
||||||
|
ret = []
|
||||||
|
for l in f:
|
||||||
|
if len(ret) > 30_000:
|
||||||
|
return ret
|
||||||
|
ret.append(str(l).lower())
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
WORDLIST_SIZE = 8192 + 3
|
||||||
|
word_re = re.compile(r"^[A-Za-z]+$")
|
||||||
|
|
||||||
|
|
||||||
|
print("Step 3")
|
||||||
|
|
||||||
|
|
||||||
|
annotated_words=pd.read_excel("annotated_words.ods")
|
||||||
|
|
||||||
|
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
|
||||||
|
excluded_words[0:10]
|
||||||
|
|
||||||
|
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
|
||||||
|
|
||||||
|
custom_maps = [
|
||||||
|
(m[1]["word"].lower(), mapping.lower())
|
||||||
|
for m in custom_maps.iterrows()
|
||||||
|
for mapping in m[1]["maps_to"]
|
||||||
|
]
|
||||||
|
custom_maps
|
||||||
|
|
||||||
|
|
||||||
|
print("Step 4")
|
||||||
|
|
||||||
|
|
||||||
|
# Start parsing the wordlist
|
||||||
|
all_words = get_lines("00-frequency-all.txt.gz")
|
||||||
|
|
||||||
|
# Delete header line
|
||||||
|
all_words = all_words[1:]
|
||||||
|
|
||||||
|
# Get only the word (fixed width)
|
||||||
|
all_words = [w[13:36].strip() for w in all_words]
|
||||||
|
|
||||||
|
# Remove special characters
|
||||||
|
all_words = [w for w in all_words if word_re.search(w)]
|
||||||
|
|
||||||
|
# Remove all removed words
|
||||||
|
all_words = [w for w in all_words if w not in excluded_words]
|
||||||
|
|
||||||
|
# Add all custom mappings
|
||||||
|
for m in list(sum(custom_maps, ())):
|
||||||
|
if m[0] not in all_words:
|
||||||
|
all_words.append(m[0])
|
||||||
|
if m[1] not in all_words:
|
||||||
|
all_words.append(m[1])
|
||||||
|
|
||||||
|
|
||||||
|
print("Step 5")
|
||||||
|
|
||||||
|
|
||||||
|
# Lemmatize all words (plural -> singular)
|
||||||
|
lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
|
||||||
|
print(lemmatize_mappings[:100])
|
||||||
|
|
||||||
|
# Add custom lemmatizations
|
||||||
|
for l in custom_maps:
|
||||||
|
if l in lemmatize_mappings:
|
||||||
|
print(f"Warning: {l} is already lemmatized")
|
||||||
|
else:
|
||||||
|
lemmatize_mappings.append(l)
|
||||||
|
|
||||||
|
print(lemmatize_mappings[:100])
|
||||||
|
|
||||||
|
lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]
|
||||||
|
print(lemmatize_mappings[:100])
|
||||||
|
|
||||||
|
# Now, re-add all lematized words to the list of every word
|
||||||
|
for w in sum(lemmatize_mappings, ()):
|
||||||
|
if w not in all_words:
|
||||||
|
print(w)
|
||||||
|
all_words.append(w)
|
||||||
|
|
||||||
|
lemmatize_mappings = {k: v for k, v in lemmatize_mappings}
|
||||||
|
|
||||||
|
|
||||||
|
print("Step 6")
|
||||||
|
|
||||||
|
|
||||||
|
final_wordlist = []
|
||||||
|
seen_lemmatizations = set()
|
||||||
|
for w in all_words:
|
||||||
|
lemmatized = lemmatize_mappings.get(w) or w
|
||||||
|
if lemmatized in seen_lemmatizations:
|
||||||
|
# The lemmatized version of this word was already seen
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# The lemmatized version hasn't been seen. We're good to add it
|
||||||
|
final_wordlist.append([
|
||||||
|
k
|
||||||
|
for k
|
||||||
|
in lemmatize_mappings.keys()
|
||||||
|
if lemmatize_mappings[k] == lemmatized
|
||||||
|
])
|
||||||
|
seen_lemmatizations.add(lemmatized)
|
||||||
|
|
||||||
|
if len(final_wordlist) >= WORDLIST_SIZE:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Now, convert it to the format (number, word)
|
||||||
|
final_wordlist = [
|
||||||
|
(idx, w)
|
||||||
|
for idx, words in enumerate(final_wordlist)
|
||||||
|
for w in words
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
print("Step 7")
|
||||||
|
|
||||||
|
print(len(lemmatize_mappings))
|
||||||
|
|
||||||
|
print("Step 8")
|
||||||
|
|
||||||
|
with open("01-generated-wordlist.csv", "w") as f:
|
||||||
|
f.write("word,number\n")
|
||||||
|
|
||||||
|
for w in final_wordlist:
|
||||||
|
lemmatized = "" if not w[1] else w[1]
|
||||||
|
f.write(f"{w[1].upper()},{w[0]}")
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
print("Done")
|
Loading…
Reference in New Issue
Block a user