Add initial wordlist and code
This commit is contained in:
parent
aaae60a95c
commit
e74b5055ee
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,2 +1,5 @@
|
||||
/target
|
||||
/design-workflow/*.html
|
||||
/design-workflow/*.svg
|
||||
/design-workflow/frequency-all.txt.gz
|
||||
**/.ipynb_checkpoints
|
||||
|
@ -1,4 +1,5 @@
|
||||
// echo DESIGN.adoc | entr sh -c "asciidoctor DESIGN.adoc; printf 'Done\n'"
|
||||
// echo DESIGN_WORKFLOW.adoc | entr sh -c "podman run --rm -it --network none -v "${PWD}:/documents/" asciidoctor/docker-asciidoctor asciidoctor -r asciidoctor-mathematical -a mathematical-format=svg DESIGN_WORKFLOW.adoc; printf 'Done ($(date -Isecond))\n'"
|
||||
|
||||
:toc:
|
||||
:nofooter:
|
||||
:!webfonts:
|
||||
|
BIN
design-workflow/annotated_words.ods
Normal file
BIN
design-workflow/annotated_words.ods
Normal file
Binary file not shown.
385
design-workflow/wordlist.ipynb
Normal file
385
design-workflow/wordlist.ipynb
Normal file
@ -0,0 +1,385 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "0b00342f-7b19-49cc-bc6c-21019f8cc7dc",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n",
|
||||
"Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n",
|
||||
"Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
|
||||
"Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n",
|
||||
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
|
||||
"Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
|
||||
"Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n",
|
||||
"[nltk_data] Package wordnet is already up-to-date!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" _initialized\n",
|
||||
"except:\n",
|
||||
" !pip install nltk odfpy\n",
|
||||
" import nltk\n",
|
||||
" \n",
|
||||
" nltk.download(\"wordnet\")\n",
|
||||
" _initialized=True\n",
|
||||
" \n",
|
||||
"from nltk.stem.wordnet import WordNetLemmatizer\n",
|
||||
"import pandas as pd\n",
|
||||
"import gzip\n",
|
||||
"import re\n",
|
||||
"\n",
|
||||
"WORDLIST_SIZE=8192 +3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d5bf26fa-0aab-403a-9a6f-b2a37dc4892e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## First, get the list of excluded words"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "926d0d84-0d7e-4939-b87f-1a170f870a8f",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"annotated_words=pd.read_excel(\"annotated_words.ods\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "8b0d26e4-051c-4669-b566-bbd5ddbbe02b",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['a', 'i', 's', 'p', 'c', 'b', 'american', 'york', 'd', 'john']"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n",
|
||||
"excluded_words[0:10]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5f855ec9-eea5-4e15-bc07-96cdd414f36a",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"## Next, get the list of custom mappings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "2eea14b2-82bf-4353-8982-76a6c7f46d22",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('be', 'bee'),\n",
|
||||
" ('by', 'bye'),\n",
|
||||
" ('corps', 'core'),\n",
|
||||
" ('ore', 'oar'),\n",
|
||||
" ('ore', ' or'),\n",
|
||||
" ('vary', 'very'),\n",
|
||||
" ('com', 'calm'),\n",
|
||||
" ('filing', 'filling'),\n",
|
||||
" ('fax', 'facts'),\n",
|
||||
" ('theatre', 'theater'),\n",
|
||||
" ('par', 'parse'),\n",
|
||||
" ('honour', 'honor'),\n",
|
||||
" ('harry', 'hairy'),\n",
|
||||
" ('brings', 'bring'),\n",
|
||||
" ('organisation', 'organization'),\n",
|
||||
" ('simultaneously', 'simultaneous'),\n",
|
||||
" ('aluminum', 'aluminium'),\n",
|
||||
" ('knight', 'night'),\n",
|
||||
" ('electronics', 'electronic'),\n",
|
||||
" ('organisations', 'organizations'),\n",
|
||||
" ('fortunately', 'fortunate'),\n",
|
||||
" ('corp', 'core'),\n",
|
||||
" ('chile', 'chilly'),\n",
|
||||
" ('chile', ' chili'),\n",
|
||||
" ('owe', 'oh'),\n",
|
||||
" ('capitol', 'capital'),\n",
|
||||
" ('weary', 'wary'),\n",
|
||||
" ('berry', 'barry'),\n",
|
||||
" ('lecturer', 'lecture'),\n",
|
||||
" ('aluminium', 'aluminum'),\n",
|
||||
" ('isle', 'aisle'),\n",
|
||||
" ('boulder', 'bolder'),\n",
|
||||
" ('ads', 'adds'),\n",
|
||||
" ('honours', 'honors'),\n",
|
||||
" ('bot', 'bought'),\n",
|
||||
" ('dew', 'do'),\n",
|
||||
" ('dew', ' due'),\n",
|
||||
" ('theatres', 'theaters'),\n",
|
||||
" ('thru', 'through'),\n",
|
||||
" ('monies', 'moneys'),\n",
|
||||
" ('cue', 'queue'),\n",
|
||||
" ('hairy', 'harry'),\n",
|
||||
" ('hem', 'him'),\n",
|
||||
" ('nun', 'none'),\n",
|
||||
" ('organisational', 'organizational'),\n",
|
||||
" ('aux', 'ox'),\n",
|
||||
" ('rap', 'wrap'),\n",
|
||||
" ('filings', 'filling'),\n",
|
||||
" ('sew', 'so'),\n",
|
||||
" ('pars', 'parse'),\n",
|
||||
" ('fillings', 'filling')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n",
|
||||
"\n",
|
||||
"custom_maps = [\n",
|
||||
" (m[1][\"word\"].lower(), mapping.lower())\n",
|
||||
" for m in custom_maps.iterrows()\n",
|
||||
" for mapping in m[1][\"maps_to\"]\n",
|
||||
"]\n",
|
||||
"custom_maps"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "dc52697b-2a30-4e6c-ab74-b77edce3607c",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"# all_words: 21323\n",
|
||||
"sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n",
|
||||
"\n",
|
||||
"# lemmatize_mappings: 21374\n",
|
||||
"sample: [('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it')]\n",
|
||||
"\n",
|
||||
"# distinct_words: 17585\n",
|
||||
"sample:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def get_lines(filename):\n",
|
||||
" with gzip.open(filename, 'r') as f:\n",
|
||||
" ret = []\n",
|
||||
" for l in f:\n",
|
||||
" if len(ret) > 30_000:\n",
|
||||
" return ret\n",
|
||||
" ret.append(str(l).lower())\n",
|
||||
" return ret\n",
|
||||
" \n",
|
||||
"lemmatizer = WordNetLemmatizer()\n",
|
||||
"word_re = re.compile(r\"^[A-Za-z]+$\")\n",
|
||||
"\n",
|
||||
"# Start parsing the wordlist\n",
|
||||
"all_words = get_lines(\"frequency-all.txt.gz\")\n",
|
||||
"\n",
|
||||
"# Delete header line\n",
|
||||
"all_words = all_words[1:]\n",
|
||||
"\n",
|
||||
"# Get only the word (fixed width)\n",
|
||||
"all_words = [w[13:36].strip() for w in all_words]\n",
|
||||
"\n",
|
||||
"# Remove special characters\n",
|
||||
"all_words = [w for w in all_words if word_re.search(w)]\n",
|
||||
"\n",
|
||||
"# Remove all removed words\n",
|
||||
"all_words = [w for w in all_words if w not in excluded_words]\n",
|
||||
"\n",
|
||||
"# Lemmatize all words (plural -> singular)\n",
|
||||
"lemmatize_mappings = [\n",
|
||||
" (w, lemmatizer.lemmatize(w)) \n",
|
||||
" for w in all_words\n",
|
||||
" # if w != lemmatizer.lemmatize(w)\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# Remove all words that lemmatize to another word\n",
|
||||
"#all_words = [w for w in all_words if w not in ]\n",
|
||||
"\n",
|
||||
"# Add custom lemmatizations\n",
|
||||
"for l in custom_maps:\n",
|
||||
" if l in lemmatize_mappings:\n",
|
||||
" print(f\"Warning: {l} is already lemmatized\")\n",
|
||||
" else:\n",
|
||||
" lemmatize_mappings.append(l)\n",
|
||||
"\n",
|
||||
"distinct_words_lemmatized = set()\n",
|
||||
"distinct_words = []\n",
|
||||
"for w in lemmatize_mappings:\n",
|
||||
" if w[1] not in distinct_words_lemmatized:\n",
|
||||
" distinct_words_lemmatized.add(w[1])\n",
|
||||
" distinct_words.append(w[0])\n",
|
||||
"del distinct_words_lemmatized\n",
|
||||
"\n",
|
||||
"# Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word\n",
|
||||
"#wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl]\n",
|
||||
"\n",
|
||||
"# Get a list of words that map to other words\n",
|
||||
"# A word was lemmatized if wordnet mapped it to another word (not None) that was different\n",
|
||||
"#only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]]\n",
|
||||
"\n",
|
||||
"# Get a list of distinct lemmatized words\n",
|
||||
"#distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None]\n",
|
||||
"#distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)]\n",
|
||||
"\n",
|
||||
"print(f\"# all_words: {len(all_words)}\")\n",
|
||||
"print(f\"sample: {all_words[0:10]}\")\n",
|
||||
"print()\n",
|
||||
"print(f\"# lemmatize_mappings: {len(lemmatize_mappings)}\")\n",
|
||||
"print(f\"sample: {lemmatize_mappings[0:10]}\")\n",
|
||||
"print()\n",
|
||||
"print(f\"# distinct_words: {len(distinct_words)}\")\n",
|
||||
"print(f\"sample:\")\n",
|
||||
"distinct_words[0:10]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6acea424-d538-4981-a4b9-0d9224f8efb3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Generate the final wordlist"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "52d0573e-aefd-4c4e-b682-47d1ff8c676b",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Final wordlist size: 11212\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# The final wordlist map. Maps a word to its numeric value\n",
|
||||
"# Starting at 1\n",
|
||||
"final_wordlist = {\n",
|
||||
" w: idx + 1\n",
|
||||
" for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE])\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"reverse_lemmatize_idx = {\n",
|
||||
" lemmatizer.lemmatize(w): w\n",
|
||||
" for w in final_wordlist.keys()\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Add the lemmatized numbers\n",
|
||||
"for w, lem_w in lemmatize_mappings:\n",
|
||||
" if lem_w not in reverse_lemmatize_idx:\n",
|
||||
" # This word is not in the reverse list\n",
|
||||
" # This happens when the index of the lemmatized word we're working with is too large\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]]\n",
|
||||
"\n",
|
||||
"assert final_wordlist[\"its\"] == final_wordlist[\"its\"]\n",
|
||||
"assert final_wordlist[\"its\"] >= 0\n",
|
||||
"\n",
|
||||
"print(f\"Final wordlist size: {len(final_wordlist.keys())}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "d1a06597-4ad5-4566-a716-8bbad416b7ab",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(\"final_wordlist.csv\", \"w\") as f:\n",
|
||||
" sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]\n",
|
||||
" \n",
|
||||
" for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):\n",
|
||||
" lemmatized = \"\" if not w[1] else w[1]\n",
|
||||
" f.write(f\"{w[0]},{lemmatized}\")\n",
|
||||
" f.write(\"\\n\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user