this_algorithm/docs/wordlist.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0b00342f-7b19-49cc-bc6c-21019f8cc7dc",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting nltk\n",
      "  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hCollecting odfpy\n",
      "  Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25hRequirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
      "Collecting regex>=2021.8.3\n",
      "  Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
      "Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
      "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n",
      "Building wheels for collected packages: odfpy\n",
      "  Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25h  Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=3ee9aaac0134706d6ef72a359cd2466813f37bd8f080150b008d6d6e247d710c\n",
      "  Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
      "Successfully built odfpy\n",
      "Installing collected packages: regex, odfpy, nltk\n",
      "Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n"
     ]
    }
   ],
   "source": [
    "try:\n",
    "    _initialized\n",
    "except:\n",
    "    !pip install nltk odfpy\n",
    "    import nltk\n",
    "    \n",
    "    nltk.download(\"wordnet\")\n",
    "    _initialized=True\n",
    "    \n",
    "from nltk.stem.wordnet import WordNetLemmatizer\n",
    "import pandas as pd\n",
    "import gzip\n",
    "import re\n",
    "\n",
    "WORDLIST_SIZE=8192 +3"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d5bf26fa-0aab-403a-9a6f-b2a37dc4892e",
   "metadata": {},
   "source": [
    "## First, get the list of excluded words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "926d0d84-0d7e-4939-b87f-1a170f870a8f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "annotated_words=pd.read_excel(\"annotated_words.ods\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "8b0d26e4-051c-4669-b566-bbd5ddbbe02b",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['a', 'i', 's', 'p', 'c', 'b', 'american', 'york', 'children', 'd']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n",
    "excluded_words[0:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5f855ec9-eea5-4e15-bc07-96cdd414f36a",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Next, get the list of custom mappings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "2eea14b2-82bf-4353-8982-76a6c7f46d22",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('be', 'bee'),\n",
       " ('by', 'bye'),\n",
       " ('died', 'dyed'),\n",
       " ('corps', 'core'),\n",
       " ('ore', 'oar'),\n",
       " ('ore', ' or'),\n",
       " ('vary', 'very'),\n",
       " ('com', 'calm'),\n",
       " ('filing', 'filling'),\n",
       " ('fax', 'facts'),\n",
       " ('favour', 'favor'),\n",
       " ('theatre', 'theater'),\n",
       " ('par', 'parse'),\n",
       " ('honour', 'honor'),\n",
       " ('harry', 'hairy'),\n",
       " ('brings', 'bring'),\n",
       " ('organisation', 'organization'),\n",
       " ('simultaneously', 'simultaneous'),\n",
       " ('aluminum', 'aluminium'),\n",
       " ('knight', 'night'),\n",
       " ('electronics', 'electronic'),\n",
       " ('organisations', 'organizations'),\n",
       " ('fortunately', 'fortunate'),\n",
       " ('corp', 'core'),\n",
       " ('chile', 'chilly'),\n",
       " ('chile', ' chili'),\n",
       " ('owe', 'oh'),\n",
       " ('capitol', 'capital'),\n",
       " ('weary', 'wary'),\n",
       " ('berry', 'barry'),\n",
       " ('lecturer', 'lecture'),\n",
       " ('aluminium', 'aluminum'),\n",
       " ('isle', 'aisle'),\n",
       " ('boulder', 'bolder'),\n",
       " ('ads', 'adds'),\n",
       " ('honours', 'honors'),\n",
       " ('bot', 'bought'),\n",
       " ('dew', 'do'),\n",
       " ('dew', ' due'),\n",
       " ('theatres', 'theaters'),\n",
       " ('thru', 'through'),\n",
       " ('monies', 'moneys'),\n",
       " ('cue', 'queue'),\n",
       " ('hairy', 'harry'),\n",
       " ('hem', 'him'),\n",
       " ('nun', 'none'),\n",
       " ('organisational', 'organizational'),\n",
       " ('dessert', 'desert'),\n",
       " ('aux', 'ox'),\n",
       " ('rap', 'wrap'),\n",
       " ('filings', 'filling'),\n",
       " ('sew', 'so'),\n",
       " ('pars', 'parse'),\n",
       " ('fillings', 'filling')]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n",
    "\n",
    "custom_maps = [\n",
    "    (m[1][\"word\"].lower(), mapping.lower())\n",
    "    for m in custom_maps.iterrows()\n",
    "    for mapping in m[1][\"maps_to\"]\n",
    "]\n",
    "custom_maps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "dc52697b-2a30-4e6c-ab74-b77edce3607c",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# all_words: 21287\n",
      "sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n",
      "\n",
      "# lemmatize_mappings: 21341\n",
      "sample: [('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it')]\n",
      "\n",
      "# distinct_words: 17557\n",
      "sample:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def get_lines(filename):\n",
    "    with gzip.open(filename, 'r') as f:\n",
    "        ret = []\n",
    "        for l in f:\n",
    "            if len(ret) > 30_000:\n",
    "                return ret\n",
    "            ret.append(str(l).lower())\n",
    "        return ret\n",
    "    \n",
    "lemmatizer = WordNetLemmatizer()\n",
    "word_re = re.compile(r\"^[A-Za-z]+$\")\n",
    "\n",
    "# Start parsing the wordlist\n",
    "all_words = get_lines(\"frequency-all.txt.gz\")\n",
    "\n",
    "# Delete header line\n",
    "all_words = all_words[1:]\n",
    "\n",
    "# Get only the word (fixed width)\n",
    "all_words = [w[13:36].strip() for w in all_words]\n",
    "\n",
    "# Remove special characters\n",
    "all_words = [w for w in all_words if word_re.search(w)]\n",
    "\n",
    "# Remove all removed words\n",
    "all_words = [w for w in all_words if w not in excluded_words]\n",
    "\n",
    "# Lemmatize all words (plural -> singular)\n",
    "lemmatize_mappings = [\n",
    "    (w, lemmatizer.lemmatize(w)) \n",
    "    for w in all_words\n",
    "    # if w != lemmatizer.lemmatize(w)\n",
    "]\n",
    "\n",
    "# Remove all words that lemmatize to another word\n",
    "#all_words = [w for w in all_words if w not in ]\n",
    "\n",
    "# Add custom lemmatizations\n",
    "for l in custom_maps:\n",
    "    if l in lemmatize_mappings:\n",
    "        print(f\"Warning: {l} is already lemmatized\")\n",
    "    else:\n",
    "        lemmatize_mappings.append(l)\n",
    "\n",
    "distinct_words_lemmatized = set()\n",
    "distinct_words = []\n",
    "for w in lemmatize_mappings:\n",
    "    if w[1] not in distinct_words_lemmatized:\n",
    "        distinct_words_lemmatized.add(w[1])\n",
    "        distinct_words.append(w[0])\n",
    "del distinct_words_lemmatized\n",
    "\n",
    "# Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word\n",
    "#wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl]\n",
    "\n",
    "# Get a list of words that map to other words\n",
    "# A word was lemmatized if wordnet mapped it to another word (not None) that was different\n",
    "#only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]]\n",
    "\n",
    "# Get a list of distinct lemmatized words\n",
    "#distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None]\n",
    "#distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)]\n",
    "\n",
    "print(f\"# all_words: {len(all_words)}\")\n",
    "print(f\"sample: {all_words[0:10]}\")\n",
    "print()\n",
    "print(f\"# lemmatize_mappings: {len(lemmatize_mappings)}\")\n",
    "print(f\"sample: {lemmatize_mappings[0:10]}\")\n",
    "print()\n",
    "print(f\"# distinct_words: {len(distinct_words)}\")\n",
    "print(f\"sample:\")\n",
    "distinct_words[0:10]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6acea424-d538-4981-a4b9-0d9224f8efb3",
   "metadata": {},
   "source": [
    "## Generate the final wordlist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "52d0573e-aefd-4c4e-b682-47d1ff8c676b",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Final wordlist size: 11210\n"
     ]
    }
   ],
   "source": [
    "# The final wordlist map. Maps a word to its numeric value\n",
    "# Starting at 1\n",
    "final_wordlist = {\n",
    "    w: idx + 1\n",
    "    for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE])\n",
    "}\n",
    "\n",
    "reverse_lemmatize_idx = {\n",
    "    lemmatizer.lemmatize(w): w\n",
    "    for w in final_wordlist.keys()\n",
    "}\n",
    "\n",
    "# Add the lemmatized numbers\n",
    "for w, lem_w in lemmatize_mappings:\n",
    "    if lem_w not in reverse_lemmatize_idx:\n",
    "        # This word is not in the reverse list\n",
    "        # This happens when the index of the lemmatized word we're working with is too large\n",
    "        continue\n",
    "        \n",
    "    final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]]\n",
    "\n",
    "assert final_wordlist[\"its\"] == final_wordlist[\"its\"]\n",
    "assert final_wordlist[\"its\"] >= 0\n",
    "\n",
    "print(f\"Final wordlist size: {len(final_wordlist.keys())}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "d1a06597-4ad5-4566-a716-8bbad416b7ab",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]\n",
    "\n",
    "with open(\"final_wordlist.csv\", \"w\") as f:\n",
    "    f.write(\"word,number\\n\")\n",
    "    \n",
    "    for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):\n",
    "        lemmatized = \"\" if not w[1] else w[1]\n",
    "        f.write(f\"{w[0].upper()},{lemmatized - 1}\")\n",
    "        f.write(\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c88fe193-11cc-4a06-a3cf-d1ad85f44d14",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'H'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a0d177b-3499-42fb-8091-29547567d69a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}