{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "0b00342f-7b19-49cc-bc6c-21019f8cc7dc", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n", "Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n", "Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n", "Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n", "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n", "Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n", "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] } ], "source": [ "try:\n", " _initialized\n", "except:\n", " !pip install nltk odfpy\n", " import nltk\n", " \n", " nltk.download(\"wordnet\")\n", " _initialized=True\n", " \n", "from nltk.stem.wordnet import WordNetLemmatizer\n", "import pandas as pd\n", "import gzip\n", "import re\n", "\n", "WORDLIST_SIZE=8192 +3" ] }, { "cell_type": "markdown", "id": "d5bf26fa-0aab-403a-9a6f-b2a37dc4892e", "metadata": {}, "source": [ "## First, get the list of excluded words" ] }, { "cell_type": "code", "execution_count": 2, "id": "926d0d84-0d7e-4939-b87f-1a170f870a8f", "metadata": { "tags": [] }, "outputs": [], "source": [ "annotated_words=pd.read_excel(\"annotated_words.ods\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "8b0d26e4-051c-4669-b566-bbd5ddbbe02b", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "['a', 'i', 's', 'p', 'c', 'b', 'american', 'york', 'd', 'john']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n", "excluded_words[0:10]" ] }, { "cell_type": "markdown", "id": "5f855ec9-eea5-4e15-bc07-96cdd414f36a", "metadata": { "tags": [] }, "source": [ "## Next, get the list of custom mappings" ] }, { "cell_type": "code", "execution_count": 4, "id": "2eea14b2-82bf-4353-8982-76a6c7f46d22", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "[('be', 'bee'),\n", " ('by', 'bye'),\n", " ('corps', 'core'),\n", " ('ore', 'oar'),\n", " ('ore', ' or'),\n", " ('vary', 'very'),\n", " ('com', 'calm'),\n", " ('filing', 'filling'),\n", " ('fax', 'facts'),\n", " ('theatre', 'theater'),\n", " ('par', 'parse'),\n", " ('honour', 'honor'),\n", " ('harry', 'hairy'),\n", " ('brings', 'bring'),\n", " ('organisation', 'organization'),\n", " ('simultaneously', 'simultaneous'),\n", " ('aluminum', 'aluminium'),\n", " ('knight', 'night'),\n", " ('electronics', 'electronic'),\n", " ('organisations', 'organizations'),\n", " ('fortunately', 'fortunate'),\n", " ('corp', 'core'),\n", " ('chile', 'chilly'),\n", " ('chile', ' chili'),\n", " ('owe', 'oh'),\n", " ('capitol', 'capital'),\n", " ('weary', 'wary'),\n", " ('berry', 'barry'),\n", " ('lecturer', 'lecture'),\n", " ('aluminium', 'aluminum'),\n", " ('isle', 'aisle'),\n", " ('boulder', 'bolder'),\n", " ('ads', 'adds'),\n", " ('honours', 'honors'),\n", " ('bot', 'bought'),\n", " ('dew', 'do'),\n", " ('dew', ' due'),\n", " ('theatres', 'theaters'),\n", " ('thru', 'through'),\n", " ('monies', 'moneys'),\n", " ('cue', 'queue'),\n", " ('hairy', 'harry'),\n", " ('hem', 'him'),\n", " ('nun', 'none'),\n", " ('organisational', 'organizational'),\n", " ('aux', 'ox'),\n", " ('rap', 'wrap'),\n", " ('filings', 'filling'),\n", " ('sew', 'so'),\n", " ('pars', 'parse'),\n", " ('fillings', 'filling')]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n", "\n", "custom_maps = [\n", " (m[1][\"word\"].lower(), mapping.lower())\n", " for m in custom_maps.iterrows()\n", " for mapping in m[1][\"maps_to\"]\n", "]\n", "custom_maps" ] }, { "cell_type": "code", "execution_count": 5, "id": "dc52697b-2a30-4e6c-ab74-b77edce3607c", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "# all_words: 21323\n", "sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n", "\n", "# lemmatize_mappings: 21374\n", "sample: [('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it')]\n", "\n", "# distinct_words: 17585\n", "sample:\n" ] }, { "data": { "text/plain": [ "['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_lines(filename):\n", " with gzip.open(filename, 'r') as f:\n", " ret = []\n", " for l in f:\n", " if len(ret) > 30_000:\n", " return ret\n", " ret.append(str(l).lower())\n", " return ret\n", " \n", "lemmatizer = WordNetLemmatizer()\n", "word_re = re.compile(r\"^[A-Za-z]+$\")\n", "\n", "# Start parsing the wordlist\n", "all_words = get_lines(\"frequency-all.txt.gz\")\n", "\n", "# Delete header line\n", "all_words = all_words[1:]\n", "\n", "# Get only the word (fixed width)\n", "all_words = [w[13:36].strip() for w in all_words]\n", "\n", "# Remove special characters\n", "all_words = [w for w in all_words if word_re.search(w)]\n", "\n", "# Remove all removed words\n", "all_words = [w for w in all_words if w not in excluded_words]\n", "\n", "# Lemmatize all words (plural -> singular)\n", "lemmatize_mappings = [\n", " (w, lemmatizer.lemmatize(w)) \n", " for w in all_words\n", " # if w != lemmatizer.lemmatize(w)\n", "]\n", "\n", "# Remove all words that lemmatize to another word\n", "#all_words = [w for w in all_words if w not in ]\n", "\n", "# Add custom lemmatizations\n", "for l in custom_maps:\n", " if l in lemmatize_mappings:\n", " print(f\"Warning: {l} is already lemmatized\")\n", " else:\n", " lemmatize_mappings.append(l)\n", "\n", "distinct_words_lemmatized = set()\n", "distinct_words = []\n", "for w in lemmatize_mappings:\n", " if w[1] not in distinct_words_lemmatized:\n", " distinct_words_lemmatized.add(w[1])\n", " distinct_words.append(w[0])\n", "del distinct_words_lemmatized\n", "\n", "# Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word\n", "#wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl]\n", "\n", "# Get a list of words that map to other words\n", "# A word was lemmatized if wordnet mapped it to another word (not None) that was different\n", "#only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]]\n", "\n", "# Get a list of distinct lemmatized words\n", "#distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None]\n", "#distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)]\n", "\n", "print(f\"# all_words: {len(all_words)}\")\n", "print(f\"sample: {all_words[0:10]}\")\n", "print()\n", "print(f\"# lemmatize_mappings: {len(lemmatize_mappings)}\")\n", "print(f\"sample: {lemmatize_mappings[0:10]}\")\n", "print()\n", "print(f\"# distinct_words: {len(distinct_words)}\")\n", "print(f\"sample:\")\n", "distinct_words[0:10]\n" ] }, { "cell_type": "markdown", "id": "6acea424-d538-4981-a4b9-0d9224f8efb3", "metadata": {}, "source": [ "## Generate the final wordlist" ] }, { "cell_type": "code", "execution_count": 6, "id": "52d0573e-aefd-4c4e-b682-47d1ff8c676b", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Final wordlist size: 11212\n" ] } ], "source": [ "# The final wordlist map. Maps a word to its numeric value\n", "# Starting at 1\n", "final_wordlist = {\n", " w: idx + 1\n", " for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE])\n", "}\n", "\n", "reverse_lemmatize_idx = {\n", " lemmatizer.lemmatize(w): w\n", " for w in final_wordlist.keys()\n", "}\n", "\n", "# Add the lemmatized numbers\n", "for w, lem_w in lemmatize_mappings:\n", " if lem_w not in reverse_lemmatize_idx:\n", " # This word is not in the reverse list\n", " # This happens when the index of the lemmatized word we're working with is too large\n", " continue\n", " \n", " final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]]\n", "\n", "assert final_wordlist[\"its\"] == final_wordlist[\"its\"]\n", "assert final_wordlist[\"its\"] >= 0\n", "\n", "print(f\"Final wordlist size: {len(final_wordlist.keys())}\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "d1a06597-4ad5-4566-a716-8bbad416b7ab", "metadata": { "tags": [] }, "outputs": [], "source": [ "with open(\"final_wordlist.csv\", \"w\") as f:\n", " sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]\n", " \n", " for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):\n", " lemmatized = \"\" if not w[1] else w[1]\n", " f.write(f\"{w[0]},{lemmatized}\")\n", " f.write(\"\\n\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 }