386 lines
11 KiB
Plaintext
Raw Normal View History

2023-02-09 00:08:47 -05:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "0b00342f-7b19-49cc-bc6c-21019f8cc7dc",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n",
"Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n",
"Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
"Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n",
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
"Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
"Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
}
],
"source": [
"try:\n",
" _initialized\n",
"except:\n",
" !pip install nltk odfpy\n",
" import nltk\n",
" \n",
" nltk.download(\"wordnet\")\n",
" _initialized=True\n",
" \n",
"from nltk.stem.wordnet import WordNetLemmatizer\n",
"import pandas as pd\n",
"import gzip\n",
"import re\n",
"\n",
"WORDLIST_SIZE=8192 +3"
]
},
{
"cell_type": "markdown",
"id": "d5bf26fa-0aab-403a-9a6f-b2a37dc4892e",
"metadata": {},
"source": [
"## First, get the list of excluded words"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "926d0d84-0d7e-4939-b87f-1a170f870a8f",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"annotated_words=pd.read_excel(\"annotated_words.ods\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8b0d26e4-051c-4669-b566-bbd5ddbbe02b",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"['a', 'i', 's', 'p', 'c', 'b', 'american', 'york', 'd', 'john']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n",
"excluded_words[0:10]"
]
},
{
"cell_type": "markdown",
"id": "5f855ec9-eea5-4e15-bc07-96cdd414f36a",
"metadata": {
"tags": []
},
"source": [
"## Next, get the list of custom mappings"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2eea14b2-82bf-4353-8982-76a6c7f46d22",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"[('be', 'bee'),\n",
" ('by', 'bye'),\n",
" ('corps', 'core'),\n",
" ('ore', 'oar'),\n",
" ('ore', ' or'),\n",
" ('vary', 'very'),\n",
" ('com', 'calm'),\n",
" ('filing', 'filling'),\n",
" ('fax', 'facts'),\n",
" ('theatre', 'theater'),\n",
" ('par', 'parse'),\n",
" ('honour', 'honor'),\n",
" ('harry', 'hairy'),\n",
" ('brings', 'bring'),\n",
" ('organisation', 'organization'),\n",
" ('simultaneously', 'simultaneous'),\n",
" ('aluminum', 'aluminium'),\n",
" ('knight', 'night'),\n",
" ('electronics', 'electronic'),\n",
" ('organisations', 'organizations'),\n",
" ('fortunately', 'fortunate'),\n",
" ('corp', 'core'),\n",
" ('chile', 'chilly'),\n",
" ('chile', ' chili'),\n",
" ('owe', 'oh'),\n",
" ('capitol', 'capital'),\n",
" ('weary', 'wary'),\n",
" ('berry', 'barry'),\n",
" ('lecturer', 'lecture'),\n",
" ('aluminium', 'aluminum'),\n",
" ('isle', 'aisle'),\n",
" ('boulder', 'bolder'),\n",
" ('ads', 'adds'),\n",
" ('honours', 'honors'),\n",
" ('bot', 'bought'),\n",
" ('dew', 'do'),\n",
" ('dew', ' due'),\n",
" ('theatres', 'theaters'),\n",
" ('thru', 'through'),\n",
" ('monies', 'moneys'),\n",
" ('cue', 'queue'),\n",
" ('hairy', 'harry'),\n",
" ('hem', 'him'),\n",
" ('nun', 'none'),\n",
" ('organisational', 'organizational'),\n",
" ('aux', 'ox'),\n",
" ('rap', 'wrap'),\n",
" ('filings', 'filling'),\n",
" ('sew', 'so'),\n",
" ('pars', 'parse'),\n",
" ('fillings', 'filling')]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n",
"\n",
"custom_maps = [\n",
" (m[1][\"word\"].lower(), mapping.lower())\n",
" for m in custom_maps.iterrows()\n",
" for mapping in m[1][\"maps_to\"]\n",
"]\n",
"custom_maps"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "dc52697b-2a30-4e6c-ab74-b77edce3607c",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# all_words: 21323\n",
"sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n",
"\n",
"# lemmatize_mappings: 21374\n",
"sample: [('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it')]\n",
"\n",
"# distinct_words: 17585\n",
"sample:\n"
]
},
{
"data": {
"text/plain": [
"['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def get_lines(filename):\n",
" with gzip.open(filename, 'r') as f:\n",
" ret = []\n",
" for l in f:\n",
" if len(ret) > 30_000:\n",
" return ret\n",
" ret.append(str(l).lower())\n",
" return ret\n",
" \n",
"lemmatizer = WordNetLemmatizer()\n",
"word_re = re.compile(r\"^[A-Za-z]+$\")\n",
"\n",
"# Start parsing the wordlist\n",
"all_words = get_lines(\"frequency-all.txt.gz\")\n",
"\n",
"# Delete header line\n",
"all_words = all_words[1:]\n",
"\n",
"# Get only the word (fixed width)\n",
"all_words = [w[13:36].strip() for w in all_words]\n",
"\n",
"# Remove special characters\n",
"all_words = [w for w in all_words if word_re.search(w)]\n",
"\n",
"# Remove all removed words\n",
"all_words = [w for w in all_words if w not in excluded_words]\n",
"\n",
"# Lemmatize all words (plural -> singular)\n",
"lemmatize_mappings = [\n",
" (w, lemmatizer.lemmatize(w)) \n",
" for w in all_words\n",
" # if w != lemmatizer.lemmatize(w)\n",
"]\n",
"\n",
"# Remove all words that lemmatize to another word\n",
"#all_words = [w for w in all_words if w not in ]\n",
"\n",
"# Add custom lemmatizations\n",
"for l in custom_maps:\n",
" if l in lemmatize_mappings:\n",
" print(f\"Warning: {l} is already lemmatized\")\n",
" else:\n",
" lemmatize_mappings.append(l)\n",
"\n",
"distinct_words_lemmatized = set()\n",
"distinct_words = []\n",
"for w in lemmatize_mappings:\n",
" if w[1] not in distinct_words_lemmatized:\n",
" distinct_words_lemmatized.add(w[1])\n",
" distinct_words.append(w[0])\n",
"del distinct_words_lemmatized\n",
"\n",
"# Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word\n",
"#wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl]\n",
"\n",
"# Get a list of words that map to other words\n",
"# A word was lemmatized if wordnet mapped it to another word (not None) that was different\n",
"#only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]]\n",
"\n",
"# Get a list of distinct lemmatized words\n",
"#distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None]\n",
"#distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)]\n",
"\n",
"print(f\"# all_words: {len(all_words)}\")\n",
"print(f\"sample: {all_words[0:10]}\")\n",
"print()\n",
"print(f\"# lemmatize_mappings: {len(lemmatize_mappings)}\")\n",
"print(f\"sample: {lemmatize_mappings[0:10]}\")\n",
"print()\n",
"print(f\"# distinct_words: {len(distinct_words)}\")\n",
"print(f\"sample:\")\n",
"distinct_words[0:10]\n"
]
},
{
"cell_type": "markdown",
"id": "6acea424-d538-4981-a4b9-0d9224f8efb3",
"metadata": {},
"source": [
"## Generate the final wordlist"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "52d0573e-aefd-4c4e-b682-47d1ff8c676b",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Final wordlist size: 11212\n"
]
}
],
"source": [
"# The final wordlist map. Maps a word to its numeric value\n",
"# Starting at 1\n",
"final_wordlist = {\n",
" w: idx + 1\n",
" for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE])\n",
"}\n",
"\n",
"reverse_lemmatize_idx = {\n",
" lemmatizer.lemmatize(w): w\n",
" for w in final_wordlist.keys()\n",
"}\n",
"\n",
"# Add the lemmatized numbers\n",
"for w, lem_w in lemmatize_mappings:\n",
" if lem_w not in reverse_lemmatize_idx:\n",
" # This word is not in the reverse list\n",
" # This happens when the index of the lemmatized word we're working with is too large\n",
" continue\n",
" \n",
" final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]]\n",
"\n",
"assert final_wordlist[\"its\"] == final_wordlist[\"its\"]\n",
"assert final_wordlist[\"its\"] >= 0\n",
"\n",
"print(f\"Final wordlist size: {len(final_wordlist.keys())}\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d1a06597-4ad5-4566-a716-8bbad416b7ab",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"with open(\"final_wordlist.csv\", \"w\") as f:\n",
" sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]\n",
" \n",
" for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):\n",
" lemmatized = \"\" if not w[1] else w[1]\n",
" f.write(f\"{w[0]},{lemmatized}\")\n",
" f.write(\"\\n\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}