412 lines
12 KiB
Plaintext
412 lines
12 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "0b00342f-7b19-49cc-bc6c-21019f8cc7dc",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n",
|
|
"Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n",
|
|
"Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
|
|
"Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n",
|
|
"Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
|
|
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
|
|
"Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n",
|
|
"[nltk_data] Package wordnet is already up-to-date!\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"try:\n",
|
|
" _initialized\n",
|
|
"except:\n",
|
|
" !pip install nltk odfpy\n",
|
|
" import nltk\n",
|
|
" \n",
|
|
" nltk.download(\"wordnet\")\n",
|
|
" _initialized=True\n",
|
|
" \n",
|
|
"from nltk.stem.wordnet import WordNetLemmatizer\n",
|
|
"import pandas as pd\n",
|
|
"import gzip\n",
|
|
"import re\n",
|
|
"\n",
|
|
"WORDLIST_SIZE=8192 +3"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "d5bf26fa-0aab-403a-9a6f-b2a37dc4892e",
|
|
"metadata": {},
|
|
"source": [
|
|
"## First, get the list of excluded words"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "926d0d84-0d7e-4939-b87f-1a170f870a8f",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"annotated_words=pd.read_excel(\"annotated_words.ods\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "8b0d26e4-051c-4669-b566-bbd5ddbbe02b",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"['a', 'i', 's', 'p', 'c', 'b', 'american', 'york', 'children', 'd']"
|
|
]
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n",
|
|
"excluded_words[0:10]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "5f855ec9-eea5-4e15-bc07-96cdd414f36a",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"## Next, get the list of custom mappings"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "2eea14b2-82bf-4353-8982-76a6c7f46d22",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[('be', 'bee'),\n",
|
|
" ('by', 'bye'),\n",
|
|
" ('died', 'dyed'),\n",
|
|
" ('cents', 'sense'),\n",
|
|
" ('yellow', 'hello'),\n",
|
|
" ('corps', 'core'),\n",
|
|
" ('ore', 'oar'),\n",
|
|
" ('ore', ' or'),\n",
|
|
" ('vary', 'very'),\n",
|
|
" ('com', 'calm'),\n",
|
|
" ('filing', 'filling'),\n",
|
|
" ('fax', 'facts'),\n",
|
|
" ('favour', 'favor'),\n",
|
|
" ('theatre', 'theater'),\n",
|
|
" ('par', 'parse'),\n",
|
|
" ('honour', 'honor'),\n",
|
|
" ('harry', 'hairy'),\n",
|
|
" ('brings', 'bring'),\n",
|
|
" ('organisation', 'organization'),\n",
|
|
" ('simultaneously', 'simultaneous'),\n",
|
|
" ('aluminum', 'aluminium'),\n",
|
|
" ('knight', 'night'),\n",
|
|
" ('electronics', 'electronic'),\n",
|
|
" ('organisations', 'organizations'),\n",
|
|
" ('fortunately', 'fortunate'),\n",
|
|
" ('corp', 'core'),\n",
|
|
" ('chile', 'chilly'),\n",
|
|
" ('chile', ' chili'),\n",
|
|
" ('owe', 'oh'),\n",
|
|
" ('capitol', 'capital'),\n",
|
|
" ('weary', 'wary'),\n",
|
|
" ('berry', 'barry'),\n",
|
|
" ('lecturer', 'lecture'),\n",
|
|
" ('aluminium', 'aluminum'),\n",
|
|
" ('isle', 'aisle'),\n",
|
|
" ('boulder', 'bolder'),\n",
|
|
" ('blew', 'blue'),\n",
|
|
" ('reformed', 'reform'),\n",
|
|
" ('scent', 'sense'),\n",
|
|
" ('ads', 'adds'),\n",
|
|
" ('honours', 'honors'),\n",
|
|
" ('bot', 'bought'),\n",
|
|
" ('dew', 'do'),\n",
|
|
" ('dew', ' due'),\n",
|
|
" ('theatres', 'theaters'),\n",
|
|
" ('thru', 'through'),\n",
|
|
" ('monies', 'moneys'),\n",
|
|
" ('cue', 'queue'),\n",
|
|
" ('hairy', 'harry'),\n",
|
|
" ('hem', 'him'),\n",
|
|
" ('nun', 'none'),\n",
|
|
" ('organisational', 'organizational'),\n",
|
|
" ('dessert', 'desert'),\n",
|
|
" ('aux', 'ox'),\n",
|
|
" ('rap', 'wrap'),\n",
|
|
" ('filings', 'filling'),\n",
|
|
" ('sew', 'so'),\n",
|
|
" ('pars', 'parse'),\n",
|
|
" ('fillings', 'filling')]"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n",
|
|
"\n",
|
|
"custom_maps = [\n",
|
|
" (m[1][\"word\"].lower(), mapping.lower())\n",
|
|
" for m in custom_maps.iterrows()\n",
|
|
" for mapping in m[1][\"maps_to\"]\n",
|
|
"]\n",
|
|
"custom_maps"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "dc52697b-2a30-4e6c-ab74-b77edce3607c",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"# all_words: 21285\n",
|
|
"sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n",
|
|
"\n",
|
|
"# lemmatize_mappings: 21344\n",
|
|
"sample: [('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it')]\n",
|
|
"\n",
|
|
"# distinct_words: 17555\n",
|
|
"sample:\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"def get_lines(filename):\n",
|
|
" with gzip.open(filename, 'r') as f:\n",
|
|
" ret = []\n",
|
|
" for l in f:\n",
|
|
" if len(ret) > 30_000:\n",
|
|
" return ret\n",
|
|
" ret.append(str(l).lower())\n",
|
|
" return ret\n",
|
|
" \n",
|
|
"lemmatizer = WordNetLemmatizer()\n",
|
|
"word_re = re.compile(r\"^[A-Za-z]+$\")\n",
|
|
"\n",
|
|
"# Start parsing the wordlist\n",
|
|
"all_words = get_lines(\"frequency-all.txt.gz\")\n",
|
|
"\n",
|
|
"# Delete header line\n",
|
|
"all_words = all_words[1:]\n",
|
|
"\n",
|
|
"# Get only the word (fixed width)\n",
|
|
"all_words = [w[13:36].strip() for w in all_words]\n",
|
|
"\n",
|
|
"# Remove special characters\n",
|
|
"all_words = [w for w in all_words if word_re.search(w)]\n",
|
|
"\n",
|
|
"# Remove all removed words\n",
|
|
"all_words = [w for w in all_words if w not in excluded_words]\n",
|
|
"\n",
|
|
"# Lemmatize all words (plural -> singular)\n",
|
|
"lemmatize_mappings = [\n",
|
|
" (w, lemmatizer.lemmatize(w)) \n",
|
|
" for w in all_words\n",
|
|
" # if w != lemmatizer.lemmatize(w)\n",
|
|
"]\n",
|
|
"\n",
|
|
"# Remove all words that lemmatize to another word\n",
|
|
"#all_words = [w for w in all_words if w not in ]\n",
|
|
"\n",
|
|
"# Add custom lemmatizations\n",
|
|
"for l in custom_maps:\n",
|
|
" if l in lemmatize_mappings:\n",
|
|
" print(f\"Warning: {l} is already lemmatized\")\n",
|
|
" else:\n",
|
|
" lemmatize_mappings.append(l)\n",
|
|
"\n",
|
|
"distinct_words_lemmatized = set()\n",
|
|
"distinct_words = []\n",
|
|
"for w in lemmatize_mappings:\n",
|
|
" if w[1] not in distinct_words_lemmatized:\n",
|
|
" distinct_words_lemmatized.add(w[1])\n",
|
|
" distinct_words.append(w[0])\n",
|
|
"del distinct_words_lemmatized\n",
|
|
"\n",
|
|
"# Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word\n",
|
|
"#wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl]\n",
|
|
"\n",
|
|
"# Get a list of words that map to other words\n",
|
|
"# A word was lemmatized if wordnet mapped it to another word (not None) that was different\n",
|
|
"#only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]]\n",
|
|
"\n",
|
|
"# Get a list of distinct lemmatized words\n",
|
|
"#distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None]\n",
|
|
"#distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)]\n",
|
|
"\n",
|
|
"print(f\"# all_words: {len(all_words)}\")\n",
|
|
"print(f\"sample: {all_words[0:10]}\")\n",
|
|
"print()\n",
|
|
"print(f\"# lemmatize_mappings: {len(lemmatize_mappings)}\")\n",
|
|
"print(f\"sample: {lemmatize_mappings[0:10]}\")\n",
|
|
"print()\n",
|
|
"print(f\"# distinct_words: {len(distinct_words)}\")\n",
|
|
"print(f\"sample:\")\n",
|
|
"distinct_words[0:10]\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "6acea424-d538-4981-a4b9-0d9224f8efb3",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Generate the final wordlist"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "52d0573e-aefd-4c4e-b682-47d1ff8c676b",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Final wordlist size: 11210\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# The final wordlist map. Maps a word to its numeric value\n",
|
|
"# Starting at 1\n",
|
|
"final_wordlist = {\n",
|
|
" w: idx + 1\n",
|
|
" for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE])\n",
|
|
"}\n",
|
|
"\n",
|
|
"reverse_lemmatize_idx = {\n",
|
|
" lemmatizer.lemmatize(w): w\n",
|
|
" for w in final_wordlist.keys()\n",
|
|
"}\n",
|
|
"\n",
|
|
"# Add the lemmatized numbers\n",
|
|
"for w, lem_w in lemmatize_mappings:\n",
|
|
" if lem_w not in reverse_lemmatize_idx:\n",
|
|
" # This word is not in the reverse list\n",
|
|
" # This happens when the index of the lemmatized word we're working with is too large\n",
|
|
" continue\n",
|
|
" \n",
|
|
" final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]]\n",
|
|
"\n",
|
|
"assert final_wordlist[\"its\"] == final_wordlist[\"its\"]\n",
|
|
"assert final_wordlist[\"its\"] >= 0\n",
|
|
"\n",
|
|
"print(f\"Final wordlist size: {len(final_wordlist.keys())}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "d1a06597-4ad5-4566-a716-8bbad416b7ab",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]\n",
|
|
"\n",
|
|
"with open(\"final_wordlist.csv\", \"w\") as f:\n",
|
|
" f.write(\"word,number\\n\")\n",
|
|
" \n",
|
|
" for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):\n",
|
|
" lemmatized = \"\" if not w[1] else w[1]\n",
|
|
" f.write(f\"{w[0].upper()},{lemmatized - 1}\")\n",
|
|
" f.write(\"\\n\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c88fe193-11cc-4a06-a3cf-d1ad85f44d14",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2a0d177b-3499-42fb-8091-29547567d69a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|