{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "991a711f-be98-4aae-a657-84b065449916", "metadata": { "tags": [] }, "outputs": [], "source": [ "try:\n", " _initialized\n", "except:\n", " # !pip install spacy\n", " # !python -m spacy download en_core_web_trf\n", " import spacy\n", " \n", " nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n", " \n", " _initialized=True\n", " \n", "import pandas as pd\n", "import gzip\n", "import re" ] }, { "cell_type": "code", "execution_count": 2, "id": "d130bb84", "metadata": {}, "outputs": [], "source": [ "def get_lines(filename):\n", " with gzip.open(filename, 'r') as f:\n", " ret = []\n", " for l in f:\n", " if len(ret) > 30_000:\n", " return ret\n", " ret.append(str(l).lower())\n", " return ret\n", "\n", "\n", " \n", "WORDLIST_SIZE = 8192 + 3\n", "word_re = re.compile(r\"^[A-Za-z]+$\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "de2d1731", "metadata": {}, "outputs": [], "source": [ "!pwd\n", "!ls" ] }, { "cell_type": "code", "execution_count": 4, "id": "90665714", "metadata": {}, "outputs": [], "source": [ "annotated_words=pd.read_excel(\"annotated_words.ods\")\n", "\n", "excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n", "excluded_words[0:10]\n", "\n", "custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n", "\n", "custom_maps = [\n", " (m[1][\"word\"].lower(), mapping.lower())\n", " for m in custom_maps.iterrows()\n", " for mapping in m[1][\"maps_to\"]\n", "]\n", "custom_maps" ] }, { "cell_type": "code", "execution_count": 5, "id": "fb50c69e", "metadata": {}, "outputs": [], "source": [ "# Start parsing the wordlist\n", "all_words = get_lines(\"00-frequency-all.txt.gz\")\n", "\n", "# Delete header line\n", "all_words = all_words[1:]\n", "\n", "# Get only the word (fixed width)\n", "all_words = [w[13:36].strip() for w in all_words]\n", "\n", "# Remove special characters\n", "all_words = [w for w in all_words if word_re.search(w)]\n", "\n", "# Remove all removed words\n", "all_words = [w for w in all_words if w not in excluded_words]\n", "\n", "# Add all custom mappings\n", "for m in list(sum(custom_maps, ())):\n", " if m[0] not in all_words:\n", " all_words.append(m[0])\n", " if m[1] not in all_words:\n", " all_words.append(m[1])" ] }, { "cell_type": "code", "execution_count": 13, "id": "cd21bff5", "metadata": {}, "outputs": [], "source": [ "# Lemmatize all words (plural -> singular)\n", "lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in all_words[:100]]\n", "print(lemmatize_mappings[:100])\n", "\n", "# Add custom lemmatizations\n", "for l in custom_maps:\n", " if l in lemmatize_mappings:\n", " print(f\"Warning: {l} is already lemmatized\")\n", " else:\n", " lemmatize_mappings.append(l)\n", " \n", "print(lemmatize_mappings[:100])\n", "\n", "lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]\n", "print(lemmatize_mappings[:100])\n", "\n", "# Now, re-add all lematized words to the list of every word\n", "for w in sum(lemmatize_mappings, ()):\n", " if w not in all_words:\n", " print(w)\n", " all_words.append(w)\n", " \n", "lemmatize_mappings = {k: v for k, v in lemmatize_mappings}" ] }, { "cell_type": "code", "execution_count": 7, "id": "0ee9af7d", "metadata": {}, "outputs": [], "source": [ "final_wordlist = []\n", "seen_lemmatizations = set()\n", "for w in all_words:\n", " lemmatized = lemmatize_mappings.get(w) or w\n", " if lemmatized in seen_lemmatizations:\n", " # The lemmatized version of this word was already seen\n", " continue\n", " else:\n", " # The lemmatized version hasn't been seen. We're good to add it\n", " final_wordlist.append([\n", " k\n", " for k\n", " in lemmatize_mappings.keys()\n", " if lemmatize_mappings[k] == lemmatized\n", " ])\n", " seen_lemmatizations.add(lemmatized)\n", "\n", " if len(final_wordlist) >= WORDLIST_SIZE:\n", " break\n", "\n", "# Now, convert it to the format (number, word)\n", "final_wordlist = [\n", " (idx, w)\n", " for idx, words in enumerate(final_wordlist)\n", " for w in words\n", "]" ] }, { "cell_type": "code", "execution_count": 12, "id": "07c1293c", "metadata": {}, "outputs": [], "source": [ "print(len(lemmatize_mappings))" ] }, { "cell_type": "code", "execution_count": null, "id": "19c255d0", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2" } }, "nbformat": 4, "nbformat_minor": 5 }