2023-02-09 00:08:47 -05:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "0b00342f-7b19-49cc-bc6c-21019f8cc7dc",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2023-02-16 20:06:19 -05:00
"Collecting nltk\n",
" Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m28.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25hCollecting odfpy\n",
" Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m68.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25hRequirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
"Collecting regex>=2021.8.3\n",
" Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m140.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
2023-02-15 22:57:30 -05:00
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
2023-02-16 20:06:19 -05:00
"Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n",
"Building wheels for collected packages: odfpy\n",
" Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25h Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=07cd1c76f3eab402c874a8f4e7d32754528bfb4ba43ad4da49f7cd9986a2b7f4\n",
" Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
"Successfully built odfpy\n",
"Installing collected packages: regex, odfpy, nltk\n",
"Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
2023-02-09 00:08:47 -05:00
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
2023-02-16 20:06:19 -05:00
"[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n"
2023-02-09 00:08:47 -05:00
]
}
],
"source": [
"try:\n",
" _initialized\n",
"except:\n",
" !pip install nltk odfpy\n",
" import nltk\n",
" \n",
" nltk.download(\"wordnet\")\n",
" _initialized=True\n",
" \n",
"from nltk.stem.wordnet import WordNetLemmatizer\n",
"import pandas as pd\n",
"import gzip\n",
"import re\n",
"\n",
"WORDLIST_SIZE=8192 +3"
]
},
{
"cell_type": "markdown",
"id": "d5bf26fa-0aab-403a-9a6f-b2a37dc4892e",
"metadata": {},
"source": [
"## First, get the list of excluded words"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "926d0d84-0d7e-4939-b87f-1a170f870a8f",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"annotated_words=pd.read_excel(\"annotated_words.ods\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8b0d26e4-051c-4669-b566-bbd5ddbbe02b",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
2023-02-11 17:04:16 -05:00
"['a', 'i', 's', 'p', 'c', 'b', 'american', 'york', 'children', 'd']"
2023-02-09 00:08:47 -05:00
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n",
"excluded_words[0:10]"
]
},
{
"cell_type": "markdown",
"id": "5f855ec9-eea5-4e15-bc07-96cdd414f36a",
"metadata": {
"tags": []
},
"source": [
"## Next, get the list of custom mappings"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2eea14b2-82bf-4353-8982-76a6c7f46d22",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"[('be', 'bee'),\n",
" ('by', 'bye'),\n",
2023-02-11 17:04:16 -05:00
" ('died', 'dyed'),\n",
2023-02-15 22:57:30 -05:00
" ('cents', 'sense'),\n",
" ('yellow', 'hello'),\n",
2023-02-09 00:08:47 -05:00
" ('corps', 'core'),\n",
" ('ore', 'oar'),\n",
" ('ore', ' or'),\n",
" ('vary', 'very'),\n",
" ('com', 'calm'),\n",
" ('filing', 'filling'),\n",
" ('fax', 'facts'),\n",
2023-02-11 17:04:16 -05:00
" ('favour', 'favor'),\n",
2023-02-09 00:08:47 -05:00
" ('theatre', 'theater'),\n",
" ('par', 'parse'),\n",
" ('honour', 'honor'),\n",
" ('harry', 'hairy'),\n",
" ('brings', 'bring'),\n",
" ('organisation', 'organization'),\n",
" ('simultaneously', 'simultaneous'),\n",
" ('aluminum', 'aluminium'),\n",
" ('knight', 'night'),\n",
" ('electronics', 'electronic'),\n",
" ('organisations', 'organizations'),\n",
" ('fortunately', 'fortunate'),\n",
" ('corp', 'core'),\n",
" ('chile', 'chilly'),\n",
" ('chile', ' chili'),\n",
" ('owe', 'oh'),\n",
" ('capitol', 'capital'),\n",
" ('weary', 'wary'),\n",
" ('berry', 'barry'),\n",
" ('lecturer', 'lecture'),\n",
" ('aluminium', 'aluminum'),\n",
" ('isle', 'aisle'),\n",
" ('boulder', 'bolder'),\n",
2023-02-15 22:57:30 -05:00
" ('blew', 'blue'),\n",
" ('reformed', 'reform'),\n",
" ('scent', 'sense'),\n",
2023-02-09 00:08:47 -05:00
" ('ads', 'adds'),\n",
" ('honours', 'honors'),\n",
" ('bot', 'bought'),\n",
" ('dew', 'do'),\n",
" ('dew', ' due'),\n",
" ('theatres', 'theaters'),\n",
" ('thru', 'through'),\n",
" ('monies', 'moneys'),\n",
" ('cue', 'queue'),\n",
" ('hairy', 'harry'),\n",
" ('hem', 'him'),\n",
" ('nun', 'none'),\n",
" ('organisational', 'organizational'),\n",
2023-02-11 17:04:16 -05:00
" ('dessert', 'desert'),\n",
2023-02-09 00:08:47 -05:00
" ('aux', 'ox'),\n",
" ('rap', 'wrap'),\n",
" ('filings', 'filling'),\n",
" ('sew', 'so'),\n",
" ('pars', 'parse'),\n",
2023-02-16 20:06:19 -05:00
" ('fillings', 'filling'),\n",
" ('scents', 'scent')]"
2023-02-09 00:08:47 -05:00
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n",
"\n",
"custom_maps = [\n",
" (m[1][\"word\"].lower(), mapping.lower())\n",
" for m in custom_maps.iterrows()\n",
" for mapping in m[1][\"maps_to\"]\n",
"]\n",
"custom_maps"
]
},
{
"cell_type": "code",
2023-02-16 20:06:19 -05:00
"execution_count": 23,
2023-02-09 00:08:47 -05:00
"id": "dc52697b-2a30-4e6c-ab74-b77edce3607c",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2023-02-15 22:57:30 -05:00
"# all_words: 21285\n",
2023-02-09 00:08:47 -05:00
"sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n",
"\n",
2023-02-16 20:06:19 -05:00
"# lemmatize_mappings: 4150\n",
"sample: [('as', 'a'), ('was', 'wa'), ('has', 'ha'), ('its', 'it'), ('years', 'year'), ('states', 'state'), ('us', 'u'), ('does', 'doe'), ('less', 'le'), ('means', 'mean')]\n",
2023-02-09 00:08:47 -05:00
"\n",
2023-02-16 20:06:19 -05:00
"# distinct_words: 4114\n",
2023-02-09 00:08:47 -05:00
"sample:\n"
]
},
{
"data": {
"text/plain": [
2023-02-16 20:06:19 -05:00
"['as', 'was', 'has', 'its', 'years', 'states', 'us', 'does', 'less', 'means']"
2023-02-09 00:08:47 -05:00
]
},
2023-02-16 20:06:19 -05:00
"execution_count": 23,
2023-02-09 00:08:47 -05:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def get_lines(filename):\n",
" with gzip.open(filename, 'r') as f:\n",
" ret = []\n",
" for l in f:\n",
" if len(ret) > 30_000:\n",
" return ret\n",
" ret.append(str(l).lower())\n",
" return ret\n",
" \n",
"lemmatizer = WordNetLemmatizer()\n",
"word_re = re.compile(r\"^[A-Za-z]+$\")\n",
"\n",
"# Start parsing the wordlist\n",
"all_words = get_lines(\"frequency-all.txt.gz\")\n",
"\n",
"# Delete header line\n",
"all_words = all_words[1:]\n",
"\n",
"# Get only the word (fixed width)\n",
"all_words = [w[13:36].strip() for w in all_words]\n",
"\n",
"# Remove special characters\n",
"all_words = [w for w in all_words if word_re.search(w)]\n",
"\n",
"# Remove all removed words\n",
"all_words = [w for w in all_words if w not in excluded_words]\n",
"\n",
"# Lemmatize all words (plural -> singular)\n",
"lemmatize_mappings = [\n",
" (w, lemmatizer.lemmatize(w)) \n",
" for w in all_words\n",
" # if w != lemmatizer.lemmatize(w)\n",
"]\n",
"\n",
"# Remove all words that lemmatize to another word\n",
"#all_words = [w for w in all_words if w not in ]\n",
"\n",
"# Add custom lemmatizations\n",
"for l in custom_maps:\n",
" if l in lemmatize_mappings:\n",
" print(f\"Warning: {l} is already lemmatized\")\n",
" else:\n",
" lemmatize_mappings.append(l)\n",
"\n",
"distinct_words_lemmatized = set()\n",
"distinct_words = []\n",
"for w in lemmatize_mappings:\n",
" if w[1] not in distinct_words_lemmatized:\n",
" distinct_words_lemmatized.add(w[1])\n",
" distinct_words.append(w[0])\n",
"del distinct_words_lemmatized\n",
"\n",
"# Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word\n",
"#wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl]\n",
"\n",
"# Get a list of words that map to other words\n",
"# A word was lemmatized if wordnet mapped it to another word (not None) that was different\n",
"#only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]]\n",
"\n",
"# Get a list of distinct lemmatized words\n",
"#distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None]\n",
"#distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)]\n",
"\n",
"print(f\"# all_words: {len(all_words)}\")\n",
"print(f\"sample: {all_words[0:10]}\")\n",
"print()\n",
"print(f\"# lemmatize_mappings: {len(lemmatize_mappings)}\")\n",
"print(f\"sample: {lemmatize_mappings[0:10]}\")\n",
"print()\n",
"print(f\"# distinct_words: {len(distinct_words)}\")\n",
"print(f\"sample:\")\n",
"distinct_words[0:10]\n"
]
},
2023-02-16 20:06:19 -05:00
{
"cell_type": "code",
"execution_count": 14,
"id": "4edc3e7b-3131-498b-acea-7edb7d5ad405",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"['the',\n",
" 'of',\n",
" 'and',\n",
" 'to',\n",
" 'in',\n",
" 'is',\n",
" 'that',\n",
" 'for',\n",
" 'as',\n",
" 'it',\n",
" 'be',\n",
" 'by',\n",
" 'with',\n",
" 'was',\n",
" 'on',\n",
" 'not',\n",
" 'or',\n",
" 'this',\n",
" 'are',\n",
" 'at',\n",
" 'from',\n",
" 'he',\n",
" 'which',\n",
" 'his',\n",
" 'have',\n",
" 'an',\n",
" 'but',\n",
" 'you',\n",
" 'they',\n",
" 'were',\n",
" 'had',\n",
" 'we',\n",
" 'all',\n",
" 'one',\n",
" 'has',\n",
" 'their',\n",
" 'been',\n",
" 'will',\n",
" 'there',\n",
" 'can',\n",
" 'if',\n",
" 'other',\n",
" 'would',\n",
" 'no',\n",
" 'her',\n",
" 'may',\n",
" 'more',\n",
" 'when',\n",
" 'so',\n",
" 'who',\n",
" 'such',\n",
" 'these',\n",
" 'any',\n",
" 'she',\n",
" 'new',\n",
" 'time',\n",
" 'than',\n",
" 'do',\n",
" 'some',\n",
" 'what',\n",
" 'only',\n",
" 'into',\n",
" 'them',\n",
" 'two',\n",
" 'also',\n",
" 'about',\n",
" 'out',\n",
" 'him',\n",
" 'my',\n",
" 'said',\n",
" 'up',\n",
" 'our',\n",
" 'first',\n",
" 'should',\n",
" 'under',\n",
" 'made',\n",
" 'state',\n",
" 'see',\n",
" 'after',\n",
" 'could',\n",
" 'then',\n",
" 'me',\n",
" 'most',\n",
" 'over',\n",
" 'very',\n",
" 'your',\n",
" 'between',\n",
" 'where',\n",
" 'now',\n",
" 'shall',\n",
" 'work',\n",
" 'those',\n",
" 'same',\n",
" 'well',\n",
" 'each',\n",
" 'many',\n",
" 'being',\n",
" 'years',\n",
" 'did',\n",
" 'through',\n",
" 'must',\n",
" 'upon',\n",
" 'before',\n",
" 'like',\n",
" 'use',\n",
" 'part',\n",
" 'general',\n",
" 'people',\n",
" 'because',\n",
" 'used',\n",
" 'how',\n",
" 'even',\n",
" 'much',\n",
" 'during',\n",
" 'both',\n",
" 'case',\n",
" 'three',\n",
" 'number',\n",
" 'make',\n",
" 'per',\n",
" 'great',\n",
" 'act',\n",
" 'way',\n",
" 'life',\n",
" 'good',\n",
" 'day',\n",
" 'public',\n",
" 'man',\n",
" 'however',\n",
" 'system',\n",
" 'water',\n",
" 'without',\n",
" 'us',\n",
" 'government',\n",
" 'while',\n",
" 'long',\n",
" 'order',\n",
" 'law',\n",
" 'section',\n",
" 'court',\n",
" 'high',\n",
" 'right',\n",
" 'own',\n",
" 'found',\n",
" 'united',\n",
" 'just',\n",
" 'here',\n",
" 'against',\n",
" 'world',\n",
" 'does',\n",
" 'company',\n",
" 'within',\n",
" 'given',\n",
" 'service',\n",
" 'house',\n",
" 'another',\n",
" 'power',\n",
" 'place',\n",
" 'know',\n",
" 'little',\n",
" 'down',\n",
" 'present',\n",
" 'every',\n",
" 'national',\n",
" 'back',\n",
" 'take',\n",
" 'information',\n",
" 'men',\n",
" 'since',\n",
" 'might',\n",
" 'small',\n",
" 'large',\n",
" 'school',\n",
" 'following',\n",
" 'still',\n",
" 'less',\n",
" 'last',\n",
" 'city',\n",
" 'second',\n",
" 'development',\n",
" 'different',\n",
" 'university',\n",
" 'old',\n",
" 'form',\n",
" 'point',\n",
" 'total',\n",
" 'data',\n",
" 'too',\n",
" 'committee',\n",
" 'report',\n",
" 'business',\n",
" 'think',\n",
" 'end',\n",
" 'get',\n",
" 'set',\n",
" 'research',\n",
" 'say',\n",
" 'come',\n",
" 'country',\n",
" 'never',\n",
" 'fact',\n",
" 'go',\n",
" 'control',\n",
" 'thus',\n",
" 'having',\n",
" 'value',\n",
" 'social',\n",
" 'department',\n",
" 'few',\n",
" 'above',\n",
" 'important',\n",
" 'interest',\n",
" 'study',\n",
" 'off',\n",
" 'area',\n",
" 'means',\n",
" 'office',\n",
" 'group',\n",
" 'give',\n",
" 'again',\n",
" 'war',\n",
" 'whether',\n",
" 'question',\n",
" 'called',\n",
" 'period',\n",
" 'line',\n",
" 'land',\n",
" 'four',\n",
" 'among',\n",
" 'table',\n",
" 'board',\n",
" 'until',\n",
" 'hand',\n",
" 'taken',\n",
" 'need',\n",
" 'education',\n",
" 'certain',\n",
" 'county',\n",
" 'action',\n",
" 'several',\n",
" 'am',\n",
" 'course',\n",
" 'far',\n",
" 'effect',\n",
" 'possible',\n",
" 'though',\n",
" 'left',\n",
" 'further',\n",
" 'home',\n",
" 'person',\n",
" 'health',\n",
" 'amount',\n",
" 'members',\n",
" 'subject',\n",
" 'yet',\n",
" 'program',\n",
" 'therefore',\n",
" 'process',\n",
" 'rate',\n",
" 'local',\n",
" 'name',\n",
" 'find',\n",
" 'necessary',\n",
" 'often',\n",
" 'others',\n",
" 'whole',\n",
" 'change',\n",
" 'example',\n",
" 'president',\n",
" 'history',\n",
" 'best',\n",
" 'although',\n",
" 'family',\n",
" 'side',\n",
" 'women',\n",
" 'held',\n",
" 'based',\n",
" 'south',\n",
" 'special',\n",
" 'required',\n",
" 'came',\n",
" 'thought',\n",
" 'five',\n",
" 'always',\n",
" 'himself',\n",
" 'air',\n",
" 'known',\n",
" 'head',\n",
" 'either',\n",
" 'property',\n",
" 'cost',\n",
" 'rather',\n",
" 'bill',\n",
" 'put',\n",
" 'human',\n",
" 'figure',\n",
" 'results',\n",
" 'level',\n",
" 'conditions',\n",
" 'full',\n",
" 'book',\n",
" 'available',\n",
" 'early',\n",
" 'matter',\n",
" 'common',\n",
" 'light',\n",
" 'let',\n",
" 'society',\n",
" 'body',\n",
" 'international',\n",
" 'including',\n",
" 'free',\n",
" 'evidence',\n",
" 'better',\n",
" 'type',\n",
" 'provided',\n",
" 'due',\n",
" 'next',\n",
" 'production',\n",
" 'once',\n",
" 'done',\n",
" 'making',\n",
" 'least',\n",
" 'support',\n",
" 'north',\n",
" 'later',\n",
" 'using',\n",
" 'things',\n",
" 'economic',\n",
" 'chapter',\n",
" 'various',\n",
" 'why',\n",
" 'white',\n",
" 'going',\n",
" 'commission',\n",
" 'federal',\n",
" 'away',\n",
" 'field',\n",
" 'nature',\n",
" 'policy',\n",
" 'become',\n",
" 'political',\n",
" 'increase',\n",
" 'around',\n",
" 'age',\n",
" 'want',\n",
" 'low',\n",
" 'trade',\n",
" 'half',\n",
" 'position',\n",
" 'young',\n",
" 'money',\n",
" 'percent',\n",
" 'cent',\n",
" 'class',\n",
" 'words',\n",
" 'view',\n",
" 'provide',\n",
" 'seen',\n",
" 'show',\n",
" 'district',\n",
" 'party',\n",
" 'analysis',\n",
" 'care',\n",
" 'june',\n",
" 'foreign',\n",
" 'shown',\n",
" 'received',\n",
" 'management',\n",
" 'third',\n",
" 'took',\n",
" 'something',\n",
" 'tax',\n",
" 'account',\n",
" 'problem',\n",
" 'almost',\n",
" 'west',\n",
" 'nothing',\n",
" 'together',\n",
" 'individual',\n",
" 'open',\n",
" 'material',\n",
" 'paper',\n",
" 'feet',\n",
" 'force',\n",
" 'association',\n",
" 'purpose',\n",
" 'terms',\n",
" 'method',\n",
" 'help',\n",
" 'real',\n",
" 'ever',\n",
" 'already',\n",
" 'along',\n",
" 'went',\n",
" 'particular',\n",
" 'energy',\n",
" 'secretary',\n",
" 'date',\n",
" 'price',\n",
" 'short',\n",
" 'true',\n",
" 'street',\n",
" 'building',\n",
" 'room',\n",
" 'market',\n",
" 'look',\n",
" 'similar',\n",
" 'industry',\n",
" 'bank',\n",
" 'according',\n",
" 'itself',\n",
" 'application',\n",
" 'current',\n",
" 'read',\n",
" 'press',\n",
" 'community',\n",
" 'plan',\n",
" 'whose',\n",
" 'major',\n",
" 'considered',\n",
" 'mind',\n",
" 'union',\n",
" 'cause',\n",
" 'able',\n",
" 'surface',\n",
" 'face',\n",
" 'river',\n",
" 'council',\n",
" 'income',\n",
" 'july',\n",
" 'near',\n",
" 'experience',\n",
" 'non',\n",
" 'paid',\n",
" 'pay',\n",
" 'reason',\n",
" 'themselves',\n",
" 'asked',\n",
" 'march',\n",
" 'king',\n",
" 'higher',\n",
" 'single',\n",
" 'average',\n",
" 'father',\n",
" 'note',\n",
" 'treatment',\n",
" 'love',\n",
" 'black',\n",
" 'knowledge',\n",
" 'enough',\n",
" 'future',\n",
" 'kind',\n",
" 'lower',\n",
" 'authority',\n",
" 'past',\n",
" 'natural',\n",
" 'six',\n",
" 'food',\n",
" 'working',\n",
" 'central',\n",
" 'college',\n",
" 'self',\n",
" 'products',\n",
" 'model',\n",
" 'brought',\n",
" 'greater',\n",
" 'test',\n",
" 'nor',\n",
" 'students',\n",
" 'private',\n",
" 'construction',\n",
" 'perhaps',\n",
" 'ground',\n",
" 'sir',\n",
" 'basis',\n",
" 'months',\n",
" 'growth',\n",
" 'increased',\n",
" 'east',\n",
" 'language',\n",
" 'rule',\n",
" 'continued',\n",
" 'quite',\n",
" 'except',\n",
" 'series',\n",
" 'practice',\n",
" 'night',\n",
" 'eyes',\n",
" 'oil',\n",
" 'art',\n",
" 'told',\n",
" 'especially',\n",
" 'population',\n",
" 'science',\n",
" 'whom',\n",
" 'obtained',\n",
" 'capital',\n",
" 'include',\n",
" 'generally',\n",
" 'meeting',\n",
" 'specific',\n",
" 'described',\n",
" 'believe',\n",
" 'review',\n",
" 'issue',\n",
" 'respect',\n",
" 'contract',\n",
" 'became',\n",
" 'medical',\n",
" 'road',\n",
" 'got',\n",
" 'clear',\n",
" 'main',\n",
" 'labor',\n",
" 'operation',\n",
" 'size',\n",
" 'below',\n",
" 'hours',\n",
" 'sense',\n",
" 'addition',\n",
" 'probably',\n",
" 'century',\n",
" 'personal',\n",
" 'plant',\n",
" 'training',\n",
" 'design',\n",
" 'statement',\n",
" 'structure',\n",
" 'project',\n",
" 'million',\n",
" 'usually',\n",
" 'range',\n",
" 'call',\n",
" 'mother',\n",
" 'seems',\n",
" 'standard',\n",
" 'return',\n",
" 'title',\n",
" 'established',\n",
" 'keep',\n",
" 'space',\n",
" 'annual',\n",
" 'record',\n",
" 'close',\n",
" 'april',\n",
" 'complete',\n",
" 'page',\n",
" 'heart',\n",
" 'fig',\n",
" 'quality',\n",
" 'gas',\n",
" 'letter',\n",
" 'stock',\n",
" 'gave',\n",
" 'related',\n",
" 'administration',\n",
" 'activities',\n",
" 'theory',\n",
" 'town',\n",
" 'equipment',\n",
" 'soon',\n",
" 'decision',\n",
" 'pressure',\n",
" 'written',\n",
" 'corporation',\n",
" 'tell',\n",
" 'agreement',\n",
" 'reported',\n",
" 'attention',\n",
" 'fire',\n",
" 'direct',\n",
" 'saw',\n",
" 'published',\n",
" 'temperature',\n",
" 'species',\n",
" 'really',\n",
" 'function',\n",
" 'military',\n",
" 'proposed',\n",
" 'january',\n",
" 'additional',\n",
" 'late',\n",
" 'opinion',\n",
" 'loss',\n",
" 'limited',\n",
" 'source',\n",
" 'article',\n",
" 'notice',\n",
" 'security',\n",
" 'organization',\n",
" 'financial',\n",
" 'follows',\n",
" 'miles',\n",
" 'chief',\n",
" 'distribution',\n",
" 'sometimes',\n",
" 'insurance',\n",
" 'son',\n",
" 'strong',\n",
" 'length',\n",
" 'original',\n",
" 'yes',\n",
" 'effective',\n",
" 'defendant',\n",
" 'living',\n",
" 'december',\n",
" 'character',\n",
" 'began',\n",
" 'carried',\n",
" 'supply',\n",
" 'blood',\n",
" 'taking',\n",
" 'manner',\n",
" 'journal',\n",
" 'hundred',\n",
" 'red',\n",
" 'developed',\n",
" 'performance',\n",
" 'situation',\n",
" 'felt',\n",
" 'workers',\n",
" 'volume',\n",
" 'presented',\n",
" 'knew',\n",
" 'answer',\n",
" 'resources',\n",
" 'industrial',\n",
" 'twenty',\n",
" 'sent',\n",
" 'looked',\n",
" 'library',\n",
" 'added',\n",
" 'passed',\n",
" 'ten',\n",
" 'sea',\n",
" 'applied',\n",
" 'included',\n",
" 'physical',\n",
" 'across',\n",
" 'army',\n",
" 'toward',\n",
" 'produced',\n",
" 'placed',\n",
" 'role',\n",
" 'october',\n",
" 'final',\n",
" 'approach',\n",
" 'provisions',\n",
" 'leave',\n",
" 'director',\n",
" 'employment',\n",
" 'anything',\n",
" 'particularly',\n",
" 'hard',\n",
" 'outside',\n",
" 'week',\n",
" 'feel',\n",
" 'charge',\n",
" 'indeed',\n",
" 'degree',\n",
" 'reference',\n",
" 'requirements',\n",
" 'september',\n",
" 'today',\n",
" 'western',\n",
" 'influence',\n",
" 'unit',\n",
" 'solution',\n",
" 'chairman',\n",
" 'legal',\n",
" 'motion',\n",
" 'region',\n",
" 'idea',\n",
" 'list',\n",
" 'judgment',\n",
" 'determined',\n",
" 'poor',\n",
" 'disease',\n",
" 'civil',\n",
" 'turn',\n",
" 'modern',\n",
" 'normal',\n",
" 'appear',\n",
" 'employees',\n",
" 'latter',\n",
" 'heard',\n",
" 'top',\n",
" 'sure',\n",
" 'moment',\n",
" 'code',\n",
" 'wife',\n",
" 'post',\n",
" 'difficult',\n",
" 'recent',\n",
" 'extent',\n",
" 'longer',\n",
" 'story',\n",
" 'meet',\n",
" 'officers',\n",
" 'patients',\n",
" 'front',\n",
" 'doing',\n",
" 'staff',\n",
" 'august',\n",
" 'needed',\n",
" 'involved',\n",
" 'likely',\n",
" 'former',\n",
" 'run',\n",
" 'author',\n",
" 'middle',\n",
" 'turned',\n",
" 'agency',\n",
" 'reading',\n",
" 'beginning',\n",
" 'duty',\n",
" 'movement',\n",
" 'alone',\n",
" 'beyond',\n",
" 'fine',\n",
" 'base',\n",
" 'relations',\n",
" 'simple',\n",
" 'consider',\n",
" 'proper',\n",
" 'instead',\n",
" 'significant',\n",
" 'appears',\n",
" 'equal',\n",
" 'lost',\n",
" 'followed',\n",
" 'hope',\n",
" 'cut',\n",
" 'unless',\n",
" 'nearly',\n",
" 'claim',\n",
" 'associated',\n",
" 'expected',\n",
" 'difference',\n",
" 'funds',\n",
" 'direction',\n",
" 'cross',\n",
" 'live',\n",
" 'finally',\n",
" 'weight',\n",
" 'lead',\n",
" 'trial',\n",
" 'justice',\n",
" 'factors',\n",
" 'response',\n",
" 'cells',\n",
" 'earth',\n",
" 'rest',\n",
" 'bring',\n",
" 'trust',\n",
" 'observed',\n",
" 'behind',\n",
" 'job',\n",
" 'door',\n",
" 'understand',\n",
" 'acid',\n",
" 'hold',\n",
" 'technology',\n",
" 'wide',\n",
" 'protection',\n",
" 'basic',\n",
" 'november',\n",
" 'seemed',\n",
" 'throughout',\n",
" 'importance',\n",
" 'sales',\n",
" 'stated',\n",
" 'address',\n",
" 'potential',\n",
" 'payment',\n",
" 'prior',\n",
" 'discussion',\n",
" 'conference',\n",
" 'writing',\n",
" 'stage',\n",
" 'fall',\n",
" 'iron',\n",
" 'play',\n",
" 'ask',\n",
" 'relationship',\n",
" 'towards',\n",
" 'regard',\n",
" 'referred',\n",
" 'flow',\n",
" 'consideration',\n",
" 'hospital',\n",
" 'seem',\n",
" 'february',\n",
" 'soil',\n",
" 'morning',\n",
" 'commercial',\n",
" 'planning',\n",
" 'provides',\n",
" 'appropriate',\n",
" 'technical',\n",
" 'demand',\n",
" 'sufficient',\n",
" 'principal',\n",
" 'credit',\n",
" 'peace',\n",
" 'previous',\n",
" 'object',\n",
" 'kept',\n",
" 'sound',\n",
" 'wanted',\n",
" 'looking',\n",
" 'entire',\n",
" 'plaintiff',\n",
" 'heat',\n",
" 'otherwise',\n",
" 'judge',\n",
" 'capacity',\n",
" 'brown',\n",
" 'music',\n",
" 'risk',\n",
" 'box',\n",
" 'exchange',\n",
" 'produce',\n",
" 'station',\n",
" 'big',\n",
" 'primary',\n",
" 'institute',\n",
" 'mentioned',\n",
" 'prepared',\n",
" 'spirit',\n",
" 'allowed',\n",
" 'site',\n",
" 'green',\n",
" 'directly',\n",
" 'text',\n",
" 'friends',\n",
" 'presence',\n",
" 'survey',\n",
" 'determine',\n",
" 'car',\n",
" 'larger',\n",
" 'deep',\n",
" 'simply',\n",
" 'immediately',\n",
" 'distance',\n",
" 'coming',\n",
" 'seven',\n",
" 'steel',\n",
" 'existing',\n",
" 'clearly',\n",
" 'actual',\n",
" 'born',\n",
" 'learning',\n",
" 'voice',\n",
" 'earlier',\n",
" 'circumstances',\n",
" 'safety',\n",
" 'ago',\n",
" 'issued',\n",
" 'upper',\n",
" 'require',\n",
" 'scale',\n",
" 'island',\n",
" 'culture',\n",
" 'employed',\n",
" 'eight',\n",
" 'estate',\n",
" 'portion',\n",
" 'deal',\n",
" 'share',\n",
" 'actually',\n",
" 'aid',\n",
" 'engineering',\n",
" 'continue',\n",
" 'formed',\n",
" 'agricultural',\n",
" 'entitled',\n",
" 'mass',\n",
" 'truth',\n",
" 'giving',\n",
" 'met',\n",
" 'built',\n",
" 'content',\n",
" 'connection',\n",
" 'assistance',\n",
" 'coal',\n",
" 'progress',\n",
" 'receive',\n",
" 'active',\n",
" 'nation',\n",
" 'contact',\n",
" 'amendment',\n",
" 'net',\n",
" 'wall',\n",
" 'farm',\n",
" 'understanding',\n",
" 'strength',\n",
" 'minutes',\n",
" 'move',\n",
" 'elements',\n",
" 'concerned',\n",
" 'regulations',\n",
" 'step',\n",
" 'literature',\n",
" 'opportunity',\n",
" 'investment',\n",
" 'led',\n",
" 'reduced',\n",
" 'follow',\n",
" 'facilities',\n",
" 'benefit',\n",
" 'compared',\n",
" 'reached',\n",
" 'religious',\n",
" 'measure',\n",
" 'meaning',\n",
" 'considerable',\n",
" 'relative',\n",
" 'electric',\n",
" 'joint',\n",
" 'certainly',\n",
" 'failure',\n",
" 'apply',\n",
" 'appeal',\n",
" 'separate',\n",
" 'balance',\n",
" 'died',\n",
" 'operating',\n",
" 'includes',\n",
" 'independent',\n",
" 'defined',\n",
" 'forward',\n",
" 'doubt',\n",
" 'none',\n",
" 'master',\n",
" 'chemical',\n",
" 'success',\n",
" 'environment',\n",
" 'everything',\n",
" 'transfer',\n",
" 'news',\n",
" 'gold',\n",
" 'thousand',\n",
" 'key',\n",
" 'examination',\n",
" 'fully',\n",
" 'description',\n",
" 'teachers',\n",
" 'lake',\n",
" 'status',\n",
" 'fair',\n",
" 'affairs',\n",
" 'round',\n",
" 'procedure',\n",
" 'covered',\n",
" 'daily',\n",
" 'collection',\n",
" 'maximum',\n",
" 'pass',\n",
" 'lot',\n",
" 'resolution',\n",
" 'adopted',\n",
" 'principles',\n",
" 'fixed',\n",
" 'police',\n",
" 'machine',\n",
" 'appeared',\n",
" 'becomes',\n",
" 'moved',\n",
" 'phase',\n",
" 'caused',\n",
" 'request',\n",
" 'stand',\n",
" 'else',\n",
" 'executive',\n",
" 'institutions',\n",
" 'neither',\n",
" 'heavy',\n",
" 'computer',\n",
" 'senator',\n",
" 'wood',\n",
" 'environmental',\n",
" 'dark',\n",
" 'professor',\n",
" 'access',\n",
" 'official',\n",
" 'hear',\n",
" 'spring',\n",
" 'complex',\n",
" 'allow',\n",
" 'sum',\n",
" 'cover',\n",
" 'proceedings',\n",
" 'write',\n",
" 'discussed',\n",
" 'events',\n",
" 'started',\n",
" 'internal',\n",
" 'ability',\n",
" ...]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distinct_words"
]
},
2023-02-09 00:08:47 -05:00
{
"cell_type": "markdown",
"id": "6acea424-d538-4981-a4b9-0d9224f8efb3",
"metadata": {},
"source": [
"## Generate the final wordlist"
]
},
{
"cell_type": "code",
2023-02-16 20:06:19 -05:00
"execution_count": 22,
2023-02-09 00:08:47 -05:00
"id": "52d0573e-aefd-4c4e-b682-47d1ff8c676b",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2023-02-16 20:06:19 -05:00
"[('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it'), ('be', 'be'), ('by', 'by'), ('with', 'with'), ('was', 'wa'), ('on', 'on'), ('not', 'not'), ('or', 'or'), ('this', 'this'), ('are', 'are'), ('at', 'at'), ('from', 'from'), ('he', 'he'), ('which', 'which'), ('his', 'his'), ('have', 'have'), ('an', 'an'), ('but', 'but'), ('you', 'you'), ('they', 'they'), ('were', 'were'), ('had', 'had'), ('we', 'we'), ('all', 'all'), ('one', 'one'), ('has', 'ha'), ('their', 'their'), ('been', 'been'), ('will', 'will'), ('there', 'there'), ('can', 'can'), ('if', 'if'), ('other', 'other'), ('would', 'would'), ('no', 'no'), ('her', 'her'), ('may', 'may'), ('more', 'more'), ('when', 'when'), ('so', 'so'), ('its', 'it'), ('who', 'who'), ('such', 'such'), ('these', 'these'), ('any', 'any'), ('she', 'she'), ('new', 'new'), ('time', 'time'), ('than', 'than'), ('do', 'do'), ('some', 'some'), ('what', 'what'), ('only', 'only'), ('into', 'into'), ('them', 'them'), ('two', 'two'), ('also', 'also'), ('about', 'about'), ('out', 'out'), ('him', 'him'), ('my', 'my'), ('said', 'said'), ('up', 'up'), ('our', 'our'), ('first', 'first'), ('should', 'should'), ('under', 'under'), ('made', 'made'), ('state', 'state'), ('see', 'see'), ('after', 'after'), ('could', 'could'), ('then', 'then'), ('me', 'me'), ('most', 'most'), ('over', 'over'), ('very', 'very'), ('your', 'your'), ('between', 'between'), ('where', 'where'), ('now', 'now'), ('shall', 'shall'), ('work', 'work'), ('those', 'those'), ('same', 'same'), ('well', 'well'), ('each', 'each'), ('many', 'many'), ('being', 'being'), ('years', 'year'), ('did', 'did'), ('year', 'year'), ('through', 'through'), ('must', 'must'), ('upon', 'upon'), ('before', 'before'), ('like', 'like'), ('use', 'use'), ('part', 'part'), ('general', 'general'), ('people', 'people'), ('because', 'because'), ('used', 'used'), ('how', 'how'), ('even', 'even'), ('much', 'much'), ('states', 'state'), ('during', 'during'), ('both', 'both'), ('case', 'case'), ('three', 'three'), ('number', 'number'), ('make', 'make'), ('per', 'per'), ('great', 'great'), ('act', 'act'), ('way', 'way'), ('life', 'life'), ('good', 'good'), ('day', 'day'), ('public', 'public'), ('man', 'man'), ('however', 'however'), ('system', 'system'), ('water', 'water'), ('without', 'without'), ('us', 'u'), ('government', 'government'), ('while', 'while'), ('long', 'long'), ('order', 'order'), ('law', 'law'), ('section', 'section'), ('court', 'court'), ('high', 'high'), ('right', 'right'), ('own', 'own'), ('found', 'found'), ('united', 'united'), ('just', 'just'), ('here', 'here'), ('against', 'against'), ('world', 'world'), ('does', 'doe'), ('company', 'company'), ('within', 'within'), ('given', 'given'), ('service', 'service'), ('house', 'house'), ('another', 'another'), ('power', 'power'), ('place', 'place'), ('know', 'know'), ('little', 'little'), ('down', 'down'), ('present', 'present'), ('every', 'every'), ('national', 'national'), ('back', 'back'), ('take', 'take'), ('information', 'information'), ('men', 'men'), ('since', 'since'), ('might', 'might'), ('small', 'small'), ('large', 'large'), ('school', 'school'), ('following', 'following'), ('still', 'still'), ('less', 'le'), ('last', 'last'), ('city', 'city'), ('second', 'second'), ('development', 'development'), ('different', 'different'), ('university', 'university'), ('old', 'old'), ('form', 'form'), ('point', 'point'), ('total', 'total'), ('data', 'data'), ('too', 'too'), ('committee', 'committee'), ('report', 'report'), ('business', 'business'), ('think', 'think'), ('end', 'end'), ('get', 'get'), ('set', 'set'), ('research', 'research'), ('say', 'say'), ('come', 'come'), ('country', 'country'), ('never', 'never'), ('fact', 'fact'), ('go', 'go'), ('control', 'control'), ('thus', 'thus'), ('having', 'having'), ('value', 'value'), ('social', 'social'), ('department', 'department'), ('few', 'few'), ('above', 'above'), ('important', 'important'), ('interest', 'interest'), ('study', 'study'), ('off', 'off'), ('are
"{'the': 1, 'of': 2, 'and': 3, 'to': 4, 'in': 5, 'is': 6, 'that': 7, 'for': 8, 'as': 9, 'it': 10, 'be': 11, 'by': 12, 'with': 13, 'was': 14, 'on': 15, 'not': 16, 'or': 17, 'this': 18, 'are': 19, 'at': 20, 'from': 21, 'he': 22, 'which': 23, 'his': 24, 'have': 25, 'an': 26, 'but': 27, 'you': 28, 'they': 29, 'were': 30, 'had': 31, 'we': 32, 'all': 33, 'one': 34, 'has': 35, 'their': 36, 'been': 37, 'will': 38, 'there': 39, 'can': 40, 'if': 41, 'other': 42, 'would': 43, 'no': 44, 'her': 45, 'may': 46, 'more': 47, 'when': 48, 'so': 49, 'who': 50, 'such': 51, 'these': 52, 'any': 53, 'she': 54, 'new': 55, 'time': 56, 'than': 57, 'do': 58, 'some': 59, 'what': 60, 'only': 61, 'into': 62, 'them': 63, 'two': 64, 'also': 65, 'about': 66, 'out': 67, 'him': 68, 'my': 69, 'said': 70, 'up': 71, 'our': 72, 'first': 73, 'should': 74, 'under': 75, 'made': 76, 'state': 77, 'see': 78, 'after': 79, 'could': 80, 'then': 81, 'me': 82, 'most': 83, 'over': 84, 'very': 85, 'your': 86, 'between': 87, 'where': 88, 'now': 89, 'shall': 90, 'work': 91, 'those': 92, 'same': 93, 'well': 94, 'each': 95, 'many': 96, 'being': 97, 'years': 98, 'did': 99, 'through': 100, 'must': 101, 'upon': 102, 'before': 103, 'like': 104, 'use': 105, 'part': 106, 'general': 107, 'people': 108, 'because': 109, 'used': 110, 'how': 111, 'even': 112, 'much': 113, 'during': 114, 'both': 115, 'case': 116, 'three': 117, 'number': 118, 'make': 119, 'per': 120, 'great': 121, 'act': 122, 'way': 123, 'life': 124, 'good': 125, 'day': 126, 'public': 127, 'man': 128, 'however': 129, 'system': 130, 'water': 131, 'without': 132, 'us': 133, 'government': 134, 'while': 135, 'long': 136, 'order': 137, 'law': 138, 'section': 139, 'court': 140, 'high': 141, 'right': 142, 'own': 143, 'found': 144, 'united': 145, 'just': 146, 'here': 147, 'against': 148, 'world': 149, 'does': 150, 'company': 151, 'within': 152, 'given': 153, 'service': 154, 'house': 155, 'another': 156, 'power': 157, 'place': 158, 'know': 159, 'little': 160, 'down': 161, 'present': 162, 'every': 163, 'national': 164, 'back': 165, 'take': 166, 'information': 167, 'men': 168, 'since': 169, 'might': 170, 'small': 171, 'large': 172, 'school': 173, 'following': 174, 'still': 175, 'less': 176, 'last': 177, 'city': 178, 'second': 179, 'development': 180, 'different': 181, 'university': 182, 'old': 183, 'form': 184, 'point': 185, 'total': 186, 'data': 187, 'too': 188, 'committee': 189, 'report': 190, 'business': 191, 'think': 192, 'end': 193, 'get': 194, 'set': 195, 'research': 196, 'say': 197, 'come': 198, 'country': 199, 'never': 200, 'fact': 201, 'go': 202, 'control': 203, 'thus': 204, 'having': 205, 'value': 206, 'social': 207, 'department': 208, 'few': 209, 'above': 210, 'important': 211, 'interest': 212, 'study': 213, 'off': 214, 'area': 215, 'means': 216, 'office': 217, 'group': 218, 'give': 219, 'again': 220, 'war': 221, 'whether': 222, 'question': 223, 'called': 224, 'period': 225, 'line': 226, 'land': 227, 'four': 228, 'among': 229, 'table': 230, 'board': 231, 'until': 232, 'hand': 233, 'taken': 234, 'need': 235, 'education': 236, 'certain': 237, 'county': 238, 'action': 239, 'several': 240, 'am': 241, 'course': 242, 'far': 243, 'effect': 244, 'possible': 245, 'though': 246, 'left': 247, 'further': 248, 'home': 249, 'person': 250, 'health': 251, 'amount': 252, 'members': 253, 'subject': 254, 'yet': 255, 'program': 256, 'therefore': 257, 'process': 258, 'rate': 259, 'local': 260, 'name': 261, 'find': 262, 'necessary': 263, 'often': 264, 'others': 265, 'whole': 266, 'change': 267, 'example': 268, 'president': 269, 'history': 270, 'best': 271, 'although': 272, 'family': 273, 'side': 274, 'women': 275, 'held': 276, 'based': 277, 'south': 278, 'special': 279, 'required': 280, 'came': 281, 'thought': 282, 'five': 283, 'always': 284, 'himself': 285, 'air': 286, 'known': 287, 'head': 288, 'either': 289, 'property': 290, 'cost': 291, 'rather': 292, 'bill': 293, 'put': 294, 'human': 295, 'figure': 296, 'results': 297, 'level': 298, 'conditions': 299, 'full': 300, 'book': 301, 'available': 302, 'early': 303, 'matter': 304, 'common':
]
},
{
"ename": "KeyError",
"evalue": "'its'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[22], line 28\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;66;03m#final_wordlist[reverse_lemmatize_idx[lem_w]] = final_wordlist[w]\u001b[39;00m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28mprint\u001b[39m(final_wordlist)\n\u001b[0;32m---> 28\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[43mfinal_wordlist\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mits\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m==\u001b[39m final_wordlist[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mits\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 29\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m final_wordlist[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mits\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFinal wordlist size: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(final_wordlist\u001b[38;5;241m.\u001b[39mkeys())\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
"\u001b[0;31mKeyError\u001b[0m: 'its'"
2023-02-09 00:08:47 -05:00
]
}
],
"source": [
"# The final wordlist map. Maps a word to its numeric value\n",
"# Starting at 1\n",
"final_wordlist = {\n",
" w: idx + 1\n",
" for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE])\n",
"}\n",
"\n",
2023-02-16 20:06:19 -05:00
"\n",
2023-02-09 00:08:47 -05:00
"reverse_lemmatize_idx = {\n",
" lemmatizer.lemmatize(w): w\n",
" for w in final_wordlist.keys()\n",
"}\n",
"\n",
2023-02-16 20:06:19 -05:00
"print(lemmatize_mappings)\n",
"\n",
2023-02-09 00:08:47 -05:00
"# Add the lemmatized numbers\n",
"for w, lem_w in lemmatize_mappings:\n",
" if lem_w not in reverse_lemmatize_idx:\n",
" # This word is not in the reverse list\n",
" # This happens when the index of the lemmatized word we're working with is too large\n",
" continue\n",
2023-02-16 20:06:19 -05:00
" if lem_w in final_wordlist.keys():\n",
" continue\n",
2023-02-09 00:08:47 -05:00
" final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]]\n",
2023-02-16 20:06:19 -05:00
" #final_wordlist[reverse_lemmatize_idx[lem_w]] = final_wordlist[w]\n",
2023-02-09 00:08:47 -05:00
"\n",
2023-02-16 20:06:19 -05:00
"print(final_wordlist)\n",
2023-02-09 00:08:47 -05:00
"assert final_wordlist[\"its\"] == final_wordlist[\"its\"]\n",
"assert final_wordlist[\"its\"] >= 0\n",
"\n",
"print(f\"Final wordlist size: {len(final_wordlist.keys())}\")"
]
},
{
"cell_type": "code",
2023-02-15 22:57:30 -05:00
"execution_count": 7,
2023-02-09 00:08:47 -05:00
"id": "d1a06597-4ad5-4566-a716-8bbad416b7ab",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
2023-02-11 17:04:16 -05:00
"sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]\n",
"\n",
2023-02-09 00:08:47 -05:00
"with open(\"final_wordlist.csv\", \"w\") as f:\n",
2023-02-11 17:04:16 -05:00
" f.write(\"word,number\\n\")\n",
2023-02-09 00:08:47 -05:00
" \n",
" for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):\n",
" lemmatized = \"\" if not w[1] else w[1]\n",
2023-02-11 17:04:16 -05:00
" f.write(f\"{w[0].upper()},{lemmatized - 1}\")\n",
2023-02-09 00:08:47 -05:00
" f.write(\"\\n\")"
]
2023-02-11 17:04:16 -05:00
},
{
"cell_type": "code",
2023-02-16 20:06:19 -05:00
"execution_count": 9,
2023-02-11 17:04:16 -05:00
"id": "c88fe193-11cc-4a06-a3cf-d1ad85f44d14",
2023-02-16 20:06:19 -05:00
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"{'the': 1,\n",
" 'of': 2,\n",
" 'and': 3,\n",
" 'to': 4,\n",
" 'in': 5,\n",
" 'is': 6,\n",
" 'that': 7,\n",
" 'for': 8,\n",
" 'as': 9,\n",
" 'it': 10,\n",
" 'be': 5378,\n",
" 'by': 7272,\n",
" 'with': 13,\n",
" 'was': 14,\n",
" 'on': 15,\n",
" 'not': 16,\n",
" 'or': 17,\n",
" 'this': 18,\n",
" 'are': 19,\n",
" 'at': 20,\n",
" 'from': 21,\n",
" 'he': 22,\n",
" 'which': 23,\n",
" 'his': 24,\n",
" 'have': 25,\n",
" 'an': 26,\n",
" 'but': 27,\n",
" 'you': 28,\n",
" 'they': 29,\n",
" 'were': 30,\n",
" 'had': 31,\n",
" 'we': 32,\n",
" 'all': 33,\n",
" 'one': 34,\n",
" 'has': 35,\n",
" 'their': 36,\n",
" 'been': 37,\n",
" 'will': 38,\n",
" 'there': 39,\n",
" 'can': 40,\n",
" 'if': 41,\n",
" 'other': 42,\n",
" 'would': 43,\n",
" 'no': 44,\n",
" 'her': 45,\n",
" 'may': 46,\n",
" 'more': 47,\n",
" 'when': 48,\n",
" 'so': 49,\n",
" 'who': 50,\n",
" 'such': 51,\n",
" 'these': 52,\n",
" 'any': 53,\n",
" 'she': 54,\n",
" 'new': 55,\n",
" 'time': 56,\n",
" 'than': 57,\n",
" 'do': 58,\n",
" 'some': 59,\n",
" 'what': 60,\n",
" 'only': 61,\n",
" 'into': 62,\n",
" 'them': 63,\n",
" 'two': 64,\n",
" 'also': 65,\n",
" 'about': 66,\n",
" 'out': 67,\n",
" 'him': 68,\n",
" 'my': 69,\n",
" 'said': 70,\n",
" 'up': 71,\n",
" 'our': 72,\n",
" 'first': 73,\n",
" 'should': 74,\n",
" 'under': 75,\n",
" 'made': 76,\n",
" 'state': 77,\n",
" 'see': 78,\n",
" 'after': 79,\n",
" 'could': 80,\n",
" 'then': 81,\n",
" 'me': 82,\n",
" 'most': 83,\n",
" 'over': 84,\n",
" 'very': 85,\n",
" 'your': 86,\n",
" 'between': 87,\n",
" 'where': 88,\n",
" 'now': 89,\n",
" 'shall': 90,\n",
" 'work': 91,\n",
" 'those': 92,\n",
" 'same': 93,\n",
" 'well': 94,\n",
" 'each': 95,\n",
" 'many': 96,\n",
" 'being': 97,\n",
" 'years': 98,\n",
" 'did': 99,\n",
" 'through': 100,\n",
" 'must': 101,\n",
" 'upon': 102,\n",
" 'before': 103,\n",
" 'like': 104,\n",
" 'use': 105,\n",
" 'part': 106,\n",
" 'general': 107,\n",
" 'people': 108,\n",
" 'because': 109,\n",
" 'used': 110,\n",
" 'how': 111,\n",
" 'even': 112,\n",
" 'much': 113,\n",
" 'during': 114,\n",
" 'both': 115,\n",
" 'case': 116,\n",
" 'three': 117,\n",
" 'number': 118,\n",
" 'make': 119,\n",
" 'per': 120,\n",
" 'great': 121,\n",
" 'act': 122,\n",
" 'way': 123,\n",
" 'life': 124,\n",
" 'good': 125,\n",
" 'day': 126,\n",
" 'public': 127,\n",
" 'man': 128,\n",
" 'however': 129,\n",
" 'system': 130,\n",
" 'water': 131,\n",
" 'without': 132,\n",
" 'us': 133,\n",
" 'government': 134,\n",
" 'while': 135,\n",
" 'long': 136,\n",
" 'order': 137,\n",
" 'law': 138,\n",
" 'section': 139,\n",
" 'court': 140,\n",
" 'high': 141,\n",
" 'right': 142,\n",
" 'own': 143,\n",
" 'found': 144,\n",
" 'united': 145,\n",
" 'just': 146,\n",
" 'here': 147,\n",
" 'against': 148,\n",
" 'world': 149,\n",
" 'does': 150,\n",
" 'company': 151,\n",
" 'within': 152,\n",
" 'given': 153,\n",
" 'service': 154,\n",
" 'house': 155,\n",
" 'another': 156,\n",
" 'power': 157,\n",
" 'place': 158,\n",
" 'know': 159,\n",
" 'little': 160,\n",
" 'down': 161,\n",
" 'present': 162,\n",
" 'every': 163,\n",
" 'national': 164,\n",
" 'back': 165,\n",
" 'take': 166,\n",
" 'information': 167,\n",
" 'men': 168,\n",
" 'since': 169,\n",
" 'might': 170,\n",
" 'small': 171,\n",
" 'large': 172,\n",
" 'school': 173,\n",
" 'following': 174,\n",
" 'still': 175,\n",
" 'less': 176,\n",
" 'last': 177,\n",
" 'city': 178,\n",
" 'second': 179,\n",
" 'development': 180,\n",
" 'different': 181,\n",
" 'university': 182,\n",
" 'old': 183,\n",
" 'form': 184,\n",
" 'point': 185,\n",
" 'total': 186,\n",
" 'data': 187,\n",
" 'too': 188,\n",
" 'committee': 189,\n",
" 'report': 190,\n",
" 'business': 191,\n",
" 'think': 192,\n",
" 'end': 193,\n",
" 'get': 194,\n",
" 'set': 195,\n",
" 'research': 196,\n",
" 'say': 197,\n",
" 'come': 198,\n",
" 'country': 199,\n",
" 'never': 200,\n",
" 'fact': 201,\n",
" 'go': 202,\n",
" 'control': 203,\n",
" 'thus': 204,\n",
" 'having': 205,\n",
" 'value': 206,\n",
" 'social': 207,\n",
" 'department': 208,\n",
" 'few': 209,\n",
" 'above': 210,\n",
" 'important': 211,\n",
" 'interest': 212,\n",
" 'study': 213,\n",
" 'off': 214,\n",
" 'area': 215,\n",
" 'means': 216,\n",
" 'office': 217,\n",
" 'group': 218,\n",
" 'give': 219,\n",
" 'again': 220,\n",
" 'war': 221,\n",
" 'whether': 222,\n",
" 'question': 223,\n",
" 'called': 224,\n",
" 'period': 225,\n",
" 'line': 226,\n",
" 'land': 227,\n",
" 'four': 228,\n",
" 'among': 229,\n",
" 'table': 230,\n",
" 'board': 231,\n",
" 'until': 232,\n",
" 'hand': 233,\n",
" 'taken': 234,\n",
" 'need': 235,\n",
" 'education': 236,\n",
" 'certain': 237,\n",
" 'county': 238,\n",
" 'action': 239,\n",
" 'several': 240,\n",
" 'am': 241,\n",
" 'course': 242,\n",
" 'far': 243,\n",
" 'effect': 244,\n",
" 'possible': 245,\n",
" 'though': 246,\n",
" 'left': 247,\n",
" 'further': 248,\n",
" 'home': 249,\n",
" 'person': 250,\n",
" 'health': 251,\n",
" 'amount': 252,\n",
" 'members': 253,\n",
" 'subject': 254,\n",
" 'yet': 255,\n",
" 'program': 256,\n",
" 'therefore': 257,\n",
" 'process': 258,\n",
" 'rate': 259,\n",
" 'local': 260,\n",
" 'name': 261,\n",
" 'find': 262,\n",
" 'necessary': 263,\n",
" 'often': 264,\n",
" 'others': 265,\n",
" 'whole': 266,\n",
" 'change': 267,\n",
" 'example': 268,\n",
" 'president': 269,\n",
" 'history': 270,\n",
" 'best': 271,\n",
" 'although': 272,\n",
" 'family': 273,\n",
" 'side': 274,\n",
" 'women': 275,\n",
" 'held': 276,\n",
" 'based': 277,\n",
" 'south': 278,\n",
" 'special': 279,\n",
" 'required': 280,\n",
" 'came': 281,\n",
" 'thought': 282,\n",
" 'five': 283,\n",
" 'always': 284,\n",
" 'himself': 285,\n",
" 'air': 286,\n",
" 'known': 287,\n",
" 'head': 288,\n",
" 'either': 289,\n",
" 'property': 290,\n",
" 'cost': 291,\n",
" 'rather': 292,\n",
" 'bill': 293,\n",
" 'put': 294,\n",
" 'human': 295,\n",
" 'figure': 296,\n",
" 'results': 297,\n",
" 'level': 298,\n",
" 'conditions': 299,\n",
" 'full': 300,\n",
" 'book': 301,\n",
" 'available': 302,\n",
" 'early': 303,\n",
" 'matter': 304,\n",
" 'common': 305,\n",
" 'light': 306,\n",
" 'let': 307,\n",
" 'society': 308,\n",
" 'body': 309,\n",
" 'international': 310,\n",
" 'including': 311,\n",
" 'free': 312,\n",
" 'evidence': 313,\n",
" 'better': 314,\n",
" 'type': 315,\n",
" 'provided': 316,\n",
" 'due': 317,\n",
" 'next': 318,\n",
" 'production': 319,\n",
" 'once': 320,\n",
" 'done': 321,\n",
" 'making': 322,\n",
" 'least': 323,\n",
" 'support': 324,\n",
" 'north': 325,\n",
" 'later': 326,\n",
" 'using': 327,\n",
" 'things': 328,\n",
" 'economic': 329,\n",
" 'chapter': 330,\n",
" 'various': 331,\n",
" 'why': 332,\n",
" 'white': 333,\n",
" 'going': 334,\n",
" 'commission': 335,\n",
" 'federal': 336,\n",
" 'away': 337,\n",
" 'field': 338,\n",
" 'nature': 339,\n",
" 'policy': 340,\n",
" 'become': 341,\n",
" 'political': 342,\n",
" 'increase': 343,\n",
" 'around': 344,\n",
" 'age': 345,\n",
" 'want': 346,\n",
" 'low': 347,\n",
" 'trade': 348,\n",
" 'half': 349,\n",
" 'position': 350,\n",
" 'young': 351,\n",
" 'money': 352,\n",
" 'percent': 353,\n",
" 'cent': 354,\n",
" 'class': 355,\n",
" 'words': 356,\n",
" 'view': 357,\n",
" 'provide': 358,\n",
" 'seen': 359,\n",
" 'show': 360,\n",
" 'district': 361,\n",
" 'party': 362,\n",
" 'analysis': 363,\n",
" 'care': 364,\n",
" 'june': 365,\n",
" 'foreign': 366,\n",
" 'shown': 367,\n",
" 'received': 368,\n",
" 'management': 369,\n",
" 'third': 370,\n",
" 'took': 371,\n",
" 'something': 372,\n",
" 'tax': 373,\n",
" 'account': 374,\n",
" 'problem': 375,\n",
" 'almost': 376,\n",
" 'west': 377,\n",
" 'nothing': 378,\n",
" 'together': 379,\n",
" 'individual': 380,\n",
" 'open': 381,\n",
" 'material': 382,\n",
" 'paper': 383,\n",
" 'feet': 384,\n",
" 'force': 385,\n",
" 'association': 386,\n",
" 'purpose': 387,\n",
" 'terms': 388,\n",
" 'method': 389,\n",
" 'help': 390,\n",
" 'real': 391,\n",
" 'ever': 392,\n",
" 'already': 393,\n",
" 'along': 394,\n",
" 'went': 395,\n",
" 'particular': 396,\n",
" 'energy': 397,\n",
" 'secretary': 398,\n",
" 'date': 399,\n",
" 'price': 400,\n",
" 'short': 401,\n",
" 'true': 402,\n",
" 'street': 403,\n",
" 'building': 404,\n",
" 'room': 405,\n",
" 'market': 406,\n",
" 'look': 407,\n",
" 'similar': 408,\n",
" 'industry': 409,\n",
" 'bank': 410,\n",
" 'according': 411,\n",
" 'itself': 412,\n",
" 'application': 413,\n",
" 'current': 414,\n",
" 'read': 415,\n",
" 'press': 416,\n",
" 'community': 417,\n",
" 'plan': 418,\n",
" 'whose': 419,\n",
" 'major': 420,\n",
" 'considered': 421,\n",
" 'mind': 422,\n",
" 'union': 423,\n",
" 'cause': 424,\n",
" 'able': 425,\n",
" 'surface': 426,\n",
" 'face': 427,\n",
" 'river': 428,\n",
" 'council': 429,\n",
" 'income': 430,\n",
" 'july': 431,\n",
" 'near': 432,\n",
" 'experience': 433,\n",
" 'non': 434,\n",
" 'paid': 435,\n",
" 'pay': 436,\n",
" 'reason': 437,\n",
" 'themselves': 438,\n",
" 'asked': 439,\n",
" 'march': 440,\n",
" 'king': 441,\n",
" 'higher': 442,\n",
" 'single': 443,\n",
" 'average': 444,\n",
" 'father': 445,\n",
" 'note': 446,\n",
" 'treatment': 447,\n",
" 'love': 448,\n",
" 'black': 449,\n",
" 'knowledge': 450,\n",
" 'enough': 451,\n",
" 'future': 452,\n",
" 'kind': 453,\n",
" 'lower': 454,\n",
" 'authority': 455,\n",
" 'past': 456,\n",
" 'natural': 457,\n",
" 'six': 458,\n",
" 'food': 459,\n",
" 'working': 460,\n",
" 'central': 461,\n",
" 'college': 462,\n",
" 'self': 463,\n",
" 'products': 464,\n",
" 'model': 465,\n",
" 'brought': 466,\n",
" 'greater': 467,\n",
" 'test': 468,\n",
" 'nor': 469,\n",
" 'students': 470,\n",
" 'private': 471,\n",
" 'construction': 472,\n",
" 'perhaps': 473,\n",
" 'ground': 474,\n",
" 'sir': 475,\n",
" 'basis': 476,\n",
" 'months': 477,\n",
" 'growth': 478,\n",
" 'increased': 479,\n",
" 'east': 480,\n",
" 'language': 481,\n",
" 'rule': 482,\n",
" 'continued': 483,\n",
" 'quite': 484,\n",
" 'except': 485,\n",
" 'series': 486,\n",
" 'practice': 487,\n",
" 'night': 488,\n",
" 'eyes': 489,\n",
" 'oil': 490,\n",
" 'art': 491,\n",
" 'told': 492,\n",
" 'especially': 493,\n",
" 'population': 494,\n",
" 'science': 495,\n",
" 'whom': 496,\n",
" 'obtained': 497,\n",
" 'capital': 498,\n",
" 'include': 499,\n",
" 'generally': 500,\n",
" 'meeting': 501,\n",
" 'specific': 502,\n",
" 'described': 503,\n",
" 'believe': 504,\n",
" 'review': 505,\n",
" 'issue': 506,\n",
" 'respect': 507,\n",
" 'contract': 508,\n",
" 'became': 509,\n",
" 'medical': 510,\n",
" 'road': 511,\n",
" 'got': 512,\n",
" 'clear': 513,\n",
" 'main': 514,\n",
" 'labor': 515,\n",
" 'operation': 516,\n",
" 'size': 517,\n",
" 'below': 518,\n",
" 'hours': 519,\n",
" 'sense': 520,\n",
" 'addition': 521,\n",
" 'probably': 522,\n",
" 'century': 523,\n",
" 'personal': 524,\n",
" 'plant': 525,\n",
" 'training': 526,\n",
" 'design': 527,\n",
" 'statement': 528,\n",
" 'structure': 529,\n",
" 'project': 530,\n",
" 'million': 531,\n",
" 'usually': 532,\n",
" 'range': 533,\n",
" 'call': 534,\n",
" 'mother': 535,\n",
" 'seems': 536,\n",
" 'standard': 537,\n",
" 'return': 538,\n",
" 'title': 539,\n",
" 'established': 540,\n",
" 'keep': 541,\n",
" 'space': 542,\n",
" 'annual': 543,\n",
" 'record': 544,\n",
" 'close': 545,\n",
" 'april': 546,\n",
" 'complete': 547,\n",
" 'page': 548,\n",
" 'heart': 549,\n",
" 'fig': 550,\n",
" 'quality': 551,\n",
" 'gas': 552,\n",
" 'letter': 553,\n",
" 'stock': 554,\n",
" 'gave': 555,\n",
" 'related': 556,\n",
" 'administration': 557,\n",
" 'activities': 558,\n",
" 'theory': 559,\n",
" 'town': 560,\n",
" 'equipment': 561,\n",
" 'soon': 562,\n",
" 'decision': 563,\n",
" 'pressure': 564,\n",
" 'written': 565,\n",
" 'corporation': 566,\n",
" 'tell': 567,\n",
" 'agreement': 568,\n",
" 'reported': 569,\n",
" 'attention': 570,\n",
" 'fire': 571,\n",
" 'direct': 572,\n",
" 'saw': 573,\n",
" 'published': 574,\n",
" 'temperature': 575,\n",
" 'species': 576,\n",
" 'really': 577,\n",
" 'function': 578,\n",
" 'military': 579,\n",
" 'proposed': 580,\n",
" 'january': 581,\n",
" 'additional': 582,\n",
" 'late': 583,\n",
" 'opinion': 584,\n",
" 'loss': 585,\n",
" 'limited': 586,\n",
" 'source': 587,\n",
" 'article': 588,\n",
" 'notice': 589,\n",
" 'security': 590,\n",
" 'organization': 591,\n",
" 'financial': 592,\n",
" 'follows': 593,\n",
" 'miles': 594,\n",
" 'chief': 595,\n",
" 'distribution': 596,\n",
" 'sometimes': 597,\n",
" 'insurance': 598,\n",
" 'son': 599,\n",
" 'strong': 600,\n",
" 'length': 601,\n",
" 'original': 602,\n",
" 'yes': 603,\n",
" 'effective': 604,\n",
" 'defendant': 605,\n",
" 'living': 606,\n",
" 'december': 607,\n",
" 'character': 608,\n",
" 'began': 609,\n",
" 'carried': 610,\n",
" 'supply': 611,\n",
" 'blood': 612,\n",
" 'taking': 613,\n",
" 'manner': 614,\n",
" 'journal': 615,\n",
" 'hundred': 616,\n",
" 'red': 617,\n",
" 'developed': 618,\n",
" 'performance': 619,\n",
" 'situation': 620,\n",
" 'felt': 621,\n",
" 'workers': 622,\n",
" 'volume': 623,\n",
" 'presented': 624,\n",
" 'knew': 625,\n",
" 'answer': 626,\n",
" 'resources': 627,\n",
" 'industrial': 628,\n",
" 'twenty': 629,\n",
" 'sent': 630,\n",
" 'looked': 631,\n",
" 'library': 632,\n",
" 'added': 633,\n",
" 'passed': 634,\n",
" 'ten': 635,\n",
" 'sea': 636,\n",
" 'applied': 637,\n",
" 'included': 638,\n",
" 'physical': 639,\n",
" 'across': 640,\n",
" 'army': 641,\n",
" 'toward': 642,\n",
" 'produced': 643,\n",
" 'placed': 644,\n",
" 'role': 645,\n",
" 'october': 646,\n",
" 'final': 647,\n",
" 'approach': 648,\n",
" 'provisions': 649,\n",
" 'leave': 650,\n",
" 'director': 651,\n",
" 'employment': 652,\n",
" 'anything': 653,\n",
" 'particularly': 654,\n",
" 'hard': 655,\n",
" 'outside': 656,\n",
" 'week': 657,\n",
" 'feel': 658,\n",
" 'charge': 659,\n",
" 'indeed': 660,\n",
" 'degree': 661,\n",
" 'reference': 662,\n",
" 'requirements': 663,\n",
" 'september': 664,\n",
" 'today': 665,\n",
" 'western': 666,\n",
" 'influence': 667,\n",
" 'unit': 668,\n",
" 'solution': 669,\n",
" 'chairman': 670,\n",
" 'legal': 671,\n",
" 'motion': 672,\n",
" 'region': 673,\n",
" 'idea': 674,\n",
" 'list': 675,\n",
" 'judgment': 676,\n",
" 'determined': 677,\n",
" 'poor': 678,\n",
" 'disease': 679,\n",
" 'civil': 680,\n",
" 'turn': 681,\n",
" 'modern': 682,\n",
" 'normal': 683,\n",
" 'appear': 684,\n",
" 'employees': 685,\n",
" 'latter': 686,\n",
" 'heard': 687,\n",
" 'top': 688,\n",
" 'sure': 689,\n",
" 'moment': 690,\n",
" 'code': 691,\n",
" 'wife': 692,\n",
" 'post': 693,\n",
" 'difficult': 694,\n",
" 'recent': 695,\n",
" 'extent': 696,\n",
" 'longer': 697,\n",
" 'story': 698,\n",
" 'meet': 699,\n",
" 'officers': 700,\n",
" 'patients': 701,\n",
" 'front': 702,\n",
" 'doing': 703,\n",
" 'staff': 704,\n",
" 'august': 705,\n",
" 'needed': 706,\n",
" 'involved': 707,\n",
" 'likely': 708,\n",
" 'former': 709,\n",
" 'run': 710,\n",
" 'author': 711,\n",
" 'middle': 712,\n",
" 'turned': 713,\n",
" 'agency': 714,\n",
" 'reading': 715,\n",
" 'beginning': 716,\n",
" 'duty': 717,\n",
" 'movement': 718,\n",
" 'alone': 719,\n",
" 'beyond': 720,\n",
" 'fine': 721,\n",
" 'base': 722,\n",
" 'relations': 723,\n",
" 'simple': 724,\n",
" 'consider': 725,\n",
" 'proper': 726,\n",
" 'instead': 727,\n",
" 'significant': 728,\n",
" 'appears': 729,\n",
" 'equal': 730,\n",
" 'lost': 731,\n",
" 'followed': 732,\n",
" 'hope': 733,\n",
" 'cut': 734,\n",
" 'unless': 735,\n",
" 'nearly': 736,\n",
" 'claim': 737,\n",
" 'associated': 738,\n",
" 'expected': 739,\n",
" 'difference': 740,\n",
" 'funds': 741,\n",
" 'direction': 742,\n",
" 'cross': 743,\n",
" 'live': 744,\n",
" 'finally': 745,\n",
" 'weight': 746,\n",
" 'lead': 747,\n",
" 'trial': 748,\n",
" 'justice': 749,\n",
" 'factors': 750,\n",
" 'response': 751,\n",
" 'cells': 752,\n",
" 'earth': 753,\n",
" 'rest': 754,\n",
" 'bring': 755,\n",
" 'trust': 756,\n",
" 'observed': 757,\n",
" 'behind': 758,\n",
" 'job': 759,\n",
" 'door': 760,\n",
" 'understand': 761,\n",
" 'acid': 762,\n",
" 'hold': 763,\n",
" 'technology': 764,\n",
" 'wide': 765,\n",
" 'protection': 766,\n",
" 'basic': 767,\n",
" 'november': 768,\n",
" 'seemed': 769,\n",
" 'throughout': 770,\n",
" 'importance': 771,\n",
" 'sales': 772,\n",
" 'stated': 773,\n",
" 'address': 774,\n",
" 'potential': 775,\n",
" 'payment': 776,\n",
" 'prior': 777,\n",
" 'discussion': 778,\n",
" 'conference': 779,\n",
" 'writing': 780,\n",
" 'stage': 781,\n",
" 'fall': 782,\n",
" 'iron': 783,\n",
" 'play': 784,\n",
" 'ask': 785,\n",
" 'relationship': 786,\n",
" 'towards': 787,\n",
" 'regard': 788,\n",
" 'referred': 789,\n",
" 'flow': 790,\n",
" 'consideration': 791,\n",
" 'hospital': 792,\n",
" 'seem': 793,\n",
" 'february': 794,\n",
" 'soil': 795,\n",
" 'morning': 796,\n",
" 'commercial': 797,\n",
" 'planning': 798,\n",
" 'provides': 799,\n",
" 'appropriate': 800,\n",
" 'technical': 801,\n",
" 'demand': 802,\n",
" 'sufficient': 803,\n",
" 'principal': 804,\n",
" 'credit': 805,\n",
" 'peace': 806,\n",
" 'previous': 807,\n",
" 'object': 808,\n",
" 'kept': 809,\n",
" 'sound': 810,\n",
" 'wanted': 811,\n",
" 'looking': 812,\n",
" 'entire': 813,\n",
" 'plaintiff': 814,\n",
" 'heat': 815,\n",
" 'otherwise': 816,\n",
" 'judge': 817,\n",
" 'capacity': 818,\n",
" 'brown': 819,\n",
" 'music': 820,\n",
" 'risk': 821,\n",
" 'box': 822,\n",
" 'exchange': 823,\n",
" 'produce': 824,\n",
" 'station': 825,\n",
" 'big': 826,\n",
" 'primary': 827,\n",
" 'institute': 828,\n",
" 'mentioned': 829,\n",
" 'prepared': 830,\n",
" 'spirit': 831,\n",
" 'allowed': 832,\n",
" 'site': 833,\n",
" 'green': 834,\n",
" 'directly': 835,\n",
" 'text': 836,\n",
" 'friends': 837,\n",
" 'presence': 838,\n",
" 'survey': 839,\n",
" 'determine': 840,\n",
" 'car': 841,\n",
" 'larger': 842,\n",
" 'deep': 843,\n",
" 'simply': 844,\n",
" 'immediately': 845,\n",
" 'distance': 846,\n",
" 'coming': 847,\n",
" 'seven': 848,\n",
" 'steel': 849,\n",
" 'existing': 850,\n",
" 'clearly': 851,\n",
" 'actual': 852,\n",
" 'born': 853,\n",
" 'learning': 854,\n",
" 'voice': 855,\n",
" 'earlier': 856,\n",
" 'circumstances': 857,\n",
" 'safety': 858,\n",
" 'ago': 859,\n",
" 'issued': 860,\n",
" 'upper': 861,\n",
" 'require': 862,\n",
" 'scale': 863,\n",
" 'island': 864,\n",
" 'culture': 865,\n",
" 'employed': 866,\n",
" 'eight': 867,\n",
" 'estate': 868,\n",
" 'portion': 869,\n",
" 'deal': 870,\n",
" 'share': 871,\n",
" 'actually': 872,\n",
" 'aid': 873,\n",
" 'engineering': 874,\n",
" 'continue': 875,\n",
" 'formed': 876,\n",
" 'agricultural': 877,\n",
" 'entitled': 878,\n",
" 'mass': 879,\n",
" 'truth': 880,\n",
" 'giving': 881,\n",
" 'met': 882,\n",
" 'built': 883,\n",
" 'content': 884,\n",
" 'connection': 885,\n",
" 'assistance': 886,\n",
" 'coal': 887,\n",
" 'progress': 888,\n",
" 'receive': 889,\n",
" 'active': 890,\n",
" 'nation': 891,\n",
" 'contact': 892,\n",
" 'amendment': 893,\n",
" 'net': 894,\n",
" 'wall': 895,\n",
" 'farm': 896,\n",
" 'understanding': 897,\n",
" 'strength': 898,\n",
" 'minutes': 899,\n",
" 'move': 900,\n",
" 'elements': 901,\n",
" 'concerned': 902,\n",
" 'regulations': 903,\n",
" 'step': 904,\n",
" 'literature': 905,\n",
" 'opportunity': 906,\n",
" 'investment': 907,\n",
" 'led': 908,\n",
" 'reduced': 909,\n",
" 'follow': 910,\n",
" 'facilities': 911,\n",
" 'benefit': 912,\n",
" 'compared': 913,\n",
" 'reached': 914,\n",
" 'religious': 915,\n",
" 'measure': 916,\n",
" 'meaning': 917,\n",
" 'considerable': 918,\n",
" 'relative': 919,\n",
" 'electric': 920,\n",
" 'joint': 921,\n",
" 'certainly': 922,\n",
" 'failure': 923,\n",
" 'apply': 924,\n",
" 'appeal': 925,\n",
" 'separate': 926,\n",
" 'balance': 927,\n",
" 'died': 928,\n",
" 'operating': 929,\n",
" 'includes': 930,\n",
" 'independent': 931,\n",
" 'defined': 932,\n",
" 'forward': 933,\n",
" 'doubt': 934,\n",
" 'none': 935,\n",
" 'master': 936,\n",
" 'chemical': 937,\n",
" 'success': 938,\n",
" 'environment': 939,\n",
" 'everything': 940,\n",
" 'transfer': 941,\n",
" 'news': 942,\n",
" 'gold': 943,\n",
" 'thousand': 944,\n",
" 'key': 945,\n",
" 'examination': 946,\n",
" 'fully': 947,\n",
" 'description': 948,\n",
" 'teachers': 949,\n",
" 'lake': 950,\n",
" 'status': 951,\n",
" 'fair': 952,\n",
" 'affairs': 953,\n",
" 'round': 954,\n",
" 'procedure': 955,\n",
" 'covered': 956,\n",
" 'daily': 957,\n",
" 'collection': 958,\n",
" 'maximum': 959,\n",
" 'pass': 960,\n",
" 'lot': 961,\n",
" 'resolution': 962,\n",
" 'adopted': 963,\n",
" 'principles': 964,\n",
" 'fixed': 965,\n",
" 'police': 966,\n",
" 'machine': 967,\n",
" 'appeared': 968,\n",
" 'becomes': 969,\n",
" 'moved': 970,\n",
" 'phase': 971,\n",
" 'caused': 972,\n",
" 'request': 973,\n",
" 'stand': 974,\n",
" 'else': 975,\n",
" 'executive': 976,\n",
" 'institutions': 977,\n",
" 'neither': 978,\n",
" 'heavy': 979,\n",
" 'computer': 980,\n",
" 'senator': 981,\n",
" 'wood': 982,\n",
" 'environmental': 983,\n",
" 'dark': 984,\n",
" 'professor': 985,\n",
" 'access': 986,\n",
" 'official': 987,\n",
" 'hear': 988,\n",
" 'spring': 989,\n",
" 'complex': 990,\n",
" 'allow': 991,\n",
" 'sum': 992,\n",
" 'cover': 993,\n",
" 'proceedings': 994,\n",
" 'write': 995,\n",
" 'discussed': 996,\n",
" 'events': 997,\n",
" 'started': 998,\n",
" 'internal': 999,\n",
" 'ability': 1000,\n",
" ...}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_wordlist"
]
2023-02-11 17:04:16 -05:00
},
{
"cell_type": "code",
"execution_count": null,
"id": "2a0d177b-3499-42fb-8091-29547567d69a",
"metadata": {},
"outputs": [],
"source": []
2023-02-09 00:08:47 -05:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}