{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "0b00342f-7b19-49cc-bc6c-21019f8cc7dc", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n", "Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n", "Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n", "Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n", "Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n", "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n", "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] } ], "source": [ "try:\n", " _initialized\n", "except:\n", " !pip install nltk odfpy\n", " import nltk\n", " \n", " nltk.download(\"wordnet\")\n", " _initialized=True\n", " \n", "from nltk.stem.wordnet import WordNetLemmatizer\n", "import pandas as pd\n", "import gzip\n", "import re" ] }, { "cell_type": "code", "execution_count": 2, "id": "985883de-8049-4f81-acd9-34e1abcd4070", "metadata": { "tags": [] }, "outputs": [], "source": [ "def get_lines(filename):\n", " with gzip.open(filename, 'r') as f:\n", " ret = []\n", " for l in f:\n", " if len(ret) > 30_000:\n", " return ret\n", " ret.append(str(l).lower())\n", " return ret\n", " \n", "WORDLIST_SIZE = 8192 + 3\n", "lemmatizer = WordNetLemmatizer()\n", "word_re = re.compile(r\"^[A-Za-z]+$\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "926d0d84-0d7e-4939-b87f-1a170f870a8f", "metadata": { "tags": [] }, "outputs": [], "source": [ "annotated_words=pd.read_excel(\"annotated_words.ods\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "8b0d26e4-051c-4669-b566-bbd5ddbbe02b", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "['a', 'as', 'it', 'was', 'i', 'has', 'so', 'its', 's', 'p']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n", "excluded_words[0:10]" ] }, { "cell_type": "code", "execution_count": 5, "id": "2eea14b2-82bf-4353-8982-76a6c7f46d22", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "[('be', 'bee'),\n", " ('by', 'bye'),\n", " ('per', 'purr'),\n", " ('sense', 'cent'),\n", " ('died', 'dyed'),\n", " ('cents', 'sense'),\n", " ('yellow', 'hello'),\n", " ('corps', 'core'),\n", " ('ore', 'oar'),\n", " ('ore', ' or'),\n", " ('vary', 'very'),\n", " ('com', 'calm'),\n", " ('filing', 'filling'),\n", " ('fax', 'facts'),\n", " ('favour', 'favor'),\n", " ('theatre', 'theater'),\n", " ('par', 'parse'),\n", " ('honour', 'honor'),\n", " ('harry', 'hairy'),\n", " ('brings', 'bring'),\n", " ('organisation', 'organization'),\n", " ('simultaneously', 'simultaneous'),\n", " ('aluminum', 'aluminium'),\n", " ('knight', 'night'),\n", " ('electronics', 'electronic'),\n", " ('senses', 'cent'),\n", " ('organisations', 'organization'),\n", " ('fortunately', 'fortunate'),\n", " ('corp', 'core'),\n", " ('chile', 'chilly'),\n", " ('chile', ' chili'),\n", " ('owe', 'oh'),\n", " ('capitol', 'capital'),\n", " ('weary', 'wary'),\n", " ('berry', 'barry'),\n", " ('lecturer', 'lecture'),\n", " ('weigh', 'way'),\n", " ('aluminium', 'aluminum'),\n", " ('isle', 'aisle'),\n", " ('boulder', 'bolder'),\n", " ('blew', 'blue'),\n", " ('reformed', 'reform'),\n", " ('scent', 'cent'),\n", " ('ads', 'adds'),\n", " ('honours', 'honors'),\n", " ('bot', 'bought'),\n", " ('dew', 'do'),\n", " ('dew', ' due'),\n", " ('theatres', 'theater'),\n", " ('thru', 'through'),\n", " ('sensed', 'cent'),\n", " ('monies', 'moneys'),\n", " ('cue', 'queue'),\n", " ('hairy', 'harry'),\n", " ('weighs', 'way'),\n", " ('hem', 'him'),\n", " ('nun', 'none'),\n", " ('organisational', 'organizational'),\n", " ('grate', 'great'),\n", " ('dessert', 'desert'),\n", " ('aux', 'ox'),\n", " ('rap', 'wrap'),\n", " ('filings', 'filling'),\n", " ('pars', 'parse'),\n", " ('dazed', 'day'),\n", " ('scents', 'cent'),\n", " ('daze', 'day'),\n", " ('four', 'for')]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n", "\n", "custom_maps = [\n", " (m[1][\"word\"].lower(), mapping.lower())\n", " for m in custom_maps.iterrows()\n", " for mapping in m[1][\"maps_to\"]\n", "]\n", "custom_maps" ] }, { "cell_type": "code", "execution_count": 6, "id": "8bdfd108-bf43-4c0f-bc5c-f91925da753f", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Start parsing the wordlist\n", "all_words = get_lines(\"frequency-all.txt.gz\")\n", "\n", "# Delete header line\n", "all_words = all_words[1:]\n", "\n", "# Get only the word (fixed width)\n", "all_words = [w[13:36].strip() for w in all_words]\n", "\n", "# Remove special characters\n", "all_words = [w for w in all_words if word_re.search(w)]\n", "\n", "# Remove all removed words\n", "all_words = [w for w in all_words if w not in excluded_words]\n", "\n", "# Add all custom mappings\n", "for m in list(sum(custom_maps, ())):\n", " if m[0] not in all_words:\n", " all_words.append(m[0])\n", " if m[1] not in all_words:\n", " all_words.append(m[1])" ] }, { "cell_type": "code", "execution_count": 7, "id": "e42f2b56-98b3-4465-95be-812d8584b511", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "['the',\n", " 'of',\n", " 'and',\n", " 'to',\n", " 'in',\n", " 'is',\n", " 'that',\n", " 'for',\n", " 'be',\n", " 'by',\n", " 'with',\n", " 'on',\n", " 'not',\n", " 'or',\n", " 'this',\n", " 'are',\n", " 'at',\n", " 'from',\n", " 'he',\n", " 'which',\n", " 'his',\n", " 'have',\n", " 'an',\n", " 'but',\n", " 'you',\n", " 'they',\n", " 'were',\n", " 'had',\n", " 'we',\n", " 'all',\n", " 'one',\n", " 'their',\n", " 'been',\n", " 'will',\n", " 'there',\n", " 'can',\n", " 'if',\n", " 'other',\n", " 'would',\n", " 'no',\n", " 'her',\n", " 'may',\n", " 'more',\n", " 'when',\n", " 'who',\n", " 'such',\n", " 'these',\n", " 'any',\n", " 'she',\n", " 'new',\n", " 'time',\n", " 'than',\n", " 'do',\n", " 'some',\n", " 'what',\n", " 'only',\n", " 'into',\n", " 'them',\n", " 'two',\n", " 'also',\n", " 'about',\n", " 'out',\n", " 'him',\n", " 'my',\n", " 'said',\n", " 'up',\n", " 'our',\n", " 'first',\n", " 'should',\n", " 'under',\n", " 'made',\n", " 'state',\n", " 'see',\n", " 'after',\n", " 'could',\n", " 'then',\n", " 'me',\n", " 'most',\n", " 'over',\n", " 'very',\n", " 'your',\n", " 'between',\n", " 'where',\n", " 'now',\n", " 'shall',\n", " 'work',\n", " 'those',\n", " 'same',\n", " 'well',\n", " 'each',\n", " 'many',\n", " 'being',\n", " 'years',\n", " 'did',\n", " 'year',\n", " 'through',\n", " 'must',\n", " 'upon',\n", " 'before',\n", " 'like',\n", " 'use',\n", " 'part',\n", " 'general',\n", " 'people',\n", " 'because',\n", " 'used',\n", " 'how',\n", " 'even',\n", " 'much',\n", " 'states',\n", " 'during',\n", " 'both',\n", " 'case',\n", " 'three',\n", " 'number',\n", " 'make',\n", " 'per',\n", " 'great',\n", " 'act',\n", " 'way',\n", " 'life',\n", " 'good',\n", " 'day',\n", " 'public',\n", " 'man',\n", " 'however',\n", " 'system',\n", " 'water',\n", " 'without',\n", " 'government',\n", " 'while',\n", " 'long',\n", " 'order',\n", " 'law',\n", " 'section',\n", " 'court',\n", " 'high',\n", " 'right',\n", " 'own',\n", " 'found',\n", " 'united',\n", " 'just',\n", " 'here',\n", " 'against',\n", " 'world',\n", " 'does',\n", " 'company',\n", " 'within',\n", " 'given',\n", " 'service',\n", " 'house',\n", " 'another',\n", " 'power',\n", " 'place',\n", " 'know',\n", " 'little',\n", " 'down',\n", " 'present',\n", " 'every',\n", " 'national',\n", " 'back',\n", " 'take',\n", " 'information',\n", " 'men',\n", " 'since',\n", " 'might',\n", " 'small',\n", " 'large',\n", " 'school',\n", " 'following',\n", " 'still',\n", " 'less',\n", " 'last',\n", " 'city',\n", " 'second',\n", " 'development',\n", " 'different',\n", " 'university',\n", " 'old',\n", " 'form',\n", " 'point',\n", " 'total',\n", " 'data',\n", " 'too',\n", " 'committee',\n", " 'report',\n", " 'business',\n", " 'think',\n", " 'end',\n", " 'get',\n", " 'set',\n", " 'research',\n", " 'say',\n", " 'come',\n", " 'country',\n", " 'never',\n", " 'fact',\n", " 'go',\n", " 'control',\n", " 'thus',\n", " 'having',\n", " 'value',\n", " 'social',\n", " 'department',\n", " 'few',\n", " 'above',\n", " 'important',\n", " 'interest',\n", " 'study',\n", " 'off',\n", " 'area',\n", " 'means',\n", " 'office',\n", " 'group',\n", " 'give',\n", " 'again',\n", " 'war',\n", " 'whether',\n", " 'question',\n", " 'called',\n", " 'period',\n", " 'line',\n", " 'land',\n", " 'four',\n", " 'among',\n", " 'table',\n", " 'board',\n", " 'until',\n", " 'hand',\n", " 'taken',\n", " 'need',\n", " 'education',\n", " 'certain',\n", " 'county',\n", " 'action',\n", " 'several',\n", " 'am',\n", " 'course',\n", " 'cases',\n", " 'far',\n", " 'effect',\n", " 'possible',\n", " 'though',\n", " 'left',\n", " 'further',\n", " 'home',\n", " 'days',\n", " 'person',\n", " 'health',\n", " 'amount',\n", " 'members',\n", " 'subject',\n", " 'yet',\n", " 'program',\n", " 'therefore',\n", " 'process',\n", " 'services',\n", " 'rate',\n", " 'local',\n", " 'name',\n", " 'find',\n", " 'necessary',\n", " 'often',\n", " 'others',\n", " 'whole',\n", " 'change',\n", " 'example',\n", " 'president',\n", " 'history',\n", " 'best',\n", " 'although',\n", " 'family',\n", " 'side',\n", " 'women',\n", " 'held',\n", " 'based',\n", " 'south',\n", " 'special',\n", " 'required',\n", " 'came',\n", " 'thought',\n", " 'five',\n", " 'always',\n", " 'himself',\n", " 'air',\n", " 'known',\n", " 'head',\n", " 'either',\n", " 'property',\n", " 'cost',\n", " 'rather',\n", " 'bill',\n", " 'put',\n", " 'human',\n", " 'figure',\n", " 'results',\n", " 'level',\n", " 'conditions',\n", " 'full',\n", " 'times',\n", " 'book',\n", " 'available',\n", " 'early',\n", " 'matter',\n", " 'common',\n", " 'light',\n", " 'let',\n", " 'society',\n", " 'body',\n", " 'international',\n", " 'including',\n", " 'free',\n", " 'evidence',\n", " 'better',\n", " 'type',\n", " 'provided',\n", " 'due',\n", " 'next',\n", " 'production',\n", " 'once',\n", " 'done',\n", " 'making',\n", " 'least',\n", " 'support',\n", " 'north',\n", " 'later',\n", " 'using',\n", " 'things',\n", " 'economic',\n", " 'chapter',\n", " 'various',\n", " 'why',\n", " 'white',\n", " 'going',\n", " 'commission',\n", " 'federal',\n", " 'away',\n", " 'field',\n", " 'result',\n", " 'nature',\n", " 'policy',\n", " 'become',\n", " 'political',\n", " 'increase',\n", " 'around',\n", " 'age',\n", " 'want',\n", " 'low',\n", " 'trade',\n", " 'half',\n", " 'position',\n", " 'young',\n", " 'money',\n", " 'percent',\n", " 'cent',\n", " 'class',\n", " 'words',\n", " 'view',\n", " 'provide',\n", " 'seen',\n", " 'show',\n", " 'district',\n", " 'party',\n", " 'analysis',\n", " 'care',\n", " 'june',\n", " 'foreign',\n", " 'shown',\n", " 'received',\n", " 'management',\n", " 'third',\n", " 'took',\n", " 'something',\n", " 'tax',\n", " 'account',\n", " 'problem',\n", " 'almost',\n", " 'west',\n", " 'nothing',\n", " 'together',\n", " 'individual',\n", " 'open',\n", " 'material',\n", " 'paper',\n", " 'feet',\n", " 'force',\n", " 'association',\n", " 'purpose',\n", " 'terms',\n", " 'method',\n", " 'help',\n", " 'real',\n", " 'ever',\n", " 'already',\n", " 'along',\n", " 'went',\n", " 'term',\n", " 'systems',\n", " 'member',\n", " 'particular',\n", " 'problems',\n", " 'energy',\n", " 'secretary',\n", " 'date',\n", " 'price',\n", " 'short',\n", " 'true',\n", " 'street',\n", " 'building',\n", " 'room',\n", " 'market',\n", " 'look',\n", " 'similar',\n", " 'industry',\n", " 'areas',\n", " 'bank',\n", " 'according',\n", " 'studies',\n", " 'itself',\n", " 'application',\n", " 'current',\n", " 'read',\n", " 'press',\n", " 'community',\n", " 'plan',\n", " 'whose',\n", " 'major',\n", " 'considered',\n", " 'mind',\n", " 'union',\n", " 'cause',\n", " 'able',\n", " 'surface',\n", " 'face',\n", " 'river',\n", " 'council',\n", " 'income',\n", " 'july',\n", " 'near',\n", " 'experience',\n", " 'non',\n", " 'paid',\n", " 'pay',\n", " 'reason',\n", " 'themselves',\n", " 'asked',\n", " 'march',\n", " 'king',\n", " 'higher',\n", " 'single',\n", " 'rights',\n", " 'average',\n", " 'father',\n", " 'note',\n", " 'treatment',\n", " 'love',\n", " 'changes',\n", " 'black',\n", " 'knowledge',\n", " 'enough',\n", " 'future',\n", " 'kind',\n", " 'lower',\n", " 'authority',\n", " 'past',\n", " 'natural',\n", " 'six',\n", " 'persons',\n", " 'food',\n", " 'working',\n", " 'central',\n", " 'college',\n", " 'self',\n", " 'products',\n", " 'model',\n", " 'brought',\n", " 'greater',\n", " 'countries',\n", " 'test',\n", " 'nor',\n", " 'students',\n", " 'private',\n", " 'construction',\n", " 'perhaps',\n", " 'ground',\n", " 'sir',\n", " 'basis',\n", " 'months',\n", " 'growth',\n", " 'increased',\n", " 'word',\n", " 'east',\n", " 'language',\n", " 'rule',\n", " 'continued',\n", " 'quite',\n", " 'except',\n", " 'series',\n", " 'practice',\n", " 'thing',\n", " 'night',\n", " 'works',\n", " 'eyes',\n", " 'oil',\n", " 'art',\n", " 'told',\n", " 'especially',\n", " 'population',\n", " 'science',\n", " 'whom',\n", " 'obtained',\n", " 'parts',\n", " 'capital',\n", " 'include',\n", " 'generally',\n", " 'meeting',\n", " 'specific',\n", " 'described',\n", " 'believe',\n", " 'review',\n", " 'issue',\n", " 'respect',\n", " 'contract',\n", " 'became',\n", " 'effects',\n", " 'medical',\n", " 'road',\n", " 'got',\n", " 'clear',\n", " 'main',\n", " 'labor',\n", " 'operation',\n", " 'size',\n", " 'below',\n", " 'hours',\n", " 'sense',\n", " 'addition',\n", " 'probably',\n", " 'mean',\n", " 'groups',\n", " 'century',\n", " 'personal',\n", " 'plant',\n", " 'training',\n", " 'design',\n", " 'statement',\n", " 'structure',\n", " 'project',\n", " 'million',\n", " 'usually',\n", " 'range',\n", " 'call',\n", " 'mother',\n", " 'seems',\n", " 'standard',\n", " 'return',\n", " 'title',\n", " 'established',\n", " 'keep',\n", " 'space',\n", " 'annual',\n", " 'record',\n", " 'close',\n", " 'april',\n", " 'complete',\n", " 'page',\n", " 'heart',\n", " 'says',\n", " 'fig',\n", " 'quality',\n", " 'gas',\n", " 'methods',\n", " 'letter',\n", " 'stock',\n", " 'costs',\n", " 'gave',\n", " 'related',\n", " 'administration',\n", " 'activities',\n", " 'condition',\n", " 'theory',\n", " 'town',\n", " 'equipment',\n", " 'rates',\n", " 'soon',\n", " 'decision',\n", " 'pressure',\n", " 'written',\n", " 'lines',\n", " 'corporation',\n", " 'tell',\n", " 'schools',\n", " 'agreement',\n", " 'reported',\n", " 'attention',\n", " 'materials',\n", " 'fire',\n", " 'direct',\n", " 'saw',\n", " 'published',\n", " 'temperature',\n", " 'species',\n", " 'really',\n", " 'laws',\n", " 'woman',\n", " 'function',\n", " 'military',\n", " 'proposed',\n", " 'january',\n", " 'additional',\n", " 'late',\n", " 'books',\n", " 'opinion',\n", " 'loss',\n", " 'limited',\n", " 'source',\n", " 'article',\n", " 'notice',\n", " 'security',\n", " 'organization',\n", " 'hands',\n", " 'financial',\n", " 'rules',\n", " 'follows',\n", " 'miles',\n", " 'values',\n", " 'points',\n", " 'chief',\n", " 'distribution',\n", " 'sometimes',\n", " 'insurance',\n", " 'son',\n", " 'strong',\n", " 'length',\n", " 'activity',\n", " 'original',\n", " 'forms',\n", " 'yes',\n", " 'effective',\n", " 'defendant',\n", " 'living',\n", " 'december',\n", " 'character',\n", " 'began',\n", " 'carried',\n", " 'supply',\n", " 'blood',\n", " 'taking',\n", " 'manner',\n", " 'journal',\n", " 'hundred',\n", " 'red',\n", " 'shows',\n", " 'developed',\n", " 'performance',\n", " 'situation',\n", " 'programs',\n", " 'felt',\n", " 'workers',\n", " 'volume',\n", " 'presented',\n", " 'knew',\n", " 'answer',\n", " 'resources',\n", " 'questions',\n", " 'industrial',\n", " 'needs',\n", " 'twenty',\n", " 'sent',\n", " 'looked',\n", " 'purposes',\n", " 'library',\n", " 'added',\n", " 'passed',\n", " 'ten',\n", " 'sea',\n", " 'applied',\n", " 'included',\n", " 'physical',\n", " 'across',\n", " 'army',\n", " 'toward',\n", " 'produced',\n", " 'makes',\n", " 'placed',\n", " 'role',\n", " 'october',\n", " 'final',\n", " 'approach',\n", " 'provisions',\n", " 'leave',\n", " 'director',\n", " 'employment',\n", " 'anything',\n", " 'particularly',\n", " 'hard',\n", " 'outside',\n", " 'week',\n", " 'feel',\n", " 'charge',\n", " 'indeed',\n", " 'degree',\n", " 'reference',\n", " 'requirements',\n", " 'september',\n", " 'today',\n", " 'western',\n", " 'influence',\n", " 'unit',\n", " 'solution',\n", " 'chairman',\n", " 'legal',\n", " 'motion',\n", " 'region',\n", " 'idea',\n", " 'list',\n", " 'judgment',\n", " 'determined',\n", " 'poor',\n", " 'disease',\n", " 'civil',\n", " 'turn',\n", " 'modern',\n", " 'normal',\n", " 'appear',\n", " 'employees',\n", " 'latter',\n", " 'heard',\n", " 'top',\n", " 'sure',\n", " 'moment',\n", " 'code',\n", " 'reports',\n", " 'wife',\n", " 'post',\n", " 'difficult',\n", " 'recent',\n", " 'extent',\n", " 'longer',\n", " 'story',\n", " 'meet',\n", " 'officers',\n", " 'companies',\n", " 'patients',\n", " 'front',\n", " 'doing',\n", " 'staff',\n", " 'product',\n", " 'august',\n", " 'needed',\n", " 'involved',\n", " 'likely',\n", " 'former',\n", " 'run',\n", " 'author',\n", " 'middle',\n", " 'turned',\n", " 'agency',\n", " 'reading',\n", " 'beginning',\n", " 'duty',\n", " 'movement',\n", " 'month',\n", " 'alone',\n", " 'issues',\n", " 'beyond',\n", " 'fine',\n", " 'base',\n", " 'parties',\n", " 'relations',\n", " 'simple',\n", " 'consider',\n", " 'proper',\n", " 'instead',\n", " 'significant',\n", " 'appears',\n", " 'equal',\n", " 'lost',\n", " 'followed',\n", " 'hope',\n", " 'cut',\n", " 'unless',\n", " 'nearly',\n", " 'claim',\n", " 'associated',\n", " 'expected',\n", " 'operations',\n", " 'difference',\n", " 'funds',\n", " 'direction',\n", " 'cross',\n", " 'live',\n", " 'finally',\n", " 'weight',\n", " 'lead',\n", " 'trial',\n", " 'justice',\n", " 'officer',\n", " 'factors',\n", " 'response',\n", " 'cells',\n", " 'earth',\n", " 'rest',\n", " 'fund',\n", " 'bring',\n", " 'trust',\n", " 'goods',\n", " 'observed',\n", " 'behind',\n", " 'job',\n", " 'door',\n", " 'types',\n", " 'understand',\n", " 'acid',\n", " 'hold',\n", " 'technology',\n", " 'wide',\n", " 'protection',\n", " 'basic',\n", " 'november',\n", " 'seemed',\n", " 'throughout',\n", " 'levels',\n", " 'importance',\n", " 'sales',\n", " 'sale',\n", " 'stated',\n", " 'address',\n", " 'potential',\n", " 'payment',\n", " 'prior',\n", " 'discussion',\n", " 'conference',\n", " 'writing',\n", " 'stage',\n", " 'fall',\n", " 'notes',\n", " 'iron',\n", " 'play',\n", " 'ask',\n", " 'plants',\n", " 'relationship',\n", " 'towards',\n", " 'regard',\n", " 'referred',\n", " 'patient',\n", " 'flow',\n", " 'consideration',\n", " 'hospital',\n", " 'seem',\n", " 'february',\n", " 'soil',\n", " 'morning',\n", " 'commercial',\n", " 'planning',\n", " 'provides',\n", " 'appropriate',\n", " 'technical',\n", " 'demand',\n", " 'sufficient',\n", " 'principal',\n", " 'credit',\n", " 'peace',\n", " 'previous',\n", " 'object',\n", " 'prices',\n", " 'kept',\n", " 'sound',\n", " 'wanted',\n", " 'looking',\n", " 'entire',\n", " 'plaintiff',\n", " 'heat',\n", " 'ways',\n", " 'otherwise',\n", " 'judge',\n", " 'hour',\n", " 'capacity',\n", " 'brown',\n", " 'music',\n", " 'risk',\n", " 'box',\n", " 'exchange',\n", " 'produce',\n", " 'station',\n", " 'big',\n", " 'primary',\n", " 'institute',\n", " 'mentioned',\n", " 'prepared',\n", " 'cell',\n", " 'spirit',\n", " 'allowed',\n", " 'claims',\n", " 'site',\n", " 'green',\n", " 'directly',\n", " 'text',\n", " 'friends',\n", " 'presence',\n", " 'survey',\n", " 'determine',\n", " 'car',\n", " 'larger',\n", " 'gives',\n", " 'deep',\n", " 'simply',\n", " 'immediately',\n", " 'distance',\n", " 'coming',\n", " 'seven',\n", " 'steel',\n", " 'friend',\n", " 'records',\n", " 'existing',\n", " 'clearly',\n", " 'actual',\n", " 'relation',\n", " 'born',\n", " 'learning',\n", " 'forces',\n", " 'voice',\n", " 'earlier',\n", " 'circumstances',\n", " 'safety',\n", " 'ago',\n", " 'issued',\n", " 'upper',\n", " 'require',\n", " 'scale',\n", " 'island',\n", " 'culture',\n", " 'employed',\n", " 'eight',\n", " 'estate',\n", " 'facts',\n", " 'portion',\n", " 'deal',\n", " 'share',\n", " 'actually',\n", " 'aid',\n", " 'engineering',\n", " 'continue',\n", " 'formed',\n", " 'agricultural',\n", " 'entitled',\n", " 'mass',\n", " 'truth',\n", " 'giving',\n", " 'numbers',\n", " 'places',\n", " 'met',\n", " 'built',\n", " 'content',\n", " 'connection',\n", " 'assistance',\n", " 'coal',\n", " 'progress',\n", " 'receive',\n", " 'active',\n", " 'nation',\n", " 'contact',\n", " 'amendment',\n", " 'interests',\n", " 'net',\n", " 'wall',\n", " 'standards',\n", " 'farm',\n", " 'understanding',\n", " 'strength',\n", " 'minutes',\n", " 'figures',\n", " 'move',\n", " 'elements',\n", " 'concerned',\n", " 'regulations',\n", " 'step',\n", " 'literature',\n", " 'units',\n", " 'opportunity',\n", " 'investment',\n", " 'led',\n", " 'reduced',\n", " 'follow',\n", " 'facilities',\n", " 'benefit',\n", " 'compared',\n", " 'reached',\n", " 'student',\n", " 'religious',\n", " 'measure',\n", " 'individuals',\n", " 'meaning',\n", " 'considerable',\n", " 'relative',\n", " 'electric',\n", " 'joint',\n", " 'certainly',\n", " 'failure',\n", " 'apply',\n", " ...]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_words" ] }, { "cell_type": "code", "execution_count": null, "id": "dd9e939e-7827-42f9-89be-bcfbb8bd3f52", "metadata": { "tags": [] }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 8, "id": "64b6fcd3-acf7-45da-a335-79c538963bdd", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "discus\n", "physic\n", "posse\n", "serf\n", "sens\n", "caput\n", "bos\n", "graf\n", "pant\n", "barrack\n", "auspex\n", "footstep\n", "colonist\n", "villager\n", "kilometer\n", "granule\n", "credential\n", "petal\n", "trouser\n", "shortcoming\n", "microorganism\n", "italic\n", "grandchild\n", "munition\n", "parenthesis\n", "foodstuff\n", "attache\n", "grandparent\n", "tropic\n", "kilometre\n", "congratulation\n", "fume\n", "convulsion\n", "nostril\n", "utensil\n", "cooky\n", "amenity\n", "reptile\n", "pretension\n", "sock\n", "peso\n", "mitochondrion\n", "reminiscence\n", "invader\n", "macrophage\n", "eyelid\n", "dweller\n", "bristle\n", "tenet\n", "taxon\n", "outskirt\n", "policyholder\n", "stamen\n", "horseman\n", "striker\n", "ramification\n", "tuft\n", "cultivar\n", "interrogatory\n", "bylaw\n", "bellow\n", "neoplasm\n", "insurgent\n", "chore\n", "pensioner\n", "exigency\n", "forefather\n", "atrocity\n", "dissenter\n", "corpuscle\n", "islander\n", "numeral\n", "bureaucrat\n", "classmate\n", "crossroad\n", "pitfall\n", "firework\n", "ravage\n", "broadcaster\n", "heretic\n", "appurtenance\n", "potentiality\n", "louse\n", "conspirator\n", "revers\n", "combatant\n", "conferee\n", "serviceman\n", "repercussion\n", "grader\n", "exhibitor\n", "alkaloid\n", "collaborator\n", "slipper\n", "foothill\n", "homeowner\n", "hallucination\n", "ailment\n", "crumb\n", "milligram\n", "turnip\n", "fingertip\n", "tradesman\n", "archaeologist\n", "bondholder\n", "lira\n", "emolument\n", "tailing\n", "enthusiast\n", "tubule\n", "warship\n", "speculator\n", "jobber\n", "raisin\n", "vicissitude\n", "courtier\n", "clove\n", "entrant\n", "festivity\n", "bough\n", "imago\n", "fibroblast\n", "bruise\n", "misgiving\n", "parishioner\n", "bract\n", "microbe\n", "industrialist\n", "sprout\n", "wrinkle\n", "worshipper\n", "retiree\n", "cracker\n", "negotiator\n", "pronouncement\n", "devotee\n", "sandal\n", "sepal\n", "interrelationship\n", "corticosteroid\n", "sou\n", "framer\n", "knuckle\n", "leukocyte\n", "malformation\n", "geographer\n", "fastener\n", "ruble\n", "whisker\n", "tentacle\n", "footprint\n", "ratepayer\n", "marketer\n", "refiner\n", "cilium\n", "inroad\n", "dragoon\n", "litigant\n", "kilo\n", "shipowner\n", "rudiment\n", "appointee\n", "fingerprint\n", "anther\n", "depredation\n", "stave\n", "rancher\n", "cytokine\n", "artefact\n", "freeholder\n", "churchman\n", "fungicide\n", "inequity\n", "contraindication\n", "arrhythmia\n", "functionary\n", "bandit\n", "horde\n", "spermatozoon\n", "selectman\n", "blocker\n", "inaccuracy\n", "gramme\n", "billet\n", "radiograph\n", "demonstrator\n", "amphibian\n", "mussel\n", "rafter\n", "headlight\n", "vestige\n", "loin\n", "raider\n", "crevice\n", "suitor\n", "technologist\n", "trooper\n", "globule\n", "firefighter\n", "woodcut\n", "purr\n", " or\n", "parse\n", " chili\n", "bolder\n", " due\n", "scents\n", "daze\n" ] } ], "source": [ "# Lemmatize all words (plural -> singular)\n", "lemmatize_mappings = [\n", " (w, lemmatizer.lemmatize(w)) \n", " for w in all_words\n", " # if w != lemmatizer.lemmatize(w)\n", "]\n", "\n", "# Remove all words that lemmatize to another word\n", "#all_words = [w for w in all_words if w not in ]\n", "\n", "# Add custom lemmatizations\n", "for l in custom_maps:\n", " if l in lemmatize_mappings:\n", " print(f\"Warning: {l} is already lemmatized\")\n", " else:\n", " lemmatize_mappings.append(l)\n", " \n", "lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]\n", "\n", "# Now, re-add all lematized words to the list of every word\n", "for w in sum(lemmatize_mappings, ()):\n", " if w not in all_words:\n", " print(w)\n", " all_words.append(w)\n", " \n", "lemmatize_mappings = {k: v for k, v in lemmatize_mappings}\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "8bdff9d0-f3ff-498f-952d-13f1a91bfbd5", "metadata": { "tags": [] }, "outputs": [], "source": [ "final_wordlist = []\n", "seen_lemmatizations = set()\n", "for w in all_words:\n", " lemmatized = lemmatize_mappings.get(w) or w\n", " if lemmatized in seen_lemmatizations:\n", " # The lemmatized version of this word was already seen\n", " continue\n", " else:\n", " # The lemmatized version hasn't been seen. We're good to add it\n", " final_wordlist.append([\n", " k\n", " for k\n", " in lemmatize_mappings.keys()\n", " if lemmatize_mappings[k] == lemmatized\n", " ])\n", " seen_lemmatizations.add(lemmatized)\n", "\n", " if len(final_wordlist) >= WORDLIST_SIZE:\n", " break\n", "\n", "# Now, convert it to the format (number, word)\n", "final_wordlist = [\n", " (idx, w)\n", " for idx, words in enumerate(final_wordlist)\n", " for w in words\n", "]" ] }, { "cell_type": "code", "execution_count": 11, "id": "65bd6887-613e-45ae-ac45-6ed5967b3a43", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "[(0, 'the'),\n", " (1, 'of'),\n", " (2, 'and'),\n", " (3, 'to'),\n", " (4, 'in'),\n", " (5, 'is'),\n", " (6, 'that'),\n", " (7, 'for'),\n", " (7, 'four'),\n", " (8, 'be'),\n", " (8, 'bee'),\n", " (8, 'bees'),\n", " (9, 'by'),\n", " (9, 'bye'),\n", " (10, 'with'),\n", " (11, 'on'),\n", " (12, 'not'),\n", " (13, 'or'),\n", " (14, 'this'),\n", " (15, 'are'),\n", " (16, 'at'),\n", " (17, 'from'),\n", " (18, 'he'),\n", " (19, 'which'),\n", " (20, 'his'),\n", " (21, 'have'),\n", " (22, 'an'),\n", " (23, 'but'),\n", " (24, 'you'),\n", " (25, 'they'),\n", " (26, 'were'),\n", " (27, 'had'),\n", " (28, 'we'),\n", " (29, 'all'),\n", " (30, 'one'),\n", " (30, 'ones'),\n", " (31, 'their'),\n", " (32, 'been'),\n", " (33, 'will'),\n", " (33, 'wills'),\n", " (34, 'there'),\n", " (35, 'can'),\n", " (35, 'cans'),\n", " (36, 'if'),\n", " (37, 'other'),\n", " (38, 'would'),\n", " (39, 'no'),\n", " (39, 'nos'),\n", " (40, 'her'),\n", " (41, 'may'),\n", " (42, 'more'),\n", " (42, 'mores'),\n", " (43, 'when'),\n", " (44, 'who'),\n", " (45, 'such'),\n", " (46, 'these'),\n", " (47, 'any'),\n", " (48, 'she'),\n", " (49, 'new'),\n", " (50, 'time'),\n", " (50, 'times'),\n", " (51, 'than'),\n", " (52, 'do'),\n", " (53, 'some'),\n", " (54, 'what'),\n", " (55, 'only'),\n", " (56, 'into'),\n", " (57, 'them'),\n", " (58, 'two'),\n", " (59, 'also'),\n", " (60, 'about'),\n", " (61, 'out'),\n", " (61, 'outs'),\n", " (62, 'him'),\n", " (62, 'hem'),\n", " (63, 'my'),\n", " (64, 'said'),\n", " (65, 'up'),\n", " (66, 'our'),\n", " (67, 'first'),\n", " (68, 'should'),\n", " (69, 'under'),\n", " (70, 'made'),\n", " (71, 'state'),\n", " (71, 'states'),\n", " (72, 'see'),\n", " (72, 'sees'),\n", " (73, 'after'),\n", " (74, 'could'),\n", " (75, 'then'),\n", " (76, 'me'),\n", " (77, 'most'),\n", " (78, 'over'),\n", " (79, 'very'),\n", " (79, 'vary'),\n", " (80, 'your'),\n", " (81, 'between'),\n", " (82, 'where'),\n", " (83, 'now'),\n", " (84, 'shall'),\n", " (85, 'work'),\n", " (85, 'works'),\n", " (86, 'those'),\n", " (87, 'same'),\n", " (88, 'well'),\n", " (88, 'wells'),\n", " (89, 'each'),\n", " (90, 'many'),\n", " (91, 'being'),\n", " (91, 'beings'),\n", " (92, 'years'),\n", " (92, 'year'),\n", " (93, 'did'),\n", " (94, 'through'),\n", " (94, 'thru'),\n", " (95, 'must'),\n", " (96, 'upon'),\n", " (97, 'before'),\n", " (98, 'like'),\n", " (98, 'likes'),\n", " (99, 'use'),\n", " (100, 'part'),\n", " (100, 'parts'),\n", " (101, 'general'),\n", " (101, 'generals'),\n", " (102, 'people'),\n", " (102, 'peoples'),\n", " (103, 'because'),\n", " (104, 'used'),\n", " (105, 'how'),\n", " (106, 'even'),\n", " (107, 'much'),\n", " (108, 'during'),\n", " (109, 'both'),\n", " (110, 'case'),\n", " (110, 'cases'),\n", " (111, 'three'),\n", " (112, 'number'),\n", " (112, 'numbers'),\n", " (113, 'make'),\n", " (113, 'makes'),\n", " (114, 'per'),\n", " (115, 'great'),\n", " (115, 'grate'),\n", " (116, 'act'),\n", " (116, 'acts'),\n", " (117, 'way'),\n", " (117, 'ways'),\n", " (117, 'weigh'),\n", " (117, 'weighs'),\n", " (118, 'life'),\n", " (118, 'lives'),\n", " (119, 'good'),\n", " (119, 'goods'),\n", " (120, 'day'),\n", " (120, 'days'),\n", " (120, 'dazed'),\n", " (120, 'daze'),\n", " (121, 'public'),\n", " (122, 'man'),\n", " (122, 'mans'),\n", " (123, 'however'),\n", " (124, 'system'),\n", " (124, 'systems'),\n", " (125, 'water'),\n", " (125, 'waters'),\n", " (126, 'without'),\n", " (127, 'government'),\n", " (127, 'governments'),\n", " (128, 'while'),\n", " (129, 'long'),\n", " (130, 'order'),\n", " (130, 'orders'),\n", " (131, 'law'),\n", " (131, 'laws'),\n", " (132, 'section'),\n", " (132, 'sections'),\n", " (133, 'court'),\n", " (133, 'courts'),\n", " (134, 'high'),\n", " (135, 'right'),\n", " (135, 'rights'),\n", " (136, 'own'),\n", " (137, 'found'),\n", " (138, 'united'),\n", " (139, 'just'),\n", " (140, 'here'),\n", " (141, 'against'),\n", " (142, 'world'),\n", " (142, 'worlds'),\n", " (144, 'company'),\n", " (144, 'companies'),\n", " (145, 'within'),\n", " (146, 'given'),\n", " (147, 'service'),\n", " (147, 'services'),\n", " (148, 'house'),\n", " (148, 'houses'),\n", " (149, 'another'),\n", " (150, 'power'),\n", " (150, 'powers'),\n", " (151, 'place'),\n", " (151, 'places'),\n", " (152, 'know'),\n", " (152, 'knows'),\n", " (153, 'little'),\n", " (154, 'down'),\n", " (155, 'present'),\n", " (155, 'presents'),\n", " (156, 'every'),\n", " (157, 'national'),\n", " (157, 'nationals'),\n", " (158, 'back'),\n", " (158, 'backs'),\n", " (159, 'take'),\n", " (159, 'takes'),\n", " (160, 'information'),\n", " (161, 'men'),\n", " (162, 'since'),\n", " (163, 'might'),\n", " (164, 'small'),\n", " (165, 'large'),\n", " (166, 'school'),\n", " (166, 'schools'),\n", " (167, 'following'),\n", " (168, 'still'),\n", " (170, 'last'),\n", " (170, 'lasts'),\n", " (171, 'city'),\n", " (171, 'cities'),\n", " (172, 'second'),\n", " (172, 'seconds'),\n", " (173, 'development'),\n", " (173, 'developments'),\n", " (174, 'different'),\n", " (175, 'university'),\n", " (175, 'universities'),\n", " (176, 'old'),\n", " (177, 'form'),\n", " (177, 'forms'),\n", " (178, 'point'),\n", " (178, 'points'),\n", " (179, 'total'),\n", " (179, 'totals'),\n", " (180, 'data'),\n", " (181, 'too'),\n", " (182, 'committee'),\n", " (182, 'committees'),\n", " (183, 'report'),\n", " (183, 'reports'),\n", " (184, 'business'),\n", " (184, 'businesses'),\n", " (185, 'think'),\n", " (185, 'thinks'),\n", " (186, 'end'),\n", " (186, 'ends'),\n", " (187, 'get'),\n", " (187, 'gets'),\n", " (188, 'set'),\n", " (188, 'sets'),\n", " (189, 'research'),\n", " (189, 'researches'),\n", " (190, 'say'),\n", " (190, 'says'),\n", " (191, 'come'),\n", " (191, 'comes'),\n", " (192, 'country'),\n", " (192, 'countries'),\n", " (193, 'never'),\n", " (194, 'fact'),\n", " (194, 'facts'),\n", " (195, 'go'),\n", " (195, 'goes'),\n", " (196, 'control'),\n", " (196, 'controls'),\n", " (197, 'thus'),\n", " (198, 'having'),\n", " (199, 'value'),\n", " (199, 'values'),\n", " (200, 'social'),\n", " (201, 'department'),\n", " (201, 'departments'),\n", " (202, 'few'),\n", " (203, 'above'),\n", " (204, 'important'),\n", " (205, 'interest'),\n", " (205, 'interests'),\n", " (206, 'study'),\n", " (206, 'studies'),\n", " (207, 'off'),\n", " (208, 'area'),\n", " (208, 'areas'),\n", " (209, 'means'),\n", " (209, 'mean'),\n", " (210, 'office'),\n", " (210, 'offices'),\n", " (211, 'group'),\n", " (211, 'groups'),\n", " (212, 'give'),\n", " (212, 'gives'),\n", " (213, 'again'),\n", " (214, 'war'),\n", " (214, 'wars'),\n", " (215, 'whether'),\n", " (216, 'question'),\n", " (216, 'questions'),\n", " (217, 'called'),\n", " (218, 'period'),\n", " (218, 'periods'),\n", " (219, 'line'),\n", " (219, 'lines'),\n", " (220, 'land'),\n", " (220, 'lands'),\n", " (221, 'among'),\n", " (222, 'table'),\n", " (222, 'tables'),\n", " (223, 'board'),\n", " (223, 'boards'),\n", " (224, 'until'),\n", " (225, 'hand'),\n", " (225, 'hands'),\n", " (226, 'taken'),\n", " (227, 'need'),\n", " (227, 'needs'),\n", " (228, 'education'),\n", " (229, 'certain'),\n", " (230, 'county'),\n", " (230, 'counties'),\n", " (231, 'action'),\n", " (231, 'actions'),\n", " (232, 'several'),\n", " (233, 'am'),\n", " (234, 'course'),\n", " (234, 'courses'),\n", " (235, 'far'),\n", " (236, 'effect'),\n", " (236, 'effects'),\n", " (237, 'possible'),\n", " (238, 'though'),\n", " (239, 'left'),\n", " (240, 'further'),\n", " (241, 'home'),\n", " (241, 'homes'),\n", " (242, 'person'),\n", " (242, 'persons'),\n", " (243, 'health'),\n", " (244, 'amount'),\n", " (244, 'amounts'),\n", " (245, 'members'),\n", " (245, 'member'),\n", " (246, 'subject'),\n", " (246, 'subjects'),\n", " (247, 'yet'),\n", " (248, 'program'),\n", " (248, 'programs'),\n", " (249, 'therefore'),\n", " (250, 'process'),\n", " (250, 'processes'),\n", " (251, 'rate'),\n", " (251, 'rates'),\n", " (252, 'local'),\n", " (252, 'locals'),\n", " (253, 'name'),\n", " (253, 'names'),\n", " (254, 'find'),\n", " (254, 'finds'),\n", " (255, 'necessary'),\n", " (255, 'necessaries'),\n", " (256, 'often'),\n", " (257, 'others'),\n", " (258, 'whole'),\n", " (259, 'change'),\n", " (259, 'changes'),\n", " (260, 'example'),\n", " (260, 'examples'),\n", " (261, 'president'),\n", " (262, 'history'),\n", " (262, 'histories'),\n", " (263, 'best'),\n", " (264, 'although'),\n", " (265, 'family'),\n", " (265, 'families'),\n", " (266, 'side'),\n", " (266, 'sides'),\n", " (267, 'women'),\n", " (267, 'woman'),\n", " (268, 'held'),\n", " (269, 'based'),\n", " (270, 'south'),\n", " (271, 'special'),\n", " (272, 'required'),\n", " (273, 'came'),\n", " (274, 'thought'),\n", " (274, 'thoughts'),\n", " (275, 'five'),\n", " (276, 'always'),\n", " (277, 'himself'),\n", " (278, 'air'),\n", " (278, 'airs'),\n", " (279, 'known'),\n", " (280, 'head'),\n", " (280, 'heads'),\n", " (281, 'either'),\n", " (282, 'property'),\n", " (282, 'properties'),\n", " (283, 'cost'),\n", " (283, 'costs'),\n", " (284, 'rather'),\n", " (285, 'bill'),\n", " (285, 'bills'),\n", " (286, 'put'),\n", " (286, 'puts'),\n", " (287, 'human'),\n", " (287, 'humans'),\n", " (288, 'figure'),\n", " (288, 'figures'),\n", " (289, 'results'),\n", " (289, 'result'),\n", " (290, 'level'),\n", " (290, 'levels'),\n", " (291, 'conditions'),\n", " (291, 'condition'),\n", " (292, 'full'),\n", " (293, 'book'),\n", " (293, 'books'),\n", " (294, 'available'),\n", " (295, 'early'),\n", " (296, 'matter'),\n", " (296, 'matters'),\n", " (297, 'common'),\n", " (298, 'light'),\n", " (298, 'lights'),\n", " (299, 'let'),\n", " (299, 'lets'),\n", " (300, 'society'),\n", " (300, 'societies'),\n", " (301, 'body'),\n", " (301, 'bodies'),\n", " (302, 'international'),\n", " (303, 'including'),\n", " (304, 'free'),\n", " (305, 'evidence'),\n", " (305, 'evidences'),\n", " (306, 'better'),\n", " (307, 'type'),\n", " (307, 'types'),\n", " (308, 'provided'),\n", " (309, 'due'),\n", " (309, 'dues'),\n", " (310, 'next'),\n", " (311, 'production'),\n", " (311, 'productions'),\n", " (312, 'once'),\n", " (313, 'done'),\n", " (314, 'making'),\n", " (315, 'least'),\n", " (316, 'support'),\n", " (316, 'supports'),\n", " (317, 'north'),\n", " (318, 'later'),\n", " (319, 'using'),\n", " (320, 'things'),\n", " (320, 'thing'),\n", " (321, 'economic'),\n", " (322, 'chapter'),\n", " (322, 'chapters'),\n", " (323, 'various'),\n", " (324, 'why'),\n", " (325, 'white'),\n", " (325, 'whites'),\n", " (326, 'going'),\n", " (327, 'commission'),\n", " (327, 'commissions'),\n", " (328, 'federal'),\n", " (329, 'away'),\n", " (330, 'field'),\n", " (330, 'fields'),\n", " (331, 'nature'),\n", " (331, 'natures'),\n", " (332, 'policy'),\n", " (332, 'policies'),\n", " (333, 'become'),\n", " (334, 'political'),\n", " (335, 'increase'),\n", " (335, 'increases'),\n", " (336, 'around'),\n", " (337, 'age'),\n", " (337, 'ages'),\n", " (338, 'want'),\n", " (338, 'wants'),\n", " (339, 'low'),\n", " (339, 'lows'),\n", " (340, 'trade'),\n", " (340, 'trades'),\n", " (341, 'half'),\n", " (341, 'halves'),\n", " (342, 'position'),\n", " (342, 'positions'),\n", " (343, 'young'),\n", " (344, 'money'),\n", " (344, 'moneys'),\n", " (345, 'percent'),\n", " (346, 'cent'),\n", " (346, 'sense'),\n", " (346, 'senses'),\n", " (346, 'scent'),\n", " (346, 'sensed'),\n", " (346, 'scents'),\n", " (347, 'class'),\n", " (347, 'classes'),\n", " (348, 'words'),\n", " (348, 'word'),\n", " (349, 'view'),\n", " (349, 'views'),\n", " (350, 'provide'),\n", " (351, 'seen'),\n", " (352, 'show'),\n", " (352, 'shows'),\n", " (353, 'district'),\n", " (353, 'districts'),\n", " (354, 'party'),\n", " (354, 'parties'),\n", " (355, 'analysis'),\n", " (355, 'analyses'),\n", " (356, 'care'),\n", " (356, 'cares'),\n", " (357, 'june'),\n", " (358, 'foreign'),\n", " (359, 'shown'),\n", " (360, 'received'),\n", " (361, 'management'),\n", " (362, 'third'),\n", " (362, 'thirds'),\n", " (363, 'took'),\n", " (364, 'something'),\n", " (365, 'tax'),\n", " (365, 'taxes'),\n", " (366, 'account'),\n", " (366, 'accounts'),\n", " (367, 'problem'),\n", " (367, 'problems'),\n", " (368, 'almost'),\n", " (369, 'west'),\n", " (370, 'nothing'),\n", " (371, 'together'),\n", " (372, 'individual'),\n", " (372, 'individuals'),\n", " (373, 'open'),\n", " (373, 'opens'),\n", " (374, 'material'),\n", " (374, 'materials'),\n", " (375, 'paper'),\n", " (375, 'papers'),\n", " (376, 'feet'),\n", " (376, 'foot'),\n", " (377, 'force'),\n", " (377, 'forces'),\n", " (378, 'association'),\n", " (378, 'associations'),\n", " (379, 'purpose'),\n", " (379, 'purposes'),\n", " (380, 'terms'),\n", " (380, 'term'),\n", " (381, 'method'),\n", " (381, 'methods'),\n", " (382, 'help'),\n", " (382, 'helps'),\n", " (383, 'real'),\n", " (384, 'ever'),\n", " (385, 'already'),\n", " (386, 'along'),\n", " (387, 'went'),\n", " (388, 'particular'),\n", " (388, 'particulars'),\n", " (389, 'energy'),\n", " (389, 'energies'),\n", " (390, 'secretary'),\n", " (391, 'date'),\n", " (391, 'dates'),\n", " (392, 'price'),\n", " (392, 'prices'),\n", " (393, 'short'),\n", " (393, 'shorts'),\n", " (394, 'true'),\n", " (395, 'street'),\n", " (395, 'streets'),\n", " (396, 'building'),\n", " (396, 'buildings'),\n", " (397, 'room'),\n", " (397, 'rooms'),\n", " (398, 'market'),\n", " (398, 'markets'),\n", " (399, 'look'),\n", " (399, 'looks'),\n", " (400, 'similar'),\n", " (401, 'industry'),\n", " (401, 'industries'),\n", " (402, 'bank'),\n", " (402, 'banks'),\n", " (403, 'according'),\n", " (404, 'itself'),\n", " (405, 'application'),\n", " (405, 'applications'),\n", " (406, 'current'),\n", " (406, 'currents'),\n", " (407, 'read'),\n", " (407, 'reads'),\n", " (408, 'press'),\n", " (408, 'presses'),\n", " (409, 'community'),\n", " (409, 'communities'),\n", " (410, 'plan'),\n", " (410, 'plans'),\n", " (411, 'whose'),\n", " (412, 'major'),\n", " (412, 'majors'),\n", " (413, 'considered'),\n", " (414, 'mind'),\n", " (414, 'minds'),\n", " (415, 'union'),\n", " (415, 'unions'),\n", " (416, 'cause'),\n", " (416, 'causes'),\n", " (417, 'able'),\n", " (418, 'surface'),\n", " (418, 'surfaces'),\n", " (419, 'face'),\n", " (419, 'faces'),\n", " (420, 'river'),\n", " (420, 'rivers'),\n", " (421, 'council'),\n", " (421, 'councils'),\n", " (422, 'income'),\n", " (422, 'incomes'),\n", " (423, 'july'),\n", " (424, 'near'),\n", " (425, 'experience'),\n", " (425, 'experiences'),\n", " (426, 'non'),\n", " (427, 'paid'),\n", " (428, 'pay'),\n", " (428, 'pays'),\n", " (429, 'reason'),\n", " (429, 'reasons'),\n", " (430, 'themselves'),\n", " (431, 'asked'),\n", " (432, 'march'),\n", " (432, 'marches'),\n", " (433, 'king'),\n", " (433, 'kings'),\n", " (434, 'higher'),\n", " (435, 'single'),\n", " (435, 'singles'),\n", " (436, 'average'),\n", " (436, 'averages'),\n", " (437, 'father'),\n", " (437, 'fathers'),\n", " (438, 'note'),\n", " (438, 'notes'),\n", " (439, 'treatment'),\n", " (439, 'treatments'),\n", " (440, 'love'),\n", " (440, 'loves'),\n", " (441, 'black'),\n", " (441, 'blacks'),\n", " (442, 'knowledge'),\n", " (443, 'enough'),\n", " (444, 'future'),\n", " (444, 'futures'),\n", " (445, 'kind'),\n", " (445, 'kinds'),\n", " (446, 'lower'),\n", " (446, 'lowers'),\n", " (447, 'authority'),\n", " (447, 'authorities'),\n", " (448, 'past'),\n", " (449, 'natural'),\n", " (450, 'six'),\n", " (451, 'food'),\n", " (451, 'foods'),\n", " (452, 'working'),\n", " (452, 'workings'),\n", " (453, 'central'),\n", " (454, 'college'),\n", " (454, 'colleges'),\n", " (455, 'self'),\n", " (455, 'selves'),\n", " (456, 'products'),\n", " (456, 'product'),\n", " (457, 'model'),\n", " (457, 'models'),\n", " (458, 'brought'),\n", " (459, 'greater'),\n", " (460, 'test'),\n", " (460, 'tests'),\n", " (461, 'nor'),\n", " (462, 'students'),\n", " (462, 'student'),\n", " (463, 'private'),\n", " (464, 'construction'),\n", " (464, 'constructions'),\n", " (465, 'perhaps'),\n", " (466, 'ground'),\n", " (466, 'grounds'),\n", " (467, 'sir'),\n", " (468, 'basis'),\n", " (469, 'months'),\n", " (469, 'month'),\n", " (470, 'growth'),\n", " (470, 'growths'),\n", " (471, 'increased'),\n", " (472, 'east'),\n", " (473, 'language'),\n", " (473, 'languages'),\n", " (474, 'rule'),\n", " (474, 'rules'),\n", " (475, 'continued'),\n", " (476, 'quite'),\n", " (477, 'except'),\n", " (478, 'series'),\n", " (479, 'practice'),\n", " (479, 'practices'),\n", " (480, 'night'),\n", " (480, 'knight'),\n", " (480, 'nights'),\n", " (481, 'eyes'),\n", " (481, 'eye'),\n", " (482, 'oil'),\n", " (482, 'oils'),\n", " (483, 'art'),\n", " (483, 'arts'),\n", " (484, 'told'),\n", " (485, 'especially'),\n", " (486, 'population'),\n", " (486, 'populations'),\n", " (487, 'science'),\n", " (487, 'sciences'),\n", " (488, 'whom'),\n", " (489, 'obtained'),\n", " (490, 'capital'),\n", " (490, 'capitol'),\n", " (490, 'capitals'),\n", " (491, 'include'),\n", " (492, 'generally'),\n", " (493, 'meeting'),\n", " (493, 'meetings'),\n", " (494, 'specific'),\n", " (494, 'specifics'),\n", " (495, 'described'),\n", " (496, 'believe'),\n", " (497, 'review'),\n", " (497, 'reviews'),\n", " (498, 'issue'),\n", " (498, 'issues'),\n", " (499, 'respect'),\n", " (499, 'respects'),\n", " (500, 'contract'),\n", " (500, 'contracts'),\n", " (501, 'became'),\n", " (502, 'medical'),\n", " (503, 'road'),\n", " (503, 'roads'),\n", " (504, 'got'),\n", " (505, 'clear'),\n", " (505, 'clears'),\n", " (506, 'main'),\n", " (506, 'mains'),\n", " (507, 'labor'),\n", " (507, 'labors'),\n", " (508, 'operation'),\n", " (508, 'operations'),\n", " (509, 'size'),\n", " (509, 'sizes'),\n", " (510, 'below'),\n", " (511, 'hours'),\n", " (511, 'hour'),\n", " (512, 'addition'),\n", " (512, 'additions'),\n", " (513, 'probably'),\n", " (514, 'century'),\n", " (514, 'centuries'),\n", " (515, 'personal'),\n", " (516, 'plant'),\n", " (516, 'plants'),\n", " (517, 'training'),\n", " (518, 'design'),\n", " (518, 'designs'),\n", " (519, 'statement'),\n", " (519, 'statements'),\n", " (520, 'structure'),\n", " (520, 'structures'),\n", " (521, 'project'),\n", " (521, 'projects'),\n", " (522, 'million'),\n", " (522, 'millions'),\n", " (523, 'usually'),\n", " (524, 'range'),\n", " (524, 'ranges'),\n", " (525, 'call'),\n", " (525, 'calls'),\n", " (526, 'mother'),\n", " (526, 'mothers'),\n", " (527, 'seems'),\n", " (528, 'standard'),\n", " (528, 'standards'),\n", " (529, 'return'),\n", " (529, 'returns'),\n", " (530, 'title'),\n", " (530, 'titles'),\n", " (531, 'established'),\n", " (532, 'keep'),\n", " (532, 'keeps'),\n", " (533, 'space'),\n", " (533, 'spaces'),\n", " (534, 'annual'),\n", " (535, 'record'),\n", " (535, 'records'),\n", " (536, 'close'),\n", " (536, 'closes'),\n", " (537, 'april'),\n", " (538, 'complete'),\n", " (539, 'page'),\n", " (539, 'pages'),\n", " (540, 'heart'),\n", " (540, 'hearts'),\n", " (541, 'fig'),\n", " (541, 'figs'),\n", " (542, 'quality'),\n", " (542, 'qualities'),\n", " (543, 'gas'),\n", " (543, 'gases'),\n", " (544, 'letter'),\n", " (544, 'letters'),\n", " (545, 'stock'),\n", " (545, 'stocks'),\n", " (546, 'gave'),\n", " (547, 'related'),\n", " (548, 'administration'),\n", " (548, 'administrations'),\n", " (549, 'activities'),\n", " (549, 'activity'),\n", " (550, 'theory'),\n", " (550, 'theories'),\n", " (551, 'town'),\n", " (551, 'towns'),\n", " (552, 'equipment'),\n", " (552, 'equipments'),\n", " (553, 'soon'),\n", " (554, 'decision'),\n", " (554, 'decisions'),\n", " (555, 'pressure'),\n", " (555, 'pressures'),\n", " (556, 'written'),\n", " (557, 'corporation'),\n", " (557, 'corporations'),\n", " (558, 'tell'),\n", " (558, 'tells'),\n", " (559, 'agreement'),\n", " (559, 'agreements'),\n", " (560, 'reported'),\n", " (561, 'attention'),\n", " (561, 'attentions'),\n", " (562, 'fire'),\n", " (562, 'fires'),\n", " (563, 'direct'),\n", " (564, 'saw'),\n", " (564, 'saws'),\n", " (565, 'published'),\n", " (566, 'temperature'),\n", " (566, 'temperatures'),\n", " (567, 'species'),\n", " (567, 'specie'),\n", " (568, 'really'),\n", " (569, 'function'),\n", " (569, 'functions'),\n", " (570, 'military'),\n", " (571, 'proposed'),\n", " (572, 'january'),\n", " (573, 'additional'),\n", " (574, 'late'),\n", " (575, 'opinion'),\n", " (575, 'opinions'),\n", " (576, 'loss'),\n", " (576, 'losses'),\n", " (577, 'limited'),\n", " (578, 'source'),\n", " (578, 'sources'),\n", " (579, 'article'),\n", " (579, 'articles'),\n", " (580, 'notice'),\n", " (580, 'notices'),\n", " (581, 'security'),\n", " (581, 'securities'),\n", " (582, 'organization'),\n", " (582, 'organizations'),\n", " (582, 'organisation'),\n", " (582, 'organisations'),\n", " (583, 'financial'),\n", " (584, 'follows'),\n", " (585, 'miles'),\n", " (585, 'mile'),\n", " (586, 'chief'),\n", " (586, 'chiefs'),\n", " (587, 'distribution'),\n", " (587, 'distributions'),\n", " (588, 'sometimes'),\n", " (589, 'insurance'),\n", " (590, 'son'),\n", " (590, 'sons'),\n", " (591, 'strong'),\n", " (592, 'length'),\n", " (592, 'lengths'),\n", " (593, 'original'),\n", " (593, 'originals'),\n", " (594, 'yes'),\n", " (595, 'effective'),\n", " (596, 'defendant'),\n", " (596, 'defendants'),\n", " (597, 'living'),\n", " (598, 'december'),\n", " (599, 'character'),\n", " (599, 'characters'),\n", " (600, 'began'),\n", " (601, 'carried'),\n", " (602, 'supply'),\n", " (602, 'supplies'),\n", " (603, 'blood'),\n", " (604, 'taking'),\n", " (605, 'manner'),\n", " (605, 'manners'),\n", " (606, 'journal'),\n", " (606, 'journals'),\n", " (607, 'hundred'),\n", " (607, 'hundreds'),\n", " (608, 'red'),\n", " (609, 'developed'),\n", " (610, 'performance'),\n", " (610, 'performances'),\n", " (611, 'situation'),\n", " (611, 'situations'),\n", " (612, 'felt'),\n", " (613, 'workers'),\n", " (613, 'worker'),\n", " (614, 'volume'),\n", " (614, 'volumes'),\n", " (615, 'presented'),\n", " (616, 'knew'),\n", " (617, 'answer'),\n", " (617, 'answers'),\n", " (618, 'resources'),\n", " (618, 'resource'),\n", " (619, 'industrial'),\n", " (620, 'twenty'),\n", " (620, 'twenties'),\n", " (621, 'sent'),\n", " (622, 'looked'),\n", " (623, 'library'),\n", " (623, 'libraries'),\n", " (624, 'added'),\n", " (625, 'passed'),\n", " (626, 'ten'),\n", " (626, 'tens'),\n", " (627, 'sea'),\n", " (627, 'seas'),\n", " (628, 'applied'),\n", " (629, 'included'),\n", " (630, 'physical'),\n", " (631, 'across'),\n", " (632, 'army'),\n", " (632, 'armies'),\n", " (633, 'toward'),\n", " (634, 'produced'),\n", " (635, 'placed'),\n", " (636, 'role'),\n", " (636, 'roles'),\n", " (637, 'october'),\n", " (638, 'final'),\n", " (639, 'approach'),\n", " (639, 'approaches'),\n", " (640, 'provisions'),\n", " (640, 'provision'),\n", " (641, 'leave'),\n", " (642, 'director'),\n", " (642, 'directors'),\n", " (643, 'employment'),\n", " (643, 'employments'),\n", " (644, 'anything'),\n", " (645, 'particularly'),\n", " (646, 'hard'),\n", " (647, 'outside'),\n", " (648, 'week'),\n", " (648, 'weeks'),\n", " (649, 'feel'),\n", " (649, 'feels'),\n", " (650, 'charge'),\n", " (650, 'charges'),\n", " (651, 'indeed'),\n", " (652, 'degree'),\n", " (652, 'degrees'),\n", " (653, 'reference'),\n", " ...]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_wordlist" ] }, { "cell_type": "code", "execution_count": 12, "id": "d1a06597-4ad5-4566-a716-8bbad416b7ab", "metadata": { "tags": [] }, "outputs": [], "source": [ "with open(\"final_wordlist.csv\", \"w\") as f:\n", " f.write(\"word,number\\n\")\n", " \n", " for w in final_wordlist:\n", " lemmatized = \"\" if not w[1] else w[1]\n", " f.write(f\"{w[1].upper()},{w[0]}\")\n", " f.write(\"\\n\")" ] }, { "cell_type": "code", "execution_count": null, "id": "c88fe193-11cc-4a06-a3cf-d1ad85f44d14", "metadata": { "tags": [] }, "outputs": [], "source": [ "final_wordlist" ] }, { "cell_type": "code", "execution_count": null, "id": "2a0d177b-3499-42fb-8091-29547567d69a", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 }