In [1]:
try:
    _initialized
except:
    !pip install nltk odfpy
    import nltk
    
    nltk.download("wordnet")
    _initialized=True
    
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import gzip
import re



[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def get_lines(filename):
    with gzip.open(filename, 'r') as f:
        ret = []
        for l in f:
            if len(ret) > 30_000:
                return ret
            ret.append(str(l).lower())
        return ret
    
WORDLIST_SIZE = 8192 + 3
lemmatizer = WordNetLemmatizer()
word_re = re.compile(r"^[A-Za-z]+$")

In [3]:
annotated_words=pd.read_excel("annotated_words.ods")

In [4]:
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
excluded_words[0:10]

['a', 'as', 'it', 'was', 'i', 'has', 'so', 'its', 's', 'p']

In [5]:
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))

custom_maps = [
    (m[1]["word"].lower(), mapping.lower())
    for m in custom_maps.iterrows()
    for mapping in m[1]["maps_to"]
]
custom_maps

[('be', 'bee'),
 ('by', 'bye'),
 ('per', 'purr'),
 ('sense', 'cent'),
 ('died', 'dyed'),
 ('cents', 'sense'),
 ('yellow', 'hello'),
 ('corps', 'core'),
 ('ore', 'oar'),
 ('ore', ' or'),
 ('vary', 'very'),
 ('com', 'calm'),
 ('filing', 'filling'),
 ('fax', 'facts'),
 ('favour', 'favor'),
 ('theatre', 'theater'),
 ('par', 'parse'),
 ('honour', 'honor'),
 ('harry', 'hairy'),
 ('brings', 'bring'),
 ('organisation', 'organization'),
 ('simultaneously', 'simultaneous'),
 ('aluminum', 'aluminium'),
 ('knight', 'night'),
 ('electronics', 'electronic'),
 ('senses', 'cent'),
 ('organisations', 'organization'),
 ('fortunately', 'fortunate'),
 ('corp', 'core'),
 ('chile', 'chilly'),
 ('chile', ' chili'),
 ('owe', 'oh'),
 ('capitol', 'capital'),
 ('weary', 'wary'),
 ('berry', 'barry'),
 ('lecturer', 'lecture'),
 ('weigh', 'way'),
 ('aluminium', 'aluminum'),
 ('isle', 'aisle'),
 ('boulder', 'bolder'),
 ('blew', 'blue'),
 ('reformed', 'reform'),
 ('scent', 'cent'),
 ('ads', 'adds'),
 ('honours', 'hon

In [6]:
# Start parsing the wordlist
all_words = get_lines("frequency-all.txt.gz")

# Delete header line
all_words = all_words[1:]

# Get only the word (fixed width)
all_words = [w[13:36].strip() for w in all_words]

# Remove special characters
all_words = [w for w in all_words if word_re.search(w)]

# Remove all removed words
all_words = [w for w in all_words if w not in excluded_words]

# Add all custom mappings
for m in list(sum(custom_maps, ())):
    if m[0] not in all_words:
        all_words.append(m[0])
    if m[1] not in all_words:
        all_words.append(m[1])

In [7]:
all_words

['the',
 'of',
 'and',
 'to',
 'in',
 'is',
 'that',
 'for',
 'be',
 'by',
 'with',
 'on',
 'not',
 'or',
 'this',
 'are',
 'at',
 'from',
 'he',
 'which',
 'his',
 'have',
 'an',
 'but',
 'you',
 'they',
 'were',
 'had',
 'we',
 'all',
 'one',
 'their',
 'been',
 'will',
 'there',
 'can',
 'if',
 'other',
 'would',
 'no',
 'her',
 'may',
 'more',
 'when',
 'who',
 'such',
 'these',
 'any',
 'she',
 'new',
 'time',
 'than',
 'do',
 'some',
 'what',
 'only',
 'into',
 'them',
 'two',
 'also',
 'about',
 'out',
 'him',
 'my',
 'said',
 'up',
 'our',
 'first',
 'should',
 'under',
 'made',
 'state',
 'see',
 'after',
 'could',
 'then',
 'me',
 'most',
 'over',
 'very',
 'your',
 'between',
 'where',
 'now',
 'shall',
 'work',
 'those',
 'same',
 'well',
 'each',
 'many',
 'being',
 'years',
 'did',
 'year',
 'through',
 'must',
 'upon',
 'before',
 'like',
 'use',
 'part',
 'general',
 'people',
 'because',
 'used',
 'how',
 'even',
 'much',
 'states',
 'during',
 'both',
 'case',
 'three

In [8]:
# Lemmatize all words (plural -> singular)
lemmatize_mappings = [
    (w, lemmatizer.lemmatize(w)) 
    for w in all_words
    # if w != lemmatizer.lemmatize(w)
]

# Remove all words that lemmatize to another word
#all_words = [w for w in all_words if w not in ]

# Add custom lemmatizations
for l in custom_maps:
    if l in lemmatize_mappings:
        print(f"Warning: {l} is already lemmatized")
    else:
        lemmatize_mappings.append(l)
        
lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]

# Now, re-add all lematized words to the list of every word
for w in sum(lemmatize_mappings, ()):
    if w not in all_words:
        print(w)
        all_words.append(w)
        
lemmatize_mappings = {k: v for k, v in lemmatize_mappings}


discus
physic
posse
serf
sens
caput
bos
graf
pant
barrack
auspex
footstep
colonist
villager
kilometer
granule
credential
petal
trouser
shortcoming
microorganism
italic
grandchild
munition
parenthesis
foodstuff
attache
grandparent
tropic
kilometre
congratulation
fume
convulsion
nostril
utensil
cooky
amenity
reptile
pretension
sock
peso
mitochondrion
reminiscence
invader
macrophage
eyelid
dweller
bristle
tenet
taxon
outskirt
policyholder
stamen
horseman
striker
ramification
tuft
cultivar
interrogatory
bylaw
bellow
neoplasm
insurgent
chore
pensioner
exigency
forefather
atrocity
dissenter
corpuscle
islander
numeral
bureaucrat
classmate
crossroad
pitfall
firework
ravage
broadcaster
heretic
appurtenance
potentiality
louse
conspirator
revers
combatant
conferee
serviceman
repercussion
grader
exhibitor
alkaloid
collaborator
slipper
foothill
homeowner
hallucination
ailment
crumb
milligram
turnip
fingertip
tradesman
archaeologist
bondholder
lira
emolument
tailing
enthusiast
tubule
warship
specula

In [10]:
final_wordlist = []
seen_lemmatizations = set()
for w in all_words:
    lemmatized = lemmatize_mappings.get(w) or w
    if lemmatized in seen_lemmatizations:
        # The lemmatized version of this word was already seen
        continue
    else:
        # The lemmatized version hasn't been seen. We're good to add it
        final_wordlist.append([
            k
            for k
            in lemmatize_mappings.keys()
            if lemmatize_mappings[k] == lemmatized
        ])
        seen_lemmatizations.add(lemmatized)

    if len(final_wordlist) >= WORDLIST_SIZE:
        break

# Now, convert it to the format (number, word)
final_wordlist = [
    (idx, w)
    for idx, words in enumerate(final_wordlist)
    for w in words
]

In [11]:
final_wordlist

[(0, 'the'),
 (1, 'of'),
 (2, 'and'),
 (3, 'to'),
 (4, 'in'),
 (5, 'is'),
 (6, 'that'),
 (7, 'for'),
 (7, 'four'),
 (8, 'be'),
 (8, 'bee'),
 (8, 'bees'),
 (9, 'by'),
 (9, 'bye'),
 (10, 'with'),
 (11, 'on'),
 (12, 'not'),
 (13, 'or'),
 (14, 'this'),
 (15, 'are'),
 (16, 'at'),
 (17, 'from'),
 (18, 'he'),
 (19, 'which'),
 (20, 'his'),
 (21, 'have'),
 (22, 'an'),
 (23, 'but'),
 (24, 'you'),
 (25, 'they'),
 (26, 'were'),
 (27, 'had'),
 (28, 'we'),
 (29, 'all'),
 (30, 'one'),
 (30, 'ones'),
 (31, 'their'),
 (32, 'been'),
 (33, 'will'),
 (33, 'wills'),
 (34, 'there'),
 (35, 'can'),
 (35, 'cans'),
 (36, 'if'),
 (37, 'other'),
 (38, 'would'),
 (39, 'no'),
 (39, 'nos'),
 (40, 'her'),
 (41, 'may'),
 (42, 'more'),
 (42, 'mores'),
 (43, 'when'),
 (44, 'who'),
 (45, 'such'),
 (46, 'these'),
 (47, 'any'),
 (48, 'she'),
 (49, 'new'),
 (50, 'time'),
 (50, 'times'),
 (51, 'than'),
 (52, 'do'),
 (53, 'some'),
 (54, 'what'),
 (55, 'only'),
 (56, 'into'),
 (57, 'them'),
 (58, 'two'),
 (59, 'also'),
 (60, '

In [12]:
with open("final_wordlist.csv", "w") as f:
    f.write("word,number\n")
    
    for w in final_wordlist:
        lemmatized = "" if not w[1] else w[1]
        f.write(f"{w[1].upper()},{w[0]}")
        f.write("\n")

In [None]:
final_wordlist