In [1]:
try:
    _initialized
except:
    !pip install nltk odfpy
    import nltk
    
    nltk.download("wordnet")
    _initialized=True
    
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import gzip
import re

WORDLIST_SIZE=8192 +3

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting odfpy
  Downloading odfpy-1.4.1.tar.gz (717 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m717.0/717.0 kB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m770.5/770.5 kB[0m [31m140.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: odfpy
  Building wheel for odfpy (setup.py) ... [?25ldone
[?25h  Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=07cd1c76f3eab402c874a8f4e7d32754528bfb4ba43ad4da49f7cd9986a2b7f4
  Stored in directory: /home/jovyan/.cache/pip/wheel

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...


## First, get the list of excluded words

In [2]:
annotated_words=pd.read_excel("annotated_words.ods")

In [3]:
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
excluded_words[0:10]

['a', 'i', 's', 'p', 'c', 'b', 'american', 'york', 'children', 'd']

## Next, get the list of custom mappings

In [4]:
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))

custom_maps = [
    (m[1]["word"].lower(), mapping.lower())
    for m in custom_maps.iterrows()
    for mapping in m[1]["maps_to"]
]
custom_maps

[('be', 'bee'),
 ('by', 'bye'),
 ('died', 'dyed'),
 ('cents', 'sense'),
 ('yellow', 'hello'),
 ('corps', 'core'),
 ('ore', 'oar'),
 ('ore', ' or'),
 ('vary', 'very'),
 ('com', 'calm'),
 ('filing', 'filling'),
 ('fax', 'facts'),
 ('favour', 'favor'),
 ('theatre', 'theater'),
 ('par', 'parse'),
 ('honour', 'honor'),
 ('harry', 'hairy'),
 ('brings', 'bring'),
 ('organisation', 'organization'),
 ('simultaneously', 'simultaneous'),
 ('aluminum', 'aluminium'),
 ('knight', 'night'),
 ('electronics', 'electronic'),
 ('organisations', 'organizations'),
 ('fortunately', 'fortunate'),
 ('corp', 'core'),
 ('chile', 'chilly'),
 ('chile', ' chili'),
 ('owe', 'oh'),
 ('capitol', 'capital'),
 ('weary', 'wary'),
 ('berry', 'barry'),
 ('lecturer', 'lecture'),
 ('aluminium', 'aluminum'),
 ('isle', 'aisle'),
 ('boulder', 'bolder'),
 ('blew', 'blue'),
 ('reformed', 'reform'),
 ('scent', 'sense'),
 ('ads', 'adds'),
 ('honours', 'honors'),
 ('bot', 'bought'),
 ('dew', 'do'),
 ('dew', ' due'),
 ('theatres', '

In [23]:
def get_lines(filename):
    with gzip.open(filename, 'r') as f:
        ret = []
        for l in f:
            if len(ret) > 30_000:
                return ret
            ret.append(str(l).lower())
        return ret
    
lemmatizer = WordNetLemmatizer()
word_re = re.compile(r"^[A-Za-z]+$")

# Start parsing the wordlist
all_words = get_lines("frequency-all.txt.gz")

# Delete header line
all_words = all_words[1:]

# Get only the word (fixed width)
all_words = [w[13:36].strip() for w in all_words]

# Remove special characters
all_words = [w for w in all_words if word_re.search(w)]

# Remove all removed words
all_words = [w for w in all_words if w not in excluded_words]

# Lemmatize all words (plural -> singular)
lemmatize_mappings = [
    (w, lemmatizer.lemmatize(w)) 
    for w in all_words
    # if w != lemmatizer.lemmatize(w)
]

# Remove all words that lemmatize to another word
#all_words = [w for w in all_words if w not in ]

# Add custom lemmatizations
for l in custom_maps:
    if l in lemmatize_mappings:
        print(f"Warning: {l} is already lemmatized")
    else:
        lemmatize_mappings.append(l)

distinct_words_lemmatized = set()
distinct_words = []
for w in lemmatize_mappings:
    if w[1] not in distinct_words_lemmatized:
        distinct_words_lemmatized.add(w[1])
        distinct_words.append(w[0])
del distinct_words_lemmatized

# Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word
#wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl]

# Get a list of words that map to other words
# A word was lemmatized if wordnet mapped it to another word (not None) that was different
#only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]]

# Get a list of distinct lemmatized words
#distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None]
#distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)]

print(f"# all_words: {len(all_words)}")
print(f"sample: {all_words[0:10]}")
print()
print(f"# lemmatize_mappings: {len(lemmatize_mappings)}")
print(f"sample: {lemmatize_mappings[0:10]}")
print()
print(f"# distinct_words: {len(distinct_words)}")
print(f"sample:")
distinct_words[0:10]


# all_words: 21285
sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']

# lemmatize_mappings: 4150
sample: [('as', 'a'), ('was', 'wa'), ('has', 'ha'), ('its', 'it'), ('years', 'year'), ('states', 'state'), ('us', 'u'), ('does', 'doe'), ('less', 'le'), ('means', 'mean')]

# distinct_words: 4114
sample:


['as', 'was', 'has', 'its', 'years', 'states', 'us', 'does', 'less', 'means']

In [14]:
distinct_words

['the',
 'of',
 'and',
 'to',
 'in',
 'is',
 'that',
 'for',
 'as',
 'it',
 'be',
 'by',
 'with',
 'was',
 'on',
 'not',
 'or',
 'this',
 'are',
 'at',
 'from',
 'he',
 'which',
 'his',
 'have',
 'an',
 'but',
 'you',
 'they',
 'were',
 'had',
 'we',
 'all',
 'one',
 'has',
 'their',
 'been',
 'will',
 'there',
 'can',
 'if',
 'other',
 'would',
 'no',
 'her',
 'may',
 'more',
 'when',
 'so',
 'who',
 'such',
 'these',
 'any',
 'she',
 'new',
 'time',
 'than',
 'do',
 'some',
 'what',
 'only',
 'into',
 'them',
 'two',
 'also',
 'about',
 'out',
 'him',
 'my',
 'said',
 'up',
 'our',
 'first',
 'should',
 'under',
 'made',
 'state',
 'see',
 'after',
 'could',
 'then',
 'me',
 'most',
 'over',
 'very',
 'your',
 'between',
 'where',
 'now',
 'shall',
 'work',
 'those',
 'same',
 'well',
 'each',
 'many',
 'being',
 'years',
 'did',
 'through',
 'must',
 'upon',
 'before',
 'like',
 'use',
 'part',
 'general',
 'people',
 'because',
 'used',
 'how',
 'even',
 'much',
 'during',
 'both',

## Generate the final wordlist

In [22]:
# The final wordlist map. Maps a word to its numeric value
# Starting at 1
final_wordlist = {
    w: idx + 1
    for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE])
}


reverse_lemmatize_idx = {
    lemmatizer.lemmatize(w): w
    for w in final_wordlist.keys()
}

print(lemmatize_mappings)

# Add the lemmatized numbers
for w, lem_w in lemmatize_mappings:
    if lem_w not in reverse_lemmatize_idx:
        # This word is not in the reverse list
        # This happens when the index of the lemmatized word we're working with is too large
        continue
    if lem_w in final_wordlist.keys():
        continue
    final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]]
    #final_wordlist[reverse_lemmatize_idx[lem_w]] = final_wordlist[w]

print(final_wordlist)
assert final_wordlist["its"] == final_wordlist["its"]
assert final_wordlist["its"] >= 0

print(f"Final wordlist size: {len(final_wordlist.keys())}")



KeyError: 'its'

In [7]:
sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]

with open("final_wordlist.csv", "w") as f:
    f.write("word,number\n")
    
    for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):
        lemmatized = "" if not w[1] else w[1]
        f.write(f"{w[0].upper()},{lemmatized - 1}")
        f.write("\n")

In [9]:
final_wordlist

{'the': 1,
 'of': 2,
 'and': 3,
 'to': 4,
 'in': 5,
 'is': 6,
 'that': 7,
 'for': 8,
 'as': 9,
 'it': 10,
 'be': 5378,
 'by': 7272,
 'with': 13,
 'was': 14,
 'on': 15,
 'not': 16,
 'or': 17,
 'this': 18,
 'are': 19,
 'at': 20,
 'from': 21,
 'he': 22,
 'which': 23,
 'his': 24,
 'have': 25,
 'an': 26,
 'but': 27,
 'you': 28,
 'they': 29,
 'were': 30,
 'had': 31,
 'we': 32,
 'all': 33,
 'one': 34,
 'has': 35,
 'their': 36,
 'been': 37,
 'will': 38,
 'there': 39,
 'can': 40,
 'if': 41,
 'other': 42,
 'would': 43,
 'no': 44,
 'her': 45,
 'may': 46,
 'more': 47,
 'when': 48,
 'so': 49,
 'who': 50,
 'such': 51,
 'these': 52,
 'any': 53,
 'she': 54,
 'new': 55,
 'time': 56,
 'than': 57,
 'do': 58,
 'some': 59,
 'what': 60,
 'only': 61,
 'into': 62,
 'them': 63,
 'two': 64,
 'also': 65,
 'about': 66,
 'out': 67,
 'him': 68,
 'my': 69,
 'said': 70,
 'up': 71,
 'our': 72,
 'first': 73,
 'should': 74,
 'under': 75,
 'made': 76,
 'state': 77,
 'see': 78,
 'after': 79,
 'could': 80,
 'then': 81,
 'm