this_algorithm/docs/wordlist.py

#!/usr/bin/env python
# coding: utf-8

import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import gzip
import re

nltk.download("wordnet")

WORDLIST_SIZE=8192 + 3


# ## First, get the list of excluded words

annotated_words=pd.read_excel("annotated_words.ods")


excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
excluded_words[0:10]


# ## Next, get the list of custom mappings

custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))

custom_maps = [
    (m[1]["word"].lower(), mapping.lower())
    for m in custom_maps.iterrows()
    for mapping in m[1]["maps_to"]
]
custom_maps


def get_lines(filename):
    with gzip.open(filename, 'r') as f:
        ret = []
        for l in f:
            if len(ret) > 30_000:
                return ret
            ret.append(str(l).lower())
        return ret

lemmatizer = WordNetLemmatizer()
word_re = re.compile(r"^[A-Za-z]+$")

# Start parsing the wordlist
all_words = get_lines("frequency-all.txt.gz")

# Delete header line
all_words = all_words[1:]

# Get only the word (fixed width)
all_words = [w[13:36].strip() for w in all_words]

# Remove special characters
all_words = [w for w in all_words if word_re.search(w)]

# Remove all removed words
all_words = [w for w in all_words if w not in excluded_words]

# Lemmatize all words (plural -> singular)
lemmatize_mappings = [
    (w, lemmatizer.lemmatize(w))
    for w in all_words
    # if w != lemmatizer.lemmatize(w)
]

# Remove all words that lemmatize to another word
#all_words = [w for w in all_words if w not in ]

# Add custom lemmatizations
for l in custom_maps:
    if l in lemmatize_mappings:
        print(f"Warning: {l} is already lemmatized")
    else:
        lemmatize_mappings.append(l)

distinct_words_lemmatized = set()
distinct_words = []
for w in lemmatize_mappings:
    if w[1] not in distinct_words_lemmatized:
        distinct_words_lemmatized.add(w[1])
        distinct_words.append(w[0])
del distinct_words_lemmatized

# Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word
#wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl]

# Get a list of words that map to other words
# A word was lemmatized if wordnet mapped it to another word (not None) that was different
#only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]]

# Get a list of distinct lemmatized words
#distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None]
#distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)]

print(f"# all_words: {len(all_words)}")
print(f"sample: {all_words[0:10]}")
print()
print(f"# lemmatize_mappings: {len(lemmatize_mappings)}")
print(f"sample: {lemmatize_mappings[0:10]}")
print()
print(f"# distinct_words: {len(distinct_words)}")
print(f"sample:")
distinct_words[0:10]


# ## Generate the final wordlist

# The final wordlist map. Maps a word to its numeric value
# Starting at 1
final_wordlist = {
    w: idx + 1
    for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE])
}

reverse_lemmatize_idx = {
    lemmatizer.lemmatize(w): w
    for w in final_wordlist.keys()
}

# Add the lemmatized numbers
for w, lem_w in lemmatize_mappings:
    if lem_w not in reverse_lemmatize_idx:
        # This word is not in the reverse list
        # This happens when the index of the lemmatized word we're working with is too large
        continue

    final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]]

assert final_wordlist["its"] == final_wordlist["its"]
assert final_wordlist["its"] >= 0

print(f"Final wordlist size: {len(final_wordlist.keys())}")


sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]

with open("final_wordlist.csv", "w") as f:
    f.write("word,number\n")

    for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):
        lemmatized = "" if not w[1] else w[1]
        f.write(f"{w[0].upper()},{lemmatized - 1}")
        f.write("\n")