this_algorithm/wordlist/04-deduplicated-words.py

#!/usr/bin/env python3
# coding: utf-8

import gzip
from pprint import pprint
from tqdm import tqdm

# 2**13 + 2 since two can be skipped
WORDLIST_SIZE=8192+2

print("Loading full wordlist")

all_words = []
with gzip.open("./00-frequency-list.csv.gz", 'r') as infile:
    first = True
    for line in infile:
        if first:
            first = False
            continue
        all_words.append(line.decode('ascii').split(",")[0])

print("Building lemmatization graph")

lemmatization_graph = list()
def add_lemmatization(word1, word2):
    for lemmatization in lemmatization_graph:
        word1_contained = word1 in lemmatization
        word2_contained = word2 in lemmatization

        if word1_contained or word2_contained:
            if word1_contained and word2_contained:
                print(f"Warning: lemmatization {word1}<=>{word2} already in set: {lemmatization}")

            lemmatization.add(word1)
            lemmatization.add(word2)

            # Success. We added the words
            return
        else:
            # This lemmatization doesn't contain either. This is the common case
            pass

    # If we get here, there is no known lemmatization between these two. Add it
    lemmatization_graph.append(set((word1, word2)))

def get_lemmatization(word):
    for lemmatization in lemmatization_graph:
        if word in lemmatization:
            return lemmatization

print("\tAdding automatic lemmatizations")
# First, iterate over automated lemmatizations
with gzip.open("./01-lemmatized-words.csv.gz") as infile:
    first = True
    for line in infile:
        if first:
            first = False
            continue
        split = line.decode('ascii').strip().split(",")
        add_lemmatization(split[0], split[1])

print("\tAdding custom lemmatizations")
# Next, iterate over manual lemmatizations
with open("./02-custom-lemmatizations.csv") as infile:
    first = True
    for line in infile:
        if first:
            first = False
            continue
        split = line.strip().split(",")
        add_lemmatization(split[0], split[1])

print("Lemmatization graph constructed:")
pprint(lemmatization_graph)

print("Loading exclude wordlist")
with open("./03-exclude.csv") as infile:
    first = True
    exclude_words = set()
    for line in infile:
        if first:
            first = False
            continue
        exclude_words.add(line.strip())

# Now, start printing the first WORDLIST_SIZE elements
seen_word_lemmatizations = set()
final_wordlist = []
ending_word_index = 0
for word in all_words:
    ending_word_index += 1

    word_lemmatizations = get_lemmatization(word)

    if not word_lemmatizations:
        word_lemmatizations = set([word])

    if len(word_lemmatizations - exclude_words) != len(word_lemmatizations) :
        print(f"Note: {word_lemmatizations} is excluded")
        continue

    if word_lemmatizations in seen_word_lemmatizations:
        # We already added this one
        continue

    final_wordlist.append(word_lemmatizations)

    if len(final_wordlist) >= WORDLIST_SIZE:
        # We've added all the words we need
        break

assert len(final_wordlist) == WORDLIST_SIZE
pprint(list(enumerate(final_wordlist)))
print(f"Ending index: {ending_word_index}")

final_wordlist = [
    (idx + 1, word)
    for idx, words in enumerate(final_wordlist)
    for word in words
]

with open("./04-deduplicated-words.csv", 'w') as outfile:
    outfile.write("WORD,NUMBER\n")
    for (idx, word) in final_wordlist:
        outfile.write(f"{word},{idx}\n")
        # all_words.append(line.decode('ascii').split(",")[0])