#!/usr/bin/env python3 # coding: utf-8 import gzip from pprint import pprint from tqdm import tqdm # 2**13 + 2 since two can be skipped WORDLIST_SIZE=8192+2 print("Loading full wordlist") all_words = [] with gzip.open("./00-frequency-list.csv.gz", 'r') as infile: first = True for line in infile: if first: first = False continue all_words.append(line.decode('ascii').split(",")[0]) print("Building lemmatization graph") lemmatization_graph = list() def add_lemmatization(word1, word2): for lemmatization in lemmatization_graph: word1_contained = word1 in lemmatization word2_contained = word2 in lemmatization if word1_contained or word2_contained: if word1_contained and word2_contained: print(f"Warning: lemmatization {word1}<=>{word2} already in set: {lemmatization}") lemmatization.add(word1) lemmatization.add(word2) # Success. We added the words return else: # This lemmatization doesn't contain either. This is the common case pass # If we get here, there is no known lemmatization between these two. Add it lemmatization_graph.append(set((word1, word2))) def get_lemmatization(word): for lemmatization in lemmatization_graph: if word in lemmatization: return lemmatization print("\tAdding automatic lemmatizations") # First, iterate over automated lemmatizations with gzip.open("./01-lemmatized-words.csv.gz") as infile: first = True for line in infile: if first: first = False continue split = line.decode('ascii').strip().split(",") add_lemmatization(split[0], split[1]) print("\tAdding custom lemmatizations") # Next, iterate over manual lemmatizations with open("./02-custom-lemmatizations.csv") as infile: first = True for line in infile: if first: first = False continue split = line.strip().split(",") add_lemmatization(split[0], split[1]) print("Lemmatization graph constructed:") pprint(lemmatization_graph) print("Loading exclude wordlist") with open("./03-exclude.csv") as infile: first = True exclude_words = set() for line in infile: if first: first = False continue exclude_words.add(line.strip()) # Now, start printing the first WORDLIST_SIZE elements seen_word_lemmatizations = set() final_wordlist = [] ending_word_index = 0 for word in all_words: ending_word_index += 1 word_lemmatizations = get_lemmatization(word) if not word_lemmatizations: word_lemmatizations = set([word]) if len(word_lemmatizations - exclude_words) != len(word_lemmatizations) : print(f"Note: {word_lemmatizations} is excluded") continue if word_lemmatizations in seen_word_lemmatizations: # We already added this one continue final_wordlist.append(word_lemmatizations) if len(final_wordlist) >= WORDLIST_SIZE: # We've added all the words we need break assert len(final_wordlist) == WORDLIST_SIZE pprint(list(enumerate(final_wordlist))) print(f"Ending index: {ending_word_index}") final_wordlist = [ # The idx here starts at 0, which is fine # It indicates that a *word* component can map to 0 (not that the numeric component can) (idx, word) for idx, words in enumerate(final_wordlist) for word in words ] with open("./04-deduplicated-words.csv", 'w') as outfile: outfile.write("WORD,NUMBER\n") for (idx, word) in final_wordlist: outfile.write(f"{word},{idx}\n") # all_words.append(line.decode('ascii').split(",")[0])