this_algorithm/wordlist/04-deduplicated-words.py

139 lines
3.8 KiB
Python
Raw Permalink Normal View History

2023-03-02 00:20:25 -05:00
#!/usr/bin/env python3
# coding: utf-8
import gzip
from pprint import pprint
from tqdm import tqdm
# 2**13 + 2 since two can be skipped
2023-03-17 23:09:55 -04:00
WORDLIST_SIZE = 4096 + 2
2023-03-02 00:20:25 -05:00
print("Loading full wordlist")
all_words = []
2023-03-17 23:09:55 -04:00
with gzip.open("./00-frequency-list.csv.gz", "r") as infile:
2023-03-02 00:20:25 -05:00
first = True
for line in infile:
if first:
first = False
continue
2023-03-17 23:09:55 -04:00
all_words.append(line.decode("ascii").split(",")[0])
2023-03-02 00:20:25 -05:00
print("Building lemmatization graph")
lemmatization_graph = list()
2023-03-17 23:09:55 -04:00
2023-03-02 00:20:25 -05:00
def add_lemmatization(word1, word2):
for lemmatization in lemmatization_graph:
word1_contained = word1 in lemmatization
word2_contained = word2 in lemmatization
if word1_contained or word2_contained:
if word1_contained and word2_contained:
2023-03-17 23:09:55 -04:00
print(
f"Warning: lemmatization {word1}<=>{word2} already in set: {lemmatization}"
)
2023-03-02 00:20:25 -05:00
lemmatization.add(word1)
lemmatization.add(word2)
# Success. We added the words
return
else:
# This lemmatization doesn't contain either. This is the common case
pass
# If we get here, there is no known lemmatization between these two. Add it
lemmatization_graph.append(set((word1, word2)))
2023-03-17 23:09:55 -04:00
2023-03-02 00:20:25 -05:00
def get_lemmatization(word):
for lemmatization in lemmatization_graph:
if word in lemmatization:
return lemmatization
2023-03-17 23:09:55 -04:00
2023-03-02 00:20:25 -05:00
print("\tAdding automatic lemmatizations")
# First, iterate over automated lemmatizations
with gzip.open("./01-lemmatized-words.csv.gz") as infile:
first = True
for line in infile:
if first:
first = False
continue
2023-03-17 23:09:55 -04:00
split = line.decode("ascii").strip().split(",")
2023-03-02 00:20:25 -05:00
add_lemmatization(split[0], split[1])
print("\tAdding custom lemmatizations")
# Next, iterate over manual lemmatizations
with open("./02-custom-lemmatizations.csv") as infile:
first = True
for line in infile:
if first:
first = False
continue
split = line.strip().split(",")
add_lemmatization(split[0], split[1])
print("Lemmatization graph constructed:")
pprint(lemmatization_graph)
print("Loading exclude wordlist")
with open("./03-exclude.csv") as infile:
first = True
exclude_words = set()
for line in infile:
if first:
first = False
continue
exclude_words.add(line.strip())
# Now, start printing the first WORDLIST_SIZE elements
seen_word_lemmatizations = set()
final_wordlist = []
ending_word_index = 0
for word in all_words:
ending_word_index += 1
word_lemmatizations = get_lemmatization(word)
if not word_lemmatizations:
word_lemmatizations = set([word])
2023-03-17 23:09:55 -04:00
if len(word_lemmatizations - exclude_words) != len(word_lemmatizations):
2023-03-02 00:20:25 -05:00
print(f"Note: {word_lemmatizations} is excluded")
continue
if word_lemmatizations in seen_word_lemmatizations:
# We already added this one
continue
2023-03-06 21:18:22 -05:00
if word_lemmatizations in final_wordlist:
# ?
continue
2023-03-02 00:20:25 -05:00
final_wordlist.append(word_lemmatizations)
if len(final_wordlist) >= WORDLIST_SIZE:
# We've added all the words we need
break
assert len(final_wordlist) == WORDLIST_SIZE
pprint(list(enumerate(final_wordlist)))
print(f"Ending index: {ending_word_index}")
final_wordlist = [
2023-03-06 20:28:48 -05:00
# The idx here starts at 0, which is fine
# It indicates that a *word* component can map to 0 (not that the numeric component can)
(idx, word)
2023-03-02 00:20:25 -05:00
for idx, words in enumerate(final_wordlist)
for word in words
]
2023-03-17 23:09:55 -04:00
with open("./04-deduplicated-words.csv", "w") as outfile:
2023-03-02 00:20:25 -05:00
outfile.write("WORD,NUMBER\n")
2023-03-17 23:09:55 -04:00
for idx, word in final_wordlist:
2023-03-02 00:20:25 -05:00
outfile.write(f"{word},{idx}\n")
# all_words.append(line.decode('ascii').split(",")[0])