127 lines
3.5 KiB
Python
127 lines
3.5 KiB
Python
|
#!/usr/bin/env python3
|
||
|
# coding: utf-8
|
||
|
|
||
|
import gzip
|
||
|
from pprint import pprint
|
||
|
from tqdm import tqdm
|
||
|
|
||
|
# 2**13 + 2 since two can be skipped
|
||
|
WORDLIST_SIZE=8192+2
|
||
|
|
||
|
print("Loading full wordlist")
|
||
|
|
||
|
all_words = []
|
||
|
with gzip.open("./00-frequency-list.csv.gz", 'r') as infile:
|
||
|
first = True
|
||
|
for line in infile:
|
||
|
if first:
|
||
|
first = False
|
||
|
continue
|
||
|
all_words.append(line.decode('ascii').split(",")[0])
|
||
|
|
||
|
print("Building lemmatization graph")
|
||
|
|
||
|
lemmatization_graph = list()
|
||
|
def add_lemmatization(word1, word2):
|
||
|
for lemmatization in lemmatization_graph:
|
||
|
word1_contained = word1 in lemmatization
|
||
|
word2_contained = word2 in lemmatization
|
||
|
|
||
|
if word1_contained or word2_contained:
|
||
|
if word1_contained and word2_contained:
|
||
|
print(f"Warning: lemmatization {word1}<=>{word2} already in set: {lemmatization}")
|
||
|
|
||
|
lemmatization.add(word1)
|
||
|
lemmatization.add(word2)
|
||
|
|
||
|
# Success. We added the words
|
||
|
return
|
||
|
else:
|
||
|
# This lemmatization doesn't contain either. This is the common case
|
||
|
pass
|
||
|
|
||
|
# If we get here, there is no known lemmatization between these two. Add it
|
||
|
lemmatization_graph.append(set((word1, word2)))
|
||
|
|
||
|
def get_lemmatization(word):
|
||
|
for lemmatization in lemmatization_graph:
|
||
|
if word in lemmatization:
|
||
|
return lemmatization
|
||
|
|
||
|
print("\tAdding automatic lemmatizations")
|
||
|
# First, iterate over automated lemmatizations
|
||
|
with gzip.open("./01-lemmatized-words.csv.gz") as infile:
|
||
|
first = True
|
||
|
for line in infile:
|
||
|
if first:
|
||
|
first = False
|
||
|
continue
|
||
|
split = line.decode('ascii').strip().split(",")
|
||
|
add_lemmatization(split[0], split[1])
|
||
|
|
||
|
print("\tAdding custom lemmatizations")
|
||
|
# Next, iterate over manual lemmatizations
|
||
|
with open("./02-custom-lemmatizations.csv") as infile:
|
||
|
first = True
|
||
|
for line in infile:
|
||
|
if first:
|
||
|
first = False
|
||
|
continue
|
||
|
split = line.strip().split(",")
|
||
|
add_lemmatization(split[0], split[1])
|
||
|
|
||
|
print("Lemmatization graph constructed:")
|
||
|
pprint(lemmatization_graph)
|
||
|
|
||
|
print("Loading exclude wordlist")
|
||
|
with open("./03-exclude.csv") as infile:
|
||
|
first = True
|
||
|
exclude_words = set()
|
||
|
for line in infile:
|
||
|
if first:
|
||
|
first = False
|
||
|
continue
|
||
|
exclude_words.add(line.strip())
|
||
|
|
||
|
# Now, start printing the first WORDLIST_SIZE elements
|
||
|
seen_word_lemmatizations = set()
|
||
|
final_wordlist = []
|
||
|
ending_word_index = 0
|
||
|
for word in all_words:
|
||
|
ending_word_index += 1
|
||
|
|
||
|
word_lemmatizations = get_lemmatization(word)
|
||
|
|
||
|
if not word_lemmatizations:
|
||
|
word_lemmatizations = set([word])
|
||
|
|
||
|
if len(word_lemmatizations - exclude_words) != len(word_lemmatizations) :
|
||
|
print(f"Note: {word_lemmatizations} is excluded")
|
||
|
continue
|
||
|
|
||
|
if word_lemmatizations in seen_word_lemmatizations:
|
||
|
# We already added this one
|
||
|
continue
|
||
|
|
||
|
final_wordlist.append(word_lemmatizations)
|
||
|
|
||
|
if len(final_wordlist) >= WORDLIST_SIZE:
|
||
|
# We've added all the words we need
|
||
|
break
|
||
|
|
||
|
assert len(final_wordlist) == WORDLIST_SIZE
|
||
|
pprint(list(enumerate(final_wordlist)))
|
||
|
print(f"Ending index: {ending_word_index}")
|
||
|
|
||
|
final_wordlist = [
|
||
|
(idx + 1, word)
|
||
|
for idx, words in enumerate(final_wordlist)
|
||
|
for word in words
|
||
|
]
|
||
|
|
||
|
with open("./04-deduplicated-words.csv", 'w') as outfile:
|
||
|
outfile.write("WORD,NUMBER\n")
|
||
|
for (idx, word) in final_wordlist:
|
||
|
outfile.write(f"{word},{idx}\n")
|
||
|
# all_words.append(line.decode('ascii').split(",")[0])
|