this_algorithm/docs/wordlist.py

148 lines
4.1 KiB
Python
Raw Normal View History

2023-02-15 22:57:30 -05:00
#!/usr/bin/env python
# coding: utf-8
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import gzip
import re
nltk.download("wordnet")
WORDLIST_SIZE=8192 + 3
# ## First, get the list of excluded words
annotated_words=pd.read_excel("annotated_words.ods")
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
excluded_words[0:10]
# ## Next, get the list of custom mappings
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
custom_maps = [
(m[1]["word"].lower(), mapping.lower())
for m in custom_maps.iterrows()
for mapping in m[1]["maps_to"]
]
custom_maps
def get_lines(filename):
with gzip.open(filename, 'r') as f:
ret = []
for l in f:
if len(ret) > 30_000:
return ret
ret.append(str(l).lower())
return ret
lemmatizer = WordNetLemmatizer()
word_re = re.compile(r"^[A-Za-z]+$")
# Start parsing the wordlist
all_words = get_lines("frequency-all.txt.gz")
# Delete header line
all_words = all_words[1:]
# Get only the word (fixed width)
all_words = [w[13:36].strip() for w in all_words]
# Remove special characters
all_words = [w for w in all_words if word_re.search(w)]
# Remove all removed words
all_words = [w for w in all_words if w not in excluded_words]
# Lemmatize all words (plural -> singular)
lemmatize_mappings = [
(w, lemmatizer.lemmatize(w))
for w in all_words
# if w != lemmatizer.lemmatize(w)
]
# Remove all words that lemmatize to another word
#all_words = [w for w in all_words if w not in ]
# Add custom lemmatizations
for l in custom_maps:
if l in lemmatize_mappings:
print(f"Warning: {l} is already lemmatized")
else:
lemmatize_mappings.append(l)
distinct_words_lemmatized = set()
distinct_words = []
for w in lemmatize_mappings:
if w[1] not in distinct_words_lemmatized:
distinct_words_lemmatized.add(w[1])
distinct_words.append(w[0])
del distinct_words_lemmatized
# Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word
#wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl]
# Get a list of words that map to other words
# A word was lemmatized if wordnet mapped it to another word (not None) that was different
#only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]]
# Get a list of distinct lemmatized words
#distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None]
#distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)]
print(f"# all_words: {len(all_words)}")
print(f"sample: {all_words[0:10]}")
print()
print(f"# lemmatize_mappings: {len(lemmatize_mappings)}")
print(f"sample: {lemmatize_mappings[0:10]}")
print()
print(f"# distinct_words: {len(distinct_words)}")
print(f"sample:")
distinct_words[0:10]
# ## Generate the final wordlist
# The final wordlist map. Maps a word to its numeric value
# Starting at 1
final_wordlist = {
w: idx + 1
for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE])
}
reverse_lemmatize_idx = {
lemmatizer.lemmatize(w): w
for w in final_wordlist.keys()
}
# Add the lemmatized numbers
for w, lem_w in lemmatize_mappings:
if lem_w not in reverse_lemmatize_idx:
# This word is not in the reverse list
# This happens when the index of the lemmatized word we're working with is too large
continue
final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]]
assert final_wordlist["its"] == final_wordlist["its"]
assert final_wordlist["its"] >= 0
print(f"Final wordlist size: {len(final_wordlist.keys())}")
sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]
with open("final_wordlist.csv", "w") as f:
f.write("word,number\n")
for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):
lemmatized = "" if not w[1] else w[1]
f.write(f"{w[0].upper()},{lemmatized - 1}")
f.write("\n")