148 lines
4.1 KiB
Python
148 lines
4.1 KiB
Python
|
#!/usr/bin/env python
|
||
|
# coding: utf-8
|
||
|
|
||
|
import nltk
|
||
|
from nltk.stem.wordnet import WordNetLemmatizer
|
||
|
import pandas as pd
|
||
|
import gzip
|
||
|
import re
|
||
|
|
||
|
nltk.download("wordnet")
|
||
|
|
||
|
WORDLIST_SIZE=8192 + 3
|
||
|
|
||
|
|
||
|
# ## First, get the list of excluded words
|
||
|
|
||
|
annotated_words=pd.read_excel("annotated_words.ods")
|
||
|
|
||
|
|
||
|
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
|
||
|
excluded_words[0:10]
|
||
|
|
||
|
|
||
|
# ## Next, get the list of custom mappings
|
||
|
|
||
|
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
|
||
|
|
||
|
custom_maps = [
|
||
|
(m[1]["word"].lower(), mapping.lower())
|
||
|
for m in custom_maps.iterrows()
|
||
|
for mapping in m[1]["maps_to"]
|
||
|
]
|
||
|
custom_maps
|
||
|
|
||
|
|
||
|
def get_lines(filename):
|
||
|
with gzip.open(filename, 'r') as f:
|
||
|
ret = []
|
||
|
for l in f:
|
||
|
if len(ret) > 30_000:
|
||
|
return ret
|
||
|
ret.append(str(l).lower())
|
||
|
return ret
|
||
|
|
||
|
lemmatizer = WordNetLemmatizer()
|
||
|
word_re = re.compile(r"^[A-Za-z]+$")
|
||
|
|
||
|
# Start parsing the wordlist
|
||
|
all_words = get_lines("frequency-all.txt.gz")
|
||
|
|
||
|
# Delete header line
|
||
|
all_words = all_words[1:]
|
||
|
|
||
|
# Get only the word (fixed width)
|
||
|
all_words = [w[13:36].strip() for w in all_words]
|
||
|
|
||
|
# Remove special characters
|
||
|
all_words = [w for w in all_words if word_re.search(w)]
|
||
|
|
||
|
# Remove all removed words
|
||
|
all_words = [w for w in all_words if w not in excluded_words]
|
||
|
|
||
|
# Lemmatize all words (plural -> singular)
|
||
|
lemmatize_mappings = [
|
||
|
(w, lemmatizer.lemmatize(w))
|
||
|
for w in all_words
|
||
|
# if w != lemmatizer.lemmatize(w)
|
||
|
]
|
||
|
|
||
|
# Remove all words that lemmatize to another word
|
||
|
#all_words = [w for w in all_words if w not in ]
|
||
|
|
||
|
# Add custom lemmatizations
|
||
|
for l in custom_maps:
|
||
|
if l in lemmatize_mappings:
|
||
|
print(f"Warning: {l} is already lemmatized")
|
||
|
else:
|
||
|
lemmatize_mappings.append(l)
|
||
|
|
||
|
distinct_words_lemmatized = set()
|
||
|
distinct_words = []
|
||
|
for w in lemmatize_mappings:
|
||
|
if w[1] not in distinct_words_lemmatized:
|
||
|
distinct_words_lemmatized.add(w[1])
|
||
|
distinct_words.append(w[0])
|
||
|
del distinct_words_lemmatized
|
||
|
|
||
|
# Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word
|
||
|
#wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl]
|
||
|
|
||
|
# Get a list of words that map to other words
|
||
|
# A word was lemmatized if wordnet mapped it to another word (not None) that was different
|
||
|
#only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]]
|
||
|
|
||
|
# Get a list of distinct lemmatized words
|
||
|
#distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None]
|
||
|
#distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)]
|
||
|
|
||
|
print(f"# all_words: {len(all_words)}")
|
||
|
print(f"sample: {all_words[0:10]}")
|
||
|
print()
|
||
|
print(f"# lemmatize_mappings: {len(lemmatize_mappings)}")
|
||
|
print(f"sample: {lemmatize_mappings[0:10]}")
|
||
|
print()
|
||
|
print(f"# distinct_words: {len(distinct_words)}")
|
||
|
print(f"sample:")
|
||
|
distinct_words[0:10]
|
||
|
|
||
|
|
||
|
# ## Generate the final wordlist
|
||
|
|
||
|
# The final wordlist map. Maps a word to its numeric value
|
||
|
# Starting at 1
|
||
|
final_wordlist = {
|
||
|
w: idx + 1
|
||
|
for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE])
|
||
|
}
|
||
|
|
||
|
reverse_lemmatize_idx = {
|
||
|
lemmatizer.lemmatize(w): w
|
||
|
for w in final_wordlist.keys()
|
||
|
}
|
||
|
|
||
|
# Add the lemmatized numbers
|
||
|
for w, lem_w in lemmatize_mappings:
|
||
|
if lem_w not in reverse_lemmatize_idx:
|
||
|
# This word is not in the reverse list
|
||
|
# This happens when the index of the lemmatized word we're working with is too large
|
||
|
continue
|
||
|
|
||
|
final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]]
|
||
|
|
||
|
assert final_wordlist["its"] == final_wordlist["its"]
|
||
|
assert final_wordlist["its"] >= 0
|
||
|
|
||
|
print(f"Final wordlist size: {len(final_wordlist.keys())}")
|
||
|
|
||
|
|
||
|
sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]
|
||
|
|
||
|
with open("final_wordlist.csv", "w") as f:
|
||
|
f.write("word,number\n")
|
||
|
|
||
|
for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):
|
||
|
lemmatized = "" if not w[1] else w[1]
|
||
|
f.write(f"{w[0].upper()},{lemmatized - 1}")
|
||
|
f.write("\n")
|