2023-03-01 22:26:06 -05:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# coding: utf-8
|
|
|
|
|
2023-03-02 00:20:25 -05:00
|
|
|
print("Loading dependencies")
|
2023-03-01 22:26:06 -05:00
|
|
|
|
2023-03-02 00:20:25 -05:00
|
|
|
import spacy
|
|
|
|
import nltk
|
|
|
|
from tqdm import tqdm
|
|
|
|
import gzip
|
2023-03-01 22:26:06 -05:00
|
|
|
|
2023-03-02 00:20:25 -05:00
|
|
|
# Wordnet
|
2023-03-01 22:26:06 -05:00
|
|
|
try:
|
2023-03-02 00:20:25 -05:00
|
|
|
from nltk.stem.wordnet import WordNetLemmatizer
|
2023-03-01 22:26:06 -05:00
|
|
|
except:
|
2023-03-02 00:20:25 -05:00
|
|
|
nltk.download("wordnet")
|
|
|
|
from nltk.stem.wordnet import WordNetLemmatizer
|
|
|
|
wordnet = WordNetLemmatizer()
|
2023-03-01 22:26:06 -05:00
|
|
|
|
2023-03-02 00:20:25 -05:00
|
|
|
# Spacy
|
2023-03-17 23:09:55 -04:00
|
|
|
nlp = spacy.load("en_core_web_trf", disable=["parser", "ner"])
|
2023-03-01 22:26:06 -05:00
|
|
|
|
2023-03-02 00:20:25 -05:00
|
|
|
print("Loading initial wordlist")
|
2023-03-01 22:26:06 -05:00
|
|
|
|
2023-03-02 00:20:25 -05:00
|
|
|
words = []
|
2023-04-30 22:16:54 -04:00
|
|
|
|
|
|
|
for file in [
|
|
|
|
"./00-oxford-5000.txt",
|
|
|
|
"./00-desiquintans-nounlist.txt",
|
|
|
|
"./00-frequency-list.csv.gz",
|
|
|
|
]:
|
|
|
|
if file.endswith(".gz"):
|
|
|
|
with gzip.open(file, "r") as infile:
|
|
|
|
for line in infile:
|
|
|
|
words.append(line.decode("ascii").split(",")[0])
|
|
|
|
else:
|
|
|
|
with open(file, "r") as infile:
|
|
|
|
for line in infile:
|
|
|
|
words.append(line.split(",")[0].strip())
|
2023-03-01 22:26:06 -05:00
|
|
|
|
2023-03-02 00:20:25 -05:00
|
|
|
# Remove header
|
|
|
|
words = words[1:]
|
2023-03-01 22:26:06 -05:00
|
|
|
|
2023-03-02 00:20:25 -05:00
|
|
|
print(words[0:5])
|
2023-03-01 22:26:06 -05:00
|
|
|
|
2023-03-02 00:20:25 -05:00
|
|
|
print("Lemmatizing words")
|
2023-03-01 22:26:06 -05:00
|
|
|
|
2023-04-30 22:16:54 -04:00
|
|
|
# seen_lemmatizations = set()
|
|
|
|
seen_words = set()
|
2023-03-01 22:26:06 -05:00
|
|
|
|
2023-03-17 23:09:55 -04:00
|
|
|
with open("./01-errored-lemmatized-words.csv", "w") as erroutfile:
|
2023-03-02 00:20:25 -05:00
|
|
|
erroutfile.write("WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER\n")
|
2023-03-01 22:26:06 -05:00
|
|
|
|
2023-03-17 23:09:55 -04:00
|
|
|
with gzip.open("./01-lemmatized-words.csv.gz", "w") as outfile:
|
2023-03-02 00:20:25 -05:00
|
|
|
outfile.write("WORD,LEMMATIZED_WORD,LEMMATIZER\n".encode("ascii"))
|
2023-03-01 22:26:06 -05:00
|
|
|
|
2023-04-30 22:16:54 -04:00
|
|
|
# Make a progress bar so logs can be printed
|
2023-03-02 00:20:25 -05:00
|
|
|
iter = tqdm(words)
|
2023-03-01 22:26:06 -05:00
|
|
|
|
2023-04-30 22:16:54 -04:00
|
|
|
# Zip (progress bar-ed) word list with nlp.pipe so nlp can process chunks at a time
|
|
|
|
for (word, spacy_word) in zip(iter, nlp.pipe(words)):
|
2023-03-02 00:20:25 -05:00
|
|
|
lemmatized_words = [
|
2023-03-17 23:09:55 -04:00
|
|
|
(wordnet.lemmatize(word.lower()).upper(), "WORDNET"),
|
2023-04-30 22:16:54 -04:00
|
|
|
(spacy_word[0].lemma_.upper(), "SPACY"),
|
2023-03-02 00:20:25 -05:00
|
|
|
]
|
2023-03-01 22:26:06 -05:00
|
|
|
|
2023-03-17 23:09:55 -04:00
|
|
|
for lemmatized_word, lemmatizer in lemmatized_words:
|
2023-04-30 22:16:54 -04:00
|
|
|
# if word == lemmatized_word:
|
|
|
|
# # This word is its own lemmatization
|
|
|
|
# continue
|
2023-03-01 22:26:06 -05:00
|
|
|
|
2023-04-30 22:16:54 -04:00
|
|
|
# Skip words if we've already lemmatized them
|
|
|
|
# if (word, lemmatized_word) in seen_lemmatizations: continue
|
|
|
|
# seen_lemmatizations.add((word, lemmatized_word))
|
2023-03-01 22:26:06 -05:00
|
|
|
|
2023-04-30 22:16:54 -04:00
|
|
|
# Skip words if they've already been added
|
|
|
|
if lemmatized_word in seen_words:
|
|
|
|
iter.write(f"{lemmatized_word} ({lemmatizer})\talready in seen_words")
|
|
|
|
continue
|
|
|
|
seen_words.add(lemmatized_word)
|
2023-03-01 22:26:06 -05:00
|
|
|
|
2023-03-02 00:20:25 -05:00
|
|
|
if lemmatized_word not in words:
|
2023-04-30 22:16:54 -04:00
|
|
|
iter.write(f"{lemmatized_word} ({lemmatizer})\tnot in all_words")
|
2023-03-02 00:20:25 -05:00
|
|
|
erroutfile.write(f"{word},{lemmatized_word},{lemmatizer}\n")
|
|
|
|
continue
|
2023-03-02 00:20:09 -05:00
|
|
|
|
2023-03-02 00:20:25 -05:00
|
|
|
iter.write(f"{word} => {lemmatized_word} ({lemmatizer}) added")
|
2023-03-02 00:20:09 -05:00
|
|
|
|
2023-03-17 23:09:55 -04:00
|
|
|
outfile.write(
|
|
|
|
f"{word},{lemmatized_word},{lemmatizer}\n".encode("ascii")
|
|
|
|
)
|