this_algorithm/wordlist/01-lemmatized-words.py

71 lines
1.9 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# coding: utf-8
2023-03-02 00:20:25 -05:00
print("Loading dependencies")
2023-03-02 00:20:25 -05:00
import spacy
import nltk
from tqdm import tqdm
import gzip
2023-03-02 00:20:25 -05:00
# Wordnet
try:
2023-03-02 00:20:25 -05:00
from nltk.stem.wordnet import WordNetLemmatizer
except:
2023-03-02 00:20:25 -05:00
nltk.download("wordnet")
from nltk.stem.wordnet import WordNetLemmatizer
wordnet = WordNetLemmatizer()
2023-03-02 00:20:25 -05:00
# Spacy
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
2023-03-02 00:20:25 -05:00
print("Loading initial wordlist")
2023-03-02 00:20:25 -05:00
words = []
with gzip.open("./00-frequency-list.csv.gz", 'r') as infile:
for line in infile:
words.append(line.decode('ascii').split(",")[0])
2023-03-02 00:20:25 -05:00
# Remove header
words = words[1:]
2023-03-02 00:20:25 -05:00
print(words[0:5])
2023-03-02 00:20:25 -05:00
print("Lemmatizing words")
2023-03-02 00:20:25 -05:00
seen_lemmatizations = set()
2023-03-02 00:20:25 -05:00
with open("./01-errored-lemmatized-words.csv", 'w') as erroutfile:
erroutfile.write("WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER\n")
2023-03-02 00:20:25 -05:00
with gzip.open("./01-lemmatized-words.csv.gz", 'w') as outfile:
outfile.write("WORD,LEMMATIZED_WORD,LEMMATIZER\n".encode("ascii"))
2023-03-02 00:20:25 -05:00
iter = tqdm(words)
2023-03-02 00:20:25 -05:00
for word in iter:
lemmatized_words = [
# Wordnet
(wordnet.lemmatize(word).upper(), 'WORDNET'),
# Spacy
(nlp(word)[0].lemma_.upper().upper(), 'SPACY'),
]
2023-03-02 00:20:25 -05:00
for (lemmatized_word, lemmatizer) in lemmatized_words:
if word == lemmatized_word:
continue
2023-03-02 00:20:25 -05:00
if (word, lemmatized_word) in seen_lemmatizations:
continue
2023-03-02 00:20:25 -05:00
seen_lemmatizations.add((word, lemmatized_word))
2023-03-02 00:20:25 -05:00
if lemmatized_word not in words:
iter.write(f"{lemmatized_word} ({lemmatizer}) not in all_words")
erroutfile.write(f"{word},{lemmatized_word},{lemmatizer}\n")
continue
2023-03-02 00:20:25 -05:00
iter.write(f"{word} => {lemmatized_word} ({lemmatizer}) added")
2023-03-02 00:20:25 -05:00
outfile.write(f"{word},{lemmatized_word},{lemmatizer}\n".encode("ascii"))