this_algorithm/wordlist/01-lemmatized-words.py

#!/usr/bin/env python3
# coding: utf-8

print("Loading dependencies")

import spacy
import nltk
from tqdm import tqdm
import gzip

# Wordnet
try:
    from nltk.stem.wordnet import WordNetLemmatizer
except:
    nltk.download("wordnet")
    from nltk.stem.wordnet import WordNetLemmatizer
wordnet = WordNetLemmatizer()

# Spacy
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])

print("Loading initial wordlist")

words = []
with gzip.open("./00-frequency-list.csv.gz", 'r') as infile:
    for line in infile:
        words.append(line.decode('ascii').split(",")[0])

# Remove header
words = words[1:]

print(words[0:5])

print("Lemmatizing words")

seen_lemmatizations = set()

with open("./01-errored-lemmatized-words.csv", 'w') as erroutfile:
    erroutfile.write("WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER\n")

    with gzip.open("./01-lemmatized-words.csv.gz", 'w') as outfile:
        outfile.write("WORD,LEMMATIZED_WORD,LEMMATIZER\n".encode("ascii"))

        iter = tqdm(words)

        for word in iter:
            lemmatized_words = [
                # Wordnet
                (wordnet.lemmatize(word).upper(), 'WORDNET'),
                # Spacy
                (nlp(word)[0].lemma_.upper().upper(), 'SPACY'),
            ]

            for (lemmatized_word, lemmatizer) in lemmatized_words:
                if word == lemmatized_word:
                    continue

                if (word, lemmatized_word) in seen_lemmatizations:
                    continue

                seen_lemmatizations.add((word, lemmatized_word))

                if lemmatized_word not in words:
                    iter.write(f"{lemmatized_word} ({lemmatizer}) not in all_words")
                    erroutfile.write(f"{word},{lemmatized_word},{lemmatizer}\n")
                    continue

                iter.write(f"{word} => {lemmatized_word} ({lemmatizer}) added")

                outfile.write(f"{word},{lemmatized_word},{lemmatizer}\n".encode("ascii"))