#!/usr/bin/env python3 # coding: utf-8 print("Loading dependencies") import spacy import nltk from tqdm import tqdm import gzip # Wordnet try: from nltk.stem.wordnet import WordNetLemmatizer except: nltk.download("wordnet") from nltk.stem.wordnet import WordNetLemmatizer wordnet = WordNetLemmatizer() # Spacy nlp = spacy.load("en_core_web_trf", disable=["parser", "ner"]) print("Loading initial wordlist") words = [] for file in [ "./00-oxford-5000.txt", "./00-desiquintans-nounlist.txt", "./00-frequency-list.csv.gz", ]: if file.endswith(".gz"): with gzip.open(file, "r") as infile: for line in infile: words.append(line.decode("ascii").split(",")[0]) else: with open(file, "r") as infile: for line in infile: words.append(line.split(",")[0].strip()) # Remove header words = words[1:] print(words[0:5]) print("Lemmatizing words") # seen_lemmatizations = set() seen_words = set() with open("./01-errored-lemmatized-words.csv", "w") as erroutfile: erroutfile.write("WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER\n") with gzip.open("./01-lemmatized-words.csv.gz", "w") as outfile: outfile.write("WORD,LEMMATIZED_WORD,LEMMATIZER\n".encode("ascii")) # Make a progress bar so logs can be printed iter = tqdm(words) # Zip (progress bar-ed) word list with nlp.pipe so nlp can process chunks at a time for (word, spacy_word) in zip(iter, nlp.pipe(words)): lemmatized_words = [ (wordnet.lemmatize(word.lower()).upper(), "WORDNET"), (spacy_word[0].lemma_.upper(), "SPACY"), ] for lemmatized_word, lemmatizer in lemmatized_words: # if word == lemmatized_word: # # This word is its own lemmatization # continue # Skip words if we've already lemmatized them # if (word, lemmatized_word) in seen_lemmatizations: continue # seen_lemmatizations.add((word, lemmatized_word)) # Skip words if they've already been added if lemmatized_word in seen_words: iter.write(f"{lemmatized_word} ({lemmatizer})\talready in seen_words") continue seen_words.add(lemmatized_word) if lemmatized_word not in words: iter.write(f"{lemmatized_word} ({lemmatizer})\tnot in all_words") erroutfile.write(f"{word},{lemmatized_word},{lemmatizer}\n") continue iter.write(f"{word} => {lemmatized_word} ({lemmatizer}) added") outfile.write( f"{word},{lemmatized_word},{lemmatizer}\n".encode("ascii") )