#!/usr/bin/env python # coding: utf-8 import nltk from nltk.stem.wordnet import WordNetLemmatizer import pandas as pd import gzip import re nltk.download("wordnet") WORDLIST_SIZE=8192 + 3 # ## First, get the list of excluded words annotated_words=pd.read_excel("annotated_words.ods") excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower()) excluded_words[0:10] # ## Next, get the list of custom mappings custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(","))) custom_maps = [ (m[1]["word"].lower(), mapping.lower()) for m in custom_maps.iterrows() for mapping in m[1]["maps_to"] ] custom_maps def get_lines(filename): with gzip.open(filename, 'r') as f: ret = [] for l in f: if len(ret) > 30_000: return ret ret.append(str(l).lower()) return ret lemmatizer = WordNetLemmatizer() word_re = re.compile(r"^[A-Za-z]+$") # Start parsing the wordlist all_words = get_lines("frequency-all.txt.gz") # Delete header line all_words = all_words[1:] # Get only the word (fixed width) all_words = [w[13:36].strip() for w in all_words] # Remove special characters all_words = [w for w in all_words if word_re.search(w)] # Remove all removed words all_words = [w for w in all_words if w not in excluded_words] # Lemmatize all words (plural -> singular) lemmatize_mappings = [ (w, lemmatizer.lemmatize(w)) for w in all_words # if w != lemmatizer.lemmatize(w) ] # Remove all words that lemmatize to another word #all_words = [w for w in all_words if w not in ] # Add custom lemmatizations for l in custom_maps: if l in lemmatize_mappings: print(f"Warning: {l} is already lemmatized") else: lemmatize_mappings.append(l) distinct_words_lemmatized = set() distinct_words = [] for w in lemmatize_mappings: if w[1] not in distinct_words_lemmatized: distinct_words_lemmatized.add(w[1]) distinct_words.append(w[0]) del distinct_words_lemmatized # Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word #wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl] # Get a list of words that map to other words # A word was lemmatized if wordnet mapped it to another word (not None) that was different #only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]] # Get a list of distinct lemmatized words #distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None] #distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)] print(f"# all_words: {len(all_words)}") print(f"sample: {all_words[0:10]}") print() print(f"# lemmatize_mappings: {len(lemmatize_mappings)}") print(f"sample: {lemmatize_mappings[0:10]}") print() print(f"# distinct_words: {len(distinct_words)}") print(f"sample:") distinct_words[0:10] # ## Generate the final wordlist # The final wordlist map. Maps a word to its numeric value # Starting at 1 final_wordlist = { w: idx + 1 for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE]) } reverse_lemmatize_idx = { lemmatizer.lemmatize(w): w for w in final_wordlist.keys() } # Add the lemmatized numbers for w, lem_w in lemmatize_mappings: if lem_w not in reverse_lemmatize_idx: # This word is not in the reverse list # This happens when the index of the lemmatized word we're working with is too large continue final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]] assert final_wordlist["its"] == final_wordlist["its"] assert final_wordlist["its"] >= 0 print(f"Final wordlist size: {len(final_wordlist.keys())}") sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()] with open("final_wordlist.csv", "w") as f: f.write("word,number\n") for w in sorted(sorted_final_wordlist, key=lambda w: w[1]): lemmatized = "" if not w[1] else w[1] f.write(f"{w[0].upper()},{lemmatized - 1}") f.write("\n")