this_algorithm/docs/wordlist.py

#!/usr/bin/env python
# coding: utf-8

import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import gzip
import re

nltk.download("wordnet")

WORDLIST_SIZE=8192 + 3


# ## First, get the list of excluded words

annotated_words=pd.read_excel("annotated_words.ods")


excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
excluded_words[0:10]


# ## Next, get the list of custom mappings

custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))

custom_maps = [
    (m[1]["word"].lower(), mapping.lower())
    for m in custom_maps.iterrows()
    for mapping in m[1]["maps_to"]
]
custom_maps


def get_lines(filename):
    with gzip.open(filename, 'r') as f:
        ret = []
        for l in f:
            if len(ret) > 30_000:
                return ret
            ret.append(str(l).lower())
        return ret

lemmatizer = WordNetLemmatizer()
word_re = re.compile(r"^[A-Za-z]+$")

# Start parsing the wordlist
all_words = get_lines("frequency-all.txt.gz")

# Delete header line
all_words = all_words[1:]

# Get only the word (fixed width)
all_words = [w[13:36].strip() for w in all_words]

# Remove special characters
all_words = [w for w in all_words if word_re.search(w)]

# Remove all removed words
all_words = [w for w in all_words if w not in excluded_words]

# Lemmatize all words (plural -> singular)
lemmatize_mappings = [
    (w, lemmatizer.lemmatize(w)) 
    for w in all_words
    # if w != lemmatizer.lemmatize(w)
]

# Remove all words that lemmatize to another word
#all_words = [w for w in all_words if w not in ]

# Add custom lemmatizations
for l in custom_maps:
    if l in lemmatize_mappings:
        print(f"Warning: {l} is already lemmatized")
    else:
        lemmatize_mappings.append(l)

distinct_words_lemmatized = set()
distinct_words = []
for w in lemmatize_mappings:
    if w[1] not in distinct_words_lemmatized:
        distinct_words_lemmatized.add(w[1])
        distinct_words.append(w[0])
del distinct_words_lemmatized

# Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word
#wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl]

# Get a list of words that map to other words
# A word was lemmatized if wordnet mapped it to another word (not None) that was different
#only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]]

# Get a list of distinct lemmatized words
#distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None]
#distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)]

print(f"# all_words: {len(all_words)}")
print(f"sample: {all_words[0:10]}")
print()
print(f"# lemmatize_mappings: {len(lemmatize_mappings)}")
print(f"sample: {lemmatize_mappings[0:10]}")
print()
print(f"# distinct_words: {len(distinct_words)}")
print(f"sample:")
distinct_words[0:10]


# ## Generate the final wordlist

# The final wordlist map. Maps a word to its numeric value
# Starting at 1
final_wordlist = {
    w: idx + 1
    for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE])
}

reverse_lemmatize_idx = {
    lemmatizer.lemmatize(w): w
    for w in final_wordlist.keys()
}

# Add the lemmatized numbers
for w, lem_w in lemmatize_mappings:
    if lem_w not in reverse_lemmatize_idx:
        # This word is not in the reverse list
        # This happens when the index of the lemmatized word we're working with is too large
        continue

    final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]]

assert final_wordlist["its"] == final_wordlist["its"]
assert final_wordlist["its"] >= 0

print(f"Final wordlist size: {len(final_wordlist.keys())}")


sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]

with open("final_wordlist.csv", "w") as f:
    f.write("word,number\n")

    for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):
        lemmatized = "" if not w[1] else w[1]
        f.write(f"{w[0].upper()},{lemmatized - 1}")
        f.write("\n")
Update tests and wordlist 2023-02-15 22:57:30 -05:00			`#!/usr/bin/env python`
			`# coding: utf-8`

			`import nltk`
			`from nltk.stem.wordnet import WordNetLemmatizer`
			`import pandas as pd`
			`import gzip`
			`import re`

			`nltk.download("wordnet")`

			`WORDLIST_SIZE=8192 + 3`


			`# ## First, get the list of excluded words`

			`annotated_words=pd.read_excel("annotated_words.ods")`


			`excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())`
			`excluded_words[0:10]`


			`# ## Next, get the list of custom mappings`

			`custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))`

			`custom_maps = [`
			`(m[1]["word"].lower(), mapping.lower())`
			`for m in custom_maps.iterrows()`
			`for mapping in m[1]["maps_to"]`
			`]`
			`custom_maps`


			`def get_lines(filename):`
			`with gzip.open(filename, 'r') as f:`
			`ret = []`
			`for l in f:`
			`if len(ret) > 30_000:`
			`return ret`
			`ret.append(str(l).lower())`
			`return ret`

			`lemmatizer = WordNetLemmatizer()`
			`word_re = re.compile(r"^[A-Za-z]+$")`

			`# Start parsing the wordlist`
			`all_words = get_lines("frequency-all.txt.gz")`

			`# Delete header line`
			`all_words = all_words[1:]`

			`# Get only the word (fixed width)`
			`all_words = [w[13:36].strip() for w in all_words]`

			`# Remove special characters`
			`all_words = [w for w in all_words if word_re.search(w)]`

			`# Remove all removed words`
			`all_words = [w for w in all_words if w not in excluded_words]`

			`# Lemmatize all words (plural -> singular)`
			`lemmatize_mappings = [`
			`(w, lemmatizer.lemmatize(w))`
			`for w in all_words`
			`# if w != lemmatizer.lemmatize(w)`
			`]`

			`# Remove all words that lemmatize to another word`
			`#all_words = [w for w in all_words if w not in ]`

			`# Add custom lemmatizations`
			`for l in custom_maps:`
			`if l in lemmatize_mappings:`
			`print(f"Warning: {l} is already lemmatized")`
			`else:`
			`lemmatize_mappings.append(l)`

			`distinct_words_lemmatized = set()`
			`distinct_words = []`
			`for w in lemmatize_mappings:`
			`if w[1] not in distinct_words_lemmatized:`
			`distinct_words_lemmatized.add(w[1])`
			`distinct_words.append(w[0])`
			`del distinct_words_lemmatized`

			`# Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word`
			`#wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl]`

			`# Get a list of words that map to other words`
			`# A word was lemmatized if wordnet mapped it to another word (not None) that was different`
			`#only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]]`

			`# Get a list of distinct lemmatized words`
			`#distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None]`
			`#distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)]`

			`print(f"# all_words: {len(all_words)}")`
			`print(f"sample: {all_words[0:10]}")`
			`print()`
			`print(f"# lemmatize_mappings: {len(lemmatize_mappings)}")`
			`print(f"sample: {lemmatize_mappings[0:10]}")`
			`print()`
			`print(f"# distinct_words: {len(distinct_words)}")`
			`print(f"sample:")`
			`distinct_words[0:10]`


			`# ## Generate the final wordlist`

			`# The final wordlist map. Maps a word to its numeric value`
			`# Starting at 1`
			`final_wordlist = {`
			`w: idx + 1`
			`for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE])`
			`}`

			`reverse_lemmatize_idx = {`
			`lemmatizer.lemmatize(w): w`
			`for w in final_wordlist.keys()`
			`}`

			`# Add the lemmatized numbers`
			`for w, lem_w in lemmatize_mappings:`
			`if lem_w not in reverse_lemmatize_idx:`
			`# This word is not in the reverse list`
			`# This happens when the index of the lemmatized word we're working with is too large`
			`continue`

			`final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]]`

			`assert final_wordlist["its"] == final_wordlist["its"]`
			`assert final_wordlist["its"] >= 0`

			`print(f"Final wordlist size: {len(final_wordlist.keys())}")`


			`sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]`

			`with open("final_wordlist.csv", "w") as f:`
			`f.write("word,number\n")`

			`for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):`
			`lemmatized = "" if not w[1] else w[1]`
			`f.write(f"{w[0].upper()},{lemmatized - 1}")`
			`f.write("\n")`