this_algorithm/wordlist/01-lemmatized-words.py

#!/usr/bin/env python3
# coding: utf-8

print("Step 1")


try:
    _initialized
except:
    # !pip install spacy
    # !python -m spacy download en_core_web_trf
    import spacy
    from tqdm import tqdm

    nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])

    _initialized=True

import pandas as pd
import gzip
import re


print("Step 2")


def get_lines(filename):
    with gzip.open(filename, 'r') as f:
        ret = []
        for l in f:
            if len(ret) > 30_000:
                return ret
            ret.append(str(l).lower())
        return ret


WORDLIST_SIZE = 8192 + 3
word_re = re.compile(r"^[A-Za-z]+$")


print("Step 3")


annotated_words=pd.read_excel("annotated_words.ods")

excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
excluded_words[0:10]

custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))

custom_maps = [
    (m[1]["word"].lower(), mapping.lower())
    for m in custom_maps.iterrows()
    for mapping in m[1]["maps_to"]
]
custom_maps


print("Step 4")


# Start parsing the wordlist
all_words = get_lines("00-frequency-all.txt.gz")

# Delete header line
all_words = all_words[1:]

# Get only the word (fixed width)
all_words = [w[13:36].strip() for w in all_words]

# Remove special characters
all_words = [w for w in all_words if word_re.search(w)]

# Remove all removed words
all_words = [w for w in all_words if w not in excluded_words]

# Add all custom mappings
for m in list(sum(custom_maps, ())):
    if m[0] not in all_words:
        all_words.append(m[0])
    if m[1] not in all_words:
        all_words.append(m[1])


print("Step 5")

# Lemmatize all words (plural -> singular)
# lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]

with open("01-lemmatized-words.csv", "w") as f:
    f.write("word,lemmatized_word\n")

    iter = tqdm(all_words[:1000])

    for w in iter:
        lemmatized_word = nlp(w)[0].lemma_.upper()
        if lemmatized_word == w:
            continue
        if lemmatized_word not in all_words:
            iter.write(f"{lemmatized_word} not in all_words")

        f.write(f"{w},{lemmatized_word}\n")