this_algorithm/wordlist/wordlist-new2.py

#!/usr/bin/env python3
# coding: utf-8

print("Step 1")


try:
    _initialized
except:
    # !pip install spacy
    # !python -m spacy download en_core_web_trf
    import spacy
    from tqdm import tqdm

    nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])

    _initialized=True

import pandas as pd
import gzip
import re


print("Step 2")


def get_lines(filename):
    with gzip.open(filename, 'r') as f:
        ret = []
        for l in f:
            if len(ret) > 30_000:
                return ret
            ret.append(str(l).lower())
        return ret


WORDLIST_SIZE = 8192 + 3
word_re = re.compile(r"^[A-Za-z]+$")


print("Step 3")


annotated_words=pd.read_excel("annotated_words.ods")

excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
excluded_words[0:10]

custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))

custom_maps = [
    (m[1]["word"].lower(), mapping.lower())
    for m in custom_maps.iterrows()
    for mapping in m[1]["maps_to"]
]
custom_maps


print("Step 4")


# Start parsing the wordlist
all_words = get_lines("00-frequency-all.txt.gz")

# Delete header line
all_words = all_words[1:]

# Get only the word (fixed width)
all_words = [w[13:36].strip() for w in all_words]

# Remove special characters
all_words = [w for w in all_words if word_re.search(w)]

# Remove all removed words
all_words = [w for w in all_words if w not in excluded_words]

# Add all custom mappings
for m in list(sum(custom_maps, ())):
    if m[0] not in all_words:
        all_words.append(m[0])
    if m[1] not in all_words:
        all_words.append(m[1])


print("Step 5")


# Lemmatize all words (plural -> singular)
lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
print(lemmatize_mappings[:100])

# Add custom lemmatizations
for l in custom_maps:
    if l in lemmatize_mappings:
        print(f"Warning: {l} is already lemmatized")
    else:
        lemmatize_mappings.append(l)

print(lemmatize_mappings[:100])

lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]
print(lemmatize_mappings[:100])

# Now, re-add all lematized words to the list of every word
for w in sum(lemmatize_mappings, ()):
    if w not in all_words:
        print(w)
        all_words.append(w)

lemmatize_mappings = {k: v for k, v in lemmatize_mappings}


print("Step 6")


final_wordlist = []
seen_lemmatizations = set()
for w in all_words:
    lemmatized = lemmatize_mappings.get(w) or w
    if lemmatized in seen_lemmatizations:
        # The lemmatized version of this word was already seen
        continue
    else:
        # The lemmatized version hasn't been seen. We're good to add it
        final_wordlist.append([
            k
            for k
            in lemmatize_mappings.keys()
            if lemmatize_mappings[k] == lemmatized
        ])
        seen_lemmatizations.add(lemmatized)

    if len(final_wordlist) >= WORDLIST_SIZE:
        break

# Now, convert it to the format (number, word)
final_wordlist = [
    (idx, w)
    for idx, words in enumerate(final_wordlist)
    for w in words
]


print("Step 7")

print(len(lemmatize_mappings))

print("Step 8")

with open("01-generated-wordlist.csv", "w") as f:
    f.write("word,number\n")

    for w in final_wordlist:
        lemmatized = "" if not w[1] else w[1]
        f.write(f"{w[1].upper()},{w[0]}")
        f.write("\n")

print("Done")
Start working on new wordlist generation 2023-03-01 22:26:06 -05:00			`#!/usr/bin/env python3`
			`# coding: utf-8`

			`print("Step 1")`


			`try:`
			`_initialized`
			`except:`
			`# !pip install spacy`
			`# !python -m spacy download en_core_web_trf`
			`import spacy`
			`from tqdm import tqdm`

			`nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])`

			`_initialized=True`

			`import pandas as pd`
			`import gzip`
			`import re`


			`print("Step 2")`


			`def get_lines(filename):`
			`with gzip.open(filename, 'r') as f:`
			`ret = []`
			`for l in f:`
			`if len(ret) > 30_000:`
			`return ret`
			`ret.append(str(l).lower())`
			`return ret`



			`WORDLIST_SIZE = 8192 + 3`
			`word_re = re.compile(r"^[A-Za-z]+$")`


			`print("Step 3")`


			`annotated_words=pd.read_excel("annotated_words.ods")`

			`excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())`
			`excluded_words[0:10]`

			`custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))`

			`custom_maps = [`
			`(m[1]["word"].lower(), mapping.lower())`
			`for m in custom_maps.iterrows()`
			`for mapping in m[1]["maps_to"]`
			`]`
			`custom_maps`


			`print("Step 4")`


			`# Start parsing the wordlist`
			`all_words = get_lines("00-frequency-all.txt.gz")`

			`# Delete header line`
			`all_words = all_words[1:]`

			`# Get only the word (fixed width)`
			`all_words = [w[13:36].strip() for w in all_words]`

			`# Remove special characters`
			`all_words = [w for w in all_words if word_re.search(w)]`

			`# Remove all removed words`
			`all_words = [w for w in all_words if w not in excluded_words]`

			`# Add all custom mappings`
			`for m in list(sum(custom_maps, ())):`
			`if m[0] not in all_words:`
			`all_words.append(m[0])`
			`if m[1] not in all_words:`
			`all_words.append(m[1])`


			`print("Step 5")`


			`# Lemmatize all words (plural -> singular)`
			`lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]`
			`print(lemmatize_mappings[:100])`

			`# Add custom lemmatizations`
			`for l in custom_maps:`
			`if l in lemmatize_mappings:`
			`print(f"Warning: {l} is already lemmatized")`
			`else:`
			`lemmatize_mappings.append(l)`

			`print(lemmatize_mappings[:100])`

			`lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]`
			`print(lemmatize_mappings[:100])`

			`# Now, re-add all lematized words to the list of every word`
			`for w in sum(lemmatize_mappings, ()):`
			`if w not in all_words:`
			`print(w)`
			`all_words.append(w)`

			`lemmatize_mappings = {k: v for k, v in lemmatize_mappings}`


			`print("Step 6")`


			`final_wordlist = []`
			`seen_lemmatizations = set()`
			`for w in all_words:`
			`lemmatized = lemmatize_mappings.get(w) or w`
			`if lemmatized in seen_lemmatizations:`
			`# The lemmatized version of this word was already seen`
			`continue`
			`else:`
			`# The lemmatized version hasn't been seen. We're good to add it`
			`final_wordlist.append([`
			`k`
			`for k`
			`in lemmatize_mappings.keys()`
			`if lemmatize_mappings[k] == lemmatized`
			`])`
			`seen_lemmatizations.add(lemmatized)`

			`if len(final_wordlist) >= WORDLIST_SIZE:`
			`break`

			`# Now, convert it to the format (number, word)`
			`final_wordlist = [`
			`(idx, w)`
			`for idx, words in enumerate(final_wordlist)`
			`for w in words`
			`]`


			`print("Step 7")`

			`print(len(lemmatize_mappings))`

			`print("Step 8")`

			`with open("01-generated-wordlist.csv", "w") as f:`
			`f.write("word,number\n")`

			`for w in final_wordlist:`
			`lemmatized = "" if not w[1] else w[1]`
			`f.write(f"{w[1].upper()},{w[0]}")`
			`f.write("\n")`

			`print("Done")`