160 lines
3.4 KiB
Python
160 lines
3.4 KiB
Python
|
#!/usr/bin/env python3
|
||
|
# coding: utf-8
|
||
|
|
||
|
print("Step 1")
|
||
|
|
||
|
|
||
|
try:
|
||
|
_initialized
|
||
|
except:
|
||
|
# !pip install spacy
|
||
|
# !python -m spacy download en_core_web_trf
|
||
|
import spacy
|
||
|
from tqdm import tqdm
|
||
|
|
||
|
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
|
||
|
|
||
|
_initialized=True
|
||
|
|
||
|
import pandas as pd
|
||
|
import gzip
|
||
|
import re
|
||
|
|
||
|
|
||
|
print("Step 2")
|
||
|
|
||
|
|
||
|
def get_lines(filename):
|
||
|
with gzip.open(filename, 'r') as f:
|
||
|
ret = []
|
||
|
for l in f:
|
||
|
if len(ret) > 30_000:
|
||
|
return ret
|
||
|
ret.append(str(l).lower())
|
||
|
return ret
|
||
|
|
||
|
|
||
|
|
||
|
WORDLIST_SIZE = 8192 + 3
|
||
|
word_re = re.compile(r"^[A-Za-z]+$")
|
||
|
|
||
|
|
||
|
print("Step 3")
|
||
|
|
||
|
|
||
|
annotated_words=pd.read_excel("annotated_words.ods")
|
||
|
|
||
|
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
|
||
|
excluded_words[0:10]
|
||
|
|
||
|
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
|
||
|
|
||
|
custom_maps = [
|
||
|
(m[1]["word"].lower(), mapping.lower())
|
||
|
for m in custom_maps.iterrows()
|
||
|
for mapping in m[1]["maps_to"]
|
||
|
]
|
||
|
custom_maps
|
||
|
|
||
|
|
||
|
print("Step 4")
|
||
|
|
||
|
|
||
|
# Start parsing the wordlist
|
||
|
all_words = get_lines("00-frequency-all.txt.gz")
|
||
|
|
||
|
# Delete header line
|
||
|
all_words = all_words[1:]
|
||
|
|
||
|
# Get only the word (fixed width)
|
||
|
all_words = [w[13:36].strip() for w in all_words]
|
||
|
|
||
|
# Remove special characters
|
||
|
all_words = [w for w in all_words if word_re.search(w)]
|
||
|
|
||
|
# Remove all removed words
|
||
|
all_words = [w for w in all_words if w not in excluded_words]
|
||
|
|
||
|
# Add all custom mappings
|
||
|
for m in list(sum(custom_maps, ())):
|
||
|
if m[0] not in all_words:
|
||
|
all_words.append(m[0])
|
||
|
if m[1] not in all_words:
|
||
|
all_words.append(m[1])
|
||
|
|
||
|
|
||
|
print("Step 5")
|
||
|
|
||
|
|
||
|
# Lemmatize all words (plural -> singular)
|
||
|
lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
|
||
|
print(lemmatize_mappings[:100])
|
||
|
|
||
|
# Add custom lemmatizations
|
||
|
for l in custom_maps:
|
||
|
if l in lemmatize_mappings:
|
||
|
print(f"Warning: {l} is already lemmatized")
|
||
|
else:
|
||
|
lemmatize_mappings.append(l)
|
||
|
|
||
|
print(lemmatize_mappings[:100])
|
||
|
|
||
|
lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]
|
||
|
print(lemmatize_mappings[:100])
|
||
|
|
||
|
# Now, re-add all lematized words to the list of every word
|
||
|
for w in sum(lemmatize_mappings, ()):
|
||
|
if w not in all_words:
|
||
|
print(w)
|
||
|
all_words.append(w)
|
||
|
|
||
|
lemmatize_mappings = {k: v for k, v in lemmatize_mappings}
|
||
|
|
||
|
|
||
|
print("Step 6")
|
||
|
|
||
|
|
||
|
final_wordlist = []
|
||
|
seen_lemmatizations = set()
|
||
|
for w in all_words:
|
||
|
lemmatized = lemmatize_mappings.get(w) or w
|
||
|
if lemmatized in seen_lemmatizations:
|
||
|
# The lemmatized version of this word was already seen
|
||
|
continue
|
||
|
else:
|
||
|
# The lemmatized version hasn't been seen. We're good to add it
|
||
|
final_wordlist.append([
|
||
|
k
|
||
|
for k
|
||
|
in lemmatize_mappings.keys()
|
||
|
if lemmatize_mappings[k] == lemmatized
|
||
|
])
|
||
|
seen_lemmatizations.add(lemmatized)
|
||
|
|
||
|
if len(final_wordlist) >= WORDLIST_SIZE:
|
||
|
break
|
||
|
|
||
|
# Now, convert it to the format (number, word)
|
||
|
final_wordlist = [
|
||
|
(idx, w)
|
||
|
for idx, words in enumerate(final_wordlist)
|
||
|
for w in words
|
||
|
]
|
||
|
|
||
|
|
||
|
print("Step 7")
|
||
|
|
||
|
print(len(lemmatize_mappings))
|
||
|
|
||
|
print("Step 8")
|
||
|
|
||
|
with open("01-generated-wordlist.csv", "w") as f:
|
||
|
f.write("word,number\n")
|
||
|
|
||
|
for w in final_wordlist:
|
||
|
lemmatized = "" if not w[1] else w[1]
|
||
|
f.write(f"{w[1].upper()},{w[0]}")
|
||
|
f.write("\n")
|
||
|
|
||
|
print("Done")
|