Update tests and wordlist
This commit is contained in:
parent
c57b0f120c
commit
9a6a68d2a7
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,6 +1,9 @@
|
|||||||
/build
|
/build
|
||||||
|
/docs/final_wordlist.csv
|
||||||
/docs/frequency-all.txt.gz
|
/docs/frequency-all.txt.gz
|
||||||
/docs/*.html
|
/docs/*.html
|
||||||
|
/docs/.~lock.*.ods#
|
||||||
/docs/*.svg
|
/docs/*.svg
|
||||||
|
/docs/venv
|
||||||
**/.ipynb_checkpoints
|
**/.ipynb_checkpoints
|
||||||
/target
|
/target
|
||||||
|
17441
data/wordlist-tmp.csv
17441
data/wordlist-tmp.csv
File diff suppressed because it is too large
Load Diff
Binary file not shown.
12
docs/requirements.txt
Normal file
12
docs/requirements.txt
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
click==8.1.3
|
||||||
|
defusedxml==0.7.1
|
||||||
|
joblib==1.2.0
|
||||||
|
nltk==3.8.1
|
||||||
|
numpy==1.24.2
|
||||||
|
odfpy==1.4.1
|
||||||
|
pandas==1.5.3
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
pytz==2022.7.1
|
||||||
|
regex==2022.10.31
|
||||||
|
six==1.16.0
|
||||||
|
tqdm==4.64.1
|
@ -12,34 +12,21 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"Collecting nltk\n",
|
"Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n",
|
||||||
" Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
|
"Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n",
|
||||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
|
||||||
"\u001b[?25hCollecting odfpy\n",
|
|
||||||
" Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
|
|
||||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
|
||||||
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
|
||||||
"\u001b[?25hRequirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
|
|
||||||
"Collecting regex>=2021.8.3\n",
|
|
||||||
" Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
|
|
||||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
|
||||||
"\u001b[?25hRequirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
|
|
||||||
"Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
|
"Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
|
||||||
"Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n",
|
"Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n",
|
||||||
"Building wheels for collected packages: odfpy\n",
|
"Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
|
||||||
" Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n",
|
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
|
||||||
"\u001b[?25h Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=3ee9aaac0134706d6ef72a359cd2466813f37bd8f080150b008d6d6e247d710c\n",
|
"Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n"
|
||||||
" Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
|
|
||||||
"Successfully built odfpy\n",
|
|
||||||
"Installing collected packages: regex, odfpy, nltk\n",
|
|
||||||
"Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "stderr",
|
"name": "stderr",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n"
|
"[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n",
|
||||||
|
"[nltk_data] Package wordnet is already up-to-date!\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -129,6 +116,8 @@
|
|||||||
"[('be', 'bee'),\n",
|
"[('be', 'bee'),\n",
|
||||||
" ('by', 'bye'),\n",
|
" ('by', 'bye'),\n",
|
||||||
" ('died', 'dyed'),\n",
|
" ('died', 'dyed'),\n",
|
||||||
|
" ('cents', 'sense'),\n",
|
||||||
|
" ('yellow', 'hello'),\n",
|
||||||
" ('corps', 'core'),\n",
|
" ('corps', 'core'),\n",
|
||||||
" ('ore', 'oar'),\n",
|
" ('ore', 'oar'),\n",
|
||||||
" ('ore', ' or'),\n",
|
" ('ore', ' or'),\n",
|
||||||
@ -160,6 +149,9 @@
|
|||||||
" ('aluminium', 'aluminum'),\n",
|
" ('aluminium', 'aluminum'),\n",
|
||||||
" ('isle', 'aisle'),\n",
|
" ('isle', 'aisle'),\n",
|
||||||
" ('boulder', 'bolder'),\n",
|
" ('boulder', 'bolder'),\n",
|
||||||
|
" ('blew', 'blue'),\n",
|
||||||
|
" ('reformed', 'reform'),\n",
|
||||||
|
" ('scent', 'sense'),\n",
|
||||||
" ('ads', 'adds'),\n",
|
" ('ads', 'adds'),\n",
|
||||||
" ('honours', 'honors'),\n",
|
" ('honours', 'honors'),\n",
|
||||||
" ('bot', 'bought'),\n",
|
" ('bot', 'bought'),\n",
|
||||||
@ -210,13 +202,13 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"# all_words: 21287\n",
|
"# all_words: 21285\n",
|
||||||
"sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n",
|
"sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# lemmatize_mappings: 21341\n",
|
"# lemmatize_mappings: 21344\n",
|
||||||
"sample: [('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it')]\n",
|
"sample: [('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it')]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# distinct_words: 17557\n",
|
"# distinct_words: 17555\n",
|
||||||
"sample:\n"
|
"sample:\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -360,7 +352,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": 7,
|
||||||
"id": "d1a06597-4ad5-4566-a716-8bbad416b7ab",
|
"id": "d1a06597-4ad5-4566-a716-8bbad416b7ab",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"tags": []
|
"tags": []
|
||||||
@ -380,21 +372,10 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": null,
|
||||||
"id": "c88fe193-11cc-4a06-a3cf-d1ad85f44d14",
|
"id": "c88fe193-11cc-4a06-a3cf-d1ad85f44d14",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"'H'"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 11,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": []
|
"source": []
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
147
docs/wordlist.py
Normal file
147
docs/wordlist.py
Normal file
@ -0,0 +1,147 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
import nltk
|
||||||
|
from nltk.stem.wordnet import WordNetLemmatizer
|
||||||
|
import pandas as pd
|
||||||
|
import gzip
|
||||||
|
import re
|
||||||
|
|
||||||
|
nltk.download("wordnet")
|
||||||
|
|
||||||
|
WORDLIST_SIZE=8192 + 3
|
||||||
|
|
||||||
|
|
||||||
|
# ## First, get the list of excluded words
|
||||||
|
|
||||||
|
annotated_words=pd.read_excel("annotated_words.ods")
|
||||||
|
|
||||||
|
|
||||||
|
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
|
||||||
|
excluded_words[0:10]
|
||||||
|
|
||||||
|
|
||||||
|
# ## Next, get the list of custom mappings
|
||||||
|
|
||||||
|
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
|
||||||
|
|
||||||
|
custom_maps = [
|
||||||
|
(m[1]["word"].lower(), mapping.lower())
|
||||||
|
for m in custom_maps.iterrows()
|
||||||
|
for mapping in m[1]["maps_to"]
|
||||||
|
]
|
||||||
|
custom_maps
|
||||||
|
|
||||||
|
|
||||||
|
def get_lines(filename):
|
||||||
|
with gzip.open(filename, 'r') as f:
|
||||||
|
ret = []
|
||||||
|
for l in f:
|
||||||
|
if len(ret) > 30_000:
|
||||||
|
return ret
|
||||||
|
ret.append(str(l).lower())
|
||||||
|
return ret
|
||||||
|
|
||||||
|
lemmatizer = WordNetLemmatizer()
|
||||||
|
word_re = re.compile(r"^[A-Za-z]+$")
|
||||||
|
|
||||||
|
# Start parsing the wordlist
|
||||||
|
all_words = get_lines("frequency-all.txt.gz")
|
||||||
|
|
||||||
|
# Delete header line
|
||||||
|
all_words = all_words[1:]
|
||||||
|
|
||||||
|
# Get only the word (fixed width)
|
||||||
|
all_words = [w[13:36].strip() for w in all_words]
|
||||||
|
|
||||||
|
# Remove special characters
|
||||||
|
all_words = [w for w in all_words if word_re.search(w)]
|
||||||
|
|
||||||
|
# Remove all removed words
|
||||||
|
all_words = [w for w in all_words if w not in excluded_words]
|
||||||
|
|
||||||
|
# Lemmatize all words (plural -> singular)
|
||||||
|
lemmatize_mappings = [
|
||||||
|
(w, lemmatizer.lemmatize(w))
|
||||||
|
for w in all_words
|
||||||
|
# if w != lemmatizer.lemmatize(w)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Remove all words that lemmatize to another word
|
||||||
|
#all_words = [w for w in all_words if w not in ]
|
||||||
|
|
||||||
|
# Add custom lemmatizations
|
||||||
|
for l in custom_maps:
|
||||||
|
if l in lemmatize_mappings:
|
||||||
|
print(f"Warning: {l} is already lemmatized")
|
||||||
|
else:
|
||||||
|
lemmatize_mappings.append(l)
|
||||||
|
|
||||||
|
distinct_words_lemmatized = set()
|
||||||
|
distinct_words = []
|
||||||
|
for w in lemmatize_mappings:
|
||||||
|
if w[1] not in distinct_words_lemmatized:
|
||||||
|
distinct_words_lemmatized.add(w[1])
|
||||||
|
distinct_words.append(w[0])
|
||||||
|
del distinct_words_lemmatized
|
||||||
|
|
||||||
|
# Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word
|
||||||
|
#wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl]
|
||||||
|
|
||||||
|
# Get a list of words that map to other words
|
||||||
|
# A word was lemmatized if wordnet mapped it to another word (not None) that was different
|
||||||
|
#only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]]
|
||||||
|
|
||||||
|
# Get a list of distinct lemmatized words
|
||||||
|
#distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None]
|
||||||
|
#distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)]
|
||||||
|
|
||||||
|
print(f"# all_words: {len(all_words)}")
|
||||||
|
print(f"sample: {all_words[0:10]}")
|
||||||
|
print()
|
||||||
|
print(f"# lemmatize_mappings: {len(lemmatize_mappings)}")
|
||||||
|
print(f"sample: {lemmatize_mappings[0:10]}")
|
||||||
|
print()
|
||||||
|
print(f"# distinct_words: {len(distinct_words)}")
|
||||||
|
print(f"sample:")
|
||||||
|
distinct_words[0:10]
|
||||||
|
|
||||||
|
|
||||||
|
# ## Generate the final wordlist
|
||||||
|
|
||||||
|
# The final wordlist map. Maps a word to its numeric value
|
||||||
|
# Starting at 1
|
||||||
|
final_wordlist = {
|
||||||
|
w: idx + 1
|
||||||
|
for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE])
|
||||||
|
}
|
||||||
|
|
||||||
|
reverse_lemmatize_idx = {
|
||||||
|
lemmatizer.lemmatize(w): w
|
||||||
|
for w in final_wordlist.keys()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add the lemmatized numbers
|
||||||
|
for w, lem_w in lemmatize_mappings:
|
||||||
|
if lem_w not in reverse_lemmatize_idx:
|
||||||
|
# This word is not in the reverse list
|
||||||
|
# This happens when the index of the lemmatized word we're working with is too large
|
||||||
|
continue
|
||||||
|
|
||||||
|
final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]]
|
||||||
|
|
||||||
|
assert final_wordlist["its"] == final_wordlist["its"]
|
||||||
|
assert final_wordlist["its"] >= 0
|
||||||
|
|
||||||
|
print(f"Final wordlist size: {len(final_wordlist.keys())}")
|
||||||
|
|
||||||
|
|
||||||
|
sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]
|
||||||
|
|
||||||
|
with open("final_wordlist.csv", "w") as f:
|
||||||
|
f.write("word,number\n")
|
||||||
|
|
||||||
|
for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):
|
||||||
|
lemmatized = "" if not w[1] else w[1]
|
||||||
|
f.write(f"{w[0].upper()},{lemmatized - 1}")
|
||||||
|
f.write("\n")
|
@ -66,6 +66,7 @@ impl FromStr for Address<'_> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Check if either the beginning or end is a number
|
// Check if either the beginning or end is a number
|
||||||
|
// These unwraps are okay because we checked that components is not empty above
|
||||||
let (reverse, number) = if let Ok(number) = components.first().unwrap().parse::<Number>() {
|
let (reverse, number) = if let Ok(number) = components.first().unwrap().parse::<Number>() {
|
||||||
// The number is the first component
|
// The number is the first component
|
||||||
(false, number)
|
(false, number)
|
||||||
@ -133,13 +134,13 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
macro_rules! w {
|
macro_rules! w {
|
||||||
($word:tt) => {
|
($word:ident) => {
|
||||||
words::get_word($word).unwrap()
|
words::get_word(stringify!($word)).unwrap()
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
macro_rules! addr {
|
macro_rules! addr {
|
||||||
($number:tt, $word0:tt, $word1:tt, $word2:tt) => {
|
($number:tt, $word0:ident, $word1:ident, $word2:ident) => {
|
||||||
Address {
|
Address {
|
||||||
number: $number,
|
number: $number,
|
||||||
words: [w!($word0), w!($word1), w!($word2)],
|
words: [w!($word0), w!($word1), w!($word2)],
|
||||||
@ -159,29 +160,68 @@ mod tests {
|
|||||||
assert_eq!(extract_version(0b01 << 10), 0b01);
|
assert_eq!(extract_version(0b01 << 10), 0b01);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_address_from_str() {
|
||||||
|
for i in &[
|
||||||
|
// Regular
|
||||||
|
"grape orange apple 1000",
|
||||||
|
// Reverse
|
||||||
|
"1000 apple orange grape",
|
||||||
|
// Whitespace everywhere
|
||||||
|
"\t\tgrape\n\t orange apple 1000 \t ",
|
||||||
|
// Mixed case
|
||||||
|
"\n1000 APPlE oRAnGE GrAPe\n",
|
||||||
|
] {
|
||||||
|
eprintln!("Testing {i:?}");
|
||||||
|
assert_eq!(Address::from_str(i), Ok(addr![1000, apple, orange, grape]));
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in &[
|
||||||
|
// Too small
|
||||||
|
"",
|
||||||
|
" ",
|
||||||
|
"1000",
|
||||||
|
"1000 orange blue",
|
||||||
|
// Not a word
|
||||||
|
"1000 ajlkdsf alskdjasldkj fas",
|
||||||
|
// Number too large
|
||||||
|
"grape orange apple 10000",
|
||||||
|
// Number too small
|
||||||
|
"0 apple orange grape",
|
||||||
|
// No number
|
||||||
|
"grape orange apple mix",
|
||||||
|
"grape orange apple 1e4",
|
||||||
|
"grape orange apple 1025",
|
||||||
|
] {
|
||||||
|
eprintln!("Testing {i:?}");
|
||||||
|
assert!(Address::from_str(i).is_err());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_parse_v0() {
|
fn test_parse_v0() {
|
||||||
|
// Regular case
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
Address::parse_v0(1000, vec!["apple", "orange", "grape"]),
|
Address::parse_v0(1000, vec!["apple", "orange", "grape"]),
|
||||||
Ok(addr![1000, "apple", "orange", "grape"])
|
Ok(addr![1000, apple, orange, grape])
|
||||||
);
|
);
|
||||||
|
|
||||||
// Number is on the edge
|
// Number is on the edge
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
Address::parse_v0(1, vec!["apple", "orange", "grape"]),
|
Address::parse_v0(1, vec!["apple", "orange", "grape"]),
|
||||||
Ok(addr![1, "apple", "orange", "grape"])
|
Ok(addr![1, apple, orange, grape])
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
Address::parse_v0(1024, vec!["apple", "orange", "grape"]),
|
Address::parse_v0(1024, vec!["apple", "orange", "grape"]),
|
||||||
Ok(addr![1024, "apple", "orange", "grape"])
|
Ok(addr![1024, apple, orange, grape])
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
Address::parse_v0(V0_MAX_NUMBER, vec!["apple", "orange", "grape"]),
|
Address::parse_v0(V0_MAX_NUMBER, vec!["apple", "orange", "grape"]),
|
||||||
Ok(addr![V0_MAX_NUMBER, "apple", "orange", "grape"])
|
Ok(addr![V0_MAX_NUMBER, apple, orange, grape])
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
Address::parse_v0(V0_MIN_NUMBER, vec!["apple", "orange", "grape"]),
|
Address::parse_v0(V0_MIN_NUMBER, vec!["apple", "orange", "grape"]),
|
||||||
Ok(addr![V0_MIN_NUMBER, "apple", "orange", "grape"])
|
Ok(addr![V0_MIN_NUMBER, apple, orange, grape])
|
||||||
);
|
);
|
||||||
|
|
||||||
// Word not found
|
// Word not found
|
||||||
|
@ -11,12 +11,15 @@ use std::path::Path;
|
|||||||
fn main() {
|
fn main() {
|
||||||
let path = Path::new(&env::var("OUT_DIR").unwrap()).join("codegen.rs");
|
let path = Path::new(&env::var("OUT_DIR").unwrap()).join("codegen.rs");
|
||||||
let mut file = BufWriter::new(File::create(path).unwrap());
|
let mut file = BufWriter::new(File::create(path).unwrap());
|
||||||
|
let wordlist_path = "../data/wordlist-tmp.csv";
|
||||||
|
|
||||||
|
println!("cargo:rerun-if-changed={}", wordlist_path);
|
||||||
|
|
||||||
let rdr_builder = ReaderBuilder::new();
|
let rdr_builder = ReaderBuilder::new();
|
||||||
|
|
||||||
// First get the actual wordlist
|
// First get the actual wordlist
|
||||||
let words: Vec<Word> = rdr_builder
|
let words: Vec<Word> = rdr_builder
|
||||||
.from_reader(File::open("../data/wordlist-tmp.csv").unwrap())
|
.from_reader(File::open(wordlist_path).unwrap())
|
||||||
.deserialize()
|
.deserialize()
|
||||||
.collect::<Result<Vec<Word>, _>>()
|
.collect::<Result<Vec<Word>, _>>()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@ -43,7 +46,7 @@ pub const WORDS: &[Word] = &["#
|
|||||||
writeln!(&mut file, "\t{result:?},").unwrap();
|
writeln!(&mut file, "\t{result:?},").unwrap();
|
||||||
}
|
}
|
||||||
writeln!(&mut file, "];\n").unwrap();
|
writeln!(&mut file, "];\n").unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_word_map(mut file: impl Write, words: &[Word]) {
|
fn write_word_map(mut file: impl Write, words: &[Word]) {
|
||||||
let mut word_map = phf_codegen::Map::new();
|
let mut word_map = phf_codegen::Map::new();
|
||||||
@ -59,7 +62,7 @@ fn write_word_map(mut file: impl Write, words: &[Word]) {
|
|||||||
word_map.build()
|
word_map.build()
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_number_to_words(mut file: impl Write, words: &[Word]) {
|
fn write_number_to_words(mut file: impl Write, words: &[Word]) {
|
||||||
let word_number_to_idx = words
|
let word_number_to_idx = words
|
||||||
@ -88,7 +91,7 @@ fn write_number_to_words(mut file: impl Write, words: &[Word]) {
|
|||||||
writeln!(&mut file, "],").unwrap();
|
writeln!(&mut file, "],").unwrap();
|
||||||
}
|
}
|
||||||
writeln!(&mut file, "];\n").unwrap();
|
writeln!(&mut file, "];\n").unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||||
pub struct Word {
|
pub struct Word {
|
||||||
|
@ -53,3 +53,35 @@ where
|
|||||||
.get(&maybe_word.as_ref().trim().to_ascii_uppercase())
|
.get(&maybe_word.as_ref().trim().to_ascii_uppercase())
|
||||||
.copied()
|
.copied()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_equivalence() {
|
||||||
|
// Test equivalence
|
||||||
|
macro_rules! te {
|
||||||
|
($word1:ident, $word2: ident) => {
|
||||||
|
eprintln!("Checking if {:?} is a word", stringify!($word1));
|
||||||
|
assert!(get_word(stringify!($word1)).is_some());
|
||||||
|
eprintln!("Checking if {:?} is a word", stringify!($word2));
|
||||||
|
assert!(get_word(stringify!($word2)).is_some());
|
||||||
|
eprintln!("Checking equivalence");
|
||||||
|
assert_eq!(
|
||||||
|
get_word(stringify!($word1)).unwrap().number,
|
||||||
|
get_word(stringify!($word1)).unwrap().number
|
||||||
|
);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Homonyms
|
||||||
|
te!(blue, blew);
|
||||||
|
te!(yellow, hello);
|
||||||
|
|
||||||
|
// Plurals
|
||||||
|
te!(sent, sense);
|
||||||
|
te!(sent, scents);
|
||||||
|
te!(sent, cents);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user