Update tests and wordlist

This commit is contained in:
Austen Adler 2023-02-15 22:57:30 -05:00
parent c57b0f120c
commit 9a6a68d2a7
9 changed files with 9029 additions and 8810 deletions

3
.gitignore vendored
View File

@ -1,6 +1,9 @@
/build /build
/docs/final_wordlist.csv
/docs/frequency-all.txt.gz /docs/frequency-all.txt.gz
/docs/*.html /docs/*.html
/docs/.~lock.*.ods#
/docs/*.svg /docs/*.svg
/docs/venv
**/.ipynb_checkpoints **/.ipynb_checkpoints
/target /target

File diff suppressed because it is too large Load Diff

Binary file not shown.

12
docs/requirements.txt Normal file
View File

@ -0,0 +1,12 @@
click==8.1.3
defusedxml==0.7.1
joblib==1.2.0
nltk==3.8.1
numpy==1.24.2
odfpy==1.4.1
pandas==1.5.3
python-dateutil==2.8.2
pytz==2022.7.1
regex==2022.10.31
six==1.16.0
tqdm==4.64.1

View File

@ -12,34 +12,21 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Collecting nltk\n", "Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n",
" Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n", "Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25hCollecting odfpy\n",
" Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25hRequirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
"Collecting regex>=2021.8.3\n",
" Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
"Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n", "Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
"Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n", "Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n",
"Building wheels for collected packages: odfpy\n", "Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
" Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n", "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
"\u001b[?25h Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=3ee9aaac0134706d6ef72a359cd2466813f37bd8f080150b008d6d6e247d710c\n", "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n"
" Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
"Successfully built odfpy\n",
"Installing collected packages: regex, odfpy, nltk\n",
"Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
] ]
}, },
{ {
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n" "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
] ]
} }
], ],
@ -129,6 +116,8 @@
"[('be', 'bee'),\n", "[('be', 'bee'),\n",
" ('by', 'bye'),\n", " ('by', 'bye'),\n",
" ('died', 'dyed'),\n", " ('died', 'dyed'),\n",
" ('cents', 'sense'),\n",
" ('yellow', 'hello'),\n",
" ('corps', 'core'),\n", " ('corps', 'core'),\n",
" ('ore', 'oar'),\n", " ('ore', 'oar'),\n",
" ('ore', ' or'),\n", " ('ore', ' or'),\n",
@ -160,6 +149,9 @@
" ('aluminium', 'aluminum'),\n", " ('aluminium', 'aluminum'),\n",
" ('isle', 'aisle'),\n", " ('isle', 'aisle'),\n",
" ('boulder', 'bolder'),\n", " ('boulder', 'bolder'),\n",
" ('blew', 'blue'),\n",
" ('reformed', 'reform'),\n",
" ('scent', 'sense'),\n",
" ('ads', 'adds'),\n", " ('ads', 'adds'),\n",
" ('honours', 'honors'),\n", " ('honours', 'honors'),\n",
" ('bot', 'bought'),\n", " ('bot', 'bought'),\n",
@ -210,13 +202,13 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"# all_words: 21287\n", "# all_words: 21285\n",
"sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n", "sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n",
"\n", "\n",
"# lemmatize_mappings: 21341\n", "# lemmatize_mappings: 21344\n",
"sample: [('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it')]\n", "sample: [('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it')]\n",
"\n", "\n",
"# distinct_words: 17557\n", "# distinct_words: 17555\n",
"sample:\n" "sample:\n"
] ]
}, },
@ -360,7 +352,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 7,
"id": "d1a06597-4ad5-4566-a716-8bbad416b7ab", "id": "d1a06597-4ad5-4566-a716-8bbad416b7ab",
"metadata": { "metadata": {
"tags": [] "tags": []
@ -380,21 +372,10 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": null,
"id": "c88fe193-11cc-4a06-a3cf-d1ad85f44d14", "id": "c88fe193-11cc-4a06-a3cf-d1ad85f44d14",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"data": {
"text/plain": [
"'H'"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [] "source": []
}, },
{ {

147
docs/wordlist.py Normal file
View File

@ -0,0 +1,147 @@
#!/usr/bin/env python
# coding: utf-8
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import gzip
import re
nltk.download("wordnet")
WORDLIST_SIZE=8192 + 3
# ## First, get the list of excluded words
annotated_words=pd.read_excel("annotated_words.ods")
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
excluded_words[0:10]
# ## Next, get the list of custom mappings
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
custom_maps = [
(m[1]["word"].lower(), mapping.lower())
for m in custom_maps.iterrows()
for mapping in m[1]["maps_to"]
]
custom_maps
def get_lines(filename):
with gzip.open(filename, 'r') as f:
ret = []
for l in f:
if len(ret) > 30_000:
return ret
ret.append(str(l).lower())
return ret
lemmatizer = WordNetLemmatizer()
word_re = re.compile(r"^[A-Za-z]+$")
# Start parsing the wordlist
all_words = get_lines("frequency-all.txt.gz")
# Delete header line
all_words = all_words[1:]
# Get only the word (fixed width)
all_words = [w[13:36].strip() for w in all_words]
# Remove special characters
all_words = [w for w in all_words if word_re.search(w)]
# Remove all removed words
all_words = [w for w in all_words if w not in excluded_words]
# Lemmatize all words (plural -> singular)
lemmatize_mappings = [
(w, lemmatizer.lemmatize(w))
for w in all_words
# if w != lemmatizer.lemmatize(w)
]
# Remove all words that lemmatize to another word
#all_words = [w for w in all_words if w not in ]
# Add custom lemmatizations
for l in custom_maps:
if l in lemmatize_mappings:
print(f"Warning: {l} is already lemmatized")
else:
lemmatize_mappings.append(l)
distinct_words_lemmatized = set()
distinct_words = []
for w in lemmatize_mappings:
if w[1] not in distinct_words_lemmatized:
distinct_words_lemmatized.add(w[1])
distinct_words.append(w[0])
del distinct_words_lemmatized
# Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word
#wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl]
# Get a list of words that map to other words
# A word was lemmatized if wordnet mapped it to another word (not None) that was different
#only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]]
# Get a list of distinct lemmatized words
#distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None]
#distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)]
print(f"# all_words: {len(all_words)}")
print(f"sample: {all_words[0:10]}")
print()
print(f"# lemmatize_mappings: {len(lemmatize_mappings)}")
print(f"sample: {lemmatize_mappings[0:10]}")
print()
print(f"# distinct_words: {len(distinct_words)}")
print(f"sample:")
distinct_words[0:10]
# ## Generate the final wordlist
# The final wordlist map. Maps a word to its numeric value
# Starting at 1
final_wordlist = {
w: idx + 1
for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE])
}
reverse_lemmatize_idx = {
lemmatizer.lemmatize(w): w
for w in final_wordlist.keys()
}
# Add the lemmatized numbers
for w, lem_w in lemmatize_mappings:
if lem_w not in reverse_lemmatize_idx:
# This word is not in the reverse list
# This happens when the index of the lemmatized word we're working with is too large
continue
final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]]
assert final_wordlist["its"] == final_wordlist["its"]
assert final_wordlist["its"] >= 0
print(f"Final wordlist size: {len(final_wordlist.keys())}")
sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]
with open("final_wordlist.csv", "w") as f:
f.write("word,number\n")
for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):
lemmatized = "" if not w[1] else w[1]
f.write(f"{w[0].upper()},{lemmatized - 1}")
f.write("\n")

View File

@ -66,6 +66,7 @@ impl FromStr for Address<'_> {
} }
// Check if either the beginning or end is a number // Check if either the beginning or end is a number
// These unwraps are okay because we checked that components is not empty above
let (reverse, number) = if let Ok(number) = components.first().unwrap().parse::<Number>() { let (reverse, number) = if let Ok(number) = components.first().unwrap().parse::<Number>() {
// The number is the first component // The number is the first component
(false, number) (false, number)
@ -133,13 +134,13 @@ mod tests {
use super::*; use super::*;
macro_rules! w { macro_rules! w {
($word:tt) => { ($word:ident) => {
words::get_word($word).unwrap() words::get_word(stringify!($word)).unwrap()
}; };
} }
macro_rules! addr { macro_rules! addr {
($number:tt, $word0:tt, $word1:tt, $word2:tt) => { ($number:tt, $word0:ident, $word1:ident, $word2:ident) => {
Address { Address {
number: $number, number: $number,
words: [w!($word0), w!($word1), w!($word2)], words: [w!($word0), w!($word1), w!($word2)],
@ -159,29 +160,68 @@ mod tests {
assert_eq!(extract_version(0b01 << 10), 0b01); assert_eq!(extract_version(0b01 << 10), 0b01);
} }
#[test]
fn test_address_from_str() {
for i in &[
// Regular
"grape orange apple 1000",
// Reverse
"1000 apple orange grape",
// Whitespace everywhere
"\t\tgrape\n\t orange apple 1000 \t ",
// Mixed case
"\n1000 APPlE oRAnGE GrAPe\n",
] {
eprintln!("Testing {i:?}");
assert_eq!(Address::from_str(i), Ok(addr![1000, apple, orange, grape]));
}
for i in &[
// Too small
"",
" ",
"1000",
"1000 orange blue",
// Not a word
"1000 ajlkdsf alskdjasldkj fas",
// Number too large
"grape orange apple 10000",
// Number too small
"0 apple orange grape",
// No number
"grape orange apple mix",
"grape orange apple 1e4",
"grape orange apple 1025",
] {
eprintln!("Testing {i:?}");
assert!(Address::from_str(i).is_err());
}
}
#[test] #[test]
fn test_parse_v0() { fn test_parse_v0() {
// Regular case
assert_eq!( assert_eq!(
Address::parse_v0(1000, vec!["apple", "orange", "grape"]), Address::parse_v0(1000, vec!["apple", "orange", "grape"]),
Ok(addr![1000, "apple", "orange", "grape"]) Ok(addr![1000, apple, orange, grape])
); );
// Number is on the edge // Number is on the edge
assert_eq!( assert_eq!(
Address::parse_v0(1, vec!["apple", "orange", "grape"]), Address::parse_v0(1, vec!["apple", "orange", "grape"]),
Ok(addr![1, "apple", "orange", "grape"]) Ok(addr![1, apple, orange, grape])
); );
assert_eq!( assert_eq!(
Address::parse_v0(1024, vec!["apple", "orange", "grape"]), Address::parse_v0(1024, vec!["apple", "orange", "grape"]),
Ok(addr![1024, "apple", "orange", "grape"]) Ok(addr![1024, apple, orange, grape])
); );
assert_eq!( assert_eq!(
Address::parse_v0(V0_MAX_NUMBER, vec!["apple", "orange", "grape"]), Address::parse_v0(V0_MAX_NUMBER, vec!["apple", "orange", "grape"]),
Ok(addr![V0_MAX_NUMBER, "apple", "orange", "grape"]) Ok(addr![V0_MAX_NUMBER, apple, orange, grape])
); );
assert_eq!( assert_eq!(
Address::parse_v0(V0_MIN_NUMBER, vec!["apple", "orange", "grape"]), Address::parse_v0(V0_MIN_NUMBER, vec!["apple", "orange", "grape"]),
Ok(addr![V0_MIN_NUMBER, "apple", "orange", "grape"]) Ok(addr![V0_MIN_NUMBER, apple, orange, grape])
); );
// Word not found // Word not found

View File

@ -11,12 +11,15 @@ use std::path::Path;
fn main() { fn main() {
let path = Path::new(&env::var("OUT_DIR").unwrap()).join("codegen.rs"); let path = Path::new(&env::var("OUT_DIR").unwrap()).join("codegen.rs");
let mut file = BufWriter::new(File::create(path).unwrap()); let mut file = BufWriter::new(File::create(path).unwrap());
let wordlist_path = "../data/wordlist-tmp.csv";
println!("cargo:rerun-if-changed={}", wordlist_path);
let rdr_builder = ReaderBuilder::new(); let rdr_builder = ReaderBuilder::new();
// First get the actual wordlist // First get the actual wordlist
let words: Vec<Word> = rdr_builder let words: Vec<Word> = rdr_builder
.from_reader(File::open("../data/wordlist-tmp.csv").unwrap()) .from_reader(File::open(wordlist_path).unwrap())
.deserialize() .deserialize()
.collect::<Result<Vec<Word>, _>>() .collect::<Result<Vec<Word>, _>>()
.unwrap(); .unwrap();
@ -43,7 +46,7 @@ pub const WORDS: &[Word] = &["#
writeln!(&mut file, "\t{result:?},").unwrap(); writeln!(&mut file, "\t{result:?},").unwrap();
} }
writeln!(&mut file, "];\n").unwrap(); writeln!(&mut file, "];\n").unwrap();
} }
fn write_word_map(mut file: impl Write, words: &[Word]) { fn write_word_map(mut file: impl Write, words: &[Word]) {
let mut word_map = phf_codegen::Map::new(); let mut word_map = phf_codegen::Map::new();
@ -59,7 +62,7 @@ fn write_word_map(mut file: impl Write, words: &[Word]) {
word_map.build() word_map.build()
) )
.unwrap(); .unwrap();
} }
fn write_number_to_words(mut file: impl Write, words: &[Word]) { fn write_number_to_words(mut file: impl Write, words: &[Word]) {
let word_number_to_idx = words let word_number_to_idx = words
@ -88,7 +91,7 @@ fn write_number_to_words(mut file: impl Write, words: &[Word]) {
writeln!(&mut file, "],").unwrap(); writeln!(&mut file, "],").unwrap();
} }
writeln!(&mut file, "];\n").unwrap(); writeln!(&mut file, "];\n").unwrap();
} }
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Word { pub struct Word {

View File

@ -53,3 +53,35 @@ where
.get(&maybe_word.as_ref().trim().to_ascii_uppercase()) .get(&maybe_word.as_ref().trim().to_ascii_uppercase())
.copied() .copied()
} }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_equivalence() {
// Test equivalence
macro_rules! te {
($word1:ident, $word2: ident) => {
eprintln!("Checking if {:?} is a word", stringify!($word1));
assert!(get_word(stringify!($word1)).is_some());
eprintln!("Checking if {:?} is a word", stringify!($word2));
assert!(get_word(stringify!($word2)).is_some());
eprintln!("Checking equivalence");
assert_eq!(
get_word(stringify!($word1)).unwrap().number,
get_word(stringify!($word1)).unwrap().number
);
};
}
// Homonyms
te!(blue, blew);
te!(yellow, hello);
// Plurals
te!(sent, sense);
te!(sent, scents);
te!(sent, cents);
}
}