Update tests and wordlist
This commit is contained in:
parent
c57b0f120c
commit
9a6a68d2a7
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,6 +1,9 @@
|
||||
/build
|
||||
/docs/final_wordlist.csv
|
||||
/docs/frequency-all.txt.gz
|
||||
/docs/*.html
|
||||
/docs/.~lock.*.ods#
|
||||
/docs/*.svg
|
||||
/docs/venv
|
||||
**/.ipynb_checkpoints
|
||||
/target
|
||||
|
17441
data/wordlist-tmp.csv
17441
data/wordlist-tmp.csv
File diff suppressed because it is too large
Load Diff
Binary file not shown.
12
docs/requirements.txt
Normal file
12
docs/requirements.txt
Normal file
@ -0,0 +1,12 @@
|
||||
click==8.1.3
|
||||
defusedxml==0.7.1
|
||||
joblib==1.2.0
|
||||
nltk==3.8.1
|
||||
numpy==1.24.2
|
||||
odfpy==1.4.1
|
||||
pandas==1.5.3
|
||||
python-dateutil==2.8.2
|
||||
pytz==2022.7.1
|
||||
regex==2022.10.31
|
||||
six==1.16.0
|
||||
tqdm==4.64.1
|
@ -12,34 +12,21 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Collecting nltk\n",
|
||||
" Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hCollecting odfpy\n",
|
||||
" Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hRequirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
|
||||
"Collecting regex>=2021.8.3\n",
|
||||
" Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
|
||||
"Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n",
|
||||
"Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n",
|
||||
"Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
|
||||
"Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n",
|
||||
"Building wheels for collected packages: odfpy\n",
|
||||
" Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=3ee9aaac0134706d6ef72a359cd2466813f37bd8f080150b008d6d6e247d710c\n",
|
||||
" Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
|
||||
"Successfully built odfpy\n",
|
||||
"Installing collected packages: regex, odfpy, nltk\n",
|
||||
"Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
|
||||
"Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n",
|
||||
"Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
|
||||
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
|
||||
"Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n"
|
||||
"[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n",
|
||||
"[nltk_data] Package wordnet is already up-to-date!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -129,6 +116,8 @@
|
||||
"[('be', 'bee'),\n",
|
||||
" ('by', 'bye'),\n",
|
||||
" ('died', 'dyed'),\n",
|
||||
" ('cents', 'sense'),\n",
|
||||
" ('yellow', 'hello'),\n",
|
||||
" ('corps', 'core'),\n",
|
||||
" ('ore', 'oar'),\n",
|
||||
" ('ore', ' or'),\n",
|
||||
@ -160,6 +149,9 @@
|
||||
" ('aluminium', 'aluminum'),\n",
|
||||
" ('isle', 'aisle'),\n",
|
||||
" ('boulder', 'bolder'),\n",
|
||||
" ('blew', 'blue'),\n",
|
||||
" ('reformed', 'reform'),\n",
|
||||
" ('scent', 'sense'),\n",
|
||||
" ('ads', 'adds'),\n",
|
||||
" ('honours', 'honors'),\n",
|
||||
" ('bot', 'bought'),\n",
|
||||
@ -210,13 +202,13 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"# all_words: 21287\n",
|
||||
"# all_words: 21285\n",
|
||||
"sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n",
|
||||
"\n",
|
||||
"# lemmatize_mappings: 21341\n",
|
||||
"# lemmatize_mappings: 21344\n",
|
||||
"sample: [('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it')]\n",
|
||||
"\n",
|
||||
"# distinct_words: 17557\n",
|
||||
"# distinct_words: 17555\n",
|
||||
"sample:\n"
|
||||
]
|
||||
},
|
||||
@ -360,7 +352,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 7,
|
||||
"id": "d1a06597-4ad5-4566-a716-8bbad416b7ab",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
@ -380,21 +372,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": null,
|
||||
"id": "c88fe193-11cc-4a06-a3cf-d1ad85f44d14",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'H'"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
|
147
docs/wordlist.py
Normal file
147
docs/wordlist.py
Normal file
@ -0,0 +1,147 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
import nltk
|
||||
from nltk.stem.wordnet import WordNetLemmatizer
|
||||
import pandas as pd
|
||||
import gzip
|
||||
import re
|
||||
|
||||
nltk.download("wordnet")
|
||||
|
||||
WORDLIST_SIZE=8192 + 3
|
||||
|
||||
|
||||
# ## First, get the list of excluded words
|
||||
|
||||
annotated_words=pd.read_excel("annotated_words.ods")
|
||||
|
||||
|
||||
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
|
||||
excluded_words[0:10]
|
||||
|
||||
|
||||
# ## Next, get the list of custom mappings
|
||||
|
||||
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
|
||||
|
||||
custom_maps = [
|
||||
(m[1]["word"].lower(), mapping.lower())
|
||||
for m in custom_maps.iterrows()
|
||||
for mapping in m[1]["maps_to"]
|
||||
]
|
||||
custom_maps
|
||||
|
||||
|
||||
def get_lines(filename):
|
||||
with gzip.open(filename, 'r') as f:
|
||||
ret = []
|
||||
for l in f:
|
||||
if len(ret) > 30_000:
|
||||
return ret
|
||||
ret.append(str(l).lower())
|
||||
return ret
|
||||
|
||||
lemmatizer = WordNetLemmatizer()
|
||||
word_re = re.compile(r"^[A-Za-z]+$")
|
||||
|
||||
# Start parsing the wordlist
|
||||
all_words = get_lines("frequency-all.txt.gz")
|
||||
|
||||
# Delete header line
|
||||
all_words = all_words[1:]
|
||||
|
||||
# Get only the word (fixed width)
|
||||
all_words = [w[13:36].strip() for w in all_words]
|
||||
|
||||
# Remove special characters
|
||||
all_words = [w for w in all_words if word_re.search(w)]
|
||||
|
||||
# Remove all removed words
|
||||
all_words = [w for w in all_words if w not in excluded_words]
|
||||
|
||||
# Lemmatize all words (plural -> singular)
|
||||
lemmatize_mappings = [
|
||||
(w, lemmatizer.lemmatize(w))
|
||||
for w in all_words
|
||||
# if w != lemmatizer.lemmatize(w)
|
||||
]
|
||||
|
||||
# Remove all words that lemmatize to another word
|
||||
#all_words = [w for w in all_words if w not in ]
|
||||
|
||||
# Add custom lemmatizations
|
||||
for l in custom_maps:
|
||||
if l in lemmatize_mappings:
|
||||
print(f"Warning: {l} is already lemmatized")
|
||||
else:
|
||||
lemmatize_mappings.append(l)
|
||||
|
||||
distinct_words_lemmatized = set()
|
||||
distinct_words = []
|
||||
for w in lemmatize_mappings:
|
||||
if w[1] not in distinct_words_lemmatized:
|
||||
distinct_words_lemmatized.add(w[1])
|
||||
distinct_words.append(w[0])
|
||||
del distinct_words_lemmatized
|
||||
|
||||
# Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word
|
||||
#wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl]
|
||||
|
||||
# Get a list of words that map to other words
|
||||
# A word was lemmatized if wordnet mapped it to another word (not None) that was different
|
||||
#only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]]
|
||||
|
||||
# Get a list of distinct lemmatized words
|
||||
#distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None]
|
||||
#distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)]
|
||||
|
||||
print(f"# all_words: {len(all_words)}")
|
||||
print(f"sample: {all_words[0:10]}")
|
||||
print()
|
||||
print(f"# lemmatize_mappings: {len(lemmatize_mappings)}")
|
||||
print(f"sample: {lemmatize_mappings[0:10]}")
|
||||
print()
|
||||
print(f"# distinct_words: {len(distinct_words)}")
|
||||
print(f"sample:")
|
||||
distinct_words[0:10]
|
||||
|
||||
|
||||
# ## Generate the final wordlist
|
||||
|
||||
# The final wordlist map. Maps a word to its numeric value
|
||||
# Starting at 1
|
||||
final_wordlist = {
|
||||
w: idx + 1
|
||||
for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE])
|
||||
}
|
||||
|
||||
reverse_lemmatize_idx = {
|
||||
lemmatizer.lemmatize(w): w
|
||||
for w in final_wordlist.keys()
|
||||
}
|
||||
|
||||
# Add the lemmatized numbers
|
||||
for w, lem_w in lemmatize_mappings:
|
||||
if lem_w not in reverse_lemmatize_idx:
|
||||
# This word is not in the reverse list
|
||||
# This happens when the index of the lemmatized word we're working with is too large
|
||||
continue
|
||||
|
||||
final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]]
|
||||
|
||||
assert final_wordlist["its"] == final_wordlist["its"]
|
||||
assert final_wordlist["its"] >= 0
|
||||
|
||||
print(f"Final wordlist size: {len(final_wordlist.keys())}")
|
||||
|
||||
|
||||
sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]
|
||||
|
||||
with open("final_wordlist.csv", "w") as f:
|
||||
f.write("word,number\n")
|
||||
|
||||
for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):
|
||||
lemmatized = "" if not w[1] else w[1]
|
||||
f.write(f"{w[0].upper()},{lemmatized - 1}")
|
||||
f.write("\n")
|
@ -66,6 +66,7 @@ impl FromStr for Address<'_> {
|
||||
}
|
||||
|
||||
// Check if either the beginning or end is a number
|
||||
// These unwraps are okay because we checked that components is not empty above
|
||||
let (reverse, number) = if let Ok(number) = components.first().unwrap().parse::<Number>() {
|
||||
// The number is the first component
|
||||
(false, number)
|
||||
@ -133,13 +134,13 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
macro_rules! w {
|
||||
($word:tt) => {
|
||||
words::get_word($word).unwrap()
|
||||
($word:ident) => {
|
||||
words::get_word(stringify!($word)).unwrap()
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! addr {
|
||||
($number:tt, $word0:tt, $word1:tt, $word2:tt) => {
|
||||
($number:tt, $word0:ident, $word1:ident, $word2:ident) => {
|
||||
Address {
|
||||
number: $number,
|
||||
words: [w!($word0), w!($word1), w!($word2)],
|
||||
@ -159,29 +160,68 @@ mod tests {
|
||||
assert_eq!(extract_version(0b01 << 10), 0b01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_address_from_str() {
|
||||
for i in &[
|
||||
// Regular
|
||||
"grape orange apple 1000",
|
||||
// Reverse
|
||||
"1000 apple orange grape",
|
||||
// Whitespace everywhere
|
||||
"\t\tgrape\n\t orange apple 1000 \t ",
|
||||
// Mixed case
|
||||
"\n1000 APPlE oRAnGE GrAPe\n",
|
||||
] {
|
||||
eprintln!("Testing {i:?}");
|
||||
assert_eq!(Address::from_str(i), Ok(addr![1000, apple, orange, grape]));
|
||||
}
|
||||
|
||||
for i in &[
|
||||
// Too small
|
||||
"",
|
||||
" ",
|
||||
"1000",
|
||||
"1000 orange blue",
|
||||
// Not a word
|
||||
"1000 ajlkdsf alskdjasldkj fas",
|
||||
// Number too large
|
||||
"grape orange apple 10000",
|
||||
// Number too small
|
||||
"0 apple orange grape",
|
||||
// No number
|
||||
"grape orange apple mix",
|
||||
"grape orange apple 1e4",
|
||||
"grape orange apple 1025",
|
||||
] {
|
||||
eprintln!("Testing {i:?}");
|
||||
assert!(Address::from_str(i).is_err());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_v0() {
|
||||
// Regular case
|
||||
assert_eq!(
|
||||
Address::parse_v0(1000, vec!["apple", "orange", "grape"]),
|
||||
Ok(addr![1000, "apple", "orange", "grape"])
|
||||
Ok(addr![1000, apple, orange, grape])
|
||||
);
|
||||
|
||||
// Number is on the edge
|
||||
assert_eq!(
|
||||
Address::parse_v0(1, vec!["apple", "orange", "grape"]),
|
||||
Ok(addr![1, "apple", "orange", "grape"])
|
||||
Ok(addr![1, apple, orange, grape])
|
||||
);
|
||||
assert_eq!(
|
||||
Address::parse_v0(1024, vec!["apple", "orange", "grape"]),
|
||||
Ok(addr![1024, "apple", "orange", "grape"])
|
||||
Ok(addr![1024, apple, orange, grape])
|
||||
);
|
||||
assert_eq!(
|
||||
Address::parse_v0(V0_MAX_NUMBER, vec!["apple", "orange", "grape"]),
|
||||
Ok(addr![V0_MAX_NUMBER, "apple", "orange", "grape"])
|
||||
Ok(addr![V0_MAX_NUMBER, apple, orange, grape])
|
||||
);
|
||||
assert_eq!(
|
||||
Address::parse_v0(V0_MIN_NUMBER, vec!["apple", "orange", "grape"]),
|
||||
Ok(addr![V0_MIN_NUMBER, "apple", "orange", "grape"])
|
||||
Ok(addr![V0_MIN_NUMBER, apple, orange, grape])
|
||||
);
|
||||
|
||||
// Word not found
|
||||
|
@ -11,12 +11,15 @@ use std::path::Path;
|
||||
fn main() {
|
||||
let path = Path::new(&env::var("OUT_DIR").unwrap()).join("codegen.rs");
|
||||
let mut file = BufWriter::new(File::create(path).unwrap());
|
||||
let wordlist_path = "../data/wordlist-tmp.csv";
|
||||
|
||||
println!("cargo:rerun-if-changed={}", wordlist_path);
|
||||
|
||||
let rdr_builder = ReaderBuilder::new();
|
||||
|
||||
// First get the actual wordlist
|
||||
let words: Vec<Word> = rdr_builder
|
||||
.from_reader(File::open("../data/wordlist-tmp.csv").unwrap())
|
||||
.from_reader(File::open(wordlist_path).unwrap())
|
||||
.deserialize()
|
||||
.collect::<Result<Vec<Word>, _>>()
|
||||
.unwrap();
|
||||
|
@ -53,3 +53,35 @@ where
|
||||
.get(&maybe_word.as_ref().trim().to_ascii_uppercase())
|
||||
.copied()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_equivalence() {
|
||||
// Test equivalence
|
||||
macro_rules! te {
|
||||
($word1:ident, $word2: ident) => {
|
||||
eprintln!("Checking if {:?} is a word", stringify!($word1));
|
||||
assert!(get_word(stringify!($word1)).is_some());
|
||||
eprintln!("Checking if {:?} is a word", stringify!($word2));
|
||||
assert!(get_word(stringify!($word2)).is_some());
|
||||
eprintln!("Checking equivalence");
|
||||
assert_eq!(
|
||||
get_word(stringify!($word1)).unwrap().number,
|
||||
get_word(stringify!($word1)).unwrap().number
|
||||
);
|
||||
};
|
||||
}
|
||||
|
||||
// Homonyms
|
||||
te!(blue, blew);
|
||||
te!(yellow, hello);
|
||||
|
||||
// Plurals
|
||||
te!(sent, sense);
|
||||
te!(sent, scents);
|
||||
te!(sent, cents);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user