Update tests and wordlist

2023-02-15 22:57:30 -05:00 · 2023-02-15 22:57:30 -05:00 · 9a6a68d2a7
commit 9a6a68d2a7
parent c57b0f120c
9 changed files with 9029 additions and 8810 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,9 @@
 /build
+/docs/final_wordlist.csv
 /docs/frequency-all.txt.gz
 /docs/*.html
+/docs/.~lock.*.ods#
 /docs/*.svg
+/docs/venv
 **/.ipynb_checkpoints
 /target
--- a/data/wordlist-tmp.csv
+++ b/data/wordlist-tmp.csv
--- a/docs/annotated_words.ods
+++ b/docs/annotated_words.ods
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -0,0 +1,12 @@
+click==8.1.3
+defusedxml==0.7.1
+joblib==1.2.0
+nltk==3.8.1
+numpy==1.24.2
+odfpy==1.4.1
+pandas==1.5.3
+python-dateutil==2.8.2
+pytz==2022.7.1
+regex==2022.10.31
+six==1.16.0
+tqdm==4.64.1
--- a/docs/wordlist.ipynb
+++ b/docs/wordlist.ipynb
@ -12,34 +12,21 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Collecting nltk\n",
-      "  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
-      "\u001b[?25hCollecting odfpy\n",
-      "  Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
-      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
-      "\u001b[?25hRequirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
-      "Collecting regex>=2021.8.3\n",
-      "  Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
-      "\u001b[?25hRequirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
+      "Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n",
+      "Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n",
      "Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
-      "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n",
-      "Building wheels for collected packages: odfpy\n",
-      "  Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n",
-      "\u001b[?25h  Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=3ee9aaac0134706d6ef72a359cd2466813f37bd8f080150b008d6d6e247d710c\n",
-      "  Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
-      "Successfully built odfpy\n",
-      "Installing collected packages: regex, odfpy, nltk\n",
-      "Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
+      "Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n",
+      "Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
+      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
+      "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n"
+      "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    }
   ],
@ -129,6 +116,8 @@
       "[('be', 'bee'),\n",
       " ('by', 'bye'),\n",
       " ('died', 'dyed'),\n",
+       " ('cents', 'sense'),\n",
+       " ('yellow', 'hello'),\n",
       " ('corps', 'core'),\n",
       " ('ore', 'oar'),\n",
       " ('ore', ' or'),\n",
@ -160,6 +149,9 @@
       " ('aluminium', 'aluminum'),\n",
       " ('isle', 'aisle'),\n",
       " ('boulder', 'bolder'),\n",
+       " ('blew', 'blue'),\n",
+       " ('reformed', 'reform'),\n",
+       " ('scent', 'sense'),\n",
       " ('ads', 'adds'),\n",
       " ('honours', 'honors'),\n",
       " ('bot', 'bought'),\n",
@ -210,13 +202,13 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "# all_words: 21287\n",
+      "# all_words: 21285\n",
      "sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n",
      "\n",
-      "# lemmatize_mappings: 21341\n",
+      "# lemmatize_mappings: 21344\n",
      "sample: [('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it')]\n",
      "\n",
-      "# distinct_words: 17557\n",
+      "# distinct_words: 17555\n",
      "sample:\n"
     ]
    },
@ -360,7 +352,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 7,
   "id": "d1a06597-4ad5-4566-a716-8bbad416b7ab",
   "metadata": {
    "tags": []
@ -380,21 +372,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
   "id": "c88fe193-11cc-4a06-a3cf-d1ad85f44d14",
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'H'"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": []
  },
  {
--- a/docs/wordlist.py
+++ b/docs/wordlist.py
@ -0,0 +1,147 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import nltk
+from nltk.stem.wordnet import WordNetLemmatizer
+import pandas as pd
+import gzip
+import re
+
+nltk.download("wordnet")
+
+WORDLIST_SIZE=8192 + 3
+
+
+# ## First, get the list of excluded words
+
+annotated_words=pd.read_excel("annotated_words.ods")
+
+
+excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
+excluded_words[0:10]
+
+
+# ## Next, get the list of custom mappings
+
+custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
+
+custom_maps = [
+    (m[1]["word"].lower(), mapping.lower())
+    for m in custom_maps.iterrows()
+    for mapping in m[1]["maps_to"]
+]
+custom_maps
+
+
+def get_lines(filename):
+    with gzip.open(filename, 'r') as f:
+        ret = []
+        for l in f:
+            if len(ret) > 30_000:
+                return ret
+            ret.append(str(l).lower())
+        return ret
+
+lemmatizer = WordNetLemmatizer()
+word_re = re.compile(r"^[A-Za-z]+$")
+
+# Start parsing the wordlist
+all_words = get_lines("frequency-all.txt.gz")
+
+# Delete header line
+all_words = all_words[1:]
+
+# Get only the word (fixed width)
+all_words = [w[13:36].strip() for w in all_words]
+
+# Remove special characters
+all_words = [w for w in all_words if word_re.search(w)]
+
+# Remove all removed words
+all_words = [w for w in all_words if w not in excluded_words]
+
+# Lemmatize all words (plural -> singular)
+lemmatize_mappings = [
+    (w, lemmatizer.lemmatize(w)) 
+    for w in all_words
+    # if w != lemmatizer.lemmatize(w)
+]
+
+# Remove all words that lemmatize to another word
+#all_words = [w for w in all_words if w not in ]
+
+# Add custom lemmatizations
+for l in custom_maps:
+    if l in lemmatize_mappings:
+        print(f"Warning: {l} is already lemmatized")
+    else:
+        lemmatize_mappings.append(l)
+
+distinct_words_lemmatized = set()
+distinct_words = []
+for w in lemmatize_mappings:
+    if w[1] not in distinct_words_lemmatized:
+        distinct_words_lemmatized.add(w[1])
+        distinct_words.append(w[0])
+del distinct_words_lemmatized
+
+# Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word
+#wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl]
+
+# Get a list of words that map to other words
+# A word was lemmatized if wordnet mapped it to another word (not None) that was different
+#only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]]
+
+# Get a list of distinct lemmatized words
+#distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None]
+#distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)]
+
+print(f"# all_words: {len(all_words)}")
+print(f"sample: {all_words[0:10]}")
+print()
+print(f"# lemmatize_mappings: {len(lemmatize_mappings)}")
+print(f"sample: {lemmatize_mappings[0:10]}")
+print()
+print(f"# distinct_words: {len(distinct_words)}")
+print(f"sample:")
+distinct_words[0:10]
+
+
+# ## Generate the final wordlist
+
+# The final wordlist map. Maps a word to its numeric value
+# Starting at 1
+final_wordlist = {
+    w: idx + 1
+    for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE])
+}
+
+reverse_lemmatize_idx = {
+    lemmatizer.lemmatize(w): w
+    for w in final_wordlist.keys()
+}
+
+# Add the lemmatized numbers
+for w, lem_w in lemmatize_mappings:
+    if lem_w not in reverse_lemmatize_idx:
+        # This word is not in the reverse list
+        # This happens when the index of the lemmatized word we're working with is too large
+        continue
+
+    final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]]
+
+assert final_wordlist["its"] == final_wordlist["its"]
+assert final_wordlist["its"] >= 0
+
+print(f"Final wordlist size: {len(final_wordlist.keys())}")
+
+
+sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]
+
+with open("final_wordlist.csv", "w") as f:
+    f.write("word,number\n")
+
+    for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):
+        lemmatized = "" if not w[1] else w[1]
+        f.write(f"{w[0].upper()},{lemmatized - 1}")
+        f.write("\n")
--- a/this_algorithm/src/lib.rs
+++ b/this_algorithm/src/lib.rs
@ -66,6 +66,7 @@ impl FromStr for Address<'_> {
        }

        // Check if either the beginning or end is a number
+        // These unwraps are okay because we checked that components is not empty above
        let (reverse, number) = if let Ok(number) = components.first().unwrap().parse::<Number>() {
            // The number is the first component
            (false, number)
@ -133,13 +134,13 @@ mod tests {
    use super::*;

    macro_rules! w {
-        ($word:tt) => {
-            words::get_word($word).unwrap()
+        ($word:ident) => {
+            words::get_word(stringify!($word)).unwrap()
        };
    }

    macro_rules! addr {
-        ($number:tt, $word0:tt, $word1:tt, $word2:tt) => {
+        ($number:tt, $word0:ident, $word1:ident, $word2:ident) => {
            Address {
                number: $number,
                words: [w!($word0), w!($word1), w!($word2)],
@ -159,29 +160,68 @@ mod tests {
        assert_eq!(extract_version(0b01 << 10), 0b01);
    }

+    #[test]
+    fn test_address_from_str() {
+        for i in &[
+            // Regular
+            "grape orange apple 1000",
+            // Reverse
+            "1000 apple orange grape",
+            // Whitespace everywhere
+            "\t\tgrape\n\t orange apple     1000 \t  ",
+            // Mixed case
+            "\n1000 APPlE oRAnGE GrAPe\n",
+        ] {
+            eprintln!("Testing {i:?}");
+            assert_eq!(Address::from_str(i), Ok(addr![1000, apple, orange, grape]));
+        }
+
+        for i in &[
+            // Too small
+            "",
+            "   ",
+            "1000",
+            "1000 orange blue",
+            // Not a word
+            "1000 ajlkdsf alskdjasldkj fas",
+            // Number too large
+            "grape orange apple 10000",
+            // Number too small
+            "0 apple orange grape",
+            // No number
+            "grape orange apple mix",
+            "grape orange apple 1e4",
+            "grape orange apple 1025",
+        ] {
+            eprintln!("Testing {i:?}");
+            assert!(Address::from_str(i).is_err());
+        }
+    }
+
    #[test]
    fn test_parse_v0() {
+        // Regular case
        assert_eq!(
            Address::parse_v0(1000, vec!["apple", "orange", "grape"]),
-            Ok(addr![1000, "apple", "orange", "grape"])
+            Ok(addr![1000, apple, orange, grape])
        );

        // Number is on the edge
        assert_eq!(
            Address::parse_v0(1, vec!["apple", "orange", "grape"]),
-            Ok(addr![1, "apple", "orange", "grape"])
+            Ok(addr![1, apple, orange, grape])
        );
        assert_eq!(
            Address::parse_v0(1024, vec!["apple", "orange", "grape"]),
-            Ok(addr![1024, "apple", "orange", "grape"])
+            Ok(addr![1024, apple, orange, grape])
        );
        assert_eq!(
            Address::parse_v0(V0_MAX_NUMBER, vec!["apple", "orange", "grape"]),
-            Ok(addr![V0_MAX_NUMBER, "apple", "orange", "grape"])
+            Ok(addr![V0_MAX_NUMBER, apple, orange, grape])
        );
        assert_eq!(
            Address::parse_v0(V0_MIN_NUMBER, vec!["apple", "orange", "grape"]),
-            Ok(addr![V0_MIN_NUMBER, "apple", "orange", "grape"])
+            Ok(addr![V0_MIN_NUMBER, apple, orange, grape])
        );

        // Word not found
--- a/words/build.rs
+++ b/words/build.rs
@ -11,12 +11,15 @@ use std::path::Path;
 fn main() {
    let path = Path::new(&env::var("OUT_DIR").unwrap()).join("codegen.rs");
    let mut file = BufWriter::new(File::create(path).unwrap());
+    let wordlist_path = "../data/wordlist-tmp.csv";
+
+    println!("cargo:rerun-if-changed={}", wordlist_path);

    let rdr_builder = ReaderBuilder::new();

    // First get the actual wordlist
    let words: Vec<Word> = rdr_builder
-        .from_reader(File::open("../data/wordlist-tmp.csv").unwrap())
+        .from_reader(File::open(wordlist_path).unwrap())
        .deserialize()
        .collect::<Result<Vec<Word>, _>>()
        .unwrap();
@ -32,63 +35,63 @@ fn main() {
 }

 fn write_words(mut file: impl Write, words: &[Word]) {
-        writeln!(
-            &mut file,
-            r#"/// Static array of `Word`
+    writeln!(
+        &mut file,
+        r#"/// Static array of `Word`
 pub const WORDS: &[Word] = &["#
-        )
-        .unwrap();
+    )
+    .unwrap();

-        for result in words.iter() {
-            writeln!(&mut file, "\t{result:?},").unwrap();
-        }
-    writeln!(&mut file, "];\n").unwrap();
+    for result in words.iter() {
+        writeln!(&mut file, "\t{result:?},").unwrap();
    }
+    writeln!(&mut file, "];\n").unwrap();
+}

 fn write_word_map(mut file: impl Write, words: &[Word]) {
-        let mut word_map = phf_codegen::Map::new();
-        for (idx, word) in words.iter().enumerate() {
-            let idx_str = format!("&WORDS[{idx}]");
-            word_map.entry(word.word.to_uppercase(), &idx_str);
-        }
-        writeln!(
-            &mut file,
-            r#"/// Mapping from all caps `&str` to `&'static Word`
+    let mut word_map = phf_codegen::Map::new();
+    for (idx, word) in words.iter().enumerate() {
+        let idx_str = format!("&WORDS[{idx}]");
+        word_map.entry(word.word.to_uppercase(), &idx_str);
+    }
+    writeln!(
+        &mut file,
+        r#"/// Mapping from all caps `&str` to `&'static Word`
            pub static WORD_MAP: phf::Map<&'static str, &'static Word> =
                {};"#,
-            word_map.build()
-        )
-        .unwrap();
-    }
+        word_map.build()
+    )
+    .unwrap();
+}

 fn write_number_to_words(mut file: impl Write, words: &[Word]) {
-        let word_number_to_idx = words
-            .iter()
-            .enumerate()
-            .map(|(idx, w)| (w.number, idx))
-            .collect::<Vec<(u16, usize)>>();
+    let word_number_to_idx = words
+        .iter()
+        .enumerate()
+        .map(|(idx, w)| (w.number, idx))
+        .collect::<Vec<(u16, usize)>>();

-        writeln!(
-            &mut file,
-            // "pub const NUMBER_TO_WORDS: &[&[usize]] = &["
-            "pub const NUMBER_TO_WORDS: &[&[&'static Word]] = &["
-        )
-        .unwrap();
+    writeln!(
+        &mut file,
+        // "pub const NUMBER_TO_WORDS: &[&[usize]] = &["
+        "pub const NUMBER_TO_WORDS: &[&[&'static Word]] = &["
+    )
+    .unwrap();

-        for entry in word_number_to_idx
-            .as_slice()
-            .group_by(|(number1, _idx1), (number2, _idx2)| number1 == number2)
-        {
-            write!(&mut file, "\t&[",).unwrap();
+    for entry in word_number_to_idx
+        .as_slice()
+        .group_by(|(number1, _idx1), (number2, _idx2)| number1 == number2)
+    {
+        write!(&mut file, "\t&[",).unwrap();

-            for idx in entry.iter().map(|(_w, idx)| idx) {
-                // write!(&mut file, "{idx},").unwrap();
-                write!(&mut file, "&WORDS[{idx}],").unwrap();
-            }
-            writeln!(&mut file, "],").unwrap();
+        for idx in entry.iter().map(|(_w, idx)| idx) {
+            // write!(&mut file, "{idx},").unwrap();
+            write!(&mut file, "&WORDS[{idx}],").unwrap();
        }
-        writeln!(&mut file, "];\n").unwrap();
+        writeln!(&mut file, "],").unwrap();
    }
+    writeln!(&mut file, "];\n").unwrap();
+}

 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub struct Word {
--- a/words/src/lib.rs
+++ b/words/src/lib.rs
@ -53,3 +53,35 @@ where
        .get(&maybe_word.as_ref().trim().to_ascii_uppercase())
        .copied()
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_equivalence() {
+        // Test equivalence
+        macro_rules! te {
+            ($word1:ident, $word2: ident) => {
+                eprintln!("Checking if {:?} is a word", stringify!($word1));
+                assert!(get_word(stringify!($word1)).is_some());
+                eprintln!("Checking if {:?} is a word", stringify!($word2));
+                assert!(get_word(stringify!($word2)).is_some());
+                eprintln!("Checking equivalence");
+                assert_eq!(
+                    get_word(stringify!($word1)).unwrap().number,
+                    get_word(stringify!($word1)).unwrap().number
+                );
+            };
+        }
+
+        // Homonyms
+        te!(blue, blew);
+        te!(yellow, hello);
+
+        // Plurals
+        te!(sent, sense);
+        te!(sent, scents);
+        te!(sent, cents);
+    }
+}