Update tests and wordlist

2023-02-15 22:57:30 -05:00 · 2023-02-15 22:57:30 -05:00 · 9a6a68d2a7
commit 9a6a68d2a7
parent c57b0f120c
9 changed files with 9029 additions and 8810 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,9 @@
 /build
 /docs/final_wordlist.csv
 /docs/frequency-all.txt.gz
 /docs/*.html
 /docs/.~lock.*.ods#
 /docs/*.svg
 /docs/venv
 **/.ipynb_checkpoints
 /target
--- a/data/wordlist-tmp.csv
+++ b/data/wordlist-tmp.csv
--- a/docs/annotated_words.ods
+++ b/docs/annotated_words.ods
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -0,0 +1,12 @@
 click==8.1.3
 defusedxml==0.7.1
 joblib==1.2.0
 nltk==3.8.1
 numpy==1.24.2
 odfpy==1.4.1
 pandas==1.5.3
 python-dateutil==2.8.2
 pytz==2022.7.1
 regex==2022.10.31
 six==1.16.0
 tqdm==4.64.1
--- a/docs/wordlist.ipynb
+++ b/docs/wordlist.ipynb
@ -12,34 +12,21 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Collecting nltk\n",
+      "Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n",
-      "  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
+      "Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hCollecting odfpy\n",
      "  Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25hRequirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
      "Collecting regex>=2021.8.3\n",
      "  Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
      "Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
-      "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n",
+      "Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n",
-      "Building wheels for collected packages: odfpy\n",
+      "Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
-      "  Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n",
+      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
-      "\u001b[?25h  Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=3ee9aaac0134706d6ef72a359cd2466813f37bd8f080150b008d6d6e247d710c\n",
+      "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n"
      "  Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
      "Successfully built odfpy\n",
      "Installing collected packages: regex, odfpy, nltk\n",
      "Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n"
+      "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    }
   ],
@ -129,6 +116,8 @@
       "[('be', 'bee'),\n",
       " ('by', 'bye'),\n",
       " ('died', 'dyed'),\n",
       " ('cents', 'sense'),\n",
       " ('yellow', 'hello'),\n",
       " ('corps', 'core'),\n",
       " ('ore', 'oar'),\n",
       " ('ore', ' or'),\n",
@ -160,6 +149,9 @@
       " ('aluminium', 'aluminum'),\n",
       " ('isle', 'aisle'),\n",
       " ('boulder', 'bolder'),\n",
       " ('blew', 'blue'),\n",
       " ('reformed', 'reform'),\n",
       " ('scent', 'sense'),\n",
       " ('ads', 'adds'),\n",
       " ('honours', 'honors'),\n",
       " ('bot', 'bought'),\n",
@ -210,13 +202,13 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "# all_words: 21287\n",
+      "# all_words: 21285\n",
      "sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n",
      "\n",
-      "# lemmatize_mappings: 21341\n",
+      "# lemmatize_mappings: 21344\n",
      "sample: [('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it')]\n",
      "\n",
-      "# distinct_words: 17557\n",
+      "# distinct_words: 17555\n",
      "sample:\n"
     ]
    },
@ -360,7 +352,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 7,
   "id": "d1a06597-4ad5-4566-a716-8bbad416b7ab",
   "metadata": {
    "tags": []
@ -380,21 +372,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
   "id": "c88fe193-11cc-4a06-a3cf-d1ad85f44d14",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "data": {
      "text/plain": [
       "'H'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
--- a/docs/wordlist.py
+++ b/docs/wordlist.py
@ -0,0 +1,147 @@
 #!/usr/bin/env python
 # coding: utf-8
 import nltk
 from nltk.stem.wordnet import WordNetLemmatizer
 import pandas as pd
 import gzip
 import re
 nltk.download("wordnet")
 WORDLIST_SIZE=8192 + 3
 # ## First, get the list of excluded words
 annotated_words=pd.read_excel("annotated_words.ods")
 excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
 excluded_words[0:10]
 # ## Next, get the list of custom mappings
 custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
 custom_maps = [
    (m[1]["word"].lower(), mapping.lower())
    for m in custom_maps.iterrows()
    for mapping in m[1]["maps_to"]
 ]
 custom_maps
 def get_lines(filename):
    with gzip.open(filename, 'r') as f:
        ret = []
        for l in f:
            if len(ret) > 30_000:
                return ret
            ret.append(str(l).lower())
        return ret
 lemmatizer = WordNetLemmatizer()
 word_re = re.compile(r"^[A-Za-z]+$")
 # Start parsing the wordlist
 all_words = get_lines("frequency-all.txt.gz")
 # Delete header line
 all_words = all_words[1:]
 # Get only the word (fixed width)
 all_words = [w[13:36].strip() for w in all_words]
 # Remove special characters
 all_words = [w for w in all_words if word_re.search(w)]
 # Remove all removed words
 all_words = [w for w in all_words if w not in excluded_words]
 # Lemmatize all words (plural -> singular)
 lemmatize_mappings = [
    (w, lemmatizer.lemmatize(w)) 
    for w in all_words
    # if w != lemmatizer.lemmatize(w)
 ]
 # Remove all words that lemmatize to another word
 #all_words = [w for w in all_words if w not in ]
 # Add custom lemmatizations
 for l in custom_maps:
    if l in lemmatize_mappings:
        print(f"Warning: {l} is already lemmatized")
    else:
        lemmatize_mappings.append(l)
 distinct_words_lemmatized = set()
 distinct_words = []
 for w in lemmatize_mappings:
    if w[1] not in distinct_words_lemmatized:
        distinct_words_lemmatized.add(w[1])
        distinct_words.append(w[0])
 del distinct_words_lemmatized
 # Generate a wordlist of word[0] being the word, and w[1] being what that word maps to, or None if it is a distinct word
 #wordlist = [(w[0], None if w[0] == w[1] else w[1]) if w[0] == w[1] else w for w in wl]
 # Get a list of words that map to other words
 # A word was lemmatized if wordnet mapped it to another word (not None) that was different
 #only_lemmatized_words = [w for w in wordlist if w[1] is not None and w[0] != w[1]]
 # Get a list of distinct lemmatized words
 #distinct_lemmatized_words = [w[1] for w in wordlist if w[1] is not None]
 #distinct_lemmatized_words = [w for w in pd.unique(distinct_lemmatized_words)]
 print(f"# all_words: {len(all_words)}")
 print(f"sample: {all_words[0:10]}")
 print()
 print(f"# lemmatize_mappings: {len(lemmatize_mappings)}")
 print(f"sample: {lemmatize_mappings[0:10]}")
 print()
 print(f"# distinct_words: {len(distinct_words)}")
 print(f"sample:")
 distinct_words[0:10]
 # ## Generate the final wordlist
 # The final wordlist map. Maps a word to its numeric value
 # Starting at 1
 final_wordlist = {
    w: idx + 1
    for idx, w in enumerate(distinct_words[0:WORDLIST_SIZE])
 }
 reverse_lemmatize_idx = {
    lemmatizer.lemmatize(w): w
    for w in final_wordlist.keys()
 }
 # Add the lemmatized numbers
 for w, lem_w in lemmatize_mappings:
    if lem_w not in reverse_lemmatize_idx:
        # This word is not in the reverse list
        # This happens when the index of the lemmatized word we're working with is too large
        continue
    final_wordlist[w] = final_wordlist[reverse_lemmatize_idx[lem_w]]
 assert final_wordlist["its"] == final_wordlist["its"]
 assert final_wordlist["its"] >= 0
 print(f"Final wordlist size: {len(final_wordlist.keys())}")
 sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]
 with open("final_wordlist.csv", "w") as f:
    f.write("word,number\n")
    for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):
        lemmatized = "" if not w[1] else w[1]
        f.write(f"{w[0].upper()},{lemmatized - 1}")
        f.write("\n")
--- a/this_algorithm/src/lib.rs
+++ b/this_algorithm/src/lib.rs
@ -66,6 +66,7 @@ impl FromStr for Address<'_> {
        }
        // Check if either the beginning or end is a number
        // These unwraps are okay because we checked that components is not empty above
        let (reverse, number) = if let Ok(number) = components.first().unwrap().parse::<Number>() {
            // The number is the first component
            (false, number)
@ -133,13 +134,13 @@ mod tests {
    use super::*;
    macro_rules! w {
-        ($word:tt) => {
+        ($word:ident) => {
-            words::get_word($word).unwrap()
+            words::get_word(stringify!($word)).unwrap()
        };
    }
    macro_rules! addr {
-        ($number:tt, $word0:tt, $word1:tt, $word2:tt) => {
+        ($number:tt, $word0:ident, $word1:ident, $word2:ident) => {
            Address {
                number: $number,
                words: [w!($word0), w!($word1), w!($word2)],
@ -159,29 +160,68 @@ mod tests {
        assert_eq!(extract_version(0b01 << 10), 0b01);
    }
    #[test]
    fn test_address_from_str() {
        for i in &[
            // Regular
            "grape orange apple 1000",
            // Reverse
            "1000 apple orange grape",
            // Whitespace everywhere
            "\t\tgrape\n\t orange apple     1000 \t  ",
            // Mixed case
            "\n1000 APPlE oRAnGE GrAPe\n",
        ] {
            eprintln!("Testing {i:?}");
            assert_eq!(Address::from_str(i), Ok(addr![1000, apple, orange, grape]));
        }
        for i in &[
            // Too small
            "",
            "   ",
            "1000",
            "1000 orange blue",
            // Not a word
            "1000 ajlkdsf alskdjasldkj fas",
            // Number too large
            "grape orange apple 10000",
            // Number too small
            "0 apple orange grape",
            // No number
            "grape orange apple mix",
            "grape orange apple 1e4",
            "grape orange apple 1025",
        ] {
            eprintln!("Testing {i:?}");
            assert!(Address::from_str(i).is_err());
        }
    }
    #[test]
    fn test_parse_v0() {
        // Regular case
        assert_eq!(
            Address::parse_v0(1000, vec!["apple", "orange", "grape"]),
-            Ok(addr![1000, "apple", "orange", "grape"])
+            Ok(addr![1000, apple, orange, grape])
        );
        // Number is on the edge
        assert_eq!(
            Address::parse_v0(1, vec!["apple", "orange", "grape"]),
-            Ok(addr![1, "apple", "orange", "grape"])
+            Ok(addr![1, apple, orange, grape])
        );
        assert_eq!(
            Address::parse_v0(1024, vec!["apple", "orange", "grape"]),
-            Ok(addr![1024, "apple", "orange", "grape"])
+            Ok(addr![1024, apple, orange, grape])
        );
        assert_eq!(
            Address::parse_v0(V0_MAX_NUMBER, vec!["apple", "orange", "grape"]),
-            Ok(addr![V0_MAX_NUMBER, "apple", "orange", "grape"])
+            Ok(addr![V0_MAX_NUMBER, apple, orange, grape])
        );
        assert_eq!(
            Address::parse_v0(V0_MIN_NUMBER, vec!["apple", "orange", "grape"]),
-            Ok(addr![V0_MIN_NUMBER, "apple", "orange", "grape"])
+            Ok(addr![V0_MIN_NUMBER, apple, orange, grape])
        );
        // Word not found
--- a/words/build.rs
+++ b/words/build.rs
@ -11,12 +11,15 @@ use std::path::Path;
 fn main() {
    let path = Path::new(&env::var("OUT_DIR").unwrap()).join("codegen.rs");
    let mut file = BufWriter::new(File::create(path).unwrap());
    let wordlist_path = "../data/wordlist-tmp.csv";
    println!("cargo:rerun-if-changed={}", wordlist_path);
    let rdr_builder = ReaderBuilder::new();
    // First get the actual wordlist
    let words: Vec<Word> = rdr_builder
-        .from_reader(File::open("../data/wordlist-tmp.csv").unwrap())
+        .from_reader(File::open(wordlist_path).unwrap())
        .deserialize()
        .collect::<Result<Vec<Word>, _>>()
        .unwrap();
@ -43,7 +46,7 @@ pub const WORDS: &[Word] = &["#
        writeln!(&mut file, "\t{result:?},").unwrap();
    }
    writeln!(&mut file, "];\n").unwrap();
-    }
+}
 fn write_word_map(mut file: impl Write, words: &[Word]) {
    let mut word_map = phf_codegen::Map::new();
@ -59,7 +62,7 @@ fn write_word_map(mut file: impl Write, words: &[Word]) {
        word_map.build()
    )
    .unwrap();
-    }
+}
 fn write_number_to_words(mut file: impl Write, words: &[Word]) {
    let word_number_to_idx = words
@ -88,7 +91,7 @@ fn write_number_to_words(mut file: impl Write, words: &[Word]) {
        writeln!(&mut file, "],").unwrap();
    }
    writeln!(&mut file, "];\n").unwrap();
-    }
+}
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub struct Word {
--- a/words/src/lib.rs
+++ b/words/src/lib.rs
@ -53,3 +53,35 @@ where
        .get(&maybe_word.as_ref().trim().to_ascii_uppercase())
        .copied()
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_equivalence() {
        // Test equivalence
        macro_rules! te {
            ($word1:ident, $word2: ident) => {
                eprintln!("Checking if {:?} is a word", stringify!($word1));
                assert!(get_word(stringify!($word1)).is_some());
                eprintln!("Checking if {:?} is a word", stringify!($word2));
                assert!(get_word(stringify!($word2)).is_some());
                eprintln!("Checking equivalence");
                assert_eq!(
                    get_word(stringify!($word1)).unwrap().number,
                    get_word(stringify!($word1)).unwrap().number
                );
            };
        }
        // Homonyms
        te!(blue, blew);
        te!(yellow, hello);
        // Plurals
        te!(sent, sense);
        te!(sent, scents);
        te!(sent, cents);
    }
 }