Start working on rust types

2023-02-11 17:04:16 -05:00 · 2023-02-11 17:04:16 -05:00 · 2a7c660ba6
commit 2a7c660ba6
parent f9804f0399
8 changed files with 11553 additions and 17 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -29,6 +29,18 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "bstr"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
+dependencies = [
+ "lazy_static",
+ "memchr",
+ "regex-automata",
+ "serde",
+]
+
 [[package]]
 name = "cgmath"
 version = "0.18.0"
@ -39,6 +51,28 @@ dependencies = [
 "num-traits",
 ]

+[[package]]
+name = "csv"
+version = "1.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
+dependencies = [
+ "bstr",
+ "csv-core",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "float_extras"
 version = "0.1.6"
@ -48,6 +82,12 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "itoa"
+version = "0.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
+
 [[package]]
 name = "lazy_static"
 version = "1.4.0"
@ -66,6 +106,12 @@ version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"

+[[package]]
+name = "memchr"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
+
 [[package]]
 name = "num-bigint"
 version = "0.4.3"
@ -96,6 +142,44 @@ dependencies = [
 "autocfg",
 ]

+[[package]]
+name = "phf"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c"
+dependencies = [
+ "phf_shared",
+]
+
+[[package]]
+name = "phf_codegen"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a56ac890c5e3ca598bbdeaa99964edb5b0258a583a9eb6ef4e89fc85d9224770"
+dependencies = [
+ "phf_generator",
+ "phf_shared",
+]
+
+[[package]]
+name = "phf_generator"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1181c94580fa345f50f19d738aaa39c0ed30a600d95cb2d3e23f94266f14fbf"
+dependencies = [
+ "phf_shared",
+ "rand",
+]
+
+[[package]]
+name = "phf_shared"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676"
+dependencies = [
+ "siphasher",
+]
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.50"
@ -114,6 +198,33 @@ dependencies = [
 "proc-macro2",
 ]

+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+
+[[package]]
+name = "regex-automata"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
+
+[[package]]
+name = "ryu"
+version = "1.0.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde"
+
 [[package]]
 name = "s2"
 version = "0.0.12"
@ -148,6 +259,12 @@ dependencies = [
 "syn",
 ]

+[[package]]
+name = "siphasher"
+version = "0.3.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
+
 [[package]]
 name = "syn"
 version = "1.0.107"
@ -164,6 +281,15 @@ name = "this_algoritm"
 version = "0.1.0"
 dependencies = [
 "s2",
+]
+
+[[package]]
+name = "types"
+version = "0.1.0"
+dependencies = [
+ "csv",
+ "phf",
+ "phf_codegen",
 "serde",
 ]

--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,8 +3,13 @@ name = "this_algoritm"
 version = "0.1.0"
 edition = "2021"

+[workspace]
+members = [
+    ".",
+    "./types",
+]
+
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-serde="1"
 s2="0.0.12"
--- a/data/wordlist-tmp.csv
+++ b/data/wordlist-tmp.csv
--- a/docs/wordlist.ipynb
+++ b/docs/wordlist.ipynb
@ -12,21 +12,34 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n",
-      "Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n",
-      "Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
-      "Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n",
-      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
+      "Collecting nltk\n",
+      "  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hCollecting odfpy\n",
+      "  Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hRequirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
+      "Collecting regex>=2021.8.3\n",
+      "  Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
      "Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
-      "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n"
+      "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n",
+      "Building wheels for collected packages: odfpy\n",
+      "  Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=3ee9aaac0134706d6ef72a359cd2466813f37bd8f080150b008d6d6e247d710c\n",
+      "  Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
+      "Successfully built odfpy\n",
+      "Installing collected packages: regex, odfpy, nltk\n",
+      "Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n",
-      "[nltk_data]   Package wordnet is already up-to-date!\n"
+      "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n"
     ]
    }
   ],
@ -79,7 +92,7 @@
    {
     "data": {
      "text/plain": [
-       "['a', 'i', 's', 'p', 'c', 'b', 'american', 'york', 'd', 'john']"
+       "['a', 'i', 's', 'p', 'c', 'b', 'american', 'york', 'children', 'd']"
      ]
     },
     "execution_count": 3,
@ -115,6 +128,7 @@
      "text/plain": [
       "[('be', 'bee'),\n",
       " ('by', 'bye'),\n",
+       " ('died', 'dyed'),\n",
       " ('corps', 'core'),\n",
       " ('ore', 'oar'),\n",
       " ('ore', ' or'),\n",
@ -122,6 +136,7 @@
       " ('com', 'calm'),\n",
       " ('filing', 'filling'),\n",
       " ('fax', 'facts'),\n",
+       " ('favour', 'favor'),\n",
       " ('theatre', 'theater'),\n",
       " ('par', 'parse'),\n",
       " ('honour', 'honor'),\n",
@ -158,6 +173,7 @@
       " ('hem', 'him'),\n",
       " ('nun', 'none'),\n",
       " ('organisational', 'organizational'),\n",
+       " ('dessert', 'desert'),\n",
       " ('aux', 'ox'),\n",
       " ('rap', 'wrap'),\n",
       " ('filings', 'filling'),\n",
@ -194,13 +210,13 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "# all_words: 21323\n",
+      "# all_words: 21287\n",
      "sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n",
      "\n",
-      "# lemmatize_mappings: 21374\n",
+      "# lemmatize_mappings: 21341\n",
      "sample: [('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it')]\n",
      "\n",
-      "# distinct_words: 17585\n",
+      "# distinct_words: 17557\n",
      "sample:\n"
     ]
    },
@ -310,7 +326,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Final wordlist size: 11212\n"
+      "Final wordlist size: 11210\n"
     ]
    }
   ],
@ -344,21 +360,50 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 12,
   "id": "d1a06597-4ad5-4566-a716-8bbad416b7ab",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
+    "sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]\n",
+    "\n",
    "with open(\"final_wordlist.csv\", \"w\") as f:\n",
-    "    sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]\n",
+    "    f.write(\"word,number\\n\")\n",
    "    \n",
    "    for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):\n",
    "        lemmatized = \"\" if not w[1] else w[1]\n",
-    "        f.write(f\"{w[0]},{lemmatized}\")\n",
+    "        f.write(f\"{w[0].upper()},{lemmatized - 1}\")\n",
    "        f.write(\"\\n\")"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "c88fe193-11cc-4a06-a3cf-d1ad85f44d14",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'H'"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a0d177b-3499-42fb-8091-29547567d69a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {
--- a/types/Cargo.lock
+++ b/types/Cargo.lock
@ -0,0 +1,7 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "types"
+version = "0.1.0"
--- a/types/Cargo.toml
+++ b/types/Cargo.toml
@ -0,0 +1,15 @@
+[package]
+name = "types"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+serde={version="1", features=["derive"]}
+phf="0.11"
+
+[build-dependencies]
+serde={version="1", features=["derive"]}
+phf_codegen="0.11"
+csv="1.1"
--- a/types/build.rs
+++ b/types/build.rs
@ -0,0 +1,88 @@
+#![feature(slice_group_by)]
+#![allow(unused_imports)]
+use csv::ReaderBuilder;
+use serde::{Deserialize, Deserializer, Serialize};
+use std::collections::HashMap;
+use std::env;
+use std::fs::File;
+use std::io::{BufRead, BufReader, BufWriter, Write};
+use std::path::Path;
+
+fn main() {
+    let path = Path::new(&env::var("OUT_DIR").unwrap()).join("codegen.rs");
+    let mut file = BufWriter::new(File::create(path).unwrap());
+
+    let rdr_builder = ReaderBuilder::new();
+
+    // First get the actual wordlist
+    let words: Vec<Word> = rdr_builder
+        .from_reader(File::open("../data/wordlist-tmp.csv").unwrap())
+        .deserialize()
+        .collect::<Result<Vec<Word>, _>>()
+        .unwrap();
+
+    // Write it to an array containing all words
+    writeln!(
+        &mut file,
+        r#"/// Static array of `Word`
+        pub const WORDS: &[Word] = &["#
+    )
+    .unwrap();
+
+    for result in words.iter() {
+        writeln!(&mut file, "{result:?},").unwrap();
+    }
+
+    writeln!(&mut file, "];\n").unwrap();
+
+    // Make a mapping of all caps word to a reference to the `Word` entry
+    let mut word_map = phf_codegen::Map::new();
+    for (idx, word) in words.iter().enumerate() {
+        let idx_str = format!("&WORDS[{idx}]");
+        word_map.entry(word.word.to_uppercase(), &idx_str);
+    }
+    writeln!(
+        &mut file,
+        r#"/// Mapping from all caps `&str` to `&'static Word`
+            pub static WORD_MAP: phf::Map<&'static str, &'static Word> =
+                {};"#,
+        word_map.build()
+    )
+    .unwrap();
+
+    // Make a mapping of numbers to `Word`s
+    let word_number_to_idx = words
+        .iter()
+        .enumerate()
+        .map(|(idx, w)| (w.number, idx))
+        .collect::<Vec<(u16, usize)>>();
+
+    writeln!(
+        &mut file,
+        "pub const NUMBER_TO_WORD: &[&[&'static Word]] = &["
+    )
+    .unwrap();
+    for entry in word_number_to_idx
+        .as_slice()
+        .group_by(|(number1, _idx1), (number2, _idx2)| number1 == number2)
+    {
+        write!(&mut file, "\t&[",).unwrap();
+
+        for idx in entry.iter().map(|(_w, idx)| idx) {
+            write!(&mut file, "&WORDS[{idx}],").unwrap();
+        }
+        writeln!(&mut file, "],").unwrap();
+    }
+    writeln!(&mut file, "];\n").unwrap();
+}
+
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub struct Word {
+    /// The word itself
+    pub word: String,
+
+    /// The binary representation of this number
+    ///
+    /// The words are responsible for 13 bits of data, so this is fine to fit in a u16
+    pub number: u16,
+}
--- a/types/src/lib.rs
+++ b/types/src/lib.rs
@ -0,0 +1,39 @@
+include!(concat!(env!("OUT_DIR"), "/codegen.rs"));
+
+#[derive(Debug, Clone, PartialEq)]
+/// A word struct
+pub struct Word<'a> {
+    /// The word itself
+    pub word: &'a str,
+
+    /// The binary representation of this number
+    ///
+    /// The words are responsible for 13 bits of data, so this is fine to fit in a u16
+    pub number: u16,
+}
+
+#[derive(Debug, Clone, PartialEq)]
+pub struct Address<'a> {
+    number: u32,
+    words: [Word<'a>; 3],
+}
+
+// /// Find a word
+// ///
+// /// ```rust
+// /// use static_data::find_country;
+// ///
+// /// assert!(find_country("US").is_some());
+// /// assert!(find_country("UnItEd StAtEs").is_some());
+// /// assert!(find_country("abcd").is_none());
+// /// ```
+// pub fn find_country<S>(s: S) -> Option<&'static Country<'static>>
+// where
+//     S: AsRef<str>,
+// {
+//     // TODO: Decide weather uppercasing is too slow
+//     COUNTRY_MAP
+//         .get(s.as_ref())
+//         .or_else(|| COUNTRY_MAP.get(&s.as_ref().to_uppercase()))
+//         .copied()
+// }