Start working on rust types

2023-02-11 17:04:16 -05:00 · 2023-02-11 17:04:16 -05:00 · 2a7c660ba6
commit 2a7c660ba6
parent f9804f0399
8 changed files with 11553 additions and 17 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -29,6 +29,18 @@ dependencies = [
 "serde",
 ]
 [[package]]
 name = "bstr"
 version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
 dependencies = [
 "lazy_static",
 "memchr",
 "regex-automata",
 "serde",
 ]
 [[package]]
 name = "cgmath"
 version = "0.18.0"
@ -39,6 +51,28 @@ dependencies = [
 "num-traits",
 ]
 [[package]]
 name = "csv"
 version = "1.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
 dependencies = [
 "bstr",
 "csv-core",
 "itoa",
 "ryu",
 "serde",
 ]
 [[package]]
 name = "csv-core"
 version = "0.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
 dependencies = [
 "memchr",
 ]
 [[package]]
 name = "float_extras"
 version = "0.1.6"
@ -48,6 +82,12 @@ dependencies = [
 "libc",
 ]
 [[package]]
 name = "itoa"
 version = "0.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
 [[package]]
 name = "lazy_static"
 version = "1.4.0"
@ -66,6 +106,12 @@ version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"
 [[package]]
 name = "memchr"
 version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
 [[package]]
 name = "num-bigint"
 version = "0.4.3"
@ -96,6 +142,44 @@ dependencies = [
 "autocfg",
 ]
 [[package]]
 name = "phf"
 version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c"
 dependencies = [
 "phf_shared",
 ]
 [[package]]
 name = "phf_codegen"
 version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a56ac890c5e3ca598bbdeaa99964edb5b0258a583a9eb6ef4e89fc85d9224770"
 dependencies = [
 "phf_generator",
 "phf_shared",
 ]
 [[package]]
 name = "phf_generator"
 version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b1181c94580fa345f50f19d738aaa39c0ed30a600d95cb2d3e23f94266f14fbf"
 dependencies = [
 "phf_shared",
 "rand",
 ]
 [[package]]
 name = "phf_shared"
 version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676"
 dependencies = [
 "siphasher",
 ]
 [[package]]
 name = "proc-macro2"
 version = "1.0.50"
@ -114,6 +198,33 @@ dependencies = [
 "proc-macro2",
 ]
 [[package]]
 name = "rand"
 version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
 dependencies = [
 "rand_core",
 ]
 [[package]]
 name = "rand_core"
 version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
 [[package]]
 name = "regex-automata"
 version = "0.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
 [[package]]
 name = "ryu"
 version = "1.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde"
 [[package]]
 name = "s2"
 version = "0.0.12"
@ -148,6 +259,12 @@ dependencies = [
 "syn",
 ]
 [[package]]
 name = "siphasher"
 version = "0.3.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
 [[package]]
 name = "syn"
 version = "1.0.107"
@ -164,6 +281,15 @@ name = "this_algoritm"
 version = "0.1.0"
 dependencies = [
 "s2",
 ]
 [[package]]
 name = "types"
 version = "0.1.0"
 dependencies = [
 "csv",
 "phf",
 "phf_codegen",
 "serde",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,8 +3,13 @@ name = "this_algoritm"
 version = "0.1.0"
 edition = "2021"
 [workspace]
 members = [
    ".",
    "./types",
 ]
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [dependencies]
 serde="1"
 s2="0.0.12"
--- a/data/wordlist-tmp.csv
+++ b/data/wordlist-tmp.csv
--- a/docs/wordlist.ipynb
+++ b/docs/wordlist.ipynb
@ -12,21 +12,34 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n",
+      "Collecting nltk\n",
-      "Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n",
+      "  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
-      "Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
-      "Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n",
+      "\u001b[?25hCollecting odfpy\n",
-      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
+      "  Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25hRequirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
      "Collecting regex>=2021.8.3\n",
      "  Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
      "Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
-      "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n"
+      "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n",
      "Building wheels for collected packages: odfpy\n",
      "  Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25h  Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=3ee9aaac0134706d6ef72a359cd2466813f37bd8f080150b008d6d6e247d710c\n",
      "  Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
      "Successfully built odfpy\n",
      "Installing collected packages: regex, odfpy, nltk\n",
      "Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n",
+      "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n"
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    }
   ],
@ -79,7 +92,7 @@
    {
     "data": {
      "text/plain": [
-       "['a', 'i', 's', 'p', 'c', 'b', 'american', 'york', 'd', 'john']"
+       "['a', 'i', 's', 'p', 'c', 'b', 'american', 'york', 'children', 'd']"
      ]
     },
     "execution_count": 3,
@ -115,6 +128,7 @@
      "text/plain": [
       "[('be', 'bee'),\n",
       " ('by', 'bye'),\n",
       " ('died', 'dyed'),\n",
       " ('corps', 'core'),\n",
       " ('ore', 'oar'),\n",
       " ('ore', ' or'),\n",
@ -122,6 +136,7 @@
       " ('com', 'calm'),\n",
       " ('filing', 'filling'),\n",
       " ('fax', 'facts'),\n",
       " ('favour', 'favor'),\n",
       " ('theatre', 'theater'),\n",
       " ('par', 'parse'),\n",
       " ('honour', 'honor'),\n",
@ -158,6 +173,7 @@
       " ('hem', 'him'),\n",
       " ('nun', 'none'),\n",
       " ('organisational', 'organizational'),\n",
       " ('dessert', 'desert'),\n",
       " ('aux', 'ox'),\n",
       " ('rap', 'wrap'),\n",
       " ('filings', 'filling'),\n",
@ -194,13 +210,13 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "# all_words: 21323\n",
+      "# all_words: 21287\n",
      "sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n",
      "\n",
-      "# lemmatize_mappings: 21374\n",
+      "# lemmatize_mappings: 21341\n",
      "sample: [('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it')]\n",
      "\n",
-      "# distinct_words: 17585\n",
+      "# distinct_words: 17557\n",
      "sample:\n"
     ]
    },
@ -310,7 +326,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Final wordlist size: 11212\n"
+      "Final wordlist size: 11210\n"
     ]
    }
   ],
@ -344,21 +360,50 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 12,
   "id": "d1a06597-4ad5-4566-a716-8bbad416b7ab",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "with open(\"final_wordlist.csv\", \"w\") as f:\n",
    "sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]\n",
    "\n",
    "with open(\"final_wordlist.csv\", \"w\") as f:\n",
    "    f.write(\"word,number\\n\")\n",
    "    \n",
    "    for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):\n",
    "        lemmatized = \"\" if not w[1] else w[1]\n",
-    "        f.write(f\"{w[0]},{lemmatized}\")\n",
+    "        f.write(f\"{w[0].upper()},{lemmatized - 1}\")\n",
    "        f.write(\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c88fe193-11cc-4a06-a3cf-d1ad85f44d14",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'H'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a0d177b-3499-42fb-8091-29547567d69a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
--- a/types/Cargo.lock
+++ b/types/Cargo.lock
@ -0,0 +1,7 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
 version = 3
 [[package]]
 name = "types"
 version = "0.1.0"
--- a/types/Cargo.toml
+++ b/types/Cargo.toml
@ -0,0 +1,15 @@
 [package]
 name = "types"
 version = "0.1.0"
 edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [dependencies]
 serde={version="1", features=["derive"]}
 phf="0.11"
 [build-dependencies]
 serde={version="1", features=["derive"]}
 phf_codegen="0.11"
 csv="1.1"
--- a/types/build.rs
+++ b/types/build.rs
@ -0,0 +1,88 @@
 #![feature(slice_group_by)]
 #![allow(unused_imports)]
 use csv::ReaderBuilder;
 use serde::{Deserialize, Deserializer, Serialize};
 use std::collections::HashMap;
 use std::env;
 use std::fs::File;
 use std::io::{BufRead, BufReader, BufWriter, Write};
 use std::path::Path;
 fn main() {
    let path = Path::new(&env::var("OUT_DIR").unwrap()).join("codegen.rs");
    let mut file = BufWriter::new(File::create(path).unwrap());
    let rdr_builder = ReaderBuilder::new();
    // First get the actual wordlist
    let words: Vec<Word> = rdr_builder
        .from_reader(File::open("../data/wordlist-tmp.csv").unwrap())
        .deserialize()
        .collect::<Result<Vec<Word>, _>>()
        .unwrap();
    // Write it to an array containing all words
    writeln!(
        &mut file,
        r#"/// Static array of `Word`
        pub const WORDS: &[Word] = &["#
    )
    .unwrap();
    for result in words.iter() {
        writeln!(&mut file, "{result:?},").unwrap();
    }
    writeln!(&mut file, "];\n").unwrap();
    // Make a mapping of all caps word to a reference to the `Word` entry
    let mut word_map = phf_codegen::Map::new();
    for (idx, word) in words.iter().enumerate() {
        let idx_str = format!("&WORDS[{idx}]");
        word_map.entry(word.word.to_uppercase(), &idx_str);
    }
    writeln!(
        &mut file,
        r#"/// Mapping from all caps `&str` to `&'static Word`
            pub static WORD_MAP: phf::Map<&'static str, &'static Word> =
                {};"#,
        word_map.build()
    )
    .unwrap();
    // Make a mapping of numbers to `Word`s
    let word_number_to_idx = words
        .iter()
        .enumerate()
        .map(|(idx, w)| (w.number, idx))
        .collect::<Vec<(u16, usize)>>();
    writeln!(
        &mut file,
        "pub const NUMBER_TO_WORD: &[&[&'static Word]] = &["
    )
    .unwrap();
    for entry in word_number_to_idx
        .as_slice()
        .group_by(|(number1, _idx1), (number2, _idx2)| number1 == number2)
    {
        write!(&mut file, "\t&[",).unwrap();
        for idx in entry.iter().map(|(_w, idx)| idx) {
            write!(&mut file, "&WORDS[{idx}],").unwrap();
        }
        writeln!(&mut file, "],").unwrap();
    }
    writeln!(&mut file, "];\n").unwrap();
 }
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub struct Word {
    /// The word itself
    pub word: String,
    /// The binary representation of this number
    ///
    /// The words are responsible for 13 bits of data, so this is fine to fit in a u16
    pub number: u16,
 }
--- a/types/src/lib.rs
+++ b/types/src/lib.rs
@ -0,0 +1,39 @@
 include!(concat!(env!("OUT_DIR"), "/codegen.rs"));
 #[derive(Debug, Clone, PartialEq)]
 /// A word struct
 pub struct Word<'a> {
    /// The word itself
    pub word: &'a str,
    /// The binary representation of this number
    ///
    /// The words are responsible for 13 bits of data, so this is fine to fit in a u16
    pub number: u16,
 }
 #[derive(Debug, Clone, PartialEq)]
 pub struct Address<'a> {
    number: u32,
    words: [Word<'a>; 3],
 }
 // /// Find a word
 // ///
 // /// ```rust
 // /// use static_data::find_country;
 // ///
 // /// assert!(find_country("US").is_some());
 // /// assert!(find_country("UnItEd StAtEs").is_some());
 // /// assert!(find_country("abcd").is_none());
 // /// ```
 // pub fn find_country<S>(s: S) -> Option<&'static Country<'static>>
 // where
 //     S: AsRef<str>,
 // {
 //     // TODO: Decide weather uppercasing is too slow
 //     COUNTRY_MAP
 //         .get(s.as_ref())
 //         .or_else(|| COUNTRY_MAP.get(&s.as_ref().to_uppercase()))
 //         .copied()
 // }