Start working on rust types

This commit is contained in:
Austen Adler 2023-02-11 17:04:16 -05:00
parent f9804f0399
commit 2a7c660ba6
8 changed files with 11553 additions and 17 deletions

126
Cargo.lock generated
View File

@ -29,6 +29,18 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "bstr"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
dependencies = [
"lazy_static",
"memchr",
"regex-automata",
"serde",
]
[[package]] [[package]]
name = "cgmath" name = "cgmath"
version = "0.18.0" version = "0.18.0"
@ -39,6 +51,28 @@ dependencies = [
"num-traits", "num-traits",
] ]
[[package]]
name = "csv"
version = "1.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
dependencies = [
"bstr",
"csv-core",
"itoa",
"ryu",
"serde",
]
[[package]]
name = "csv-core"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "float_extras" name = "float_extras"
version = "0.1.6" version = "0.1.6"
@ -48,6 +82,12 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "itoa"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
[[package]] [[package]]
name = "lazy_static" name = "lazy_static"
version = "1.4.0" version = "1.4.0"
@ -66,6 +106,12 @@ version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb" checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"
[[package]]
name = "memchr"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]] [[package]]
name = "num-bigint" name = "num-bigint"
version = "0.4.3" version = "0.4.3"
@ -96,6 +142,44 @@ dependencies = [
"autocfg", "autocfg",
] ]
[[package]]
name = "phf"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c"
dependencies = [
"phf_shared",
]
[[package]]
name = "phf_codegen"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a56ac890c5e3ca598bbdeaa99964edb5b0258a583a9eb6ef4e89fc85d9224770"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1181c94580fa345f50f19d738aaa39c0ed30a600d95cb2d3e23f94266f14fbf"
dependencies = [
"phf_shared",
"rand",
]
[[package]]
name = "phf_shared"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676"
dependencies = [
"siphasher",
]
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.50" version = "1.0.50"
@ -114,6 +198,33 @@ dependencies = [
"proc-macro2", "proc-macro2",
] ]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
[[package]]
name = "regex-automata"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
[[package]]
name = "ryu"
version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde"
[[package]] [[package]]
name = "s2" name = "s2"
version = "0.0.12" version = "0.0.12"
@ -148,6 +259,12 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "siphasher"
version = "0.3.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
[[package]] [[package]]
name = "syn" name = "syn"
version = "1.0.107" version = "1.0.107"
@ -164,6 +281,15 @@ name = "this_algoritm"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"s2", "s2",
]
[[package]]
name = "types"
version = "0.1.0"
dependencies = [
"csv",
"phf",
"phf_codegen",
"serde", "serde",
] ]

View File

@ -3,8 +3,13 @@ name = "this_algoritm"
version = "0.1.0" version = "0.1.0"
edition = "2021" edition = "2021"
[workspace]
members = [
".",
"./types",
]
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
serde="1"
s2="0.0.12" s2="0.0.12"

11211
data/wordlist-tmp.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -12,21 +12,34 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n", "Collecting nltk\n",
"Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n", " Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
"Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
"Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n", "\u001b[?25hCollecting odfpy\n",
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n", " Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25hRequirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
"Collecting regex>=2021.8.3\n",
" Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
"Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n", "Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
"Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n" "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n",
"Building wheels for collected packages: odfpy\n",
" Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25h Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=3ee9aaac0134706d6ef72a359cd2466813f37bd8f080150b008d6d6e247d710c\n",
" Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
"Successfully built odfpy\n",
"Installing collected packages: regex, odfpy, nltk\n",
"Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
] ]
}, },
{ {
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n", "[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n"
"[nltk_data] Package wordnet is already up-to-date!\n"
] ]
} }
], ],
@ -79,7 +92,7 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"['a', 'i', 's', 'p', 'c', 'b', 'american', 'york', 'd', 'john']" "['a', 'i', 's', 'p', 'c', 'b', 'american', 'york', 'children', 'd']"
] ]
}, },
"execution_count": 3, "execution_count": 3,
@ -115,6 +128,7 @@
"text/plain": [ "text/plain": [
"[('be', 'bee'),\n", "[('be', 'bee'),\n",
" ('by', 'bye'),\n", " ('by', 'bye'),\n",
" ('died', 'dyed'),\n",
" ('corps', 'core'),\n", " ('corps', 'core'),\n",
" ('ore', 'oar'),\n", " ('ore', 'oar'),\n",
" ('ore', ' or'),\n", " ('ore', ' or'),\n",
@ -122,6 +136,7 @@
" ('com', 'calm'),\n", " ('com', 'calm'),\n",
" ('filing', 'filling'),\n", " ('filing', 'filling'),\n",
" ('fax', 'facts'),\n", " ('fax', 'facts'),\n",
" ('favour', 'favor'),\n",
" ('theatre', 'theater'),\n", " ('theatre', 'theater'),\n",
" ('par', 'parse'),\n", " ('par', 'parse'),\n",
" ('honour', 'honor'),\n", " ('honour', 'honor'),\n",
@ -158,6 +173,7 @@
" ('hem', 'him'),\n", " ('hem', 'him'),\n",
" ('nun', 'none'),\n", " ('nun', 'none'),\n",
" ('organisational', 'organizational'),\n", " ('organisational', 'organizational'),\n",
" ('dessert', 'desert'),\n",
" ('aux', 'ox'),\n", " ('aux', 'ox'),\n",
" ('rap', 'wrap'),\n", " ('rap', 'wrap'),\n",
" ('filings', 'filling'),\n", " ('filings', 'filling'),\n",
@ -194,13 +210,13 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"# all_words: 21323\n", "# all_words: 21287\n",
"sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n", "sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n",
"\n", "\n",
"# lemmatize_mappings: 21374\n", "# lemmatize_mappings: 21341\n",
"sample: [('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it')]\n", "sample: [('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it')]\n",
"\n", "\n",
"# distinct_words: 17585\n", "# distinct_words: 17557\n",
"sample:\n" "sample:\n"
] ]
}, },
@ -310,7 +326,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Final wordlist size: 11212\n" "Final wordlist size: 11210\n"
] ]
} }
], ],
@ -344,21 +360,50 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 12,
"id": "d1a06597-4ad5-4566-a716-8bbad416b7ab", "id": "d1a06597-4ad5-4566-a716-8bbad416b7ab",
"metadata": { "metadata": {
"tags": [] "tags": []
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"with open(\"final_wordlist.csv\", \"w\") as f:\n",
"sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]\n", "sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]\n",
"\n", "\n",
"with open(\"final_wordlist.csv\", \"w\") as f:\n",
" f.write(\"word,number\\n\")\n",
" \n",
" for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):\n", " for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):\n",
" lemmatized = \"\" if not w[1] else w[1]\n", " lemmatized = \"\" if not w[1] else w[1]\n",
" f.write(f\"{w[0]},{lemmatized}\")\n", " f.write(f\"{w[0].upper()},{lemmatized - 1}\")\n",
" f.write(\"\\n\")" " f.write(\"\\n\")"
] ]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "c88fe193-11cc-4a06-a3cf-d1ad85f44d14",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'H'"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "2a0d177b-3499-42fb-8091-29547567d69a",
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {

7
types/Cargo.lock generated Normal file
View File

@ -0,0 +1,7 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "types"
version = "0.1.0"

15
types/Cargo.toml Normal file
View File

@ -0,0 +1,15 @@
[package]
name = "types"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
serde={version="1", features=["derive"]}
phf="0.11"
[build-dependencies]
serde={version="1", features=["derive"]}
phf_codegen="0.11"
csv="1.1"

88
types/build.rs Normal file
View File

@ -0,0 +1,88 @@
#![feature(slice_group_by)]
#![allow(unused_imports)]
use csv::ReaderBuilder;
use serde::{Deserialize, Deserializer, Serialize};
use std::collections::HashMap;
use std::env;
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::Path;
fn main() {
let path = Path::new(&env::var("OUT_DIR").unwrap()).join("codegen.rs");
let mut file = BufWriter::new(File::create(path).unwrap());
let rdr_builder = ReaderBuilder::new();
// First get the actual wordlist
let words: Vec<Word> = rdr_builder
.from_reader(File::open("../data/wordlist-tmp.csv").unwrap())
.deserialize()
.collect::<Result<Vec<Word>, _>>()
.unwrap();
// Write it to an array containing all words
writeln!(
&mut file,
r#"/// Static array of `Word`
pub const WORDS: &[Word] = &["#
)
.unwrap();
for result in words.iter() {
writeln!(&mut file, "{result:?},").unwrap();
}
writeln!(&mut file, "];\n").unwrap();
// Make a mapping of all caps word to a reference to the `Word` entry
let mut word_map = phf_codegen::Map::new();
for (idx, word) in words.iter().enumerate() {
let idx_str = format!("&WORDS[{idx}]");
word_map.entry(word.word.to_uppercase(), &idx_str);
}
writeln!(
&mut file,
r#"/// Mapping from all caps `&str` to `&'static Word`
pub static WORD_MAP: phf::Map<&'static str, &'static Word> =
{};"#,
word_map.build()
)
.unwrap();
// Make a mapping of numbers to `Word`s
let word_number_to_idx = words
.iter()
.enumerate()
.map(|(idx, w)| (w.number, idx))
.collect::<Vec<(u16, usize)>>();
writeln!(
&mut file,
"pub const NUMBER_TO_WORD: &[&[&'static Word]] = &["
)
.unwrap();
for entry in word_number_to_idx
.as_slice()
.group_by(|(number1, _idx1), (number2, _idx2)| number1 == number2)
{
write!(&mut file, "\t&[",).unwrap();
for idx in entry.iter().map(|(_w, idx)| idx) {
write!(&mut file, "&WORDS[{idx}],").unwrap();
}
writeln!(&mut file, "],").unwrap();
}
writeln!(&mut file, "];\n").unwrap();
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Word {
/// The word itself
pub word: String,
/// The binary representation of this number
///
/// The words are responsible for 13 bits of data, so this is fine to fit in a u16
pub number: u16,
}

39
types/src/lib.rs Normal file
View File

@ -0,0 +1,39 @@
include!(concat!(env!("OUT_DIR"), "/codegen.rs"));
#[derive(Debug, Clone, PartialEq)]
/// A word struct
pub struct Word<'a> {
/// The word itself
pub word: &'a str,
/// The binary representation of this number
///
/// The words are responsible for 13 bits of data, so this is fine to fit in a u16
pub number: u16,
}
#[derive(Debug, Clone, PartialEq)]
pub struct Address<'a> {
number: u32,
words: [Word<'a>; 3],
}
// /// Find a word
// ///
// /// ```rust
// /// use static_data::find_country;
// ///
// /// assert!(find_country("US").is_some());
// /// assert!(find_country("UnItEd StAtEs").is_some());
// /// assert!(find_country("abcd").is_none());
// /// ```
// pub fn find_country<S>(s: S) -> Option<&'static Country<'static>>
// where
// S: AsRef<str>,
// {
// // TODO: Decide weather uppercasing is too slow
// COUNTRY_MAP
// .get(s.as_ref())
// .or_else(|| COUNTRY_MAP.get(&s.as_ref().to_uppercase()))
// .copied()
// }