Start working on rust types
This commit is contained in:
parent
f9804f0399
commit
2a7c660ba6
126
Cargo.lock
generated
126
Cargo.lock
generated
@ -29,6 +29,18 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bstr"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cgmath"
|
||||
version = "0.18.0"
|
||||
@ -39,6 +51,28 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv"
|
||||
version = "1.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
|
||||
dependencies = [
|
||||
"bstr",
|
||||
"csv-core",
|
||||
"itoa",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv-core"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "float_extras"
|
||||
version = "0.1.6"
|
||||
@ -48,6 +82,12 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "0.4.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.4.0"
|
||||
@ -66,6 +106,12 @@ version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
||||
|
||||
[[package]]
|
||||
name = "num-bigint"
|
||||
version = "0.4.3"
|
||||
@ -96,6 +142,44 @@ dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c"
|
||||
dependencies = [
|
||||
"phf_shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_codegen"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a56ac890c5e3ca598bbdeaa99964edb5b0258a583a9eb6ef4e89fc85d9224770"
|
||||
dependencies = [
|
||||
"phf_generator",
|
||||
"phf_shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b1181c94580fa345f50f19d738aaa39c0ed30a600d95cb2d3e23f94266f14fbf"
|
||||
dependencies = [
|
||||
"phf_shared",
|
||||
"rand",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676"
|
||||
dependencies = [
|
||||
"siphasher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.50"
|
||||
@ -114,6 +198,33 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||
dependencies = [
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde"
|
||||
|
||||
[[package]]
|
||||
name = "s2"
|
||||
version = "0.0.12"
|
||||
@ -148,6 +259,12 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "siphasher"
|
||||
version = "0.3.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.107"
|
||||
@ -164,6 +281,15 @@ name = "this_algoritm"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"s2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "types"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"csv",
|
||||
"phf",
|
||||
"phf_codegen",
|
||||
"serde",
|
||||
]
|
||||
|
||||
|
@ -3,8 +3,13 @@ name = "this_algoritm"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[workspace]
|
||||
members = [
|
||||
".",
|
||||
"./types",
|
||||
]
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
serde="1"
|
||||
s2="0.0.12"
|
||||
|
11211
data/wordlist-tmp.csv
Normal file
11211
data/wordlist-tmp.csv
Normal file
File diff suppressed because it is too large
Load Diff
@ -12,21 +12,34 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.8.1)\n",
|
||||
"Requirement already satisfied: odfpy in /opt/conda/lib/python3.10/site-packages (1.4.1)\n",
|
||||
"Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
|
||||
"Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.10.31)\n",
|
||||
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
|
||||
"Collecting nltk\n",
|
||||
" Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hCollecting odfpy\n",
|
||||
" Downloading odfpy-1.4.1.tar.gz (717 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m717.0/717.0 kB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hRequirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.3)\n",
|
||||
"Collecting regex>=2021.8.3\n",
|
||||
" Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)\n",
|
||||
"Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.2.0)\n",
|
||||
"Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n"
|
||||
"Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from odfpy) (0.7.1)\n",
|
||||
"Building wheels for collected packages: odfpy\n",
|
||||
" Building wheel for odfpy (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=3ee9aaac0134706d6ef72a359cd2466813f37bd8f080150b008d6d6e247d710c\n",
|
||||
" Stored in directory: /home/jovyan/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469\n",
|
||||
"Successfully built odfpy\n",
|
||||
"Installing collected packages: regex, odfpy, nltk\n",
|
||||
"Successfully installed nltk-3.8.1 odfpy-1.4.1 regex-2022.10.31\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n",
|
||||
"[nltk_data] Package wordnet is already up-to-date!\n"
|
||||
"[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -79,7 +92,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['a', 'i', 's', 'p', 'c', 'b', 'american', 'york', 'd', 'john']"
|
||||
"['a', 'i', 's', 'p', 'c', 'b', 'american', 'york', 'children', 'd']"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
@ -115,6 +128,7 @@
|
||||
"text/plain": [
|
||||
"[('be', 'bee'),\n",
|
||||
" ('by', 'bye'),\n",
|
||||
" ('died', 'dyed'),\n",
|
||||
" ('corps', 'core'),\n",
|
||||
" ('ore', 'oar'),\n",
|
||||
" ('ore', ' or'),\n",
|
||||
@ -122,6 +136,7 @@
|
||||
" ('com', 'calm'),\n",
|
||||
" ('filing', 'filling'),\n",
|
||||
" ('fax', 'facts'),\n",
|
||||
" ('favour', 'favor'),\n",
|
||||
" ('theatre', 'theater'),\n",
|
||||
" ('par', 'parse'),\n",
|
||||
" ('honour', 'honor'),\n",
|
||||
@ -158,6 +173,7 @@
|
||||
" ('hem', 'him'),\n",
|
||||
" ('nun', 'none'),\n",
|
||||
" ('organisational', 'organizational'),\n",
|
||||
" ('dessert', 'desert'),\n",
|
||||
" ('aux', 'ox'),\n",
|
||||
" ('rap', 'wrap'),\n",
|
||||
" ('filings', 'filling'),\n",
|
||||
@ -194,13 +210,13 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"# all_words: 21323\n",
|
||||
"# all_words: 21287\n",
|
||||
"sample: ['the', 'of', 'and', 'to', 'in', 'is', 'that', 'for', 'as', 'it']\n",
|
||||
"\n",
|
||||
"# lemmatize_mappings: 21374\n",
|
||||
"# lemmatize_mappings: 21341\n",
|
||||
"sample: [('the', 'the'), ('of', 'of'), ('and', 'and'), ('to', 'to'), ('in', 'in'), ('is', 'is'), ('that', 'that'), ('for', 'for'), ('as', 'a'), ('it', 'it')]\n",
|
||||
"\n",
|
||||
"# distinct_words: 17585\n",
|
||||
"# distinct_words: 17557\n",
|
||||
"sample:\n"
|
||||
]
|
||||
},
|
||||
@ -310,7 +326,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Final wordlist size: 11212\n"
|
||||
"Final wordlist size: 11210\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -344,21 +360,50 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 12,
|
||||
"id": "d1a06597-4ad5-4566-a716-8bbad416b7ab",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]\n",
|
||||
"\n",
|
||||
"with open(\"final_wordlist.csv\", \"w\") as f:\n",
|
||||
" sorted_final_wordlist = [(k, final_wordlist[k]) for k in final_wordlist.keys()]\n",
|
||||
" f.write(\"word,number\\n\")\n",
|
||||
" \n",
|
||||
" for w in sorted(sorted_final_wordlist, key=lambda w: w[1]):\n",
|
||||
" lemmatized = \"\" if not w[1] else w[1]\n",
|
||||
" f.write(f\"{w[0]},{lemmatized}\")\n",
|
||||
" f.write(f\"{w[0].upper()},{lemmatized - 1}\")\n",
|
||||
" f.write(\"\\n\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "c88fe193-11cc-4a06-a3cf-d1ad85f44d14",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'H'"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2a0d177b-3499-42fb-8091-29547567d69a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
7
types/Cargo.lock
generated
Normal file
7
types/Cargo.lock
generated
Normal file
@ -0,0 +1,7 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "types"
|
||||
version = "0.1.0"
|
15
types/Cargo.toml
Normal file
15
types/Cargo.toml
Normal file
@ -0,0 +1,15 @@
|
||||
[package]
|
||||
name = "types"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
serde={version="1", features=["derive"]}
|
||||
phf="0.11"
|
||||
|
||||
[build-dependencies]
|
||||
serde={version="1", features=["derive"]}
|
||||
phf_codegen="0.11"
|
||||
csv="1.1"
|
88
types/build.rs
Normal file
88
types/build.rs
Normal file
@ -0,0 +1,88 @@
|
||||
#![feature(slice_group_by)]
|
||||
#![allow(unused_imports)]
|
||||
use csv::ReaderBuilder;
|
||||
use serde::{Deserialize, Deserializer, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, BufWriter, Write};
|
||||
use std::path::Path;
|
||||
|
||||
fn main() {
|
||||
let path = Path::new(&env::var("OUT_DIR").unwrap()).join("codegen.rs");
|
||||
let mut file = BufWriter::new(File::create(path).unwrap());
|
||||
|
||||
let rdr_builder = ReaderBuilder::new();
|
||||
|
||||
// First get the actual wordlist
|
||||
let words: Vec<Word> = rdr_builder
|
||||
.from_reader(File::open("../data/wordlist-tmp.csv").unwrap())
|
||||
.deserialize()
|
||||
.collect::<Result<Vec<Word>, _>>()
|
||||
.unwrap();
|
||||
|
||||
// Write it to an array containing all words
|
||||
writeln!(
|
||||
&mut file,
|
||||
r#"/// Static array of `Word`
|
||||
pub const WORDS: &[Word] = &["#
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
for result in words.iter() {
|
||||
writeln!(&mut file, "{result:?},").unwrap();
|
||||
}
|
||||
|
||||
writeln!(&mut file, "];\n").unwrap();
|
||||
|
||||
// Make a mapping of all caps word to a reference to the `Word` entry
|
||||
let mut word_map = phf_codegen::Map::new();
|
||||
for (idx, word) in words.iter().enumerate() {
|
||||
let idx_str = format!("&WORDS[{idx}]");
|
||||
word_map.entry(word.word.to_uppercase(), &idx_str);
|
||||
}
|
||||
writeln!(
|
||||
&mut file,
|
||||
r#"/// Mapping from all caps `&str` to `&'static Word`
|
||||
pub static WORD_MAP: phf::Map<&'static str, &'static Word> =
|
||||
{};"#,
|
||||
word_map.build()
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Make a mapping of numbers to `Word`s
|
||||
let word_number_to_idx = words
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(idx, w)| (w.number, idx))
|
||||
.collect::<Vec<(u16, usize)>>();
|
||||
|
||||
writeln!(
|
||||
&mut file,
|
||||
"pub const NUMBER_TO_WORD: &[&[&'static Word]] = &["
|
||||
)
|
||||
.unwrap();
|
||||
for entry in word_number_to_idx
|
||||
.as_slice()
|
||||
.group_by(|(number1, _idx1), (number2, _idx2)| number1 == number2)
|
||||
{
|
||||
write!(&mut file, "\t&[",).unwrap();
|
||||
|
||||
for idx in entry.iter().map(|(_w, idx)| idx) {
|
||||
write!(&mut file, "&WORDS[{idx}],").unwrap();
|
||||
}
|
||||
writeln!(&mut file, "],").unwrap();
|
||||
}
|
||||
writeln!(&mut file, "];\n").unwrap();
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Word {
|
||||
/// The word itself
|
||||
pub word: String,
|
||||
|
||||
/// The binary representation of this number
|
||||
///
|
||||
/// The words are responsible for 13 bits of data, so this is fine to fit in a u16
|
||||
pub number: u16,
|
||||
}
|
39
types/src/lib.rs
Normal file
39
types/src/lib.rs
Normal file
@ -0,0 +1,39 @@
|
||||
include!(concat!(env!("OUT_DIR"), "/codegen.rs"));
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
/// A word struct
|
||||
pub struct Word<'a> {
|
||||
/// The word itself
|
||||
pub word: &'a str,
|
||||
|
||||
/// The binary representation of this number
|
||||
///
|
||||
/// The words are responsible for 13 bits of data, so this is fine to fit in a u16
|
||||
pub number: u16,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct Address<'a> {
|
||||
number: u32,
|
||||
words: [Word<'a>; 3],
|
||||
}
|
||||
|
||||
// /// Find a word
|
||||
// ///
|
||||
// /// ```rust
|
||||
// /// use static_data::find_country;
|
||||
// ///
|
||||
// /// assert!(find_country("US").is_some());
|
||||
// /// assert!(find_country("UnItEd StAtEs").is_some());
|
||||
// /// assert!(find_country("abcd").is_none());
|
||||
// /// ```
|
||||
// pub fn find_country<S>(s: S) -> Option<&'static Country<'static>>
|
||||
// where
|
||||
// S: AsRef<str>,
|
||||
// {
|
||||
// // TODO: Decide weather uppercasing is too slow
|
||||
// COUNTRY_MAP
|
||||
// .get(s.as_ref())
|
||||
// .or_else(|| COUNTRY_MAP.get(&s.as_ref().to_uppercase()))
|
||||
// .copied()
|
||||
// }
|
Loading…
Reference in New Issue
Block a user