From ea1d77a20889f17e502a6f525e722a74d322fc20 Mon Sep 17 00:00:00 2001 From: Austen Adler Date: Mon, 9 Sep 2024 15:50:53 -0400 Subject: [PATCH] Release next version --- Cargo.lock | 76 +++- Cargo.toml | 10 +- src/lib.rs | 2 + src/main.rs | 247 +++++++---- src/parser.rs | 1154 +++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 1392 insertions(+), 97 deletions(-) create mode 100644 src/lib.rs create mode 100644 src/parser.rs diff --git a/Cargo.lock b/Cargo.lock index e43ac06..4c6d58c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -57,12 +57,6 @@ version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" -[[package]] -name = "arrayvec" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" - [[package]] name = "atomicwrites" version = "0.4.3" @@ -123,7 +117,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.75", ] [[package]] @@ -153,6 +147,17 @@ version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" +[[package]] +name = "derivative" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "errno" version = "0.3.9" @@ -181,15 +186,6 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "fjson" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9749dc2b27a3c20c7a10a40dff21369bcc7ed67e52b9e3d5858a1b6cd44cb5d" -dependencies = [ - "arrayvec", -] - [[package]] name = "fsevent-sys" version = "4.1.0" @@ -239,8 +235,11 @@ dependencies = [ "atomicwrites", "clap", "crossbeam-channel", - "fjson", + "derivative", + "memchr", "notify-debouncer-mini", + "oxidized-json-checker", + "thiserror", ] [[package]] @@ -292,6 +291,12 @@ version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + [[package]] name = "mio" version = "0.8.11" @@ -340,6 +345,12 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "oxidized-json-checker" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "938464aebf563f48ab86d1cfc0e2df952985c0b814d3108f41d1b85e7f5b0dac" + [[package]] name = "proc-macro2" version = "1.0.86" @@ -395,6 +406,17 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.75" @@ -419,6 +441,26 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "thiserror" +version = "1.0.63" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.63" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.75", +] + [[package]] name = "unicode-ident" version = "1.0.12" diff --git a/Cargo.toml b/Cargo.toml index 66a37ed..f334bea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,5 +8,13 @@ anyhow = "1.0.86" atomicwrites = "0.4.3" clap = { version = "4.5.16", features = ["derive"] } crossbeam-channel = "0.5.13" -fjson = "0.3.1" +derivative = "2.2.0" +# fjson = {path="./fjson/"} +# fjson = "0.3.1" +memchr = "2.7.4" notify-debouncer-mini = "0.4.1" +oxidized-json-checker = "0.3.2" +thiserror = "1.0.63" + +# [profile.release] +# debug = 2 diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..3262ac3 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,2 @@ +pub mod parser; +pub mod indentor; diff --git a/src/main.rs b/src/main.rs index 4c320b6..fb4d7e9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,167 +1,257 @@ +mod indentor; +mod parser; + use anyhow::{bail, Context, Error, Result}; use atomicwrites::{AtomicFile, OverwriteBehavior::AllowOverwrite}; use clap::{Args, Parser, Subcommand}; use crossbeam_channel::{Receiver, Sender}; use notify_debouncer_mini::{new_debouncer, notify::*, DebounceEventResult}; +use parser::Mode; use std::{ collections::HashSet, + convert::Infallible, ffi::OsString, - fs, - io::Write, + fs::File, + io::{BufRead, BufReader, BufWriter}, path::{Path, PathBuf}, + str::FromStr, time::Duration, }; #[derive(Parser, Debug)] struct Cli { + #[clap(help = "Input file, or `-` for stdin", default_value = "-")] + input: IoArg, + #[command(subcommand)] - command: Command, + command: Option, + + #[command(flatten)] + fmt_args: FmtArgs, + + #[clap( + short = 'o', + long = "jsoncc-output", + help = "Output file; will be stdout if no output is specified" + )] + output: Option, + + #[clap(short = 'O', long = "json-output", help = "Output file for json")] + json_output: Option, } #[derive(Subcommand, Debug)] enum Command { - #[clap(about = "Format a single file or stdin")] - Fmt(FmtArgs), + // #[clap(about = "Format a single file or stdin")] + // Fmt(FmtArgs), #[clap(about = "Watch a file or directory for changes")] Watch(WatchArgs), } #[derive(Args, Debug)] struct WatchArgs { - path: PathBuf, - + // path: PathBuf, #[clap(short = 'e', long = "extension", default_values = ["jsonc", "jsoncc"], help = "File extensions to track")] extensions: Vec, #[clap(short = 'r', long = "recursive", help = "Recursively search files")] recursive: bool, - - #[clap(short = 'I', long = "inplace", help = "Replace each file inplace")] - inplace: bool, + // #[clap(short = 'I', long = "inplace", help = "Replace each file inplace")] + // inplace: bool, } -#[derive(Args, Debug)] +#[derive(Args, Debug, Clone)] struct FmtArgs { - // #[clap(short = 'i', long = "input")] - #[clap(help = "Input file, or `-` for stdin")] - input: Option, - #[clap( - short = 'o', - long = "output", - help = "Output file; will be stdout if no output is specified" + short = 'c', + long = "compact", + help = "Compact json format", + conflicts_with = "output" )] - output: Option, - - #[clap(short = 'O', long = "json-output", help = "Output file for json")] - json_output: Option, - - #[clap(short = 'c', long = "compact", help = "Compact json format")] compact: bool, - #[clap(short = 'I', long = "inplace", help = "Replace file contents inplace")] + #[clap( + short = 'I', + long = "inplace", + help = "Replace file contents inplace", + requires = "input" + )] inplace: bool, + + #[clap(short = 'V', long = "validate", help = "Validate input is valid")] + validate: bool, } -impl FmtArgs { +impl Cli { /// Where should we format Jsonc output to? - fn jsonc_output(&self) -> Option { - if self.inplace { - Some(JsoncOutput::File(self.input.as_ref().expect( - "Argument parsing error -- input was empty, but --inplace was specified", - ))) - } else if let Some(ref output_file) = &self.output { - Some(if output_file.as_os_str() == "-" { - JsoncOutput::Stdout - } else { - JsoncOutput::File(output_file) - }) + fn jsonc_output(&self) -> Option> { + if self.fmt_args.inplace { + if !matches!(self.input, IoArg::File(_)) { + panic!("--inplace was specified, but input is not a file"); + } + + Some(self.input.as_output()) + } else if self.output.is_some() { + self.output.as_ref().map(IoArg::as_output) } else if self.json_output.is_some() { // We don't want to output jsonc anywhere if they don't specify -o and they do specify -O None } else { // If they don't have any output specified, default to stdout - Some(JsoncOutput::Stdout) + Some(IoArgRef::Stdio) } } } -enum JsoncOutput<'a> { - Stdout, +/// An argument that represents a file or stdin/stdout +#[derive(Debug, Clone)] +enum IoArg { + Stdio, + File(PathBuf), +} + +impl Default for IoArg { + fn default() -> Self { + Self::Stdio + } +} + +impl FromStr for IoArg { + type Err = Infallible; + + fn from_str(s: &str) -> std::result::Result { + if s == "-" { + Ok(Self::Stdio) + } else { + PathBuf::from_str(s).map(Self::File) + } + } +} + +impl IoArg { + fn as_output(&self) -> IoArgRef { + match self { + Self::Stdio => IoArgRef::Stdio, + Self::File(f) => IoArgRef::File(f), + } + } +} + +// impl<'a> AsRef> for IoArg { +// fn as_ref<'b>(&'b self) -> &'b Output<'a> { +// match self { +// IoArg::Stdio => &Output::Stdio, +// IoArg::File(f) => &Output::File(f), +// } +// } +// } + +#[derive(Debug)] +enum IoArgRef<'a> { + Stdio, File(&'a Path), } fn main() -> Result<()> { - let options = Cli::parse(); + let cli = Cli::parse(); - match options.command { - Command::Fmt(a) => { + match &cli.command { + None => { // TODO: Figure out how to validate this in clap Parser - if a.compact && a.json_output.is_none() { + if cli.fmt_args.compact && cli.json_output.is_none() { bail!("Cannot compact format jsonc. Specify --json-output if you want to use --compact"); } - if a.inplace && a.output.is_some() { + if cli.fmt_args.inplace && cli.output.is_some() { bail!("Cannot format --inplace when --output is specified"); } format_single_file( - a.input.as_ref(), - a.jsonc_output().as_ref(), - a.json_output.as_ref(), - a.compact, + &cli.input.as_output(), + cli.jsonc_output().as_ref(), + cli.json_output.as_ref(), + &cli.fmt_args, )?; } - Command::Watch(a) => watch(&a)?, + Some(Command::Watch(a)) => watch(&cli, a)?, } Ok(()) } +// TODO: Accept a [`FmtArgs`] fn format_single_file( - input: Option>, - jsonc_output: Option<&JsoncOutput>, + input: &IoArgRef, + jsonc_output: Option<&IoArgRef>, json_output: Option>, - json_compact: bool, + fmt_args: &FmtArgs, ) -> Result<()> { - let input_str = if let Some(input_filename) = input { - fs::read_to_string(&input_filename).context("Reading input")? + let mut input: Box = if let IoArgRef::File(input_filename) = input { + Box::new(BufReader::new( + File::open(input_filename).context("Reading input")?, + )) } else { - std::io::read_to_string(std::io::stdin()).context("Reading stdin")? + Box::new(BufReader::new(std::io::stdin().lock())) }; // First, format jsonc if let Some(jsonc_output) = jsonc_output { - let output = fjson::to_jsonc(&input_str).context("Parsing jsonc")?; - match jsonc_output { - JsoncOutput::Stdout => print!("{output}"), - JsoncOutput::File(output_file) => AtomicFile::new(output_file, AllowOverwrite) - .write(|f| f.write_all(output.as_bytes())) - .context("Writing jsonc output")?, + IoArgRef::Stdio => parser::Parser::new( + parser::Mode::Jsoncc, + &mut input, + BufWriter::new(std::io::stdout()), + ) + .format_buf() + .context("Formatting file")?, + IoArgRef::File(output_file) => AtomicFile::new(output_file, AllowOverwrite) + .write(|f| { + parser::Parser::new(parser::Mode::CompactJson, &mut input, BufWriter::new(f)) + .format_buf() + }) + .context("Formatting file")?, } } // Format json next if let Some(ref json_output_file) = json_output { - let output = if json_compact { - fjson::to_json_compact(&input_str).context("Formatting to json") + let mode = if fmt_args.compact { + Mode::CompactJson } else { - fjson::to_json(&input_str).context("Formatting to json") - }?; + Mode::Json + }; if json_output_file.as_ref().as_os_str() == "-" { - print!("{output}"); + parser::Parser::new(mode, &mut input, BufWriter::new(std::io::stdout())) + .format_buf() + .context("Formatting file")? } else { AtomicFile::new(json_output_file, AllowOverwrite) - .write(|f| f.write_all(output.as_bytes())) - .context("Writing jsonc output")?; + .write(|f| parser::Parser::new(mode, &mut input, BufWriter::new(f)).format_buf()) + .context("Writing json output")?; } } + // Validate - Just duplicate code here. If they want to validate, it adds a little extra cost anyway + // Reformatting is probably not a big cost + if fmt_args.validate { + // TODO: Not + let mut buf = vec![]; + parser::Parser::new(Mode::CompactJson, &mut input, BufWriter::new(&mut buf)) + .format_buf() + .context("Formatting file")?; + + oxidized_json_checker::validate(&buf[..])?; + } + Ok(()) } -fn watch(args: &WatchArgs) -> Result<()> { - let is_watching_file = args.path.is_file(); +fn watch(cli: &Cli, args: &WatchArgs) -> Result<()> { + // The path to watch + let IoArg::File(ref watch_path) = cli.input else { + panic!("Input must be specified") + }; + // True if we are watching only a single file + let is_watching_file = watch_path.is_file(); let (terminate_tx, terminate_rx): (Sender>, Receiver>) = crossbeam_channel::bounded(100); @@ -186,7 +276,7 @@ fn watch(args: &WatchArgs) -> Result<()> { .watcher() // TODO: Make this recursive or not .watch( - &args.path, + watch_path, if args.recursive { RecursiveMode::Recursive } else { @@ -198,7 +288,7 @@ fn watch(args: &WatchArgs) -> Result<()> { // Keep track of files that have just been formatted let mut just_formatted = HashSet::new(); - eprintln!("Watching {:?}", args.path); + eprintln!("Watching {:?}", watch_path); while let Ok(evt) = terminate_rx.recv() { match evt { @@ -221,10 +311,10 @@ fn watch(args: &WatchArgs) -> Result<()> { eprintln!("Got result: {path:#?}"); match format_single_file( - Some(&path), - Some(&JsoncOutput::File(&path)), + &IoArgRef::File(&path), + Some(&IoArgRef::File(&path)), None::<&Path>, - false, + &cli.fmt_args, ) { Ok(()) => { eprintln!("Formatted file {:?}", path); @@ -234,12 +324,11 @@ fn watch(args: &WatchArgs) -> Result<()> { // This is because on formatting, the file is unlinked, so we lose our watch debouncer .watcher() - // TODO: Make this recursive or not - .watch(&args.path, RecursiveMode::NonRecursive) + .watch(&path, RecursiveMode::NonRecursive) .context("Adding watch to debouncer")?; } - // Otherwise, we don't want to trigger anything for this file, so we ignore it next time + // We don't want to trigger anything for this file, so we ignore it next time just_formatted.insert(path); } Err(e) => { diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..55e2de7 --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,1154 @@ +// use anyhow::Result; +use std::collections::VecDeque; +use std::io::BufRead; +use std::io::ErrorKind; +use std::io::Write; + +use derivative::Derivative; + +use crate::indentor::Indentor; + +const INDENT: &[u8] = b" "; +const RECORD_SEPARATOR: &[u8] = b"\n"; +const NEWLINE: &[u8] = b"\n"; + +const C_CR: u8 = b'\r'; +const C_LF: u8 = b'\n'; +const C_TAB: u8 = b'\t'; +const C_SPACE: u8 = b' '; + +const C_COMMA: u8 = b','; +const C_COLON: u8 = b':'; +const C_QUOTE: u8 = b'"'; +const C_BACKSLASH: u8 = b'\\'; + +const C_LEFT_BRACE: u8 = b'{'; +const C_LEFT_BRACKET: u8 = b'['; +const C_RIGHT_BRACE: u8 = b'}'; +const C_RIGHT_BRACKET: u8 = b']'; + +const C_SLASH: u8 = b'/'; +const C_STAR: u8 = b'*'; + +const C_PLUS: u8 = b'+'; +const C_DOT: u8 = b'.'; +const C_MINUS: u8 = b'-'; +const C_E: u8 = b'-'; + +/// Mode of operation of ouptut of the parser +#[derive(Debug, PartialEq, Eq)] +pub enum Mode { + /// Add trailing commas, and do not strip comments + Jsoncc, + /// Strip comments, and add whitespace and newlines + Json, + /// Strip comments, and strip all optional whitespace + CompactJson, +} + +impl Mode { + /// Check if the mode wants to keep comments or strip them + fn keep_comments(&self) -> bool { + match self { + Mode::Jsoncc => true, + Mode::Json | Mode::CompactJson => false, + } + } +} + +impl Default for Mode { + fn default() -> Self { + Self::Jsoncc + } +} + +pub type Result = std::result::Result; + +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// The input buffer is empty, but we need a token + #[error("Buffer unexpectedly empty")] + BufferEmpty, + /// Bytes that look like a value (true, false, null, a number, or a string) was found in the wrong position + #[error("Unexpected value type")] + UnexpectedValue, + /// A byte was found in an unexpected position + #[error("Unexpected char {0:?}")] + UnexpectedChar(char), + /// A collection end token was found in an unexpected position + #[error("Unexpected collection ending")] + UnexpectedCollectionEnd, + /// An IO error occured when reading or writing + #[error("IO Error: {0}")] + Io(#[from] std::io::Error), +} + +impl Error { + pub fn is_eof(&self) -> bool { + matches!(self, Self::Io(e) if e.kind() == ErrorKind::UnexpectedEof) + } +} + +/// A token found in the input stream +/// +/// This does not track `:` or `,` for two reasons: +/// +/// 1. All input is jsoncc, which has optional `,`. `,` provides no extra information as the next token would need to be checked to decide if the current value is the last value +/// 1. `:` state is derived by the [`CollectionState::Object`] `awaiting_key` field +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Token { + /// We have reached an EOF at a position that is not in a value + Eof, + /// The root of the input + /// + /// Note that there can be multiple root tokens. For example, for input `{}{}`, root tokens are sent at these positions: `^{}^{}` + Root, + /// The start of an object or array + CollectionStart { ty: CollectionType }, + /// The end of an object or array + CollectionEnd { ty: CollectionType }, + /// A block or line comment + Comment { + ty: CommentType, + /// Should this comment be on its own line? + /// + /// This is derived from the input. + /// If the comment is read on a line with only whitespace tokens, this is set to true + own_line: bool, + }, + /// A value that is not a collection + Value { ty: ValueType, first_char: u8 }, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CollectionType { + Object, + Array, +} + +#[derive(Debug, Clone, Copy)] +enum CollectionState { + Object { awaiting_key: bool }, + Array, +} + +impl CollectionState { + fn ty(&self) -> CollectionType { + match self { + Self::Object { awaiting_key: _ } => CollectionType::Object, + Self::Array => CollectionType::Array, + } + } +} + +impl CollectionType { + fn as_state(&self) -> CollectionState { + match self { + Self::Object => CollectionState::Object { awaiting_key: true }, + Self::Array => CollectionState::Array, + } + } + + fn start_str(&self) -> &'static str { + match self { + Self::Object => "{", + Self::Array => "[", + } + } + + fn end_str(&self) -> &'static str { + match self { + Self::Object => "}", + Self::Array => "]", + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum CommentType { + Line, + Block, +} + +impl CommentType { + fn start_str(&self) -> &'static str { + match self { + Self::Line => "//", + Self::Block => "/*", + } + } +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum ValueType { + // TODO: Do we want to allow the possibility of unquoted object keys? + // Unquoted values would be a bad idea. For example, there would be ambiguity for {x: true} (is it {"x": "true"} or {"x": true}) + // You could force `true`/`false`/`null`/numbers to be non-strings, but then you end up with the yaml `yes`/`no` problem + // Also, if someone types a number like `-1.4e4.`, we don't want that converted to a string, we should keep it as an (invalid) number + // UnquotedString, + String, + Number, + Boolean, + Null, +} + +#[derive(Derivative)] +#[derivative(Debug)] +pub struct Parser +where + R: BufRead, + W: Write, +{ + /// Input reader + #[derivative(Debug = "ignore")] + input: R, + /// Output writer + #[derivative(Debug = "ignore")] + write: W, + /// Stack tracking the state of the parser + /// + /// When descending into an array or object, push a [`CollectionState`] here + state_stack: VecDeque, + /// The current token the parser has received + current_token: Token, + // TODO: This can be used to add whitespace (if [`num_empty_lines`] > 1) + /// The number of empty lines read from [`input`] in a row + /// + /// Empty lines are lines that only contain whitespace + num_empty_lines: u8, + /// The mode of operation of the parser + mode: Mode, + /// Buffered indent strings so repeated calls do not have to repeated call [`std::io::repeat`] + indentor: Indentor, +} + +impl Parser +where + R: BufRead, + W: Write, +{ + pub fn new(mode: Mode, read: R, write: W) -> Self { + Self { + input: read, + write, + state_stack: VecDeque::new(), + current_token: Token::Root, + num_empty_lines: 0, + mode, + indentor: Indentor::new(INDENT), + } + } + + /// Send the rest of the input to the writer until the end of the comment is reached + fn drain_comment(&mut self, ty: &CommentType) -> Result<()> { + let mut maybe_block_end = false; + + loop { + let buf = self.input.fill_buf()?; + + if buf.is_empty() { + return Err(Error::BufferEmpty); + } + + match ty { + CommentType::Line => match line_comment_end(buf) { + Some(idx) => { + if self.mode.keep_comments() { + self.write.write_all(&buf[0..idx])?; + } + + self.input.consume(idx); + break; + } + None => { + if self.mode.keep_comments() { + self.write.write_all(buf)?; + } + let len = buf.len(); + self.input.consume(len); + } + }, + CommentType::Block => { + if maybe_block_end && buf[0] == b'/' { + // We ended the block comment + if self.mode.keep_comments() { + self.write.write_all(b"/")?; + } + break; + } + + maybe_block_end = false; + + match block_comment_end(buf) { + BlockCommentEnd::Position(idx) => { + if self.mode.keep_comments() { + self.write.write_all(&buf[0..idx])?; + } + + self.input.consume(idx); + break; + } + BlockCommentEnd::MaybeEnd => { + if self.mode.keep_comments() { + self.write.write_all(buf)?; + } + let len = buf.len(); + self.input.consume(len); + maybe_block_end = true; + } + BlockCommentEnd::None => { + if self.mode.keep_comments() { + self.write.write_all(buf)?; + } + let len = buf.len(); + self.input.consume(len); + } + } + } + } + } + + Ok(()) + } + + /// Send the rest of the input to the writer until the end of the value is reached + fn drain_value(&mut self, ty: &ValueType, first_char: u8) -> Result<()> { + match (ty, first_char) { + (ValueType::String, C_QUOTE) => { + let mut next_char_escaped = false; + self.write([C_QUOTE])?; + + // Loop until we are done with the string + loop { + if next_char_escaped { + // The previous buffer ended in `\` + // Send this character out + let next_char = self.next_char()?; + self.write([next_char])?; + } + next_char_escaped = false; + + let buf = self.input.fill_buf()?; + + if buf.is_empty() { + return Err(Error::BufferEmpty); + } + + match string_end(buf) { + StringEnd::Position(idx) => { + self.write.write_all(&buf[0..idx])?; + self.input.consume(idx); + break; + } + StringEnd::MaybeEnd => { + self.write.write_all(buf)?; + let len = buf.len(); + self.input.consume(len); + next_char_escaped = true; + } + StringEnd::None => { + self.write.write_all(buf)?; + let len = buf.len(); + self.input.consume(len); + } + } + } + + let next_char = self.next_char()?; + self.write([next_char])?; + + Ok(()) + } + (ValueType::Number, mut c) => { + loop { + self.write([c])?; + c = self.peek_next_char()?; + // Any of the json numerical characters + if c == C_PLUS + || c == C_MINUS + || c == C_DOT + || (c as char).is_ascii_digit() + || c == C_E + { + self.next_char()?; + } else { + break; + } + } + Ok(()) + } + (ValueType::Boolean, b't') => { + let mut chr = [0_u8; 3]; + self.input.read_exact(&mut chr)?; + + if chr == *b"rue" { + self.write("true")?; + Ok(()) + } else { + Err(Error::UnexpectedValue) + } + } + (ValueType::Boolean, b'f') => { + let mut chr = [0_u8; 4]; + self.input.read_exact(&mut chr)?; + + if chr == *b"alse" { + self.write("false")?; + Ok(()) + } else { + Err(Error::UnexpectedValue) + } + } + (ValueType::Null, b'n') => { + let mut chr = [0_u8; 3]; + self.input.read_exact(&mut chr)?; + + if chr == *b"ull" { + self.write("null")?; + Ok(()) + } else { + Err(Error::UnexpectedValue) + } + } + _ => { + eprintln!("Value type: {ty:?}, with first char {first_char:?}"); + Err(Error::UnexpectedValue) + } + } + } + + /// Write some bytes to the writer + fn write(&mut self, buf: impl AsRef<[u8]>) -> Result<()> { + // eprintln!("### Writing {:?}", String::from_utf8_lossy(buf.as_ref())); + self.write.write_all(buf.as_ref())?; + Ok(()) + } + + /// Write the record separator to the writer + fn record_separator(&mut self) -> Result<()> { + self.write(RECORD_SEPARATOR)?; + + Ok(()) + } + + /// Add extra padding after `:` or before the `//`/`/*` in a comment, if the format requests it + fn extra_spacing(&mut self) -> Result<()> { + match self.mode { + Mode::Jsoncc | Mode::Json => self.write(" ")?, + Mode::CompactJson => {} + } + + Ok(()) + } + + /// Add a comma only if we are not at the root level + fn comma(&mut self) -> Result<()> { + // We don't want a comma if this is a root element + if self.state_stack.is_empty() { + return Ok(()); + } + + self.write(",")?; + + Ok(()) + } + + /// Add a trailing comma only if we are not at the root level and we are in [`Mode::Jsoncc`] + fn trailing_comma(&mut self) -> Result<()> { + match self.mode { + Mode::Jsoncc => self.comma()?, + Mode::Json | Mode::CompactJson => {} + } + + Ok(()) + } + + /// Write a newline and add indentation + fn newline(&mut self) -> Result<()> { + match self.mode { + Mode::Jsoncc | Mode::Json => { + self.write(NEWLINE)?; + self.write + .write_all(self.indentor.get_indent(self.state_stack.len()))?; + } + Mode::CompactJson => {} + } + + Ok(()) + } + + /// Leave a collection + /// + /// Call this after you see a `]` or `}` token, and you want the parser to ensure that we were in the right kind of collection before leaving it + fn exit_collection(&mut self, ty: &CollectionType) -> Result<()> { + if Some(*ty) + != self + .state_stack + .pop_back() + .as_ref() + .map(CollectionState::ty) + { + return Err(Error::UnexpectedCollectionEnd); + } + + Ok(()) + } + + /// Format the reader into the writer and consume the [`Parser`] by reading tokens and sending formatted output + /// + /// Generally, the writer state ends with each token written with the ending `:` as required by the next token + /// A `,` is decided if the `current_token` is a value and the next token is something that warrants a `,` (either another value, a collection, or a collection end in jsoncc mode) + /// + /// For example: + /// ```text + /// ["a", "b"] + /// ^ + /// ``` + /// + /// At this position, `self::current_token` is a `Value` (representing `"a"`) and `next_token` represents `"b"`, so we know a `,` has been written + /// In Jsoncc/Json mode, write a newline, indent, and flush the `"b"` Value + pub fn format_buf(mut self) -> Result<()> { + loop { + // eprintln!("========================================================"); + // eprintln!("{:?}", self); + + let mut next_token = self.get_next_token()?; + + // eprintln!("{:#?}\n{:#?}", self.current_token, next_token); + // eprintln!(); + + match (self.current_token, &next_token) { + (Token::Root, Token::CollectionStart { ty }) => { + self.state_stack.push_back(ty.as_state()); + self.write(ty.start_str())?; + } + (Token::Root, Token::CollectionEnd { ty }) => { + self.exit_collection(ty)?; + self.write(ty.end_str())?; + self.write(ty.end_str())?; + } + (Token::Root, Token::Comment { ty, own_line: _ }) => { + self.write(ty.start_str())?; + self.drain_comment(ty)?; + } + (Token::Root, Token::Value { ty, first_char }) => { + self.drain_value(ty, *first_char)?; + next_token = Token::Root; + } + (Token::CollectionStart { ty: _ }, Token::CollectionStart { ty }) => { + self.newline()?; + self.write(ty.start_str())?; + self.state_stack.push_back(ty.as_state()); + } + (Token::CollectionStart { ty: _ }, Token::CollectionEnd { ty }) => { + // `{}` or `[]` + self.exit_collection(ty)?; + self.write(ty.end_str())?; + } + (Token::CollectionStart { ty: _ }, Token::Comment { ty, own_line: _ }) => { + // Force own_line to be true + self.newline()?; + self.write(ty.start_str())?; + self.drain_comment(ty)?; + } + (Token::CollectionStart { ty: _ }, Token::Value { ty, first_char }) => { + self.newline()?; + self.drain_value(ty, *first_char)?; + if self.is_awaiting_key()? { + self.write(":")?; + } + self.toggle_awaiting_key()?; + } + (Token::CollectionEnd { ty: _ }, Token::CollectionStart { ty }) => { + self.comma()?; + self.newline()?; + self.write(ty.start_str())?; + self.state_stack.push_back(ty.as_state()); + } + (Token::CollectionEnd { ty: _ }, Token::CollectionEnd { ty }) => { + self.trailing_comma()?; + self.exit_collection(ty)?; + self.newline()?; + self.write(ty.end_str())?; + } + (Token::CollectionEnd { ty: _ }, Token::Comment { ty, own_line: _ }) => { + // Force own_line to be true + self.trailing_comma()?; + self.newline()?; + self.write(ty.start_str())?; + self.drain_comment(ty)?; + } + (Token::CollectionEnd { ty: _ }, Token::Value { ty, first_char }) => { + self.comma()?; + self.newline()?; + self.drain_value(ty, *first_char)?; + if self.is_awaiting_key()? { + self.write(":")?; + } + self.toggle_awaiting_key()?; + } + (Token::Comment { ty: _, own_line: _ }, Token::CollectionStart { ty }) => { + self.newline()?; + self.write(ty.start_str())?; + self.state_stack.push_back(ty.as_state()); + } + (Token::Comment { ty: _, own_line: _ }, Token::CollectionEnd { ty }) => { + self.exit_collection(ty)?; + self.newline()?; + self.write(ty.end_str())?; + // self.trailing_comma()?; + } + (Token::Comment { ty: _, own_line: _ }, Token::Comment { ty, own_line: _ }) => { + // Force own_line to be true + self.newline()?; + self.write(ty.start_str())?; + self.drain_comment(ty)?; + } + (Token::Comment { ty: _, own_line: _ }, Token::Value { ty, first_char }) => { + self.newline()?; + self.drain_value(ty, *first_char)?; + + if self.is_awaiting_key()? { + self.write(":")?; + } + self.toggle_awaiting_key()?; + } + ( + Token::Value { + ty: _, + first_char: _, + }, + Token::CollectionStart { ty }, + ) => { + if self.is_awaiting_key()? { + self.comma()?; + self.newline()?; + } else { + self.extra_spacing()?; + } + + self.write(ty.start_str())?; + self.toggle_awaiting_key()?; + self.state_stack.push_back(ty.as_state()); + } + ( + Token::Value { + ty: _, + first_char: _, + }, + Token::CollectionEnd { ty }, + ) => { + self.trailing_comma()?; + self.exit_collection(ty)?; + self.newline()?; + self.write(ty.end_str())?; + } + ( + Token::Value { + ty: _, + first_char: _, + }, + Token::Comment { ty, own_line }, + ) => { + if self.is_awaiting_key()? { + self.comma()?; + } + + if *own_line { + self.newline()?; + } else { + self.extra_spacing()?; + } + self.write(ty.start_str())?; + self.drain_comment(ty)?; + } + ( + Token::Value { + ty: _, + first_char: _, + }, + Token::Value { ty, first_char }, + ) => { + if self.is_awaiting_key()? { + self.comma()?; + self.newline()?; + } else { + // The previous value was an object key, so put a space after the `:` + self.extra_spacing()?; + } + self.drain_value(ty, *first_char)?; + if self.is_awaiting_key()? { + self.write(":")?; + } + self.toggle_awaiting_key()?; + } + + (Token::Root, Token::Eof) if self.state_stack.is_empty() => { + // We read the whole file successfully! + return Ok(()); + } + + (a, b) => { + panic!("Invalid state transition: {a:?} => {b:?}") + } + } + + if (matches!(next_token, Token::CollectionEnd { .. }) || next_token == Token::Root) + && self.state_stack.is_empty() + { + self.record_separator()?; + next_token = Token::Root; + } + + self.current_token = next_token; + } + } + + /// Search for a token while in [`ParserMode::Normal`] + fn get_next_token(&mut self) -> Result { + let ret = loop { + let chr = self.next_char(); + + if Err(true) == chr.as_ref().map_err(Error::is_eof) { + // TODO: If our nested depth is 0, this is just a Root token?? + break Ok(Token::Eof); + } + let chr = chr?; + + // eprintln!("Got next char: {:?}", chr as char); + + break Ok(match chr { + C_CR | C_LF => { + self.num_empty_lines = self.num_empty_lines.saturating_add(1); + continue; + } + C_TAB | C_SPACE => continue, + // C_COMMA => Token::Comma, + C_COLON => continue, + // TODO: Allow unquoted strings? + C_QUOTE => Token::Value { + ty: ValueType::String, + first_char: b'"', + }, + // C_BACKSLASH => {} + C_LEFT_BRACE => Token::CollectionStart { + ty: CollectionType::Object, + }, + C_LEFT_BRACKET => Token::CollectionStart { + ty: CollectionType::Array, + }, + C_RIGHT_BRACE => Token::CollectionEnd { + ty: CollectionType::Object, + }, + C_RIGHT_BRACKET => Token::CollectionEnd { + ty: CollectionType::Array, + }, + C_SLASH => { + // We can't send comment tokens if using json + let maybe_next_token_ty = match self.next_char()? { + C_SLASH => CommentType::Line, + + C_STAR => CommentType::Block, + + c => { + eprintln!("{:#?}", self); + eprintln!("X {:?}", (c as char)); + break Err(Error::UnexpectedChar(c as char)); + } + }; + + if self.mode.keep_comments() { + Token::Comment { + ty: maybe_next_token_ty, + own_line: self.num_empty_lines > 0, + } + } else { + // We need to drain this comment by reading the buffer + // This function won't write anything in json modes + self.drain_comment(&maybe_next_token_ty)?; + + self.num_empty_lines = 0; + continue; + } + } + C_COMMA => continue, + + c @ b't' | c @ b'f' => Token::Value { + ty: ValueType::Boolean, + first_char: c, + }, + c @ b'n' => Token::Value { + ty: ValueType::Null, + first_char: c, + }, + + c @ C_PLUS | c @ C_MINUS | c if (c as char).is_ascii_digit() => Token::Value { + ty: ValueType::Number, + first_char: c, + }, + + c => { + eprintln!("Unexpected char?? {self:#?}"); + break Err(Error::UnexpectedChar(c as char)); + } + }); + }; + + self.num_empty_lines = 0; + ret + } + + /// Check the next char without consuming it + fn peek_next_char(&mut self) -> Result { + self.input + .fill_buf()? + .first() + .ok_or(Error::BufferEmpty) + .copied() + } + + /// Consume the next character from the reader + fn next_char(&mut self) -> Result { + let mut chr = [0_u8]; + self.input.read_exact(&mut chr)?; + Ok(chr[0]) + } + + /// Returns `true` if we are in an object and the next value is actually an object key + fn is_awaiting_key(&self) -> Result { + Ok( + match self.state_stack.back().ok_or(Error::UnexpectedValue)? { + CollectionState::Object { awaiting_key } => *awaiting_key, + CollectionState::Array => false, + }, + ) + } + + /// Toggles the `awaiting_key` value. Called after reading a value + /// + /// Has no affect if the current collection is an array, so this is safe to call after reading any value or CollectionEnd token + fn toggle_awaiting_key(&mut self) -> Result<()> { + match self.state_stack.back_mut().ok_or(Error::UnexpectedValue)? { + CollectionState::Object { awaiting_key } => *awaiting_key = !*awaiting_key, + CollectionState::Array => {} + } + + Ok(()) + } +} + +/// Gets the position in a buf that a block comment ends +/// ```text +/// /* abc */ def +/// ^ +/// ``` +fn block_comment_end(buf: &[u8]) -> BlockCommentEnd { + for star_idx in memchr::memchr_iter(C_STAR, buf) { + match buf.get(star_idx + 1) { + Some(&C_SLASH) => { + // We found `*/` at position `star_idx` + return BlockCommentEnd::Position(star_idx + 2); + } + Some(_) => {} + None => { + // We found `*` at the end of the buffer + return BlockCommentEnd::MaybeEnd; + } + } + } + BlockCommentEnd::None +} + +/// Gets the position in a buf that the string ends +/// ```text +/// xyzabc": 123, +/// ^ +/// ``` +/// Note that the `xyzabc` is part of a string, but the start of the string must have come from a previous buffer +fn string_end(buf: &[u8]) -> StringEnd { + let mut n = 0; + + loop { + match memchr::memchr2(C_QUOTE, C_BACKSLASH, &buf[n..]) + .and_then(|idx| Some((idx, buf.get(idx + n)?))) + { + Some((idx, &C_QUOTE)) => { + n += idx; + return StringEnd::Position(n); + } + Some((idx, &C_BACKSLASH)) => { + n += idx; + + // We found a `\` at the end of the buf + if buf.len() == n + 1 { + // The `/` is at the end of `buf` + return StringEnd::MaybeEnd; + } else { + // The end of the string won't be the `\` and the next byte + n += 2; + } + } + Some((idx, chr)) => { + eprintln!("Buf: {:?}", String::from_utf8(buf.to_vec())); + panic!( + "memchr2 returned unexpected result ({} @ {})", + *chr as char, + idx + n + ); + } + None => { + // There are no `"` in the string, so we know the rest of the buf is just part of the string + return StringEnd::None; + } + } + } +} + +/// Gets the position in a buf that a line comment ends +fn line_comment_end(buf: &[u8]) -> Option { + memchr::memchr2(C_CR, C_LF, buf) +} + +/// A case that a buf ends in a block comment ending `*/` +enum BlockCommentEnd { + /// The block comment ended at this position + Position(usize), + /// The buffer did not have any `*/`, but it ended in a `*` + MaybeEnd, + /// The block comment does not end in this buf + None, +} + +/// A case that a buf ends in a string ending `"` that was not escaped by `\` +#[derive(PartialEq, Eq, Debug)] +enum StringEnd { + /// The string ended at this position + Position(usize), + /// The buffer did not have any unescaped `"`, but it ended in a `*` + MaybeEnd, + /// The string does not end in this buf + None, +} + +#[cfg(test)] +mod tests { + use std::io::{BufReader, BufWriter}; + + use super::*; + + fn format_to_string(input: &[u8], mode: Mode) -> String { + let mut output = vec![]; + Parser::new( + mode, + BufReader::new(input), + &mut BufWriter::new(&mut output), + ) + .format_buf() + .unwrap(); + String::from_utf8(output).unwrap() + } + + #[test] + fn test_string_end() { + assert_eq!(string_end(br#"ABC"#), StringEnd::None); + assert_eq!(string_end(br#"ABC\"#), StringEnd::MaybeEnd); + assert_eq!(string_end(br#"ABC""#), StringEnd::Position(3)); + } + + #[test] + fn test_formatting() { + let x = r#"[] +{} +[] +{ + "a": "b" +} +{"a":"b"} +{ + "a": "b", +} +{"a":"b",} +[] + "#; + + eprintln!("{}", format_to_string(x.as_bytes(), Mode::Jsoncc)); + assert_eq!( + format_to_string(x.as_bytes(), Mode::Jsoncc), + r#"[] +{} +[] +{ + "a": "b", +} +{ + "a": "b", +} +{ + "a": "b", +} +{ + "a": "b", +} +[] +"# + ); + + assert_eq!( + format_to_string(x.as_bytes(), Mode::Json), + r#"[] +{} +[] +{ + "a": "b" +} +{ + "a": "b" +} +{ + "a": "b" +} +{ + "a": "b" +} +[] +"# + ); + + assert_eq!( + format_to_string(x.as_bytes(), Mode::CompactJson), + r#"[] +{} +[] +{"a":"b"} +{"a":"b"} +{"a":"b"} +{"a":"b"} +[] +"# + ); + } + + #[test] + fn test_formatting_comments() { + let x = r#"[] +{ + /*1*/ +} +[ + /*2*/ +] +{ + // + "a": "b", +} +{ + // + "a": "b", +} +{ + /*1*/ + "a": "b", /*2*/ + /*3*/ + "c":"d", + /*4*/ "e":"f"/*5*/, /*6*/ +} +{/*w*/ + /*x*/ + "a"/*y*/:/*z*/"b",/*a*/ +} +[]"#; + + eprintln!("{}", format_to_string(x.as_bytes(), Mode::Json)); + + assert_eq!( + format_to_string(x.as_bytes(), Mode::Jsoncc), + r#"[] +{ + /*1*/ +} +[ + /*2*/ +] +{ + // + "a": "b", +} +{ + // + "a": "b", +} +{ + /*1*/ + "a": "b", /*2*/ + /*3*/ + "c": "d", + /*4*/ + "e": "f", /*5*/ + /*6*/ +} +{ + /*w*/ + /*x*/ + "a": /*y*/ + /*z*/ + "b", /*a*/ +} +[] +"# + ); + + assert_eq!( + format_to_string(x.as_bytes(), Mode::CompactJson), + r#"[] +{} +[] +{"a":"b"} +{"a":"b"} +{"a":"b","c":"d","e":"f"} +{"a":"b"} +[] +"# + ); + assert_eq!( + format_to_string(x.as_bytes(), Mode::Json), + r#"[] +{} +[] +{ + "a": "b" +} +{ + "a": "b" +} +{ + "a": "b", + "c": "d", + "e": "f" +} +{ + "a": "b" +} +[] +"# + ); + } + + // static G: AtomicUsize = AtomicUsize::new(0); + + // fn fork(i: &str) -> Vec { + // let a = i.replacen( + // "_", + // &format!("/*{}*/", G.fetch_add(1, Ordering::Relaxed)), + // 1, + // ); + // let b = i.replacen("_", "", 1); + + // if a.contains("_") { + // let mut ret = fork(&a); + // ret.append(&mut fork(&b)); + // ret + // } else { + // vec![a, b] + // } + // } +}