1155 lines
34 KiB
Rust
1155 lines
34 KiB
Rust
// use anyhow::Result;
|
|
use std::collections::VecDeque;
|
|
use std::io::BufRead;
|
|
use std::io::ErrorKind;
|
|
use std::io::Write;
|
|
|
|
use derivative::Derivative;
|
|
|
|
use crate::indentor::Indentor;
|
|
|
|
const INDENT: &[u8] = b" ";
|
|
const RECORD_SEPARATOR: &[u8] = b"\n";
|
|
const NEWLINE: &[u8] = b"\n";
|
|
|
|
const C_CR: u8 = b'\r';
|
|
const C_LF: u8 = b'\n';
|
|
const C_TAB: u8 = b'\t';
|
|
const C_SPACE: u8 = b' ';
|
|
|
|
const C_COMMA: u8 = b',';
|
|
const C_COLON: u8 = b':';
|
|
const C_QUOTE: u8 = b'"';
|
|
const C_BACKSLASH: u8 = b'\\';
|
|
|
|
const C_LEFT_BRACE: u8 = b'{';
|
|
const C_LEFT_BRACKET: u8 = b'[';
|
|
const C_RIGHT_BRACE: u8 = b'}';
|
|
const C_RIGHT_BRACKET: u8 = b']';
|
|
|
|
const C_SLASH: u8 = b'/';
|
|
const C_STAR: u8 = b'*';
|
|
|
|
const C_PLUS: u8 = b'+';
|
|
const C_DOT: u8 = b'.';
|
|
const C_MINUS: u8 = b'-';
|
|
const C_E: u8 = b'-';
|
|
|
|
/// Mode of operation of ouptut of the parser
|
|
#[derive(Debug, PartialEq, Eq)]
|
|
pub enum Mode {
|
|
/// Add trailing commas, and do not strip comments
|
|
Jsoncc,
|
|
/// Strip comments, and add whitespace and newlines
|
|
Json,
|
|
/// Strip comments, and strip all optional whitespace
|
|
CompactJson,
|
|
}
|
|
|
|
impl Mode {
|
|
/// Check if the mode wants to keep comments or strip them
|
|
fn keep_comments(&self) -> bool {
|
|
match self {
|
|
Mode::Jsoncc => true,
|
|
Mode::Json | Mode::CompactJson => false,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Default for Mode {
|
|
fn default() -> Self {
|
|
Self::Jsoncc
|
|
}
|
|
}
|
|
|
|
pub type Result<T> = std::result::Result<T, Error>;
|
|
|
|
#[derive(Debug, thiserror::Error)]
|
|
pub enum Error {
|
|
/// The input buffer is empty, but we need a token
|
|
#[error("Buffer unexpectedly empty")]
|
|
BufferEmpty,
|
|
/// Bytes that look like a value (true, false, null, a number, or a string) was found in the wrong position
|
|
#[error("Unexpected value type")]
|
|
UnexpectedValue,
|
|
/// A byte was found in an unexpected position
|
|
#[error("Unexpected char {0:?}")]
|
|
UnexpectedChar(char),
|
|
/// A collection end token was found in an unexpected position
|
|
#[error("Unexpected collection ending")]
|
|
UnexpectedCollectionEnd,
|
|
/// An IO error occured when reading or writing
|
|
#[error("IO Error: {0}")]
|
|
Io(#[from] std::io::Error),
|
|
}
|
|
|
|
impl Error {
|
|
pub fn is_eof(&self) -> bool {
|
|
matches!(self, Self::Io(e) if e.kind() == ErrorKind::UnexpectedEof)
|
|
}
|
|
}
|
|
|
|
/// A token found in the input stream
|
|
///
|
|
/// This does not track `:` or `,` for two reasons:
|
|
///
|
|
/// 1. All input is jsoncc, which has optional `,`. `,` provides no extra information as the next token would need to be checked to decide if the current value is the last value
|
|
/// 1. `:` state is derived by the [`CollectionState::Object`] `awaiting_key` field
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum Token {
|
|
/// We have reached an EOF at a position that is not in a value
|
|
Eof,
|
|
/// The root of the input
|
|
///
|
|
/// Note that there can be multiple root tokens. For example, for input `{}{}`, root tokens are sent at these positions: `^{}^{}`
|
|
Root,
|
|
/// The start of an object or array
|
|
CollectionStart { ty: CollectionType },
|
|
/// The end of an object or array
|
|
CollectionEnd { ty: CollectionType },
|
|
/// A block or line comment
|
|
Comment {
|
|
ty: CommentType,
|
|
/// Should this comment be on its own line?
|
|
///
|
|
/// This is derived from the input.
|
|
/// If the comment is read on a line with only whitespace tokens, this is set to true
|
|
own_line: bool,
|
|
},
|
|
/// A value that is not a collection
|
|
Value { ty: ValueType, first_char: u8 },
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum CollectionType {
|
|
Object,
|
|
Array,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy)]
|
|
enum CollectionState {
|
|
Object { awaiting_key: bool },
|
|
Array,
|
|
}
|
|
|
|
impl CollectionState {
|
|
fn ty(&self) -> CollectionType {
|
|
match self {
|
|
Self::Object { awaiting_key: _ } => CollectionType::Object,
|
|
Self::Array => CollectionType::Array,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl CollectionType {
|
|
fn as_state(&self) -> CollectionState {
|
|
match self {
|
|
Self::Object => CollectionState::Object { awaiting_key: true },
|
|
Self::Array => CollectionState::Array,
|
|
}
|
|
}
|
|
|
|
fn start_str(&self) -> &'static str {
|
|
match self {
|
|
Self::Object => "{",
|
|
Self::Array => "[",
|
|
}
|
|
}
|
|
|
|
fn end_str(&self) -> &'static str {
|
|
match self {
|
|
Self::Object => "}",
|
|
Self::Array => "]",
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
|
pub enum CommentType {
|
|
Line,
|
|
Block,
|
|
}
|
|
|
|
impl CommentType {
|
|
fn start_str(&self) -> &'static str {
|
|
match self {
|
|
Self::Line => "//",
|
|
Self::Block => "/*",
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
|
pub enum ValueType {
|
|
// TODO: Do we want to allow the possibility of unquoted object keys?
|
|
// Unquoted values would be a bad idea. For example, there would be ambiguity for {x: true} (is it {"x": "true"} or {"x": true})
|
|
// You could force `true`/`false`/`null`/numbers to be non-strings, but then you end up with the yaml `yes`/`no` problem
|
|
// Also, if someone types a number like `-1.4e4.`, we don't want that converted to a string, we should keep it as an (invalid) number
|
|
// UnquotedString,
|
|
String,
|
|
Number,
|
|
Boolean,
|
|
Null,
|
|
}
|
|
|
|
#[derive(Derivative)]
|
|
#[derivative(Debug)]
|
|
pub struct Parser<R, W>
|
|
where
|
|
R: BufRead,
|
|
W: Write,
|
|
{
|
|
/// Input reader
|
|
#[derivative(Debug = "ignore")]
|
|
input: R,
|
|
/// Output writer
|
|
#[derivative(Debug = "ignore")]
|
|
write: W,
|
|
/// Stack tracking the state of the parser
|
|
///
|
|
/// When descending into an array or object, push a [`CollectionState`] here
|
|
state_stack: VecDeque<CollectionState>,
|
|
/// The current token the parser has received
|
|
current_token: Token,
|
|
// TODO: This can be used to add whitespace (if [`num_empty_lines`] > 1)
|
|
/// The number of empty lines read from [`input`] in a row
|
|
///
|
|
/// Empty lines are lines that only contain whitespace
|
|
num_empty_lines: u8,
|
|
/// The mode of operation of the parser
|
|
mode: Mode,
|
|
/// Buffered indent strings so repeated calls do not have to repeated call [`std::io::repeat`]
|
|
indentor: Indentor,
|
|
}
|
|
|
|
impl<R, W> Parser<R, W>
|
|
where
|
|
R: BufRead,
|
|
W: Write,
|
|
{
|
|
pub fn new(mode: Mode, read: R, write: W) -> Self {
|
|
Self {
|
|
input: read,
|
|
write,
|
|
state_stack: VecDeque::new(),
|
|
current_token: Token::Root,
|
|
num_empty_lines: 0,
|
|
mode,
|
|
indentor: Indentor::new(INDENT),
|
|
}
|
|
}
|
|
|
|
/// Send the rest of the input to the writer until the end of the comment is reached
|
|
fn drain_comment(&mut self, ty: &CommentType) -> Result<()> {
|
|
let mut maybe_block_end = false;
|
|
|
|
loop {
|
|
let buf = self.input.fill_buf()?;
|
|
|
|
if buf.is_empty() {
|
|
return Err(Error::BufferEmpty);
|
|
}
|
|
|
|
match ty {
|
|
CommentType::Line => match line_comment_end(buf) {
|
|
Some(idx) => {
|
|
if self.mode.keep_comments() {
|
|
self.write.write_all(&buf[0..idx])?;
|
|
}
|
|
|
|
self.input.consume(idx);
|
|
break;
|
|
}
|
|
None => {
|
|
if self.mode.keep_comments() {
|
|
self.write.write_all(buf)?;
|
|
}
|
|
let len = buf.len();
|
|
self.input.consume(len);
|
|
}
|
|
},
|
|
CommentType::Block => {
|
|
if maybe_block_end && buf[0] == b'/' {
|
|
// We ended the block comment
|
|
if self.mode.keep_comments() {
|
|
self.write.write_all(b"/")?;
|
|
}
|
|
break;
|
|
}
|
|
|
|
maybe_block_end = false;
|
|
|
|
match block_comment_end(buf) {
|
|
BlockCommentEnd::Position(idx) => {
|
|
if self.mode.keep_comments() {
|
|
self.write.write_all(&buf[0..idx])?;
|
|
}
|
|
|
|
self.input.consume(idx);
|
|
break;
|
|
}
|
|
BlockCommentEnd::MaybeEnd => {
|
|
if self.mode.keep_comments() {
|
|
self.write.write_all(buf)?;
|
|
}
|
|
let len = buf.len();
|
|
self.input.consume(len);
|
|
maybe_block_end = true;
|
|
}
|
|
BlockCommentEnd::None => {
|
|
if self.mode.keep_comments() {
|
|
self.write.write_all(buf)?;
|
|
}
|
|
let len = buf.len();
|
|
self.input.consume(len);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Send the rest of the input to the writer until the end of the value is reached
|
|
fn drain_value(&mut self, ty: &ValueType, first_char: u8) -> Result<()> {
|
|
match (ty, first_char) {
|
|
(ValueType::String, C_QUOTE) => {
|
|
let mut next_char_escaped = false;
|
|
self.write([C_QUOTE])?;
|
|
|
|
// Loop until we are done with the string
|
|
loop {
|
|
if next_char_escaped {
|
|
// The previous buffer ended in `\`
|
|
// Send this character out
|
|
let next_char = self.next_char()?;
|
|
self.write([next_char])?;
|
|
}
|
|
next_char_escaped = false;
|
|
|
|
let buf = self.input.fill_buf()?;
|
|
|
|
if buf.is_empty() {
|
|
return Err(Error::BufferEmpty);
|
|
}
|
|
|
|
match string_end(buf) {
|
|
StringEnd::Position(idx) => {
|
|
self.write.write_all(&buf[0..idx])?;
|
|
self.input.consume(idx);
|
|
break;
|
|
}
|
|
StringEnd::MaybeEnd => {
|
|
self.write.write_all(buf)?;
|
|
let len = buf.len();
|
|
self.input.consume(len);
|
|
next_char_escaped = true;
|
|
}
|
|
StringEnd::None => {
|
|
self.write.write_all(buf)?;
|
|
let len = buf.len();
|
|
self.input.consume(len);
|
|
}
|
|
}
|
|
}
|
|
|
|
let next_char = self.next_char()?;
|
|
self.write([next_char])?;
|
|
|
|
Ok(())
|
|
}
|
|
(ValueType::Number, mut c) => {
|
|
loop {
|
|
self.write([c])?;
|
|
c = self.peek_next_char()?;
|
|
// Any of the json numerical characters
|
|
if c == C_PLUS
|
|
|| c == C_MINUS
|
|
|| c == C_DOT
|
|
|| (c as char).is_ascii_digit()
|
|
|| c == C_E
|
|
{
|
|
self.next_char()?;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
(ValueType::Boolean, b't') => {
|
|
let mut chr = [0_u8; 3];
|
|
self.input.read_exact(&mut chr)?;
|
|
|
|
if chr == *b"rue" {
|
|
self.write("true")?;
|
|
Ok(())
|
|
} else {
|
|
Err(Error::UnexpectedValue)
|
|
}
|
|
}
|
|
(ValueType::Boolean, b'f') => {
|
|
let mut chr = [0_u8; 4];
|
|
self.input.read_exact(&mut chr)?;
|
|
|
|
if chr == *b"alse" {
|
|
self.write("false")?;
|
|
Ok(())
|
|
} else {
|
|
Err(Error::UnexpectedValue)
|
|
}
|
|
}
|
|
(ValueType::Null, b'n') => {
|
|
let mut chr = [0_u8; 3];
|
|
self.input.read_exact(&mut chr)?;
|
|
|
|
if chr == *b"ull" {
|
|
self.write("null")?;
|
|
Ok(())
|
|
} else {
|
|
Err(Error::UnexpectedValue)
|
|
}
|
|
}
|
|
_ => {
|
|
eprintln!("Value type: {ty:?}, with first char {first_char:?}");
|
|
Err(Error::UnexpectedValue)
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Write some bytes to the writer
|
|
fn write(&mut self, buf: impl AsRef<[u8]>) -> Result<()> {
|
|
// eprintln!("### Writing {:?}", String::from_utf8_lossy(buf.as_ref()));
|
|
self.write.write_all(buf.as_ref())?;
|
|
Ok(())
|
|
}
|
|
|
|
/// Write the record separator to the writer
|
|
fn record_separator(&mut self) -> Result<()> {
|
|
self.write(RECORD_SEPARATOR)?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Add extra padding after `:` or before the `//`/`/*` in a comment, if the format requests it
|
|
fn extra_spacing(&mut self) -> Result<()> {
|
|
match self.mode {
|
|
Mode::Jsoncc | Mode::Json => self.write(" ")?,
|
|
Mode::CompactJson => {}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Add a comma only if we are not at the root level
|
|
fn comma(&mut self) -> Result<()> {
|
|
// We don't want a comma if this is a root element
|
|
if self.state_stack.is_empty() {
|
|
return Ok(());
|
|
}
|
|
|
|
self.write(",")?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Add a trailing comma only if we are not at the root level and we are in [`Mode::Jsoncc`]
|
|
fn trailing_comma(&mut self) -> Result<()> {
|
|
match self.mode {
|
|
Mode::Jsoncc => self.comma()?,
|
|
Mode::Json | Mode::CompactJson => {}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Write a newline and add indentation
|
|
fn newline(&mut self) -> Result<()> {
|
|
match self.mode {
|
|
Mode::Jsoncc | Mode::Json => {
|
|
self.write(NEWLINE)?;
|
|
self.write
|
|
.write_all(self.indentor.get_indent(self.state_stack.len()))?;
|
|
}
|
|
Mode::CompactJson => {}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Leave a collection
|
|
///
|
|
/// Call this after you see a `]` or `}` token, and you want the parser to ensure that we were in the right kind of collection before leaving it
|
|
fn exit_collection(&mut self, ty: &CollectionType) -> Result<()> {
|
|
if Some(*ty)
|
|
!= self
|
|
.state_stack
|
|
.pop_back()
|
|
.as_ref()
|
|
.map(CollectionState::ty)
|
|
{
|
|
return Err(Error::UnexpectedCollectionEnd);
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Format the reader into the writer and consume the [`Parser`] by reading tokens and sending formatted output
|
|
///
|
|
/// Generally, the writer state ends with each token written with the ending `:` as required by the next token
|
|
/// A `,` is decided if the `current_token` is a value and the next token is something that warrants a `,` (either another value, a collection, or a collection end in jsoncc mode)
|
|
///
|
|
/// For example:
|
|
/// ```text
|
|
/// ["a", "b"]
|
|
/// ^
|
|
/// ```
|
|
///
|
|
/// At this position, `self::current_token` is a `Value` (representing `"a"`) and `next_token` represents `"b"`, so we know a `,` has been written
|
|
/// In Jsoncc/Json mode, write a newline, indent, and flush the `"b"` Value
|
|
pub fn format_buf(mut self) -> Result<()> {
|
|
loop {
|
|
// eprintln!("========================================================");
|
|
// eprintln!("{:?}", self);
|
|
|
|
let mut next_token = self.get_next_token()?;
|
|
|
|
// eprintln!("{:#?}\n{:#?}", self.current_token, next_token);
|
|
// eprintln!();
|
|
|
|
match (self.current_token, &next_token) {
|
|
(Token::Root, Token::CollectionStart { ty }) => {
|
|
self.state_stack.push_back(ty.as_state());
|
|
self.write(ty.start_str())?;
|
|
}
|
|
(Token::Root, Token::CollectionEnd { ty }) => {
|
|
self.exit_collection(ty)?;
|
|
self.write(ty.end_str())?;
|
|
self.write(ty.end_str())?;
|
|
}
|
|
(Token::Root, Token::Comment { ty, own_line: _ }) => {
|
|
self.write(ty.start_str())?;
|
|
self.drain_comment(ty)?;
|
|
}
|
|
(Token::Root, Token::Value { ty, first_char }) => {
|
|
self.drain_value(ty, *first_char)?;
|
|
next_token = Token::Root;
|
|
}
|
|
(Token::CollectionStart { ty: _ }, Token::CollectionStart { ty }) => {
|
|
self.newline()?;
|
|
self.write(ty.start_str())?;
|
|
self.state_stack.push_back(ty.as_state());
|
|
}
|
|
(Token::CollectionStart { ty: _ }, Token::CollectionEnd { ty }) => {
|
|
// `{}` or `[]`
|
|
self.exit_collection(ty)?;
|
|
self.write(ty.end_str())?;
|
|
}
|
|
(Token::CollectionStart { ty: _ }, Token::Comment { ty, own_line: _ }) => {
|
|
// Force own_line to be true
|
|
self.newline()?;
|
|
self.write(ty.start_str())?;
|
|
self.drain_comment(ty)?;
|
|
}
|
|
(Token::CollectionStart { ty: _ }, Token::Value { ty, first_char }) => {
|
|
self.newline()?;
|
|
self.drain_value(ty, *first_char)?;
|
|
if self.is_awaiting_key()? {
|
|
self.write(":")?;
|
|
}
|
|
self.toggle_awaiting_key()?;
|
|
}
|
|
(Token::CollectionEnd { ty: _ }, Token::CollectionStart { ty }) => {
|
|
self.comma()?;
|
|
self.newline()?;
|
|
self.write(ty.start_str())?;
|
|
self.state_stack.push_back(ty.as_state());
|
|
}
|
|
(Token::CollectionEnd { ty: _ }, Token::CollectionEnd { ty }) => {
|
|
self.trailing_comma()?;
|
|
self.exit_collection(ty)?;
|
|
self.newline()?;
|
|
self.write(ty.end_str())?;
|
|
}
|
|
(Token::CollectionEnd { ty: _ }, Token::Comment { ty, own_line: _ }) => {
|
|
// Force own_line to be true
|
|
self.trailing_comma()?;
|
|
self.newline()?;
|
|
self.write(ty.start_str())?;
|
|
self.drain_comment(ty)?;
|
|
}
|
|
(Token::CollectionEnd { ty: _ }, Token::Value { ty, first_char }) => {
|
|
self.comma()?;
|
|
self.newline()?;
|
|
self.drain_value(ty, *first_char)?;
|
|
if self.is_awaiting_key()? {
|
|
self.write(":")?;
|
|
}
|
|
self.toggle_awaiting_key()?;
|
|
}
|
|
(Token::Comment { ty: _, own_line: _ }, Token::CollectionStart { ty }) => {
|
|
self.newline()?;
|
|
self.write(ty.start_str())?;
|
|
self.state_stack.push_back(ty.as_state());
|
|
}
|
|
(Token::Comment { ty: _, own_line: _ }, Token::CollectionEnd { ty }) => {
|
|
self.exit_collection(ty)?;
|
|
self.newline()?;
|
|
self.write(ty.end_str())?;
|
|
// self.trailing_comma()?;
|
|
}
|
|
(Token::Comment { ty: _, own_line: _ }, Token::Comment { ty, own_line: _ }) => {
|
|
// Force own_line to be true
|
|
self.newline()?;
|
|
self.write(ty.start_str())?;
|
|
self.drain_comment(ty)?;
|
|
}
|
|
(Token::Comment { ty: _, own_line: _ }, Token::Value { ty, first_char }) => {
|
|
self.newline()?;
|
|
self.drain_value(ty, *first_char)?;
|
|
|
|
if self.is_awaiting_key()? {
|
|
self.write(":")?;
|
|
}
|
|
self.toggle_awaiting_key()?;
|
|
}
|
|
(
|
|
Token::Value {
|
|
ty: _,
|
|
first_char: _,
|
|
},
|
|
Token::CollectionStart { ty },
|
|
) => {
|
|
if self.is_awaiting_key()? {
|
|
self.comma()?;
|
|
self.newline()?;
|
|
} else {
|
|
self.extra_spacing()?;
|
|
}
|
|
|
|
self.write(ty.start_str())?;
|
|
self.toggle_awaiting_key()?;
|
|
self.state_stack.push_back(ty.as_state());
|
|
}
|
|
(
|
|
Token::Value {
|
|
ty: _,
|
|
first_char: _,
|
|
},
|
|
Token::CollectionEnd { ty },
|
|
) => {
|
|
self.trailing_comma()?;
|
|
self.exit_collection(ty)?;
|
|
self.newline()?;
|
|
self.write(ty.end_str())?;
|
|
}
|
|
(
|
|
Token::Value {
|
|
ty: _,
|
|
first_char: _,
|
|
},
|
|
Token::Comment { ty, own_line },
|
|
) => {
|
|
if self.is_awaiting_key()? {
|
|
self.comma()?;
|
|
}
|
|
|
|
if *own_line {
|
|
self.newline()?;
|
|
} else {
|
|
self.extra_spacing()?;
|
|
}
|
|
self.write(ty.start_str())?;
|
|
self.drain_comment(ty)?;
|
|
}
|
|
(
|
|
Token::Value {
|
|
ty: _,
|
|
first_char: _,
|
|
},
|
|
Token::Value { ty, first_char },
|
|
) => {
|
|
if self.is_awaiting_key()? {
|
|
self.comma()?;
|
|
self.newline()?;
|
|
} else {
|
|
// The previous value was an object key, so put a space after the `:`
|
|
self.extra_spacing()?;
|
|
}
|
|
self.drain_value(ty, *first_char)?;
|
|
if self.is_awaiting_key()? {
|
|
self.write(":")?;
|
|
}
|
|
self.toggle_awaiting_key()?;
|
|
}
|
|
|
|
(Token::Root, Token::Eof) if self.state_stack.is_empty() => {
|
|
// We read the whole file successfully!
|
|
return Ok(());
|
|
}
|
|
|
|
(a, b) => {
|
|
panic!("Invalid state transition: {a:?} => {b:?}")
|
|
}
|
|
}
|
|
|
|
if (matches!(next_token, Token::CollectionEnd { .. }) || next_token == Token::Root)
|
|
&& self.state_stack.is_empty()
|
|
{
|
|
self.record_separator()?;
|
|
next_token = Token::Root;
|
|
}
|
|
|
|
self.current_token = next_token;
|
|
}
|
|
}
|
|
|
|
/// Search for a token while in [`ParserMode::Normal`]
|
|
fn get_next_token(&mut self) -> Result<Token> {
|
|
let ret = loop {
|
|
let chr = self.next_char();
|
|
|
|
if Err(true) == chr.as_ref().map_err(Error::is_eof) {
|
|
// TODO: If our nested depth is 0, this is just a Root token??
|
|
break Ok(Token::Eof);
|
|
}
|
|
let chr = chr?;
|
|
|
|
// eprintln!("Got next char: {:?}", chr as char);
|
|
|
|
break Ok(match chr {
|
|
C_CR | C_LF => {
|
|
self.num_empty_lines = self.num_empty_lines.saturating_add(1);
|
|
continue;
|
|
}
|
|
C_TAB | C_SPACE => continue,
|
|
// C_COMMA => Token::Comma,
|
|
C_COLON => continue,
|
|
// TODO: Allow unquoted strings?
|
|
C_QUOTE => Token::Value {
|
|
ty: ValueType::String,
|
|
first_char: b'"',
|
|
},
|
|
// C_BACKSLASH => {}
|
|
C_LEFT_BRACE => Token::CollectionStart {
|
|
ty: CollectionType::Object,
|
|
},
|
|
C_LEFT_BRACKET => Token::CollectionStart {
|
|
ty: CollectionType::Array,
|
|
},
|
|
C_RIGHT_BRACE => Token::CollectionEnd {
|
|
ty: CollectionType::Object,
|
|
},
|
|
C_RIGHT_BRACKET => Token::CollectionEnd {
|
|
ty: CollectionType::Array,
|
|
},
|
|
C_SLASH => {
|
|
// We can't send comment tokens if using json
|
|
let maybe_next_token_ty = match self.next_char()? {
|
|
C_SLASH => CommentType::Line,
|
|
|
|
C_STAR => CommentType::Block,
|
|
|
|
c => {
|
|
eprintln!("{:#?}", self);
|
|
eprintln!("X {:?}", (c as char));
|
|
break Err(Error::UnexpectedChar(c as char));
|
|
}
|
|
};
|
|
|
|
if self.mode.keep_comments() {
|
|
Token::Comment {
|
|
ty: maybe_next_token_ty,
|
|
own_line: self.num_empty_lines > 0,
|
|
}
|
|
} else {
|
|
// We need to drain this comment by reading the buffer
|
|
// This function won't write anything in json modes
|
|
self.drain_comment(&maybe_next_token_ty)?;
|
|
|
|
self.num_empty_lines = 0;
|
|
continue;
|
|
}
|
|
}
|
|
C_COMMA => continue,
|
|
|
|
c @ b't' | c @ b'f' => Token::Value {
|
|
ty: ValueType::Boolean,
|
|
first_char: c,
|
|
},
|
|
c @ b'n' => Token::Value {
|
|
ty: ValueType::Null,
|
|
first_char: c,
|
|
},
|
|
|
|
c @ C_PLUS | c @ C_MINUS | c if (c as char).is_ascii_digit() => Token::Value {
|
|
ty: ValueType::Number,
|
|
first_char: c,
|
|
},
|
|
|
|
c => {
|
|
eprintln!("Unexpected char?? {self:#?}");
|
|
break Err(Error::UnexpectedChar(c as char));
|
|
}
|
|
});
|
|
};
|
|
|
|
self.num_empty_lines = 0;
|
|
ret
|
|
}
|
|
|
|
/// Check the next char without consuming it
|
|
fn peek_next_char(&mut self) -> Result<u8> {
|
|
self.input
|
|
.fill_buf()?
|
|
.first()
|
|
.ok_or(Error::BufferEmpty)
|
|
.copied()
|
|
}
|
|
|
|
/// Consume the next character from the reader
|
|
fn next_char(&mut self) -> Result<u8> {
|
|
let mut chr = [0_u8];
|
|
self.input.read_exact(&mut chr)?;
|
|
Ok(chr[0])
|
|
}
|
|
|
|
/// Returns `true` if we are in an object and the next value is actually an object key
|
|
fn is_awaiting_key(&self) -> Result<bool> {
|
|
Ok(
|
|
match self.state_stack.back().ok_or(Error::UnexpectedValue)? {
|
|
CollectionState::Object { awaiting_key } => *awaiting_key,
|
|
CollectionState::Array => false,
|
|
},
|
|
)
|
|
}
|
|
|
|
/// Toggles the `awaiting_key` value. Called after reading a value
|
|
///
|
|
/// Has no affect if the current collection is an array, so this is safe to call after reading any value or CollectionEnd token
|
|
fn toggle_awaiting_key(&mut self) -> Result<()> {
|
|
match self.state_stack.back_mut().ok_or(Error::UnexpectedValue)? {
|
|
CollectionState::Object { awaiting_key } => *awaiting_key = !*awaiting_key,
|
|
CollectionState::Array => {}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
/// Gets the position in a buf that a block comment ends
|
|
/// ```text
|
|
/// /* abc */ def
|
|
/// ^
|
|
/// ```
|
|
fn block_comment_end(buf: &[u8]) -> BlockCommentEnd {
|
|
for star_idx in memchr::memchr_iter(C_STAR, buf) {
|
|
match buf.get(star_idx + 1) {
|
|
Some(&C_SLASH) => {
|
|
// We found `*/` at position `star_idx`
|
|
return BlockCommentEnd::Position(star_idx + 2);
|
|
}
|
|
Some(_) => {}
|
|
None => {
|
|
// We found `*` at the end of the buffer
|
|
return BlockCommentEnd::MaybeEnd;
|
|
}
|
|
}
|
|
}
|
|
BlockCommentEnd::None
|
|
}
|
|
|
|
/// Gets the position in a buf that the string ends
|
|
/// ```text
|
|
/// xyzabc": 123,
|
|
/// ^
|
|
/// ```
|
|
/// Note that the `xyzabc` is part of a string, but the start of the string must have come from a previous buffer
|
|
fn string_end(buf: &[u8]) -> StringEnd {
|
|
let mut n = 0;
|
|
|
|
loop {
|
|
match memchr::memchr2(C_QUOTE, C_BACKSLASH, &buf[n..])
|
|
.and_then(|idx| Some((idx, buf.get(idx + n)?)))
|
|
{
|
|
Some((idx, &C_QUOTE)) => {
|
|
n += idx;
|
|
return StringEnd::Position(n);
|
|
}
|
|
Some((idx, &C_BACKSLASH)) => {
|
|
n += idx;
|
|
|
|
// We found a `\` at the end of the buf
|
|
if buf.len() == n + 1 {
|
|
// The `/` is at the end of `buf`
|
|
return StringEnd::MaybeEnd;
|
|
} else {
|
|
// The end of the string won't be the `\` and the next byte
|
|
n += 2;
|
|
}
|
|
}
|
|
Some((idx, chr)) => {
|
|
eprintln!("Buf: {:?}", String::from_utf8(buf.to_vec()));
|
|
panic!(
|
|
"memchr2 returned unexpected result ({} @ {})",
|
|
*chr as char,
|
|
idx + n
|
|
);
|
|
}
|
|
None => {
|
|
// There are no `"` in the string, so we know the rest of the buf is just part of the string
|
|
return StringEnd::None;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Gets the position in a buf that a line comment ends
|
|
fn line_comment_end(buf: &[u8]) -> Option<usize> {
|
|
memchr::memchr2(C_CR, C_LF, buf)
|
|
}
|
|
|
|
/// A case that a buf ends in a block comment ending `*/`
|
|
enum BlockCommentEnd {
|
|
/// The block comment ended at this position
|
|
Position(usize),
|
|
/// The buffer did not have any `*/`, but it ended in a `*`
|
|
MaybeEnd,
|
|
/// The block comment does not end in this buf
|
|
None,
|
|
}
|
|
|
|
/// A case that a buf ends in a string ending `"` that was not escaped by `\`
|
|
#[derive(PartialEq, Eq, Debug)]
|
|
enum StringEnd {
|
|
/// The string ended at this position
|
|
Position(usize),
|
|
/// The buffer did not have any unescaped `"`, but it ended in a `*`
|
|
MaybeEnd,
|
|
/// The string does not end in this buf
|
|
None,
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use std::io::{BufReader, BufWriter};
|
|
|
|
use super::*;
|
|
|
|
fn format_to_string(input: &[u8], mode: Mode) -> String {
|
|
let mut output = vec![];
|
|
Parser::new(
|
|
mode,
|
|
BufReader::new(input),
|
|
&mut BufWriter::new(&mut output),
|
|
)
|
|
.format_buf()
|
|
.unwrap();
|
|
String::from_utf8(output).unwrap()
|
|
}
|
|
|
|
#[test]
|
|
fn test_string_end() {
|
|
assert_eq!(string_end(br#"ABC"#), StringEnd::None);
|
|
assert_eq!(string_end(br#"ABC\"#), StringEnd::MaybeEnd);
|
|
assert_eq!(string_end(br#"ABC""#), StringEnd::Position(3));
|
|
}
|
|
|
|
#[test]
|
|
fn test_formatting() {
|
|
let x = r#"[]
|
|
{}
|
|
[]
|
|
{
|
|
"a": "b"
|
|
}
|
|
{"a":"b"}
|
|
{
|
|
"a": "b",
|
|
}
|
|
{"a":"b",}
|
|
[]
|
|
"#;
|
|
|
|
eprintln!("{}", format_to_string(x.as_bytes(), Mode::Jsoncc));
|
|
assert_eq!(
|
|
format_to_string(x.as_bytes(), Mode::Jsoncc),
|
|
r#"[]
|
|
{}
|
|
[]
|
|
{
|
|
"a": "b",
|
|
}
|
|
{
|
|
"a": "b",
|
|
}
|
|
{
|
|
"a": "b",
|
|
}
|
|
{
|
|
"a": "b",
|
|
}
|
|
[]
|
|
"#
|
|
);
|
|
|
|
assert_eq!(
|
|
format_to_string(x.as_bytes(), Mode::Json),
|
|
r#"[]
|
|
{}
|
|
[]
|
|
{
|
|
"a": "b"
|
|
}
|
|
{
|
|
"a": "b"
|
|
}
|
|
{
|
|
"a": "b"
|
|
}
|
|
{
|
|
"a": "b"
|
|
}
|
|
[]
|
|
"#
|
|
);
|
|
|
|
assert_eq!(
|
|
format_to_string(x.as_bytes(), Mode::CompactJson),
|
|
r#"[]
|
|
{}
|
|
[]
|
|
{"a":"b"}
|
|
{"a":"b"}
|
|
{"a":"b"}
|
|
{"a":"b"}
|
|
[]
|
|
"#
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_formatting_comments() {
|
|
let x = r#"[]
|
|
{
|
|
/*1*/
|
|
}
|
|
[
|
|
/*2*/
|
|
]
|
|
{
|
|
//
|
|
"a": "b",
|
|
}
|
|
{
|
|
//
|
|
"a": "b",
|
|
}
|
|
{
|
|
/*1*/
|
|
"a": "b", /*2*/
|
|
/*3*/
|
|
"c":"d",
|
|
/*4*/ "e":"f"/*5*/, /*6*/
|
|
}
|
|
{/*w*/
|
|
/*x*/
|
|
"a"/*y*/:/*z*/"b",/*a*/
|
|
}
|
|
[]"#;
|
|
|
|
eprintln!("{}", format_to_string(x.as_bytes(), Mode::Json));
|
|
|
|
assert_eq!(
|
|
format_to_string(x.as_bytes(), Mode::Jsoncc),
|
|
r#"[]
|
|
{
|
|
/*1*/
|
|
}
|
|
[
|
|
/*2*/
|
|
]
|
|
{
|
|
//
|
|
"a": "b",
|
|
}
|
|
{
|
|
//
|
|
"a": "b",
|
|
}
|
|
{
|
|
/*1*/
|
|
"a": "b", /*2*/
|
|
/*3*/
|
|
"c": "d",
|
|
/*4*/
|
|
"e": "f", /*5*/
|
|
/*6*/
|
|
}
|
|
{
|
|
/*w*/
|
|
/*x*/
|
|
"a": /*y*/
|
|
/*z*/
|
|
"b", /*a*/
|
|
}
|
|
[]
|
|
"#
|
|
);
|
|
|
|
assert_eq!(
|
|
format_to_string(x.as_bytes(), Mode::CompactJson),
|
|
r#"[]
|
|
{}
|
|
[]
|
|
{"a":"b"}
|
|
{"a":"b"}
|
|
{"a":"b","c":"d","e":"f"}
|
|
{"a":"b"}
|
|
[]
|
|
"#
|
|
);
|
|
assert_eq!(
|
|
format_to_string(x.as_bytes(), Mode::Json),
|
|
r#"[]
|
|
{}
|
|
[]
|
|
{
|
|
"a": "b"
|
|
}
|
|
{
|
|
"a": "b"
|
|
}
|
|
{
|
|
"a": "b",
|
|
"c": "d",
|
|
"e": "f"
|
|
}
|
|
{
|
|
"a": "b"
|
|
}
|
|
[]
|
|
"#
|
|
);
|
|
}
|
|
|
|
// static G: AtomicUsize = AtomicUsize::new(0);
|
|
|
|
// fn fork(i: &str) -> Vec<String> {
|
|
// let a = i.replacen(
|
|
// "_",
|
|
// &format!("/*{}*/", G.fetch_add(1, Ordering::Relaxed)),
|
|
// 1,
|
|
// );
|
|
// let b = i.replacen("_", "", 1);
|
|
|
|
// if a.contains("_") {
|
|
// let mut ret = fork(&a);
|
|
// ret.append(&mut fork(&b));
|
|
// ret
|
|
// } else {
|
|
// vec![a, b]
|
|
// }
|
|
// }
|
|
}
|