From ba4b935cc0bcf16474b1c66539f7624307b8c780 Mon Sep 17 00:00:00 2001 From: Myles Wirth Date: Wed, 19 Nov 2025 11:55:10 -0500 Subject: [PATCH 01/11] Initial version --- Cargo.lock | 327 ++++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 2 + src/lib.rs | 284 ++++++++++++++++++++++++++++++++++++++++++++- src/main.rs | 31 +++++ 4 files changed, 639 insertions(+), 5 deletions(-) create mode 100644 Cargo.lock create mode 100644 src/main.rs diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..b040ed5 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,327 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + +[[package]] +name = "ar_archive_writer" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0c269894b6fe5e9d7ada0cf69b5bf847ff35bc25fc271f08e1d080fce80339a" +dependencies = [ + "object", +] + +[[package]] +name = "ariadne" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8454c8a44ce2cb9cc7e7fae67fc6128465b343b92c6631e94beca3c8d1524ea5" +dependencies = [ + "unicode-width", + "yansi", +] + +[[package]] +name = "cc" +version = "1.2.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97463e1064cb1b1c1384ad0a0b9c8abd0988e2a91f52606c80ef14aadb63e36" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "chumsky" +version = "1.0.0-alpha.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e82d74e6c83060ec269fe9e0d408d6de4a1645d525f9a0bbbb841ba4efd91ac" +dependencies = [ + "hashbrown", + "regex-automata", + "serde", + "stacker", + "unicode-ident", + "unicode-segmentation", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "find-msvc-tools" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] + +[[package]] +name = "libc" +version = "0.2.177" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" + +[[package]] +name = "maml" +version = "0.0.0" +dependencies = [ + "ariadne", + "chumsky", +] + +[[package]] +name = "memchr" +version = "2.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" + +[[package]] +name = "object" +version = "0.32.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +dependencies = [ + "memchr", +] + +[[package]] +name = "proc-macro2" +version = "1.0.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "psm" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d11f2fedc3b7dafdc2851bc52f277377c5473d378859be234bc7ebb593144d01" +dependencies = [ + "ar_archive_writer", + "cc", +] + +[[package]] +name = "quote" +version = "1.0.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex-automata" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "stacker" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1f8b29fb42aafcea4edeeb6b2f2d7ecd0d969c48b4cf0d2e64aafc471dd6e59" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys", +] + +[[package]] +name = "syn" +version = "2.0.110" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a99801b5bd34ede4cf3fc688c5919368fea4e4814a4664359503e6015b280aea" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" + +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "yansi" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" diff --git a/Cargo.toml b/Cargo.toml index 0cc4829..773f06d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,3 +8,5 @@ license-file = "LICENSE" readme = "README.md" [dependencies] +ariadne = "0.6.0" +chumsky = "1.0.0-alpha.7" diff --git a/src/lib.rs b/src/lib.rs index b93cf3f..6c9f197 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,234 @@ -pub fn add(left: u64, right: u64) -> u64 { - left + right +use ariadne::{Color, Label, Report, ReportKind, Source}; +use chumsky::prelude::*; +use std::collections::HashMap; + +/// MAML AST +#[derive(Debug, Clone, PartialEq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +#[cfg_attr(feature = "serde", serde(untagged))] +pub enum MamlValue { + Null, + Bool(bool), + Int(i64), + Float(f64), + String(String), + Array(Vec), + Object(HashMap), +} + +/// Parser definition +fn parser<'a>() -> impl Parser<'a, &'a str, MamlValue, extra::Err>> { + recursive(|value| { + // Comments + let comment = just('#') + .then(any().and_is(text::newline().not()).repeated()) + .ignored(); + + // Whitespace (not including newlines) + let ws = one_of(" \t").ignored().repeated(); + + // Separator: comma, newline, or comment+newline + // All branches must return () so we use .ignored() + let separator = choice(( + just(',').ignored(), + text::newline().ignored(), + comment.then(text::newline().or_not()).ignored(), + )) + .padded_by(ws); + + // --- Numbers --- + let digits = text::digits(10); + let integer_part = choice(( + just('0').to_slice(), + one_of("123456789").then(digits.or_not()).to_slice(), + )); + + let fraction = just('.').then(digits); + let exponent = one_of("eE").then(one_of("+-").or_not()).then(digits); + + let number = just('-') + .or_not() + .then(integer_part) + .then(fraction.or_not()) + .then(exponent.or_not()) + .to_slice() + .try_map(|s: &str, span| { + if s.contains(['.', 'e', 'E']) { + s.parse::() + .map(MamlValue::Float) + .map_err(|_| Rich::custom(span, "Invalid float")) + } else { + s.parse::() + .map(MamlValue::Int) + .map_err(|_| Rich::custom(span, "Integer overflow (must fit in 64-bit)")) + } + }) + .labelled("number"); + + // --- Strings --- + let escape = just('\\').ignore_then(choice(( + just('\\'), + just('/'), + just('"'), + just('b').to('\x08'), + just('f').to('\x0C'), + just('n').to('\n'), + just('r').to('\r'), + just('t').to('\t'), + just('u').ignore_then( + text::digits(16) + .at_least(1) + .at_most(6) + .to_slice() + .delimited_by(just('{'), just('}')) + .try_map(|digits: &str, span| { + let code = u32::from_str_radix(digits, 16).unwrap(); + char::from_u32(code).ok_or_else(|| { + Rich::custom(span, format!("Invalid unicode codepoint: U+{:X}", code)) + }) + }), + ), + ))); + + let string_content = none_of("\\\"").or(escape).repeated().collect::(); + let simple_string = string_content + .delimited_by(just('"'), just('"')) + .labelled("string"); + + // Raw string: """...""" + let raw_string = just("\"\"\"") + .ignore_then( + any() + .and_is(just("\"\"\"").not()) + .repeated() + .collect::(), + ) + .then_ignore(just("\"\"\"")) + .map(|s: String| { + // Strip leading newline if present + s.strip_prefix('\n') + .or_else(|| s.strip_prefix("\r\n")) + .unwrap_or(&s) + .to_string() + }) + .labelled("raw string"); + + let string_val = choice((raw_string, simple_string)).map(MamlValue::String); + + // --- Keys --- + let identifier = one_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-") + .repeated() + .at_least(1) + .collect::(); + + let key = choice((simple_string.clone(), identifier)).padded_by(ws); + + // --- Array --- + let array = value + .clone() + .separated_by(separator.clone().repeated().at_least(1)) + .allow_trailing() + .collect() + .padded() + .delimited_by(just('['), just(']')) + .map(MamlValue::Array) + .labelled("array") + .recover_with(via_parser(nested_delimiters( + '[', + ']', + [('[', ']'), ('{', '}')], + |_| MamlValue::Array(vec![]), + ))); + + // --- Object --- + let member = key.then_ignore(just(':').padded_by(ws)).then(value.clone()); + + let object = member + .separated_by(separator.repeated().at_least(1)) + .allow_trailing() + .collect() + .padded() + .delimited_by(just('{'), just('}')) + .map(MamlValue::Object) + .labelled("object") + .recover_with(via_parser(nested_delimiters( + '{', + '}', + [('[', ']'), ('{', '}')], + |_| MamlValue::Object(HashMap::new()), + ))); + + // --- Top-level choice --- + choice(( + just("null").to(MamlValue::Null), + just("true").to(MamlValue::Bool(true)), + just("false").to(MamlValue::Bool(false)), + number, + string_val, + array, + object, + )) + .padded() + }) +} + +/// Parse from string (like `serde_json::from_str`) +pub fn from_str(input: &str) -> Result { + let (val, errs) = parser().parse(input).into_output_errors(); + + if !errs.is_empty() { + let mut buffer = Vec::new(); + for e in errs { + Report::build(ReportKind::Error, ("", e.span().into_range())) + .with_message(e.to_string()) + .with_label( + Label::new(("", e.span().into_range())) + .with_message(e.reason().to_string()) + .with_color(Color::Red), + ) + .finish() + .write(("", Source::from(input)), &mut buffer) + .unwrap(); + } + return Err(String::from_utf8_lossy(&buffer).to_string()); + } + + val.ok_or_else(|| "Unexpected parsing failure".to_string()) +} + +/// Parse with detailed error reporting to stderr +pub fn parse_with_report(filename: &str, input: &str) -> Option { + let (val, errs) = parser().parse(input).into_output_errors(); + + if errs.is_empty() { + return val; + } + + for e in errs { + let span = e.span().into_range(); + Report::build(ReportKind::Error, (filename, span.clone())) + .with_message(format!("{}", e)) + .with_label( + Label::new((filename, span.clone())) + .with_message(e.reason().to_string()) + .with_color(Color::Red), + ) + .with_note(format!( + "Error at line {} column {}", + input[..span.start].lines().count(), + input[..span.start] + .lines() + .last() + .map(|l| l.len()) + .unwrap_or(0) + + 1 + )) + .finish() + .eprint((filename, Source::from(input))) + .unwrap(); + } + + None } #[cfg(test)] @@ -7,8 +236,53 @@ mod tests { use super::*; #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); + fn test_basic_values() { + assert!(matches!(from_str("null").unwrap(), MamlValue::Null)); + assert!(matches!(from_str("true").unwrap(), MamlValue::Bool(true))); + assert!(matches!(from_str("42").unwrap(), MamlValue::Int(42))); + assert!(matches!(from_str("3.14").unwrap(), MamlValue::Float(_))); + } + + #[test] + fn test_string() { + let val = from_str(r#""hello world""#).unwrap(); + assert!(matches!(val, MamlValue::String(s) if s == "hello world")); + } + + #[test] + fn test_array() { + let val = from_str("[1, 2, 3]").unwrap(); + if let MamlValue::Array(arr) = val { + assert_eq!(arr.len(), 3); + } else { + panic!("Expected array"); + } + } + + #[test] + fn test_object() { + let val = from_str(r#"{ name: "test", value: 42 }"#).unwrap(); + if let MamlValue::Object(obj) = val { + assert_eq!(obj.len(), 2); + assert!(obj.contains_key("name")); + } else { + panic!("Expected object"); + } + } + + #[test] + fn test_raw_string() { + let val = from_str( + r#"""" +hello +world +""""#, + ) + .unwrap(); + if let MamlValue::String(s) = val { + assert_eq!(s, "hello\nworld\n"); + } else { + panic!("Expected string"); + } } } diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..2b74578 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,31 @@ +use std::env; +use std::fs; + +fn main() { + let args: Vec = env::args().collect(); + if args.len() != 2 { + eprintln!("Usage: {} ", args[0]); + std::process::exit(1); + } + + let filename = &args[1]; + let src = match fs::read_to_string(filename) { + Ok(s) => s, + Err(e) => { + eprintln!("Error reading {}: {}", filename, e); + std::process::exit(1); + } + }; + + match maml::parse_with_report(filename, &src) { + Some(value) => { + println!("{:#?}", value); + + #[cfg(feature = "serde")] + if let Ok(json) = serde_json::to_string_pretty(&value) { + println!("\nAs JSON:\n{}", json); + } + } + None => std::process::exit(1), + } +} From 8fe0bf5935e23e7c933f94606a6708aa2649cbcb Mon Sep 17 00:00:00 2001 From: Myles Wirth Date: Mon, 1 Dec 2025 08:35:25 -0500 Subject: [PATCH 02/11] Fixed accidental newline consumption --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 6c9f197..b81859e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -168,7 +168,7 @@ fn parser<'a>() -> impl Parser<'a, &'a str, MamlValue, extra::Err array, object, )) - .padded() + .padded_by(ws) }) } From 65b16a6ade6c69ba84e935a668117e8e86a77c4d Mon Sep 17 00:00:00 2001 From: Myles Wirth Date: Mon, 1 Dec 2025 08:35:40 -0500 Subject: [PATCH 03/11] Added band-aid padded() for trailing newline support --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index b81859e..39f8787 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -174,7 +174,7 @@ fn parser<'a>() -> impl Parser<'a, &'a str, MamlValue, extra::Err /// Parse from string (like `serde_json::from_str`) pub fn from_str(input: &str) -> Result { - let (val, errs) = parser().parse(input).into_output_errors(); + let (val, errs) = parser().padded().parse(input).into_output_errors(); if !errs.is_empty() { let mut buffer = Vec::new(); @@ -198,7 +198,7 @@ pub fn from_str(input: &str) -> Result { /// Parse with detailed error reporting to stderr pub fn parse_with_report(filename: &str, input: &str) -> Option { - let (val, errs) = parser().parse(input).into_output_errors(); + let (val, errs) = parser().padded().parse(input).into_output_errors(); if errs.is_empty() { return val; From b6e1d3f013bfa24a35d33f316045cad70a572a8f Mon Sep 17 00:00:00 2001 From: Myles Wirth Date: Mon, 1 Dec 2025 09:50:01 -0500 Subject: [PATCH 04/11] Switched to using logos Drastically simplifies the parsing, and handles a lot of the optimizations that actually matter here --- src/lib.rs | 354 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 221 insertions(+), 133 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 39f8787..63bb16e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ use ariadne::{Color, Label, Report, ReportKind, Source}; use chumsky::prelude::*; +use logos::Logos; use std::collections::HashMap; /// MAML AST @@ -16,174 +17,253 @@ pub enum MamlValue { Object(HashMap), } +/// Tokens for MAML +#[derive(Logos, Debug, Clone, PartialEq)] +#[logos(skip r"[ \t]+")] +enum Token { + #[token("null")] + Null, + + #[token("true")] + True, + + #[token("false")] + False, + + // Float MUST come before Int to ensure proper decimal matching, so priority three + #[regex(r"-?(?:0|[1-9][0-9]*)\.[0-9]+(?:[eE][+-]?[0-9]+)?", |lex| lex.slice().parse::().ok(), priority = 3)] + #[regex(r"-?(?:0|[1-9][0-9]*)[eE][+-]?[0-9]+", |lex| lex.slice().parse::().ok(), priority = 3)] + Float(f64), + + #[regex(r"-?(?:0|[1-9][0-9]*)", |lex| lex.slice().parse::().ok(), priority = 2)] + Int(i64), + + #[regex(r#""(?:[^"\\]|\\["\\/bfnrt]|\\u\{[0-9a-fA-F]{1,6}\})*""#, |lex| { + let s = lex.slice(); + parse_string(&s[1..s.len()-1]) + })] + String(String), + + // Surrounded by triple quotes + #[regex(r#""""([^"]|"[^"]|""[^"])*""""#, |lex| { + let s = lex.slice(); + let content = &s[3..s.len()-3]; + + // Make sure triple quotes are checked + if content.contains(r#"""""#) { + return None; + } + + Some(content.strip_prefix('\n') + .or_else(|| content.strip_prefix("\r\n")) + .unwrap_or(content) + .to_string()) + })] + RawString(String), + + // An object key + #[regex(r"[a-zA-Z_-][a-zA-Z0-9_-]*", |lex| lex.slice().to_string(), priority = 1)] + #[regex(r"[0-9]+", |lex| lex.slice().to_string(), priority = 1)] + Key(String), + + #[token("[")] + LBracket, + + #[token("]")] + RBracket, + + #[token("{")] + LBrace, + + #[token("}")] + RBrace, + + #[token(":")] + Colon, + + #[token(",")] + Comma, + + #[token("\n")] + Newline, + + // Anything that comes after a # + #[regex(r"#[^\n]*", logos::skip)] + Comment, +} + +impl std::fmt::Display for Token { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Token::Null => write!(f, "null"), + Token::True => write!(f, "true"), + Token::False => write!(f, "false"), + Token::Float(n) => write!(f, "{}", n), + Token::Int(n) => write!(f, "{}", n), + Token::String(s) => write!(f, "\"{}\"", s), + Token::RawString(s) => write!(f, "\"\"\"{}\"\"\"", s), + Token::Key(s) => write!(f, "{}", s), + Token::LBracket => write!(f, "["), + Token::RBracket => write!(f, "]"), + Token::LBrace => write!(f, "{{"), + Token::RBrace => write!(f, "}}"), + Token::Colon => write!(f, ":"), + Token::Comma => write!(f, ","), + Token::Newline => write!(f, "\\n"), + Token::Comment => write!(f, "#comment"), + } + } +} + +// Helper function to parse escape sequences in strings +fn parse_string(s: &str) -> Option { + let mut result = String::new(); + let mut chars = s.chars(); + + while let Some(ch) = chars.next() { + if ch == '\\' { + match chars.next()? { + '\\' => result.push('\\'), + '/' => result.push('/'), + '"' => result.push('"'), + 'b' => result.push('\x08'), + 'f' => result.push('\x0C'), + 'n' => result.push('\n'), + 'r' => result.push('\r'), + 't' => result.push('\t'), + 'u' => { + // Expect {XXXXXX} + if chars.next()? != '{' { + return None; + } + let mut hex = String::new(); + loop { + match chars.next()? { + '}' => break, + c if c.is_ascii_hexdigit() && hex.len() < 6 => hex.push(c), + _ => return None, + } + } + let code = u32::from_str_radix(&hex, 16).ok()?; + result.push(char::from_u32(code)?); + } + _ => return None, + } + } else { + result.push(ch); + } + } + + Some(result) +} + /// Parser definition -fn parser<'a>() -> impl Parser<'a, &'a str, MamlValue, extra::Err>> { +fn parser<'src>() -> impl Parser<'src, &'src [Token], MamlValue, extra::Err>> { recursive(|value| { - // Comments - let comment = just('#') - .then(any().and_is(text::newline().not()).repeated()) - .ignored(); - - // Whitespace (not including newlines) - let ws = one_of(" \t").ignored().repeated(); - - // Separator: comma, newline, or comment+newline - // All branches must return () so we use .ignored() - let separator = choice(( - just(',').ignored(), - text::newline().ignored(), - comment.then(text::newline().or_not()).ignored(), + // Separator: comma or newline + let separator = choice((just(Token::Comma).ignored(), just(Token::Newline).ignored())); + + // The number types + let number = choice(( + select! { Token::Float(f) => MamlValue::Float(f) }, + select! { Token::Int(i) => MamlValue::Int(i) }, )) - .padded_by(ws); + .labelled("number"); - // --- Numbers --- - let digits = text::digits(10); - let integer_part = choice(( - just('0').to_slice(), - one_of("123456789").then(digits.or_not()).to_slice(), + // Strings, raw or typical + let string_val = choice(( + select! { Token::RawString(s) => MamlValue::String(s) }, + select! { Token::String(s) => MamlValue::String(s) }, + )); + + // Handling object keys + let key = choice(( + select! { Token::String(s) => s }, + select! { Token::Key(s) => s }, )); - let fraction = just('.').then(digits); - let exponent = one_of("eE").then(one_of("+-").or_not()).then(digits); - - let number = just('-') - .or_not() - .then(integer_part) - .then(fraction.or_not()) - .then(exponent.or_not()) - .to_slice() - .try_map(|s: &str, span| { - if s.contains(['.', 'e', 'E']) { - s.parse::() - .map(MamlValue::Float) - .map_err(|_| Rich::custom(span, "Invalid float")) - } else { - s.parse::() - .map(MamlValue::Int) - .map_err(|_| Rich::custom(span, "Integer overflow (must fit in 64-bit)")) - } - }) - .labelled("number"); - - // --- Strings --- - let escape = just('\\').ignore_then(choice(( - just('\\'), - just('/'), - just('"'), - just('b').to('\x08'), - just('f').to('\x0C'), - just('n').to('\n'), - just('r').to('\r'), - just('t').to('\t'), - just('u').ignore_then( - text::digits(16) - .at_least(1) - .at_most(6) - .to_slice() - .delimited_by(just('{'), just('}')) - .try_map(|digits: &str, span| { - let code = u32::from_str_radix(digits, 16).unwrap(); - char::from_u32(code).ok_or_else(|| { - Rich::custom(span, format!("Invalid unicode codepoint: U+{:X}", code)) - }) - }), - ), - ))); - - let string_content = none_of("\\\"").or(escape).repeated().collect::(); - let simple_string = string_content - .delimited_by(just('"'), just('"')) - .labelled("string"); - - // Raw string: """...""" - let raw_string = just("\"\"\"") - .ignore_then( - any() - .and_is(just("\"\"\"").not()) - .repeated() - .collect::(), - ) - .then_ignore(just("\"\"\"")) - .map(|s: String| { - // Strip leading newline if present - s.strip_prefix('\n') - .or_else(|| s.strip_prefix("\r\n")) - .unwrap_or(&s) - .to_string() - }) - .labelled("raw string"); - - let string_val = choice((raw_string, simple_string)).map(MamlValue::String); - - // --- Keys --- - let identifier = one_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-") - .repeated() - .at_least(1) - .collect::(); - - let key = choice((simple_string.clone(), identifier)).padded_by(ws); - - // --- Array --- let array = value .clone() .separated_by(separator.clone().repeated().at_least(1)) .allow_trailing() .collect() - .padded() - .delimited_by(just('['), just(']')) + .padded_by(just(Token::Newline).repeated()) + .delimited_by(just(Token::LBracket), just(Token::RBracket)) .map(MamlValue::Array) .labelled("array") .recover_with(via_parser(nested_delimiters( - '[', - ']', - [('[', ']'), ('{', '}')], + Token::LBracket, + Token::RBracket, + [ + (Token::LBracket, Token::RBracket), + (Token::LBrace, Token::RBrace), + ], |_| MamlValue::Array(vec![]), ))); - // --- Object --- - let member = key.then_ignore(just(':').padded_by(ws)).then(value.clone()); + // Object parsing + let member = key.then_ignore(just(Token::Colon)).then(value.clone()); let object = member .separated_by(separator.repeated().at_least(1)) .allow_trailing() .collect() - .padded() - .delimited_by(just('{'), just('}')) + .padded_by(just(Token::Newline).repeated()) + .delimited_by(just(Token::LBrace), just(Token::RBrace)) .map(MamlValue::Object) .labelled("object") .recover_with(via_parser(nested_delimiters( - '{', - '}', - [('[', ']'), ('{', '}')], + Token::LBrace, + Token::RBrace, + [ + (Token::LBracket, Token::RBracket), + (Token::LBrace, Token::RBrace), + ], |_| MamlValue::Object(HashMap::new()), ))); - // --- Top-level choice --- + // Entry point/top-level choice choice(( - just("null").to(MamlValue::Null), - just("true").to(MamlValue::Bool(true)), - just("false").to(MamlValue::Bool(false)), + just(Token::Null).to(MamlValue::Null), + just(Token::True).to(MamlValue::Bool(true)), + just(Token::False).to(MamlValue::Bool(false)), number, string_val, array, object, )) - .padded_by(ws) }) } /// Parse from string (like `serde_json::from_str`) pub fn from_str(input: &str) -> Result { - let (val, errs) = parser().padded().parse(input).into_output_errors(); + // Tokenize + let lexer = Token::lexer(input); + let mut tokens = vec![]; + + for (token_result, span) in lexer.spanned() { + match token_result { + Ok(token) => tokens.push(token), + Err(_) => { + return Err(format!("Lexer error at {:?}", span)); + } + } + } + + // Parse + let (val, errs) = parser() + .padded_by(just(Token::Newline).repeated()) + .parse(&tokens) + .into_output_errors(); if !errs.is_empty() { let mut buffer = Vec::new(); for e in errs { Report::build(ReportKind::Error, ("", e.span().into_range())) - .with_message(e.to_string()) + .with_message(format!("{:?}", e)) .with_label( Label::new(("", e.span().into_range())) - .with_message(e.reason().to_string()) + .with_message(format!("{:?}", e.reason())) .with_color(Color::Red), ) .finish() @@ -198,7 +278,25 @@ pub fn from_str(input: &str) -> Result { /// Parse with detailed error reporting to stderr pub fn parse_with_report(filename: &str, input: &str) -> Option { - let (val, errs) = parser().padded().parse(input).into_output_errors(); + // Tokenize + let lexer = Token::lexer(input); + let mut tokens = vec![]; + + for (token_result, span) in lexer.spanned() { + match token_result { + Ok(token) => tokens.push(token), + Err(_) => { + eprintln!("Lexer error at {:?}", span); + return None; + } + } + } + + // Parse + let (val, errs) = parser() + .padded_by(just(Token::Newline).repeated()) + .parse(&tokens) + .into_output_errors(); if errs.is_empty() { return val; @@ -213,16 +311,6 @@ pub fn parse_with_report(filename: &str, input: &str) -> Option { .with_message(e.reason().to_string()) .with_color(Color::Red), ) - .with_note(format!( - "Error at line {} column {}", - input[..span.start].lines().count(), - input[..span.start] - .lines() - .last() - .map(|l| l.len()) - .unwrap_or(0) - + 1 - )) .finish() .eprint((filename, Source::from(input))) .unwrap(); From c7c6b9416d5a626d83d03f85b602917f613ef04b Mon Sep 17 00:00:00 2001 From: Myles Wirth Date: Mon, 1 Dec 2025 09:50:09 -0500 Subject: [PATCH 05/11] Added logos --- Cargo.lock | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++- Cargo.toml | 7 ++++ 2 files changed, 109 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index b040ed5..6df5370 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -36,6 +36,12 @@ dependencies = [ "yansi", ] +[[package]] +name = "beef" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" + [[package]] name = "cc" version = "1.2.46" @@ -78,6 +84,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "foldhash" version = "0.1.5" @@ -95,18 +107,67 @@ dependencies = [ "foldhash", ] +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + [[package]] name = "libc" version = "0.2.177" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" +[[package]] +name = "logos" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff472f899b4ec2d99161c51f60ff7075eeb3097069a36050d8037a6325eb8154" +dependencies = [ + "logos-derive", +] + +[[package]] +name = "logos-codegen" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "192a3a2b90b0c05b27a0b2c43eecdb7c415e29243acc3f89cc8247a5b693045c" +dependencies = [ + "beef", + "fnv", + "lazy_static", + "proc-macro2", + "quote", + "regex-syntax 0.8.8", + "rustc_version", + "syn", +] + +[[package]] +name = "logos-derive" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "605d9697bcd5ef3a42d38efc51541aa3d6a4a25f7ab6d1ed0da5ac632a26b470" +dependencies = [ + "logos-codegen", +] + [[package]] name = "maml" version = "0.0.0" dependencies = [ "ariadne", "chumsky", + "logos", + "serde", + "serde_json", ] [[package]] @@ -160,7 +221,7 @@ checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.7.5", ] [[package]] @@ -169,6 +230,33 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +[[package]] +name = "regex-syntax" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "semver" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" + [[package]] name = "serde" version = "1.0.228" @@ -199,6 +287,19 @@ dependencies = [ "syn", ] +[[package]] +name = "serde_json" +version = "1.0.145" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", + "serde_core", +] + [[package]] name = "shlex" version = "1.3.0" diff --git a/Cargo.toml b/Cargo.toml index 773f06d..5a66729 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,3 +10,10 @@ readme = "README.md" [dependencies] ariadne = "0.6.0" chumsky = "1.0.0-alpha.7" +logos = "0.15.1" +serde = { version = "1.0", optional = true, features = ["derive"]} +serde_json = { version = "1.0"} + +[features] +default = [] +serde = ["dep:serde"] From d84f0e38df54cefa3cd84087d523abf8920fbacf Mon Sep 17 00:00:00 2001 From: Myles Wirth Date: Mon, 1 Dec 2025 09:57:30 -0500 Subject: [PATCH 06/11] Broke things out into sepearate files --- maml.js | 1 + src/lib.rs | 311 +---------------------------------------------- src/main.rs | 4 +- src/parser.rs | 170 ++++++++++++++++++++++++++ src/tokenizer.rs | 101 +++++++++++++++ src/utils.rs | 41 +++++++ 6 files changed, 322 insertions(+), 306 deletions(-) create mode 160000 maml.js create mode 100644 src/parser.rs create mode 100644 src/tokenizer.rs create mode 100644 src/utils.rs diff --git a/maml.js b/maml.js new file mode 160000 index 0000000..5d1521a --- /dev/null +++ b/maml.js @@ -0,0 +1 @@ +Subproject commit 5d1521ac22378ed82d1c4eac10123de4f36e89ff diff --git a/src/lib.rs b/src/lib.rs index 63bb16e..ea73dc2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,8 +1,9 @@ -use ariadne::{Color, Label, Report, ReportKind, Source}; -use chumsky::prelude::*; -use logos::Logos; use std::collections::HashMap; +pub mod parser; +pub mod tokenizer; +mod utils; + /// MAML AST #[derive(Debug, Clone, PartialEq)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] @@ -17,310 +18,10 @@ pub enum MamlValue { Object(HashMap), } -/// Tokens for MAML -#[derive(Logos, Debug, Clone, PartialEq)] -#[logos(skip r"[ \t]+")] -enum Token { - #[token("null")] - Null, - - #[token("true")] - True, - - #[token("false")] - False, - - // Float MUST come before Int to ensure proper decimal matching, so priority three - #[regex(r"-?(?:0|[1-9][0-9]*)\.[0-9]+(?:[eE][+-]?[0-9]+)?", |lex| lex.slice().parse::().ok(), priority = 3)] - #[regex(r"-?(?:0|[1-9][0-9]*)[eE][+-]?[0-9]+", |lex| lex.slice().parse::().ok(), priority = 3)] - Float(f64), - - #[regex(r"-?(?:0|[1-9][0-9]*)", |lex| lex.slice().parse::().ok(), priority = 2)] - Int(i64), - - #[regex(r#""(?:[^"\\]|\\["\\/bfnrt]|\\u\{[0-9a-fA-F]{1,6}\})*""#, |lex| { - let s = lex.slice(); - parse_string(&s[1..s.len()-1]) - })] - String(String), - - // Surrounded by triple quotes - #[regex(r#""""([^"]|"[^"]|""[^"])*""""#, |lex| { - let s = lex.slice(); - let content = &s[3..s.len()-3]; - - // Make sure triple quotes are checked - if content.contains(r#"""""#) { - return None; - } - - Some(content.strip_prefix('\n') - .or_else(|| content.strip_prefix("\r\n")) - .unwrap_or(content) - .to_string()) - })] - RawString(String), - - // An object key - #[regex(r"[a-zA-Z_-][a-zA-Z0-9_-]*", |lex| lex.slice().to_string(), priority = 1)] - #[regex(r"[0-9]+", |lex| lex.slice().to_string(), priority = 1)] - Key(String), - - #[token("[")] - LBracket, - - #[token("]")] - RBracket, - - #[token("{")] - LBrace, - - #[token("}")] - RBrace, - - #[token(":")] - Colon, - - #[token(",")] - Comma, - - #[token("\n")] - Newline, - - // Anything that comes after a # - #[regex(r"#[^\n]*", logos::skip)] - Comment, -} - -impl std::fmt::Display for Token { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Token::Null => write!(f, "null"), - Token::True => write!(f, "true"), - Token::False => write!(f, "false"), - Token::Float(n) => write!(f, "{}", n), - Token::Int(n) => write!(f, "{}", n), - Token::String(s) => write!(f, "\"{}\"", s), - Token::RawString(s) => write!(f, "\"\"\"{}\"\"\"", s), - Token::Key(s) => write!(f, "{}", s), - Token::LBracket => write!(f, "["), - Token::RBracket => write!(f, "]"), - Token::LBrace => write!(f, "{{"), - Token::RBrace => write!(f, "}}"), - Token::Colon => write!(f, ":"), - Token::Comma => write!(f, ","), - Token::Newline => write!(f, "\\n"), - Token::Comment => write!(f, "#comment"), - } - } -} - -// Helper function to parse escape sequences in strings -fn parse_string(s: &str) -> Option { - let mut result = String::new(); - let mut chars = s.chars(); - - while let Some(ch) = chars.next() { - if ch == '\\' { - match chars.next()? { - '\\' => result.push('\\'), - '/' => result.push('/'), - '"' => result.push('"'), - 'b' => result.push('\x08'), - 'f' => result.push('\x0C'), - 'n' => result.push('\n'), - 'r' => result.push('\r'), - 't' => result.push('\t'), - 'u' => { - // Expect {XXXXXX} - if chars.next()? != '{' { - return None; - } - let mut hex = String::new(); - loop { - match chars.next()? { - '}' => break, - c if c.is_ascii_hexdigit() && hex.len() < 6 => hex.push(c), - _ => return None, - } - } - let code = u32::from_str_radix(&hex, 16).ok()?; - result.push(char::from_u32(code)?); - } - _ => return None, - } - } else { - result.push(ch); - } - } - - Some(result) -} - -/// Parser definition -fn parser<'src>() -> impl Parser<'src, &'src [Token], MamlValue, extra::Err>> { - recursive(|value| { - // Separator: comma or newline - let separator = choice((just(Token::Comma).ignored(), just(Token::Newline).ignored())); - - // The number types - let number = choice(( - select! { Token::Float(f) => MamlValue::Float(f) }, - select! { Token::Int(i) => MamlValue::Int(i) }, - )) - .labelled("number"); - - // Strings, raw or typical - let string_val = choice(( - select! { Token::RawString(s) => MamlValue::String(s) }, - select! { Token::String(s) => MamlValue::String(s) }, - )); - - // Handling object keys - let key = choice(( - select! { Token::String(s) => s }, - select! { Token::Key(s) => s }, - )); - - let array = value - .clone() - .separated_by(separator.clone().repeated().at_least(1)) - .allow_trailing() - .collect() - .padded_by(just(Token::Newline).repeated()) - .delimited_by(just(Token::LBracket), just(Token::RBracket)) - .map(MamlValue::Array) - .labelled("array") - .recover_with(via_parser(nested_delimiters( - Token::LBracket, - Token::RBracket, - [ - (Token::LBracket, Token::RBracket), - (Token::LBrace, Token::RBrace), - ], - |_| MamlValue::Array(vec![]), - ))); - - // Object parsing - let member = key.then_ignore(just(Token::Colon)).then(value.clone()); - - let object = member - .separated_by(separator.repeated().at_least(1)) - .allow_trailing() - .collect() - .padded_by(just(Token::Newline).repeated()) - .delimited_by(just(Token::LBrace), just(Token::RBrace)) - .map(MamlValue::Object) - .labelled("object") - .recover_with(via_parser(nested_delimiters( - Token::LBrace, - Token::RBrace, - [ - (Token::LBracket, Token::RBracket), - (Token::LBrace, Token::RBrace), - ], - |_| MamlValue::Object(HashMap::new()), - ))); - - // Entry point/top-level choice - choice(( - just(Token::Null).to(MamlValue::Null), - just(Token::True).to(MamlValue::Bool(true)), - just(Token::False).to(MamlValue::Bool(false)), - number, - string_val, - array, - object, - )) - }) -} - -/// Parse from string (like `serde_json::from_str`) -pub fn from_str(input: &str) -> Result { - // Tokenize - let lexer = Token::lexer(input); - let mut tokens = vec![]; - - for (token_result, span) in lexer.spanned() { - match token_result { - Ok(token) => tokens.push(token), - Err(_) => { - return Err(format!("Lexer error at {:?}", span)); - } - } - } - - // Parse - let (val, errs) = parser() - .padded_by(just(Token::Newline).repeated()) - .parse(&tokens) - .into_output_errors(); - - if !errs.is_empty() { - let mut buffer = Vec::new(); - for e in errs { - Report::build(ReportKind::Error, ("", e.span().into_range())) - .with_message(format!("{:?}", e)) - .with_label( - Label::new(("", e.span().into_range())) - .with_message(format!("{:?}", e.reason())) - .with_color(Color::Red), - ) - .finish() - .write(("", Source::from(input)), &mut buffer) - .unwrap(); - } - return Err(String::from_utf8_lossy(&buffer).to_string()); - } - - val.ok_or_else(|| "Unexpected parsing failure".to_string()) -} - -/// Parse with detailed error reporting to stderr -pub fn parse_with_report(filename: &str, input: &str) -> Option { - // Tokenize - let lexer = Token::lexer(input); - let mut tokens = vec![]; - - for (token_result, span) in lexer.spanned() { - match token_result { - Ok(token) => tokens.push(token), - Err(_) => { - eprintln!("Lexer error at {:?}", span); - return None; - } - } - } - - // Parse - let (val, errs) = parser() - .padded_by(just(Token::Newline).repeated()) - .parse(&tokens) - .into_output_errors(); - - if errs.is_empty() { - return val; - } - - for e in errs { - let span = e.span().into_range(); - Report::build(ReportKind::Error, (filename, span.clone())) - .with_message(format!("{}", e)) - .with_label( - Label::new((filename, span.clone())) - .with_message(e.reason().to_string()) - .with_color(Color::Red), - ) - .finish() - .eprint((filename, Source::from(input))) - .unwrap(); - } - - None -} - #[cfg(test)] mod tests { + use crate::parser::from_str; + use super::*; #[test] diff --git a/src/main.rs b/src/main.rs index 2b74578..9ae5a33 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,8 @@ use std::env; use std::fs; +use maml::parser::parse_with_report; + fn main() { let args: Vec = env::args().collect(); if args.len() != 2 { @@ -17,7 +19,7 @@ fn main() { } }; - match maml::parse_with_report(filename, &src) { + match parse_with_report(filename, &src) { Some(value) => { println!("{:#?}", value); diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..8786797 --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,170 @@ +use std::collections::HashMap; + +use ariadne::{Color, Label, Report, ReportKind, Source}; +use chumsky::prelude::*; +use logos::Logos; + +use crate::{MamlValue, tokenizer::Token}; + +/// Parser definition +pub fn parser<'src>() -> impl Parser<'src, &'src [Token], MamlValue, extra::Err>> +{ + recursive(|value| { + // Separator: comma or newline + let separator = choice((just(Token::Comma).ignored(), just(Token::Newline).ignored())); + + // The number types + let number = choice(( + select! { Token::Float(f) => MamlValue::Float(f) }, + select! { Token::Int(i) => MamlValue::Int(i) }, + )) + .labelled("number"); + + // Strings, raw or typical + let string_val = choice(( + select! { Token::RawString(s) => MamlValue::String(s) }, + select! { Token::String(s) => MamlValue::String(s) }, + )); + + // Handling object keys + let key = choice(( + select! { Token::String(s) => s }, + select! { Token::Key(s) => s }, + )); + + let array = value + .clone() + .separated_by(separator.clone().repeated().at_least(1)) + .allow_trailing() + .collect() + .padded_by(just(Token::Newline).repeated()) + .delimited_by(just(Token::LBracket), just(Token::RBracket)) + .map(MamlValue::Array) + .labelled("array") + .recover_with(via_parser(nested_delimiters( + Token::LBracket, + Token::RBracket, + [ + (Token::LBracket, Token::RBracket), + (Token::LBrace, Token::RBrace), + ], + |_| MamlValue::Array(vec![]), + ))); + + // Object parsing + let member = key.then_ignore(just(Token::Colon)).then(value.clone()); + + let object = member + .separated_by(separator.repeated().at_least(1)) + .allow_trailing() + .collect() + .padded_by(just(Token::Newline).repeated()) + .delimited_by(just(Token::LBrace), just(Token::RBrace)) + .map(MamlValue::Object) + .labelled("object") + .recover_with(via_parser(nested_delimiters( + Token::LBrace, + Token::RBrace, + [ + (Token::LBracket, Token::RBracket), + (Token::LBrace, Token::RBrace), + ], + |_| MamlValue::Object(HashMap::new()), + ))); + + // Entry point/top-level choice + choice(( + just(Token::Null).to(MamlValue::Null), + just(Token::True).to(MamlValue::Bool(true)), + just(Token::False).to(MamlValue::Bool(false)), + number, + string_val, + array, + object, + )) + }) +} + +/// Parse from string (like `serde_json::from_str`) +pub fn from_str(input: &str) -> Result { + // Tokenize + let lexer = Token::lexer(input); + let mut tokens = vec![]; + + for (token_result, span) in lexer.spanned() { + match token_result { + Ok(token) => tokens.push(token), + Err(_) => { + return Err(format!("Lexer error at {:?}", span)); + } + } + } + + // Parse + let (val, errs) = parser() + .padded_by(just(Token::Newline).repeated()) + .parse(&tokens) + .into_output_errors(); + + if !errs.is_empty() { + let mut buffer = Vec::new(); + for e in errs { + Report::build(ReportKind::Error, ("", e.span().into_range())) + .with_message(format!("{:?}", e)) + .with_label( + Label::new(("", e.span().into_range())) + .with_message(format!("{:?}", e.reason())) + .with_color(Color::Red), + ) + .finish() + .write(("", Source::from(input)), &mut buffer) + .unwrap(); + } + return Err(String::from_utf8_lossy(&buffer).to_string()); + } + + val.ok_or_else(|| "Unexpected parsing failure".to_string()) +} + +/// Parse with detailed error reporting to stderr +pub fn parse_with_report(filename: &str, input: &str) -> Option { + // Tokenize + let lexer = Token::lexer(input); + let mut tokens = vec![]; + + for (token_result, span) in lexer.spanned() { + match token_result { + Ok(token) => tokens.push(token), + Err(_) => { + eprintln!("Lexer error at {:?}", span); + return None; + } + } + } + + // Parse + let (val, errs) = parser() + .padded_by(just(Token::Newline).repeated()) + .parse(&tokens) + .into_output_errors(); + + if errs.is_empty() { + return val; + } + + for e in errs { + let span = e.span().into_range(); + Report::build(ReportKind::Error, (filename, span.clone())) + .with_message(format!("{}", e)) + .with_label( + Label::new((filename, span.clone())) + .with_message(e.reason().to_string()) + .with_color(Color::Red), + ) + .finish() + .eprint((filename, Source::from(input))) + .unwrap(); + } + + None +} diff --git a/src/tokenizer.rs b/src/tokenizer.rs new file mode 100644 index 0000000..c709b02 --- /dev/null +++ b/src/tokenizer.rs @@ -0,0 +1,101 @@ +use logos::Logos; + +use crate::utils::parse_string; + +/// Token definitions for MAML +#[derive(Logos, Debug, Clone, PartialEq)] +#[logos(skip r"[ \t]+")] +pub enum Token { + #[token("null")] + Null, + + #[token("true")] + True, + + #[token("false")] + False, + + // Float MUST come before Int to ensure proper decimal matching, so priority three + #[regex(r"-?(?:0|[1-9][0-9]*)\.[0-9]+(?:[eE][+-]?[0-9]+)?", |lex| lex.slice().parse::().ok(), priority = 3)] + #[regex(r"-?(?:0|[1-9][0-9]*)[eE][+-]?[0-9]+", |lex| lex.slice().parse::().ok(), priority = 3)] + Float(f64), + + #[regex(r"-?(?:0|[1-9][0-9]*)", |lex| lex.slice().parse::().ok(), priority = 2)] + Int(i64), + + #[regex(r#""(?:[^"\\]|\\["\\/bfnrt]|\\u\{[0-9a-fA-F]{1,6}\})*""#, |lex| { + let s = lex.slice(); + parse_string(&s[1..s.len()-1]) + })] + String(String), + + // Surrounded by triple quotes + #[regex(r#""""([^"]|"[^"]|""[^"])*""""#, |lex| { + let s = lex.slice(); + let content = &s[3..s.len()-3]; + + // Make sure triple quotes are checked + if content.contains(r#"""""#) { + return None; + } + + Some(content.strip_prefix('\n') + .or_else(|| content.strip_prefix("\r\n")) + .unwrap_or(content) + .to_string()) + })] + RawString(String), + + // An object key + #[regex(r"[a-zA-Z_-][a-zA-Z0-9_-]*", |lex| lex.slice().to_string(), priority = 1)] + #[regex(r"[0-9]+", |lex| lex.slice().to_string(), priority = 1)] + Key(String), + + #[token("[")] + LBracket, + + #[token("]")] + RBracket, + + #[token("{")] + LBrace, + + #[token("}")] + RBrace, + + #[token(":")] + Colon, + + #[token(",")] + Comma, + + #[token("\n")] + Newline, + + // Anything that comes after a # + #[regex(r"#[^\n]*", logos::skip)] + Comment, +} + +impl std::fmt::Display for Token { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Token::Null => write!(f, "null"), + Token::True => write!(f, "true"), + Token::False => write!(f, "false"), + Token::Float(n) => write!(f, "{}", n), + Token::Int(n) => write!(f, "{}", n), + Token::String(s) => write!(f, "\"{}\"", s), + Token::RawString(s) => write!(f, "\"\"\"{}\"\"\"", s), + Token::Key(s) => write!(f, "{}", s), + Token::LBracket => write!(f, "["), + Token::RBracket => write!(f, "]"), + Token::LBrace => write!(f, "{{"), + Token::RBrace => write!(f, "}}"), + Token::Colon => write!(f, ":"), + Token::Comma => write!(f, ","), + Token::Newline => write!(f, "\\n"), + Token::Comment => write!(f, "#comment"), + } + } +} diff --git a/src/utils.rs b/src/utils.rs new file mode 100644 index 0000000..c839f5b --- /dev/null +++ b/src/utils.rs @@ -0,0 +1,41 @@ +/// Parse escape sequences in strings +pub(crate) fn parse_string(s: &str) -> Option { + let mut result = String::new(); + let mut chars = s.chars(); + + while let Some(ch) = chars.next() { + if ch == '\\' { + match chars.next()? { + '\\' => result.push('\\'), + '/' => result.push('/'), + '"' => result.push('"'), + 'b' => result.push('\x08'), + 'f' => result.push('\x0C'), + 'n' => result.push('\n'), + 'r' => result.push('\r'), + 't' => result.push('\t'), + 'u' => { + // Expect {XXXXXX} + if chars.next()? != '{' { + return None; + } + let mut hex = String::new(); + loop { + match chars.next()? { + '}' => break, + c if c.is_ascii_hexdigit() && hex.len() < 6 => hex.push(c), + _ => return None, + } + } + let code = u32::from_str_radix(&hex, 16).ok()?; + result.push(char::from_u32(code)?); + } + _ => return None, + } + } else { + result.push(ch); + } + } + + Some(result) +} From f1386122e54ccb02712d3d48af60c7038e090732 Mon Sep 17 00:00:00 2001 From: Myles Wirth Date: Mon, 1 Dec 2025 12:22:46 -0500 Subject: [PATCH 07/11] Removed main entrypoint -- solely lib --- src/main.rs | 33 --------------------------------- 1 file changed, 33 deletions(-) delete mode 100644 src/main.rs diff --git a/src/main.rs b/src/main.rs deleted file mode 100644 index 9ae5a33..0000000 --- a/src/main.rs +++ /dev/null @@ -1,33 +0,0 @@ -use std::env; -use std::fs; - -use maml::parser::parse_with_report; - -fn main() { - let args: Vec = env::args().collect(); - if args.len() != 2 { - eprintln!("Usage: {} ", args[0]); - std::process::exit(1); - } - - let filename = &args[1]; - let src = match fs::read_to_string(filename) { - Ok(s) => s, - Err(e) => { - eprintln!("Error reading {}: {}", filename, e); - std::process::exit(1); - } - }; - - match parse_with_report(filename, &src) { - Some(value) => { - println!("{:#?}", value); - - #[cfg(feature = "serde")] - if let Ok(json) = serde_json::to_string_pretty(&value) { - println!("\nAs JSON:\n{}", json); - } - } - None => std::process::exit(1), - } -} From 30c2a26e556f19347216bb2341239a21b74dfa2f Mon Sep 17 00:00:00 2001 From: Myles Wirth Date: Mon, 1 Dec 2025 12:22:49 -0500 Subject: [PATCH 08/11] Documentation --- src/parser.rs | 2 +- src/tokenizer.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 8786797..94fa9db 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -6,7 +6,7 @@ use logos::Logos; use crate::{MamlValue, tokenizer::Token}; -/// Parser definition +/// Raw parser entrypoint pub fn parser<'src>() -> impl Parser<'src, &'src [Token], MamlValue, extra::Err>> { recursive(|value| { diff --git a/src/tokenizer.rs b/src/tokenizer.rs index c709b02..f3a612b 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -32,7 +32,7 @@ pub enum Token { // Surrounded by triple quotes #[regex(r#""""([^"]|"[^"]|""[^"])*""""#, |lex| { let s = lex.slice(); - let content = &s[3..s.len()-3]; + let content = &s[3..s.len()-3]; // The content within the triple quotes (which take up three chars each) // Make sure triple quotes are checked if content.contains(r#"""""#) { From 54c1f5bdb59a0fba4f735d15d74f87c15550690c Mon Sep 17 00:00:00 2001 From: Myles Wirth Date: Mon, 1 Dec 2025 12:23:44 -0500 Subject: [PATCH 09/11] Removed serde_json mentions --- Cargo.toml | 1 - src/parser.rs | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5a66729..76a58b6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,6 @@ ariadne = "0.6.0" chumsky = "1.0.0-alpha.7" logos = "0.15.1" serde = { version = "1.0", optional = true, features = ["derive"]} -serde_json = { version = "1.0"} [features] default = [] diff --git a/src/parser.rs b/src/parser.rs index 94fa9db..f20abd6 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -85,7 +85,7 @@ pub fn parser<'src>() -> impl Parser<'src, &'src [Token], MamlValue, extra::Err< }) } -/// Parse from string (like `serde_json::from_str`) +/// Parse from string pub fn from_str(input: &str) -> Result { // Tokenize let lexer = Token::lexer(input); From 5f95a517114666d1794adf786630b9d514a079ec Mon Sep 17 00:00:00 2001 From: Myles Wirth Date: Mon, 1 Dec 2025 12:24:28 -0500 Subject: [PATCH 10/11] OR \r --- Cargo.lock | 26 -------------------------- src/tokenizer.rs | 2 +- 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6df5370..657b839 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -107,12 +107,6 @@ dependencies = [ "foldhash", ] -[[package]] -name = "itoa" -version = "1.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" - [[package]] name = "lazy_static" version = "1.5.0" @@ -167,7 +161,6 @@ dependencies = [ "chumsky", "logos", "serde", - "serde_json", ] [[package]] @@ -245,12 +238,6 @@ dependencies = [ "semver", ] -[[package]] -name = "ryu" -version = "1.0.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" - [[package]] name = "semver" version = "1.0.27" @@ -287,19 +274,6 @@ dependencies = [ "syn", ] -[[package]] -name = "serde_json" -version = "1.0.145" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" -dependencies = [ - "itoa", - "memchr", - "ryu", - "serde", - "serde_core", -] - [[package]] name = "shlex" version = "1.3.0" diff --git a/src/tokenizer.rs b/src/tokenizer.rs index f3a612b..e746a9e 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -69,7 +69,7 @@ pub enum Token { #[token(",")] Comma, - #[token("\n")] + #[regex(r"\r?\n")] Newline, // Anything that comes after a # From e24e64a508d7134ab05b6b9a0c9f9caff510faa6 Mon Sep 17 00:00:00 2001 From: Myles Wirth Date: Mon, 1 Dec 2025 12:26:47 -0500 Subject: [PATCH 11/11] Added one super pretty test --- src/parser.rs | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/src/parser.rs b/src/parser.rs index f20abd6..dc2fddd 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -168,3 +168,59 @@ pub fn parse_with_report(filename: &str, input: &str) -> Option { None } + +#[cfg(test)] +mod tests { + use crate::parser::from_str; + + use super::*; + + #[test] + fn test_object() { + let val = from_str( + r#" + { + project: "MAML" + tags: [ + "minimal" + "readable" + ] + + # A simple nested object + spec: { + version: 1 + author: "Anton Medvedev" + } + + # Array of objects with nested objects + examples: [ + { + json: { + name: "JSON" + born: 2001 + } + } + { + maml: { + name: "MAML" + born: 2025 + } + } + ] + + notes: """ + This is a multiline strings. + Keeps formatting as-is. + """ + } + "#, + ) + .unwrap(); + if let MamlValue::Object(obj) = val { + assert_eq!(obj.len(), 5); + assert!(obj.contains_key("notes")); + } else { + panic!("Expected object"); + } + } +}