diff --git a/Cargo.lock b/Cargo.lock index bc9cf04..672997a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -420,12 +420,13 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.7.1" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" +checksum = "3954d50fe15b02142bf25d3b8bdadb634ec3948f103d04ffe3031bc8fe9d7058" dependencies = [ "equivalent", "hashbrown", + "serde", ] [[package]] @@ -569,6 +570,16 @@ dependencies = [ "serde", ] +[[package]] +name = "ordermap" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e98f974336ceffd5b1b1f4fcbb89a23c8dcd334adc4b8612f11b7fa99944535" +dependencies = [ + "indexmap", + "serde", +] + [[package]] name = "percent-encoding" version = "2.3.1" @@ -735,6 +746,7 @@ dependencies = [ "maplit", "once_cell", "ordered-float", + "ordermap", "quick-xml", "regex", "schemars", diff --git a/schema_analysis/Cargo.toml b/schema_analysis/Cargo.toml index a7ac196..7986f9b 100644 --- a/schema_analysis/Cargo.toml +++ b/schema_analysis/Cargo.toml @@ -23,6 +23,7 @@ serde = { version = "1.0", features = [ "serde_derive" ] } regex = "1.5" # Used to detect interesting strings once_cell = "1.8" # For global constants that require allocation ordered-float = { version = "3.4", features = [ "serde" ] } # To save sets of floats +ordermap = { version = "0.5", features = [ "serde" ] } # To preserve the order of fields in the schema # These are used to allow the users of the library to run # custom analysis on the nodes. Check src/context/aggregators.rs diff --git a/schema_analysis/src/analysis/schema.rs b/schema_analysis/src/analysis/schema.rs index 741760b..d1f42a6 100644 --- a/schema_analysis/src/analysis/schema.rs +++ b/schema_analysis/src/analysis/schema.rs @@ -1,5 +1,4 @@ -use std::collections::BTreeMap; - +use ordermap::OrderMap; use serde::de::{Error, Visitor}; use crate::{Aggregate, Field, Schema}; @@ -173,7 +172,7 @@ impl<'de> Visitor<'de> for SchemaVisitor<'_> { A: serde::de::MapAccess<'de>, { let mut keys = Vec::new(); - let mut fields: BTreeMap = BTreeMap::new(); + let mut fields: OrderMap = OrderMap::new(); while let Some(key) = map.next_key::()? { match fields.get_mut(&key) { diff --git a/schema_analysis/src/schema.rs b/schema_analysis/src/schema.rs index e8540c3..47a6aa5 100644 --- a/schema_analysis/src/schema.rs +++ b/schema_analysis/src/schema.rs @@ -1,5 +1,4 @@ -use std::collections::BTreeMap; - +use ordermap::OrderMap; use serde::{Deserialize, Serialize}; use crate::{ @@ -47,7 +46,7 @@ pub enum Schema { Struct { /// Each [String] key gets assigned a [Field]. /// Currently we are using a [BTreeMap], but that might change in the future. - fields: BTreeMap, + fields: OrderMap, /// The context aggregates information about the struct. /// It is passed a vector of the key names. context: MapStructContext, @@ -101,6 +100,59 @@ pub struct FieldStatus { // // Schema implementations // +impl Schema { + /// Sorts the fields of the schema by their name (using [String::cmp]). + pub fn sort_fields(&mut self) { + match self { + Schema::Null(_) + | Schema::Boolean(_) + | Schema::Integer(_) + | Schema::Float(_) + | Schema::String(_) + | Schema::Bytes(_) => {} + Schema::Sequence { field, context: _ } => { + field.sort_fields(); + } + Schema::Struct { fields, context: _ } => { + fields.sort_keys(); + for field in fields.values_mut() { + field.sort_fields(); + } + } + Schema::Union { variants } => { + variants.sort_by(schema_cmp); + for variant in variants { + variant.sort_fields(); + } + } + } + } + /// Sorts/normalises the order of [Schema::Union] variants. + pub fn sort_variants(&mut self) { + match self { + Schema::Null(_) + | Schema::Boolean(_) + | Schema::Integer(_) + | Schema::Float(_) + | Schema::String(_) + | Schema::Bytes(_) => {} + Schema::Sequence { field, context: _ } => { + field.sort_variants(); + } + Schema::Struct { fields, context: _ } => { + for field in fields.values_mut() { + field.sort_variants(); + } + } + Schema::Union { variants } => { + variants.sort_by(schema_cmp); + for variant in variants { + variant.sort_variants(); + } + } + } + } +} impl StructuralEq for Schema { fn structural_eq(&self, other: &Self) -> bool { use Schema::*; @@ -123,7 +175,15 @@ impl StructuralEq for Schema { Struct { fields: fields_2, .. }, - ) => fields_1.structural_eq(fields_2), + ) => { + fields_1.len() == fields_2.len() + && fields_1.iter().all(|(sk, sv)| { + let Some(ov) = fields_2.get(sk) else { + return false; + }; + sv.structural_eq(ov) + }) + } (Union { variants: s }, Union { variants: o }) => { let mut s = s.clone(); @@ -378,6 +438,17 @@ impl Field { schema: Some(schema), } } + + fn sort_fields(&mut self) { + if let Some(schema) = &mut self.schema { + schema.sort_fields(); + } + } + fn sort_variants(&mut self) { + if let Some(schema) = &mut self.schema { + schema.sort_variants(); + } + } } impl Coalesce for Field { fn coalesce(&mut self, other: Self) @@ -437,63 +508,20 @@ impl Coalesce for FieldStatus { /// Since a [Schema::Union] should never hold two schemas of the same type, it is enough to /// just compare the top level without recursion. fn schema_cmp(first: &Schema, second: &Schema) -> std::cmp::Ordering { - use std::cmp::Ordering::*; - use Schema::*; - match first { - Null(_) => match second { - Null(_) => Equal, - _ => Less, - }, - Boolean(_) => match second { - Null(_) | Boolean(_) => Equal, - _ => Less, - }, - Integer(_) => match second { - Null(_) | Boolean(_) => Greater, - Integer(_) => Equal, - _ => Less, - }, - Float(_) => match second { - Null(_) | Boolean(_) | Integer(_) => Greater, - Float(_) => Equal, - _ => Less, - }, - String(_) => match second { - Null(_) | Boolean(_) | Integer(_) | Float(_) => Greater, - String(_) => Equal, - _ => Less, - }, - Bytes(_) => match second { - Null(_) | Boolean(_) | Integer(_) | Float(_) | String(_) => Greater, - Bytes(_) => Equal, - _ => Less, - }, - Sequence { .. } => match second { - Null(_) | Boolean(_) | Integer(_) | Float(_) | String(_) | Bytes(_) => Greater, - Sequence { .. } => Equal, - _ => Less, - }, - Struct { .. } => match second { - Null(_) - | Boolean(_) - | Integer(_) - | Float(_) - | String(_) - | Bytes(_) - | Sequence { .. } => Greater, - Struct { .. } => Equal, - _ => Less, - }, - Union { .. } => match second { - Null(_) - | Boolean(_) - | Integer(_) - | Float(_) - | String(_) - | Bytes(_) - | Sequence { .. } - | Struct { .. } => Greater, - Union { .. } => Equal, - }, + fn ordering(v: &Schema) -> u8 { + use Schema::*; + + match v { + Null(_) => 0, + Boolean(_) => 1, + Integer(_) => 2, + Float(_) => 3, + String(_) => 4, + Bytes(_) => 5, + Sequence { .. } => 6, + Struct { .. } => 7, + Union { .. } => 8, + } } + Ord::cmp(&ordering(first), &ordering(second)) } diff --git a/schema_analysis/src/traits.rs b/schema_analysis/src/traits.rs index 28ea87f..7719529 100644 --- a/schema_analysis/src/traits.rs +++ b/schema_analysis/src/traits.rs @@ -1,6 +1,6 @@ //! A module holding the crate's public traits. -use std::{any::Any, collections::BTreeMap, fmt::Debug}; +use std::{any::Any, fmt::Debug}; use downcast_rs::Downcast; @@ -120,12 +120,3 @@ impl StructuralEq for Option { } } } -impl StructuralEq for BTreeMap { - fn structural_eq(&self, other: &Self) -> bool { - self.len() == other.len() - && self - .iter() - .zip(other) - .all(|((sk, sv), (ok, ov))| sk.structural_eq(ok) && sv.structural_eq(ov)) - } -} diff --git a/schema_analysis/tests/shared/mod.rs b/schema_analysis/tests/shared/mod.rs index bbee9ce..71b454c 100644 --- a/schema_analysis/tests/shared/mod.rs +++ b/schema_analysis/tests/shared/mod.rs @@ -185,10 +185,7 @@ macro_rules! test_format { } mod targets { - use std::collections::BTreeMap; - - use maplit::btreemap; - + use ordermap::OrderMap; use schema_analysis::{Field, FieldStatus, Schema}; pub fn null() -> Schema { @@ -272,7 +269,7 @@ mod targets { } pub fn empty_map_struct() -> Schema { - let field_schemas: BTreeMap = BTreeMap::new(); + let field_schemas: OrderMap = OrderMap::new(); Schema::Struct { fields: field_schemas, context: Default::default(), @@ -285,9 +282,7 @@ mod targets { schema: Some(Schema::Integer(Default::default())), }; hello_field.status.may_be_normal = true; - btreemap! { - "hello".into() => hello_field - } + [("hello".into(), hello_field)].into() }; Schema::Struct { fields, @@ -306,10 +301,7 @@ mod targets { schema: Some(Schema::String(Default::default())), }; world_field.status.may_be_normal = true; - btreemap! { - "hello".into() => hello_field, - "world".into() => world_field, - } + [("hello".into(), hello_field), ("world".into(), world_field)].into() }; Schema::Struct { fields, @@ -332,11 +324,12 @@ mod targets { }); mixed_field.status.may_be_normal = true; - btreemap! { - "hello".into() => hello_field, - "world".into() => world_field, - "mixed".into() => mixed_field, - } + [ + ("hello".into(), hello_field), + ("world".into(), world_field), + ("mixed".into(), mixed_field), + ] + .into() }; let mut element_field = Field::with_schema(Schema::Struct { @@ -367,12 +360,13 @@ mod targets { null_or_missing_field.status.may_be_null = true; null_or_missing_field.status.may_be_missing = true; - btreemap! { - "hello".into() => hello_field, - "possibly_null".into() => possibly_null_field, - "possibly_missing".into() => possibly_missing_field, - "null_or_missing".into() => null_or_missing_field, - } + [ + ("hello".into(), hello_field), + ("possibly_null".into(), possibly_null_field), + ("possibly_missing".into(), possibly_missing_field), + ("null_or_missing".into(), null_or_missing_field), + ] + .into() }; let mut element_field = Field::with_schema(Schema::Struct { @@ -406,11 +400,12 @@ mod targets { }; sequence_field.status.may_be_normal = true; - btreemap! { - "hello".into() => hello_field, - "world".into() => world_field, - "sequence".into() => sequence_field, - } + [ + ("hello".into(), hello_field), + ("world".into(), world_field), + ("sequence".into(), sequence_field), + ] + .into() }; Schema::Struct { fields, @@ -441,12 +436,13 @@ mod targets { }; sequence_field.status.may_be_normal = true; - btreemap! { - "hello".into() => hello_field, - "world".into() => world_field, - "optional".into() => optional_field, - "sequence".into() => sequence_field, - } + [ + ("hello".into(), hello_field), + ("world".into(), world_field), + ("optional".into(), optional_field), + ("sequence".into(), sequence_field), + ] + .into() }; Schema::Struct { fields, diff --git a/schema_analysis/tests/source_xml.rs b/schema_analysis/tests/source_xml.rs index e92d758..a9b2cc7 100644 --- a/schema_analysis/tests/source_xml.rs +++ b/schema_analysis/tests/source_xml.rs @@ -132,22 +132,18 @@ impl FormatTests for Xml { /// We need to redefine some targets because, for example, xml doesn't have integer values. mod targets { - use std::collections::BTreeMap; - - use maplit::btreemap; + use ordermap::OrderMap; use schema_analysis::{Field, FieldStatus, Schema}; pub fn map_struct_single() -> Schema { - let fields: BTreeMap = { + let fields: OrderMap = { let mut hello_field = Field { status: FieldStatus::default(), schema: Some(Schema::String(Default::default())), }; hello_field.status.may_be_normal = true; - btreemap! { - "hello".into() => hello_field - } + [("hello".into(), hello_field)].into() }; Schema::Struct { fields, @@ -155,7 +151,7 @@ mod targets { } } pub fn map_struct_double() -> Schema { - let fields: BTreeMap = { + let fields: OrderMap = { let mut hello_field = Field { status: FieldStatus::default(), schema: Some(Schema::String(Default::default())), @@ -166,10 +162,7 @@ mod targets { schema: Some(Schema::String(Default::default())), }; world_field.status.may_be_normal = true; - btreemap! { - "hello".into() => hello_field, - "world".into() => world_field, - } + [("hello".into(), hello_field), ("world".into(), world_field)].into() }; Schema::Struct { fields, @@ -200,12 +193,13 @@ mod targets { null_or_missing_field.status.may_be_normal = true; // No built-in null null_or_missing_field.status.may_be_missing = true; - btreemap! { - "hello".into() => hello_field, - "possibly_null".into() => possibly_null_field, - "possibly_missing".into() => possibly_missing_field, - "null_or_missing".into() => null_or_missing_field, - } + [ + ("hello".into(), hello_field), + ("possibly_null".into(), possibly_null_field), + ("possibly_missing".into(), possibly_missing_field), + ("null_or_missing".into(), null_or_missing_field), + ] + .into() }; let mut element_field = Field::with_schema(Schema::Struct { @@ -224,15 +218,13 @@ mod targets { sequence_field.status.may_be_normal = true; Schema::Struct { - fields: btreemap! { - "element".into() => sequence_field, - }, + fields: [("element".into(), sequence_field)].into(), context: Default::default(), } } pub fn map_struct_mixed_sequence() -> Schema { - let fields: BTreeMap = { + let fields: OrderMap = { let mut hello_field = Field::with_schema(Schema::String(Default::default())); // hello_field.status.may_be_normal = true; @@ -252,11 +244,12 @@ mod targets { }; sequence_field.status.may_be_normal = true; - btreemap! { - "hello".into() => hello_field, - "world".into() => world_field, - "sequence".into() => sequence_field, - } + [ + ("hello".into(), hello_field), + ("world".into(), world_field), + ("sequence".into(), sequence_field), + ] + .into() }; Schema::Struct { fields, diff --git a/schema_analysis/tests/target_json_schema_json_typegen.rs b/schema_analysis/tests/target_json_schema_json_typegen.rs index b1f1d96..30bf644 100644 --- a/schema_analysis/tests/target_json_schema_json_typegen.rs +++ b/schema_analysis/tests/target_json_schema_json_typegen.rs @@ -170,11 +170,11 @@ impl FormatTests for JSchema { "type": "object", "properties": { "hello": { "type": INTEGER }, + "world": { "type": "string" }, // json_typegen does not have a concept of union types. "mixed": {}, - "world": { "type": "string" }, }, - "required": [ "hello", "mixed", "world" ], + "required": [ "hello", "world", "mixed" ], } }) } @@ -187,12 +187,12 @@ impl FormatTests for JSchema { "type": "object", "properties": { "hello": { "type": INTEGER }, - // We don't know what it is when it's not null, so it might be anything. - "null_or_missing": {}, - "possibly_missing": { "type": "number" }, // json_typegen considers being of type "null" and missing equivalent, // so it's simply not required instead of required and both "string" and "null". - "possibly_null": { "type": "string" } + "possibly_null": { "type": "string" }, + "possibly_missing": { "type": "number" }, + // We don't know what it is when it's not null, so it might be anything. + "null_or_missing": {}, }, // FIXME: "null_or_missing" is included because json_typegen collapses // null/missing + no inference info into it, but checks only Shape::Optional @@ -207,14 +207,14 @@ impl FormatTests for JSchema { "title": SCHEMA_TITLE, "type": "object", "properties": { - "hello": { "type": INTEGER}, + "hello": { "type": INTEGER }, + "world": { "type": "string" }, "sequence": { "type": "array", "items": { "type": "string" }, }, - "world": { "type": "string" }, }, - "required": [ "hello", "sequence", "world" ], + "required": [ "hello", "world", "sequence" ], }) } fn map_struct_mixed_sequence_optional() -> Self::Value { @@ -224,6 +224,7 @@ impl FormatTests for JSchema { "type": "object", "properties": { "hello": { "type": INTEGER }, + "world": { "type": "string" }, "optional": {}, "sequence": { "type": "array", @@ -234,9 +235,8 @@ impl FormatTests for JSchema { // "items": { "type": [ "string", "null" ] }, "items": { "type": "string" }, }, - "world": { "type": "string" }, }, - "required": [ "hello", "optional", "sequence", "world" ], + "required": [ "hello", "world", "optional", "sequence" ], }) } } diff --git a/schema_analysis/tests/target_json_typegen_shape.rs b/schema_analysis/tests/target_json_typegen_shape.rs index f215284..8fe53b6 100644 --- a/schema_analysis/tests/target_json_typegen_shape.rs +++ b/schema_analysis/tests/target_json_typegen_shape.rs @@ -97,8 +97,8 @@ impl FormatTests for JsonTypegen { let mut fields = LinkedHashMap::new(); fields.insert("hello".to_string(), Shape::Integer); - fields.insert("mixed".to_string(), Shape::Any); fields.insert("world".to_string(), Shape::StringT); + fields.insert("mixed".to_string(), Shape::Any); Shape::VecT { elem_type: Box::new(Shape::Struct { fields }), @@ -109,15 +109,15 @@ impl FormatTests for JsonTypegen { let mut fields = LinkedHashMap::new(); fields.insert("hello".to_string(), Shape::Integer); - fields.insert("null_or_missing".to_string(), Shape::Null); - fields.insert( - "possibly_missing".to_string(), - Shape::Optional(Box::new(Shape::Floating)), - ); fields.insert( "possibly_null".to_string(), Shape::Optional(Box::new(Shape::StringT)), ); + fields.insert( + "possibly_missing".to_string(), + Shape::Optional(Box::new(Shape::Floating)), + ); + fields.insert("null_or_missing".to_string(), Shape::Null); Shape::VecT { elem_type: Box::new(Shape::Struct { fields }), @@ -128,13 +128,13 @@ impl FormatTests for JsonTypegen { let mut fields = LinkedHashMap::new(); fields.insert("hello".to_string(), Shape::Integer); + fields.insert("world".to_string(), Shape::StringT); fields.insert( "sequence".to_string(), Shape::VecT { elem_type: Box::new(Shape::StringT), }, ); - fields.insert("world".to_string(), Shape::StringT); Shape::Struct { fields } } @@ -143,6 +143,7 @@ impl FormatTests for JsonTypegen { let mut fields = LinkedHashMap::new(); fields.insert("hello".to_string(), Shape::Integer); + fields.insert("world".to_string(), Shape::StringT); fields.insert("optional".to_string(), Shape::Null); fields.insert( "sequence".to_string(), @@ -150,7 +151,6 @@ impl FormatTests for JsonTypegen { elem_type: Box::new(Shape::Optional(Box::new(Shape::StringT))), }, ); - fields.insert("world".to_string(), Shape::StringT); Shape::Struct { fields } }