diff --git a/schema_analysis/src/analysis/field.rs b/schema_analysis/src/analysis/field.rs index 2f6f5b6..107d7bc 100644 --- a/schema_analysis/src/analysis/field.rs +++ b/schema_analysis/src/analysis/field.rs @@ -1,36 +1,39 @@ +use std::marker::PhantomData; + use serde::de::{DeserializeSeed, Error, Visitor}; use crate::Field; use super::{schema::SchemaVisitor, schema_seed::SchemaVisitorSeed, Context}; -pub struct FieldVisitor<'s> { - pub context: &'s Context, +pub(super) struct InferredField { + _marker: PhantomData, } - -impl<'de> DeserializeSeed<'de> for FieldVisitor<'_> { - type Value = Field; +impl InferredField { + pub fn new() -> Self { + Self { + _marker: PhantomData, + } + } +} +impl<'de, C: Context> DeserializeSeed<'de> for InferredField { + type Value = Field; fn deserialize(self, deserializer: D) -> Result where D: serde::Deserializer<'de>, { let mut field = Field::default(); - deserializer.deserialize_any(FieldVisitorSeed { - context: self.context, - field: &mut field, - })?; - + deserializer.deserialize_any(InferredFieldSeed { field: &mut field })?; Ok(field) } } -pub struct FieldVisitorSeed<'s> { - pub context: &'s Context, - pub field: &'s mut Field, +// NOTE: this is also the [Visitor] for convenience. +pub(super) struct InferredFieldSeed<'s, C: Context> { + pub(super) field: &'s mut Field, } - -impl<'de> DeserializeSeed<'de> for FieldVisitorSeed<'_> { +impl<'de, C: Context> DeserializeSeed<'de> for InferredFieldSeed<'_, C> { type Value = (); fn deserialize(self, deserializer: D) -> Result @@ -40,7 +43,6 @@ impl<'de> DeserializeSeed<'de> for FieldVisitorSeed<'_> { deserializer.deserialize_any(self) } } - macro_rules! method_impl { ($method_name:ident, $type:ty) => { fn $method_name(self, value: $type) -> Result { @@ -48,18 +50,11 @@ macro_rules! method_impl { // If a schema is already present, then we can use it as seed and let // the schema side of things take care of the rest. Some(schema) => { - let () = SchemaVisitorSeed { - context: self.context, - schema, - } - .$method_name(value)?; + let () = SchemaVisitorSeed { schema }.$method_name(value)?; } // Otherwise we need to generate a new schema. None => { - let schema = SchemaVisitor { - context: self.context, - } - .$method_name(value)?; + let schema = SchemaVisitor::new().$method_name(value)?; self.field.schema = Some(schema); } } @@ -70,8 +65,7 @@ macro_rules! method_impl { } }; } - -impl<'de> Visitor<'de> for FieldVisitorSeed<'_> { +impl<'de, C: Context> Visitor<'de> for InferredFieldSeed<'_, C> { type Value = (); fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { @@ -162,17 +156,10 @@ impl<'de> Visitor<'de> for FieldVisitorSeed<'_> { { match &mut self.field.schema { Some(schema) => { - SchemaVisitorSeed { - context: self.context, - schema, - } - .visit_seq(seq)?; + SchemaVisitorSeed { schema }.visit_seq(seq)?; } None => { - let schema = SchemaVisitor { - context: self.context, - } - .visit_seq(seq)?; + let schema = SchemaVisitor::new().visit_seq(seq)?; self.field.schema = Some(schema); } } @@ -186,17 +173,10 @@ impl<'de> Visitor<'de> for FieldVisitorSeed<'_> { { match &mut self.field.schema { Some(schema) => { - SchemaVisitorSeed { - context: self.context, - schema, - } - .visit_map(map)?; + SchemaVisitorSeed { schema }.visit_map(map)?; } None => { - let schema = SchemaVisitor { - context: self.context, - } - .visit_map(map)?; + let schema = SchemaVisitor::new().visit_map(map)?; self.field.schema = Some(schema); } } diff --git a/schema_analysis/src/analysis/mod.rs b/schema_analysis/src/analysis/mod.rs index cb1186d..35ae692 100644 --- a/schema_analysis/src/analysis/mod.rs +++ b/schema_analysis/src/analysis/mod.rs @@ -170,13 +170,15 @@ let _: Visitor::Value = deserializer.deserialize_str(visitor); `[...]` */ -use once_cell::sync::Lazy; -use serde::{de::DeserializeSeed, Deserialize, Deserializer}; +use serde::{de::DeserializeSeed, Deserialize}; #[allow(unused_imports)] use serde::de::Visitor; // For docs above. -use crate::{Coalesce, Context, Schema}; +use crate::{ + context::{Context, DefaultContext}, + Coalesce, Schema, +}; mod field; mod schema; @@ -185,44 +187,37 @@ mod schema_seed; use schema::SchemaVisitor; use schema_seed::SchemaVisitorSeed; -/// Since the context is never modified, we can store a default to avoid creating a new one -/// each time. -static DEFAULT_CONTEXT: Lazy = Lazy::new(Context::default); - /** [InferredSchema] is at the heart of this crate, it is a wrapper around [Schema] that interfaces with the analysis code. It implements both [Deserialize] and [DeserializeSeed] to allow for analysis both when no schema is yet available and when we wish to expand an existing schema (for data across files, for example). */ -#[derive(Debug, Clone, PartialEq)] -pub struct InferredSchema { +pub struct InferredSchema { /// Where the juicy info lays. - pub schema: Schema, + pub schema: Schema, } -impl Coalesce for InferredSchema { - fn coalesce(&mut self, other: Self) - where - Self: Sized, - { +impl Coalesce for InferredSchema +where + Schema: Coalesce, +{ + fn coalesce(&mut self, other: Self) { self.schema.coalesce(other.schema) } } -// (no schema + no context) -> (schema + no context) -impl<'de> Deserialize<'de> for InferredSchema { +impl<'de, C: Context + Default> Deserialize<'de> for InferredSchema { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { - let visitor = SchemaVisitor { - context: &DEFAULT_CONTEXT, - }; - let schema = deserializer.deserialize_any(visitor)?; + let schema = deserializer.deserialize_any(SchemaVisitor::new())?; Ok(InferredSchema { schema }) } } -// (schema + no context) -> (schema + no context) -impl<'de> DeserializeSeed<'de> for &mut InferredSchema { +impl<'de, C: Context> DeserializeSeed<'de> for &mut InferredSchema +where + Schema: Coalesce, +{ type Value = (); fn deserialize(self, deserializer: D) -> Result @@ -230,7 +225,6 @@ impl<'de> DeserializeSeed<'de> for &mut InferredSchema { D: serde::Deserializer<'de>, { let visitor = SchemaVisitorSeed { - context: &DEFAULT_CONTEXT, schema: &mut self.schema, }; deserializer.deserialize_any(visitor)?; @@ -238,63 +232,42 @@ impl<'de> DeserializeSeed<'de> for &mut InferredSchema { } } -/** -[InferredSchemaWithContext] is an experimental feature that allows the user to provide a custom -context. - -It is meant to be used along with [Aggregators](crate::context::Aggregators) holding -custom aggregators as trait objects. -To use it, construct a [Default] [Context] and push custom aggregators to the `other_aggregators` -fields present on some sub-contexts like [StringContext](crate::context::StringContext). The -custom aggregator will need to implement [CoalescingAggregator](crate::traits::CoalescingAggregator). - */ -#[derive(Debug, Clone, PartialEq)] -pub struct InferredSchemaWithContext { - /// The schema holds the actual description of the data. - pub schema: Schema, - /// The context may be user-provided with additional aggregators. - pub context: Context, -} -impl Coalesce for InferredSchemaWithContext { - fn coalesce(&mut self, other: Self) +mod boilerplate { + use std::fmt; + + use crate::{context::Context, Schema}; + + use super::InferredSchema; + + // Auto-generated, with bounds changed. (TODO: use perfect derive.) + impl fmt::Debug for InferredSchema where - Self: Sized, + Schema: fmt::Debug, { - self.schema.coalesce(other.schema); + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("InferredSchema") + .field("schema", &self.schema) + .finish() + } } -} -// (schema + context) -> (schema + context) -impl<'de> DeserializeSeed<'de> for &mut InferredSchemaWithContext { - type Value = (); - - fn deserialize(self, deserializer: D) -> Result + // Auto-generated, with bounds changed. (TODO: use perfect derive.) + impl Clone for InferredSchema where - D: serde::Deserializer<'de>, + Schema: Clone, { - let visitor = SchemaVisitorSeed { - context: &self.context, - schema: &mut self.schema, - }; - deserializer.deserialize_any(visitor)?; - Ok(()) + fn clone(&self) -> Self { + Self { + schema: self.schema.clone(), + } + } } -} -// (no schema + context) -> (schema + context) -impl Context { - /// Deserialization of a new schema using a context, returns a [InferredSchemaWithContext] that - /// can be used to deserialize further files and reuse the context. - pub fn deserialize_schema<'de, D>( - self, - deserializer: D, - ) -> Result + // Auto-generated, with bounds changed. (TODO: use perfect derive.) + impl PartialEq for InferredSchema where - D: Deserializer<'de>, + Schema: PartialEq, { - let visitor = SchemaVisitor { context: &self }; - let schema = deserializer.deserialize_any(visitor)?; - Ok(InferredSchemaWithContext { - context: self, - schema, - }) + fn eq(&self, other: &Self) -> bool { + self.schema == other.schema + } } } diff --git a/schema_analysis/src/analysis/schema.rs b/schema_analysis/src/analysis/schema.rs index d1f42a6..9f1742e 100644 --- a/schema_analysis/src/analysis/schema.rs +++ b/schema_analysis/src/analysis/schema.rs @@ -1,50 +1,58 @@ +use std::marker::PhantomData; + use ordermap::OrderMap; use serde::de::{Error, Visitor}; -use crate::{Aggregate, Field, Schema}; +use crate::{traits::Aggregate, Field, Schema}; use super::{ - field::{FieldVisitor, FieldVisitorSeed}, + field::{InferredField, InferredFieldSeed}, Context, }; -pub struct SchemaVisitor<'s> { - pub context: &'s Context, +pub(super) struct SchemaVisitor { + _marker: PhantomData, } - -impl<'de> Visitor<'de> for SchemaVisitor<'_> { - type Value = Schema; +impl SchemaVisitor { + pub fn new() -> Self { + Self { + _marker: PhantomData, + } + } +} +impl<'de, C: Context> Visitor<'de> for SchemaVisitor { + type Value = Schema; fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { formatter.write_str("anything") } fn visit_bool(self, value: bool) -> Result { - let mut aggregators = self.context.for_boolean(); + let mut aggregators = C::Boolean::default(); aggregators.aggregate(&value); Ok(Schema::Boolean(aggregators)) } fn visit_i128(self, value: i128) -> Result { - let mut aggregators = self.context.for_integer(); + let mut aggregators = C::Integer::default(); aggregators.aggregate(&value); Ok(Schema::Integer(aggregators)) } fn visit_f64(self, value: f64) -> Result { - let mut aggregators = self.context.for_float(); + let mut aggregators = C::Float::default(); aggregators.aggregate(&value); Ok(Schema::Float(aggregators)) } fn visit_borrowed_str(self, value: &'de str) -> Result { - let mut aggregators = self.context.for_string(); + let mut aggregators = C::String::default(); aggregators.aggregate(value); Ok(Schema::String(aggregators)) } fn visit_borrowed_bytes(self, value: &'de [u8]) -> Result { - let mut aggregators = self.context.for_bytes(); + let mut aggregators = C::Bytes::default(); aggregators.aggregate(value); Ok(Schema::Bytes(aggregators)) @@ -104,7 +112,7 @@ impl<'de> Visitor<'de> for SchemaVisitor<'_> { /// This method should only be called if the Null value is at the root of the document, /// because otherwise null values are handled at the field level. fn visit_none(self) -> Result { - let mut aggregators = self.context.for_null(); + let mut aggregators = C::Null::default(); aggregators.aggregate(&()); Ok(Schema::Null(aggregators)) @@ -133,18 +141,15 @@ impl<'de> Visitor<'de> for SchemaVisitor<'_> { { let mut count = 0; - let initial_seed = FieldVisitor { - context: self.context, - }; + let initial_seed = InferredField::new(); let mut field = match seq.next_element_seed(initial_seed)? { Some(mut field) => { count += 1; - while let Some(()) = seq.next_element_seed(FieldVisitorSeed { - context: self.context, - field: &mut field, - })? { + while let Some(()) = + seq.next_element_seed(InferredFieldSeed { field: &mut field })? + { count += 1; } @@ -158,7 +163,7 @@ impl<'de> Visitor<'de> for SchemaVisitor<'_> { field.status.may_be_missing = true; } - let mut aggregators = self.context.for_sequence(); + let mut aggregators = C::Sequence::default(); aggregators.aggregate(&count); Ok(Schema::Sequence { @@ -172,22 +177,17 @@ impl<'de> Visitor<'de> for SchemaVisitor<'_> { A: serde::de::MapAccess<'de>, { let mut keys = Vec::new(); - let mut fields: OrderMap = OrderMap::new(); + let mut fields: OrderMap> = OrderMap::new(); while let Some(key) = map.next_key::()? { match fields.get_mut(&key) { Some(old_field) => { - map.next_value_seed(FieldVisitorSeed { - context: self.context, - field: old_field, - })?; + map.next_value_seed(InferredFieldSeed { field: old_field })?; old_field.status.allow_duplicates(true); } None => { - let new_field = map.next_value_seed(FieldVisitor { - context: self.context, - })?; + let new_field = map.next_value_seed(InferredField::new())?; fields.insert(key.clone(), new_field); } } @@ -195,7 +195,7 @@ impl<'de> Visitor<'de> for SchemaVisitor<'_> { keys.push(key.clone()); } - let mut aggregators = self.context.for_map_struct(); + let mut aggregators = C::Struct::default(); aggregators.aggregate(&keys); Ok(Schema::Struct { diff --git a/schema_analysis/src/analysis/schema_seed.rs b/schema_analysis/src/analysis/schema_seed.rs index 13f03ca..1448aca 100644 --- a/schema_analysis/src/analysis/schema_seed.rs +++ b/schema_analysis/src/analysis/schema_seed.rs @@ -1,19 +1,20 @@ use serde::de::{Error, Visitor}; -use crate::{traits::Coalesce, Aggregate, Schema}; +use crate::{traits::Aggregate, traits::Coalesce, Schema}; use super::{ - field::{FieldVisitor, FieldVisitorSeed}, + field::{InferredField, InferredFieldSeed}, schema::SchemaVisitor, Context, }; -pub struct SchemaVisitorSeed<'s> { - pub context: &'s Context, - pub schema: &'s mut Schema, +pub(super) struct SchemaVisitorSeed<'s, C: Context> { + pub(super) schema: &'s mut Schema, } - -impl<'de> Visitor<'de> for SchemaVisitorSeed<'_> { +impl<'de, C: Context> Visitor<'de> for SchemaVisitorSeed<'_, C> +where + Schema: Coalesce, +{ type Value = (); fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { @@ -26,10 +27,7 @@ impl<'de> Visitor<'de> for SchemaVisitorSeed<'_> { Schema::Boolean(aggregators) => aggregators.aggregate(&value), // Extend a different schema schema => { - let new_schema = SchemaVisitor { - context: self.context, - } - .visit_bool(value)?; + let new_schema = SchemaVisitor::new().visit_bool(value)?; schema.coalesce(new_schema); } @@ -42,10 +40,7 @@ impl<'de> Visitor<'de> for SchemaVisitorSeed<'_> { Schema::Integer(aggregators) => aggregators.aggregate(&value), // Extend a different schema schema => { - let new_schema = SchemaVisitor { - context: self.context, - } - .visit_i128(value)?; + let new_schema = SchemaVisitor::new().visit_i128(value)?; schema.coalesce(new_schema); } @@ -58,10 +53,7 @@ impl<'de> Visitor<'de> for SchemaVisitorSeed<'_> { Schema::Float(aggregators) => aggregators.aggregate(&value), // Extend a different schema schema => { - let new_schema = SchemaVisitor { - context: self.context, - } - .visit_f64(value)?; + let new_schema = SchemaVisitor::new().visit_f64(value)?; schema.coalesce(new_schema); } @@ -74,10 +66,7 @@ impl<'de> Visitor<'de> for SchemaVisitorSeed<'_> { Schema::String(aggregators) => aggregators.aggregate(value), // Extend a different schema schema => { - let new_schema = SchemaVisitor { - context: self.context, - } - .visit_borrowed_str(value)?; + let new_schema = SchemaVisitor::new().visit_borrowed_str(value)?; schema.coalesce(new_schema); } @@ -90,10 +79,7 @@ impl<'de> Visitor<'de> for SchemaVisitorSeed<'_> { Schema::Bytes(aggregators) => aggregators.aggregate(value), // Extend a different schema schema => { - let new_schema = SchemaVisitor { - context: self.context, - } - .visit_borrowed_bytes(value)?; + let new_schema = SchemaVisitor::new().visit_borrowed_bytes(value)?; schema.coalesce(new_schema); } @@ -162,10 +148,7 @@ impl<'de> Visitor<'de> for SchemaVisitorSeed<'_> { } // Extend a different schema schema => { - let new_schema = SchemaVisitor { - context: self.context, - } - .visit_none()?; + let new_schema = SchemaVisitor::new().visit_none()?; schema.coalesce(new_schema); } @@ -204,10 +187,7 @@ impl<'de> Visitor<'de> for SchemaVisitorSeed<'_> { } => { let field = boxed_field.as_mut(); - while let Some(()) = seq.next_element_seed(FieldVisitorSeed { - context: self.context, - field, - })? { + while let Some(()) = seq.next_element_seed(InferredFieldSeed { field })? { count += 1; } @@ -219,10 +199,7 @@ impl<'de> Visitor<'de> for SchemaVisitorSeed<'_> { } // Extend a different schema schema => { - let sequence_schema = SchemaVisitor { - context: self.context, - } - .visit_seq(seq)?; + let sequence_schema = SchemaVisitor::new().visit_seq(seq)?; schema.coalesce(sequence_schema); } }; @@ -243,16 +220,11 @@ impl<'de> Visitor<'de> for SchemaVisitorSeed<'_> { match fields.get_mut(&key) { Some(old_field) => { old_field.status.allow_duplicates(keys.contains(&key)); - map.next_value_seed(FieldVisitorSeed { - context: self.context, - field: old_field, - })?; + map.next_value_seed(InferredFieldSeed { field: old_field })?; } None => { - let mut new_field = map.next_value_seed(FieldVisitor { - context: self.context, - })?; + let mut new_field = map.next_value_seed(InferredField::new())?; // If we are adding it to an existing schema it means that it was // missing when this schema was created. new_field.status.may_be_missing = true; @@ -273,10 +245,7 @@ impl<'de> Visitor<'de> for SchemaVisitorSeed<'_> { aggregators.aggregate(&keys); } schema => { - let sequence_schema = SchemaVisitor { - context: self.context, - } - .visit_map(map)?; + let sequence_schema = SchemaVisitor::new().visit_map(map)?; schema.coalesce(sequence_schema); } } diff --git a/schema_analysis/src/context/aggregators.rs b/schema_analysis/src/context/aggregators.rs deleted file mode 100644 index 81c15f7..0000000 --- a/schema_analysis/src/context/aggregators.rs +++ /dev/null @@ -1,52 +0,0 @@ -use std::{any::Any, fmt::Debug}; - -use crate::{Aggregate, Coalesce, CoalescingAggregator}; - -/// A collection of aggregators that should allow the user of the library to run arbitrary -/// aggregation code on the data as it is being analyzed. -/// -/// This is an experimental feature. -#[derive(Debug)] -pub struct Aggregators(pub Vec>>); - -impl Aggregate for Aggregators { - fn aggregate(&mut self, value: &'_ V) { - for a in &mut self.0 { - a.aggregate(value) - } - } -} -impl Coalesce for Aggregators { - fn coalesce(&mut self, other: Aggregators) - where - Self: Sized, - { - 'outer: for o in other.0 { - let mut o: Box = o.into_any(); - for s in &mut self.0 { - // coalesce_any returns the value if it doesn't manage to coalesce it. - o = match s.coalesce_any(o) { - Some(o) => o, - None => continue 'outer, - } - } - let o = *o.downcast::>>().unwrap(); - self.0.push(o); - } - } -} -impl Clone for Aggregators { - fn clone(&self) -> Self { - Aggregators(self.0.clone()) - } -} -impl Default for Aggregators { - fn default() -> Self { - Self(Default::default()) - } -} -impl From>>> for Aggregators { - fn from(value: Vec>>) -> Self { - Self(value) - } -} diff --git a/schema_analysis/src/context/boolean.rs b/schema_analysis/src/context/boolean.rs index 2aacb30..68d5a7b 100644 --- a/schema_analysis/src/context/boolean.rs +++ b/schema_analysis/src/context/boolean.rs @@ -2,7 +2,7 @@ use serde::{Deserialize, Serialize}; -use crate::{traits::Coalesce, Aggregate}; +use crate::{traits::Aggregate, traits::Coalesce}; use super::Counter; @@ -22,10 +22,7 @@ impl Aggregate for BooleanContext { } } impl Coalesce for BooleanContext { - fn coalesce(&mut self, other: Self) - where - Self: Sized, - { + fn coalesce(&mut self, other: Self) { self.count.coalesce(other.count); self.trues.coalesce(other.trues); self.falses.coalesce(other.falses); diff --git a/schema_analysis/src/context/bytes.rs b/schema_analysis/src/context/bytes.rs index 3c01055..2e82a96 100644 --- a/schema_analysis/src/context/bytes.rs +++ b/schema_analysis/src/context/bytes.rs @@ -2,37 +2,28 @@ use serde::{Deserialize, Serialize}; -use crate::{traits::Coalesce, Aggregate}; +use crate::{traits::Aggregate, traits::Coalesce}; -use super::{shared::Counter, shared::MinMax, Aggregators}; +use super::{shared::Counter, shared::MinMax}; #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct BytesContext { pub count: Counter, pub min_max_length: MinMax, - #[serde(skip)] - pub other_aggregators: Aggregators<[u8]>, } impl Aggregate<[u8]> for BytesContext { fn aggregate(&mut self, value: &'_ [u8]) { self.count.aggregate(value); self.min_max_length.aggregate(&value.len()); - self.other_aggregators.aggregate(value); } } impl Coalesce for BytesContext { - fn coalesce(&mut self, other: Self) - where - Self: Sized, - { + fn coalesce(&mut self, other: Self) { self.count.coalesce(other.count); self.min_max_length.coalesce(other.min_max_length); - self.other_aggregators.coalesce(other.other_aggregators); } } impl PartialEq for BytesContext { - /// NOTE: [BytesContext]'s [PartialEq] implementation ignores the `other_aggregators` - /// provided by the user of the library. fn eq(&self, other: &Self) -> bool { self.count == other.count && self.min_max_length == other.min_max_length } diff --git a/schema_analysis/src/context/map_struct.rs b/schema_analysis/src/context/map_struct.rs index f825e89..d072e7c 100644 --- a/schema_analysis/src/context/map_struct.rs +++ b/schema_analysis/src/context/map_struct.rs @@ -2,15 +2,13 @@ use serde::{Deserialize, Serialize}; -use crate::{traits::Coalesce, Aggregate}; +use crate::{traits::Aggregate, traits::Coalesce}; -use super::{Aggregators, Counter}; +use super::Counter; #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct MapStructContext { pub count: Counter, - #[serde(skip)] - pub other_aggregators: Aggregators<[String]>, } impl Aggregate<[String]> for MapStructContext { fn aggregate(&mut self, value: &[String]) { @@ -18,16 +16,11 @@ impl Aggregate<[String]> for MapStructContext { } } impl Coalesce for MapStructContext { - fn coalesce(&mut self, other: Self) - where - Self: Sized, - { + fn coalesce(&mut self, other: Self) { self.count.coalesce(other.count); } } impl PartialEq for MapStructContext { - /// NOTE: [MapStructContext]'s [PartialEq] implementation ignores the `other_aggregators` - /// provided by the user of the library. fn eq(&self, other: &Self) -> bool { self.count == other.count } diff --git a/schema_analysis/src/context/mod.rs b/schema_analysis/src/context/mod.rs index d51be27..c678046 100644 --- a/schema_analysis/src/context/mod.rs +++ b/schema_analysis/src/context/mod.rs @@ -1,6 +1,7 @@ -//! The [Context] provides a way to store information about the types found during analysis. +//! A [Context] provides a way to store information about the types found during analysis. +//! +//! [DefaultContext] is the one used by default. `()` can be used to skip any additional analysis. -mod aggregators; mod boolean; mod bytes; mod map_struct; @@ -10,7 +11,6 @@ mod sequence; mod shared; mod string; -pub use aggregators::Aggregators; pub use boolean::BooleanContext; pub use bytes::BytesContext; pub use map_struct::MapStructContext; @@ -21,64 +21,60 @@ pub use shared::{Counter, CountingSet}; pub use string::{SemanticExtractor, StringContext, SuspiciousStrings}; use serde::{Deserialize, Serialize}; +use std::fmt::Debug; -/// The Context holds a fresh copy of the context that each [Schema](crate::Schema) -/// copies when it's first created and then fills as the analysis proceeds. +use crate::{traits::Aggregate, Coalesce}; + +/// Interface describing the custom analysis that will be run on each type +/// alongside the schema shape. /// -/// All default context should respect a constant memory bound on each node. -/// This will allow analysis of arbitraryly large amounts of data as long as the schema does not -/// grow out of proportion. -#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)] -pub struct Context { - /// The context for null values. - pub null: NullContext, - /// The context for boolean values. - pub boolean: BooleanContext, - /// The context for integer values. - pub integer: NumberContext, - /// The context for floating point values. - pub float: NumberContext, - /// The context for string values. - pub string: StringContext, - /// The context for bytes values. - pub bytes: BytesContext, - /// The context for sequence values. - pub sequence: SequenceContext, - /// The context for struct values. - pub map_struct: MapStructContext, +/// For no analysis, you can use `()`. [DefaultContext] is the one used by default. +pub trait Context { + /// The state for the analysis run on null values. + type Null: Aggregate<()> + Coalesce + Default; + /// The state for the analysis run on boolean values. + type Boolean: Aggregate + Coalesce + Default; + /// The state for the analysis run on integer values. + type Integer: Aggregate + Coalesce + Default; + /// The state for the analysis run on floating point values. + type Float: Aggregate + Coalesce + Default; + /// The state for the analysis run on strings. + type String: Aggregate + Coalesce + Default; + /// The state for the analysis run on binary data. + type Bytes: Aggregate<[u8]> + Coalesce + Default; + /// The state for the analysis run on sequence values. + type Sequence: Aggregate + Coalesce + Default; + /// The state for the analysis run on struct values. + type Struct: Aggregate<[String]> + Coalesce + Default; +} + +impl Context for () { + type Null = (); + type Boolean = (); + type Integer = (); + type Float = (); + type String = (); + type Bytes = (); + type Sequence = (); + type Struct = (); } -impl Context { - /// Returns a fresh context for null schemas. - pub fn for_null(&self) -> NullContext { - self.null.clone() - } - /// Returns a fresh context for boolean schemas. - pub fn for_boolean(&self) -> BooleanContext { - self.boolean.clone() - } - /// Returns a fresh context for integer schemas. - pub fn for_integer(&self) -> NumberContext { - self.integer.clone() - } - /// Returns a fresh context for floating point schemas. - pub fn for_float(&self) -> NumberContext { - self.float.clone() - } - /// Returns a fresh context for string schemas. - pub fn for_string(&self) -> StringContext { - self.string.clone() - } - /// Returns a fresh context for bytes schemas. - pub fn for_bytes(&self) -> BytesContext { - self.bytes.clone() - } - /// Returns a fresh context for sequence schemas. - pub fn for_sequence(&self) -> SequenceContext { - self.sequence.clone() - } - /// Returns a fresh context for struct schemas. - pub fn for_map_struct(&self) -> MapStructContext { - self.map_struct.clone() - } +/// This is the default [Context]. +/// It performs some basic analysis like counting and sampling. +/// +/// This context has a memory bound for each node. +/// This allows the analysis of arbitraryly large amounts of data as long +/// as the schema itself does not grow out of proportion. +/// (Do note that sampling might still be very large if individual leaves are large.) +#[derive(Debug, Clone, Copy, PartialEq, Default, Serialize, Deserialize)] +pub struct DefaultContext; +impl Context for DefaultContext { + type Null = NullContext; + type Boolean = BooleanContext; + type Integer = NumberContext; + type Float = NumberContext; + type String = StringContext; + type Bytes = BytesContext; + type Sequence = SequenceContext; + type Struct = MapStructContext; } diff --git a/schema_analysis/src/context/null.rs b/schema_analysis/src/context/null.rs index 6e57c9a..eb95e57 100644 --- a/schema_analysis/src/context/null.rs +++ b/schema_analysis/src/context/null.rs @@ -2,7 +2,7 @@ use serde::{Deserialize, Serialize}; -use crate::{traits::Coalesce, Aggregate}; +use crate::{traits::Aggregate, traits::Coalesce}; use super::Counter; @@ -16,10 +16,7 @@ impl Aggregate<()> for NullContext { } } impl Coalesce for NullContext { - fn coalesce(&mut self, other: Self) - where - Self: Sized, - { + fn coalesce(&mut self, other: Self) { self.count.coalesce(other.count); } } diff --git a/schema_analysis/src/context/number.rs b/schema_analysis/src/context/number.rs index 9e3723c..016e460 100644 --- a/schema_analysis/src/context/number.rs +++ b/schema_analysis/src/context/number.rs @@ -1,8 +1,10 @@ #![allow(missing_docs)] +use std::fmt::Debug; + use serde::{de::DeserializeOwned, Deserialize, Serialize}; -use crate::{traits::Coalesce, Aggregate, Aggregators}; +use crate::{traits::Aggregate, traits::Coalesce}; use super::{ shared::{MinMax, Sampler}, @@ -20,15 +22,12 @@ pub struct NumberContext { pub samples: Sampler, #[serde(flatten)] pub min_max: MinMax, - #[serde(skip)] - pub other_aggregators: Aggregators, } impl Aggregate for NumberContext { fn aggregate(&mut self, value: &i128) { self.count.aggregate(value); self.samples.aggregate(value); self.min_max.aggregate(value); - self.other_aggregators.aggregate(value); } } impl Aggregate for NumberContext { @@ -38,23 +37,16 @@ impl Aggregate for NumberContext { if !value.is_nan() { self.min_max.aggregate(value); } - self.other_aggregators.aggregate(value); } } -impl Coalesce for NumberContext { - fn coalesce(&mut self, other: Self) - where - Self: Sized, - { +impl Coalesce for NumberContext { + fn coalesce(&mut self, other: Self) { self.count.coalesce(other.count); self.samples.coalesce(other.samples); self.min_max.coalesce(other.min_max); - self.other_aggregators.coalesce(other.other_aggregators); } } impl PartialEq for NumberContext { - /// NOTE: [NumberContext]'s [PartialEq] implementation ignores the `other_aggregators` - /// provided by the user of the library. fn eq(&self, other: &Self) -> bool { self.count == other.count && self.min_max == other.min_max } diff --git a/schema_analysis/src/context/sequence.rs b/schema_analysis/src/context/sequence.rs index 6bd0f05..7354375 100644 --- a/schema_analysis/src/context/sequence.rs +++ b/schema_analysis/src/context/sequence.rs @@ -2,7 +2,7 @@ use serde::{Deserialize, Serialize}; -use crate::{traits::Coalesce, Aggregate, Aggregators}; +use crate::{traits::Aggregate, traits::Coalesce}; use super::{shared::MinMax, Counter}; @@ -10,29 +10,20 @@ use super::{shared::MinMax, Counter}; pub struct SequenceContext { pub count: Counter, pub length: MinMax, - #[serde(skip)] - pub other_aggregators: Aggregators, } impl Aggregate for SequenceContext { fn aggregate(&mut self, value: &usize) { self.count.aggregate(value); self.length.aggregate(value); - self.other_aggregators.aggregate(value); } } impl Coalesce for SequenceContext { - fn coalesce(&mut self, other: Self) - where - Self: Sized, - { + fn coalesce(&mut self, other: Self) { self.count.coalesce(other.count); self.length.coalesce(other.length); - self.other_aggregators.coalesce(other.other_aggregators); } } impl PartialEq for SequenceContext { - /// NOTE: [SequenceContext]'s [PartialEq] implementation ignores the `other_aggregators` - /// provided by the user of the library. fn eq(&self, other: &Self) -> bool { self.count == other.count && self.length == other.length } diff --git a/schema_analysis/src/context/shared.rs b/schema_analysis/src/context/shared.rs index 34fc355..0a219a5 100644 --- a/schema_analysis/src/context/shared.rs +++ b/schema_analysis/src/context/shared.rs @@ -5,7 +5,7 @@ use std::{ use serde::{Deserialize, Serialize}; -use crate::{traits::Coalesce, Aggregate}; +use crate::{traits::Aggregate, traits::Coalesce}; // // Counter @@ -20,10 +20,7 @@ impl Aggregate for Counter { } } impl Coalesce for Counter { - fn coalesce(&mut self, other: Self) - where - Self: Sized, - { + fn coalesce(&mut self, other: Self) { self.0 += other.0; } } @@ -67,10 +64,7 @@ impl CountingSet { } } impl Coalesce for CountingSet { - fn coalesce(&mut self, other: Self) - where - Self: Sized, - { + fn coalesce(&mut self, other: Self) { for (k, v) in other.0 { let s: &mut usize = self.0.entry(k).or_insert(0); *s += v; @@ -111,10 +105,7 @@ impl Aggregate for MinMax { } } impl Coalesce for MinMax { - fn coalesce(&mut self, other: Self) - where - Self: Sized, - { + fn coalesce(&mut self, other: Self) { if let Some(other_min) = other.min { self.aggregate(&other_min); } @@ -150,10 +141,7 @@ where } } impl Coalesce for Sampler { - fn coalesce(&mut self, other: Self) - where - Self: Sized, - { + fn coalesce(&mut self, other: Self) { self.values.extend(other.values); if self.values.len() > MAX_SAMPLE_COUNT { self.is_exaustive = false; diff --git a/schema_analysis/src/context/string.rs b/schema_analysis/src/context/string.rs index 8f2f86f..98df8db 100644 --- a/schema_analysis/src/context/string.rs +++ b/schema_analysis/src/context/string.rs @@ -6,12 +6,9 @@ use once_cell::sync::Lazy; use regex::Regex; use serde::{Deserialize, Serialize}; -use crate::{traits::Coalesce, Aggregate}; +use crate::{traits::Aggregate, traits::Coalesce}; -use super::{ - shared::{Counter, CountingSet, MinMax, Sampler}, - Aggregators, -}; +use super::shared::{Counter, CountingSet, MinMax, Sampler}; #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct StringContext { @@ -24,8 +21,6 @@ pub struct StringContext { #[serde(default, skip_serializing_if = "SemanticExtractor::is_empty")] pub semantic_extractor: SemanticExtractor, pub min_max_length: MinMax, - #[serde(skip)] - pub other_aggregators: Aggregators, } impl Aggregate for StringContext { fn aggregate(&mut self, value: &'_ str) { @@ -34,25 +29,18 @@ impl Aggregate for StringContext { self.suspicious_strings.aggregate(value); self.semantic_extractor.aggregate(value); self.min_max_length.aggregate(&value.len()); - self.other_aggregators.aggregate(value); } } impl Coalesce for StringContext { - fn coalesce(&mut self, other: Self) - where - Self: Sized, - { + fn coalesce(&mut self, other: Self) { self.count.coalesce(other.count); self.samples.coalesce(other.samples); self.suspicious_strings.coalesce(other.suspicious_strings); self.semantic_extractor.coalesce(other.semantic_extractor); self.min_max_length.coalesce(other.min_max_length); - self.other_aggregators.coalesce(other.other_aggregators); } } impl PartialEq for StringContext { - /// NOTE: [StringContext]'s [PartialEq] implementation ignores the `other_aggregators` - /// provided by the user of the library. fn eq(&self, other: &Self) -> bool { self.count == other.count && self.samples == other.samples @@ -87,10 +75,7 @@ impl Aggregate for SuspiciousStrings { } } impl Coalesce for SuspiciousStrings { - fn coalesce(&mut self, other: Self) - where - Self: Sized, - { + fn coalesce(&mut self, other: Self) { self.0.coalesce(other.0); } } @@ -137,10 +122,7 @@ impl Aggregate for SemanticExtractor { } } impl Coalesce for SemanticExtractor { - fn coalesce(&mut self, other: Self) - where - Self: Sized, - { + fn coalesce(&mut self, other: Self) { self.0.coalesce(other.0); } } diff --git a/schema_analysis/src/helpers.rs b/schema_analysis/src/helpers.rs index c144228..839845a 100644 --- a/schema_analysis/src/helpers.rs +++ b/schema_analysis/src/helpers.rs @@ -4,14 +4,16 @@ pub mod xml { //! A module for xml cleaning helper functions. //! Check individual functions for details. - use crate::{Field, Schema}; + use std::mem; + + use crate::{context::Context, Field, Schema}; /// A wrapper function that applies all XML cleaning transformations. /// /// [clean_solitary_nested_values] /// + [turn_duplicates_into_sequence_field] /// + [clean_empty_structs_in_field] - pub fn cleanup_xml_schema(schema: &mut Schema) { + pub fn cleanup_xml_schema(schema: &mut Schema) { clean_solitary_nested_values(schema); turn_duplicates_into_sequence_field(schema); clean_empty_structs_in_field(schema); @@ -22,7 +24,7 @@ pub mod xml { /// /// This function simply finds [Schema::Struct]s with a single field named `$value` and /// replaces them with the schema inside the `$value` field. - pub fn clean_solitary_nested_values(schema: &mut Schema) { + pub fn clean_solitary_nested_values(schema: &mut Schema) { use Schema::*; match schema { Null(_) | Boolean(_) | Integer(_) | Float(_) | String(_) | Bytes(_) => {} @@ -62,10 +64,10 @@ pub mod xml { /// /// To help with this the inference software annotates duplicate fields, and this function /// takes the schema in that field and places it into a [Schema::Sequence]. - pub fn turn_duplicates_into_sequence_field(schema: &mut Schema) { + pub fn turn_duplicates_into_sequence_field(schema: &mut Schema) { clean_field_recursively(schema, _inner_field_cleaning); - fn _inner_field_cleaning(field: &mut Field) { + fn _inner_field_cleaning(field: &mut Field) { if let Some(schema) = &mut field.schema { clean_field_recursively(schema, _inner_field_cleaning) } @@ -75,8 +77,8 @@ pub mod xml { *field = Field { status: field.status.clone(), schema: Some(Schema::Sequence { - field: Box::new(field.clone()), - context: Default::default(), + field: Box::new(mem::take(field)), + context: C::Sequence::default(), }), }; field.status.may_be_duplicate = false; @@ -88,10 +90,10 @@ pub mod xml { /// /// This function replaces those fields with empty [Schema::Struct] with fields of /// unknown schema. - pub fn clean_empty_structs_in_field(schema: &mut Schema) { + pub fn clean_empty_structs_in_field(schema: &mut Schema) { clean_field_recursively(schema, _inner_field_cleaning); - fn _inner_field_cleaning(field: &mut Field) { + fn _inner_field_cleaning(field: &mut Field) { match &mut field.schema { Some(Schema::Struct { fields, .. }) if fields.is_empty() => { field.schema = None; @@ -102,7 +104,7 @@ pub mod xml { } } - fn clean_field_recursively(schema: &mut Schema, clean_field: fn(&mut Field)) { + fn clean_field_recursively(schema: &mut Schema, clean_field: fn(&mut Field)) { use Schema::*; match schema { Null(_) | Boolean(_) | Integer(_) | Float(_) | String(_) | Bytes(_) => {} diff --git a/schema_analysis/src/lib.rs b/schema_analysis/src/lib.rs index 26d6ae1..b71b2ab 100644 --- a/schema_analysis/src/lib.rs +++ b/schema_analysis/src/lib.rs @@ -72,7 +72,7 @@ It's everything you love about Serde, but with runtime state. ``` # use serde::de::DeserializeSeed; -# use schema_analysis::{Schema, InferredSchema, context::NumberContext, Aggregate}; +# use schema_analysis::{Schema, InferredSchema, context::NumberContext, traits::Aggregate}; # fn main() -> Result<(), Box> { let a_lot_of_json_files: &[&str] = &[ "1", "2", "1000" ]; let mut iter = a_lot_of_json_files.iter(); @@ -142,17 +142,14 @@ from that info. [~13.3GB]: https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/ */ +mod analysis; mod schema; -use traits::CoalescingAggregator; - -pub mod analysis; pub mod context; pub mod helpers; pub mod targets; pub mod traits; -pub use analysis::{InferredSchema, InferredSchemaWithContext}; -pub use context::{Aggregators, Context}; +pub use analysis::InferredSchema; pub use schema::{Field, FieldStatus, Schema}; -pub use traits::{Aggregate, Coalesce, StructuralEq}; +pub use traits::{Coalesce, StructuralEq}; diff --git a/schema_analysis/src/schema.rs b/schema_analysis/src/schema.rs index 47a6aa5..af654a5 100644 --- a/schema_analysis/src/schema.rs +++ b/schema_analysis/src/schema.rs @@ -1,44 +1,40 @@ -use ordermap::OrderMap; +use std::mem; + +use ordermap::{map::Entry, OrderMap}; use serde::{Deserialize, Serialize}; -use crate::{ - context::{ - BooleanContext, BytesContext, MapStructContext, NullContext, NumberContext, - SequenceContext, StringContext, - }, - Coalesce, StructuralEq, -}; +use crate::{context::Context, context::DefaultContext, Coalesce, StructuralEq}; /// This enum is the core output of the analysis, it describes the structure of a document. /// /// Each variant also contains [context](crate::context) data that allows it to store information /// about the values it has encountered. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Serialize, Deserialize)] #[serde(tag = "type")] -pub enum Schema { +pub enum Schema { /// The Null variant is a special one that is only ever found when a document has a single /// null value at the root of the document. /// Null values in [Struct](Schema::Struct)s or [Sequence](Schema::Sequence)s are instead /// handled at the [Field] level, where it is more ergonomic. - Null(NullContext), + Null(C::Null), /// Represents a boolean value. - Boolean(BooleanContext), + Boolean(C::Boolean), /// Represents an integer value. - Integer(NumberContext), + Integer(C::Integer), /// Represents a floating point value. - Float(NumberContext), + Float(C::Float), /// Represents a textual value. - String(StringContext), + String(C::String), /// Represents a value of raw bytes. - Bytes(BytesContext), + Bytes(C::Bytes), /// Represents a sequence of values described by a [Field]. /// It assumes all values share the same schema. Sequence { /// The field is the structure shared by all the elements of the sequence. - field: Box, + field: Box>, /// The context aggregates information about the sequence. /// It is passed the length of the sequence. - context: SequenceContext, + context: C::Sequence, }, /// Represents a [String]->[Field] mapping. /// @@ -46,10 +42,10 @@ pub enum Schema { Struct { /// Each [String] key gets assigned a [Field]. /// Currently we are using a [BTreeMap], but that might change in the future. - fields: OrderMap, + fields: OrderMap>, /// The context aggregates information about the struct. /// It is passed a vector of the key names. - context: MapStructContext, + context: C::Struct, }, /// Simply a vector of [Schema]s, it should never contain an Union or multiple instances of the /// same variant inside. @@ -57,7 +53,7 @@ pub enum Schema { /// Note: content needs to be a struct variant to work with `#[serde(tag = "type")]`. Union { /// A list of the possible schemas that were found. - variants: Vec, + variants: Vec>, }, // Tuple(..), // Map(..), @@ -66,8 +62,12 @@ pub enum Schema { /// A [Field] is a useful abstraction to record metadata that does not belong or would be unyieldy /// to place into the [Schema] and to account for cases in which the existence of a [Field] might be /// known, but nothing is known about its shape. -#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)] -pub struct Field { +#[derive(Serialize, Deserialize)] +#[serde(bound( + serialize = "Schema: Serialize", + deserialize = "Schema: Deserialize<'de>" +))] +pub struct Field { /// The status holds information on the the field, like whether it might be null or /// missing altogether. Duplicate fields are also recorded. #[serde(flatten)] @@ -75,7 +75,15 @@ pub struct Field { /// The inner Schema is optional because we might have no information on the shape of the field /// (like for an empty array). #[serde(flatten)] - pub schema: Option, + pub schema: Option>, +} +impl Default for Field { + fn default() -> Self { + Self { + status: FieldStatus::default(), + schema: None, + } + } } /// The FieldStatus keeps track of what kind of values a [Field] has been found to have. @@ -100,7 +108,7 @@ pub struct FieldStatus { // // Schema implementations // -impl Schema { +impl Schema { /// Sorts the fields of the schema by their name (using [String::cmp]). pub fn sort_fields(&mut self) { match self { @@ -152,8 +160,14 @@ impl Schema { } } } + fn take(&mut self) -> Self { + mem::replace(self, Self::Null(C::Null::default())) + } } -impl StructuralEq for Schema { +impl StructuralEq for Schema +where + Self: Clone, +{ fn structural_eq(&self, other: &Self) -> bool { use Schema::*; match (self, other) { @@ -206,7 +220,7 @@ impl StructuralEq for Schema { } } } -impl Coalesce for Schema { +impl Coalesce for Schema { fn coalesce(&mut self, other: Self) { use Schema::*; match (self, other) { @@ -242,10 +256,14 @@ impl Coalesce for Schema { ) => { self_agg.coalesce(other_agg); for (name, other_schema) in other_fields { - self_fields - .entry(name) - .and_modify(|schema| schema.coalesce(other_schema.clone())) - .or_insert_with(|| other_schema); + match self_fields.entry(name) { + Entry::Occupied(mut schema) => { + schema.get_mut().coalesce(other_schema); + } + Entry::Vacant(entry) => { + entry.insert(other_schema); + } + } } } ( @@ -268,23 +286,21 @@ impl Coalesce for Schema { variants: mut other_alternatives, }, ) => { - let self_original = std::mem::replace(any_self, Schema::Null(Default::default())); - coalesce_to_alternatives(&mut other_alternatives, self_original); + coalesce_to_alternatives(&mut other_alternatives, any_self.take()); *any_self = Schema::Union { variants: other_alternatives, }; } (any_self, any_other) => { - let self_original = std::mem::replace(any_self, Schema::Null(Default::default())); *any_self = Union { - variants: vec![self_original, any_other], + variants: vec![any_self.take(), any_other], }; } }; return; - fn coalesce_unions(selfs: &mut Vec, others: Vec) { + fn coalesce_unions(selfs: &mut Vec>, others: Vec>) { for o in others { coalesce_to_alternatives(selfs, o); } @@ -293,7 +309,10 @@ impl Coalesce for Schema { /// This function attempts to match the incomming schema against all the /// alternatives already present, and if it fails it pushes it to the vector as a /// new alternative. - fn coalesce_to_alternatives(alternatives: &mut Vec, mut other: Schema) { + fn coalesce_to_alternatives( + alternatives: &mut Vec>, + mut other: Schema, + ) { use Schema::*; for s in alternatives.iter_mut() { match (s, other) { @@ -352,10 +371,14 @@ impl Coalesce for Schema { ) => { self_agg.coalesce(other_agg); for (name, other_schema) in other_fields { - self_fields - .entry(name) - .and_modify(|schema| schema.coalesce(other_schema.clone())) - .or_insert_with(|| other_schema); + match self_fields.entry(name) { + Entry::Occupied(mut schema) => { + schema.get_mut().coalesce(other_schema); + } + Entry::Vacant(entry) => { + entry.insert(other_schema); + } + } } return; } @@ -372,67 +395,13 @@ impl Coalesce for Schema { } } } -impl PartialEq for Schema { - fn eq(&self, other: &Self) -> bool { - use Schema::*; - match (self, other) { - (Null(s), Null(o)) => s == o, - (Boolean(s), Boolean(o)) => s == o, - (Integer(s), Integer(o)) => s == o, - (Float(s), Float(o)) => s == o, - (String(s), String(o)) => s == o, - (Bytes(s), Bytes(o)) => s == o, - - ( - Sequence { - field: field_1, - context: context_1, - }, - Sequence { - field: field_2, - context: context_2, - }, - ) => field_1 == field_2 && context_1 == context_2, - - ( - Struct { - fields: fields_1, - context: context_1, - }, - Struct { - fields: fields_2, - context: context_2, - }, - ) => fields_1 == fields_2 && context_1 == context_2, - - (Union { variants: s }, Union { variants: o }) => { - let mut s = s.clone(); - let mut o = o.clone(); - s.sort_by(schema_cmp); - o.sort_by(schema_cmp); - s == o - } - - // Listing these out makes sure it fails if new variants are added. - (Null(_), _) - | (Boolean(_), _) - | (Integer(_), _) - | (Float(_), _) - | (String(_), _) - | (Bytes(_), _) - | (Sequence { .. }, _) - | (Struct { .. }, _) - | (Union { .. }, _) => false, - } - } -} // // Field implementations // -impl Field { +impl Field { /// Returns a [Field] with the given [Schema] and default [FieldStatus]. - pub fn with_schema(schema: Schema) -> Self { + pub fn with_schema(schema: Schema) -> Self { Self { status: FieldStatus::default(), schema: Some(schema), @@ -450,11 +419,11 @@ impl Field { } } } -impl Coalesce for Field { - fn coalesce(&mut self, other: Self) - where - Self: Sized, - { +impl Coalesce for Field +where + Schema: Coalesce, +{ + fn coalesce(&mut self, other: Self) { self.status.coalesce(other.status); self.schema = match (self.schema.take(), other.schema) { (Some(mut s), Some(o)) => { @@ -467,7 +436,10 @@ impl Coalesce for Field { } } } -impl StructuralEq for Field { +impl StructuralEq for Field +where + Schema: StructuralEq, +{ fn structural_eq(&self, other: &Self) -> bool { self.status == other.status && self.schema.structural_eq(&other.schema) } @@ -488,10 +460,7 @@ impl FieldStatus { } } impl Coalesce for FieldStatus { - fn coalesce(&mut self, other: Self) - where - Self: Sized, - { + fn coalesce(&mut self, other: Self) { self.may_be_null |= other.may_be_null; self.may_be_normal |= other.may_be_normal; self.may_be_missing |= other.may_be_missing; @@ -507,8 +476,8 @@ impl Coalesce for FieldStatus { /// to help in comparing two [Schema::Union]. /// Since a [Schema::Union] should never hold two schemas of the same type, it is enough to /// just compare the top level without recursion. -fn schema_cmp(first: &Schema, second: &Schema) -> std::cmp::Ordering { - fn ordering(v: &Schema) -> u8 { +fn schema_cmp(first: &Schema, second: &Schema) -> std::cmp::Ordering { + fn ordering(v: &Schema) -> u8 { use Schema::*; match v { @@ -525,3 +494,168 @@ fn schema_cmp(first: &Schema, second: &Schema) -> std::cmp::Ordering { } Ord::cmp(&ordering(first), &ordering(second)) } + +mod boilerplate { + use std::fmt; + + use crate::context::Context; + + use super::{Field, Schema}; + + // Auto-generated, with bounds changed. (TODO: use perfect derive.) + impl fmt::Debug for Schema + where + C::Null: fmt::Debug, + C::Boolean: fmt::Debug, + C::Integer: fmt::Debug, + C::Float: fmt::Debug, + C::String: fmt::Debug, + C::Bytes: fmt::Debug, + C::Sequence: fmt::Debug, + C::Struct: fmt::Debug, + { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Null(arg0) => f.debug_tuple("Null").field(arg0).finish(), + Self::Boolean(arg0) => f.debug_tuple("Boolean").field(arg0).finish(), + Self::Integer(arg0) => f.debug_tuple("Integer").field(arg0).finish(), + Self::Float(arg0) => f.debug_tuple("Float").field(arg0).finish(), + Self::String(arg0) => f.debug_tuple("String").field(arg0).finish(), + Self::Bytes(arg0) => f.debug_tuple("Bytes").field(arg0).finish(), + Self::Sequence { field, context } => f + .debug_struct("Sequence") + .field("field", field) + .field("context", context) + .finish(), + Self::Struct { fields, context } => f + .debug_struct("Struct") + .field("fields", fields) + .field("context", context) + .finish(), + Self::Union { variants } => { + f.debug_struct("Union").field("variants", variants).finish() + } + } + } + } + // Auto-generated, with bounds changed. (TODO: use perfect derive.) + impl Clone for Schema + where + C::Null: Clone, + C::Boolean: Clone, + C::Integer: Clone, + C::Float: Clone, + C::String: Clone, + C::Bytes: Clone, + C::Sequence: Clone, + C::Struct: Clone, + { + fn clone(&self) -> Self { + match self { + Self::Null(arg0) => Self::Null(arg0.clone()), + Self::Boolean(arg0) => Self::Boolean(arg0.clone()), + Self::Integer(arg0) => Self::Integer(arg0.clone()), + Self::Float(arg0) => Self::Float(arg0.clone()), + Self::String(arg0) => Self::String(arg0.clone()), + Self::Bytes(arg0) => Self::Bytes(arg0.clone()), + Self::Sequence { field, context } => Self::Sequence { + field: field.clone(), + context: context.clone(), + }, + Self::Struct { fields, context } => Self::Struct { + fields: fields.clone(), + context: context.clone(), + }, + Self::Union { variants } => Self::Union { + variants: variants.clone(), + }, + } + } + } + // Auto-generated, with bounds changed. (TODO: use perfect derive.) + impl PartialEq for Schema + where + C::Null: PartialEq, + C::Boolean: PartialEq, + C::Integer: PartialEq, + C::Float: PartialEq, + C::String: PartialEq, + C::Bytes: PartialEq, + C::Sequence: PartialEq, + C::Struct: PartialEq, + { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::Null(l0), Self::Null(r0)) => l0 == r0, + (Self::Boolean(l0), Self::Boolean(r0)) => l0 == r0, + (Self::Integer(l0), Self::Integer(r0)) => l0 == r0, + (Self::Float(l0), Self::Float(r0)) => l0 == r0, + (Self::String(l0), Self::String(r0)) => l0 == r0, + (Self::Bytes(l0), Self::Bytes(r0)) => l0 == r0, + ( + Self::Sequence { + field: l_field, + context: l_context, + }, + Self::Sequence { + field: r_field, + context: r_context, + }, + ) => l_field == r_field && l_context == r_context, + ( + Self::Struct { + fields: l_fields, + context: l_context, + }, + Self::Struct { + fields: r_fields, + context: r_context, + }, + ) => l_fields == r_fields && l_context == r_context, + ( + Self::Union { + variants: l_variants, + }, + Self::Union { + variants: r_variants, + }, + ) => l_variants == r_variants, + _ => false, + } + } + } + + // Auto-generated, with bounds changed. (TODO: use perfect derive.) + impl fmt::Debug for Field + where + Schema: fmt::Debug, + { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Field") + .field("status", &self.status) + .field("schema", &self.schema) + .finish() + } + } + // Auto-generated, with bounds changed. (TODO: use perfect derive.) + impl Clone for Field + where + Schema: Clone, + { + fn clone(&self) -> Self { + Self { + status: self.status.clone(), + schema: self.schema.clone(), + } + } + } + // Auto-generated, with bounds changed. (TODO: use perfect derive.) + impl PartialEq for Field + where + Schema: PartialEq, + { + fn eq(&self, other: &Self) -> bool { + self.status == other.status && self.schema == other.schema + } + } +} diff --git a/schema_analysis/src/targets/json_typegen.rs b/schema_analysis/src/targets/json_typegen.rs index e18ab23..cc5ee89 100644 --- a/schema_analysis/src/targets/json_typegen.rs +++ b/schema_analysis/src/targets/json_typegen.rs @@ -21,9 +21,9 @@ let output: String = json_typegen_shared::codegen_from_shape("Root", &Shape::Boo pub use json_typegen_shared::{codegen_from_shape, ErrorKind, JTError, Options, OutputMode, Shape}; -use crate::{Field, Schema}; +use crate::{context::Context, Field, Schema}; -impl Schema { +impl Schema { /// Convert a [Schema] to a json_typegen [Shape]. pub fn to_json_typegen_shape(&self) -> Shape { schema_to_shape(self) @@ -47,13 +47,13 @@ impl Schema { } } -impl From for Shape { - fn from(schema: Schema) -> Self { +impl From> for Shape { + fn from(schema: Schema) -> Self { schema_to_shape(&schema) } } -fn schema_to_shape(schema: &Schema) -> Shape { +fn schema_to_shape(schema: &Schema) -> Shape { match schema { Schema::Null(_) => Shape::Null, Schema::Boolean(_) => Shape::Bool, @@ -81,7 +81,7 @@ fn schema_to_shape(schema: &Schema) -> Shape { /// if they are missing, while sequences whose fields may be missing are merely empty. /// /// In both cases the field is optional if it may have a value of null/none. -fn convert_field(field: &Field, is_option: bool) -> Shape { +fn convert_field(field: &Field, is_option: bool) -> Shape { // From Shape docs: // `Bottom` represents the absence of any inference information // `Optional(T)` represents that a value is nullable, or not always present diff --git a/schema_analysis/src/targets/schemars.rs b/schema_analysis/src/targets/schemars.rs index 771a394..68f2108 100644 --- a/schema_analysis/src/targets/schemars.rs +++ b/schema_analysis/src/targets/schemars.rs @@ -4,9 +4,9 @@ use std::error::Error; use schemars::schema as schemars_types; -use crate::Schema; +use crate::{context::Context, Schema}; -impl Schema { +impl Schema { /// Convert into a json_schema using the default settings. pub fn to_json_schema_with_schemars(&self) -> Result { self.to_json_schema_with_schemars_version(&Default::default()) @@ -67,7 +67,7 @@ mod helpers { use schemars::schema as schemars_types; - use crate::{Field, Schema}; + use crate::{context::Context, Field, Schema}; /// Wraps a [Schema](schemars_types::Schema) in a [RootSchema](schemars_types::RootSchema). pub fn wrap_in_root( @@ -82,9 +82,9 @@ mod helpers { } /// Converts an inferred [Schema] to a schemars [Schema](schemars_types::Schema). - pub fn inferred_to_schemars( + pub fn inferred_to_schemars( generator: &mut schemars::gen::SchemaGenerator, - inferred: &Schema, + inferred: &Schema, ) -> schemars_types::Schema { // Note: we can use the generator even if we don't generate the final root schema // using it because simple values will not be referrenced. @@ -165,9 +165,9 @@ mod helpers { } /// Converts a [Field] into a [Schema](schemars_types::Schema). - fn internal_field_to_schemars_schema( + fn internal_field_to_schemars_schema( generator: &mut schemars::gen::SchemaGenerator, - field: &Field, + field: &Field, ) -> schemars_types::Schema { // Note: we can use the generator even if we don't generate the final root schema // using it because simple values will not be referrenced. diff --git a/schema_analysis/src/traits.rs b/schema_analysis/src/traits.rs index 7719529..2ae5775 100644 --- a/schema_analysis/src/traits.rs +++ b/schema_analysis/src/traits.rs @@ -1,24 +1,20 @@ //! A module holding the crate's public traits. -use std::{any::Any, fmt::Debug}; - -use downcast_rs::Downcast; - /** This trait defines a way to merge two instances of the same type. ``` -# use schema_analysis::{Schema, Coalesce, Aggregate, context::BooleanContext}; +# use schema_analysis::{Schema, context::{BooleanContext, DefaultContext}, traits::{Coalesce, Aggregate}}; # # fn main() -> Result<(), Box> { let mut context_1: BooleanContext = Default::default(); context_1.aggregate(&true); context_1.aggregate(&true); -let mut schema_1 = Schema::Boolean(context_1); +let mut schema_1 = Schema::::Boolean(context_1); let mut context_2: BooleanContext = Default::default(); context_2.aggregate(&false); -let mut schema_2 = Schema::Boolean(context_2); +let mut schema_2 = Schema::::Boolean(context_2); schema_1.coalesce(schema_2); // schema_2 is gone. @@ -26,7 +22,7 @@ let mut context_merged: BooleanContext = Default::default(); context_merged.aggregate(&true); context_merged.aggregate(&true); context_merged.aggregate(&false); -let schema_merged = Schema::Boolean(context_merged); +let schema_merged = Schema::::Boolean(context_merged); assert_eq!(schema_1, schema_merged); # @@ -34,30 +30,12 @@ assert_eq!(schema_1, schema_merged); # } ``` */ -pub trait Coalesce { +pub trait Coalesce: Sized { /// Merge `other` into `self`. - fn coalesce(&mut self, other: Self) - where - Self: Sized; -} -/// This trait allows the merging of a type with an arbitrary trait object. -/// -/// If the merger is unsuccessful (they are not of the same type) the trait object is returned. -/// -/// This trait has a blanket implementation on any [Sized] type implementing [Coalesce]. -pub trait CoalesceAny: Coalesce { - /// Merge `other` into `self`. Trait object is returned if merging was unsuccessful. - fn coalesce_any(&mut self, other: Box) -> Option>; + fn coalesce(&mut self, other: Self); } -impl CoalesceAny for T { - fn coalesce_any(&mut self, other: Box) -> Option> { - let other: Self = match other.downcast() { - Ok(downcasted) => *downcasted, - Err(not_downcasted) => return Some(not_downcasted), - }; - self.coalesce(other); - None - } +impl Coalesce for () { + fn coalesce(&mut self, _other: Self) {} } /// This trait defines an interface used for types that need to receive values one at a time to @@ -69,13 +47,8 @@ pub trait Aggregate { /// Run the internal logic on value fn aggregate(&mut self, value: &'_ V); } -/// A trait used by [crate::context::Aggregators]. -/// It's an experimental feature meant to allow library users to run arbitrary aggregation logic on -/// the input data. -#[dyn_clonable::clonable] -pub trait CoalescingAggregator: - Aggregate + CoalesceAny + Downcast + Debug + Clone + Send + Sync -{ +impl Aggregate for () { + fn aggregate(&mut self, _value: &'_ T) {} } /// This trait checks whether the shape of two objects is the same.