Skip to content
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
## v0.3.0 (2025-11-19)

### Feat

- new domain type formattedtime for time only data

### Refactor

- small tweak to allow use of dynamic fields in select rules

## v0.2.0 (2025-11-12)

### Refactor
Expand Down
28 changes: 14 additions & 14 deletions docs/detailed_guidance/domain_types.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,24 @@ Domain types are custom defined pydantic types that solve common problems with u
This might include Postcodes, NHS Numbers, dates with specific formats etc.

Below is a list of defined types, their output type and any contraints. Nested beneath them are any constraints that area allowed and their default values if there are any.
| Defined Type | Output Type | Contraints & Defaults |
| ------------ | ----------- | --------------------- |
| NHSNumber | str |
| permissive_nhs_number | str | <li> warn_on_test_numbers = False </li> |
| Postcode | str |
| OrgId | str |
| conformatteddate | date | <li>date_format: str</li><li>ge: date</li><li>le: date</li><li>gt: date</li><li>lt: date</li> |
| formatteddatetime | datetime | <li>date_format: str </li><li>timezone_treatment: one_of ["forbid", "permit", "require] = "permit"</li> |
| reportingperiod | date | <li>reporting_period_type: one_of ["start", "end"]</li><li>date_format: str = "%Y-%m-%d"</li> |
| alphanumeric | str | <li>min_digits : NonNegativeInt = 1</li><li>max_digits: PositiveInt = 1</li> |
| identifier | str | <li>min_digits : NonNegativeInt = 1</li><li>max_digits: PositiveInt = 1</li>
| Defined Type | Output Type | Contraints & Defaults | Supported Implementations |
| ------------ | ----------- | --------------------- | ------------------------- |
| NHSNumber | str | | Spark, DuckDB |
| permissive_nhs_number | str | <li> warn_on_test_numbers = False </li> | Spark, DuckDB |
| Postcode | str | | Spark, DuckDB |
| OrgId | str | | Spark, DuckDB |
| conformatteddate | date | <li>date_format: str</li><li>ge: date</li><li>le: date</li><li>gt: date</li><li>lt: date</li> | Spark, DuckDB |
| formatteddatetime | datetime | <li>date_format: str </li><li>timezone_treatment: one_of ["forbid", "permit", "require] = "permit"</li> | Spark, DuckDB |
| formattedtime | time | <li>time_format: str </li><li>timezone_treatment: one_of ["forbid", "permit", "require"] = "permit" | DuckDB |
| reportingperiod | date | <li>reporting_period_type: one_of ["start", "end"]</li><li>date_format: str = "%Y-%m-%d"</li> | Spark, DuckDB |
| alphanumeric | str | <li>min_digits : NonNegativeInt = 1</li><li>max_digits: PositiveInt = 1</li> | Spark, DuckDB |
| identifier | str | <li>min_digits : NonNegativeInt = 1</li><li>max_digits: PositiveInt = 1</li> | Spark, DuckDB |

Other types that are allowed include:
**Other types that are allowed include:**
- str
- int
- date
- datetime
- Decimal
- float

And any types that are included in [pydantic version 1.10](https://docs.pydantic.dev/1.10/usage/types/#pydantic-types)
- Any types that are included in [pydantic version 1.10](https://docs.pydantic.dev/1.10/usage/types/#pydantic-types)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "nhs_dve"
version = "0.2.0"
version = "0.3.0"
description = "`nhs data validation engine` is a framework used to validate data"
authors = ["NHS England <england.contactus@nhs.net>"]
readme = "README.md"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Implementation of duckdb backend"""

from dve.core_engine.backends.implementations.duckdb.readers.json import DuckDBJSONReader
from dve.core_engine.backends.readers import register_reader

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""Helper objects for duckdb data contract implementation"""
from collections.abc import Generator, Iterator
from dataclasses import is_dataclass
from datetime import date, datetime
from datetime import date, datetime, time
from decimal import Decimal
from pathlib import Path
from typing import Any, ClassVar, Union
Expand Down Expand Up @@ -87,6 +87,7 @@ def __call__(self):
date: ddbtyp.DATE,
datetime: ddbtyp.TIMESTAMP,
Decimal: DDBDecimal()(),
time: ddbtyp.TIME,
}
"""A mapping of Python types to the equivalent DuckDB types."""

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Utility objects for use with duckdb backend"""

import itertools

from dve.core_engine.backends.base.utilities import _split_multiexpr_string


Expand All @@ -24,7 +26,11 @@ def expr_mapping_to_columns(expressions: dict) -> list[str]:

def expr_array_to_columns(expressions: list[str]) -> list[str]:
"""Create list of duckdb expressions from list of expressions"""
return [f"{expression}" for expression in expressions]
return list(
itertools.chain.from_iterable(
_split_multiexpr_string(expression) for expression in expressions
)
)


def multiexpr_string_to_columns(expressions: str) -> list[str]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,7 @@
from dataclasses import dataclass, is_dataclass
from decimal import Decimal
from functools import wraps
from typing import (
Any,
ClassVar,
Optional,
TypeVar,
Union,
overload,
)
from typing import Any, ClassVar, Optional, TypeVar, Union, overload

from delta.exceptions import ConcurrentAppendException, DeltaConcurrentModificationException
from pydantic import BaseModel
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Some utilities which are useful for implementing Spark transformations."""

import datetime as dt
import itertools
from collections.abc import Callable
from json import JSONEncoder
from operator import and_, or_
Expand Down Expand Up @@ -70,7 +71,13 @@ def expr_mapping_to_columns(expressions: ExpressionMapping) -> list[Column]:

def expr_array_to_columns(expressions: ExpressionArray) -> list[Column]:
"""Convert an array of expressions to a list of columns."""
return list(map(sf.expr, expressions))

_expr_list = list(
itertools.chain.from_iterable(
_split_multiexpr_string(expression) for expression in expressions
)
)
return list(map(sf.expr, _expr_list))


def multiexpr_string_to_columns(expressions: MultiExpression) -> list[Column]:
Expand Down
9 changes: 1 addition & 8 deletions src/dve/core_engine/backends/readers/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,7 @@

import re
from collections.abc import Collection, Iterator
from typing import (
IO,
Any,
GenericAlias, # type: ignore
Optional,
Union,
overload
)
from typing import IO, Any, GenericAlias, Optional, Union, overload # type: ignore

import polars as pl
from lxml import etree # type: ignore
Expand Down
9 changes: 6 additions & 3 deletions src/dve/core_engine/backends/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

import sys
from dataclasses import is_dataclass
from datetime import date, datetime
from datetime import date, datetime, time
from decimal import Decimal
from typing import Any, ClassVar, Union
from typing import GenericAlias # type: ignore
from typing import Any, ClassVar, Union

import polars as pl # type: ignore
from polars.datatypes.classes import DataTypeClass as PolarsType
Expand Down Expand Up @@ -33,13 +33,16 @@
date: pl.Date, # type: ignore
datetime: pl.Datetime, # type: ignore
Decimal: pl.Utf8, # type: ignore
time: pl.Time, # type: ignore
}
"""A mapping of Python types to the equivalent Polars types."""


def stringify_type(type_: Union[type, GenericAlias]) -> type:
"""Stringify an individual type."""
if isinstance(type_, type) and not isinstance(type_, GenericAlias): # A model, return the contents. # pylint: disable=C0301
if isinstance(type_, type) and not isinstance(
type_, GenericAlias
): # A model, return the contents. # pylint: disable=C0301
if issubclass(type_, BaseModel):
return stringify_model(type_)

Expand Down
2 changes: 1 addition & 1 deletion src/dve/core_engine/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import copy
import datetime as dt
import operator
import json
import operator
from collections.abc import Callable
from decimal import Decimal
from functools import reduce
Expand Down
2 changes: 1 addition & 1 deletion src/dve/core_engine/type_hints.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
from pathlib import Path
from queue import Queue as ThreadQueue
from typing import TYPE_CHECKING, Any, List, Optional, TypeVar, Union # pylint: disable=W1901
# TODO - cannot remove List from Typing. See L60 for details.

from pyspark.sql import DataFrame
from pyspark.sql.types import StructType
from typing_extensions import Literal, ParamSpec, get_args

# TODO - cannot remove List from Typing. See L60 for details.

if TYPE_CHECKING: # pragma: no cover
from dve.core_engine.message import FeedbackMessage
Expand Down
108 changes: 108 additions & 0 deletions src/dve/metadata_parser/domain_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,97 @@ def __get_validators__(cls) -> Iterator[classmethod]:
yield cls.validate # type: ignore


class FormattedTime(dt.time):
"""A time, provided as a datetime or a string in a specific format."""

TIME_FORMAT: ClassVar[Optional[str]] = None
"""The specific format of the time."""
TIMEZONE_TREATMENT: ClassVar[Literal["forbid", "permit", "require"]] = "permit"
"""How to treat the presence of timezone-related information."""
DEFAULT_PATTERNS: Sequence[str] = list(
# 24 hour time pattern combinations
map(
"".join,
itertools.product(
("%H:%M:%S", "%H%M%S"),
("", ".%f"),
("%p", "%P", ""),
("%z", ""),
),
)
) + list(
# 12 hour time pattern combinations
map(
"".join,
itertools.product(
("%I:%M:%S", "%I%M%S"),
("", ".%f"),
("%z", ""),
(" %p", "%p", "%P", " %P", ""),
),
)
)
"""A sequence of time format patterns to try if `TIME_FORMAT` is unset."""

@classmethod
def convert_to_time(cls, value: dt.datetime) -> dt.time:
"""
Convert `datetime.datetime` to `datetime.time`. If datetime contains timezone info, that
will be retained.
"""
if value.tzinfo:
return value.timetz()

return value.time()

@classmethod
def parse_time(cls, string: str) -> dt.time:
"""Attempt to parse a datetime using various formats in sequence."""
string = string.strip()
if string.endswith("Z"): # Convert 'zulu' time to UTC.
string = string[:-1] + "+00:00"

for pattern in cls.DEFAULT_PATTERNS:
try:
datetime = dt.datetime.strptime(string, pattern)
except ValueError:
continue

time = cls.convert_to_time(datetime)

return time # pragma: no cover
raise ValueError("Unable to parse provided time")

@classmethod
def validate(cls, value: Union[dt.time, dt.datetime, str]) -> dt.time | None:
"""Validate a passed time, datetime or string."""
if value is None:
return value

if isinstance(value, dt.time):
new_time = value
elif isinstance(value, dt.datetime):
new_time = cls.convert_to_time(value)
else:
if cls.TIME_FORMAT is not None:
try:
new_time = dt.datetime.strptime(value, cls.TIME_FORMAT) # type: ignore
new_time = cls.convert_to_time(new_time) # type: ignore
except ValueError as err:
raise ValueError(
f"Unable to parse provided time in format {cls.TIME_FORMAT}"
) from err
else:
new_time = cls.parse_time(value)

if cls.TIMEZONE_TREATMENT == "forbid" and new_time.tzinfo:
raise ValueError("Provided time has timezone, but this is forbidden for this field")
if cls.TIMEZONE_TREATMENT == "require" and not new_time.tzinfo:
raise ValueError("Provided time missing timezone, but this is required for this field")

return new_time


@lru_cache()
@validate_arguments
def formatteddatetime(
Expand All @@ -412,6 +503,23 @@ def formatteddatetime(
return type("FormattedDatetime", (FormattedDatetime, *FormattedDatetime.__bases__), dict_)


@lru_cache()
@validate_arguments
def formattedtime(
time_format: Optional[str] = None,
timezone_treatment: Literal["forbid", "permit", "require"] = "permit",
) -> type[FormattedTime]:
"""Return a formatted time class with a set time format and timezone treatment."""
if time_format is None and timezone_treatment == "permit":
return FormattedTime

dict_ = FormattedTime.__dict__.copy()
dict_["TIME_FORMAT"] = time_format
dict_["TIMEZONE_TREATMENT"] = timezone_treatment

return type("FormattedTime", (FormattedTime, *FormattedTime.__bases__), dict_)


class ReportingPeriod(dt.date):
"""A reporting period field, with the type of reporting period supplied"""

Expand Down
1 change: 1 addition & 0 deletions src/dve/metadata_parser/model_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def constr(
"identifier": domain_types.identifier,
"orgid": domain_types.OrgID,
"formatteddatetime": domain_types.formatteddatetime,
"formattedtime": domain_types.formattedtime,
"conformatteddate": domain_types.conformatteddate,
"reportingperiodstart": domain_types.reportingperiod(reporting_period_type="start"),
"reportingperiodend": domain_types.reportingperiod(reporting_period_type="end"),
Expand Down
9 changes: 5 additions & 4 deletions tests/test_core_engine/test_backends/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# pylint: disable=redefined-outer-name
import json
from datetime import date, datetime
from datetime import date, datetime, time
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any, Dict, Iterator, List, Tuple
Expand Down Expand Up @@ -83,10 +83,10 @@ def temp_duckdb_dir():

@pytest.fixture
def temp_csv_file(temp_duckdb_dir: Path):
header: str = "ID,varchar_field,bigint_field,date_field,timestamp_field"
header: str = "ID,varchar_field,bigint_field,date_field,timestamp_field,time_field"
typed_data = [
[1, "hi", 3, date(2023, 1, 3), datetime(2023, 1, 3, 12, 0, 3)],
[2, "bye", 4, date(2023, 3, 7), datetime(2023, 5, 9, 15, 21, 53)],
[1, "hi", 3, date(2023, 1, 3), datetime(2023, 1, 3, 12, 0, 3), time(12, 0, 0)],
[2, "bye", 4, date(2023, 3, 7), datetime(2023, 5, 9, 15, 21, 53), time(13, 0 ,0)],
]

class SimpleModel(BaseModel):
Expand All @@ -95,6 +95,7 @@ class SimpleModel(BaseModel):
bigint_field: int
date_field: date
timestamp_field: datetime
time_field: time

with open(temp_duckdb_dir.joinpath("dummy.csv"), mode="w") as csv_file:
csv_file.write(header + "\n")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@ def test_duckdb_data_contract_csv(temp_csv_file):
"bigint_field": "NonNegativeInt",
"date_field": "date",
"timestamp_field": "datetime",
"time_field": {
"description": "test",
"callable": "formattedtime",
"constraints": {
"time_format": "%Y-%m-%d",
"timezone_treatment": "forbid"
}
}
},
"reader_config": {
".csv": {
Expand Down
Loading