From 963a377f26a41484a50cc2799632bcdf26efdf54 Mon Sep 17 00:00:00 2001 From: georgeRobertson <50412379+georgeRobertson@users.noreply.github.com> Date: Tue, 18 Nov 2025 17:51:14 +0000 Subject: [PATCH 1/3] feat: new domain type formattedtime for time only data --- .../implementations/duckdb/duckdb_helpers.py | 3 +- src/dve/core_engine/backends/utilities.py | 3 +- src/dve/metadata_parser/domain_types.py | 110 ++++++++++++++++++ src/dve/metadata_parser/model_generator.py | 1 + .../test_backends/fixtures.py | 9 +- .../test_duckdb/test_data_contract.py | 8 ++ .../test_domain_types.py | 53 +++++++++ 7 files changed, 181 insertions(+), 6 deletions(-) diff --git a/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py b/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py index ea1901e..a261f7b 100644 --- a/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +++ b/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py @@ -4,7 +4,7 @@ """Helper objects for duckdb data contract implementation""" from collections.abc import Generator, Iterator from dataclasses import is_dataclass -from datetime import date, datetime +from datetime import date, datetime, time from decimal import Decimal from pathlib import Path from typing import Any, ClassVar, Union @@ -87,6 +87,7 @@ def __call__(self): date: ddbtyp.DATE, datetime: ddbtyp.TIMESTAMP, Decimal: DDBDecimal()(), + time: ddbtyp.TIME, } """A mapping of Python types to the equivalent DuckDB types.""" diff --git a/src/dve/core_engine/backends/utilities.py b/src/dve/core_engine/backends/utilities.py index 9319780..5d79533 100644 --- a/src/dve/core_engine/backends/utilities.py +++ b/src/dve/core_engine/backends/utilities.py @@ -2,7 +2,7 @@ import sys from dataclasses import is_dataclass -from datetime import date, datetime +from datetime import date, datetime, time from decimal import Decimal from typing import Any, ClassVar, Union from typing import GenericAlias # type: ignore @@ -33,6 +33,7 @@ date: pl.Date, # type: ignore datetime: pl.Datetime, # type: ignore Decimal: pl.Utf8, # type: ignore + time: pl.Time, # type: ignore } """A mapping of Python types to the equivalent Polars types.""" diff --git a/src/dve/metadata_parser/domain_types.py b/src/dve/metadata_parser/domain_types.py index c944278..0489fb2 100644 --- a/src/dve/metadata_parser/domain_types.py +++ b/src/dve/metadata_parser/domain_types.py @@ -392,6 +392,99 @@ def __get_validators__(cls) -> Iterator[classmethod]: yield cls.validate # type: ignore +class FormattedTime(dt.time): + """A time, provided as a datetime or a string in a specific format.""" + + TIME_FORMAT: ClassVar[Optional[str]] = None + """The specific format of the time.""" + TIMEZONE_TREATMENT: ClassVar[Literal["forbid", "permit", "require"]] = "permit" + """How to treat the presence of timezone-related information.""" + DEFAULT_PATTERNS: Sequence[str] = list( + # 24 hour time pattern combinations + map( + "".join, + itertools.product( + ("%H:%M:%S", "%H%M%S"), + ("", ".%f"), + ("%p", "%P", ""), + ("%z", ""), + ) + ) + ) + list( + # 12 hour time pattern combinations + map( + "".join, + itertools.product( + ("%I:%M:%S", "%I%M%S"), + ("", ".%f"), + ("%z", ""), + (" %p", "%p", "%P", " %P", ""), + ) + ) + ) + """A sequence of time format patterns to try if `TIME_FORMAT` is unset.""" + + @classmethod + def convert_to_time(cls, value: dt.datetime) -> dt.time: + """ + Convert `datetime.datetime` to `datetime.time`. If datetime contains timezone info, that + will be retained. + """ + if value.tzinfo: + return value.timetz() + + return value.time() + + @classmethod + def parse_time(cls, string: str) -> dt.time: + """Attempt to parse a datetime using various formats in sequence.""" + string = string.strip() + if string.endswith("Z"): # Convert 'zulu' time to UTC. + string = string[:-1] + "+00:00" + + for pattern in cls.DEFAULT_PATTERNS: + try: + datetime = dt.datetime.strptime(string, pattern) + except ValueError: + continue + + time = cls.convert_to_time(datetime) + + return time # pragma: no cover + raise ValueError("Unable to parse provided time") + + @classmethod + def validate(cls, value: Optional[Union[dt.time, dt.datetime, str]]) -> dt.time | None: + """Validate a passed time, datetime or string.""" + if not value: + return value + + if isinstance(value, dt.time): + new_time = value + elif isinstance(value, dt.datetime): + new_time = cls.convert_to_time(value) + else: + if cls.TIME_FORMAT is not None: + try: + new_time = dt.datetime.strptime(value, cls.TIME_FORMAT) + new_time = cls.convert_to_time(new_time) + except ValueError as err: + raise ValueError( + f"Unable to parse provided time in format {cls.TIME_FORMAT}" + ) from err + else: + new_time = cls.parse_time(value) + + if cls.TIMEZONE_TREATMENT == "forbid" and new_time.tzinfo: + raise ValueError("Provided time has timezone, but this is forbidden for this field") + if cls.TIMEZONE_TREATMENT == "require" and not new_time.tzinfo: + raise ValueError( + "Provided time missing timezone, but this is required for this field" + ) + + return new_time + + @lru_cache() @validate_arguments def formatteddatetime( @@ -412,6 +505,23 @@ def formatteddatetime( return type("FormattedDatetime", (FormattedDatetime, *FormattedDatetime.__bases__), dict_) +@lru_cache() +@validate_arguments +def formattedtime( + time_format: Optional[str] = None, + timezone_treatment: Literal["forbid", "permit", "require"] = "permit", +) -> type[FormattedTime]: + """Return a formatted time class with a set time format and timezone treatment.""" + if time_format is None and timezone_treatment == "permit": + return FormattedTime + + dict_ = FormattedTime.__dict__.copy() + dict_["TIME_FORMAT"] = time_format + dict_["TIMEZONE_TREATMENT"] = timezone_treatment + + return type("FormattedTime", (FormattedTime, *FormattedTime.__bases__), dict_) + + class ReportingPeriod(dt.date): """A reporting period field, with the type of reporting period supplied""" diff --git a/src/dve/metadata_parser/model_generator.py b/src/dve/metadata_parser/model_generator.py index 53a82d8..7681b7f 100644 --- a/src/dve/metadata_parser/model_generator.py +++ b/src/dve/metadata_parser/model_generator.py @@ -72,6 +72,7 @@ def constr( "identifier": domain_types.identifier, "orgid": domain_types.OrgID, "formatteddatetime": domain_types.formatteddatetime, + "formattedtime": domain_types.formattedtime, "conformatteddate": domain_types.conformatteddate, "reportingperiodstart": domain_types.reportingperiod(reporting_period_type="start"), "reportingperiodend": domain_types.reportingperiod(reporting_period_type="end"), diff --git a/tests/test_core_engine/test_backends/fixtures.py b/tests/test_core_engine/test_backends/fixtures.py index 1f9ac23..14369b9 100644 --- a/tests/test_core_engine/test_backends/fixtures.py +++ b/tests/test_core_engine/test_backends/fixtures.py @@ -2,7 +2,7 @@ # pylint: disable=redefined-outer-name import json -from datetime import date, datetime +from datetime import date, datetime, time from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Dict, Iterator, List, Tuple @@ -83,10 +83,10 @@ def temp_duckdb_dir(): @pytest.fixture def temp_csv_file(temp_duckdb_dir: Path): - header: str = "ID,varchar_field,bigint_field,date_field,timestamp_field" + header: str = "ID,varchar_field,bigint_field,date_field,timestamp_field,time_field" typed_data = [ - [1, "hi", 3, date(2023, 1, 3), datetime(2023, 1, 3, 12, 0, 3)], - [2, "bye", 4, date(2023, 3, 7), datetime(2023, 5, 9, 15, 21, 53)], + [1, "hi", 3, date(2023, 1, 3), datetime(2023, 1, 3, 12, 0, 3), time(12, 0, 0)], + [2, "bye", 4, date(2023, 3, 7), datetime(2023, 5, 9, 15, 21, 53), time(13, 0 ,0)], ] class SimpleModel(BaseModel): @@ -95,6 +95,7 @@ class SimpleModel(BaseModel): bigint_field: int date_field: date timestamp_field: datetime + time_field: time with open(temp_duckdb_dir.joinpath("dummy.csv"), mode="w") as csv_file: csv_file.write(header + "\n") diff --git a/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py b/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py index 23f1534..5093150 100644 --- a/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py +++ b/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py @@ -41,6 +41,14 @@ def test_duckdb_data_contract_csv(temp_csv_file): "bigint_field": "NonNegativeInt", "date_field": "date", "timestamp_field": "datetime", + "time_field": { + "description": "test", + "callable": "formattedtime", + "constraints": { + "time_format": "%Y-%m-%d", + "timezone_treatment": "forbid" + } + } }, "reader_config": { ".csv": { diff --git a/tests/test_model_generation/test_domain_types.py b/tests/test_model_generation/test_domain_types.py index 9db587a..6ceee74 100644 --- a/tests/test_model_generation/test_domain_types.py +++ b/tests/test_model_generation/test_domain_types.py @@ -307,3 +307,56 @@ def test_reportingperiod_raises(field, value): data = {field: value} with pytest.raises(ValueError): model = ReportingPeriodModel(**data) + + +@pytest.mark.parametrize( + ["time_to_validate", "time_format", "timezone_treatment", "expected"], + [ + ["23:00:00", "%H:%M:%S", "forbid", dt.time(23, 0, 0)], + ["11:00:00", "%I:%M:%S", "forbid", dt.time(11, 0, 0)], + ["23:00:00Z", None, "require", dt.time(23, 0, 0, tzinfo=UTC)], + ["12:00:00Zam", None, "permit", dt.time(0, 0, 0, tzinfo=UTC)], + ["12:00:00pm", None, "forbid", dt.time(12, 0, 0)], + ["1970-01-01", "%Y-%m-%d", "forbid", dt.time(0, 0)], + # not great that it effectively returns incorrect time object here. However, this would be + # down to user error in setting up the dischema. + [dt.datetime(2025, 12, 1, 13, 0, 5), "%H:%M:%S", "forbid", dt.time(13, 0, 5)], + [dt.datetime(2025, 12, 1, 13, 0, 5, tzinfo=UTC), "%H:%M:%S", "require", dt.time(13, 0, 5, tzinfo=UTC)], + [dt.time(13, 0, 0), "%H:%M:%S", "forbid", dt.time(13, 0, 0)], + [dt.time(13, 0, 0, tzinfo=UTC), "%H:%M:%S", "permit", dt.time(13, 0, 0, tzinfo=UTC)], + [dt.time(13, 0, 0, tzinfo=UTC), "%H:%M:%S", "require", dt.time(13, 0, 0, tzinfo=UTC)], + ] +) +def test_formattedtime( + time_to_validate: str | dt.datetime | dt.time, + time_format: str, + timezone_treatment: str, + expected: dt.time +): + """Test serialised time objects can be parsed correctly when valid.""" + time_type = hct.formattedtime(time_format, timezone_treatment) + assert time_type.validate(time_to_validate) == expected + + +@pytest.mark.parametrize( + ["time_to_validate", "time_format", "timezone_treatment"], + [ + ["1970-01-01", "%H:%M:%S", "forbid",], + ["1970-01-01", "%H:%M:%S", "forbid",], + ["23:00:00", "%I:%M:%S", "permit",], + ["23:00:00", "%H:%M:%S", "require",], + ["23:00:00Z", "%I:%M:%S", "forbid",], + [dt.datetime(2025, 12, 1, 13, 0, 5, tzinfo=UTC), "%H:%M:%S", "forbid",], + [dt.time(13, 0, 5, tzinfo=UTC), "%H:%M:%S", "forbid",] + ] +) +def test_formattedtime_raises( + time_to_validate: str | dt.datetime | dt.time, time_format: str, timezone_treatment: str +): + """ + Test incorrect serialised objects can be handled correctly when attempting to parse into time + objects. + """ + time_type = hct.formattedtime(time_format, timezone_treatment) + with pytest.raises(ValueError): + time_type.validate(time_to_validate) # pylint: disable=W0106 From 8a853f50089e94a0a88f2082ea465faa1e091971 Mon Sep 17 00:00:00 2001 From: georgeRobertson <50412379+georgeRobertson@users.noreply.github.com> Date: Tue, 18 Nov 2025 17:52:40 +0000 Subject: [PATCH 2/3] docs: update domain types to include new time only type also added context around which implementation support given backend implementations as time only works on duckdb solution atm --- docs/detailed_guidance/domain_types.md | 28 +++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/detailed_guidance/domain_types.md b/docs/detailed_guidance/domain_types.md index df382de..62279b1 100644 --- a/docs/detailed_guidance/domain_types.md +++ b/docs/detailed_guidance/domain_types.md @@ -4,24 +4,24 @@ Domain types are custom defined pydantic types that solve common problems with u This might include Postcodes, NHS Numbers, dates with specific formats etc. Below is a list of defined types, their output type and any contraints. Nested beneath them are any constraints that area allowed and their default values if there are any. -| Defined Type | Output Type | Contraints & Defaults | -| ------------ | ----------- | --------------------- | -| NHSNumber | str | -| permissive_nhs_number | str |