diff --git a/docs/detailed_guidance/domain_types.md b/docs/detailed_guidance/domain_types.md index df382de..62279b1 100644 --- a/docs/detailed_guidance/domain_types.md +++ b/docs/detailed_guidance/domain_types.md @@ -4,24 +4,24 @@ Domain types are custom defined pydantic types that solve common problems with u This might include Postcodes, NHS Numbers, dates with specific formats etc. Below is a list of defined types, their output type and any contraints. Nested beneath them are any constraints that area allowed and their default values if there are any. -| Defined Type | Output Type | Contraints & Defaults | -| ------------ | ----------- | --------------------- | -| NHSNumber | str | -| permissive_nhs_number | str |
  • warn_on_test_numbers = False
  • | -| Postcode | str | -| OrgId | str | -| conformatteddate | date |
  • date_format: str
  • ge: date
  • le: date
  • gt: date
  • lt: date
  • | -| formatteddatetime | datetime |
  • date_format: str
  • timezone_treatment: one_of ["forbid", "permit", "require] = "permit"
  • | -| reportingperiod | date |
  • reporting_period_type: one_of ["start", "end"]
  • date_format: str = "%Y-%m-%d"
  • | -| alphanumeric | str |
  • min_digits : NonNegativeInt = 1
  • max_digits: PositiveInt = 1
  • | -| identifier | str |
  • min_digits : NonNegativeInt = 1
  • max_digits: PositiveInt = 1
  • +| Defined Type | Output Type | Contraints & Defaults | Supported Implementations | +| ------------ | ----------- | --------------------- | ------------------------- | +| NHSNumber | str | | Spark, DuckDB | +| permissive_nhs_number | str |
  • warn_on_test_numbers = False
  • | Spark, DuckDB | +| Postcode | str | | Spark, DuckDB | +| OrgId | str | | Spark, DuckDB | +| conformatteddate | date |
  • date_format: str
  • ge: date
  • le: date
  • gt: date
  • lt: date
  • | Spark, DuckDB | +| formatteddatetime | datetime |
  • date_format: str
  • timezone_treatment: one_of ["forbid", "permit", "require] = "permit"
  • | Spark, DuckDB | +| formattedtime | time |
  • time_format: str
  • timezone_treatment: one_of ["forbid", "permit", "require"] = "permit" | DuckDB | +| reportingperiod | date |
  • reporting_period_type: one_of ["start", "end"]
  • date_format: str = "%Y-%m-%d"
  • | Spark, DuckDB | +| alphanumeric | str |
  • min_digits : NonNegativeInt = 1
  • max_digits: PositiveInt = 1
  • | Spark, DuckDB | +| identifier | str |
  • min_digits : NonNegativeInt = 1
  • max_digits: PositiveInt = 1
  • | Spark, DuckDB | -Other types that are allowed include: +**Other types that are allowed include:** - str - int - date - datetime - Decimal - float - -And any types that are included in [pydantic version 1.10](https://docs.pydantic.dev/1.10/usage/types/#pydantic-types) +- Any types that are included in [pydantic version 1.10](https://docs.pydantic.dev/1.10/usage/types/#pydantic-types) diff --git a/src/dve/core_engine/backends/implementations/duckdb/__init__.py b/src/dve/core_engine/backends/implementations/duckdb/__init__.py index d731064..996ec80 100644 --- a/src/dve/core_engine/backends/implementations/duckdb/__init__.py +++ b/src/dve/core_engine/backends/implementations/duckdb/__init__.py @@ -1,4 +1,5 @@ """Implementation of duckdb backend""" + from dve.core_engine.backends.implementations.duckdb.readers.json import DuckDBJSONReader from dve.core_engine.backends.readers import register_reader diff --git a/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py b/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py index ea1901e..a261f7b 100644 --- a/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py +++ b/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py @@ -4,7 +4,7 @@ """Helper objects for duckdb data contract implementation""" from collections.abc import Generator, Iterator from dataclasses import is_dataclass -from datetime import date, datetime +from datetime import date, datetime, time from decimal import Decimal from pathlib import Path from typing import Any, ClassVar, Union @@ -87,6 +87,7 @@ def __call__(self): date: ddbtyp.DATE, datetime: ddbtyp.TIMESTAMP, Decimal: DDBDecimal()(), + time: ddbtyp.TIME, } """A mapping of Python types to the equivalent DuckDB types.""" diff --git a/src/dve/core_engine/backends/implementations/spark/spark_helpers.py b/src/dve/core_engine/backends/implementations/spark/spark_helpers.py index 921b04e..7cb7b17 100644 --- a/src/dve/core_engine/backends/implementations/spark/spark_helpers.py +++ b/src/dve/core_engine/backends/implementations/spark/spark_helpers.py @@ -12,14 +12,7 @@ from dataclasses import dataclass, is_dataclass from decimal import Decimal from functools import wraps -from typing import ( - Any, - ClassVar, - Optional, - TypeVar, - Union, - overload, -) +from typing import Any, ClassVar, Optional, TypeVar, Union, overload from delta.exceptions import ConcurrentAppendException, DeltaConcurrentModificationException from pydantic import BaseModel diff --git a/src/dve/core_engine/backends/readers/xml.py b/src/dve/core_engine/backends/readers/xml.py index bd7b8e4..5de23c4 100644 --- a/src/dve/core_engine/backends/readers/xml.py +++ b/src/dve/core_engine/backends/readers/xml.py @@ -3,14 +3,7 @@ import re from collections.abc import Collection, Iterator -from typing import ( - IO, - Any, - GenericAlias, # type: ignore - Optional, - Union, - overload -) +from typing import IO, Any, GenericAlias, Optional, Union, overload # type: ignore import polars as pl from lxml import etree # type: ignore diff --git a/src/dve/core_engine/backends/utilities.py b/src/dve/core_engine/backends/utilities.py index 9319780..bfa6f90 100644 --- a/src/dve/core_engine/backends/utilities.py +++ b/src/dve/core_engine/backends/utilities.py @@ -2,10 +2,10 @@ import sys from dataclasses import is_dataclass -from datetime import date, datetime +from datetime import date, datetime, time from decimal import Decimal -from typing import Any, ClassVar, Union from typing import GenericAlias # type: ignore +from typing import Any, ClassVar, Union import polars as pl # type: ignore from polars.datatypes.classes import DataTypeClass as PolarsType @@ -33,13 +33,16 @@ date: pl.Date, # type: ignore datetime: pl.Datetime, # type: ignore Decimal: pl.Utf8, # type: ignore + time: pl.Time, # type: ignore } """A mapping of Python types to the equivalent Polars types.""" def stringify_type(type_: Union[type, GenericAlias]) -> type: """Stringify an individual type.""" - if isinstance(type_, type) and not isinstance(type_, GenericAlias): # A model, return the contents. # pylint: disable=C0301 + if isinstance(type_, type) and not isinstance( + type_, GenericAlias + ): # A model, return the contents. # pylint: disable=C0301 if issubclass(type_, BaseModel): return stringify_model(type_) diff --git a/src/dve/core_engine/message.py b/src/dve/core_engine/message.py index d81acde..7dd4f02 100644 --- a/src/dve/core_engine/message.py +++ b/src/dve/core_engine/message.py @@ -2,8 +2,8 @@ import copy import datetime as dt -import operator import json +import operator from collections.abc import Callable from decimal import Decimal from functools import reduce diff --git a/src/dve/core_engine/type_hints.py b/src/dve/core_engine/type_hints.py index a6c0c44..ac6cf2a 100644 --- a/src/dve/core_engine/type_hints.py +++ b/src/dve/core_engine/type_hints.py @@ -6,12 +6,14 @@ from pathlib import Path from queue import Queue as ThreadQueue from typing import TYPE_CHECKING, Any, List, Optional, TypeVar, Union # pylint: disable=W1901 -# TODO - cannot remove List from Typing. See L60 for details. from pyspark.sql import DataFrame from pyspark.sql.types import StructType from typing_extensions import Literal, ParamSpec, get_args +# TODO - cannot remove List from Typing. See L60 for details. + + if TYPE_CHECKING: # pragma: no cover from dve.core_engine.message import FeedbackMessage diff --git a/src/dve/metadata_parser/domain_types.py b/src/dve/metadata_parser/domain_types.py index c944278..3153d26 100644 --- a/src/dve/metadata_parser/domain_types.py +++ b/src/dve/metadata_parser/domain_types.py @@ -392,6 +392,97 @@ def __get_validators__(cls) -> Iterator[classmethod]: yield cls.validate # type: ignore +class FormattedTime(dt.time): + """A time, provided as a datetime or a string in a specific format.""" + + TIME_FORMAT: ClassVar[Optional[str]] = None + """The specific format of the time.""" + TIMEZONE_TREATMENT: ClassVar[Literal["forbid", "permit", "require"]] = "permit" + """How to treat the presence of timezone-related information.""" + DEFAULT_PATTERNS: Sequence[str] = list( + # 24 hour time pattern combinations + map( + "".join, + itertools.product( + ("%H:%M:%S", "%H%M%S"), + ("", ".%f"), + ("%p", "%P", ""), + ("%z", ""), + ), + ) + ) + list( + # 12 hour time pattern combinations + map( + "".join, + itertools.product( + ("%I:%M:%S", "%I%M%S"), + ("", ".%f"), + ("%z", ""), + (" %p", "%p", "%P", " %P", ""), + ), + ) + ) + """A sequence of time format patterns to try if `TIME_FORMAT` is unset.""" + + @classmethod + def convert_to_time(cls, value: dt.datetime) -> dt.time: + """ + Convert `datetime.datetime` to `datetime.time`. If datetime contains timezone info, that + will be retained. + """ + if value.tzinfo: + return value.timetz() + + return value.time() + + @classmethod + def parse_time(cls, string: str) -> dt.time: + """Attempt to parse a datetime using various formats in sequence.""" + string = string.strip() + if string.endswith("Z"): # Convert 'zulu' time to UTC. + string = string[:-1] + "+00:00" + + for pattern in cls.DEFAULT_PATTERNS: + try: + datetime = dt.datetime.strptime(string, pattern) + except ValueError: + continue + + time = cls.convert_to_time(datetime) + + return time # pragma: no cover + raise ValueError("Unable to parse provided time") + + @classmethod + def validate(cls, value: Union[dt.time, dt.datetime, str]) -> dt.time | None: + """Validate a passed time, datetime or string.""" + if value is None: + return value + + if isinstance(value, dt.time): + new_time = value + elif isinstance(value, dt.datetime): + new_time = cls.convert_to_time(value) + else: + if cls.TIME_FORMAT is not None: + try: + new_time = dt.datetime.strptime(value, cls.TIME_FORMAT) # type: ignore + new_time = cls.convert_to_time(new_time) # type: ignore + except ValueError as err: + raise ValueError( + f"Unable to parse provided time in format {cls.TIME_FORMAT}" + ) from err + else: + new_time = cls.parse_time(value) + + if cls.TIMEZONE_TREATMENT == "forbid" and new_time.tzinfo: + raise ValueError("Provided time has timezone, but this is forbidden for this field") + if cls.TIMEZONE_TREATMENT == "require" and not new_time.tzinfo: + raise ValueError("Provided time missing timezone, but this is required for this field") + + return new_time + + @lru_cache() @validate_arguments def formatteddatetime( @@ -412,6 +503,23 @@ def formatteddatetime( return type("FormattedDatetime", (FormattedDatetime, *FormattedDatetime.__bases__), dict_) +@lru_cache() +@validate_arguments +def formattedtime( + time_format: Optional[str] = None, + timezone_treatment: Literal["forbid", "permit", "require"] = "permit", +) -> type[FormattedTime]: + """Return a formatted time class with a set time format and timezone treatment.""" + if time_format is None and timezone_treatment == "permit": + return FormattedTime + + dict_ = FormattedTime.__dict__.copy() + dict_["TIME_FORMAT"] = time_format + dict_["TIMEZONE_TREATMENT"] = timezone_treatment + + return type("FormattedTime", (FormattedTime, *FormattedTime.__bases__), dict_) + + class ReportingPeriod(dt.date): """A reporting period field, with the type of reporting period supplied""" diff --git a/src/dve/metadata_parser/model_generator.py b/src/dve/metadata_parser/model_generator.py index 53a82d8..7681b7f 100644 --- a/src/dve/metadata_parser/model_generator.py +++ b/src/dve/metadata_parser/model_generator.py @@ -72,6 +72,7 @@ def constr( "identifier": domain_types.identifier, "orgid": domain_types.OrgID, "formatteddatetime": domain_types.formatteddatetime, + "formattedtime": domain_types.formattedtime, "conformatteddate": domain_types.conformatteddate, "reportingperiodstart": domain_types.reportingperiod(reporting_period_type="start"), "reportingperiodend": domain_types.reportingperiod(reporting_period_type="end"), diff --git a/tests/test_core_engine/test_backends/fixtures.py b/tests/test_core_engine/test_backends/fixtures.py index 1f9ac23..14369b9 100644 --- a/tests/test_core_engine/test_backends/fixtures.py +++ b/tests/test_core_engine/test_backends/fixtures.py @@ -2,7 +2,7 @@ # pylint: disable=redefined-outer-name import json -from datetime import date, datetime +from datetime import date, datetime, time from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Dict, Iterator, List, Tuple @@ -83,10 +83,10 @@ def temp_duckdb_dir(): @pytest.fixture def temp_csv_file(temp_duckdb_dir: Path): - header: str = "ID,varchar_field,bigint_field,date_field,timestamp_field" + header: str = "ID,varchar_field,bigint_field,date_field,timestamp_field,time_field" typed_data = [ - [1, "hi", 3, date(2023, 1, 3), datetime(2023, 1, 3, 12, 0, 3)], - [2, "bye", 4, date(2023, 3, 7), datetime(2023, 5, 9, 15, 21, 53)], + [1, "hi", 3, date(2023, 1, 3), datetime(2023, 1, 3, 12, 0, 3), time(12, 0, 0)], + [2, "bye", 4, date(2023, 3, 7), datetime(2023, 5, 9, 15, 21, 53), time(13, 0 ,0)], ] class SimpleModel(BaseModel): @@ -95,6 +95,7 @@ class SimpleModel(BaseModel): bigint_field: int date_field: date timestamp_field: datetime + time_field: time with open(temp_duckdb_dir.joinpath("dummy.csv"), mode="w") as csv_file: csv_file.write(header + "\n") diff --git a/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py b/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py index 23f1534..5093150 100644 --- a/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py +++ b/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py @@ -41,6 +41,14 @@ def test_duckdb_data_contract_csv(temp_csv_file): "bigint_field": "NonNegativeInt", "date_field": "date", "timestamp_field": "datetime", + "time_field": { + "description": "test", + "callable": "formattedtime", + "constraints": { + "time_format": "%Y-%m-%d", + "timezone_treatment": "forbid" + } + } }, "reader_config": { ".csv": { diff --git a/tests/test_model_generation/test_domain_types.py b/tests/test_model_generation/test_domain_types.py index 9db587a..6ceee74 100644 --- a/tests/test_model_generation/test_domain_types.py +++ b/tests/test_model_generation/test_domain_types.py @@ -307,3 +307,56 @@ def test_reportingperiod_raises(field, value): data = {field: value} with pytest.raises(ValueError): model = ReportingPeriodModel(**data) + + +@pytest.mark.parametrize( + ["time_to_validate", "time_format", "timezone_treatment", "expected"], + [ + ["23:00:00", "%H:%M:%S", "forbid", dt.time(23, 0, 0)], + ["11:00:00", "%I:%M:%S", "forbid", dt.time(11, 0, 0)], + ["23:00:00Z", None, "require", dt.time(23, 0, 0, tzinfo=UTC)], + ["12:00:00Zam", None, "permit", dt.time(0, 0, 0, tzinfo=UTC)], + ["12:00:00pm", None, "forbid", dt.time(12, 0, 0)], + ["1970-01-01", "%Y-%m-%d", "forbid", dt.time(0, 0)], + # not great that it effectively returns incorrect time object here. However, this would be + # down to user error in setting up the dischema. + [dt.datetime(2025, 12, 1, 13, 0, 5), "%H:%M:%S", "forbid", dt.time(13, 0, 5)], + [dt.datetime(2025, 12, 1, 13, 0, 5, tzinfo=UTC), "%H:%M:%S", "require", dt.time(13, 0, 5, tzinfo=UTC)], + [dt.time(13, 0, 0), "%H:%M:%S", "forbid", dt.time(13, 0, 0)], + [dt.time(13, 0, 0, tzinfo=UTC), "%H:%M:%S", "permit", dt.time(13, 0, 0, tzinfo=UTC)], + [dt.time(13, 0, 0, tzinfo=UTC), "%H:%M:%S", "require", dt.time(13, 0, 0, tzinfo=UTC)], + ] +) +def test_formattedtime( + time_to_validate: str | dt.datetime | dt.time, + time_format: str, + timezone_treatment: str, + expected: dt.time +): + """Test serialised time objects can be parsed correctly when valid.""" + time_type = hct.formattedtime(time_format, timezone_treatment) + assert time_type.validate(time_to_validate) == expected + + +@pytest.mark.parametrize( + ["time_to_validate", "time_format", "timezone_treatment"], + [ + ["1970-01-01", "%H:%M:%S", "forbid",], + ["1970-01-01", "%H:%M:%S", "forbid",], + ["23:00:00", "%I:%M:%S", "permit",], + ["23:00:00", "%H:%M:%S", "require",], + ["23:00:00Z", "%I:%M:%S", "forbid",], + [dt.datetime(2025, 12, 1, 13, 0, 5, tzinfo=UTC), "%H:%M:%S", "forbid",], + [dt.time(13, 0, 5, tzinfo=UTC), "%H:%M:%S", "forbid",] + ] +) +def test_formattedtime_raises( + time_to_validate: str | dt.datetime | dt.time, time_format: str, timezone_treatment: str +): + """ + Test incorrect serialised objects can be handled correctly when attempting to parse into time + objects. + """ + time_type = hct.formattedtime(time_format, timezone_treatment) + with pytest.raises(ValueError): + time_type.validate(time_to_validate) # pylint: disable=W0106