Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 14 additions & 14 deletions docs/detailed_guidance/domain_types.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,24 @@ Domain types are custom defined pydantic types that solve common problems with u
This might include Postcodes, NHS Numbers, dates with specific formats etc.

Below is a list of defined types, their output type and any contraints. Nested beneath them are any constraints that area allowed and their default values if there are any.
| Defined Type | Output Type | Contraints & Defaults |
| ------------ | ----------- | --------------------- |
| NHSNumber | str |
| permissive_nhs_number | str | <li> warn_on_test_numbers = False </li> |
| Postcode | str |
| OrgId | str |
| conformatteddate | date | <li>date_format: str</li><li>ge: date</li><li>le: date</li><li>gt: date</li><li>lt: date</li> |
| formatteddatetime | datetime | <li>date_format: str </li><li>timezone_treatment: one_of ["forbid", "permit", "require] = "permit"</li> |
| reportingperiod | date | <li>reporting_period_type: one_of ["start", "end"]</li><li>date_format: str = "%Y-%m-%d"</li> |
| alphanumeric | str | <li>min_digits : NonNegativeInt = 1</li><li>max_digits: PositiveInt = 1</li> |
| identifier | str | <li>min_digits : NonNegativeInt = 1</li><li>max_digits: PositiveInt = 1</li>
| Defined Type | Output Type | Contraints & Defaults | Supported Implementations |
| ------------ | ----------- | --------------------- | ------------------------- |
| NHSNumber | str | | Spark, DuckDB |
| permissive_nhs_number | str | <li> warn_on_test_numbers = False </li> | Spark, DuckDB |
| Postcode | str | | Spark, DuckDB |
| OrgId | str | | Spark, DuckDB |
| conformatteddate | date | <li>date_format: str</li><li>ge: date</li><li>le: date</li><li>gt: date</li><li>lt: date</li> | Spark, DuckDB |
| formatteddatetime | datetime | <li>date_format: str </li><li>timezone_treatment: one_of ["forbid", "permit", "require] = "permit"</li> | Spark, DuckDB |
| formattedtime | time | <li>time_format: str </li><li>timezone_treatment: one_of ["forbid", "permit", "require"] = "permit" | DuckDB |
| reportingperiod | date | <li>reporting_period_type: one_of ["start", "end"]</li><li>date_format: str = "%Y-%m-%d"</li> | Spark, DuckDB |
| alphanumeric | str | <li>min_digits : NonNegativeInt = 1</li><li>max_digits: PositiveInt = 1</li> | Spark, DuckDB |
| identifier | str | <li>min_digits : NonNegativeInt = 1</li><li>max_digits: PositiveInt = 1</li> | Spark, DuckDB |

Other types that are allowed include:
**Other types that are allowed include:**
- str
- int
- date
- datetime
- Decimal
- float

And any types that are included in [pydantic version 1.10](https://docs.pydantic.dev/1.10/usage/types/#pydantic-types)
- Any types that are included in [pydantic version 1.10](https://docs.pydantic.dev/1.10/usage/types/#pydantic-types)
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Implementation of duckdb backend"""

from dve.core_engine.backends.implementations.duckdb.readers.json import DuckDBJSONReader
from dve.core_engine.backends.readers import register_reader

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""Helper objects for duckdb data contract implementation"""
from collections.abc import Generator, Iterator
from dataclasses import is_dataclass
from datetime import date, datetime
from datetime import date, datetime, time
from decimal import Decimal
from pathlib import Path
from typing import Any, ClassVar, Union
Expand Down Expand Up @@ -87,6 +87,7 @@ def __call__(self):
date: ddbtyp.DATE,
datetime: ddbtyp.TIMESTAMP,
Decimal: DDBDecimal()(),
time: ddbtyp.TIME,
}
"""A mapping of Python types to the equivalent DuckDB types."""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,7 @@
from dataclasses import dataclass, is_dataclass
from decimal import Decimal
from functools import wraps
from typing import (
Any,
ClassVar,
Optional,
TypeVar,
Union,
overload,
)
from typing import Any, ClassVar, Optional, TypeVar, Union, overload

from delta.exceptions import ConcurrentAppendException, DeltaConcurrentModificationException
from pydantic import BaseModel
Expand Down
9 changes: 1 addition & 8 deletions src/dve/core_engine/backends/readers/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,7 @@

import re
from collections.abc import Collection, Iterator
from typing import (
IO,
Any,
GenericAlias, # type: ignore
Optional,
Union,
overload
)
from typing import IO, Any, GenericAlias, Optional, Union, overload # type: ignore

import polars as pl
from lxml import etree # type: ignore
Expand Down
9 changes: 6 additions & 3 deletions src/dve/core_engine/backends/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

import sys
from dataclasses import is_dataclass
from datetime import date, datetime
from datetime import date, datetime, time
from decimal import Decimal
from typing import Any, ClassVar, Union
from typing import GenericAlias # type: ignore
from typing import Any, ClassVar, Union

import polars as pl # type: ignore
from polars.datatypes.classes import DataTypeClass as PolarsType
Expand Down Expand Up @@ -33,13 +33,16 @@
date: pl.Date, # type: ignore
datetime: pl.Datetime, # type: ignore
Decimal: pl.Utf8, # type: ignore
time: pl.Time, # type: ignore
}
"""A mapping of Python types to the equivalent Polars types."""


def stringify_type(type_: Union[type, GenericAlias]) -> type:
"""Stringify an individual type."""
if isinstance(type_, type) and not isinstance(type_, GenericAlias): # A model, return the contents. # pylint: disable=C0301
if isinstance(type_, type) and not isinstance(
type_, GenericAlias
): # A model, return the contents. # pylint: disable=C0301
if issubclass(type_, BaseModel):
return stringify_model(type_)

Expand Down
2 changes: 1 addition & 1 deletion src/dve/core_engine/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import copy
import datetime as dt
import operator
import json
import operator
from collections.abc import Callable
from decimal import Decimal
from functools import reduce
Expand Down
4 changes: 3 additions & 1 deletion src/dve/core_engine/type_hints.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
from pathlib import Path
from queue import Queue as ThreadQueue
from typing import TYPE_CHECKING, Any, List, Optional, TypeVar, Union # pylint: disable=W1901
# TODO - cannot remove List from Typing. See L60 for details.

from pyspark.sql import DataFrame
from pyspark.sql.types import StructType
from typing_extensions import Literal, ParamSpec, get_args

# TODO - cannot remove List from Typing. See L60 for details.



if TYPE_CHECKING: # pragma: no cover
from dve.core_engine.message import FeedbackMessage
Expand Down
108 changes: 108 additions & 0 deletions src/dve/metadata_parser/domain_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,97 @@ def __get_validators__(cls) -> Iterator[classmethod]:
yield cls.validate # type: ignore


class FormattedTime(dt.time):
"""A time, provided as a datetime or a string in a specific format."""

TIME_FORMAT: ClassVar[Optional[str]] = None
"""The specific format of the time."""
TIMEZONE_TREATMENT: ClassVar[Literal["forbid", "permit", "require"]] = "permit"
"""How to treat the presence of timezone-related information."""
DEFAULT_PATTERNS: Sequence[str] = list(
# 24 hour time pattern combinations
map(
"".join,
itertools.product(
("%H:%M:%S", "%H%M%S"),
("", ".%f"),
("%p", "%P", ""),
("%z", ""),
),
)
) + list(
# 12 hour time pattern combinations
map(
"".join,
itertools.product(
("%I:%M:%S", "%I%M%S"),
("", ".%f"),
("%z", ""),
(" %p", "%p", "%P", " %P", ""),
),
)
)
"""A sequence of time format patterns to try if `TIME_FORMAT` is unset."""

@classmethod
def convert_to_time(cls, value: dt.datetime) -> dt.time:
"""
Convert `datetime.datetime` to `datetime.time`. If datetime contains timezone info, that
will be retained.
"""
if value.tzinfo:
return value.timetz()

return value.time()

@classmethod
def parse_time(cls, string: str) -> dt.time:
"""Attempt to parse a datetime using various formats in sequence."""
string = string.strip()
if string.endswith("Z"): # Convert 'zulu' time to UTC.
string = string[:-1] + "+00:00"

for pattern in cls.DEFAULT_PATTERNS:
try:
datetime = dt.datetime.strptime(string, pattern)
except ValueError:
continue

time = cls.convert_to_time(datetime)

return time # pragma: no cover
raise ValueError("Unable to parse provided time")

@classmethod
def validate(cls, value: Union[dt.time, dt.datetime, str]) -> dt.time | None:
"""Validate a passed time, datetime or string."""
if value is None:
return value

if isinstance(value, dt.time):
new_time = value
elif isinstance(value, dt.datetime):
new_time = cls.convert_to_time(value)
else:
if cls.TIME_FORMAT is not None:
try:
new_time = dt.datetime.strptime(value, cls.TIME_FORMAT) # type: ignore
new_time = cls.convert_to_time(new_time) # type: ignore
except ValueError as err:
raise ValueError(
f"Unable to parse provided time in format {cls.TIME_FORMAT}"
) from err
else:
new_time = cls.parse_time(value)

if cls.TIMEZONE_TREATMENT == "forbid" and new_time.tzinfo:
raise ValueError("Provided time has timezone, but this is forbidden for this field")
if cls.TIMEZONE_TREATMENT == "require" and not new_time.tzinfo:
raise ValueError("Provided time missing timezone, but this is required for this field")

return new_time


@lru_cache()
@validate_arguments
def formatteddatetime(
Expand All @@ -412,6 +503,23 @@ def formatteddatetime(
return type("FormattedDatetime", (FormattedDatetime, *FormattedDatetime.__bases__), dict_)


@lru_cache()
@validate_arguments
def formattedtime(
time_format: Optional[str] = None,
timezone_treatment: Literal["forbid", "permit", "require"] = "permit",
) -> type[FormattedTime]:
"""Return a formatted time class with a set time format and timezone treatment."""
if time_format is None and timezone_treatment == "permit":
return FormattedTime

dict_ = FormattedTime.__dict__.copy()
dict_["TIME_FORMAT"] = time_format
dict_["TIMEZONE_TREATMENT"] = timezone_treatment

return type("FormattedTime", (FormattedTime, *FormattedTime.__bases__), dict_)


class ReportingPeriod(dt.date):
"""A reporting period field, with the type of reporting period supplied"""

Expand Down
1 change: 1 addition & 0 deletions src/dve/metadata_parser/model_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def constr(
"identifier": domain_types.identifier,
"orgid": domain_types.OrgID,
"formatteddatetime": domain_types.formatteddatetime,
"formattedtime": domain_types.formattedtime,
"conformatteddate": domain_types.conformatteddate,
"reportingperiodstart": domain_types.reportingperiod(reporting_period_type="start"),
"reportingperiodend": domain_types.reportingperiod(reporting_period_type="end"),
Expand Down
9 changes: 5 additions & 4 deletions tests/test_core_engine/test_backends/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# pylint: disable=redefined-outer-name
import json
from datetime import date, datetime
from datetime import date, datetime, time
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any, Dict, Iterator, List, Tuple
Expand Down Expand Up @@ -83,10 +83,10 @@ def temp_duckdb_dir():

@pytest.fixture
def temp_csv_file(temp_duckdb_dir: Path):
header: str = "ID,varchar_field,bigint_field,date_field,timestamp_field"
header: str = "ID,varchar_field,bigint_field,date_field,timestamp_field,time_field"
typed_data = [
[1, "hi", 3, date(2023, 1, 3), datetime(2023, 1, 3, 12, 0, 3)],
[2, "bye", 4, date(2023, 3, 7), datetime(2023, 5, 9, 15, 21, 53)],
[1, "hi", 3, date(2023, 1, 3), datetime(2023, 1, 3, 12, 0, 3), time(12, 0, 0)],
[2, "bye", 4, date(2023, 3, 7), datetime(2023, 5, 9, 15, 21, 53), time(13, 0 ,0)],
]

class SimpleModel(BaseModel):
Expand All @@ -95,6 +95,7 @@ class SimpleModel(BaseModel):
bigint_field: int
date_field: date
timestamp_field: datetime
time_field: time

with open(temp_duckdb_dir.joinpath("dummy.csv"), mode="w") as csv_file:
csv_file.write(header + "\n")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@ def test_duckdb_data_contract_csv(temp_csv_file):
"bigint_field": "NonNegativeInt",
"date_field": "date",
"timestamp_field": "datetime",
"time_field": {
"description": "test",
"callable": "formattedtime",
"constraints": {
"time_format": "%Y-%m-%d",
"timezone_treatment": "forbid"
}
}
},
"reader_config": {
".csv": {
Expand Down
53 changes: 53 additions & 0 deletions tests/test_model_generation/test_domain_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,3 +307,56 @@ def test_reportingperiod_raises(field, value):
data = {field: value}
with pytest.raises(ValueError):
model = ReportingPeriodModel(**data)


@pytest.mark.parametrize(
["time_to_validate", "time_format", "timezone_treatment", "expected"],
[
["23:00:00", "%H:%M:%S", "forbid", dt.time(23, 0, 0)],
["11:00:00", "%I:%M:%S", "forbid", dt.time(11, 0, 0)],
["23:00:00Z", None, "require", dt.time(23, 0, 0, tzinfo=UTC)],
["12:00:00Zam", None, "permit", dt.time(0, 0, 0, tzinfo=UTC)],
["12:00:00pm", None, "forbid", dt.time(12, 0, 0)],
["1970-01-01", "%Y-%m-%d", "forbid", dt.time(0, 0)],
# not great that it effectively returns incorrect time object here. However, this would be
# down to user error in setting up the dischema.
[dt.datetime(2025, 12, 1, 13, 0, 5), "%H:%M:%S", "forbid", dt.time(13, 0, 5)],
[dt.datetime(2025, 12, 1, 13, 0, 5, tzinfo=UTC), "%H:%M:%S", "require", dt.time(13, 0, 5, tzinfo=UTC)],
[dt.time(13, 0, 0), "%H:%M:%S", "forbid", dt.time(13, 0, 0)],
[dt.time(13, 0, 0, tzinfo=UTC), "%H:%M:%S", "permit", dt.time(13, 0, 0, tzinfo=UTC)],
[dt.time(13, 0, 0, tzinfo=UTC), "%H:%M:%S", "require", dt.time(13, 0, 0, tzinfo=UTC)],
]
)
def test_formattedtime(
time_to_validate: str | dt.datetime | dt.time,
time_format: str,
timezone_treatment: str,
expected: dt.time
):
"""Test serialised time objects can be parsed correctly when valid."""
time_type = hct.formattedtime(time_format, timezone_treatment)
assert time_type.validate(time_to_validate) == expected


@pytest.mark.parametrize(
["time_to_validate", "time_format", "timezone_treatment"],
[
["1970-01-01", "%H:%M:%S", "forbid",],
["1970-01-01", "%H:%M:%S", "forbid",],
["23:00:00", "%I:%M:%S", "permit",],
["23:00:00", "%H:%M:%S", "require",],
["23:00:00Z", "%I:%M:%S", "forbid",],
[dt.datetime(2025, 12, 1, 13, 0, 5, tzinfo=UTC), "%H:%M:%S", "forbid",],
[dt.time(13, 0, 5, tzinfo=UTC), "%H:%M:%S", "forbid",]
]
)
def test_formattedtime_raises(
time_to_validate: str | dt.datetime | dt.time, time_format: str, timezone_treatment: str
):
"""
Test incorrect serialised objects can be handled correctly when attempting to parse into time
objects.
"""
time_type = hct.formattedtime(time_format, timezone_treatment)
with pytest.raises(ValueError):
time_type.validate(time_to_validate) # pylint: disable=W0106