From 64a01d08ca06cd7d42a8818260e016b64aa6a2f0 Mon Sep 17 00:00:00 2001 From: Benjamin Gutzmann Date: Tue, 20 Jan 2026 21:52:49 +0100 Subject: [PATCH 1/4] Treat identifier filters with missing/null `value` as a configurable placeholder for logging and naming --- CHANGELOG.md | 1 + README.md | 4 ++++ docs/configuration.md | 2 ++ src/koality/checks.py | 8 +++++++- src/koality/executor.py | 1 + src/koality/models.py | 2 ++ tests/unit/test_filters.py | 23 +++++++++++++++++++++++ 7 files changed, 40 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 116d21b..1c95900 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ Types of changes: - Allow setting `monitor_only` on check bundles and on individual checks - Map provider/table-not-found errors to a unified `table_exists` metric - Support identifier-type filters without explicit column/value for naming; add `identifier_format` option (`identifier`, `filter_name`, `column_name`) to control the result identifier column +- Treat identifier filters with missing/null `value` as a configurable placeholder for logging and naming (defaults to `ALL`) ## [0.9.0] - 2026-01-16 diff --git a/README.md b/README.md index 515199f..c5a66d8 100644 --- a/README.md +++ b/README.md @@ -193,6 +193,10 @@ Koality supports an `identifier` filter type which can be used to mark the field If an identifier-type filter is defined without a concrete `column` or `value` (for example in global `defaults`), it is treated as a naming-only hint and will not be turned into a WHERE clause; this is useful when you only want to control the result identifier column name (e.g., `SHOP_ID`) across checks. +Behavior for missing identifier values + +When an identifier-type filter is present but its `value` is missing or explicitly `null`, Koality substitutes a configurable placeholder for logging and naming (`defaults.identifier_placeholder`, default: `ALL`) to avoid `None` appearing in metric messages. You can override the placeholder at bundle or check level by setting `identifier_placeholder` in the corresponding defaults. + ### Filter Properties | Property | Description | diff --git a/docs/configuration.md b/docs/configuration.md index a93987e..70dc87e 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -406,6 +406,8 @@ filters: The identifier value appears in check results and failure messages. How it's formatted depends on the `identifier_format` global setting. +For more details about naming-only identifier filters and the `identifier_placeholder` option, see the guide: [Identifier filters and naming](../identifier_filters.md). + ### Date Filters When `type: date` is set, the value is automatically parsed as a date. Supported formats: diff --git a/src/koality/checks.py b/src/koality/checks.py index c7ec8c7..a6de543 100644 --- a/src/koality/checks.py +++ b/src/koality/checks.py @@ -41,6 +41,7 @@ def __init__( *, filters: dict[str, Any] | None = None, identifier_format: str = "identifier", + identifier_placeholder: str = "ALL", date_info: str | None = None, extra_info: str | None = None, monitor_only: bool = False, @@ -62,6 +63,7 @@ def __init__( # Identifier format configuration self.identifier_format = identifier_format + self.identifier_placeholder = identifier_placeholder # for where filter handling self.filters = self.get_filters(filters or {}) @@ -70,7 +72,11 @@ def __init__( identifier_filter_result = self.get_identifier_filter(self.filters) if identifier_filter_result: filter_name, filter_config = identifier_filter_result - value = filter_config.get("value", "ALL") + # If value key is missing or explicitly None, treat as identifier_placeholder (meaning "no specific value") + if "value" in filter_config and filter_config["value"] is not None: + value = filter_config["value"] + else: + value = self.identifier_placeholder column = filter_config.get("column", "") if self.identifier_format == "identifier": diff --git a/src/koality/executor.py b/src/koality/executor.py index fc07013..70dc613 100644 --- a/src/koality/executor.py +++ b/src/koality/executor.py @@ -318,6 +318,7 @@ def execute_checks(self) -> None: check_kwargs["database_accessor"] = self.config.database_accessor check_kwargs["database_provider"] = self.database_provider check_kwargs["identifier_format"] = self.config.defaults.identifier_format + check_kwargs["identifier_placeholder"] = self.config.defaults.identifier_placeholder check_instance = check_factory(**check_kwargs) self.checks.append(check_instance) diff --git a/src/koality/models.py b/src/koality/models.py index 3efdfde..56e0f08 100644 --- a/src/koality/models.py +++ b/src/koality/models.py @@ -179,6 +179,8 @@ class _GlobalDefaults(_Defaults): monitor_only: bool = False result_table: str | None = None identifier_format: IdentifierFormat = "identifier" + # Placeholder used when an identifier filter has no concrete value (e.g., naming-only filters) + identifier_placeholder: str = "ALL" @computed_field def persist_results(self) -> bool: diff --git a/tests/unit/test_filters.py b/tests/unit/test_filters.py index ed93ad2..2ad3585 100644 --- a/tests/unit/test_filters.py +++ b/tests/unit/test_filters.py @@ -249,6 +249,29 @@ def test_assemble_where_skips_naming_only_identifier(self) -> None: assert "shop_id" not in where_sql assert "DATE" in where_sql + def test_identifier_placeholder_used(self) -> None: + """When identifier filter has no value, identifier_placeholder is used in check identifier and column naming.""" + class DummyCheck(DataQualityCheck): + def assemble_query(self) -> str: + return "SELECT 1" + + def assemble_data_exists_query(self) -> str: + return "SELECT '' AS empty_table" + + def assemble_name(self) -> str: + return "dummy" + + chk = DummyCheck( + database_accessor="", + database_provider=None, + table="t", + filters={"shop_id": {"type": "identifier"}}, + identifier_format="filter_name", + identifier_placeholder="PLACEHOLDER", + ) + assert chk.identifier == "PLACEHOLDER" + assert chk.identifier_column == "SHOP_ID" + class TestOperatorFilters: """Tests for filter operator functionality.""" From fa73164e54bc385269a70d8f0af2f6e31778bf5d Mon Sep 17 00:00:00 2001 From: Benjamin Gutzmann Date: Tue, 20 Jan 2026 22:19:38 +0100 Subject: [PATCH 2/4] Add `identifier_placeholder` option to configure the placeholder value used when identifier filters lack a value --- CHANGELOG.md | 1 + README.md | 2 + docs/identifier_placeholder.md | 66 ++++++++++++++++++++++++++++++++ src/koality/checks.py | 18 +++++++++ tests/integration/test_checks.py | 7 ++++ tests/unit/test_filters.py | 1 + 6 files changed, 95 insertions(+) create mode 100644 docs/identifier_placeholder.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c95900..3bdea9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ Types of changes: - Map provider/table-not-found errors to a unified `table_exists` metric - Support identifier-type filters without explicit column/value for naming; add `identifier_format` option (`identifier`, `filter_name`, `column_name`) to control the result identifier column - Treat identifier filters with missing/null `value` as a configurable placeholder for logging and naming (defaults to `ALL`) +- Add `identifier_placeholder` option to configure the placeholder value used when identifier filters lack a value; defaults to `ALL` and is applied to the result IDENTIFIER column and logging for clearer partition naming. ## [0.9.0] - 2026-01-16 diff --git a/README.md b/README.md index c5a66d8..fa22444 100644 --- a/README.md +++ b/README.md @@ -197,6 +197,8 @@ Behavior for missing identifier values When an identifier-type filter is present but its `value` is missing or explicitly `null`, Koality substitutes a configurable placeholder for logging and naming (`defaults.identifier_placeholder`, default: `ALL`) to avoid `None` appearing in metric messages. You can override the placeholder at bundle or check level by setting `identifier_placeholder` in the corresponding defaults. +Additional docs: see docs/identifier_placeholder.md for usage examples and configuration details. + ### Filter Properties | Property | Description | diff --git a/docs/identifier_placeholder.md b/docs/identifier_placeholder.md new file mode 100644 index 0000000..b545108 --- /dev/null +++ b/docs/identifier_placeholder.md @@ -0,0 +1,66 @@ +Identifier Placeholder + +When an identifier-type filter (type: identifier) is defined but the filter's value is missing or explicitly `null`, Koality uses a configurable placeholder string to fill the result IDENTIFIER column and logging messages. This avoids `None` or empty identifiers in persisted results and monitoring UIs. + +Configuration + +You can set the placeholder in the following locations (more specific levels override less specific ones): + +- `defaults.identifier_placeholder` (global default) +- `check_bundles..defaults.identifier_placeholder` (bundle-level) +- `check_bundles..checks..identifier_placeholder` (check-level) + +If not set, the placeholder defaults to `ALL`. + +Examples + +1) Global default placeholder: + +```yaml +defaults: + identifier_placeholder: UNKNOWN + filters: + shop_id: + column: shop_code + type: identifier + value: null +``` + +Result: IDENTIFIER uses `UNKNOWN` for checks that rely on the `shop_id` identifier when no concrete value is provided. + +2) Bundle-level override: + +```yaml +check_bundles: + - name: my_bundle + defaults: + identifier_placeholder: ALL_SHOPS + filters: + shop_id: + column: shop_code + type: identifier + value: null +``` + +3) Check-level override: + +```yaml +check_bundles: + - name: my_bundle + defaults: + filters: + shop_id: + column: shop_code + type: identifier + checks: + - check_type: CountCheck + identifier_placeholder: SHOP_UNKNOWN + filters: + shop_id: + value: null +``` + +Notes + +- The placeholder is only applied for naming/logging and does not produce a WHERE clause when the identifier filter lacks a `value` and no column is provided. +- Use a descriptive placeholder (e.g., `ALL`, `UNKNOWN`, `ALL_SHOPS`) to make results easier to interpret in dashboards and logs. diff --git a/src/koality/checks.py b/src/koality/checks.py index a6de543..7bba9f4 100644 --- a/src/koality/checks.py +++ b/src/koality/checks.py @@ -471,6 +471,7 @@ def __init__( *, filters: dict[str, Any] | None = None, identifier_format: str = "identifier", + identifier_placeholder: str = "ALL", date_info: str | None = None, extra_info: str | None = None, monitor_only: bool = False, @@ -487,6 +488,7 @@ def __init__( upper_threshold=upper_threshold, filters=filters, identifier_format=identifier_format, + identifier_placeholder=identifier_placeholder, date_info=date_info, extra_info=extra_info, monitor_only=monitor_only, @@ -569,6 +571,7 @@ def __init__( *, filters: dict[str, Any] | None = None, identifier_format: str = "identifier", + identifier_placeholder: str = "ALL", date_info: str | None = None, extra_info: str | None = None, monitor_only: bool = False, @@ -584,6 +587,7 @@ def __init__( upper_threshold=upper_threshold, filters=filters, identifier_format=identifier_format, + identifier_placeholder=identifier_placeholder, date_info=date_info, extra_info=extra_info, monitor_only=monitor_only, @@ -634,6 +638,7 @@ def __init__( *, filters: dict[str, Any] | None = None, identifier_format: str = "identifier", + identifier_placeholder: str = "ALL", date_info: str | None = None, extra_info: str | None = None, monitor_only: bool = False, @@ -651,6 +656,7 @@ def __init__( upper_threshold=upper_threshold, filters=filters, identifier_format=identifier_format, + identifier_placeholder=identifier_placeholder, date_info=date_info, extra_info=extra_info, monitor_only=monitor_only, @@ -705,6 +711,7 @@ def __init__( extra_info: str | None = None, filters: dict[str, Any] | None = None, identifier_format: str = "identifier", + identifier_placeholder: str = "ALL", date_info: str | None = None, ) -> None: """Initialize the values in set check.""" @@ -724,6 +731,7 @@ def __init__( upper_threshold=upper_threshold, filters=filters, identifier_format=identifier_format, + identifier_placeholder=identifier_placeholder, date_info=date_info, extra_info=extra_info, monitor_only=monitor_only, @@ -861,6 +869,7 @@ def __init__( *, filters: dict[str, Any] | None = None, identifier_format: str = "identifier", + identifier_placeholder: str = "ALL", date_info: str | None = None, extra_info: str | None = None, monitor_only: bool = False, @@ -876,6 +885,7 @@ def __init__( upper_threshold=upper_threshold, filters=filters, identifier_format=identifier_format, + identifier_placeholder=identifier_placeholder, date_info=date_info, extra_info=extra_info, monitor_only=monitor_only, @@ -927,6 +937,7 @@ def __init__( distinct: bool = False, filters: dict[str, Any] | None = None, identifier_format: str = "identifier", + identifier_placeholder: str = "ALL", date_info: str | None = None, extra_info: str | None = None, monitor_only: bool = False, @@ -948,6 +959,7 @@ def __init__( upper_threshold=upper_threshold, filters=filters, identifier_format=identifier_format, + identifier_placeholder=identifier_placeholder, date_info=date_info, extra_info=extra_info, monitor_only=monitor_only, @@ -1241,6 +1253,7 @@ def __init__( filters_left: dict[str, Any] | None = None, filters_right: dict[str, Any] | None = None, identifier_format: str = "identifier", + identifier_placeholder: str = "ALL", date_info: str | None = None, ) -> None: """Initialize the match rate check.""" @@ -1276,6 +1289,7 @@ def __init__( upper_threshold=upper_threshold, filters=filters, identifier_format=identifier_format, + identifier_placeholder=identifier_placeholder, date_info=date_info, extra_info=extra_info, monitor_only=monitor_only, @@ -1411,6 +1425,7 @@ def __init__( *, filters: dict[str, Any] | None = None, identifier_format: str = "identifier", + identifier_placeholder: str = "ALL", date_info: str | None = None, extra_info: str | None = None, monitor_only: bool = False, @@ -1440,6 +1455,7 @@ def __init__( upper_threshold=upper_threshold, filters=filters, identifier_format=identifier_format, + identifier_placeholder=identifier_placeholder, date_info=date_info, extra_info=extra_info, monitor_only=monitor_only, @@ -1575,6 +1591,7 @@ def __init__( *, filters: dict[str, Any] | None = None, identifier_format: str = "identifier", + identifier_placeholder: str = "ALL", date_info: str | None = None, extra_info: str | None = None, monitor_only: bool = False, @@ -1617,6 +1634,7 @@ def __init__( upper_threshold=math.inf, filters=filters, identifier_format=identifier_format, + identifier_placeholder=identifier_placeholder, date_info=date_info, extra_info=extra_info, monitor_only=monitor_only, diff --git a/tests/integration/test_checks.py b/tests/integration/test_checks.py index 36001b8..7a8dd9c 100644 --- a/tests/integration/test_checks.py +++ b/tests/integration/test_checks.py @@ -42,6 +42,7 @@ def test_message_no_extra_info(duckdb_client: duckdb.DuckDBPyConnection) -> None }, lower_threshold=1000, upper_threshold=9999, + identifier_placeholder="ALL", ) check(duckdb_client) @@ -65,6 +66,7 @@ def test_message_date_info(duckdb_client: duckdb.DuckDBPyConnection) -> None: lower_threshold=1000, upper_threshold=9999, date_info="PREDICTION_DATE = real date + 1", + identifier_placeholder="ALL", ) check(duckdb_client) @@ -88,6 +90,7 @@ def test_message_extra_info(duckdb_client: duckdb.DuckDBPyConnection) -> None: lower_threshold=1000, upper_threshold=9999, extra_info="Note: This is an awesome check.", + identifier_placeholder="ALL", ) check(duckdb_client) @@ -124,6 +127,7 @@ def test_message_correct_formatting() -> None: }, lower_threshold=1, upper_threshold=1, + identifier_placeholder="ALL", ) check(conn) @@ -146,6 +150,7 @@ def test_identifier_format_filter_name(duckdb_client: duckdb.DuckDBPyConnection) "date": {"column": "DATE", "value": "2023-01-01", "type": "date"}, }, identifier_format="filter_name", + identifier_placeholder="ALL", ) result = check(duckdb_client) @@ -168,6 +173,7 @@ def test_identifier_format_column_name(duckdb_client: duckdb.DuckDBPyConnection) "date": {"column": "DATE", "value": "2023-01-01", "type": "date"}, }, identifier_format="column_name", + identifier_placeholder="ALL", ) result = check(duckdb_client) @@ -213,6 +219,7 @@ def test_missing_table_maps_to_table_exists() -> None: "shop_id": {"column": "shop_code", "value": "SHOP001", "type": "identifier"}, "date": {"column": "DATE", "value": "2023-01-01", "type": "date"}, }, + identifier_placeholder="ALL", ) result = check.check(conn) diff --git a/tests/unit/test_filters.py b/tests/unit/test_filters.py index 2ad3585..79e3298 100644 --- a/tests/unit/test_filters.py +++ b/tests/unit/test_filters.py @@ -251,6 +251,7 @@ def test_assemble_where_skips_naming_only_identifier(self) -> None: def test_identifier_placeholder_used(self) -> None: """When identifier filter has no value, identifier_placeholder is used in check identifier and column naming.""" + class DummyCheck(DataQualityCheck): def assemble_query(self) -> str: return "SELECT 1" From 7e869735f17992f6a27c30372f4429d784cfd8bc Mon Sep 17 00:00:00 2001 From: Benjamin Gutzmann Date: Tue, 20 Jan 2026 22:26:04 +0100 Subject: [PATCH 3/4] Quote table identifiers in bulk SELECTs when loading data into DuckDB memory --- CHANGELOG.md | 4 ++ src/koality/executor.py | 2 +- .../test_executor_table_quoting.py | 61 +++++++++++++++++++ 3 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 tests/integration/test_executor_table_quoting.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bdea9b..6df60f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,10 @@ Types of changes: - Treat identifier filters with missing/null `value` as a configurable placeholder for logging and naming (defaults to `ALL`) - Add `identifier_placeholder` option to configure the placeholder value used when identifier filters lack a value; defaults to `ALL` and is applied to the result IDENTIFIER column and logging for clearer partition naming. +### Fixed + +- Quote table identifiers in bulk SELECTs when loading data into DuckDB memory to avoid BigQuery binder errors for identifiers that look like project IDs (e.g., `EC0601`). Added an integration test covering the quoting behavior. + ## [0.9.0] - 2026-01-16 ### Changed diff --git a/src/koality/executor.py b/src/koality/executor.py index 70dc613..06fbe34 100644 --- a/src/koality/executor.py +++ b/src/koality/executor.py @@ -274,7 +274,7 @@ def fetch_data_into_memory(self, data_requirements: defaultdict[str, defaultdict # Construct the bulk SELECT query select_query = f""" SELECT {columns} - FROM {table} + FROM "{table}" {final_where_clause} """ # noqa: S608 diff --git a/tests/integration/test_executor_table_quoting.py b/tests/integration/test_executor_table_quoting.py new file mode 100644 index 0000000..be00546 --- /dev/null +++ b/tests/integration/test_executor_table_quoting.py @@ -0,0 +1,61 @@ +"""Integration test for CheckExecutor to verify table name quoting when fetching data into memory.""" + +import duckdb +import pytest + +from koality.executor import CheckExecutor +from koality.models import Config, DatabaseProvider + + +@pytest.mark.integration +def test_fetch_data_into_memory_quotes_table(monkeypatch: pytest.MonkeyPatch) -> None: + """Test that table names are properly quoted when fetching data into memory.""" + # Minimal config without an accessor to avoid identify_database_provider during init + cfg = Config.model_validate( + { + "name": "test", + "database_setup": "", + "database_accessor": "", + "defaults": {"filters": {}}, + "check_bundles": [], + }, + ) + + executor = CheckExecutor(cfg) + + # Simulate that we have an accessor and a bigquery provider after initialization + executor.config.database_accessor = "bq" + executor.database_provider = DatabaseProvider( + database_name="bq", + database_oid=1, + path="", + comment=None, + tags={}, + internal=False, + type="bigquery", + readonly=False, + encrypted=False, + cipher=None, + ) + + table_name = "EC0601.view_skufeed" + data_requirements = {table_name: {"columns": {"*"}, "filters": set()}} + + captured = {"query": None} + + def fake_execute_query( + query: str, + duckdb_client: duckdb.DuckDBPyConnection, + database_accessor: str, # noqa: ARG001 + database_provider: DatabaseProvider, # noqa: ARG001 + ) -> duckdb.DuckDBPyRelation: + captured["query"] = query + # Return empty relation by executing a query that yields no rows + return duckdb_client.query("SELECT 1 WHERE FALSE") + + monkeypatch.setattr("koality.executor.execute_query", fake_execute_query) + + executor.fetch_data_into_memory(data_requirements) + + assert captured["query"] is not None + assert f'FROM "{table_name}"' in captured["query"] From a9bb9e52f8ac6b917c7f5c40641180ec4a540863 Mon Sep 17 00:00:00 2001 From: Benjamin Gutzmann Date: Tue, 20 Jan 2026 23:20:54 +0100 Subject: [PATCH 4/4] Ensure MatchRateCheck only requires the check column from the left table --- CHANGELOG.md | 1 + docs/checks/matchrate.md | 33 ++++++++++++ src/koality/executor.py | 51 +++++++++++++++---- .../test_executor_table_quoting.py | 3 +- 4 files changed, 76 insertions(+), 12 deletions(-) create mode 100644 docs/checks/matchrate.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 6df60f1..5f2ab72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ Types of changes: ### Fixed - Quote table identifiers in bulk SELECTs when loading data into DuckDB memory to avoid BigQuery binder errors for identifiers that look like project IDs (e.g., `EC0601`). Added an integration test covering the quoting behavior. +- Ensure MatchRateCheck only requires the check column from the left table; the right table now only contributes join and filter columns to avoid unnecessary column selection and errors. ## [0.9.0] - 2026-01-16 diff --git a/docs/checks/matchrate.md b/docs/checks/matchrate.md new file mode 100644 index 0000000..20a05f9 --- /dev/null +++ b/docs/checks/matchrate.md @@ -0,0 +1,33 @@ +# MatchRateCheck — check_column placement + +Guidance: + +- The `check_column` for `MatchRateCheck` must be a column present in the left-hand table only. +- The right-hand table is only expected to provide join columns and filter columns; it must not be required to contain `check_column`. + +Configuration example: + +```yaml +- defaults: + check_type: MatchRateCheck + check_column: product_number # must exist on the left table + join_columns_left: + - BQ_PARTITIONTIME + - shopId + - product_number + join_columns_right: + - BQ_PARTITIONTIME + - value.shopId + - product_number + checks: + - left_table: project.dataset.left_table + right_table: project.dataset.right_table + filters: + shop_id: + value: SHOP01 +``` + +Notes: + +- The executor only requests the `check_column` from the left table during bulk loading; the right table will only be queried for its join and filter columns. +- This avoids errors when the right table does not contain the `check_column` or when its identifier resembles a BigQuery project ID. diff --git a/src/koality/executor.py b/src/koality/executor.py index 06fbe34..5e60f1e 100644 --- a/src/koality/executor.py +++ b/src/koality/executor.py @@ -215,6 +215,33 @@ def get_data_requirements(self) -> defaultdict[str, defaultdict[str, set]]: # n """ data_requirements = defaultdict(lambda: defaultdict(set)) for check in self.checks: + # Skip synthetic JOIN table entries created for MatchRateCheck; handle left/right tables explicitly + if isinstance(check, MatchRateCheck): + # Add columns and filter columns for left table + if check.check_column and check.check_column != "*": + data_requirements[check.left_table]["columns"].add(check.check_column) + for _filter in check.filters_left.values(): + if "column" in _filter: + data_requirements[check.left_table]["columns"].add(_filter["column"]) + data_requirements[check.left_table]["columns"].update(check.join_columns_left) + + # Add only filter and join columns for right table (check_column is only from left table) + for _filter in check.filters_right.values(): + if "column" in _filter: + data_requirements[check.right_table]["columns"].add(_filter["column"]) + data_requirements[check.right_table]["columns"].update(check.join_columns_right) + + # Store unique filter configurations for both tables + filter_key_left = frozenset( + (name, frozenset(config.items())) for name, config in check.filters_left.items() + ) + filter_key_right = frozenset( + (name, frozenset(config.items())) for name, config in check.filters_right.items() + ) + data_requirements[check.left_table]["filters"].add(filter_key_left) + data_requirements[check.right_table]["filters"].add(filter_key_right) + continue + table_name = check.table check_filters = check.filters # Add check-specific columns and filter columns to the requirements @@ -224,16 +251,12 @@ def get_data_requirements(self) -> defaultdict[str, defaultdict[str, set]]: # n if "column" in _filter: data_requirements[table_name]["columns"].add(_filter["column"]) - # For MatchRateCheck, add columns from both left and right tables - if isinstance(check, MatchRateCheck): - data_requirements[check.left_table]["columns"].update(check.join_columns_left) - data_requirements[check.right_table]["columns"].update(check.join_columns_right) - for _filter in check.filters_left.values(): - if "column" in _filter: - data_requirements[check.left_table]["columns"].add(_filter["column"]) - for _filter in check.filters_right.values(): - if "column" in _filter: - data_requirements[check.right_table]["columns"].add(_filter["column"]) + if isinstance(check, IqrOutlierCheck): + check_filters = {k: v for k, v in check.filters.items() if v.get("type") != "date"} + + # Store unique filter configurations for each table + filter_key = frozenset((name, frozenset(config.items())) for name, config in check_filters.items()) + data_requirements[table_name]["filters"].add(filter_key) if isinstance(check, IqrOutlierCheck): check_filters = {k: v for k, v in check.filters.items() if v.get("type") != "date"} @@ -271,10 +294,16 @@ def fetch_data_into_memory(self, data_requirements: defaultdict[str, defaultdict if all_filters_sql: final_where_clause = "WHERE " + " OR ".join(all_filters_sql) + # Determine appropriate table quoting depending on database provider + if self.database_provider and getattr(self.database_provider, "type", "").lower() == "bigquery": + table_ref = f"`{table}`" + else: + table_ref = f'"{table}"' + # Construct the bulk SELECT query select_query = f""" SELECT {columns} - FROM "{table}" + FROM {table_ref} {final_where_clause} """ # noqa: S608 diff --git a/tests/integration/test_executor_table_quoting.py b/tests/integration/test_executor_table_quoting.py index be00546..152c028 100644 --- a/tests/integration/test_executor_table_quoting.py +++ b/tests/integration/test_executor_table_quoting.py @@ -58,4 +58,5 @@ def fake_execute_query( executor.fetch_data_into_memory(data_requirements) assert captured["query"] is not None - assert f'FROM "{table_name}"' in captured["query"] + # Expect backticks for BigQuery provider + assert f"FROM `{table_name}`" in captured["query"]