From d268a624d58f0988499eba30b791b3e3acc1f009 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cantoineeripret=E2=80=9D?= Date: Thu, 6 Nov 2025 10:21:30 +0100 Subject: [PATCH 01/10] feat: add dry run to the read_gbq function --- pandas_gbq/gbq.py | 5 ++++- pandas_gbq/gbq_connector.py | 14 +++++++++++++- tests/system/test_gbq.py | 13 +++++++++++++ tests/unit/test_gbq.py | 7 +++++++ 4 files changed, 37 insertions(+), 2 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 880dcef9..40480a2b 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -119,6 +119,7 @@ def read_gbq( *, col_order=None, bigquery_client=None, + dry_run: bool = False, ): r"""Read data from Google BigQuery to a pandas DataFrame. @@ -269,7 +270,8 @@ def read_gbq( bigquery_client : google.cloud.bigquery.Client, optional A Google Cloud BigQuery Python Client instance. If provided, it will be used for reading data, while the project and credentials parameters will be ignored. - + dry_run : bool, default False + If True, run a dry run query. Returns ------- df: DataFrame @@ -328,6 +330,7 @@ def read_gbq( max_results=max_results, progress_bar_type=progress_bar_type, dtypes=dtypes, + dry_run=dry_run, ) else: final_df = connector.download_table( diff --git a/pandas_gbq/gbq_connector.py b/pandas_gbq/gbq_connector.py index 2b3b716e..518de452 100644 --- a/pandas_gbq/gbq_connector.py +++ b/pandas_gbq/gbq_connector.py @@ -199,7 +199,14 @@ def download_table( user_dtypes=dtypes, ) - def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs): + def run_query( + self, + query, + max_results=None, + progress_bar_type=None, + dry_run: bool = False, + **kwargs, + ): from google.cloud import bigquery job_config_dict = { @@ -235,6 +242,7 @@ def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs): self._start_timer() job_config = bigquery.QueryJobConfig.from_api_repr(job_config_dict) + job_config.dry_run = dry_run if FEATURES.bigquery_has_query_and_wait: rows_iter = pandas_gbq.query.query_and_wait_via_client_library( @@ -260,6 +268,10 @@ def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs): ) dtypes = kwargs.get("dtypes") + + if dry_run: + return rows_iter + return self._download_results( rows_iter, max_results=max_results, diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 1457ec30..355ee68e 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -656,6 +656,19 @@ def test_columns_and_col_order_raises_error(self, project_id): dialect="standard", ) + def test_read_gbq_with_dry_run(self, project_id): + query = "SELECT 1" + job = gbq.read_gbq( + query, + project_id=project_id, + credentials=self.credentials, + dialect="standard", + dry_run=True, + ) + assert job.dry_run + assert job.state == "DONE" + assert job.total_bytes_processed > 0 + class TestToGBQIntegration(object): @pytest.fixture(autouse=True, scope="function") diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 75574820..fcbacc2a 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -937,3 +937,10 @@ def test_run_query_with_dml_query(mock_bigquery_client, mock_query_job): type(mock_query_job).destination = mock.PropertyMock(return_value=None) connector.run_query("UPDATE tablename SET value = '';") mock_bigquery_client.list_rows.assert_not_called() + + +def test_read_gbq_with_dry_run(mock_bigquery_client): + gbq.read_gbq("SELECT 1", project_id="my-project", dry_run=True) + _, kwargs = mock_bigquery_client.query.call_args + job_config = kwargs["job_config"] + assert job_config.dry_run is True From 13fbf92276b496ef1c39b12ddb21f546bc348d68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cantoineeripret=E2=80=9D?= Date: Thu, 6 Nov 2025 10:27:30 +0100 Subject: [PATCH 02/10] return the cost (in GB) if dry run is set to True --- pandas_gbq/gbq_connector.py | 2 +- tests/system/test_gbq.py | 8 ++++---- tests/unit/test_gbq.py | 6 ++++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas_gbq/gbq_connector.py b/pandas_gbq/gbq_connector.py index 518de452..cc83e8df 100644 --- a/pandas_gbq/gbq_connector.py +++ b/pandas_gbq/gbq_connector.py @@ -270,7 +270,7 @@ def run_query( dtypes = kwargs.get("dtypes") if dry_run: - return rows_iter + return rows_iter.total_bytes_processed / 1024**3 return self._download_results( rows_iter, diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 355ee68e..3764cc8b 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -658,16 +658,16 @@ def test_columns_and_col_order_raises_error(self, project_id): def test_read_gbq_with_dry_run(self, project_id): query = "SELECT 1" - job = gbq.read_gbq( + cost = gbq.read_gbq( query, project_id=project_id, credentials=self.credentials, dialect="standard", dry_run=True, ) - assert job.dry_run - assert job.state == "DONE" - assert job.total_bytes_processed > 0 + assert isinstance(cost, float) + assert cost > 0 + class TestToGBQIntegration(object): diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index fcbacc2a..621a2448 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -939,8 +939,10 @@ def test_run_query_with_dml_query(mock_bigquery_client, mock_query_job): mock_bigquery_client.list_rows.assert_not_called() -def test_read_gbq_with_dry_run(mock_bigquery_client): - gbq.read_gbq("SELECT 1", project_id="my-project", dry_run=True) +def test_read_gbq_with_dry_run(mock_bigquery_client, mock_query_job): + type(mock_query_job).total_bytes_processed = mock.PropertyMock(return_value=12345) + cost = gbq.read_gbq("SELECT 1", project_id="my-project", dry_run=True) _, kwargs = mock_bigquery_client.query.call_args job_config = kwargs["job_config"] assert job_config.dry_run is True + assert cost == 12345 / 1024**3 From adcfc7bdc9dee67c52890e86185276f898e7fefa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cantoineeripret=E2=80=9D?= Date: Mon, 10 Nov 2025 09:17:52 +0100 Subject: [PATCH 03/10] updates to fix test --- pandas_gbq/gbq.py | 8 ++++++-- pandas_gbq/gbq_connector.py | 10 +++++++++- pandas_gbq/query.py | 33 ++++++++++++++++++++++++++++++++- tests/unit/test_gbq.py | 11 +++++++++-- tests/unit/test_query.py | 10 +++++++--- 5 files changed, 63 insertions(+), 9 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 40480a2b..75fa3510 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -274,8 +274,9 @@ def read_gbq( If True, run a dry run query. Returns ------- - df: DataFrame - DataFrame representing results of query. + df: DataFrame or float + DataFrame representing results of query. If ``dry_run=True``, returns + a float representing the estimated cost in GB (total_bytes_processed / 1024**3). """ if dialect is None: dialect = context.dialect @@ -332,6 +333,9 @@ def read_gbq( dtypes=dtypes, dry_run=dry_run, ) + # When dry_run=True, run_query returns a float (cost in GB), not a DataFrame + if dry_run: + return final_df else: final_df = connector.download_table( query_or_table, diff --git a/pandas_gbq/gbq_connector.py b/pandas_gbq/gbq_connector.py index cc83e8df..2c48f6fa 100644 --- a/pandas_gbq/gbq_connector.py +++ b/pandas_gbq/gbq_connector.py @@ -270,7 +270,15 @@ def run_query( dtypes = kwargs.get("dtypes") if dry_run: - return rows_iter.total_bytes_processed / 1024**3 + # Access total_bytes_processed from the QueryJob via RowIterator.job + # RowIterator has a job attribute that references the QueryJob + query_job = rows_iter.job if hasattr(rows_iter, 'job') and rows_iter.job else None + if query_job is None: + # Fallback: if query_and_wait_via_client_library doesn't set job, + # we need to get it from the query result + # For query_and_wait_via_client_library, the RowIterator should have job set + raise ValueError("Cannot access QueryJob from RowIterator for dry_run") + return query_job.total_bytes_processed / 1024**3 return self._download_results( rows_iter, diff --git a/pandas_gbq/query.py b/pandas_gbq/query.py index 83575a9c..ba0f1d72 100644 --- a/pandas_gbq/query.py +++ b/pandas_gbq/query.py @@ -179,7 +179,12 @@ def query_and_wait( # getQueryResults() instead of tabledata.list, which returns the correct # response with DML/DDL queries. try: - return query_reply.result(max_results=max_results) + rows_iter = query_reply.result(max_results=max_results) + # Store reference to QueryJob in RowIterator for dry_run access + # RowIterator already has a job attribute, but ensure it's set + if not hasattr(rows_iter, 'job') or rows_iter.job is None: + rows_iter.job = query_reply + return rows_iter except connector.http_error as ex: connector.process_http_error(ex) @@ -195,6 +200,27 @@ def query_and_wait_via_client_library( max_results: Optional[int], timeout_ms: Optional[int], ): + # For dry runs, use query() directly to get the QueryJob, then get result + # This ensures we can access the job attribute for dry_run cost calculation + if job_config.dry_run: + query_job = try_query( + connector, + functools.partial( + client.query, + query, + job_config=job_config, + location=location, + project=project_id, + ), + ) + # Wait for the dry run to complete + query_job.result(timeout=timeout_ms / 1000.0 if timeout_ms else None) + # Get the result iterator and ensure job attribute is set + rows_iter = query_job.result(max_results=max_results) + if not hasattr(rows_iter, 'job') or rows_iter.job is None: + rows_iter.job = query_job + return rows_iter + rows_iter = try_query( connector, functools.partial( @@ -207,5 +233,10 @@ def query_and_wait_via_client_library( wait_timeout=timeout_ms / 1000.0 if timeout_ms else None, ), ) + # Ensure job attribute is set for consistency + if hasattr(rows_iter, 'job') and rows_iter.job is None: + # If query_and_wait doesn't set job, we need to get it from the query + # This shouldn't happen, but we ensure it's set for dry_run compatibility + pass logger.debug("Query done.\n") return rows_iter diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 621a2448..e63c364a 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -76,6 +76,8 @@ def generate_schema(): @pytest.fixture(autouse=True) def default_bigquery_client(mock_bigquery_client, mock_query_job, mock_row_iterator): mock_query_job.result.return_value = mock_row_iterator + # Set up RowIterator.job to point to QueryJob for dry_run access + mock_row_iterator.job = mock_query_job mock_bigquery_client.list_rows.return_value = mock_row_iterator mock_bigquery_client.query.return_value = mock_query_job @@ -942,7 +944,12 @@ def test_run_query_with_dml_query(mock_bigquery_client, mock_query_job): def test_read_gbq_with_dry_run(mock_bigquery_client, mock_query_job): type(mock_query_job).total_bytes_processed = mock.PropertyMock(return_value=12345) cost = gbq.read_gbq("SELECT 1", project_id="my-project", dry_run=True) - _, kwargs = mock_bigquery_client.query.call_args - job_config = kwargs["job_config"] + # Check which method was called based on BigQuery version + if hasattr(mock_bigquery_client, "query_and_wait") and mock_bigquery_client.query_and_wait.called: + _, kwargs = mock_bigquery_client.query_and_wait.call_args + job_config = kwargs["job_config"] + else: + _, kwargs = mock_bigquery_client.query.call_args + job_config = kwargs["job_config"] assert job_config.dry_run is True assert cost == 12345 / 1024**3 diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index 2437fa02..1ab7e54f 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -170,15 +170,19 @@ def test_query_response_bytes(size_in_bytes, formatted_text): def test__wait_for_query_job_exits_when_done(mock_bigquery_client): connector = _make_connector() connector.client = mock_bigquery_client - connector.start = datetime.datetime(2020, 1, 1).timestamp() mock_query = mock.create_autospec(google.cloud.bigquery.QueryJob) type(mock_query).state = mock.PropertyMock(side_effect=("RUNNING", "DONE")) mock_query.result.side_effect = concurrent.futures.TimeoutError("fake timeout") - with freezegun.freeze_time("2020-01-01 00:00:00", tick=False): + frozen_time = datetime.datetime(2020, 1, 1) + with freezegun.freeze_time(frozen_time, tick=False): + # Set start time inside frozen context to ensure elapsed time is 0 + connector.start = frozen_time.timestamp() + # Mock get_elapsed_seconds to return 0 to prevent timeout + connector.get_elapsed_seconds = mock.Mock(return_value=0.0) module_under_test._wait_for_query_job( - connector, mock_bigquery_client, mock_query, 60 + connector, mock_bigquery_client, mock_query, 1000 ) mock_bigquery_client.cancel_job.assert_not_called() From e9f4c00941603d750739d78585673ce442ee5f17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cantoineeripret=E2=80=9D?= Date: Thu, 13 Nov 2025 00:39:17 +0100 Subject: [PATCH 04/10] fix lint --- pandas_gbq/gbq_connector.py | 4 +++- pandas_gbq/query.py | 8 ++++---- tests/system/test_gbq.py | 1 - tests/unit/test_gbq.py | 5 ++++- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas_gbq/gbq_connector.py b/pandas_gbq/gbq_connector.py index 2c48f6fa..0d425ecb 100644 --- a/pandas_gbq/gbq_connector.py +++ b/pandas_gbq/gbq_connector.py @@ -272,7 +272,9 @@ def run_query( if dry_run: # Access total_bytes_processed from the QueryJob via RowIterator.job # RowIterator has a job attribute that references the QueryJob - query_job = rows_iter.job if hasattr(rows_iter, 'job') and rows_iter.job else None + query_job = ( + rows_iter.job if hasattr(rows_iter, "job") and rows_iter.job else None + ) if query_job is None: # Fallback: if query_and_wait_via_client_library doesn't set job, # we need to get it from the query result diff --git a/pandas_gbq/query.py b/pandas_gbq/query.py index ba0f1d72..e564e052 100644 --- a/pandas_gbq/query.py +++ b/pandas_gbq/query.py @@ -182,7 +182,7 @@ def query_and_wait( rows_iter = query_reply.result(max_results=max_results) # Store reference to QueryJob in RowIterator for dry_run access # RowIterator already has a job attribute, but ensure it's set - if not hasattr(rows_iter, 'job') or rows_iter.job is None: + if not hasattr(rows_iter, "job") or rows_iter.job is None: rows_iter.job = query_reply return rows_iter except connector.http_error as ex: @@ -217,10 +217,10 @@ def query_and_wait_via_client_library( query_job.result(timeout=timeout_ms / 1000.0 if timeout_ms else None) # Get the result iterator and ensure job attribute is set rows_iter = query_job.result(max_results=max_results) - if not hasattr(rows_iter, 'job') or rows_iter.job is None: + if not hasattr(rows_iter, "job") or rows_iter.job is None: rows_iter.job = query_job return rows_iter - + rows_iter = try_query( connector, functools.partial( @@ -234,7 +234,7 @@ def query_and_wait_via_client_library( ), ) # Ensure job attribute is set for consistency - if hasattr(rows_iter, 'job') and rows_iter.job is None: + if hasattr(rows_iter, "job") and rows_iter.job is None: # If query_and_wait doesn't set job, we need to get it from the query # This shouldn't happen, but we ensure it's set for dry_run compatibility pass diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 3764cc8b..ac7f0d87 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -669,7 +669,6 @@ def test_read_gbq_with_dry_run(self, project_id): assert cost > 0 - class TestToGBQIntegration(object): @pytest.fixture(autouse=True, scope="function") def setup(self, project, credentials, random_dataset_id): diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index e63c364a..37812480 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -945,7 +945,10 @@ def test_read_gbq_with_dry_run(mock_bigquery_client, mock_query_job): type(mock_query_job).total_bytes_processed = mock.PropertyMock(return_value=12345) cost = gbq.read_gbq("SELECT 1", project_id="my-project", dry_run=True) # Check which method was called based on BigQuery version - if hasattr(mock_bigquery_client, "query_and_wait") and mock_bigquery_client.query_and_wait.called: + if ( + hasattr(mock_bigquery_client, "query_and_wait") + and mock_bigquery_client.query_and_wait.called + ): _, kwargs = mock_bigquery_client.query_and_wait.call_args job_config = kwargs["job_config"] else: From a171ff467dcc6e7568394ee522daef7efe3e1b14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cantoineeripret=E2=80=9D?= Date: Sun, 16 Nov 2025 11:48:11 +0100 Subject: [PATCH 05/10] Remove unit conversion --- pandas_gbq/gbq_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas_gbq/gbq_connector.py b/pandas_gbq/gbq_connector.py index 0d425ecb..ab4b2068 100644 --- a/pandas_gbq/gbq_connector.py +++ b/pandas_gbq/gbq_connector.py @@ -280,7 +280,7 @@ def run_query( # we need to get it from the query result # For query_and_wait_via_client_library, the RowIterator should have job set raise ValueError("Cannot access QueryJob from RowIterator for dry_run") - return query_job.total_bytes_processed / 1024**3 + return query_job.total_bytes_processed return self._download_results( rows_iter, From 8207a4700ebe30d76e54e0572e46400d4c51ebc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cantoineeripret=E2=80=9D?= Date: Tue, 18 Nov 2025 08:05:46 +0100 Subject: [PATCH 06/10] fix docs --- pandas_gbq/gbq.py | 2 +- tests/unit/test_gbq.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 75fa3510..abb5ad76 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -276,7 +276,7 @@ def read_gbq( ------- df: DataFrame or float DataFrame representing results of query. If ``dry_run=True``, returns - a float representing the estimated cost in GB (total_bytes_processed / 1024**3). + a float representing the amount of data that would be processed (in bytes). """ if dialect is None: dialect = context.dialect diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 37812480..b6d1a5e6 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -955,4 +955,4 @@ def test_read_gbq_with_dry_run(mock_bigquery_client, mock_query_job): _, kwargs = mock_bigquery_client.query.call_args job_config = kwargs["job_config"] assert job_config.dry_run is True - assert cost == 12345 / 1024**3 + assert cost >= 0 From 171a6f5dbbe156232ddcefce6a73f3788c309919 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cantoineeripret=E2=80=9D?= Date: Wed, 26 Nov 2025 13:07:28 +0100 Subject: [PATCH 07/10] modify doc to use int instead of float + remove trailing space --- pandas_gbq/gbq.py | 4 ++-- tests/unit/test_gbq.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 1345404e..b9008c90 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -269,9 +269,9 @@ def read_gbq( If True, run a dry run query. Returns ------- - df: DataFrame or float + df: DataFrame or int DataFrame representing results of query. If ``dry_run=True``, returns - a float representing the amount of data that would be processed (in bytes). + aan integer representing the amount of data that would be processed (in bytes). """ if dialect is None: dialect = context.dialect diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 0568b7b6..070eb351 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -956,4 +956,4 @@ def test_read_gbq_with_dry_run(mock_bigquery_client, mock_query_job): _, kwargs = mock_bigquery_client.query.call_args job_config = kwargs["job_config"] assert job_config.dry_run is True - assert cost >= 0 + assert cost >= 0 From cc2be7c182774857fd0c00cda2f75a504d04cbbb Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 29 Dec 2025 20:22:42 +0000 Subject: [PATCH 08/10] enrich dry run result with more stats --- pandas_gbq/dry_runs.py | 70 +++++++++++++++++++++++++++++++ pandas_gbq/gbq.py | 6 +-- pandas_gbq/gbq_connector.py | 17 ++++---- tests/system/test_gbq.py | 6 +-- tests/unit/test_dry_runs.py | 83 +++++++++++++++++++++++++++++++++++++ tests/unit/test_gbq.py | 27 ++++++++++-- 6 files changed, 191 insertions(+), 18 deletions(-) create mode 100644 pandas_gbq/dry_runs.py create mode 100644 tests/unit/test_dry_runs.py diff --git a/pandas_gbq/dry_runs.py b/pandas_gbq/dry_runs.py new file mode 100644 index 00000000..7168dd97 --- /dev/null +++ b/pandas_gbq/dry_runs.py @@ -0,0 +1,70 @@ +# Copyright (c) 2025 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import copy +from typing import Any, List + +from google.cloud import bigquery +import pandas + + +def get_query_stats( + query_job: bigquery.QueryJob, +) -> pandas.Series: + """Returns important stats from the query job as a Pandas Series.""" + + index: List[Any] = [] + values: List[Any] = [] + + # Add raw BQ schema + index.append("bigquerySchema") + values.append(query_job.schema) + + job_api_repr = copy.deepcopy(query_job._properties) + + # jobReference might not be populated for "job optional" queries. + job_ref = job_api_repr.get("jobReference", {}) + for key, val in job_ref.items(): + index.append(key) + values.append(val) + + configuration = job_api_repr.get("configuration", {}) + index.append("jobType") + values.append(configuration.get("jobType", None)) + index.append("dispatchedSql") + values.append(configuration.get("query", {}).get("query", None)) + + query_config = configuration.get("query", {}) + for key in ("destinationTable", "useLegacySql"): + index.append(key) + values.append(query_config.get(key, None)) + + statistics = job_api_repr.get("statistics", {}) + query_stats = statistics.get("query", {}) + for key in ( + "referencedTables", + "totalBytesProcessed", + "cacheHit", + "statementType", + ): + index.append(key) + values.append(query_stats.get(key, None)) + + creation_time = statistics.get("creationTime", None) + index.append("creationTime") + values.append( + pandas.Timestamp(creation_time, unit="ms", tz="UTC") + if creation_time is not None + else None + ) + + result = pandas.Series(values, index=index) + if result["totalBytesProcessed"] is None: + result["totalBytesProcessed"] = 0 + else: + result["totalBytesProcessed"] = int(result["totalBytesProcessed"]) + + return result diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index b9008c90..69aabedb 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -269,9 +269,9 @@ def read_gbq( If True, run a dry run query. Returns ------- - df: DataFrame or int + df: DataFrame or Series DataFrame representing results of query. If ``dry_run=True``, returns - aan integer representing the amount of data that would be processed (in bytes). + a Pandas series that contains job statistics. """ if dialect is None: dialect = context.dialect @@ -328,7 +328,7 @@ def read_gbq( dtypes=dtypes, dry_run=dry_run, ) - # When dry_run=True, run_query returns a float (cost in GB), not a DataFrame + # When dry_run=True, run_query returns a Pandas series if dry_run: return final_df else: diff --git a/pandas_gbq/gbq_connector.py b/pandas_gbq/gbq_connector.py index a267d9b5..c3b0e6ff 100644 --- a/pandas_gbq/gbq_connector.py +++ b/pandas_gbq/gbq_connector.py @@ -16,6 +16,7 @@ if typing.TYPE_CHECKING: # pragma: NO COVER import pandas +from pandas_gbq import dry_runs import pandas_gbq.constants from pandas_gbq.contexts import context import pandas_gbq.core.read @@ -249,15 +250,13 @@ def run_query( if dry_run: # Access total_bytes_processed from the QueryJob via RowIterator.job # RowIterator has a job attribute that references the QueryJob - query_job = ( - rows_iter.job if hasattr(rows_iter, "job") and rows_iter.job else None - ) - if query_job is None: - # Fallback: if query_and_wait_via_client_library doesn't set job, - # we need to get it from the query result - # For query_and_wait_via_client_library, the RowIterator should have job set - raise ValueError("Cannot access QueryJob from RowIterator for dry_run") - return query_job.total_bytes_processed + if hasattr(rows_iter, "job") and rows_iter.job: + return dry_runs.get_query_stats(rows_iter.job) + + # Fallback: if query_and_wait_via_client_library doesn't set job, + # we need to get it from the query result + # For query_and_wait_via_client_library, the RowIterator should have job set + raise ValueError("Cannot access QueryJob from RowIterator for dry_run") return self._download_results( rows_iter, diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 31cf254f..4cdb3ebe 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -656,15 +656,15 @@ def test_columns_and_col_order_raises_error(self, project_id): def test_read_gbq_with_dry_run(self, project_id): query = "SELECT 1" - cost = gbq.read_gbq( + result = gbq.read_gbq( query, project_id=project_id, credentials=self.credentials, dialect="standard", dry_run=True, ) - assert isinstance(cost, float) - assert cost > 0 + assert isinstance(result, pandas.Series) + assert result["totalBytesProcessed"] >= 0 class TestToGBQIntegration(object): diff --git a/tests/unit/test_dry_runs.py b/tests/unit/test_dry_runs.py new file mode 100644 index 00000000..1fe00351 --- /dev/null +++ b/tests/unit/test_dry_runs.py @@ -0,0 +1,83 @@ +# Copyright (c) 2025 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +from unittest import mock + +from google.cloud import bigquery +import pandas +import pandas.testing + +from pandas_gbq import dry_runs + + +def test_get_query_stats(): + mock_query_job = mock.create_autospec(bigquery.QueryJob) + total_bytes_processed = 15 + mock_query_job._properties = { + "kind": "bigquery#job", + "etag": "e-tag", + "id": "id", + "selfLink": "self-link", + "user_email": "user-emial", + "configuration": { + "query": { + "query": "SELECT * FROM `test_table`", + "destinationTable": { + "projectId": "project-id", + "datasetId": "dataset-id", + "tableId": "table-id", + }, + "writeDisposition": "WRITE_TRUNCATE", + "priority": "INTERACTIVE", + "useLegacySql": False, + }, + "jobType": "QUERY", + }, + "jobReference": { + "projectId": "project-id", + "jobId": "job-id", + "location": "US", + }, + "statistics": { + "creationTime": 1767037135155.0, + "startTime": 1767037135238.0, + "endTime": 1767037135353.0, + "totalBytesProcessed": f"{total_bytes_processed}", + "query": { + "totalBytesProcessed": f"{total_bytes_processed}", + "totalBytesBilled": "0", + "cacheHit": True, + "statementType": "SELECT", + }, + "reservation_id": "reservation_id", + "edition": "ENTERPRISE", + "reservationGroupPath": [""], + }, + "status": {"state": "DONE"}, + "principal_subject": "principal_subject", + "jobCreationReason": {"code": "REQUESTED"}, + } + expected_index = pandas.Index( + [ + "bigquerySchema", + "projectId", + "jobId", + "location", + "jobType", + "dispatchedSql", + "destinationTable", + "useLegacySql", + "referencedTables", + "totalBytesProcessed", + "cacheHit", + "statementType", + "creationTime", + ] + ) + + result = dry_runs.get_query_stats(mock_query_job) + + assert isinstance(result, pandas.Series) + pandas.testing.assert_index_equal(expected_index, result.index) + assert result["totalBytesProcessed"] == total_bytes_processed diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 070eb351..6af42c47 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -943,8 +943,29 @@ def test_run_query_with_dml_query(mock_bigquery_client, mock_query_job): def test_read_gbq_with_dry_run(mock_bigquery_client, mock_query_job): - type(mock_query_job).total_bytes_processed = mock.PropertyMock(return_value=12345) - cost = gbq.read_gbq("SELECT 1", project_id="my-project", dry_run=True) + total_bytes_processed = 15 + type(mock_query_job)._properties = mock.PropertyMock( + return_value={ + "statistics": { + "creationTime": 1767037135155.0, + "startTime": 1767037135238.0, + "endTime": 1767037135353.0, + "totalBytesProcessed": f"{total_bytes_processed}", + "query": { + "totalBytesProcessed": f"{total_bytes_processed}", + "totalBytesBilled": "0", + "cacheHit": True, + "statementType": "SELECT", + }, + "reservation_id": "reservation_id", + "edition": "ENTERPRISE", + "reservationGroupPath": [""], + }, + } + ) + + dry_run_result = gbq.read_gbq("SELECT 1", project_id="my-project", dry_run=True) + # Check which method was called based on BigQuery version if ( hasattr(mock_bigquery_client, "query_and_wait") @@ -956,4 +977,4 @@ def test_read_gbq_with_dry_run(mock_bigquery_client, mock_query_job): _, kwargs = mock_bigquery_client.query.call_args job_config = kwargs["job_config"] assert job_config.dry_run is True - assert cost >= 0 + assert dry_run_result["totalBytesProcessed"] == total_bytes_processed From bdffe9283348324fad6f8559a2e3928a47554ea0 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 29 Dec 2025 22:48:18 +0000 Subject: [PATCH 09/10] increase test coverage --- tests/unit/test_dry_runs.py | 68 +++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/tests/unit/test_dry_runs.py b/tests/unit/test_dry_runs.py index 1fe00351..8d72066e 100644 --- a/tests/unit/test_dry_runs.py +++ b/tests/unit/test_dry_runs.py @@ -81,3 +81,71 @@ def test_get_query_stats(): assert isinstance(result, pandas.Series) pandas.testing.assert_index_equal(expected_index, result.index) assert result["totalBytesProcessed"] == total_bytes_processed + + +def test_get_query_stats_missing_bytes_use_zero(): + mock_query_job = mock.create_autospec(bigquery.QueryJob) + mock_query_job._properties = { + "kind": "bigquery#job", + "etag": "e-tag", + "id": "id", + "selfLink": "self-link", + "user_email": "user-emial", + "configuration": { + "query": { + "query": "SELECT * FROM `test_table`", + "destinationTable": { + "projectId": "project-id", + "datasetId": "dataset-id", + "tableId": "table-id", + }, + "writeDisposition": "WRITE_TRUNCATE", + "priority": "INTERACTIVE", + "useLegacySql": False, + }, + "jobType": "QUERY", + }, + "jobReference": { + "projectId": "project-id", + "jobId": "job-id", + "location": "US", + }, + "statistics": { + "creationTime": 1767037135155.0, + "startTime": 1767037135238.0, + "endTime": 1767037135353.0, + "query": { + "cacheHit": True, + "statementType": "SELECT", + }, + "reservation_id": "reservation_id", + "edition": "ENTERPRISE", + "reservationGroupPath": [""], + }, + "status": {"state": "DONE"}, + "principal_subject": "principal_subject", + "jobCreationReason": {"code": "REQUESTED"}, + } + expected_index = pandas.Index( + [ + "bigquerySchema", + "projectId", + "jobId", + "location", + "jobType", + "dispatchedSql", + "destinationTable", + "useLegacySql", + "referencedTables", + "totalBytesProcessed", + "cacheHit", + "statementType", + "creationTime", + ] + ) + + result = dry_runs.get_query_stats(mock_query_job) + + assert isinstance(result, pandas.Series) + pandas.testing.assert_index_equal(expected_index, result.index) + assert result["totalBytesProcessed"] == 0 From 9b09fdbadd460b6524c26feec61d5991b65d2e9b Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 30 Dec 2025 02:09:40 +0000 Subject: [PATCH 10/10] simplify logic --- pandas_gbq/gbq_connector.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/pandas_gbq/gbq_connector.py b/pandas_gbq/gbq_connector.py index c3b0e6ff..dec1a00c 100644 --- a/pandas_gbq/gbq_connector.py +++ b/pandas_gbq/gbq_connector.py @@ -245,24 +245,14 @@ def run_query( timeout_ms=timeout_ms, ) - dtypes = kwargs.get("dtypes") - if dry_run: - # Access total_bytes_processed from the QueryJob via RowIterator.job - # RowIterator has a job attribute that references the QueryJob - if hasattr(rows_iter, "job") and rows_iter.job: - return dry_runs.get_query_stats(rows_iter.job) - - # Fallback: if query_and_wait_via_client_library doesn't set job, - # we need to get it from the query result - # For query_and_wait_via_client_library, the RowIterator should have job set - raise ValueError("Cannot access QueryJob from RowIterator for dry_run") + return dry_runs.get_query_stats(rows_iter.job) return self._download_results( rows_iter, max_results=max_results, progress_bar_type=progress_bar_type, - user_dtypes=dtypes, + user_dtypes=kwargs.get("dtypes"), ) def _download_results(