From ff5d5cb45695a3b706b9e33267a41a9a8932add7 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Mon, 15 Dec 2025 14:31:31 +0000 Subject: [PATCH 1/3] Fix boolean round-trip test and CSV datetime loading - Fix `test_dataframe_round_trip_with_table_schema` failure by expecting `pd.NA` for boolean columns loaded as object, aligning with BigQuery Storage API behavior. - Fix CSV loading failure for extreme datetimes (e.g., year 0001) by introducing `cast_dataframe_for_csv`. This helper forces `isoformat()` string conversion for DATETIME/TIMESTAMP columns, ensuring 4-digit years (e.g., `0001-01-01` instead of `1-01-01`) which prevents BigQuery BadRequest errors. --- pandas_gbq/load/__init__.py | 2 ++ pandas_gbq/load/core.py | 35 +++++++++++++++++++++++++++++++++++ tests/system/test_to_gbq.py | 25 +++++++++++++++++++++++++ 3 files changed, 62 insertions(+) diff --git a/pandas_gbq/load/__init__.py b/pandas_gbq/load/__init__.py index 250d6517..2fa2f24c 100644 --- a/pandas_gbq/load/__init__.py +++ b/pandas_gbq/load/__init__.py @@ -3,6 +3,7 @@ # license that can be found in the LICENSE file. from pandas_gbq.load.core import ( + cast_dataframe_for_csv, cast_dataframe_for_parquet, encode_chunk, load_chunks, @@ -13,6 +14,7 @@ ) __all__ = [ + "cast_dataframe_for_csv", "cast_dataframe_for_parquet", "encode_chunk", "load_chunks", diff --git a/pandas_gbq/load/core.py b/pandas_gbq/load/core.py index d98f8306..f230794c 100644 --- a/pandas_gbq/load/core.py +++ b/pandas_gbq/load/core.py @@ -124,6 +124,38 @@ def convert(x): return dataframe +def cast_dataframe_for_csv( + dataframe: pandas.DataFrame, + schema: Optional[Dict[str, Any]], +) -> pandas.DataFrame: + """Cast columns to needed dtype when writing CSV files.""" + + columns = schema.get("fields", []) + + # Protect against an explicit None in the dictionary. + columns = columns if columns is not None else [] + + for column in columns: + # Schema can be a superset of the columns in the dataframe, so ignore + # columns that aren't present. + column_name = column.get("name") + if column_name not in dataframe.columns: + continue + + column_type = column.get("type", "").upper() + if column_type in {"DATETIME", "TIMESTAMP"}: + # Use isoformat to ensure that the years are 4 digits. + # https://github.com/googleapis/python-bigquery-pandas/issues/365 + def convert(x): + if pandas.isna(x): + return None + return x.isoformat(sep=" ") + + cast_column = dataframe[column_name].map(convert) + dataframe = dataframe.assign(**{column_name: cast_column}) + return dataframe + + def load_parquet( client: bigquery.Client, dataframe: pandas.DataFrame, @@ -195,6 +227,9 @@ def load_csv_from_dataframe( bq_schema = pandas_gbq.schema.to_google_cloud_bigquery(schema) def load_chunk(chunk, job_config): + if schema is not None: + chunk = cast_dataframe_for_csv(chunk, schema) + client.load_table_from_dataframe( chunk, destination_table_ref, diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index ad7c58ec..f202389d 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -160,6 +160,31 @@ def test_series_round_trip( ), } ), + expected_df=pandas.DataFrame( + { + "row_num": [0, 1, 2], + "bool_col": pandas.Series( + [True, False, True], + dtype="bool", + ), + "boolean_col": pandas.Series( + [None, True, False], + dtype="boolean", + ), + "object_col": pandas.Series( + [ + False, + ( + pandas.NA + if hasattr(pandas, "NA") + else None + ), + True, + ], + dtype="object", + ), + } + ), table_schema=[ {"name": "bool_col", "type": "BOOLEAN"}, {"name": "boolean_col", "type": "BOOLEAN"}, From 57046afab48f5a3ad810529006bf6e8f2c194d83 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Mon, 15 Dec 2025 16:02:48 +0000 Subject: [PATCH 2/3] Fix boolean round-trip test and CSV datetime loading (v2) - Fix `test_dataframe_round_trip_with_table_schema` failure by expecting `pd.NA` for boolean columns loaded as object, aligning with BigQuery Storage API behavior. - Fix CSV loading failure for extreme datetimes (e.g., year 0001) by introducing `cast_dataframe_for_csv`. This helper forces `isoformat()` string conversion for DATETIME/TIMESTAMP columns, ensuring 4-digit years (e.g., `0001-01-01` instead of `1-01-01`). - `cast_dataframe_for_csv` is robust against non-datetime inputs (falls back to original value) and efficient (batch assigns new columns). - Code formatting applied with `black`. --- pandas_gbq/load/core.py | 14 +++++++++++--- tests/system/test_to_gbq.py | 6 +----- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pandas_gbq/load/core.py b/pandas_gbq/load/core.py index f230794c..1c3d4724 100644 --- a/pandas_gbq/load/core.py +++ b/pandas_gbq/load/core.py @@ -73,6 +73,7 @@ def cast_dataframe_for_parquet( # Protect against an explicit None in the dictionary. columns = columns if columns is not None else [] + new_columns = {} for column in columns: # Schema can be a superset of the columns in the dataframe, so ignore # columns that aren't present. @@ -135,6 +136,7 @@ def cast_dataframe_for_csv( # Protect against an explicit None in the dictionary. columns = columns if columns is not None else [] + new_columns = {} for column in columns: # Schema can be a superset of the columns in the dataframe, so ignore # columns that aren't present. @@ -149,10 +151,16 @@ def cast_dataframe_for_csv( def convert(x): if pandas.isna(x): return None - return x.isoformat(sep=" ") + try: + return x.isoformat(sep=" ") + except AttributeError: + # It might be a string already or some other type. + return x - cast_column = dataframe[column_name].map(convert) - dataframe = dataframe.assign(**{column_name: cast_column}) + new_columns[column_name] = dataframe[column_name].map(convert) + + if new_columns: + dataframe = dataframe.assign(**new_columns) return dataframe diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index f202389d..a398b9ad 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -174,11 +174,7 @@ def test_series_round_trip( "object_col": pandas.Series( [ False, - ( - pandas.NA - if hasattr(pandas, "NA") - else None - ), + (pandas.NA if hasattr(pandas, "NA") else None), True, ], dtype="object", From 04bc21cbacc163182f0ba2f9ba560545d34e12ad Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Mon, 15 Dec 2025 12:11:38 -0500 Subject: [PATCH 3/3] Removes un-used variable to resolve linting --- pandas_gbq/load/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas_gbq/load/core.py b/pandas_gbq/load/core.py index 1c3d4724..553e56f4 100644 --- a/pandas_gbq/load/core.py +++ b/pandas_gbq/load/core.py @@ -73,7 +73,6 @@ def cast_dataframe_for_parquet( # Protect against an explicit None in the dictionary. columns = columns if columns is not None else [] - new_columns = {} for column in columns: # Schema can be a superset of the columns in the dataframe, so ignore # columns that aren't present.