diff --git a/pandas_gbq/load/__init__.py b/pandas_gbq/load/__init__.py index 250d6517..2fa2f24c 100644 --- a/pandas_gbq/load/__init__.py +++ b/pandas_gbq/load/__init__.py @@ -3,6 +3,7 @@ # license that can be found in the LICENSE file. from pandas_gbq.load.core import ( + cast_dataframe_for_csv, cast_dataframe_for_parquet, encode_chunk, load_chunks, @@ -13,6 +14,7 @@ ) __all__ = [ + "cast_dataframe_for_csv", "cast_dataframe_for_parquet", "encode_chunk", "load_chunks", diff --git a/pandas_gbq/load/core.py b/pandas_gbq/load/core.py index d98f8306..553e56f4 100644 --- a/pandas_gbq/load/core.py +++ b/pandas_gbq/load/core.py @@ -124,6 +124,45 @@ def convert(x): return dataframe +def cast_dataframe_for_csv( + dataframe: pandas.DataFrame, + schema: Optional[Dict[str, Any]], +) -> pandas.DataFrame: + """Cast columns to needed dtype when writing CSV files.""" + + columns = schema.get("fields", []) + + # Protect against an explicit None in the dictionary. + columns = columns if columns is not None else [] + + new_columns = {} + for column in columns: + # Schema can be a superset of the columns in the dataframe, so ignore + # columns that aren't present. + column_name = column.get("name") + if column_name not in dataframe.columns: + continue + + column_type = column.get("type", "").upper() + if column_type in {"DATETIME", "TIMESTAMP"}: + # Use isoformat to ensure that the years are 4 digits. + # https://github.com/googleapis/python-bigquery-pandas/issues/365 + def convert(x): + if pandas.isna(x): + return None + try: + return x.isoformat(sep=" ") + except AttributeError: + # It might be a string already or some other type. + return x + + new_columns[column_name] = dataframe[column_name].map(convert) + + if new_columns: + dataframe = dataframe.assign(**new_columns) + return dataframe + + def load_parquet( client: bigquery.Client, dataframe: pandas.DataFrame, @@ -195,6 +234,9 @@ def load_csv_from_dataframe( bq_schema = pandas_gbq.schema.to_google_cloud_bigquery(schema) def load_chunk(chunk, job_config): + if schema is not None: + chunk = cast_dataframe_for_csv(chunk, schema) + client.load_table_from_dataframe( chunk, destination_table_ref, diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index ad7c58ec..a398b9ad 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -160,6 +160,27 @@ def test_series_round_trip( ), } ), + expected_df=pandas.DataFrame( + { + "row_num": [0, 1, 2], + "bool_col": pandas.Series( + [True, False, True], + dtype="bool", + ), + "boolean_col": pandas.Series( + [None, True, False], + dtype="boolean", + ), + "object_col": pandas.Series( + [ + False, + (pandas.NA if hasattr(pandas, "NA") else None), + True, + ], + dtype="object", + ), + } + ), table_schema=[ {"name": "bool_col", "type": "BOOLEAN"}, {"name": "boolean_col", "type": "BOOLEAN"},