From d3e064e21f22b13a2fbfdee8ce4d7a1f2d4bd381 Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Wed, 3 Dec 2025 16:08:04 -0800 Subject: [PATCH 1/2] Adding back in lazy xr.Dataset and metadata parsing fix. --- xarray_sql/df.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/xarray_sql/df.py b/xarray_sql/df.py index a249d10..ba75d7b 100644 --- a/xarray_sql/df.py +++ b/xarray_sql/df.py @@ -171,6 +171,23 @@ def _parse_schema(ds) -> pa.Schema: return pa.schema(columns) +def _parse_schema(ds) -> pa.Schema: + """Extracts a `pa.Schema` from the Dataset, treating dims and data_vars as columns.""" + columns = [] + + for coord_name, coord_var in ds.coords.items(): + # Only include dimension coordinates + if coord_name in ds.dims: + pa_type = pa.from_numpy_dtype(coord_var.dtype) + columns.append(pa.field(coord_name, pa_type)) + + for var_name, var in ds.data_vars.items(): + pa_type = pa.from_numpy_dtype(var.dtype) + columns.append(pa.field(var_name, pa_type)) + + return pa.schema(columns) + + def read_xarray(ds: xr.Dataset, chunks: Chunks = None) -> pa.RecordBatchReader: """Pivots an Xarray Dataset into a PyArrow Table, partitioned by chunks. @@ -183,18 +200,15 @@ def read_xarray(ds: xr.Dataset, chunks: Chunks = None) -> pa.RecordBatchReader: Returns: A PyArrow Table, which is a table representation of the input Dataset. """ - fst = next(iter(ds.values())).dims - assert all( - da.dims == fst for da in ds.values() - ), "All dimensions must be equal. Please filter data_vars in the Dataset." - - blocks = list(block_slices(ds, chunks)) def pivot_block(b: Block): return pivot(ds.isel(b)) - schema = pa.Schema.from_pandas(pivot_block(blocks[0])) - last_schema = pa.Schema.from_pandas(pivot_block(blocks[-1])) - assert schema == last_schema, "Schemas must be consistent across blocks!" + fst = next(iter(ds.values())).dims + assert all( + da.dims == fst for da in ds.values() + ), "All dimensions must be equal. Please filter data_vars in the Dataset." + schema = _parse_schema(ds) + blocks = block_slices(ds, chunks) return from_map_batched(pivot_block, blocks, schema=schema) From 836181ab58b2bcf0310e73ec137b6e6c08ab75aa Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Thu, 11 Dec 2025 15:17:35 -0800 Subject: [PATCH 2/2] Duplicate code block! --- xarray_sql/df.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/xarray_sql/df.py b/xarray_sql/df.py index ba75d7b..ef9e4c6 100644 --- a/xarray_sql/df.py +++ b/xarray_sql/df.py @@ -171,23 +171,6 @@ def _parse_schema(ds) -> pa.Schema: return pa.schema(columns) -def _parse_schema(ds) -> pa.Schema: - """Extracts a `pa.Schema` from the Dataset, treating dims and data_vars as columns.""" - columns = [] - - for coord_name, coord_var in ds.coords.items(): - # Only include dimension coordinates - if coord_name in ds.dims: - pa_type = pa.from_numpy_dtype(coord_var.dtype) - columns.append(pa.field(coord_name, pa_type)) - - for var_name, var in ds.data_vars.items(): - pa_type = pa.from_numpy_dtype(var.dtype) - columns.append(pa.field(var_name, pa_type)) - - return pa.schema(columns) - - def read_xarray(ds: xr.Dataset, chunks: Chunks = None) -> pa.RecordBatchReader: """Pivots an Xarray Dataset into a PyArrow Table, partitioned by chunks.