From d29f72709fdadf097dda5d9e35da2e7c3d9cda7e Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Tue, 25 Nov 2025 11:21:53 +0000 Subject: [PATCH 1/4] bulk remove hydrostats and extraction code --- docs/hat_extract-timeseries.md | 30 --- docs/hat_hydrostats.md | 42 ---- docs/index.md | 6 +- hat/cli.py | 4 - hat/compute_hydrostats/stat_calc.py | 42 ---- hat/compute_hydrostats/stats.py | 83 -------- hat/core.py | 11 -- hat/extract_timeseries/extractor.py | 211 -------------------- pyproject.toml | 2 - tests/test_extractor.py | 288 ---------------------------- tests/test_imports.py | 8 - 11 files changed, 1 insertion(+), 726 deletions(-) delete mode 100644 docs/hat_extract-timeseries.md delete mode 100644 docs/hat_hydrostats.md delete mode 100644 hat/compute_hydrostats/stat_calc.py delete mode 100644 hat/compute_hydrostats/stats.py delete mode 100644 hat/core.py delete mode 100644 hat/extract_timeseries/extractor.py delete mode 100644 tests/test_extractor.py diff --git a/docs/hat_extract-timeseries.md b/docs/hat_extract-timeseries.md deleted file mode 100644 index b89024e..0000000 --- a/docs/hat_extract-timeseries.md +++ /dev/null @@ -1,30 +0,0 @@ -`hat-extract-timeseries` documentation -=============================== - -Extract timeseries from a collection of simulation raster files. - -How to use ------ -Timeseries extraction requires a json configuration file, which can be provided on the command line using `--config` - - $ hat-extract-timeseries --config timeseries.json - -You can show an example of config file using `--show-default-config` - - $ extract_simulation_timeseries --show-default-config - -To create your own configuration json file you might want to start with the default configuration as a template. Default values will be used where possible if not defined in the custom figuration file. Here is an example custom configuration file. - - # example custom configuration .json file - { - "station_metadata_filepath": "/path/to/stations.csv", - "simulation": { - "type": "file", - "files": "*.nc" - }, - "simulation_output_filepath": "./simulation_timeseries.nc", - "station_epsg": 4326, - "station_id_column_name": "obsid", - "station_filters":"", - "station_coordinates": ["Lon", "Lat"] - } diff --git a/docs/hat_hydrostats.md b/docs/hat_hydrostats.md deleted file mode 100644 index 94f64c9..0000000 --- a/docs/hat_hydrostats.md +++ /dev/null @@ -1,42 +0,0 @@ -`hat-hydrostats` documentation -============================== - -Command line tool to calculate hydrological statistics on timeseries. - -How to use ------ -To run this analysis the following are required: - -- `--functions` = names of statistical function(s) -- `--sims` = filepath to simulation file -- `--obs` = filepath to observation file - -For example - -`hydrostats --functions kge --sims $SIMS --obs $OBS` - -These are the currently supported functions: - -- apb -- apb2 -- bias -- br -- correlation -- kge -- index_agreement -- mae -- ns -- nslog -- pc_bias -- pc_bias2 -- rmse -- rsr -- vr - -You can calculate more than one function at once using commas with the `--functions` option - -`hat-hydrostats --functions kge, rmse, mae, correlation --sims $SIMS --obs $OBS` - -(Optionally) define the minimum percentage of observations required for timeseries to be valid using the `--obs_threshold` option (default is 80%) - -`hat-hydrostats --functions kge --sims $SIMS --obs $OBS --obs_threshold 70` diff --git a/docs/index.md b/docs/index.md index 8a03899..3c3c736 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,9 +1,5 @@ # Welcome to HAT -The Hydrological Analysis Toolkit (HAT) is a software suite for hydrologists working with simulated and observed river discharge. HAT performs data analysis on hydrological datasets, with its main features being: +The Hydrological Analysis Toolkit (HAT) is a software suite for hydrologists working with simulated and observed river discharge. HAT performs data analysis on hydrological datasets, with its main feature being: - mapping station locations into hydrological model grids - -- extraction of timeseries - -- statistical analysis of hydrological timeseries diff --git a/hat/cli.py b/hat/cli.py index 972b151..587e8b8 100644 --- a/hat/cli.py +++ b/hat/cli.py @@ -3,8 +3,6 @@ import sys from hat import _LOGGER as logger -from hat.compute_hydrostats.stat_calc import stat_calc -from hat.extract_timeseries.extractor import extractor from hat.station_mapping.mapper import mapper @@ -24,8 +22,6 @@ def wrapper(args=None): mapper_cli = commandlineify(mapper) -extractor_cli = commandlineify(extractor) -stat_calc_cli = commandlineify(stat_calc) if __name__ == "__main__": diff --git a/hat/compute_hydrostats/stat_calc.py b/hat/compute_hydrostats/stat_calc.py deleted file mode 100644 index 4da6a15..0000000 --- a/hat/compute_hydrostats/stat_calc.py +++ /dev/null @@ -1,42 +0,0 @@ -from hat.core import load_da -import numpy as np -import xarray as xr -from hat.compute_hydrostats import stats - - -def find_valid_subset(sim_da, obs_da, sim_coords, obs_coords, new_coords): - sim_station_colname = sim_coords.get("s", "station") - obs_station_colname = obs_coords.get("s", "station") - matching_stations = np.intersect1d(sim_da[sim_station_colname].values, obs_da[obs_station_colname].values) - sim_time_colname = sim_coords.get("t", "time") - obs_time_colname = obs_coords.get("t", "time") - matching_times = np.intersect1d(sim_da[sim_time_colname].values, obs_da[obs_time_colname].values) - - sim_da = sim_da.sel({sim_time_colname: matching_times, sim_station_colname: matching_stations}) - obs_da = obs_da.sel({obs_time_colname: matching_times, obs_station_colname: matching_stations}) - - sim_da = sim_da.rename( - {sim_time_colname: new_coords.get("t", "time"), sim_station_colname: new_coords.get("s", "station")} - ) - obs_da = obs_da.rename( - {obs_time_colname: new_coords.get("t", "time"), obs_station_colname: new_coords.get("s", "station")} - ) - - return sim_da, obs_da - - -def stat_calc(config): - sim_config = config["sim"] - sim_da, _ = load_da(sim_config, 2) - obs_config = config["obs"] - obs_da, _ = load_da(obs_config, 2) - new_coords = config["output"]["coords"] - sim_da, obs_da = find_valid_subset(sim_da, obs_da, sim_config["coords"], obs_config["coords"], new_coords) - stat_dict = {} - for stat in config["stats"]: - func = getattr(stats, stat) - stat_dict[stat] = func(sim_da, obs_da, new_coords.get("t", "time")) - ds = xr.Dataset(stat_dict) - if config["output"].get("file", None) is not None: - ds.to_netcdf(config["output"]["file"]) - return ds diff --git a/hat/compute_hydrostats/stats.py b/hat/compute_hydrostats/stats.py deleted file mode 100644 index 8c8aff9..0000000 --- a/hat/compute_hydrostats/stats.py +++ /dev/null @@ -1,83 +0,0 @@ -import numpy as np -import xarray as xr - - -def bias(sim_da, obs_da, time_name): - return (sim_da - obs_da).mean(dim=time_name, skipna=True) - - -def mae(sim_da, obs_da, time_name): - return np.abs(sim_da - obs_da).mean(dim=time_name, skipna=True) - - -def mape(sim_da, obs_da, time_name): - return mae(sim_da, obs_da, time_name) / np.abs(obs_da).sum(dim=time_name, skipna=True) - - -def mse(sim_da, obs_da, time_name): - return ((sim_da - obs_da) ** 2).mean(dim=time_name, skipna=True) - - -def rmse(sim_da, obs_da, time_name): - return np.sqrt(mse(sim_da, obs_da, time_name)) - - -def br(sim_da, obs_da, time_name): - # as defined in: - # Kling, H., Fuchs, M., & Paulin, M. (2012). Runoff conditions in the upper Danube basin under an ensemble of climate change scenarios. Journal of hydrology, 424, 264-277. - return sim_da.mean(dim=time_name, skipna=True) / obs_da.mean(dim=time_name, skipna=True) - - -def vr(sim_da, obs_da, time_name): - # as defined in: - # Kling, H., Fuchs, M., & Paulin, M. (2012). Runoff conditions in the upper Danube basin under an ensemble of climate change scenarios. Journal of hydrology, 424, 264-277. - return (sim_da.std(dim=time_name, skipna=True) * obs_da.mean(dim=time_name, skipna=True)) / ( - obs_da.std(dim=time_name, skipna=True) * sim_da.mean(dim=time_name, skipna=True) - ) - - -def pc_bias(sim_da, obs_da, time_name): - return (sim_da - obs_da).mean(dim=time_name, skipna=True) / obs_da.mean(dim=time_name, skipna=True) - - -def correlation(sim_da, obs_da, time_name): - def _corr(a, b): - mask = ~np.isnan(a) & ~np.isnan(b) - if np.sum(mask) < 2: - return np.nan - return np.corrcoef(a[mask], b[mask])[0, 1] - - return xr.apply_ufunc( - _corr, - sim_da, - obs_da, - input_core_dims=[[time_name], [time_name]], - vectorize=True, - dask="parallelized", - output_dtypes=[float], - ) - - -def kge(sim_da, obs_da, time_name): - # as defined in: - # Kling, H., Fuchs, M., & Paulin, M. (2012). Runoff conditions in the upper Danube basin under an ensemble of climate change scenarios. Journal of hydrology, 424, 264-277. - B = br(sim_da, obs_da, time_name) - y = vr(sim_da, obs_da, time_name) - r = correlation(sim_da, obs_da, time_name) - - return 1 - np.sqrt((r - 1) ** 2 + (B - 1) ** 2 + (y - 1) ** 2) - - -def index_agreement(sim_da, obs_da, time_name): - numerator = ((obs_da - sim_da) ** 2).sum(dim=time_name, skipna=True) - mean_obs = obs_da.mean(dim=time_name, skipna=True) - denominator = ((np.abs(sim_da - mean_obs) + np.abs(obs_da - mean_obs)) ** 2).sum(dim=time_name, skipna=True) - - return 1 - (numerator / denominator) - - -def nse(sim_da, obs_da, time_name): - numerator = ((sim_da - obs_da) ** 2).sum(dim=time_name, skipna=True) - denominator = ((obs_da - obs_da.mean(dim=time_name, skipna=True)) ** 2).sum(dim=time_name, skipna=True) - - return 1 - (numerator / denominator) diff --git a/hat/core.py b/hat/core.py deleted file mode 100644 index 12213a5..0000000 --- a/hat/core.py +++ /dev/null @@ -1,11 +0,0 @@ -import earthkit.data as ekd -from earthkit.hydro._readers import find_main_var - - -def load_da(ds_config, n_dims): - src_name = list(ds_config["source"].keys())[0] - source = ekd.from_source(src_name, **ds_config["source"][src_name]) - ds = source.to_xarray(**ds_config.get("to_xarray_options", {})) - var_name = find_main_var(ds, n_dims) - da = ds[var_name] - return da, var_name diff --git a/hat/extract_timeseries/extractor.py b/hat/extract_timeseries/extractor.py deleted file mode 100644 index 7b76a2b..0000000 --- a/hat/extract_timeseries/extractor.py +++ /dev/null @@ -1,211 +0,0 @@ -from dask.diagnostics import ProgressBar -import pandas as pd -import xarray as xr -import numpy as np -from typing import Any -from hat.core import load_da - -from hat import _LOGGER as logger - - -def process_grid_inputs(grid_config): - da, var_name = load_da(grid_config, 3) - logger.info(f"Xarray created from source:\n{da}\n") - coord_config = grid_config.get("coords", {}) - x_dim = coord_config.get("x", "lat") - y_dim = coord_config.get("y", "lon") - da = da.sortby([x_dim, y_dim]) - shape = da[x_dim].shape[0], da[y_dim].shape[0] - return da, var_name, x_dim, y_dim, shape - - -def construct_mask(x_indices, y_indices, shape): - mask = np.zeros(shape, dtype=bool) - mask[x_indices, y_indices] = True - - flat_indices = np.ravel_multi_index((x_indices, y_indices), shape) - _, duplication_indexes = np.unique(flat_indices, return_inverse=True) - return mask, duplication_indexes - - -def create_mask_from_index(df, shape): - logger.info(f"Creating mask {shape} from index") - logger.debug(f"DataFrame columns: {df.columns.tolist()}") - x_indices = df["x_index"].values - y_indices = df["y_index"].values - if np.any(x_indices < 0) or np.any(x_indices >= shape[0]) or np.any(y_indices < 0) or np.any(y_indices >= shape[1]): - raise ValueError( - f"Station indices out of grid bounds. Grid shape={shape}, " - f"x_index range=({int(x_indices.min())},{int(x_indices.max())}), " - f"y_index range=({int(y_indices.min())},{int(y_indices.max())})" - ) - mask, duplication_indexes = construct_mask(x_indices, y_indices, shape) - return mask, duplication_indexes - - -def create_mask_from_coords(df, gridx, gridy, shape): - logger.info(f"Creating mask {shape} from coordinates") - logger.debug(f"DataFrame columns: {df.columns.tolist()}") - station_x = df["x_coord"].values - station_y = df["y_coord"].values - - x_distances = np.abs(station_x[:, np.newaxis] - gridx) - x_indices = np.argmin(x_distances, axis=1) - y_distances = np.abs(station_y[:, np.newaxis] - gridy) - y_indices = np.argmin(y_distances, axis=1) - - mask, duplication_indexes = construct_mask(x_indices, y_indices, shape) - return mask, duplication_indexes - - -def parse_stations(station_config: dict[str, Any]) -> pd.DataFrame: - """Read, filter, and normalize station DataFrame to canonical column names.""" - logger.debug(f"Reading station file, {station_config}") - if "name" not in station_config: - raise ValueError("Station config must include a 'name' key mapping to the station column") - df = pd.read_csv(station_config["file"]) - filters = station_config.get("filter") - if filters is not None: - logger.debug(f"Applying filters: {filters} to station DataFrame") - df = df.query(filters) - - if len(df) == 0: - raise ValueError("No stations found. Check station file or filter.") - - has_index = "index" in station_config - has_coords = "coords" in station_config - has_index_1d = "index_1d" in station_config - - if not has_index_1d: - if has_index and has_coords: - raise ValueError("Station config must use either 'index' or 'coords', not both.") - if not has_index and not has_coords: - raise ValueError("Station config must provide either 'index' or 'coords' for station mapping.") - - renames = {} - renames[station_config["name"]] = "station_name" - - if has_index: - index_config = station_config["index"] - x_col = index_config.get("x", "opt_x_index") - y_col = index_config.get("y", "opt_y_index") - renames[x_col] = "x_index" - renames[y_col] = "y_index" - - if has_coords: - coords_config = station_config["coords"] - x_col = coords_config.get("x", "opt_x_coord") - y_col = coords_config.get("y", "opt_y_coord") - renames[x_col] = "x_coord" - renames[y_col] = "y_coord" - - if has_index_1d: - renames[station_config["index_1d"]] = "index_1d" - - df_renamed = df.rename(columns=renames) - - if has_index and ("x_index" not in df_renamed.columns or "y_index" not in df_renamed.columns): - raise ValueError( - "Station file missing required index columns. Expected columns to map to 'x_index' and 'y_index'." - ) - if has_coords and ("x_coord" not in df_renamed.columns or "y_coord" not in df_renamed.columns): - raise ValueError( - "Station file missing required coordinate columns. Expected columns to map to 'x_coord' and 'y_coord'." - ) - if has_index_1d and "index_1d" not in df_renamed.columns: - raise ValueError("Station file missing required 'index_1d' column.") - - return df_renamed - - -def _process_gribjump(grid_config: dict[str, Any], df: pd.DataFrame) -> xr.Dataset: - if "index_1d" not in df.columns: - raise ValueError("Gribjump source requires 'index_1d' in station config.") - - station_names = df["station_name"].values - unique_indices, duplication_indexes = np.unique(df["index_1d"].values, return_inverse=True) # type: ignore[call-overload] - - # Converting indices to ranges is currently faster than using indices - # directly. This is a problem in the earthkit-data gribjump source and will - # be fixed there. - ranges = [(i, i + 1) for i in unique_indices] - - gribjump_config = { - "source": { - "gribjump": { - **grid_config["source"]["gribjump"], - "ranges": ranges, - # fetch_coords_from_fdb is currently very slow. Needs fix in - # earthkit-data gribjump source. - # "fetch_coords_from_fdb": True, - } - }, - "to_xarray_options": grid_config.get("to_xarray_options", {}), - } - - masked_da, var_name = load_da(gribjump_config, 2) - - ds = xr.Dataset({var_name: masked_da}) - ds = ds.isel(index=duplication_indexes) - ds = ds.rename({"index": "station"}) - ds["station"] = station_names - return ds - - -def _process_regular(grid_config: dict[str, Any], df: pd.DataFrame) -> xr.Dataset: - station_names = df["station_name"].values - da, var_name, x_dim, y_dim, shape = process_grid_inputs(grid_config) - - use_index = "x_index" in df.columns and "y_index" in df.columns - - if use_index: - mask, duplication_indexes = create_mask_from_index(df, shape) - else: - mask, duplication_indexes = create_mask_from_coords(df, da[x_dim].values, da[y_dim].values, shape) - - logger.info("Extracting timeseries at selected stations") - masked_da = apply_mask(da, mask, x_dim, y_dim) - - ds = xr.Dataset({var_name: masked_da}) - ds = ds.isel(index=duplication_indexes) - ds = ds.rename({"index": "station"}) - ds["station"] = station_names - return ds - - -def process_inputs(station_config: dict[str, Any], grid_config: dict[str, Any]) -> xr.Dataset: - df = parse_stations(station_config) - if "gribjump" in grid_config.get("source", {}): - return _process_gribjump(grid_config, df) - return _process_regular(grid_config, df) - - -def mask_array_np(arr: np.ndarray, mask: np.ndarray) -> np.ndarray: - return arr[..., mask] - - -def apply_mask(da: xr.DataArray, mask: np.ndarray, coordx: str, coordy: str) -> xr.DataArray: - task = xr.apply_ufunc( - mask_array_np, - da, - mask, - input_core_dims=[(coordx, coordy), (coordx, coordy)], - output_core_dims=[["index"]], - output_dtypes=[da.dtype], - exclude_dims={coordx, coordy}, - dask="parallelized", - dask_gufunc_kwargs={ - "output_sizes": {"index": int(mask.sum())}, - "allow_rechunk": True, - }, - ) - with ProgressBar(dt=15): - return task.compute() - - -def extractor(config: dict[str, Any]) -> xr.Dataset: - ds = process_inputs(config["station"], config["grid"]) - if config.get("output", None) is not None: - logger.info(f"Saving output to {config['output']['file']}") - ds.to_netcdf(config["output"]["file"]) - return ds diff --git a/pyproject.toml b/pyproject.toml index 5094c94..e1155cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,8 +75,6 @@ dependencies = [ ] [project.scripts] - hat-extract-timeseries = "hat.cli:extractor_cli" - hat-hydrostats = "hat.cli:stat_calc_cli" hat-station-mapping = "hat.cli:mapper_cli" # Linting settings diff --git a/tests/test_extractor.py b/tests/test_extractor.py deleted file mode 100644 index 41ed39d..0000000 --- a/tests/test_extractor.py +++ /dev/null @@ -1,288 +0,0 @@ -"""Unit tests for the extractor function.""" - -import pytest -import pandas as pd -import numpy as np -import xarray as xr -from unittest.mock import Mock, patch - -from hat.extract_timeseries.extractor import extractor - - -@pytest.fixture -def dummy_grid_data(): - """4x5 grid, 2 timesteps, temperature variable.""" - lats = np.array([40.0, 41.0, 42.0, 43.0]) - lons = np.array([10.0, 11.0, 12.0, 13.0, 14.0]) - - temperature_values = np.array( - [ - # t=0: 2024-01-01 - [ - [10.0, 11.0, 12.0, 13.0, 14.0], # lat=40 - [15.0, 16.0, 17.0, 18.0, 19.0], # lat=41 - [20.0, 21.0, 22.0, 23.0, 24.0], # lat=42 - [25.0, 26.0, 27.0, 28.0, 29.0], # lat=43 - ], - # t=1: 2024-01-02 - [ - [30.0, 31.0, 32.0, 33.0, 34.0], # lat=40 - [35.0, 36.0, 37.0, 38.0, 39.0], # lat=41 - [40.0, 41.0, 42.0, 43.0, 44.0], # lat=42 - [45.0, 46.0, 47.0, 48.0, 49.0], # lat=43 - ], - ] - ) - - list_of_dicts = [] - - list_of_dicts.append( - { - "values": temperature_values[0].flatten(), - "param": "temperature", - "date": 20240101, - "time": 0, - "distinctLatitudes": lats.tolist(), - "distinctLongitudes": lons.tolist(), - } - ) - - list_of_dicts.append( - { - "values": temperature_values[1].flatten(), - "param": "temperature", - "date": 20240102, - "time": 0, - "distinctLatitudes": lats.tolist(), - "distinctLongitudes": lons.tolist(), - } - ) - - return list_of_dicts - - -@pytest.fixture -def station_dataframe(): - """2 stations with both index and coordinate columns.""" - return pd.DataFrame( - { - "station_id": ["STATION_A", "STATION_B"], - "opt_x_index": [1, 2], - "opt_y_index": [2, 3], - "opt_x_coord": [41.1, 41.9], # offset to test nearest-neighbor - "opt_y_coord": [12.2, 13.1], - } - ) - - -@pytest.fixture -def station_csv_file(station_dataframe, tmp_path): - """Write station DataFrame to temporary CSV.""" - csv_path = tmp_path / "stations.csv" - station_dataframe.to_csv(csv_path, index=False) - return str(csv_path) - - -@pytest.mark.parametrize( - "mapping_config", - [ - {"index": {"x": "opt_x_index", "y": "opt_y_index"}}, - {"coords": {"x": "opt_x_coord", "y": "opt_y_coord"}}, - ], - ids=["index", "coords"], -) -def test_extractor_with_temperature(dummy_grid_data, station_csv_file, mapping_config): - """Test extraction with both index and coords mapping.""" - config = { - "station": { - "file": station_csv_file, - "name": "station_id", - **mapping_config, - }, - "grid": { - "source": { - "list-of-dicts": { - "list_of_dicts": dummy_grid_data, - } - }, - "coords": { - "x": "latitude", - "y": "longitude", - }, - }, - } - - result_ds = extractor(config) - - assert isinstance(result_ds, xr.Dataset) - assert "temperature" in result_ds.data_vars - assert "station" in result_ds.dims - assert len(result_ds.station) == 2 - assert list(result_ds.station.values) == ["STATION_A", "STATION_B"] - - time_dim = "time" if "time" in result_ds.dims else "forecast_reference_time" - assert len(result_ds[time_dim]) == 2 - - # Station A: indices [1,2] -> values [17.0, 37.0] - # Station B: indices [2,3] -> values [23.0, 43.0] - np.testing.assert_allclose(result_ds["temperature"].sel(station="STATION_A").values, [17.0, 37.0]) - np.testing.assert_allclose(result_ds["temperature"].sel(station="STATION_B").values, [23.0, 43.0]) - - -def test_extractor_with_station_filter(dummy_grid_data, tmp_path): - """Test station filtering.""" - df = pd.DataFrame( - { - "station_id": ["S1", "S2", "S3"], - "opt_x_index": [1, 2, 1], - "opt_y_index": [2, 3, 3], - "network": ["primary", "secondary", "primary"], - } - ) - csv_path = tmp_path / "stations.csv" - df.to_csv(csv_path, index=False) - - config = { - "station": { - "file": str(csv_path), - "name": "station_id", - "filter": "network == 'primary'", - "index": {"x": "opt_x_index", "y": "opt_y_index"}, - }, - "grid": { - "source": { - "list-of-dicts": { - "list_of_dicts": dummy_grid_data, - } - }, - "coords": { - "x": "latitude", - "y": "longitude", - }, - }, - } - - result_ds = extractor(config) - - assert len(result_ds.station) == 2 - assert list(result_ds.station.values) == ["S1", "S3"] - assert result_ds["temperature"].sel(station="S1").values[0] == 17.0 - assert result_ds["temperature"].sel(station="S3").values[0] == 18.0 - - -def test_extractor_rejects_both_index_and_coords(dummy_grid_data, station_csv_file): - """Test that providing both index and coords raises ValueError.""" - config = { - "station": { - "file": station_csv_file, - "name": "station_id", - "index": {"x": "opt_x_index", "y": "opt_y_index"}, - "coords": {"x": "opt_x_coord", "y": "opt_y_coord"}, # Both provided - }, - "grid": { - "source": {"list-of-dicts": {"list_of_dicts": dummy_grid_data}}, - "coords": {"x": "latitude", "y": "longitude"}, - }, - } - - with pytest.raises(ValueError, match="must use either 'index' or 'coords', not both"): - extractor(config) - - -def test_extractor_with_output_file(dummy_grid_data, station_csv_file, tmp_path): - """Test output file writing.""" - output_file = tmp_path / "output.nc" - - config = { - "station": { - "file": station_csv_file, - "name": "station_id", - "index": {"x": "opt_x_index", "y": "opt_y_index"}, - }, - "grid": { - "source": { - "list-of-dicts": { - "list_of_dicts": dummy_grid_data, - } - }, - "coords": { - "x": "latitude", - "y": "longitude", - }, - }, - "output": { - "file": str(output_file), - }, - } - - result_ds = extractor(config) - - assert output_file.exists() - - loaded_ds = xr.open_dataset(output_file) - assert "temperature" in loaded_ds.data_vars - assert "station" in loaded_ds.dims - assert len(loaded_ds.station) == 2 - - xr.testing.assert_allclose(result_ds["temperature"], loaded_ds["temperature"]) - xr.testing.assert_equal(result_ds.station, loaded_ds.station) - - loaded_ds.close() - - -def test_extractor_with_empty_stations(dummy_grid_data, tmp_path): - """Test that extractor raises clear error for empty station list.""" - empty_csv = tmp_path / "empty_stations.csv" - pd.DataFrame(columns=["station_id", "opt_x_index", "opt_y_index"]).to_csv(empty_csv, index=False) - - config = { - "station": { - "file": str(empty_csv), - "name": "station_id", - "index": {"x": "opt_x_index", "y": "opt_y_index"}, - }, - "grid": { - "source": {"list-of-dicts": {"list_of_dicts": dummy_grid_data}}, - "coords": {"x": "latitude", "y": "longitude"}, - }, - } - - with pytest.raises(ValueError, match="No stations found"): - extractor(config) - - -@patch("earthkit.data.from_source") -def test_extractor_gribjump(mock_from_source, tmp_path): - """Test gribjump path: verifies ranges computation and earthkit call.""" - - # Mock returns object with to_xarray() that returns minimal dataset - mock_source = Mock() - mock_source.to_xarray.return_value = xr.Dataset( - {"temperature": xr.DataArray([[15.0, 25.0], [35.0, 45.0]], dims=["index", "time"])} - ) - mock_from_source.return_value = mock_source - - # Station CSV with index_1d (includes duplicate to test deduplication) - csv_file = tmp_path / "stations.csv" - pd.DataFrame( - { - "name": ["S1", "S2", "S3"], - "idx": [100, 200, 100], # S1 and S3 share index 100 - } - ).to_csv(csv_file, index=False) - - config = { - "station": {"file": str(csv_file), "name": "name", "index_1d": "idx"}, - "grid": {"source": {"gribjump": {"request": {"class": "od", "expver": "0001", "stream": "oper"}}}}, - } - - result = extractor(config) - - # Verify earthkit.data.from_source was called correctly - mock_from_source.assert_called_once_with( - "gribjump", request={"class": "od", "expver": "0001", "stream": "oper"}, ranges=[(100, 101), (200, 201)] - ) - - # Verify output - assert len(result.station) == 3 - assert list(result.station.values) == ["S1", "S2", "S3"] diff --git a/tests/test_imports.py b/tests/test_imports.py index eadc903..bdd65cd 100644 --- a/tests/test_imports.py +++ b/tests/test_imports.py @@ -1,10 +1,2 @@ def test_mapper_import(): from hat.station_mapping.mapper import mapper - - -def test_extractor_import(): - from hat.extract_timeseries.extractor import extractor - - -def test_stat_calc_import(): - from hat.compute_hydrostats.stat_calc import stat_calc From 9c0bead84cbf296b1ea9bad0c07d8cf9009a3aa3 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Tue, 25 Nov 2025 11:32:14 +0000 Subject: [PATCH 2/4] remove more files and udpate README --- README.md | 18 +---- .../workflow/hydrostats_computation.ipynb | 71 ----------------- .../workflow/timeseries_extraction.ipynb | 76 ------------------- pyproject.toml | 8 -- 4 files changed, 4 insertions(+), 169 deletions(-) delete mode 100644 notebooks/workflow/hydrostats_computation.ipynb delete mode 100644 notebooks/workflow/timeseries_extraction.ipynb diff --git a/README.md b/README.md index 6a420fa..c3324f4 100644 --- a/README.md +++ b/README.md @@ -28,8 +28,10 @@ The Hydrological Analysis Toolkit (HAT) is a software suite for hydrologists working with simulated and observed river discharge. HAT performs data analysis on hydrological datasets, with its main features being: - mapping station locations into hydrological model grids -- extraction of timeseries at station locations from gridded model outputs -- statistical analysis of hydrological timeseries +- interactive visualizations + +> [!NOTE] +> The station extraction and hydrostats functionality formerly in HAT now live in [ecmwf/hyve](https://github.com/ecmwf/hyve). ### Installation @@ -50,18 +52,6 @@ pip install -e .[dev] pre-commit install ``` -HAT provides **experimental** support for earthkit-data's [gribjump source](https://earthkit-data.readthedocs.io/en/latest/guide/sources.html#gribjump). -To install the gribjump extras for testing and experimentation, run: -```bash -pip install hydro-analysis-toolkit[gribjump] -``` - -> [!NOTE] -> The gribjump feature is experimental. It is not recommended for production use and may change or break in future releases. -> Information on how to build gribjump can be found in [GribJump's source code](https://github.com/ecmwf/gribjump/). Experimental -> wheels of `gribjumplib` can also be found [on PyPI](https://pypi.org/project/gribjumplib/). - - ## Licence ``` diff --git a/notebooks/workflow/hydrostats_computation.ipynb b/notebooks/workflow/hydrostats_computation.ipynb deleted file mode 100644 index bb82958..0000000 --- a/notebooks/workflow/hydrostats_computation.ipynb +++ /dev/null @@ -1,71 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "6908e7ba", - "metadata": {}, - "outputs": [], - "source": [ - "from hat.compute_hydrostats import stat_calc" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c6c9405", - "metadata": {}, - "outputs": [], - "source": [ - "config = {\n", - " \"sim\": {\n", - " \"source\": {\"file\": \"extracted_timeseries.nc\"},\n", - " \"coords\": {\n", - " \"s\": \"station\",\n", - " \"t\": \"time\"\n", - " }\n", - " },\n", - " \"obs\": {\n", - " \"source\": {\"file\": \"observations.nc\"},\n", - " \"coords\": {\n", - " \"s\": \"station\",\n", - " \"t\": \"time\"\n", - " }\n", - " },\n", - " \"stats\" : [\"bias\", \"apb\", \"apb2\"],\n", - " \"output\": {\n", - " \"file\": \"statistics.nc\",\n", - " \"coords\": {\n", - " \"s\": \"station\",\n", - " \"t\": \"time\"\n", - " }\n", - " },\n", - "}\n", - "\n", - "ds = stat_calc.stat_calc(config)\n", - "ds" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "hat", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/workflow/timeseries_extraction.ipynb b/notebooks/workflow/timeseries_extraction.ipynb deleted file mode 100644 index d5de09e..0000000 --- a/notebooks/workflow/timeseries_extraction.ipynb +++ /dev/null @@ -1,76 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "6908e7ba", - "metadata": {}, - "outputs": [], - "source": [ - "from hat.extract_timeseries import extractor" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c6c9405", - "metadata": {}, - "outputs": [], - "source": [ - "config = {\n", - " \"station\": {\n", - " \"file\": \"mapped_stations.csv\",\n", - " \"filter\": \"(StationLon >= 0) and (drainage_area_provided >= 0)\",\n", - " # EITHER USE INDEX OR COORDS, NOT BOTH\n", - " # INDEX is preferred but requires you to have the exact same grid\n", - " # as the upstream area used in station mapping\n", - " # \"index\": {\n", - " # \"x\": \"opt_x_index\",\n", - " # \"y\": \"opt_y_index\"\n", - " # },\n", - " \"coords\": {\n", - " \"x\": \"opt_x_coord\",\n", - " \"y\": \"opt_y_coord\"\n", - " },\n", - " \"name\": \"station_id\"\n", - " },\n", - " \"grid\": {\n", - " \"source\": {\"file\": \"./sim.nc\"},\n", - " \"coords\": {\n", - " \"x\": \"lat\",\n", - " \"y\": \"lon\",\n", - " # TODO: allow custom \"t\": \"time\" specification\n", - " }\n", - " },\n", - " \"output\": {\n", - " \"file\": \"extracted_timeseries.nc\"\n", - " },\n", - "}\n", - "\n", - "ds = extractor.extractor(config)\n", - "ds" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/pyproject.toml b/pyproject.toml index e1155cd..b1cba3b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,9 +70,6 @@ dependencies = [ "ruff", "pre-commit" ] - gribjump = [ - "earthkit-data[gribjump]" - ] [project.scripts] hat-station-mapping = "hat.cli:mapper_cli" @@ -100,11 +97,6 @@ addopts = "--pdbcls=IPython.terminal.debugger:Pdb" testpaths = [ "tests", ] -filterwarnings = [ - # Probably harmless numpy 2.x ABI warning from netCDF4's Cython extension - # See: https://github.com/Unidata/netcdf4-python/issues/1354 - "ignore:numpy.ndarray size changed:RuntimeWarning", -] # Packaging/setuptools options [tool.setuptools] From be5679917778e07fba62637556475ca214be3ec0 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Tue, 25 Nov 2025 11:38:43 +0000 Subject: [PATCH 3/4] remove missing lines in docs --- mkdocs.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/mkdocs.yml b/mkdocs.yml index 5ec673a..11d64c5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -7,7 +7,5 @@ nav: - Usage: usage.md - Command Line Interface: - station_mapping: station_mapping.md - - hat-extract-timeseries: hat_extract-timeseries.md - - hat-hydrostats: hat_hydrostats.md theme: readthedocs From 9e1784cab225b2bc2b6c322ca7a4a5d2fbde917e Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Tue, 2 Dec 2025 12:49:30 +0000 Subject: [PATCH 4/4] remove missed hat-extract-timeseries occurence in docs --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index f0d998a..00a5f5c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -10,7 +10,7 @@ If you have already [installed](installation.md) hat then Run a command line tool, for example - $ hat-extract-timeseries --help + $ hat-station-mapping --help For more information on individual command line tools, use the `--help` option at the command line or read the documentation, for instance for the [station mapping](station_mapping.md) tool.