From d29f72709fdadf097dda5d9e35da2e7c3d9cda7e Mon Sep 17 00:00:00 2001
From: Andreas Grafberger
 <18516896+andreas-grafberger@users.noreply.github.com>
Date: Tue, 25 Nov 2025 11:21:53 +0000
Subject: [PATCH 1/4] bulk remove hydrostats and extraction code

---
 docs/hat_extract-timeseries.md      |  30 ---
 docs/hat_hydrostats.md              |  42 ----
 docs/index.md                       |   6 +-
 hat/cli.py                          |   4 -
 hat/compute_hydrostats/stat_calc.py |  42 ----
 hat/compute_hydrostats/stats.py     |  83 --------
 hat/core.py                         |  11 --
 hat/extract_timeseries/extractor.py | 211 --------------------
 pyproject.toml                      |   2 -
 tests/test_extractor.py             | 288 ----------------------------
 tests/test_imports.py               |   8 -
 11 files changed, 1 insertion(+), 726 deletions(-)
 delete mode 100644 docs/hat_extract-timeseries.md
 delete mode 100644 docs/hat_hydrostats.md
 delete mode 100644 hat/compute_hydrostats/stat_calc.py
 delete mode 100644 hat/compute_hydrostats/stats.py
 delete mode 100644 hat/core.py
 delete mode 100644 hat/extract_timeseries/extractor.py
 delete mode 100644 tests/test_extractor.py

diff --git a/docs/hat_extract-timeseries.md b/docs/hat_extract-timeseries.md
deleted file mode 100644
index b89024e..0000000
--- a/docs/hat_extract-timeseries.md
+++ /dev/null
@@ -1,30 +0,0 @@
-`hat-extract-timeseries` documentation
-===============================
-
-Extract timeseries from a collection of simulation raster files.
-
-How to use
------
-Timeseries extraction requires a json configuration file, which can be provided on the command line using `--config`
-
-    $ hat-extract-timeseries --config timeseries.json
-
-You can show an example of config file using `--show-default-config`
-
-    $ extract_simulation_timeseries --show-default-config
-
-To create your own configuration json file you might want to start with the default configuration as a template. Default values will be used where possible if not defined in the custom figuration file. Here is an example custom configuration file.
-
-    # example custom configuration .json file
-    {
-        "station_metadata_filepath": "/path/to/stations.csv",
-        "simulation": {
-            "type": "file",
-            "files": "*.nc"
-        },
-        "simulation_output_filepath": "./simulation_timeseries.nc",
-        "station_epsg": 4326,
-        "station_id_column_name": "obsid",
-        "station_filters":"",
-        "station_coordinates": ["Lon", "Lat"]
-    }
diff --git a/docs/hat_hydrostats.md b/docs/hat_hydrostats.md
deleted file mode 100644
index 94f64c9..0000000
--- a/docs/hat_hydrostats.md
+++ /dev/null
@@ -1,42 +0,0 @@
-`hat-hydrostats` documentation
-==============================
-
-Command line tool to calculate hydrological statistics on timeseries.
-
-How to use
------
-To run this analysis the following are required:
-
-- `--functions` = names of statistical function(s)
-- `--sims` = filepath to simulation file
-- `--obs` = filepath to observation file
-
-For example
-
-`hydrostats --functions kge --sims $SIMS --obs $OBS`
-
-These are the currently supported functions:
-
-- apb
-- apb2
-- bias
-- br
-- correlation
-- kge
-- index_agreement
-- mae
-- ns
-- nslog
-- pc_bias
-- pc_bias2
-- rmse
-- rsr
-- vr
-
-You can calculate more than one function at once using commas with the `--functions` option
-
-`hat-hydrostats --functions kge, rmse, mae, correlation --sims $SIMS --obs $OBS`
-
-(Optionally) define the minimum percentage of observations required for timeseries to be valid using the `--obs_threshold` option (default is 80%)
-
-`hat-hydrostats --functions kge --sims $SIMS --obs $OBS --obs_threshold 70`
diff --git a/docs/index.md b/docs/index.md
index 8a03899..3c3c736 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,9 +1,5 @@
 # Welcome to HAT
 
-The Hydrological Analysis Toolkit (HAT) is a software suite for hydrologists working with simulated and observed river discharge. HAT performs data analysis on hydrological datasets, with its main features being:
+The Hydrological Analysis Toolkit (HAT) is a software suite for hydrologists working with simulated and observed river discharge. HAT performs data analysis on hydrological datasets, with its main feature being:
 
 - mapping station locations into hydrological model grids
-
-- extraction of timeseries
-
-- statistical analysis of hydrological timeseries
diff --git a/hat/cli.py b/hat/cli.py
index 972b151..587e8b8 100644
--- a/hat/cli.py
+++ b/hat/cli.py
@@ -3,8 +3,6 @@
 import sys
 
 from hat import _LOGGER as logger
-from hat.compute_hydrostats.stat_calc import stat_calc
-from hat.extract_timeseries.extractor import extractor
 from hat.station_mapping.mapper import mapper
 
 
@@ -24,8 +22,6 @@ def wrapper(args=None):
 
 
 mapper_cli = commandlineify(mapper)
-extractor_cli = commandlineify(extractor)
-stat_calc_cli = commandlineify(stat_calc)
 
 
 if __name__ == "__main__":
diff --git a/hat/compute_hydrostats/stat_calc.py b/hat/compute_hydrostats/stat_calc.py
deleted file mode 100644
index 4da6a15..0000000
--- a/hat/compute_hydrostats/stat_calc.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from hat.core import load_da
-import numpy as np
-import xarray as xr
-from hat.compute_hydrostats import stats
-
-
-def find_valid_subset(sim_da, obs_da, sim_coords, obs_coords, new_coords):
-    sim_station_colname = sim_coords.get("s", "station")
-    obs_station_colname = obs_coords.get("s", "station")
-    matching_stations = np.intersect1d(sim_da[sim_station_colname].values, obs_da[obs_station_colname].values)
-    sim_time_colname = sim_coords.get("t", "time")
-    obs_time_colname = obs_coords.get("t", "time")
-    matching_times = np.intersect1d(sim_da[sim_time_colname].values, obs_da[obs_time_colname].values)
-
-    sim_da = sim_da.sel({sim_time_colname: matching_times, sim_station_colname: matching_stations})
-    obs_da = obs_da.sel({obs_time_colname: matching_times, obs_station_colname: matching_stations})
-
-    sim_da = sim_da.rename(
-        {sim_time_colname: new_coords.get("t", "time"), sim_station_colname: new_coords.get("s", "station")}
-    )
-    obs_da = obs_da.rename(
-        {obs_time_colname: new_coords.get("t", "time"), obs_station_colname: new_coords.get("s", "station")}
-    )
-
-    return sim_da, obs_da
-
-
-def stat_calc(config):
-    sim_config = config["sim"]
-    sim_da, _ = load_da(sim_config, 2)
-    obs_config = config["obs"]
-    obs_da, _ = load_da(obs_config, 2)
-    new_coords = config["output"]["coords"]
-    sim_da, obs_da = find_valid_subset(sim_da, obs_da, sim_config["coords"], obs_config["coords"], new_coords)
-    stat_dict = {}
-    for stat in config["stats"]:
-        func = getattr(stats, stat)
-        stat_dict[stat] = func(sim_da, obs_da, new_coords.get("t", "time"))
-    ds = xr.Dataset(stat_dict)
-    if config["output"].get("file", None) is not None:
-        ds.to_netcdf(config["output"]["file"])
-    return ds
diff --git a/hat/compute_hydrostats/stats.py b/hat/compute_hydrostats/stats.py
deleted file mode 100644
index 8c8aff9..0000000
--- a/hat/compute_hydrostats/stats.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import numpy as np
-import xarray as xr
-
-
-def bias(sim_da, obs_da, time_name):
-    return (sim_da - obs_da).mean(dim=time_name, skipna=True)
-
-
-def mae(sim_da, obs_da, time_name):
-    return np.abs(sim_da - obs_da).mean(dim=time_name, skipna=True)
-
-
-def mape(sim_da, obs_da, time_name):
-    return mae(sim_da, obs_da, time_name) / np.abs(obs_da).sum(dim=time_name, skipna=True)
-
-
-def mse(sim_da, obs_da, time_name):
-    return ((sim_da - obs_da) ** 2).mean(dim=time_name, skipna=True)
-
-
-def rmse(sim_da, obs_da, time_name):
-    return np.sqrt(mse(sim_da, obs_da, time_name))
-
-
-def br(sim_da, obs_da, time_name):
-    # as defined in:
-    # Kling, H., Fuchs, M., & Paulin, M. (2012). Runoff conditions in the upper Danube basin under an ensemble of climate change scenarios. Journal of hydrology, 424, 264-277.
-    return sim_da.mean(dim=time_name, skipna=True) / obs_da.mean(dim=time_name, skipna=True)
-
-
-def vr(sim_da, obs_da, time_name):
-    # as defined in:
-    # Kling, H., Fuchs, M., & Paulin, M. (2012). Runoff conditions in the upper Danube basin under an ensemble of climate change scenarios. Journal of hydrology, 424, 264-277.
-    return (sim_da.std(dim=time_name, skipna=True) * obs_da.mean(dim=time_name, skipna=True)) / (
-        obs_da.std(dim=time_name, skipna=True) * sim_da.mean(dim=time_name, skipna=True)
-    )
-
-
-def pc_bias(sim_da, obs_da, time_name):
-    return (sim_da - obs_da).mean(dim=time_name, skipna=True) / obs_da.mean(dim=time_name, skipna=True)
-
-
-def correlation(sim_da, obs_da, time_name):
-    def _corr(a, b):
-        mask = ~np.isnan(a) & ~np.isnan(b)
-        if np.sum(mask) < 2:
-            return np.nan
-        return np.corrcoef(a[mask], b[mask])[0, 1]
-
-    return xr.apply_ufunc(
-        _corr,
-        sim_da,
-        obs_da,
-        input_core_dims=[[time_name], [time_name]],
-        vectorize=True,
-        dask="parallelized",
-        output_dtypes=[float],
-    )
-
-
-def kge(sim_da, obs_da, time_name):
-    # as defined in:
-    # Kling, H., Fuchs, M., & Paulin, M. (2012). Runoff conditions in the upper Danube basin under an ensemble of climate change scenarios. Journal of hydrology, 424, 264-277.
-    B = br(sim_da, obs_da, time_name)
-    y = vr(sim_da, obs_da, time_name)
-    r = correlation(sim_da, obs_da, time_name)
-
-    return 1 - np.sqrt((r - 1) ** 2 + (B - 1) ** 2 + (y - 1) ** 2)
-
-
-def index_agreement(sim_da, obs_da, time_name):
-    numerator = ((obs_da - sim_da) ** 2).sum(dim=time_name, skipna=True)
-    mean_obs = obs_da.mean(dim=time_name, skipna=True)
-    denominator = ((np.abs(sim_da - mean_obs) + np.abs(obs_da - mean_obs)) ** 2).sum(dim=time_name, skipna=True)
-
-    return 1 - (numerator / denominator)
-
-
-def nse(sim_da, obs_da, time_name):
-    numerator = ((sim_da - obs_da) ** 2).sum(dim=time_name, skipna=True)
-    denominator = ((obs_da - obs_da.mean(dim=time_name, skipna=True)) ** 2).sum(dim=time_name, skipna=True)
-
-    return 1 - (numerator / denominator)
diff --git a/hat/core.py b/hat/core.py
deleted file mode 100644
index 12213a5..0000000
--- a/hat/core.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import earthkit.data as ekd
-from earthkit.hydro._readers import find_main_var
-
-
-def load_da(ds_config, n_dims):
-    src_name = list(ds_config["source"].keys())[0]
-    source = ekd.from_source(src_name, **ds_config["source"][src_name])
-    ds = source.to_xarray(**ds_config.get("to_xarray_options", {}))
-    var_name = find_main_var(ds, n_dims)
-    da = ds[var_name]
-    return da, var_name
diff --git a/hat/extract_timeseries/extractor.py b/hat/extract_timeseries/extractor.py
deleted file mode 100644
index 7b76a2b..0000000
--- a/hat/extract_timeseries/extractor.py
+++ /dev/null
@@ -1,211 +0,0 @@
-from dask.diagnostics import ProgressBar
-import pandas as pd
-import xarray as xr
-import numpy as np
-from typing import Any
-from hat.core import load_da
-
-from hat import _LOGGER as logger
-
-
-def process_grid_inputs(grid_config):
-    da, var_name = load_da(grid_config, 3)
-    logger.info(f"Xarray created from source:\n{da}\n")
-    coord_config = grid_config.get("coords", {})
-    x_dim = coord_config.get("x", "lat")
-    y_dim = coord_config.get("y", "lon")
-    da = da.sortby([x_dim, y_dim])
-    shape = da[x_dim].shape[0], da[y_dim].shape[0]
-    return da, var_name, x_dim, y_dim, shape
-
-
-def construct_mask(x_indices, y_indices, shape):
-    mask = np.zeros(shape, dtype=bool)
-    mask[x_indices, y_indices] = True
-
-    flat_indices = np.ravel_multi_index((x_indices, y_indices), shape)
-    _, duplication_indexes = np.unique(flat_indices, return_inverse=True)
-    return mask, duplication_indexes
-
-
-def create_mask_from_index(df, shape):
-    logger.info(f"Creating mask {shape} from index")
-    logger.debug(f"DataFrame columns: {df.columns.tolist()}")
-    x_indices = df["x_index"].values
-    y_indices = df["y_index"].values
-    if np.any(x_indices < 0) or np.any(x_indices >= shape[0]) or np.any(y_indices < 0) or np.any(y_indices >= shape[1]):
-        raise ValueError(
-            f"Station indices out of grid bounds. Grid shape={shape}, "
-            f"x_index range=({int(x_indices.min())},{int(x_indices.max())}), "
-            f"y_index range=({int(y_indices.min())},{int(y_indices.max())})"
-        )
-    mask, duplication_indexes = construct_mask(x_indices, y_indices, shape)
-    return mask, duplication_indexes
-
-
-def create_mask_from_coords(df, gridx, gridy, shape):
-    logger.info(f"Creating mask {shape} from coordinates")
-    logger.debug(f"DataFrame columns: {df.columns.tolist()}")
-    station_x = df["x_coord"].values
-    station_y = df["y_coord"].values
-
-    x_distances = np.abs(station_x[:, np.newaxis] - gridx)
-    x_indices = np.argmin(x_distances, axis=1)
-    y_distances = np.abs(station_y[:, np.newaxis] - gridy)
-    y_indices = np.argmin(y_distances, axis=1)
-
-    mask, duplication_indexes = construct_mask(x_indices, y_indices, shape)
-    return mask, duplication_indexes
-
-
-def parse_stations(station_config: dict[str, Any]) -> pd.DataFrame:
-    """Read, filter, and normalize station DataFrame to canonical column names."""
-    logger.debug(f"Reading station file, {station_config}")
-    if "name" not in station_config:
-        raise ValueError("Station config must include a 'name' key mapping to the station column")
-    df = pd.read_csv(station_config["file"])
-    filters = station_config.get("filter")
-    if filters is not None:
-        logger.debug(f"Applying filters: {filters} to station DataFrame")
-        df = df.query(filters)
-
-    if len(df) == 0:
-        raise ValueError("No stations found. Check station file or filter.")
-
-    has_index = "index" in station_config
-    has_coords = "coords" in station_config
-    has_index_1d = "index_1d" in station_config
-
-    if not has_index_1d:
-        if has_index and has_coords:
-            raise ValueError("Station config must use either 'index' or 'coords', not both.")
-        if not has_index and not has_coords:
-            raise ValueError("Station config must provide either 'index' or 'coords' for station mapping.")
-
-    renames = {}
-    renames[station_config["name"]] = "station_name"
-
-    if has_index:
-        index_config = station_config["index"]
-        x_col = index_config.get("x", "opt_x_index")
-        y_col = index_config.get("y", "opt_y_index")
-        renames[x_col] = "x_index"
-        renames[y_col] = "y_index"
-
-    if has_coords:
-        coords_config = station_config["coords"]
-        x_col = coords_config.get("x", "opt_x_coord")
-        y_col = coords_config.get("y", "opt_y_coord")
-        renames[x_col] = "x_coord"
-        renames[y_col] = "y_coord"
-
-    if has_index_1d:
-        renames[station_config["index_1d"]] = "index_1d"
-
-    df_renamed = df.rename(columns=renames)
-
-    if has_index and ("x_index" not in df_renamed.columns or "y_index" not in df_renamed.columns):
-        raise ValueError(
-            "Station file missing required index columns. Expected columns to map to 'x_index' and 'y_index'."
-        )
-    if has_coords and ("x_coord" not in df_renamed.columns or "y_coord" not in df_renamed.columns):
-        raise ValueError(
-            "Station file missing required coordinate columns. Expected columns to map to 'x_coord' and 'y_coord'."
-        )
-    if has_index_1d and "index_1d" not in df_renamed.columns:
-        raise ValueError("Station file missing required 'index_1d' column.")
-
-    return df_renamed
-
-
-def _process_gribjump(grid_config: dict[str, Any], df: pd.DataFrame) -> xr.Dataset:
-    if "index_1d" not in df.columns:
-        raise ValueError("Gribjump source requires 'index_1d' in station config.")
-
-    station_names = df["station_name"].values
-    unique_indices, duplication_indexes = np.unique(df["index_1d"].values, return_inverse=True)  # type: ignore[call-overload]
-
-    # Converting indices to ranges is currently faster than using indices
-    # directly. This is a problem in the earthkit-data gribjump source and will
-    # be fixed there.
-    ranges = [(i, i + 1) for i in unique_indices]
-
-    gribjump_config = {
-        "source": {
-            "gribjump": {
-                **grid_config["source"]["gribjump"],
-                "ranges": ranges,
-                # fetch_coords_from_fdb is currently very slow. Needs fix in
-                # earthkit-data gribjump source.
-                # "fetch_coords_from_fdb": True,
-            }
-        },
-        "to_xarray_options": grid_config.get("to_xarray_options", {}),
-    }
-
-    masked_da, var_name = load_da(gribjump_config, 2)
-
-    ds = xr.Dataset({var_name: masked_da})
-    ds = ds.isel(index=duplication_indexes)
-    ds = ds.rename({"index": "station"})
-    ds["station"] = station_names
-    return ds
-
-
-def _process_regular(grid_config: dict[str, Any], df: pd.DataFrame) -> xr.Dataset:
-    station_names = df["station_name"].values
-    da, var_name, x_dim, y_dim, shape = process_grid_inputs(grid_config)
-
-    use_index = "x_index" in df.columns and "y_index" in df.columns
-
-    if use_index:
-        mask, duplication_indexes = create_mask_from_index(df, shape)
-    else:
-        mask, duplication_indexes = create_mask_from_coords(df, da[x_dim].values, da[y_dim].values, shape)
-
-    logger.info("Extracting timeseries at selected stations")
-    masked_da = apply_mask(da, mask, x_dim, y_dim)
-
-    ds = xr.Dataset({var_name: masked_da})
-    ds = ds.isel(index=duplication_indexes)
-    ds = ds.rename({"index": "station"})
-    ds["station"] = station_names
-    return ds
-
-
-def process_inputs(station_config: dict[str, Any], grid_config: dict[str, Any]) -> xr.Dataset:
-    df = parse_stations(station_config)
-    if "gribjump" in grid_config.get("source", {}):
-        return _process_gribjump(grid_config, df)
-    return _process_regular(grid_config, df)
-
-
-def mask_array_np(arr: np.ndarray, mask: np.ndarray) -> np.ndarray:
-    return arr[..., mask]
-
-
-def apply_mask(da: xr.DataArray, mask: np.ndarray, coordx: str, coordy: str) -> xr.DataArray:
-    task = xr.apply_ufunc(
-        mask_array_np,
-        da,
-        mask,
-        input_core_dims=[(coordx, coordy), (coordx, coordy)],
-        output_core_dims=[["index"]],
-        output_dtypes=[da.dtype],
-        exclude_dims={coordx, coordy},
-        dask="parallelized",
-        dask_gufunc_kwargs={
-            "output_sizes": {"index": int(mask.sum())},
-            "allow_rechunk": True,
-        },
-    )
-    with ProgressBar(dt=15):
-        return task.compute()
-
-
-def extractor(config: dict[str, Any]) -> xr.Dataset:
-    ds = process_inputs(config["station"], config["grid"])
-    if config.get("output", None) is not None:
-        logger.info(f"Saving output to {config['output']['file']}")
-        ds.to_netcdf(config["output"]["file"])
-    return ds
diff --git a/pyproject.toml b/pyproject.toml
index 5094c94..e1155cd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,8 +75,6 @@ dependencies = [
     ]
 
 [project.scripts]
-    hat-extract-timeseries = "hat.cli:extractor_cli"
-    hat-hydrostats = "hat.cli:stat_calc_cli"
     hat-station-mapping = "hat.cli:mapper_cli"
 
 # Linting settings
diff --git a/tests/test_extractor.py b/tests/test_extractor.py
deleted file mode 100644
index 41ed39d..0000000
--- a/tests/test_extractor.py
+++ /dev/null
@@ -1,288 +0,0 @@
-"""Unit tests for the extractor function."""
-
-import pytest
-import pandas as pd
-import numpy as np
-import xarray as xr
-from unittest.mock import Mock, patch
-
-from hat.extract_timeseries.extractor import extractor
-
-
-@pytest.fixture
-def dummy_grid_data():
-    """4x5 grid, 2 timesteps, temperature variable."""
-    lats = np.array([40.0, 41.0, 42.0, 43.0])
-    lons = np.array([10.0, 11.0, 12.0, 13.0, 14.0])
-
-    temperature_values = np.array(
-        [
-            # t=0: 2024-01-01
-            [
-                [10.0, 11.0, 12.0, 13.0, 14.0],  # lat=40
-                [15.0, 16.0, 17.0, 18.0, 19.0],  # lat=41
-                [20.0, 21.0, 22.0, 23.0, 24.0],  # lat=42
-                [25.0, 26.0, 27.0, 28.0, 29.0],  # lat=43
-            ],
-            # t=1: 2024-01-02
-            [
-                [30.0, 31.0, 32.0, 33.0, 34.0],  # lat=40
-                [35.0, 36.0, 37.0, 38.0, 39.0],  # lat=41
-                [40.0, 41.0, 42.0, 43.0, 44.0],  # lat=42
-                [45.0, 46.0, 47.0, 48.0, 49.0],  # lat=43
-            ],
-        ]
-    )
-
-    list_of_dicts = []
-
-    list_of_dicts.append(
-        {
-            "values": temperature_values[0].flatten(),
-            "param": "temperature",
-            "date": 20240101,
-            "time": 0,
-            "distinctLatitudes": lats.tolist(),
-            "distinctLongitudes": lons.tolist(),
-        }
-    )
-
-    list_of_dicts.append(
-        {
-            "values": temperature_values[1].flatten(),
-            "param": "temperature",
-            "date": 20240102,
-            "time": 0,
-            "distinctLatitudes": lats.tolist(),
-            "distinctLongitudes": lons.tolist(),
-        }
-    )
-
-    return list_of_dicts
-
-
-@pytest.fixture
-def station_dataframe():
-    """2 stations with both index and coordinate columns."""
-    return pd.DataFrame(
-        {
-            "station_id": ["STATION_A", "STATION_B"],
-            "opt_x_index": [1, 2],
-            "opt_y_index": [2, 3],
-            "opt_x_coord": [41.1, 41.9],  # offset to test nearest-neighbor
-            "opt_y_coord": [12.2, 13.1],
-        }
-    )
-
-
-@pytest.fixture
-def station_csv_file(station_dataframe, tmp_path):
-    """Write station DataFrame to temporary CSV."""
-    csv_path = tmp_path / "stations.csv"
-    station_dataframe.to_csv(csv_path, index=False)
-    return str(csv_path)
-
-
-@pytest.mark.parametrize(
-    "mapping_config",
-    [
-        {"index": {"x": "opt_x_index", "y": "opt_y_index"}},
-        {"coords": {"x": "opt_x_coord", "y": "opt_y_coord"}},
-    ],
-    ids=["index", "coords"],
-)
-def test_extractor_with_temperature(dummy_grid_data, station_csv_file, mapping_config):
-    """Test extraction with both index and coords mapping."""
-    config = {
-        "station": {
-            "file": station_csv_file,
-            "name": "station_id",
-            **mapping_config,
-        },
-        "grid": {
-            "source": {
-                "list-of-dicts": {
-                    "list_of_dicts": dummy_grid_data,
-                }
-            },
-            "coords": {
-                "x": "latitude",
-                "y": "longitude",
-            },
-        },
-    }
-
-    result_ds = extractor(config)
-
-    assert isinstance(result_ds, xr.Dataset)
-    assert "temperature" in result_ds.data_vars
-    assert "station" in result_ds.dims
-    assert len(result_ds.station) == 2
-    assert list(result_ds.station.values) == ["STATION_A", "STATION_B"]
-
-    time_dim = "time" if "time" in result_ds.dims else "forecast_reference_time"
-    assert len(result_ds[time_dim]) == 2
-
-    # Station A: indices [1,2] -> values [17.0, 37.0]
-    # Station B: indices [2,3] -> values [23.0, 43.0]
-    np.testing.assert_allclose(result_ds["temperature"].sel(station="STATION_A").values, [17.0, 37.0])
-    np.testing.assert_allclose(result_ds["temperature"].sel(station="STATION_B").values, [23.0, 43.0])
-
-
-def test_extractor_with_station_filter(dummy_grid_data, tmp_path):
-    """Test station filtering."""
-    df = pd.DataFrame(
-        {
-            "station_id": ["S1", "S2", "S3"],
-            "opt_x_index": [1, 2, 1],
-            "opt_y_index": [2, 3, 3],
-            "network": ["primary", "secondary", "primary"],
-        }
-    )
-    csv_path = tmp_path / "stations.csv"
-    df.to_csv(csv_path, index=False)
-
-    config = {
-        "station": {
-            "file": str(csv_path),
-            "name": "station_id",
-            "filter": "network == 'primary'",
-            "index": {"x": "opt_x_index", "y": "opt_y_index"},
-        },
-        "grid": {
-            "source": {
-                "list-of-dicts": {
-                    "list_of_dicts": dummy_grid_data,
-                }
-            },
-            "coords": {
-                "x": "latitude",
-                "y": "longitude",
-            },
-        },
-    }
-
-    result_ds = extractor(config)
-
-    assert len(result_ds.station) == 2
-    assert list(result_ds.station.values) == ["S1", "S3"]
-    assert result_ds["temperature"].sel(station="S1").values[0] == 17.0
-    assert result_ds["temperature"].sel(station="S3").values[0] == 18.0
-
-
-def test_extractor_rejects_both_index_and_coords(dummy_grid_data, station_csv_file):
-    """Test that providing both index and coords raises ValueError."""
-    config = {
-        "station": {
-            "file": station_csv_file,
-            "name": "station_id",
-            "index": {"x": "opt_x_index", "y": "opt_y_index"},
-            "coords": {"x": "opt_x_coord", "y": "opt_y_coord"},  # Both provided
-        },
-        "grid": {
-            "source": {"list-of-dicts": {"list_of_dicts": dummy_grid_data}},
-            "coords": {"x": "latitude", "y": "longitude"},
-        },
-    }
-
-    with pytest.raises(ValueError, match="must use either 'index' or 'coords', not both"):
-        extractor(config)
-
-
-def test_extractor_with_output_file(dummy_grid_data, station_csv_file, tmp_path):
-    """Test output file writing."""
-    output_file = tmp_path / "output.nc"
-
-    config = {
-        "station": {
-            "file": station_csv_file,
-            "name": "station_id",
-            "index": {"x": "opt_x_index", "y": "opt_y_index"},
-        },
-        "grid": {
-            "source": {
-                "list-of-dicts": {
-                    "list_of_dicts": dummy_grid_data,
-                }
-            },
-            "coords": {
-                "x": "latitude",
-                "y": "longitude",
-            },
-        },
-        "output": {
-            "file": str(output_file),
-        },
-    }
-
-    result_ds = extractor(config)
-
-    assert output_file.exists()
-
-    loaded_ds = xr.open_dataset(output_file)
-    assert "temperature" in loaded_ds.data_vars
-    assert "station" in loaded_ds.dims
-    assert len(loaded_ds.station) == 2
-
-    xr.testing.assert_allclose(result_ds["temperature"], loaded_ds["temperature"])
-    xr.testing.assert_equal(result_ds.station, loaded_ds.station)
-
-    loaded_ds.close()
-
-
-def test_extractor_with_empty_stations(dummy_grid_data, tmp_path):
-    """Test that extractor raises clear error for empty station list."""
-    empty_csv = tmp_path / "empty_stations.csv"
-    pd.DataFrame(columns=["station_id", "opt_x_index", "opt_y_index"]).to_csv(empty_csv, index=False)
-
-    config = {
-        "station": {
-            "file": str(empty_csv),
-            "name": "station_id",
-            "index": {"x": "opt_x_index", "y": "opt_y_index"},
-        },
-        "grid": {
-            "source": {"list-of-dicts": {"list_of_dicts": dummy_grid_data}},
-            "coords": {"x": "latitude", "y": "longitude"},
-        },
-    }
-
-    with pytest.raises(ValueError, match="No stations found"):
-        extractor(config)
-
-
-@patch("earthkit.data.from_source")
-def test_extractor_gribjump(mock_from_source, tmp_path):
-    """Test gribjump path: verifies ranges computation and earthkit call."""
-
-    # Mock returns object with to_xarray() that returns minimal dataset
-    mock_source = Mock()
-    mock_source.to_xarray.return_value = xr.Dataset(
-        {"temperature": xr.DataArray([[15.0, 25.0], [35.0, 45.0]], dims=["index", "time"])}
-    )
-    mock_from_source.return_value = mock_source
-
-    # Station CSV with index_1d (includes duplicate to test deduplication)
-    csv_file = tmp_path / "stations.csv"
-    pd.DataFrame(
-        {
-            "name": ["S1", "S2", "S3"],
-            "idx": [100, 200, 100],  # S1 and S3 share index 100
-        }
-    ).to_csv(csv_file, index=False)
-
-    config = {
-        "station": {"file": str(csv_file), "name": "name", "index_1d": "idx"},
-        "grid": {"source": {"gribjump": {"request": {"class": "od", "expver": "0001", "stream": "oper"}}}},
-    }
-
-    result = extractor(config)
-
-    # Verify earthkit.data.from_source was called correctly
-    mock_from_source.assert_called_once_with(
-        "gribjump", request={"class": "od", "expver": "0001", "stream": "oper"}, ranges=[(100, 101), (200, 201)]
-    )
-
-    # Verify output
-    assert len(result.station) == 3
-    assert list(result.station.values) == ["S1", "S2", "S3"]
diff --git a/tests/test_imports.py b/tests/test_imports.py
index eadc903..bdd65cd 100644
--- a/tests/test_imports.py
+++ b/tests/test_imports.py
@@ -1,10 +1,2 @@
 def test_mapper_import():
     from hat.station_mapping.mapper import mapper
-
-
-def test_extractor_import():
-    from hat.extract_timeseries.extractor import extractor
-
-
-def test_stat_calc_import():
-    from hat.compute_hydrostats.stat_calc import stat_calc

From 9c0bead84cbf296b1ea9bad0c07d8cf9009a3aa3 Mon Sep 17 00:00:00 2001
From: Andreas Grafberger
 <18516896+andreas-grafberger@users.noreply.github.com>
Date: Tue, 25 Nov 2025 11:32:14 +0000
Subject: [PATCH 2/4] remove more files and udpate README

---
 README.md                                     | 18 +----
 .../workflow/hydrostats_computation.ipynb     | 71 -----------------
 .../workflow/timeseries_extraction.ipynb      | 76 -------------------
 pyproject.toml                                |  8 --
 4 files changed, 4 insertions(+), 169 deletions(-)
 delete mode 100644 notebooks/workflow/hydrostats_computation.ipynb
 delete mode 100644 notebooks/workflow/timeseries_extraction.ipynb

diff --git a/README.md b/README.md
index 6a420fa..c3324f4 100644
--- a/README.md
+++ b/README.md
@@ -28,8 +28,10 @@
 
 The Hydrological Analysis Toolkit (HAT) is a software suite for hydrologists working with simulated and observed river discharge. HAT performs data analysis on hydrological datasets, with its main features being:
 - mapping station locations into hydrological model grids
-- extraction of timeseries at station locations from gridded model outputs
-- statistical analysis of hydrological timeseries
+- interactive visualizations
+
+> [!NOTE]
+> The station extraction and hydrostats functionality formerly in HAT now live in [ecmwf/hyve](https://github.com/ecmwf/hyve).
 
 ### Installation
 
@@ -50,18 +52,6 @@ pip install -e .[dev]
 pre-commit install
 ```
 
-HAT provides **experimental** support for earthkit-data's [gribjump source](https://earthkit-data.readthedocs.io/en/latest/guide/sources.html#gribjump).
-To install the gribjump extras for testing and experimentation, run:
-```bash
-pip install hydro-analysis-toolkit[gribjump]
-```
-
-> [!NOTE]
-> The gribjump feature is experimental. It is not recommended for production use and may change or break in future releases.
-> Information on how to build gribjump can be found in [GribJump's source code](https://github.com/ecmwf/gribjump/). Experimental
-> wheels of `gribjumplib` can also be found [on PyPI](https://pypi.org/project/gribjumplib/).
-
-
 ## Licence
 
 ```
diff --git a/notebooks/workflow/hydrostats_computation.ipynb b/notebooks/workflow/hydrostats_computation.ipynb
deleted file mode 100644
index bb82958..0000000
--- a/notebooks/workflow/hydrostats_computation.ipynb
+++ /dev/null
@@ -1,71 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6908e7ba",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from hat.compute_hydrostats import stat_calc"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0c6c9405",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "config = {\n",
-    "    \"sim\": {\n",
-    "        \"source\": {\"file\": \"extracted_timeseries.nc\"},\n",
-    "        \"coords\": {\n",
-    "            \"s\": \"station\",\n",
-    "            \"t\": \"time\"\n",
-    "        }\n",
-    "    },\n",
-    "    \"obs\": {\n",
-    "        \"source\": {\"file\": \"observations.nc\"},\n",
-    "        \"coords\": {\n",
-    "            \"s\": \"station\",\n",
-    "            \"t\": \"time\"\n",
-    "        }\n",
-    "    },\n",
-    "    \"stats\" : [\"bias\", \"apb\", \"apb2\"],\n",
-    "    \"output\": {\n",
-    "        \"file\": \"statistics.nc\",\n",
-    "        \"coords\": {\n",
-    "            \"s\": \"station\",\n",
-    "            \"t\": \"time\"\n",
-    "        }\n",
-    "    },\n",
-    "}\n",
-    "\n",
-    "ds = stat_calc.stat_calc(config)\n",
-    "ds"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "hat",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/notebooks/workflow/timeseries_extraction.ipynb b/notebooks/workflow/timeseries_extraction.ipynb
deleted file mode 100644
index d5de09e..0000000
--- a/notebooks/workflow/timeseries_extraction.ipynb
+++ /dev/null
@@ -1,76 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6908e7ba",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from hat.extract_timeseries import extractor"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0c6c9405",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "config = {\n",
-    "    \"station\": {\n",
-    "        \"file\": \"mapped_stations.csv\",\n",
-    "        \"filter\": \"(StationLon >= 0) and (drainage_area_provided >= 0)\",\n",
-    "        # EITHER USE INDEX OR COORDS, NOT BOTH\n",
-    "        # INDEX is preferred but requires you to have the exact same grid\n",
-    "        # as the upstream area used in station mapping\n",
-    "        # \"index\": {\n",
-    "        #     \"x\": \"opt_x_index\",\n",
-    "        #     \"y\": \"opt_y_index\"\n",
-    "        # },\n",
-    "        \"coords\": {\n",
-    "            \"x\": \"opt_x_coord\",\n",
-    "            \"y\": \"opt_y_coord\"\n",
-    "        },\n",
-    "        \"name\": \"station_id\"\n",
-    "    },\n",
-    "    \"grid\": {\n",
-    "        \"source\": {\"file\": \"./sim.nc\"},\n",
-    "        \"coords\": {\n",
-    "            \"x\": \"lat\",\n",
-    "            \"y\": \"lon\",\n",
-    "            # TODO: allow custom \"t\": \"time\" specification\n",
-    "        }\n",
-    "    },\n",
-    "    \"output\": {\n",
-    "        \"file\": \"extracted_timeseries.nc\"\n",
-    "    },\n",
-    "}\n",
-    "\n",
-    "ds = extractor.extractor(config)\n",
-    "ds"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/pyproject.toml b/pyproject.toml
index e1155cd..b1cba3b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,9 +70,6 @@ dependencies = [
         "ruff",
         "pre-commit"
     ]
-    gribjump = [
-        "earthkit-data[gribjump]"
-    ]
 
 [project.scripts]
     hat-station-mapping = "hat.cli:mapper_cli"
@@ -100,11 +97,6 @@ addopts = "--pdbcls=IPython.terminal.debugger:Pdb"
 testpaths = [
     "tests",
 ]
-filterwarnings = [
-    # Probably harmless numpy 2.x ABI warning from netCDF4's Cython extension
-    # See: https://github.com/Unidata/netcdf4-python/issues/1354
-    "ignore:numpy.ndarray size changed:RuntimeWarning",
-]
 
 # Packaging/setuptools options
 [tool.setuptools]

From be5679917778e07fba62637556475ca214be3ec0 Mon Sep 17 00:00:00 2001
From: Andreas Grafberger
 <18516896+andreas-grafberger@users.noreply.github.com>
Date: Tue, 25 Nov 2025 11:38:43 +0000
Subject: [PATCH 3/4] remove missing lines in docs

---
 mkdocs.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mkdocs.yml b/mkdocs.yml
index 5ec673a..11d64c5 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -7,7 +7,5 @@ nav:
     - Usage: usage.md
     - Command Line Interface:
         - station_mapping: station_mapping.md
-        - hat-extract-timeseries: hat_extract-timeseries.md
-        - hat-hydrostats: hat_hydrostats.md
 
 theme: readthedocs

From 9e1784cab225b2bc2b6c322ca7a4a5d2fbde917e Mon Sep 17 00:00:00 2001
From: Andreas Grafberger
 <18516896+andreas-grafberger@users.noreply.github.com>
Date: Tue, 2 Dec 2025 12:49:30 +0000
Subject: [PATCH 4/4] remove missed hat-extract-timeseries occurence in docs

---
 docs/usage.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/usage.md b/docs/usage.md
index f0d998a..00a5f5c 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -10,7 +10,7 @@ If you have already [installed](installation.md) hat then
 
 Run a command line tool, for example
 
-    $ hat-extract-timeseries --help
+    $ hat-station-mapping --help
 
 For more information on individual command line tools, use the `--help` option at the command line or read the documentation, for instance for the [station mapping](station_mapping.md) tool.