diff --git a/pyproject.toml b/pyproject.toml index 4520c45d..c5c10917 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -139,6 +139,15 @@ files = ["src"] target-version = "py310" extend-include = ["*.ipynb"] +# Add this to ignore shapefiles +exclude = [ + "shapefiles", + "*.shp", + "*.shx", + "*.dbf", + "*.prj" +] + [tool.ruff.lint] select = [ "A", diff --git a/src/ewatercycle/_forcings/caravan.py b/src/ewatercycle/_forcings/caravan.py index 8a535f1c..141ef83e 100644 --- a/src/ewatercycle/_forcings/caravan.py +++ b/src/ewatercycle/_forcings/caravan.py @@ -1,3 +1,4 @@ +import os import shutil import zipfile from pathlib import Path @@ -12,7 +13,7 @@ from ewatercycle.util import get_time COMMON_URL = "ca13056c-c347-4a27-b320-930c2a4dd207" -OPENDAP_URL = f"https://opendap.4tu.nl/thredds/dodsC/data2/djht/{COMMON_URL}/1/" +OPENDAP_URL = f"https://opendap.4tu.nl/thredds/dodsC/data2/djht/{COMMON_URL}/2/" SHAPEFILE_URL = ( f"https://data.4tu.nl/file/{COMMON_URL}/bbe94526-cf1a-4b96-8155-244f20094719" ) @@ -106,7 +107,12 @@ class CaravanForcing(DefaultForcing): @classmethod def get_dataset(cls: type["CaravanForcing"], dataset: str) -> xr.Dataset: - """Opens specified dataset from data.4tu.nl OPeNDAP server. + """Opens dataset from data.4tu.nl OPeNDAP server, or cache if available. + + By default, it will open the dataset from data.4tu.nl OPeNDAP server + This can be overridden by having an environmental variable: CARAVAN_CACHE. + Set this variable to the directory containing the netCDF files. + Args: dataset (str): name of dataset, choose from: @@ -118,6 +124,10 @@ def get_dataset(cls: type["CaravanForcing"], dataset: str) -> xr.Dataset: 'hysets', 'lamah' """ + cache_dir = os.environ.get("CARAVAN_CACHE") + # Check if we want to load from 4TU or dCache + if cache_dir: + return xr.open_dataset(Path(cache_dir) / f"{dataset}.nc") return xr.open_dataset(f"{OPENDAP_URL}{dataset}.nc") @classmethod @@ -246,7 +256,18 @@ def generate( # type: ignore[override] def get_shapefiles(directory: Path, basin_id: str) -> Path: - """Retrieve shapefiles from data 4TU.nl .""" + """Retrieve shapefiles from data 4TU.nl or cache.""" + cache_dir = os.environ.get("CARAVAN_CACHE") + # Check if we want to load from 4TU or dCache + if cache_dir: + shape_path = directory / f"{basin_id}.shp" + combined_shapefile_path = Path(cache_dir) / "shapefiles" / "combined.shp" + + if not shape_path.is_file(): + extract_basin_shapefile(basin_id, combined_shapefile_path, shape_path) + + return shape_path + zip_path = directory / "shapefiles.zip" output_path = directory / "shapefiles" shape_path = directory / f"{basin_id}.shp" diff --git a/tests/src/base/forcing_files/README.md b/tests/src/base/forcing_files/README.md index 749fd46f..c563bf64 100644 --- a/tests/src/base/forcing_files/README.md +++ b/tests/src/base/forcing_files/README.md @@ -4,6 +4,6 @@ The data only includes a year of forcing for one catchment. For own use, please download from the original source and cite correctly. The Caravan dataset itself is also a combination of data from seperate sources. -The Carvan dataset is originanly obtained from https://zenodo.org/records/7944025 and is explained in a paper by Kratzert, F. :'Caravan - A global community dataset for large-sample hydrology' found here: https://doi-org.tudelft.idm.oclc.org/10.1038/s41597-023-01975-w +The Caravan dataset is originally obtained from https://zenodo.org/records/7944025 and is explained in a paper by Kratzert, F. :'Caravan - A global community dataset for large-sample hydrology' found here: https://doi-org.tudelft.idm.oclc.org/10.1038/s41597-023-01975-w Distributed under Creative Commons Attribution 4.0 International. diff --git a/tests/src/base/forcing_files/camels_03439000.cpg b/tests/src/base/forcing_files/camels_03439000.cpg index cd89cb97..57decb48 100644 --- a/tests/src/base/forcing_files/camels_03439000.cpg +++ b/tests/src/base/forcing_files/camels_03439000.cpg @@ -1 +1 @@ -ISO-8859-1 \ No newline at end of file +ISO-8859-1 diff --git a/tests/src/base/forcing_files/camels_03439000.prj b/tests/src/base/forcing_files/camels_03439000.prj index f45cbadf..0ae685b4 100644 --- a/tests/src/base/forcing_files/camels_03439000.prj +++ b/tests/src/base/forcing_files/camels_03439000.prj @@ -1 +1 @@ -GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]] \ No newline at end of file +GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]] diff --git a/tests/src/base/forcing_files/camels_03439000.shx b/tests/src/base/forcing_files/camels_03439000.shx index 548b65f7..124438b6 100644 Binary files a/tests/src/base/forcing_files/camels_03439000.shx and b/tests/src/base/forcing_files/camels_03439000.shx differ diff --git a/tests/src/base/forcing_files/shapefiles/combined.cpg b/tests/src/base/forcing_files/shapefiles/combined.cpg new file mode 100644 index 00000000..7edc66b0 --- /dev/null +++ b/tests/src/base/forcing_files/shapefiles/combined.cpg @@ -0,0 +1 @@ +UTF-8 diff --git a/tests/src/base/forcing_files/shapefiles/combined.dbf b/tests/src/base/forcing_files/shapefiles/combined.dbf new file mode 100644 index 00000000..d4c35dd5 Binary files /dev/null and b/tests/src/base/forcing_files/shapefiles/combined.dbf differ diff --git a/tests/src/base/forcing_files/shapefiles/combined.prj b/tests/src/base/forcing_files/shapefiles/combined.prj new file mode 100644 index 00000000..0ae685b4 --- /dev/null +++ b/tests/src/base/forcing_files/shapefiles/combined.prj @@ -0,0 +1 @@ +GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]] diff --git a/tests/src/base/forcing_files/shapefiles/combined.shp b/tests/src/base/forcing_files/shapefiles/combined.shp new file mode 100644 index 00000000..217dd6d9 Binary files /dev/null and b/tests/src/base/forcing_files/shapefiles/combined.shp differ diff --git a/tests/src/base/forcing_files/shapefiles/combined.shx b/tests/src/base/forcing_files/shapefiles/combined.shx new file mode 100644 index 00000000..9ee15835 Binary files /dev/null and b/tests/src/base/forcing_files/shapefiles/combined.shx differ diff --git a/tests/src/base/test_forcing.py b/tests/src/base/test_forcing.py index 2966821f..dc1ab228 100644 --- a/tests/src/base/test_forcing.py +++ b/tests/src/base/test_forcing.py @@ -427,3 +427,41 @@ def test_extract_basin_shapefile(tmp_path: Path): assert len(records) == 1 assert records[0].attributes["gauge_id"] == basin_id + + +def test_get_dataset_using_cache(tmp_path, monkeypatch): + # Prepare cache directory + cache_dir = tmp_path / "cache" + cache_dir.mkdir() + + basin_id = "camels_01022500" + # Use the existing fake Caravan dataset + test_files_dir = Path(__file__).parent / "forcing_files" + test_file = test_files_dir / "test_caravan_file.nc" + cache_target = cache_dir / "camels.nc" + cache_target.write_bytes(test_file.read_bytes()) + + # Copy shapefiles into the cache so Fiona can find them + shapefiles_dir = test_files_dir / "shapefiles" + cache_shapefiles_dir = cache_dir / "shapefiles" + copytree(shapefiles_dir, cache_shapefiles_dir) + + # Point CARAVAN_CACHE to this directory + monkeypatch.setenv("CARAVAN_CACHE", str(cache_dir)) + + # Copy other forcing files to tmp_camels_dir + tmp_camels_dir = tmp_path / "camels" + copytree(test_files_dir, tmp_camels_dir) + + # Call the method + ds = CaravanForcing.generate( + start_time="1981-01-01T00:00:00Z", + end_time="1981-03-01T00:00:00Z", + directory=str(tmp_camels_dir), + basin_id=basin_id, + ).to_xarray() + + # Assert that the file was loaded from cache + content = list(ds.data_vars.keys()) + expected = ["Q", "evspsblpot", "pr", "tas", "tasmax", "tasmin"] + assert content == expected