From 6608794e8174c0a3883adeded1ef5989ac6b2e48 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Sun, 7 Dec 2025 21:32:47 +0100 Subject: [PATCH 1/6] ENH: Added utility functions to create frozen datasets --- dtoolcore/__init__.py | 97 +++++ tests/test_create_frozen_dataset.py | 580 ++++++++++++++++++++++++++++ 2 files changed, 677 insertions(+) create mode 100644 tests/test_create_frozen_dataset.py diff --git a/dtoolcore/__init__.py b/dtoolcore/__init__.py index 211aa30..dbfb38d 100644 --- a/dtoolcore/__init__.py +++ b/dtoolcore/__init__.py @@ -224,6 +224,103 @@ def create_derived_proto_dataset( return proto_dataset +def create_frozen_dataset( + base_uri, + uuid, + name, + creator_username, + frozen_at, + manifest, + readme_content="", + tags=None, + annotations=None, + config_path=None +): + """Create a frozen dataset directly from metadata. + + This function creates a frozen dataset without going through the + proto-dataset stage. It is useful for server-side operations where + the dataset structure is known upfront (e.g., signed URL uploads). + + The function writes admin_metadata, manifest, structure, tags, and + annotations directly to storage. Items and README content must be + uploaded separately if they are not provided. + + :param base_uri: base URI where the dataset will be created + :param uuid: dataset UUID + :param name: dataset name + :param creator_username: username of the dataset creator + :param frozen_at: timestamp when the dataset was frozen + :param manifest: manifest dictionary with items metadata + :param readme_content: optional README content string + :param tags: optional list of tags + :param annotations: optional dictionary of annotations + :param config_path: path to dtool configuration file + :returns: DataSet instance + :raises: DtoolCoreInvalidNameError if name or any tag/annotation name is invalid + """ + logger.debug("In create_frozen_dataset...") + + # Validate name + if not dtoolcore.utils.name_is_valid(name): + raise DtoolCoreInvalidNameError(f"Invalid dataset name: {name}") + + # Validate tags + if tags: + for tag in tags: + if not isinstance(tag, str): + raise DtoolCoreValueError(f"Tag must be a string: {tag}") + if not dtoolcore.utils.name_is_valid(tag): + raise DtoolCoreInvalidNameError(f"Invalid tag name: {tag}") + + # Validate annotations + if annotations: + for annotation_name in annotations.keys(): + if not dtoolcore.utils.name_is_valid(annotation_name): + raise DtoolCoreInvalidNameError( + f"Invalid annotation name: {annotation_name}" + ) + + # Build admin metadata for a frozen dataset + admin_metadata = { + "uuid": uuid, + "dtoolcore_version": __version__, + "name": name, + "type": "dataset", # Frozen dataset, not protodataset + "creator_username": creator_username, + "frozen_at": frozen_at, + } + + # Get storage broker + uri = _generate_uri(admin_metadata, base_uri) + storage_broker = _get_storage_broker(uri, config_path) + + # Create the dataset structure + storage_broker.create_structure() + + # Write admin metadata + storage_broker.put_admin_metadata(admin_metadata) + + # Write manifest + storage_broker.put_manifest(manifest) + + # Write README + storage_broker.put_readme(readme_content) + + # Write tags + if tags: + for tag in tags: + storage_broker.put_tag(tag) + + # Write annotations + if annotations: + for annotation_name, annotation_value in annotations.items(): + storage_broker.put_annotation(annotation_name, annotation_value) + + # Return a DataSet instance + return DataSet(uri, admin_metadata, config_path) + + def _copy_create_proto_dataset( src_dataset, dest_base_uri, diff --git a/tests/test_create_frozen_dataset.py b/tests/test_create_frozen_dataset.py new file mode 100644 index 0000000..2b48f2f --- /dev/null +++ b/tests/test_create_frozen_dataset.py @@ -0,0 +1,580 @@ +"""Test the create_frozen_dataset function.""" + +import os +import uuid as uuid_module + +import pytest + +from . import tmp_dir_fixture # NOQA + +from dtoolcore.utils import ( + IS_WINDOWS, + generous_parse_uri, + windows_to_unix_path, + generate_identifier, +) + + +def _sanitise_base_uri(tmp_dir): + base_uri = tmp_dir + if IS_WINDOWS: + parsed_base_uri = generous_parse_uri(tmp_dir) + unix_path = windows_to_unix_path(parsed_base_uri.path) + base_uri = "file://{}".format(unix_path) + return base_uri + + +def test_create_frozen_dataset_basic(tmp_dir_fixture): # NOQA + """Test basic creation of a frozen dataset.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + dataset_uuid = str(uuid_module.uuid4()) + name = "test-frozen-dataset" + creator_username = "tester" + frozen_at = 1234567890.123 + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + dataset = dtoolcore.create_frozen_dataset( + base_uri=base_uri, + uuid=dataset_uuid, + name=name, + creator_username=creator_username, + frozen_at=frozen_at, + manifest=manifest, + ) + + # Verify it's a DataSet instance + assert isinstance(dataset, dtoolcore.DataSet) + + # Verify admin metadata + assert dataset.uuid == dataset_uuid + assert dataset.name == name + assert dataset.admin_metadata["creator_username"] == creator_username + assert dataset.admin_metadata["frozen_at"] == frozen_at + assert dataset.admin_metadata["type"] == "dataset" + assert dataset.admin_metadata["dtoolcore_version"] == dtoolcore.__version__ + + # Verify we can load it from URI + loaded_dataset = dtoolcore.DataSet.from_uri(dataset.uri) + assert loaded_dataset.uuid == dataset_uuid + assert loaded_dataset.name == name + + +def test_create_frozen_dataset_with_readme(tmp_dir_fixture): # NOQA + """Test creation with README content.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + dataset_uuid = str(uuid_module.uuid4()) + readme_content = "---\ndescription: Test dataset\nproject: Testing" + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + dataset = dtoolcore.create_frozen_dataset( + base_uri=base_uri, + uuid=dataset_uuid, + name="readme-test", + creator_username="tester", + frozen_at=1234567890.0, + manifest=manifest, + readme_content=readme_content, + ) + + assert dataset.get_readme_content() == readme_content + + # Verify it persists after reloading + loaded_dataset = dtoolcore.DataSet.from_uri(dataset.uri) + assert loaded_dataset.get_readme_content() == readme_content + + +def test_create_frozen_dataset_with_tags(tmp_dir_fixture): # NOQA + """Test creation with tags.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + dataset_uuid = str(uuid_module.uuid4()) + tags = ["production", "validated", "public"] + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + dataset = dtoolcore.create_frozen_dataset( + base_uri=base_uri, + uuid=dataset_uuid, + name="tags-test", + creator_username="tester", + frozen_at=1234567890.0, + manifest=manifest, + tags=tags, + ) + + # Tags may be returned in different order, so compare as sets + assert set(dataset.list_tags()) == set(tags) + + # Verify it persists after reloading + loaded_dataset = dtoolcore.DataSet.from_uri(dataset.uri) + assert set(loaded_dataset.list_tags()) == set(tags) + + +def test_create_frozen_dataset_with_annotations(tmp_dir_fixture): # NOQA + """Test creation with annotations.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + dataset_uuid = str(uuid_module.uuid4()) + annotations = { + "project": "test-project", + "version": 42, + "metadata": {"nested": "value", "list": [1, 2, 3]}, + "flag": True, + } + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + dataset = dtoolcore.create_frozen_dataset( + base_uri=base_uri, + uuid=dataset_uuid, + name="annotations-test", + creator_username="tester", + frozen_at=1234567890.0, + manifest=manifest, + annotations=annotations, + ) + + # Verify all annotations + assert set(dataset.list_annotation_names()) == set(annotations.keys()) + for name, value in annotations.items(): + assert dataset.get_annotation(name) == value + + # Verify it persists after reloading + loaded_dataset = dtoolcore.DataSet.from_uri(dataset.uri) + assert set(loaded_dataset.list_annotation_names()) == set(annotations.keys()) + for name, value in annotations.items(): + assert loaded_dataset.get_annotation(name) == value + + +def test_create_frozen_dataset_with_items(tmp_dir_fixture): # NOQA + """Test creation with manifest items.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + dataset_uuid = str(uuid_module.uuid4()) + + # Create manifest with items + items = { + generate_identifier("data/file1.txt"): { + "relpath": "data/file1.txt", + "size_in_bytes": 100, + "hash": "abc123", + "utc_timestamp": 1234567890.0, + }, + generate_identifier("data/file2.csv"): { + "relpath": "data/file2.csv", + "size_in_bytes": 500, + "hash": "def456", + "utc_timestamp": 1234567891.0, + }, + } + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": items, + } + + dataset = dtoolcore.create_frozen_dataset( + base_uri=base_uri, + uuid=dataset_uuid, + name="items-test", + creator_username="tester", + frozen_at=1234567890.0, + manifest=manifest, + ) + + # Verify manifest items + assert set(dataset.identifiers) == set(items.keys()) + for identifier, props in items.items(): + item_props = dataset.item_properties(identifier) + assert item_props["relpath"] == props["relpath"] + assert item_props["size_in_bytes"] == props["size_in_bytes"] + assert item_props["hash"] == props["hash"] + + +def test_create_frozen_dataset_full(tmp_dir_fixture): # NOQA + """Test creation with all optional parameters.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + dataset_uuid = str(uuid_module.uuid4()) + name = "full-test-dataset" + creator_username = "scientist" + frozen_at = 1609459200.0 # 2021-01-01 00:00:00 UTC + readme_content = "---\nproject: Full Test\ndescription: Complete test" + tags = ["experiment", "simulation"] + annotations = { + "experiment_id": "EXP-001", + "parameters": {"temp": 300, "pressure": 1.0}, + } + + items = { + generate_identifier("results.json"): { + "relpath": "results.json", + "size_in_bytes": 1024, + "hash": "result_hash", + "utc_timestamp": frozen_at, + }, + } + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": items, + } + + dataset = dtoolcore.create_frozen_dataset( + base_uri=base_uri, + uuid=dataset_uuid, + name=name, + creator_username=creator_username, + frozen_at=frozen_at, + manifest=manifest, + readme_content=readme_content, + tags=tags, + annotations=annotations, + ) + + # Verify everything + assert dataset.uuid == dataset_uuid + assert dataset.name == name + assert dataset.admin_metadata["creator_username"] == creator_username + assert dataset.admin_metadata["frozen_at"] == frozen_at + assert dataset.get_readme_content() == readme_content + assert set(dataset.list_tags()) == set(tags) + assert set(dataset.list_annotation_names()) == set(annotations.keys()) + assert set(dataset.identifiers) == set(items.keys()) + + +def test_create_frozen_dataset_invalid_name(tmp_dir_fixture): # NOQA + """Test that invalid dataset name raises DtoolCoreInvalidNameError.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Names with spaces are invalid + with pytest.raises(dtoolcore.DtoolCoreInvalidNameError): + dtoolcore.create_frozen_dataset( + base_uri=base_uri, + uuid=str(uuid_module.uuid4()), + name="invalid name with spaces", + creator_username="tester", + frozen_at=1234567890.0, + manifest=manifest, + ) + + # Names with special characters are invalid + with pytest.raises(dtoolcore.DtoolCoreInvalidNameError): + dtoolcore.create_frozen_dataset( + base_uri=base_uri, + uuid=str(uuid_module.uuid4()), + name="invalid@name!", + creator_username="tester", + frozen_at=1234567890.0, + manifest=manifest, + ) + + +def test_create_frozen_dataset_invalid_tag(tmp_dir_fixture): # NOQA + """Test that invalid tag raises DtoolCoreInvalidNameError.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Tag with spaces is invalid + with pytest.raises(dtoolcore.DtoolCoreInvalidNameError): + dtoolcore.create_frozen_dataset( + base_uri=base_uri, + uuid=str(uuid_module.uuid4()), + name="valid-name", + creator_username="tester", + frozen_at=1234567890.0, + manifest=manifest, + tags=["valid-tag", "invalid tag"], + ) + + +def test_create_frozen_dataset_invalid_tag_type(tmp_dir_fixture): # NOQA + """Test that non-string tag raises DtoolCoreValueError.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Tag must be a string + with pytest.raises(dtoolcore.DtoolCoreValueError): + dtoolcore.create_frozen_dataset( + base_uri=base_uri, + uuid=str(uuid_module.uuid4()), + name="valid-name", + creator_username="tester", + frozen_at=1234567890.0, + manifest=manifest, + tags=["valid-tag", 123], # 123 is not a string + ) + + +def test_create_frozen_dataset_invalid_annotation_name(tmp_dir_fixture): # NOQA + """Test that invalid annotation name raises DtoolCoreInvalidNameError.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Annotation name with spaces is invalid + with pytest.raises(dtoolcore.DtoolCoreInvalidNameError): + dtoolcore.create_frozen_dataset( + base_uri=base_uri, + uuid=str(uuid_module.uuid4()), + name="valid-name", + creator_username="tester", + frozen_at=1234567890.0, + manifest=manifest, + annotations={"valid_name": "value", "invalid name": "value"}, + ) + + +def test_create_frozen_dataset_empty_tags_and_annotations(tmp_dir_fixture): # NOQA + """Test that empty lists/dicts for tags and annotations work.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + dataset_uuid = str(uuid_module.uuid4()) + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Empty list for tags, empty dict for annotations + dataset = dtoolcore.create_frozen_dataset( + base_uri=base_uri, + uuid=dataset_uuid, + name="empty-test", + creator_username="tester", + frozen_at=1234567890.0, + manifest=manifest, + tags=[], + annotations={}, + ) + + assert dataset.list_tags() == [] + assert dataset.list_annotation_names() == [] + + +def test_create_frozen_dataset_none_tags_and_annotations(tmp_dir_fixture): # NOQA + """Test that None for tags and annotations work (default behavior).""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + dataset_uuid = str(uuid_module.uuid4()) + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Explicitly pass None + dataset = dtoolcore.create_frozen_dataset( + base_uri=base_uri, + uuid=dataset_uuid, + name="none-test", + creator_username="tester", + frozen_at=1234567890.0, + manifest=manifest, + tags=None, + annotations=None, + ) + + assert dataset.list_tags() == [] + assert dataset.list_annotation_names() == [] + + +def test_dataset_put_readme(tmp_dir_fixture): # NOQA + """Test updating the README of a frozen dataset.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + dataset_uuid = str(uuid_module.uuid4()) + original_readme = "---\ndescription: Original README content" + updated_readme = "---\ndescription: Updated README content\nversion: 2" + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Create dataset with original README + dataset = dtoolcore.create_frozen_dataset( + base_uri=base_uri, + uuid=dataset_uuid, + name="readme-update-test", + creator_username="tester", + frozen_at=1234567890.0, + manifest=manifest, + readme_content=original_readme, + ) + + assert dataset.get_readme_content() == original_readme + + # Update the README + dataset.put_readme(updated_readme) + + # Verify the update + assert dataset.get_readme_content() == updated_readme + + # Reload dataset and verify persistence + reloaded_dataset = dtoolcore.DataSet.from_uri(dataset.uri) + assert reloaded_dataset.get_readme_content() == updated_readme + + +def test_dataset_put_readme_creates_backup(tmp_dir_fixture): # NOQA + """Test that put_readme creates a backup of the original README.""" + import os + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + dataset_uuid = str(uuid_module.uuid4()) + original_readme = "---\ndescription: Original content" + updated_readme = "---\ndescription: New content" + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Create dataset + dataset = dtoolcore.create_frozen_dataset( + base_uri=base_uri, + uuid=dataset_uuid, + name="backup-test", + creator_username="tester", + frozen_at=1234567890.0, + manifest=manifest, + readme_content=original_readme, + ) + + # Get the dataset directory path + from dtoolcore.utils import generous_parse_uri + parsed = generous_parse_uri(dataset.uri) + dataset_path = parsed.path + + # Count README files before update + readme_files_before = [f for f in os.listdir(dataset_path) + if f.startswith("README.yml")] + assert len(readme_files_before) == 1 + + # Update the README + dataset.put_readme(updated_readme) + + # Count README files after update - should have backup + readme_files_after = [f for f in os.listdir(dataset_path) + if f.startswith("README.yml")] + assert len(readme_files_after) == 2 + + # Verify one is the current README and one is a backup + assert "README.yml" in readme_files_after + backup_files = [f for f in readme_files_after if f != "README.yml"] + assert len(backup_files) == 1 + assert backup_files[0].startswith("README.yml-") + + # Verify the backup contains the original content + backup_path = os.path.join(dataset_path, backup_files[0]) + with open(backup_path, "r") as f: + backup_content = f.read() + assert backup_content == original_readme + + +def test_dataset_put_readme_multiple_updates(tmp_dir_fixture): # NOQA + """Test multiple README updates create multiple backups.""" + import os + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + dataset_uuid = str(uuid_module.uuid4()) + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Create dataset + dataset = dtoolcore.create_frozen_dataset( + base_uri=base_uri, + uuid=dataset_uuid, + name="multi-update-test", + creator_username="tester", + frozen_at=1234567890.0, + manifest=manifest, + readme_content="Version 1", + ) + + # Get the dataset directory path + from dtoolcore.utils import generous_parse_uri + parsed = generous_parse_uri(dataset.uri) + dataset_path = parsed.path + + # Perform multiple updates + dataset.put_readme("Version 2") + dataset.put_readme("Version 3") + dataset.put_readme("Version 4") + + # Should have original + 3 backups = 4 README files + readme_files = [f for f in os.listdir(dataset_path) + if f.startswith("README.yml")] + assert len(readme_files) == 4 + + # Current README should be the latest version + assert dataset.get_readme_content() == "Version 4" From 479b383f3bd53715d78eac83d7b2e3e94ccd6d4d Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Sun, 7 Dec 2025 23:25:46 +0100 Subject: [PATCH 2/6] BUILD: Switched build system to flit --- pyproject.toml | 24 ++++++++++++++++-------- setup.cfg | 10 ---------- 2 files changed, 16 insertions(+), 18 deletions(-) delete mode 100644 setup.cfg diff --git a/pyproject.toml b/pyproject.toml index 9dbfb00..cb56204 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,17 +1,18 @@ [build-system] -requires = ["setuptools>=42", "setuptools_scm[toml]>=6.3"] -build-backend = "setuptools.build_meta" +requires = ["flit_scm"] +build-backend = "flit_scm:buildapi" [project] name = "dtoolcore" description = "Core API for managing (scientific) data" readme = "README.rst" -license = {file = "LICENSE"} +license = {text = "MIT"} authors = [ {name = "Tjelvar Olsson", email = "tjelvar.olsson@gmail.com"} ] dynamic = ["version"] -dependencies = ["setuptools"] +requires-python = ">=3.8" +dependencies = [] [project.optional-dependencies] test = [ @@ -29,13 +30,20 @@ Documentation = "https://dtoolcore.readthedocs.io" Repository = "https://github.com/jic-dtool/dtoolcore" Changelog = "https://github.com/jic-dtool/dtoolcore/blob/master/CHANGELOG.rst" +[project.entry-points."dtool.storage_brokers"] +DiskStorageBroker = "dtoolcore.storagebroker:DiskStorageBroker" + +[tool.flit.module] +name = "dtoolcore" + [tool.setuptools_scm] version_scheme = "guess-next-dev" local_scheme = "no-local-version" write_to = "dtoolcore/version.py" -[tool.setuptools] -packages = ["dtoolcore"] +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "--cov=dtoolcore --cov-report=term-missing" -[project.entry-points."dtool.storage_brokers"] -"DiskStorageBroker" = "dtoolcore.storagebroker:DiskStorageBroker" +[tool.flake8] +exclude = ["env*", ".tox", ".git", "*.egg", "build", "docs", "venv"] diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 95f3f57..0000000 --- a/setup.cfg +++ /dev/null @@ -1,10 +0,0 @@ -[flake8] -exclude=env*,.tox,.git,*.egg,build,docs,venv - -[tool:pytest] -testpaths = tests -addopts = --cov=dtoolcore --cov-report=term-missing -#addopts = -x --pdb - -[cov:run] -source = dtoolcore From 14d38b7300a3a146471c0bc24fce5888bfc78707 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Mon, 8 Dec 2025 08:22:59 +0100 Subject: [PATCH 3/6] MAINT: New upload workflow that creates a frozen dataset first and then freezes upon confirming upload --- dtoolcore/__init__.py | 157 +++----- tests/test_create_frozen_dataset.py | 580 ---------------------------- tests/test_freeze_with_manifest.py | 362 +++++++++++++++++ 3 files changed, 422 insertions(+), 677 deletions(-) delete mode 100644 tests/test_create_frozen_dataset.py create mode 100644 tests/test_freeze_with_manifest.py diff --git a/dtoolcore/__init__.py b/dtoolcore/__init__.py index dbfb38d..bc116c5 100644 --- a/dtoolcore/__init__.py +++ b/dtoolcore/__init__.py @@ -224,103 +224,6 @@ def create_derived_proto_dataset( return proto_dataset -def create_frozen_dataset( - base_uri, - uuid, - name, - creator_username, - frozen_at, - manifest, - readme_content="", - tags=None, - annotations=None, - config_path=None -): - """Create a frozen dataset directly from metadata. - - This function creates a frozen dataset without going through the - proto-dataset stage. It is useful for server-side operations where - the dataset structure is known upfront (e.g., signed URL uploads). - - The function writes admin_metadata, manifest, structure, tags, and - annotations directly to storage. Items and README content must be - uploaded separately if they are not provided. - - :param base_uri: base URI where the dataset will be created - :param uuid: dataset UUID - :param name: dataset name - :param creator_username: username of the dataset creator - :param frozen_at: timestamp when the dataset was frozen - :param manifest: manifest dictionary with items metadata - :param readme_content: optional README content string - :param tags: optional list of tags - :param annotations: optional dictionary of annotations - :param config_path: path to dtool configuration file - :returns: DataSet instance - :raises: DtoolCoreInvalidNameError if name or any tag/annotation name is invalid - """ - logger.debug("In create_frozen_dataset...") - - # Validate name - if not dtoolcore.utils.name_is_valid(name): - raise DtoolCoreInvalidNameError(f"Invalid dataset name: {name}") - - # Validate tags - if tags: - for tag in tags: - if not isinstance(tag, str): - raise DtoolCoreValueError(f"Tag must be a string: {tag}") - if not dtoolcore.utils.name_is_valid(tag): - raise DtoolCoreInvalidNameError(f"Invalid tag name: {tag}") - - # Validate annotations - if annotations: - for annotation_name in annotations.keys(): - if not dtoolcore.utils.name_is_valid(annotation_name): - raise DtoolCoreInvalidNameError( - f"Invalid annotation name: {annotation_name}" - ) - - # Build admin metadata for a frozen dataset - admin_metadata = { - "uuid": uuid, - "dtoolcore_version": __version__, - "name": name, - "type": "dataset", # Frozen dataset, not protodataset - "creator_username": creator_username, - "frozen_at": frozen_at, - } - - # Get storage broker - uri = _generate_uri(admin_metadata, base_uri) - storage_broker = _get_storage_broker(uri, config_path) - - # Create the dataset structure - storage_broker.create_structure() - - # Write admin metadata - storage_broker.put_admin_metadata(admin_metadata) - - # Write manifest - storage_broker.put_manifest(manifest) - - # Write README - storage_broker.put_readme(readme_content) - - # Write tags - if tags: - for tag in tags: - storage_broker.put_tag(tag) - - # Write annotations - if annotations: - for annotation_name, annotation_value in annotations.items(): - storage_broker.put_annotation(annotation_name, annotation_value) - - # Return a DataSet instance - return DataSet(uri, admin_metadata, config_path) - - def _copy_create_proto_dataset( src_dataset, dest_base_uri, @@ -955,6 +858,66 @@ def freeze(self, progressbar=None): # Clean up using the storage broker's post freeze hook. self._storage_broker.post_freeze_hook() + def freeze_with_manifest(self, manifest, frozen_at=None): + """ + Convert :class:`dtoolcore.ProtoDataSet` to :class:`dtoolcore.DataSet` + using a pre-computed manifest. + + This method freezes the dataset without computing hashes server-side. + The caller provides a manifest with pre-computed item properties + (hash, size, timestamp). This is useful for server-side operations + where the client has already computed hashes during upload. + + :param manifest: dict with structure:: + + { + "dtoolcore_version": , + "hash_function": , + "items": { + : { + "relpath": , + "size_in_bytes": , + "hash": , + "utc_timestamp": + } + } + } + + :param frozen_at: optional timestamp for when the dataset was frozen. + If not provided, uses the current UTC time. + """ + logger.debug("Freeze dataset with manifest {}".format(self)) + + # Call the storage broker pre_freeze hook. + self._storage_broker.pre_freeze_hook() + + # Use provided manifest instead of computing + self._storage_broker.put_manifest(manifest) + + # Generate and persist overlays from any item metadata that has been + # added. + overlays = self._generate_overlays() + for overlay_name, overlay in overlays.items(): + self._put_overlay(overlay_name, overlay) + + # Change the type of the dataset from "protodataset" to "dataset" + # in the administrative metadata. + metadata_update = {"type": "dataset"} + + # Use provided frozen_at or generate one + if frozen_at is not None: + metadata_update["frozen_at"] = frozen_at + elif "frozen_at" not in self._admin_metadata: + datetime_obj = datetime.datetime.utcnow() + metadata_update["frozen_at"] = dtoolcore.utils.timestamp(datetime_obj) + + # Apply the change(s) to the administrative metadata. + self._admin_metadata.update(metadata_update) + self._storage_broker.put_admin_metadata(self._admin_metadata) + + # Clean up using the storage broker's post freeze hook. + self._storage_broker.post_freeze_hook() + class DataSetCreator(object): """Context manager for creating a dataset. diff --git a/tests/test_create_frozen_dataset.py b/tests/test_create_frozen_dataset.py deleted file mode 100644 index 2b48f2f..0000000 --- a/tests/test_create_frozen_dataset.py +++ /dev/null @@ -1,580 +0,0 @@ -"""Test the create_frozen_dataset function.""" - -import os -import uuid as uuid_module - -import pytest - -from . import tmp_dir_fixture # NOQA - -from dtoolcore.utils import ( - IS_WINDOWS, - generous_parse_uri, - windows_to_unix_path, - generate_identifier, -) - - -def _sanitise_base_uri(tmp_dir): - base_uri = tmp_dir - if IS_WINDOWS: - parsed_base_uri = generous_parse_uri(tmp_dir) - unix_path = windows_to_unix_path(parsed_base_uri.path) - base_uri = "file://{}".format(unix_path) - return base_uri - - -def test_create_frozen_dataset_basic(tmp_dir_fixture): # NOQA - """Test basic creation of a frozen dataset.""" - import dtoolcore - - base_uri = _sanitise_base_uri(tmp_dir_fixture) - dataset_uuid = str(uuid_module.uuid4()) - name = "test-frozen-dataset" - creator_username = "tester" - frozen_at = 1234567890.123 - - manifest = { - "dtoolcore_version": dtoolcore.__version__, - "hash_function": "md5sum_hexdigest", - "items": {}, - } - - dataset = dtoolcore.create_frozen_dataset( - base_uri=base_uri, - uuid=dataset_uuid, - name=name, - creator_username=creator_username, - frozen_at=frozen_at, - manifest=manifest, - ) - - # Verify it's a DataSet instance - assert isinstance(dataset, dtoolcore.DataSet) - - # Verify admin metadata - assert dataset.uuid == dataset_uuid - assert dataset.name == name - assert dataset.admin_metadata["creator_username"] == creator_username - assert dataset.admin_metadata["frozen_at"] == frozen_at - assert dataset.admin_metadata["type"] == "dataset" - assert dataset.admin_metadata["dtoolcore_version"] == dtoolcore.__version__ - - # Verify we can load it from URI - loaded_dataset = dtoolcore.DataSet.from_uri(dataset.uri) - assert loaded_dataset.uuid == dataset_uuid - assert loaded_dataset.name == name - - -def test_create_frozen_dataset_with_readme(tmp_dir_fixture): # NOQA - """Test creation with README content.""" - import dtoolcore - - base_uri = _sanitise_base_uri(tmp_dir_fixture) - dataset_uuid = str(uuid_module.uuid4()) - readme_content = "---\ndescription: Test dataset\nproject: Testing" - - manifest = { - "dtoolcore_version": dtoolcore.__version__, - "hash_function": "md5sum_hexdigest", - "items": {}, - } - - dataset = dtoolcore.create_frozen_dataset( - base_uri=base_uri, - uuid=dataset_uuid, - name="readme-test", - creator_username="tester", - frozen_at=1234567890.0, - manifest=manifest, - readme_content=readme_content, - ) - - assert dataset.get_readme_content() == readme_content - - # Verify it persists after reloading - loaded_dataset = dtoolcore.DataSet.from_uri(dataset.uri) - assert loaded_dataset.get_readme_content() == readme_content - - -def test_create_frozen_dataset_with_tags(tmp_dir_fixture): # NOQA - """Test creation with tags.""" - import dtoolcore - - base_uri = _sanitise_base_uri(tmp_dir_fixture) - dataset_uuid = str(uuid_module.uuid4()) - tags = ["production", "validated", "public"] - - manifest = { - "dtoolcore_version": dtoolcore.__version__, - "hash_function": "md5sum_hexdigest", - "items": {}, - } - - dataset = dtoolcore.create_frozen_dataset( - base_uri=base_uri, - uuid=dataset_uuid, - name="tags-test", - creator_username="tester", - frozen_at=1234567890.0, - manifest=manifest, - tags=tags, - ) - - # Tags may be returned in different order, so compare as sets - assert set(dataset.list_tags()) == set(tags) - - # Verify it persists after reloading - loaded_dataset = dtoolcore.DataSet.from_uri(dataset.uri) - assert set(loaded_dataset.list_tags()) == set(tags) - - -def test_create_frozen_dataset_with_annotations(tmp_dir_fixture): # NOQA - """Test creation with annotations.""" - import dtoolcore - - base_uri = _sanitise_base_uri(tmp_dir_fixture) - dataset_uuid = str(uuid_module.uuid4()) - annotations = { - "project": "test-project", - "version": 42, - "metadata": {"nested": "value", "list": [1, 2, 3]}, - "flag": True, - } - - manifest = { - "dtoolcore_version": dtoolcore.__version__, - "hash_function": "md5sum_hexdigest", - "items": {}, - } - - dataset = dtoolcore.create_frozen_dataset( - base_uri=base_uri, - uuid=dataset_uuid, - name="annotations-test", - creator_username="tester", - frozen_at=1234567890.0, - manifest=manifest, - annotations=annotations, - ) - - # Verify all annotations - assert set(dataset.list_annotation_names()) == set(annotations.keys()) - for name, value in annotations.items(): - assert dataset.get_annotation(name) == value - - # Verify it persists after reloading - loaded_dataset = dtoolcore.DataSet.from_uri(dataset.uri) - assert set(loaded_dataset.list_annotation_names()) == set(annotations.keys()) - for name, value in annotations.items(): - assert loaded_dataset.get_annotation(name) == value - - -def test_create_frozen_dataset_with_items(tmp_dir_fixture): # NOQA - """Test creation with manifest items.""" - import dtoolcore - - base_uri = _sanitise_base_uri(tmp_dir_fixture) - dataset_uuid = str(uuid_module.uuid4()) - - # Create manifest with items - items = { - generate_identifier("data/file1.txt"): { - "relpath": "data/file1.txt", - "size_in_bytes": 100, - "hash": "abc123", - "utc_timestamp": 1234567890.0, - }, - generate_identifier("data/file2.csv"): { - "relpath": "data/file2.csv", - "size_in_bytes": 500, - "hash": "def456", - "utc_timestamp": 1234567891.0, - }, - } - - manifest = { - "dtoolcore_version": dtoolcore.__version__, - "hash_function": "md5sum_hexdigest", - "items": items, - } - - dataset = dtoolcore.create_frozen_dataset( - base_uri=base_uri, - uuid=dataset_uuid, - name="items-test", - creator_username="tester", - frozen_at=1234567890.0, - manifest=manifest, - ) - - # Verify manifest items - assert set(dataset.identifiers) == set(items.keys()) - for identifier, props in items.items(): - item_props = dataset.item_properties(identifier) - assert item_props["relpath"] == props["relpath"] - assert item_props["size_in_bytes"] == props["size_in_bytes"] - assert item_props["hash"] == props["hash"] - - -def test_create_frozen_dataset_full(tmp_dir_fixture): # NOQA - """Test creation with all optional parameters.""" - import dtoolcore - - base_uri = _sanitise_base_uri(tmp_dir_fixture) - dataset_uuid = str(uuid_module.uuid4()) - name = "full-test-dataset" - creator_username = "scientist" - frozen_at = 1609459200.0 # 2021-01-01 00:00:00 UTC - readme_content = "---\nproject: Full Test\ndescription: Complete test" - tags = ["experiment", "simulation"] - annotations = { - "experiment_id": "EXP-001", - "parameters": {"temp": 300, "pressure": 1.0}, - } - - items = { - generate_identifier("results.json"): { - "relpath": "results.json", - "size_in_bytes": 1024, - "hash": "result_hash", - "utc_timestamp": frozen_at, - }, - } - - manifest = { - "dtoolcore_version": dtoolcore.__version__, - "hash_function": "md5sum_hexdigest", - "items": items, - } - - dataset = dtoolcore.create_frozen_dataset( - base_uri=base_uri, - uuid=dataset_uuid, - name=name, - creator_username=creator_username, - frozen_at=frozen_at, - manifest=manifest, - readme_content=readme_content, - tags=tags, - annotations=annotations, - ) - - # Verify everything - assert dataset.uuid == dataset_uuid - assert dataset.name == name - assert dataset.admin_metadata["creator_username"] == creator_username - assert dataset.admin_metadata["frozen_at"] == frozen_at - assert dataset.get_readme_content() == readme_content - assert set(dataset.list_tags()) == set(tags) - assert set(dataset.list_annotation_names()) == set(annotations.keys()) - assert set(dataset.identifiers) == set(items.keys()) - - -def test_create_frozen_dataset_invalid_name(tmp_dir_fixture): # NOQA - """Test that invalid dataset name raises DtoolCoreInvalidNameError.""" - import dtoolcore - - base_uri = _sanitise_base_uri(tmp_dir_fixture) - - manifest = { - "dtoolcore_version": dtoolcore.__version__, - "hash_function": "md5sum_hexdigest", - "items": {}, - } - - # Names with spaces are invalid - with pytest.raises(dtoolcore.DtoolCoreInvalidNameError): - dtoolcore.create_frozen_dataset( - base_uri=base_uri, - uuid=str(uuid_module.uuid4()), - name="invalid name with spaces", - creator_username="tester", - frozen_at=1234567890.0, - manifest=manifest, - ) - - # Names with special characters are invalid - with pytest.raises(dtoolcore.DtoolCoreInvalidNameError): - dtoolcore.create_frozen_dataset( - base_uri=base_uri, - uuid=str(uuid_module.uuid4()), - name="invalid@name!", - creator_username="tester", - frozen_at=1234567890.0, - manifest=manifest, - ) - - -def test_create_frozen_dataset_invalid_tag(tmp_dir_fixture): # NOQA - """Test that invalid tag raises DtoolCoreInvalidNameError.""" - import dtoolcore - - base_uri = _sanitise_base_uri(tmp_dir_fixture) - - manifest = { - "dtoolcore_version": dtoolcore.__version__, - "hash_function": "md5sum_hexdigest", - "items": {}, - } - - # Tag with spaces is invalid - with pytest.raises(dtoolcore.DtoolCoreInvalidNameError): - dtoolcore.create_frozen_dataset( - base_uri=base_uri, - uuid=str(uuid_module.uuid4()), - name="valid-name", - creator_username="tester", - frozen_at=1234567890.0, - manifest=manifest, - tags=["valid-tag", "invalid tag"], - ) - - -def test_create_frozen_dataset_invalid_tag_type(tmp_dir_fixture): # NOQA - """Test that non-string tag raises DtoolCoreValueError.""" - import dtoolcore - - base_uri = _sanitise_base_uri(tmp_dir_fixture) - - manifest = { - "dtoolcore_version": dtoolcore.__version__, - "hash_function": "md5sum_hexdigest", - "items": {}, - } - - # Tag must be a string - with pytest.raises(dtoolcore.DtoolCoreValueError): - dtoolcore.create_frozen_dataset( - base_uri=base_uri, - uuid=str(uuid_module.uuid4()), - name="valid-name", - creator_username="tester", - frozen_at=1234567890.0, - manifest=manifest, - tags=["valid-tag", 123], # 123 is not a string - ) - - -def test_create_frozen_dataset_invalid_annotation_name(tmp_dir_fixture): # NOQA - """Test that invalid annotation name raises DtoolCoreInvalidNameError.""" - import dtoolcore - - base_uri = _sanitise_base_uri(tmp_dir_fixture) - - manifest = { - "dtoolcore_version": dtoolcore.__version__, - "hash_function": "md5sum_hexdigest", - "items": {}, - } - - # Annotation name with spaces is invalid - with pytest.raises(dtoolcore.DtoolCoreInvalidNameError): - dtoolcore.create_frozen_dataset( - base_uri=base_uri, - uuid=str(uuid_module.uuid4()), - name="valid-name", - creator_username="tester", - frozen_at=1234567890.0, - manifest=manifest, - annotations={"valid_name": "value", "invalid name": "value"}, - ) - - -def test_create_frozen_dataset_empty_tags_and_annotations(tmp_dir_fixture): # NOQA - """Test that empty lists/dicts for tags and annotations work.""" - import dtoolcore - - base_uri = _sanitise_base_uri(tmp_dir_fixture) - dataset_uuid = str(uuid_module.uuid4()) - - manifest = { - "dtoolcore_version": dtoolcore.__version__, - "hash_function": "md5sum_hexdigest", - "items": {}, - } - - # Empty list for tags, empty dict for annotations - dataset = dtoolcore.create_frozen_dataset( - base_uri=base_uri, - uuid=dataset_uuid, - name="empty-test", - creator_username="tester", - frozen_at=1234567890.0, - manifest=manifest, - tags=[], - annotations={}, - ) - - assert dataset.list_tags() == [] - assert dataset.list_annotation_names() == [] - - -def test_create_frozen_dataset_none_tags_and_annotations(tmp_dir_fixture): # NOQA - """Test that None for tags and annotations work (default behavior).""" - import dtoolcore - - base_uri = _sanitise_base_uri(tmp_dir_fixture) - dataset_uuid = str(uuid_module.uuid4()) - - manifest = { - "dtoolcore_version": dtoolcore.__version__, - "hash_function": "md5sum_hexdigest", - "items": {}, - } - - # Explicitly pass None - dataset = dtoolcore.create_frozen_dataset( - base_uri=base_uri, - uuid=dataset_uuid, - name="none-test", - creator_username="tester", - frozen_at=1234567890.0, - manifest=manifest, - tags=None, - annotations=None, - ) - - assert dataset.list_tags() == [] - assert dataset.list_annotation_names() == [] - - -def test_dataset_put_readme(tmp_dir_fixture): # NOQA - """Test updating the README of a frozen dataset.""" - import dtoolcore - - base_uri = _sanitise_base_uri(tmp_dir_fixture) - dataset_uuid = str(uuid_module.uuid4()) - original_readme = "---\ndescription: Original README content" - updated_readme = "---\ndescription: Updated README content\nversion: 2" - - manifest = { - "dtoolcore_version": dtoolcore.__version__, - "hash_function": "md5sum_hexdigest", - "items": {}, - } - - # Create dataset with original README - dataset = dtoolcore.create_frozen_dataset( - base_uri=base_uri, - uuid=dataset_uuid, - name="readme-update-test", - creator_username="tester", - frozen_at=1234567890.0, - manifest=manifest, - readme_content=original_readme, - ) - - assert dataset.get_readme_content() == original_readme - - # Update the README - dataset.put_readme(updated_readme) - - # Verify the update - assert dataset.get_readme_content() == updated_readme - - # Reload dataset and verify persistence - reloaded_dataset = dtoolcore.DataSet.from_uri(dataset.uri) - assert reloaded_dataset.get_readme_content() == updated_readme - - -def test_dataset_put_readme_creates_backup(tmp_dir_fixture): # NOQA - """Test that put_readme creates a backup of the original README.""" - import os - import dtoolcore - - base_uri = _sanitise_base_uri(tmp_dir_fixture) - dataset_uuid = str(uuid_module.uuid4()) - original_readme = "---\ndescription: Original content" - updated_readme = "---\ndescription: New content" - - manifest = { - "dtoolcore_version": dtoolcore.__version__, - "hash_function": "md5sum_hexdigest", - "items": {}, - } - - # Create dataset - dataset = dtoolcore.create_frozen_dataset( - base_uri=base_uri, - uuid=dataset_uuid, - name="backup-test", - creator_username="tester", - frozen_at=1234567890.0, - manifest=manifest, - readme_content=original_readme, - ) - - # Get the dataset directory path - from dtoolcore.utils import generous_parse_uri - parsed = generous_parse_uri(dataset.uri) - dataset_path = parsed.path - - # Count README files before update - readme_files_before = [f for f in os.listdir(dataset_path) - if f.startswith("README.yml")] - assert len(readme_files_before) == 1 - - # Update the README - dataset.put_readme(updated_readme) - - # Count README files after update - should have backup - readme_files_after = [f for f in os.listdir(dataset_path) - if f.startswith("README.yml")] - assert len(readme_files_after) == 2 - - # Verify one is the current README and one is a backup - assert "README.yml" in readme_files_after - backup_files = [f for f in readme_files_after if f != "README.yml"] - assert len(backup_files) == 1 - assert backup_files[0].startswith("README.yml-") - - # Verify the backup contains the original content - backup_path = os.path.join(dataset_path, backup_files[0]) - with open(backup_path, "r") as f: - backup_content = f.read() - assert backup_content == original_readme - - -def test_dataset_put_readme_multiple_updates(tmp_dir_fixture): # NOQA - """Test multiple README updates create multiple backups.""" - import os - import dtoolcore - - base_uri = _sanitise_base_uri(tmp_dir_fixture) - dataset_uuid = str(uuid_module.uuid4()) - - manifest = { - "dtoolcore_version": dtoolcore.__version__, - "hash_function": "md5sum_hexdigest", - "items": {}, - } - - # Create dataset - dataset = dtoolcore.create_frozen_dataset( - base_uri=base_uri, - uuid=dataset_uuid, - name="multi-update-test", - creator_username="tester", - frozen_at=1234567890.0, - manifest=manifest, - readme_content="Version 1", - ) - - # Get the dataset directory path - from dtoolcore.utils import generous_parse_uri - parsed = generous_parse_uri(dataset.uri) - dataset_path = parsed.path - - # Perform multiple updates - dataset.put_readme("Version 2") - dataset.put_readme("Version 3") - dataset.put_readme("Version 4") - - # Should have original + 3 backups = 4 README files - readme_files = [f for f in os.listdir(dataset_path) - if f.startswith("README.yml")] - assert len(readme_files) == 4 - - # Current README should be the latest version - assert dataset.get_readme_content() == "Version 4" diff --git a/tests/test_freeze_with_manifest.py b/tests/test_freeze_with_manifest.py new file mode 100644 index 0000000..c2e9dd6 --- /dev/null +++ b/tests/test_freeze_with_manifest.py @@ -0,0 +1,362 @@ +"""Test the freeze_with_manifest method of ProtoDataSet.""" + +import os +import uuid as uuid_module + +import pytest + +from . import tmp_dir_fixture # NOQA + +from dtoolcore.utils import ( + IS_WINDOWS, + generous_parse_uri, + windows_to_unix_path, + generate_identifier, +) + + +def _sanitise_base_uri(tmp_dir): + base_uri = tmp_dir + if IS_WINDOWS: + parsed_base_uri = generous_parse_uri(tmp_dir) + unix_path = windows_to_unix_path(parsed_base_uri.path) + base_uri = "file://{}".format(unix_path) + return base_uri + + +def test_freeze_with_manifest_basic(tmp_dir_fixture): # NOQA + """Test basic freezing of a proto dataset with provided manifest.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-freeze-manifest" + creator_username = "tester" + frozen_at = 1234567890.123 + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Create a proto dataset + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + creator_username=creator_username, + ) + + # Freeze with the provided manifest + proto_dataset.freeze_with_manifest(manifest, frozen_at=frozen_at) + + # Load the dataset and verify it's frozen + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + + assert isinstance(dataset, dtoolcore.DataSet) + assert dataset.name == name + assert dataset.admin_metadata["creator_username"] == creator_username + assert dataset.admin_metadata["frozen_at"] == frozen_at + assert dataset.admin_metadata["type"] == "dataset" + + +def test_freeze_with_manifest_with_items(tmp_dir_fixture): # NOQA + """Test freezing with manifest containing items.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-items" + frozen_at = 1234567890.0 + + # Create manifest with items + items = { + generate_identifier("data/file1.txt"): { + "relpath": "data/file1.txt", + "size_in_bytes": 100, + "hash": "abc123", + "utc_timestamp": 1234567890.0, + }, + generate_identifier("data/file2.csv"): { + "relpath": "data/file2.csv", + "size_in_bytes": 500, + "hash": "def456", + "utc_timestamp": 1234567891.0, + }, + } + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": items, + } + + # Create a proto dataset + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + # Freeze with the provided manifest + proto_dataset.freeze_with_manifest(manifest, frozen_at=frozen_at) + + # Load and verify + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + + assert set(dataset.identifiers) == set(items.keys()) + for identifier, props in items.items(): + item_props = dataset.item_properties(identifier) + assert item_props["relpath"] == props["relpath"] + assert item_props["size_in_bytes"] == props["size_in_bytes"] + assert item_props["hash"] == props["hash"] + + +def test_freeze_with_manifest_auto_frozen_at(tmp_dir_fixture): # NOQA + """Test that frozen_at is auto-generated if not provided.""" + import dtoolcore + import time + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-auto-frozen-at" + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Create a proto dataset + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + before_freeze = time.time() + proto_dataset.freeze_with_manifest(manifest) + after_freeze = time.time() + + # Load and verify frozen_at was auto-generated + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + + assert "frozen_at" in dataset.admin_metadata + frozen_at = dataset.admin_metadata["frozen_at"] + assert before_freeze <= frozen_at <= after_freeze + + +def test_freeze_with_manifest_with_readme(tmp_dir_fixture): # NOQA + """Test freezing with README content.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-readme" + readme_content = "---\ndescription: Test dataset\nproject: Testing" + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Create a proto dataset with README + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + readme_content=readme_content, + ) + + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + # Load and verify README persisted + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + assert dataset.get_readme_content() == readme_content + + +def test_freeze_with_manifest_with_tags(tmp_dir_fixture): # NOQA + """Test freezing with tags added to proto dataset.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-tags" + tags = ["production", "validated"] + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Create a proto dataset and add tags + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + for tag in tags: + proto_dataset.put_tag(tag) + + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + # Load and verify tags persisted + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + assert set(dataset.list_tags()) == set(tags) + + +def test_freeze_with_manifest_with_annotations(tmp_dir_fixture): # NOQA + """Test freezing with annotations added to proto dataset.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-annotations" + annotations = { + "project": "test-project", + "version": 42, + } + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Create a proto dataset and add annotations + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + for ann_name, ann_value in annotations.items(): + proto_dataset.put_annotation(ann_name, ann_value) + + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + # Load and verify annotations persisted + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + assert set(dataset.list_annotation_names()) == set(annotations.keys()) + for ann_name, ann_value in annotations.items(): + assert dataset.get_annotation(ann_name) == ann_value + + +def test_freeze_with_manifest_full(tmp_dir_fixture): # NOQA + """Test freezing with all features combined.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "full-test-dataset" + creator_username = "scientist" + frozen_at = 1609459200.0 # 2021-01-01 00:00:00 UTC + readme_content = "---\nproject: Full Test\ndescription: Complete test" + tags = ["experiment", "simulation"] + annotations = { + "experiment_id": "EXP-001", + "parameters": {"temp": 300, "pressure": 1.0}, + } + + items = { + generate_identifier("results.json"): { + "relpath": "results.json", + "size_in_bytes": 1024, + "hash": "result_hash", + "utc_timestamp": frozen_at, + }, + } + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": items, + } + + # Create proto dataset with all features + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + readme_content=readme_content, + creator_username=creator_username, + ) + for tag in tags: + proto_dataset.put_tag(tag) + for ann_name, ann_value in annotations.items(): + proto_dataset.put_annotation(ann_name, ann_value) + + proto_dataset.freeze_with_manifest(manifest, frozen_at=frozen_at) + + # Load and verify everything + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + assert dataset.name == name + assert dataset.admin_metadata["creator_username"] == creator_username + assert dataset.admin_metadata["frozen_at"] == frozen_at + assert dataset.get_readme_content() == readme_content + assert set(dataset.list_tags()) == set(tags) + assert set(dataset.list_annotation_names()) == set(annotations.keys()) + assert set(dataset.identifiers) == set(items.keys()) + + +def test_freeze_with_manifest_different_hash_function(tmp_dir_fixture): # NOQA + """Test that hash_function in manifest is preserved.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-hash-function" + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "sha256sum_hexdigest", + "items": {}, + } + + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + # Load and verify hash function is preserved + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + loaded_manifest = dataset._storage_broker.get_manifest() + assert loaded_manifest["hash_function"] == "sha256sum_hexdigest" + + +def test_proto_dataset_type_before_freeze(tmp_dir_fixture): # NOQA + """Test that proto dataset has type 'protodataset' before freezing.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-proto-type" + + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + # Before freeze, should be a protodataset + assert proto_dataset.admin_metadata["type"] == "protodataset" + + # Can load as ProtoDataSet + loaded_proto = dtoolcore.ProtoDataSet.from_uri(proto_dataset.uri) + assert loaded_proto.admin_metadata["type"] == "protodataset" + + +def test_dataset_type_after_freeze(tmp_dir_fixture): # NOQA + """Test that dataset has type 'dataset' after freezing.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-dataset-type" + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + # After freeze, should be a dataset + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + assert dataset.admin_metadata["type"] == "dataset" + + # Cannot load as ProtoDataSet anymore + with pytest.raises(dtoolcore.DtoolCoreTypeError): + dtoolcore.ProtoDataSet.from_uri(proto_dataset.uri) From 3c6bdac5f289d4e7eb2196deade0871526337616 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Mon, 8 Dec 2025 08:38:14 +0100 Subject: [PATCH 4/6] MAINT: Validate that README and manifest items actually exist in storage when running `freeze_with_manifest` --- dtoolcore/__init__.py | 44 ++++++ tests/test_freeze_with_manifest.py | 221 +++++++++++++++++++++++++++++ 2 files changed, 265 insertions(+) diff --git a/dtoolcore/__init__.py b/dtoolcore/__init__.py index bc116c5..208ea9f 100644 --- a/dtoolcore/__init__.py +++ b/dtoolcore/__init__.py @@ -868,6 +868,13 @@ def freeze_with_manifest(self, manifest, frozen_at=None): (hash, size, timestamp). This is useful for server-side operations where the client has already computed hashes during upload. + Before freezing, this method validates that: + - The README file exists in storage + - All items listed in the manifest exist in storage + + Note: This method does NOT verify that the hashes match - it trusts + the client-provided hashes in the manifest. + :param manifest: dict with structure:: { @@ -885,9 +892,46 @@ def freeze_with_manifest(self, manifest, frozen_at=None): :param frozen_at: optional timestamp for when the dataset was frozen. If not provided, uses the current UTC time. + :raises: DtoolCoreValueError if README or any manifest item is missing """ logger.debug("Freeze dataset with manifest {}".format(self)) + # Validate that README exists + try: + self._storage_broker.get_readme_content() + except Exception as e: + raise DtoolCoreValueError( + f"README file is missing or cannot be read: {e}" + ) + + # Validate that all items in the manifest exist in storage + manifest_items = manifest.get("items", {}) + if manifest_items: + # Get identifiers of items that actually exist in storage + existing_handles = set(self._storage_broker.iter_item_handles()) + existing_identifiers = set( + dtoolcore.utils.generate_identifier(h) for h in existing_handles + ) + + # Check for missing items + expected_identifiers = set(manifest_items.keys()) + missing_identifiers = expected_identifiers - existing_identifiers + + if missing_identifiers: + # Get relpaths of missing items for better error message + missing_relpaths = [ + manifest_items[ident].get("relpath", ident) + for ident in list(missing_identifiers)[:5] # Limit to 5 + ] + if len(missing_identifiers) > 5: + missing_relpaths.append( + f"... and {len(missing_identifiers) - 5} more" + ) + raise DtoolCoreValueError( + f"Missing {len(missing_identifiers)} item(s) in storage: " + f"{missing_relpaths}" + ) + # Call the storage broker pre_freeze hook. self._storage_broker.pre_freeze_hook() diff --git a/tests/test_freeze_with_manifest.py b/tests/test_freeze_with_manifest.py index c2e9dd6..db35550 100644 --- a/tests/test_freeze_with_manifest.py +++ b/tests/test_freeze_with_manifest.py @@ -62,6 +62,8 @@ def test_freeze_with_manifest_basic(tmp_dir_fixture): # NOQA def test_freeze_with_manifest_with_items(tmp_dir_fixture): # NOQA """Test freezing with manifest containing items.""" import dtoolcore + import tempfile + import os base_uri = _sanitise_base_uri(tmp_dir_fixture) name = "test-items" @@ -95,6 +97,20 @@ def test_freeze_with_manifest_with_items(tmp_dir_fixture): # NOQA base_uri=base_uri, ) + # Add items to storage + temp_files = [] + for relpath in ["data/file1.txt", "data/file2.csv"]: + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f: + f.write(f"content for {relpath}") + temp_files.append((f.name, relpath)) + + try: + for temp_path, relpath in temp_files: + proto_dataset.put_item(temp_path, relpath) + finally: + for temp_path, _ in temp_files: + os.unlink(temp_path) + # Freeze with the provided manifest proto_dataset.freeze_with_manifest(manifest, frozen_at=frozen_at) @@ -235,6 +251,8 @@ def test_freeze_with_manifest_with_annotations(tmp_dir_fixture): # NOQA def test_freeze_with_manifest_full(tmp_dir_fixture): # NOQA """Test freezing with all features combined.""" import dtoolcore + import tempfile + import os base_uri = _sanitise_base_uri(tmp_dir_fixture) name = "full-test-dataset" @@ -274,6 +292,16 @@ def test_freeze_with_manifest_full(tmp_dir_fixture): # NOQA for ann_name, ann_value in annotations.items(): proto_dataset.put_annotation(ann_name, ann_value) + # Add item to storage + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as f: + f.write('{"results": "test data"}') + temp_path = f.name + + try: + proto_dataset.put_item(temp_path, "results.json") + finally: + os.unlink(temp_path) + proto_dataset.freeze_with_manifest(manifest, frozen_at=frozen_at) # Load and verify everything @@ -360,3 +388,196 @@ def test_dataset_type_after_freeze(tmp_dir_fixture): # NOQA # Cannot load as ProtoDataSet anymore with pytest.raises(dtoolcore.DtoolCoreTypeError): dtoolcore.ProtoDataSet.from_uri(proto_dataset.uri) + + +def test_freeze_with_manifest_missing_readme(tmp_dir_fixture): # NOQA + """Test that freezing fails if README is missing.""" + import dtoolcore + import os + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-missing-readme" + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Create proto dataset + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + # Delete the README file + from dtoolcore.utils import generous_parse_uri + parsed = generous_parse_uri(proto_dataset.uri) + readme_path = os.path.join(parsed.path, "README.yml") + os.remove(readme_path) + + # Freezing should fail because README is missing + with pytest.raises(dtoolcore.DtoolCoreValueError) as excinfo: + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + assert "README" in str(excinfo.value) + + +def test_freeze_with_manifest_missing_items(tmp_dir_fixture): # NOQA + """Test that freezing fails if manifest items are missing from storage.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-missing-items" + + # Create manifest with items that don't exist in storage + items = { + generate_identifier("data/nonexistent1.txt"): { + "relpath": "data/nonexistent1.txt", + "size_in_bytes": 100, + "hash": "abc123", + "utc_timestamp": 1234567890.0, + }, + generate_identifier("data/nonexistent2.txt"): { + "relpath": "data/nonexistent2.txt", + "size_in_bytes": 200, + "hash": "def456", + "utc_timestamp": 1234567890.0, + }, + } + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": items, + } + + # Create proto dataset (no items uploaded) + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + # Freezing should fail because items are missing + with pytest.raises(dtoolcore.DtoolCoreValueError) as excinfo: + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + assert "Missing" in str(excinfo.value) + assert "2" in str(excinfo.value) # Should mention 2 missing items + + +def test_freeze_with_manifest_partial_items(tmp_dir_fixture): # NOQA + """Test that freezing fails if some manifest items are missing.""" + import dtoolcore + import tempfile + import os + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-partial-items" + + # Create manifest with 2 items + items = { + generate_identifier("data/exists.txt"): { + "relpath": "data/exists.txt", + "size_in_bytes": 100, + "hash": "abc123", + "utc_timestamp": 1234567890.0, + }, + generate_identifier("data/missing.txt"): { + "relpath": "data/missing.txt", + "size_in_bytes": 200, + "hash": "def456", + "utc_timestamp": 1234567890.0, + }, + } + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": items, + } + + # Create proto dataset + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + # Create a temporary file to add to the dataset + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f: + f.write("test content") + temp_path = f.name + + try: + # Add only one item to storage + proto_dataset.put_item(temp_path, "data/exists.txt") + finally: + os.unlink(temp_path) + + # Freezing should fail because one item is missing + with pytest.raises(dtoolcore.DtoolCoreValueError) as excinfo: + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + assert "Missing" in str(excinfo.value) + assert "1" in str(excinfo.value) # Should mention 1 missing item + assert "missing.txt" in str(excinfo.value) + + +def test_freeze_with_manifest_items_exist(tmp_dir_fixture): # NOQA + """Test that freezing succeeds when all items exist.""" + import dtoolcore + import tempfile + import os + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-items-exist" + + # Create manifest with items + items = { + generate_identifier("data/file1.txt"): { + "relpath": "data/file1.txt", + "size_in_bytes": 100, + "hash": "abc123", + "utc_timestamp": 1234567890.0, + }, + generate_identifier("data/file2.txt"): { + "relpath": "data/file2.txt", + "size_in_bytes": 200, + "hash": "def456", + "utc_timestamp": 1234567890.0, + }, + } + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": items, + } + + # Create proto dataset + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + # Create temporary files and add them to the dataset + temp_files = [] + for relpath in ["data/file1.txt", "data/file2.txt"]: + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f: + f.write(f"content for {relpath}") + temp_files.append((f.name, relpath)) + + try: + for temp_path, relpath in temp_files: + proto_dataset.put_item(temp_path, relpath) + finally: + for temp_path, _ in temp_files: + os.unlink(temp_path) + + # Freezing should succeed because all items exist + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + # Verify the dataset was frozen correctly + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + assert dataset.admin_metadata["type"] == "dataset" + assert set(dataset.identifiers) == set(items.keys()) From d71011175831f815aba90b51cc592a1bfe922e24 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Mon, 8 Dec 2025 08:40:11 +0100 Subject: [PATCH 5/6] DOC: Updated CHANGELOG.rst --- CHANGELOG.rst | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 23604d7..6a3de27 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,6 +4,35 @@ CHANGELOG This project uses `semantic versioning `_. This change log uses principles from `keep a changelog `_. +[3.20.0] - 2025-12-08 +--------------------- + + +Added +^^^^^ + +- ``freeze_with_manifest`` method on ``ProtoDataSet`` class + +Changed +^^^^^^^ + +- changed build system to ``flit`` + +Deprecated +^^^^^^^^^^ + + +Removed +^^^^^^^ + + +Fixed +^^^^^ + + +Security +^^^^^^^^ + [3.19.0] - 2024-12-16 --------------------- From 497c6de5b79e5b57b6c1607e2d8b49fc2c61e831 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Mon, 8 Dec 2025 09:08:58 +0100 Subject: [PATCH 6/6] DOC: Updated CHANGELOG.rst --- CHANGELOG.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6a3de27..bf3f764 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -11,7 +11,11 @@ This change log uses principles from `keep a changelog