diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 23604d7..bf3f764 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,6 +4,39 @@ CHANGELOG This project uses `semantic versioning `_. This change log uses principles from `keep a changelog `_. +[3.20.0] - 2025-12-08 +--------------------- + + +Added +^^^^^ + +- ``freeze_with_manifest`` method on ``ProtoDataSet`` class for converting a + proto-dataset to a frozen dataset using a pre-computed manifest with + client-provided hashes. Validates that README and all manifest items exist + in storage before freezing. Useful for server applications that trust + client-computed hashes. + +Changed +^^^^^^^ + +- changed build system to ``flit`` + +Deprecated +^^^^^^^^^^ + + +Removed +^^^^^^^ + + +Fixed +^^^^^ + + +Security +^^^^^^^^ + [3.19.0] - 2024-12-16 --------------------- diff --git a/dtoolcore/__init__.py b/dtoolcore/__init__.py index 211aa30..208ea9f 100644 --- a/dtoolcore/__init__.py +++ b/dtoolcore/__init__.py @@ -858,6 +858,110 @@ def freeze(self, progressbar=None): # Clean up using the storage broker's post freeze hook. self._storage_broker.post_freeze_hook() + def freeze_with_manifest(self, manifest, frozen_at=None): + """ + Convert :class:`dtoolcore.ProtoDataSet` to :class:`dtoolcore.DataSet` + using a pre-computed manifest. + + This method freezes the dataset without computing hashes server-side. + The caller provides a manifest with pre-computed item properties + (hash, size, timestamp). This is useful for server-side operations + where the client has already computed hashes during upload. + + Before freezing, this method validates that: + - The README file exists in storage + - All items listed in the manifest exist in storage + + Note: This method does NOT verify that the hashes match - it trusts + the client-provided hashes in the manifest. + + :param manifest: dict with structure:: + + { + "dtoolcore_version": , + "hash_function": , + "items": { + : { + "relpath": , + "size_in_bytes": , + "hash": , + "utc_timestamp": + } + } + } + + :param frozen_at: optional timestamp for when the dataset was frozen. + If not provided, uses the current UTC time. + :raises: DtoolCoreValueError if README or any manifest item is missing + """ + logger.debug("Freeze dataset with manifest {}".format(self)) + + # Validate that README exists + try: + self._storage_broker.get_readme_content() + except Exception as e: + raise DtoolCoreValueError( + f"README file is missing or cannot be read: {e}" + ) + + # Validate that all items in the manifest exist in storage + manifest_items = manifest.get("items", {}) + if manifest_items: + # Get identifiers of items that actually exist in storage + existing_handles = set(self._storage_broker.iter_item_handles()) + existing_identifiers = set( + dtoolcore.utils.generate_identifier(h) for h in existing_handles + ) + + # Check for missing items + expected_identifiers = set(manifest_items.keys()) + missing_identifiers = expected_identifiers - existing_identifiers + + if missing_identifiers: + # Get relpaths of missing items for better error message + missing_relpaths = [ + manifest_items[ident].get("relpath", ident) + for ident in list(missing_identifiers)[:5] # Limit to 5 + ] + if len(missing_identifiers) > 5: + missing_relpaths.append( + f"... and {len(missing_identifiers) - 5} more" + ) + raise DtoolCoreValueError( + f"Missing {len(missing_identifiers)} item(s) in storage: " + f"{missing_relpaths}" + ) + + # Call the storage broker pre_freeze hook. + self._storage_broker.pre_freeze_hook() + + # Use provided manifest instead of computing + self._storage_broker.put_manifest(manifest) + + # Generate and persist overlays from any item metadata that has been + # added. + overlays = self._generate_overlays() + for overlay_name, overlay in overlays.items(): + self._put_overlay(overlay_name, overlay) + + # Change the type of the dataset from "protodataset" to "dataset" + # in the administrative metadata. + metadata_update = {"type": "dataset"} + + # Use provided frozen_at or generate one + if frozen_at is not None: + metadata_update["frozen_at"] = frozen_at + elif "frozen_at" not in self._admin_metadata: + datetime_obj = datetime.datetime.utcnow() + metadata_update["frozen_at"] = dtoolcore.utils.timestamp(datetime_obj) + + # Apply the change(s) to the administrative metadata. + self._admin_metadata.update(metadata_update) + self._storage_broker.put_admin_metadata(self._admin_metadata) + + # Clean up using the storage broker's post freeze hook. + self._storage_broker.post_freeze_hook() + class DataSetCreator(object): """Context manager for creating a dataset. diff --git a/pyproject.toml b/pyproject.toml index 9dbfb00..cb56204 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,17 +1,18 @@ [build-system] -requires = ["setuptools>=42", "setuptools_scm[toml]>=6.3"] -build-backend = "setuptools.build_meta" +requires = ["flit_scm"] +build-backend = "flit_scm:buildapi" [project] name = "dtoolcore" description = "Core API for managing (scientific) data" readme = "README.rst" -license = {file = "LICENSE"} +license = {text = "MIT"} authors = [ {name = "Tjelvar Olsson", email = "tjelvar.olsson@gmail.com"} ] dynamic = ["version"] -dependencies = ["setuptools"] +requires-python = ">=3.8" +dependencies = [] [project.optional-dependencies] test = [ @@ -29,13 +30,20 @@ Documentation = "https://dtoolcore.readthedocs.io" Repository = "https://github.com/jic-dtool/dtoolcore" Changelog = "https://github.com/jic-dtool/dtoolcore/blob/master/CHANGELOG.rst" +[project.entry-points."dtool.storage_brokers"] +DiskStorageBroker = "dtoolcore.storagebroker:DiskStorageBroker" + +[tool.flit.module] +name = "dtoolcore" + [tool.setuptools_scm] version_scheme = "guess-next-dev" local_scheme = "no-local-version" write_to = "dtoolcore/version.py" -[tool.setuptools] -packages = ["dtoolcore"] +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "--cov=dtoolcore --cov-report=term-missing" -[project.entry-points."dtool.storage_brokers"] -"DiskStorageBroker" = "dtoolcore.storagebroker:DiskStorageBroker" +[tool.flake8] +exclude = ["env*", ".tox", ".git", "*.egg", "build", "docs", "venv"] diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 95f3f57..0000000 --- a/setup.cfg +++ /dev/null @@ -1,10 +0,0 @@ -[flake8] -exclude=env*,.tox,.git,*.egg,build,docs,venv - -[tool:pytest] -testpaths = tests -addopts = --cov=dtoolcore --cov-report=term-missing -#addopts = -x --pdb - -[cov:run] -source = dtoolcore diff --git a/tests/test_freeze_with_manifest.py b/tests/test_freeze_with_manifest.py new file mode 100644 index 0000000..db35550 --- /dev/null +++ b/tests/test_freeze_with_manifest.py @@ -0,0 +1,583 @@ +"""Test the freeze_with_manifest method of ProtoDataSet.""" + +import os +import uuid as uuid_module + +import pytest + +from . import tmp_dir_fixture # NOQA + +from dtoolcore.utils import ( + IS_WINDOWS, + generous_parse_uri, + windows_to_unix_path, + generate_identifier, +) + + +def _sanitise_base_uri(tmp_dir): + base_uri = tmp_dir + if IS_WINDOWS: + parsed_base_uri = generous_parse_uri(tmp_dir) + unix_path = windows_to_unix_path(parsed_base_uri.path) + base_uri = "file://{}".format(unix_path) + return base_uri + + +def test_freeze_with_manifest_basic(tmp_dir_fixture): # NOQA + """Test basic freezing of a proto dataset with provided manifest.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-freeze-manifest" + creator_username = "tester" + frozen_at = 1234567890.123 + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Create a proto dataset + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + creator_username=creator_username, + ) + + # Freeze with the provided manifest + proto_dataset.freeze_with_manifest(manifest, frozen_at=frozen_at) + + # Load the dataset and verify it's frozen + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + + assert isinstance(dataset, dtoolcore.DataSet) + assert dataset.name == name + assert dataset.admin_metadata["creator_username"] == creator_username + assert dataset.admin_metadata["frozen_at"] == frozen_at + assert dataset.admin_metadata["type"] == "dataset" + + +def test_freeze_with_manifest_with_items(tmp_dir_fixture): # NOQA + """Test freezing with manifest containing items.""" + import dtoolcore + import tempfile + import os + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-items" + frozen_at = 1234567890.0 + + # Create manifest with items + items = { + generate_identifier("data/file1.txt"): { + "relpath": "data/file1.txt", + "size_in_bytes": 100, + "hash": "abc123", + "utc_timestamp": 1234567890.0, + }, + generate_identifier("data/file2.csv"): { + "relpath": "data/file2.csv", + "size_in_bytes": 500, + "hash": "def456", + "utc_timestamp": 1234567891.0, + }, + } + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": items, + } + + # Create a proto dataset + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + # Add items to storage + temp_files = [] + for relpath in ["data/file1.txt", "data/file2.csv"]: + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f: + f.write(f"content for {relpath}") + temp_files.append((f.name, relpath)) + + try: + for temp_path, relpath in temp_files: + proto_dataset.put_item(temp_path, relpath) + finally: + for temp_path, _ in temp_files: + os.unlink(temp_path) + + # Freeze with the provided manifest + proto_dataset.freeze_with_manifest(manifest, frozen_at=frozen_at) + + # Load and verify + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + + assert set(dataset.identifiers) == set(items.keys()) + for identifier, props in items.items(): + item_props = dataset.item_properties(identifier) + assert item_props["relpath"] == props["relpath"] + assert item_props["size_in_bytes"] == props["size_in_bytes"] + assert item_props["hash"] == props["hash"] + + +def test_freeze_with_manifest_auto_frozen_at(tmp_dir_fixture): # NOQA + """Test that frozen_at is auto-generated if not provided.""" + import dtoolcore + import time + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-auto-frozen-at" + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Create a proto dataset + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + before_freeze = time.time() + proto_dataset.freeze_with_manifest(manifest) + after_freeze = time.time() + + # Load and verify frozen_at was auto-generated + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + + assert "frozen_at" in dataset.admin_metadata + frozen_at = dataset.admin_metadata["frozen_at"] + assert before_freeze <= frozen_at <= after_freeze + + +def test_freeze_with_manifest_with_readme(tmp_dir_fixture): # NOQA + """Test freezing with README content.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-readme" + readme_content = "---\ndescription: Test dataset\nproject: Testing" + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Create a proto dataset with README + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + readme_content=readme_content, + ) + + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + # Load and verify README persisted + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + assert dataset.get_readme_content() == readme_content + + +def test_freeze_with_manifest_with_tags(tmp_dir_fixture): # NOQA + """Test freezing with tags added to proto dataset.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-tags" + tags = ["production", "validated"] + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Create a proto dataset and add tags + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + for tag in tags: + proto_dataset.put_tag(tag) + + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + # Load and verify tags persisted + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + assert set(dataset.list_tags()) == set(tags) + + +def test_freeze_with_manifest_with_annotations(tmp_dir_fixture): # NOQA + """Test freezing with annotations added to proto dataset.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-annotations" + annotations = { + "project": "test-project", + "version": 42, + } + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Create a proto dataset and add annotations + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + for ann_name, ann_value in annotations.items(): + proto_dataset.put_annotation(ann_name, ann_value) + + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + # Load and verify annotations persisted + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + assert set(dataset.list_annotation_names()) == set(annotations.keys()) + for ann_name, ann_value in annotations.items(): + assert dataset.get_annotation(ann_name) == ann_value + + +def test_freeze_with_manifest_full(tmp_dir_fixture): # NOQA + """Test freezing with all features combined.""" + import dtoolcore + import tempfile + import os + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "full-test-dataset" + creator_username = "scientist" + frozen_at = 1609459200.0 # 2021-01-01 00:00:00 UTC + readme_content = "---\nproject: Full Test\ndescription: Complete test" + tags = ["experiment", "simulation"] + annotations = { + "experiment_id": "EXP-001", + "parameters": {"temp": 300, "pressure": 1.0}, + } + + items = { + generate_identifier("results.json"): { + "relpath": "results.json", + "size_in_bytes": 1024, + "hash": "result_hash", + "utc_timestamp": frozen_at, + }, + } + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": items, + } + + # Create proto dataset with all features + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + readme_content=readme_content, + creator_username=creator_username, + ) + for tag in tags: + proto_dataset.put_tag(tag) + for ann_name, ann_value in annotations.items(): + proto_dataset.put_annotation(ann_name, ann_value) + + # Add item to storage + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as f: + f.write('{"results": "test data"}') + temp_path = f.name + + try: + proto_dataset.put_item(temp_path, "results.json") + finally: + os.unlink(temp_path) + + proto_dataset.freeze_with_manifest(manifest, frozen_at=frozen_at) + + # Load and verify everything + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + assert dataset.name == name + assert dataset.admin_metadata["creator_username"] == creator_username + assert dataset.admin_metadata["frozen_at"] == frozen_at + assert dataset.get_readme_content() == readme_content + assert set(dataset.list_tags()) == set(tags) + assert set(dataset.list_annotation_names()) == set(annotations.keys()) + assert set(dataset.identifiers) == set(items.keys()) + + +def test_freeze_with_manifest_different_hash_function(tmp_dir_fixture): # NOQA + """Test that hash_function in manifest is preserved.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-hash-function" + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "sha256sum_hexdigest", + "items": {}, + } + + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + # Load and verify hash function is preserved + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + loaded_manifest = dataset._storage_broker.get_manifest() + assert loaded_manifest["hash_function"] == "sha256sum_hexdigest" + + +def test_proto_dataset_type_before_freeze(tmp_dir_fixture): # NOQA + """Test that proto dataset has type 'protodataset' before freezing.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-proto-type" + + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + # Before freeze, should be a protodataset + assert proto_dataset.admin_metadata["type"] == "protodataset" + + # Can load as ProtoDataSet + loaded_proto = dtoolcore.ProtoDataSet.from_uri(proto_dataset.uri) + assert loaded_proto.admin_metadata["type"] == "protodataset" + + +def test_dataset_type_after_freeze(tmp_dir_fixture): # NOQA + """Test that dataset has type 'dataset' after freezing.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-dataset-type" + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + # After freeze, should be a dataset + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + assert dataset.admin_metadata["type"] == "dataset" + + # Cannot load as ProtoDataSet anymore + with pytest.raises(dtoolcore.DtoolCoreTypeError): + dtoolcore.ProtoDataSet.from_uri(proto_dataset.uri) + + +def test_freeze_with_manifest_missing_readme(tmp_dir_fixture): # NOQA + """Test that freezing fails if README is missing.""" + import dtoolcore + import os + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-missing-readme" + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": {}, + } + + # Create proto dataset + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + # Delete the README file + from dtoolcore.utils import generous_parse_uri + parsed = generous_parse_uri(proto_dataset.uri) + readme_path = os.path.join(parsed.path, "README.yml") + os.remove(readme_path) + + # Freezing should fail because README is missing + with pytest.raises(dtoolcore.DtoolCoreValueError) as excinfo: + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + assert "README" in str(excinfo.value) + + +def test_freeze_with_manifest_missing_items(tmp_dir_fixture): # NOQA + """Test that freezing fails if manifest items are missing from storage.""" + import dtoolcore + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-missing-items" + + # Create manifest with items that don't exist in storage + items = { + generate_identifier("data/nonexistent1.txt"): { + "relpath": "data/nonexistent1.txt", + "size_in_bytes": 100, + "hash": "abc123", + "utc_timestamp": 1234567890.0, + }, + generate_identifier("data/nonexistent2.txt"): { + "relpath": "data/nonexistent2.txt", + "size_in_bytes": 200, + "hash": "def456", + "utc_timestamp": 1234567890.0, + }, + } + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": items, + } + + # Create proto dataset (no items uploaded) + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + # Freezing should fail because items are missing + with pytest.raises(dtoolcore.DtoolCoreValueError) as excinfo: + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + assert "Missing" in str(excinfo.value) + assert "2" in str(excinfo.value) # Should mention 2 missing items + + +def test_freeze_with_manifest_partial_items(tmp_dir_fixture): # NOQA + """Test that freezing fails if some manifest items are missing.""" + import dtoolcore + import tempfile + import os + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-partial-items" + + # Create manifest with 2 items + items = { + generate_identifier("data/exists.txt"): { + "relpath": "data/exists.txt", + "size_in_bytes": 100, + "hash": "abc123", + "utc_timestamp": 1234567890.0, + }, + generate_identifier("data/missing.txt"): { + "relpath": "data/missing.txt", + "size_in_bytes": 200, + "hash": "def456", + "utc_timestamp": 1234567890.0, + }, + } + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": items, + } + + # Create proto dataset + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + # Create a temporary file to add to the dataset + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f: + f.write("test content") + temp_path = f.name + + try: + # Add only one item to storage + proto_dataset.put_item(temp_path, "data/exists.txt") + finally: + os.unlink(temp_path) + + # Freezing should fail because one item is missing + with pytest.raises(dtoolcore.DtoolCoreValueError) as excinfo: + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + assert "Missing" in str(excinfo.value) + assert "1" in str(excinfo.value) # Should mention 1 missing item + assert "missing.txt" in str(excinfo.value) + + +def test_freeze_with_manifest_items_exist(tmp_dir_fixture): # NOQA + """Test that freezing succeeds when all items exist.""" + import dtoolcore + import tempfile + import os + + base_uri = _sanitise_base_uri(tmp_dir_fixture) + name = "test-items-exist" + + # Create manifest with items + items = { + generate_identifier("data/file1.txt"): { + "relpath": "data/file1.txt", + "size_in_bytes": 100, + "hash": "abc123", + "utc_timestamp": 1234567890.0, + }, + generate_identifier("data/file2.txt"): { + "relpath": "data/file2.txt", + "size_in_bytes": 200, + "hash": "def456", + "utc_timestamp": 1234567890.0, + }, + } + + manifest = { + "dtoolcore_version": dtoolcore.__version__, + "hash_function": "md5sum_hexdigest", + "items": items, + } + + # Create proto dataset + proto_dataset = dtoolcore.create_proto_dataset( + name=name, + base_uri=base_uri, + ) + + # Create temporary files and add them to the dataset + temp_files = [] + for relpath in ["data/file1.txt", "data/file2.txt"]: + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f: + f.write(f"content for {relpath}") + temp_files.append((f.name, relpath)) + + try: + for temp_path, relpath in temp_files: + proto_dataset.put_item(temp_path, relpath) + finally: + for temp_path, _ in temp_files: + os.unlink(temp_path) + + # Freezing should succeed because all items exist + proto_dataset.freeze_with_manifest(manifest, frozen_at=1234567890.0) + + # Verify the dataset was frozen correctly + dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) + assert dataset.admin_metadata["type"] == "dataset" + assert set(dataset.identifiers) == set(items.keys())