From 457adbea52a7b141600b2020b4d41a62127d2aa7 Mon Sep 17 00:00:00 2001 From: Alexander Belikov Date: Mon, 16 Feb 2026 21:01:52 +0100 Subject: [PATCH] solved: resources with disconnected vertices removed --- .github/workflows/build-docs.yml | 2 +- .github/workflows/pre-commit.yml | 2 +- .github/workflows/pypi-publish.yml | 2 +- graflo/architecture/actor.py | 9 +- graflo/architecture/schema.py | 28 ++---- test/architecture/test_schema.py | 155 ++++++++++++++++++++++++++++- 6 files changed, 173 insertions(+), 25 deletions(-) diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml index bb458f15..ec97488c 100644 --- a/.github/workflows/build-docs.yml +++ b/.github/workflows/build-docs.yml @@ -30,7 +30,7 @@ jobs: - name: Install uv uses: astral-sh/setup-uv@v5 with: - version: "0.9.5" + version: "0.9.28" - name: Install dependencies run: | uv sync --group docs --no-group dev diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 782465be..4dba3386 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -13,7 +13,7 @@ jobs: - name: Install uv uses: astral-sh/setup-uv@v5 with: - version: "0.9.5" + version: "0.9.28" - name: Install dependencies (including dev) run: uv sync --group dev - name: Run pre-commit diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml index 3e71c8f6..a7b5180a 100644 --- a/.github/workflows/pypi-publish.yml +++ b/.github/workflows/pypi-publish.yml @@ -26,7 +26,7 @@ jobs: - name: Install uv uses: astral-sh/setup-uv@v5 with: - version: "0.8.14" + version: "0.9.28" - name: Build and publish run: | uv build diff --git a/graflo/architecture/actor.py b/graflo/architecture/actor.py index 7e2a401f..43b33a06 100644 --- a/graflo/architecture/actor.py +++ b/graflo/architecture/actor.py @@ -1297,8 +1297,8 @@ def remove_descendants_if(self, predicate: Callable[[ActorWrapper], bool]) -> No Mutates the tree in place: for each DescendActor, filters its descendants to exclude wrappers matching the predicate, after - recursing into each descendant. Use with find_descendants to - remove actors that reference disconnected vertices. + recursing into each descendant. Intermediate DescendActor + wrappers that become empty after pruning are also removed. Args: predicate: Callable(ActorWrapper) -> bool. Descendants for @@ -1308,5 +1308,8 @@ def remove_descendants_if(self, predicate: Callable[[ActorWrapper], bool]) -> No for d in list(self.actor.descendants): d.remove_descendants_if(predicate=predicate) self.actor._descendants[:] = [ - d for d in self.actor.descendants if not predicate(d) + d + for d in self.actor.descendants + if not predicate(d) + and not (isinstance(d.actor, DescendActor) and d.count() == 0) ] diff --git a/graflo/architecture/schema.py b/graflo/architecture/schema.py index 729b200c..e4e92c6f 100644 --- a/graflo/architecture/schema.py +++ b/graflo/architecture/schema.py @@ -175,10 +175,11 @@ def remove_disconnected_vertices(self) -> None: """Remove vertices that do not take part in any relation (disconnected). Builds the set of vertex names that appear as source or target of any - edge, then removes from VertexConfig all other vertices. For each - resource, finds actors that reference disconnected vertices (via - find_descendants) and removes them from the actor tree. Resources - whose root actor references only disconnected vertices are removed. + edge, then removes from VertexConfig all other vertices. For each + resource, removes actors that reference disconnected vertices from the + actor tree. If a resource's root directly references a disconnected + vertex (single-step pipeline) or becomes empty after pruning, the + entire resource is removed. Mutates this schema in place. """ @@ -189,7 +190,7 @@ def remove_disconnected_vertices(self) -> None: self.vertex_config.remove_vertices(disconnected) - def mentions_disconnected(wrapper): + def _mentions_disconnected(wrapper) -> bool: actor = wrapper.actor if isinstance(actor, VertexActor): return actor.name in disconnected @@ -205,21 +206,12 @@ def mentions_disconnected(wrapper): to_drop: list[Resource] = [] for resource in self.resources: root = resource.root - to_remove = set( - root.find_descendants(actor_type=VertexActor, name=disconnected) - + root.find_descendants(actor_type=TransformActor, vertex=disconnected) - + root.find_descendants( - predicate=lambda w: isinstance(w.actor, EdgeActor) - and ( - w.actor.edge.source in disconnected - or w.actor.edge.target in disconnected - ), - ) - ) - if mentions_disconnected(root): + if _mentions_disconnected(root): to_drop.append(resource) continue - root.remove_descendants_if(lambda w: w in to_remove) + root.remove_descendants_if(_mentions_disconnected) + if not any(isinstance(a, VertexActor) for a in root.collect_actors()): + to_drop.append(resource) for r in to_drop: self.resources.remove(r) diff --git a/test/architecture/test_schema.py b/test/architecture/test_schema.py index f6f5c394..8b6f08c6 100644 --- a/test/architecture/test_schema.py +++ b/test/architecture/test_schema.py @@ -1,6 +1,6 @@ import logging -from graflo.architecture.actor import VertexActor +from graflo.architecture.actor import EdgeActor, VertexActor from graflo.architecture.resource import Resource from graflo.architecture.schema import Schema @@ -70,3 +70,156 @@ def test_remove_disconnected_vertices(vertex_config_kg, edge_config_kg): # Resource r1 should still exist but without the VertexActor(publication) assert len(sch.resources) == 1 assert len(root.find_descendants(actor_type=VertexActor, name={"publication"})) == 0 + + +def test_remove_disconnected_vertices_drops_resource(vertex_config_kg, edge_config_kg): + """A resource that only references a disconnected vertex should be removed entirely.""" + # publication is disconnected (not in any edge) + # r_only_pub pipeline has a single step targeting only the disconnected vertex + schema_dict = { + "vertex_config": vertex_config_kg, + "edge_config": edge_config_kg, + "resources": [ + { + "resource_name": "r_connected", + "apply": [ + {"vertex": "entity"}, + {"source": "mention", "target": "entity"}, + ], + }, + { + "resource_name": "r_only_pub", + "apply": [ + {"vertex": "publication"}, + ], + }, + ], + "general": {"name": "kg"}, + } + sch = Schema.from_dict(schema_dict) + assert len(sch.resources) == 2 + + sch.remove_disconnected_vertices() + + assert sch.vertex_config.vertex_set == {"entity", "mention"} + # r_only_pub should be gone — its only actor referenced a disconnected vertex + assert len(sch.resources) == 1 + assert sch.resources[0].name == "r_connected" + + +def test_remove_disconnected_vertices_nested_resource(vertex_config_kg, edge_config_kg): + """Nested descend blocks are pruned correctly. + + Covers three scenarios: + 1. Mixed nested descend — disconnected actors inside are removed, + connected actors and the descend wrapper survive. + 2. Nested descend becomes empty — the descend wrapper itself is + dropped, but the resource survives because of other actors. + 3. Resource whose only content is a nested descend with disconnected + actors — the resource is removed entirely. + """ + # vertex_config_kg: publication, entity, mention + # edge_config_kg edges: entity→entity, entity→entity(aux), mention→entity + # connected = {entity, mention}; publication is disconnected + schema_dict = { + "vertex_config": vertex_config_kg, + "edge_config": edge_config_kg, + "resources": [ + { + "resource_name": "r_mixed", + "apply": [ + {"vertex": "entity"}, + { + "key": "items", + "apply": [ + {"vertex": "publication"}, + {"vertex": "mention"}, + ], + }, + {"source": "mention", "target": "entity"}, + ], + }, + { + "resource_name": "r_nested_empty", + "apply": [ + {"vertex": "entity"}, + { + "key": "items", + "apply": [ + {"vertex": "publication"}, + ], + }, + ], + }, + { + "resource_name": "r_all_disconnected", + "apply": [ + { + "key": "items", + "apply": [ + {"vertex": "publication"}, + ], + }, + ], + }, + ], + "general": {"name": "kg"}, + } + sch = Schema.from_dict(schema_dict) + + # -- preconditions -- + assert len(sch.resources) == 3 + r_mixed = sch.fetch_resource("r_mixed") + r_nested_empty = sch.fetch_resource("r_nested_empty") + assert ( + len(r_mixed.root.find_descendants(actor_type=VertexActor, name={"publication"})) + == 1 + ) + assert ( + len( + r_nested_empty.root.find_descendants( + actor_type=VertexActor, name={"publication"} + ) + ) + == 1 + ) + + sch.remove_disconnected_vertices() + + assert sch.vertex_config.vertex_set == {"entity", "mention"} + + # r_mixed: publication removed from nested descend; mention and edge survive + assert ( + len(r_mixed.root.find_descendants(actor_type=VertexActor, name={"publication"})) + == 0 + ) + assert ( + len(r_mixed.root.find_descendants(actor_type=VertexActor, name={"mention"})) + == 1 + ) + assert len(r_mixed.root.find_descendants(actor_type=EdgeActor)) == 1 + + # r_nested_empty: the nested descend was emptied and dropped; + # only vertex: entity remains + assert ( + len( + r_nested_empty.root.find_descendants( + actor_type=VertexActor, name={"publication"} + ) + ) + == 0 + ) + assert ( + len( + r_nested_empty.root.find_descendants( + actor_type=VertexActor, name={"entity"} + ) + ) + == 1 + ) + assert r_nested_empty.count() == 1 + + # r_all_disconnected: removed entirely — its only content was disconnected + assert len(sch.resources) == 2 + resource_names = {r.name for r in sch.resources} + assert "r_all_disconnected" not in resource_names