From cb61d74c74dcf8c700fc0cf5bd4181201c4e7d26 Mon Sep 17 00:00:00 2001
From: Conscht <constantin.auga@gmx.de>
Date: Tue, 17 Feb 2026 15:01:38 +0100
Subject: [PATCH 1/5] Automatically evalute partly, full labeled, abd combined
 data

---
 README.md                        |  25 ++--
 assets/sample_list_per_split.txt | 117 +++++++++++++++
 evalinstseg/evaluate.py          | 236 ++++++++++++++++++-------------
 evalinstseg/metrics.py           |  17 ++-
 evalinstseg/summarize.py         |   4 +-
 evalinstseg/util.py              |  27 ++++
 6 files changed, 310 insertions(+), 116 deletions(-)
 create mode 100644 assets/sample_list_per_split.txt

diff --git a/README.md b/README.md
index 6cf2540..af2784f 100644
--- a/README.md
+++ b/README.md
@@ -55,12 +55,13 @@ The `evalinstseg` command is automatically available after installation.
 
 ### 1. Evaluate a Single File
 ```bash
-evalinstseg \
-  --res_file tests/pred/R14A02-20180905.hdf \
-  --res_key volumes/labels \
-  --gt_file tests/gt/R14A02-20180905.zarr \
-  --gt_key volumes/gt_instances \
-  --out_dir tests/results \
+evalinstseg `
+  --res_file tests/pred/R14A02-20180905_65_A6.hdf `
+  --res_key volumes/gmm_label_cleaned `
+  --gt_file tests/gt/R14A02-20180905_65_A6.zarr `
+  --gt_key volumes/gt_instances `
+  --split_file assets/sample_list_per_split.txt `
+  --out_dir tests/results `
   --app flylight
 ```
 
@@ -68,12 +69,12 @@ evalinstseg \
 If you provide a directory path to `--res_file`, the tool will look for matching Ground Truth files in the `--gt_file` folder. Files are matched by name.
 
 ```bash
-evalinstseg \
-  --res_file /path/to/predictions_folder \
-  --res_key volumes/labels \
-  --gt_file /path/to/ground_truth_folder \
-  --gt_key volumes/gt_instances \
-  --out_dir /path/to/output_folder \
+evalinstseg `
+  --res_file /path/to/predictions_folder `
+  --res_key volumes/gmm_label_cleaned `
+  --gt_file /path/to/ground_truth_folder `
+  --gt_key volumes/gt_instances `
+  --out_dir /path/to/output_folder `
   --app flylight
 ```
 
diff --git a/assets/sample_list_per_split.txt b/assets/sample_list_per_split.txt
new file mode 100644
index 0000000..c6ca31b
--- /dev/null
+++ b/assets/sample_list_per_split.txt
@@ -0,0 +1,117 @@
+(1) samples for FlyLight completely:
+
+train:
+R38F04-20181005_63_G3
+R38F04-20181005_63_G5
+R38F04-20181005_63_H1
+R53A10-20181019_64_A4
+R75E01-20181030_64_D1
+VT008647-20171222_63_D2
+VT008647-20171222_63_D1
+VT008647-20171222_63_E1
+VT019303-20171013_65_B6
+VT019307-20171013_65_F1
+VT033051-20171128_61_E4
+VT033051-20171128_61_E2
+VT040433-20170919_63_D6
+VT047848-20171020_66_I3
+VT047848-20171020_66_I2
+VT047848-20171020_66_J2
+VT047848-20171020_66_I5
+VT061467-20180911_62_E5
+
+val:
+R22C03-20180918_66_J2
+VT012403-20171128_61_B2
+VT033614-20171124_64_H1
+VT033614-20171124_64_H5
+VT041298-20171114_63_C3
+
+test:
+JRC_SS04989-20160318_24_A2
+R14A02-20180905_65_A6
+R54A09-20181019_64_H1
+VT011145-20171222_63_I1
+VT027175-20171031_62_H3
+VT027175-20171031_62_H4
+VT050157-20171110_61_C1
+
+(2) samples for FlyLight partly:
+
+train:
+R14B11-20180905_65_D2
+R14B11-20180905_65_D6
+R24D12-20180921_65_J6
+R38F04-20181005_63_G2
+R38F04-20181005_63_G4
+VT003236-20170602_62_G4
+VT003236-20170602_62_G5
+VT007080-20170517_61_A2
+VT007080-20170517_61_A4
+VT007080-20170517_61_A5
+VT008135-20171122_61_C2
+VT008647-20171222_63_D5
+VT008647-20171222_63_D6
+VT010264-20171222_63_H2
+VT010264-20171222_63_H5
+VT011049-20180918_66_I1
+VT024641-20170615_62_D2
+VT024641-20170615_62_D3
+VT024641-20170615_62_D5
+VT024641-20170615_62_D6
+VT024641-20170615_62_E1
+VT025523-20170915_64_I1
+VT026776-20171017_62_J1
+VT033051-20171128_61_E3
+VT033296-20171010_62_B4
+VT034391-20171128_61_G2
+VT038149-20171103_62_F1
+VT039484-20171020_64_C1
+VT039484-20171020_64_C2
+VT040430-20170919_63_C4
+VT040433-20170919_63_E1
+VT045568-20171020_66_C5
+VT045568-20171020_66_D2
+VT047848-20171020_66_I1
+VT047848-20171020_66_I4
+VT047848-20171020_66_J1
+VT050217-20171110_61_D6
+VT050217-20171110_61_E1
+VT058568-20170926_64_E1
+VT060731-20170517_63_F1
+VT060731-20170517_63_F2
+VT061467-20180911_62_E4
+VT062059-20170727_61_D4
+
+val:
+JRC_SS05008-20160318_24_B1
+JRC_SS05008-20160318_24_B2
+R22C03-20180918_66_J1
+R9F03-20181030_62_B5
+VT008194-20171222_63_A3
+VT008194-20171222_63_A5
+VT012403-20171128_61_B1
+VT033614-20171124_64_H4
+VT039350-20171020_64_A1
+VT039350-20171020_64_A3
+VT039350-20171020_64_A6
+VT059775-20170630_63_D5
+
+test:
+R54A09-20181019_64_H4
+R54A09-20181019_64_H6
+R73H08-20181030_62_G5
+VT006202-20170511_63_C4
+VT011145-20171222_63_I2
+VT021537-20171003_61_C3
+VT023747-20171017_61_F1
+VT027175-20171031_62_H6
+VT028606-20170721_65_A2
+VT028606-20170721_65_A3
+VT033453-20170721_65_D2
+VT033453-20170721_65_D4
+VT033453-20170721_65_D5
+VT046838-20170922_62_A2
+VT050157-20171110_61_C5
+VT058571-20170926_64_G6
+
diff --git a/evalinstseg/evaluate.py b/evalinstseg/evaluate.py
index 46b0fc7..6c42ad6 100644
--- a/evalinstseg/evaluate.py
+++ b/evalinstseg/evaluate.py
@@ -14,6 +14,7 @@
     check_fix_and_unify_ids,
     get_output_name,
     read_file,
+    load_partly_config
 )
 from .localize import compute_localization_criterion
 from .match import assign_labels, get_false_labels
@@ -489,6 +490,10 @@ def main():
     parser.add_argument(
         "--run_dirs", nargs="+", type=str, help="List of 3 experiment directories"
     )
+    parser.add_argument(
+        "--split_file", type=str, default="sample_list_per_split.txt", 
+        help="Path to sample split file"
+    )
     parser.add_argument(
         "--res_file", nargs="+", type=str, help="path to result file"
     )
@@ -630,26 +635,36 @@ def main():
     logger.debug("arguments %s", tuple(sys.argv))
     args = parser.parse_args()
 
+    # --- 1. LOAD CONFIG for PARTLY LABELED DATA ---
+    partly_set = set()
+    if args.split_file and os.path.exists(args.split_file):
+        partly_set = load_partly_config(args.split_file)
+        logger.info(f"Loaded {len(partly_set)} partly labeled samples from config.")
+
+    # --- HELPER: FILE MATCHING ---
     def get_gt_file(in_fn, gt_folder):
         """Helper to get gt file corresponding to input result file."""
-        out_fn = os.path.join(
-            gt_folder, os.path.basename(in_fn).split(".")[0] + ".zarr"
-        )
-        return out_fn
+        return os.path.join(gt_folder, os.path.basename(in_fn).split(".")[0] + ".zarr")
 
-    def _run_loop(res_files, gt_files, out_dirs, partly_list_loc):
+    # --- HELPER: EVALUATION LOOP ---
+    def _run_loop(res_files, gt_files, out_dirs):
         """Core evaluation loop used in normal and stability mode."""
-        
         loop_samples = []
         loop_metrics = []
-        for res_file, gt_file, partly, out_dir in zip(
-            res_files, gt_files, partly_list_loc, out_dirs
-        ):
-            if not os.path.exists(out_dir): os.makedirs(out_dir, exist_ok=True)
-            
+        actual_partly_flags = []
+
+        for res_file, gt_file, out_dir in zip(res_files, gt_files, out_dirs):
+            if not os.path.exists(out_dir):
+                os.makedirs(out_dir, exist_ok=True)
             sample_name = os.path.basename(res_file).split(".")[0]
             logger.info("sample_name: %s", sample_name)
             
+            # AUTO-DETECT PARTLY MODE
+            if partly_set:
+                is_partly = sample_name in partly_set
+            else:
+                is_partly = args.partly
+    
             metric_dict = evaluate_file(
                 res_file,
                 gt_file,
@@ -663,7 +678,7 @@ def _run_loop(res_files, gt_files, out_dirs, partly_list_loc):
                 add_general_metrics=args.add_general_metrics,
                 visualize=args.visualize,
                 visualize_type=args.visualize_type,
-                partly=partly,
+                partly=is_partly,
                 foreground_only=args.foreground_only,
                 remove_small_components=args.remove_small_components,
                 evaluate_false_labels=args.evaluate_false_labels,
@@ -675,18 +690,76 @@ def _run_loop(res_files, gt_files, out_dirs, partly_list_loc):
             )
             loop_metrics.append(metric_dict)
             loop_samples.append(sample_name)
-            print(f"Evaluated {sample_name}: {metric_dict}")
+            actual_partly_flags.append(is_partly)
+            print(f"Evaluated {sample_name}: {metric_dict}, Partly={is_partly}")
             
-        return loop_metrics, loop_samples
+        return loop_metrics, loop_samples, np.array(actual_partly_flags, dtype=bool)
+
+    # --- PRESETS ---
+    if args.app == "flylight":
+        print("Using FlyLight Presets...")
+        args.ndim = 3
+        args.localization_criterion = "cldice"
+        args.assignment_strategy = "greedy"
+        args.remove_small_components = 800
+        args.metric = "general.avg_f1_cov_score"
+        args.add_general_metrics = [
+            "avg_gt_skel_coverage",
+            "avg_f1_cov_score",
+            "false_merge",
+            "false_split",
+            "avg_gt_cov_dim",
+            "avg_gt_cov_overlap",
+        ]
+        args.summary = [
+            "general.Num GT",
+            "general.Num Pred",
+            "general.avg_f1_cov_score",
+            "confusion_matrix.avFscore",
+            "general.avg_gt_skel_coverage",
+            "confusion_matrix.th_0_5.AP_TP",
+            "confusion_matrix.th_0_5.AP_FP",
+            "confusion_matrix.th_0_5.AP_FN",
+            "general.FM",
+            "general.FS",
+            "general.TP_05",
+            "general.TP_05_rel",
+            "general.avg_TP_05_cldice",
+            "general.GT_dim",
+            "general.TP_05_dim",
+            "general.TP_05_rel_dim",
+            "general.avg_gt_cov_dim",
+            "general.GT_overlap",
+            "general.TP_05_overlap",
+            "general.TP_05_rel_overlap",
+            "general.avg_gt_cov_overlap",
+            "confusion_matrix.th_0_1.fscore",
+            "confusion_matrix.th_0_2.fscore",
+            "confusion_matrix.th_0_3.fscore",
+            "confusion_matrix.th_0_4.fscore",
+            "confusion_matrix.th_0_5.fscore",
+            "confusion_matrix.th_0_6.fscore",
+            "confusion_matrix.th_0_7.fscore",
+            "confusion_matrix.th_0_8.fscore",
+            "confusion_matrix.th_0_9.fscore",
+        ]
+        args.visualize_type = "neuron"
+        args.fm_thresh = 0.1
+        args.fs_thresh = 0.05
+        args.eval_dim = True
 
     # Stability Mode (Wraps logic 3 times)
     if args.stability_mode:
         if not args.run_dirs or len(args.run_dirs) != 3:
             raise ValueError("Stability mode requires exactly 3 directories passed to --run_dirs")
         
-        stability_scores = []
-        print("--- EVALUTE USING STABILITY MODE ---")
-
+        # Lists to store the 3 run results for each mode
+        scores_complete = []
+        scores_partly = []
+        scores_combined = []
+        
+        print("--- 3x STABILITY MODE ---")
+        
         for run_idx, run_dir in enumerate(args.run_dirs):
             print(f"Processing Run {run_idx+1}: {run_dir}")
             
@@ -700,24 +773,48 @@ def _run_loop(res_files, gt_files, out_dirs, partly_list_loc):
             run_out_dirs = [os.path.join(args.out_dir[0], f"seed_{run_idx+1}")] * len(run_res_files)
             
             # Run the inner loop
-            m_dicts, s_names = _run_loop(run_res_files, run_gt_files, run_out_dirs, [args.partly]*len(run_res_files))
+            metric_dicts, samples, actual_partly_flags = _run_loop(run_res_files, run_gt_files, run_out_dirs)
             
-            # Aggregate just this run
-            metrics_full = {s: m for m, s in zip(m_dicts, s_names) if m is not None}
-            acc, _ = average_flylight_score_over_instances(s_names, metrics_full)
-            stability_scores.append(acc)
+            metrics_full = {s: m for m, s in zip(metric_dicts, samples) if m is not None}
+            s_arr = np.array(samples)
+
+            # CALCULATE SEPARATE MODES
+            if len(np.unique(actual_partly_flags)) > 1:
+                # Mode 1: Complete Only
+                acc_cpt, acc_inst_cpt = average_flylight_score_over_instances(s_arr[~actual_partly_flags], metrics_full)
+                scores_complete.append(acc_cpt)
+
+                # Mode 2: Partly Only
+                acc_prt, acc_inst_prt = average_flylight_score_over_instances(s_arr[actual_partly_flags], metrics_full)
+                scores_partly.append(acc_prt)
+
+                # Mode 3: Combined
+                acc_comb, _ = average_sets(acc_cpt, acc_inst_cpt, acc_prt, acc_inst_prt)
+                scores_combined.append(acc_comb)
+            else:
+                # Uniform case
+                acc, _ = average_flylight_score_over_instances(samples, metrics_full)
+                scores_combined.append(acc)
 
         # Print Average and Std Dev across runs
         print("\n=== FISBe BENCHMARK RESULTS (Mean ± Std) ===")
-        if stability_scores:
-            for key in stability_scores[0].keys():
-                values = [s[key] for s in stability_scores if key in s]
-                if len(values) == 3:
-                    print(f"{key:<30}: {np.mean(values):.4f} ± {np.std(values):.4f}")
+        
+        def print_stats(title, score_list):
+            if not score_list: return
+            print(f"\n--- {title} ---")
+            for key in score_list[0].keys():
+                vals = [s[key] for s in score_list if key in s]
+                if len(vals) == 3:
+                    print(f"{key:<30}: {np.mean(vals):.4f} ± {np.std(vals):.4f}")
+        
+        # Report all 3
+        print_stats("MODE 1: COMPLETELY LABELED", scores_complete)
+        print_stats("MODE 2: PARTLY LABELED", scores_partly)
+        print_stats("MODE 3: COMBINED (MAIN BENCHMARK)", scores_combined)
 
     # Normal Mode 
     else:
-        print("--- EVALUTE USING SINGLE DIR ---")
+        print("--- EVALUATE USING SINGLE DIR ---")
         # shortcut if res_file and gt_file contain folders
         if len(args.res_file) == 1 and len(args.gt_file) == 1:
             res_file = args.res_file[0]
@@ -732,18 +829,11 @@ def _run_loop(res_files, gt_files, out_dirs, partly_list_loc):
         assert len(args.res_file) == len(args.gt_file), (
             "Please check, not the same number of result and gt files"
         )
-        # set partly parameter for all samples if not done already
-        if len(args.res_file) > 1:
-            if args.partly_list is not None:
-                assert len(args.partly_list) == len(args.res_file), (
-                    "Please check, not the same number of result files "
-                    "and partly_list values"
-                )
-                partly_list = np.array(args.partly_list, dtype=bool)
-            else:
-                partly_list = [args.partly] * len(args.res_file)
-        else:
-            partly_list = [args.partly]
+        
+        # We don't need partly_list prep here anymore as we auto-detect
+        # But we still check validity if user provided it
+        if args.partly_list is not None:
+             assert len(args.partly_list) == len(args.res_file)
 
         # check out_dir
         if len(args.res_file) > 1:
@@ -761,64 +851,7 @@ def _run_loop(res_files, gt_files, out_dirs, partly_list_loc):
         if args.summary_out_dir is None:
             args.summary_out_dir = args.out_dir[0]
 
-        if args.app is not None:
-            if args.app == "flylight":
-                print(
-                    "Warning: parameter app is set and will overwrite parameters. "
-                    "This might not be what you want."
-                )
-                args.ndim = 3
-                args.localization_criterion = "cldice"
-                args.assignment_strategy = "greedy"
-                args.remove_small_components = 800
-                # args.evaluate_false_labels = True
-                args.metric = "general.avg_f1_cov_score"
-                args.add_general_metrics = [
-                    "avg_gt_skel_coverage",
-                    "avg_f1_cov_score",
-                    "false_merge",
-                    "false_split",
-                    "avg_gt_cov_dim",
-                    "avg_gt_cov_overlap",
-                ]
-                args.summary = [
-                    "general.Num GT",
-                    "general.Num Pred",
-                    "general.avg_f1_cov_score",
-                    "confusion_matrix.avFscore",
-                    "general.avg_gt_skel_coverage",
-                    "confusion_matrix.th_0_5.AP_TP",
-                    "confusion_matrix.th_0_5.AP_FP",
-                    "confusion_matrix.th_0_5.AP_FN",
-                    "general.FM",
-                    "general.FS",
-                    "general.TP_05",
-                    "general.TP_05_rel",
-                    "general.avg_TP_05_cldice",
-                    "general.GT_dim",
-                    "general.TP_05_dim",
-                    "general.TP_05_rel_dim",
-                    "general.avg_gt_cov_dim",
-                    "general.GT_overlap",
-                    "general.TP_05_overlap",
-                    "general.TP_05_rel_overlap",
-                    "general.avg_gt_cov_overlap",
-                    "confusion_matrix.th_0_1.fscore",
-                    "confusion_matrix.th_0_2.fscore",
-                    "confusion_matrix.th_0_3.fscore",
-                    "confusion_matrix.th_0_4.fscore",
-                    "confusion_matrix.th_0_5.fscore",
-                    "confusion_matrix.th_0_6.fscore",
-                    "confusion_matrix.th_0_7.fscore",
-                    "confusion_matrix.th_0_8.fscore",
-                    "confusion_matrix.th_0_9.fscore",
-                ]
-                args.visualize_type = "neuron"
-                args.fm_thresh = 0.1
-                args.fs_thresh = 0.05
-                args.eval_dim = True
-
-        metric_dicts, samples = _run_loop(args.res_file, args.gt_file, outdir_list, partly_list)
+        metric_dicts, samples, actual_partly_flags = _run_loop(args.res_file, args.gt_file, outdir_list)
 
         # aggregate over instances
         metrics_full = {}
@@ -827,15 +860,16 @@ def _run_loop(res_files, gt_files, out_dirs, partly_list_loc):
             if metric_dict is None:
                 continue
             metrics_full[sample] = metric_dict
-        if len(np.unique(partly_list)) > 1:
+            
+        if len(np.unique(actual_partly_flags)) > 1:
             print("averaging for combined")
             # get average over instances for completely
             samples = np.array(samples)
             acc_cpt, acc_inst_cpt = average_flylight_score_over_instances(
-                samples[partly_list == False], metrics_full
+                samples[actual_partly_flags == False], metrics_full
             )
             acc_prt, acc_inst_prt = average_flylight_score_over_instances(
-                samples[partly_list == True], metrics_full
+                samples[actual_partly_flags == True], metrics_full
             )
             acc, acc_all_instances = average_sets(
                 acc_cpt, acc_inst_cpt, acc_prt, acc_inst_prt
diff --git a/evalinstseg/metrics.py b/evalinstseg/metrics.py
index 6c1a836..f4c15fb 100644
--- a/evalinstseg/metrics.py
+++ b/evalinstseg/metrics.py
@@ -1,5 +1,6 @@
 import logging
 import toml
+import numpy as np
 
 logger = logging.getLogger(__name__)
 
@@ -23,8 +24,22 @@ def __init__(self, fn):
     def save(self):
         """dump results to toml file."""
         logger.info("saving %s", self.fn)
+
+        def convert_numpy(obj):
+            if isinstance(obj, np.integer):
+                return int(obj)
+            elif isinstance(obj, np.floating):
+                return float(obj)
+            elif isinstance(obj, np.ndarray):
+                return obj.tolist()
+            elif isinstance(obj, dict):
+                return {k: convert_numpy(v) for k, v in obj.items()}
+            elif isinstance(obj, list):
+                return [convert_numpy(i) for i in obj]
+            return obj
+
         with open(self.fn + ".toml", "w") as tomlFl:
-            toml.dump(self.metricsDict, tomlFl)
+            toml.dump(convert_numpy(self.metricsDict), tomlFl)
 
     def addTable(self, name, dct=None):
         """add new sub-table to result dict
diff --git a/evalinstseg/summarize.py b/evalinstseg/summarize.py
index 2ed4f16..d7ee27f 100644
--- a/evalinstseg/summarize.py
+++ b/evalinstseg/summarize.py
@@ -205,8 +205,8 @@ def average_sets(acc_a, dict_a, acc_b, dict_b):
     gt_covs = list(dict_a["gt_covs"]) + list(dict_b["gt_covs"])
     num_gt = dict_a["general"]["Num GT"] + dict_b["general"]["Num GT"]
     tp_05 = dict_a["general"]["TP_05"] + dict_b["general"]["TP_05"]
-    tp_05_cldice = list(dict_a["general"]["tp_05_cldice"]) + \
-            list(dict_b["general"]["tp_05_cldice"])
+    tp_05_cldice = list(dict_a["general"]["TP_05_cldice"]) + \
+            list(dict_b["general"]["TP_05_cldice"])
 
     per_instance_counts = {}
     per_instance_counts["general"] = {
diff --git a/evalinstseg/util.py b/evalinstseg/util.py
index 8b4d8dc..6821767 100644
--- a/evalinstseg/util.py
+++ b/evalinstseg/util.py
@@ -38,6 +38,33 @@ def crop(arr, target_shape):
     return arr[leading + trailing]
 
 
+def load_partly_config(list_file_path):
+    """
+    Parses the sample_list_per_split.txt to identify which files 
+    are 'partly' labeled.
+    """
+    partly_samples = set()
+    is_partly_section = False
+    
+    with open(list_file_path, 'r') as f:
+        for line in f:
+            line = line.strip()
+            if not line: continue
+            
+            # Detect section headers
+            if "(1) samples for FlyLight completely:" in line:
+                is_partly_section = False
+            elif "(2) samples for FlyLight partly:" in line:
+                is_partly_section = True
+            elif line.endswith(":") and line[:-1] in ["train", "val", "test"]:
+                continue # Skip split headers
+            else:
+                # It's a filename
+                if is_partly_section:
+                    partly_samples.add(line)
+                    
+    return partly_samples
+
 def check_and_fix_sizes(gt_labels, pred_labels, ndim):
     """check if prediction and gt have same size, otherwise crop the bigger one
     add channel dimension if missing (channel_first)

From b7768e4fe4d5e53f0ed9e7e34381a5213086b11c Mon Sep 17 00:00:00 2001
From: Conscht <constantin.auga@gmx.de>
Date: Thu, 19 Feb 2026 12:42:03 +0100
Subject: [PATCH 2/5] Return also single outputs for each data type (partly,
 completly, combined) + Adjusted readme and added detailed metric readme in
 docs

---
 README.md                | 133 ++++++++++-----------
 docs/METRICS.md          | 142 ++++++++++++++++++++++
 evalinstseg/evaluate.py  | 248 ++++++++++++++++++---------------------
 evalinstseg/summarize.py |  38 ++++++
 4 files changed, 357 insertions(+), 204 deletions(-)
 create mode 100644 docs/METRICS.md

diff --git a/README.md b/README.md
index af2784f..57f39e3 100644
--- a/README.md
+++ b/README.md
@@ -13,12 +13,13 @@ This is the official implementation of the **FISBe (FlyLight Instance Segmentati
 
 The benchmark supports 2D and 3D segmentations and computes a wide range of commonly used evaluation metrics (e.g., AP, F1, coverage). Crucially, it provides specialized error attribution for topological errors (False Merges, False Splits) relevant to filamentous structures.
 
-### Features
-- **Standard Metrics:** AP, F1, Precision, Recall.
-- **FISBe Metrics:** Greedy many-to-many matching for False Merges (FM) and False Splits (FS).
+### Key Features
+- **Official Protocol:** Implements the exact ranking score ($S$) and matching logic defined in the FISBe paper.
+- **Topology-Aware:** Uses skeleton-based localization (`clDice`) to handle thin structures robustly.
+- **Error Attribution:** Explicitly quantifies False Merges (FM) and False Splits (FS) via many-to-many matching.
 - **Flexibility:** Supports HDF5 (`.hdf`, `.h5`) and Zarr (`.zarr`) files.
-- **Modes:** Run on single files, entire folders, or in stability analysis mode.
-- **Partly Labeled Data:** Robust evaluation ignoring background conflicts for sparse Ground Truth.
+- **Modes:** Single file, folder evaluation, or 3x stability analysis.
+- **Partly Labeled Support:** Robust evaluation that ignores background conflicts for sparse Ground Truth.
 
 ---
 
@@ -29,17 +30,14 @@ The recommended way to install is using `uv` (fastest) or `micromamba`.
 ### Option 1: Using `uv` (Fastest)
 
 ```bash
-# 1. Install uv (if not installed)
 pip install uv
-
-# 2. Clone and install
 git clone https://github.com/Kainmueller-Lab/evaluate-instance-segmentation.git
 cd evaluate-instance-segmentation
 uv venv
 uv pip install -e .
 ```
 
-### Option 2: Using `micromamba` or `conda`
+### Option 2: Using micromamba or conda
 
 ```bash
 micromamba create -n evalinstseg python=3.10
@@ -55,13 +53,13 @@ The `evalinstseg` command is automatically available after installation.
 
 ### 1. Evaluate a Single File
 ```bash
-evalinstseg `
-  --res_file tests/pred/R14A02-20180905_65_A6.hdf `
-  --res_key volumes/gmm_label_cleaned `
-  --gt_file tests/gt/R14A02-20180905_65_A6.zarr `
-  --gt_key volumes/gt_instances `
-  --split_file assets/sample_list_per_split.txt `
-  --out_dir tests/results `
+evalinstseg \
+  --res_file tests/pred/sample_01.hdf \
+  --res_key volumes/gmm_label_cleaned \
+  --gt_file tests/gt/sample_01.zarr \
+  --gt_key volumes/gt_instances \
+  --split_file assets/sample_list_per_split.txt \
+  --out_dir tests/results \
   --app flylight
 ```
 
@@ -69,12 +67,12 @@ evalinstseg `
 If you provide a directory path to `--res_file`, the tool will look for matching Ground Truth files in the `--gt_file` folder. Files are matched by name.
 
 ```bash
-evalinstseg `
-  --res_file /path/to/predictions_folder `
-  --res_key volumes/gmm_label_cleaned `
-  --gt_file /path/to/ground_truth_folder `
-  --gt_key volumes/gt_instances `
-  --out_dir /path/to/output_folder `
+evalinstseg \
+  --res_file /path/to/predictions_folder \
+  --res_key volumes/gmm_label_cleaned \
+  --gt_file /path/to/ground_truth_folder \
+  --gt_key volumes/gt_instances \
+  --out_dir /path/to/output_folder \
   --app flylight
 ```
 
@@ -91,11 +89,12 @@ evalinstseg \
 ```
 
 **Requirements:**
+
 - `--run_dirs`: Provide exactly 3 folders.
 - `--gt_file`: The folder containing Ground Truth files (filenames must match predictions).
 
 ### 4. Partly Labeled Data
-If your ground truth is sparse (not fully dense), use the `--partly` flag. T
+If your ground truth is sparse (not fully dense), use the `--partly` flag. See the **Partly Labeled Data Mode** section for details on how False Positives are handled.
 
 ## Usage: Python Package
 You can integrate the benchmark directly into your Python scripts or notebooks.
@@ -141,56 +140,46 @@ metrics = evaluate_volume(
     add_general_metrics=["false_merge", "false_split"]
 )
 ```
-### 4. Partly Labeled Data (`--partly`)
-Some samples contain sparse / incomplete GT annotations. In this setting, counting all unmatched predictions as false positives is not meaningful.
 
-When `--partly` is enabled, we approximate FP by counting only **unmatched predictions whose best match is a foreground GT instance** (based on the localization matrix used for evaluation, e.g. clPrecision for `cldice`).  
-Unmatched predictions whose best match is **background** are ignored.
+## FISBe Benchmark Protocol
+For a complete reference of all calculated metrics, see [docs/METRICS.md](docs/METRICS.md).
+> **Note:** Some output keys use internal names; see the documentation for the exact mapping to website/leaderboard columns.
 
-Concretely, we compute for each unmatched prediction the index of the GT label with maximal overlap score; it is counted as FP only if that index is > 0 (foreground), not 0 (background).
+### Official FlyLight Configuration (`--app flylight`)
+The `flylight` preset implements the specific metrics described in the FISBe paper for evaluating long-range thin filamentous neuronal structures.
 
----
+**Primary Ranking Score ($S$)**
+The single scalar used to rank methods on the leaderboard:
+$$S = 0.5 \cdot \text{avF1} + 0.5 \cdot C$$
 
-## Metrics Explanation
-
-### 1. Standard Instance Metrics (TP/FP/FN, F-score, AP proxy)
-These metrics are computed from a **one-to-one matching** between GT and prediction instances (Hungarian or greedy), using a chosen localization criterion (default for FlyLight is `cldice`).
-
-- **TP**: matched pairs above threshold  
-- **FP**: unmatched predictions (or, in `--partly`, only those whose best match is foreground)  
-- **FN**: unmatched GT instances  
-- **precision** = TP / (TP + FP)  
-- **recall** = TP / (TP + FN)  
-- **fscore** = 2 * precision * recall / (precision + recall)  
-- **AP**: we report a simple AP proxy `precision × recall` at each threshold and average it across thresholds (this is not COCO-style AP).
-
-### 2. FISBe Error Attribution (False Splits / False Merges)
-False splits (FS) and false merges (FM) aim to quantify **instance topology errors** for long-range thin filamentous structures.
-
-We compute FS/FM using **greedy many-to-many matching with consumption**:
-- Candidate GT–Pred pairs above threshold are processed in descending score order.
-- After selecting a match, we update “available” pixels so that already explained structure is not matched again.
-- FS counts when one GT is explained by multiple preds (excess preds per GT).
-- FM counts when one pred explains multiple GTs (excess GTs per pred).
-
-This produces an explicit attribution of split/merge errors rather than only TP/FP/FN.
-
-### Metric Definitions
-
-#### Instance-Level (per threshold)
-| Metric | Description |
-| :--- | :--- |
-| **AP_TP** | True Positives (1-to-1 match) |
-| **AP_FP** | False Positives (unmatched preds; in `--partly`: only unmatched preds whose best match is foreground) |
-| **AP_FN** | False Negatives (unmatched GT) |
-| **precision** | TP / (TP + FP) |
-| **recall** | TP / (TP + FN) |
-| **fscore** | Harmonic mean of precision and recall |
-
-#### Global / FISBe
-| Metric | Description |
-| :--- | :--- |
-| **avAP** | Mean AP proxy across thresholds ≥ 0.5 |
-| **FM** | False Merges (many-to-many matching with consumption) |
-| **FS** | False Splits (many-to-many matching with consumption) |
-| **avg_gt_skel_coverage** | Mean skeleton coverage of GT instances by associated predictions (association via best-match mapping) |
+**Key Metrics**
+- **avF1**: Average F1 score across clDice thresholds.
+- **C (Coverage)**: Average ground truth skeleton coverage (assigned via max clPrecision; score via clRecall on union of matches).  
+- **clDiceTP**: Average clDice score of matched True Positives (at threshold 0.5).
+- **tp**: Relative number of True Positives at threshold 0.5 ($TP_{0.5} / N_{GT}$).
+- **FS (False Splits)**: $\sum_{gt} \max(0, N_{\text{assigned\_pred}} - 1)$
+- **FM (False Merges)**: $\sum_{pred} \max(0, N_{\text{assigned\_gt}} - 1)$
+
+### Partly Labeled Data Mode (`--partly`)
+FISBe includes 71 partly labeled images where only a subset of neurons is annotated.
+- **Logic**: Unmatched predictions are only counted as False Positives if they match a **Foreground GT instance**.
+- **Background Exclusion**: Predictions matching background (unlabeled regions) are ignored.
+
+## Output Structure
+Metrics returned by the API or saved to disk are grouped into category-specific dictionaries:
+
+```python
+metrics["confusion_matrix"]
+├── TP / FP / FN         # Counts across all images
+├── precision / recall   # Standard detection metrics
+└── avAP                 # Mean precision × recall proxy
+
+metrics["general"]
+├── aggregate_score      # S (Official Ranking Score)
+├── avg_gt_skel_coverage # C (Coverage)
+├── FM                   # Global False Merge count
+└── FS                   # Global False Split count
+
+metrics["curves"]
+└── F1_0.1 … F1_0.9     # Per-threshold performance
+```
\ No newline at end of file
diff --git a/docs/METRICS.md b/docs/METRICS.md
new file mode 100644
index 0000000..9b73935
--- /dev/null
+++ b/docs/METRICS.md
@@ -0,0 +1,142 @@
+# FISBe Metrics Reference
+
+This document details all evaluation metrics computed by the pipeline.
+
+## 1. Official FlyLight Benchmark Metrics (`--app flylight`)
+These metrics determine the **FISBe Leaderboard** ranking. They are designed for long-range, thin filamentous structures.
+
+### Primary Ranking Score ($S$)
+The single scalar used to rank methods.
+$$S = 0.5 \cdot \text{avF1} + 0.5 \cdot C$$
+
+### Website leaderboard column mapping
+| Website column | Meaning | Pipeline key |
+|---|---|---|
+| S | $0.5 \cdot \text{avF1} + 0.5 \cdot C$ | `general.aggregate_score` |
+| avF1 | mean F1 over th=0.1..0.9 | `confusion_matrix.avFscore19` |
+| C | avg GT coverage (union-of-preds, clRecall) | `general.avg_gt_skel_coverage` |
+| clDiceTP | mean clDice of TP matches at th=0.5 | `general.avg_TP_05_cldice` |
+| tp | (#TP matches at th=0.5) / (#GT) | `general.TP_05_rel` |
+| FS | false splits count | `general.FS` |
+| FM | false merges count | `general.FM` |
+
+---
+
+### Official Ranking Metrics Definition
+| Metric Key | Definition | Matching Strategy |
+| :--- | :--- | :--- |
+| **`avFscore19`** (avF1) | Mean F1 score averaged over clDice thresholds 0.1 to 0.9. | Greedy 1-to-1 |
+| **`avg_gt_skel_coverage`** (C) | Average GT centerline coverage computed per GT using the union of matched predictions (see below). | One-to-Many |
+
+#### Coverage (C) definition (official protocol)
+**Localization:** compute clPrecision for all (pred, gt) pairs.  
+**Matching (one-to-many):** assign each predicted instance to the GT instance with the highest clPrecision score (a GT can receive multiple predictions).  
+**Computation:** for each GT, compute clRecall using the union of all predictions assigned to that GT (to avoid double-counting in overlaps), then average over all GT instances.
+
+#### avF1 computation details (official protocol)
+For each threshold th ∈ {0.1, 0.2, …, 0.9} after greedy 1-to-1 matching by clDice:
+- **TP**: matched (pred, gt) with clDice > th
+- **FP**: unmatched predicted instances
+- **FN**: unmatched GT instances
+
+Compute F1 = 2TP / (2TP + FP + FN) aggregated across all images, then average across thresholds.
+
+### Topology Error Attribution
+Explicit counts of topological errors specific to filamentous structures.
+
+| Metric Key | Definition | Matching Strategy |
+| :--- | :--- | :--- |
+| **`FS`** (False Splits) | $\sum_{gt} \max(0, N_{\text{assigned\_pred}} - 1)$ | Many-to-Many (Consumption) |
+| **`FM`** (False Merges) | $\sum_{pred} \max(0, N_{\text{assigned\_gt}} - 1)$ | Many-to-Many (Consumption) |
+
+### Localization Criteria
+**clDice (Benchmark Default)**
+Centerline Dice evaluates the agreement between skeletonized structures. It is the robust choice for thin filamentous objects where standard pixel-level IoU is mathematically unstable due to small boundary variations.
+- **clPrecision**: Fraction of the predicted skeleton lying within the ground truth mask.
+- **clRecall**: Fraction of the ground truth skeleton covered by the predicted mask.
+- **clDice**: Harmonic mean of centerline precision and recall.
+
+### Matching Strategies
+**1. Greedy One-to-One Matching**
+Pairs are sorted by localization score and matched if both remain unassigned.
+*Used for: TP/FP/FN counts, F1-scores, and Precision/Recall curves.*
+
+**2. One-to-Many Matching**
+Assigns each predicted instance to the single ground truth instance with which it shares the highest clPrecision.
+*Used for: Average GT Coverage (C).*
+
+**3. Greedy Many-to-Many Matching with Consumption**
+Iteratively matches GT and predictions while "consuming" pixels—removing them from availability once matched. This prevents double-counting of overlapping structures.
+*Used for: False Splits (FS) and False Merges (FM).*
+
+---
+
+## 2. Standard Instance Metrics (Reported per Threshold)
+These metrics are computed for every threshold (e.g., `th_0_5`) using **Greedy 1-to-1 matching**.
+
+| Metric Key | Description |
+| :--- | :--- |
+| **`AP_TP`** | True Positives count. |
+| **`AP_FP`** | False Positives count (filtered in `--partly` mode). |
+| **`AP_FN`** | False Negatives count. |
+| **`precision`** | $TP / (TP + FP)$ |
+| **`recall`** | $TP / (TP + FN)$ |
+| **`fscore`** | Harmonic mean of precision and recall. |
+| **`AP`** | Precision $\times$ Recall (at specific threshold). |
+
+> **Note on Averages:** The code also aggregates these across thresholds:
+> * **`avAP`** (or `avAP59`): Mean AP over thresholds **0.5 to 0.9**.
+> * **`avFscore`** (or `avFscore19`): Mean F1 over thresholds **0.1 to 0.9**.
+
+---
+
+## 3. Diagnostic & Sub-group Metrics
+The code calculates these additional metrics for deep-dive analysis. They do not affect the primary ranking score.
+
+### General Stats
+| Metric Key | Description |
+| :--- | :--- |
+| **`Num GT`** | Total number of ground truth instances. |
+| **`Num Pred`** | Total number of predicted instances. |
+| **`TP_05`** | Count of True Positives at threshold 0.5. |
+| **`TP_05_rel`** | Fraction of GT instances detected at threshold 0.5 ($TP_{0.5} / N_{GT}$). |
+
+### Quality of Matches
+> **Note:** `avg_TP_05_cldice` corresponds to the leaderboard metric **clDiceTP**, and `TP_05_rel` corresponds to **tp**.
+
+| Metric Key | Description |
+| :--- | :--- |
+| **`avg_TP_05_cldice`** | Average `clDice` score of matched True Positives (at $th=0.5$). Indicates segmentation quality of detected objects. |
+| **`TP_05_cldice`** | List of individual `clDice` scores for all TP pairs. |
+
+### Challenging Subsets (Dim & Overlapping)
+Metrics evaluated only on specific subsets of Ground Truth neurons.
+
+| Metric Key | Description |
+| :--- | :--- |
+| **`GT_dim`** | Total number of dim (low contrast) GT neurons. |
+| **`TP_05_dim`** | Number of dim neurons correctly detected ($clDice > 0.5$). |
+| **`TP_05_rel_dim`** | Fraction of dim neurons detected. |
+| **`avg_gt_cov_dim`** | Coverage ($C$) score computed only on dim neurons. |
+| **`GT_overlap`** | Total number of GT neurons involved in overlaps. |
+| **`TP_05_overlap`** | Number of overlapping neurons correctly detected ($clDice > 0.5$). |
+| **`TP_05_rel_overlap`** | Fraction of overlapping neurons detected. |
+| **`avg_gt_cov_overlap`** | Coverage ($C$) score computed only on overlapping neurons. |
+
+---
+
+## 4. Evaluation Logic
+
+### Localization Criterion
+* **Default:** `clDice` (Centerline Dice). Robust to boundary variations in thin structures.
+* **Options:** `iou` (Intersection over Union) is available via API but not used for the benchmark.
+
+### Matching Strategies
+* **Greedy 1-to-1:** Used for all standard detection metrics (TP, FP, F1).
+* **One-to-Many:** Used for Coverage ($C$). Assigns a prediction to the GT it covers best.
+* **Many-to-Many (Consumption):** Used for FS/FM. "Consumes" pixels to prevent double-counting in overlaps.
+
+### Partly Labeled Data (`--partly`)
+Handles sparse Ground Truth where not all neurons are annotated.
+* **Logic:** Unmatched predictions are counted as False Positives (**FP**) *only* if their best match is a **Foreground** GT instance.
+* **Background:** Predictions matching background (unlabeled regions) are ignored.
diff --git a/evalinstseg/evaluate.py b/evalinstseg/evaluate.py
index 6c42ad6..fd082e2 100644
--- a/evalinstseg/evaluate.py
+++ b/evalinstseg/evaluate.py
@@ -27,8 +27,7 @@
 from .visualize import visualize_neurons, visualize_nuclei
 from .summarize import (
     summarize_metric_dict,
-    average_flylight_score_over_instances,
-    average_sets,
+    aggregate_and_report,
 )
 
 logger = logging.getLogger(__name__)
@@ -473,6 +472,31 @@ def evaluate_volume(
     return metrics
 
 
+def print_and_collect_stats(mode_name, score_list, num_expected_runs):
+    """Calculates Mean ± Std for stability runs and prepares data for export."""
+    if not score_list:
+        return []
+
+    print(f"\n--- {mode_name} ---")
+    report_rows = []
+    keys = score_list[0].keys()
+
+    for key in keys:
+        vals = [s[key] for s in score_list if key in s]
+        if len(vals) == num_expected_runs:
+            mean_val = np.mean(vals)
+            std_val = np.std(vals)
+            print(f"{key:<30}: {mean_val:.4f} ± {std_val:.4f}")
+
+            report_rows.append({
+                "Mode": mode_name,
+                "Metric": key,
+                "Mean": mean_val,
+                "StdDev": std_val
+            })
+    return report_rows
+
+
 # TODO: option to just pass config (toml) file instead of flags
 def main():
     """main entry point if called from command line
@@ -691,7 +715,8 @@ def _run_loop(res_files, gt_files, out_dirs):
             loop_metrics.append(metric_dict)
             loop_samples.append(sample_name)
             actual_partly_flags.append(is_partly)
-            print(f"Evaluated {sample_name}: {metric_dict}, Partly={is_partly}")
+            # print(f"Evaluated {sample_name}: {metric_dict}, Partly={is_partly}")
+            print(f"Evaluated {sample_name}", "Partly:", is_partly)
             
         return loop_metrics, loop_samples, np.array(actual_partly_flags, dtype=bool)
 
@@ -748,145 +773,104 @@ def _run_loop(res_files, gt_files, out_dirs):
         args.fs_thresh = 0.05
         args.eval_dim = True
 
-    # Stability Mode (Wraps logic 3 times)
+    # Determine runs: Stability (3 runs) or Normal (1 run)
+    run_configs = []
     if args.stability_mode:
         if not args.run_dirs or len(args.run_dirs) != 3:
-            raise ValueError("Stability mode requires exactly 3 directories passed to --run_dirs")
+            raise ValueError("Stability mode requires exactly 3 directories.")
+        for i, rdir in enumerate(args.run_dirs):
+            run_configs.append({
+                "res_dir": rdir, 
+                "out_dir": os.path.join(args.out_dir[0], f"seed_{i+1}")
+            })
+    else:
+        # Detect if res_file is a directory or a list of specific files
+        if len(args.res_file) == 1 and os.path.isdir(args.res_file[0]) and not args.res_file[0].endswith(".zarr"):
+            run_configs.append({"res_dir": args.res_file[0], "out_dir": args.out_dir[0]})
+        else:
+            run_configs.append({"files": args.res_file, "out_dir": args.out_dir[0]})
+
+    all_run_data = []
+
+    # --- 2. UNIFIED EVALUATION LOOP ---
+    for run in run_configs:
+        # File discovery for this run
+        if "res_dir" in run:
+            res_files = natsorted(glob.glob(run["res_dir"] + "/*.hdf")) or \
+                        natsorted(glob.glob(run["res_dir"] + "/*.zarr"))
+            gt_files = [get_gt_file(fn, args.gt_file[0]) for fn in res_files]
+        else:
+            res_files = run["files"]
+            gt_files = args.gt_file # Assumes lists are already aligned
         
-        # Lists to store the 3 run results for each mode
-        scores_complete = []
-        scores_partly = []
-        scores_combined = []
+        # Prepare out_dirs list for _run_loop (it expects a list corresponding to files)
+        # Note: In stability mode logic of original code: run_out_dirs = [os.path.join(args.out_dir[0], f"seed_{run_idx+1}")] * len(run_res_files)
+        # In normal mode logic of original code: outdir_list = args.out_dir * len(args.res_file) (if 1 out dir)
         
-        print("--- 3x STABILITY MODE ---")
+        # Here run["out_dir"] is a single string for the run.
+        out_dirs = [run["out_dir"]] * len(res_files)
         
-        for run_idx, run_dir in enumerate(args.run_dirs):
-            print(f"Processing Run {run_idx+1}: {run_dir}")
-            
-            # Auto-detect files for this run
-            run_res_files = natsorted(glob.glob(run_dir + "/*.hdf")) 
-            if not run_res_files: 
-                run_res_files = natsorted(glob.glob(run_dir + "/*.zarr"))
-            
-            # Assume gt_file is the PARENT folder
-            run_gt_files = [get_gt_file(fn, args.gt_file[0]) for fn in run_res_files]
-            run_out_dirs = [os.path.join(args.out_dir[0], f"seed_{run_idx+1}")] * len(run_res_files)
-            
-            # Run the inner loop
-            metric_dicts, samples, actual_partly_flags = _run_loop(run_res_files, run_gt_files, run_out_dirs)
-            
-            metrics_full = {s: m for m, s in zip(metric_dicts, samples) if m is not None}
-            s_arr = np.array(samples)
-
-            # CALCULATE SEPARATE MODES
-            if len(np.unique(actual_partly_flags)) > 1:
-                # Mode 1: Complete Only
-                acc_cpt, acc_inst_cpt = average_flylight_score_over_instances(s_arr[~actual_partly_flags], metrics_full)
-                scores_complete.append(acc_cpt)
-
-                # Mode 2: Partly Only
-                acc_prt, acc_inst_prt = average_flylight_score_over_instances(s_arr[actual_partly_flags], metrics_full)
-                scores_partly.append(acc_prt)
-
-                # Mode 3: Combined
-                acc_comb, _ = average_sets(acc_cpt, acc_inst_cpt, acc_prt, acc_inst_prt)
-                scores_combined.append(acc_comb)
-            else:
-                # Uniform case
-                acc, _ = average_flylight_score_over_instances(samples, metrics_full)
-                scores_combined.append(acc)
-
-        # Print Average and Std Dev across runs
-        print("\n=== FISBe BENCHMARK RESULTS (Mean ± Std) ===")
+        # Evaluate all files in the run
+        metric_dicts, samples, flags = _run_loop(res_files, gt_files, out_dirs)
         
-        def print_stats(title, score_list):
-            if not score_list: return
-            print(f"\n--- {title} ---")
-            for key in score_list[0].keys():
-                vals = [s[key] for s in score_list if key in s]
-                if len(vals) == 3:
-                    print(f"{key:<30}: {np.mean(vals):.4f} ± {np.std(vals):.4f}")
+        # Aggregate scores (Combined, Partly, Complete)
+        scores = aggregate_and_report(metric_dicts, samples, flags)
         
-        # Report all 3
-        print_stats("MODE 1: COMPLETELY LABELED", scores_complete)
-        print_stats("MODE 2: PARTLY LABELED", scores_partly)
-        print_stats("MODE 3: COMBINED (MAIN BENCHMARK)", scores_combined)
-
-    # Normal Mode 
-    else:
-        print("--- EVALUATE USING SINGLE DIR ---")
-        # shortcut if res_file and gt_file contain folders
-        if len(args.res_file) == 1 and len(args.gt_file) == 1:
-            res_file = args.res_file[0]
-            gt_file = args.gt_file[0]
-            if (os.path.isdir(res_file) and not res_file.endswith(".zarr")) and (
-                os.path.isdir(gt_file) and not gt_file.endswith(".zarr")
-            ):
-                args.res_file = natsorted(glob.glob(res_file + "/*.hdf"))
-                args.gt_file = [get_gt_file(fn, gt_file) for fn in args.res_file]
-
-        # check same length for result and gt files
-        assert len(args.res_file) == len(args.gt_file), (
-            "Please check, not the same number of result and gt files"
-        )
+        all_run_data.append({
+            "metrics": metric_dicts,
+            "samples": samples,
+            "flags": flags,
+            "scores": scores # Tuple: (res_cpt, res_prt, res_comb)
+        })
+
+    # --- 3. FINAL REPORTING ---
+    if not args.stability_mode:
+        # MODE: Normal (Single Run)
+        run = all_run_data[0]
+        res_cpt, res_prt, res_comb = run["scores"]
+        summary_path = args.summary_out_dir or args.out_dir[0]
         
-        # We don't need partly_list prep here anymore as we auto-detect
-        # But we still check validity if user provided it
-        if args.partly_list is not None:
-             assert len(args.partly_list) == len(args.res_file)
-
-        # check out_dir
-        if len(args.res_file) > 1:
-            if len(args.out_dir) > 1:
-                assert len(args.res_file) == len(args.out_dir), (
-                    "Please check, number of input files and output folders should correspond"
-                )
-                outdir_list = args.out_dir
-            else:
-                outdir_list = args.out_dir * len(args.res_file)
-        else:
-            assert len(args.out_dir) == 1, "Please check number of output directories"
-            outdir_list = args.out_dir
-        # check output dir for summary
-        if args.summary_out_dir is None:
-            args.summary_out_dir = args.out_dir[0]
-
-        metric_dicts, samples, actual_partly_flags = _run_loop(args.res_file, args.gt_file, outdir_list)
-
-        # aggregate over instances
-        metrics_full = {}
-        acc_all_instances = None
-        for metric_dict, sample in zip(metric_dicts, samples):
-            if metric_dict is None:
-                continue
-            metrics_full[sample] = metric_dict
+        if args.summary:
+            # Save Main Summary
+            summarize_metric_dict(run["metrics"], run["samples"], args.summary, 
+                                 os.path.join(summary_path, "summary.csv"), 
+                                 agg_inst_dict=res_comb[1])
             
-        if len(np.unique(actual_partly_flags)) > 1:
-            print("averaging for combined")
-            # get average over instances for completely
-            samples = np.array(samples)
-            acc_cpt, acc_inst_cpt = average_flylight_score_over_instances(
-                samples[actual_partly_flags == False], metrics_full
-            )
-            acc_prt, acc_inst_prt = average_flylight_score_over_instances(
-                samples[actual_partly_flags == True], metrics_full
-            )
-            acc, acc_all_instances = average_sets(
-                acc_cpt, acc_inst_cpt, acc_prt, acc_inst_prt
-            )
+            # Save Split Summaries if mixed data exists
+            if res_cpt is not None:
+                s_arr = np.array(run["samples"])
+                summarize_metric_dict([m for m, f in zip(run["metrics"], run["flags"]) if not f], 
+                                     s_arr[~run["flags"]], args.summary, 
+                                     os.path.join(summary_path, "summary_complete.csv"), 
+                                     agg_inst_dict=res_cpt[1])
+                summarize_metric_dict([m for m, f in zip(run["metrics"], run["flags"]) if f], 
+                                     s_arr[run["flags"]], args.summary, 
+                                     os.path.join(summary_path, "summary_partly.csv"), 
+                                     agg_inst_dict=res_prt[1])
+    else:
+        # MODE: Stability (Mean ± Std)
+        print("\n=== FISBe BENCHMARK RESULTS (Mean ± Std) ===")
+        # Extract mean dicts from each run
+        cpt_scores = [r["scores"][0][0] for r in all_run_data if r["scores"][0]]
+        prt_scores = [r["scores"][1][0] for r in all_run_data if r["scores"][1]]
+        comb_scores = [r["scores"][2][0] for r in all_run_data if r["scores"][2]]
+        
+        stability_rows = []
+        stability_rows += print_and_collect_stats("MODE 1: COMPLETE", cpt_scores, 3)
+        stability_rows += print_and_collect_stats("MODE 2: PARTLY", prt_scores, 3)
+        stability_rows += print_and_collect_stats("MODE 3: COMBINED", comb_scores, 3)
+        
+        # Export Stability CSV
+        if stability_rows:
+            import pandas as pd
+            out_csv = os.path.join(args.out_dir[0], "stability_report.csv")
+            try:
+                df = pd.DataFrame(stability_rows)
+                df.to_csv(out_csv, index=False)
+                print(f"\nStability report saved to: {out_csv}")
+            except ImportError:
+                 print("\nWarning: pandas not found, skipping CSV export of stability report.")
 
-        else:
-            acc, acc_all_instances = average_flylight_score_over_instances(
-                samples, metrics_full
-            )
-        if args.summary:
-            summarize_metric_dict(
-                metric_dicts,
-                samples,
-                args.summary,
-                os.path.join(args.summary_out_dir, "summary.csv"),
-                agg_inst_dict=acc_all_instances,
-            )
 
 
 if __name__ == "__main__":
diff --git a/evalinstseg/summarize.py b/evalinstseg/summarize.py
index d7ee27f..24744fb 100644
--- a/evalinstseg/summarize.py
+++ b/evalinstseg/summarize.py
@@ -240,3 +240,41 @@ def average_sets(acc_a, dict_a, acc_b, dict_b):
                     cm_a["AP_FN"] + cm_b["AP_FN"]
     return acc, per_instance_counts
 
+
+def aggregate_and_report(metric_dicts, samples, partly_flags):
+    """
+    A single source of truth for splitting 'Complete' vs 'Partly' 
+    and calculating the 'Combined' score.
+    """
+    try:
+        metrics_full = {s: m for m, s in zip(metric_dicts, samples) if m is not None}
+        s_arr = np.array(samples)
+        
+        # Check if we have consistent partly flags (length matches samples)
+        if len(partly_flags) != len(samples):
+             # Try to subset if metric_dicts has fewer items (e.g. None filtered out earlier)?
+             # The caller usually provides matching lists. 
+             # But if metric_dicts has Nones, metrics_full will be smaller.
+             # We should rely on s_arr being aligned with partly_flags.
+             pass
+
+        if len(np.unique(partly_flags)) > 1:
+            # 1. Complete
+            cpt_mask = partly_flags == False
+            acc_cpt, inst_cpt = average_flylight_score_over_instances(s_arr[cpt_mask], metrics_full)
+            
+            # 2. Partly
+            prt_mask = partly_flags == True
+            acc_prt, inst_prt = average_flylight_score_over_instances(s_arr[prt_mask], metrics_full)
+            
+            # 3. Combined
+            acc_comb, inst_comb = average_sets(acc_cpt, inst_cpt, acc_prt, inst_prt)
+            
+            return (acc_cpt, inst_cpt), (acc_prt, inst_prt), (acc_comb, inst_comb)
+        else:
+            acc, inst = average_flylight_score_over_instances(samples, metrics_full)
+            return None, None, (acc, inst)
+    except Exception as e:
+        print(f"Error in aggregation: {e}")
+        return None, None, (0.0, {})
+

From 40d6e647b29b356238866c23d13f7037143f7663 Mon Sep 17 00:00:00 2001
From: Conscht <constantin.auga@gmx.de>
Date: Thu, 19 Feb 2026 12:46:56 +0100
Subject: [PATCH 3/5] fixed math display in readme

---
 README.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 57f39e3..d413537 100644
--- a/README.md
+++ b/README.md
@@ -152,13 +152,14 @@ The `flylight` preset implements the specific metrics described in the FISBe pap
 The single scalar used to rank methods on the leaderboard:
 $$S = 0.5 \cdot \text{avF1} + 0.5 \cdot C$$
 
-**Key Metrics**
+### Key Metrics
+
 - **avF1**: Average F1 score across clDice thresholds.
-- **C (Coverage)**: Average ground truth skeleton coverage (assigned via max clPrecision; score via clRecall on union of matches).  
+- **C (Coverage)**: Average ground truth skeleton coverage (assigned via max clPrecision; score via clRecall on union of matches).
 - **clDiceTP**: Average clDice score of matched True Positives (at threshold 0.5).
-- **tp**: Relative number of True Positives at threshold 0.5 ($TP_{0.5} / N_{GT}$).
-- **FS (False Splits)**: $\sum_{gt} \max(0, N_{\text{assigned\_pred}} - 1)$
-- **FM (False Merges)**: $\sum_{pred} \max(0, N_{\text{assigned\_gt}} - 1)$
+- **tp**: Relative number of True Positives at threshold 0.5 ($TP_{0.5} / N_{\mathrm{GT}}$).
+- **FS (False Splits)**: $\sum_{\mathrm{gt}} \max(0, N_{\text{assigned\_pred}} - 1)$
+- **FM (False Merges)**: $\sum_{\mathrm{pred}} \max(0, N_{\text{assigned\_gt}} - 1)$
 
 ### Partly Labeled Data Mode (`--partly`)
 FISBe includes 71 partly labeled images where only a subset of neurons is annotated.

From 07e7f7bd1f4b0ad06644be96e0933e35d29b6470 Mon Sep 17 00:00:00 2001
From: Conscht <constantin.auga@gmx.de>
Date: Thu, 19 Feb 2026 12:48:49 +0100
Subject: [PATCH 4/5] removed latex code to fix readme

---
 README.md | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index d413537..6a17133 100644
--- a/README.md
+++ b/README.md
@@ -153,13 +153,12 @@ The single scalar used to rank methods on the leaderboard:
 $$S = 0.5 \cdot \text{avF1} + 0.5 \cdot C$$
 
 ### Key Metrics
-
 - **avF1**: Average F1 score across clDice thresholds.
-- **C (Coverage)**: Average ground truth skeleton coverage (assigned via max clPrecision; score via clRecall on union of matches).
-- **clDiceTP**: Average clDice score of matched True Positives (at threshold 0.5).
-- **tp**: Relative number of True Positives at threshold 0.5 ($TP_{0.5} / N_{\mathrm{GT}}$).
-- **FS (False Splits)**: $\sum_{\mathrm{gt}} \max(0, N_{\text{assigned\_pred}} - 1)$
-- **FM (False Merges)**: $\sum_{\mathrm{pred}} \max(0, N_{\text{assigned\_gt}} - 1)$
+- **C (Coverage)**: Average GT skeleton coverage (assignment via max clPrecision; scoring via clRecall on union of matches).
+- **clDiceTP**: Average clDice score of matched TPs at threshold 0.5.
+- **tp**: Relative number of TPs at threshold 0.5 (`TP_0.5 / N_GT`).
+- **FS (False Splits)**: Sum over GT of `max(0, N_assigned_pred - 1)`.
+- **FM (False Merges)**: Sum over predictions of `max(0, N_assigned_gt - 1)`.
 
 ### Partly Labeled Data Mode (`--partly`)
 FISBe includes 71 partly labeled images where only a subset of neurons is annotated.

From 84c9c6de0d7b8e541f8308b48cd667154a38f920 Mon Sep 17 00:00:00 2001
From: Conscht <constantin.auga@gmx.de>
Date: Thu, 19 Feb 2026 12:54:15 +0100
Subject: [PATCH 5/5] fixed latex code in METRICS.md

---
 docs/METRICS.md | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/docs/METRICS.md b/docs/METRICS.md
index 9b73935..3b950ed 100644
--- a/docs/METRICS.md
+++ b/docs/METRICS.md
@@ -16,17 +16,17 @@ $$S = 0.5 \cdot \text{avF1} + 0.5 \cdot C$$
 | avF1 | mean F1 over th=0.1..0.9 | `confusion_matrix.avFscore19` |
 | C | avg GT coverage (union-of-preds, clRecall) | `general.avg_gt_skel_coverage` |
 | clDiceTP | mean clDice of TP matches at th=0.5 | `general.avg_TP_05_cldice` |
-| tp | (#TP matches at th=0.5) / (#GT) | `general.TP_05_rel` |
+| tp | $(\#TP \text{ at } th=0.5) / (\#GT)$ | `general.TP_05_rel` |
 | FS | false splits count | `general.FS` |
 | FM | false merges count | `general.FM` |
 
 ---
 
-### Official Ranking Metrics Definition
+### Official ranking metrics definition
 | Metric Key | Definition | Matching Strategy |
 | :--- | :--- | :--- |
-| **`avFscore19`** (avF1) | Mean F1 score averaged over clDice thresholds 0.1 to 0.9. | Greedy 1-to-1 |
-| **`avg_gt_skel_coverage`** (C) | Average GT centerline coverage computed per GT using the union of matched predictions (see below). | One-to-Many |
+| **`confusion_matrix.avFscore19`** (avF1) | Mean F1 score averaged over clDice thresholds 0.1 to 0.9. | Greedy 1-to-1 |
+| **`general.avg_gt_skel_coverage`** (C) | Average GT centerline coverage computed per GT using the union of matched predictions (see below). | One-to-Many |
 
 #### Coverage (C) definition (official protocol)
 **Localization:** compute clPrecision for all (pred, gt) pairs.  
@@ -41,13 +41,14 @@ For each threshold th ∈ {0.1, 0.2, …, 0.9} after greedy 1-to-1 matching by c
 
 Compute F1 = 2TP / (2TP + FP + FN) aggregated across all images, then average across thresholds.
 
-### Topology Error Attribution
+### Topology error attribution
 Explicit counts of topological errors specific to filamentous structures.
 
 | Metric Key | Definition | Matching Strategy |
 | :--- | :--- | :--- |
-| **`FS`** (False Splits) | $\sum_{gt} \max(0, N_{\text{assigned\_pred}} - 1)$ | Many-to-Many (Consumption) |
-| **`FM`** (False Merges) | $\sum_{pred} \max(0, N_{\text{assigned\_gt}} - 1)$ | Many-to-Many (Consumption) |
+| **`general.FS`** (False Splits) | $\sum_{\mathrm{gt}} \max(0, N_{\text{assigned pred}} - 1)$ | Many-to-Many (Consumption) |
+| **`general.FM`** (False Merges) | $\sum_{\mathrm{pred}} \max(0, N_{\text{assigned gt}} - 1)$ | Many-to-Many (Consumption) |
+
 
 ### Localization Criteria
 **clDice (Benchmark Default)**