From cb61d74c74dcf8c700fc0cf5bd4181201c4e7d26 Mon Sep 17 00:00:00 2001 From: Conscht Date: Tue, 17 Feb 2026 15:01:38 +0100 Subject: [PATCH 1/5] Automatically evalute partly, full labeled, abd combined data --- README.md | 25 ++-- assets/sample_list_per_split.txt | 117 +++++++++++++++ evalinstseg/evaluate.py | 236 ++++++++++++++++++------------- evalinstseg/metrics.py | 17 ++- evalinstseg/summarize.py | 4 +- evalinstseg/util.py | 27 ++++ 6 files changed, 310 insertions(+), 116 deletions(-) create mode 100644 assets/sample_list_per_split.txt diff --git a/README.md b/README.md index 6cf2540..af2784f 100644 --- a/README.md +++ b/README.md @@ -55,12 +55,13 @@ The `evalinstseg` command is automatically available after installation. ### 1. Evaluate a Single File ```bash -evalinstseg \ - --res_file tests/pred/R14A02-20180905.hdf \ - --res_key volumes/labels \ - --gt_file tests/gt/R14A02-20180905.zarr \ - --gt_key volumes/gt_instances \ - --out_dir tests/results \ +evalinstseg ` + --res_file tests/pred/R14A02-20180905_65_A6.hdf ` + --res_key volumes/gmm_label_cleaned ` + --gt_file tests/gt/R14A02-20180905_65_A6.zarr ` + --gt_key volumes/gt_instances ` + --split_file assets/sample_list_per_split.txt ` + --out_dir tests/results ` --app flylight ``` @@ -68,12 +69,12 @@ evalinstseg \ If you provide a directory path to `--res_file`, the tool will look for matching Ground Truth files in the `--gt_file` folder. Files are matched by name. ```bash -evalinstseg \ - --res_file /path/to/predictions_folder \ - --res_key volumes/labels \ - --gt_file /path/to/ground_truth_folder \ - --gt_key volumes/gt_instances \ - --out_dir /path/to/output_folder \ +evalinstseg ` + --res_file /path/to/predictions_folder ` + --res_key volumes/gmm_label_cleaned ` + --gt_file /path/to/ground_truth_folder ` + --gt_key volumes/gt_instances ` + --out_dir /path/to/output_folder ` --app flylight ``` diff --git a/assets/sample_list_per_split.txt b/assets/sample_list_per_split.txt new file mode 100644 index 0000000..c6ca31b --- /dev/null +++ b/assets/sample_list_per_split.txt @@ -0,0 +1,117 @@ +(1) samples for FlyLight completely: + +train: +R38F04-20181005_63_G3 +R38F04-20181005_63_G5 +R38F04-20181005_63_H1 +R53A10-20181019_64_A4 +R75E01-20181030_64_D1 +VT008647-20171222_63_D2 +VT008647-20171222_63_D1 +VT008647-20171222_63_E1 +VT019303-20171013_65_B6 +VT019307-20171013_65_F1 +VT033051-20171128_61_E4 +VT033051-20171128_61_E2 +VT040433-20170919_63_D6 +VT047848-20171020_66_I3 +VT047848-20171020_66_I2 +VT047848-20171020_66_J2 +VT047848-20171020_66_I5 +VT061467-20180911_62_E5 + +val: +R22C03-20180918_66_J2 +VT012403-20171128_61_B2 +VT033614-20171124_64_H1 +VT033614-20171124_64_H5 +VT041298-20171114_63_C3 + +test: +JRC_SS04989-20160318_24_A2 +R14A02-20180905_65_A6 +R54A09-20181019_64_H1 +VT011145-20171222_63_I1 +VT027175-20171031_62_H3 +VT027175-20171031_62_H4 +VT050157-20171110_61_C1 + +(2) samples for FlyLight partly: + +train: +R14B11-20180905_65_D2 +R14B11-20180905_65_D6 +R24D12-20180921_65_J6 +R38F04-20181005_63_G2 +R38F04-20181005_63_G4 +VT003236-20170602_62_G4 +VT003236-20170602_62_G5 +VT007080-20170517_61_A2 +VT007080-20170517_61_A4 +VT007080-20170517_61_A5 +VT008135-20171122_61_C2 +VT008647-20171222_63_D5 +VT008647-20171222_63_D6 +VT010264-20171222_63_H2 +VT010264-20171222_63_H5 +VT011049-20180918_66_I1 +VT024641-20170615_62_D2 +VT024641-20170615_62_D3 +VT024641-20170615_62_D5 +VT024641-20170615_62_D6 +VT024641-20170615_62_E1 +VT025523-20170915_64_I1 +VT026776-20171017_62_J1 +VT033051-20171128_61_E3 +VT033296-20171010_62_B4 +VT034391-20171128_61_G2 +VT038149-20171103_62_F1 +VT039484-20171020_64_C1 +VT039484-20171020_64_C2 +VT040430-20170919_63_C4 +VT040433-20170919_63_E1 +VT045568-20171020_66_C5 +VT045568-20171020_66_D2 +VT047848-20171020_66_I1 +VT047848-20171020_66_I4 +VT047848-20171020_66_J1 +VT050217-20171110_61_D6 +VT050217-20171110_61_E1 +VT058568-20170926_64_E1 +VT060731-20170517_63_F1 +VT060731-20170517_63_F2 +VT061467-20180911_62_E4 +VT062059-20170727_61_D4 + +val: +JRC_SS05008-20160318_24_B1 +JRC_SS05008-20160318_24_B2 +R22C03-20180918_66_J1 +R9F03-20181030_62_B5 +VT008194-20171222_63_A3 +VT008194-20171222_63_A5 +VT012403-20171128_61_B1 +VT033614-20171124_64_H4 +VT039350-20171020_64_A1 +VT039350-20171020_64_A3 +VT039350-20171020_64_A6 +VT059775-20170630_63_D5 + +test: +R54A09-20181019_64_H4 +R54A09-20181019_64_H6 +R73H08-20181030_62_G5 +VT006202-20170511_63_C4 +VT011145-20171222_63_I2 +VT021537-20171003_61_C3 +VT023747-20171017_61_F1 +VT027175-20171031_62_H6 +VT028606-20170721_65_A2 +VT028606-20170721_65_A3 +VT033453-20170721_65_D2 +VT033453-20170721_65_D4 +VT033453-20170721_65_D5 +VT046838-20170922_62_A2 +VT050157-20171110_61_C5 +VT058571-20170926_64_G6 + diff --git a/evalinstseg/evaluate.py b/evalinstseg/evaluate.py index 46b0fc7..6c42ad6 100644 --- a/evalinstseg/evaluate.py +++ b/evalinstseg/evaluate.py @@ -14,6 +14,7 @@ check_fix_and_unify_ids, get_output_name, read_file, + load_partly_config ) from .localize import compute_localization_criterion from .match import assign_labels, get_false_labels @@ -489,6 +490,10 @@ def main(): parser.add_argument( "--run_dirs", nargs="+", type=str, help="List of 3 experiment directories" ) + parser.add_argument( + "--split_file", type=str, default="sample_list_per_split.txt", + help="Path to sample split file" + ) parser.add_argument( "--res_file", nargs="+", type=str, help="path to result file" ) @@ -630,26 +635,36 @@ def main(): logger.debug("arguments %s", tuple(sys.argv)) args = parser.parse_args() + # --- 1. LOAD CONFIG for PARTLY LABELED DATA --- + partly_set = set() + if args.split_file and os.path.exists(args.split_file): + partly_set = load_partly_config(args.split_file) + logger.info(f"Loaded {len(partly_set)} partly labeled samples from config.") + + # --- HELPER: FILE MATCHING --- def get_gt_file(in_fn, gt_folder): """Helper to get gt file corresponding to input result file.""" - out_fn = os.path.join( - gt_folder, os.path.basename(in_fn).split(".")[0] + ".zarr" - ) - return out_fn + return os.path.join(gt_folder, os.path.basename(in_fn).split(".")[0] + ".zarr") - def _run_loop(res_files, gt_files, out_dirs, partly_list_loc): + # --- HELPER: EVALUATION LOOP --- + def _run_loop(res_files, gt_files, out_dirs): """Core evaluation loop used in normal and stability mode.""" - loop_samples = [] loop_metrics = [] - for res_file, gt_file, partly, out_dir in zip( - res_files, gt_files, partly_list_loc, out_dirs - ): - if not os.path.exists(out_dir): os.makedirs(out_dir, exist_ok=True) - + actual_partly_flags = [] + + for res_file, gt_file, out_dir in zip(res_files, gt_files, out_dirs): + if not os.path.exists(out_dir): + os.makedirs(out_dir, exist_ok=True) sample_name = os.path.basename(res_file).split(".")[0] logger.info("sample_name: %s", sample_name) + # AUTO-DETECT PARTLY MODE + if partly_set: + is_partly = sample_name in partly_set + else: + is_partly = args.partly + metric_dict = evaluate_file( res_file, gt_file, @@ -663,7 +678,7 @@ def _run_loop(res_files, gt_files, out_dirs, partly_list_loc): add_general_metrics=args.add_general_metrics, visualize=args.visualize, visualize_type=args.visualize_type, - partly=partly, + partly=is_partly, foreground_only=args.foreground_only, remove_small_components=args.remove_small_components, evaluate_false_labels=args.evaluate_false_labels, @@ -675,18 +690,76 @@ def _run_loop(res_files, gt_files, out_dirs, partly_list_loc): ) loop_metrics.append(metric_dict) loop_samples.append(sample_name) - print(f"Evaluated {sample_name}: {metric_dict}") + actual_partly_flags.append(is_partly) + print(f"Evaluated {sample_name}: {metric_dict}, Partly={is_partly}") - return loop_metrics, loop_samples + return loop_metrics, loop_samples, np.array(actual_partly_flags, dtype=bool) + + # --- PRESETS --- + if args.app == "flylight": + print("Using FlyLight Presets...") + args.ndim = 3 + args.localization_criterion = "cldice" + args.assignment_strategy = "greedy" + args.remove_small_components = 800 + args.metric = "general.avg_f1_cov_score" + args.add_general_metrics = [ + "avg_gt_skel_coverage", + "avg_f1_cov_score", + "false_merge", + "false_split", + "avg_gt_cov_dim", + "avg_gt_cov_overlap", + ] + args.summary = [ + "general.Num GT", + "general.Num Pred", + "general.avg_f1_cov_score", + "confusion_matrix.avFscore", + "general.avg_gt_skel_coverage", + "confusion_matrix.th_0_5.AP_TP", + "confusion_matrix.th_0_5.AP_FP", + "confusion_matrix.th_0_5.AP_FN", + "general.FM", + "general.FS", + "general.TP_05", + "general.TP_05_rel", + "general.avg_TP_05_cldice", + "general.GT_dim", + "general.TP_05_dim", + "general.TP_05_rel_dim", + "general.avg_gt_cov_dim", + "general.GT_overlap", + "general.TP_05_overlap", + "general.TP_05_rel_overlap", + "general.avg_gt_cov_overlap", + "confusion_matrix.th_0_1.fscore", + "confusion_matrix.th_0_2.fscore", + "confusion_matrix.th_0_3.fscore", + "confusion_matrix.th_0_4.fscore", + "confusion_matrix.th_0_5.fscore", + "confusion_matrix.th_0_6.fscore", + "confusion_matrix.th_0_7.fscore", + "confusion_matrix.th_0_8.fscore", + "confusion_matrix.th_0_9.fscore", + ] + args.visualize_type = "neuron" + args.fm_thresh = 0.1 + args.fs_thresh = 0.05 + args.eval_dim = True # Stability Mode (Wraps logic 3 times) if args.stability_mode: if not args.run_dirs or len(args.run_dirs) != 3: raise ValueError("Stability mode requires exactly 3 directories passed to --run_dirs") - stability_scores = [] - print("--- EVALUTE USING STABILITY MODE ---") - + # Lists to store the 3 run results for each mode + scores_complete = [] + scores_partly = [] + scores_combined = [] + + print("--- 3x STABILITY MODE ---") + for run_idx, run_dir in enumerate(args.run_dirs): print(f"Processing Run {run_idx+1}: {run_dir}") @@ -700,24 +773,48 @@ def _run_loop(res_files, gt_files, out_dirs, partly_list_loc): run_out_dirs = [os.path.join(args.out_dir[0], f"seed_{run_idx+1}")] * len(run_res_files) # Run the inner loop - m_dicts, s_names = _run_loop(run_res_files, run_gt_files, run_out_dirs, [args.partly]*len(run_res_files)) + metric_dicts, samples, actual_partly_flags = _run_loop(run_res_files, run_gt_files, run_out_dirs) - # Aggregate just this run - metrics_full = {s: m for m, s in zip(m_dicts, s_names) if m is not None} - acc, _ = average_flylight_score_over_instances(s_names, metrics_full) - stability_scores.append(acc) + metrics_full = {s: m for m, s in zip(metric_dicts, samples) if m is not None} + s_arr = np.array(samples) + + # CALCULATE SEPARATE MODES + if len(np.unique(actual_partly_flags)) > 1: + # Mode 1: Complete Only + acc_cpt, acc_inst_cpt = average_flylight_score_over_instances(s_arr[~actual_partly_flags], metrics_full) + scores_complete.append(acc_cpt) + + # Mode 2: Partly Only + acc_prt, acc_inst_prt = average_flylight_score_over_instances(s_arr[actual_partly_flags], metrics_full) + scores_partly.append(acc_prt) + + # Mode 3: Combined + acc_comb, _ = average_sets(acc_cpt, acc_inst_cpt, acc_prt, acc_inst_prt) + scores_combined.append(acc_comb) + else: + # Uniform case + acc, _ = average_flylight_score_over_instances(samples, metrics_full) + scores_combined.append(acc) # Print Average and Std Dev across runs print("\n=== FISBe BENCHMARK RESULTS (Mean ± Std) ===") - if stability_scores: - for key in stability_scores[0].keys(): - values = [s[key] for s in stability_scores if key in s] - if len(values) == 3: - print(f"{key:<30}: {np.mean(values):.4f} ± {np.std(values):.4f}") + + def print_stats(title, score_list): + if not score_list: return + print(f"\n--- {title} ---") + for key in score_list[0].keys(): + vals = [s[key] for s in score_list if key in s] + if len(vals) == 3: + print(f"{key:<30}: {np.mean(vals):.4f} ± {np.std(vals):.4f}") + + # Report all 3 + print_stats("MODE 1: COMPLETELY LABELED", scores_complete) + print_stats("MODE 2: PARTLY LABELED", scores_partly) + print_stats("MODE 3: COMBINED (MAIN BENCHMARK)", scores_combined) # Normal Mode else: - print("--- EVALUTE USING SINGLE DIR ---") + print("--- EVALUATE USING SINGLE DIR ---") # shortcut if res_file and gt_file contain folders if len(args.res_file) == 1 and len(args.gt_file) == 1: res_file = args.res_file[0] @@ -732,18 +829,11 @@ def _run_loop(res_files, gt_files, out_dirs, partly_list_loc): assert len(args.res_file) == len(args.gt_file), ( "Please check, not the same number of result and gt files" ) - # set partly parameter for all samples if not done already - if len(args.res_file) > 1: - if args.partly_list is not None: - assert len(args.partly_list) == len(args.res_file), ( - "Please check, not the same number of result files " - "and partly_list values" - ) - partly_list = np.array(args.partly_list, dtype=bool) - else: - partly_list = [args.partly] * len(args.res_file) - else: - partly_list = [args.partly] + + # We don't need partly_list prep here anymore as we auto-detect + # But we still check validity if user provided it + if args.partly_list is not None: + assert len(args.partly_list) == len(args.res_file) # check out_dir if len(args.res_file) > 1: @@ -761,64 +851,7 @@ def _run_loop(res_files, gt_files, out_dirs, partly_list_loc): if args.summary_out_dir is None: args.summary_out_dir = args.out_dir[0] - if args.app is not None: - if args.app == "flylight": - print( - "Warning: parameter app is set and will overwrite parameters. " - "This might not be what you want." - ) - args.ndim = 3 - args.localization_criterion = "cldice" - args.assignment_strategy = "greedy" - args.remove_small_components = 800 - # args.evaluate_false_labels = True - args.metric = "general.avg_f1_cov_score" - args.add_general_metrics = [ - "avg_gt_skel_coverage", - "avg_f1_cov_score", - "false_merge", - "false_split", - "avg_gt_cov_dim", - "avg_gt_cov_overlap", - ] - args.summary = [ - "general.Num GT", - "general.Num Pred", - "general.avg_f1_cov_score", - "confusion_matrix.avFscore", - "general.avg_gt_skel_coverage", - "confusion_matrix.th_0_5.AP_TP", - "confusion_matrix.th_0_5.AP_FP", - "confusion_matrix.th_0_5.AP_FN", - "general.FM", - "general.FS", - "general.TP_05", - "general.TP_05_rel", - "general.avg_TP_05_cldice", - "general.GT_dim", - "general.TP_05_dim", - "general.TP_05_rel_dim", - "general.avg_gt_cov_dim", - "general.GT_overlap", - "general.TP_05_overlap", - "general.TP_05_rel_overlap", - "general.avg_gt_cov_overlap", - "confusion_matrix.th_0_1.fscore", - "confusion_matrix.th_0_2.fscore", - "confusion_matrix.th_0_3.fscore", - "confusion_matrix.th_0_4.fscore", - "confusion_matrix.th_0_5.fscore", - "confusion_matrix.th_0_6.fscore", - "confusion_matrix.th_0_7.fscore", - "confusion_matrix.th_0_8.fscore", - "confusion_matrix.th_0_9.fscore", - ] - args.visualize_type = "neuron" - args.fm_thresh = 0.1 - args.fs_thresh = 0.05 - args.eval_dim = True - - metric_dicts, samples = _run_loop(args.res_file, args.gt_file, outdir_list, partly_list) + metric_dicts, samples, actual_partly_flags = _run_loop(args.res_file, args.gt_file, outdir_list) # aggregate over instances metrics_full = {} @@ -827,15 +860,16 @@ def _run_loop(res_files, gt_files, out_dirs, partly_list_loc): if metric_dict is None: continue metrics_full[sample] = metric_dict - if len(np.unique(partly_list)) > 1: + + if len(np.unique(actual_partly_flags)) > 1: print("averaging for combined") # get average over instances for completely samples = np.array(samples) acc_cpt, acc_inst_cpt = average_flylight_score_over_instances( - samples[partly_list == False], metrics_full + samples[actual_partly_flags == False], metrics_full ) acc_prt, acc_inst_prt = average_flylight_score_over_instances( - samples[partly_list == True], metrics_full + samples[actual_partly_flags == True], metrics_full ) acc, acc_all_instances = average_sets( acc_cpt, acc_inst_cpt, acc_prt, acc_inst_prt diff --git a/evalinstseg/metrics.py b/evalinstseg/metrics.py index 6c1a836..f4c15fb 100644 --- a/evalinstseg/metrics.py +++ b/evalinstseg/metrics.py @@ -1,5 +1,6 @@ import logging import toml +import numpy as np logger = logging.getLogger(__name__) @@ -23,8 +24,22 @@ def __init__(self, fn): def save(self): """dump results to toml file.""" logger.info("saving %s", self.fn) + + def convert_numpy(obj): + if isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, dict): + return {k: convert_numpy(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [convert_numpy(i) for i in obj] + return obj + with open(self.fn + ".toml", "w") as tomlFl: - toml.dump(self.metricsDict, tomlFl) + toml.dump(convert_numpy(self.metricsDict), tomlFl) def addTable(self, name, dct=None): """add new sub-table to result dict diff --git a/evalinstseg/summarize.py b/evalinstseg/summarize.py index 2ed4f16..d7ee27f 100644 --- a/evalinstseg/summarize.py +++ b/evalinstseg/summarize.py @@ -205,8 +205,8 @@ def average_sets(acc_a, dict_a, acc_b, dict_b): gt_covs = list(dict_a["gt_covs"]) + list(dict_b["gt_covs"]) num_gt = dict_a["general"]["Num GT"] + dict_b["general"]["Num GT"] tp_05 = dict_a["general"]["TP_05"] + dict_b["general"]["TP_05"] - tp_05_cldice = list(dict_a["general"]["tp_05_cldice"]) + \ - list(dict_b["general"]["tp_05_cldice"]) + tp_05_cldice = list(dict_a["general"]["TP_05_cldice"]) + \ + list(dict_b["general"]["TP_05_cldice"]) per_instance_counts = {} per_instance_counts["general"] = { diff --git a/evalinstseg/util.py b/evalinstseg/util.py index 8b4d8dc..6821767 100644 --- a/evalinstseg/util.py +++ b/evalinstseg/util.py @@ -38,6 +38,33 @@ def crop(arr, target_shape): return arr[leading + trailing] +def load_partly_config(list_file_path): + """ + Parses the sample_list_per_split.txt to identify which files + are 'partly' labeled. + """ + partly_samples = set() + is_partly_section = False + + with open(list_file_path, 'r') as f: + for line in f: + line = line.strip() + if not line: continue + + # Detect section headers + if "(1) samples for FlyLight completely:" in line: + is_partly_section = False + elif "(2) samples for FlyLight partly:" in line: + is_partly_section = True + elif line.endswith(":") and line[:-1] in ["train", "val", "test"]: + continue # Skip split headers + else: + # It's a filename + if is_partly_section: + partly_samples.add(line) + + return partly_samples + def check_and_fix_sizes(gt_labels, pred_labels, ndim): """check if prediction and gt have same size, otherwise crop the bigger one add channel dimension if missing (channel_first) From b7768e4fe4d5e53f0ed9e7e34381a5213086b11c Mon Sep 17 00:00:00 2001 From: Conscht Date: Thu, 19 Feb 2026 12:42:03 +0100 Subject: [PATCH 2/5] Return also single outputs for each data type (partly, completly, combined) + Adjusted readme and added detailed metric readme in docs --- README.md | 133 ++++++++++----------- docs/METRICS.md | 142 ++++++++++++++++++++++ evalinstseg/evaluate.py | 248 ++++++++++++++++++--------------------- evalinstseg/summarize.py | 38 ++++++ 4 files changed, 357 insertions(+), 204 deletions(-) create mode 100644 docs/METRICS.md diff --git a/README.md b/README.md index af2784f..57f39e3 100644 --- a/README.md +++ b/README.md @@ -13,12 +13,13 @@ This is the official implementation of the **FISBe (FlyLight Instance Segmentati The benchmark supports 2D and 3D segmentations and computes a wide range of commonly used evaluation metrics (e.g., AP, F1, coverage). Crucially, it provides specialized error attribution for topological errors (False Merges, False Splits) relevant to filamentous structures. -### Features -- **Standard Metrics:** AP, F1, Precision, Recall. -- **FISBe Metrics:** Greedy many-to-many matching for False Merges (FM) and False Splits (FS). +### Key Features +- **Official Protocol:** Implements the exact ranking score ($S$) and matching logic defined in the FISBe paper. +- **Topology-Aware:** Uses skeleton-based localization (`clDice`) to handle thin structures robustly. +- **Error Attribution:** Explicitly quantifies False Merges (FM) and False Splits (FS) via many-to-many matching. - **Flexibility:** Supports HDF5 (`.hdf`, `.h5`) and Zarr (`.zarr`) files. -- **Modes:** Run on single files, entire folders, or in stability analysis mode. -- **Partly Labeled Data:** Robust evaluation ignoring background conflicts for sparse Ground Truth. +- **Modes:** Single file, folder evaluation, or 3x stability analysis. +- **Partly Labeled Support:** Robust evaluation that ignores background conflicts for sparse Ground Truth. --- @@ -29,17 +30,14 @@ The recommended way to install is using `uv` (fastest) or `micromamba`. ### Option 1: Using `uv` (Fastest) ```bash -# 1. Install uv (if not installed) pip install uv - -# 2. Clone and install git clone https://github.com/Kainmueller-Lab/evaluate-instance-segmentation.git cd evaluate-instance-segmentation uv venv uv pip install -e . ``` -### Option 2: Using `micromamba` or `conda` +### Option 2: Using micromamba or conda ```bash micromamba create -n evalinstseg python=3.10 @@ -55,13 +53,13 @@ The `evalinstseg` command is automatically available after installation. ### 1. Evaluate a Single File ```bash -evalinstseg ` - --res_file tests/pred/R14A02-20180905_65_A6.hdf ` - --res_key volumes/gmm_label_cleaned ` - --gt_file tests/gt/R14A02-20180905_65_A6.zarr ` - --gt_key volumes/gt_instances ` - --split_file assets/sample_list_per_split.txt ` - --out_dir tests/results ` +evalinstseg \ + --res_file tests/pred/sample_01.hdf \ + --res_key volumes/gmm_label_cleaned \ + --gt_file tests/gt/sample_01.zarr \ + --gt_key volumes/gt_instances \ + --split_file assets/sample_list_per_split.txt \ + --out_dir tests/results \ --app flylight ``` @@ -69,12 +67,12 @@ evalinstseg ` If you provide a directory path to `--res_file`, the tool will look for matching Ground Truth files in the `--gt_file` folder. Files are matched by name. ```bash -evalinstseg ` - --res_file /path/to/predictions_folder ` - --res_key volumes/gmm_label_cleaned ` - --gt_file /path/to/ground_truth_folder ` - --gt_key volumes/gt_instances ` - --out_dir /path/to/output_folder ` +evalinstseg \ + --res_file /path/to/predictions_folder \ + --res_key volumes/gmm_label_cleaned \ + --gt_file /path/to/ground_truth_folder \ + --gt_key volumes/gt_instances \ + --out_dir /path/to/output_folder \ --app flylight ``` @@ -91,11 +89,12 @@ evalinstseg \ ``` **Requirements:** + - `--run_dirs`: Provide exactly 3 folders. - `--gt_file`: The folder containing Ground Truth files (filenames must match predictions). ### 4. Partly Labeled Data -If your ground truth is sparse (not fully dense), use the `--partly` flag. T +If your ground truth is sparse (not fully dense), use the `--partly` flag. See the **Partly Labeled Data Mode** section for details on how False Positives are handled. ## Usage: Python Package You can integrate the benchmark directly into your Python scripts or notebooks. @@ -141,56 +140,46 @@ metrics = evaluate_volume( add_general_metrics=["false_merge", "false_split"] ) ``` -### 4. Partly Labeled Data (`--partly`) -Some samples contain sparse / incomplete GT annotations. In this setting, counting all unmatched predictions as false positives is not meaningful. -When `--partly` is enabled, we approximate FP by counting only **unmatched predictions whose best match is a foreground GT instance** (based on the localization matrix used for evaluation, e.g. clPrecision for `cldice`). -Unmatched predictions whose best match is **background** are ignored. +## FISBe Benchmark Protocol +For a complete reference of all calculated metrics, see [docs/METRICS.md](docs/METRICS.md). +> **Note:** Some output keys use internal names; see the documentation for the exact mapping to website/leaderboard columns. -Concretely, we compute for each unmatched prediction the index of the GT label with maximal overlap score; it is counted as FP only if that index is > 0 (foreground), not 0 (background). +### Official FlyLight Configuration (`--app flylight`) +The `flylight` preset implements the specific metrics described in the FISBe paper for evaluating long-range thin filamentous neuronal structures. ---- +**Primary Ranking Score ($S$)** +The single scalar used to rank methods on the leaderboard: +$$S = 0.5 \cdot \text{avF1} + 0.5 \cdot C$$ -## Metrics Explanation - -### 1. Standard Instance Metrics (TP/FP/FN, F-score, AP proxy) -These metrics are computed from a **one-to-one matching** between GT and prediction instances (Hungarian or greedy), using a chosen localization criterion (default for FlyLight is `cldice`). - -- **TP**: matched pairs above threshold -- **FP**: unmatched predictions (or, in `--partly`, only those whose best match is foreground) -- **FN**: unmatched GT instances -- **precision** = TP / (TP + FP) -- **recall** = TP / (TP + FN) -- **fscore** = 2 * precision * recall / (precision + recall) -- **AP**: we report a simple AP proxy `precision × recall` at each threshold and average it across thresholds (this is not COCO-style AP). - -### 2. FISBe Error Attribution (False Splits / False Merges) -False splits (FS) and false merges (FM) aim to quantify **instance topology errors** for long-range thin filamentous structures. - -We compute FS/FM using **greedy many-to-many matching with consumption**: -- Candidate GT–Pred pairs above threshold are processed in descending score order. -- After selecting a match, we update “available” pixels so that already explained structure is not matched again. -- FS counts when one GT is explained by multiple preds (excess preds per GT). -- FM counts when one pred explains multiple GTs (excess GTs per pred). - -This produces an explicit attribution of split/merge errors rather than only TP/FP/FN. - -### Metric Definitions - -#### Instance-Level (per threshold) -| Metric | Description | -| :--- | :--- | -| **AP_TP** | True Positives (1-to-1 match) | -| **AP_FP** | False Positives (unmatched preds; in `--partly`: only unmatched preds whose best match is foreground) | -| **AP_FN** | False Negatives (unmatched GT) | -| **precision** | TP / (TP + FP) | -| **recall** | TP / (TP + FN) | -| **fscore** | Harmonic mean of precision and recall | - -#### Global / FISBe -| Metric | Description | -| :--- | :--- | -| **avAP** | Mean AP proxy across thresholds ≥ 0.5 | -| **FM** | False Merges (many-to-many matching with consumption) | -| **FS** | False Splits (many-to-many matching with consumption) | -| **avg_gt_skel_coverage** | Mean skeleton coverage of GT instances by associated predictions (association via best-match mapping) | +**Key Metrics** +- **avF1**: Average F1 score across clDice thresholds. +- **C (Coverage)**: Average ground truth skeleton coverage (assigned via max clPrecision; score via clRecall on union of matches). +- **clDiceTP**: Average clDice score of matched True Positives (at threshold 0.5). +- **tp**: Relative number of True Positives at threshold 0.5 ($TP_{0.5} / N_{GT}$). +- **FS (False Splits)**: $\sum_{gt} \max(0, N_{\text{assigned\_pred}} - 1)$ +- **FM (False Merges)**: $\sum_{pred} \max(0, N_{\text{assigned\_gt}} - 1)$ + +### Partly Labeled Data Mode (`--partly`) +FISBe includes 71 partly labeled images where only a subset of neurons is annotated. +- **Logic**: Unmatched predictions are only counted as False Positives if they match a **Foreground GT instance**. +- **Background Exclusion**: Predictions matching background (unlabeled regions) are ignored. + +## Output Structure +Metrics returned by the API or saved to disk are grouped into category-specific dictionaries: + +```python +metrics["confusion_matrix"] +├── TP / FP / FN # Counts across all images +├── precision / recall # Standard detection metrics +└── avAP # Mean precision × recall proxy + +metrics["general"] +├── aggregate_score # S (Official Ranking Score) +├── avg_gt_skel_coverage # C (Coverage) +├── FM # Global False Merge count +└── FS # Global False Split count + +metrics["curves"] +└── F1_0.1 … F1_0.9 # Per-threshold performance +``` \ No newline at end of file diff --git a/docs/METRICS.md b/docs/METRICS.md new file mode 100644 index 0000000..9b73935 --- /dev/null +++ b/docs/METRICS.md @@ -0,0 +1,142 @@ +# FISBe Metrics Reference + +This document details all evaluation metrics computed by the pipeline. + +## 1. Official FlyLight Benchmark Metrics (`--app flylight`) +These metrics determine the **FISBe Leaderboard** ranking. They are designed for long-range, thin filamentous structures. + +### Primary Ranking Score ($S$) +The single scalar used to rank methods. +$$S = 0.5 \cdot \text{avF1} + 0.5 \cdot C$$ + +### Website leaderboard column mapping +| Website column | Meaning | Pipeline key | +|---|---|---| +| S | $0.5 \cdot \text{avF1} + 0.5 \cdot C$ | `general.aggregate_score` | +| avF1 | mean F1 over th=0.1..0.9 | `confusion_matrix.avFscore19` | +| C | avg GT coverage (union-of-preds, clRecall) | `general.avg_gt_skel_coverage` | +| clDiceTP | mean clDice of TP matches at th=0.5 | `general.avg_TP_05_cldice` | +| tp | (#TP matches at th=0.5) / (#GT) | `general.TP_05_rel` | +| FS | false splits count | `general.FS` | +| FM | false merges count | `general.FM` | + +--- + +### Official Ranking Metrics Definition +| Metric Key | Definition | Matching Strategy | +| :--- | :--- | :--- | +| **`avFscore19`** (avF1) | Mean F1 score averaged over clDice thresholds 0.1 to 0.9. | Greedy 1-to-1 | +| **`avg_gt_skel_coverage`** (C) | Average GT centerline coverage computed per GT using the union of matched predictions (see below). | One-to-Many | + +#### Coverage (C) definition (official protocol) +**Localization:** compute clPrecision for all (pred, gt) pairs. +**Matching (one-to-many):** assign each predicted instance to the GT instance with the highest clPrecision score (a GT can receive multiple predictions). +**Computation:** for each GT, compute clRecall using the union of all predictions assigned to that GT (to avoid double-counting in overlaps), then average over all GT instances. + +#### avF1 computation details (official protocol) +For each threshold th ∈ {0.1, 0.2, …, 0.9} after greedy 1-to-1 matching by clDice: +- **TP**: matched (pred, gt) with clDice > th +- **FP**: unmatched predicted instances +- **FN**: unmatched GT instances + +Compute F1 = 2TP / (2TP + FP + FN) aggregated across all images, then average across thresholds. + +### Topology Error Attribution +Explicit counts of topological errors specific to filamentous structures. + +| Metric Key | Definition | Matching Strategy | +| :--- | :--- | :--- | +| **`FS`** (False Splits) | $\sum_{gt} \max(0, N_{\text{assigned\_pred}} - 1)$ | Many-to-Many (Consumption) | +| **`FM`** (False Merges) | $\sum_{pred} \max(0, N_{\text{assigned\_gt}} - 1)$ | Many-to-Many (Consumption) | + +### Localization Criteria +**clDice (Benchmark Default)** +Centerline Dice evaluates the agreement between skeletonized structures. It is the robust choice for thin filamentous objects where standard pixel-level IoU is mathematically unstable due to small boundary variations. +- **clPrecision**: Fraction of the predicted skeleton lying within the ground truth mask. +- **clRecall**: Fraction of the ground truth skeleton covered by the predicted mask. +- **clDice**: Harmonic mean of centerline precision and recall. + +### Matching Strategies +**1. Greedy One-to-One Matching** +Pairs are sorted by localization score and matched if both remain unassigned. +*Used for: TP/FP/FN counts, F1-scores, and Precision/Recall curves.* + +**2. One-to-Many Matching** +Assigns each predicted instance to the single ground truth instance with which it shares the highest clPrecision. +*Used for: Average GT Coverage (C).* + +**3. Greedy Many-to-Many Matching with Consumption** +Iteratively matches GT and predictions while "consuming" pixels—removing them from availability once matched. This prevents double-counting of overlapping structures. +*Used for: False Splits (FS) and False Merges (FM).* + +--- + +## 2. Standard Instance Metrics (Reported per Threshold) +These metrics are computed for every threshold (e.g., `th_0_5`) using **Greedy 1-to-1 matching**. + +| Metric Key | Description | +| :--- | :--- | +| **`AP_TP`** | True Positives count. | +| **`AP_FP`** | False Positives count (filtered in `--partly` mode). | +| **`AP_FN`** | False Negatives count. | +| **`precision`** | $TP / (TP + FP)$ | +| **`recall`** | $TP / (TP + FN)$ | +| **`fscore`** | Harmonic mean of precision and recall. | +| **`AP`** | Precision $\times$ Recall (at specific threshold). | + +> **Note on Averages:** The code also aggregates these across thresholds: +> * **`avAP`** (or `avAP59`): Mean AP over thresholds **0.5 to 0.9**. +> * **`avFscore`** (or `avFscore19`): Mean F1 over thresholds **0.1 to 0.9**. + +--- + +## 3. Diagnostic & Sub-group Metrics +The code calculates these additional metrics for deep-dive analysis. They do not affect the primary ranking score. + +### General Stats +| Metric Key | Description | +| :--- | :--- | +| **`Num GT`** | Total number of ground truth instances. | +| **`Num Pred`** | Total number of predicted instances. | +| **`TP_05`** | Count of True Positives at threshold 0.5. | +| **`TP_05_rel`** | Fraction of GT instances detected at threshold 0.5 ($TP_{0.5} / N_{GT}$). | + +### Quality of Matches +> **Note:** `avg_TP_05_cldice` corresponds to the leaderboard metric **clDiceTP**, and `TP_05_rel` corresponds to **tp**. + +| Metric Key | Description | +| :--- | :--- | +| **`avg_TP_05_cldice`** | Average `clDice` score of matched True Positives (at $th=0.5$). Indicates segmentation quality of detected objects. | +| **`TP_05_cldice`** | List of individual `clDice` scores for all TP pairs. | + +### Challenging Subsets (Dim & Overlapping) +Metrics evaluated only on specific subsets of Ground Truth neurons. + +| Metric Key | Description | +| :--- | :--- | +| **`GT_dim`** | Total number of dim (low contrast) GT neurons. | +| **`TP_05_dim`** | Number of dim neurons correctly detected ($clDice > 0.5$). | +| **`TP_05_rel_dim`** | Fraction of dim neurons detected. | +| **`avg_gt_cov_dim`** | Coverage ($C$) score computed only on dim neurons. | +| **`GT_overlap`** | Total number of GT neurons involved in overlaps. | +| **`TP_05_overlap`** | Number of overlapping neurons correctly detected ($clDice > 0.5$). | +| **`TP_05_rel_overlap`** | Fraction of overlapping neurons detected. | +| **`avg_gt_cov_overlap`** | Coverage ($C$) score computed only on overlapping neurons. | + +--- + +## 4. Evaluation Logic + +### Localization Criterion +* **Default:** `clDice` (Centerline Dice). Robust to boundary variations in thin structures. +* **Options:** `iou` (Intersection over Union) is available via API but not used for the benchmark. + +### Matching Strategies +* **Greedy 1-to-1:** Used for all standard detection metrics (TP, FP, F1). +* **One-to-Many:** Used for Coverage ($C$). Assigns a prediction to the GT it covers best. +* **Many-to-Many (Consumption):** Used for FS/FM. "Consumes" pixels to prevent double-counting in overlaps. + +### Partly Labeled Data (`--partly`) +Handles sparse Ground Truth where not all neurons are annotated. +* **Logic:** Unmatched predictions are counted as False Positives (**FP**) *only* if their best match is a **Foreground** GT instance. +* **Background:** Predictions matching background (unlabeled regions) are ignored. diff --git a/evalinstseg/evaluate.py b/evalinstseg/evaluate.py index 6c42ad6..fd082e2 100644 --- a/evalinstseg/evaluate.py +++ b/evalinstseg/evaluate.py @@ -27,8 +27,7 @@ from .visualize import visualize_neurons, visualize_nuclei from .summarize import ( summarize_metric_dict, - average_flylight_score_over_instances, - average_sets, + aggregate_and_report, ) logger = logging.getLogger(__name__) @@ -473,6 +472,31 @@ def evaluate_volume( return metrics +def print_and_collect_stats(mode_name, score_list, num_expected_runs): + """Calculates Mean ± Std for stability runs and prepares data for export.""" + if not score_list: + return [] + + print(f"\n--- {mode_name} ---") + report_rows = [] + keys = score_list[0].keys() + + for key in keys: + vals = [s[key] for s in score_list if key in s] + if len(vals) == num_expected_runs: + mean_val = np.mean(vals) + std_val = np.std(vals) + print(f"{key:<30}: {mean_val:.4f} ± {std_val:.4f}") + + report_rows.append({ + "Mode": mode_name, + "Metric": key, + "Mean": mean_val, + "StdDev": std_val + }) + return report_rows + + # TODO: option to just pass config (toml) file instead of flags def main(): """main entry point if called from command line @@ -691,7 +715,8 @@ def _run_loop(res_files, gt_files, out_dirs): loop_metrics.append(metric_dict) loop_samples.append(sample_name) actual_partly_flags.append(is_partly) - print(f"Evaluated {sample_name}: {metric_dict}, Partly={is_partly}") + # print(f"Evaluated {sample_name}: {metric_dict}, Partly={is_partly}") + print(f"Evaluated {sample_name}", "Partly:", is_partly) return loop_metrics, loop_samples, np.array(actual_partly_flags, dtype=bool) @@ -748,145 +773,104 @@ def _run_loop(res_files, gt_files, out_dirs): args.fs_thresh = 0.05 args.eval_dim = True - # Stability Mode (Wraps logic 3 times) + # Determine runs: Stability (3 runs) or Normal (1 run) + run_configs = [] if args.stability_mode: if not args.run_dirs or len(args.run_dirs) != 3: - raise ValueError("Stability mode requires exactly 3 directories passed to --run_dirs") + raise ValueError("Stability mode requires exactly 3 directories.") + for i, rdir in enumerate(args.run_dirs): + run_configs.append({ + "res_dir": rdir, + "out_dir": os.path.join(args.out_dir[0], f"seed_{i+1}") + }) + else: + # Detect if res_file is a directory or a list of specific files + if len(args.res_file) == 1 and os.path.isdir(args.res_file[0]) and not args.res_file[0].endswith(".zarr"): + run_configs.append({"res_dir": args.res_file[0], "out_dir": args.out_dir[0]}) + else: + run_configs.append({"files": args.res_file, "out_dir": args.out_dir[0]}) + + all_run_data = [] + + # --- 2. UNIFIED EVALUATION LOOP --- + for run in run_configs: + # File discovery for this run + if "res_dir" in run: + res_files = natsorted(glob.glob(run["res_dir"] + "/*.hdf")) or \ + natsorted(glob.glob(run["res_dir"] + "/*.zarr")) + gt_files = [get_gt_file(fn, args.gt_file[0]) for fn in res_files] + else: + res_files = run["files"] + gt_files = args.gt_file # Assumes lists are already aligned - # Lists to store the 3 run results for each mode - scores_complete = [] - scores_partly = [] - scores_combined = [] + # Prepare out_dirs list for _run_loop (it expects a list corresponding to files) + # Note: In stability mode logic of original code: run_out_dirs = [os.path.join(args.out_dir[0], f"seed_{run_idx+1}")] * len(run_res_files) + # In normal mode logic of original code: outdir_list = args.out_dir * len(args.res_file) (if 1 out dir) - print("--- 3x STABILITY MODE ---") + # Here run["out_dir"] is a single string for the run. + out_dirs = [run["out_dir"]] * len(res_files) - for run_idx, run_dir in enumerate(args.run_dirs): - print(f"Processing Run {run_idx+1}: {run_dir}") - - # Auto-detect files for this run - run_res_files = natsorted(glob.glob(run_dir + "/*.hdf")) - if not run_res_files: - run_res_files = natsorted(glob.glob(run_dir + "/*.zarr")) - - # Assume gt_file is the PARENT folder - run_gt_files = [get_gt_file(fn, args.gt_file[0]) for fn in run_res_files] - run_out_dirs = [os.path.join(args.out_dir[0], f"seed_{run_idx+1}")] * len(run_res_files) - - # Run the inner loop - metric_dicts, samples, actual_partly_flags = _run_loop(run_res_files, run_gt_files, run_out_dirs) - - metrics_full = {s: m for m, s in zip(metric_dicts, samples) if m is not None} - s_arr = np.array(samples) - - # CALCULATE SEPARATE MODES - if len(np.unique(actual_partly_flags)) > 1: - # Mode 1: Complete Only - acc_cpt, acc_inst_cpt = average_flylight_score_over_instances(s_arr[~actual_partly_flags], metrics_full) - scores_complete.append(acc_cpt) - - # Mode 2: Partly Only - acc_prt, acc_inst_prt = average_flylight_score_over_instances(s_arr[actual_partly_flags], metrics_full) - scores_partly.append(acc_prt) - - # Mode 3: Combined - acc_comb, _ = average_sets(acc_cpt, acc_inst_cpt, acc_prt, acc_inst_prt) - scores_combined.append(acc_comb) - else: - # Uniform case - acc, _ = average_flylight_score_over_instances(samples, metrics_full) - scores_combined.append(acc) - - # Print Average and Std Dev across runs - print("\n=== FISBe BENCHMARK RESULTS (Mean ± Std) ===") + # Evaluate all files in the run + metric_dicts, samples, flags = _run_loop(res_files, gt_files, out_dirs) - def print_stats(title, score_list): - if not score_list: return - print(f"\n--- {title} ---") - for key in score_list[0].keys(): - vals = [s[key] for s in score_list if key in s] - if len(vals) == 3: - print(f"{key:<30}: {np.mean(vals):.4f} ± {np.std(vals):.4f}") + # Aggregate scores (Combined, Partly, Complete) + scores = aggregate_and_report(metric_dicts, samples, flags) - # Report all 3 - print_stats("MODE 1: COMPLETELY LABELED", scores_complete) - print_stats("MODE 2: PARTLY LABELED", scores_partly) - print_stats("MODE 3: COMBINED (MAIN BENCHMARK)", scores_combined) - - # Normal Mode - else: - print("--- EVALUATE USING SINGLE DIR ---") - # shortcut if res_file and gt_file contain folders - if len(args.res_file) == 1 and len(args.gt_file) == 1: - res_file = args.res_file[0] - gt_file = args.gt_file[0] - if (os.path.isdir(res_file) and not res_file.endswith(".zarr")) and ( - os.path.isdir(gt_file) and not gt_file.endswith(".zarr") - ): - args.res_file = natsorted(glob.glob(res_file + "/*.hdf")) - args.gt_file = [get_gt_file(fn, gt_file) for fn in args.res_file] - - # check same length for result and gt files - assert len(args.res_file) == len(args.gt_file), ( - "Please check, not the same number of result and gt files" - ) + all_run_data.append({ + "metrics": metric_dicts, + "samples": samples, + "flags": flags, + "scores": scores # Tuple: (res_cpt, res_prt, res_comb) + }) + + # --- 3. FINAL REPORTING --- + if not args.stability_mode: + # MODE: Normal (Single Run) + run = all_run_data[0] + res_cpt, res_prt, res_comb = run["scores"] + summary_path = args.summary_out_dir or args.out_dir[0] - # We don't need partly_list prep here anymore as we auto-detect - # But we still check validity if user provided it - if args.partly_list is not None: - assert len(args.partly_list) == len(args.res_file) - - # check out_dir - if len(args.res_file) > 1: - if len(args.out_dir) > 1: - assert len(args.res_file) == len(args.out_dir), ( - "Please check, number of input files and output folders should correspond" - ) - outdir_list = args.out_dir - else: - outdir_list = args.out_dir * len(args.res_file) - else: - assert len(args.out_dir) == 1, "Please check number of output directories" - outdir_list = args.out_dir - # check output dir for summary - if args.summary_out_dir is None: - args.summary_out_dir = args.out_dir[0] - - metric_dicts, samples, actual_partly_flags = _run_loop(args.res_file, args.gt_file, outdir_list) - - # aggregate over instances - metrics_full = {} - acc_all_instances = None - for metric_dict, sample in zip(metric_dicts, samples): - if metric_dict is None: - continue - metrics_full[sample] = metric_dict + if args.summary: + # Save Main Summary + summarize_metric_dict(run["metrics"], run["samples"], args.summary, + os.path.join(summary_path, "summary.csv"), + agg_inst_dict=res_comb[1]) - if len(np.unique(actual_partly_flags)) > 1: - print("averaging for combined") - # get average over instances for completely - samples = np.array(samples) - acc_cpt, acc_inst_cpt = average_flylight_score_over_instances( - samples[actual_partly_flags == False], metrics_full - ) - acc_prt, acc_inst_prt = average_flylight_score_over_instances( - samples[actual_partly_flags == True], metrics_full - ) - acc, acc_all_instances = average_sets( - acc_cpt, acc_inst_cpt, acc_prt, acc_inst_prt - ) + # Save Split Summaries if mixed data exists + if res_cpt is not None: + s_arr = np.array(run["samples"]) + summarize_metric_dict([m for m, f in zip(run["metrics"], run["flags"]) if not f], + s_arr[~run["flags"]], args.summary, + os.path.join(summary_path, "summary_complete.csv"), + agg_inst_dict=res_cpt[1]) + summarize_metric_dict([m for m, f in zip(run["metrics"], run["flags"]) if f], + s_arr[run["flags"]], args.summary, + os.path.join(summary_path, "summary_partly.csv"), + agg_inst_dict=res_prt[1]) + else: + # MODE: Stability (Mean ± Std) + print("\n=== FISBe BENCHMARK RESULTS (Mean ± Std) ===") + # Extract mean dicts from each run + cpt_scores = [r["scores"][0][0] for r in all_run_data if r["scores"][0]] + prt_scores = [r["scores"][1][0] for r in all_run_data if r["scores"][1]] + comb_scores = [r["scores"][2][0] for r in all_run_data if r["scores"][2]] + + stability_rows = [] + stability_rows += print_and_collect_stats("MODE 1: COMPLETE", cpt_scores, 3) + stability_rows += print_and_collect_stats("MODE 2: PARTLY", prt_scores, 3) + stability_rows += print_and_collect_stats("MODE 3: COMBINED", comb_scores, 3) + + # Export Stability CSV + if stability_rows: + import pandas as pd + out_csv = os.path.join(args.out_dir[0], "stability_report.csv") + try: + df = pd.DataFrame(stability_rows) + df.to_csv(out_csv, index=False) + print(f"\nStability report saved to: {out_csv}") + except ImportError: + print("\nWarning: pandas not found, skipping CSV export of stability report.") - else: - acc, acc_all_instances = average_flylight_score_over_instances( - samples, metrics_full - ) - if args.summary: - summarize_metric_dict( - metric_dicts, - samples, - args.summary, - os.path.join(args.summary_out_dir, "summary.csv"), - agg_inst_dict=acc_all_instances, - ) if __name__ == "__main__": diff --git a/evalinstseg/summarize.py b/evalinstseg/summarize.py index d7ee27f..24744fb 100644 --- a/evalinstseg/summarize.py +++ b/evalinstseg/summarize.py @@ -240,3 +240,41 @@ def average_sets(acc_a, dict_a, acc_b, dict_b): cm_a["AP_FN"] + cm_b["AP_FN"] return acc, per_instance_counts + +def aggregate_and_report(metric_dicts, samples, partly_flags): + """ + A single source of truth for splitting 'Complete' vs 'Partly' + and calculating the 'Combined' score. + """ + try: + metrics_full = {s: m for m, s in zip(metric_dicts, samples) if m is not None} + s_arr = np.array(samples) + + # Check if we have consistent partly flags (length matches samples) + if len(partly_flags) != len(samples): + # Try to subset if metric_dicts has fewer items (e.g. None filtered out earlier)? + # The caller usually provides matching lists. + # But if metric_dicts has Nones, metrics_full will be smaller. + # We should rely on s_arr being aligned with partly_flags. + pass + + if len(np.unique(partly_flags)) > 1: + # 1. Complete + cpt_mask = partly_flags == False + acc_cpt, inst_cpt = average_flylight_score_over_instances(s_arr[cpt_mask], metrics_full) + + # 2. Partly + prt_mask = partly_flags == True + acc_prt, inst_prt = average_flylight_score_over_instances(s_arr[prt_mask], metrics_full) + + # 3. Combined + acc_comb, inst_comb = average_sets(acc_cpt, inst_cpt, acc_prt, inst_prt) + + return (acc_cpt, inst_cpt), (acc_prt, inst_prt), (acc_comb, inst_comb) + else: + acc, inst = average_flylight_score_over_instances(samples, metrics_full) + return None, None, (acc, inst) + except Exception as e: + print(f"Error in aggregation: {e}") + return None, None, (0.0, {}) + From 40d6e647b29b356238866c23d13f7037143f7663 Mon Sep 17 00:00:00 2001 From: Conscht Date: Thu, 19 Feb 2026 12:46:56 +0100 Subject: [PATCH 3/5] fixed math display in readme --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 57f39e3..d413537 100644 --- a/README.md +++ b/README.md @@ -152,13 +152,14 @@ The `flylight` preset implements the specific metrics described in the FISBe pap The single scalar used to rank methods on the leaderboard: $$S = 0.5 \cdot \text{avF1} + 0.5 \cdot C$$ -**Key Metrics** +### Key Metrics + - **avF1**: Average F1 score across clDice thresholds. -- **C (Coverage)**: Average ground truth skeleton coverage (assigned via max clPrecision; score via clRecall on union of matches). +- **C (Coverage)**: Average ground truth skeleton coverage (assigned via max clPrecision; score via clRecall on union of matches). - **clDiceTP**: Average clDice score of matched True Positives (at threshold 0.5). -- **tp**: Relative number of True Positives at threshold 0.5 ($TP_{0.5} / N_{GT}$). -- **FS (False Splits)**: $\sum_{gt} \max(0, N_{\text{assigned\_pred}} - 1)$ -- **FM (False Merges)**: $\sum_{pred} \max(0, N_{\text{assigned\_gt}} - 1)$ +- **tp**: Relative number of True Positives at threshold 0.5 ($TP_{0.5} / N_{\mathrm{GT}}$). +- **FS (False Splits)**: $\sum_{\mathrm{gt}} \max(0, N_{\text{assigned\_pred}} - 1)$ +- **FM (False Merges)**: $\sum_{\mathrm{pred}} \max(0, N_{\text{assigned\_gt}} - 1)$ ### Partly Labeled Data Mode (`--partly`) FISBe includes 71 partly labeled images where only a subset of neurons is annotated. From 07e7f7bd1f4b0ad06644be96e0933e35d29b6470 Mon Sep 17 00:00:00 2001 From: Conscht Date: Thu, 19 Feb 2026 12:48:49 +0100 Subject: [PATCH 4/5] removed latex code to fix readme --- README.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index d413537..6a17133 100644 --- a/README.md +++ b/README.md @@ -153,13 +153,12 @@ The single scalar used to rank methods on the leaderboard: $$S = 0.5 \cdot \text{avF1} + 0.5 \cdot C$$ ### Key Metrics - - **avF1**: Average F1 score across clDice thresholds. -- **C (Coverage)**: Average ground truth skeleton coverage (assigned via max clPrecision; score via clRecall on union of matches). -- **clDiceTP**: Average clDice score of matched True Positives (at threshold 0.5). -- **tp**: Relative number of True Positives at threshold 0.5 ($TP_{0.5} / N_{\mathrm{GT}}$). -- **FS (False Splits)**: $\sum_{\mathrm{gt}} \max(0, N_{\text{assigned\_pred}} - 1)$ -- **FM (False Merges)**: $\sum_{\mathrm{pred}} \max(0, N_{\text{assigned\_gt}} - 1)$ +- **C (Coverage)**: Average GT skeleton coverage (assignment via max clPrecision; scoring via clRecall on union of matches). +- **clDiceTP**: Average clDice score of matched TPs at threshold 0.5. +- **tp**: Relative number of TPs at threshold 0.5 (`TP_0.5 / N_GT`). +- **FS (False Splits)**: Sum over GT of `max(0, N_assigned_pred - 1)`. +- **FM (False Merges)**: Sum over predictions of `max(0, N_assigned_gt - 1)`. ### Partly Labeled Data Mode (`--partly`) FISBe includes 71 partly labeled images where only a subset of neurons is annotated. From 84c9c6de0d7b8e541f8308b48cd667154a38f920 Mon Sep 17 00:00:00 2001 From: Conscht Date: Thu, 19 Feb 2026 12:54:15 +0100 Subject: [PATCH 5/5] fixed latex code in METRICS.md --- docs/METRICS.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/METRICS.md b/docs/METRICS.md index 9b73935..3b950ed 100644 --- a/docs/METRICS.md +++ b/docs/METRICS.md @@ -16,17 +16,17 @@ $$S = 0.5 \cdot \text{avF1} + 0.5 \cdot C$$ | avF1 | mean F1 over th=0.1..0.9 | `confusion_matrix.avFscore19` | | C | avg GT coverage (union-of-preds, clRecall) | `general.avg_gt_skel_coverage` | | clDiceTP | mean clDice of TP matches at th=0.5 | `general.avg_TP_05_cldice` | -| tp | (#TP matches at th=0.5) / (#GT) | `general.TP_05_rel` | +| tp | $(\#TP \text{ at } th=0.5) / (\#GT)$ | `general.TP_05_rel` | | FS | false splits count | `general.FS` | | FM | false merges count | `general.FM` | --- -### Official Ranking Metrics Definition +### Official ranking metrics definition | Metric Key | Definition | Matching Strategy | | :--- | :--- | :--- | -| **`avFscore19`** (avF1) | Mean F1 score averaged over clDice thresholds 0.1 to 0.9. | Greedy 1-to-1 | -| **`avg_gt_skel_coverage`** (C) | Average GT centerline coverage computed per GT using the union of matched predictions (see below). | One-to-Many | +| **`confusion_matrix.avFscore19`** (avF1) | Mean F1 score averaged over clDice thresholds 0.1 to 0.9. | Greedy 1-to-1 | +| **`general.avg_gt_skel_coverage`** (C) | Average GT centerline coverage computed per GT using the union of matched predictions (see below). | One-to-Many | #### Coverage (C) definition (official protocol) **Localization:** compute clPrecision for all (pred, gt) pairs. @@ -41,13 +41,14 @@ For each threshold th ∈ {0.1, 0.2, …, 0.9} after greedy 1-to-1 matching by c Compute F1 = 2TP / (2TP + FP + FN) aggregated across all images, then average across thresholds. -### Topology Error Attribution +### Topology error attribution Explicit counts of topological errors specific to filamentous structures. | Metric Key | Definition | Matching Strategy | | :--- | :--- | :--- | -| **`FS`** (False Splits) | $\sum_{gt} \max(0, N_{\text{assigned\_pred}} - 1)$ | Many-to-Many (Consumption) | -| **`FM`** (False Merges) | $\sum_{pred} \max(0, N_{\text{assigned\_gt}} - 1)$ | Many-to-Many (Consumption) | +| **`general.FS`** (False Splits) | $\sum_{\mathrm{gt}} \max(0, N_{\text{assigned pred}} - 1)$ | Many-to-Many (Consumption) | +| **`general.FM`** (False Merges) | $\sum_{\mathrm{pred}} \max(0, N_{\text{assigned gt}} - 1)$ | Many-to-Many (Consumption) | + ### Localization Criteria **clDice (Benchmark Default)**