From fc6e1db9ad0681206ff96e7f1f5075528c5fa399 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Wed, 24 Dec 2025 20:29:40 +0000 Subject: [PATCH 1/3] Add timeout logic to Study._monitor_progress for stalled logs Detects stalled parallel simulations by checking if log files have not been updated for 8 hours. Marks them as errors to prevent the progress monitoring loop from hanging indefinitely. Implements robust ID matching to handle varying zero-padding in case IDs. --- src/VITools/study.py | 63 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 52 insertions(+), 11 deletions(-) diff --git a/src/VITools/study.py b/src/VITools/study.py index fb7fa6f..07943d3 100644 --- a/src/VITools/study.py +++ b/src/VITools/study.py @@ -16,7 +16,7 @@ from pathlib import Path from tqdm import tqdm import shutil -from time import sleep +from time import sleep, time import ast from subprocess import run from argparse import ArgumentParser @@ -394,17 +394,58 @@ def _monitor_progress(self, log_dir, scans_queued=None): temp_errors = scan_logs_for_errors(log_dir, verbose=False) output_df = self.get_scans_completed() - # Count successful scans - successful_scans = len(np.unique(output_df.get('case_id', []))) - - # Check for errors - new_errors_found = False - if len(temp_errors) > len(errors): - errors = temp_errors - new_errors_found = True - for task_id in errors: + # Update errors from logs + for task_id, msg in temp_errors.items(): + if task_id not in errors: + errors[task_id] = msg print(f"--- ERROR FOUND IN: {task_id} ---") - print(f"Error Message: {errors[task_id]}\n") + print(f"Error Message: {msg}\n") + + # Count successful scans + successful_cases = set(output_df.get('case_id', [])) + successful_scans = len(successful_cases) + + # Get set of completed IDs as integers to handle varying string formats (case_0001 vs case_1) + successful_ids = set() + for cid in successful_cases: + try: + successful_ids.add(int(str(cid).split('_')[-1])) + except (ValueError, IndexError): + pass + + # Check for stalled logs (timeout) + # Iterate over log files in log_dir + try: + for entry in os.scandir(log_dir): + if entry.is_file() and entry.name.startswith('task_') and entry.name.endswith('.log'): + # Extract task id from filename "task_0.log" + try: + # Regex match to be safe + match = re.search(r'task_(\d+)\.log', entry.name) + if match: + task_id_str = match.group(1) + task_id = int(task_id_str) + + # Check if already completed + if task_id in successful_ids: + continue + + # Check if already errored + if entry.name in errors: + continue + + # Check modification time + mtime = entry.stat().st_mtime + if time() - mtime > 8 * 3600: + msg = "Timeout: Exceeded limit of 8 hours" + errors[entry.name] = msg + print(f"--- ERROR FOUND IN: {entry.name} ---") + print(f"Error Message: {msg}\n") + + except Exception: + pass + except FileNotFoundError: + pass # Total completed = successes + failures current_total_completed = successful_scans + len(errors) From c448fd582ee66aae0949726eac86f86a73c9180e Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Thu, 25 Dec 2025 18:45:42 +0000 Subject: [PATCH 2/3] Add timeout logic and error logging to Study._monitor_progress Implements an 8-hour inactivity timeout for parallel simulation logs to detect stalled jobs. All simulation errors (including timeouts) are now logged to 'study_errors.log' in the study root directory to facilitate debugging. Prevents infinite stalling of the progress monitor loop. --- src/VITools/study.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/VITools/study.py b/src/VITools/study.py index 07943d3..92e738b 100644 --- a/src/VITools/study.py +++ b/src/VITools/study.py @@ -13,6 +13,8 @@ import re import sys from datetime import datetime +import os +import re from pathlib import Path from tqdm import tqdm import shutil @@ -388,6 +390,14 @@ def _monitor_progress(self, log_dir, scans_queued=None): output_df = self.get_scans_completed() scans_completed = len(np.unique(output_df.get('case_id', []))) errors = {} + + # Define error log path + try: + study_root = Path(self.metadata.iloc[0]['output_directory']).parent + error_log_path = study_root / 'study_errors.log' + except (IndexError, KeyError): + error_log_path = None + with tqdm(total=scans_queued, initial=scans_completed, desc='Scans completed in parallel') as pbar: while scans_completed < scans_queued: sleep(1) @@ -398,8 +408,11 @@ def _monitor_progress(self, log_dir, scans_queued=None): for task_id, msg in temp_errors.items(): if task_id not in errors: errors[task_id] = msg - print(f"--- ERROR FOUND IN: {task_id} ---") - print(f"Error Message: {msg}\n") + error_msg = f"--- ERROR FOUND IN: {task_id} ---\nError Message: {msg}\n\n" + print(error_msg) + if error_log_path: + with open(error_log_path, 'a') as f: + f.write(error_msg) # Count successful scans successful_cases = set(output_df.get('case_id', [])) @@ -439,8 +452,11 @@ def _monitor_progress(self, log_dir, scans_queued=None): if time() - mtime > 8 * 3600: msg = "Timeout: Exceeded limit of 8 hours" errors[entry.name] = msg - print(f"--- ERROR FOUND IN: {entry.name} ---") - print(f"Error Message: {msg}\n") + error_msg = f"--- ERROR FOUND IN: {entry.name} ---\nError Message: {msg}\n\n" + print(error_msg) + if error_log_path: + with open(error_log_path, 'a') as f: + f.write(error_msg) except Exception: pass From d56b72bc1281282368da576e3fc7b18bf1c95844 Mon Sep 17 00:00:00 2001 From: "Brandon J. Nelson" Date: Thu, 25 Dec 2025 18:56:49 -0500 Subject: [PATCH 3/3] increase version number with recent progress monitoring updates --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5999706..4ec7d12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "VITools" -version = "0.4.0" +version = "0.4.1" authors = [ {name="Brandon Nelson", email="brandon.nelson@fda.hhs.gov"}, ]