From fc6e1db9ad0681206ff96e7f1f5075528c5fa399 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Wed, 24 Dec 2025 20:29:40 +0000
Subject: [PATCH 1/3] Add timeout logic to Study._monitor_progress for stalled
 logs

Detects stalled parallel simulations by checking if log files have not been updated for 8 hours. Marks them as errors to prevent the progress monitoring loop from hanging indefinitely. Implements robust ID matching to handle varying zero-padding in case IDs.
---
 src/VITools/study.py | 63 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 52 insertions(+), 11 deletions(-)

diff --git a/src/VITools/study.py b/src/VITools/study.py
index fb7fa6f..07943d3 100644
--- a/src/VITools/study.py
+++ b/src/VITools/study.py
@@ -16,7 +16,7 @@
 from pathlib import Path
 from tqdm import tqdm
 import shutil
-from time import sleep
+from time import sleep, time
 import ast
 from subprocess import run
 from argparse import ArgumentParser
@@ -394,17 +394,58 @@ def _monitor_progress(self, log_dir, scans_queued=None):
                 temp_errors = scan_logs_for_errors(log_dir, verbose=False)
                 output_df = self.get_scans_completed()
 
-                # Count successful scans
-                successful_scans = len(np.unique(output_df.get('case_id', [])))
-
-                # Check for errors
-                new_errors_found = False
-                if len(temp_errors) > len(errors):
-                    errors = temp_errors
-                    new_errors_found = True
-                    for task_id in errors:
+                # Update errors from logs
+                for task_id, msg in temp_errors.items():
+                    if task_id not in errors:
+                        errors[task_id] = msg
                         print(f"--- ERROR FOUND IN: {task_id} ---")
-                        print(f"Error Message: {errors[task_id]}\n")
+                        print(f"Error Message: {msg}\n")
+
+                # Count successful scans
+                successful_cases = set(output_df.get('case_id', []))
+                successful_scans = len(successful_cases)
+
+                # Get set of completed IDs as integers to handle varying string formats (case_0001 vs case_1)
+                successful_ids = set()
+                for cid in successful_cases:
+                    try:
+                        successful_ids.add(int(str(cid).split('_')[-1]))
+                    except (ValueError, IndexError):
+                        pass
+
+                # Check for stalled logs (timeout)
+                # Iterate over log files in log_dir
+                try:
+                    for entry in os.scandir(log_dir):
+                        if entry.is_file() and entry.name.startswith('task_') and entry.name.endswith('.log'):
+                             # Extract task id from filename "task_0.log"
+                             try:
+                                 # Regex match to be safe
+                                 match = re.search(r'task_(\d+)\.log', entry.name)
+                                 if match:
+                                     task_id_str = match.group(1)
+                                     task_id = int(task_id_str)
+
+                                     # Check if already completed
+                                     if task_id in successful_ids:
+                                         continue
+
+                                     # Check if already errored
+                                     if entry.name in errors:
+                                         continue
+
+                                     # Check modification time
+                                     mtime = entry.stat().st_mtime
+                                     if time() - mtime > 8 * 3600:
+                                         msg = "Timeout: Exceeded limit of 8 hours"
+                                         errors[entry.name] = msg
+                                         print(f"--- ERROR FOUND IN: {entry.name} ---")
+                                         print(f"Error Message: {msg}\n")
+
+                             except Exception:
+                                 pass
+                except FileNotFoundError:
+                    pass
 
                 # Total completed = successes + failures
                 current_total_completed = successful_scans + len(errors)

From c448fd582ee66aae0949726eac86f86a73c9180e Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Thu, 25 Dec 2025 18:45:42 +0000
Subject: [PATCH 2/3] Add timeout logic and error logging to
 Study._monitor_progress

Implements an 8-hour inactivity timeout for parallel simulation logs to detect stalled jobs. All simulation errors (including timeouts) are now logged to 'study_errors.log' in the study root directory to facilitate debugging. Prevents infinite stalling of the progress monitor loop.
---
 src/VITools/study.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/VITools/study.py b/src/VITools/study.py
index 07943d3..92e738b 100644
--- a/src/VITools/study.py
+++ b/src/VITools/study.py
@@ -13,6 +13,8 @@
 import re
 import sys
 from datetime import datetime
+import os
+import re
 from pathlib import Path
 from tqdm import tqdm
 import shutil
@@ -388,6 +390,14 @@ def _monitor_progress(self, log_dir, scans_queued=None):
         output_df = self.get_scans_completed()
         scans_completed = len(np.unique(output_df.get('case_id', [])))
         errors = {}
+
+        # Define error log path
+        try:
+            study_root = Path(self.metadata.iloc[0]['output_directory']).parent
+            error_log_path = study_root / 'study_errors.log'
+        except (IndexError, KeyError):
+            error_log_path = None
+
         with tqdm(total=scans_queued, initial=scans_completed, desc='Scans completed in parallel') as pbar:
             while scans_completed < scans_queued:
                 sleep(1)
@@ -398,8 +408,11 @@ def _monitor_progress(self, log_dir, scans_queued=None):
                 for task_id, msg in temp_errors.items():
                     if task_id not in errors:
                         errors[task_id] = msg
-                        print(f"--- ERROR FOUND IN: {task_id} ---")
-                        print(f"Error Message: {msg}\n")
+                        error_msg = f"--- ERROR FOUND IN: {task_id} ---\nError Message: {msg}\n\n"
+                        print(error_msg)
+                        if error_log_path:
+                            with open(error_log_path, 'a') as f:
+                                f.write(error_msg)
 
                 # Count successful scans
                 successful_cases = set(output_df.get('case_id', []))
@@ -439,8 +452,11 @@ def _monitor_progress(self, log_dir, scans_queued=None):
                                      if time() - mtime > 8 * 3600:
                                          msg = "Timeout: Exceeded limit of 8 hours"
                                          errors[entry.name] = msg
-                                         print(f"--- ERROR FOUND IN: {entry.name} ---")
-                                         print(f"Error Message: {msg}\n")
+                                         error_msg = f"--- ERROR FOUND IN: {entry.name} ---\nError Message: {msg}\n\n"
+                                         print(error_msg)
+                                         if error_log_path:
+                                             with open(error_log_path, 'a') as f:
+                                                 f.write(error_msg)
 
                              except Exception:
                                  pass

From d56b72bc1281282368da576e3fc7b18bf1c95844 Mon Sep 17 00:00:00 2001
From: "Brandon J. Nelson" <brandon.nelson@fda.hhs.gov>
Date: Thu, 25 Dec 2025 18:56:49 -0500
Subject: [PATCH 3/3] increase version number with recent progress monitoring
 updates

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 5999706..4ec7d12 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "VITools"
-version = "0.4.0"
+version = "0.4.1"
 authors = [
   {name="Brandon Nelson", email="brandon.nelson@fda.hhs.gov"},
 ]