From ad652a786cffcdedd333d7d6167c8c20dae0e28f Mon Sep 17 00:00:00 2001 From: Piotr Konopka Date: Fri, 12 Dec 2025 12:01:58 +0100 Subject: [PATCH 1/5] do not attempt to kill a process which never started it's difficult to trigger such an error, but if taskCmd.Start() actually fails, it never sets taskCmd.Process, so trying to use the PID there is a nil access and causes executor to crash. --- executor/executable/controllabletask.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/executor/executable/controllabletask.go b/executor/executable/controllabletask.go index 72bbf737..d0642e87 100644 --- a/executor/executable/controllabletask.go +++ b/executor/executable/controllabletask.go @@ -147,8 +147,6 @@ func (t *ControllableTask) doLaunchTask(taskCmd *exec.Cmd, launchStartTime time. Error("failed to run task") t.sendStatus(t.knownEnvironmentId, mesos.TASK_FAILED, err.Error()) - // fixme: i confirmed on staging, that's a nil access! taskCmd.Process is not set if Start() fails - _ = t.doTermIntKill(-taskCmd.Process.Pid) // fixme: shouldn't we also close pipes, as we do in some other error cases later? return } From 3a2ff6f0f86d50dc5262ce6b6efeb46edee48bb7 Mon Sep 17 00:00:00 2001 From: Piotr Konopka Date: Fri, 12 Dec 2025 12:22:26 +0100 Subject: [PATCH 2/5] document that we don't need to close pipes if command fails to start --- executor/executable/controllabletask.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/executor/executable/controllabletask.go b/executor/executable/controllabletask.go index d0642e87..ea3f16c3 100644 --- a/executor/executable/controllabletask.go +++ b/executor/executable/controllabletask.go @@ -147,7 +147,7 @@ func (t *ControllableTask) doLaunchTask(taskCmd *exec.Cmd, launchStartTime time. Error("failed to run task") t.sendStatus(t.knownEnvironmentId, mesos.TASK_FAILED, err.Error()) - // fixme: shouldn't we also close pipes, as we do in some other error cases later? + // no need to close IO pipes, Cmd.Start does it on failure return } log.WithFields(defaultLogFields).Debug("task launched") From 0ed2a5afc3d670d0e17b6de83779e6eade5fd3d1 Mon Sep 17 00:00:00 2001 From: Piotr Konopka Date: Fri, 12 Dec 2025 12:33:31 +0100 Subject: [PATCH 3/5] be gentler with terminating tasks which fail to reach STANDBY If a task fails to report STANDBY state upon startup, we now perform the standard TERM, INT, KILL sequence instead of just KILL. We also rely on Wait() to close relevant pipes and provide us with exit code. All above is moved into a dedicated method which will be reused in next fixes. --- executor/executable/controllabletask.go | 64 ++++++++++++++++++------- 1 file changed, 47 insertions(+), 17 deletions(-) diff --git a/executor/executable/controllabletask.go b/executor/executable/controllabletask.go index ea3f16c3..557e4f37 100644 --- a/executor/executable/controllabletask.go +++ b/executor/executable/controllabletask.go @@ -211,25 +211,14 @@ func (t *ControllableTask) doLaunchTask(taskCmd *exec.Cmd, launchStartTime time. err = t.pollTaskForStandbyState() if err != nil { - t.sendStatus(t.knownEnvironmentId, mesos.TASK_FAILED, err.Error()) - - _ = t.rpc.Close() - t.rpc = nil - - pid := t.knownPid - if pid == 0 { - // The pid was never known through a successful `GetState` in the lifetime - // of this process, so we must rely on the PGID of the containing shell - pid = -taskCmd.Process.Pid - } log.WithFields(defaultLogFields). - Debug("sending SIGKILL (9) to task") - _ = syscall.Kill(pid, syscall.SIGKILL) // fixme: not sure why we do it differently than elsewhere (doTermIntKill) - _ = stdoutIn.Close() - _ = stderrIn.Close() + WithField(infologger.Level, infologger.IL_Support). + WithError(err). + Error("failed to poll task for standby state") - log.WithFields(defaultLogFields). - Debug("task killed") + t.sendStatus(t.knownEnvironmentId, mesos.TASK_FAILED, err.Error()) + + t.cleanupFailedTask(taskCmd) return } @@ -314,6 +303,47 @@ func (t *ControllableTask) doLaunchTask(taskCmd *exec.Cmd, launchStartTime time. return } +func (t *ControllableTask) cleanupFailedTask(taskCmd *exec.Cmd) { + + defaultLogFields := logrus.Fields{ + "taskId": t.ti.TaskID.GetValue(), + "taskName": t.ti.Name, + "partition": t.knownEnvironmentId.String(), + "detector": t.knownDetector, + } + + if taskCmd.Process == nil { + // task never started or was already terminated + return + } + + if t.rpc != nil { + _ = t.rpc.Close() + t.rpc = nil + } + + pid := t.knownPid + if pid == 0 { + // The pid was never known through a successful `GetState` in the lifetime + // of this process, so we must rely on the PGID of the containing shell + pid = -taskCmd.Process.Pid + } + + _ = t.doTermIntKill(-taskCmd.Process.Pid) + + err := taskCmd.Wait() + if err != nil { + log.WithFields(defaultLogFields). + WithField(infologger.Level, infologger.IL_Support). + WithError(err). + Warning("task terminated and exited with error") + } else { + log.WithFields(defaultLogFields). + WithField(infologger.Level, infologger.IL_Support). + Debug("task terminated") + } +} + func (t *ControllableTask) initTaskStdLogging(stdoutIn io.ReadCloser, stderrIn io.ReadCloser) { defaultLogFields := logrus.Fields{ "taskId": t.ti.TaskID.GetValue(), From 3b282ad0c05f5807a0de068744b7012cb2cc6b67 Mon Sep 17 00:00:00 2001 From: Piotr Konopka Date: Fri, 12 Dec 2025 14:52:35 +0100 Subject: [PATCH 4/5] be consistent in cleaning up failed controllable task --- executor/executable/controllabletask.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/executor/executable/controllabletask.go b/executor/executable/controllabletask.go index 557e4f37..e17bd167 100644 --- a/executor/executable/controllabletask.go +++ b/executor/executable/controllabletask.go @@ -194,7 +194,8 @@ func (t *ControllableTask) doLaunchTask(taskCmd *exec.Cmd, launchStartTime time. Error("could not start gRPC client") t.sendStatus(t.knownEnvironmentId, mesos.TASK_FAILED, err.Error()) - _ = t.doTermIntKill(-taskCmd.Process.Pid) + + t.cleanupFailedTask(taskCmd) return } t.rpc.TaskCmd = taskCmd @@ -234,9 +235,7 @@ func (t *ControllableTask) doLaunchTask(taskCmd *exec.Cmd, launchStartTime time. WithError(err). Error("cannot set up event stream from task") t.sendStatus(t.knownEnvironmentId, mesos.TASK_FAILED, err.Error()) - _ = t.rpc.Close() - t.rpc = nil - // fixme: why don't we kill the task in this error case, but we do in others? + t.cleanupFailedTask(taskCmd) return } From 87c9710f1c62174ae9c39404f6acbe1eef01e855 Mon Sep 17 00:00:00 2001 From: Piotr Konopka Date: Fri, 12 Dec 2025 15:06:18 +0100 Subject: [PATCH 5/5] fix nil access while polling controllable task for STANDBY state --- executor/executable/controllabletask.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/executor/executable/controllabletask.go b/executor/executable/controllabletask.go index e17bd167..efbd0df9 100644 --- a/executor/executable/controllabletask.go +++ b/executor/executable/controllabletask.go @@ -447,6 +447,7 @@ func (t *ControllableTask) pollTaskForStandbyState() error { Debug("polling task for STANDBY state reached") response, err := t.rpc.GetState(context.TODO(), &pb.GetStateRequest{}, grpc.EmptyCallOption{}) + reachedState := "UNKNOWN" if err != nil { log.WithError(err). WithFields(defaultLogFields). @@ -458,12 +459,11 @@ func (t *ControllableTask) pollTaskForStandbyState() error { WithField(infologger.Level, infologger.IL_Devel). Debug("task status queried") t.knownPid = int(response.GetPid()) + // NOTE: we acquire the transitioner-dependent STANDBY equivalent state + reachedState = t.rpc.FromDeviceState(response.GetState()) } - // NOTE: we acquire the transitioner-dependent STANDBY equivalent state - // fixme: that's a possible nil access there, because we do not "continue" on error - reachedState := t.rpc.FromDeviceState(response.GetState()) - if reachedState == "STANDBY" && err == nil { + if reachedState == "STANDBY" { log.WithFields(defaultLogFields). WithField(infologger.Level, infologger.IL_Devel). Debug("task running and ready for control input")