Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions cmd/wait-for-github/pr.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ func (pr *prCheck) Check(ctx context.Context) error {
pr.logger.InfoContext(ctx, "CI failed, attempting to retry failed GitHub Actions",
"retries_done", pr.retriesDone, "retries_allowed", pr.actionRetries)

rerunCount, err := pr.githubClient.RerunFailedWorkflowsForCommit(ctx, pr.owner, pr.repo, sha)
rerunCount, hasRunsInProgress, err := pr.githubClient.RerunFailedWorkflowsForCommit(ctx, pr.owner, pr.repo, sha)
if err != nil {
pr.logger.WarnContext(ctx, "failed to rerun workflows, will retry", "error", err)
return nil // Continue waiting and try again
Expand All @@ -213,7 +213,16 @@ func (pr *prCheck) Check(ctx context.Context) error {
return nil // Continue waiting
}

// No workflows were rerun (maybe non-Action failures), fail immediately
if hasRunsInProgress {
// No concluded workflow runs to retry yet, but some are still running.
// This happens when a job fails but other jobs in the same workflow
// run are still in progress — the run won't have a "failure" conclusion
// until all jobs complete. Keep waiting so we can retry on the next poll.
pr.logger.InfoContext(ctx, "CI failed but workflow runs still in progress, will keep waiting")
return nil
}

// No workflows were rerun and none are in progress — fail immediately
pr.logger.InfoContext(ctx, "CI failed with no GitHub Actions to retry, exiting")
return cli.Exit("CI failed", 1)
}
Expand Down
55 changes: 50 additions & 5 deletions cmd/wait-for-github/pr_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,10 @@ type fakeGithubClientPRCheck struct {
getCIStatusError error
rerunFailedWorkflowsError error

CIStatus github.CIStatus
RerunCount int
RerunCalledCount int
CIStatus github.CIStatus
RerunCount int
HasRunsInProgress bool
RerunCalledCount int
}

func (fg *fakeGithubClientPRCheck) IsPRMergedOrClosed(ctx context.Context, owner, repo string, pr int) (string, bool, int64, error) {
Expand All @@ -56,9 +57,9 @@ func (fg *fakeGithubClientPRCheck) GetCIStatus(ctx context.Context, owner, repo
return fg.CIStatus, fg.getCIStatusError
}

func (fg *fakeGithubClientPRCheck) RerunFailedWorkflowsForCommit(ctx context.Context, owner, repo, commitHash string) (int, error) {
func (fg *fakeGithubClientPRCheck) RerunFailedWorkflowsForCommit(ctx context.Context, owner, repo, commitHash string) (int, bool, error) {
fg.RerunCalledCount++
return fg.RerunCount, fg.rerunFailedWorkflowsError
return fg.RerunCount, fg.HasRunsInProgress, fg.rerunFailedWorkflowsError
}

func TestPRCheck(t *testing.T) {
Expand Down Expand Up @@ -176,6 +177,50 @@ func TestPRCheck(t *testing.T) {
}
}

// TestPRCheckRetryWithInProgressWorkflow tests the race condition where GetCIStatus
// reports failure (a job has failed) but the workflow run hasn't concluded yet because
// other jobs are still running. RerunFailedWorkflowsForCommit returns 0 on the first
// call (no concluded runs to retry), and on the next poll the workflow has completed
// and can be retried.
func TestPRCheckRetryWithInProgressWorkflow(t *testing.T) {
t.Parallel()

fakeClient := &fakeGithubClientPRCheck{
CIStatus: github.CIStatusFailed,
RerunCount: 0, // Workflow run hasn't concluded yet, no runs match "failure" conclusion
HasRunsInProgress: true, // Other jobs in the workflow are still running
}

pr := &prCheck{
prConfig: prConfig{
owner: "owner",
repo: "repo",
pr: 1,
actionRetries: 2,
},
githubClient: fakeClient,
logger: testLogger,
}

// First check: CI reports failure (from a failed job), but the workflow run
// is still in progress so RerunFailedWorkflowsForCommit finds nothing to retry.
// Should continue waiting because runs are still in progress.
err := pr.Check(context.Background())
require.NoError(t, err, "should continue waiting when workflow runs are still in progress")
require.Equal(t, 1, fakeClient.RerunCalledCount)
require.Equal(t, 0, pr.retriesDone, "retriesDone should not increment when no workflows were rerun")

// Simulate the workflow run completing — it now has conclusion "failure" and is retryable.
fakeClient.RerunCount = 1
fakeClient.HasRunsInProgress = false

// Second check: CI still failed, workflow run now concluded and gets retried.
err = pr.Check(context.Background())
require.NoError(t, err, "should continue waiting after successful rerun")
require.Equal(t, 2, fakeClient.RerunCalledCount)
require.Equal(t, 1, pr.retriesDone, "retriesDone should increment after successful rerun")
}

type fakeFileWriter struct {
filename string
data []byte
Expand Down
105 changes: 72 additions & 33 deletions internal/github/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ type GetDetailedCIStatus interface {
}

type RerunFailedWorkflows interface {
RerunFailedWorkflowsForCommit(ctx context.Context, owner, repo, commitHash string) (int, error)
RerunFailedWorkflowsForCommit(ctx context.Context, owner, repo, commitHash string) (int, bool, error)
}

type CheckCIStatus interface {
Expand Down Expand Up @@ -99,6 +99,41 @@ const (
CIStatusSkipped
)

// Workflow run and check run status values.
// See: https://docs.github.com/en/rest/actions/workflow-runs
const (
RunStatusCompleted = "completed"
RunStatusActionRequired = "action_required"
RunStatusInProgress = "in_progress"
RunStatusQueued = "queued"
RunStatusRequested = "requested"
RunStatusWaiting = "waiting"
RunStatusPending = "pending"
)

// Workflow run and check run conclusion values.
// See: https://docs.github.com/en/rest/actions/workflow-runs
const (
RunConclusionSuccess = "success"
RunConclusionFailure = "failure"
RunConclusionCancelled = "cancelled"
RunConclusionTimedOut = "timed_out"
RunConclusionSkipped = "skipped"
RunConclusionNeutral = "neutral"
RunConclusionStale = "stale"
RunConclusionActionRequired = "action_required"
RunConclusionStartupFailure = "startup_failure"
)

// Commit status context state values.
// See: https://docs.github.com/en/rest/commits/statuses
const (
StatusStateSuccess = "success"
StatusStateFailure = "failure"
StatusStatePending = "pending"
StatusStateError = "error"
)

func (c CIStatus) String() string {
switch c {
case CIStatusPassed:
Expand Down Expand Up @@ -160,15 +195,15 @@ func (c CheckRun) String() string {

func (c CheckRun) Outcome() CIStatus {
switch strings.ToLower(c.Status) {
case "completed":
case RunStatusCompleted:
switch strings.ToLower(c.Conclusion) {
case "success":
case RunConclusionSuccess:
return CIStatusPassed
case "startup_failure":
case RunConclusionStartupFailure:
return CIStatusFailed
case "failure":
case RunConclusionFailure:
return CIStatusFailed
case "skipped":
case RunConclusionSkipped:
return CIStatusSkipped
default:
return CIStatusUnknown
Expand Down Expand Up @@ -197,11 +232,11 @@ func (s StatusContext) String() string {

func (s StatusContext) Outcome() CIStatus {
switch strings.ToLower(s.State) {
case "success":
case StatusStateSuccess:
return CIStatusPassed
case "failure":
case StatusStateFailure:
return CIStatusFailed
case "error":
case StatusStateError:
return CIStatusFailed
default:
return CIStatusUnknown
Expand Down Expand Up @@ -456,9 +491,9 @@ func (c GHClient) GetCIStatus(ctx context.Context, owner, repoName, ref string,
return CIStatusPassed, nil
}

isSuccess := strings.ToLower(rollup.State) == "success"
isFailure := strings.ToLower(rollup.State) == "failure"
isPending := strings.ToLower(rollup.State) == "pending"
isSuccess := strings.ToLower(rollup.State) == StatusStateSuccess
isFailure := strings.ToLower(rollup.State) == StatusStateFailure
isPending := strings.ToLower(rollup.State) == StatusStatePending

// return early if all checks and statuses are successful. no need to evaluate individual nodes in the response.
if isSuccess {
Expand All @@ -482,8 +517,8 @@ func (c GHClient) GetCIStatus(ctx context.Context, owner, repoName, ref string,
}

for _, node := range nodes {
isCheckFailure := strings.ToLower(node.CheckRun.Conclusion) == "failure"
isStatusFailure := strings.ToLower(node.StatusContext.State) == "failure"
isCheckFailure := strings.ToLower(node.CheckRun.Conclusion) == RunConclusionFailure
isStatusFailure := strings.ToLower(node.StatusContext.State) == StatusStateFailure

checkRunName := node.CheckRun.Name
statusContextName := node.StatusContext.Context
Expand Down Expand Up @@ -558,18 +593,18 @@ func (c GHClient) getOneStatus(ctx context.Context, owner, repoName, ref, check

for _, checkRun := range checkRuns {
switch checkRun.GetStatus() {
case "completed":
case RunStatusCompleted:
switch checkRun.GetConclusion() {
case "success":
case RunConclusionSuccess:
return CIStatusPassed, nil
case "skipped":
case RunConclusionSkipped:
return CIStatusPassed, nil
default:
return CIStatusFailed, nil
}
case "queued":
case RunStatusQueued:
return CIStatusPending, nil
case "in_progress":
case RunStatusInProgress:
return CIStatusPending, nil
}
}
Expand Down Expand Up @@ -610,13 +645,13 @@ func (c GHClient) getOneStatus(ctx context.Context, owner, repoName, ref, check
}

switch status.GetState() {
case "success":
case StatusStateSuccess:
return CIStatusPassed, nil
case "failure":
case StatusStateFailure:
return CIStatusFailed, nil
case "pending":
case StatusStatePending:
return CIStatusPending, nil
case "error":
case StatusStateError:
return CIStatusFailed, nil
}
}
Expand Down Expand Up @@ -687,8 +722,8 @@ func (c GHClient) GetDetailedCIStatus(ctx context.Context, owner, repoName, ref
}

// RerunFailedWorkflowsForCommit finds all failed GitHub Actions workflow runs for a commit and re-runs them.
// Returns the number of workflows that were re-run.
func (c GHClient) RerunFailedWorkflowsForCommit(ctx context.Context, owner, repoName, commitHash string) (int, error) {
// Returns the number of workflows that were re-run and whether any runs are still in progress.
func (c GHClient) RerunFailedWorkflowsForCommit(ctx context.Context, owner, repoName, commitHash string) (int, bool, error) {
listOptions := github.ListOptions{
PerPage: 100,
}
Expand All @@ -700,16 +735,17 @@ func (c GHClient) RerunFailedWorkflowsForCommit(ctx context.Context, owner, repo

var failedRunIDs []int64
seenRuns := make(map[int64]bool)
hasRunsInProgress := false

for {
runs, resp, err := c.client.Actions.ListRepositoryWorkflowRuns(ctx, owner, repoName, opts)
if err != nil {
return 0, fmt.Errorf("failed to list workflow runs: %w", err)
return 0, false, fmt.Errorf("failed to list workflow runs: %w", err)
}

respErr := c.handleResponseError(resp, "ListRepositoryWorkflowRuns", owner, repoName)
if respErr != nil {
return 0, respErr
return 0, false, respErr
}

for _, run := range runs.WorkflowRuns {
Expand All @@ -718,10 +754,13 @@ func (c GHClient) RerunFailedWorkflowsForCommit(ctx context.Context, owner, repo
}
seenRuns[run.GetID()] = true

// See https://docs.github.com/en/rest/actions/workflow-runs?apiVersion=2022-11-28#list-workflow-runs-for-a-workflow
// Can be one of: completed, action_required, cancelled, failure, neutral, skipped, stale, success, timed_out, in_progress, queued, requested, waiting, pending
status := strings.ToLower(run.GetStatus())
if status != RunStatusCompleted {
hasRunsInProgress = true
}

conclusion := strings.ToLower(run.GetConclusion())
retryableConclusions := []string{"failure", "timed_out"}
retryableConclusions := []string{RunConclusionFailure, RunConclusionTimedOut}
if slices.Contains(retryableConclusions, conclusion) {
failedRunIDs = append(failedRunIDs, run.GetID())
}
Expand All @@ -738,15 +777,15 @@ func (c GHClient) RerunFailedWorkflowsForCommit(ctx context.Context, owner, repo
c.logger.InfoContext(ctx, "re-running failed workflow", "run_id", runID)
resp, err := c.client.Actions.RerunFailedJobsByID(ctx, owner, repoName, runID)
if err != nil {
return rerunCount, fmt.Errorf("failed to rerun workflow %d: %w", runID, err)
return rerunCount, hasRunsInProgress, fmt.Errorf("failed to rerun workflow %d: %w", runID, err)
}

respErr := c.handleResponseError(resp, "RerunFailedJobsByID", owner, repoName)
if respErr != nil {
return rerunCount, respErr
return rerunCount, hasRunsInProgress, respErr
}
rerunCount++
}

return rerunCount, nil
return rerunCount, hasRunsInProgress, nil
}
Loading
Loading