diff --git a/README.md b/README.md index 33c0f03..9f875e0 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,7 @@ USAGE: wait-for-github ci [command options] [ ] OPTIONS: + --action-retries value Number of times to retry failed GitHub Actions before failing. Set to 0 to disable retries. (default: 0) [$GITHUB_ACTION_RETRIES] --check value, -c value [ --check value, -c value ] Check the status of a specific CI check. By default, the status of all required checks is checked. [$GITHUB_CI_CHECKS] --exclude value, -x value [ --exclude value, -x value ] Exclude the status of a specific CI check. Argument ignored if checks are specified individually. By default, the status of all checks is checked. [$GITHUB_CI_EXCLUDE] --help, -h show help (default: false) @@ -93,6 +94,10 @@ OPTIONS: This command will wait for CI checks to finish for a ref or PR URL. If they finish successfully it will exit `0` and otherwise it will exit `1`. +To automatically retry failed GitHub Actions workflows, use the `--action-retries` +flag. For example, `--action-retries 2` will retry failed workflows up to 2 times +before actually failing. + To wait for a specific check to finish, use the `--check` flag. To exclude specific checks from failing the status, use the `--exclude` flag. See below for details of the `ci list` subcommand, which can help determine valid values for @@ -159,8 +164,7 @@ which are not added immediately. #### `action-retries` -Number of times to retry failed GitHub Actions before failing. Only used when -`wait-for` is set to `"pr"`. Optional. Default is `0` (no retries). +Number of times to retry failed GitHub Actions before failing. Optional. Default is `0` (no retries). #### `interval` diff --git a/action.yml b/action.yml index 8ab3342..b8ea8b6 100644 --- a/action.yml +++ b/action.yml @@ -43,7 +43,7 @@ inputs: description: 'The comma-separated names of the checks to wait for. Only used when wait-for is "ci"' required: false action-retries: - description: 'Number of times to retry failed GitHub Actions before failing. Only used when wait-for is "pr"' + description: 'Number of times to retry failed GitHub Actions before failing' required: false default: "0" diff --git a/cmd/wait-for-github/ci.go b/cmd/wait-for-github/ci.go index be52e0c..292d17d 100644 --- a/cmd/wait-for-github/ci.go +++ b/cmd/wait-for-github/ci.go @@ -36,8 +36,9 @@ type ciConfig struct { ref string // options - checks []string - excludes []string + checks []string + excludes []string + actionRetries int } var ( @@ -119,14 +120,20 @@ func parseCIArguments(ctx context.Context, cmd *cli.Command, logger *slog.Logger } return ciConfig{ - owner: owner, - repo: repo, - ref: ref, - checks: cmd.StringSlice("check"), - excludes: cmd.StringSlice("exclude"), + owner: owner, + repo: repo, + ref: ref, + checks: cmd.StringSlice("check"), + excludes: cmd.StringSlice("exclude"), + actionRetries: cmd.Int("action-retries"), }, nil } +type checkCIStatusWithRerun interface { + github.CheckCIStatus + github.RerunFailedWorkflows +} + func handleCIStatus(logger *slog.Logger, status github.CIStatus, url string) cli.ExitCoder { switch status { case github.CIStatusUnknown: @@ -143,32 +150,40 @@ func handleCIStatus(logger *slog.Logger, status github.CIStatus, url string) cli } type checkAllCI struct { - githubClient github.CheckCIStatus - owner string - repo string - ref string - excludes []string - logger *slog.Logger + githubClient checkCIStatusWithRerun + owner string + repo string + ref string + excludes []string + logger *slog.Logger + actionRetries int + retriesDone int } -func (ci checkAllCI) Check(ctx context.Context) error { +func (ci *checkAllCI) Check(ctx context.Context) error { status, err := ci.githubClient.GetCIStatus(ctx, ci.owner, ci.repo, ci.ref, ci.excludes) if err != nil { return err } + if status == github.CIStatusFailed { + var shouldContinue bool + shouldContinue, ci.retriesDone = utils.TryRerunFailedWorkflows(ctx, ci.githubClient, ci.logger, ci.owner, ci.repo, ci.ref, ci.actionRetries, ci.retriesDone) + if shouldContinue { + return nil + } + } + return handleCIStatus(ci.logger, status, urlFor(ci.owner, ci.repo, ci.ref)) } type checkSpecificCI struct { - // same field as checkAllCI, and also a list of checks to wait for - checkAllCI + // same fields as checkAllCI, and also a list of checks to wait for + *checkAllCI checks []string } -func (ci checkSpecificCI) Check(ctx context.Context) error { - var status github.CIStatus - +func (ci *checkSpecificCI) Check(ctx context.Context) error { status, interestingChecks, err := ci.githubClient.GetCIStatusForChecks(ctx, ci.owner, ci.repo, ci.ref, ci.checks) if err != nil { return err @@ -176,6 +191,11 @@ func (ci checkSpecificCI) Check(ctx context.Context) error { if status == github.CIStatusFailed { ci.logger.InfoContext(ctx, "CI check failed, not waiting for other checks", "failed_checks", strings.Join(interestingChecks, ", ")) + var shouldContinue bool + shouldContinue, ci.retriesDone = utils.TryRerunFailedWorkflows(ctx, ci.githubClient, ci.logger, ci.owner, ci.repo, ci.ref, ci.actionRetries, ci.retriesDone) + if shouldContinue { + return nil + } } // we didn't find any failed checks, and not all checks are finished, so @@ -187,20 +207,21 @@ func (ci checkSpecificCI) Check(ctx context.Context) error { return handleCIStatus(ci.logger, status, urlFor(ci.owner, ci.repo, ci.ref)) } -func checkCIStatus(timeoutCtx context.Context, githubClient github.CheckCIStatus, cfg *config, ciConf *ciConfig) error { +func checkCIStatus(timeoutCtx context.Context, githubClient checkCIStatusWithRerun, cfg *config, ciConf *ciConfig) error { logger := cfg.logger.With(logging.OwnerAttr(ciConf.owner), logging.RepoAttr(ciConf.repo), logging.RefAttr(ciConf.ref)) logger.InfoContext(timeoutCtx, "checking CI status") - all := checkAllCI{ - githubClient: githubClient, - owner: ciConf.owner, - repo: ciConf.repo, - ref: ciConf.ref, - excludes: ciConf.excludes, - logger: cfg.logger, + all := &checkAllCI{ + githubClient: githubClient, + owner: ciConf.owner, + repo: ciConf.repo, + ref: ciConf.ref, + excludes: ciConf.excludes, + logger: cfg.logger, + actionRetries: ciConf.actionRetries, } - specific := checkSpecificCI{ + specific := &checkSpecificCI{ checkAllCI: all, checks: ciConf.checks, } @@ -258,6 +279,13 @@ func ciCommand(cfg *config) *cli.Command { cli.EnvVar("GITHUB_CI_EXCLUDE"), ), }, + &cli.IntFlag{ + Name: "action-retries", + Usage: "Number of times to retry failed GitHub Actions before failing. Set to 0 to disable retries.", + Sources: cli.NewValueSourceChain( + cli.EnvVar("GITHUB_ACTION_RETRIES"), + ), + }, }, } } diff --git a/cmd/wait-for-github/ci_test.go b/cmd/wait-for-github/ci_test.go index 59ecaaf..12a213a 100644 --- a/cmd/wait-for-github/ci_test.go +++ b/cmd/wait-for-github/ci_test.go @@ -26,10 +26,13 @@ import ( "github.com/urfave/cli/v3" ) -// FakeCIStatusChecker implements the CheckCIStatus interface. +// FakeCIStatusChecker implements the checkCIStatusWithRerun interface. type FakeCIStatusChecker struct { - status github.CIStatus - err error + status github.CIStatus + err error + RerunCount int + RerunCalledCount int + RerunError error } func (c *FakeCIStatusChecker) GetCIStatus(ctx context.Context, owner, repo string, commitHash string, excludes []string) (github.CIStatus, error) { @@ -44,6 +47,11 @@ func (c *FakeCIStatusChecker) GetDetailedCIStatus(ctx context.Context, owner, re return nil, c.err } +func (c *FakeCIStatusChecker) RerunFailedWorkflowsForCommit(ctx context.Context, owner, repo, commitHash string) (int, error) { + c.RerunCalledCount++ + return c.RerunCount, c.RerunError +} + func TestHandleCIStatus(t *testing.T) { tests := []struct { name string @@ -293,6 +301,10 @@ func (c *UnknownCIStatusChecker) GetDetailedCIStatus(ctx context.Context, owner, return nil, nil } +func (c *UnknownCIStatusChecker) RerunFailedWorkflowsForCommit(ctx context.Context, owner, repo, commitHash string) (int, error) { + return 0, nil +} + func TestUnknownCIStatusRetries(t *testing.T) { t.Parallel() @@ -336,3 +348,146 @@ func TestUrlFor(t *testing.T) { require.Equal(t, "https://github.com/owner/repo/commit/abc123", url) } + +func TestCheckAllCIActionRetries(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + actionRetries int + rerunCount int + rerunError error + expectedExitCode *int + }{ + { + name: "CI failed with action-retries, workflows rerun", + actionRetries: 2, + rerunCount: 1, + rerunError: nil, + expectedExitCode: nil, // Should continue waiting after rerun + }, + { + name: "CI failed with action-retries, no workflows to rerun", + actionRetries: 2, + rerunCount: 0, + rerunError: nil, + expectedExitCode: &one, // Should fail immediately + }, + { + name: "CI failed with action-retries, rerun error continues waiting", + actionRetries: 2, + rerunCount: 0, + rerunError: cli.Exit("rerun failed", 1), + expectedExitCode: nil, // Should continue waiting and retry later + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + fakeCIStatusChecker := &FakeCIStatusChecker{ + status: github.CIStatusFailed, + RerunCount: tt.rerunCount, + RerunError: tt.rerunError, + } + cfg := &config{ + recheckInterval: 1, + logger: testLogger, + } + ciConf := &ciConfig{ + owner: "owner", + repo: "repo", + ref: "ref", + actionRetries: tt.actionRetries, + } + + ctx, cancel := context.WithTimeout(context.Background(), 1) + cancel() + + err := checkCIStatus(ctx, fakeCIStatusChecker, cfg, ciConf) + + if tt.expectedExitCode != nil { + var exitErr cli.ExitCoder + require.ErrorAs(t, err, &exitErr) + require.Equal(t, *tt.expectedExitCode, exitErr.ExitCode()) + } else { + // Context expired before CI could complete, which is expected + // since we cancelled the context immediately + require.Error(t, err) + } + }) + } +} + +func TestCheckSpecificCIActionRetries(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + actionRetries int + rerunCount int + rerunError error + expectedExitCode *int + }{ + { + name: "CI failed with action-retries, workflows rerun", + actionRetries: 2, + rerunCount: 1, + rerunError: nil, + expectedExitCode: nil, // Should continue waiting after rerun + }, + { + name: "CI failed with action-retries, no workflows to rerun", + actionRetries: 2, + rerunCount: 0, + rerunError: nil, + expectedExitCode: &one, // Should fail immediately + }, + { + name: "CI failed with action-retries, rerun error continues waiting", + actionRetries: 2, + rerunCount: 0, + rerunError: cli.Exit("rerun failed", 1), + expectedExitCode: nil, // Should continue waiting and retry later + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + fakeCIStatusChecker := &FakeCIStatusChecker{ + status: github.CIStatusFailed, + RerunCount: tt.rerunCount, + RerunError: tt.rerunError, + } + cfg := &config{ + recheckInterval: 1, + logger: testLogger, + } + ciConf := &ciConfig{ + owner: "owner", + repo: "repo", + ref: "ref", + checks: []string{"check1", "check2"}, + actionRetries: tt.actionRetries, + } + + ctx, cancel := context.WithTimeout(context.Background(), 1) + cancel() + + err := checkCIStatus(ctx, fakeCIStatusChecker, cfg, ciConf) + + if tt.expectedExitCode != nil { + var exitErr cli.ExitCoder + require.ErrorAs(t, err, &exitErr) + require.Equal(t, *tt.expectedExitCode, exitErr.ExitCode()) + } else { + // Context expired before CI could complete, which is expected + // since we cancelled the context immediately + require.Error(t, err) + } + }) + } +} diff --git a/cmd/wait-for-github/pr.go b/cmd/wait-for-github/pr.go index 31cea76..e9ad914 100644 --- a/cmd/wait-for-github/pr.go +++ b/cmd/wait-for-github/pr.go @@ -195,27 +195,10 @@ func (pr *prCheck) Check(ctx context.Context) error { } if status == github.CIStatusFailed { - // Check if we have retries left - if pr.actionRetries > 0 && pr.retriesDone < pr.actionRetries { - pr.logger.InfoContext(ctx, "CI failed, attempting to retry failed GitHub Actions", - "retries_done", pr.retriesDone, "retries_allowed", pr.actionRetries) - - rerunCount, err := pr.githubClient.RerunFailedWorkflowsForCommit(ctx, pr.owner, pr.repo, sha) - if err != nil { - pr.logger.WarnContext(ctx, "failed to rerun workflows, will retry", "error", err) - return nil // Continue waiting and try again - } - - if rerunCount > 0 { - pr.retriesDone++ - pr.logger.InfoContext(ctx, "re-ran failed workflows, continuing to wait", - "workflows_rerun", rerunCount, "retries_done", pr.retriesDone) - return nil // Continue waiting - } - - // No workflows were rerun (maybe non-Action failures), fail immediately - pr.logger.InfoContext(ctx, "CI failed with no GitHub Actions to retry, exiting") - return cli.Exit("CI failed", 1) + var shouldContinue bool + shouldContinue, pr.retriesDone = utils.TryRerunFailedWorkflows(ctx, pr.githubClient, pr.logger, pr.owner, pr.repo, sha, pr.actionRetries, pr.retriesDone) + if shouldContinue { + return nil // Continue waiting } pr.logger.InfoContext(ctx, "CI failed, exiting") diff --git a/internal/utils/utils.go b/internal/utils/utils.go index 3242e49..d7cce1f 100644 --- a/internal/utils/utils.go +++ b/internal/utils/utils.go @@ -24,6 +24,7 @@ import ( "syscall" "time" + "github.com/grafana/wait-for-github/internal/github" "github.com/urfave/cli/v3" ) @@ -31,6 +32,34 @@ type Check interface { Check(ctx context.Context) error } +// TryRerunFailedWorkflows attempts to rerun failed workflows if retries are available. +// Returns whether to continue waiting, and the updated retriesDone count. +func TryRerunFailedWorkflows(ctx context.Context, client github.RerunFailedWorkflows, logger *slog.Logger, owner, repo, ref string, actionRetries, retriesDone int) (bool, int) { + if actionRetries == 0 || retriesDone >= actionRetries { + return false, retriesDone + } + + logger.InfoContext(ctx, "CI failed, attempting to retry failed GitHub Actions", + "retries_done", retriesDone, "retries_allowed", actionRetries) + + rerunCount, err := client.RerunFailedWorkflowsForCommit(ctx, owner, repo, ref) + if err != nil { + logger.WarnContext(ctx, "failed to rerun workflows, will retry", "error", err) + return true, retriesDone + } + + if rerunCount > 0 { + retriesDone++ + logger.InfoContext(ctx, "re-ran failed workflows, continuing to wait", + "workflows_rerun", rerunCount, "retries_done", retriesDone) + return true, retriesDone + } + + // No workflows were rerun (maybe non-Action failures), fail immediately + logger.InfoContext(ctx, "CI failed with no GitHub Actions to retry, exiting") + return false, retriesDone +} + func RunUntilCancelledOrTimeout(ctx context.Context, logger *slog.Logger, check Check, interval time.Duration) error { ticker := time.NewTicker(interval) defer ticker.Stop()