Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/e2e_manual.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ on:
- gpu
- imagepuller-auth
- imagestore
- kds-pcs-downtime
- memdump
- multiple-cpus
- openssl
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/e2e_nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ jobs:
- gpu
- imagepuller-auth
- imagestore
- kds-pcs-downtime
- memdump
- openssl
- peerrecovery
Expand Down
38 changes: 37 additions & 1 deletion dev-docs/endorsement-caching.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,22 @@ The following gives an overview over the request and caching structure we implem
If the CRL can't be obtained from KDS, the cache is checked for an unexpired CRL.
The validator cache is on-disk.

### KDS Unavailability

If the KDS is unreachable, the following table shows the validation outcome depending on the cache state.
An empty cell indicates that the value isn't present.
An asterisk (*) indicates that any value is acceptable (present or not present).
CRL + VCEK indicates that *both* CRL and VCEK are present in the cache.

| Issuer can't reach KDS | Validator can't reach KDS | Issuer cache | Validator cache | Validation |
| :--------------------: | :-----------------------: | :----------: | :-------------: | :--------: |
| | | * | * | Success |
| X | | | * | Success |
| | X | * | | Success |
| X | X | | | Failure |
| X | X | CRL + VCEK | * | Success |
| X | X | * | CRL + VCEK | Success |

## Intel PCS

For successful verification, the client needs to receive the TCBInfo, QeIdentity, and the Root CRL and PCK CRL.
Expand Down Expand Up @@ -81,5 +97,25 @@ The expiration date of the CRLS as well as the expiration date included in the T
4. On the validator side, the go-tdx-guest library will retrieve the collateral from the PCS which is needed to verify the quote.
This includes the TCBInfo, the QeIdentity, as well as the Root CRL and the PCK CRL.
5. On the validator side, if the collateral or CRLs can't be retrieved from the PCS, the go-tdx-guest library will use the collateral from the local cache if present.
If the CRLs can't be retrieved from the PCS, the go-tdx-guest library the cache is checked for an unexpired CRL.
If the CRLs can't be retrieved from the PCS, the cache is checked for an unexpired CRL.
The validator cache is on-disk.

### PCS Unavailability

If the PCS is unreachable, the following table shows the validation outcome depending on the cache state.
An empty cell indicates that the value isn't present.
An asterisk (*) indicates that any value is acceptable (present or not present).
The issuer must be able to obtain the PCK Certificate Chain from PCCS to generate a valid quote.
The validator must be able to obtain the collateral (from PCS or cache) to validate the quote.

| Issuer can't reach PCS | Validator can't reach PCS | Issuer cache | Validator cache | Validation |
| :--------------------: | :-----------------------: | :--------------------: | :-------------: | :--------: |
| | | * | * | Success |
| X | | | * | Failure |
| X | | *PCCS: PCK Cert Chain* | * | Success |
| | X | * | | Failure |
| | X | * | Collateral | Success |
| X | X | | | Failure |
| X | X | *PCCS: PCK Cert Chain* | | Failure |
| X | X | | Collateral | Failure |
| X | X | *PCCS: PCK Cert Chain* | Collateral | Success |
16 changes: 9 additions & 7 deletions e2e/internal/contrasttest/contrasttest.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ func (ct *ContrastTest) RunGenerate(ctx context.Context) error {
errBuf := &bytes.Buffer{}
generate.SetErr(errBuf)

if err := generate.Execute(); err != nil {
if err := generate.ExecuteContext(ctx); err != nil {
return errors.Join(fmt.Errorf("%s", errBuf), err)
}
patchRefValsFunc, err := PatchReferenceValues(ctx, ct.Kubeclient, ct.Platform)
Expand Down Expand Up @@ -421,14 +421,16 @@ func (ct *ContrastTest) Verify(t *testing.T) {
require.NoError(t, ct.RunVerify(t.Context()))
}

// Recover runs the contrast recover subcommand.
// Recover runs the contrast recover subcommand and fails the test if it is not successful.
func (ct *ContrastTest) Recover(t *testing.T) {
require := require.New(t)
require.NoError(t, ct.runAgainstCoordinator(t.Context(), cmd.NewRecoverCmd()))
}

ctx, cancel := context.WithTimeout(t.Context(), 3*time.Minute)
// RunRecover runs the contrast recover subcommand.
func (ct *ContrastTest) RunRecover(ctx context.Context) error {
ctx, cancel := context.WithTimeout(ctx, 3*time.Minute)
defer cancel()

require.NoError(ct.runAgainstCoordinator(ctx, cmd.NewRecoverCmd()))
return ct.runAgainstCoordinator(ctx, cmd.NewRecoverCmd())
}

// MeshCACert returns a CertPool that contains the coordinator mesh CA cert.
Expand Down Expand Up @@ -522,7 +524,7 @@ func (ct *ContrastTest) runAgainstCoordinator(ctx context.Context, cmd *cobra.Co
errBuf := &bytes.Buffer{}
cmd.SetErr(errBuf)

if err := cmd.Execute(); err != nil {
if err := cmd.ExecuteContext(ctx); err != nil {
return fmt.Errorf("running %q: %s", cmd.Use, errBuf)
}
return nil
Expand Down
214 changes: 214 additions & 0 deletions e2e/kds-pcs-downtime/kds-pcs-downtime_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
// Copyright 2025 Edgeless Systems GmbH
// SPDX-License-Identifier: BUSL-1.1

//go:build e2e

package kdspcsdowntime

import (
"context"
"flag"
"fmt"
"net"
"net/http"
"os"
"sync/atomic"
"testing"
"time"

"github.com/edgelesssys/contrast/e2e/internal/contrasttest"
"github.com/edgelesssys/contrast/e2e/internal/kubeclient"
"github.com/edgelesssys/contrast/internal/kuberesource"
"github.com/edgelesssys/contrast/internal/manifest"
"github.com/edgelesssys/contrast/internal/platforms"
"github.com/elazarl/goproxy"
"github.com/stretchr/testify/require"
)

const (
kdsAddr = "kdsintf.amd.com:443"
pcsAddr = "api.trustedservices.intel.com:443"
)

func TestKDSPCSDowntime(t *testing.T) {
platform, err := platforms.FromString(contrasttest.Flags.PlatformStr)
require.NoError(t, err)
ct := contrasttest.New(t)

runtimeHandler, err := manifest.RuntimeHandler(platform)
require.NoError(t, err)
resources := kuberesource.CoordinatorBundle()
resources = kuberesource.PatchRuntimeHandlers(resources, runtimeHandler)
resources = kuberesource.AddPortForwarders(resources)
ct.Init(t, resources)

proxy := goproxy.NewProxyHttpServer()
server := http.Server{Handler: proxy}
errCh := make(chan error)

// If set to true, connections to KDS and PCS will be blocked by the proxy.
var blockKDSPCS atomic.Bool
// connectionProxied will be set to true if the proxy performs an HTTP CONNECT to the address of KDS or PCS.
var connectionProxied atomic.Bool
proxy.ConnectDial = func(network string, addr string) (net.Conn, error) {
t.Logf("Proxying connection: %q", addr)
if (addr == kdsAddr || addr == pcsAddr) && blockKDSPCS.Load() {
t.Logf("Blocking connection to KDS/PCS %q", addr)
connectionProxied.Store(true)
return nil, fmt.Errorf("connection to KDS/PCS %q blocked by test proxy", addr)
}
return (&net.Dialer{}).DialContext(t.Context(), network, addr)
}

proxyListener, err := (&net.ListenConfig{}).Listen(t.Context(), "tcp", "127.0.0.1:")
require.NoError(t, err)

t.Cleanup(func() {
require.NoError(t, server.Close())
err := <-errCh
require.ErrorIs(t, err, http.ErrServerClosed)
})

go func() {
errCh <- server.Serve(proxyListener)
}()

t.Setenv("https_proxy", proxyListener.Addr().String())

require.True(t, t.Run("generate", ct.Generate), "contrast generate needs to succeed for subsequent tests")
require.True(t, t.Run("apply", ct.Apply), "Kubernetes resources need to be applied for subsequent tests")

t.Run("kds downtime", func(t *testing.T) {
if !platforms.IsSNP(platform) {
t.Skip("KDS downtime test is only applicable to SEV-SNP workloads")
}

require := require.New(t)

ctx, cancel := context.WithTimeout(t.Context(), ct.FactorPlatformTimeout(3*time.Minute))
defer cancel()

require.NoError(ct.Kubeclient.WaitForCoordinator(ctx, ct.Namespace))

//
// Look at dev-docs/endorsement-caching.md for table of different cases.
//

// Coordinator and CLI cache are empty at the beginning.

coordinatorPods, err := ct.Kubeclient.PodsFromOwner(ctx, ct.Namespace, "StatefulSet", "coordinator")
require.NoError(err)
require.NotEmpty(coordinatorPods, "pod not found: %s/%s", ct.Namespace, "coordinator")

// Block coordinator access to KDS.
etcHosts, stderr, err := ct.Kubeclient.Exec(ctx, ct.Namespace, coordinatorPods[0].Name, []string{"/bin/sh", "-c", "cat /etc/hosts"})
require.NoError(err, "stderr: %q", stderr)
_, stderr, err = ct.Kubeclient.Exec(ctx, ct.Namespace, coordinatorPods[0].Name, []string{"/bin/sh", "-c", "echo 127.0.0.1 kdsintf.amd.com >> /etc/hosts"})
require.NoError(err, "stderr: %q", stderr)

// Block CLI access to KDS.
blockKDSPCS.Store(true)

// Set should fail because neither coordinator nor CLI can reach KDS and there is no cached data.
// Set loop considers context deadline exceeded from KDS as a retriable error.
// Lower the timeout so the set loop doesn't exceed the test timeout.
setCtx, setCancel := context.WithTimeout(ctx, ct.FactorPlatformTimeout(1*time.Minute))
defer setCancel()
err = ct.RunSet(setCtx)
t.Logf("Set error: %v", err)
require.ErrorContains(err, "transport: authentication handshake failed: context deadline exceeded")
require.True(connectionProxied.Load(), "expected connection to KDS to be proxied")
connectionProxied.Store(false)

// Unblock coordinator access to KDS.
_, stderr, err = ct.Kubeclient.Exec(ctx, ct.Namespace, coordinatorPods[0].Name, []string{"/bin/sh", "-c", fmt.Sprintf("echo '%s' > /etc/hosts", etcHosts)})
require.NoError(err, "updating /etc/hosts: stderr: %q", stderr)

// Set should succeed because coordinator can reach KDS.
require.NoError(ct.RunSet(ctx))

// Block coordinator access to KDS again.
_, stderr, err = ct.Kubeclient.Exec(ctx, ct.Namespace, coordinatorPods[0].Name, []string{"/bin/sh", "-c", "echo 127.0.0.1 kdsintf.amd.com >> /etc/hosts"})
require.NoError(err, "updating /etc/hosts: stderr: %q", stderr)

// Verify should succeed because certs are now cached by coordinator.
require.NoError(ct.RunVerify(ctx))

// Clear coordinator cache by restarting it.
require.NoError(ct.Kubeclient.Restart(ctx, kubeclient.StatefulSet{}, ct.Namespace, "coordinator"))
require.NoError(ct.Kubeclient.WaitForCoordinator(ctx, ct.Namespace))

coordinatorPods, err = ct.Kubeclient.PodsFromOwner(ctx, ct.Namespace, "StatefulSet", "coordinator")
require.NoError(err)
require.NotEmpty(coordinatorPods, "pod not found: %s/%s", ct.Namespace, "coordinator")

// Block coordinator access to KDS.
_, stderr, err = ct.Kubeclient.Exec(ctx, ct.Namespace, coordinatorPods[0].Name, []string{"/bin/sh", "-c", "echo 127.0.0.1 kdsintf.amd.com >> /etc/hosts"})
require.NoError(err, "updating /etc/hosts: stderr: %q", stderr)

// Unblock CLI access to KDS.
blockKDSPCS.Store(false)

// Recover should succeed because CLI can reach KDS.
require.NoError(ct.RunRecover(ctx))

// Block CLI access to KDS again.
blockKDSPCS.Store(true)

// Verify should succeed because CLI has now cached the certs.
require.NoError(ct.RunVerify(ctx))
})

t.Run("pcs downtime", func(t *testing.T) {
if !platforms.IsTDX(platform) {
t.Skip("PCS downtime test is only applicable to TDX workloads")
}

require := require.New(t)

ctx, cancel := context.WithTimeout(t.Context(), ct.FactorPlatformTimeout(2*time.Minute))
defer cancel()

c := kubeclient.NewForTest(t)

require.NoError(c.WaitForCoordinator(ctx, ct.Namespace))

//
// We can't test PCS downtime on the issuer side, since PCS/PCCS are accessed from the host.
// Look at dev-docs/endorsement-caching.md for table of different cases.
//

// CLI cache is empty at the beginning. Block CLI access to PCS.
blockKDSPCS.Store(true)

// Set should fail because the CLI can't reach the PCS and there is no cached data.
// Set loop considers context deadline exceeded from PCS as a retriable error.
// Lower the timeout so the set loop doesn't exceed the test timeout.
setCtx, setCancel := context.WithTimeout(ctx, ct.FactorPlatformTimeout(1*time.Minute))
defer setCancel()
err = ct.RunSet(setCtx)
t.Logf("Set error: %v", err)
require.ErrorContains(err, "transport: authentication handshake failed: context deadline exceeded")
require.True(connectionProxied.Load(), "expected connection to PCS to be proxied")
connectionProxied.Store(false)

// Unblock CLI access to PCS.
blockKDSPCS.Store(false)

// Set should succeed because the CLI can reach PCS.
require.NoError(ct.RunSet(ctx))

// Block CLI access to PCS again.
blockKDSPCS.Store(true)

// Verify should succeed because collateral is now cached by CLI.
require.NoError(ct.RunVerify(ctx))
})
}

func TestMain(m *testing.M) {
contrasttest.RegisterFlags()
flag.Parse()

os.Exit(m.Run())
}
1 change: 1 addition & 0 deletions packages/by-name/contrast/e2e/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ buildGoModule {
"e2e/gpu"
"e2e/imagepuller-auth"
"e2e/imagestore"
"e2e/kds-pcs-downtime"
"e2e/memdump"
"e2e/multiple-cpus"
"e2e/openssl"
Expand Down
Loading