From 140877a7f700af01b233014c7e39891ca593f5ea Mon Sep 17 00:00:00 2001 From: Andrei Smirnov Date: Wed, 28 Jan 2026 12:37:12 +0300 Subject: [PATCH 1/2] *: better locking --- app/privkeylock/privkeylock.go | 20 ++++++--------- app/privkeylock/privkeylock_internal_test.go | 18 ++++++------- cmd/createcluster.go | 2 +- cmd/dkg.go | 6 ++++- cmd/edit_addoperators.go | 6 +---- cmd/edit_addvalidators.go | 2 +- cmd/edit_recreateprivatekeys.go | 2 +- cmd/edit_removeoperators.go | 2 +- cmd/edit_replaceoperator.go | 2 +- cmd/testall.go | 2 +- cmd/testpeers.go | 2 +- dkg/disk.go | 8 +++--- dkg/disk_internal_test.go | 2 +- dkg/dkg.go | 27 ++++++++------------ 14 files changed, 46 insertions(+), 55 deletions(-) diff --git a/app/privkeylock/privkeylock.go b/app/privkeylock/privkeylock.go index c2d06ed6d..785d89fc5 100644 --- a/app/privkeylock/privkeylock.go +++ b/app/privkeylock/privkeylock.go @@ -64,7 +64,7 @@ func New(privKeyFilePath, clusterLockFilePath, command string) (Service, error) content, err := os.ReadFile(privKeyFileLockPath) if errors.Is(err, os.ErrNotExist) { //nolint:revive // Empty block is fine. - // No file, we will create it in run + // No file, we will create it in Run } else if err != nil { return Service{}, errors.Wrap(err, "read private key lock file", z.Str("path", privKeyFileLockPath)) } else { @@ -75,7 +75,7 @@ func New(privKeyFilePath, clusterLockFilePath, command string) (Service, error) if time.Since(meta.Timestamp) <= staleDuration { return Service{}, errors.New( - "existing private key lock file found, another charon instance may be running on your machine", + "private key lock file is recently updated, another charon instance may be running on your machine", z.Str("path", privKeyFileLockPath), z.Str("command", meta.Command), z.Str("cluster_lock_hash", meta.ClusterLockHash), @@ -100,7 +100,7 @@ func New(privKeyFilePath, clusterLockFilePath, command string) (Service, error) } } - if err := writePrivateKeyLockFile(privKeyFileLockPath, clusterLockHash, command, time.Now()); err != nil { + if err := overwritePrivateKeyLockFile(privKeyFileLockPath, clusterLockHash, command, time.Now()); err != nil { return Service{}, err } @@ -124,7 +124,8 @@ type Service struct { done chan struct{} // Done is closed when Run exits, which exits the Close goroutine. } -// Run runs the service, updating the lock file every second and deleting it on context cancellation. +// Run runs the service, periodically updating the lock file. +// It does NOT delete the lock file on exit; callers are responsible for any cleanup. func (s Service) Run() error { defer close(s.done) @@ -134,14 +135,9 @@ func (s Service) Run() error { for { select { case <-s.quit: - if err := os.Remove(s.lockFilePath); err != nil { - return errors.Wrap(err, "delete private key lock file") - } - return nil case <-tick.C: - // Overwrite lockfile with new metadata - if err := writePrivateKeyLockFile(s.lockFilePath, s.clusterLockHash, s.command, time.Now()); err != nil { + if err := overwritePrivateKeyLockFile(s.lockFilePath, s.clusterLockHash, s.command, time.Now()); err != nil { return err } } @@ -162,8 +158,8 @@ type metadata struct { ClusterLockHash string `json:"cluster_lock_hash,omitempty"` } -// writePrivateKeyLockFile creates or updates the lock file with the latest metadata. -func writePrivateKeyLockFile(path, clusterLockHash, command string, now time.Time) error { +// overwritePrivateKeyLockFile creates or updates the lock file with the latest metadata. +func overwritePrivateKeyLockFile(path, clusterLockHash, command string, now time.Time) error { b, err := json.Marshal(metadata{Command: command, Timestamp: now, ClusterLockHash: clusterLockHash}) if err != nil { return errors.Wrap(err, "marshal private key lock file") diff --git a/app/privkeylock/privkeylock_internal_test.go b/app/privkeylock/privkeylock_internal_test.go index ebb519e3a..52962c474 100644 --- a/app/privkeylock/privkeylock_internal_test.go +++ b/app/privkeylock/privkeylock_internal_test.go @@ -25,7 +25,7 @@ func TestService(t *testing.T) { writePrivateKeyFile(t, privKeyPath) // Create a stale file that is ignored. - err := writePrivateKeyLockFile(lockPath, "hash123", "test", time.Now().Add(-staleDuration)) + err := overwritePrivateKeyLockFile(lockPath, "hash123", "test", time.Now().Add(-staleDuration)) require.NoError(t, err) // Create a new service. @@ -38,7 +38,7 @@ func TestService(t *testing.T) { // Assert a new service can't be created. _, err = New(privKeyPath, lockFilePath, "test") - require.ErrorContains(t, err, "existing private key lock file found") + require.ErrorContains(t, err, "private key lock file is recently updated") // Delete the file so Run will create it again. require.NoError(t, os.Remove(lockPath)) @@ -55,9 +55,9 @@ func TestService(t *testing.T) { require.NoError(t, eg.Wait()) - // Assert the file is deleted. + // Assert the file is not deleted. _, openErr := os.Open(lockPath) - require.ErrorIs(t, openErr, os.ErrNotExist) + require.NoError(t, openErr) } func TestClusterHashMismatchWithinGracePeriod(t *testing.T) { @@ -71,7 +71,7 @@ func TestClusterHashMismatchWithinGracePeriod(t *testing.T) { writePrivateKeyFile(t, privKeyPath) // Create a stale but within grace period lock file with hash1. - err := writePrivateKeyLockFile(lockPath, "hash1", "test", time.Now().Add(-staleDuration-time.Second)) + err := overwritePrivateKeyLockFile(lockPath, "hash1", "test", time.Now().Add(-staleDuration-time.Second)) require.NoError(t, err) // Update cluster lock file to hash2. @@ -95,7 +95,7 @@ func TestClusterHashMismatchAfterGracePeriod(t *testing.T) { writePrivateKeyFile(t, privKeyPath) // Create an old lock file with hash1 (beyond grace period). - err := writePrivateKeyLockFile(lockPath, "hash1", "test", time.Now().Add(-gracePeriod-time.Second)) + err := overwritePrivateKeyLockFile(lockPath, "hash1", "test", time.Now().Add(-gracePeriod-time.Second)) require.NoError(t, err) // Update cluster lock file to hash2. @@ -127,7 +127,7 @@ func TestClusterHashMatchWithinGracePeriod(t *testing.T) { writePrivateKeyFile(t, privKeyPath) // Create a recent lock file with hash1 (within stale duration). - err := writePrivateKeyLockFile(lockPath, "hash1", "test", time.Now().Add(-time.Second)) + err := overwritePrivateKeyLockFile(lockPath, "hash1", "test", time.Now().Add(-time.Second)) require.NoError(t, err) // Try to create service with same hash - should fail due to staleness check. @@ -136,7 +136,7 @@ func TestClusterHashMatchWithinGracePeriod(t *testing.T) { require.ErrorContains(t, err, "another charon instance may be running") // Now create a stale lock file with same hash (beyond stale duration but within grace period). - err = writePrivateKeyLockFile(lockPath, "hash1", "test", time.Now().Add(-staleDuration-time.Second)) + err = overwritePrivateKeyLockFile(lockPath, "hash1", "test", time.Now().Add(-staleDuration-time.Second)) require.NoError(t, err) // Should succeed since hash matches and file is stale. @@ -211,7 +211,7 @@ func TestEmptyHashToHashMigration(t *testing.T) { writePrivateKeyFile(t, privKeyPath) // Create a stale lock file with empty cluster hash (migration scenario). - err := writePrivateKeyLockFile(lockPath, "", "test", time.Now().Add(-staleDuration*2)) + err := overwritePrivateKeyLockFile(lockPath, "", "test", time.Now().Add(-staleDuration*2)) require.NoError(t, err) // Should succeed - empty hash shouldn't trigger grace period. diff --git a/cmd/createcluster.go b/cmd/createcluster.go index 68c18b27d..86adabd01 100644 --- a/cmd/createcluster.go +++ b/cmd/createcluster.go @@ -443,7 +443,7 @@ func detectNodeDirs(clusterDir string, nodeAmount int) error { return errors.Wrap(err, "absolute path retrieval") } - if _, err := os.Stat(filepath.Join(absPath, "cluster-lock.json")); err == nil { + if _, err := os.Stat(filepath.Join(absPath, clusterLockFile)); err == nil { return errors.New("existing node directory found, please delete it before running this command", z.Str("node_path", absPath)) } } diff --git a/cmd/dkg.go b/cmd/dkg.go index fc5dc9662..a9898df7b 100644 --- a/cmd/dkg.go +++ b/cmd/dkg.go @@ -15,6 +15,10 @@ import ( "github.com/obolnetwork/charon/dkg" ) +var ( + defaultDKGRelays = []string{"https://4.relay.obol.dev"} +) + func newDKGCmd(runFunc func(context.Context, dkg.Config) error) *cobra.Command { var config dkg.Config @@ -50,7 +54,7 @@ this command at the same time.`, bindKeymanagerFlags(cmd.Flags(), &config.KeymanagerAddr, &config.KeymanagerAuthToken) bindDefDirFlag(cmd.Flags(), &config.DefFile) bindNoVerifyFlag(cmd.Flags(), &config.NoVerify) - bindP2PFlags(cmd, &config.P2P) + bindP2PFlags(cmd, &config.P2P, defaultDKGRelays...) bindLogFlags(cmd.Flags(), &config.Log) bindPublishFlags(cmd.Flags(), &config) bindShutdownDelayFlag(cmd.Flags(), &config.ShutdownDelay) diff --git a/cmd/edit_addoperators.go b/cmd/edit_addoperators.go index f3c672a1b..7a732f265 100644 --- a/cmd/edit_addoperators.go +++ b/cmd/edit_addoperators.go @@ -18,10 +18,6 @@ import ( "github.com/obolnetwork/charon/eth2util/enr" ) -const ( - defaultAlphaRelay = "https://4.relay.obol.dev" -) - func newAddOperatorsCmd(runFunc func(context.Context, dkg.AddOperatorsConfig, dkg.Config) error) *cobra.Command { var ( config dkg.AddOperatorsConfig @@ -52,7 +48,7 @@ func newAddOperatorsCmd(runFunc func(context.Context, dkg.AddOperatorsConfig, dk cmd.Flags().DurationVar(&dkgConfig.Timeout, "timeout", time.Minute, "Timeout for the protocol, should be increased if protocol times out.") bindNoVerifyFlag(cmd.Flags(), &dkgConfig.NoVerify) - bindP2PFlags(cmd, &dkgConfig.P2P, defaultAlphaRelay) + bindP2PFlags(cmd, &dkgConfig.P2P, defaultDKGRelays...) bindLogFlags(cmd.Flags(), &dkgConfig.Log) bindEth1Flag(cmd.Flags(), &dkgConfig.ExecutionEngineAddr) bindShutdownDelayFlag(cmd.Flags(), &dkgConfig.ShutdownDelay) diff --git a/cmd/edit_addvalidators.go b/cmd/edit_addvalidators.go index 0f5516b45..4c8f27ff4 100644 --- a/cmd/edit_addvalidators.go +++ b/cmd/edit_addvalidators.go @@ -73,7 +73,7 @@ func newAddValidatorsCmd(runFunc func(context.Context, addValidatorsConfig) erro // Bind `dkg` flags. bindKeymanagerFlags(cmd.Flags(), &config.DKG.KeymanagerAddr, &config.DKG.KeymanagerAuthToken) bindNoVerifyFlag(cmd.Flags(), &config.DKG.NoVerify) - bindP2PFlags(cmd, &config.DKG.P2P, defaultAlphaRelay) + bindP2PFlags(cmd, &config.DKG.P2P, defaultDKGRelays...) bindLogFlags(cmd.Flags(), &config.DKG.Log) bindShutdownDelayFlag(cmd.Flags(), &config.DKG.ShutdownDelay) bindEth1Flag(cmd.Flags(), &config.DKG.ExecutionEngineAddr) diff --git a/cmd/edit_recreateprivatekeys.go b/cmd/edit_recreateprivatekeys.go index f652ecf50..877db0e00 100644 --- a/cmd/edit_recreateprivatekeys.go +++ b/cmd/edit_recreateprivatekeys.go @@ -43,7 +43,7 @@ func newRecreatePrivateKeysCmd(runFunc func(context.Context, dkg.ReshareConfig) cmd.Flags().DurationVar(&config.DKGConfig.Timeout, "timeout", time.Minute, "Timeout for the protocol, should be increased if protocol times out.") bindNoVerifyFlag(cmd.Flags(), &config.DKGConfig.NoVerify) - bindP2PFlags(cmd, &config.DKGConfig.P2P, defaultAlphaRelay) + bindP2PFlags(cmd, &config.DKGConfig.P2P, defaultDKGRelays...) bindLogFlags(cmd.Flags(), &config.DKGConfig.Log) bindEth1Flag(cmd.Flags(), &config.DKGConfig.ExecutionEngineAddr) bindShutdownDelayFlag(cmd.Flags(), &config.DKGConfig.ShutdownDelay) diff --git a/cmd/edit_removeoperators.go b/cmd/edit_removeoperators.go index d6eb056cb..c2d402f9a 100644 --- a/cmd/edit_removeoperators.go +++ b/cmd/edit_removeoperators.go @@ -50,7 +50,7 @@ func newRemoveOperatorsCmd(runFunc func(context.Context, dkg.RemoveOperatorsConf cmd.Flags().StringSliceVar(&config.ParticipatingENRs, "participating-operator-enrs", nil, "Comma-separated list of operator ENRs participating in the ceremony. Required if --operator-enrs-to-remove specifies more operators to remove than the fault tolerance of the current cluster.") bindNoVerifyFlag(cmd.Flags(), &dkgConfig.NoVerify) - bindP2PFlags(cmd, &dkgConfig.P2P, defaultAlphaRelay) + bindP2PFlags(cmd, &dkgConfig.P2P, defaultDKGRelays...) bindLogFlags(cmd.Flags(), &dkgConfig.Log) bindEth1Flag(cmd.Flags(), &dkgConfig.ExecutionEngineAddr) bindShutdownDelayFlag(cmd.Flags(), &dkgConfig.ShutdownDelay) diff --git a/cmd/edit_replaceoperator.go b/cmd/edit_replaceoperator.go index a17b83782..0ddc599f0 100644 --- a/cmd/edit_replaceoperator.go +++ b/cmd/edit_replaceoperator.go @@ -51,7 +51,7 @@ func newReplaceOperatorCmd(runFunc func(context.Context, dkg.ReplaceOperatorConf cmd.Flags().DurationVar(&dkgConfig.Timeout, "timeout", time.Minute, "Timeout for the protocol, should be increased if protocol times out.") bindNoVerifyFlag(cmd.Flags(), &dkgConfig.NoVerify) - bindP2PFlags(cmd, &dkgConfig.P2P, defaultAlphaRelay) + bindP2PFlags(cmd, &dkgConfig.P2P, defaultDKGRelays...) bindLogFlags(cmd.Flags(), &dkgConfig.Log) bindEth1Flag(cmd.Flags(), &dkgConfig.ExecutionEngineAddr) bindShutdownDelayFlag(cmd.Flags(), &dkgConfig.ShutdownDelay) diff --git a/cmd/testall.go b/cmd/testall.go index 49aea7d3e..c657244fc 100644 --- a/cmd/testall.go +++ b/cmd/testall.go @@ -45,7 +45,7 @@ func newTestAllCmd(runFunc func(context.Context, io.Writer, testAllConfig) error bindTestMEVFlags(cmd, &config.MEV, "mev-") bindTestInfraFlags(cmd, &config.Infra, "infra-") - bindP2PFlags(cmd, &config.Peers.P2P, defaultAlphaRelay) + bindP2PFlags(cmd, &config.Peers.P2P, defaultDKGRelays...) bindTestLogFlags(cmd.Flags(), &config.Peers.Log) wrapPreRunE(cmd, func(cmd *cobra.Command, _ []string) error { diff --git a/cmd/testpeers.go b/cmd/testpeers.go index 5d17952af..1b72892ed 100644 --- a/cmd/testpeers.go +++ b/cmd/testpeers.go @@ -86,7 +86,7 @@ func newTestPeersCmd(runFunc func(context.Context, io.Writer, testPeersConfig) ( bindTestFlags(cmd, &config.testConfig) bindTestPeersFlags(cmd, &config, "") - bindP2PFlags(cmd, &config.P2P, defaultAlphaRelay) + bindP2PFlags(cmd, &config.P2P, defaultDKGRelays...) bindTestLogFlags(cmd.Flags(), &config.Log) wrapPreRunE(cmd, func(cmd *cobra.Command, _ []string) error { diff --git a/dkg/disk.go b/dkg/disk.go index 4d278549a..1ae0dbc30 100644 --- a/dkg/disk.go +++ b/dkg/disk.go @@ -153,7 +153,7 @@ func writeLock(datadir string, lock cluster.Lock) error { } //nolint:gosec // File needs to be read-only for everybody - err = os.WriteFile(path.Join(datadir, "cluster-lock.json"), b, 0o444) // Read-only + err = os.WriteFile(path.Join(datadir, clusterLockFile), b, 0o444) // Read-only if err != nil { return errors.Wrap(err, "write lock") } @@ -179,12 +179,12 @@ func checkClearDataDir(dataDir string) error { } disallowedEntities := map[string]struct{}{ - "validator_keys": {}, - "cluster-lock.json": {}, + validatorKeysSubDir: {}, + clusterLockFile: {}, } necessaryEntities := map[string]bool{ - "charon-enr-private-key": false, + enrPrivateKeyFile: false, } for _, entity := range dirContent { diff --git a/dkg/disk_internal_test.go b/dkg/disk_internal_test.go index a5b4a4334..374af4f53 100644 --- a/dkg/disk_internal_test.go +++ b/dkg/disk_internal_test.go @@ -189,7 +189,7 @@ func TestCheckClearDataDir(t *testing.T) { require.NoError(t, os.WriteFile( - filepath.Join(rootDir, dataDir, "cluster-lock.json"), + filepath.Join(rootDir, dataDir, clusterLockFile), []byte{1, 2, 3}, 0o755, ), diff --git a/dkg/dkg.go b/dkg/dkg.go index 671413d6c..6a232850e 100644 --- a/dkg/dkg.go +++ b/dkg/dkg.go @@ -114,23 +114,18 @@ func Run(ctx context.Context, conf Config) (err error) { ctx = log.WithTopic(ctx, "dkg") - { - // Setup private key locking. - lockSvc, err := privkeylock.New(p2p.KeyPath(conf.DataDir), path.Join(conf.DataDir, "cluster-lock.json"), "charon dkg") - if err != nil { - return err - } - - // Start it async - go func() { - if err := lockSvc.Run(); err != nil { - log.Error(ctx, "Failed to acquire lock on private key file. Another process may be using it or file permissions may be incorrect", err) - } - }() - - // Stop it on exit. - defer lockSvc.Close() + // Setup private key locking and starting async + lockSvc, err := privkeylock.New(p2p.KeyPath(conf.DataDir), path.Join(conf.DataDir, clusterLockFile), "charon dkg") + if err != nil { + return err } + defer lockSvc.Close() // safe to defer Close here, because lockSvc.Run() will be called eventually + + go func() { + if err := lockSvc.Run(); err != nil { + log.Error(ctx, "Failed to update the private key lock file, another process may be using the file or file permissions may be incorrect.", err) + } + }() version.LogInfo(ctx, "Charon DKG starting") From cd357506dd5e617973d80f779046b180f8c2b1b0 Mon Sep 17 00:00:00 2001 From: Andrei Smirnov Date: Wed, 28 Jan 2026 13:40:21 +0300 Subject: [PATCH 2/2] Fixed test --- cmd/dkg.go | 4 +--- dkg/dkg_test.go | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/cmd/dkg.go b/cmd/dkg.go index a9898df7b..9b4773fe4 100644 --- a/cmd/dkg.go +++ b/cmd/dkg.go @@ -15,9 +15,7 @@ import ( "github.com/obolnetwork/charon/dkg" ) -var ( - defaultDKGRelays = []string{"https://4.relay.obol.dev"} -) +var defaultDKGRelays = []string{"https://4.relay.obol.dev"} func newDKGCmd(runFunc func(context.Context, dkg.Config) error) *cobra.Command { var config dkg.Config diff --git a/dkg/dkg_test.go b/dkg/dkg_test.go index 602034bdb..d8d69e7e4 100644 --- a/dkg/dkg_test.go +++ b/dkg/dkg_test.go @@ -362,12 +362,12 @@ func testDKG(t *testing.T, def cluster.Definition, dir string, p2pKeys []*k1.Pri testutil.SkipIfBindErr(t, err) testutil.RequireNoError(t, err) - // check that the privkey lock file has been deleted in all nodes at the end of dkg + // check that the privkey lock file has NOT been deleted in all nodes at the end of dkg for i := range len(def.Operators) { lockPath := path.Join(dir, fmt.Sprintf("node%d", i), "charon-enr-private-key.lock") _, openErr := os.Open(lockPath) - require.ErrorIs(t, openErr, os.ErrNotExist) + require.NoError(t, openErr) } if keymanager {