From 1988e9e9d04af5e5ae4e36a46cc64997ad27ddc4 Mon Sep 17 00:00:00 2001 From: syntrust Date: Thu, 4 Dec 2025 17:22:54 +0800 Subject: [PATCH 01/51] alert when blob missing from cl --- cmd/es-node/config.go | 10 +++++++++- ethstorage/archiver/service.go | 2 +- ethstorage/downloader/config.go | 3 +++ ethstorage/downloader/downloader.go | 25 +++++++++++++++++++------ ethstorage/node/node.go | 1 + 5 files changed, 33 insertions(+), 8 deletions(-) diff --git a/cmd/es-node/config.go b/cmd/es-node/config.go index 9c778939..4eb7f2c9 100644 --- a/cmd/es-node/config.go +++ b/cmd/es-node/config.go @@ -255,9 +255,17 @@ func NewL1EndpointConfig(ctx *cli.Context, lg log.Logger) (*eth.L1EndpointConfig } func NewDownloaderConfig(ctx *cli.Context) *downloader.Config { - return &downloader.Config{ + dlCfg := &downloader.Config{ DownloadStart: ctx.GlobalInt64(flags.DownloadStart.Name), DownloadDump: ctx.GlobalString(flags.DownloadDump.Name), DownloadThreadNum: ctx.GlobalInt(flags.DownloadThreadNum.Name), } + + emailConfig, err := email.GetEmailConfig(ctx) + if err != nil { + // email is nice to have but not required by downloader + return dlCfg + } + dlCfg.EmailConfig = *emailConfig + return dlCfg } diff --git a/ethstorage/archiver/service.go b/ethstorage/archiver/service.go index 6d702ef5..3462d71c 100644 --- a/ethstorage/archiver/service.go +++ b/ethstorage/archiver/service.go @@ -119,7 +119,7 @@ func (a *APIService) Start(ctx context.Context) error { return err } r := mux.NewRouter() - // Deprecated + // Deprecated by Fusaka but still used by OP Stack r.HandleFunc("/eth/v1/beacon/blob_sidecars/{id}", a.blobSidecarHandler) // Fusaka r.HandleFunc("/eth/v1/beacon/blobs/{id}", a.blobsHandler) diff --git a/ethstorage/downloader/config.go b/ethstorage/downloader/config.go index b094f199..8d8f1d95 100644 --- a/ethstorage/downloader/config.go +++ b/ethstorage/downloader/config.go @@ -3,8 +3,11 @@ package downloader +import "github.com/ethstorage/go-ethstorage/ethstorage/email" + type Config struct { DownloadStart int64 // which block should we download the blobs from DownloadDump string // where to dump the download blobs DownloadThreadNum int // how many threads that will be used to download the blobs into storage file + EmailConfig email.EmailConfig } diff --git a/ethstorage/downloader/downloader.go b/ethstorage/downloader/downloader.go index f93ff1ee..054f62bf 100644 --- a/ethstorage/downloader/downloader.go +++ b/ethstorage/downloader/downloader.go @@ -22,6 +22,7 @@ import ( "github.com/ethereum/go-ethereum/rpc" "github.com/ethstorage/go-ethstorage/ethstorage" + "github.com/ethstorage/go-ethstorage/ethstorage/email" "github.com/ethstorage/go-ethstorage/ethstorage/eth" ) @@ -70,10 +71,11 @@ type Downloader struct { dlLatestReq chan struct{} dlFinalizedReq chan struct{} - lg log.Logger - done chan struct{} - wg sync.WaitGroup - mu sync.Mutex + emailConfig *email.EmailConfig + lg log.Logger + done chan struct{} + wg sync.WaitGroup + mu sync.Mutex } type blob struct { @@ -109,6 +111,7 @@ func NewDownloader( downloadDump string, minDurationForBlobsRequest uint64, downloadThreadNum int, + emailConfig email.EmailConfig, lg log.Logger, ) *Downloader { sm.DownloadThreadNum = downloadThreadNum @@ -127,6 +130,7 @@ func NewDownloader( done: make(chan struct{}), lastDownloadBlock: downloadStart, downloadedBlobs: 0, + emailConfig: &emailConfig, } } @@ -411,8 +415,17 @@ func (s *Downloader) downloadRange(start int64, end int64, toCache bool) ([]blob for _, elBlob := range elBlock.blobs { clBlob, exists := clBlobs[elBlob.hash] if !exists { - s.lg.Error("Did not find the event specified blob in the CL") - + if s.emailConfig != nil { + msg := fmt.Sprintf("Did not find the event specified blob in the CL, blockNumber: %d, kvIndex: %d\n", elBlock.number, elBlob.kvIndex) + msg += "This may indicate that the blob has not been published to the consensus layer yet, or there is an issue with the consensus layer blob availability.\n" + email.SendEmail( + "Blob missing in CL for downloader", + msg, + *s.emailConfig, + s.lg, + ) + } + s.lg.Crit("Did not find the event specified blob in the CL", "blockNumber", elBlock.number, "kvIndex", elBlob.kvIndex) } // encode blobs so that miner can do sampling directly from cache elBlob.data = s.sm.EncodeBlob(clBlob.Data, elBlob.hash, elBlob.kvIndex.Uint64(), s.sm.MaxKvSize()) diff --git a/ethstorage/node/node.go b/ethstorage/node/node.go index 4f20bd49..c8200a0c 100644 --- a/ethstorage/node/node.go +++ b/ethstorage/node/node.go @@ -143,6 +143,7 @@ func (n *EsNode) initL2(ctx context.Context, cfg *Config) error { cfg.Downloader.DownloadDump, cfg.L1.L1MinDurationForBlobsRequest, cfg.Downloader.DownloadThreadNum, + cfg.Downloader.EmailConfig, n.lg, ) return nil From 8303198e62c87ad8736321a2c814af012c706aff Mon Sep 17 00:00:00 2001 From: syntrust Date: Thu, 4 Dec 2025 17:47:34 +0800 Subject: [PATCH 02/51] update content --- ethstorage/downloader/downloader.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ethstorage/downloader/downloader.go b/ethstorage/downloader/downloader.go index 054f62bf..429e9dcf 100644 --- a/ethstorage/downloader/downloader.go +++ b/ethstorage/downloader/downloader.go @@ -416,10 +416,11 @@ func (s *Downloader) downloadRange(start int64, end int64, toCache bool) ([]blob clBlob, exists := clBlobs[elBlob.hash] if !exists { if s.emailConfig != nil { - msg := fmt.Sprintf("Did not find the event specified blob in the CL, blockNumber: %d, kvIndex: %d\n", elBlock.number, elBlob.kvIndex) - msg += "This may indicate that the blob has not been published to the consensus layer yet, or there is an issue with the consensus layer blob availability.\n" + msg := fmt.Sprintf("From EL event: blockNumber=%d, kvIndex=%d, hash=%s\n", elBlock.number, elBlob.kvIndex, elBlob.hash.Hex()) + msg += "Downloader did not find the blob in CL. \n" + msg += "This may indicate that there is an issue with the consensus layer blob availability.\n" email.SendEmail( - "Blob missing in CL for downloader", + "🛑 Fatal error from es-node: blob missing in CL for downloader", msg, *s.emailConfig, s.lg, From c75fe84318fc2c98b0f74f56f4b7afe9755903aa Mon Sep 17 00:00:00 2001 From: syntrust Date: Thu, 4 Dec 2025 17:53:29 +0800 Subject: [PATCH 03/51] update content --- ethstorage/downloader/downloader.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ethstorage/downloader/downloader.go b/ethstorage/downloader/downloader.go index 429e9dcf..8c8feeda 100644 --- a/ethstorage/downloader/downloader.go +++ b/ethstorage/downloader/downloader.go @@ -416,8 +416,11 @@ func (s *Downloader) downloadRange(start int64, end int64, toCache bool) ([]blob clBlob, exists := clBlobs[elBlob.hash] if !exists { if s.emailConfig != nil { - msg := fmt.Sprintf("From EL event: blockNumber=%d, kvIndex=%d, hash=%s\n", elBlock.number, elBlob.kvIndex, elBlob.hash.Hex()) - msg += "Downloader did not find the blob in CL. \n" + msg := "Downloader did not find the specified blob in CL: \n" + msg += "From the EL event: \n" + msg += fmt.Sprintf(" blockNumber=%d\n", elBlock.number) + msg += fmt.Sprintf(" kvIndex=%d\n", elBlob.kvIndex) + msg += fmt.Sprintf(" hash=%s\n", elBlob.hash.Hex()) msg += "This may indicate that there is an issue with the consensus layer blob availability.\n" email.SendEmail( "🛑 Fatal error from es-node: blob missing in CL for downloader", From aeb3dad0db20f96cc65cac1221441795b7486628 Mon Sep 17 00:00:00 2001 From: syntrust Date: Thu, 4 Dec 2025 18:11:25 +0800 Subject: [PATCH 04/51] update content --- ethstorage/downloader/downloader.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ethstorage/downloader/downloader.go b/ethstorage/downloader/downloader.go index 8c8feeda..65c30692 100644 --- a/ethstorage/downloader/downloader.go +++ b/ethstorage/downloader/downloader.go @@ -416,14 +416,14 @@ func (s *Downloader) downloadRange(start int64, end int64, toCache bool) ([]blob clBlob, exists := clBlobs[elBlob.hash] if !exists { if s.emailConfig != nil { - msg := "Downloader did not find the specified blob in CL: \n" - msg += "From the EL event: \n" - msg += fmt.Sprintf(" blockNumber=%d\n", elBlock.number) - msg += fmt.Sprintf(" kvIndex=%d\n", elBlob.kvIndex) - msg += fmt.Sprintf(" hash=%s\n", elBlob.hash.Hex()) - msg += "This may indicate that there is an issue with the consensus layer blob availability.\n" + msg := "The downloader couldn't locate the specified blob in the consensus layer. The node is stopped pending resolution. \n" + msg += "Details from the EL event: \n" + msg += fmt.Sprintf(" - blockNumber: %d\n", elBlock.number) + msg += fmt.Sprintf(" - kvIndex: %d\n", elBlob.kvIndex) + msg += fmt.Sprintf(" - hash: %s\n", elBlob.hash.Hex()) + msg += "This may indicate a potential issue with blob availability on the consensus layer. \n" email.SendEmail( - "🛑 Fatal error from es-node: blob missing in CL for downloader", + "🛑 Fatal Error from es-node: Downloader Failed to Locate Blob in CL", msg, *s.emailConfig, s.lg, From f5d4f7de3ba6da25ad0ab12ef041748e976c8257 Mon Sep 17 00:00:00 2001 From: syntrust Date: Thu, 4 Dec 2025 18:25:40 +0800 Subject: [PATCH 05/51] update content --- ethstorage/downloader/downloader.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ethstorage/downloader/downloader.go b/ethstorage/downloader/downloader.go index 65c30692..e3331bce 100644 --- a/ethstorage/downloader/downloader.go +++ b/ethstorage/downloader/downloader.go @@ -416,7 +416,7 @@ func (s *Downloader) downloadRange(start int64, end int64, toCache bool) ([]blob clBlob, exists := clBlobs[elBlob.hash] if !exists { if s.emailConfig != nil { - msg := "The downloader couldn't locate the specified blob in the consensus layer. The node is stopped pending resolution. \n" + msg := "The downloader couldn't locate the specified blob in the consensus layer. The node is stopped pending resolution. " msg += "Details from the EL event: \n" msg += fmt.Sprintf(" - blockNumber: %d\n", elBlock.number) msg += fmt.Sprintf(" - kvIndex: %d\n", elBlob.kvIndex) From d5fdad86a53d779ec7232eb48a9edf4533a3c9db Mon Sep 17 00:00:00 2001 From: syntrust Date: Thu, 4 Dec 2025 18:37:36 +0800 Subject: [PATCH 06/51] check email config --- ethstorage/downloader/downloader.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ethstorage/downloader/downloader.go b/ethstorage/downloader/downloader.go index e3331bce..92bad7b8 100644 --- a/ethstorage/downloader/downloader.go +++ b/ethstorage/downloader/downloader.go @@ -415,7 +415,7 @@ func (s *Downloader) downloadRange(start int64, end int64, toCache bool) ([]blob for _, elBlob := range elBlock.blobs { clBlob, exists := clBlobs[elBlob.hash] if !exists { - if s.emailConfig != nil { + if s.emailConfig != nil && s.emailConfig.Check() == nil { msg := "The downloader couldn't locate the specified blob in the consensus layer. The node is stopped pending resolution. " msg += "Details from the EL event: \n" msg += fmt.Sprintf(" - blockNumber: %d\n", elBlock.number) From c4dc8b44ab7177faf91ddb9b291c35e770381919 Mon Sep 17 00:00:00 2001 From: syntrust Date: Thu, 4 Dec 2025 18:55:35 +0800 Subject: [PATCH 07/51] check email config --- ethstorage/email/email.go | 1 + 1 file changed, 1 insertion(+) diff --git a/ethstorage/email/email.go b/ethstorage/email/email.go index a647dd37..dea744c7 100644 --- a/ethstorage/email/email.go +++ b/ethstorage/email/email.go @@ -22,6 +22,7 @@ type EmailConfig struct { } func (c EmailConfig) Check() error { + fmt.Println("Checking email config:", c.String()) if c.Username == "" { return fmt.Errorf("email username is empty") } From 8be4488401b4239510d6ad960da8cf31af5531ee Mon Sep 17 00:00:00 2001 From: syntrust Date: Thu, 4 Dec 2025 18:59:39 +0800 Subject: [PATCH 08/51] check email config --- cmd/es-node/config.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/es-node/config.go b/cmd/es-node/config.go index 4eb7f2c9..37c624a5 100644 --- a/cmd/es-node/config.go +++ b/cmd/es-node/config.go @@ -55,6 +55,7 @@ func NewConfig(ctx *cli.Context, lg log.Logger) (*node.Config, error) { } dlConfig := NewDownloaderConfig(ctx) + fmt.Println("Downloader config. EmailConfig:", dlConfig.EmailConfig) minerConfig, err := NewMinerConfig(ctx, client, storageConfig.L1Contract, storageConfig.Miner, lg) if err != nil { return nil, fmt.Errorf("failed to load miner config: %w", err) From a1b0be6ec43fe909e5e4b053fe623177d0159a37 Mon Sep 17 00:00:00 2001 From: syntrust Date: Thu, 4 Dec 2025 19:08:24 +0800 Subject: [PATCH 09/51] check email config --- cmd/es-node/config.go | 3 +-- ethstorage/downloader/config.go | 2 +- ethstorage/downloader/downloader.go | 2 +- ethstorage/email/email.go | 1 - ethstorage/node/node.go | 2 +- 5 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cmd/es-node/config.go b/cmd/es-node/config.go index 37c624a5..1dc40302 100644 --- a/cmd/es-node/config.go +++ b/cmd/es-node/config.go @@ -55,7 +55,6 @@ func NewConfig(ctx *cli.Context, lg log.Logger) (*node.Config, error) { } dlConfig := NewDownloaderConfig(ctx) - fmt.Println("Downloader config. EmailConfig:", dlConfig.EmailConfig) minerConfig, err := NewMinerConfig(ctx, client, storageConfig.L1Contract, storageConfig.Miner, lg) if err != nil { return nil, fmt.Errorf("failed to load miner config: %w", err) @@ -267,6 +266,6 @@ func NewDownloaderConfig(ctx *cli.Context) *downloader.Config { // email is nice to have but not required by downloader return dlCfg } - dlCfg.EmailConfig = *emailConfig + dlCfg.EmailConfig = emailConfig return dlCfg } diff --git a/ethstorage/downloader/config.go b/ethstorage/downloader/config.go index 8d8f1d95..4ce62f44 100644 --- a/ethstorage/downloader/config.go +++ b/ethstorage/downloader/config.go @@ -9,5 +9,5 @@ type Config struct { DownloadStart int64 // which block should we download the blobs from DownloadDump string // where to dump the download blobs DownloadThreadNum int // how many threads that will be used to download the blobs into storage file - EmailConfig email.EmailConfig + EmailConfig *email.EmailConfig } diff --git a/ethstorage/downloader/downloader.go b/ethstorage/downloader/downloader.go index 92bad7b8..e3331bce 100644 --- a/ethstorage/downloader/downloader.go +++ b/ethstorage/downloader/downloader.go @@ -415,7 +415,7 @@ func (s *Downloader) downloadRange(start int64, end int64, toCache bool) ([]blob for _, elBlob := range elBlock.blobs { clBlob, exists := clBlobs[elBlob.hash] if !exists { - if s.emailConfig != nil && s.emailConfig.Check() == nil { + if s.emailConfig != nil { msg := "The downloader couldn't locate the specified blob in the consensus layer. The node is stopped pending resolution. " msg += "Details from the EL event: \n" msg += fmt.Sprintf(" - blockNumber: %d\n", elBlock.number) diff --git a/ethstorage/email/email.go b/ethstorage/email/email.go index dea744c7..a647dd37 100644 --- a/ethstorage/email/email.go +++ b/ethstorage/email/email.go @@ -22,7 +22,6 @@ type EmailConfig struct { } func (c EmailConfig) Check() error { - fmt.Println("Checking email config:", c.String()) if c.Username == "" { return fmt.Errorf("email username is empty") } diff --git a/ethstorage/node/node.go b/ethstorage/node/node.go index c8200a0c..3b77df23 100644 --- a/ethstorage/node/node.go +++ b/ethstorage/node/node.go @@ -143,7 +143,7 @@ func (n *EsNode) initL2(ctx context.Context, cfg *Config) error { cfg.Downloader.DownloadDump, cfg.L1.L1MinDurationForBlobsRequest, cfg.Downloader.DownloadThreadNum, - cfg.Downloader.EmailConfig, + *cfg.Downloader.EmailConfig, n.lg, ) return nil From 38395124c9e5bfa61c5803174eaccdfb7df4b31f Mon Sep 17 00:00:00 2001 From: syntrust Date: Fri, 5 Dec 2025 11:12:25 +0800 Subject: [PATCH 10/51] fix config --- cmd/es-node/config.go | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/cmd/es-node/config.go b/cmd/es-node/config.go index 1dc40302..1aaa1efc 100644 --- a/cmd/es-node/config.go +++ b/cmd/es-node/config.go @@ -54,11 +54,24 @@ func NewConfig(ctx *cli.Context, lg log.Logger) (*node.Config, error) { return nil, fmt.Errorf("failed to load storage config: %w", err) } + emailConfig, err := email.GetEmailConfig(ctx) + if err != nil { + lg.Warn("Failed to load email config, email notifications will be disabled", "error", err) + } dlConfig := NewDownloaderConfig(ctx) + if emailConfig != nil { + dlConfig.EmailConfig = emailConfig + } minerConfig, err := NewMinerConfig(ctx, client, storageConfig.L1Contract, storageConfig.Miner, lg) if err != nil { return nil, fmt.Errorf("failed to load miner config: %w", err) } + if minerConfig.EmailEnabled { + if emailConfig == nil { + return nil, fmt.Errorf("email config is required by miner but not loaded") + } + minerConfig.EmailConfig = *emailConfig + } chainId := new(big.Int).SetUint64(ctx.GlobalUint64(flags.ChainId.Name)) lg.Info("Read chain ID of EthStorage network", "chainID", chainId) if minerConfig != nil { @@ -134,13 +147,6 @@ func NewMinerConfig(ctx *cli.Context, client *ethclient.Client, l1Contract, mine if err != nil { return nil, err } - if minerConfig.EmailEnabled { - emailConfig, err := email.GetEmailConfig(ctx) - if err != nil { - return nil, fmt.Errorf("failed to get email config: %w", err) - } - minerConfig.EmailConfig = *emailConfig - } cctx := context.Background() cr := newContractReader(cctx, client, l1Contract, lg) From efc5dc9d694f0d45a817c358e8ed853306545754 Mon Sep 17 00:00:00 2001 From: syntrust Date: Fri, 5 Dec 2025 11:15:18 +0800 Subject: [PATCH 11/51] fix config --- cmd/es-node/config.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/es-node/config.go b/cmd/es-node/config.go index 1aaa1efc..935ae938 100644 --- a/cmd/es-node/config.go +++ b/cmd/es-node/config.go @@ -56,7 +56,7 @@ func NewConfig(ctx *cli.Context, lg log.Logger) (*node.Config, error) { emailConfig, err := email.GetEmailConfig(ctx) if err != nil { - lg.Warn("Failed to load email config, email notifications will be disabled", "error", err) + lg.Warn("Failed to load email config, email notifications will be disabled.", "error", err) } dlConfig := NewDownloaderConfig(ctx) if emailConfig != nil { @@ -66,7 +66,7 @@ func NewConfig(ctx *cli.Context, lg log.Logger) (*node.Config, error) { if err != nil { return nil, fmt.Errorf("failed to load miner config: %w", err) } - if minerConfig.EmailEnabled { + if minerConfig != nil && minerConfig.EmailEnabled { if emailConfig == nil { return nil, fmt.Errorf("email config is required by miner but not loaded") } From b593a435a20527266c9db0bb3492c807a1c281e2 Mon Sep 17 00:00:00 2001 From: syntrust Date: Fri, 5 Dec 2025 11:26:00 +0800 Subject: [PATCH 12/51] fix config --- ethstorage/downloader/downloader.go | 13 +++++-------- ethstorage/node/node.go | 5 +---- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/ethstorage/downloader/downloader.go b/ethstorage/downloader/downloader.go index e3331bce..33eac3fd 100644 --- a/ethstorage/downloader/downloader.go +++ b/ethstorage/downloader/downloader.go @@ -107,14 +107,11 @@ func NewDownloader( db ethdb.Database, sm *ethstorage.StorageManager, cache BlobCache, - downloadStart int64, - downloadDump string, minDurationForBlobsRequest uint64, - downloadThreadNum int, - emailConfig email.EmailConfig, + downloadConfig Config, lg log.Logger, ) *Downloader { - sm.DownloadThreadNum = downloadThreadNum + sm.DownloadThreadNum = downloadConfig.DownloadThreadNum return &Downloader{ Cache: cache, l1Source: l1Source, @@ -122,15 +119,15 @@ func NewDownloader( daClient: daClient, db: db, sm: sm, - dumpDir: downloadDump, + dumpDir: downloadConfig.DownloadDump, minDurationForBlobsRequest: minDurationForBlobsRequest, dlLatestReq: make(chan struct{}, 1), dlFinalizedReq: make(chan struct{}, 1), lg: lg, done: make(chan struct{}), - lastDownloadBlock: downloadStart, + lastDownloadBlock: downloadConfig.DownloadStart, downloadedBlobs: 0, - emailConfig: &emailConfig, + emailConfig: downloadConfig.EmailConfig, } } diff --git a/ethstorage/node/node.go b/ethstorage/node/node.go index 3b77df23..180ba628 100644 --- a/ethstorage/node/node.go +++ b/ethstorage/node/node.go @@ -139,11 +139,8 @@ func (n *EsNode) initL2(ctx context.Context, cfg *Config) error { n.db, n.storageManager, n.blobCache, - cfg.Downloader.DownloadStart, - cfg.Downloader.DownloadDump, cfg.L1.L1MinDurationForBlobsRequest, - cfg.Downloader.DownloadThreadNum, - *cfg.Downloader.EmailConfig, + cfg.Downloader, n.lg, ) return nil From 7b75b7f7c4286bd931c3b9af68156f1c427c911f Mon Sep 17 00:00:00 2001 From: syntrust Date: Fri, 5 Dec 2025 11:33:27 +0800 Subject: [PATCH 13/51] fix config --- cmd/es-node/config.go | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/cmd/es-node/config.go b/cmd/es-node/config.go index 935ae938..09ecfc55 100644 --- a/cmd/es-node/config.go +++ b/cmd/es-node/config.go @@ -261,17 +261,9 @@ func NewL1EndpointConfig(ctx *cli.Context, lg log.Logger) (*eth.L1EndpointConfig } func NewDownloaderConfig(ctx *cli.Context) *downloader.Config { - dlCfg := &downloader.Config{ + return &downloader.Config{ DownloadStart: ctx.GlobalInt64(flags.DownloadStart.Name), DownloadDump: ctx.GlobalString(flags.DownloadDump.Name), DownloadThreadNum: ctx.GlobalInt(flags.DownloadThreadNum.Name), } - - emailConfig, err := email.GetEmailConfig(ctx) - if err != nil { - // email is nice to have but not required by downloader - return dlCfg - } - dlCfg.EmailConfig = emailConfig - return dlCfg } From e7ce75fa4b5203a97dfcded15f30c606ad9ac693 Mon Sep 17 00:00:00 2001 From: syntrust Date: Fri, 5 Dec 2025 11:48:35 +0800 Subject: [PATCH 14/51] refactor --- ethstorage/downloader/downloader.go | 34 +++++++++++++++++------------ 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/ethstorage/downloader/downloader.go b/ethstorage/downloader/downloader.go index 33eac3fd..322a9b36 100644 --- a/ethstorage/downloader/downloader.go +++ b/ethstorage/downloader/downloader.go @@ -412,20 +412,7 @@ func (s *Downloader) downloadRange(start int64, end int64, toCache bool) ([]blob for _, elBlob := range elBlock.blobs { clBlob, exists := clBlobs[elBlob.hash] if !exists { - if s.emailConfig != nil { - msg := "The downloader couldn't locate the specified blob in the consensus layer. The node is stopped pending resolution. " - msg += "Details from the EL event: \n" - msg += fmt.Sprintf(" - blockNumber: %d\n", elBlock.number) - msg += fmt.Sprintf(" - kvIndex: %d\n", elBlob.kvIndex) - msg += fmt.Sprintf(" - hash: %s\n", elBlob.hash.Hex()) - msg += "This may indicate a potential issue with blob availability on the consensus layer. \n" - email.SendEmail( - "🛑 Fatal Error from es-node: Downloader Failed to Locate Blob in CL", - msg, - *s.emailConfig, - s.lg, - ) - } + s.notifyBlobMissing(elBlock.number, elBlob.kvIndex.Uint64(), elBlob.hash) s.lg.Crit("Did not find the event specified blob in the CL", "blockNumber", elBlock.number, "kvIndex", elBlob.kvIndex) } // encode blobs so that miner can do sampling directly from cache @@ -498,3 +485,22 @@ func (s *Downloader) eventsToBlocks(events []types.Log) ([]*blockBlobs, error) { return blocks, nil } + +func (s *Downloader) notifyBlobMissing(blockNumber uint64, kvIndex uint64, hash common.Hash) { + if s.emailConfig == nil { + return + } + + msg := "The downloader couldn't locate the specified blob in the consensus layer. The node is stopped pending resolution. " + msg += "Details from the EL event: \n" + msg += fmt.Sprintf(" - blockNumber: %d\n", blockNumber) + msg += fmt.Sprintf(" - kvIndex: %d\n", kvIndex) + msg += fmt.Sprintf(" - hash: %s\n", hash.Hex()) + msg += "This may indicate a potential issue with blob availability on the consensus layer. \n" + email.SendEmail( + "🛑 Fatal Error from es-node: Downloader Failed to Locate Blob in CL", + msg, + *s.emailConfig, + s.lg, + ) +} From 5b0fd98b1eb7f8c35b8438f8d389f67fa0f3c059 Mon Sep 17 00:00:00 2001 From: syntrust Date: Tue, 9 Dec 2025 16:56:33 +0800 Subject: [PATCH 15/51] refactor --- ethstorage/scanner/worker.go | 153 ++++++++++++++++++----------------- 1 file changed, 81 insertions(+), 72 deletions(-) diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index d754f491..274c629f 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -88,94 +88,103 @@ func (s *Worker) ScanBatch(ctx context.Context, mismatched mismatchTracker) (*st default: } - var found bool var commit common.Hash copy(commit[:], meta[32-es.HashSizeInContract:32]) kvIndex := kvsInBatch[i] - if s.cfg.Mode == modeCheckMeta { - // Check meta only - var metaLocal []byte - metaLocal, found, err = s.sm.TryReadMeta(kvIndex) - if err != nil { - s.lg.Error("Scanner: failed to read meta", "kvIndex", kvIndex, "error", err) - scanErrors.add(kvIndex, fmt.Errorf("failed to read meta: %w", err)) - continue - } - err = es.CompareCommits(commit.Bytes(), metaLocal) - } else if s.cfg.Mode == modeCheckBlob { - // Query blob and check meta from storage - _, found, err = s.sm.TryRead(kvIndex, int(s.sm.MaxKvSize()), commit) - } else { - s.lg.Error("Scanner: invalid scanner mode", "mode", s.cfg.Mode) - return sts, scanErrors, fmt.Errorf("invalid scanner mode: %d", s.cfg.Mode) - } - - if found && err == nil { + s.processScanKv(kvIndex, commit, &mismatched, scanErrors) + } - // Update status for previously mismatched entries that are now valid - if status, exists := mismatched[kvIndex]; exists { - switch status { - case failed: - mismatched.markRecovered(kvIndex) - // Clear the error state - scanErrors.nil(kvIndex) - s.lg.Info("Scanner: previously failed KV recovered", "kvIndex", kvIndex) - case pending: - delete(mismatched, kvIndex) - s.lg.Info("Scanner: previously pending KV recovered", "kvIndex", kvIndex) - } - } + s.nextIndexOfKvIdx = batchEndExclusive + if len(kvsInBatch) > 0 { + s.lg.Info("Scanner: scan batch done", "scanned", shortPrt(kvsInBatch), "count", len(kvsInBatch), "nextIndexOfKvIdx", s.nextIndexOfKvIdx) + } - // Happy path - s.lg.Debug("Scanner: KV check completed successfully", "kvIndex", kvIndex, "commit", commit) - continue - } + sts.mismatched = mismatched - if !found { - // The shard is not stored locally - scanErrors.add(kvIndex, fmt.Errorf("shard not found locally: commit=%x", commit)) - s.lg.Error("Scanner: blob not found locally", "kvIndex", kvIndex, "commit", commit) - continue - } + return sts, scanErrors, nil +} +func (s *Worker) processScanKv( + kvIndex uint64, + commit common.Hash, + mismatched *mismatchTracker, + scanErrors scanErrors, +) { + var err error + var found bool + if s.cfg.Mode == modeCheckMeta { + // Check meta only + var metaLocal []byte + metaLocal, found, err = s.sm.TryReadMeta(kvIndex) if err != nil { - var commitErr *es.CommitMismatchError - if errors.As(err, &commitErr) { - s.lg.Warn("Scanner: commit mismatch detected", "kvIndex", kvIndex, "error", err) - - // Only fix repeated mismatches - if mismatched.shouldFix(kvIndex) { - s.lg.Info("Scanner: mismatch again, attempting to fix blob", "kvIndex", kvIndex, "commit", commit) - if fixErr := s.fixKv(kvIndex, commit); fixErr != nil { - mismatched.markFailed(kvIndex) - s.lg.Error("Scanner: failed to fix blob", "kvIndex", kvIndex, "error", fixErr) - scanErrors.add(kvIndex, fmt.Errorf("failed to fix blob: %w", fixErr)) - } else { - s.lg.Info("Scanner: blob fixed successfully", "kvIndex", kvIndex) - mismatched.markFixed(kvIndex) - scanErrors.nil(kvIndex) - } - } else { + s.lg.Error("Scanner: failed to read meta", "kvIndex", kvIndex, "error", err) + scanErrors.add(kvIndex, fmt.Errorf("failed to read meta: %w", err)) + return + } + err = es.CompareCommits(commit.Bytes(), metaLocal) + } else if s.cfg.Mode == modeCheckBlob { + // Query blob and check meta from storage + _, found, err = s.sm.TryRead(kvIndex, int(s.sm.MaxKvSize()), commit) + } else { + s.lg.Crit("Scanner: invalid scanner mode", "mode", s.cfg.Mode) + } - // Mark but skip on the first occurrence as it may be caused by KV update and delayed download - mismatched.markPending(kvIndex) - s.lg.Info("Scanner: first-time mismatch, skipping fix attempt", "kvIndex", kvIndex) - } - } else { - s.lg.Error("Scanner: unexpected error occurred", "kvIndex", kvIndex, "error", err) - scanErrors.add(kvIndex, fmt.Errorf("unexpected error: %w", err)) + if found && err == nil { + + // Update status for previously mismatched entries that are now valid + if status, exists := (*mismatched)[kvIndex]; exists { + switch status { + case failed: + mismatched.markRecovered(kvIndex) + // Clear the error state + scanErrors.nil(kvIndex) + s.lg.Info("Scanner: previously failed KV recovered", "kvIndex", kvIndex) + case pending: + delete(*mismatched, kvIndex) + s.lg.Info("Scanner: previously pending KV recovered", "kvIndex", kvIndex) } } + + // Happy path + s.lg.Debug("Scanner: KV check completed successfully", "kvIndex", kvIndex, "commit", commit) + return } - s.nextIndexOfKvIdx = batchEndExclusive - if len(kvsInBatch) > 0 { - s.lg.Info("Scanner: scan batch done", "scanned", shortPrt(kvsInBatch), "count", len(kvsInBatch), "nextIndexOfKvIdx", s.nextIndexOfKvIdx) + if !found { + // The shard is not stored locally + scanErrors.add(kvIndex, fmt.Errorf("shard not found locally: commit=%x", commit)) + s.lg.Error("Scanner: blob not found locally", "kvIndex", kvIndex, "commit", commit) + return } - sts.mismatched = mismatched + if err != nil { + var commitErr *es.CommitMismatchError + if errors.As(err, &commitErr) { + s.lg.Warn("Scanner: commit mismatch detected", "kvIndex", kvIndex, "error", err) + + // Only fix repeated mismatches + if mismatched.shouldFix(kvIndex) { + s.lg.Info("Scanner: mismatch again, attempting to fix blob", "kvIndex", kvIndex, "commit", commit) + if fixErr := s.fixKv(kvIndex, commit); fixErr != nil { + mismatched.markFailed(kvIndex) + s.lg.Error("Scanner: failed to fix blob", "kvIndex", kvIndex, "error", fixErr) + scanErrors.add(kvIndex, fmt.Errorf("failed to fix blob: %w", fixErr)) + } else { + s.lg.Info("Scanner: blob fixed successfully", "kvIndex", kvIndex) + mismatched.markFixed(kvIndex) + scanErrors.nil(kvIndex) + } + } else { - return sts, scanErrors, nil + // Mark but skip on the first occurrence as it may be caused by KV update and delayed download + mismatched.markPending(kvIndex) + s.lg.Info("Scanner: first-time mismatch, skipping fix attempt", "kvIndex", kvIndex) + } + } else { + s.lg.Error("Scanner: unexpected error occurred", "kvIndex", kvIndex, "error", err) + scanErrors.add(kvIndex, fmt.Errorf("unexpected error: %w", err)) + } + } } func (s *Worker) fixKv(kvIndex uint64, commit common.Hash) error { From f8d0371d96d1de23cbfd76392fd1bceb0b6d2ce1 Mon Sep 17 00:00:00 2001 From: syntrust Date: Tue, 9 Dec 2025 17:57:13 +0800 Subject: [PATCH 16/51] check shard exist --- ethstorage/downloader/downloader.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ethstorage/downloader/downloader.go b/ethstorage/downloader/downloader.go index 322a9b36..36249460 100644 --- a/ethstorage/downloader/downloader.go +++ b/ethstorage/downloader/downloader.go @@ -12,6 +12,7 @@ import ( "math/big" "os" "path/filepath" + "slices" "sync" "time" @@ -410,6 +411,11 @@ func (s *Downloader) downloadRange(start int64, end int64, toCache bool) ([]blob } for _, elBlob := range elBlock.blobs { + shard := elBlob.kvIndex.Uint64() >> s.sm.KvEntriesBits() + if !slices.Contains(s.sm.Shards(), shard) { + s.lg.Warn("Shard not initialized locally for the kvIndex, skip this blob", "kvIndex", elBlob.kvIndex.Uint64(), "shard", shard) + continue + } clBlob, exists := clBlobs[elBlob.hash] if !exists { s.notifyBlobMissing(elBlock.number, elBlob.kvIndex.Uint64(), elBlob.hash) From d5b852423ea13aab9eecb85fc54baf86d0f81fae Mon Sep 17 00:00:00 2001 From: syntrust Date: Tue, 9 Dec 2025 19:09:28 +0800 Subject: [PATCH 17/51] skip empty shards --- ethstorage/scanner/worker.go | 5 +++++ ethstorage/scanner/worker_test.go | 11 +++++++++++ 2 files changed, 16 insertions(+) diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index 274c629f..aaf69fe5 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -199,6 +199,11 @@ func getKvsInBatch(shards []uint64, kvEntries, lastKvIdx, batchSize, batchStartI var totalEntries uint64 // Shard indices are sorted but may not be continuous: e.g. [0, 1, 3, 4] indicates shard 2 is missing for _, shardIndex := range shards { + shardOfLastFinalizedKv := lastKvIdx / kvEntries + if shardIndex > shardOfLastFinalizedKv { + // Skip empty shards + break + } // The last shard may contain fewer than the full kvEntries if shardIndex == lastKvIdx/kvEntries { totalEntries += lastKvIdx%kvEntries + 1 diff --git a/ethstorage/scanner/worker_test.go b/ethstorage/scanner/worker_test.go index e8fe6c66..dfd659d1 100644 --- a/ethstorage/scanner/worker_test.go +++ b/ethstorage/scanner/worker_test.go @@ -19,6 +19,17 @@ func TestGetKvsInBatch(t *testing.T) { expectedTotal uint64 expectedBatchEnd uint64 }{ + { + name: "skip empty shards", + shards: []uint64{0, 2}, + kvEntries: 8, + lastKvIdx: 12, + batchSize: 100, + batchStartIndex: 0, + expectedKvs: []uint64{0, 1, 2, 3, 4, 5, 6, 7}, + expectedTotal: 8, + expectedBatchEnd: 8, + }, { name: "1 shard batch 1", shards: []uint64{0}, From b4d7e75d7bea1aa63dfe7c9012213040038ee6e6 Mon Sep 17 00:00:00 2001 From: syntrust Date: Wed, 10 Dec 2025 18:59:39 +0800 Subject: [PATCH 18/51] suport mode 3 --- ethstorage/scanner/config.go | 6 +- ethstorage/scanner/scanner.go | 119 +++++++++++++++++++++++++--------- ethstorage/scanner/worker.go | 47 ++++++++------ 3 files changed, 117 insertions(+), 55 deletions(-) diff --git a/ethstorage/scanner/config.go b/ethstorage/scanner/config.go index 14057afb..dc05bf3e 100644 --- a/ethstorage/scanner/config.go +++ b/ethstorage/scanner/config.go @@ -38,7 +38,7 @@ func CLIFlags() []cli.Flag { flags := []cli.Flag{ cli.IntFlag{ Name: ModeFlagName, - Usage: "Data scan mode, 0: disabled, 1: check meta, 2: check blob", + Usage: "Data scan mode, 0: disabled, 1: check meta, 2: check blob, 3: hybrid", EnvVar: scannerEnv("MODE"), Value: 1, }, @@ -50,7 +50,7 @@ func CLIFlags() []cli.Flag { }, cli.IntFlag{ Name: IntervalFlagName, - Usage: fmt.Sprintf("Data scan interval in minutes, minimum %d (default)", defaultInterval), + Usage: fmt.Sprintf("Data scan interval in minutes, minimum %d (default). In hybrid mode, the interval applies to meta mode in minutes, blob mode in days", defaultInterval), EnvVar: scannerEnv("INTERVAL"), Value: defaultInterval, }, @@ -63,7 +63,7 @@ func NewConfig(ctx *cli.Context) *Config { if mode == modeDisabled { return nil } - if mode != modeCheckMeta && mode != modeCheckBlob { + if mode != modeCheckMeta && mode != modeCheckBlob && mode != modeCheckBlob+modeCheckMeta { panic(fmt.Sprintf("invalid scanner mode: %d", mode)) } if interval := ctx.GlobalInt(IntervalFlagName); interval < defaultInterval { diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index de3b7142..1aa8efbf 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -15,16 +15,23 @@ import ( ) type Scanner struct { - worker *Worker - feed *event.Feed - interval time.Duration - ctx context.Context - cancel context.CancelFunc - wg sync.WaitGroup - running bool - mu sync.Mutex - lg log.Logger - scanStats ScanStats + worker *Worker + feed *event.Feed + interval time.Duration + cfg Config + ctx context.Context + cancel context.CancelFunc + wg sync.WaitGroup + running bool + mu sync.Mutex + lg log.Logger + scanStats ScanStats + scanPermit chan struct{} // to ensure only one scan at a time +} + +type scanLoopState struct { + mode int + nextIndex uint64 } func New( @@ -38,14 +45,17 @@ func New( ) *Scanner { cctx, cancel := context.WithCancel(ctx) scanner := &Scanner{ - worker: NewWorker(sm, fetchBlob, l1, cfg, lg), - feed: feed, - interval: time.Minute * time.Duration(cfg.Interval), - ctx: cctx, - cancel: cancel, - lg: lg, - scanStats: ScanStats{0, 0}, + worker: NewWorker(sm, fetchBlob, l1, lg), + feed: feed, + interval: time.Minute * time.Duration(cfg.Interval), + cfg: cfg, + ctx: cctx, + cancel: cancel, + lg: lg, + scanStats: ScanStats{0, 0}, + scanPermit: make(chan struct{}, 1), } + scanner.scanPermit <- struct{}{} scanner.wg.Add(1) go scanner.update() return scanner @@ -89,35 +99,63 @@ func (s *Scanner) start() { s.running = true s.mu.Unlock() - s.wg.Add(1) + if s.cfg.Mode == modeCheckBlob+modeCheckMeta { + // TODO: blobInterval := time.Hour * 24 * time.Duration(s.cfg.Interval) + // test only + blobInterval := time.Minute * 9 * time.Duration(s.cfg.Interval) + s.lg.Info("Scanner running in hybrid mode", "metaInterval", s.interval, "blobInterval", blobInterval) + s.launchScanLoop(&scanLoopState{mode: modeCheckBlob}, blobInterval) + s.launchScanLoop(&scanLoopState{mode: modeCheckMeta}, s.interval) + return + } + s.launchScanLoop(&scanLoopState{mode: s.cfg.Mode}, s.interval) +} + +func (s *Scanner) launchScanLoop(state *scanLoopState, interval time.Duration) { + s.wg.Add(1) go func() { defer s.wg.Done() - s.lg.Info("Scanner started", "mode", s.worker.cfg.Mode, "interval", s.interval.String(), "batchSize", s.worker.cfg.BatchSize) + s.lg.Info("Scanner started", "mode", state.mode, "interval", interval.String(), "batchSize", s.cfg.BatchSize) - mainTicker := time.NewTicker(s.interval) - reportTicker := time.NewTicker(1 * time.Minute) + mainTicker := time.NewTicker(interval) defer mainTicker.Stop() + + reportTicker := time.NewTicker(time.Minute) defer reportTicker.Stop() - sts, errCache, err := s.doWork(mismatchTracker{}) + + sts := newStats() + errCache := scanErrors{} + if !s.acquireScanPermit(state.mode) { + return + } + initSts, initErrs, err := s.doWork(state, mismatchTracker{}) + s.releaseScanPermit() if err != nil { - s.lg.Error("Initial scan failed", "error", err) + s.lg.Error("Scanner: initial scan failed", "mode", state.mode, "error", err) + } else { + sts = initSts + errCache = initErrs } for { select { case <-mainTicker.C: - newSts, scanErrs, err := s.doWork(sts.mismatched.clone()) + if !s.acquireScanPermit(state.mode) { + return + } + newSts, scanErrs, err := s.doWork(state, sts.mismatched.clone()) + s.releaseScanPermit() if err != nil { - s.lg.Error("Scanner: scan batch failed", "error", err) + s.lg.Error("Scanner: scan batch failed", "mode", state.mode, "error", err) continue } sts = newSts errCache.merge(scanErrs) case <-reportTicker.C: - s.logStats(sts) + s.logStats(state.mode, sts) for i, e := range errCache { s.lg.Info("Scanner error happened earlier", "kvIndex", i, "error", e) } @@ -129,8 +167,27 @@ func (s *Scanner) start() { }() } -func (s *Scanner) logStats(sts *stats) { +func (s *Scanner) acquireScanPermit(mode int) bool { + s.lg.Info("Scanner acquiring scan permit for mode", "mode", mode) + select { + case <-s.ctx.Done(): + return false + case <-s.scanPermit: + s.lg.Info("Scanner acquired scan permit for mode", "mode", mode) + return true + } +} + +func (s *Scanner) releaseScanPermit() { + select { + case s.scanPermit <- struct{}{}: + default: + } +} + +func (s *Scanner) logStats(mode int, sts *stats) { logFields := []any{ + "mode", mode, "localKvs", sts.localKvs, "localKvsCount", sts.total, } @@ -168,15 +225,15 @@ func (s *Scanner) Close() { s.wg.Wait() } -func (s *Scanner) doWork(tracker mismatchTracker) (*stats, scanErrors, error) { - s.lg.Debug("Scan batch started") +func (s *Scanner) doWork(state *scanLoopState, tracker mismatchTracker) (*stats, scanErrors, error) { start := time.Now() defer func(stt time.Time) { - s.lg.Info("Scan batch done", "duration", time.Since(stt).String()) + s.lg.Info("Scanner: scan batch done", "mode", state.mode, "duration", time.Since(stt).String()) }(start) - sts, scanErrs, err := s.worker.ScanBatch(s.ctx, tracker) + sts, scanErrs, nextIndex, err := s.worker.ScanBatch(s.ctx, state.mode, s.cfg.BatchSize, state.nextIndex, tracker) if err == nil { + state.nextIndex = nextIndex s.setScanState(sts) } return sts, scanErrs, err diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index aaf69fe5..1fcb2d18 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -26,31 +26,34 @@ type IStorageManager interface { } type Worker struct { - sm IStorageManager - fetchBlob es.FetchBlobFunc - l1 es.Il1Source - cfg Config - nextIndexOfKvIdx uint64 - lg log.Logger + sm IStorageManager + fetchBlob es.FetchBlobFunc + l1 es.Il1Source + lg log.Logger } func NewWorker( sm IStorageManager, fetch es.FetchBlobFunc, l1 es.Il1Source, - cfg Config, lg log.Logger, ) *Worker { return &Worker{ sm: sm, fetchBlob: fetch, l1: l1, - cfg: cfg, lg: lg, } } -func (s *Worker) ScanBatch(ctx context.Context, mismatched mismatchTracker) (*stats, scanErrors, error) { +func (s *Worker) ScanBatch( + ctx context.Context, + mode int, + batchSize int, + startIndex uint64, + mismatched mismatchTracker, +) (*stats, scanErrors, uint64, error) { + s.lg.Info("Scanner: scan batch started", "mode", mode) // Never return nil stats and nil scanErrors sts := newStats() scanErrors := make(scanErrors) @@ -61,13 +64,13 @@ func (s *Worker) ScanBatch(ctx context.Context, mismatched mismatchTracker) (*st entryCount := s.sm.KvEntryCount() if entryCount == 0 { s.lg.Info("Scanner: no KV entries found in local storage") - return sts, scanErrors, nil + return sts, scanErrors, startIndex, nil } lastKvIdx := entryCount - 1 s.lg.Info("Scanner: local storage info", "lastKvIdx", lastKvIdx, "shards", shards, "kvEntriesPerShard", kvEntries) // Determine the batch of KV indices to scan - kvsInBatch, totalEntries, batchEndExclusive := getKvsInBatch(shards, kvEntries, lastKvIdx, uint64(s.cfg.BatchSize), s.nextIndexOfKvIdx, s.lg) + kvsInBatch, totalEntries, batchEndExclusive := getKvsInBatch(shards, kvEntries, lastKvIdx, uint64(batchSize), startIndex, s.lg) sts.localKvs = summaryLocalKvs(shards, kvEntries, lastKvIdx) sts.total = int(totalEntries) @@ -76,7 +79,7 @@ func (s *Worker) ScanBatch(ctx context.Context, mismatched mismatchTracker) (*st metas, err := s.l1.GetKvMetas(kvsInBatch, rpc.FinalizedBlockNumber.Int64()) if err != nil { s.lg.Error("Scanner: failed to query KV metas", "error", err) - return sts, scanErrors, fmt.Errorf("failed to query KV metas: %w", err) + return sts, scanErrors, startIndex, fmt.Errorf("failed to query KV metas: %w", err) } s.lg.Debug("Scanner: query KV meta done", "kvsInBatch", shortPrt(kvsInBatch)) @@ -84,27 +87,27 @@ func (s *Worker) ScanBatch(ctx context.Context, mismatched mismatchTracker) (*st select { case <-ctx.Done(): s.lg.Warn("Scanner canceled, stopping scan", "ctx.Err", ctx.Err()) - return sts, scanErrors, ctx.Err() + return sts, scanErrors, startIndex, ctx.Err() default: } var commit common.Hash copy(commit[:], meta[32-es.HashSizeInContract:32]) kvIndex := kvsInBatch[i] - s.processScanKv(kvIndex, commit, &mismatched, scanErrors) + s.processScanKv(mode, kvIndex, commit, &mismatched, scanErrors) } - s.nextIndexOfKvIdx = batchEndExclusive if len(kvsInBatch) > 0 { - s.lg.Info("Scanner: scan batch done", "scanned", shortPrt(kvsInBatch), "count", len(kvsInBatch), "nextIndexOfKvIdx", s.nextIndexOfKvIdx) + s.lg.Info("Scanner: scan batch done", "mode", mode, "scanned", shortPrt(kvsInBatch), "count", len(kvsInBatch), "nextIndexOfKvIdx", batchEndExclusive) } sts.mismatched = mismatched - return sts, scanErrors, nil + return sts, scanErrors, batchEndExclusive, nil } func (s *Worker) processScanKv( + mode int, kvIndex uint64, commit common.Hash, mismatched *mismatchTracker, @@ -112,7 +115,8 @@ func (s *Worker) processScanKv( ) { var err error var found bool - if s.cfg.Mode == modeCheckMeta { + switch mode { + case modeCheckMeta: // Check meta only var metaLocal []byte metaLocal, found, err = s.sm.TryReadMeta(kvIndex) @@ -122,11 +126,12 @@ func (s *Worker) processScanKv( return } err = es.CompareCommits(commit.Bytes(), metaLocal) - } else if s.cfg.Mode == modeCheckBlob { + case modeCheckBlob: // Query blob and check meta from storage _, found, err = s.sm.TryRead(kvIndex, int(s.sm.MaxKvSize()), commit) - } else { - s.lg.Crit("Scanner: invalid scanner mode", "mode", s.cfg.Mode) + default: + // Other modes are handled outside + s.lg.Crit("Scanner: invalid scanner mode", "mode", mode) } if found && err == nil { From cbc567e2a0c2fc1cedc05675708fa725d153b476 Mon Sep 17 00:00:00 2001 From: syntrust Date: Thu, 11 Dec 2025 11:51:59 +0800 Subject: [PATCH 19/51] refactor --- ethstorage/scanner/scanner.go | 195 ++++++++++++++++++++-------------- ethstorage/scanner/worker.go | 28 +++-- 2 files changed, 129 insertions(+), 94 deletions(-) diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index 1aa8efbf..f8bd92ca 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -5,6 +5,7 @@ package scanner import ( "context" + "maps" "sync" "time" @@ -15,18 +16,20 @@ import ( ) type Scanner struct { - worker *Worker - feed *event.Feed - interval time.Duration - cfg Config - ctx context.Context - cancel context.CancelFunc - wg sync.WaitGroup - running bool - mu sync.Mutex - lg log.Logger - scanStats ScanStats - scanPermit chan struct{} // to ensure only one scan at a time + worker *Worker + feed *event.Feed + interval time.Duration + cfg Config + ctx context.Context + cancel context.CancelFunc + wg sync.WaitGroup + running bool + mu sync.Mutex // protects running + lg log.Logger + scanPermit chan struct{} // to ensure only one scan at a time + statsMu sync.Mutex // protects sharedStats and sharedErrCache + sharedStats stats + sharedErrCache scanErrors } type scanLoopState struct { @@ -45,15 +48,16 @@ func New( ) *Scanner { cctx, cancel := context.WithCancel(ctx) scanner := &Scanner{ - worker: NewWorker(sm, fetchBlob, l1, lg), - feed: feed, - interval: time.Minute * time.Duration(cfg.Interval), - cfg: cfg, - ctx: cctx, - cancel: cancel, - lg: lg, - scanStats: ScanStats{0, 0}, - scanPermit: make(chan struct{}, 1), + worker: NewWorker(sm, fetchBlob, l1, uint64(cfg.BatchSize), lg), + feed: feed, + interval: time.Minute * time.Duration(cfg.Interval), + cfg: cfg, + ctx: cctx, + cancel: cancel, + lg: lg, + scanPermit: make(chan struct{}, 1), + sharedStats: *newStats(), + sharedErrCache: scanErrors{}, } scanner.scanPermit <- struct{}{} scanner.wg.Add(1) @@ -99,11 +103,12 @@ func (s *Scanner) start() { s.running = true s.mu.Unlock() + s.startReporter() + if s.cfg.Mode == modeCheckBlob+modeCheckMeta { // TODO: blobInterval := time.Hour * 24 * time.Duration(s.cfg.Interval) - // test only - blobInterval := time.Minute * 9 * time.Duration(s.cfg.Interval) - s.lg.Info("Scanner running in hybrid mode", "metaInterval", s.interval, "blobInterval", blobInterval) + blobInterval := time.Minute * 9 * time.Duration(s.cfg.Interval) // test only + s.lg.Info("Scanner running in hybrid mode", "mode", s.cfg.Mode, "metaInterval", s.interval, "blobInterval", blobInterval) s.launchScanLoop(&scanLoopState{mode: modeCheckBlob}, blobInterval) s.launchScanLoop(&scanLoopState{mode: modeCheckMeta}, s.interval) return @@ -122,43 +127,11 @@ func (s *Scanner) launchScanLoop(state *scanLoopState, interval time.Duration) { mainTicker := time.NewTicker(interval) defer mainTicker.Stop() - reportTicker := time.NewTicker(time.Minute) - defer reportTicker.Stop() - - sts := newStats() - errCache := scanErrors{} - if !s.acquireScanPermit(state.mode) { - return - } - initSts, initErrs, err := s.doWork(state, mismatchTracker{}) - s.releaseScanPermit() - if err != nil { - s.lg.Error("Scanner: initial scan failed", "mode", state.mode, "error", err) - } else { - sts = initSts - errCache = initErrs - } - + s.doScan(state) for { select { case <-mainTicker.C: - if !s.acquireScanPermit(state.mode) { - return - } - newSts, scanErrs, err := s.doWork(state, sts.mismatched.clone()) - s.releaseScanPermit() - if err != nil { - s.lg.Error("Scanner: scan batch failed", "mode", state.mode, "error", err) - continue - } - sts = newSts - errCache.merge(scanErrs) - - case <-reportTicker.C: - s.logStats(state.mode, sts) - for i, e := range errCache { - s.lg.Info("Scanner error happened earlier", "kvIndex", i, "error", e) - } + s.doScan(state) case <-s.ctx.Done(): return @@ -167,13 +140,25 @@ func (s *Scanner) launchScanLoop(state *scanLoopState, interval time.Duration) { }() } -func (s *Scanner) acquireScanPermit(mode int) bool { - s.lg.Info("Scanner acquiring scan permit for mode", "mode", mode) +func (s *Scanner) doScan(state *scanLoopState) { + if !s.acquireScanPermit() { + return + } + tracker := s.cloneSharedMismatches() + initSts, initErrs, err := s.doWork(state, tracker) + s.releaseScanPermit() + if err != nil { + s.lg.Error("Scanner: initial scan failed", "mode", state.mode, "error", err) + } else { + s.updateSharedStats(initSts, initErrs) + } +} + +func (s *Scanner) acquireScanPermit() bool { select { case <-s.ctx.Done(): return false case <-s.scanPermit: - s.lg.Info("Scanner acquired scan permit for mode", "mode", mode) return true } } @@ -185,9 +170,30 @@ func (s *Scanner) releaseScanPermit() { } } -func (s *Scanner) logStats(mode int, sts *stats) { +func (s *Scanner) startReporter() { + s.wg.Add(1) + go func() { + defer s.wg.Done() + ticker := time.NewTicker(time.Minute) + defer ticker.Stop() + for { + select { + case <-ticker.C: + statsSnapshot, errSnapshot := s.snapshotSharedState() + s.logStats(statsSnapshot) + for i, e := range errSnapshot { + s.lg.Info("Scanner error happened earlier", "kvIndex", i, "error", e) + } + case <-s.ctx.Done(): + return + } + } + }() +} + +func (s *Scanner) logStats(sts *stats) { logFields := []any{ - "mode", mode, + "mode", s.cfg.Mode, "localKvs", sts.localKvs, "localKvsCount", sts.total, } @@ -197,18 +203,54 @@ func (s *Scanner) logStats(mode int, sts *stats) { s.lg.Info("Scanner stats", logFields...) } -func (s *Scanner) GetScanState() *ScanStats { - s.mu.Lock() - defer s.mu.Unlock() - snapshot := s.scanStats // Make a copy - return &snapshot // Return a pointer to the copy +func (s *Scanner) cloneSharedMismatches() mismatchTracker { + s.statsMu.Lock() + defer s.statsMu.Unlock() + return s.sharedStats.mismatched.clone() } -func (s *Scanner) setScanState(sts *stats) { - s.mu.Lock() - defer s.mu.Unlock() - s.scanStats.MismatchedCount = len(sts.mismatched) - s.scanStats.UnfixedCount = len(sts.mismatched.failed()) +func (s *Scanner) updateSharedStats(sts *stats, errs scanErrors) { + if sts == nil { + return + } + s.statsMu.Lock() + defer s.statsMu.Unlock() + s.sharedStats.localKvs = sts.localKvs + s.sharedStats.total = sts.total + if sts.mismatched != nil { + s.sharedStats.mismatched = sts.mismatched.clone() + } else { + s.sharedStats.mismatched = mismatchTracker{} + } + if errs != nil { + if s.sharedErrCache == nil { + s.sharedErrCache = scanErrors{} + } + s.sharedErrCache.merge(errs) + } +} + +func (s *Scanner) snapshotSharedState() (*stats, scanErrors) { + s.statsMu.Lock() + defer s.statsMu.Unlock() + statsCopy := &stats{ + localKvs: s.sharedStats.localKvs, + total: s.sharedStats.total, + mismatched: s.sharedStats.mismatched.clone(), + } + errCopy := scanErrors{} + maps.Copy(errCopy, s.sharedErrCache) + return statsCopy, errCopy +} + +func (s *Scanner) GetScanState() *ScanStats { + s.statsMu.Lock() + defer s.statsMu.Unlock() + + return &ScanStats{ + MismatchedCount: len(s.sharedStats.mismatched), + UnfixedCount: len(s.sharedStats.mismatched.failed()), + } } func (s *Scanner) Close() { @@ -231,10 +273,5 @@ func (s *Scanner) doWork(state *scanLoopState, tracker mismatchTracker) (*stats, s.lg.Info("Scanner: scan batch done", "mode", state.mode, "duration", time.Since(stt).String()) }(start) - sts, scanErrs, nextIndex, err := s.worker.ScanBatch(s.ctx, state.mode, s.cfg.BatchSize, state.nextIndex, tracker) - if err == nil { - state.nextIndex = nextIndex - s.setScanState(sts) - } - return sts, scanErrs, err + return s.worker.ScanBatch(s.ctx, state, tracker) } diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index 1fcb2d18..964a7f9b 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -29,6 +29,7 @@ type Worker struct { sm IStorageManager fetchBlob es.FetchBlobFunc l1 es.Il1Source + batchSize uint64 lg log.Logger } @@ -36,24 +37,20 @@ func NewWorker( sm IStorageManager, fetch es.FetchBlobFunc, l1 es.Il1Source, + batchSize uint64, lg log.Logger, ) *Worker { return &Worker{ sm: sm, fetchBlob: fetch, l1: l1, + batchSize: batchSize, lg: lg, } } -func (s *Worker) ScanBatch( - ctx context.Context, - mode int, - batchSize int, - startIndex uint64, - mismatched mismatchTracker, -) (*stats, scanErrors, uint64, error) { - s.lg.Info("Scanner: scan batch started", "mode", mode) +func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched mismatchTracker) (*stats, scanErrors, error) { + s.lg.Info("Scanner: scan batch started", "mode", state.mode) // Never return nil stats and nil scanErrors sts := newStats() scanErrors := make(scanErrors) @@ -64,13 +61,13 @@ func (s *Worker) ScanBatch( entryCount := s.sm.KvEntryCount() if entryCount == 0 { s.lg.Info("Scanner: no KV entries found in local storage") - return sts, scanErrors, startIndex, nil + return sts, scanErrors, nil } lastKvIdx := entryCount - 1 s.lg.Info("Scanner: local storage info", "lastKvIdx", lastKvIdx, "shards", shards, "kvEntriesPerShard", kvEntries) // Determine the batch of KV indices to scan - kvsInBatch, totalEntries, batchEndExclusive := getKvsInBatch(shards, kvEntries, lastKvIdx, uint64(batchSize), startIndex, s.lg) + kvsInBatch, totalEntries, batchEndExclusive := getKvsInBatch(shards, kvEntries, lastKvIdx, s.batchSize, state.nextIndex, s.lg) sts.localKvs = summaryLocalKvs(shards, kvEntries, lastKvIdx) sts.total = int(totalEntries) @@ -79,7 +76,7 @@ func (s *Worker) ScanBatch( metas, err := s.l1.GetKvMetas(kvsInBatch, rpc.FinalizedBlockNumber.Int64()) if err != nil { s.lg.Error("Scanner: failed to query KV metas", "error", err) - return sts, scanErrors, startIndex, fmt.Errorf("failed to query KV metas: %w", err) + return sts, scanErrors, fmt.Errorf("failed to query KV metas: %w", err) } s.lg.Debug("Scanner: query KV meta done", "kvsInBatch", shortPrt(kvsInBatch)) @@ -87,23 +84,24 @@ func (s *Worker) ScanBatch( select { case <-ctx.Done(): s.lg.Warn("Scanner canceled, stopping scan", "ctx.Err", ctx.Err()) - return sts, scanErrors, startIndex, ctx.Err() + return sts, scanErrors, ctx.Err() default: } var commit common.Hash copy(commit[:], meta[32-es.HashSizeInContract:32]) kvIndex := kvsInBatch[i] - s.processScanKv(mode, kvIndex, commit, &mismatched, scanErrors) + s.processScanKv(state.mode, kvIndex, commit, &mismatched, scanErrors) } if len(kvsInBatch) > 0 { - s.lg.Info("Scanner: scan batch done", "mode", mode, "scanned", shortPrt(kvsInBatch), "count", len(kvsInBatch), "nextIndexOfKvIdx", batchEndExclusive) + s.lg.Info("Scanner: scan batch done", "mode", state.mode, "scanned", shortPrt(kvsInBatch), "count", len(kvsInBatch), "nextIndexOfKvIdx", batchEndExclusive) } sts.mismatched = mismatched + state.nextIndex = batchEndExclusive - return sts, scanErrors, batchEndExclusive, nil + return sts, scanErrors, nil } func (s *Worker) processScanKv( From 2a69b557ac924d439c94762cd8663ee8094f044c Mon Sep 17 00:00:00 2001 From: syntrust Date: Thu, 11 Dec 2025 15:44:30 +0800 Subject: [PATCH 20/51] refactor --- ethstorage/scanner/config.go | 21 +++++- ethstorage/scanner/scanner.go | 117 +++++++++++++--------------------- ethstorage/scanner/utils.go | 7 ++ ethstorage/scanner/worker.go | 37 ++++++----- 4 files changed, 92 insertions(+), 90 deletions(-) diff --git a/ethstorage/scanner/config.go b/ethstorage/scanner/config.go index dc05bf3e..6ba13595 100644 --- a/ethstorage/scanner/config.go +++ b/ethstorage/scanner/config.go @@ -28,8 +28,25 @@ func scannerEnv(name string) string { return utils.PrefixEnvVar("SCANNER_" + name) } +type scanMode int + +func (m scanMode) String() string { + switch m { + case modeDisabled: + return "disabled" + case modeCheckMeta: + return "check-meta" + case modeCheckBlob: + return "check-blob" + case modeCheckBlob + modeCheckMeta: + return "hybrid" + default: + panic(fmt.Sprintf("invalid scanner mode: %d", m)) + } +} + type Config struct { - Mode int + Mode scanMode BatchSize int Interval int } @@ -70,7 +87,7 @@ func NewConfig(ctx *cli.Context) *Config { panic(fmt.Sprintf("scanner interval must be at least %d minutes", defaultInterval)) } return &Config{ - Mode: mode, + Mode: scanMode(mode), BatchSize: ctx.GlobalInt(BatchSizeFlagName), Interval: ctx.GlobalInt(IntervalFlagName), } diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index f8bd92ca..999f1377 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -5,7 +5,6 @@ package scanner import ( "context" - "maps" "sync" "time" @@ -16,25 +15,19 @@ import ( ) type Scanner struct { - worker *Worker - feed *event.Feed - interval time.Duration - cfg Config - ctx context.Context - cancel context.CancelFunc - wg sync.WaitGroup - running bool - mu sync.Mutex // protects running - lg log.Logger - scanPermit chan struct{} // to ensure only one scan at a time - statsMu sync.Mutex // protects sharedStats and sharedErrCache - sharedStats stats - sharedErrCache scanErrors -} - -type scanLoopState struct { - mode int - nextIndex uint64 + worker *Worker + feed *event.Feed + interval time.Duration + cfg Config + ctx context.Context + cancel context.CancelFunc + wg sync.WaitGroup + running bool + mu sync.Mutex // protects running + lg log.Logger + scanPermit chan struct{} // to ensure only one scan at a time + statsMu sync.Mutex // protects sharedStats + sharedStats stats } func New( @@ -48,16 +41,15 @@ func New( ) *Scanner { cctx, cancel := context.WithCancel(ctx) scanner := &Scanner{ - worker: NewWorker(sm, fetchBlob, l1, uint64(cfg.BatchSize), lg), - feed: feed, - interval: time.Minute * time.Duration(cfg.Interval), - cfg: cfg, - ctx: cctx, - cancel: cancel, - lg: lg, - scanPermit: make(chan struct{}, 1), - sharedStats: *newStats(), - sharedErrCache: scanErrors{}, + worker: NewWorker(sm, fetchBlob, l1, uint64(cfg.BatchSize), lg), + feed: feed, + interval: time.Minute * time.Duration(cfg.Interval), + cfg: cfg, + ctx: cctx, + cancel: cancel, + lg: lg, + scanPermit: make(chan struct{}, 1), + sharedStats: *newStats(), } scanner.scanPermit <- struct{}{} scanner.wg.Add(1) @@ -127,11 +119,11 @@ func (s *Scanner) launchScanLoop(state *scanLoopState, interval time.Duration) { mainTicker := time.NewTicker(interval) defer mainTicker.Stop() - s.doScan(state) + s.doWork(state) for { select { case <-mainTicker.C: - s.doScan(state) + s.doWork(state) case <-s.ctx.Done(): return @@ -140,17 +132,17 @@ func (s *Scanner) launchScanLoop(state *scanLoopState, interval time.Duration) { }() } -func (s *Scanner) doScan(state *scanLoopState) { +func (s *Scanner) doWork(state *scanLoopState) { if !s.acquireScanPermit() { return } tracker := s.cloneSharedMismatches() - initSts, initErrs, err := s.doWork(state, tracker) + stats, err := s.worker.ScanBatch(s.ctx, state, tracker) s.releaseScanPermit() if err != nil { s.lg.Error("Scanner: initial scan failed", "mode", state.mode, "error", err) } else { - s.updateSharedStats(initSts, initErrs) + s.updateSharedStats(stats) } } @@ -179,11 +171,7 @@ func (s *Scanner) startReporter() { for { select { case <-ticker.C: - statsSnapshot, errSnapshot := s.snapshotSharedState() - s.logStats(statsSnapshot) - for i, e := range errSnapshot { - s.lg.Info("Scanner error happened earlier", "kvIndex", i, "error", e) - } + s.logStats() case <-s.ctx.Done(): return } @@ -191,16 +179,23 @@ func (s *Scanner) startReporter() { }() } -func (s *Scanner) logStats(sts *stats) { +func (s *Scanner) logStats() { + s.statsMu.Lock() + defer s.statsMu.Unlock() + logFields := []any{ "mode", s.cfg.Mode, - "localKvs", sts.localKvs, - "localKvsCount", sts.total, + "localKvs", s.sharedStats.localKvs, + "localKvsCount", s.sharedStats.total, } - if len(sts.mismatched) > 0 { - logFields = append(logFields, "mismatched", sts.mismatched.String()) + if len(s.sharedStats.mismatched) > 0 { + logFields = append(logFields, "mismatched", s.sharedStats.mismatched.String()) } s.lg.Info("Scanner stats", logFields...) + + for i, e := range s.sharedStats.errs { + s.lg.Info("Scanner error happened earlier", "kvIndex", i, "error", e) + } } func (s *Scanner) cloneSharedMismatches() mismatchTracker { @@ -209,7 +204,7 @@ func (s *Scanner) cloneSharedMismatches() mismatchTracker { return s.sharedStats.mismatched.clone() } -func (s *Scanner) updateSharedStats(sts *stats, errs scanErrors) { +func (s *Scanner) updateSharedStats(sts *stats) { if sts == nil { return } @@ -222,27 +217,14 @@ func (s *Scanner) updateSharedStats(sts *stats, errs scanErrors) { } else { s.sharedStats.mismatched = mismatchTracker{} } - if errs != nil { - if s.sharedErrCache == nil { - s.sharedErrCache = scanErrors{} + if sts.errs != nil { + if s.sharedStats.errs == nil { + s.sharedStats.errs = scanErrors{} } - s.sharedErrCache.merge(errs) + s.sharedStats.errs.merge(sts.errs) } } -func (s *Scanner) snapshotSharedState() (*stats, scanErrors) { - s.statsMu.Lock() - defer s.statsMu.Unlock() - statsCopy := &stats{ - localKvs: s.sharedStats.localKvs, - total: s.sharedStats.total, - mismatched: s.sharedStats.mismatched.clone(), - } - errCopy := scanErrors{} - maps.Copy(errCopy, s.sharedErrCache) - return statsCopy, errCopy -} - func (s *Scanner) GetScanState() *ScanStats { s.statsMu.Lock() defer s.statsMu.Unlock() @@ -266,12 +248,3 @@ func (s *Scanner) Close() { s.lg.Info("Scanner closed") s.wg.Wait() } - -func (s *Scanner) doWork(state *scanLoopState, tracker mismatchTracker) (*stats, scanErrors, error) { - start := time.Now() - defer func(stt time.Time) { - s.lg.Info("Scanner: scan batch done", "mode", state.mode, "duration", time.Since(stt).String()) - }(start) - - return s.worker.ScanBatch(s.ctx, state, tracker) -} diff --git a/ethstorage/scanner/utils.go b/ethstorage/scanner/utils.go index ae6f6d40..13d9bf09 100644 --- a/ethstorage/scanner/utils.go +++ b/ethstorage/scanner/utils.go @@ -128,10 +128,16 @@ func (m mismatchTracker) clone() mismatchTracker { return clone } +type scanLoopState struct { + mode scanMode + nextIndex uint64 +} + type stats struct { localKvs string // kv entries stored in local total int // total number of kv entries stored in local mismatched mismatchTracker // tracks all mismatched indices and their status + errs scanErrors // latest scan errors keyed by kv index } func newStats() *stats { @@ -139,6 +145,7 @@ func newStats() *stats { localKvs: "", total: 0, mismatched: mismatchTracker{}, + errs: scanErrors{}, } } diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index 964a7f9b..d78dce4f 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -7,6 +7,7 @@ import ( "context" "errors" "fmt" + "time" "github.com/ethereum/go-ethereum/common" "github.com/ethereum/go-ethereum/log" @@ -49,11 +50,18 @@ func NewWorker( } } -func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched mismatchTracker) (*stats, scanErrors, error) { - s.lg.Info("Scanner: scan batch started", "mode", state.mode) +func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched mismatchTracker) (*stats, error) { + start := time.Now() + var kvsInBatch []uint64 + defer func(stt time.Time) { + if len(kvsInBatch) > 0 { + s.lg.Info("Scanner: scan batch done", "mode", state.mode, "scanned", shortPrt(kvsInBatch), "count", len(kvsInBatch), "nextIndexOfKvIdx", state.nextIndex, "duration", time.Since(stt).String()) + } + }(start) + // Never return nil stats and nil scanErrors sts := newStats() - scanErrors := make(scanErrors) + errs := sts.errs // Query local storage info shards := s.sm.Shards() @@ -61,13 +69,14 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched entryCount := s.sm.KvEntryCount() if entryCount == 0 { s.lg.Info("Scanner: no KV entries found in local storage") - return sts, scanErrors, nil + return sts, nil } lastKvIdx := entryCount - 1 - s.lg.Info("Scanner: local storage info", "lastKvIdx", lastKvIdx, "shards", shards, "kvEntriesPerShard", kvEntries) + startIndexOfKvIdx := state.nextIndex + s.lg.Info("Scanner: scan batch started", "mode", state.mode, "startIndexOfKvIdx", startIndexOfKvIdx, "lastKvIdxOnChain", lastKvIdx, "shardsInLocal", shards) // Determine the batch of KV indices to scan - kvsInBatch, totalEntries, batchEndExclusive := getKvsInBatch(shards, kvEntries, lastKvIdx, s.batchSize, state.nextIndex, s.lg) + kvsInBatch, totalEntries, batchEndExclusive := getKvsInBatch(shards, kvEntries, lastKvIdx, s.batchSize, startIndexOfKvIdx, s.lg) sts.localKvs = summaryLocalKvs(shards, kvEntries, lastKvIdx) sts.total = int(totalEntries) @@ -76,7 +85,7 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched metas, err := s.l1.GetKvMetas(kvsInBatch, rpc.FinalizedBlockNumber.Int64()) if err != nil { s.lg.Error("Scanner: failed to query KV metas", "error", err) - return sts, scanErrors, fmt.Errorf("failed to query KV metas: %w", err) + return sts, fmt.Errorf("failed to query KV metas: %w", err) } s.lg.Debug("Scanner: query KV meta done", "kvsInBatch", shortPrt(kvsInBatch)) @@ -84,28 +93,24 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched select { case <-ctx.Done(): s.lg.Warn("Scanner canceled, stopping scan", "ctx.Err", ctx.Err()) - return sts, scanErrors, ctx.Err() + return sts, ctx.Err() default: } var commit common.Hash copy(commit[:], meta[32-es.HashSizeInContract:32]) kvIndex := kvsInBatch[i] - s.processScanKv(state.mode, kvIndex, commit, &mismatched, scanErrors) - } - - if len(kvsInBatch) > 0 { - s.lg.Info("Scanner: scan batch done", "mode", state.mode, "scanned", shortPrt(kvsInBatch), "count", len(kvsInBatch), "nextIndexOfKvIdx", batchEndExclusive) + s.processScannedKv(state.mode, kvIndex, commit, &mismatched, errs) } sts.mismatched = mismatched state.nextIndex = batchEndExclusive - return sts, scanErrors, nil + return sts, nil } -func (s *Worker) processScanKv( - mode int, +func (s *Worker) processScannedKv( + mode scanMode, kvIndex uint64, commit common.Hash, mismatched *mismatchTracker, From 433ab9f23d1858e70aa2ffe413250903d45b95ba Mon Sep 17 00:00:00 2001 From: syntrust Date: Thu, 11 Dec 2025 19:07:02 +0800 Subject: [PATCH 21/51] minor --- ethstorage/scanner/scanner.go | 4 ++-- ethstorage/scanner/worker.go | 22 ++++++++-------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index 999f1377..059b4e9e 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -98,8 +98,8 @@ func (s *Scanner) start() { s.startReporter() if s.cfg.Mode == modeCheckBlob+modeCheckMeta { - // TODO: blobInterval := time.Hour * 24 * time.Duration(s.cfg.Interval) - blobInterval := time.Minute * 9 * time.Duration(s.cfg.Interval) // test only + // Always keep blob interval 24 * 60 times of meta interval for hybrid mode + blobInterval := time.Hour * 24 * time.Duration(s.cfg.Interval) s.lg.Info("Scanner running in hybrid mode", "mode", s.cfg.Mode, "metaInterval", s.interval, "blobInterval", blobInterval) s.launchScanLoop(&scanLoopState{mode: modeCheckBlob}, blobInterval) s.launchScanLoop(&scanLoopState{mode: modeCheckMeta}, s.interval) diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index d78dce4f..7489b838 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -100,7 +100,7 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched var commit common.Hash copy(commit[:], meta[32-es.HashSizeInContract:32]) kvIndex := kvsInBatch[i] - s.processScannedKv(state.mode, kvIndex, commit, &mismatched, errs) + s.scanKv(state.mode, kvIndex, commit, &mismatched, errs) } sts.mismatched = mismatched @@ -109,13 +109,7 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched return sts, nil } -func (s *Worker) processScannedKv( - mode scanMode, - kvIndex uint64, - commit common.Hash, - mismatched *mismatchTracker, - scanErrors scanErrors, -) { +func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, mismatched *mismatchTracker, errs scanErrors) { var err error var found bool switch mode { @@ -125,7 +119,7 @@ func (s *Worker) processScannedKv( metaLocal, found, err = s.sm.TryReadMeta(kvIndex) if err != nil { s.lg.Error("Scanner: failed to read meta", "kvIndex", kvIndex, "error", err) - scanErrors.add(kvIndex, fmt.Errorf("failed to read meta: %w", err)) + errs.add(kvIndex, fmt.Errorf("failed to read meta: %w", err)) return } err = es.CompareCommits(commit.Bytes(), metaLocal) @@ -145,7 +139,7 @@ func (s *Worker) processScannedKv( case failed: mismatched.markRecovered(kvIndex) // Clear the error state - scanErrors.nil(kvIndex) + errs.nil(kvIndex) s.lg.Info("Scanner: previously failed KV recovered", "kvIndex", kvIndex) case pending: delete(*mismatched, kvIndex) @@ -160,7 +154,7 @@ func (s *Worker) processScannedKv( if !found { // The shard is not stored locally - scanErrors.add(kvIndex, fmt.Errorf("shard not found locally: commit=%x", commit)) + errs.add(kvIndex, fmt.Errorf("shard not found locally: commit=%x", commit)) s.lg.Error("Scanner: blob not found locally", "kvIndex", kvIndex, "commit", commit) return } @@ -176,11 +170,11 @@ func (s *Worker) processScannedKv( if fixErr := s.fixKv(kvIndex, commit); fixErr != nil { mismatched.markFailed(kvIndex) s.lg.Error("Scanner: failed to fix blob", "kvIndex", kvIndex, "error", fixErr) - scanErrors.add(kvIndex, fmt.Errorf("failed to fix blob: %w", fixErr)) + errs.add(kvIndex, fmt.Errorf("failed to fix blob: %w", fixErr)) } else { s.lg.Info("Scanner: blob fixed successfully", "kvIndex", kvIndex) mismatched.markFixed(kvIndex) - scanErrors.nil(kvIndex) + errs.nil(kvIndex) } } else { @@ -190,7 +184,7 @@ func (s *Worker) processScannedKv( } } else { s.lg.Error("Scanner: unexpected error occurred", "kvIndex", kvIndex, "error", err) - scanErrors.add(kvIndex, fmt.Errorf("unexpected error: %w", err)) + errs.add(kvIndex, fmt.Errorf("unexpected error: %w", err)) } } } From aa2cfcbd76fd6490f465ce5f595e443507d0c1c0 Mon Sep 17 00:00:00 2001 From: syntrust Date: Tue, 16 Dec 2025 15:01:40 +0800 Subject: [PATCH 22/51] refactor --- ethstorage/scanner/scanner.go | 3 +-- ethstorage/scanner/worker.go | 42 +++++++++++++++++++---------------- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index 059b4e9e..26dbde82 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -136,8 +136,7 @@ func (s *Scanner) doWork(state *scanLoopState) { if !s.acquireScanPermit() { return } - tracker := s.cloneSharedMismatches() - stats, err := s.worker.ScanBatch(s.ctx, state, tracker) + stats, err := s.worker.ScanBatch(s.ctx, state, s.cloneSharedMismatches()) s.releaseScanPermit() if err != nil { s.lg.Error("Scanner: initial scan failed", "mode", state.mode, "error", err) diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index 7489b838..6e7590db 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -196,28 +196,11 @@ func (s *Worker) fixKv(kvIndex uint64, commit common.Hash) error { return nil } -func getKvsInBatch(shards []uint64, kvEntries, lastKvIdx, batchSize, batchStartIndex uint64, lg log.Logger) ([]uint64, uint64, uint64) { - // Calculate the total number of KV entries stored locally - var totalEntries uint64 - // Shard indices are sorted but may not be continuous: e.g. [0, 1, 3, 4] indicates shard 2 is missing - for _, shardIndex := range shards { - shardOfLastFinalizedKv := lastKvIdx / kvEntries - if shardIndex > shardOfLastFinalizedKv { - // Skip empty shards - break - } - // The last shard may contain fewer than the full kvEntries - if shardIndex == lastKvIdx/kvEntries { - totalEntries += lastKvIdx%kvEntries + 1 - break - } - // Complete shards - totalEntries += kvEntries - } +func getKvsInBatch(shards []uint64, kvEntries, lastKvIdx, batchSize, startKvIndex uint64, lg log.Logger) ([]uint64, uint64, uint64) { + totalEntries := getTotalEntries(shards, kvEntries, lastKvIdx) lg.Debug("Scanner: KV entries stored locally", "totalKvStored", totalEntries) // Determine batch start and end KV indices - startKvIndex := batchStartIndex if startKvIndex >= totalEntries { startKvIndex = 0 lg.Debug("Scanner: restarting scan from the beginning") @@ -254,3 +237,24 @@ func getKvsInBatch(shards []uint64, kvEntries, lastKvIdx, batchSize, batchStartI lg.Debug("Scanner: batch index range determined", "batchStart", startKvIndex, "batchEnd(exclusive)", endKvIndexExclusive, "kvsInBatch", shortPrt(kvsInBatch)) return kvsInBatch, totalEntries, endKvIndexExclusive } + +// Calculate the total number of KV entries stored locally +func getTotalEntries(shards []uint64, kvEntries, lastKvIdx uint64) uint64 { + var totalEntries uint64 + // Shard indices are sorted but may not be continuous: e.g. [0, 1, 3, 4] indicates shard 2 is missing + for _, shardIndex := range shards { + shardOfLastFinalizedKv := lastKvIdx / kvEntries + if shardIndex > shardOfLastFinalizedKv { + // Skip empty shards + break + } + // The last shard may contain fewer than the full kvEntries + if shardIndex == lastKvIdx/kvEntries { + totalEntries += lastKvIdx%kvEntries + 1 + break + } + // Complete shards + totalEntries += kvEntries + } + return totalEntries +} From bb167af7566fd6a0846ad884df1f3953ed5cfef2 Mon Sep 17 00:00:00 2001 From: syntrust Date: Tue, 16 Dec 2025 18:21:54 +0800 Subject: [PATCH 23/51] report stats using timely local kv summary --- ethstorage/scanner/scanner.go | 74 ++++++++++++++++++------------- ethstorage/scanner/utils.go | 23 ---------- ethstorage/scanner/worker.go | 62 +++++++++++++++----------- ethstorage/scanner/worker_test.go | 25 ++++++----- 4 files changed, 92 insertions(+), 92 deletions(-) diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index 26dbde82..107a2e23 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -5,6 +5,7 @@ package scanner import ( "context" + "maps" "sync" "time" @@ -15,19 +16,20 @@ import ( ) type Scanner struct { - worker *Worker - feed *event.Feed - interval time.Duration - cfg Config - ctx context.Context - cancel context.CancelFunc - wg sync.WaitGroup - running bool - mu sync.Mutex // protects running - lg log.Logger - scanPermit chan struct{} // to ensure only one scan at a time - statsMu sync.Mutex // protects sharedStats - sharedStats stats + worker *Worker + feed *event.Feed + interval time.Duration + cfg Config + ctx context.Context + cancel context.CancelFunc + wg sync.WaitGroup + running bool + mu sync.Mutex // protects running + lg log.Logger + scanPermit chan struct{} // to ensure only one scan at a time + statsMu sync.Mutex // protects sharedStats + sharedStats stats + localKvCount uint64 // total number of kv entries stored in local } func New( @@ -136,7 +138,11 @@ func (s *Scanner) doWork(state *scanLoopState) { if !s.acquireScanPermit() { return } - stats, err := s.worker.ScanBatch(s.ctx, state, s.cloneSharedMismatches()) + s.statsMu.Lock() + localKvCount := s.localKvCount + tracker := s.sharedStats.mismatched.clone() + s.statsMu.Unlock() + stats, err := s.worker.ScanBatch(s.ctx, state, localKvCount, tracker) s.releaseScanPermit() if err != nil { s.lg.Error("Scanner: initial scan failed", "mode", state.mode, "error", err) @@ -170,7 +176,13 @@ func (s *Scanner) startReporter() { for { select { case <-ticker.C: - s.logStats() + // update local entries info + localKvs, sum := s.worker.summaryLocalKvs() + s.statsMu.Lock() + s.localKvCount = localKvs + s.statsMu.Unlock() + + s.logStats(sum) case <-s.ctx.Done(): return } @@ -178,39 +190,41 @@ func (s *Scanner) startReporter() { }() } -func (s *Scanner) logStats() { +func (s *Scanner) logStats(sum string) { s.statsMu.Lock() - defer s.statsMu.Unlock() + localKvCount := s.localKvCount + var mismatched string + if len(s.sharedStats.mismatched) > 0 { + mismatched = s.sharedStats.mismatched.String() + } + errSnapshot := scanErrors{} + if s.sharedStats.errs != nil { + maps.Copy(errSnapshot, s.sharedStats.errs) + } + s.statsMu.Unlock() logFields := []any{ "mode", s.cfg.Mode, - "localKvs", s.sharedStats.localKvs, - "localKvsCount", s.sharedStats.total, + "localKvs", sum, + "localKvCount", localKvCount, } - if len(s.sharedStats.mismatched) > 0 { - logFields = append(logFields, "mismatched", s.sharedStats.mismatched.String()) + if mismatched != "" { + logFields = append(logFields, "mismatched", mismatched) } s.lg.Info("Scanner stats", logFields...) - for i, e := range s.sharedStats.errs { + for i, e := range errSnapshot { s.lg.Info("Scanner error happened earlier", "kvIndex", i, "error", e) } } -func (s *Scanner) cloneSharedMismatches() mismatchTracker { - s.statsMu.Lock() - defer s.statsMu.Unlock() - return s.sharedStats.mismatched.clone() -} - func (s *Scanner) updateSharedStats(sts *stats) { if sts == nil { return } s.statsMu.Lock() defer s.statsMu.Unlock() - s.sharedStats.localKvs = sts.localKvs - s.sharedStats.total = sts.total + if sts.mismatched != nil { s.sharedStats.mismatched = sts.mismatched.clone() } else { diff --git a/ethstorage/scanner/utils.go b/ethstorage/scanner/utils.go index 13d9bf09..a5a19caa 100644 --- a/ethstorage/scanner/utils.go +++ b/ethstorage/scanner/utils.go @@ -134,16 +134,12 @@ type scanLoopState struct { } type stats struct { - localKvs string // kv entries stored in local - total int // total number of kv entries stored in local mismatched mismatchTracker // tracks all mismatched indices and their status errs scanErrors // latest scan errors keyed by kv index } func newStats() *stats { return &stats{ - localKvs: "", - total: 0, mismatched: mismatchTracker{}, errs: scanErrors{}, } @@ -169,25 +165,6 @@ func shortPrt(nums []uint64) string { return strings.Join(res, ",") } -func summaryLocalKvs(shards []uint64, kvEntries, lastKvIdx uint64) string { - var res []string - for _, shard := range shards { - if shard*kvEntries > lastKvIdx { - // skip empty shards - break - } - var lastEntry uint64 - if shard == lastKvIdx/kvEntries { - lastEntry = lastKvIdx - } else { - lastEntry = (shard+1)*kvEntries - 1 - } - shardView := fmt.Sprintf("shard%d%s", shard, formatRange(shard*kvEntries, lastEntry)) - res = append(res, shardView) - } - return strings.Join(res, ",") -} - func formatRange(start, end uint64) string { if start == end { return fmt.Sprintf("[%d]", start) diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index 6e7590db..7c3c71d9 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -7,6 +7,7 @@ import ( "context" "errors" "fmt" + "strings" "time" "github.com/ethereum/go-ethereum/common" @@ -50,7 +51,7 @@ func NewWorker( } } -func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched mismatchTracker) (*stats, error) { +func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, localKvCount uint64, mismatched mismatchTracker) (*stats, error) { start := time.Now() var kvsInBatch []uint64 defer func(stt time.Time) { @@ -63,23 +64,18 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched sts := newStats() errs := sts.errs - // Query local storage info - shards := s.sm.Shards() - kvEntries := s.sm.KvEntries() - entryCount := s.sm.KvEntryCount() - if entryCount == 0 { + if localKvCount == 0 { s.lg.Info("Scanner: no KV entries found in local storage") return sts, nil } - lastKvIdx := entryCount - 1 + // Query local storage info + shards := s.sm.Shards() + kvEntries := s.sm.KvEntries() + lastKvIdx := s.sm.KvEntryCount() - 1 startIndexOfKvIdx := state.nextIndex s.lg.Info("Scanner: scan batch started", "mode", state.mode, "startIndexOfKvIdx", startIndexOfKvIdx, "lastKvIdxOnChain", lastKvIdx, "shardsInLocal", shards) - // Determine the batch of KV indices to scan - kvsInBatch, totalEntries, batchEndExclusive := getKvsInBatch(shards, kvEntries, lastKvIdx, s.batchSize, startIndexOfKvIdx, s.lg) - - sts.localKvs = summaryLocalKvs(shards, kvEntries, lastKvIdx) - sts.total = int(totalEntries) + kvsInBatch, batchEndExclusive := getKvsInBatch(shards, kvEntries, localKvCount, s.batchSize, startIndexOfKvIdx, s.lg) // Query the metas from the L1 contract metas, err := s.l1.GetKvMetas(kvsInBatch, rpc.FinalizedBlockNumber.Int64()) @@ -196,16 +192,22 @@ func (s *Worker) fixKv(kvIndex uint64, commit common.Hash) error { return nil } -func getKvsInBatch(shards []uint64, kvEntries, lastKvIdx, batchSize, startKvIndex uint64, lg log.Logger) ([]uint64, uint64, uint64) { - totalEntries := getTotalEntries(shards, kvEntries, lastKvIdx) - lg.Debug("Scanner: KV entries stored locally", "totalKvStored", totalEntries) +func (s *Worker) summaryLocalKvs() (uint64, string) { + kvEntryCountOnChain := s.sm.KvEntryCount() + if kvEntryCountOnChain == 0 { + s.lg.Info("Scanner: no KV entries found in local storage") + return 0, "(none)" + } + return summaryLocalKvs(s.sm.Shards(), s.sm.KvEntries(), kvEntryCountOnChain-1) +} +func getKvsInBatch(shards []uint64, kvEntries, localKvCount, batchSize, startKvIndex uint64, lg log.Logger) ([]uint64, uint64) { // Determine batch start and end KV indices - if startKvIndex >= totalEntries { + if startKvIndex >= localKvCount { startKvIndex = 0 lg.Debug("Scanner: restarting scan from the beginning") } - endKvIndexExclusive := min(startKvIndex+batchSize, totalEntries) + endKvIndexExclusive := min(startKvIndex+batchSize, localKvCount) // The actual batch range is [startKvIndex, endKvIndexExclusive) or [startKvIndex, endIndex] endIndex := endKvIndexExclusive - 1 @@ -235,26 +237,32 @@ func getKvsInBatch(shards []uint64, kvEntries, lastKvIdx, batchSize, startKvInde } } lg.Debug("Scanner: batch index range determined", "batchStart", startKvIndex, "batchEnd(exclusive)", endKvIndexExclusive, "kvsInBatch", shortPrt(kvsInBatch)) - return kvsInBatch, totalEntries, endKvIndexExclusive + return kvsInBatch, endKvIndexExclusive } // Calculate the total number of KV entries stored locally -func getTotalEntries(shards []uint64, kvEntries, lastKvIdx uint64) uint64 { +func summaryLocalKvs(shards []uint64, kvEntries, lastKvIdx uint64) (uint64, string) { var totalEntries uint64 + var res []string // Shard indices are sorted but may not be continuous: e.g. [0, 1, 3, 4] indicates shard 2 is missing - for _, shardIndex := range shards { - shardOfLastFinalizedKv := lastKvIdx / kvEntries - if shardIndex > shardOfLastFinalizedKv { + for _, shard := range shards { + shardOfLastKv := lastKvIdx / kvEntries + if shard > shardOfLastKv { // Skip empty shards break } + var lastEntry uint64 // The last shard may contain fewer than the full kvEntries - if shardIndex == lastKvIdx/kvEntries { + if shard == shardOfLastKv { totalEntries += lastKvIdx%kvEntries + 1 - break + lastEntry = lastKvIdx + } else { + // Complete shards + totalEntries += kvEntries + lastEntry = (shard+1)*kvEntries - 1 } - // Complete shards - totalEntries += kvEntries + shardView := fmt.Sprintf("shard%d%s", shard, formatRange(shard*kvEntries, lastEntry)) + res = append(res, shardView) } - return totalEntries + return totalEntries, strings.Join(res, ",") } diff --git a/ethstorage/scanner/worker_test.go b/ethstorage/scanner/worker_test.go index dfd659d1..b6191e84 100644 --- a/ethstorage/scanner/worker_test.go +++ b/ethstorage/scanner/worker_test.go @@ -19,17 +19,6 @@ func TestGetKvsInBatch(t *testing.T) { expectedTotal uint64 expectedBatchEnd uint64 }{ - { - name: "skip empty shards", - shards: []uint64{0, 2}, - kvEntries: 8, - lastKvIdx: 12, - batchSize: 100, - batchStartIndex: 0, - expectedKvs: []uint64{0, 1, 2, 3, 4, 5, 6, 7}, - expectedTotal: 8, - expectedBatchEnd: 8, - }, { name: "1 shard batch 1", shards: []uint64{0}, @@ -162,6 +151,17 @@ func TestGetKvsInBatch(t *testing.T) { expectedTotal: 14, expectedBatchEnd: 14, }, + { + name: "Discontinuous shards missing current", + shards: []uint64{0, 2}, + kvEntries: 8, + lastKvIdx: 12, + batchSize: 100, + batchStartIndex: 0, + expectedKvs: []uint64{0, 1, 2, 3, 4, 5, 6, 7}, + expectedTotal: 8, + expectedBatchEnd: 8, + }, { name: "Boundary conditions 1 kv", shards: []uint64{0}, @@ -235,7 +235,8 @@ func TestGetKvsInBatch(t *testing.T) { t.Run(tt.name, func(t *testing.T) { lg := log.New() - kvs, total, batchEnd := getKvsInBatch(tt.shards, tt.kvEntries, tt.lastKvIdx, tt.batchSize, tt.batchStartIndex, lg) + total, _ := summaryLocalKvs(tt.shards, tt.kvEntries, tt.lastKvIdx) + kvs, batchEnd := getKvsInBatch(tt.shards, tt.kvEntries, total, tt.batchSize, tt.batchStartIndex, lg) assert.Equal(t, tt.expectedKvs, kvs, "KV indices do not match") assert.Equal(t, tt.expectedTotal, total, "Total entries do not match") From 71c023fea63e4454b453134fcaacae30874e5f27 Mon Sep 17 00:00:00 2001 From: syntrust Date: Wed, 17 Dec 2025 17:35:53 +0800 Subject: [PATCH 24/51] count local kvs on demand --- ethstorage/scanner/worker.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index 7c3c71d9..f55002a1 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -51,7 +51,7 @@ func NewWorker( } } -func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, localKvCount uint64, mismatched mismatchTracker) (*stats, error) { +func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched mismatchTracker) (*stats, error) { start := time.Now() var kvsInBatch []uint64 defer func(stt time.Time) { @@ -63,7 +63,7 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, localKvCou // Never return nil stats and nil scanErrors sts := newStats() errs := sts.errs - + localKvCount, _ := s.summaryLocalKvs() if localKvCount == 0 { s.lg.Info("Scanner: no KV entries found in local storage") return sts, nil From 6621274ba606380181afa8827a4d634032846b56 Mon Sep 17 00:00:00 2001 From: syntrust Date: Wed, 17 Dec 2025 17:36:21 +0800 Subject: [PATCH 25/51] support 2 intevals --- ethstorage/scanner/config.go | 50 +++++++++++++++++---------- ethstorage/scanner/scanner.go | 65 +++++++++++++++++------------------ 2 files changed, 63 insertions(+), 52 deletions(-) diff --git a/ethstorage/scanner/config.go b/ethstorage/scanner/config.go index 6ba13595..f39bdc21 100644 --- a/ethstorage/scanner/config.go +++ b/ethstorage/scanner/config.go @@ -5,6 +5,7 @@ package scanner import ( "fmt" + "time" "github.com/ethstorage/go-ethstorage/ethstorage/flags/utils" "github.com/urfave/cli" @@ -17,12 +18,17 @@ const ( ) const ( - ModeFlagName = "scanner.mode" - BatchSizeFlagName = "scanner.batch-size" - IntervalFlagName = "scanner.interval" + ModeFlagName = "scanner.mode" + BatchSizeFlagName = "scanner.batch-size" + IntervalMetaFlagName = "scanner.interval.meta" + IntervalBlobFlagName = "scanner.interval.blob" ) -const defaultInterval = 3 // in minutes +// intervals in minutes +const defaultIntervalMeta = 3 +const defaultIntervalBlob = 24 * 60 +const minIntervalMeta = 1 +const minIntervalBlob = 5 func scannerEnv(name string) string { return utils.PrefixEnvVar("SCANNER_" + name) @@ -46,9 +52,10 @@ func (m scanMode) String() string { } type Config struct { - Mode scanMode - BatchSize int - Interval int + Mode scanMode + BatchSize int + IntervalMeta time.Duration + IntervalBlob time.Duration } func CLIFlags() []cli.Flag { @@ -66,10 +73,16 @@ func CLIFlags() []cli.Flag { Value: 8192, }, cli.IntFlag{ - Name: IntervalFlagName, - Usage: fmt.Sprintf("Data scan interval in minutes, minimum %d (default). In hybrid mode, the interval applies to meta mode in minutes, blob mode in days", defaultInterval), - EnvVar: scannerEnv("INTERVAL"), - Value: defaultInterval, + Name: IntervalMetaFlagName, + Usage: fmt.Sprintf("Data scan interval of check-meta mode in minutes, minimum %d (default %d)", minIntervalMeta, defaultIntervalMeta), + EnvVar: scannerEnv("INTERVAL_META"), + Value: defaultIntervalMeta, + }, + cli.IntFlag{ + Name: IntervalBlobFlagName, + Usage: fmt.Sprintf("Data scan interval of check-blob mode in minutes, minimum %d (default %d)", minIntervalBlob, defaultIntervalBlob), + EnvVar: scannerEnv("INTERVAL_BLOB"), + Value: defaultIntervalBlob, }, } return flags @@ -80,15 +93,16 @@ func NewConfig(ctx *cli.Context) *Config { if mode == modeDisabled { return nil } - if mode != modeCheckMeta && mode != modeCheckBlob && mode != modeCheckBlob+modeCheckMeta { - panic(fmt.Sprintf("invalid scanner mode: %d", mode)) + if interval := ctx.GlobalInt(IntervalMetaFlagName); interval < minIntervalMeta { + panic(fmt.Sprintf("scanner interval of check-meta mode must be at least %d minutes", minIntervalMeta)) } - if interval := ctx.GlobalInt(IntervalFlagName); interval < defaultInterval { - panic(fmt.Sprintf("scanner interval must be at least %d minutes", defaultInterval)) + if interval := ctx.GlobalInt(IntervalBlobFlagName); interval < minIntervalBlob { + panic(fmt.Sprintf("scanner interval of check-blob mode must be at least %d minutes", minIntervalBlob)) } return &Config{ - Mode: scanMode(mode), - BatchSize: ctx.GlobalInt(BatchSizeFlagName), - Interval: ctx.GlobalInt(IntervalFlagName), + Mode: scanMode(mode), + BatchSize: ctx.GlobalInt(BatchSizeFlagName), + IntervalMeta: time.Minute * time.Duration(ctx.GlobalInt(IntervalMetaFlagName)), + IntervalBlob: time.Minute * time.Duration(ctx.GlobalInt(IntervalBlobFlagName)), } } diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index 107a2e23..cba293d8 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -16,20 +16,18 @@ import ( ) type Scanner struct { - worker *Worker - feed *event.Feed - interval time.Duration - cfg Config - ctx context.Context - cancel context.CancelFunc - wg sync.WaitGroup - running bool - mu sync.Mutex // protects running - lg log.Logger - scanPermit chan struct{} // to ensure only one scan at a time - statsMu sync.Mutex // protects sharedStats - sharedStats stats - localKvCount uint64 // total number of kv entries stored in local + worker *Worker + feed *event.Feed + cfg Config + ctx context.Context + cancel context.CancelFunc + wg sync.WaitGroup + running bool + mu sync.Mutex // protects running + lg log.Logger + scanPermit chan struct{} // to ensure only one scan at a time + statsMu sync.Mutex // protects sharedStats + sharedStats stats } func New( @@ -45,7 +43,6 @@ func New( scanner := &Scanner{ worker: NewWorker(sm, fetchBlob, l1, uint64(cfg.BatchSize), lg), feed: feed, - interval: time.Minute * time.Duration(cfg.Interval), cfg: cfg, ctx: cctx, cancel: cancel, @@ -99,16 +96,21 @@ func (s *Scanner) start() { s.startReporter() + if s.cfg.Mode == modeDisabled { + s.lg.Info("Scanner is disabled") + return + } if s.cfg.Mode == modeCheckBlob+modeCheckMeta { - // Always keep blob interval 24 * 60 times of meta interval for hybrid mode - blobInterval := time.Hour * 24 * time.Duration(s.cfg.Interval) - s.lg.Info("Scanner running in hybrid mode", "mode", s.cfg.Mode, "metaInterval", s.interval, "blobInterval", blobInterval) - s.launchScanLoop(&scanLoopState{mode: modeCheckBlob}, blobInterval) - s.launchScanLoop(&scanLoopState{mode: modeCheckMeta}, s.interval) + s.lg.Info("Scanner running in hybrid mode", "mode", s.cfg.Mode, "metaInterval", s.cfg.IntervalMeta, "blobInterval", s.cfg.IntervalBlob) + s.launchScanLoop(&scanLoopState{mode: modeCheckBlob}, s.cfg.IntervalBlob) + s.launchScanLoop(&scanLoopState{mode: modeCheckMeta}, s.cfg.IntervalMeta) return } - - s.launchScanLoop(&scanLoopState{mode: s.cfg.Mode}, s.interval) + interval := s.cfg.IntervalMeta + if s.cfg.Mode == modeCheckBlob { + interval = s.cfg.IntervalBlob + } + s.launchScanLoop(&scanLoopState{mode: s.cfg.Mode}, interval) } func (s *Scanner) launchScanLoop(state *scanLoopState, interval time.Duration) { @@ -116,7 +118,7 @@ func (s *Scanner) launchScanLoop(state *scanLoopState, interval time.Duration) { go func() { defer s.wg.Done() - s.lg.Info("Scanner started", "mode", state.mode, "interval", interval.String(), "batchSize", s.cfg.BatchSize) + s.lg.Info("Scanner configured", "mode", state.mode, "interval", interval.String(), "batchSize", s.cfg.BatchSize) mainTicker := time.NewTicker(interval) defer mainTicker.Stop() @@ -139,10 +141,9 @@ func (s *Scanner) doWork(state *scanLoopState) { return } s.statsMu.Lock() - localKvCount := s.localKvCount tracker := s.sharedStats.mismatched.clone() s.statsMu.Unlock() - stats, err := s.worker.ScanBatch(s.ctx, state, localKvCount, tracker) + stats, err := s.worker.ScanBatch(s.ctx, state, tracker) s.releaseScanPermit() if err != nil { s.lg.Error("Scanner: initial scan failed", "mode", state.mode, "error", err) @@ -171,18 +172,14 @@ func (s *Scanner) startReporter() { s.wg.Add(1) go func() { defer s.wg.Done() + + s.logStats() ticker := time.NewTicker(time.Minute) defer ticker.Stop() for { select { case <-ticker.C: - // update local entries info - localKvs, sum := s.worker.summaryLocalKvs() - s.statsMu.Lock() - s.localKvCount = localKvs - s.statsMu.Unlock() - - s.logStats(sum) + s.logStats() case <-s.ctx.Done(): return } @@ -190,9 +187,8 @@ func (s *Scanner) startReporter() { }() } -func (s *Scanner) logStats(sum string) { +func (s *Scanner) logStats() { s.statsMu.Lock() - localKvCount := s.localKvCount var mismatched string if len(s.sharedStats.mismatched) > 0 { mismatched = s.sharedStats.mismatched.String() @@ -203,6 +199,7 @@ func (s *Scanner) logStats(sum string) { } s.statsMu.Unlock() + localKvCount, sum := s.worker.summaryLocalKvs() logFields := []any{ "mode", s.cfg.Mode, "localKvs", sum, From 3ffb9f02668f1c26ab85e9c7d8fa3a46e6ca2e16 Mon Sep 17 00:00:00 2001 From: syntrust Date: Thu, 18 Dec 2025 15:13:09 +0800 Subject: [PATCH 26/51] update check stats using callback --- ethstorage/scanner/scanner.go | 53 ++++++++++++++++++++--------------- ethstorage/scanner/utils.go | 8 ++++++ ethstorage/scanner/worker.go | 46 +++++++++++++++++------------- 3 files changed, 65 insertions(+), 42 deletions(-) diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index cba293d8..b795eb2f 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -143,12 +143,39 @@ func (s *Scanner) doWork(state *scanLoopState) { s.statsMu.Lock() tracker := s.sharedStats.mismatched.clone() s.statsMu.Unlock() - stats, err := s.worker.ScanBatch(s.ctx, state, tracker) + onUpdate := func(u scanUpdate) { + s.applyUpdate(u) + } + err := s.worker.ScanBatch(s.ctx, state, tracker, onUpdate) s.releaseScanPermit() if err != nil { s.lg.Error("Scanner: initial scan failed", "mode", state.mode, "error", err) - } else { - s.updateSharedStats(stats) + } +} + +func (s *Scanner) applyUpdate(u scanUpdate) { + s.statsMu.Lock() + defer s.statsMu.Unlock() + + if u.status != nil { + if s.sharedStats.mismatched == nil { + s.sharedStats.mismatched = mismatchTracker{} + } + s.sharedStats.mismatched[u.kvIndex] = *u.status + } else if u.status == nil { + delete(s.sharedStats.mismatched, u.kvIndex) + } + + if u.err != nil { + if s.sharedStats.errs == nil { + s.sharedStats.errs = scanErrors{} + } + s.sharedStats.errs[u.kvIndex] = u.err + return + } + // nil err means clear error state for this kv index + if s.sharedStats.errs != nil { + delete(s.sharedStats.errs, u.kvIndex) } } @@ -215,26 +242,6 @@ func (s *Scanner) logStats() { } } -func (s *Scanner) updateSharedStats(sts *stats) { - if sts == nil { - return - } - s.statsMu.Lock() - defer s.statsMu.Unlock() - - if sts.mismatched != nil { - s.sharedStats.mismatched = sts.mismatched.clone() - } else { - s.sharedStats.mismatched = mismatchTracker{} - } - if sts.errs != nil { - if s.sharedStats.errs == nil { - s.sharedStats.errs = scanErrors{} - } - s.sharedStats.errs.merge(sts.errs) - } -} - func (s *Scanner) GetScanState() *ScanStats { s.statsMu.Lock() defer s.statsMu.Unlock() diff --git a/ethstorage/scanner/utils.go b/ethstorage/scanner/utils.go index a5a19caa..69889653 100644 --- a/ethstorage/scanner/utils.go +++ b/ethstorage/scanner/utils.go @@ -138,6 +138,14 @@ type stats struct { errs scanErrors // latest scan errors keyed by kv index } +type scanUpdate struct { + kvIndex uint64 + status *status + err error +} + +type scanUpdateFn func(scanUpdate) + func newStats() *stats { return &stats{ mismatched: mismatchTracker{}, diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index f55002a1..b1506eb9 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -51,7 +51,11 @@ func NewWorker( } } -func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched mismatchTracker) (*stats, error) { +func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched mismatchTracker, onUpdate scanUpdateFn) error { + if onUpdate == nil { + onUpdate = func(scanUpdate) {} + } + start := time.Now() var kvsInBatch []uint64 defer func(stt time.Time) { @@ -60,13 +64,10 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched } }(start) - // Never return nil stats and nil scanErrors - sts := newStats() - errs := sts.errs localKvCount, _ := s.summaryLocalKvs() if localKvCount == 0 { s.lg.Info("Scanner: no KV entries found in local storage") - return sts, nil + return nil } // Query local storage info shards := s.sm.Shards() @@ -81,7 +82,7 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched metas, err := s.l1.GetKvMetas(kvsInBatch, rpc.FinalizedBlockNumber.Int64()) if err != nil { s.lg.Error("Scanner: failed to query KV metas", "error", err) - return sts, fmt.Errorf("failed to query KV metas: %w", err) + return fmt.Errorf("failed to query KV metas: %w", err) } s.lg.Debug("Scanner: query KV meta done", "kvsInBatch", shortPrt(kvsInBatch)) @@ -89,23 +90,21 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched select { case <-ctx.Done(): s.lg.Warn("Scanner canceled, stopping scan", "ctx.Err", ctx.Err()) - return sts, ctx.Err() + return ctx.Err() default: } var commit common.Hash copy(commit[:], meta[32-es.HashSizeInContract:32]) kvIndex := kvsInBatch[i] - s.scanKv(state.mode, kvIndex, commit, &mismatched, errs) + s.scanKv(state.mode, kvIndex, commit, &mismatched, onUpdate) } - sts.mismatched = mismatched state.nextIndex = batchEndExclusive - - return sts, nil + return nil } -func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, mismatched *mismatchTracker, errs scanErrors) { +func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, mismatched *mismatchTracker, onUpdate scanUpdateFn) { var err error var found bool switch mode { @@ -115,7 +114,8 @@ func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, misma metaLocal, found, err = s.sm.TryReadMeta(kvIndex) if err != nil { s.lg.Error("Scanner: failed to read meta", "kvIndex", kvIndex, "error", err) - errs.add(kvIndex, fmt.Errorf("failed to read meta: %w", err)) + errWrapped := fmt.Errorf("failed to read meta: %w", err) + onUpdate(scanUpdate{kvIndex: kvIndex, err: errWrapped}) return } err = es.CompareCommits(commit.Bytes(), metaLocal) @@ -134,11 +134,12 @@ func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, misma switch status { case failed: mismatched.markRecovered(kvIndex) - // Clear the error state - errs.nil(kvIndex) + newStatus := recovered + onUpdate(scanUpdate{kvIndex: kvIndex, status: &newStatus, err: nil}) s.lg.Info("Scanner: previously failed KV recovered", "kvIndex", kvIndex) case pending: delete(*mismatched, kvIndex) + onUpdate(scanUpdate{kvIndex: kvIndex, status: nil, err: nil}) s.lg.Info("Scanner: previously pending KV recovered", "kvIndex", kvIndex) } } @@ -150,7 +151,8 @@ func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, misma if !found { // The shard is not stored locally - errs.add(kvIndex, fmt.Errorf("shard not found locally: commit=%x", commit)) + errWrapped := fmt.Errorf("shard not found locally: commit=%x", commit) + onUpdate(scanUpdate{kvIndex: kvIndex, err: errWrapped}) s.lg.Error("Scanner: blob not found locally", "kvIndex", kvIndex, "commit", commit) return } @@ -166,21 +168,27 @@ func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, misma if fixErr := s.fixKv(kvIndex, commit); fixErr != nil { mismatched.markFailed(kvIndex) s.lg.Error("Scanner: failed to fix blob", "kvIndex", kvIndex, "error", fixErr) - errs.add(kvIndex, fmt.Errorf("failed to fix blob: %w", fixErr)) + errWrapped := fmt.Errorf("failed to fix blob: %w", fixErr) + newStatus := failed + onUpdate(scanUpdate{kvIndex: kvIndex, status: &newStatus, err: errWrapped}) } else { s.lg.Info("Scanner: blob fixed successfully", "kvIndex", kvIndex) mismatched.markFixed(kvIndex) - errs.nil(kvIndex) + newStatus := fixed + onUpdate(scanUpdate{kvIndex: kvIndex, status: &newStatus, err: nil}) } } else { // Mark but skip on the first occurrence as it may be caused by KV update and delayed download mismatched.markPending(kvIndex) + newStatus := pending + onUpdate(scanUpdate{kvIndex: kvIndex, status: &newStatus}) s.lg.Info("Scanner: first-time mismatch, skipping fix attempt", "kvIndex", kvIndex) } } else { s.lg.Error("Scanner: unexpected error occurred", "kvIndex", kvIndex, "error", err) - errs.add(kvIndex, fmt.Errorf("unexpected error: %w", err)) + errWrapped := fmt.Errorf("unexpected error: %w", err) + onUpdate(scanUpdate{kvIndex: kvIndex, err: errWrapped}) } } } From 7aa6373a7ef124f8fd2cdd956bb90fd74932d492 Mon Sep 17 00:00:00 2001 From: syntrust Date: Thu, 18 Dec 2025 17:46:04 +0800 Subject: [PATCH 27/51] retry download blob --- ethstorage/downloader/downloader.go | 67 ++++++++++++++++++++--------- ethstorage/eth/beacon_client.go | 8 ++-- 2 files changed, 51 insertions(+), 24 deletions(-) diff --git a/ethstorage/downloader/downloader.go b/ethstorage/downloader/downloader.go index 36249460..39e37808 100644 --- a/ethstorage/downloader/downloader.go +++ b/ethstorage/downloader/downloader.go @@ -388,26 +388,10 @@ func (s *Downloader) downloadRange(start int64, end int64, toCache bool) ([]blob ) } - var clBlobs map[common.Hash]eth.Blob - if s.l1Beacon != nil { - clBlobs, err = s.l1Beacon.DownloadBlobs(s.l1Beacon.Timestamp2Slot(elBlock.timestamp)) - if err != nil { - s.lg.Error("L1 beacon download blob error", "err", err) - return nil, err - } - } else if s.daClient != nil { - var hashes []common.Hash - for _, blob := range elBlock.blobs { - hashes = append(hashes, blob.hash) - } - - clBlobs, err = s.daClient.DownloadBlobs(hashes) - if err != nil { - s.lg.Error("DA client download blob error", "err", err) - return nil, err - } - } else { - return nil, fmt.Errorf("no beacon client or DA client is available") + clBlobs, err := s.downloadBlobsWithRetry(elBlock, 3) + if err != nil { + s.lg.Error("Failed to download blobs for the block after 3 attempts", "block", elBlock.number, "err", err) + // Empty CL blob will be handled later in the EL blob loop } for _, elBlob := range elBlock.blobs { @@ -441,6 +425,49 @@ func (s *Downloader) downloadRange(start int64, end int64, toCache bool) ([]blob return blobs, nil } +func (s *Downloader) downloadBlobsWithRetry(elBlock *blockBlobs, maxAttempts int) (map[common.Hash]eth.Blob, error) { + var lastErr error + for attempt := 1; attempt <= maxAttempts; attempt++ { + clBlobs, err := s.downloadBlobs(elBlock) + if err == nil { + return clBlobs, nil + } + lastErr = err + if attempt < maxAttempts { + time.Sleep(3 * time.Second) + } + } + return nil, lastErr +} + +func (s *Downloader) downloadBlobs(elBlock *blockBlobs) (map[common.Hash]eth.Blob, error) { + if s.l1Beacon != nil { + slot := s.l1Beacon.Timestamp2Slot(elBlock.timestamp) + clBlobs, err := s.l1Beacon.DownloadBlobs(slot) + if err != nil { + s.lg.Error("L1 beacon download blob error", "block", elBlock.number, "slot", slot, "err", err) + return nil, err + } + return clBlobs, nil + } + + if s.daClient != nil { + hashes := make([]common.Hash, 0, len(elBlock.blobs)) + for _, b := range elBlock.blobs { + hashes = append(hashes, b.hash) + } + + clBlobs, err := s.daClient.DownloadBlobs(hashes) + if err != nil { + s.lg.Error("DA client download blob error", "err", err) + return nil, err + } + return clBlobs, nil + } + + return nil, fmt.Errorf("no beacon client or DA client is available") +} + func (s *Downloader) dumpBlobsIfNeeded(blobs []blob) { if s.dumpDir != "" { for _, blob := range blobs { diff --git a/ethstorage/eth/beacon_client.go b/ethstorage/eth/beacon_client.go index 880be075..46f74445 100644 --- a/ethstorage/eth/beacon_client.go +++ b/ethstorage/eth/beacon_client.go @@ -77,13 +77,13 @@ func (c *BeaconClient) DownloadBlobs(slot uint64) (map[common.Hash]Blob, error) } resp, err := http.Get(beaconUrl) if err != nil { - return nil, err + return nil, fmt.Errorf("failed to query beacon blobs with url %s: %w", beaconUrl, err) } defer resp.Body.Close() var blobsResp blobs.BeaconBlobs if err := json.NewDecoder(resp.Body).Decode(&blobsResp); err != nil { - return nil, err + return nil, fmt.Errorf("failed to decode beacon blobs response from url %s: %w", beaconUrl, err) } res := map[common.Hash]Blob{} @@ -91,11 +91,11 @@ func (c *BeaconClient) DownloadBlobs(slot uint64) (map[common.Hash]Blob, error) // decode hex string to bytes asciiBytes, err := hex.DecodeString(beaconBlob[2:]) if err != nil { - return nil, err + return nil, fmt.Errorf("failed to decode beacon blob hex string %s: %w", beaconBlob, err) } hash, err := blobs.BlobToVersionedHash(asciiBytes) if err != nil { - return nil, err + return nil, fmt.Errorf("failed to compute versioned hash for blob: %w", err) } res[hash] = Blob{VersionedHash: hash, Data: asciiBytes} } From 51fc8c6e4610c4d205fc3b919eed16297fbc39f2 Mon Sep 17 00:00:00 2001 From: syntrust Date: Thu, 18 Dec 2025 19:09:38 +0800 Subject: [PATCH 28/51] clean up --- ethstorage/scanner/scanner.go | 5 +-- ethstorage/scanner/utils.go | 21 --------- ethstorage/scanner/worker.go | 80 ++++++++--------------------------- 3 files changed, 18 insertions(+), 88 deletions(-) diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index b795eb2f..31001fe2 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -140,13 +140,10 @@ func (s *Scanner) doWork(state *scanLoopState) { if !s.acquireScanPermit() { return } - s.statsMu.Lock() - tracker := s.sharedStats.mismatched.clone() - s.statsMu.Unlock() onUpdate := func(u scanUpdate) { s.applyUpdate(u) } - err := s.worker.ScanBatch(s.ctx, state, tracker, onUpdate) + err := s.worker.ScanBatch(s.ctx, state, onUpdate) s.releaseScanPermit() if err != nil { s.lg.Error("Scanner: initial scan failed", "mode", state.mode, "error", err) diff --git a/ethstorage/scanner/utils.go b/ethstorage/scanner/utils.go index 69889653..4b92ec6e 100644 --- a/ethstorage/scanner/utils.go +++ b/ethstorage/scanner/utils.go @@ -71,27 +71,6 @@ func (m mismatchTracker) String() string { return "[" + strings.Join(items, ",") + "]" } -func (m mismatchTracker) markPending(kvIndex uint64) { - m[kvIndex] = pending -} - -func (m mismatchTracker) markRecovered(kvIndex uint64) { - m[kvIndex] = recovered -} - -func (m mismatchTracker) markFixed(kvIndex uint64) { - m[kvIndex] = fixed -} - -func (m mismatchTracker) markFailed(kvIndex uint64) { - m[kvIndex] = failed -} - -func (m mismatchTracker) shouldFix(kvIndex uint64) bool { - status, exists := m[kvIndex] - return exists && (status == pending || status == failed) -} - // failed() returns all indices that are still mismatched // since the first-time do not count as mismatched and the // second-time will be fixed immediately if possible diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index b1506eb9..b9774f4b 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -51,7 +51,7 @@ func NewWorker( } } -func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched mismatchTracker, onUpdate scanUpdateFn) error { +func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, onUpdate scanUpdateFn) error { if onUpdate == nil { onUpdate = func(scanUpdate) {} } @@ -97,14 +97,14 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, mismatched var commit common.Hash copy(commit[:], meta[32-es.HashSizeInContract:32]) kvIndex := kvsInBatch[i] - s.scanKv(state.mode, kvIndex, commit, &mismatched, onUpdate) + s.scanKv(state.mode, kvIndex, commit, onUpdate) } state.nextIndex = batchEndExclusive return nil } -func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, mismatched *mismatchTracker, onUpdate scanUpdateFn) { +func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, onUpdate scanUpdateFn) { var err error var found bool switch mode { @@ -127,28 +127,6 @@ func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, misma s.lg.Crit("Scanner: invalid scanner mode", "mode", mode) } - if found && err == nil { - - // Update status for previously mismatched entries that are now valid - if status, exists := (*mismatched)[kvIndex]; exists { - switch status { - case failed: - mismatched.markRecovered(kvIndex) - newStatus := recovered - onUpdate(scanUpdate{kvIndex: kvIndex, status: &newStatus, err: nil}) - s.lg.Info("Scanner: previously failed KV recovered", "kvIndex", kvIndex) - case pending: - delete(*mismatched, kvIndex) - onUpdate(scanUpdate{kvIndex: kvIndex, status: nil, err: nil}) - s.lg.Info("Scanner: previously pending KV recovered", "kvIndex", kvIndex) - } - } - - // Happy path - s.lg.Debug("Scanner: KV check completed successfully", "kvIndex", kvIndex, "commit", commit) - return - } - if !found { // The shard is not stored locally errWrapped := fmt.Errorf("shard not found locally: commit=%x", commit) @@ -157,47 +135,23 @@ func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, misma return } - if err != nil { - var commitErr *es.CommitMismatchError - if errors.As(err, &commitErr) { - s.lg.Warn("Scanner: commit mismatch detected", "kvIndex", kvIndex, "error", err) - - // Only fix repeated mismatches - if mismatched.shouldFix(kvIndex) { - s.lg.Info("Scanner: mismatch again, attempting to fix blob", "kvIndex", kvIndex, "commit", commit) - if fixErr := s.fixKv(kvIndex, commit); fixErr != nil { - mismatched.markFailed(kvIndex) - s.lg.Error("Scanner: failed to fix blob", "kvIndex", kvIndex, "error", fixErr) - errWrapped := fmt.Errorf("failed to fix blob: %w", fixErr) - newStatus := failed - onUpdate(scanUpdate{kvIndex: kvIndex, status: &newStatus, err: errWrapped}) - } else { - s.lg.Info("Scanner: blob fixed successfully", "kvIndex", kvIndex) - mismatched.markFixed(kvIndex) - newStatus := fixed - onUpdate(scanUpdate{kvIndex: kvIndex, status: &newStatus, err: nil}) - } - } else { - - // Mark but skip on the first occurrence as it may be caused by KV update and delayed download - mismatched.markPending(kvIndex) - newStatus := pending - onUpdate(scanUpdate{kvIndex: kvIndex, status: &newStatus}) - s.lg.Info("Scanner: first-time mismatch, skipping fix attempt", "kvIndex", kvIndex) - } - } else { - s.lg.Error("Scanner: unexpected error occurred", "kvIndex", kvIndex, "error", err) - errWrapped := fmt.Errorf("unexpected error: %w", err) - onUpdate(scanUpdate{kvIndex: kvIndex, err: errWrapped}) - } + if err == nil { + // Happy path + s.lg.Debug("Scanner: KV check completed successfully", "kvIndex", kvIndex, "commit", commit) + return } -} -func (s *Worker) fixKv(kvIndex uint64, commit common.Hash) error { - if err := s.sm.TryWriteWithMetaCheck(kvIndex, commit, s.fetchBlob); err != nil { - return fmt.Errorf("failed to write KV: kvIndex=%d, commit=%x, %w", kvIndex, commit, err) + var commitErr *es.CommitMismatchError + if errors.As(err, &commitErr) { + s.lg.Warn("Scanner: commit mismatch detected", "kvIndex", kvIndex, "error", err) + newStatus := pending + onUpdate(scanUpdate{kvIndex: kvIndex, status: &newStatus, err: nil}) + return } - return nil + + s.lg.Error("Scanner: unexpected error occurred", "kvIndex", kvIndex, "error", err) + errWrapped := fmt.Errorf("unexpected error: %w", err) + onUpdate(scanUpdate{kvIndex: kvIndex, err: errWrapped}) } func (s *Worker) summaryLocalKvs() (uint64, string) { From 070f0e25b70165b420541ca597d82c1c4e629806 Mon Sep 17 00:00:00 2001 From: syntrust Date: Fri, 19 Dec 2025 19:20:22 +0800 Subject: [PATCH 29/51] refactor --- ethstorage/scanner/scanner.go | 50 ++++++--------- ethstorage/scanner/utils.go | 112 +++++++++++++++------------------- ethstorage/scanner/worker.go | 102 +++++++++++++++++-------------- 3 files changed, 123 insertions(+), 141 deletions(-) diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index 31001fe2..e22a345c 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -27,7 +27,7 @@ type Scanner struct { lg log.Logger scanPermit chan struct{} // to ensure only one scan at a time statsMu sync.Mutex // protects sharedStats - sharedStats stats + sharedStats scannedKVs } func New( @@ -48,7 +48,7 @@ func New( cancel: cancel, lg: lg, scanPermit: make(chan struct{}, 1), - sharedStats: *newStats(), + sharedStats: scannedKVs{}, } scanner.scanPermit <- struct{}{} scanner.wg.Add(1) @@ -140,39 +140,23 @@ func (s *Scanner) doWork(state *scanLoopState) { if !s.acquireScanPermit() { return } - onUpdate := func(u scanUpdate) { - s.applyUpdate(u) - } - err := s.worker.ScanBatch(s.ctx, state, onUpdate) + err := s.worker.ScanBatch(s.ctx, state, func(kvi uint64, m *scanned) { + s.applyUpdate(kvi, m) + }) s.releaseScanPermit() if err != nil { s.lg.Error("Scanner: initial scan failed", "mode", state.mode, "error", err) } } -func (s *Scanner) applyUpdate(u scanUpdate) { +func (s *Scanner) applyUpdate(kvi uint64, m *scanned) { s.statsMu.Lock() defer s.statsMu.Unlock() - if u.status != nil { - if s.sharedStats.mismatched == nil { - s.sharedStats.mismatched = mismatchTracker{} - } - s.sharedStats.mismatched[u.kvIndex] = *u.status - } else if u.status == nil { - delete(s.sharedStats.mismatched, u.kvIndex) - } - - if u.err != nil { - if s.sharedStats.errs == nil { - s.sharedStats.errs = scanErrors{} - } - s.sharedStats.errs[u.kvIndex] = u.err - return - } - // nil err means clear error state for this kv index - if s.sharedStats.errs != nil { - delete(s.sharedStats.errs, u.kvIndex) + if m != nil { + s.sharedStats[kvi] = *m + } else { + delete(s.sharedStats, kvi) } } @@ -214,12 +198,12 @@ func (s *Scanner) startReporter() { func (s *Scanner) logStats() { s.statsMu.Lock() var mismatched string - if len(s.sharedStats.mismatched) > 0 { - mismatched = s.sharedStats.mismatched.String() + if len(s.sharedStats) > 0 { + mismatched = s.sharedStats.String() } - errSnapshot := scanErrors{} - if s.sharedStats.errs != nil { - maps.Copy(errSnapshot, s.sharedStats.errs) + errSnapshot := make(map[uint64]error) + if s.sharedStats.hasError() { + maps.Copy(errSnapshot, s.sharedStats.withErrors()) } s.statsMu.Unlock() @@ -244,8 +228,8 @@ func (s *Scanner) GetScanState() *ScanStats { defer s.statsMu.Unlock() return &ScanStats{ - MismatchedCount: len(s.sharedStats.mismatched), - UnfixedCount: len(s.sharedStats.mismatched.failed()), + MismatchedCount: len(s.sharedStats), + UnfixedCount: len(s.sharedStats.failed()), } } diff --git a/ethstorage/scanner/utils.go b/ethstorage/scanner/utils.go index 4b92ec6e..dd37bd80 100644 --- a/ethstorage/scanner/utils.go +++ b/ethstorage/scanner/utils.go @@ -5,42 +5,34 @@ package scanner import ( "fmt" - "maps" "slices" "strings" -) - -type scanErrors map[uint64]error -func (s scanErrors) add(kvIndex uint64, err error) { - s[kvIndex] = err -} + "github.com/ethereum/go-ethereum/common" +) -func (s scanErrors) nil(kvIndex uint64) { - s[kvIndex] = nil -} - -func (s scanErrors) merge(errs scanErrors) { - for k, v := range errs { - if v != nil { - s[k] = v - } else { - delete(s, k) - } - } +type scanned struct { + status + err error } type status int const ( - pending status = iota // first-time detected - fixed // by scanner - recovered // by downloader - failed // failed to fix + ok status = iota + notFound // not found + pending // first-time detected + fixed // by scanner + recovered // by downloader + failed // failed to fix ) func (s status) String() string { switch s { + case ok: + return "ok" + case notFound: + return "not_found" case pending: return "pending" case recovered: @@ -54,9 +46,9 @@ func (s status) String() string { } } -type mismatchTracker map[uint64]status +type scannedKVs map[uint64]scanned -func (m mismatchTracker) String() string { +func (m scannedKVs) String() string { var items []string keys := make([]uint64, 0, len(m)) for k := range m { @@ -71,29 +63,32 @@ func (m mismatchTracker) String() string { return "[" + strings.Join(items, ",") + "]" } -// failed() returns all indices that are still mismatched -// since the first-time do not count as mismatched and the -// second-time will be fixed immediately if possible -func (m mismatchTracker) failed() []uint64 { - return m.filterByStatus(failed) -} - -// fixed() returns only indices that have been fixed by the scanner -// add recovered() to get those fixed by downloader -func (m mismatchTracker) fixed() []uint64 { - return m.filterByStatus(fixed) +func (m scannedKVs) hasError() bool { + for _, scanned := range m { + if scanned.err != nil { + return true + } + } + return false } -// recovered() returns indices fixed by downloader from failed status -// those recovered from pending status are no longer tracked -func (m mismatchTracker) recovered() []uint64 { - return m.filterByStatus(recovered) +func (m scannedKVs) withErrors() map[uint64]error { + res := make(map[uint64]error) + for kvIndex, scanned := range m { + if scanned.err != nil { + res[kvIndex] = scanned.err + } + } + return res } -func (m mismatchTracker) filterByStatus(s status) []uint64 { +// failed() returns all indices that are still mismatched +// since the first-time do not count as mismatched and the +// second-time will be fixed immediately if possible +func (m scannedKVs) failed() []uint64 { var res []uint64 - for kvIndex, status := range m { - if status == s { + for kvIndex, scanned := range m { + if scanned.status == failed { res = append(res, kvIndex) } } @@ -101,35 +96,28 @@ func (m mismatchTracker) filterByStatus(s status) []uint64 { return res } -func (m mismatchTracker) clone() mismatchTracker { - clone := make(mismatchTracker) - maps.Copy(clone, m) - return clone -} - type scanLoopState struct { mode scanMode nextIndex uint64 } -type stats struct { - mismatched mismatchTracker // tracks all mismatched indices and their status - errs scanErrors // latest scan errors keyed by kv index -} +type scanUpdateFn func(kvi uint64, m *scanned) -type scanUpdate struct { +type scanMarker struct { kvIndex uint64 - status *status - err error + mark scanUpdateFn } -type scanUpdateFn func(scanUpdate) +func newScanMarker(kvIndex uint64, fn scanUpdateFn) *scanMarker { + return &scanMarker{kvIndex: kvIndex, mark: fn} +} -func newStats() *stats { - return &stats{ - mismatched: mismatchTracker{}, - errs: scanErrors{}, - } +func (m *scanMarker) markError(commit common.Hash, err error) { + m.mark(m.kvIndex, &scanned{status: notFound, err: err}) +} + +func (m *scanMarker) markPending() { + m.mark(m.kvIndex, &scanned{status: pending, err: nil}) } func shortPrt(nums []uint64) string { diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index b9774f4b..b4810ac7 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -52,31 +52,32 @@ func NewWorker( } func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, onUpdate scanUpdateFn) error { + // Noop if onUpdate == nil { - onUpdate = func(scanUpdate) {} + onUpdate = func(kvi uint64, m *scanned) {} } start := time.Now() var kvsInBatch []uint64 defer func(stt time.Time) { if len(kvsInBatch) > 0 { - s.lg.Info("Scanner: scan batch done", "mode", state.mode, "scanned", shortPrt(kvsInBatch), "count", len(kvsInBatch), "nextIndexOfKvIdx", state.nextIndex, "duration", time.Since(stt).String()) + s.lg.Info("Scanner: scan batch done", + "mode", state.mode, + "scanned", shortPrt(kvsInBatch), + "count", len(kvsInBatch), + "nextIndexOfKvIdx", state.nextIndex, + "duration", time.Since(stt).String(), + ) } }(start) - localKvCount, _ := s.summaryLocalKvs() - if localKvCount == 0 { - s.lg.Info("Scanner: no KV entries found in local storage") + // Determine the batch of KV indices to scan + kvsInBatch, batchEndExclusive := s.getKvsInBatch(state.nextIndex) + if len(kvsInBatch) == 0 { + s.lg.Info("Scanner: no KV entries to scan in this batch") return nil } - // Query local storage info - shards := s.sm.Shards() - kvEntries := s.sm.KvEntries() - lastKvIdx := s.sm.KvEntryCount() - 1 - startIndexOfKvIdx := state.nextIndex - s.lg.Info("Scanner: scan batch started", "mode", state.mode, "startIndexOfKvIdx", startIndexOfKvIdx, "lastKvIdxOnChain", lastKvIdx, "shardsInLocal", shards) - // Determine the batch of KV indices to scan - kvsInBatch, batchEndExclusive := getKvsInBatch(shards, kvEntries, localKvCount, s.batchSize, startIndexOfKvIdx, s.lg) + s.lg.Info("Scanner: scan batch started", "mode", state.mode, "startIndexOfKvIdx", state.nextIndex) // Query the metas from the L1 contract metas, err := s.l1.GetKvMetas(kvsInBatch, rpc.FinalizedBlockNumber.Int64()) @@ -104,54 +105,63 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, onUpdate s return nil } +func (s *Worker) getKvsInBatch(startIndexOfKvIdx uint64) ([]uint64, uint64) { + localKvCount, _ := s.summaryLocalKvs() + if localKvCount == 0 { + s.lg.Info("Scanner: no KV entries found in local storage") + return []uint64{}, 0 + } + shards := s.sm.Shards() + kvEntries := s.sm.KvEntries() + return getKvsInBatch(shards, kvEntries, localKvCount, s.batchSize, startIndexOfKvIdx, s.lg) +} + func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, onUpdate scanUpdateFn) { var err error - var found bool switch mode { case modeCheckMeta: // Check meta only - var metaLocal []byte - metaLocal, found, err = s.sm.TryReadMeta(kvIndex) - if err != nil { - s.lg.Error("Scanner: failed to read meta", "kvIndex", kvIndex, "error", err) - errWrapped := fmt.Errorf("failed to read meta: %w", err) - onUpdate(scanUpdate{kvIndex: kvIndex, err: errWrapped}) - return + metaLocal, found, readErr := s.sm.TryReadMeta(kvIndex) + if metaLocal != nil { + err = es.CompareCommits(commit.Bytes(), metaLocal) + } else { + if readErr != nil { + err = fmt.Errorf("failed to read meta: %w", readErr) + } else if !found { + err = fmt.Errorf("meta not found locally: %x", commit) + } else { + err = fmt.Errorf("meta is nil") + } } - err = es.CompareCommits(commit.Bytes(), metaLocal) + s.lg.Error("Scanner: failed to read meta", "kvIndex", kvIndex, "error", err) + case modeCheckBlob: // Query blob and check meta from storage - _, found, err = s.sm.TryRead(kvIndex, int(s.sm.MaxKvSize()), commit) + _, found, readErr := s.sm.TryRead(kvIndex, int(s.sm.MaxKvSize()), commit) + if readErr != nil { + s.lg.Error("Scanner: failed to read blob", "kvIndex", kvIndex, "error", readErr) + err = readErr + } else if !found { + err = fmt.Errorf("blob not found locally: %x", commit) + } default: // Other modes are handled outside s.lg.Crit("Scanner: invalid scanner mode", "mode", mode) } - - if !found { - // The shard is not stored locally - errWrapped := fmt.Errorf("shard not found locally: commit=%x", commit) - onUpdate(scanUpdate{kvIndex: kvIndex, err: errWrapped}) - s.lg.Error("Scanner: blob not found locally", "kvIndex", kvIndex, "commit", commit) - return - } - - if err == nil { - // Happy path - s.lg.Debug("Scanner: KV check completed successfully", "kvIndex", kvIndex, "commit", commit) - return - } - - var commitErr *es.CommitMismatchError - if errors.As(err, &commitErr) { - s.lg.Warn("Scanner: commit mismatch detected", "kvIndex", kvIndex, "error", err) - newStatus := pending - onUpdate(scanUpdate{kvIndex: kvIndex, status: &newStatus, err: nil}) + if err != nil { + marker := newScanMarker(kvIndex, onUpdate) + var commitErr *es.CommitMismatchError + if errors.As(err, &commitErr) { + s.lg.Warn("Scanner: commit mismatch detected", "kvIndex", kvIndex, "error", err) + marker.markPending() + return + } + marker.markError(commit, fmt.Errorf("unexpected error: %w", err)) return } - s.lg.Error("Scanner: unexpected error occurred", "kvIndex", kvIndex, "error", err) - errWrapped := fmt.Errorf("unexpected error: %w", err) - onUpdate(scanUpdate{kvIndex: kvIndex, err: errWrapped}) + // Happy path + s.lg.Debug("Scanner: KV check completed successfully", "kvIndex", kvIndex, "commit", commit) } func (s *Worker) summaryLocalKvs() (uint64, string) { From dfd6d407765c06ff856971ed3d8fab1fc5de10d3 Mon Sep 17 00:00:00 2001 From: syntrust Date: Sat, 20 Dec 2025 12:16:40 +0800 Subject: [PATCH 30/51] minor --- ethstorage/scanner/worker.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index b4810ac7..a8c24030 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -129,17 +129,13 @@ func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, onUpd err = fmt.Errorf("failed to read meta: %w", readErr) } else if !found { err = fmt.Errorf("meta not found locally: %x", commit) - } else { - err = fmt.Errorf("meta is nil") } } - s.lg.Error("Scanner: failed to read meta", "kvIndex", kvIndex, "error", err) - case modeCheckBlob: // Query blob and check meta from storage _, found, readErr := s.sm.TryRead(kvIndex, int(s.sm.MaxKvSize()), commit) if readErr != nil { - s.lg.Error("Scanner: failed to read blob", "kvIndex", kvIndex, "error", readErr) + // Could be CommitMismatchError err = readErr } else if !found { err = fmt.Errorf("blob not found locally: %x", commit) @@ -156,6 +152,7 @@ func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, onUpd marker.markPending() return } + s.lg.Error("Scanner: failed to scan KV", "mode", mode, "kvIndex", kvIndex, "error", err) marker.markError(commit, fmt.Errorf("unexpected error: %w", err)) return } From d0f3f853c4f7191e8eec02acf29bfb1d90682cca Mon Sep 17 00:00:00 2001 From: syntrust Date: Mon, 22 Dec 2025 10:57:22 +0800 Subject: [PATCH 31/51] fix kv --- ethstorage/scanner/scanner.go | 46 +++++++++++++++++++- ethstorage/scanner/utils.go | 54 ++++++++++++++++------- ethstorage/scanner/worker.go | 80 +++++++++++++++++++++++++++-------- 3 files changed, 146 insertions(+), 34 deletions(-) diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index e22a345c..12cea031 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -100,6 +100,8 @@ func (s *Scanner) start() { s.lg.Info("Scanner is disabled") return } + + // Launch scan loops for hybrid mode if s.cfg.Mode == modeCheckBlob+modeCheckMeta { s.lg.Info("Scanner running in hybrid mode", "mode", s.cfg.Mode, "metaInterval", s.cfg.IntervalMeta, "blobInterval", s.cfg.IntervalBlob) s.launchScanLoop(&scanLoopState{mode: modeCheckBlob}, s.cfg.IntervalBlob) @@ -111,6 +113,9 @@ func (s *Scanner) start() { interval = s.cfg.IntervalBlob } s.launchScanLoop(&scanLoopState{mode: s.cfg.Mode}, interval) + + // Launch the fix loop to fix mismatched KVs every 10 minutes + s.launchFixLoop(time.Minute * 10) } func (s *Scanner) launchScanLoop(state *scanLoopState, interval time.Duration) { @@ -135,6 +140,45 @@ func (s *Scanner) launchScanLoop(state *scanLoopState, interval time.Duration) { } }() } +func (s *Scanner) launchFixLoop(interval time.Duration) { + s.wg.Add(1) + go func() { + defer s.wg.Done() + + s.lg.Info("Scanner fix loop started", "interval", interval.String()) + + fixTicker := time.NewTicker(interval) + defer fixTicker.Stop() + + for { + select { + case <-fixTicker.C: + s.lg.Info("Scanner fix loop triggered") + + if !s.acquireScanPermit() { + return + } + // hold for 3 minutes before fixing to allow possible ongoing kv downloading to finish + time.Sleep(time.Minute * 3) + + s.statsMu.Lock() + kvIndices := s.sharedStats.needFix() + s.statsMu.Unlock() + + err := s.worker.fixBatch(s.ctx, kvIndices, func(kvi uint64, m *scanned) { + s.applyUpdate(kvi, m) + }) + s.releaseScanPermit() + if err != nil { + s.lg.Error("Fix scan batch failed", "error", err) + } + + case <-s.ctx.Done(): + return + } + } + }() +} func (s *Scanner) doWork(state *scanLoopState) { if !s.acquireScanPermit() { @@ -145,7 +189,7 @@ func (s *Scanner) doWork(state *scanLoopState) { }) s.releaseScanPermit() if err != nil { - s.lg.Error("Scanner: initial scan failed", "mode", state.mode, "error", err) + s.lg.Error("Scan batch failed", "mode", state.mode, "error", err) } } diff --git a/ethstorage/scanner/utils.go b/ethstorage/scanner/utils.go index dd37bd80..cfe264d2 100644 --- a/ethstorage/scanner/utils.go +++ b/ethstorage/scanner/utils.go @@ -19,22 +19,22 @@ type scanned struct { type status int const ( - ok status = iota - notFound // not found - pending // first-time detected - fixed // by scanner - recovered // by downloader - failed // failed to fix + ok status = iota + err_read // read meta or blob error / not found + mismatched // mismatch detected + fixed // by scanner + recovered // by downloader + failed // failed to fix ) func (s status) String() string { switch s { case ok: return "ok" - case notFound: - return "not_found" - case pending: - return "pending" + case err_read: + return "err_read" + case mismatched: + return "mismatched" case recovered: return "recovered" case fixed: @@ -82,9 +82,7 @@ func (m scannedKVs) withErrors() map[uint64]error { return res } -// failed() returns all indices that are still mismatched -// since the first-time do not count as mismatched and the -// second-time will be fixed immediately if possible +// failed() returns all kvIndices that are failed to be fixed func (m scannedKVs) failed() []uint64 { var res []uint64 for kvIndex, scanned := range m { @@ -96,6 +94,18 @@ func (m scannedKVs) failed() []uint64 { return res } +// needFix() returns all kvIndices that need to be fixed or at least check again +func (m scannedKVs) needFix() []uint64 { + var res []uint64 + for kvIndex, scanned := range m { + if scanned.status == mismatched || scanned.status == failed || scanned.err != nil { + res = append(res, kvIndex) + } + } + slices.Sort(res) + return res +} + type scanLoopState struct { mode scanMode nextIndex uint64 @@ -113,11 +123,23 @@ func newScanMarker(kvIndex uint64, fn scanUpdateFn) *scanMarker { } func (m *scanMarker) markError(commit common.Hash, err error) { - m.mark(m.kvIndex, &scanned{status: notFound, err: err}) + m.mark(m.kvIndex, &scanned{status: err_read, err: fmt.Errorf("commit: %x, error reading kv: %w", commit, err)}) +} + +func (m *scanMarker) markFailed(commit common.Hash, err error) { + m.mark(m.kvIndex, &scanned{status: failed, err: fmt.Errorf("commit: %x, error fixing kv: %w", commit, err)}) +} + +func (m *scanMarker) markMismatched() { + m.mark(m.kvIndex, &scanned{status: mismatched, err: nil}) +} + +func (m *scanMarker) markFixed() { + m.mark(m.kvIndex, &scanned{status: fixed, err: nil}) } -func (m *scanMarker) markPending() { - m.mark(m.kvIndex, &scanned{status: pending, err: nil}) +func (m *scanMarker) markRecovered() { + m.mark(m.kvIndex, &scanned{status: recovered, err: nil}) } func shortPrt(nums []uint64) string { diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index a8c24030..2a2367d9 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -61,7 +61,7 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, onUpdate s var kvsInBatch []uint64 defer func(stt time.Time) { if len(kvsInBatch) > 0 { - s.lg.Info("Scanner: scan batch done", + s.lg.Info("Scan batch done", "mode", state.mode, "scanned", shortPrt(kvsInBatch), "count", len(kvsInBatch), @@ -74,18 +74,18 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, onUpdate s // Determine the batch of KV indices to scan kvsInBatch, batchEndExclusive := s.getKvsInBatch(state.nextIndex) if len(kvsInBatch) == 0 { - s.lg.Info("Scanner: no KV entries to scan in this batch") + s.lg.Info("No KV entries to scan in this batch") return nil } - s.lg.Info("Scanner: scan batch started", "mode", state.mode, "startIndexOfKvIdx", state.nextIndex) + s.lg.Info("Scan batch started", "mode", state.mode, "startIndexOfKvIdx", state.nextIndex) // Query the metas from the L1 contract metas, err := s.l1.GetKvMetas(kvsInBatch, rpc.FinalizedBlockNumber.Int64()) if err != nil { - s.lg.Error("Scanner: failed to query KV metas", "error", err) + s.lg.Error("Failed to query KV metas for scan batch", "error", err) return fmt.Errorf("failed to query KV metas: %w", err) } - s.lg.Debug("Scanner: query KV meta done", "kvsInBatch", shortPrt(kvsInBatch)) + s.lg.Debug("Query KV meta done", "kvsInBatch", shortPrt(kvsInBatch)) for i, meta := range metas { select { @@ -97,8 +97,7 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, onUpdate s var commit common.Hash copy(commit[:], meta[32-es.HashSizeInContract:32]) - kvIndex := kvsInBatch[i] - s.scanKv(state.mode, kvIndex, commit, onUpdate) + s.scanKv(state.mode, kvsInBatch[i], commit, onUpdate) } state.nextIndex = batchEndExclusive @@ -108,7 +107,6 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, onUpdate s func (s *Worker) getKvsInBatch(startIndexOfKvIdx uint64) ([]uint64, uint64) { localKvCount, _ := s.summaryLocalKvs() if localKvCount == 0 { - s.lg.Info("Scanner: no KV entries found in local storage") return []uint64{}, 0 } shards := s.sm.Shards() @@ -142,29 +140,77 @@ func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, onUpd } default: // Other modes are handled outside - s.lg.Crit("Scanner: invalid scanner mode", "mode", mode) + s.lg.Crit("Invalid scanner mode", "mode", mode) } if err != nil { marker := newScanMarker(kvIndex, onUpdate) var commitErr *es.CommitMismatchError if errors.As(err, &commitErr) { - s.lg.Warn("Scanner: commit mismatch detected", "kvIndex", kvIndex, "error", err) - marker.markPending() + s.lg.Warn("Commit mismatch detected", "kvIndex", kvIndex, "error", err) + marker.markMismatched() return } - s.lg.Error("Scanner: failed to scan KV", "mode", mode, "kvIndex", kvIndex, "error", err) - marker.markError(commit, fmt.Errorf("unexpected error: %w", err)) + s.lg.Error("Failed to scan KV", "mode", mode, "kvIndex", kvIndex, "error", err) + marker.markError(commit, err) return } // Happy path - s.lg.Debug("Scanner: KV check completed successfully", "kvIndex", kvIndex, "commit", commit) + s.lg.Debug("KV check completed successfully", "kvIndex", kvIndex, "commit", commit) +} + +func (s *Worker) fixBatch(ctx context.Context, kvIndices []uint64, onUpdate scanUpdateFn) error { + metas, err := s.l1.GetKvMetas(kvIndices, rpc.FinalizedBlockNumber.Int64()) + if err != nil { + s.lg.Error("Failed to query KV metas for scan batch", "error", err) + return fmt.Errorf("failed to query KV metas: %w", err) + } + s.lg.Debug("Query KV meta done", "kvsInBatch", shortPrt(kvIndices)) + + for i, meta := range metas { + select { + case <-ctx.Done(): + s.lg.Warn("Scanner canceled, stopping fix batch", "ctx.Err", ctx.Err()) + return ctx.Err() + default: + } + var commit common.Hash + copy(commit[:], meta[32-es.HashSizeInContract:32]) + s.scanAndFixKv(kvIndices[i], commit, onUpdate) + } + return nil +} + +func (s *Worker) scanAndFixKv(kvIndex uint64, commit common.Hash, onUpdate scanUpdateFn) { + marker := newScanMarker(kvIndex, onUpdate) + _, found, err := s.sm.TryRead(kvIndex, int(s.sm.MaxKvSize()), commit) + if !found && err == nil { + err = fmt.Errorf("blob not found locally: %x", commit) + } + if err != nil { + var commitErr *es.CommitMismatchError + if errors.As(err, &commitErr) { + s.lg.Info("Fixing mismatched KV", "kvIndex", kvIndex) + if err := s.sm.TryWriteWithMetaCheck(kvIndex, commit, s.fetchBlob); err != nil { + marker.markFailed(commit, fmt.Errorf("failed to fix KV: kvIndex=%d, commit=%x, %w", kvIndex, commit, err)) + return + } + marker.markFixed() + s.lg.Info("KV fixed successfully", "kvIndex", kvIndex) + return + } + s.lg.Error("Failed to scan KV to fix", "kvIndex", kvIndex, "error", err) + marker.markError(commit, err) + return + } + marker.markRecovered() + s.lg.Info("KV recovered", "kvIndex", kvIndex, "commit", commit) } func (s *Worker) summaryLocalKvs() (uint64, string) { kvEntryCountOnChain := s.sm.KvEntryCount() if kvEntryCountOnChain == 0 { - s.lg.Info("Scanner: no KV entries found in local storage") + s.lg.Info("No KV entries found in local storage") return 0, "(none)" } return summaryLocalKvs(s.sm.Shards(), s.sm.KvEntries(), kvEntryCountOnChain-1) @@ -174,7 +220,7 @@ func getKvsInBatch(shards []uint64, kvEntries, localKvCount, batchSize, startKvI // Determine batch start and end KV indices if startKvIndex >= localKvCount { startKvIndex = 0 - lg.Debug("Scanner: restarting scan from the beginning") + lg.Debug("Restarting scan from the beginning") } endKvIndexExclusive := min(startKvIndex+batchSize, localKvCount) // The actual batch range is [startKvIndex, endKvIndexExclusive) or [startKvIndex, endIndex] @@ -205,7 +251,7 @@ func getKvsInBatch(shards []uint64, kvEntries, localKvCount, batchSize, startKvI kvsInBatch = append(kvsInBatch, shards[i]*kvEntries+k) } } - lg.Debug("Scanner: batch index range determined", "batchStart", startKvIndex, "batchEnd(exclusive)", endKvIndexExclusive, "kvsInBatch", shortPrt(kvsInBatch)) + lg.Debug("Scan batch index range determined", "batchStart", startKvIndex, "batchEnd(exclusive)", endKvIndexExclusive, "kvsInBatch", shortPrt(kvsInBatch)) return kvsInBatch, endKvIndexExclusive } From 232108848b99db514e5f7343fdf6de546e1e1eee Mon Sep 17 00:00:00 2001 From: syntrust Date: Tue, 23 Dec 2025 19:15:11 +0800 Subject: [PATCH 32/51] scan latest updated --- ethstorage/eth/polling_client.go | 14 +++++ ethstorage/scanner/scanner.go | 88 ++++++++++++++++++++++---------- ethstorage/scanner/utils.go | 9 +++- ethstorage/scanner/worker.go | 52 ++++++++++++++----- 4 files changed, 121 insertions(+), 42 deletions(-) diff --git a/ethstorage/eth/polling_client.go b/ethstorage/eth/polling_client.go index 95169cb7..618247ee 100644 --- a/ethstorage/eth/polling_client.go +++ b/ethstorage/eth/polling_client.go @@ -228,6 +228,20 @@ func (w *PollingClient) FilterLogsByBlockRange(start *big.Int, end *big.Int, eve return w.FilterLogs(context.Background(), query) } +func (w *PollingClient) GetUpdatedKvIndices(startBlock, endBlock *big.Int) ([]uint64, error) { + events, err := w.FilterLogsByBlockRange(startBlock, endBlock, PutBlobEvent) + if err != nil { + return nil, err + } + var kvIndices []uint64 + for _, event := range events { + kvIndices = append(kvIndices, new(big.Int).SetBytes(event.Topics[1][:]).Uint64()) + var hash common.Hash + copy(hash[:], event.Topics[3][:]) + } + return kvIndices, nil +} + func (w *PollingClient) GetStorageKvEntryCount(blockNumber int64) (uint64, error) { h := crypto.Keccak256Hash([]byte(`kvEntryCount()`)) diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index 12cea031..e2d50e0c 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -26,8 +26,8 @@ type Scanner struct { mu sync.Mutex // protects running lg log.Logger scanPermit chan struct{} // to ensure only one scan at a time - statsMu sync.Mutex // protects sharedStats sharedStats scannedKVs + statsMu sync.Mutex // protects sharedStats } func New( @@ -35,7 +35,7 @@ func New( cfg Config, sm *es.StorageManager, fetchBlob es.FetchBlobFunc, - l1 es.Il1Source, + l1 IL1, feed *event.Feed, lg log.Logger, ) *Scanner { @@ -101,38 +101,71 @@ func (s *Scanner) start() { return } - // Launch scan loops for hybrid mode if s.cfg.Mode == modeCheckBlob+modeCheckMeta { s.lg.Info("Scanner running in hybrid mode", "mode", s.cfg.Mode, "metaInterval", s.cfg.IntervalMeta, "blobInterval", s.cfg.IntervalBlob) - s.launchScanLoop(&scanLoopState{mode: modeCheckBlob}, s.cfg.IntervalBlob) - s.launchScanLoop(&scanLoopState{mode: modeCheckMeta}, s.cfg.IntervalMeta) - return + s.launchScanLoop(s.blobScanLoopRuntime()) + s.launchScanLoop(s.metaScanLoopRuntime()) + } else { + s.lg.Info("Scanner running in single mode", "mode", s.cfg.Mode, "interval", s.cfg.IntervalMeta) + s.launchScanLoop(s.defaultScanLoopRuntime()) } + + // Launch the scan loop to fix mismatched KVs every 12 minutes + s.launchFixLoop(time.Minute * 12) + + // Launch the scan loop for the updated KVs within the last scan interval using blob mode + s.launchScanLoop(&scanLoopRuntime{mode: modeCheckBlob, nextBatch: s.worker.latestUpdated, interval: s.cfg.IntervalBlob, batchSize: 7200}) +} + +func (s *Scanner) defaultScanLoopRuntime() *scanLoopRuntime { interval := s.cfg.IntervalMeta if s.cfg.Mode == modeCheckBlob { interval = s.cfg.IntervalBlob } - s.launchScanLoop(&scanLoopState{mode: s.cfg.Mode}, interval) + return &scanLoopRuntime{ + mode: s.cfg.Mode, + nextBatch: s.worker.getKvsInBatch, + interval: interval, + batchSize: uint64(s.cfg.BatchSize), + nextIndex: 0, + } +} + +func (s *Scanner) blobScanLoopRuntime() *scanLoopRuntime { + return &scanLoopRuntime{ + mode: modeCheckBlob, + nextBatch: s.worker.getKvsInBatch, + interval: s.cfg.IntervalBlob, + batchSize: uint64(s.cfg.BatchSize), + nextIndex: 0, + } +} - // Launch the fix loop to fix mismatched KVs every 10 minutes - s.launchFixLoop(time.Minute * 10) +func (s *Scanner) metaScanLoopRuntime() *scanLoopRuntime { + return &scanLoopRuntime{ + mode: modeCheckMeta, + nextBatch: s.worker.getKvsInBatch, + interval: s.cfg.IntervalMeta, + batchSize: uint64(s.cfg.BatchSize), + nextIndex: 0, + } } -func (s *Scanner) launchScanLoop(state *scanLoopState, interval time.Duration) { +func (s *Scanner) launchScanLoop(state *scanLoopRuntime) { s.wg.Add(1) go func() { defer s.wg.Done() - s.lg.Info("Scanner configured", "mode", state.mode, "interval", interval.String(), "batchSize", s.cfg.BatchSize) + s.lg.Info("Scanner configured", "mode", state.mode, "interval", state.interval.String(), "batchSize", state.batchSize) - mainTicker := time.NewTicker(interval) + mainTicker := time.NewTicker(state.interval) defer mainTicker.Stop() - s.doWork(state) + s.doScan(state) for { select { case <-mainTicker.C: - s.doWork(state) + s.doScan(state) case <-s.ctx.Done(): return @@ -140,6 +173,20 @@ func (s *Scanner) launchScanLoop(state *scanLoopState, interval time.Duration) { } }() } + +func (s *Scanner) doScan(state *scanLoopRuntime) { + if !s.acquireScanPermit() { + return + } + err := s.worker.scanBatch(s.ctx, state, func(kvi uint64, m *scanned) { + s.applyUpdate(kvi, m) + }) + s.releaseScanPermit() + if err != nil { + s.lg.Error("Scan batch failed", "mode", state.mode, "error", err) + } +} + func (s *Scanner) launchFixLoop(interval time.Duration) { s.wg.Add(1) go func() { @@ -180,19 +227,6 @@ func (s *Scanner) launchFixLoop(interval time.Duration) { }() } -func (s *Scanner) doWork(state *scanLoopState) { - if !s.acquireScanPermit() { - return - } - err := s.worker.ScanBatch(s.ctx, state, func(kvi uint64, m *scanned) { - s.applyUpdate(kvi, m) - }) - s.releaseScanPermit() - if err != nil { - s.lg.Error("Scan batch failed", "mode", state.mode, "error", err) - } -} - func (s *Scanner) applyUpdate(kvi uint64, m *scanned) { s.statsMu.Lock() defer s.statsMu.Unlock() diff --git a/ethstorage/scanner/utils.go b/ethstorage/scanner/utils.go index cfe264d2..fdc34dfe 100644 --- a/ethstorage/scanner/utils.go +++ b/ethstorage/scanner/utils.go @@ -7,6 +7,7 @@ import ( "fmt" "slices" "strings" + "time" "github.com/ethereum/go-ethereum/common" ) @@ -106,12 +107,16 @@ func (m scannedKVs) needFix() []uint64 { return res } -type scanLoopState struct { +type scanLoopRuntime struct { mode scanMode + nextBatch nextBatchFn + interval time.Duration + batchSize uint64 nextIndex uint64 } -type scanUpdateFn func(kvi uint64, m *scanned) +type scanUpdateFn func(uint64, *scanned) +type nextBatchFn func(uint64, uint64) ([]uint64, uint64) type scanMarker struct { kvIndex uint64 diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index 2a2367d9..c32bbdf6 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -7,6 +7,7 @@ import ( "context" "errors" "fmt" + "math/big" "strings" "time" @@ -27,18 +28,23 @@ type IStorageManager interface { Shards() []uint64 } +type IL1 interface { + GetKvMetas(kvIndices []uint64, blockNumber int64) ([][32]byte, error) + GetUpdatedKvIndices(startBlock, endBlock *big.Int) ([]uint64, error) + BlockNumber(context.Context) (uint64, error) +} + type Worker struct { sm IStorageManager fetchBlob es.FetchBlobFunc - l1 es.Il1Source - batchSize uint64 + l1 IL1 lg log.Logger } func NewWorker( sm IStorageManager, fetch es.FetchBlobFunc, - l1 es.Il1Source, + l1 IL1, batchSize uint64, lg log.Logger, ) *Worker { @@ -46,12 +52,11 @@ func NewWorker( sm: sm, fetchBlob: fetch, l1: l1, - batchSize: batchSize, lg: lg, } } -func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, onUpdate scanUpdateFn) error { +func (s *Worker) scanBatch(ctx context.Context, runtime *scanLoopRuntime, onUpdate scanUpdateFn) error { // Noop if onUpdate == nil { onUpdate = func(kvi uint64, m *scanned) {} @@ -62,22 +67,22 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, onUpdate s defer func(stt time.Time) { if len(kvsInBatch) > 0 { s.lg.Info("Scan batch done", - "mode", state.mode, + "mode", runtime.mode, "scanned", shortPrt(kvsInBatch), "count", len(kvsInBatch), - "nextIndexOfKvIdx", state.nextIndex, + "nextIndexOfKvIdx", runtime.nextIndex, "duration", time.Since(stt).String(), ) } }(start) // Determine the batch of KV indices to scan - kvsInBatch, batchEndExclusive := s.getKvsInBatch(state.nextIndex) + kvsInBatch, batchEndExclusive := runtime.nextBatch(runtime.batchSize, runtime.nextIndex) if len(kvsInBatch) == 0 { s.lg.Info("No KV entries to scan in this batch") return nil } - s.lg.Info("Scan batch started", "mode", state.mode, "startIndexOfKvIdx", state.nextIndex) + s.lg.Info("Scan batch started", "mode", runtime.mode, "startIndexOfKvIdx", runtime.nextIndex) // Query the metas from the L1 contract metas, err := s.l1.GetKvMetas(kvsInBatch, rpc.FinalizedBlockNumber.Int64()) @@ -97,21 +102,42 @@ func (s *Worker) ScanBatch(ctx context.Context, state *scanLoopState, onUpdate s var commit common.Hash copy(commit[:], meta[32-es.HashSizeInContract:32]) - s.scanKv(state.mode, kvsInBatch[i], commit, onUpdate) + s.scanKv(runtime.mode, kvsInBatch[i], commit, onUpdate) } - state.nextIndex = batchEndExclusive + runtime.nextIndex = batchEndExclusive return nil } -func (s *Worker) getKvsInBatch(startIndexOfKvIdx uint64) ([]uint64, uint64) { +func (s *Worker) getKvsInBatch(batchSize uint64, startIndexOfKvIdx uint64) ([]uint64, uint64) { localKvCount, _ := s.summaryLocalKvs() if localKvCount == 0 { return []uint64{}, 0 } shards := s.sm.Shards() kvEntries := s.sm.KvEntries() - return getKvsInBatch(shards, kvEntries, localKvCount, s.batchSize, startIndexOfKvIdx, s.lg) + return getKvsInBatch(shards, kvEntries, localKvCount, batchSize, startIndexOfKvIdx, s.lg) +} + +func (s *Worker) latestUpdated(blocksToScan uint64, lastScannedBlock uint64) ([]uint64, uint64) { + var endBlock uint64 + startBlock := lastScannedBlock + 1 + latestBlock, err := s.l1.BlockNumber(context.Background()) + if err != nil { + s.lg.Error("Failed to get latest block number", "error", err) + return []uint64{}, 0 + } + if startBlock == 1 { + s.lg.Info(fmt.Sprintf("No last scanned block recorded, starting from %d blocks ago", blocksToScan)) + startBlock = latestBlock - blocksToScan + } + endBlock = latestBlock + kvsIndices, err := s.l1.GetUpdatedKvIndices(big.NewInt(int64(startBlock)), big.NewInt(int64(endBlock))) + if err != nil { + s.lg.Error("Failed to get updated KV indices", "error", err) + return []uint64{}, 0 + } + return kvsIndices, endBlock } func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, onUpdate scanUpdateFn) { From ddb2c7aeda9c303c6b2fa86a7291d4ee6345cf35 Mon Sep 17 00:00:00 2001 From: syntrust Date: Thu, 25 Dec 2025 11:31:35 +0800 Subject: [PATCH 33/51] minor --- cmd/es-utils/utils/utils.go | 1 + ethstorage/scanner/worker.go | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cmd/es-utils/utils/utils.go b/cmd/es-utils/utils/utils.go index d4e9562d..13141840 100644 --- a/cmd/es-utils/utils/utils.go +++ b/cmd/es-utils/utils/utils.go @@ -186,6 +186,7 @@ func SendBlobTx( for i := 0; i <= maxRetries; i++ { errRetry = client.SendTransaction(context.Background(), tx) if errRetry == nil { + lg.Info("SendTransaction succeeded", "txHash", tx.Hash()) break } lg.Warn("SendTransaction failed", "retriesLeft", maxRetries-i, "error", errRetry) diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index c32bbdf6..9a7b968a 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -73,13 +73,14 @@ func (s *Worker) scanBatch(ctx context.Context, runtime *scanLoopRuntime, onUpda "nextIndexOfKvIdx", runtime.nextIndex, "duration", time.Since(stt).String(), ) + } else { + s.lg.Info("Scan batch done", "mode", runtime.mode, "scanned", "(none)") } }(start) // Determine the batch of KV indices to scan kvsInBatch, batchEndExclusive := runtime.nextBatch(runtime.batchSize, runtime.nextIndex) if len(kvsInBatch) == 0 { - s.lg.Info("No KV entries to scan in this batch") return nil } s.lg.Info("Scan batch started", "mode", runtime.mode, "startIndexOfKvIdx", runtime.nextIndex) From a11236aa52c78e7edb226c87081e5ebc06b0138b Mon Sep 17 00:00:00 2001 From: syntrust Date: Tue, 30 Dec 2025 18:08:16 +0800 Subject: [PATCH 34/51] fix nil --- ethstorage/scanner/scanner.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index e2d50e0c..80938d22 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -302,6 +302,9 @@ func (s *Scanner) logStats() { } func (s *Scanner) GetScanState() *ScanStats { + if s == nil { + return &ScanStats{} + } s.statsMu.Lock() defer s.statsMu.Unlock() From c65e607950a5b15044aadedaba327cb31edb19c0 Mon Sep 17 00:00:00 2001 From: syntrust Date: Tue, 30 Dec 2025 18:11:24 +0800 Subject: [PATCH 35/51] check blob exist before put cache --- ethstorage/downloader/blob_disk_cache.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ethstorage/downloader/blob_disk_cache.go b/ethstorage/downloader/blob_disk_cache.go index 37a10427..93017a9e 100644 --- a/ethstorage/downloader/blob_disk_cache.go +++ b/ethstorage/downloader/blob_disk_cache.go @@ -68,6 +68,9 @@ func (c *BlobDiskCache) SetBlockBlobs(block *blockBlobs) error { } var blbs []*blob for _, b := range block.blobs { + if b.data == nil { + continue + } kvi := b.kvIndex.Uint64() id, err := c.store.Put(b.data) if err != nil { From 0eade27e7fe661a3c61873b1a5d27012c0e3ec22 Mon Sep 17 00:00:00 2001 From: syntrust Date: Wed, 31 Dec 2025 10:06:04 +0800 Subject: [PATCH 36/51] filter updates --- ethstorage/scanner/scanner.go | 2 +- ethstorage/scanner/worker.go | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index 80938d22..2bd506e1 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -113,7 +113,7 @@ func (s *Scanner) start() { // Launch the scan loop to fix mismatched KVs every 12 minutes s.launchFixLoop(time.Minute * 12) - // Launch the scan loop for the updated KVs within the last scan interval using blob mode + // Launch the scan loop for the updated KVs within the last scan interval using blob mode TODO: make interval configurable? s.launchScanLoop(&scanLoopRuntime{mode: modeCheckBlob, nextBatch: s.worker.latestUpdated, interval: s.cfg.IntervalBlob, batchSize: 7200}) } diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index 9a7b968a..d648ac89 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -138,7 +138,21 @@ func (s *Worker) latestUpdated(blocksToScan uint64, lastScannedBlock uint64) ([] s.lg.Error("Failed to get updated KV indices", "error", err) return []uint64{}, 0 } - return kvsIndices, endBlock + // filter out kv indices that are not stored in local storage + shardSet := make(map[uint64]struct{}) + for _, shard := range s.sm.Shards() { + shardSet[shard] = struct{}{} + } + kvEntries := s.sm.KvEntries() + var filteredKvs []uint64 + for _, kvi := range kvsIndices { + shardIdx := kvi / kvEntries + if _, ok := shardSet[shardIdx]; ok { + filteredKvs = append(filteredKvs, kvi) + } + } + s.lg.Info("Latest updated KV indices fetched", "startBlock", startBlock, "endBlock", endBlock, "totalUpdatedKvs", len(kvsIndices), "filteredKvs", len(filteredKvs)) + return filteredKvs, endBlock } func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, onUpdate scanUpdateFn) { From b43b0086ca893d2bd164d4c7536488951309bd45 Mon Sep 17 00:00:00 2001 From: syntrust Date: Wed, 31 Dec 2025 17:55:04 +0800 Subject: [PATCH 37/51] Fix comments --- ethstorage/downloader/downloader.go | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/ethstorage/downloader/downloader.go b/ethstorage/downloader/downloader.go index 39e37808..199caadb 100644 --- a/ethstorage/downloader/downloader.go +++ b/ethstorage/downloader/downloader.go @@ -403,7 +403,7 @@ func (s *Downloader) downloadRange(start int64, end int64, toCache bool) ([]blob clBlob, exists := clBlobs[elBlob.hash] if !exists { s.notifyBlobMissing(elBlock.number, elBlob.kvIndex.Uint64(), elBlob.hash) - s.lg.Crit("Did not find the event specified blob in the CL", "blockNumber", elBlock.number, "kvIndex", elBlob.kvIndex) + continue } // encode blobs so that miner can do sampling directly from cache elBlob.data = s.sm.EncodeBlob(clBlob.Data, elBlob.hash, elBlob.kvIndex.Uint64(), s.sm.MaxKvSize()) @@ -520,20 +520,23 @@ func (s *Downloader) eventsToBlocks(events []types.Log) ([]*blockBlobs, error) { } func (s *Downloader) notifyBlobMissing(blockNumber uint64, kvIndex uint64, hash common.Hash) { - if s.emailConfig == nil { - return - } - + title := "🛑 Fatal Error from es-node: Downloader Failed to Locate Blob in CL" msg := "The downloader couldn't locate the specified blob in the consensus layer. The node is stopped pending resolution. " msg += "Details from the EL event: \n" msg += fmt.Sprintf(" - blockNumber: %d\n", blockNumber) msg += fmt.Sprintf(" - kvIndex: %d\n", kvIndex) msg += fmt.Sprintf(" - hash: %s\n", hash.Hex()) msg += "This may indicate a potential issue with blob availability on the consensus layer. \n" - email.SendEmail( - "🛑 Fatal Error from es-node: Downloader Failed to Locate Blob in CL", - msg, - *s.emailConfig, - s.lg, - ) + + if s.emailConfig != nil { + email.SendEmail( + title, + msg, + *s.emailConfig, + s.lg, + ) + } else { + s.lg.Error(title) + s.lg.Crit(msg) + } } From 13979c37db8cdf37414cc7aefbfdc9a44ba68b32 Mon Sep 17 00:00:00 2001 From: syntrust Date: Sun, 4 Jan 2026 11:17:44 +0800 Subject: [PATCH 38/51] complete err msg --- ethstorage/blobs/blob.go | 4 +++- ethstorage/eth/beacon_client.go | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ethstorage/blobs/blob.go b/ethstorage/blobs/blob.go index 0dad6d78..c6e1e070 100644 --- a/ethstorage/blobs/blob.go +++ b/ethstorage/blobs/blob.go @@ -12,7 +12,9 @@ import ( ) type BeaconBlobs struct { - Data []string `json:"data"` + Data []string `json:"data"` + Message string `json:"message"` + Code int `json:"code"` } func BlobToVersionedHash(blobBytes []byte) (common.Hash, error) { diff --git a/ethstorage/eth/beacon_client.go b/ethstorage/eth/beacon_client.go index 46f74445..6bb09dc0 100644 --- a/ethstorage/eth/beacon_client.go +++ b/ethstorage/eth/beacon_client.go @@ -85,7 +85,9 @@ func (c *BeaconClient) DownloadBlobs(slot uint64) (map[common.Hash]Blob, error) if err := json.NewDecoder(resp.Body).Decode(&blobsResp); err != nil { return nil, fmt.Errorf("failed to decode beacon blobs response from url %s: %w", beaconUrl, err) } - + if len(blobsResp.Data) == 0 { + return nil, fmt.Errorf("no blobs found for slot %d: %d %s", slot, blobsResp.Code, blobsResp.Message) + } res := map[common.Hash]Blob{} for _, beaconBlob := range blobsResp.Data { // decode hex string to bytes From cd334db4503996d8f468b36f04691ad1a3f1c8aa Mon Sep 17 00:00:00 2001 From: syntrust Date: Sun, 4 Jan 2026 14:28:27 +0800 Subject: [PATCH 39/51] better err msg --- ethstorage/eth/beacon_client.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ethstorage/eth/beacon_client.go b/ethstorage/eth/beacon_client.go index 6bb09dc0..7eef8fa2 100644 --- a/ethstorage/eth/beacon_client.go +++ b/ethstorage/eth/beacon_client.go @@ -86,7 +86,11 @@ func (c *BeaconClient) DownloadBlobs(slot uint64) (map[common.Hash]Blob, error) return nil, fmt.Errorf("failed to decode beacon blobs response from url %s: %w", beaconUrl, err) } if len(blobsResp.Data) == 0 { - return nil, fmt.Errorf("no blobs found for slot %d: %d %s", slot, blobsResp.Code, blobsResp.Message) + err := fmt.Sprintf("no blobs found for slot %d", slot) + if blobsResp.Code != 0 || blobsResp.Message != "" { + err = fmt.Sprintf("%s: %d %s", err, blobsResp.Code, blobsResp.Message) + } + return nil, fmt.Errorf("%s", err) } res := map[common.Hash]Blob{} for _, beaconBlob := range blobsResp.Data { From 3dc133ef469ce626ad0516d4acb561595055ccbb Mon Sep 17 00:00:00 2001 From: syntrust Date: Sun, 4 Jan 2026 15:44:33 +0800 Subject: [PATCH 40/51] handle send email error --- ethstorage/downloader/downloader.go | 14 +++++--------- ethstorage/email/email.go | 3 ++- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/ethstorage/downloader/downloader.go b/ethstorage/downloader/downloader.go index 199caadb..b8b084d5 100644 --- a/ethstorage/downloader/downloader.go +++ b/ethstorage/downloader/downloader.go @@ -529,14 +529,10 @@ func (s *Downloader) notifyBlobMissing(blockNumber uint64, kvIndex uint64, hash msg += "This may indicate a potential issue with blob availability on the consensus layer. \n" if s.emailConfig != nil { - email.SendEmail( - title, - msg, - *s.emailConfig, - s.lg, - ) - } else { - s.lg.Error(title) - s.lg.Crit(msg) + if err := email.SendEmail(title, msg, *s.emailConfig, s.lg); err == nil { + return + } } + s.lg.Error(title) + s.lg.Crit(msg) } diff --git a/ethstorage/email/email.go b/ethstorage/email/email.go index a647dd37..fbabe5dc 100644 --- a/ethstorage/email/email.go +++ b/ethstorage/email/email.go @@ -54,7 +54,7 @@ func (c EmailConfig) String() string { ) } -func SendEmail(emailSubject, msg string, config EmailConfig, lg log.Logger) { +func SendEmail(emailSubject, msg string, config EmailConfig, lg log.Logger) error { lg.Info("Sending email notification", "subject", emailSubject) emailBody := fmt.Sprintf("Subject: %s\r\n", emailSubject) @@ -80,4 +80,5 @@ func SendEmail(emailSubject, msg string, config EmailConfig, lg log.Logger) { } else { lg.Info("Email notification sent successfully!") } + return err } From d40c01c220d85b55d04a9417b998a07c6e1bbab6 Mon Sep 17 00:00:00 2001 From: syntrust Date: Mon, 5 Jan 2026 18:49:23 +0800 Subject: [PATCH 41/51] test and fix --- ethstorage/scanner/config.go | 56 ++++++++++++---------- ethstorage/scanner/scanner.go | 89 +++++++++++++++++------------------ ethstorage/scanner/utils.go | 20 ++++---- ethstorage/scanner/worker.go | 51 ++++++++++---------- 4 files changed, 110 insertions(+), 106 deletions(-) diff --git a/ethstorage/scanner/config.go b/ethstorage/scanner/config.go index f39bdc21..4e5eff62 100644 --- a/ethstorage/scanner/config.go +++ b/ethstorage/scanner/config.go @@ -15,20 +15,22 @@ const ( modeDisabled = iota modeCheckMeta modeCheckBlob + modeCheckBlock + modeHybrid ) const ( - ModeFlagName = "scanner.mode" - BatchSizeFlagName = "scanner.batch-size" - IntervalMetaFlagName = "scanner.interval.meta" - IntervalBlobFlagName = "scanner.interval.blob" + ModeFlagName = "scanner.mode" + BatchSizeFlagName = "scanner.batch-size" + IntervalMetaFlagName = "scanner.interval.meta" + IntervalBlobFlagName = "scanner.interval.blob" + IntervalBlockFlagName = "scanner.interval.block" ) // intervals in minutes const defaultIntervalMeta = 3 -const defaultIntervalBlob = 24 * 60 -const minIntervalMeta = 1 -const minIntervalBlob = 5 +const defaultIntervalBlob = 60 +const defaultIntervalBlock = 24 * 60 func scannerEnv(name string) string { return utils.PrefixEnvVar("SCANNER_" + name) @@ -44,7 +46,9 @@ func (m scanMode) String() string { return "check-meta" case modeCheckBlob: return "check-blob" - case modeCheckBlob + modeCheckMeta: + case modeCheckBlock: + return "check-block" + case modeHybrid: return "hybrid" default: panic(fmt.Sprintf("invalid scanner mode: %d", m)) @@ -52,17 +56,18 @@ func (m scanMode) String() string { } type Config struct { - Mode scanMode - BatchSize int - IntervalMeta time.Duration - IntervalBlob time.Duration + Mode scanMode + BatchSize int + IntervalMeta time.Duration + IntervalBlob time.Duration + IntervalBlock time.Duration } func CLIFlags() []cli.Flag { flags := []cli.Flag{ cli.IntFlag{ Name: ModeFlagName, - Usage: "Data scan mode, 0: disabled, 1: check meta, 2: check blob, 3: hybrid", + Usage: "Data scan mode, 0: disabled, 1: check meta, 2: check blob, 3: check block, 4: hybrid", EnvVar: scannerEnv("MODE"), Value: 1, }, @@ -74,16 +79,22 @@ func CLIFlags() []cli.Flag { }, cli.IntFlag{ Name: IntervalMetaFlagName, - Usage: fmt.Sprintf("Data scan interval of check-meta mode in minutes, minimum %d (default %d)", minIntervalMeta, defaultIntervalMeta), + Usage: fmt.Sprintf("Data scan interval of check-meta mode in minutes (default %d)", defaultIntervalMeta), EnvVar: scannerEnv("INTERVAL_META"), Value: defaultIntervalMeta, }, cli.IntFlag{ Name: IntervalBlobFlagName, - Usage: fmt.Sprintf("Data scan interval of check-blob mode in minutes, minimum %d (default %d)", minIntervalBlob, defaultIntervalBlob), + Usage: fmt.Sprintf("Data scan interval of check-blob mode in minutes (default %d)", defaultIntervalBlob), EnvVar: scannerEnv("INTERVAL_BLOB"), Value: defaultIntervalBlob, }, + cli.IntFlag{ + Name: IntervalBlockFlagName, + Usage: fmt.Sprintf("Data scan interval of check-block mode in minutes (default %d)", defaultIntervalBlock), + EnvVar: scannerEnv("INTERVAL_BLOCK"), + Value: defaultIntervalBlock, + }, } return flags } @@ -93,16 +104,11 @@ func NewConfig(ctx *cli.Context) *Config { if mode == modeDisabled { return nil } - if interval := ctx.GlobalInt(IntervalMetaFlagName); interval < minIntervalMeta { - panic(fmt.Sprintf("scanner interval of check-meta mode must be at least %d minutes", minIntervalMeta)) - } - if interval := ctx.GlobalInt(IntervalBlobFlagName); interval < minIntervalBlob { - panic(fmt.Sprintf("scanner interval of check-blob mode must be at least %d minutes", minIntervalBlob)) - } return &Config{ - Mode: scanMode(mode), - BatchSize: ctx.GlobalInt(BatchSizeFlagName), - IntervalMeta: time.Minute * time.Duration(ctx.GlobalInt(IntervalMetaFlagName)), - IntervalBlob: time.Minute * time.Duration(ctx.GlobalInt(IntervalBlobFlagName)), + Mode: scanMode(mode), + BatchSize: ctx.GlobalInt(BatchSizeFlagName), + IntervalMeta: time.Minute * time.Duration(ctx.GlobalInt(IntervalMetaFlagName)), + IntervalBlob: time.Minute * time.Duration(ctx.GlobalInt(IntervalBlobFlagName)), + IntervalBlock: time.Minute * time.Duration(ctx.GlobalInt(IntervalBlockFlagName)), } } diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index 2bd506e1..019169c3 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -94,39 +94,41 @@ func (s *Scanner) start() { s.running = true s.mu.Unlock() - s.startReporter() - - if s.cfg.Mode == modeDisabled { + switch s.cfg.Mode { + case modeDisabled: s.lg.Info("Scanner is disabled") return - } - - if s.cfg.Mode == modeCheckBlob+modeCheckMeta { - s.lg.Info("Scanner running in hybrid mode", "mode", s.cfg.Mode, "metaInterval", s.cfg.IntervalMeta, "blobInterval", s.cfg.IntervalBlob) + case modeCheckMeta: + s.launchScanLoop(s.metaScanLoopRuntime()) + s.lg.Info("Scanner started in meta check mode") + case modeCheckBlob: s.launchScanLoop(s.blobScanLoopRuntime()) + s.lg.Info("Scanner started in blob check mode") + case modeCheckBlock: + // Launch the scan loop for the updated KVs in the latest blocks every 24 hours + s.launchScanLoop(s.latestScanLoopRuntime()) + s.lg.Info("Scanner started in block check mode") + case modeHybrid: + // hybrid mode s.launchScanLoop(s.metaScanLoopRuntime()) - } else { - s.lg.Info("Scanner running in single mode", "mode", s.cfg.Mode, "interval", s.cfg.IntervalMeta) - s.launchScanLoop(s.defaultScanLoopRuntime()) + s.launchScanLoop(s.latestScanLoopRuntime()) + s.lg.Info("Scanner started in hybrid mode") + default: + s.lg.Error("Invalid scanner mode", "mode", s.cfg.Mode) + return } - // Launch the scan loop to fix mismatched KVs every 12 minutes + s.startReporter() + // Launch the scan loop to fix mismatched KVs every 12 minutes FIXME: adjust interval? s.launchFixLoop(time.Minute * 12) - - // Launch the scan loop for the updated KVs within the last scan interval using blob mode TODO: make interval configurable? - s.launchScanLoop(&scanLoopRuntime{mode: modeCheckBlob, nextBatch: s.worker.latestUpdated, interval: s.cfg.IntervalBlob, batchSize: 7200}) } -func (s *Scanner) defaultScanLoopRuntime() *scanLoopRuntime { - interval := s.cfg.IntervalMeta - if s.cfg.Mode == modeCheckBlob { - interval = s.cfg.IntervalBlob - } +func (s *Scanner) latestScanLoopRuntime() *scanLoopRuntime { return &scanLoopRuntime{ - mode: s.cfg.Mode, - nextBatch: s.worker.getKvsInBatch, - interval: interval, - batchSize: uint64(s.cfg.BatchSize), + mode: modeCheckBlock, + nextBatch: s.worker.latestUpdated, + interval: s.cfg.IntervalBlock, + batchSize: 7200, // start back from 7200 blocks (1 day for Ethereum L1) ago nextIndex: 0, } } @@ -156,7 +158,7 @@ func (s *Scanner) launchScanLoop(state *scanLoopRuntime) { go func() { defer s.wg.Done() - s.lg.Info("Scanner configured", "mode", state.mode, "interval", state.interval.String(), "batchSize", state.batchSize) + s.lg.Info("Launching scanner loop", "mode", state.mode, "interval", state.interval.String(), "batchSize", state.batchSize) mainTicker := time.NewTicker(state.interval) defer mainTicker.Stop() @@ -179,7 +181,7 @@ func (s *Scanner) doScan(state *scanLoopRuntime) { return } err := s.worker.scanBatch(s.ctx, state, func(kvi uint64, m *scanned) { - s.applyUpdate(kvi, m) + s.updateStats(kvi, m) }) s.releaseScanPermit() if err != nil { @@ -192,7 +194,7 @@ func (s *Scanner) launchFixLoop(interval time.Duration) { go func() { defer s.wg.Done() - s.lg.Info("Scanner fix loop started", "interval", interval.String()) + s.lg.Info("Launching scan fix loop", "interval", interval.String()) fixTicker := time.NewTicker(interval) defer fixTicker.Stop() @@ -200,20 +202,19 @@ func (s *Scanner) launchFixLoop(interval time.Duration) { for { select { case <-fixTicker.C: - s.lg.Info("Scanner fix loop triggered") - - if !s.acquireScanPermit() { - return - } + s.lg.Info("Scanner fix batch triggered") // hold for 3 minutes before fixing to allow possible ongoing kv downloading to finish time.Sleep(time.Minute * 3) - + if !s.acquireScanPermit() { + s.lg.Warn("Skipping fix scan batch since another scan is ongoing") + continue + } s.statsMu.Lock() kvIndices := s.sharedStats.needFix() s.statsMu.Unlock() err := s.worker.fixBatch(s.ctx, kvIndices, func(kvi uint64, m *scanned) { - s.applyUpdate(kvi, m) + s.updateStats(kvi, m) }) s.releaseScanPermit() if err != nil { @@ -227,7 +228,7 @@ func (s *Scanner) launchFixLoop(interval time.Duration) { }() } -func (s *Scanner) applyUpdate(kvi uint64, m *scanned) { +func (s *Scanner) updateStats(kvi uint64, m *scanned) { s.statsMu.Lock() defer s.statsMu.Unlock() @@ -259,7 +260,8 @@ func (s *Scanner) startReporter() { go func() { defer s.wg.Done() - s.logStats() + localKvCount, sum := s.worker.summaryLocalKvs() + s.lg.Info("Local storage summary", "localKvs", sum, "localKvCount", localKvCount) ticker := time.NewTicker(time.Minute) defer ticker.Stop() for { @@ -274,8 +276,11 @@ func (s *Scanner) startReporter() { } func (s *Scanner) logStats() { + localKvCount, sum := s.worker.summaryLocalKvs() + s.lg.Info("Local storage summary", "localKvs", sum, "localKvCount", localKvCount) + s.statsMu.Lock() - var mismatched string + mismatched := "(none)" if len(s.sharedStats) > 0 { mismatched = s.sharedStats.String() } @@ -285,17 +290,7 @@ func (s *Scanner) logStats() { } s.statsMu.Unlock() - localKvCount, sum := s.worker.summaryLocalKvs() - logFields := []any{ - "mode", s.cfg.Mode, - "localKvs", sum, - "localKvCount", localKvCount, - } - if mismatched != "" { - logFields = append(logFields, "mismatched", mismatched) - } - s.lg.Info("Scanner stats", logFields...) - + s.lg.Info("Scanner stats", "mode", s.cfg.Mode, "mismatched", mismatched) for i, e := range errSnapshot { s.lg.Info("Scanner error happened earlier", "kvIndex", i, "error", e) } diff --git a/ethstorage/scanner/utils.go b/ethstorage/scanner/utils.go index fdc34dfe..00fa812b 100644 --- a/ethstorage/scanner/utils.go +++ b/ethstorage/scanner/utils.go @@ -20,12 +20,12 @@ type scanned struct { type status int const ( - ok status = iota - err_read // read meta or blob error / not found - mismatched // mismatch detected - fixed // by scanner - recovered // by downloader - failed // failed to fix + ok status = iota + err_read // read meta or blob error / not found + pending // mismatch detected + fixed // by scanner + recovered // by downloader + failed // failed to fix ) func (s status) String() string { @@ -34,8 +34,8 @@ func (s status) String() string { return "ok" case err_read: return "err_read" - case mismatched: - return "mismatched" + case pending: + return "pending" case recovered: return "recovered" case fixed: @@ -99,7 +99,7 @@ func (m scannedKVs) failed() []uint64 { func (m scannedKVs) needFix() []uint64 { var res []uint64 for kvIndex, scanned := range m { - if scanned.status == mismatched || scanned.status == failed || scanned.err != nil { + if scanned.status == pending || scanned.status == failed || scanned.err != nil { res = append(res, kvIndex) } } @@ -136,7 +136,7 @@ func (m *scanMarker) markFailed(commit common.Hash, err error) { } func (m *scanMarker) markMismatched() { - m.mark(m.kvIndex, &scanned{status: mismatched, err: nil}) + m.mark(m.kvIndex, &scanned{status: pending, err: nil}) } func (m *scanMarker) markFixed() { diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index d648ac89..db334a77 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -12,6 +12,7 @@ import ( "time" "github.com/ethereum/go-ethereum/common" + "github.com/ethereum/go-ethereum/core/types" "github.com/ethereum/go-ethereum/log" "github.com/ethereum/go-ethereum/rpc" @@ -31,7 +32,7 @@ type IStorageManager interface { type IL1 interface { GetKvMetas(kvIndices []uint64, blockNumber int64) ([][32]byte, error) GetUpdatedKvIndices(startBlock, endBlock *big.Int) ([]uint64, error) - BlockNumber(context.Context) (uint64, error) + HeaderByNumber(context.Context, *big.Int) (*types.Header, error) } type Worker struct { @@ -57,11 +58,6 @@ func NewWorker( } func (s *Worker) scanBatch(ctx context.Context, runtime *scanLoopRuntime, onUpdate scanUpdateFn) error { - // Noop - if onUpdate == nil { - onUpdate = func(kvi uint64, m *scanned) {} - } - start := time.Now() var kvsInBatch []uint64 defer func(stt time.Time) { @@ -73,17 +69,16 @@ func (s *Worker) scanBatch(ctx context.Context, runtime *scanLoopRuntime, onUpda "nextIndexOfKvIdx", runtime.nextIndex, "duration", time.Since(stt).String(), ) - } else { - s.lg.Info("Scan batch done", "mode", runtime.mode, "scanned", "(none)") } }(start) // Determine the batch of KV indices to scan kvsInBatch, batchEndExclusive := runtime.nextBatch(runtime.batchSize, runtime.nextIndex) if len(kvsInBatch) == 0 { + s.lg.Info("No KV entries to scan in this batch", "mode", runtime.mode) return nil } - s.lg.Info("Scan batch started", "mode", runtime.mode, "startIndexOfKvIdx", runtime.nextIndex) + s.lg.Info("Scan batch started", "mode", runtime.mode, "startIndexOfKvIdx", runtime.nextIndex, "kvsInBatch", shortPrt(kvsInBatch)) // Query the metas from the L1 contract metas, err := s.l1.GetKvMetas(kvsInBatch, rpc.FinalizedBlockNumber.Int64()) @@ -101,9 +96,14 @@ func (s *Worker) scanBatch(ctx context.Context, runtime *scanLoopRuntime, onUpda default: } + mode := runtime.mode + if mode == modeCheckBlock { + // since we done parsing blob info from block + mode = modeCheckBlob + } var commit common.Hash copy(commit[:], meta[32-es.HashSizeInContract:32]) - s.scanKv(runtime.mode, kvsInBatch[i], commit, onUpdate) + s.scanKv(mode, kvsInBatch[i], commit, onUpdate) } runtime.nextIndex = batchEndExclusive @@ -121,22 +121,25 @@ func (s *Worker) getKvsInBatch(batchSize uint64, startIndexOfKvIdx uint64) ([]ui } func (s *Worker) latestUpdated(blocksToScan uint64, lastScannedBlock uint64) ([]uint64, uint64) { - var endBlock uint64 - startBlock := lastScannedBlock + 1 - latestBlock, err := s.l1.BlockNumber(context.Background()) + latestFinalized, err := s.l1.HeaderByNumber(context.Background(), big.NewInt(int64(rpc.FinalizedBlockNumber))) if err != nil { - s.lg.Error("Failed to get latest block number", "error", err) - return []uint64{}, 0 + s.lg.Error("Failed to get latest finalized block header", "error", err) + return []uint64{}, lastScannedBlock } - if startBlock == 1 { + startBlock := lastScannedBlock + 1 + endBlock := latestFinalized.Number.Uint64() + if lastScannedBlock == 0 { s.lg.Info(fmt.Sprintf("No last scanned block recorded, starting from %d blocks ago", blocksToScan)) - startBlock = latestBlock - blocksToScan + startBlock = endBlock - blocksToScan + } + if startBlock > endBlock { + s.lg.Info("No new finalized blocks to scan", "lastScannedBlock", lastScannedBlock, "latestFinalized", endBlock) + return []uint64{}, lastScannedBlock } - endBlock = latestBlock kvsIndices, err := s.l1.GetUpdatedKvIndices(big.NewInt(int64(startBlock)), big.NewInt(int64(endBlock))) if err != nil { - s.lg.Error("Failed to get updated KV indices", "error", err) - return []uint64{}, 0 + s.lg.Error("Failed to get updated KV indices", "startBlock", startBlock, "endBlock", endBlock, "error", err) + return []uint64{}, lastScannedBlock } // filter out kv indices that are not stored in local storage shardSet := make(map[uint64]struct{}) @@ -144,15 +147,15 @@ func (s *Worker) latestUpdated(blocksToScan uint64, lastScannedBlock uint64) ([] shardSet[shard] = struct{}{} } kvEntries := s.sm.KvEntries() - var filteredKvs []uint64 + var locallyStored []uint64 for _, kvi := range kvsIndices { shardIdx := kvi / kvEntries if _, ok := shardSet[shardIdx]; ok { - filteredKvs = append(filteredKvs, kvi) + locallyStored = append(locallyStored, kvi) } } - s.lg.Info("Latest updated KV indices fetched", "startBlock", startBlock, "endBlock", endBlock, "totalUpdatedKvs", len(kvsIndices), "filteredKvs", len(filteredKvs)) - return filteredKvs, endBlock + s.lg.Info("Latest updated KV indices fetched", "startBlock", startBlock, "endBlock", endBlock, "totalUpdatedKvs", len(kvsIndices), "locallyStored", len(locallyStored)) + return locallyStored, endBlock } func (s *Worker) scanKv(mode scanMode, kvIndex uint64, commit common.Hash, onUpdate scanUpdateFn) { From ddfa86cdefc234485b5b99de5760a54150d59016 Mon Sep 17 00:00:00 2001 From: syntrust Date: Tue, 6 Jan 2026 15:50:47 +0800 Subject: [PATCH 42/51] refactor --- cmd/es-node/utils.go | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/cmd/es-node/utils.go b/cmd/es-node/utils.go index 443fcd88..e89b96a2 100644 --- a/cmd/es-node/utils.go +++ b/cmd/es-node/utils.go @@ -6,7 +6,6 @@ package main import ( "bytes" "context" - "crypto/sha256" "fmt" "math/big" "net/http" @@ -19,11 +18,11 @@ import ( "github.com/ethereum/go-ethereum/common" "github.com/ethereum/go-ethereum/common/hexutil" "github.com/ethereum/go-ethereum/crypto" - "github.com/ethereum/go-ethereum/crypto/kzg4844" "github.com/ethereum/go-ethereum/ethclient" "github.com/ethereum/go-ethereum/log" "github.com/ethereum/go-ethereum/rpc" es "github.com/ethstorage/go-ethstorage/ethstorage" + "github.com/ethstorage/go-ethstorage/ethstorage/blobs" "github.com/ethstorage/go-ethstorage/ethstorage/flags" "github.com/ethstorage/go-ethstorage/ethstorage/storage" "github.com/urfave/cli" @@ -261,15 +260,11 @@ func downloadBlobFromRPC(endpoint string, kvIndex uint64, hash common.Hash) ([]b return nil, err } - var blob kzg4844.Blob - copy(blob[:], result) - commitment, err := kzg4844.BlobToCommitment(&blob) + blobhash, err := blobs.BlobToVersionedHash(result) if err != nil { return nil, fmt.Errorf("blobToCommitment failed: %w", err) } - blobhash := common.Hash(kzg4844.CalcBlobHashV1(sha256.New(), &commitment)) - fmt.Printf("blobhash from blob: %x\n", blobhash) - if bytes.Compare(blobhash[:es.HashSizeInContract], hash[:es.HashSizeInContract]) != 0 { + if !bytes.Equal(blobhash[:es.HashSizeInContract], hash[:es.HashSizeInContract]) { return nil, fmt.Errorf("invalid blobhash for %d want: %x, got: %x", kvIndex, hash, blobhash) } From 02e49caf2c186e71505c504bff139709d2b5d11f Mon Sep 17 00:00:00 2001 From: syntrust Date: Tue, 6 Jan 2026 15:52:44 +0800 Subject: [PATCH 43/51] config slot --- cmd/es-node/config.go | 2 +- ethstorage/scanner/config.go | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cmd/es-node/config.go b/cmd/es-node/config.go index 09ecfc55..b1d9f6e6 100644 --- a/cmd/es-node/config.go +++ b/cmd/es-node/config.go @@ -121,7 +121,7 @@ func NewConfig(ctx *cli.Context, lg log.Logger) (*node.Config, error) { Storage: *storageConfig, Mining: minerConfig, Archiver: archiverConfig, - Scanner: scanner.NewConfig(ctx), + Scanner: scanner.NewConfig(ctx, l1Endpoint.L1BeaconSlotTime), } if err := cfg.Check(); err != nil { return nil, err diff --git a/ethstorage/scanner/config.go b/ethstorage/scanner/config.go index 4e5eff62..2d6a48e5 100644 --- a/ethstorage/scanner/config.go +++ b/ethstorage/scanner/config.go @@ -58,6 +58,7 @@ func (m scanMode) String() string { type Config struct { Mode scanMode BatchSize int + L1SlotTime time.Duration IntervalMeta time.Duration IntervalBlob time.Duration IntervalBlock time.Duration @@ -99,7 +100,7 @@ func CLIFlags() []cli.Flag { return flags } -func NewConfig(ctx *cli.Context) *Config { +func NewConfig(ctx *cli.Context, slot uint64) *Config { mode := ctx.GlobalInt(ModeFlagName) if mode == modeDisabled { return nil @@ -107,6 +108,7 @@ func NewConfig(ctx *cli.Context) *Config { return &Config{ Mode: scanMode(mode), BatchSize: ctx.GlobalInt(BatchSizeFlagName), + L1SlotTime: time.Second * time.Duration(slot), IntervalMeta: time.Minute * time.Duration(ctx.GlobalInt(IntervalMetaFlagName)), IntervalBlob: time.Minute * time.Duration(ctx.GlobalInt(IntervalBlobFlagName)), IntervalBlock: time.Minute * time.Duration(ctx.GlobalInt(IntervalBlockFlagName)), From 06ab8e7d5d677e0364eafd8861acafc66c9568db Mon Sep 17 00:00:00 2001 From: syntrust Date: Tue, 6 Jan 2026 15:54:45 +0800 Subject: [PATCH 44/51] log err --- ethstorage/scanner/worker.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index db334a77..2802ce97 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -236,7 +236,9 @@ func (s *Worker) scanAndFixKv(kvIndex uint64, commit common.Hash, onUpdate scanU if errors.As(err, &commitErr) { s.lg.Info("Fixing mismatched KV", "kvIndex", kvIndex) if err := s.sm.TryWriteWithMetaCheck(kvIndex, commit, s.fetchBlob); err != nil { - marker.markFailed(commit, fmt.Errorf("failed to fix KV: kvIndex=%d, commit=%x, %w", kvIndex, commit, err)) + fixErr := fmt.Errorf("failed to fix KV: kvIndex=%d, commit=%x, %w", kvIndex, commit, err) + marker.markFailed(commit, fixErr) + s.lg.Error("Failed to fix KV", "error", fixErr) return } marker.markFixed() From 4d1fb902a54fe1193b5c65e7a6d699f953057ca5 Mon Sep 17 00:00:00 2001 From: syntrust Date: Tue, 6 Jan 2026 15:55:15 +0800 Subject: [PATCH 45/51] fix batch size for block check --- ethstorage/scanner/scanner.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index 019169c3..b835c4ef 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -128,7 +128,7 @@ func (s *Scanner) latestScanLoopRuntime() *scanLoopRuntime { mode: modeCheckBlock, nextBatch: s.worker.latestUpdated, interval: s.cfg.IntervalBlock, - batchSize: 7200, // start back from 7200 blocks (1 day for Ethereum L1) ago + batchSize: uint64(s.cfg.IntervalBlock / s.cfg.L1SlotTime), // number of slots in the interval nextIndex: 0, } } From e675b229290bcbfd4a74fbce5c6fe5e2526b5ed6 Mon Sep 17 00:00:00 2001 From: syntrust Date: Tue, 6 Jan 2026 15:56:03 +0800 Subject: [PATCH 46/51] rm fixed/recovered --- ethstorage/scanner/scanner.go | 5 +++++ ethstorage/scanner/utils.go | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index b835c4ef..9868a358 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -233,8 +233,13 @@ func (s *Scanner) updateStats(kvi uint64, m *scanned) { defer s.statsMu.Unlock() if m != nil { + if m.status == pending && s.sharedStats[kvi].status == failed { + // keep failed status until fixed + return + } s.sharedStats[kvi] = *m } else { + // fixed or recovered delete(s.sharedStats, kvi) } } diff --git a/ethstorage/scanner/utils.go b/ethstorage/scanner/utils.go index 00fa812b..4e515af9 100644 --- a/ethstorage/scanner/utils.go +++ b/ethstorage/scanner/utils.go @@ -140,11 +140,11 @@ func (m *scanMarker) markMismatched() { } func (m *scanMarker) markFixed() { - m.mark(m.kvIndex, &scanned{status: fixed, err: nil}) + m.mark(m.kvIndex, nil) } func (m *scanMarker) markRecovered() { - m.mark(m.kvIndex, &scanned{status: recovered, err: nil}) + m.mark(m.kvIndex, nil) } func shortPrt(nums []uint64) string { From e1c9d773889c294044d47c57c88d0564a6423b0a Mon Sep 17 00:00:00 2001 From: syntrust Date: Tue, 6 Jan 2026 17:03:46 +0800 Subject: [PATCH 47/51] fix scan permit --- ethstorage/scanner/scanner.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index 9868a358..11f7bb80 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -205,9 +205,9 @@ func (s *Scanner) launchFixLoop(interval time.Duration) { s.lg.Info("Scanner fix batch triggered") // hold for 3 minutes before fixing to allow possible ongoing kv downloading to finish time.Sleep(time.Minute * 3) + // hold until other possible ongoing scans finish if !s.acquireScanPermit() { - s.lg.Warn("Skipping fix scan batch since another scan is ongoing") - continue + return } s.statsMu.Lock() kvIndices := s.sharedStats.needFix() From f5b3ecc58fb139053c6d70a937e2f5cef50073de Mon Sep 17 00:00:00 2001 From: syntrust Date: Tue, 6 Jan 2026 19:06:31 +0800 Subject: [PATCH 48/51] refactor --- ethstorage/scanner/config.go | 69 +++++++++++++++++++++++++++-------- ethstorage/scanner/scanner.go | 27 +++++--------- ethstorage/scanner/worker.go | 2 +- 3 files changed, 64 insertions(+), 34 deletions(-) diff --git a/ethstorage/scanner/config.go b/ethstorage/scanner/config.go index 2d6a48e5..49a4a8c4 100644 --- a/ethstorage/scanner/config.go +++ b/ethstorage/scanner/config.go @@ -11,14 +11,6 @@ import ( "github.com/urfave/cli" ) -const ( - modeDisabled = iota - modeCheckMeta - modeCheckBlob - modeCheckBlock - modeHybrid -) - const ( ModeFlagName = "scanner.mode" BatchSizeFlagName = "scanner.batch-size" @@ -36,6 +28,17 @@ func scannerEnv(name string) string { return utils.PrefixEnvVar("SCANNER_" + name) } +const ( + modeDisabled = iota + // Compare local meta hashes with those in L1 contract + modeCheckMeta + // Compute meta hashes from local blobs and compare with those in L1 contract + modeCheckBlob + // Scan updated KVs from recent blocks and run "check-blob" on them + modeCheckBlock +) + +// scanMode is an internal per-loop mode used by scan workers (meta/blob/block). type scanMode int func (m scanMode) String() string { @@ -48,15 +51,48 @@ func (m scanMode) String() string { return "check-blob" case modeCheckBlock: return "check-block" - case modeHybrid: - return "hybrid" default: - panic(fmt.Sprintf("invalid scanner mode: %d", m)) + return fmt.Sprintf("unknown(%d)", int(m)) } } +const ( + modeSetMeta scanModeSet = 1 << iota // 1 + modeSetBlob // 2 + modeSetBlock // 4 +) + +const scanModeSetMask = modeSetMeta | modeSetBlob | modeSetBlock // 7 + +// scanModeSet is a combination of scanMode values used for configuration purposes. +type scanModeSet uint8 + +func (m scanModeSet) String() string { + if m == 0 { + return "disabled" + } + + out := "" + if m&modeSetMeta != 0 { + out = "check-meta" + } + if m&modeSetBlob != 0 { + if out != "" { + out += "+" + } + out += "check-blob" + } + if m&modeSetBlock != 0 { + if out != "" { + out += "+" + } + out += "check-block" + } + return out +} + type Config struct { - Mode scanMode + Mode scanModeSet BatchSize int L1SlotTime time.Duration IntervalMeta time.Duration @@ -68,7 +104,7 @@ func CLIFlags() []cli.Flag { flags := []cli.Flag{ cli.IntFlag{ Name: ModeFlagName, - Usage: "Data scan mode, 0: disabled, 1: check meta, 2: check blob, 3: check block, 4: hybrid", + Usage: "Data scan mode (bitmask) : 0=disabled, 1=meta, 2=blob, 4=block; combinations via sum/OR: 3=meta+blob, 5=meta+block, 6=blob+block, 7=all", EnvVar: scannerEnv("MODE"), Value: 1, }, @@ -101,12 +137,13 @@ func CLIFlags() []cli.Flag { } func NewConfig(ctx *cli.Context, slot uint64) *Config { - mode := ctx.GlobalInt(ModeFlagName) - if mode == modeDisabled { + mode := scanModeSet(ctx.GlobalInt(ModeFlagName)) & scanModeSetMask + + if mode == 0 { return nil } return &Config{ - Mode: scanMode(mode), + Mode: mode, BatchSize: ctx.GlobalInt(BatchSizeFlagName), L1SlotTime: time.Second * time.Duration(slot), IntervalMeta: time.Minute * time.Duration(ctx.GlobalInt(IntervalMetaFlagName)), diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index 11f7bb80..ed22758b 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -94,30 +94,23 @@ func (s *Scanner) start() { s.running = true s.mu.Unlock() - switch s.cfg.Mode { - case modeDisabled: + if s.cfg.Mode == 0 { s.lg.Info("Scanner is disabled") return - case modeCheckMeta: + } + + if s.cfg.Mode&modeSetMeta != 0 { s.launchScanLoop(s.metaScanLoopRuntime()) - s.lg.Info("Scanner started in meta check mode") - case modeCheckBlob: + } + if s.cfg.Mode&modeSetBlob != 0 { s.launchScanLoop(s.blobScanLoopRuntime()) - s.lg.Info("Scanner started in blob check mode") - case modeCheckBlock: - // Launch the scan loop for the updated KVs in the latest blocks every 24 hours - s.launchScanLoop(s.latestScanLoopRuntime()) - s.lg.Info("Scanner started in block check mode") - case modeHybrid: - // hybrid mode - s.launchScanLoop(s.metaScanLoopRuntime()) + } + if s.cfg.Mode&modeSetBlock != 0 { s.launchScanLoop(s.latestScanLoopRuntime()) - s.lg.Info("Scanner started in hybrid mode") - default: - s.lg.Error("Invalid scanner mode", "mode", s.cfg.Mode) - return } + s.lg.Info("Scanner started", "mode", s.cfg.Mode.String()) + s.startReporter() // Launch the scan loop to fix mismatched KVs every 12 minutes FIXME: adjust interval? s.launchFixLoop(time.Minute * 12) diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index 2802ce97..1d298b78 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -129,7 +129,7 @@ func (s *Worker) latestUpdated(blocksToScan uint64, lastScannedBlock uint64) ([] startBlock := lastScannedBlock + 1 endBlock := latestFinalized.Number.Uint64() if lastScannedBlock == 0 { - s.lg.Info(fmt.Sprintf("No last scanned block recorded, starting from %d blocks ago", blocksToScan)) + s.lg.Info(fmt.Sprintf("No last scanned block recorded, starting from %d slots ago", blocksToScan)) startBlock = endBlock - blocksToScan } if startBlock > endBlock { From d7b54ed97fe164584a7e7c1dc964429d342b4c1a Mon Sep 17 00:00:00 2001 From: syntrust Date: Fri, 9 Jan 2026 18:15:19 +0800 Subject: [PATCH 49/51] update doc --- ethstorage/scanner/README.md | 59 +++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 8 deletions(-) diff --git a/ethstorage/scanner/README.md b/ethstorage/scanner/README.md index 970f73ee..c2496e0e 100644 --- a/ethstorage/scanner/README.md +++ b/ethstorage/scanner/README.md @@ -1,14 +1,57 @@ # EthStorage Scanner -A data verification service periodically checks if the data hashes of the blobs in local storage files align with the key-value hashes in the storage contract. If any mismatch found, the service looks for the correct blob in the p2p network, and update the data in the local storage. +A data verification service that periodically checks whether locally stored KV blobs match the on-chain KV meta hash (commit) from the storage contract. -This service offers a lightweight yet effective way to maintain network-wide data consistency. +If a mismatch is detected, the scanner can attempt to repair the local data by re-fetching the blob from the network and rewriting it with meta validation. -### Usage +## Options -The scanner service is enabled with `check meta` mode by default: -- `--scanner.mode` Data scan mode, 0: disabled, 1: check meta, 2: check blob (default: 1)[`ES_NODE_SCANNER_MODE`]. +| Flag | Default | Env var | Description | +| --- | --- | --- | --- | +| `--scanner.mode` | `1` | `ES_NODE_SCANNER_MODE` | Data scan mode (bitmask): `0`=disabled, `1`=meta, `2`=blob, `4`=block. Combine via sum/OR (e.g. `3`=`1+2`, `5`=`1+4`, `7`=`1+2+4`). | +| `--scanner.batch-size` | `8192` | `ES_NODE_SCANNER_BATCH_SIZE` | Data scan batch size. | +| `--scanner.interval.meta` | `3` (minutes) | `ES_NODE_SCANNER_INTERVAL_META` | Scan interval for `check-meta`. | +| `--scanner.interval.blob` | `60` (minutes) | `ES_NODE_SCANNER_INTERVAL_BLOB` | Scan interval for `check-blob`. | +| `--scanner.interval.block` | `1440` (minutes) | `ES_NODE_SCANNER_INTERVAL_BLOCK` | Scan interval for `check-block`. | -The following settings are required if the service is not disabled manually: -- `--scanner.batch-size` Data scan batch size (default: 8192) [`$ES_NODE_SCANNER_BATCH_SIZE`] -- `--scanner.interval` Data scan interval in minutes (default: 3) [`$ES_NODE_SCANNER_INTERVAL`] +## Scan modes explained + +The flag / env `--scanner.mode` (default: `1`) [`ES_NODE_SCANNER_MODE`] is a bitmask: + +- `0`: disabled +- `1`: check-meta (compare local meta with on-chain meta) +- `2`: check-blob (read local blob and validate its commit against on-chain meta) +- `4`: check-block (scan recently updated KVs from finalized blocks, then run check-blob on them) + +### Quick comparison + +| Name | `--scanner.mode` | What it does | Performance impact | Notes | +| --- | ---: | --- | --- | --- | +| check-meta | `1` | Read local meta and compare with on-chain meta | Low | Minimal impact; may miss some mismatches | +| check-blob | `2` | Compute the commit from a local blob and validate it against on-chain meta | High | Best precision; highest IO/CPU cost when many blobs | +| check-block | `4` | Scan recently finalized blocks for updated KVs, then run `check-blob` on them | High | Ensure newly updated blobs are fetched and verified within the Beacon node retention window | + +### More choices + +You can combine modes by summing/OR-ing them to get mixed behavior and balance precision, coverage vs performance: + +- `3` = `1 + 2` = meta + blob +- `5` = `1 + 4` = meta + block +- `6` = `2 + 4` = blob + block +- `7` = meta + blob + block + +`--scanner.batch-size` and `--scanner.interval.*` control the batch size and frequency of each scan mode, so you can tune the performance impact further based on the amount of data and hardware resources. + +## Status tracking + +When es-node starts, the scanner only starts after the node finishes syncing all shards from the P2P network. + +After it starts, the scanner periodically logs summaries and statistics (mismatched/unfixed counts). These counts are also exposed in the node state as `scan_stats`. + +## Repair behavior + +A background repair loop periodically retries/fixes mismatched KVs by fetching blobs from the p2p network and rewriting them locally. + +- If mismatches are detected for the first time, the KV is marked as `pending`, meaning it is scheduled for repair. Sometimes the mismatch is transient (e.g., due to download latency) and may be recovered automatically by the downloader. +- If the KV is repaired successfully or recovered, it is removed from the mismatch list. +- If the repair fails, it remains in the mismatch list and is marked as `failed` for future retries. From a530f38e21daf324f3323cc1a661902da6721948 Mon Sep 17 00:00:00 2001 From: syntrust Date: Tue, 13 Jan 2026 18:17:21 +0800 Subject: [PATCH 50/51] updates on fix --- ethstorage/scanner/scanner.go | 4 +--- ethstorage/scanner/worker.go | 11 +++-------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index ed22758b..7f7409c1 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -196,8 +196,6 @@ func (s *Scanner) launchFixLoop(interval time.Duration) { select { case <-fixTicker.C: s.lg.Info("Scanner fix batch triggered") - // hold for 3 minutes before fixing to allow possible ongoing kv downloading to finish - time.Sleep(time.Minute * 3) // hold until other possible ongoing scans finish if !s.acquireScanPermit() { return @@ -205,7 +203,7 @@ func (s *Scanner) launchFixLoop(interval time.Duration) { s.statsMu.Lock() kvIndices := s.sharedStats.needFix() s.statsMu.Unlock() - + s.lg.Info("Scanner fixing batch", "mismatches", kvIndices) err := s.worker.fixBatch(s.ctx, kvIndices, func(kvi uint64, m *scanned) { s.updateStats(kvi, m) }) diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index 1d298b78..d158f27b 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -212,21 +212,16 @@ func (s *Worker) fixBatch(ctx context.Context, kvIndices []uint64, onUpdate scan s.lg.Debug("Query KV meta done", "kvsInBatch", shortPrt(kvIndices)) for i, meta := range metas { - select { - case <-ctx.Done(): - s.lg.Warn("Scanner canceled, stopping fix batch", "ctx.Err", ctx.Err()) - return ctx.Err() - default: - } var commit common.Hash copy(commit[:], meta[32-es.HashSizeInContract:32]) - s.scanAndFixKv(kvIndices[i], commit, onUpdate) + s.fixKv(kvIndices[i], commit, onUpdate) } return nil } -func (s *Worker) scanAndFixKv(kvIndex uint64, commit common.Hash, onUpdate scanUpdateFn) { +func (s *Worker) fixKv(kvIndex uint64, commit common.Hash, onUpdate scanUpdateFn) { marker := newScanMarker(kvIndex, onUpdate) + // check blob again before fix _, found, err := s.sm.TryRead(kvIndex, int(s.sm.MaxKvSize()), commit) if !found && err == nil { err = fmt.Errorf("blob not found locally: %x", commit) From 9099214e264413068d9c8864ffede48d4f3df2c9 Mon Sep 17 00:00:00 2001 From: syntrust Date: Tue, 13 Jan 2026 18:38:40 +0800 Subject: [PATCH 51/51] minor --- ethstorage/scanner/scanner.go | 2 +- ethstorage/scanner/worker.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ethstorage/scanner/scanner.go b/ethstorage/scanner/scanner.go index 7f7409c1..fa128312 100644 --- a/ethstorage/scanner/scanner.go +++ b/ethstorage/scanner/scanner.go @@ -276,7 +276,7 @@ func (s *Scanner) logStats() { s.lg.Info("Local storage summary", "localKvs", sum, "localKvCount", localKvCount) s.statsMu.Lock() - mismatched := "(none)" + mismatched := "[]" if len(s.sharedStats) > 0 { mismatched = s.sharedStats.String() } diff --git a/ethstorage/scanner/worker.go b/ethstorage/scanner/worker.go index d158f27b..dc6d371a 100644 --- a/ethstorage/scanner/worker.go +++ b/ethstorage/scanner/worker.go @@ -252,7 +252,7 @@ func (s *Worker) summaryLocalKvs() (uint64, string) { kvEntryCountOnChain := s.sm.KvEntryCount() if kvEntryCountOnChain == 0 { s.lg.Info("No KV entries found in local storage") - return 0, "(none)" + return 0, "[]" } return summaryLocalKvs(s.sm.Shards(), s.sm.KvEntries(), kvEntryCountOnChain-1) }