From cde638b11dd90a07790339607be6134faf832980 Mon Sep 17 00:00:00 2001 From: Felipe Andrade Date: Tue, 9 May 2023 17:21:25 -0700 Subject: [PATCH 1/5] moar consensus metrics --- proxyd/proxyd/consensus_poller.go | 22 +++++++--- proxyd/proxyd/metrics.go | 73 ++++++++++++++++++++++++++++++- 2 files changed, 88 insertions(+), 7 deletions(-) diff --git a/proxyd/proxyd/consensus_poller.go b/proxyd/proxyd/consensus_poller.go index 34fe708..a7b1fc2 100644 --- a/proxyd/proxyd/consensus_poller.go +++ b/proxyd/proxyd/consensus_poller.go @@ -203,7 +203,10 @@ func NewConsensusPoller(bg *BackendGroup, opts ...ConsensusOpt) *ConsensusPoller // UpdateBackend refreshes the consensus state of a single backend func (cp *ConsensusPoller) UpdateBackend(ctx context.Context, be *Backend) { - if cp.IsBanned(be) { + banned := cp.IsBanned(be) + RecordConsensusBackendBanned(be, banned) + + if banned { log.Debug("skipping backend banned", "backend", be.Name) return } @@ -212,6 +215,7 @@ func (cp *ConsensusPoller) UpdateBackend(ctx context.Context, be *Backend) { if !be.Online() || !be.IsHealthy() { log.Warn("backend banned - not online or not healthy", "backend", be.Name) cp.Ban(be) + return } // if backend it not in sync we'll check again after ban @@ -219,7 +223,9 @@ func (cp *ConsensusPoller) UpdateBackend(ctx context.Context, be *Backend) { if err != nil || !inSync { log.Warn("backend banned - not in sync", "backend", be.Name) cp.Ban(be) + return } + RecordConsensusBackendInSync(be, inSync) // if backend exhausted rate limit we'll skip it for now if be.IsRateLimited() { @@ -234,6 +240,7 @@ func (cp *ConsensusPoller) UpdateBackend(ctx context.Context, be *Backend) { return } } + RecordConsensusBackendPeerCount(be, peerCount) latestBlockNumber, latestBlockHash, err := cp.fetchBlock(ctx, be, "latest") if err != nil { @@ -241,15 +248,17 @@ func (cp *ConsensusPoller) UpdateBackend(ctx context.Context, be *Backend) { return } - changed := cp.setBackendState(be, peerCount, latestBlockNumber, latestBlockHash) + changed, updateDelay := cp.setBackendState(be, peerCount, latestBlockNumber, latestBlockHash) if changed { RecordBackendLatestBlock(be, latestBlockNumber) + RecordConsensusBackendUpdateDelay(be, updateDelay) log.Debug("backend state updated", "name", be.Name, "peerCount", peerCount, "latestBlockNumber", latestBlockNumber, - "latestBlockHash", latestBlockHash) + "latestBlockHash", latestBlockHash, + "updateDelay", updateDelay) } } @@ -354,11 +363,13 @@ func (cp *ConsensusPoller) UpdateBackendGroupConsensus(ctx context.Context) { } cp.tracker.SetConsensusBlockNumber(proposedBlock) - RecordGroupConsensusLatestBlock(cp.backendGroup, proposedBlock) cp.consensusGroupMux.Lock() cp.consensusGroup = consensusBackends cp.consensusGroupMux.Unlock() + RecordGroupConsensusLatestBlock(cp.backendGroup, proposedBlock) + RecordGroupConsensusCount(cp.backendGroup, len(consensusBackends)) + log.Debug("group state", "proposedBlock", proposedBlock, "consensusBackends", strings.Join(consensusBackendsNames, ", "), "filteredBackends", strings.Join(filteredBackendsNames, ", ")) } @@ -463,13 +474,14 @@ func (cp *ConsensusPoller) getBackendState(be *Backend) (peerCount uint64, block return } -func (cp *ConsensusPoller) setBackendState(be *Backend, peerCount uint64, blockNumber hexutil.Uint64, blockHash string) (changed bool) { +func (cp *ConsensusPoller) setBackendState(be *Backend, peerCount uint64, blockNumber hexutil.Uint64, blockHash string) (changed bool, updateDelay time.Duration) { bs := cp.backendState[be] bs.backendStateMux.Lock() changed = bs.latestBlockHash != blockHash bs.peerCount = peerCount bs.latestBlockNumber = blockNumber bs.latestBlockHash = blockHash + updateDelay = time.Now().Sub(bs.lastUpdate) bs.lastUpdate = time.Now() bs.backendStateMux.Unlock() return diff --git a/proxyd/proxyd/metrics.go b/proxyd/proxyd/metrics.go index 2aef49d..ab5d284 100644 --- a/proxyd/proxyd/metrics.go +++ b/proxyd/proxyd/metrics.go @@ -4,6 +4,7 @@ import ( "context" "strconv" "strings" + "time" "github.com/ethereum/go-ethereum/common/hexutil" @@ -260,6 +261,46 @@ var ( }, []string{ "backend_name", }) + + consensusGroupCount = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: MetricsNamespace, + Name: "group_consensus_count", + Help: "Consensus group count", + }, []string{ + "backend_group_name", + }) + + consensusBannedBackends = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: MetricsNamespace, + Name: "consensus_backend_banned", + Help: "Bool gauge for banned backends", + }, []string{ + "backend_name", + }) + + consensusPeerCountBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: MetricsNamespace, + Name: "consensus_backend_peer_count", + Help: "Peer count", + }, []string{ + "backend_name", + }) + + consensusInSyncBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: MetricsNamespace, + Name: "consensus_backend_in_sync", + Help: "Bool gauge for backends in sync", + }, []string{ + "backend_name", + }) + + consensusUpdateDelayBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: MetricsNamespace, + Name: "consensus_backend_update_delay", + Help: "Delay (ms) for backend update", + }, []string{ + "backend_name", + }) ) func RecordRedisError(source string) { @@ -321,10 +362,38 @@ func RecordBatchSize(size int) { batchSizeHistogram.Observe(float64(size)) } +func RecordGroupConsensusLatestBlock(group *BackendGroup, blockNumber hexutil.Uint64) { + consensusLatestBlock.WithLabelValues(group.Name).Set(float64(blockNumber)) +} + +func RecordGroupConsensusCount(group *BackendGroup, count int) { + consensusGroupCount.WithLabelValues(group.Name).Set(float64(count)) +} + func RecordBackendLatestBlock(be *Backend, blockNumber hexutil.Uint64) { backendLatestBlockBackend.WithLabelValues(be.Name).Set(float64(blockNumber)) } -func RecordGroupConsensusLatestBlock(group *BackendGroup, blockNumber hexutil.Uint64) { - consensusLatestBlock.WithLabelValues(group.Name).Set(float64(blockNumber)) +func RecordConsensusBackendBanned(be *Backend, banned bool) { + v := float64(0) + if banned { + v = float64(1) + } + consensusBannedBackends.WithLabelValues(be.Name).Set(v) +} + +func RecordConsensusBackendPeerCount(be *Backend, peerCount uint64) { + consensusPeerCountBackend.WithLabelValues(be.Name).Set(float64(peerCount)) +} + +func RecordConsensusBackendInSync(be *Backend, inSync bool) { + v := float64(0) + if inSync { + v = float64(1) + } + consensusInSyncBackend.WithLabelValues(be.Name).Set(v) +} + +func RecordConsensusBackendUpdateDelay(be *Backend, delay time.Duration) { + consensusUpdateDelayBackend.WithLabelValues(be.Name).Set(float64(delay.Round(time.Millisecond))) } From cfb26e6a8ae74e6d26bc03578af47ae7eb857a04 Mon Sep 17 00:00:00 2001 From: Felipe Andrade Date: Tue, 9 May 2023 17:50:25 -0700 Subject: [PATCH 2/5] convert update delay to ms --- proxyd/proxyd/metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proxyd/proxyd/metrics.go b/proxyd/proxyd/metrics.go index ab5d284..6c76fba 100644 --- a/proxyd/proxyd/metrics.go +++ b/proxyd/proxyd/metrics.go @@ -395,5 +395,5 @@ func RecordConsensusBackendInSync(be *Backend, inSync bool) { } func RecordConsensusBackendUpdateDelay(be *Backend, delay time.Duration) { - consensusUpdateDelayBackend.WithLabelValues(be.Name).Set(float64(delay.Round(time.Millisecond))) + consensusUpdateDelayBackend.WithLabelValues(be.Name).Set(float64(delay.Milliseconds())) } From 651b526c506d0a9c213ab4a492cc33ce4a86407d Mon Sep 17 00:00:00 2001 From: Felipe Andrade Date: Tue, 9 May 2023 19:11:29 -0700 Subject: [PATCH 3/5] lint --- proxyd/proxyd/consensus_poller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proxyd/proxyd/consensus_poller.go b/proxyd/proxyd/consensus_poller.go index a7b1fc2..d52130c 100644 --- a/proxyd/proxyd/consensus_poller.go +++ b/proxyd/proxyd/consensus_poller.go @@ -481,7 +481,7 @@ func (cp *ConsensusPoller) setBackendState(be *Backend, peerCount uint64, blockN bs.peerCount = peerCount bs.latestBlockNumber = blockNumber bs.latestBlockHash = blockHash - updateDelay = time.Now().Sub(bs.lastUpdate) + updateDelay = time.Since(bs.lastUpdate) bs.lastUpdate = time.Now() bs.backendStateMux.Unlock() return From 5f61935bc400dbf2f5e4ce4e33438dc4ca1f5a10 Mon Sep 17 00:00:00 2001 From: Felipe Andrade Date: Tue, 9 May 2023 19:17:25 -0700 Subject: [PATCH 4/5] add filtered and total counts --- proxyd/proxyd/consensus_poller.go | 2 ++ proxyd/proxyd/metrics.go | 26 +++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/proxyd/proxyd/consensus_poller.go b/proxyd/proxyd/consensus_poller.go index d52130c..62f7bdf 100644 --- a/proxyd/proxyd/consensus_poller.go +++ b/proxyd/proxyd/consensus_poller.go @@ -369,6 +369,8 @@ func (cp *ConsensusPoller) UpdateBackendGroupConsensus(ctx context.Context) { RecordGroupConsensusLatestBlock(cp.backendGroup, proposedBlock) RecordGroupConsensusCount(cp.backendGroup, len(consensusBackends)) + RecordGroupConsensusFilteredCount(cp.backendGroup, len(filteredBackendsNames)) + RecordGroupTotalCount(cp.backendGroup, len(cp.backendGroup.Backends)) log.Debug("group state", "proposedBlock", proposedBlock, "consensusBackends", strings.Join(consensusBackendsNames, ", "), "filteredBackends", strings.Join(filteredBackendsNames, ", ")) } diff --git a/proxyd/proxyd/metrics.go b/proxyd/proxyd/metrics.go index 6c76fba..efc36ac 100644 --- a/proxyd/proxyd/metrics.go +++ b/proxyd/proxyd/metrics.go @@ -265,7 +265,23 @@ var ( consensusGroupCount = promauto.NewGaugeVec(prometheus.GaugeOpts{ Namespace: MetricsNamespace, Name: "group_consensus_count", - Help: "Consensus group count", + Help: "Consensus group serving traffic count", + }, []string{ + "backend_group_name", + }) + + consensusGroupFilteredCount = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: MetricsNamespace, + Name: "group_consensus_filtered_count", + Help: "Consensus group filtered out from serving traffic count", + }, []string{ + "backend_group_name", + }) + + consensusGroupTotalCount = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: MetricsNamespace, + Name: "group_consensus_total_count", + Help: "Total count of candidates to be part of consensus group", }, []string{ "backend_group_name", }) @@ -370,6 +386,14 @@ func RecordGroupConsensusCount(group *BackendGroup, count int) { consensusGroupCount.WithLabelValues(group.Name).Set(float64(count)) } +func RecordGroupConsensusFilteredCount(group *BackendGroup, count int) { + consensusGroupFilteredCount.WithLabelValues(group.Name).Set(float64(count)) +} + +func RecordGroupTotalCount(group *BackendGroup, count int) { + consensusGroupTotalCount.WithLabelValues(group.Name).Set(float64(count)) +} + func RecordBackendLatestBlock(be *Backend, blockNumber hexutil.Uint64) { backendLatestBlockBackend.WithLabelValues(be.Name).Set(float64(blockNumber)) } From 88a172f7399c0d2631640d5f0534b687ef3ba855 Mon Sep 17 00:00:00 2001 From: Felipe Andrade Date: Tue, 9 May 2023 19:31:49 -0700 Subject: [PATCH 5/5] skip reporting peer count according to config --- proxyd/proxyd/consensus_poller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proxyd/proxyd/consensus_poller.go b/proxyd/proxyd/consensus_poller.go index 62f7bdf..c486d1c 100644 --- a/proxyd/proxyd/consensus_poller.go +++ b/proxyd/proxyd/consensus_poller.go @@ -239,8 +239,8 @@ func (cp *ConsensusPoller) UpdateBackend(ctx context.Context, be *Backend) { log.Warn("error updating backend", "name", be.Name, "err", err) return } + RecordConsensusBackendPeerCount(be, peerCount) } - RecordConsensusBackendPeerCount(be, peerCount) latestBlockNumber, latestBlockHash, err := cp.fetchBlock(ctx, be, "latest") if err != nil {