From a6c9489376d38cdaecece2e470313f6052824570 Mon Sep 17 00:00:00 2001 From: felipe <130432649+felipe-op@users.noreply.github.com> Date: Fri, 23 Feb 2024 13:17:07 -0800 Subject: [PATCH] feat(proxyd): improved consensus HA error reporting (#9647) * feat(proxyd): improved consensus HA error reporting * error metric should be a counter --- proxyd/proxyd/consensus_tracker.go | 25 +++++++++++++++++++++++-- proxyd/proxyd/metrics.go | 20 ++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/proxyd/proxyd/consensus_tracker.go b/proxyd/proxyd/consensus_tracker.go index 158c31b..77e0fdb 100644 --- a/proxyd/proxyd/consensus_tracker.go +++ b/proxyd/proxyd/consensus_tracker.go @@ -194,10 +194,12 @@ func (ct *RedisConsensusTracker) stateHeartbeat() { val, err := ct.client.Get(ct.ctx, key).Result() if err != nil && err != redis.Nil { log.Error("failed to read the lock", "err", err) + RecordGroupConsensusError(ct.backendGroup, "read_lock", err) if ct.leader { ok, err := ct.redlock.Unlock() if err != nil || !ok { log.Error("failed to release the lock after error", "err", err) + RecordGroupConsensusError(ct.backendGroup, "leader_release_lock", err) return } ct.leader = false @@ -210,9 +212,11 @@ func (ct *RedisConsensusTracker) stateHeartbeat() { ok, err := ct.redlock.Extend() if err != nil || !ok { log.Error("failed to extend lock", "err", err, "mutex", ct.redlock.Name(), "val", ct.redlock.Value()) + RecordGroupConsensusError(ct.backendGroup, "leader_extend_lock", err) ok, err := ct.redlock.Unlock() if err != nil || !ok { log.Error("failed to release the lock after error", "err", err) + RecordGroupConsensusError(ct.backendGroup, "leader_release_lock", err) return } ct.leader = false @@ -224,6 +228,7 @@ func (ct *RedisConsensusTracker) stateHeartbeat() { leaderName, err := ct.client.Get(ct.ctx, ct.key(fmt.Sprintf("leader:%s", val))).Result() if err != nil && err != redis.Nil { log.Error("failed to read the remote leader", "err", err) + RecordGroupConsensusError(ct.backendGroup, "read_leader", err) return } ct.leaderName = leaderName @@ -232,16 +237,19 @@ func (ct *RedisConsensusTracker) stateHeartbeat() { val, err := ct.client.Get(ct.ctx, ct.key(fmt.Sprintf("state:%s", val))).Result() if err != nil && err != redis.Nil { log.Error("failed to read the remote state", "err", err) + RecordGroupConsensusError(ct.backendGroup, "read_state", err) return } if val == "" { log.Error("remote state is missing (recent leader election maybe?)") + RecordGroupConsensusError(ct.backendGroup, "read_state_missing", err) return } state := &ConsensusTrackerState{} err = json.Unmarshal([]byte(val), state) if err != nil { log.Error("failed to unmarshal the remote state", "err", err) + RecordGroupConsensusError(ct.backendGroup, "read_unmarshal_state", err) return } @@ -316,13 +324,26 @@ func (ct *RedisConsensusTracker) postPayload(mutexVal string) { jsonState, err := json.Marshal(ct.local.state) if err != nil { log.Error("failed to marshal local", "err", err) + RecordGroupConsensusError(ct.backendGroup, "leader_marshal_local_state", err) + ct.leader = false + return + } + err = ct.client.Set(ct.ctx, ct.key(fmt.Sprintf("state:%s", mutexVal)), jsonState, ct.lockPeriod).Err() + if err != nil { + log.Error("failed to post the state", "err", err) + RecordGroupConsensusError(ct.backendGroup, "leader_post_state", err) ct.leader = false return } - ct.client.Set(ct.ctx, ct.key(fmt.Sprintf("state:%s", mutexVal)), jsonState, ct.lockPeriod) leader, _ := os.LookupEnv("HOSTNAME") - ct.client.Set(ct.ctx, ct.key(fmt.Sprintf("leader:%s", mutexVal)), leader, ct.lockPeriod) + err = ct.client.Set(ct.ctx, ct.key(fmt.Sprintf("leader:%s", mutexVal)), leader, ct.lockPeriod).Err() + if err != nil { + log.Error("failed to post the leader", "err", err) + RecordGroupConsensusError(ct.backendGroup, "leader_post_leader", err) + ct.leader = false + return + } log.Debug("posted state", "state", string(jsonState), "leader", leader) diff --git a/proxyd/proxyd/metrics.go b/proxyd/proxyd/metrics.go index 68ca4e8..90a79ab 100644 --- a/proxyd/proxyd/metrics.go +++ b/proxyd/proxyd/metrics.go @@ -2,6 +2,8 @@ package proxyd import ( "context" + "fmt" + "regexp" "strconv" "strings" "time" @@ -262,6 +264,14 @@ var ( "backend_group_name", }) + consensusHAError = promauto.NewCounterVec(prometheus.CounterOpts{ + Namespace: MetricsNamespace, + Name: "group_consensus_ha_error", + Help: "Consensus HA error count", + }, []string{ + "error", + }) + consensusHALatestBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{ Namespace: MetricsNamespace, Name: "group_consensus_ha_latest_block", @@ -465,6 +475,16 @@ func RecordBatchSize(size int) { batchSizeHistogram.Observe(float64(size)) } +var nonAlphanumericRegex = regexp.MustCompile(`[^a-zA-Z ]+`) + +func RecordGroupConsensusError(group *BackendGroup, label string, err error) { + errClean := nonAlphanumericRegex.ReplaceAllString(err.Error(), "") + errClean = strings.ReplaceAll(errClean, " ", "_") + errClean = strings.ReplaceAll(errClean, "__", "_") + label = fmt.Sprintf("%s.%s", label, errClean) + consensusHAError.WithLabelValues(label).Inc() +} + func RecordGroupConsensusHALatestBlock(group *BackendGroup, leader string, blockNumber hexutil.Uint64) { consensusHALatestBlock.WithLabelValues(group.Name, leader).Set(float64(blockNumber)) }