feat(proxyd): improved consensus HA error reporting (#9647)

* feat(proxyd): improved consensus HA error reporting

* error metric should be a counter
This commit is contained in:
felipe 2024-02-23 13:17:07 -08:00 committed by GitHub
parent 97767bcfa3
commit a6c9489376
2 changed files with 43 additions and 2 deletions

@ -194,10 +194,12 @@ func (ct *RedisConsensusTracker) stateHeartbeat() {
val, err := ct.client.Get(ct.ctx, key).Result() val, err := ct.client.Get(ct.ctx, key).Result()
if err != nil && err != redis.Nil { if err != nil && err != redis.Nil {
log.Error("failed to read the lock", "err", err) log.Error("failed to read the lock", "err", err)
RecordGroupConsensusError(ct.backendGroup, "read_lock", err)
if ct.leader { if ct.leader {
ok, err := ct.redlock.Unlock() ok, err := ct.redlock.Unlock()
if err != nil || !ok { if err != nil || !ok {
log.Error("failed to release the lock after error", "err", err) log.Error("failed to release the lock after error", "err", err)
RecordGroupConsensusError(ct.backendGroup, "leader_release_lock", err)
return return
} }
ct.leader = false ct.leader = false
@ -210,9 +212,11 @@ func (ct *RedisConsensusTracker) stateHeartbeat() {
ok, err := ct.redlock.Extend() ok, err := ct.redlock.Extend()
if err != nil || !ok { if err != nil || !ok {
log.Error("failed to extend lock", "err", err, "mutex", ct.redlock.Name(), "val", ct.redlock.Value()) log.Error("failed to extend lock", "err", err, "mutex", ct.redlock.Name(), "val", ct.redlock.Value())
RecordGroupConsensusError(ct.backendGroup, "leader_extend_lock", err)
ok, err := ct.redlock.Unlock() ok, err := ct.redlock.Unlock()
if err != nil || !ok { if err != nil || !ok {
log.Error("failed to release the lock after error", "err", err) log.Error("failed to release the lock after error", "err", err)
RecordGroupConsensusError(ct.backendGroup, "leader_release_lock", err)
return return
} }
ct.leader = false ct.leader = false
@ -224,6 +228,7 @@ func (ct *RedisConsensusTracker) stateHeartbeat() {
leaderName, err := ct.client.Get(ct.ctx, ct.key(fmt.Sprintf("leader:%s", val))).Result() leaderName, err := ct.client.Get(ct.ctx, ct.key(fmt.Sprintf("leader:%s", val))).Result()
if err != nil && err != redis.Nil { if err != nil && err != redis.Nil {
log.Error("failed to read the remote leader", "err", err) log.Error("failed to read the remote leader", "err", err)
RecordGroupConsensusError(ct.backendGroup, "read_leader", err)
return return
} }
ct.leaderName = leaderName ct.leaderName = leaderName
@ -232,16 +237,19 @@ func (ct *RedisConsensusTracker) stateHeartbeat() {
val, err := ct.client.Get(ct.ctx, ct.key(fmt.Sprintf("state:%s", val))).Result() val, err := ct.client.Get(ct.ctx, ct.key(fmt.Sprintf("state:%s", val))).Result()
if err != nil && err != redis.Nil { if err != nil && err != redis.Nil {
log.Error("failed to read the remote state", "err", err) log.Error("failed to read the remote state", "err", err)
RecordGroupConsensusError(ct.backendGroup, "read_state", err)
return return
} }
if val == "" { if val == "" {
log.Error("remote state is missing (recent leader election maybe?)") log.Error("remote state is missing (recent leader election maybe?)")
RecordGroupConsensusError(ct.backendGroup, "read_state_missing", err)
return return
} }
state := &ConsensusTrackerState{} state := &ConsensusTrackerState{}
err = json.Unmarshal([]byte(val), state) err = json.Unmarshal([]byte(val), state)
if err != nil { if err != nil {
log.Error("failed to unmarshal the remote state", "err", err) log.Error("failed to unmarshal the remote state", "err", err)
RecordGroupConsensusError(ct.backendGroup, "read_unmarshal_state", err)
return return
} }
@ -316,13 +324,26 @@ func (ct *RedisConsensusTracker) postPayload(mutexVal string) {
jsonState, err := json.Marshal(ct.local.state) jsonState, err := json.Marshal(ct.local.state)
if err != nil { if err != nil {
log.Error("failed to marshal local", "err", err) log.Error("failed to marshal local", "err", err)
RecordGroupConsensusError(ct.backendGroup, "leader_marshal_local_state", err)
ct.leader = false
return
}
err = ct.client.Set(ct.ctx, ct.key(fmt.Sprintf("state:%s", mutexVal)), jsonState, ct.lockPeriod).Err()
if err != nil {
log.Error("failed to post the state", "err", err)
RecordGroupConsensusError(ct.backendGroup, "leader_post_state", err)
ct.leader = false ct.leader = false
return return
} }
ct.client.Set(ct.ctx, ct.key(fmt.Sprintf("state:%s", mutexVal)), jsonState, ct.lockPeriod)
leader, _ := os.LookupEnv("HOSTNAME") leader, _ := os.LookupEnv("HOSTNAME")
ct.client.Set(ct.ctx, ct.key(fmt.Sprintf("leader:%s", mutexVal)), leader, ct.lockPeriod) err = ct.client.Set(ct.ctx, ct.key(fmt.Sprintf("leader:%s", mutexVal)), leader, ct.lockPeriod).Err()
if err != nil {
log.Error("failed to post the leader", "err", err)
RecordGroupConsensusError(ct.backendGroup, "leader_post_leader", err)
ct.leader = false
return
}
log.Debug("posted state", "state", string(jsonState), "leader", leader) log.Debug("posted state", "state", string(jsonState), "leader", leader)

@ -2,6 +2,8 @@ package proxyd
import ( import (
"context" "context"
"fmt"
"regexp"
"strconv" "strconv"
"strings" "strings"
"time" "time"
@ -262,6 +264,14 @@ var (
"backend_group_name", "backend_group_name",
}) })
consensusHAError = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "group_consensus_ha_error",
Help: "Consensus HA error count",
}, []string{
"error",
})
consensusHALatestBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{ consensusHALatestBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace, Namespace: MetricsNamespace,
Name: "group_consensus_ha_latest_block", Name: "group_consensus_ha_latest_block",
@ -465,6 +475,16 @@ func RecordBatchSize(size int) {
batchSizeHistogram.Observe(float64(size)) batchSizeHistogram.Observe(float64(size))
} }
var nonAlphanumericRegex = regexp.MustCompile(`[^a-zA-Z ]+`)
func RecordGroupConsensusError(group *BackendGroup, label string, err error) {
errClean := nonAlphanumericRegex.ReplaceAllString(err.Error(), "")
errClean = strings.ReplaceAll(errClean, " ", "_")
errClean = strings.ReplaceAll(errClean, "__", "_")
label = fmt.Sprintf("%s.%s", label, errClean)
consensusHAError.WithLabelValues(label).Inc()
}
func RecordGroupConsensusHALatestBlock(group *BackendGroup, leader string, blockNumber hexutil.Uint64) { func RecordGroupConsensusHALatestBlock(group *BackendGroup, leader string, blockNumber hexutil.Uint64) {
consensusHALatestBlock.WithLabelValues(group.Name, leader).Set(float64(blockNumber)) consensusHALatestBlock.WithLabelValues(group.Name, leader).Set(float64(blockNumber))
} }