feat(proxyd): improved consensus HA error reporting (#9647)

* feat(proxyd): improved consensus HA error reporting

* error metric should be a counter
This commit is contained in:
felipe 2024-02-23 13:17:07 -08:00 committed by GitHub
parent 97767bcfa3
commit a6c9489376
2 changed files with 43 additions and 2 deletions

@ -194,10 +194,12 @@ func (ct *RedisConsensusTracker) stateHeartbeat() {
val, err := ct.client.Get(ct.ctx, key).Result()
if err != nil && err != redis.Nil {
log.Error("failed to read the lock", "err", err)
RecordGroupConsensusError(ct.backendGroup, "read_lock", err)
if ct.leader {
ok, err := ct.redlock.Unlock()
if err != nil || !ok {
log.Error("failed to release the lock after error", "err", err)
RecordGroupConsensusError(ct.backendGroup, "leader_release_lock", err)
return
}
ct.leader = false
@ -210,9 +212,11 @@ func (ct *RedisConsensusTracker) stateHeartbeat() {
ok, err := ct.redlock.Extend()
if err != nil || !ok {
log.Error("failed to extend lock", "err", err, "mutex", ct.redlock.Name(), "val", ct.redlock.Value())
RecordGroupConsensusError(ct.backendGroup, "leader_extend_lock", err)
ok, err := ct.redlock.Unlock()
if err != nil || !ok {
log.Error("failed to release the lock after error", "err", err)
RecordGroupConsensusError(ct.backendGroup, "leader_release_lock", err)
return
}
ct.leader = false
@ -224,6 +228,7 @@ func (ct *RedisConsensusTracker) stateHeartbeat() {
leaderName, err := ct.client.Get(ct.ctx, ct.key(fmt.Sprintf("leader:%s", val))).Result()
if err != nil && err != redis.Nil {
log.Error("failed to read the remote leader", "err", err)
RecordGroupConsensusError(ct.backendGroup, "read_leader", err)
return
}
ct.leaderName = leaderName
@ -232,16 +237,19 @@ func (ct *RedisConsensusTracker) stateHeartbeat() {
val, err := ct.client.Get(ct.ctx, ct.key(fmt.Sprintf("state:%s", val))).Result()
if err != nil && err != redis.Nil {
log.Error("failed to read the remote state", "err", err)
RecordGroupConsensusError(ct.backendGroup, "read_state", err)
return
}
if val == "" {
log.Error("remote state is missing (recent leader election maybe?)")
RecordGroupConsensusError(ct.backendGroup, "read_state_missing", err)
return
}
state := &ConsensusTrackerState{}
err = json.Unmarshal([]byte(val), state)
if err != nil {
log.Error("failed to unmarshal the remote state", "err", err)
RecordGroupConsensusError(ct.backendGroup, "read_unmarshal_state", err)
return
}
@ -316,13 +324,26 @@ func (ct *RedisConsensusTracker) postPayload(mutexVal string) {
jsonState, err := json.Marshal(ct.local.state)
if err != nil {
log.Error("failed to marshal local", "err", err)
RecordGroupConsensusError(ct.backendGroup, "leader_marshal_local_state", err)
ct.leader = false
return
}
err = ct.client.Set(ct.ctx, ct.key(fmt.Sprintf("state:%s", mutexVal)), jsonState, ct.lockPeriod).Err()
if err != nil {
log.Error("failed to post the state", "err", err)
RecordGroupConsensusError(ct.backendGroup, "leader_post_state", err)
ct.leader = false
return
}
ct.client.Set(ct.ctx, ct.key(fmt.Sprintf("state:%s", mutexVal)), jsonState, ct.lockPeriod)
leader, _ := os.LookupEnv("HOSTNAME")
ct.client.Set(ct.ctx, ct.key(fmt.Sprintf("leader:%s", mutexVal)), leader, ct.lockPeriod)
err = ct.client.Set(ct.ctx, ct.key(fmt.Sprintf("leader:%s", mutexVal)), leader, ct.lockPeriod).Err()
if err != nil {
log.Error("failed to post the leader", "err", err)
RecordGroupConsensusError(ct.backendGroup, "leader_post_leader", err)
ct.leader = false
return
}
log.Debug("posted state", "state", string(jsonState), "leader", leader)

@ -2,6 +2,8 @@ package proxyd
import (
"context"
"fmt"
"regexp"
"strconv"
"strings"
"time"
@ -262,6 +264,14 @@ var (
"backend_group_name",
})
consensusHAError = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "group_consensus_ha_error",
Help: "Consensus HA error count",
}, []string{
"error",
})
consensusHALatestBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "group_consensus_ha_latest_block",
@ -465,6 +475,16 @@ func RecordBatchSize(size int) {
batchSizeHistogram.Observe(float64(size))
}
var nonAlphanumericRegex = regexp.MustCompile(`[^a-zA-Z ]+`)
func RecordGroupConsensusError(group *BackendGroup, label string, err error) {
errClean := nonAlphanumericRegex.ReplaceAllString(err.Error(), "")
errClean = strings.ReplaceAll(errClean, " ", "_")
errClean = strings.ReplaceAll(errClean, "__", "_")
label = fmt.Sprintf("%s.%s", label, errClean)
consensusHAError.WithLabelValues(label).Inc()
}
func RecordGroupConsensusHALatestBlock(group *BackendGroup, leader string, blockNumber hexutil.Uint64) {
consensusHALatestBlock.WithLabelValues(group.Name, leader).Set(float64(blockNumber))
}