feat(proxyd): improved consensus HA error reporting (#9647)
* feat(proxyd): improved consensus HA error reporting * error metric should be a counter
This commit is contained in:
parent
97767bcfa3
commit
a6c9489376
@ -194,10 +194,12 @@ func (ct *RedisConsensusTracker) stateHeartbeat() {
|
|||||||
val, err := ct.client.Get(ct.ctx, key).Result()
|
val, err := ct.client.Get(ct.ctx, key).Result()
|
||||||
if err != nil && err != redis.Nil {
|
if err != nil && err != redis.Nil {
|
||||||
log.Error("failed to read the lock", "err", err)
|
log.Error("failed to read the lock", "err", err)
|
||||||
|
RecordGroupConsensusError(ct.backendGroup, "read_lock", err)
|
||||||
if ct.leader {
|
if ct.leader {
|
||||||
ok, err := ct.redlock.Unlock()
|
ok, err := ct.redlock.Unlock()
|
||||||
if err != nil || !ok {
|
if err != nil || !ok {
|
||||||
log.Error("failed to release the lock after error", "err", err)
|
log.Error("failed to release the lock after error", "err", err)
|
||||||
|
RecordGroupConsensusError(ct.backendGroup, "leader_release_lock", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
ct.leader = false
|
ct.leader = false
|
||||||
@ -210,9 +212,11 @@ func (ct *RedisConsensusTracker) stateHeartbeat() {
|
|||||||
ok, err := ct.redlock.Extend()
|
ok, err := ct.redlock.Extend()
|
||||||
if err != nil || !ok {
|
if err != nil || !ok {
|
||||||
log.Error("failed to extend lock", "err", err, "mutex", ct.redlock.Name(), "val", ct.redlock.Value())
|
log.Error("failed to extend lock", "err", err, "mutex", ct.redlock.Name(), "val", ct.redlock.Value())
|
||||||
|
RecordGroupConsensusError(ct.backendGroup, "leader_extend_lock", err)
|
||||||
ok, err := ct.redlock.Unlock()
|
ok, err := ct.redlock.Unlock()
|
||||||
if err != nil || !ok {
|
if err != nil || !ok {
|
||||||
log.Error("failed to release the lock after error", "err", err)
|
log.Error("failed to release the lock after error", "err", err)
|
||||||
|
RecordGroupConsensusError(ct.backendGroup, "leader_release_lock", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
ct.leader = false
|
ct.leader = false
|
||||||
@ -224,6 +228,7 @@ func (ct *RedisConsensusTracker) stateHeartbeat() {
|
|||||||
leaderName, err := ct.client.Get(ct.ctx, ct.key(fmt.Sprintf("leader:%s", val))).Result()
|
leaderName, err := ct.client.Get(ct.ctx, ct.key(fmt.Sprintf("leader:%s", val))).Result()
|
||||||
if err != nil && err != redis.Nil {
|
if err != nil && err != redis.Nil {
|
||||||
log.Error("failed to read the remote leader", "err", err)
|
log.Error("failed to read the remote leader", "err", err)
|
||||||
|
RecordGroupConsensusError(ct.backendGroup, "read_leader", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
ct.leaderName = leaderName
|
ct.leaderName = leaderName
|
||||||
@ -232,16 +237,19 @@ func (ct *RedisConsensusTracker) stateHeartbeat() {
|
|||||||
val, err := ct.client.Get(ct.ctx, ct.key(fmt.Sprintf("state:%s", val))).Result()
|
val, err := ct.client.Get(ct.ctx, ct.key(fmt.Sprintf("state:%s", val))).Result()
|
||||||
if err != nil && err != redis.Nil {
|
if err != nil && err != redis.Nil {
|
||||||
log.Error("failed to read the remote state", "err", err)
|
log.Error("failed to read the remote state", "err", err)
|
||||||
|
RecordGroupConsensusError(ct.backendGroup, "read_state", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if val == "" {
|
if val == "" {
|
||||||
log.Error("remote state is missing (recent leader election maybe?)")
|
log.Error("remote state is missing (recent leader election maybe?)")
|
||||||
|
RecordGroupConsensusError(ct.backendGroup, "read_state_missing", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
state := &ConsensusTrackerState{}
|
state := &ConsensusTrackerState{}
|
||||||
err = json.Unmarshal([]byte(val), state)
|
err = json.Unmarshal([]byte(val), state)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("failed to unmarshal the remote state", "err", err)
|
log.Error("failed to unmarshal the remote state", "err", err)
|
||||||
|
RecordGroupConsensusError(ct.backendGroup, "read_unmarshal_state", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -316,13 +324,26 @@ func (ct *RedisConsensusTracker) postPayload(mutexVal string) {
|
|||||||
jsonState, err := json.Marshal(ct.local.state)
|
jsonState, err := json.Marshal(ct.local.state)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("failed to marshal local", "err", err)
|
log.Error("failed to marshal local", "err", err)
|
||||||
|
RecordGroupConsensusError(ct.backendGroup, "leader_marshal_local_state", err)
|
||||||
|
ct.leader = false
|
||||||
|
return
|
||||||
|
}
|
||||||
|
err = ct.client.Set(ct.ctx, ct.key(fmt.Sprintf("state:%s", mutexVal)), jsonState, ct.lockPeriod).Err()
|
||||||
|
if err != nil {
|
||||||
|
log.Error("failed to post the state", "err", err)
|
||||||
|
RecordGroupConsensusError(ct.backendGroup, "leader_post_state", err)
|
||||||
ct.leader = false
|
ct.leader = false
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
ct.client.Set(ct.ctx, ct.key(fmt.Sprintf("state:%s", mutexVal)), jsonState, ct.lockPeriod)
|
|
||||||
|
|
||||||
leader, _ := os.LookupEnv("HOSTNAME")
|
leader, _ := os.LookupEnv("HOSTNAME")
|
||||||
ct.client.Set(ct.ctx, ct.key(fmt.Sprintf("leader:%s", mutexVal)), leader, ct.lockPeriod)
|
err = ct.client.Set(ct.ctx, ct.key(fmt.Sprintf("leader:%s", mutexVal)), leader, ct.lockPeriod).Err()
|
||||||
|
if err != nil {
|
||||||
|
log.Error("failed to post the leader", "err", err)
|
||||||
|
RecordGroupConsensusError(ct.backendGroup, "leader_post_leader", err)
|
||||||
|
ct.leader = false
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
log.Debug("posted state", "state", string(jsonState), "leader", leader)
|
log.Debug("posted state", "state", string(jsonState), "leader", leader)
|
||||||
|
|
||||||
|
@ -2,6 +2,8 @@ package proxyd
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
@ -262,6 +264,14 @@ var (
|
|||||||
"backend_group_name",
|
"backend_group_name",
|
||||||
})
|
})
|
||||||
|
|
||||||
|
consensusHAError = promauto.NewCounterVec(prometheus.CounterOpts{
|
||||||
|
Namespace: MetricsNamespace,
|
||||||
|
Name: "group_consensus_ha_error",
|
||||||
|
Help: "Consensus HA error count",
|
||||||
|
}, []string{
|
||||||
|
"error",
|
||||||
|
})
|
||||||
|
|
||||||
consensusHALatestBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
consensusHALatestBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
Namespace: MetricsNamespace,
|
Namespace: MetricsNamespace,
|
||||||
Name: "group_consensus_ha_latest_block",
|
Name: "group_consensus_ha_latest_block",
|
||||||
@ -465,6 +475,16 @@ func RecordBatchSize(size int) {
|
|||||||
batchSizeHistogram.Observe(float64(size))
|
batchSizeHistogram.Observe(float64(size))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var nonAlphanumericRegex = regexp.MustCompile(`[^a-zA-Z ]+`)
|
||||||
|
|
||||||
|
func RecordGroupConsensusError(group *BackendGroup, label string, err error) {
|
||||||
|
errClean := nonAlphanumericRegex.ReplaceAllString(err.Error(), "")
|
||||||
|
errClean = strings.ReplaceAll(errClean, " ", "_")
|
||||||
|
errClean = strings.ReplaceAll(errClean, "__", "_")
|
||||||
|
label = fmt.Sprintf("%s.%s", label, errClean)
|
||||||
|
consensusHAError.WithLabelValues(label).Inc()
|
||||||
|
}
|
||||||
|
|
||||||
func RecordGroupConsensusHALatestBlock(group *BackendGroup, leader string, blockNumber hexutil.Uint64) {
|
func RecordGroupConsensusHALatestBlock(group *BackendGroup, leader string, blockNumber hexutil.Uint64) {
|
||||||
consensusHALatestBlock.WithLabelValues(group.Name, leader).Set(float64(blockNumber))
|
consensusHALatestBlock.WithLabelValues(group.Name, leader).Set(float64(blockNumber))
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user