2024-05-24 14:34:07 -07:00
|
|
|
package monitor
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/ethereum-optimism/optimism/op-conductor-mon/pkg/config"
|
|
|
|
"github.com/ethereum-optimism/optimism/op-conductor-mon/pkg/metrics"
|
|
|
|
"github.com/ethereum-optimism/optimism/op-conductor-mon/pkg/metrics/opconductor_client"
|
|
|
|
"github.com/ethereum-optimism/optimism/op-conductor/consensus"
|
|
|
|
"github.com/ethereum/go-ethereum/log"
|
|
|
|
)
|
|
|
|
|
|
|
|
func (p *Poller) cleanup(ctx context.Context) {
|
|
|
|
defer p.mutex.Unlock()
|
|
|
|
p.mutex.Lock()
|
|
|
|
for nodeName, nodeState := range p.state {
|
|
|
|
if time.Since(nodeState.updatedAt) > p.config.NodeStateExpiration {
|
|
|
|
log.Warn("node state expired",
|
|
|
|
"node", nodeName,
|
|
|
|
"updated_at", nodeState.updatedAt)
|
|
|
|
delete(p.state, nodeName)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *Poller) poll(ctx context.Context) {
|
|
|
|
for nodeName, nodeConfig := range p.nodesConfig {
|
|
|
|
p.pollNode(ctx, nodeName, nodeConfig)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *Poller) pollNode(ctx context.Context, nodeName string, nodeConfig *config.NodeConfig) {
|
|
|
|
log.Debug("polling node",
|
|
|
|
"name", nodeName,
|
|
|
|
"rpc", nodeConfig.RPCAddress)
|
|
|
|
|
|
|
|
client, err := opconductor_client.New(ctx, p.config, nodeName, nodeConfig.RPCAddress)
|
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// conductor status
|
|
|
|
paused, err := client.Paused(ctx)
|
|
|
|
if err != nil {
|
|
|
|
log.Error("cant get paused",
|
|
|
|
"node", nodeName,
|
|
|
|
"err", err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
log.Debug("got paused", "node", nodeName, "paused", paused)
|
|
|
|
|
|
|
|
stopped, err := client.Stopped(ctx)
|
|
|
|
if err != nil {
|
|
|
|
log.Error("cant get stopped",
|
|
|
|
"node", nodeName,
|
|
|
|
"err", err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
log.Debug("got stopped", "node", nodeName, "stopped", stopped)
|
|
|
|
|
|
|
|
active, err := client.Active(ctx)
|
|
|
|
if err != nil {
|
|
|
|
log.Error("cant get active",
|
|
|
|
"node", nodeName,
|
|
|
|
"err", err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
log.Debug("got active", "node", nodeName, "active", active)
|
|
|
|
|
|
|
|
// sequencer status
|
|
|
|
healthy, err := client.SequencerHealthy(ctx)
|
|
|
|
if err != nil {
|
|
|
|
log.Error("cant get sequencer healthy",
|
|
|
|
"node", nodeName,
|
|
|
|
"err", err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
log.Debug("got sequencer healthy", "node", nodeName, "healthy", healthy)
|
|
|
|
|
|
|
|
leader, err := client.Leader(ctx)
|
|
|
|
if err != nil {
|
|
|
|
log.Error("cant get leader",
|
|
|
|
"node", nodeName,
|
|
|
|
"err", err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
log.Debug("got leader", "node", nodeName, "leader", leader)
|
|
|
|
|
|
|
|
// raft status
|
|
|
|
leaderWithID, err := client.LeaderWithID(ctx)
|
|
|
|
if err != nil {
|
|
|
|
log.Error("cant get leader with id",
|
|
|
|
"node", nodeName,
|
|
|
|
"err", err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
log.Debug("got leader with id", "node", nodeName, "leader_with_id", leaderWithID)
|
|
|
|
|
|
|
|
clusterMembership, err := client.ClusterMembership(ctx)
|
|
|
|
if err != nil {
|
|
|
|
log.Error("cant get cluster membership",
|
|
|
|
"node", nodeName,
|
|
|
|
"err", err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
log.Debug("got cluster membership", "node", nodeName, "cluster_membership", clusterMembership)
|
|
|
|
|
|
|
|
// update node state
|
|
|
|
nodeState := &NodeState{
|
|
|
|
paused: paused,
|
|
|
|
stopped: stopped,
|
|
|
|
active: active,
|
|
|
|
|
|
|
|
healthy: healthy,
|
|
|
|
leader: leader,
|
|
|
|
|
|
|
|
leaderWithID: leaderWithID,
|
|
|
|
clusterMembership: clusterMembership,
|
|
|
|
|
|
|
|
updatedAt: time.Now(),
|
|
|
|
}
|
|
|
|
|
|
|
|
p.mutex.Lock()
|
|
|
|
defer p.mutex.Unlock()
|
|
|
|
|
|
|
|
p.state[nodeName] = nodeState
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *Poller) reportMetrics(ctx context.Context) {
|
|
|
|
log.Debug("report metrics",
|
|
|
|
"state_len", len(p.state))
|
|
|
|
|
|
|
|
leaderCount := 0
|
|
|
|
|
|
|
|
for nodeName, nodeState := range p.state {
|
|
|
|
// reset previous leaders from local config
|
|
|
|
for other := range p.nodesConfig {
|
|
|
|
metrics.ReportNodeLeader(nodeName, other, false)
|
|
|
|
}
|
|
|
|
// reset previous leaders from reported state
|
2024-06-20 17:09:07 -04:00
|
|
|
for _, other := range nodeState.clusterMembership.Servers {
|
2024-05-24 14:34:07 -07:00
|
|
|
metrics.ReportNodeLeader(nodeName, other.ID, false)
|
|
|
|
}
|
|
|
|
|
|
|
|
if nodeState.leader {
|
|
|
|
leaderCount++
|
|
|
|
}
|
|
|
|
|
|
|
|
p.reportNodeMetrics(ctx, nodeName, nodeState)
|
|
|
|
}
|
|
|
|
|
|
|
|
metrics.ReportLeaderCount(leaderCount)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *Poller) reportNodeMetrics(ctx context.Context, name string, state *NodeState) {
|
|
|
|
log.Debug("report node metrics",
|
|
|
|
"node", name)
|
|
|
|
|
|
|
|
// conductor status
|
|
|
|
metrics.RecordNodeState(name, "paused", state.paused)
|
|
|
|
metrics.RecordNodeState(name, "stopped", state.stopped)
|
|
|
|
metrics.RecordNodeState(name, "active", state.active)
|
|
|
|
|
|
|
|
// sequencer status
|
|
|
|
metrics.RecordNodeState(name, "healthy", state.healthy)
|
|
|
|
metrics.RecordNodeState(name, "leader", state.leader)
|
|
|
|
|
|
|
|
// raft status
|
|
|
|
metrics.ReportNodeLeader(name, state.leaderWithID.ID, true)
|
2024-06-20 17:09:07 -04:00
|
|
|
metrics.ReportClusterMembershipCount(name, len(state.clusterMembership.Servers))
|
2024-05-24 14:34:07 -07:00
|
|
|
|
|
|
|
voters := 0
|
2024-06-20 17:09:07 -04:00
|
|
|
for _, member := range state.clusterMembership.Servers {
|
2024-05-24 14:34:07 -07:00
|
|
|
if member.Suffrage == consensus.Voter {
|
|
|
|
voters++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
metrics.ReportClusterVotersCount(name, voters)
|
|
|
|
}
|