infra/proxyd/metrics.go

602 lines
18 KiB
Go
Raw Normal View History

package proxyd
import (
"context"
"fmt"
"regexp"
"strconv"
"strings"
2023-05-10 03:21:25 +03:00
"time"
2023-04-21 20:36:42 +03:00
"github.com/ethereum/go-ethereum/common/hexutil"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
const (
MetricsNamespace = "proxyd"
RPCRequestSourceHTTP = "http"
RPCRequestSourceWS = "ws"
BackendProxyd = "proxyd"
SourceClient = "client"
SourceBackend = "backend"
MethodUnknown = "unknown"
)
var PayloadSizeBuckets = []float64{10, 50, 100, 500, 1000, 5000, 10000, 100000, 1000000}
var MillisecondDurationBuckets = []float64{1, 10, 50, 100, 500, 1000, 5000, 10000, 100000}
var (
rpcRequestsTotal = promauto.NewCounter(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "rpc_requests_total",
Help: "Count of total client RPC requests.",
})
rpcForwardsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "rpc_forwards_total",
Help: "Count of total RPC requests forwarded to each backend.",
}, []string{
"auth",
"backend_name",
"method_name",
"source",
})
rpcBackendHTTPResponseCodesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "rpc_backend_http_response_codes_total",
Help: "Count of total backend responses by HTTP status code.",
}, []string{
"auth",
"backend_name",
"method_name",
"status_code",
"batched",
})
rpcErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "rpc_errors_total",
Help: "Count of total RPC errors.",
}, []string{
"auth",
"backend_name",
"method_name",
"error_code",
})
rpcSpecialErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "rpc_special_errors_total",
Help: "Count of total special RPC errors.",
}, []string{
"auth",
"backend_name",
"method_name",
"error_type",
})
rpcBackendRequestDurationSumm = promauto.NewSummaryVec(prometheus.SummaryOpts{
Namespace: MetricsNamespace,
Name: "rpc_backend_request_duration_seconds",
Help: "Summary of backend response times broken down by backend and method name.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
}, []string{
"backend_name",
"method_name",
"batched",
})
activeClientWsConnsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "active_client_ws_conns",
Help: "Gauge of active client WS connections.",
}, []string{
"auth",
})
activeBackendWsConnsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "active_backend_ws_conns",
Help: "Gauge of active backend WS connections.",
}, []string{
"backend_name",
})
unserviceableRequestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "unserviceable_requests_total",
Help: "Count of total requests that were rejected due to no backends being available.",
}, []string{
"auth",
"request_source",
})
httpResponseCodesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "http_response_codes_total",
Help: "Count of total HTTP response codes.",
}, []string{
"status_code",
})
httpRequestDurationSumm = promauto.NewSummary(prometheus.SummaryOpts{
Namespace: MetricsNamespace,
Name: "http_request_duration_seconds",
Help: "Summary of HTTP request durations, in seconds.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.95: 0.005, 0.99: 0.001},
})
wsMessagesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "ws_messages_total",
Help: "Count of total websocket messages including protocol control.",
}, []string{
"auth",
"backend_name",
"source",
})
redisErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "redis_errors_total",
Help: "Count of total Redis errors.",
}, []string{
"source",
})
requestPayloadSizesGauge = promauto.NewHistogramVec(prometheus.HistogramOpts{
Namespace: MetricsNamespace,
Name: "request_payload_sizes",
Help: "Histogram of client request payload sizes.",
Buckets: PayloadSizeBuckets,
}, []string{
"auth",
})
responsePayloadSizesGauge = promauto.NewHistogramVec(prometheus.HistogramOpts{
Namespace: MetricsNamespace,
Name: "response_payload_sizes",
Help: "Histogram of client response payload sizes.",
Buckets: PayloadSizeBuckets,
}, []string{
"auth",
})
cacheHitsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "cache_hits_total",
Help: "Number of cache hits.",
}, []string{
"method",
})
cacheMissesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "cache_misses_total",
Help: "Number of cache misses.",
}, []string{
"method",
})
cacheErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "cache_errors_total",
Help: "Number of cache errors.",
}, []string{
"method",
})
batchRPCShortCircuitsTotal = promauto.NewCounter(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "batch_rpc_short_circuits_total",
Help: "Count of total batch RPC short-circuits.",
})
rpcSpecialErrors = []string{
"nonce too low",
"gas price too high",
"gas price too low",
"invalid parameters",
}
redisCacheDurationSumm = promauto.NewHistogramVec(prometheus.HistogramOpts{
Namespace: MetricsNamespace,
Name: "redis_cache_duration_milliseconds",
Help: "Histogram of Redis command durations, in milliseconds.",
Buckets: MillisecondDurationBuckets,
}, []string{"command"})
tooManyRequestErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "too_many_request_errors_total",
Help: "Count of request timeouts due to too many concurrent RPCs.",
}, []string{
"backend_name",
})
batchSizeHistogram = promauto.NewHistogram(prometheus.HistogramOpts{
Namespace: MetricsNamespace,
Name: "batch_size_summary",
Help: "Summary of batch sizes",
Buckets: []float64{
1,
5,
10,
25,
50,
100,
},
})
frontendRateLimitTakeErrors = promauto.NewCounter(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "rate_limit_take_errors",
Help: "Count of errors taking frontend rate limits",
})
2023-04-18 21:57:55 +03:00
2023-04-21 20:36:42 +03:00
consensusLatestBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
2023-04-18 21:57:55 +03:00
Namespace: MetricsNamespace,
2023-04-21 20:36:42 +03:00
Name: "group_consensus_latest_block",
2023-04-18 21:57:55 +03:00
Help: "Consensus latest block",
2023-04-21 20:36:42 +03:00
}, []string{
"backend_group_name",
2023-04-18 21:57:55 +03:00
})
2023-05-27 00:22:50 +03:00
consensusSafeBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "group_consensus_safe_block",
Help: "Consensus safe block",
}, []string{
"backend_group_name",
})
consensusFinalizedBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "group_consensus_finalized_block",
Help: "Consensus finalized block",
}, []string{
"backend_group_name",
})
consensusHAError = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "group_consensus_ha_error",
Help: "Consensus HA error count",
}, []string{
"error",
})
2023-09-14 22:36:24 +03:00
consensusHALatestBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "group_consensus_ha_latest_block",
Help: "Consensus HA latest block",
}, []string{
"backend_group_name",
"leader",
})
consensusHASafeBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "group_consensus_ha_safe_block",
Help: "Consensus HA safe block",
}, []string{
"backend_group_name",
"leader",
})
consensusHAFinalizedBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "group_consensus_ha_finalized_block",
Help: "Consensus HA finalized block",
}, []string{
"backend_group_name",
"leader",
})
2023-04-18 21:57:55 +03:00
backendLatestBlockBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "backend_latest_block",
Help: "Current latest block observed per backend",
}, []string{
"backend_name",
})
2023-05-10 03:21:25 +03:00
2023-05-27 00:22:50 +03:00
backendSafeBlockBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "backend_safe_block",
Help: "Current safe block observed per backend",
}, []string{
"backend_name",
})
backendFinalizedBlockBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "backend_finalized_block",
Help: "Current finalized block observed per backend",
}, []string{
"backend_name",
})
backendUnexpectedBlockTagsBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "backend_unexpected_block_tags",
Help: "Bool gauge for unexpected block tags",
}, []string{
"backend_name",
})
2023-05-10 03:21:25 +03:00
consensusGroupCount = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "group_consensus_count",
2023-05-10 05:17:25 +03:00
Help: "Consensus group serving traffic count",
}, []string{
"backend_group_name",
})
consensusGroupFilteredCount = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "group_consensus_filtered_count",
2023-09-14 22:42:33 +03:00
Help: "Consensus group filtered out from serving traffic count",
2023-05-10 05:17:25 +03:00
}, []string{
"backend_group_name",
})
consensusGroupTotalCount = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "group_consensus_total_count",
Help: "Total count of candidates to be part of consensus group",
2023-05-10 03:21:25 +03:00
}, []string{
"backend_group_name",
})
consensusBannedBackends = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "consensus_backend_banned",
Help: "Bool gauge for banned backends",
}, []string{
"backend_name",
})
consensusPeerCountBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "consensus_backend_peer_count",
Help: "Peer count",
}, []string{
"backend_name",
})
consensusInSyncBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "consensus_backend_in_sync",
Help: "Bool gauge for backends in sync",
}, []string{
"backend_name",
})
consensusUpdateDelayBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "consensus_backend_update_delay",
Help: "Delay (ms) for backend update",
}, []string{
"backend_name",
})
2023-05-25 19:16:16 +03:00
avgLatencyBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "backend_avg_latency",
Help: "Average latency per backend",
}, []string{
"backend_name",
})
2023-05-31 22:41:20 +03:00
degradedBackends = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "backend_degraded",
Help: "Bool gauge for degraded backends",
}, []string{
"backend_name",
})
2023-05-27 00:22:50 +03:00
networkErrorRateBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
2023-05-25 19:16:16 +03:00
Namespace: MetricsNamespace,
2023-05-27 00:22:50 +03:00
Name: "backend_error_rate",
Help: "Request error rate per backend",
2023-05-25 19:16:16 +03:00
}, []string{
"backend_name",
})
healthyPrimaryCandidates = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "healthy_candidates",
Help: "Record the number of healthy primary candidates",
}, []string{
"backend_group_name",
})
backendGroupFallbackBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "backend_group_fallback_backenend",
Help: "Bool gauge for if a backend is a fallback for a backend group",
}, []string{
"backend_group",
"backend_name",
"fallback",
})
)
func RecordRedisError(source string) {
redisErrorsTotal.WithLabelValues(source).Inc()
}
func RecordRPCError(ctx context.Context, backendName, method string, err error) {
rpcErr, ok := err.(*RPCErr)
var code int
if ok {
MaybeRecordSpecialRPCError(ctx, backendName, method, rpcErr)
code = rpcErr.Code
} else {
code = -1
}
rpcErrorsTotal.WithLabelValues(GetAuthCtx(ctx), backendName, method, strconv.Itoa(code)).Inc()
}
func RecordWSMessage(ctx context.Context, backendName, source string) {
wsMessagesTotal.WithLabelValues(GetAuthCtx(ctx), backendName, source).Inc()
}
func RecordUnserviceableRequest(ctx context.Context, source string) {
unserviceableRequestsTotal.WithLabelValues(GetAuthCtx(ctx), source).Inc()
}
func RecordRPCForward(ctx context.Context, backendName, method, source string) {
rpcForwardsTotal.WithLabelValues(GetAuthCtx(ctx), backendName, method, source).Inc()
}
func MaybeRecordSpecialRPCError(ctx context.Context, backendName, method string, rpcErr *RPCErr) {
errMsg := strings.ToLower(rpcErr.Message)
for _, errStr := range rpcSpecialErrors {
if strings.Contains(errMsg, errStr) {
rpcSpecialErrorsTotal.WithLabelValues(GetAuthCtx(ctx), backendName, method, errStr).Inc()
return
}
}
}
func RecordRequestPayloadSize(ctx context.Context, payloadSize int) {
requestPayloadSizesGauge.WithLabelValues(GetAuthCtx(ctx)).Observe(float64(payloadSize))
}
func RecordResponsePayloadSize(ctx context.Context, payloadSize int) {
responsePayloadSizesGauge.WithLabelValues(GetAuthCtx(ctx)).Observe(float64(payloadSize))
}
func RecordCacheHit(method string) {
cacheHitsTotal.WithLabelValues(method).Inc()
}
func RecordCacheMiss(method string) {
cacheMissesTotal.WithLabelValues(method).Inc()
}
func RecordCacheError(method string) {
2023-05-16 05:38:09 +03:00
cacheErrorsTotal.WithLabelValues(method).Inc()
}
func RecordBatchSize(size int) {
batchSizeHistogram.Observe(float64(size))
}
2023-04-21 20:36:42 +03:00
var nonAlphanumericRegex = regexp.MustCompile(`[^a-zA-Z ]+`)
func RecordGroupConsensusError(group *BackendGroup, label string, err error) {
errClean := nonAlphanumericRegex.ReplaceAllString(err.Error(), "")
errClean = strings.ReplaceAll(errClean, " ", "_")
errClean = strings.ReplaceAll(errClean, "__", "_")
label = fmt.Sprintf("%s.%s", label, errClean)
consensusHAError.WithLabelValues(label).Inc()
}
2023-09-14 22:36:24 +03:00
func RecordGroupConsensusHALatestBlock(group *BackendGroup, leader string, blockNumber hexutil.Uint64) {
consensusHALatestBlock.WithLabelValues(group.Name, leader).Set(float64(blockNumber))
}
func RecordGroupConsensusHASafeBlock(group *BackendGroup, leader string, blockNumber hexutil.Uint64) {
consensusHASafeBlock.WithLabelValues(group.Name, leader).Set(float64(blockNumber))
}
func RecordGroupConsensusHAFinalizedBlock(group *BackendGroup, leader string, blockNumber hexutil.Uint64) {
consensusHAFinalizedBlock.WithLabelValues(group.Name, leader).Set(float64(blockNumber))
}
2023-05-10 03:21:25 +03:00
func RecordGroupConsensusLatestBlock(group *BackendGroup, blockNumber hexutil.Uint64) {
consensusLatestBlock.WithLabelValues(group.Name).Set(float64(blockNumber))
}
2023-05-27 00:22:50 +03:00
func RecordGroupConsensusSafeBlock(group *BackendGroup, blockNumber hexutil.Uint64) {
consensusSafeBlock.WithLabelValues(group.Name).Set(float64(blockNumber))
}
func RecordGroupConsensusFinalizedBlock(group *BackendGroup, blockNumber hexutil.Uint64) {
consensusFinalizedBlock.WithLabelValues(group.Name).Set(float64(blockNumber))
}
2023-05-10 03:21:25 +03:00
func RecordGroupConsensusCount(group *BackendGroup, count int) {
consensusGroupCount.WithLabelValues(group.Name).Set(float64(count))
}
2023-05-10 05:17:25 +03:00
func RecordGroupConsensusFilteredCount(group *BackendGroup, count int) {
consensusGroupFilteredCount.WithLabelValues(group.Name).Set(float64(count))
}
func RecordGroupTotalCount(group *BackendGroup, count int) {
consensusGroupTotalCount.WithLabelValues(group.Name).Set(float64(count))
}
2023-05-25 19:16:16 +03:00
func RecordBackendLatestBlock(b *Backend, blockNumber hexutil.Uint64) {
backendLatestBlockBackend.WithLabelValues(b.Name).Set(float64(blockNumber))
2023-04-21 20:36:42 +03:00
}
2023-05-27 00:22:50 +03:00
func RecordBackendSafeBlock(b *Backend, blockNumber hexutil.Uint64) {
backendSafeBlockBackend.WithLabelValues(b.Name).Set(float64(blockNumber))
}
func RecordBackendFinalizedBlock(b *Backend, blockNumber hexutil.Uint64) {
backendFinalizedBlockBackend.WithLabelValues(b.Name).Set(float64(blockNumber))
}
func RecordBackendUnexpectedBlockTags(b *Backend, unexpected bool) {
2023-05-27 00:25:56 +03:00
backendUnexpectedBlockTagsBackend.WithLabelValues(b.Name).Set(boolToFloat64(unexpected))
2023-05-27 00:22:50 +03:00
}
2023-05-25 19:16:16 +03:00
func RecordConsensusBackendBanned(b *Backend, banned bool) {
2023-05-27 00:22:50 +03:00
consensusBannedBackends.WithLabelValues(b.Name).Set(boolToFloat64(banned))
2023-05-10 03:21:25 +03:00
}
func RecordHealthyCandidates(b *BackendGroup, candidates int) {
healthyPrimaryCandidates.WithLabelValues(b.Name).Set(float64(candidates))
}
2023-05-25 19:16:16 +03:00
func RecordConsensusBackendPeerCount(b *Backend, peerCount uint64) {
consensusPeerCountBackend.WithLabelValues(b.Name).Set(float64(peerCount))
2023-05-10 03:21:25 +03:00
}
2023-05-25 19:16:16 +03:00
func RecordConsensusBackendInSync(b *Backend, inSync bool) {
2023-05-27 00:22:50 +03:00
consensusInSyncBackend.WithLabelValues(b.Name).Set(boolToFloat64(inSync))
2023-05-25 19:16:16 +03:00
}
2023-05-27 20:18:34 +03:00
func RecordConsensusBackendUpdateDelay(b *Backend, lastUpdate time.Time) {
// avoid recording the delay for the first update
if lastUpdate.IsZero() {
return
}
delay := time.Since(lastUpdate)
2023-05-25 19:16:16 +03:00
consensusUpdateDelayBackend.WithLabelValues(b.Name).Set(float64(delay.Milliseconds()))
}
2023-05-25 21:04:14 +03:00
func RecordBackendNetworkLatencyAverageSlidingWindow(b *Backend, avgLatency time.Duration) {
avgLatencyBackend.WithLabelValues(b.Name).Set(float64(avgLatency.Milliseconds()))
2023-05-31 22:41:20 +03:00
degradedBackends.WithLabelValues(b.Name).Set(boolToFloat64(b.IsDegraded()))
2023-05-25 19:16:16 +03:00
}
2023-05-27 00:22:50 +03:00
func RecordBackendNetworkErrorRateSlidingWindow(b *Backend, rate float64) {
networkErrorRateBackend.WithLabelValues(b.Name).Set(rate)
2023-05-10 03:21:25 +03:00
}
func RecordBackendGroupFallbacks(bg *BackendGroup, name string, fallback bool) {
backendGroupFallbackBackend.WithLabelValues(bg.Name, name, strconv.FormatBool(fallback)).Set(boolToFloat64(fallback))
}
2023-05-27 00:22:50 +03:00
func boolToFloat64(b bool) float64 {
if b {
return 1
}
return 0
2023-04-21 20:36:42 +03:00
}