From 1d9406daa6b7feaacb48601bc21079c43236b178 Mon Sep 17 00:00:00 2001 From: felipe andrade <130432649+felipe-op@users.noreply.github.com> Date: Thu, 25 May 2023 09:16:16 -0700 Subject: [PATCH] record sliding window metrics (#5782) --- proxyd/proxyd/backend.go | 9 +++++++ proxyd/proxyd/metrics.go | 56 +++++++++++++++++++++++++++++++++------- 2 files changed, 55 insertions(+), 10 deletions(-) diff --git a/proxyd/proxyd/backend.go b/proxyd/proxyd/backend.go index fdbea53..b57c938 100644 --- a/proxyd/proxyd/backend.go +++ b/proxyd/proxyd/backend.go @@ -374,6 +374,7 @@ func (b *Backend) ForwardRPC(ctx context.Context, res *RPCRes, id string, method func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool) ([]*RPCRes, error) { // we are concerned about network error rates, so we record 1 request independently of how many are in the batch b.networkRequestsSlidingWindow.Incr() + RecordBackendNetworkRequestCountSlidingWindow(b, b.networkRequestsSlidingWindow.Count()) isSingleElementBatch := len(rpcReqs) == 1 @@ -390,6 +391,7 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool httpReq, err := http.NewRequestWithContext(ctx, "POST", b.rpcURL, bytes.NewReader(body)) if err != nil { b.networkErrorsSlidingWindow.Incr() + RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count()) return nil, wrapErr(err, "error creating backend request") } @@ -411,6 +413,7 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool httpRes, err := b.client.DoLimited(httpReq) if err != nil { b.networkErrorsSlidingWindow.Incr() + RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count()) return nil, wrapErr(err, "error in backend request") } @@ -429,6 +432,7 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool // Alchemy returns a 400 on bad JSONs, so handle that case if httpRes.StatusCode != 200 && httpRes.StatusCode != 400 { b.networkErrorsSlidingWindow.Incr() + RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count()) return nil, fmt.Errorf("response code %d", httpRes.StatusCode) } @@ -436,6 +440,7 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool resB, err := io.ReadAll(io.LimitReader(httpRes.Body, b.maxResponseSize)) if err != nil { b.networkErrorsSlidingWindow.Incr() + RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count()) return nil, wrapErr(err, "error reading response body") } @@ -453,15 +458,18 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool // Infura may return a single JSON-RPC response if, for example, the batch contains a request for an unsupported method if responseIsNotBatched(resB) { b.networkErrorsSlidingWindow.Incr() + RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count()) return nil, ErrBackendUnexpectedJSONRPC } b.networkErrorsSlidingWindow.Incr() + RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count()) return nil, ErrBackendBadResponse } } if len(rpcReqs) != len(res) { b.networkErrorsSlidingWindow.Incr() + RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count()) return nil, ErrBackendUnexpectedJSONRPC } @@ -474,6 +482,7 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool } duration := time.Since(start) b.latencySlidingWindow.Add(float64(duration)) + RecordBackendNetworkLatencyAverageSlidingWindow(b, b.latencySlidingWindow.Avg()) sortBatchRPCResponse(rpcReqs, res) return res, nil diff --git a/proxyd/proxyd/metrics.go b/proxyd/proxyd/metrics.go index 3420b71..3f462cf 100644 --- a/proxyd/proxyd/metrics.go +++ b/proxyd/proxyd/metrics.go @@ -309,6 +309,30 @@ var ( }, []string{ "backend_name", }) + + avgLatencyBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: MetricsNamespace, + Name: "backend_avg_latency", + Help: "Average latency per backend", + }, []string{ + "backend_name", + }) + + networkErrorCountBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: MetricsNamespace, + Name: "backend_net_error_count", + Help: "Network error count per backend", + }, []string{ + "backend_name", + }) + + requestCountBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: MetricsNamespace, + Name: "backend_request_count", + Help: "Request count per backend", + }, []string{ + "backend_name", + }) ) func RecordRedisError(source string) { @@ -390,30 +414,42 @@ func RecordGroupTotalCount(group *BackendGroup, count int) { consensusGroupTotalCount.WithLabelValues(group.Name).Set(float64(count)) } -func RecordBackendLatestBlock(be *Backend, blockNumber hexutil.Uint64) { - backendLatestBlockBackend.WithLabelValues(be.Name).Set(float64(blockNumber)) +func RecordBackendLatestBlock(b *Backend, blockNumber hexutil.Uint64) { + backendLatestBlockBackend.WithLabelValues(b.Name).Set(float64(blockNumber)) } -func RecordConsensusBackendBanned(be *Backend, banned bool) { +func RecordConsensusBackendBanned(b *Backend, banned bool) { v := float64(0) if banned { v = float64(1) } - consensusBannedBackends.WithLabelValues(be.Name).Set(v) + consensusBannedBackends.WithLabelValues(b.Name).Set(v) } -func RecordConsensusBackendPeerCount(be *Backend, peerCount uint64) { - consensusPeerCountBackend.WithLabelValues(be.Name).Set(float64(peerCount)) +func RecordConsensusBackendPeerCount(b *Backend, peerCount uint64) { + consensusPeerCountBackend.WithLabelValues(b.Name).Set(float64(peerCount)) } -func RecordConsensusBackendInSync(be *Backend, inSync bool) { +func RecordConsensusBackendInSync(b *Backend, inSync bool) { v := float64(0) if inSync { v = float64(1) } - consensusInSyncBackend.WithLabelValues(be.Name).Set(v) + consensusInSyncBackend.WithLabelValues(b.Name).Set(v) } -func RecordConsensusBackendUpdateDelay(be *Backend, delay time.Duration) { - consensusUpdateDelayBackend.WithLabelValues(be.Name).Set(float64(delay.Milliseconds())) +func RecordConsensusBackendUpdateDelay(b *Backend, delay time.Duration) { + consensusUpdateDelayBackend.WithLabelValues(b.Name).Set(float64(delay.Milliseconds())) +} + +func RecordBackendNetworkLatencyAverageSlidingWindow(b *Backend, avgLatency float64) { + avgLatencyBackend.WithLabelValues(b.Name).Set(avgLatency) +} + +func RecordBackendNetworkRequestCountSlidingWindow(b *Backend, count uint) { + requestCountBackend.WithLabelValues(b.Name).Set(float64(count)) +} + +func RecordBackendNetworkErrorCountSlidingWindow(b *Backend, count uint) { + networkErrorCountBackend.WithLabelValues(b.Name).Set(float64(count)) }