From 1d9406daa6b7feaacb48601bc21079c43236b178 Mon Sep 17 00:00:00 2001
From: felipe andrade <130432649+felipe-op@users.noreply.github.com>
Date: Thu, 25 May 2023 09:16:16 -0700
Subject: [PATCH] record sliding window metrics (#5782)

---
 proxyd/proxyd/backend.go |  9 +++++++
 proxyd/proxyd/metrics.go | 56 +++++++++++++++++++++++++++++++++-------
 2 files changed, 55 insertions(+), 10 deletions(-)

diff --git a/proxyd/proxyd/backend.go b/proxyd/proxyd/backend.go
index fdbea53..b57c938 100644
--- a/proxyd/proxyd/backend.go
+++ b/proxyd/proxyd/backend.go
@@ -374,6 +374,7 @@ func (b *Backend) ForwardRPC(ctx context.Context, res *RPCRes, id string, method
 func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool) ([]*RPCRes, error) {
 	// we are concerned about network error rates, so we record 1 request independently of how many are in the batch
 	b.networkRequestsSlidingWindow.Incr()
+	RecordBackendNetworkRequestCountSlidingWindow(b, b.networkRequestsSlidingWindow.Count())
 
 	isSingleElementBatch := len(rpcReqs) == 1
 
@@ -390,6 +391,7 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool
 	httpReq, err := http.NewRequestWithContext(ctx, "POST", b.rpcURL, bytes.NewReader(body))
 	if err != nil {
 		b.networkErrorsSlidingWindow.Incr()
+		RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count())
 		return nil, wrapErr(err, "error creating backend request")
 	}
 
@@ -411,6 +413,7 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool
 	httpRes, err := b.client.DoLimited(httpReq)
 	if err != nil {
 		b.networkErrorsSlidingWindow.Incr()
+		RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count())
 		return nil, wrapErr(err, "error in backend request")
 	}
 
@@ -429,6 +432,7 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool
 	// Alchemy returns a 400 on bad JSONs, so handle that case
 	if httpRes.StatusCode != 200 && httpRes.StatusCode != 400 {
 		b.networkErrorsSlidingWindow.Incr()
+		RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count())
 		return nil, fmt.Errorf("response code %d", httpRes.StatusCode)
 	}
 
@@ -436,6 +440,7 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool
 	resB, err := io.ReadAll(io.LimitReader(httpRes.Body, b.maxResponseSize))
 	if err != nil {
 		b.networkErrorsSlidingWindow.Incr()
+		RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count())
 		return nil, wrapErr(err, "error reading response body")
 	}
 
@@ -453,15 +458,18 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool
 			// Infura may return a single JSON-RPC response if, for example, the batch contains a request for an unsupported method
 			if responseIsNotBatched(resB) {
 				b.networkErrorsSlidingWindow.Incr()
+				RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count())
 				return nil, ErrBackendUnexpectedJSONRPC
 			}
 			b.networkErrorsSlidingWindow.Incr()
+			RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count())
 			return nil, ErrBackendBadResponse
 		}
 	}
 
 	if len(rpcReqs) != len(res) {
 		b.networkErrorsSlidingWindow.Incr()
+		RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count())
 		return nil, ErrBackendUnexpectedJSONRPC
 	}
 
@@ -474,6 +482,7 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool
 	}
 	duration := time.Since(start)
 	b.latencySlidingWindow.Add(float64(duration))
+	RecordBackendNetworkLatencyAverageSlidingWindow(b, b.latencySlidingWindow.Avg())
 
 	sortBatchRPCResponse(rpcReqs, res)
 	return res, nil
diff --git a/proxyd/proxyd/metrics.go b/proxyd/proxyd/metrics.go
index 3420b71..3f462cf 100644
--- a/proxyd/proxyd/metrics.go
+++ b/proxyd/proxyd/metrics.go
@@ -309,6 +309,30 @@ var (
 	}, []string{
 		"backend_name",
 	})
+
+	avgLatencyBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
+		Namespace: MetricsNamespace,
+		Name:      "backend_avg_latency",
+		Help:      "Average latency per backend",
+	}, []string{
+		"backend_name",
+	})
+
+	networkErrorCountBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
+		Namespace: MetricsNamespace,
+		Name:      "backend_net_error_count",
+		Help:      "Network error count per backend",
+	}, []string{
+		"backend_name",
+	})
+
+	requestCountBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
+		Namespace: MetricsNamespace,
+		Name:      "backend_request_count",
+		Help:      "Request count per backend",
+	}, []string{
+		"backend_name",
+	})
 )
 
 func RecordRedisError(source string) {
@@ -390,30 +414,42 @@ func RecordGroupTotalCount(group *BackendGroup, count int) {
 	consensusGroupTotalCount.WithLabelValues(group.Name).Set(float64(count))
 }
 
-func RecordBackendLatestBlock(be *Backend, blockNumber hexutil.Uint64) {
-	backendLatestBlockBackend.WithLabelValues(be.Name).Set(float64(blockNumber))
+func RecordBackendLatestBlock(b *Backend, blockNumber hexutil.Uint64) {
+	backendLatestBlockBackend.WithLabelValues(b.Name).Set(float64(blockNumber))
 }
 
-func RecordConsensusBackendBanned(be *Backend, banned bool) {
+func RecordConsensusBackendBanned(b *Backend, banned bool) {
 	v := float64(0)
 	if banned {
 		v = float64(1)
 	}
-	consensusBannedBackends.WithLabelValues(be.Name).Set(v)
+	consensusBannedBackends.WithLabelValues(b.Name).Set(v)
 }
 
-func RecordConsensusBackendPeerCount(be *Backend, peerCount uint64) {
-	consensusPeerCountBackend.WithLabelValues(be.Name).Set(float64(peerCount))
+func RecordConsensusBackendPeerCount(b *Backend, peerCount uint64) {
+	consensusPeerCountBackend.WithLabelValues(b.Name).Set(float64(peerCount))
 }
 
-func RecordConsensusBackendInSync(be *Backend, inSync bool) {
+func RecordConsensusBackendInSync(b *Backend, inSync bool) {
 	v := float64(0)
 	if inSync {
 		v = float64(1)
 	}
-	consensusInSyncBackend.WithLabelValues(be.Name).Set(v)
+	consensusInSyncBackend.WithLabelValues(b.Name).Set(v)
 }
 
-func RecordConsensusBackendUpdateDelay(be *Backend, delay time.Duration) {
-	consensusUpdateDelayBackend.WithLabelValues(be.Name).Set(float64(delay.Milliseconds()))
+func RecordConsensusBackendUpdateDelay(b *Backend, delay time.Duration) {
+	consensusUpdateDelayBackend.WithLabelValues(b.Name).Set(float64(delay.Milliseconds()))
+}
+
+func RecordBackendNetworkLatencyAverageSlidingWindow(b *Backend, avgLatency float64) {
+	avgLatencyBackend.WithLabelValues(b.Name).Set(avgLatency)
+}
+
+func RecordBackendNetworkRequestCountSlidingWindow(b *Backend, count uint) {
+	requestCountBackend.WithLabelValues(b.Name).Set(float64(count))
+}
+
+func RecordBackendNetworkErrorCountSlidingWindow(b *Backend, count uint) {
+	networkErrorCountBackend.WithLabelValues(b.Name).Set(float64(count))
 }