better moar tests

This commit is contained in:
Felipe Andrade 2023-05-26 14:22:50 -07:00
parent af863d39de
commit 2d9259ee20
6 changed files with 533 additions and 726 deletions

@ -374,7 +374,6 @@ func (b *Backend) ForwardRPC(ctx context.Context, res *RPCRes, id string, method
func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool) ([]*RPCRes, error) {
// we are concerned about network error rates, so we record 1 request independently of how many are in the batch
b.networkRequestsSlidingWindow.Incr()
RecordBackendNetworkRequestCountSlidingWindow(b, b.networkRequestsSlidingWindow.Count())
isSingleElementBatch := len(rpcReqs) == 1
@ -391,7 +390,7 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool
httpReq, err := http.NewRequestWithContext(ctx, "POST", b.rpcURL, bytes.NewReader(body))
if err != nil {
b.networkErrorsSlidingWindow.Incr()
RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count())
RecordBackendNetworkErrorRateSlidingWindow(b, b.ErrorRate())
return nil, wrapErr(err, "error creating backend request")
}
@ -413,7 +412,7 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool
httpRes, err := b.client.DoLimited(httpReq)
if err != nil {
b.networkErrorsSlidingWindow.Incr()
RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count())
RecordBackendNetworkErrorRateSlidingWindow(b, b.ErrorRate())
return nil, wrapErr(err, "error in backend request")
}
@ -432,7 +431,7 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool
// Alchemy returns a 400 on bad JSONs, so handle that case
if httpRes.StatusCode != 200 && httpRes.StatusCode != 400 {
b.networkErrorsSlidingWindow.Incr()
RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count())
RecordBackendNetworkErrorRateSlidingWindow(b, b.ErrorRate())
return nil, fmt.Errorf("response code %d", httpRes.StatusCode)
}
@ -440,7 +439,7 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool
resB, err := io.ReadAll(io.LimitReader(httpRes.Body, b.maxResponseSize))
if err != nil {
b.networkErrorsSlidingWindow.Incr()
RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count())
RecordBackendNetworkErrorRateSlidingWindow(b, b.ErrorRate())
return nil, wrapErr(err, "error reading response body")
}
@ -458,18 +457,18 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool
// Infura may return a single JSON-RPC response if, for example, the batch contains a request for an unsupported method
if responseIsNotBatched(resB) {
b.networkErrorsSlidingWindow.Incr()
RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count())
RecordBackendNetworkErrorRateSlidingWindow(b, b.ErrorRate())
return nil, ErrBackendUnexpectedJSONRPC
}
b.networkErrorsSlidingWindow.Incr()
RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count())
RecordBackendNetworkErrorRateSlidingWindow(b, b.ErrorRate())
return nil, ErrBackendBadResponse
}
}
if len(rpcReqs) != len(res) {
b.networkErrorsSlidingWindow.Incr()
RecordBackendNetworkErrorCountSlidingWindow(b, b.networkErrorsSlidingWindow.Count())
RecordBackendNetworkErrorRateSlidingWindow(b, b.ErrorRate())
return nil, ErrBackendUnexpectedJSONRPC
}
@ -483,6 +482,7 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool
duration := time.Since(start)
b.latencySlidingWindow.Add(float64(duration))
RecordBackendNetworkLatencyAverageSlidingWindow(b, time.Duration(b.latencySlidingWindow.Avg()))
RecordBackendNetworkErrorRateSlidingWindow(b, b.ErrorRate())
sortBatchRPCResponse(rpcReqs, res)
return res, nil
@ -490,11 +490,7 @@ func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool
// IsHealthy checks if the backend is able to serve traffic, based on dynamic parameters
func (b *Backend) IsHealthy() bool {
errorRate := float64(0)
// avoid division-by-zero when the window is empty
if b.networkRequestsSlidingWindow.Sum() >= 10 {
errorRate = b.networkErrorsSlidingWindow.Sum() / b.networkRequestsSlidingWindow.Sum()
}
errorRate := b.ErrorRate()
avgLatency := time.Duration(b.latencySlidingWindow.Avg())
if errorRate >= b.maxErrorRateThreshold {
return false
@ -505,6 +501,16 @@ func (b *Backend) IsHealthy() bool {
return true
}
// ErrorRate returns the instant error rate of the backend
func (b *Backend) ErrorRate() (errorRate float64) {
// we only really start counting the error rate after a minimum of 10 requests
// this is to avoid false positives when the backend is just starting up
if b.networkRequestsSlidingWindow.Sum() >= 10 {
errorRate = b.networkErrorsSlidingWindow.Sum() / b.networkRequestsSlidingWindow.Sum()
}
return errorRate
}
// IsDegraded checks if the backend is serving traffic in a degraded state (i.e. used as a last resource)
func (b *Backend) IsDegraded() bool {
avgLatency := time.Duration(b.latencySlidingWindow.Avg())

@ -275,23 +275,42 @@ func (cp *ConsensusPoller) UpdateBackend(ctx context.Context, be *Backend) {
log.Warn("error updating backend", "name", be.Name, "err", err)
}
finalizedBlockNumber, _, err := cp.fetchBlock(ctx, be, "finalized")
if err != nil {
log.Warn("error updating backend", "name", be.Name, "err", err)
}
safeBlockNumber, _, err := cp.fetchBlock(ctx, be, "safe")
if err != nil {
log.Warn("error updating backend", "name", be.Name, "err", err)
}
finalizedBlockNumber, _, err := cp.fetchBlock(ctx, be, "finalized")
if err != nil {
log.Warn("error updating backend", "name", be.Name, "err", err)
}
_, _, _, _, oldFinalized, oldSafe, _, _ := cp.getBackendState(be)
expectedBlockTags := cp.checkExpectedBlockTags(finalizedBlockNumber, oldFinalized, safeBlockNumber, oldSafe, latestBlockNumber)
changed, updateDelay := cp.setBackendState(be, peerCount, inSync,
latestBlockNumber, latestBlockHash,
finalizedBlockNumber, safeBlockNumber)
if changed {
RecordBackendLatestBlock(be, latestBlockNumber)
RecordBackendSafeBlock(be, safeBlockNumber)
RecordBackendFinalizedBlock(be, finalizedBlockNumber)
RecordBackendUnexpectedBlockTags(be, !expectedBlockTags)
RecordConsensusBackendUpdateDelay(be, updateDelay)
if !expectedBlockTags {
log.Warn("backend banned - unexpected block tags",
"backend", be.Name,
"oldFinalized", oldFinalized,
"finalizedBlockNumber", finalizedBlockNumber,
"oldSafe", oldSafe,
"safeBlockNumber", safeBlockNumber,
"latestBlockNumber", latestBlockNumber,
)
cp.Ban(be)
}
if changed {
log.Debug("backend state updated",
"name", be.Name,
"peerCount", peerCount,
@ -304,6 +323,19 @@ func (cp *ConsensusPoller) UpdateBackend(ctx context.Context, be *Backend) {
}
}
// checkExpectedBlockTags for unexpected conditions on block tags
// - finalized block number should never decrease
// - safe block number should never decrease
// - finalized block should be < safe block < latest block
func (cp *ConsensusPoller) checkExpectedBlockTags(currentFinalized hexutil.Uint64, oldFinalized hexutil.Uint64,
currentSafe hexutil.Uint64, oldSafe hexutil.Uint64,
currentLatest hexutil.Uint64) bool {
return currentFinalized >= oldFinalized &&
currentSafe >= oldSafe &&
currentFinalized <= currentSafe &&
currentSafe <= currentLatest
}
// UpdateBackendGroupConsensus resolves the current group consensus based on the state of the backends
func (cp *ConsensusPoller) UpdateBackendGroupConsensus(ctx context.Context) {
var highestLatestBlock hexutil.Uint64
@ -320,6 +352,9 @@ func (cp *ConsensusPoller) UpdateBackendGroupConsensus(ctx context.Context) {
for _, be := range cp.backendGroup.Backends {
peerCount, inSync, backendLatestBlockNumber, _, _, _, lastUpdate, _ := cp.getBackendState(be)
if cp.IsBanned(be) {
continue
}
if !be.skipPeerCountCheck && peerCount < cp.minPeerCount {
continue
}
@ -339,6 +374,9 @@ func (cp *ConsensusPoller) UpdateBackendGroupConsensus(ctx context.Context) {
for _, be := range cp.backendGroup.Backends {
peerCount, inSync, backendLatestBlockNumber, backendLatestBlockHash, backendFinalizedBlockNumber, backendSafeBlockNumber, lastUpdate, _ := cp.getBackendState(be)
if cp.IsBanned(be) {
continue
}
if !be.skipPeerCountCheck && peerCount < cp.minPeerCount {
continue
}
@ -451,13 +489,17 @@ func (cp *ConsensusPoller) UpdateBackendGroupConsensus(ctx context.Context) {
}
cp.tracker.SetLatestBlockNumber(proposedBlock)
cp.tracker.SetFinalizedBlockNumber(lowestFinalizedBlock)
cp.tracker.SetSafeBlockNumber(lowestSafeBlock)
cp.tracker.SetFinalizedBlockNumber(lowestFinalizedBlock)
cp.consensusGroupMux.Lock()
cp.consensusGroup = consensusBackends
cp.consensusGroupMux.Unlock()
RecordGroupConsensusLatestBlock(cp.backendGroup, proposedBlock)
RecordGroupConsensusSafeBlock(cp.backendGroup, lowestSafeBlock)
RecordGroupConsensusFinalizedBlock(cp.backendGroup, lowestFinalizedBlock)
RecordGroupConsensusCount(cp.backendGroup, len(consensusBackends))
RecordGroupConsensusFilteredCount(cp.backendGroup, len(filteredBackendsNames))
RecordGroupTotalCount(cp.backendGroup, len(cp.backendGroup.Backends))
@ -481,13 +523,10 @@ func (cp *ConsensusPoller) Ban(be *Backend) {
bs.bannedUntil = time.Now().Add(cp.banPeriod)
}
// Unban remove any bans from the backends
func (cp *ConsensusPoller) Unban() {
// Reset remove any bans from the backends and reset their states
func (cp *ConsensusPoller) Reset() {
for _, be := range cp.backendGroup.Backends {
bs := cp.backendState[be]
bs.backendStateMux.Lock()
bs.bannedUntil = time.Now().Add(-10 * time.Hour)
bs.backendStateMux.Unlock()
cp.backendState[be] = &backendState{}
}
}

File diff suppressed because it is too large Load Diff

@ -26,63 +26,85 @@
"jsonrpc": "2.0",
"id": 67,
"result": {
"hash": "hash1",
"number": "0x1"
"hash": "hash_0x101",
"number": "0x101"
}
}
- method: eth_getBlockByNumber
block: 0x1
block: 0x101
response: >
{
"jsonrpc": "2.0",
"id": 67,
"result": {
"hash": "hash1",
"number": "0x1"
"hash": "hash_0x101",
"number": "0x101"
}
}
- method: eth_getBlockByNumber
block: 0x2
block: 0x102
response: >
{
"jsonrpc": "2.0",
"id": 67,
"result": {
"hash": "hash2",
"number": "0x2"
"hash": "hash_0x102",
"number": "0x102"
}
}
- method: eth_getBlockByNumber
block: 0x3
block: 0x103
response: >
{
"jsonrpc": "2.0",
"id": 67,
"result": {
"hash": "hash3",
"number": "0x3"
"hash": "hash_0x103",
"number": "0x103"
}
}
- method: eth_getBlockByNumber
block: finalized
block: 0x132
response: >
{
"jsonrpc": "2.0",
"id": 67,
"result": {
"hash": "hash_finalized",
"number": "0x555"
"hash": "hash_0x132",
"number": "0x132"
}
}
- method: eth_getBlockByNumber
block: 0x555
block: 0x133
response: >
{
"jsonrpc": "2.0",
"id": 67,
"result": {
"hash": "hash_finalized",
"number": "0x555"
"hash": "hash_0x133",
"number": "0x133"
}
}
- method: eth_getBlockByNumber
block: 0x134
response: >
{
"jsonrpc": "2.0",
"id": 67,
"result": {
"hash": "hash_0x134",
"number": "0x134"
}
}
- method: eth_getBlockByNumber
block: 0x200
response: >
{
"jsonrpc": "2.0",
"id": 67,
"result": {
"hash": "hash_0x200",
"number": "0x200"
}
}
- method: eth_getBlockByNumber
@ -92,40 +114,40 @@
"jsonrpc": "2.0",
"id": 67,
"result": {
"hash": "hash_safe",
"number": "0x551"
"hash": "hash_0xe1",
"number": "0xe1"
}
}
- method: eth_getBlockByNumber
block: 0x555
block: 0xe1
response: >
{
"jsonrpc": "2.0",
"id": 67,
"result": {
"hash": "hash_safe",
"number": "0x551"
"hash": "hash_0xe1",
"number": "0xe1"
}
}
- method: eth_getBlockByNumber
block: 0x5
block: finalized
response: >
{
"jsonrpc": "2.0",
"id": 67,
"result": {
"hash": "hash5",
"number": "0x5"
"hash": "hash_0xc1",
"number": "0xc1"
}
}
- method: eth_getBlockByNumber
block: 0x20
block: 0xc1
response: >
{
"jsonrpc": "2.0",
"id": 67,
"result": {
"hash": "hash20",
"number": "0x20"
"hash": "hash_0xc1",
"number": "0xc1"
}
}

@ -246,6 +246,22 @@ var (
"backend_group_name",
})
consensusSafeBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "group_consensus_safe_block",
Help: "Consensus safe block",
}, []string{
"backend_group_name",
})
consensusFinalizedBlock = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "group_consensus_finalized_block",
Help: "Consensus finalized block",
}, []string{
"backend_group_name",
})
backendLatestBlockBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "backend_latest_block",
@ -254,6 +270,30 @@ var (
"backend_name",
})
backendSafeBlockBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "backend_safe_block",
Help: "Current safe block observed per backend",
}, []string{
"backend_name",
})
backendFinalizedBlockBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "backend_finalized_block",
Help: "Current finalized block observed per backend",
}, []string{
"backend_name",
})
backendUnexpectedBlockTagsBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "backend_unexpected_block_tags",
Help: "Bool gauge for unexpected block tags",
}, []string{
"backend_name",
})
consensusGroupCount = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "group_consensus_count",
@ -318,18 +358,10 @@ var (
"backend_name",
})
networkErrorCountBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
networkErrorRateBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "backend_net_error_count",
Help: "Network error count per backend",
}, []string{
"backend_name",
})
requestCountBackend = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricsNamespace,
Name: "backend_request_count",
Help: "Request count per backend",
Name: "backend_error_rate",
Help: "Request error rate per backend",
}, []string{
"backend_name",
})
@ -402,6 +434,14 @@ func RecordGroupConsensusLatestBlock(group *BackendGroup, blockNumber hexutil.Ui
consensusLatestBlock.WithLabelValues(group.Name).Set(float64(blockNumber))
}
func RecordGroupConsensusSafeBlock(group *BackendGroup, blockNumber hexutil.Uint64) {
consensusSafeBlock.WithLabelValues(group.Name).Set(float64(blockNumber))
}
func RecordGroupConsensusFinalizedBlock(group *BackendGroup, blockNumber hexutil.Uint64) {
consensusFinalizedBlock.WithLabelValues(group.Name).Set(float64(blockNumber))
}
func RecordGroupConsensusCount(group *BackendGroup, count int) {
consensusGroupCount.WithLabelValues(group.Name).Set(float64(count))
}
@ -418,12 +458,20 @@ func RecordBackendLatestBlock(b *Backend, blockNumber hexutil.Uint64) {
backendLatestBlockBackend.WithLabelValues(b.Name).Set(float64(blockNumber))
}
func RecordBackendSafeBlock(b *Backend, blockNumber hexutil.Uint64) {
backendSafeBlockBackend.WithLabelValues(b.Name).Set(float64(blockNumber))
}
func RecordBackendFinalizedBlock(b *Backend, blockNumber hexutil.Uint64) {
backendFinalizedBlockBackend.WithLabelValues(b.Name).Set(float64(blockNumber))
}
func RecordBackendUnexpectedBlockTags(b *Backend, unexpected bool) {
backendFinalizedBlockBackend.WithLabelValues(b.Name).Set(boolToFloat64(unexpected))
}
func RecordConsensusBackendBanned(b *Backend, banned bool) {
v := float64(0)
if banned {
v = float64(1)
}
consensusBannedBackends.WithLabelValues(b.Name).Set(v)
consensusBannedBackends.WithLabelValues(b.Name).Set(boolToFloat64(banned))
}
func RecordConsensusBackendPeerCount(b *Backend, peerCount uint64) {
@ -431,11 +479,7 @@ func RecordConsensusBackendPeerCount(b *Backend, peerCount uint64) {
}
func RecordConsensusBackendInSync(b *Backend, inSync bool) {
v := float64(0)
if inSync {
v = float64(1)
}
consensusInSyncBackend.WithLabelValues(b.Name).Set(v)
consensusInSyncBackend.WithLabelValues(b.Name).Set(boolToFloat64(inSync))
}
func RecordConsensusBackendUpdateDelay(b *Backend, delay time.Duration) {
@ -446,10 +490,13 @@ func RecordBackendNetworkLatencyAverageSlidingWindow(b *Backend, avgLatency time
avgLatencyBackend.WithLabelValues(b.Name).Set(float64(avgLatency.Milliseconds()))
}
func RecordBackendNetworkRequestCountSlidingWindow(b *Backend, count uint) {
requestCountBackend.WithLabelValues(b.Name).Set(float64(count))
func RecordBackendNetworkErrorRateSlidingWindow(b *Backend, rate float64) {
networkErrorRateBackend.WithLabelValues(b.Name).Set(rate)
}
func RecordBackendNetworkErrorCountSlidingWindow(b *Backend, count uint) {
networkErrorCountBackend.WithLabelValues(b.Name).Set(float64(count))
func boolToFloat64(b bool) float64 {
if b {
return 1
}
return 0
}

@ -10,8 +10,8 @@ import (
type RewriteContext struct {
latest hexutil.Uint64
finalized hexutil.Uint64
safe hexutil.Uint64
finalized hexutil.Uint64
}
type RewriteResult uint8