infra/proxyd/backend.go

1510 lines
39 KiB
Go
Raw Normal View History

package proxyd
import (
"bytes"
"context"
"crypto/tls"
"encoding/json"
"errors"
"fmt"
"io"
"math"
"math/rand"
"net/http"
"sort"
"strconv"
"strings"
"sync"
"time"
sw "github.com/ethereum-optimism/optimism/proxyd/pkg/avg-sliding-window"
2023-06-01 23:36:13 +03:00
"github.com/ethereum/go-ethereum/common"
"github.com/ethereum/go-ethereum/log"
"github.com/ethereum/go-ethereum/rpc"
"github.com/gorilla/websocket"
"github.com/prometheus/client_golang/prometheus"
2023-11-09 04:59:36 +03:00
"github.com/xaionaro-go/weightedshuffle"
"golang.org/x/sync/semaphore"
)
const (
JSONRPCVersion = "2.0"
JSONRPCErrorInternal = -32000
notFoundRpcError = -32601
)
var (
ErrParseErr = &RPCErr{
Code: -32700,
Message: "parse error",
HTTPErrorCode: 400,
}
ErrInternal = &RPCErr{
Code: JSONRPCErrorInternal,
Message: "internal error",
HTTPErrorCode: 500,
}
ErrMethodNotWhitelisted = &RPCErr{
Code: notFoundRpcError,
Message: "rpc method is not whitelisted",
HTTPErrorCode: 403,
}
ErrBackendOffline = &RPCErr{
Code: JSONRPCErrorInternal - 10,
Message: "backend offline",
HTTPErrorCode: 503,
}
ErrNoBackends = &RPCErr{
Code: JSONRPCErrorInternal - 11,
Message: "no backends available for method",
HTTPErrorCode: 503,
}
ErrBackendOverCapacity = &RPCErr{
Code: JSONRPCErrorInternal - 12,
Message: "backend is over capacity",
HTTPErrorCode: 429,
}
ErrBackendBadResponse = &RPCErr{
Code: JSONRPCErrorInternal - 13,
Message: "backend returned an invalid response",
HTTPErrorCode: 500,
}
ErrTooManyBatchRequests = &RPCErr{
Code: JSONRPCErrorInternal - 14,
Message: "too many RPC calls in batch request",
}
ErrGatewayTimeout = &RPCErr{
Code: JSONRPCErrorInternal - 15,
Message: "gateway timeout",
HTTPErrorCode: 504,
}
ErrOverRateLimit = &RPCErr{
Code: JSONRPCErrorInternal - 16,
Message: "over rate limit",
HTTPErrorCode: 429,
}
ErrOverSenderRateLimit = &RPCErr{
Code: JSONRPCErrorInternal - 17,
Message: "sender is over rate limit",
HTTPErrorCode: 429,
}
2023-04-25 20:26:55 +03:00
ErrNotHealthy = &RPCErr{
Code: JSONRPCErrorInternal - 18,
Message: "backend is currently not healthy to serve traffic",
2023-04-27 22:10:30 +03:00
HTTPErrorCode: 503,
2023-04-25 20:26:55 +03:00
}
ErrBlockOutOfRange = &RPCErr{
Code: JSONRPCErrorInternal - 19,
Message: "block is out of range",
HTTPErrorCode: 400,
}
ErrRequestBodyTooLarge = &RPCErr{
Code: JSONRPCErrorInternal - 21,
Message: "request body too large",
HTTPErrorCode: 413,
}
ErrBackendResponseTooLarge = &RPCErr{
Code: JSONRPCErrorInternal - 20,
Message: "backend response too large",
HTTPErrorCode: 500,
}
ErrBackendUnexpectedJSONRPC = errors.New("backend returned an unexpected JSON-RPC response")
ErrConsensusGetReceiptsCantBeBatched = errors.New("consensus_getReceipts cannot be batched")
ErrConsensusGetReceiptsInvalidTarget = errors.New("unsupported consensus_receipts_target")
)
func ErrInvalidRequest(msg string) *RPCErr {
return &RPCErr{
Code: -32600,
Message: msg,
HTTPErrorCode: 400,
}
}
func ErrInvalidParams(msg string) *RPCErr {
return &RPCErr{
Code: -32602,
Message: msg,
HTTPErrorCode: 400,
}
}
type Backend struct {
Name string
rpcURL string
receiptsTarget string
wsURL string
authUsername string
authPassword string
headers map[string]string
client *LimitedHTTPClient
dialer *websocket.Dialer
maxRetries int
maxResponseSize int64
maxRPS int
maxWSConns int
outOfServiceInterval time.Duration
stripTrailingXFF bool
proxydIP string
2023-04-25 20:26:55 +03:00
2023-05-05 01:46:08 +03:00
skipPeerCountCheck bool
forcedCandidate bool
2023-05-05 01:46:08 +03:00
2023-04-25 20:26:55 +03:00
maxDegradedLatencyThreshold time.Duration
maxLatencyThreshold time.Duration
maxErrorRateThreshold float64
latencySlidingWindow *sw.AvgSlidingWindow
networkRequestsSlidingWindow *sw.AvgSlidingWindow
intermittentErrorsSlidingWindow *sw.AvgSlidingWindow
weight int
}
type BackendOpt func(b *Backend)
func WithBasicAuth(username, password string) BackendOpt {
return func(b *Backend) {
b.authUsername = username
b.authPassword = password
}
}
func WithHeaders(headers map[string]string) BackendOpt {
return func(b *Backend) {
b.headers = headers
}
}
func WithTimeout(timeout time.Duration) BackendOpt {
return func(b *Backend) {
b.client.Timeout = timeout
}
}
func WithMaxRetries(retries int) BackendOpt {
return func(b *Backend) {
b.maxRetries = retries
}
}
func WithMaxResponseSize(size int64) BackendOpt {
return func(b *Backend) {
b.maxResponseSize = size
}
}
func WithOutOfServiceDuration(interval time.Duration) BackendOpt {
return func(b *Backend) {
b.outOfServiceInterval = interval
}
}
func WithMaxRPS(maxRPS int) BackendOpt {
return func(b *Backend) {
b.maxRPS = maxRPS
}
}
func WithMaxWSConns(maxConns int) BackendOpt {
return func(b *Backend) {
b.maxWSConns = maxConns
}
}
func WithTLSConfig(tlsConfig *tls.Config) BackendOpt {
return func(b *Backend) {
if b.client.Transport == nil {
b.client.Transport = &http.Transport{}
}
b.client.Transport.(*http.Transport).TLSClientConfig = tlsConfig
}
}
func WithStrippedTrailingXFF() BackendOpt {
return func(b *Backend) {
b.stripTrailingXFF = true
}
}
func WithProxydIP(ip string) BackendOpt {
return func(b *Backend) {
b.proxydIP = ip
}
}
func WithConsensusSkipPeerCountCheck(skipPeerCountCheck bool) BackendOpt {
2023-05-05 01:46:08 +03:00
return func(b *Backend) {
b.skipPeerCountCheck = skipPeerCountCheck
}
}
func WithConsensusForcedCandidate(forcedCandidate bool) BackendOpt {
return func(b *Backend) {
b.forcedCandidate = forcedCandidate
}
}
func WithWeight(weight int) BackendOpt {
return func(b *Backend) {
b.weight = weight
}
}
2023-05-04 02:23:43 +03:00
func WithMaxDegradedLatencyThreshold(maxDegradedLatencyThreshold time.Duration) BackendOpt {
return func(b *Backend) {
b.maxDegradedLatencyThreshold = maxDegradedLatencyThreshold
}
}
2023-04-25 20:26:55 +03:00
func WithMaxLatencyThreshold(maxLatencyThreshold time.Duration) BackendOpt {
return func(b *Backend) {
b.maxLatencyThreshold = maxLatencyThreshold
}
}
func WithMaxErrorRateThreshold(maxErrorRateThreshold float64) BackendOpt {
return func(b *Backend) {
b.maxErrorRateThreshold = maxErrorRateThreshold
}
}
func WithConsensusReceiptTarget(receiptsTarget string) BackendOpt {
return func(b *Backend) {
b.receiptsTarget = receiptsTarget
}
}
func WithIntermittentNetworkErrorSlidingWindow(sw *sw.AvgSlidingWindow) BackendOpt {
return func(b *Backend) {
b.intermittentErrorsSlidingWindow = sw
}
}
type indexedReqRes struct {
index int
req *RPCReq
res *RPCRes
}
const proxydHealthzMethod = "proxyd_healthz"
const ConsensusGetReceiptsMethod = "consensus_getReceipts"
const ReceiptsTargetDebugGetRawReceipts = "debug_getRawReceipts"
const ReceiptsTargetAlchemyGetTransactionReceipts = "alchemy_getTransactionReceipts"
const ReceiptsTargetParityGetTransactionReceipts = "parity_getBlockReceipts"
const ReceiptsTargetEthGetTransactionReceipts = "eth_getBlockReceipts"
type ConsensusGetReceiptsResult struct {
Method string `json:"method"`
Result interface{} `json:"result"`
}
// BlockHashOrNumberParameter is a non-conventional wrapper used by alchemy_getTransactionReceipts
type BlockHashOrNumberParameter struct {
BlockHash *common.Hash `json:"blockHash"`
BlockNumber *rpc.BlockNumber `json:"blockNumber"`
}
func NewBackend(
name string,
rpcURL string,
wsURL string,
rpcSemaphore *semaphore.Weighted,
opts ...BackendOpt,
) *Backend {
backend := &Backend{
Name: name,
rpcURL: rpcURL,
wsURL: wsURL,
maxResponseSize: math.MaxInt64,
client: &LimitedHTTPClient{
Client: http.Client{Timeout: 5 * time.Second},
sem: rpcSemaphore,
backendName: name,
},
dialer: &websocket.Dialer{},
2023-04-25 20:26:55 +03:00
maxLatencyThreshold: 10 * time.Second,
maxDegradedLatencyThreshold: 5 * time.Second,
maxErrorRateThreshold: 0.5,
latencySlidingWindow: sw.NewSlidingWindow(),
networkRequestsSlidingWindow: sw.NewSlidingWindow(),
intermittentErrorsSlidingWindow: sw.NewSlidingWindow(),
}
backend.Override(opts...)
if !backend.stripTrailingXFF && backend.proxydIP == "" {
log.Warn("proxied requests' XFF header will not contain the proxyd ip address")
}
return backend
}
func (b *Backend) Override(opts ...BackendOpt) {
for _, opt := range opts {
opt(b)
}
}
func (b *Backend) Forward(ctx context.Context, reqs []*RPCReq, isBatch bool) ([]*RPCRes, error) {
var lastError error
// <= to account for the first attempt not technically being
// a retry
for i := 0; i <= b.maxRetries; i++ {
RecordBatchRPCForward(ctx, b.Name, reqs, RPCRequestSourceHTTP)
metricLabelMethod := reqs[0].Method
if isBatch {
metricLabelMethod = "<batch>"
}
timer := prometheus.NewTimer(
rpcBackendRequestDurationSumm.WithLabelValues(
b.Name,
metricLabelMethod,
strconv.FormatBool(isBatch),
),
)
log.Trace(
"forwarding request to backend",
"name", b.Name,
"req_id", GetReqID(ctx),
"attempt_count", i+1,
"max_attempts", b.maxRetries+1,
"method", metricLabelMethod,
)
res, err := b.doForward(ctx, reqs, isBatch)
switch err {
case nil: // do nothing
case ErrBackendResponseTooLarge:
log.Warn(
"backend response too large",
"name", b.Name,
"req_id", GetReqID(ctx),
"max", b.maxResponseSize,
"method", metricLabelMethod,
)
RecordBatchRPCError(ctx, b.Name, reqs, err)
case ErrConsensusGetReceiptsCantBeBatched:
log.Warn(
"Received unsupported batch request for consensus_getReceipts",
"name", b.Name,
"req_id", GetReqID(ctx),
"err", err,
)
case ErrConsensusGetReceiptsInvalidTarget:
log.Error(
"Unsupported consensus_receipts_target for consensus_getReceipts",
"name", b.Name,
"req_id", GetReqID(ctx),
"err", err,
)
// ErrBackendUnexpectedJSONRPC occurs because infura responds with a single JSON-RPC object
// to a batch request whenever any Request Object in the batch would induce a partial error.
2023-04-25 20:26:55 +03:00
// We don't label the backend offline in this case. But the error is still returned to
// callers so failover can occur if needed.
case ErrBackendUnexpectedJSONRPC:
log.Debug(
2023-04-25 20:26:55 +03:00
"Received unexpected JSON-RPC response",
"name", b.Name,
"req_id", GetReqID(ctx),
"err", err,
)
default:
lastError = err
log.Warn(
"backend request failed, trying again",
"name", b.Name,
"req_id", GetReqID(ctx),
"err", err,
"method", metricLabelMethod,
"attempt_count", i+1,
"max_retries", b.maxRetries+1,
)
timer.ObserveDuration()
RecordBatchRPCError(ctx, b.Name, reqs, err)
sleepContext(ctx, calcBackoff(i))
continue
}
timer.ObserveDuration()
MaybeRecordErrorsInRPCRes(ctx, b.Name, reqs, res)
return res, err
}
return nil, wrapErr(lastError, "permanent error forwarding request")
}
func (b *Backend) ProxyWS(clientConn *websocket.Conn, methodWhitelist *StringSet) (*WSProxier, error) {
backendConn, _, err := b.dialer.Dial(b.wsURL, nil) // nolint:bodyclose
if err != nil {
return nil, wrapErr(err, "error dialing backend")
}
activeBackendWsConnsGauge.WithLabelValues(b.Name).Inc()
return NewWSProxier(b, clientConn, backendConn, methodWhitelist), nil
}
2023-04-18 21:57:55 +03:00
// ForwardRPC makes a call directly to a backend and populate the response into `res`
func (b *Backend) ForwardRPC(ctx context.Context, res *RPCRes, id string, method string, params ...any) error {
jsonParams, err := json.Marshal(params)
if err != nil {
return err
}
rpcReq := RPCReq{
JSONRPC: JSONRPCVersion,
Method: method,
Params: jsonParams,
ID: []byte(id),
}
slicedRes, err := b.doForward(ctx, []*RPCReq{&rpcReq}, false)
if err != nil {
return err
}
if len(slicedRes) != 1 {
return fmt.Errorf("unexpected response len for non-batched request (len != 1)")
}
if slicedRes[0].IsError() {
return fmt.Errorf(slicedRes[0].Error.Error())
}
*res = *(slicedRes[0])
return nil
}
func (b *Backend) doForward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool) ([]*RPCRes, error) {
2023-04-25 20:26:55 +03:00
// we are concerned about network error rates, so we record 1 request independently of how many are in the batch
b.networkRequestsSlidingWindow.Incr()
translatedReqs := make(map[string]*RPCReq, len(rpcReqs))
// translate consensus_getReceipts to receipts target
// right now we only support non-batched
if isBatch {
for _, rpcReq := range rpcReqs {
if rpcReq.Method == ConsensusGetReceiptsMethod {
return nil, ErrConsensusGetReceiptsCantBeBatched
}
}
} else {
for _, rpcReq := range rpcReqs {
if rpcReq.Method == ConsensusGetReceiptsMethod {
translatedReqs[string(rpcReq.ID)] = rpcReq
rpcReq.Method = b.receiptsTarget
2023-06-02 23:45:42 +03:00
var reqParams []rpc.BlockNumberOrHash
err := json.Unmarshal(rpcReq.Params, &reqParams)
if err != nil {
return nil, ErrInvalidRequest("invalid request")
}
var translatedParams []byte
switch rpcReq.Method {
case ReceiptsTargetDebugGetRawReceipts,
ReceiptsTargetEthGetTransactionReceipts,
ReceiptsTargetParityGetTransactionReceipts:
// conventional methods use an array of strings having either block number or block hash
// i.e. ["0xc6ef2fc5426d6ad6fd9e2a26abeab0aa2411b7ab17f30a99d3cb96aed1d1055b"]
params := make([]string, 1)
2023-06-02 23:45:42 +03:00
if reqParams[0].BlockNumber != nil {
params[0] = reqParams[0].BlockNumber.String()
} else {
2023-06-02 23:45:42 +03:00
params[0] = reqParams[0].BlockHash.Hex()
}
translatedParams = mustMarshalJSON(params)
case ReceiptsTargetAlchemyGetTransactionReceipts:
// alchemy uses an array of object with either block number or block hash
// i.e. [{ blockHash: "0xc6ef2fc5426d6ad6fd9e2a26abeab0aa2411b7ab17f30a99d3cb96aed1d1055b" }]
params := make([]BlockHashOrNumberParameter, 1)
2023-06-02 23:45:42 +03:00
if reqParams[0].BlockNumber != nil {
params[0].BlockNumber = reqParams[0].BlockNumber
} else {
2023-06-02 23:45:42 +03:00
params[0].BlockHash = reqParams[0].BlockHash
}
translatedParams = mustMarshalJSON(params)
default:
return nil, ErrConsensusGetReceiptsInvalidTarget
}
rpcReq.Params = translatedParams
}
}
}
isSingleElementBatch := len(rpcReqs) == 1
// Single element batches are unwrapped before being sent
// since Alchemy handles single requests better than batches.
var body []byte
if isSingleElementBatch {
body = mustMarshalJSON(rpcReqs[0])
} else {
body = mustMarshalJSON(rpcReqs)
}
httpReq, err := http.NewRequestWithContext(ctx, "POST", b.rpcURL, bytes.NewReader(body))
if err != nil {
b.intermittentErrorsSlidingWindow.Incr()
2023-05-27 00:22:50 +03:00
RecordBackendNetworkErrorRateSlidingWindow(b, b.ErrorRate())
return nil, wrapErr(err, "error creating backend request")
}
if b.authPassword != "" {
httpReq.SetBasicAuth(b.authUsername, b.authPassword)
}
xForwardedFor := GetXForwardedFor(ctx)
if b.stripTrailingXFF {
xForwardedFor = stripXFF(xForwardedFor)
} else if b.proxydIP != "" {
xForwardedFor = fmt.Sprintf("%s, %s", xForwardedFor, b.proxydIP)
}
httpReq.Header.Set("content-type", "application/json")
httpReq.Header.Set("X-Forwarded-For", xForwardedFor)
for name, value := range b.headers {
httpReq.Header.Set(name, value)
}
2023-04-25 20:26:55 +03:00
start := time.Now()
httpRes, err := b.client.DoLimited(httpReq)
if err != nil {
b.intermittentErrorsSlidingWindow.Incr()
2023-05-27 00:22:50 +03:00
RecordBackendNetworkErrorRateSlidingWindow(b, b.ErrorRate())
return nil, wrapErr(err, "error in backend request")
}
metricLabelMethod := rpcReqs[0].Method
if isBatch {
metricLabelMethod = "<batch>"
}
rpcBackendHTTPResponseCodesTotal.WithLabelValues(
GetAuthCtx(ctx),
b.Name,
metricLabelMethod,
strconv.Itoa(httpRes.StatusCode),
strconv.FormatBool(isBatch),
).Inc()
// Alchemy returns a 400 on bad JSONs, so handle that case
if httpRes.StatusCode != 200 && httpRes.StatusCode != 400 {
b.intermittentErrorsSlidingWindow.Incr()
2023-05-27 00:22:50 +03:00
RecordBackendNetworkErrorRateSlidingWindow(b, b.ErrorRate())
return nil, fmt.Errorf("response code %d", httpRes.StatusCode)
}
defer httpRes.Body.Close()
resB, err := io.ReadAll(LimitReader(httpRes.Body, b.maxResponseSize))
if errors.Is(err, ErrLimitReaderOverLimit) {
return nil, ErrBackendResponseTooLarge
}
if err != nil {
b.intermittentErrorsSlidingWindow.Incr()
2023-05-27 00:22:50 +03:00
RecordBackendNetworkErrorRateSlidingWindow(b, b.ErrorRate())
return nil, wrapErr(err, "error reading response body")
}
var rpcRes []*RPCRes
if isSingleElementBatch {
var singleRes RPCRes
if err := json.Unmarshal(resB, &singleRes); err != nil {
return nil, ErrBackendBadResponse
}
rpcRes = []*RPCRes{
&singleRes,
}
} else {
if err := json.Unmarshal(resB, &rpcRes); err != nil {
// Infura may return a single JSON-RPC response if, for example, the batch contains a request for an unsupported method
if responseIsNotBatched(resB) {
b.intermittentErrorsSlidingWindow.Incr()
2023-05-27 00:22:50 +03:00
RecordBackendNetworkErrorRateSlidingWindow(b, b.ErrorRate())
return nil, ErrBackendUnexpectedJSONRPC
}
b.intermittentErrorsSlidingWindow.Incr()
2023-05-27 00:22:50 +03:00
RecordBackendNetworkErrorRateSlidingWindow(b, b.ErrorRate())
return nil, ErrBackendBadResponse
}
}
if len(rpcReqs) != len(rpcRes) {
b.intermittentErrorsSlidingWindow.Incr()
2023-05-27 00:22:50 +03:00
RecordBackendNetworkErrorRateSlidingWindow(b, b.ErrorRate())
return nil, ErrBackendUnexpectedJSONRPC
}
// capture the HTTP status code in the response. this will only
// ever be 400 given the status check on line 318 above.
if httpRes.StatusCode != 200 {
for _, res := range rpcRes {
res.Error.HTTPErrorCode = httpRes.StatusCode
}
}
2023-04-25 20:26:55 +03:00
duration := time.Since(start)
b.latencySlidingWindow.Add(float64(duration))
2023-05-25 21:04:14 +03:00
RecordBackendNetworkLatencyAverageSlidingWindow(b, time.Duration(b.latencySlidingWindow.Avg()))
2023-05-27 00:22:50 +03:00
RecordBackendNetworkErrorRateSlidingWindow(b, b.ErrorRate())
// enrich the response with the actual request method
for _, res := range rpcRes {
translatedReq, exist := translatedReqs[string(res.ID)]
if exist {
res.Result = ConsensusGetReceiptsResult{
Method: translatedReq.Method,
Result: res.Result,
}
}
}
sortBatchRPCResponse(rpcReqs, rpcRes)
return rpcRes, nil
}
2023-04-25 20:26:55 +03:00
// IsHealthy checks if the backend is able to serve traffic, based on dynamic parameters
func (b *Backend) IsHealthy() bool {
2023-05-27 00:22:50 +03:00
errorRate := b.ErrorRate()
2023-04-25 20:26:55 +03:00
avgLatency := time.Duration(b.latencySlidingWindow.Avg())
if errorRate >= b.maxErrorRateThreshold {
return false
}
if avgLatency >= b.maxLatencyThreshold {
return false
}
return true
}
2023-05-27 00:22:50 +03:00
// ErrorRate returns the instant error rate of the backend
func (b *Backend) ErrorRate() (errorRate float64) {
// we only really start counting the error rate after a minimum of 10 requests
// this is to avoid false positives when the backend is just starting up
if b.networkRequestsSlidingWindow.Sum() >= 10 {
errorRate = b.intermittentErrorsSlidingWindow.Sum() / b.networkRequestsSlidingWindow.Sum()
2023-05-27 00:22:50 +03:00
}
return errorRate
}
2023-04-25 20:26:55 +03:00
// IsDegraded checks if the backend is serving traffic in a degraded state (i.e. used as a last resource)
func (b *Backend) IsDegraded() bool {
avgLatency := time.Duration(b.latencySlidingWindow.Avg())
return avgLatency >= b.maxDegradedLatencyThreshold
}
func responseIsNotBatched(b []byte) bool {
var r RPCRes
return json.Unmarshal(b, &r) == nil
}
// sortBatchRPCResponse sorts the RPCRes slice according to the position of its corresponding ID in the RPCReq slice
func sortBatchRPCResponse(req []*RPCReq, res []*RPCRes) {
pos := make(map[string]int, len(req))
for i, r := range req {
key := string(r.ID)
if _, ok := pos[key]; ok {
panic("bug! detected requests with duplicate IDs")
}
pos[key] = i
}
sort.Slice(res, func(i, j int) bool {
l := res[i].ID
r := res[j].ID
return pos[string(l)] < pos[string(r)]
})
}
type BackendGroup struct {
Name string
Backends []*Backend
WeightedRouting bool
Consensus *ConsensusPoller
FallbackBackends map[string]bool
routingStrategy RoutingStrategy
}
func (bg *BackendGroup) GetRoutingStrategy() RoutingStrategy {
return bg.routingStrategy
}
func (bg *BackendGroup) Fallbacks() []*Backend {
fallbacks := []*Backend{}
for _, a := range bg.Backends {
if fallback, ok := bg.FallbackBackends[a.Name]; ok && fallback {
fallbacks = append(fallbacks, a)
}
}
return fallbacks
}
func (bg *BackendGroup) Primaries() []*Backend {
primaries := []*Backend{}
for _, a := range bg.Backends {
fallback, ok := bg.FallbackBackends[a.Name]
if ok && !fallback {
primaries = append(primaries, a)
}
}
return primaries
}
// NOTE: BackendGroup Forward contains the log for balancing with consensus aware
2023-10-19 22:48:03 +03:00
func (bg *BackendGroup) Forward(ctx context.Context, rpcReqs []*RPCReq, isBatch bool) ([]*RPCRes, string, error) {
if len(rpcReqs) == 0 {
2023-10-19 22:48:03 +03:00
return nil, "", nil
}
2023-11-06 05:07:39 +03:00
backends := bg.orderedBackendsForRequest()
overriddenResponses := make([]*indexedReqRes, 0)
rewrittenReqs := make([]*RPCReq, 0, len(rpcReqs))
// When routing_strategy is set to `consensus_aware` the backend group acts as a load balancer
// serving traffic from any backend that agrees in the consensus group
// We also rewrite block tags to enforce compliance with consensus
if bg.Consensus != nil {
rpcReqs, overriddenResponses = bg.OverwriteConsensusResponses(rpcReqs, overriddenResponses, rewrittenReqs)
}
// When routing_strategy is set to 'multicall' the request will be forward to all backends
// and return the first successful response
if bg.GetRoutingStrategy() == MulticallRoutingStrategy && isValidMulticallTx(rpcReqs) && !isBatch {
backendResp := bg.ExecuteMulticall(ctx, rpcReqs)
return backendResp.RPCRes, backendResp.ServedBy, backendResp.error
}
rpcRequestsTotal.Inc()
ch := make(chan BackendGroupRPCResponse)
go func() {
defer close(ch)
backendResp := bg.ForwardRequestToBackendGroup(rpcReqs, backends, ctx, isBatch)
ch <- *backendResp
}()
backendResp := <-ch
if backendResp.error != nil {
log.Error("error serving requests",
"req_id", GetReqID(ctx),
"auth", GetAuthCtx(ctx),
"err", backendResp.error,
)
return backendResp.RPCRes, backendResp.ServedBy, backendResp.error
}
// re-apply overridden responses
log.Trace("successfully served request overriding responses",
"req_id", GetReqID(ctx),
"auth", GetAuthCtx(ctx),
)
res := OverrideResponses(backendResp.RPCRes, overriddenResponses)
return res, backendResp.ServedBy, backendResp.error
}
func isValidMulticallTx(rpcReqs []*RPCReq) bool {
if len(rpcReqs) == 1 {
if rpcReqs[0].Method == "eth_sendRawTransaction" {
return true
}
}
return false
}
// Using special struct since servedBy may not be populated if error occurs
type multicallTuple struct {
response *BackendGroupRPCResponse
backendName string
}
func (bg *BackendGroup) ExecuteMulticall(ctx context.Context, rpcReqs []*RPCReq) *BackendGroupRPCResponse {
// Create ctx without cancel so background tasks process
// after original request returns
bgCtx := context.WithoutCancel(ctx)
log.Info("executing multicall routing strategy",
"req_id", GetReqID(bgCtx),
"auth", GetAuthCtx(bgCtx),
)
var wg sync.WaitGroup
ch := make(chan *multicallTuple, len(bg.Backends))
for _, backend := range bg.Backends {
wg.Add(1)
go bg.MulticallRequest(backend, rpcReqs, &wg, bgCtx, ch)
}
go func() {
wg.Wait()
log.Debug("closing multicall channel",
"req_id", GetReqID(bgCtx),
"auth", GetAuthCtx(bgCtx),
)
close(ch)
}()
return bg.ProcessMulticallResponses(ch, bgCtx)
}
func (bg *BackendGroup) MulticallRequest(backend *Backend, rpcReqs []*RPCReq, wg *sync.WaitGroup, ctx context.Context, ch chan *multicallTuple) {
defer wg.Done()
log.Debug("forwarding multicall request to backend",
"req_id", GetReqID(ctx),
"auth", GetAuthCtx(ctx),
"backend", backend.Name,
)
RecordBackendGroupMulticallRequest(bg, backend.Name)
backendResp := bg.ForwardRequestToBackendGroup(rpcReqs, []*Backend{backend}, ctx, false)
multicallResp := &multicallTuple{
response: backendResp,
backendName: backend.Name,
}
log.Debug("placing multicall response into channel",
"req_id", GetReqID(ctx),
"auth", GetAuthCtx(ctx),
"backend", backend.Name,
)
ch <- multicallResp
log.Trace("placed multicall response into channel",
"req_id", GetReqID(ctx),
"auth", GetAuthCtx(ctx),
"backend", backend.Name,
)
if backendResp.error != nil {
log.Error("received multicall error response from backend",
"req_id", GetReqID(ctx),
"auth", GetAuthCtx(ctx),
"backend", backend.Name,
"error", backendResp.error.Error(),
)
RecordBackendGroupMulticallCompletion(bg, backend.Name, backendResp.error.Error())
} else {
RecordBackendGroupMulticallCompletion(bg, backend.Name, "nil")
}
}
func (bg *BackendGroup) ProcessMulticallResponses(ch chan *multicallTuple, ctx context.Context) *BackendGroupRPCResponse {
var finalResp *BackendGroupRPCResponse
i := 0
for {
multicallResp, ok := <-ch
if !ok {
log.Trace("multicall response channel closed",
"req_id", GetReqID(ctx),
"auth", GetAuthCtx(ctx),
"response_count", i,
)
if i > 0 {
return finalResp
}
return &BackendGroupRPCResponse{
RPCRes: nil,
ServedBy: "",
error: errors.New("no multicall response received"),
}
}
i++
resp := multicallResp.response
backendName := multicallResp.backendName
if resp.error != nil {
log.Error("received error response from multicall channel",
"req_id", GetReqID(ctx),
"auth", GetAuthCtx(ctx),
"err", resp.error,
"backend", backendName,
)
finalResp = resp
continue
}
log.Info("received successful response from multicall channel",
"req_id", GetReqID(ctx),
"auth", GetAuthCtx(ctx),
"served_by", resp.ServedBy,
"backend", backendName,
)
return resp
}
}
func (bg *BackendGroup) ProxyWS(ctx context.Context, clientConn *websocket.Conn, methodWhitelist *StringSet) (*WSProxier, error) {
for _, back := range bg.Backends {
proxier, err := back.ProxyWS(clientConn, methodWhitelist)
if errors.Is(err, ErrBackendOffline) {
log.Warn(
"skipping offline backend",
"name", back.Name,
"req_id", GetReqID(ctx),
"auth", GetAuthCtx(ctx),
)
continue
}
if errors.Is(err, ErrBackendOverCapacity) {
log.Warn(
"skipping over-capacity backend",
"name", back.Name,
"req_id", GetReqID(ctx),
"auth", GetAuthCtx(ctx),
)
continue
}
if err != nil {
log.Warn(
"error dialing ws backend",
"name", back.Name,
"req_id", GetReqID(ctx),
"auth", GetAuthCtx(ctx),
"err", err,
)
continue
}
return proxier, nil
}
return nil, ErrNoBackends
}
2023-11-09 04:59:36 +03:00
func weightedShuffle(backends []*Backend) {
weight := func(i int) float64 {
return float64(backends[i].weight)
}
weightedshuffle.ShuffleInplace(backends, weight, nil)
}
2023-11-06 05:07:39 +03:00
func (bg *BackendGroup) orderedBackendsForRequest() []*Backend {
if bg.Consensus != nil {
2023-11-09 04:59:36 +03:00
return bg.loadBalancedConsensusGroup()
} else {
healthy := make([]*Backend, 0, len(bg.Backends))
unhealthy := make([]*Backend, 0, len(bg.Backends))
for _, be := range bg.Backends {
if be.IsHealthy() {
healthy = append(healthy, be)
} else {
unhealthy = append(unhealthy, be)
}
}
if bg.WeightedRouting {
weightedShuffle(healthy)
weightedShuffle(unhealthy)
}
return append(healthy, unhealthy...)
2023-11-06 05:07:39 +03:00
}
}
func (bg *BackendGroup) loadBalancedConsensusGroup() []*Backend {
cg := bg.Consensus.GetConsensusGroup()
2023-04-28 21:51:05 +03:00
backendsHealthy := make([]*Backend, 0, len(cg))
backendsDegraded := make([]*Backend, 0, len(cg))
// separate into healthy, degraded and unhealthy backends
for _, be := range cg {
// unhealthy are filtered out and not attempted
if !be.IsHealthy() {
continue
}
if be.IsDegraded() {
backendsDegraded = append(backendsDegraded, be)
continue
}
backendsHealthy = append(backendsHealthy, be)
}
// shuffle both slices
r := rand.New(rand.NewSource(time.Now().UnixNano()))
r.Shuffle(len(backendsHealthy), func(i, j int) {
backendsHealthy[i], backendsHealthy[j] = backendsHealthy[j], backendsHealthy[i]
})
r.Shuffle(len(backendsDegraded), func(i, j int) {
backendsDegraded[i], backendsDegraded[j] = backendsDegraded[j], backendsDegraded[i]
})
2023-11-01 23:01:02 +03:00
if bg.WeightedRouting {
2023-11-09 04:59:36 +03:00
weightedShuffle(backendsHealthy)
}
2023-04-28 21:51:05 +03:00
// healthy are put into a priority position
// degraded backends are used as fallback
backendsHealthy = append(backendsHealthy, backendsDegraded...)
return backendsHealthy
}
func (bg *BackendGroup) Shutdown() {
if bg.Consensus != nil {
bg.Consensus.Shutdown()
}
}
func calcBackoff(i int) time.Duration {
jitter := float64(rand.Int63n(250))
ms := math.Min(math.Pow(2, float64(i))*1000+jitter, 3000)
return time.Duration(ms) * time.Millisecond
}
type WSProxier struct {
backend *Backend
clientConn *websocket.Conn
2023-07-27 21:48:46 +03:00
clientConnMu sync.Mutex
backendConn *websocket.Conn
2023-07-27 21:48:46 +03:00
backendConnMu sync.Mutex
methodWhitelist *StringSet
2023-07-27 21:48:46 +03:00
readTimeout time.Duration
writeTimeout time.Duration
}
func NewWSProxier(backend *Backend, clientConn, backendConn *websocket.Conn, methodWhitelist *StringSet) *WSProxier {
return &WSProxier{
backend: backend,
clientConn: clientConn,
backendConn: backendConn,
methodWhitelist: methodWhitelist,
2023-07-27 21:48:46 +03:00
readTimeout: defaultWSReadTimeout,
writeTimeout: defaultWSWriteTimeout,
}
}
func (w *WSProxier) Proxy(ctx context.Context) error {
errC := make(chan error, 2)
go w.clientPump(ctx, errC)
go w.backendPump(ctx, errC)
err := <-errC
w.close()
return err
}
func (w *WSProxier) clientPump(ctx context.Context, errC chan error) {
for {
// Block until we get a message.
msgType, msg, err := w.clientConn.ReadMessage()
if err != nil {
2023-07-27 21:48:46 +03:00
if err := w.writeBackendConn(websocket.CloseMessage, formatWSError(err)); err != nil {
log.Error("error writing backendConn message", "err", err)
2023-07-27 21:48:46 +03:00
errC <- err
return
}
}
RecordWSMessage(ctx, w.backend.Name, SourceClient)
// Route control messages to the backend. These don't
// count towards the total RPC requests count.
if msgType != websocket.TextMessage && msgType != websocket.BinaryMessage {
2023-07-27 21:48:46 +03:00
err := w.writeBackendConn(msgType, msg)
if err != nil {
errC <- err
return
}
continue
}
rpcRequestsTotal.Inc()
// Don't bother sending invalid requests to the backend,
// just handle them here.
req, err := w.prepareClientMsg(msg)
if err != nil {
var id json.RawMessage
method := MethodUnknown
if req != nil {
id = req.ID
method = req.Method
}
log.Info(
"error preparing client message",
"auth", GetAuthCtx(ctx),
"req_id", GetReqID(ctx),
"err", err,
)
msg = mustMarshalJSON(NewRPCErrorRes(id, err))
RecordRPCError(ctx, BackendProxyd, method, err)
// Send error response to client
err = w.writeClientConn(msgType, msg)
if err != nil {
errC <- err
return
}
continue
}
// Send eth_accounts requests directly to the client
if req.Method == "eth_accounts" {
msg = mustMarshalJSON(NewRPCRes(req.ID, emptyArrayResponse))
RecordRPCForward(ctx, BackendProxyd, "eth_accounts", RPCRequestSourceWS)
err = w.writeClientConn(msgType, msg)
if err != nil {
errC <- err
return
}
continue
}
RecordRPCForward(ctx, w.backend.Name, req.Method, RPCRequestSourceWS)
log.Info(
"forwarded WS message to backend",
"method", req.Method,
"auth", GetAuthCtx(ctx),
"req_id", GetReqID(ctx),
)
2023-07-27 21:48:46 +03:00
err = w.writeBackendConn(msgType, msg)
if err != nil {
errC <- err
return
}
}
}
func (w *WSProxier) backendPump(ctx context.Context, errC chan error) {
for {
// Block until we get a message.
msgType, msg, err := w.backendConn.ReadMessage()
if err != nil {
if err := w.writeClientConn(websocket.CloseMessage, formatWSError(err)); err != nil {
log.Error("error writing clientConn message", "err", err)
2023-07-27 21:48:46 +03:00
errC <- err
return
}
}
RecordWSMessage(ctx, w.backend.Name, SourceBackend)
// Route control messages directly to the client.
if msgType != websocket.TextMessage && msgType != websocket.BinaryMessage {
err := w.writeClientConn(msgType, msg)
if err != nil {
errC <- err
return
}
continue
}
res, err := w.parseBackendMsg(msg)
if err != nil {
var id json.RawMessage
if res != nil {
id = res.ID
}
msg = mustMarshalJSON(NewRPCErrorRes(id, err))
log.Info("backend responded with error", "err", err)
} else {
if res.IsError() {
log.Info(
"backend responded with RPC error",
"code", res.Error.Code,
"msg", res.Error.Message,
"source", "ws",
"auth", GetAuthCtx(ctx),
"req_id", GetReqID(ctx),
)
RecordRPCError(ctx, w.backend.Name, MethodUnknown, res.Error)
} else {
log.Info(
"forwarded WS message to client",
"auth", GetAuthCtx(ctx),
"req_id", GetReqID(ctx),
)
}
}
err = w.writeClientConn(msgType, msg)
if err != nil {
errC <- err
return
}
}
}
func (w *WSProxier) close() {
w.clientConn.Close()
w.backendConn.Close()
activeBackendWsConnsGauge.WithLabelValues(w.backend.Name).Dec()
}
func (w *WSProxier) prepareClientMsg(msg []byte) (*RPCReq, error) {
req, err := ParseRPCReq(msg)
if err != nil {
return nil, err
}
if !w.methodWhitelist.Has(req.Method) {
return req, ErrMethodNotWhitelisted
}
return req, nil
}
func (w *WSProxier) parseBackendMsg(msg []byte) (*RPCRes, error) {
res, err := ParseRPCRes(bytes.NewReader(msg))
if err != nil {
log.Warn("error parsing RPC response", "source", "ws", "err", err)
return res, ErrBackendBadResponse
}
return res, nil
}
func (w *WSProxier) writeClientConn(msgType int, msg []byte) error {
w.clientConnMu.Lock()
2023-07-27 21:48:46 +03:00
defer w.clientConnMu.Unlock()
if err := w.clientConn.SetWriteDeadline(time.Now().Add(w.writeTimeout)); err != nil {
log.Error("ws client write timeout", "err", err)
return err
}
err := w.clientConn.WriteMessage(msgType, msg)
2023-07-27 21:48:46 +03:00
return err
}
func (w *WSProxier) writeBackendConn(msgType int, msg []byte) error {
w.backendConnMu.Lock()
defer w.backendConnMu.Unlock()
if err := w.backendConn.SetWriteDeadline(time.Now().Add(w.writeTimeout)); err != nil {
log.Error("ws backend write timeout", "err", err)
return err
}
2023-07-27 23:29:44 +03:00
err := w.backendConn.WriteMessage(msgType, msg)
return err
}
func mustMarshalJSON(in interface{}) []byte {
out, err := json.Marshal(in)
if err != nil {
panic(err)
}
return out
}
func formatWSError(err error) []byte {
m := websocket.FormatCloseMessage(websocket.CloseNormalClosure, fmt.Sprintf("%v", err))
if e, ok := err.(*websocket.CloseError); ok {
if e.Code != websocket.CloseNoStatusReceived {
m = websocket.FormatCloseMessage(e.Code, e.Text)
}
}
return m
}
func sleepContext(ctx context.Context, duration time.Duration) {
select {
case <-ctx.Done():
case <-time.After(duration):
}
}
type LimitedHTTPClient struct {
http.Client
sem *semaphore.Weighted
backendName string
}
func (c *LimitedHTTPClient) DoLimited(req *http.Request) (*http.Response, error) {
if err := c.sem.Acquire(req.Context(), 1); err != nil {
tooManyRequestErrorsTotal.WithLabelValues(c.backendName).Inc()
return nil, wrapErr(err, "too many requests")
}
defer c.sem.Release(1)
return c.Do(req)
}
func RecordBatchRPCError(ctx context.Context, backendName string, reqs []*RPCReq, err error) {
for _, req := range reqs {
RecordRPCError(ctx, backendName, req.Method, err)
}
}
func MaybeRecordErrorsInRPCRes(ctx context.Context, backendName string, reqs []*RPCReq, resBatch []*RPCRes) {
2024-08-06 18:23:25 +03:00
log.Debug("forwarded RPC request",
"backend", backendName,
"auth", GetAuthCtx(ctx),
"req_id", GetReqID(ctx),
"batch_size", len(reqs),
)
var lastError *RPCErr
for i, res := range resBatch {
if res.IsError() {
lastError = res.Error
RecordRPCError(ctx, backendName, reqs[i].Method, res.Error)
}
}
if lastError != nil {
log.Info(
"backend responded with RPC error",
"backend", backendName,
"last_error_code", lastError.Code,
"last_error_msg", lastError.Message,
"req_id", GetReqID(ctx),
"source", "rpc",
"auth", GetAuthCtx(ctx),
)
}
}
func RecordBatchRPCForward(ctx context.Context, backendName string, reqs []*RPCReq, source string) {
for _, req := range reqs {
RecordRPCForward(ctx, backendName, req.Method, source)
}
}
func (b *Backend) ClearSlidingWindows() {
b.intermittentErrorsSlidingWindow.Clear()
b.networkRequestsSlidingWindow.Clear()
}
func stripXFF(xff string) string {
2023-10-20 05:02:12 +03:00
ipList := strings.Split(xff, ",")
return strings.TrimSpace(ipList[0])
}
type BackendGroupRPCResponse struct {
RPCRes []*RPCRes
ServedBy string
error error
}
func (bg *BackendGroup) ForwardRequestToBackendGroup(
rpcReqs []*RPCReq,
backends []*Backend,
ctx context.Context,
isBatch bool,
) *BackendGroupRPCResponse {
for _, back := range backends {
res := make([]*RPCRes, 0)
var err error
servedBy := fmt.Sprintf("%s/%s", bg.Name, back.Name)
if len(rpcReqs) > 0 {
res, err = back.Forward(ctx, rpcReqs, isBatch)
if errors.Is(err, ErrConsensusGetReceiptsCantBeBatched) ||
errors.Is(err, ErrConsensusGetReceiptsInvalidTarget) ||
errors.Is(err, ErrMethodNotWhitelisted) {
return &BackendGroupRPCResponse{
RPCRes: nil,
ServedBy: "",
error: err,
}
}
if errors.Is(err, ErrBackendResponseTooLarge) {
return &BackendGroupRPCResponse{
RPCRes: nil,
ServedBy: "",
error: err,
}
}
if errors.Is(err, ErrBackendOffline) {
log.Warn(
"skipping offline backend",
"name", back.Name,
"auth", GetAuthCtx(ctx),
"req_id", GetReqID(ctx),
)
continue
}
if errors.Is(err, ErrBackendOverCapacity) {
log.Warn(
"skipping over-capacity backend",
"name", back.Name,
"auth", GetAuthCtx(ctx),
"req_id", GetReqID(ctx),
)
continue
}
if err != nil {
log.Error(
"error forwarding request to backend",
"name", back.Name,
"req_id", GetReqID(ctx),
"auth", GetAuthCtx(ctx),
"err", err,
)
continue
}
}
return &BackendGroupRPCResponse{
RPCRes: res,
ServedBy: servedBy,
error: nil,
}
}
RecordUnserviceableRequest(ctx, RPCRequestSourceHTTP)
return &BackendGroupRPCResponse{
RPCRes: nil,
ServedBy: "",
error: ErrNoBackends,
}
}
func OverrideResponses(res []*RPCRes, overriddenResponses []*indexedReqRes) []*RPCRes {
for _, ov := range overriddenResponses {
if len(res) > 0 {
// insert ov.res at position ov.index
res = append(res[:ov.index], append([]*RPCRes{ov.res}, res[ov.index:]...)...)
} else {
res = append(res, ov.res)
}
}
return res
}
func (bg *BackendGroup) OverwriteConsensusResponses(rpcReqs []*RPCReq, overriddenResponses []*indexedReqRes, rewrittenReqs []*RPCReq) ([]*RPCReq, []*indexedReqRes) {
rctx := RewriteContext{
latest: bg.Consensus.GetLatestBlockNumber(),
safe: bg.Consensus.GetSafeBlockNumber(),
finalized: bg.Consensus.GetFinalizedBlockNumber(),
maxBlockRange: bg.Consensus.maxBlockRange,
}
for i, req := range rpcReqs {
res := RPCRes{JSONRPC: JSONRPCVersion, ID: req.ID}
result, err := RewriteTags(rctx, req, &res)
switch result {
case RewriteOverrideError:
overriddenResponses = append(overriddenResponses, &indexedReqRes{
index: i,
req: req,
res: &res,
})
if errors.Is(err, ErrRewriteBlockOutOfRange) {
res.Error = ErrBlockOutOfRange
} else if errors.Is(err, ErrRewriteRangeTooLarge) {
res.Error = ErrInvalidParams(
fmt.Sprintf("block range greater than %d max", rctx.maxBlockRange),
)
} else {
res.Error = ErrParseErr
}
case RewriteOverrideResponse:
overriddenResponses = append(overriddenResponses, &indexedReqRes{
index: i,
req: req,
res: &res,
})
case RewriteOverrideRequest, RewriteNone:
rewrittenReqs = append(rewrittenReqs, req)
}
}
return rewrittenReqs, overriddenResponses
}