diff --git a/Cargo.lock b/Cargo.lock index feec357a..f2b417c0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -282,13 +282,13 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "axum" -version = "0.6.4" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5694b64066a2459918d8074c2ce0d5a88f409431994c2356617c8ae0c4721fc" +checksum = "4e246206a63c9830e118d12c894f56a82033da1a2361f5544deeee3df85c99d9" dependencies = [ "async-trait", "axum-core", - "base64 0.20.0", + "base64 0.21.0", "bitflags", "bytes", "futures-util", @@ -347,9 +347,9 @@ dependencies = [ [[package]] name = "axum-macros" -version = "0.3.2" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dbcf61bed07d554bd5c225cd07bc41b793eab63e79c6f0ceac7e1aed2f1c670" +checksum = "5fbf955307ff8addb48d2399393c9e2740dd491537ec562b66ab364fc4a38841" dependencies = [ "heck 0.4.0", "proc-macro2", @@ -419,12 +419,6 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" -[[package]] -name = "base64" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5" - [[package]] name = "base64" version = "0.21.0" @@ -1809,6 +1803,12 @@ version = "2.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" +[[package]] +name = "ewma" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f20267f3a8b678b7151c0c508002e79126144a5d47badddec7f31ddc1f4c754" + [[package]] name = "eyre" version = "0.6.8" @@ -2891,9 +2891,9 @@ dependencies = [ [[package]] name = "moka" -version = "0.9.7" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b9268097a2cf211ac9955b1cc95e80fa84fff5c2d13ba292916445dc8a311f" +checksum = "2b6446f16d504e3d575df79cabb11bfbe9f24b17e9562d964a815db7b28ae3ec" dependencies = [ "async-io", "async-lock", @@ -3093,9 +3093,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.17.0" +version = "1.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66" +checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" [[package]] name = "opaque-debug" @@ -3134,6 +3134,15 @@ dependencies = [ "syn", ] +[[package]] +name = "ordered-float" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d84eb1409416d254e4a9c8fa56cc24701755025b458f0fcd8e59e1f5f40c23bf" +dependencies = [ + "num-traits", +] + [[package]] name = "os_info" version = "3.6.0" @@ -4520,9 +4529,9 @@ dependencies = [ [[package]] name = "serde_prometheus" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb6048d9e4ebc41f7d1a42c79b04c5b460633be307620a0e34a8f81970ea47" +checksum = "9c1a4ca38f4e746460d1dbd3711b8ca8ae314d1b21247edeff61dd20325b5a6f" dependencies = [ "heapless", "nom", @@ -5761,7 +5770,7 @@ dependencies = [ [[package]] name = "web3_proxy" -version = "0.13.0" +version = "0.13.1" dependencies = [ "anyhow", "argh", @@ -5776,6 +5785,7 @@ dependencies = [ "entities", "env_logger", "ethers", + "ewma", "fdlimit", "flume", "futures", @@ -5793,6 +5803,8 @@ dependencies = [ "notify", "num", "num-traits", + "once_cell", + "ordered-float", "pagerduty-rs", "parking_lot 0.12.1", "prettytable", diff --git a/Jenkinsfile b/Jenkinsfile index 47cb0c2b..17085022 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,19 +1,20 @@ def buildAndPush() { + // env.ARCH is the system architecture. some apps can be generic (amd64, arm64), + // but apps that compile for specific hardware (like web3-proxy) will need more specific tags (amd64_epyc2, arm64_graviton2, intel_xeon3, etc.) // env.BRANCH_NAME is set to the git branch name by default // env.REGISTRY is the repository url for this pipeline // env.GIT_SHORT is the git short hash of the currently checked out repo // env.LATEST_BRANCH is the branch name that gets tagged latest - // env.ARCH is the system architecture. some apps can be generic (amd64, arm64), - // but apps that compile for specific hardware (like web3-proxy) will need more specific tags (amd64_epyc2, arm64_graviton2, intel_xeon3, etc.) // TODO: check that this system actually matches the given arch sh '''#!/bin/bash set -eux -o pipefail - [ -n "$GIT_SHORT" ] - [ -n "$GIT_SHORT" ] - [ -n "$REGISTRY" ] [ -n "$ARCH" ] + [ -n "$BRANCH_NAME" ] + [ -n "$REGISTRY" ] + [ -n "$GIT_SHORT" ] + [ -n "$LATEST_BRANCH" ] # deterministic mtime on .git keeps Dockerfiles that do 'ADD . .' or similar # without this, the build process always thinks the directory has changes diff --git a/TODO.md b/TODO.md index 207567c4..5f3e18f3 100644 --- a/TODO.md +++ b/TODO.md @@ -330,6 +330,11 @@ These are not yet ordered. There might be duplicates. We might not actually need - [x] block all admin_ rpc commands - [x] remove the "metered" crate now that we save aggregate queries? - [x] add archive depth to app config +- [x] use from_block and to_block so that eth_getLogs is routed correctly +- [x] improve eth_sendRawTransaction server selection +- [x] don't cache methods that are usually very large +- [x] use http provider when available +- [ ] don't use new_head_provider anywhere except new head subscription - [-] proxy mode for benchmarking all backends - [-] proxy mode for sending to multiple backends - [-] let users choose a % of reverts to log (or maybe x/second). someone like curve logging all reverts will be a BIG database very quickly @@ -339,6 +344,12 @@ These are not yet ordered. There might be duplicates. We might not actually need - [-] add configurable size limits to all the Caches - instead of configuring each cache with MB sizes, have one value for total memory footprint and then percentages for each cache - https://github.com/moka-rs/moka/issues/201 +- [ ] have multiple providers on each backend rpc. one websocket for newHeads. and then http providers for handling requests + - erigon only streams the JSON over HTTP. that code isn't enabled for websockets. so this should save memory on the erigon servers + - i think this also means we don't need to worry about changing the id that the user gives us. + - have the healthcheck get the block over http. if it errors, or doesn't match what the websocket says, something is wrong (likely a deadlock in the websocket code) +- [ ] maybe we shouldn't route eth_getLogs to syncing nodes. serving queries slows down sync significantly + - change the send_best function to only include servers that are at least close to fully synced - [ ] have private transactions be enabled by a url setting rather than a setting on the key - [ ] cli for adding rpc keys to an existing user - [ ] rate limiting/throttling on query_user_stats @@ -349,6 +360,7 @@ These are not yet ordered. There might be duplicates. We might not actually need - if total difficulty is not on the block and we aren't on ETH, fetch the full block instead of just the header - if total difficulty is set and non-zero, use it for consensus instead of just the number - [ ] query_user_stats cache hit rate +- [ ] need debounce on reconnect. websockets are closing on us and then we reconnect twice. locks on ProviderState need more thought - [ ] having the whole block in status is very verbose. trim it down - [ ] `cost estimate` script - sum bytes and number of requests. prompt hosting costs. divide diff --git a/config/example.toml b/config/example.toml index e2c9d8b7..8227635f 100644 --- a/config/example.toml +++ b/config/example.toml @@ -52,50 +52,50 @@ response_cache_max_bytes = 10_000_000_000 [balanced_rpcs.ankr] display_name = "Ankr" - url = "https://rpc.ankr.com/eth" + http_url = "https://rpc.ankr.com/eth" soft_limit = 1_000 tier = 0 [balanced_rpcs.cloudflare] display_name = "Cloudflare" - url = "https://cloudflare-eth.com" + http_url = "https://cloudflare-eth.com" soft_limit = 1_000 tier = 1 [balanced_rpcs.blastapi] display_name = "Blast" - url = "https://eth-mainnet.public.blastapi.io" + http_url = "https://eth-mainnet.public.blastapi.io" soft_limit = 1_000 tier = 1 [balanced_rpcs.mycryptoapi] display_name = "MyCrypto" disabled = true - url = "https://api.mycryptoapi.com/eth" + http_url = "https://api.mycryptoapi.com/eth" soft_limit = 1_000 tier = 2 [balanced_rpcs.pokt-v1] display_name = "Pokt #1" - url = "https://eth-mainnet.gateway.pokt.network/v1/5f3453978e354ab992c4da79" + http_url = "https://eth-mainnet.gateway.pokt.network/v1/5f3453978e354ab992c4da79" soft_limit = 500 tier = 2 [balanced_rpcs.pokt] display_name = "Pokt #2" - url = "https://eth-rpc.gateway.pokt.network" + http_url = "https://eth-rpc.gateway.pokt.network" soft_limit = 500 tier = 3 [balanced_rpcs.linkpool] display_name = "Linkpool" - url = "https://main-rpc.linkpool.io" + http_url = "https://main-rpc.linkpool.io" soft_limit = 500 tier = 4 [balanced_rpcs.runonflux] display_name = "Run on Flux (light)" - url = "https://ethereumnodelight.app.runonflux.io" + http_url = "https://ethereumnodelight.app.runonflux.io" soft_limit = 1_000 tier = 5 @@ -103,7 +103,7 @@ response_cache_max_bytes = 10_000_000_000 [balanced_rpcs.linkpool-light] display_name = "Linkpool (light)" disabled = true - url = "https://main-light.eth.linkpool.io" + http_url = "https://main-light.eth.linkpool.io" soft_limit = 100 tier = 5 @@ -114,34 +114,34 @@ response_cache_max_bytes = 10_000_000_000 [private_rpcs.eden] disabled = true display_name = "Eden network" - url = "https://api.edennetwork.io/v1/" + http_url = "https://api.edennetwork.io/v1/" soft_limit = 1_805 tier = 0 [private_rpcs.eden_beta] disabled = true display_name = "Eden network beta" - url = "https://api.edennetwork.io/v1/beta" + http_url = "https://api.edennetwork.io/v1/beta" soft_limit = 5_861 tier = 0 [private_rpcs.ethermine] disabled = true display_name = "Ethermine" - url = "https://rpc.ethermine.org" + http_url = "https://rpc.ethermine.org" soft_limit = 5_861 tier = 0 [private_rpcs.flashbots] disabled = true display_name = "Flashbots Fast" - url = "https://rpc.flashbots.net/fast" + http_url = "https://rpc.flashbots.net/fast" soft_limit = 7_074 tier = 0 [private_rpcs.securerpc] disabled = true display_name = "SecureRPC" - url = "https://gibson.securerpc.com/v1" + http_url = "https://gibson.securerpc.com/v1" soft_limit = 4_560 tier = 0 diff --git a/config/minimal.toml b/config/minimal.toml index 2225c9d1..770e3484 100644 --- a/config/minimal.toml +++ b/config/minimal.toml @@ -16,17 +16,26 @@ response_cache_max_bytes = 1_000_000_000 [balanced_rpcs] - [balanced_rpcs.llama_public_wss] + [balanced_rpcs.llama_public_both] # TODO: what should we do if all rpcs are disabled? warn and wait for a config change? disabled = false - display_name = "LlamaNodes WSS" - url = "wss://eth.llamarpc.com/" + display_name = "LlamaNodes Both" + ws_url = "wss://eth.llamarpc.com/" + http_url = "https://eth.llamarpc.com/" soft_limit = 1_000 tier = 0 [balanced_rpcs.llama_public_https] disabled = false display_name = "LlamaNodes HTTPS" - url = "https://eth.llamarpc.com/" + http_url = "https://eth.llamarpc.com/" + soft_limit = 1_000 + tier = 0 + + [balanced_rpcs.llama_public_wss] + # TODO: what should we do if all rpcs are disabled? warn and wait for a config change? + disabled = false + display_name = "LlamaNodes WSS" + ws_url = "wss://eth.llamarpc.com/" soft_limit = 1_000 tier = 0 diff --git a/deferred-rate-limiter/Cargo.toml b/deferred-rate-limiter/Cargo.toml index 9b7c4ad8..04cb8488 100644 --- a/deferred-rate-limiter/Cargo.toml +++ b/deferred-rate-limiter/Cargo.toml @@ -10,5 +10,5 @@ redis-rate-limiter = { path = "../redis-rate-limiter" } anyhow = "1.0.69" hashbrown = "0.13.2" log = "0.4.17" -moka = { version = "0.9.7", default-features = false, features = ["future"] } +moka = { version = "0.10.0", default-features = false, features = ["future"] } tokio = "1.25.0" diff --git a/docs/curl login.md b/docs/curl login.md new file mode 100644 index 00000000..16ec43b7 --- /dev/null +++ b/docs/curl login.md @@ -0,0 +1,10 @@ +# log in with curl + +1. curl http://127.0.0.1:8544/user/login/$ADDRESS +2. Sign the text with a site like https://www.myetherwallet.com/wallet/sign +3. POST the signed data: + + curl -X POST http://127.0.0.1:8544/user/login -H 'Content-Type: application/json' -d + '{ "address": "0x9eb9e3dc2543dc9ff4058e2a2da43a855403f1fd", "msg": "0x6c6c616d616e6f6465732e636f6d2077616e747320796f7520746f207369676e20696e207769746820796f757220457468657265756d206163636f756e743a0a3078396562396533646332353433646339464634303538653241324441343341383535343033463166440a0af09fa699f09fa699f09fa699f09fa699f09fa6990a0a5552493a2068747470733a2f2f6c6c616d616e6f6465732e636f6d2f0a56657273696f6e3a20310a436861696e2049443a20310a4e6f6e63653a203031474d37373330375344324448333854454d3957545156454a0a4973737565642041743a20323032322d31322d31345430323a32333a31372e3735333736335a0a45787069726174696f6e2054696d653a20323032322d31322d31345430323a34333a31372e3735333736335a", "sig": "16bac055345279723193737c6c67cf995e821fd7c038d31fd6f671102088c7b85ab4b13069fd2ed02da186cf549530e315d8d042d721bf81289b3ffdbe8cf9ce1c", "version": "3", "signer": "MEW" }' + +4. The response will include a bearer token. Use it with curl ... -H 'Authorization: Bearer $TOKEN' diff --git a/web3_proxy/Cargo.toml b/web3_proxy/Cargo.toml index d60d162d..04da5704 100644 --- a/web3_proxy/Cargo.toml +++ b/web3_proxy/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "web3_proxy" -version = "0.13.0" +version = "0.13.1" edition = "2021" default-run = "web3_proxy_cli" @@ -27,9 +27,9 @@ thread-fast-rng = { path = "../thread-fast-rng" } anyhow = { version = "1.0.69", features = ["backtrace"] } argh = "0.1.10" -axum = { version = "0.6.4", features = ["headers", "ws"] } +axum = { version = "0.6.6", features = ["headers", "ws"] } axum-client-ip = "0.4.0" -axum-macros = "0.3.2" +axum-macros = "0.3.4" chrono = "0.4.23" counter = "0.5.7" derive_more = "0.99.17" @@ -48,10 +48,11 @@ http = "0.2.8" ipnet = "2.7.1" itertools = "0.10.5" log = "0.4.17" -moka = { version = "0.9.7", default-features = false, features = ["future"] } +moka = { version = "0.10.0", default-features = false, features = ["future"] } notify = "5.1.0" num = "0.4.0" num-traits = "0.2.15" +once_cell = { version = "1.17.1" } pagerduty-rs = { version = "0.1.6", default-features = false, features = ["async", "rustls", "sync"] } parking_lot = { version = "0.12.1", features = ["arc_lock"] } prettytable = "*" @@ -62,7 +63,7 @@ rustc-hash = "1.1.0" sentry = { version = "0.29.3", default-features = false, features = ["backtrace", "contexts", "panic", "anyhow", "reqwest", "rustls", "log", "sentry-log"] } serde = { version = "1.0.152", features = [] } serde_json = { version = "1.0.93", default-features = false, features = ["alloc", "raw_value"] } -serde_prometheus = "0.2.0" +serde_prometheus = "0.2.1" siwe = "0.5.0" time = "0.3.17" tokio = { version = "1.25.0", features = ["full"] } @@ -73,3 +74,5 @@ tower-http = { version = "0.3.5", features = ["cors", "sensitive-headers"] } ulid = { version = "1.0.0", features = ["serde"] } url = "2.3.1" uuid = "1.3.0" +ewma = "0.1.1" +ordered-float = "3.4.0" diff --git a/web3_proxy/src/app/mod.rs b/web3_proxy/src/app/mod.rs index 75610ca2..06531264 100644 --- a/web3_proxy/src/app/mod.rs +++ b/web3_proxy/src/app/mod.rs @@ -10,7 +10,7 @@ use crate::frontend::rpc_proxy_ws::ProxyMode; use crate::jsonrpc::{ JsonRpcForwardedResponse, JsonRpcForwardedResponseEnum, JsonRpcRequest, JsonRpcRequestEnum, }; -use crate::rpcs::blockchain::{ArcBlock, SavedBlock}; +use crate::rpcs::blockchain::{BlockHashesCache, Web3ProxyBlock}; use crate::rpcs::many::Web3Rpcs; use crate::rpcs::one::Web3Rpc; use crate::rpcs::transactions::TxStatus; @@ -23,7 +23,7 @@ use derive_more::From; use entities::sea_orm_active_enums::LogLevel; use entities::user; use ethers::core::utils::keccak256; -use ethers::prelude::{Address, Block, Bytes, Transaction, TxHash, H256, U64}; +use ethers::prelude::{Address, Bytes, Transaction, TxHash, H256, U64}; use ethers::types::U256; use ethers::utils::rlp::{Decodable, Rlp}; use futures::future::join_all; @@ -69,9 +69,9 @@ pub static REQUEST_PERIOD: u64 = 60; #[derive(From)] struct ResponseCacheKey { // if none, this is cached until evicted - from_block: Option, + from_block: Option, // to_block is only set when ranges of blocks are requested (like with eth_getLogs) - to_block: Option, + to_block: Option, method: String, // TODO: better type for this params: Option, @@ -204,7 +204,7 @@ pub struct Web3ProxyApp { response_cache: ResponseCache, // don't drop this or the sender will stop working // TODO: broadcast channel instead? - watch_consensus_head_receiver: watch::Receiver, + watch_consensus_head_receiver: watch::Receiver>, pending_tx_sender: broadcast::Sender, pub config: AppConfig, pub db_conn: Option, @@ -482,7 +482,7 @@ impl Web3ProxyApp { let http_client = Some( reqwest::ClientBuilder::new() .connect_timeout(Duration::from_secs(5)) - .timeout(Duration::from_secs(60)) + .timeout(Duration::from_secs(5 * 60)) .user_agent(APP_USER_AGENT) .build()?, ); @@ -541,8 +541,7 @@ impl Web3ProxyApp { }; // TODO: i don't like doing Block::default here! Change this to "None"? - let (watch_consensus_head_sender, watch_consensus_head_receiver) = - watch::channel(Arc::new(Block::default())); + let (watch_consensus_head_sender, watch_consensus_head_receiver) = watch::channel(None); // TODO: will one receiver lagging be okay? how big should this be? let (pending_tx_sender, pending_tx_receiver) = broadcast::channel(256); @@ -557,33 +556,40 @@ impl Web3ProxyApp { // TODO: ttl on this? or is max_capacity fine? let pending_transactions = Cache::builder() .max_capacity(10_000) + // TODO: different chains might handle this differently + // TODO: what should we set? 5 minutes is arbitrary. the nodes themselves hold onto transactions for much longer + .time_to_idle(Duration::from_secs(300)) .build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default()); - // keep 1GB of blocks in the cache + // keep 1GB/5 minutes of blocks in the cache // TODO: limits from config // these blocks don't have full transactions, but they do have rather variable amounts of transaction hashes // TODO: how can we do the weigher better? - let block_map = Cache::builder() + let block_map: BlockHashesCache = Cache::builder() .max_capacity(1024 * 1024 * 1024) - .weigher(|_k, v: &ArcBlock| { + .weigher(|_k, v: &Web3ProxyBlock| { // TODO: is this good enough? - 1 + v.transactions.len().try_into().unwrap_or(u32::MAX) + 1 + v.block.transactions.len().try_into().unwrap_or(u32::MAX) }) + // TODO: what should we set? 5 minutes is arbitrary. the nodes themselves hold onto transactions for much longer + .time_to_idle(Duration::from_secs(300)) .build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default()); // connect to the load balanced rpcs let (balanced_rpcs, balanced_handle) = Web3Rpcs::spawn( + block_map.clone(), top_config.app.chain_id, db_conn.clone(), - balanced_rpcs, http_client.clone(), - vredis_pool.clone(), - block_map.clone(), - Some(watch_consensus_head_sender), - top_config.app.min_sum_soft_limit, + top_config.app.max_block_age, + top_config.app.max_block_lag, top_config.app.min_synced_rpcs, - Some(pending_tx_sender.clone()), + top_config.app.min_sum_soft_limit, pending_transactions.clone(), + Some(pending_tx_sender.clone()), + vredis_pool.clone(), + balanced_rpcs, + Some(watch_consensus_head_sender), ) .await .context("spawning balanced rpcs")?; @@ -599,26 +605,30 @@ impl Web3ProxyApp { None } else { let (private_rpcs, private_handle) = Web3Rpcs::spawn( + block_map, top_config.app.chain_id, db_conn.clone(), - private_rpcs, http_client.clone(), + // private rpcs don't get subscriptions, so no need for max_block_age or max_block_lag + None, + None, + 0, + 0, + pending_transactions.clone(), + // TODO: subscribe to pending transactions on the private rpcs? they seem to have low rate limits, but they should have + None, vredis_pool.clone(), - block_map, + private_rpcs, // subscribing to new heads here won't work well. if they are fast, they might be ahead of balanced_rpcs // they also often have low rate limits // however, they are well connected to miners/validators. so maybe using them as a safety check would be good + // TODO: but maybe we could include privates in the "backup" tier None, - 0, - 0, - // TODO: subscribe to pending transactions on the private rpcs? they seem to have low rate limits - None, - pending_transactions.clone(), ) .await .context("spawning private_rpcs")?; - if private_rpcs.conns.is_empty() { + if private_rpcs.by_name.is_empty() { None } else { // save the handle to catch any errors @@ -685,6 +695,8 @@ impl Web3ProxyApp { u32::MAX } }) + // TODO: what should we set? 10 minutes is arbitrary. the nodes themselves hold onto transactions for much longer + .time_to_idle(Duration::from_secs(600)) .build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default()); // all the users are the same size, so no need for a weigher @@ -734,7 +746,7 @@ impl Web3ProxyApp { Ok((app, cancellable_handles, important_background_handles).into()) } - pub fn head_block_receiver(&self) -> watch::Receiver { + pub fn head_block_receiver(&self) -> watch::Receiver> { self.watch_consensus_head_receiver.clone() } @@ -932,7 +944,7 @@ impl Web3ProxyApp { JsonRpcRequestEnum::Single(request) => { let (response, rpcs) = timeout( max_time, - self.proxy_cached_request(&authorization, request, proxy_mode), + self.proxy_cached_request(&authorization, request, proxy_mode, None), ) .await??; @@ -965,10 +977,26 @@ impl Web3ProxyApp { // TODO: spawn so the requests go in parallel? need to think about rate limiting more if we do that // TODO: improve flattening + + // get the head block now so that any requests that need it all use the same block + // TODO: FrontendErrorResponse that handles "no servers synced" in a consistent way + // TODO: this still has an edge condition if there is a reorg in the middle of the request!!! + let head_block_num = self + .balanced_rpcs + .head_block_num() + .context(anyhow::anyhow!("no servers synced"))?; + let responses = join_all( requests .into_iter() - .map(|request| self.proxy_cached_request(authorization, request, proxy_mode)) + .map(|request| { + self.proxy_cached_request( + authorization, + request, + proxy_mode, + Some(head_block_num), + ) + }) .collect::>(), ) .await; @@ -1017,6 +1045,7 @@ impl Web3ProxyApp { authorization: &Arc, mut request: JsonRpcRequest, proxy_mode: ProxyMode, + head_block_num: Option, ) -> Result<(JsonRpcForwardedResponse, Vec>), FrontendErrorResponse> { // trace!("Received request: {:?}", request); @@ -1035,9 +1064,17 @@ impl Web3ProxyApp { | "db_getString" | "db_putHex" | "db_putString" + | "debug_accountRange" + | "debug_backtraceAt" + | "debug_blockProfile" | "debug_chaindbCompact" + | "debug_chaindbProperty" + | "debug_cpuProfile" + | "debug_freeOSMemory" | "debug_freezeClient" + | "debug_gcStats" | "debug_goTrace" + | "debug_memStats" | "debug_mutexProfile" | "debug_setBlockProfileRate" | "debug_setGCPercent" @@ -1125,7 +1162,7 @@ impl Web3ProxyApp { serde_json::Value::Array(vec![]) } "eth_blockNumber" => { - match self.balanced_rpcs.head_block_num() { + match head_block_num.or(self.balanced_rpcs.head_block_num()) { Some(head_block_num) => { json!(head_block_num) } @@ -1138,9 +1175,7 @@ impl Web3ProxyApp { } } } - "eth_chainId" => { - json!(U64::from(self.config.chain_id)) - } + "eth_chainId" => json!(U64::from(self.config.chain_id)), // TODO: eth_callBundle (https://docs.flashbots.net/flashbots-auction/searchers/advanced/rpc-endpoint#eth_callbundle) // TODO: eth_cancelPrivateTransaction (https://docs.flashbots.net/flashbots-auction/searchers/advanced/rpc-endpoint#eth_cancelprivatetransaction, but maybe just reject) // TODO: eth_sendPrivateTransaction (https://docs.flashbots.net/flashbots-auction/searchers/advanced/rpc-endpoint#eth_sendprivatetransaction) @@ -1158,6 +1193,7 @@ impl Web3ProxyApp { request, Some(&request_metadata), None, + None, ) .await?; @@ -1193,7 +1229,7 @@ impl Web3ProxyApp { } "eth_mining" => { // no stats on this. its cheap - json!(false) + serde_json::Value::Bool(false) } // TODO: eth_sendBundle (flashbots command) // broadcast transactions to all private rpcs at once @@ -1222,12 +1258,19 @@ impl Web3ProxyApp { (&self.balanced_rpcs, default_num) }; + let head_block_num = head_block_num + .or(self.balanced_rpcs.head_block_num()) + .ok_or_else(|| anyhow::anyhow!("no servers synced"))?; + + // TODO: error/wait if no head block! + // try_send_all_upstream_servers puts the request id into the response. no need to do that ourselves here. let mut response = private_rpcs .try_send_all_synced_connections( authorization, &request, Some(request_metadata.clone()), + Some(&head_block_num), None, Level::Trace, num, @@ -1318,7 +1361,7 @@ impl Web3ProxyApp { "eth_syncing" => { // no stats on this. its cheap // TODO: return a real response if all backends are syncing or if no servers in sync - json!(false) + serde_json::Value::Bool(false) } "eth_subscribe" => { return Ok(( @@ -1343,12 +1386,12 @@ impl Web3ProxyApp { "net_listening" => { // no stats on this. its cheap // TODO: only if there are some backends on balanced_rpcs? - json!(true) + serde_json::Value::Bool(true) } "net_peerCount" => { // no stats on this. its cheap // TODO: do something with proxy_mode here? - self.balanced_rpcs.num_synced_rpcs().into() + json!(U64::from(self.balanced_rpcs.num_synced_rpcs())) } "web3_clientVersion" => { // no stats on this. its cheap @@ -1422,9 +1465,8 @@ impl Web3ProxyApp { // emit stats // TODO: if no servers synced, wait for them to be synced? probably better to error and let haproxy retry another server - let head_block_num = self - .balanced_rpcs - .head_block_num() + let head_block_num = head_block_num + .or(self.balanced_rpcs.head_block_num()) .context("no servers synced")?; // we do this check before checking caches because it might modify the request params @@ -1468,7 +1510,7 @@ impl Web3ProxyApp { .await?; Some(ResponseCacheKey { - from_block: Some(SavedBlock::new(request_block)), + from_block: Some(request_block), to_block: None, method: method.to_string(), // TODO: hash here? @@ -1508,8 +1550,8 @@ impl Web3ProxyApp { .await?; Some(ResponseCacheKey { - from_block: Some(SavedBlock::new(from_block)), - to_block: Some(SavedBlock::new(to_block)), + from_block: Some(from_block), + to_block: Some(to_block), method: method.to_string(), // TODO: hash here? params: request.params.clone(), @@ -1524,7 +1566,8 @@ impl Web3ProxyApp { let authorization = authorization.clone(); if let Some(cache_key) = cache_key { - let from_block_num = cache_key.from_block.as_ref().map(|x| x.number()); + let from_block_num = cache_key.from_block.as_ref().map(|x| *x.number()); + let to_block_num = cache_key.to_block.as_ref().map(|x| *x.number()); self.response_cache .try_get_with(cache_key, async move { @@ -1537,6 +1580,7 @@ impl Web3ProxyApp { request, Some(&request_metadata), from_block_num.as_ref(), + to_block_num.as_ref(), ) .await?; @@ -1545,7 +1589,7 @@ impl Web3ProxyApp { // TODO: only cache the inner response // TODO: how are we going to stream this? - // TODO: check response size. if its very large, return it in a custom Error type that bypasses caching + // TODO: check response size. if its very large, return it in a custom Error type that bypasses caching? or will moka do that for us? Ok::<_, anyhow::Error>(response) }) .await @@ -1565,6 +1609,7 @@ impl Web3ProxyApp { request, Some(&request_metadata), None, + None, ) .await? } diff --git a/web3_proxy/src/app/ws.rs b/web3_proxy/src/app/ws.rs index 582ea814..b125a5fa 100644 --- a/web3_proxy/src/app/ws.rs +++ b/web3_proxy/src/app/ws.rs @@ -61,6 +61,12 @@ impl Web3ProxyApp { ); while let Some(new_head) = head_block_receiver.next().await { + let new_head = if let Some(new_head) = new_head { + new_head + } else { + continue; + }; + // TODO: what should the payload for RequestMetadata be? let request_metadata = Arc::new(RequestMetadata::new(REQUEST_PERIOD, 0).unwrap()); @@ -72,7 +78,7 @@ impl Web3ProxyApp { "params": { "subscription": subscription_id, // TODO: option to include full transaction objects instead of just the hashes? - "result": new_head.as_ref(), + "result": new_head.block, }, }); diff --git a/web3_proxy/src/atomics.rs b/web3_proxy/src/atomics.rs new file mode 100644 index 00000000..8b0e8e5e --- /dev/null +++ b/web3_proxy/src/atomics.rs @@ -0,0 +1,22 @@ +use std::sync::atomic::{AtomicU64, Ordering}; + +pub struct AtomicF64 { + storage: AtomicU64, +} + +impl AtomicF64 { + pub fn new(value: f64) -> Self { + let as_u64 = value.to_bits(); + Self { + storage: AtomicU64::new(as_u64), + } + } + pub fn store(&self, value: f64, ordering: Ordering) { + let as_u64 = value.to_bits(); + self.storage.store(as_u64, ordering) + } + pub fn load(&self, ordering: Ordering) -> f64 { + let as_u64 = self.storage.load(ordering); + f64::from_bits(as_u64) + } +} diff --git a/web3_proxy/src/bin/web3_proxy_cli/daemon.rs b/web3_proxy/src/bin/web3_proxy_cli/daemon.rs index 62d742e5..465e545e 100644 --- a/web3_proxy/src/bin/web3_proxy_cli/daemon.rs +++ b/web3_proxy/src/bin/web3_proxy_cli/daemon.rs @@ -64,7 +64,7 @@ async fn run( )); // wait until the app has seen its first consensus head block - // TODO: if backups were included, wait a little longer + // TODO: if backups were included, wait a little longer? let _ = spawned_app.app.head_block_receiver().changed().await; // start the frontend port @@ -205,31 +205,27 @@ mod tests { ( "anvil".to_string(), Web3RpcConfig { - disabled: false, - display_name: None, - url: anvil.endpoint(), - backup: Some(false), - block_data_limit: None, + http_url: Some(anvil.endpoint()), soft_limit: 100, - hard_limit: None, tier: 0, - subscribe_txs: Some(false), - extra: Default::default(), + ..Default::default() }, ), ( "anvil_ws".to_string(), Web3RpcConfig { - disabled: false, - display_name: None, - url: anvil.ws_endpoint(), - backup: Some(false), - block_data_limit: None, + ws_url: Some(anvil.ws_endpoint()), soft_limit: 100, - hard_limit: None, tier: 0, - subscribe_txs: Some(false), - extra: Default::default(), + ..Default::default() + }, + ), + ( + "anvil_both".to_string(), + Web3RpcConfig { + http_url: Some(anvil.endpoint()), + ws_url: Some(anvil.ws_endpoint()), + ..Default::default() }, ), ]), diff --git a/web3_proxy/src/block_number.rs b/web3_proxy/src/block_number.rs index 33ef7f54..4b92d1e7 100644 --- a/web3_proxy/src/block_number.rs +++ b/web3_proxy/src/block_number.rs @@ -80,12 +80,7 @@ pub async fn clean_block_number( .context("fetching block number from hash")?; // TODO: set change to true? i think not we should probably use hashes for everything. - ( - block - .number - .expect("blocks here should always have numbers"), - false, - ) + (*block.number(), false) } else { return Err(anyhow::anyhow!("blockHash missing")); } @@ -132,6 +127,12 @@ pub async fn block_needed( head_block_num: U64, rpcs: &Web3Rpcs, ) -> anyhow::Result { + // some requests have potentially very large responses + // TODO: only skip caching if the response actually is large + if method.starts_with("trace_") || method == "debug_traceTransaction" { + return Ok(BlockNeeded::CacheNever); + } + let params = if let Some(params) = params { // grab the params so we can inspect and potentially modify them params @@ -215,8 +216,8 @@ pub async fn block_needed( }; return Ok(BlockNeeded::CacheRange { - from_block_num: from_block_num, - to_block_num: to_block_num, + from_block_num, + to_block_num, cache_errors: true, }); } diff --git a/web3_proxy/src/config.rs b/web3_proxy/src/config.rs index 942632e7..54456bb4 100644 --- a/web3_proxy/src/config.rs +++ b/web3_proxy/src/config.rs @@ -1,9 +1,9 @@ -use crate::rpcs::blockchain::BlockHashesCache; +use crate::app::AnyhowJoinHandle; +use crate::rpcs::blockchain::{BlockHashesCache, Web3ProxyBlock}; use crate::rpcs::one::Web3Rpc; -use crate::{app::AnyhowJoinHandle, rpcs::blockchain::ArcBlock}; use argh::FromArgs; use ethers::prelude::TxHash; -use ethers::types::U256; +use ethers::types::{U256, U64}; use hashbrown::HashMap; use log::warn; use migration::sea_orm::DatabaseConnection; @@ -11,7 +11,7 @@ use serde::Deserialize; use std::sync::Arc; use tokio::sync::broadcast; -pub type BlockAndRpc = (Option, Arc); +pub type BlockAndRpc = (Option, Arc); pub type TxHashAndRpc = (TxHash, Arc); #[derive(Debug, FromArgs)] @@ -105,6 +105,12 @@ pub struct AppConfig { pub invite_code: Option, pub login_domain: Option, + /// do not serve any requests if the best known block is older than this many seconds. + pub max_block_age: Option, + + /// do not serve any requests if the best known block is behind the best known block by more than this many blocks. + pub max_block_lag: Option, + /// Rate limit for bearer token authenticated entrypoints. /// This is separate from the rpc limits. #[serde(default = "default_bearer_token_max_concurrent_requests")] @@ -197,15 +203,19 @@ fn default_response_cache_max_bytes() -> u64 { } /// Configuration for a backend web3 RPC server -#[derive(Clone, Debug, Deserialize)] +#[derive(Clone, Debug, Default, Deserialize)] pub struct Web3RpcConfig { /// simple way to disable a connection without deleting the row #[serde(default)] pub disabled: bool, /// a name used in /status and other user facing messages pub display_name: Option, - /// websocket (or http if no websocket) - pub url: String, + /// (deprecated) rpc url + pub url: Option, + /// while not absolutely required, a ws:// or wss:// connection will be able to subscribe to head blocks + pub ws_url: Option, + /// while not absolutely required, a http:// or https:// connection will allow erigon to stream JSON + pub http_url: Option, /// block data limit. If None, will be queried pub block_data_limit: Option, /// the requests per second at which the server starts slowing down @@ -213,14 +223,15 @@ pub struct Web3RpcConfig { /// the requests per second at which the server throws errors (rate limit or otherwise) pub hard_limit: Option, /// only use this rpc if everything else is lagging too far. this allows us to ignore fast but very low limit rpcs - pub backup: Option, + #[serde(default)] + pub backup: bool, /// All else equal, a server with a lower tier receives all requests #[serde(default = "default_tier")] pub tier: u64, /// Subscribe to the firehose of pending transactions /// Don't do this with free rpcs #[serde(default)] - pub subscribe_txs: Option, + pub subscribe_txs: bool, /// unknown config options get put here #[serde(flatten, default = "HashMap::default")] pub extra: HashMap, @@ -245,47 +256,24 @@ impl Web3RpcConfig { block_map: BlockHashesCache, block_sender: Option>, tx_id_sender: Option>, + reconnect: bool, ) -> anyhow::Result<(Arc, AnyhowJoinHandle<()>)> { if !self.extra.is_empty() { warn!("unknown Web3RpcConfig fields!: {:?}", self.extra.keys()); } - let hard_limit = match (self.hard_limit, redis_pool) { - (None, None) => None, - (Some(hard_limit), Some(redis_client_pool)) => Some((hard_limit, redis_client_pool)), - (None, Some(_)) => None, - (Some(_hard_limit), None) => { - return Err(anyhow::anyhow!( - "no redis client pool! needed for hard limit" - )) - } - }; - - let tx_id_sender = if self.subscribe_txs.unwrap_or(false) { - tx_id_sender - } else { - None - }; - - let backup = self.backup.unwrap_or(false); - Web3Rpc::spawn( + self, name, - self.display_name, chain_id, db_conn, - self.url, http_client, http_interval_sender, - hard_limit, - self.soft_limit, - backup, - self.block_data_limit, + redis_pool, block_map, block_sender, tx_id_sender, - true, - self.tier, + reconnect, ) .await } diff --git a/web3_proxy/src/lib.rs b/web3_proxy/src/lib.rs index aab98d57..e31d0972 100644 --- a/web3_proxy/src/lib.rs +++ b/web3_proxy/src/lib.rs @@ -1,6 +1,7 @@ pub mod app; pub mod app_stats; pub mod admin_queries; +pub mod atomics; pub mod block_number; pub mod config; pub mod frontend; diff --git a/web3_proxy/src/peak_ewma.rs b/web3_proxy/src/peak_ewma.rs new file mode 100644 index 00000000..9adb34d9 --- /dev/null +++ b/web3_proxy/src/peak_ewma.rs @@ -0,0 +1,397 @@ +//! Code from [tower](https://github.com/tower-rs/tower/blob/3f31ffd2cf15f1e905142e5f43ab39ac995c22ed/tower/src/load/peak_ewma.rs) +//! Measures load using the PeakEWMA response latency. +//! TODO: refactor to work with our code + +use std::task::{Context, Poll}; +use std::{ + sync::{Arc, Mutex}, + time::Duration, +}; +use tokio::time::Instant; +use tower_service::Service; +use tracing::trace; + +/// Measures the load of the underlying service using Peak-EWMA load measurement. +/// +/// [`PeakEwma`] implements [`Load`] with the [`Cost`] metric that estimates the amount of +/// pending work to an endpoint. Work is calculated by multiplying the +/// exponentially-weighted moving average (EWMA) of response latencies by the number of +/// pending requests. The Peak-EWMA algorithm is designed to be especially sensitive to +/// worst-case latencies. Over time, the peak latency value decays towards the moving +/// average of latencies to the endpoint. +/// +/// When no latency information has been measured for an endpoint, an arbitrary default +/// RTT of 1 second is used to prevent the endpoint from being overloaded before a +/// meaningful baseline can be established.. +/// +/// ## Note +/// +/// This is derived from [Finagle][finagle], which is distributed under the Apache V2 +/// license. Copyright 2017, Twitter Inc. +/// +/// [finagle]: +/// https://github.com/twitter/finagle/blob/9cc08d15216497bb03a1cafda96b7266cfbbcff1/finagle-core/src/main/scala/com/twitter/finagle/loadbalancer/PeakEwma.scala +#[derive(Debug)] +pub struct PeakEwma { + service: S, + decay_ns: f64, + rtt_estimate: Arc>, + completion: C, +} + +#[cfg(feature = "discover")] +pin_project! { + /// Wraps a `D`-typed stream of discovered services with `PeakEwma`. + #[cfg_attr(docsrs, doc(cfg(feature = "discover")))] + #[derive(Debug)] + pub struct PeakEwmaDiscover { + #[pin] + discover: D, + decay_ns: f64, + default_rtt: Duration, + completion: C, + } +} + +/// Represents the relative cost of communicating with a service. +/// +/// The underlying value estimates the amount of pending work to a service: the Peak-EWMA +/// latency estimate multiplied by the number of pending requests. +#[derive(Copy, Clone, Debug, PartialEq, PartialOrd)] +pub struct Cost(f64); + +/// Tracks an in-flight request and updates the RTT-estimate on Drop. +#[derive(Debug)] +pub struct Handle { + sent_at: Instant, + decay_ns: f64, + rtt_estimate: Arc>, +} + +/// Holds the current RTT estimate and the last time this value was updated. +#[derive(Debug)] +struct RttEstimate { + update_at: Instant, + rtt_ns: f64, +} + +const NANOS_PER_MILLI: f64 = 1_000_000.0; + +// ===== impl PeakEwma ===== + +impl PeakEwma { + /// Wraps an `S`-typed service so that its load is tracked by the EWMA of its peak latency. + pub fn new(service: S, default_rtt: Duration, decay_ns: f64, completion: C) -> Self { + debug_assert!(decay_ns > 0.0, "decay_ns must be positive"); + Self { + service, + decay_ns, + rtt_estimate: Arc::new(Mutex::new(RttEstimate::new(nanos(default_rtt)))), + completion, + } + } + + fn handle(&self) -> Handle { + Handle { + decay_ns: self.decay_ns, + sent_at: Instant::now(), + rtt_estimate: self.rtt_estimate.clone(), + } + } +} + +impl Service for PeakEwma +where + S: Service, + C: TrackCompletion, +{ + type Response = C::Output; + type Error = S::Error; + type Future = TrackCompletionFuture; + + fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { + self.service.poll_ready(cx) + } + + fn call(&mut self, req: Request) -> Self::Future { + TrackCompletionFuture::new( + self.completion.clone(), + self.handle(), + self.service.call(req), + ) + } +} + +impl Load for PeakEwma { + type Metric = Cost; + + fn load(&self) -> Self::Metric { + let pending = Arc::strong_count(&self.rtt_estimate) as u32 - 1; + + // Update the RTT estimate to account for decay since the last update. + // If an estimate has not been established, a default is provided + let estimate = self.update_estimate(); + + let cost = Cost(estimate * f64::from(pending + 1)); + trace!( + "load estimate={:.0}ms pending={} cost={:?}", + estimate / NANOS_PER_MILLI, + pending, + cost, + ); + cost + } +} + +impl PeakEwma { + fn update_estimate(&self) -> f64 { + let mut rtt = self.rtt_estimate.lock().expect("peak ewma prior_estimate"); + rtt.decay(self.decay_ns) + } +} + +// ===== impl PeakEwmaDiscover ===== + +#[cfg(feature = "discover")] +impl PeakEwmaDiscover { + /// Wraps a `D`-typed [`Discover`] so that services have a [`PeakEwma`] load metric. + /// + /// The provided `default_rtt` is used as the default RTT estimate for newly + /// added services. + /// + /// They `decay` value determines over what time period a RTT estimate should + /// decay. + pub fn new(discover: D, default_rtt: Duration, decay: Duration, completion: C) -> Self + where + D: Discover, + D::Service: Service, + C: TrackCompletion>::Response>, + { + PeakEwmaDiscover { + discover, + decay_ns: nanos(decay), + default_rtt, + completion, + } + } +} + +#[cfg(feature = "discover")] +impl Stream for PeakEwmaDiscover +where + D: Discover, + C: Clone, +{ + type Item = Result>, D::Error>; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.project(); + let change = match ready!(this.discover.poll_discover(cx)).transpose()? { + None => return Poll::Ready(None), + Some(Change::Remove(k)) => Change::Remove(k), + Some(Change::Insert(k, svc)) => { + let peak_ewma = PeakEwma::new( + svc, + *this.default_rtt, + *this.decay_ns, + this.completion.clone(), + ); + Change::Insert(k, peak_ewma) + } + }; + + Poll::Ready(Some(Ok(change))) + } +} + +// ===== impl RttEstimate ===== + +impl RttEstimate { + fn new(rtt_ns: f64) -> Self { + debug_assert!(0.0 < rtt_ns, "rtt must be positive"); + Self { + rtt_ns, + update_at: Instant::now(), + } + } + + /// Decays the RTT estimate with a decay period of `decay_ns`. + fn decay(&mut self, decay_ns: f64) -> f64 { + // Updates with a 0 duration so that the estimate decays towards 0. + let now = Instant::now(); + self.update(now, now, decay_ns) + } + + /// Updates the Peak-EWMA RTT estimate. + /// + /// The elapsed time from `sent_at` to `recv_at` is added + fn update(&mut self, sent_at: Instant, recv_at: Instant, decay_ns: f64) -> f64 { + debug_assert!( + sent_at <= recv_at, + "recv_at={:?} after sent_at={:?}", + recv_at, + sent_at + ); + let rtt = nanos(recv_at.saturating_duration_since(sent_at)); + + let now = Instant::now(); + debug_assert!( + self.update_at <= now, + "update_at={:?} in the future", + self.update_at + ); + + self.rtt_ns = if self.rtt_ns < rtt { + // For Peak-EWMA, always use the worst-case (peak) value as the estimate for + // subsequent requests. + trace!( + "update peak rtt={}ms prior={}ms", + rtt / NANOS_PER_MILLI, + self.rtt_ns / NANOS_PER_MILLI, + ); + rtt + } else { + // When an RTT is observed that is less than the estimated RTT, we decay the + // prior estimate according to how much time has elapsed since the last + // update. The inverse of the decay is used to scale the estimate towards the + // observed RTT value. + let elapsed = nanos(now.saturating_duration_since(self.update_at)); + let decay = (-elapsed / decay_ns).exp(); + let recency = 1.0 - decay; + let next_estimate = (self.rtt_ns * decay) + (rtt * recency); + trace!( + "update rtt={:03.0}ms decay={:06.0}ns; next={:03.0}ms", + rtt / NANOS_PER_MILLI, + self.rtt_ns - next_estimate, + next_estimate / NANOS_PER_MILLI, + ); + next_estimate + }; + self.update_at = now; + + self.rtt_ns + } +} + +// ===== impl Handle ===== + +impl Drop for Handle { + fn drop(&mut self) { + let recv_at = Instant::now(); + + if let Ok(mut rtt) = self.rtt_estimate.lock() { + rtt.update(self.sent_at, recv_at, self.decay_ns); + } + } +} + +// ===== impl Cost ===== + +// Utility that converts durations to nanos in f64. +// +// Due to a lossy transformation, the maximum value that can be represented is ~585 years, +// which, I hope, is more than enough to represent request latencies. +fn nanos(d: Duration) -> f64 { + const NANOS_PER_SEC: u64 = 1_000_000_000; + let n = f64::from(d.subsec_nanos()); + let s = d.as_secs().saturating_mul(NANOS_PER_SEC) as f64; + n + s +} + +#[cfg(test)] +mod tests { + use futures_util::future; + use std::time::Duration; + use tokio::time; + use tokio_test::{assert_ready, assert_ready_ok, task}; + + use super::*; + + struct Svc; + impl Service<()> for Svc { + type Response = (); + type Error = (); + type Future = future::Ready>; + + fn poll_ready(&mut self, _: &mut Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } + + fn call(&mut self, (): ()) -> Self::Future { + future::ok(()) + } + } + + /// The default RTT estimate decays, so that new nodes are considered if the + /// default RTT is too high. + #[tokio::test] + async fn default_decay() { + time::pause(); + + let svc = PeakEwma::new( + Svc, + Duration::from_millis(10), + NANOS_PER_MILLI * 1_000.0, + CompleteOnResponse, + ); + let Cost(load) = svc.load(); + assert_eq!(load, 10.0 * NANOS_PER_MILLI); + + time::advance(Duration::from_millis(100)).await; + let Cost(load) = svc.load(); + assert!(9.0 * NANOS_PER_MILLI < load && load < 10.0 * NANOS_PER_MILLI); + + time::advance(Duration::from_millis(100)).await; + let Cost(load) = svc.load(); + assert!(8.0 * NANOS_PER_MILLI < load && load < 9.0 * NANOS_PER_MILLI); + } + + // The default RTT estimate decays, so that new nodes are considered if the default RTT is too + // high. + #[tokio::test] + async fn compound_decay() { + time::pause(); + + let mut svc = PeakEwma::new( + Svc, + Duration::from_millis(20), + NANOS_PER_MILLI * 1_000.0, + CompleteOnResponse, + ); + assert_eq!(svc.load(), Cost(20.0 * NANOS_PER_MILLI)); + + time::advance(Duration::from_millis(100)).await; + let mut rsp0 = task::spawn(svc.call(())); + assert!(svc.load() > Cost(20.0 * NANOS_PER_MILLI)); + + time::advance(Duration::from_millis(100)).await; + let mut rsp1 = task::spawn(svc.call(())); + assert!(svc.load() > Cost(40.0 * NANOS_PER_MILLI)); + + time::advance(Duration::from_millis(100)).await; + let () = assert_ready_ok!(rsp0.poll()); + assert_eq!(svc.load(), Cost(400_000_000.0)); + + time::advance(Duration::from_millis(100)).await; + let () = assert_ready_ok!(rsp1.poll()); + assert_eq!(svc.load(), Cost(200_000_000.0)); + + // Check that values decay as time elapses + time::advance(Duration::from_secs(1)).await; + assert!(svc.load() < Cost(100_000_000.0)); + + time::advance(Duration::from_secs(10)).await; + assert!(svc.load() < Cost(100_000.0)); + } + + #[test] + fn nanos() { + assert_eq!(super::nanos(Duration::new(0, 0)), 0.0); + assert_eq!(super::nanos(Duration::new(0, 123)), 123.0); + assert_eq!(super::nanos(Duration::new(1, 23)), 1_000_000_023.0); + assert_eq!( + super::nanos(Duration::new(::std::u64::MAX, 999_999_999)), + 18446744074709553000.0 + ); + } +} diff --git a/web3_proxy/src/rpcs/blockchain.rs b/web3_proxy/src/rpcs/blockchain.rs index ce79d76a..cd8957f5 100644 --- a/web3_proxy/src/rpcs/blockchain.rs +++ b/web3_proxy/src/rpcs/blockchain.rs @@ -1,16 +1,15 @@ +use super::consensus::ConsensusFinder; use super::many::Web3Rpcs; ///! Keep track of the blockchain as seen by a Web3Rpcs. use super::one::Web3Rpc; use super::transactions::TxStatus; use crate::frontend::authorization::Authorization; -use crate::{ - config::BlockAndRpc, jsonrpc::JsonRpcRequest, rpcs::synced_connections::ConsensusWeb3Rpcs, -}; -use anyhow::Context; +use crate::{config::BlockAndRpc, jsonrpc::JsonRpcRequest}; +use anyhow::{anyhow, Context}; use derive_more::From; use ethers::prelude::{Block, TxHash, H256, U64}; -use hashbrown::{HashMap, HashSet}; -use log::{debug, error, warn, Level}; +use hashbrown::HashSet; +use log::{debug, error, trace, warn, Level}; use moka::future::Cache; use serde::Serialize; use serde_json::json; @@ -22,17 +21,18 @@ use tokio::time::Duration; // TODO: type for Hydrated Blocks with their full transactions? pub type ArcBlock = Arc>; -pub type BlockHashesCache = Cache; +pub type BlockHashesCache = Cache; /// A block and its age. #[derive(Clone, Debug, Default, From, Serialize)] -pub struct SavedBlock { +pub struct Web3ProxyBlock { pub block: ArcBlock, /// number of seconds this block was behind the current time when received - pub age: u64, + /// this is only set if the block is from a subscription + pub received_age: Option, } -impl PartialEq for SavedBlock { +impl PartialEq for Web3ProxyBlock { fn eq(&self, other: &Self) -> bool { match (self.block.hash, other.block.hash) { (None, None) => true, @@ -43,18 +43,27 @@ impl PartialEq for SavedBlock { } } -impl SavedBlock { - pub fn new(block: ArcBlock) -> Self { - let mut x = Self { block, age: 0 }; +impl Web3ProxyBlock { + /// A new block has arrived over a subscription + pub fn try_new(block: ArcBlock) -> Option { + if block.number.is_none() || block.hash.is_none() { + return None; + } + + let mut x = Self { + block, + received_age: None, + }; // no need to recalulate lag every time // if the head block gets too old, a health check restarts this connection - x.age = x.lag(); + // TODO: emit a stat for received_age + x.received_age = Some(x.age()); - x + Some(x) } - pub fn lag(&self) -> u64 { + pub fn age(&self) -> u64 { let now = SystemTime::now() .duration_since(UNIX_EPOCH) .expect("there should always be time"); @@ -70,37 +79,66 @@ impl SavedBlock { } } - pub fn hash(&self) -> H256 { - self.block.hash.expect("saved blocks must have a hash") + #[inline(always)] + pub fn parent_hash(&self) -> &H256 { + &self.block.parent_hash } - // TODO: return as U64 or u64? - pub fn number(&self) -> U64 { - self.block.number.expect("saved blocks must have a number") + #[inline(always)] + pub fn hash(&self) -> &H256 { + self.block + .hash + .as_ref() + .expect("saved blocks must have a hash") + } + + #[inline(always)] + pub fn number(&self) -> &U64 { + self.block + .number + .as_ref() + .expect("saved blocks must have a number") } } -impl From for SavedBlock { - fn from(x: ArcBlock) -> Self { - SavedBlock::new(x) +impl TryFrom for Web3ProxyBlock { + type Error = anyhow::Error; + + fn try_from(x: ArcBlock) -> Result { + if x.number.is_none() || x.hash.is_none() { + return Err(anyhow!("Blocks here must have a number of hash")); + } + + let b = Web3ProxyBlock { + block: x, + received_age: None, + }; + + Ok(b) } } -impl Display for SavedBlock { +impl Display for Web3ProxyBlock { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{} ({}, {}s old)", self.number(), self.hash(), self.age) + write!( + f, + "{} ({}, {}s old)", + self.number(), + self.hash(), + self.age() + ) } } impl Web3Rpcs { /// add a block to our mappings and track the heaviest chain - pub async fn save_block( + pub async fn try_cache_block( &self, - block: ArcBlock, + block: Web3ProxyBlock, heaviest_chain: bool, - ) -> anyhow::Result { + ) -> anyhow::Result { // TODO: i think we can rearrange this function to make it faster on the hot path - let block_hash = block.hash.as_ref().context("no block hash")?; + let block_hash = block.hash(); // skip Block::default() if block_hash.is_zero() { @@ -108,7 +146,7 @@ impl Web3Rpcs { return Ok(block); } - let block_num = block.number.as_ref().context("no block num")?; + let block_num = block.number(); // TODO: think more about heaviest_chain. would be better to do the check inside this function if heaviest_chain { @@ -136,7 +174,7 @@ impl Web3Rpcs { authorization: &Arc, hash: &H256, rpc: Option<&Arc>, - ) -> anyhow::Result { + ) -> anyhow::Result { // first, try to get the hash from our cache // the cache is set last, so if its here, its everywhere // TODO: use try_get_with @@ -147,16 +185,24 @@ impl Web3Rpcs { // block not in cache. we need to ask an rpc for it let get_block_params = (*hash, false); // TODO: if error, retry? - let block: ArcBlock = match rpc { + let block: Web3ProxyBlock = match rpc { Some(rpc) => rpc - .wait_for_request_handle(authorization, Some(Duration::from_secs(30)), false) + .wait_for_request_handle(authorization, Some(Duration::from_secs(30)), None) .await? - .request::<_, Option<_>>( + .request::<_, Option>( "eth_getBlockByHash", &json!(get_block_params), Level::Error.into(), + None, ) .await? + .and_then(|x| { + if x.number.is_none() { + None + } else { + x.try_into().ok() + } + }) .context("no block!")?, None => { // TODO: helper for method+params => JsonRpcRequest @@ -167,20 +213,29 @@ impl Web3Rpcs { // TODO: request_metadata? maybe we should put it in the authorization? // TODO: think more about this wait_for_sync let response = self - .try_send_best_consensus_head_connection(authorization, request, None, None) + .try_send_best_consensus_head_connection( + authorization, + request, + None, + None, + None, + ) .await?; let block = response.result.context("failed fetching block")?; let block: Option = serde_json::from_str(block.get())?; - block.context("no block!")? + let block: ArcBlock = block.context("no block in the response")?; + + // TODO: received time is going to be weird + Web3ProxyBlock::try_from(block)? } }; // the block was fetched using eth_getBlockByHash, so it should have all fields // TODO: fill in heaviest_chain! if the block is old enough, is this definitely true? - let block = self.save_block(block, false).await?; + let block = self.try_cache_block(block, false).await?; Ok(block) } @@ -193,7 +248,7 @@ impl Web3Rpcs { ) -> anyhow::Result<(H256, u64)> { let (block, block_depth) = self.cannonical_block(authorization, num).await?; - let hash = block.hash.expect("Saved blocks should always have hashes"); + let hash = *block.hash(); Ok((hash, block_depth)) } @@ -204,7 +259,7 @@ impl Web3Rpcs { &self, authorization: &Arc, num: &U64, - ) -> anyhow::Result<(ArcBlock, u64)> { + ) -> anyhow::Result<(Web3ProxyBlock, u64)> { // we only have blocks by hash now // maybe save them during save_block in a blocks_by_number Cache> // if theres multiple, use petgraph to find the one on the main chain (and remove the others if they have enough confirmations) @@ -216,28 +271,27 @@ impl Web3Rpcs { .clone(); // be sure the requested block num exists - let mut head_block_num = consensus_head_receiver.borrow_and_update().number; + // TODO: is this okay? what if we aren't synced?! + let mut head_block_num = *consensus_head_receiver + .borrow_and_update() + .as_ref() + .context("no consensus head block")? + .number(); loop { - if let Some(head_block_num) = head_block_num { - if num <= &head_block_num { - break; - } + if num <= &head_block_num { + break; } + trace!("waiting for future block {} > {}", num, head_block_num); consensus_head_receiver.changed().await?; - head_block_num = consensus_head_receiver.borrow_and_update().number; + if let Some(head) = consensus_head_receiver.borrow_and_update().as_ref() { + head_block_num = *head.number(); + } } - let head_block_num = - head_block_num.expect("we should only get here if we have a head block"); - - let block_depth = if num >= &head_block_num { - 0 - } else { - (head_block_num - num).as_u64() - }; + let block_depth = (head_block_num - num).as_u64(); // try to get the hash from our cache // deref to not keep the lock open @@ -258,7 +312,7 @@ impl Web3Rpcs { // TODO: request_metadata or authorization? // we don't actually set min_block_needed here because all nodes have all blocks let response = self - .try_send_best_consensus_head_connection(authorization, request, None, None) + .try_send_best_consensus_head_connection(authorization, request, None, None, None) .await?; if let Some(err) = response.error { @@ -269,8 +323,10 @@ impl Web3Rpcs { let block: ArcBlock = serde_json::from_str(raw_block.get())?; + let block = Web3ProxyBlock::try_from(block)?; + // the block was fetched using eth_getBlockByNumber, so it should have all fields and be on the heaviest chain - let block = self.save_block(block, true).await?; + let block = self.try_cache_block(block, true).await?; Ok((block, block_depth)) } @@ -281,18 +337,25 @@ impl Web3Rpcs { block_receiver: flume::Receiver, // TODO: document that this is a watch sender and not a broadcast! if things get busy, blocks might get missed // Geth's subscriptions have the same potential for skipping blocks. - head_block_sender: watch::Sender, + head_block_sender: watch::Sender>, pending_tx_sender: Option>, ) -> anyhow::Result<()> { // TODO: indexmap or hashmap? what hasher? with_capacity? // TODO: this will grow unbounded. prune old heads on this at the same time we prune the graph? - let mut connection_heads = ConsensusFinder::default(); + let configured_tiers: Vec = self + .by_name + .values() + .map(|x| x.tier) + .collect::>() + .into_iter() + .collect(); + + let mut connection_heads = + ConsensusFinder::new(&configured_tiers, self.max_block_age, self.max_block_lag); loop { match block_receiver.recv_async().await { Ok((new_block, rpc)) => { - let new_block = new_block.map(Into::into); - let rpc_name = rpc.name.clone(); if let Err(err) = self @@ -306,7 +369,7 @@ impl Web3Rpcs { ) .await { - warn!("unable to process block from rpc {}: {:?}", rpc_name, err); + warn!("unable to process block from rpc {}: {:#?}", rpc_name, err); } } Err(err) => { @@ -324,63 +387,77 @@ impl Web3Rpcs { &self, authorization: &Arc, consensus_finder: &mut ConsensusFinder, - rpc_head_block: Option, + rpc_head_block: Option, rpc: Arc, - head_block_sender: &watch::Sender, + head_block_sender: &watch::Sender>, pending_tx_sender: &Option>, ) -> anyhow::Result<()> { // TODO: how should we handle an error here? if !consensus_finder .update_rpc(rpc_head_block.clone(), rpc.clone(), self) - .await? + .await + .context("failed to update rpc")? { - // nothing changed. no need + // nothing changed. no need to scan for a new consensus head return Ok(()); } let new_synced_connections = consensus_finder .best_consensus_connections(authorization, self) - .await; + .await + .context("no consensus head block!") + .map_err(|err| { + self.watch_consensus_rpcs_sender + .send_replace(Arc::new(Default::default())); + + err + })?; // TODO: what should we do if the block number of new_synced_connections is < old_synced_connections? wait? - let includes_backups = new_synced_connections.includes_backups; + let consensus_tier = new_synced_connections.tier; + let total_tiers = consensus_finder.len(); + let backups_needed = new_synced_connections.backups_needed; let consensus_head_block = new_synced_connections.head_block.clone(); let num_consensus_rpcs = new_synced_connections.num_conns(); - let num_checked_rpcs = new_synced_connections.num_checked_conns; - let num_active_rpcs = consensus_finder.all.rpc_name_to_hash.len(); - let total_rpcs = self.conns.len(); + let num_active_rpcs = consensus_finder + .all_rpcs_group() + .map(|x| x.len()) + .unwrap_or_default(); + let total_rpcs = self.by_name.len(); let old_consensus_head_connections = self - .watch_consensus_connections_sender + .watch_consensus_rpcs_sender .send_replace(Arc::new(new_synced_connections)); - let includes_backups_str = if includes_backups { "B " } else { "" }; + let backups_voted_str = if backups_needed { "B " } else { "" }; - if let Some(consensus_saved_block) = consensus_head_block { + if let Some(consensus_head_block) = consensus_head_block { match &old_consensus_head_connections.head_block { None => { debug!( - "first {}{}/{}/{}/{} block={}, rpc={}", - includes_backups_str, + "first {}/{} {}{}/{}/{} block={}, rpc={}", + consensus_tier, + total_tiers, + backups_voted_str, num_consensus_rpcs, - num_checked_rpcs, num_active_rpcs, total_rpcs, - consensus_saved_block, + consensus_head_block, rpc, ); - if includes_backups { + if backups_needed { // TODO: what else should be in this error? warn!("Backup RPCs are in use!"); } + // this should already be cached let consensus_head_block = - self.save_block(consensus_saved_block.block, true).await?; + self.try_cache_block(consensus_head_block, true).await?; head_block_sender - .send(consensus_head_block) + .send(Some(consensus_head_block)) .context("head_block_sender sending consensus_head_block")?; } Some(old_head_block) => { @@ -389,51 +466,52 @@ impl Web3Rpcs { .map(|x| x.to_string()) .unwrap_or_else(|| "None".to_string()); - match consensus_saved_block.number().cmp(&old_head_block.number()) { + match consensus_head_block.number().cmp(&old_head_block.number()) { Ordering::Equal => { // multiple blocks with the same fork! - if consensus_saved_block.hash() == old_head_block.hash() { + if consensus_head_block.hash() == old_head_block.hash() { // no change in hash. no need to use head_block_sender // TODO: trace level if rpc is backup debug!( - "con {}{}/{}/{}/{} con={} rpc={}@{}", - includes_backups_str, + "con {}/{} {}{}/{}/{} con={} rpc={}@{}", + consensus_tier, + total_tiers, + backups_voted_str, num_consensus_rpcs, - num_checked_rpcs, num_active_rpcs, total_rpcs, - consensus_saved_block, + consensus_head_block, rpc, rpc_head_str, ) } else { // hash changed - - if includes_backups { + if backups_needed { // TODO: what else should be in this error? warn!("Backup RPCs are in use!"); } debug!( - "unc {}{}/{}/{}/{} con_head={} old={} rpc={}@{}", - includes_backups_str, + "unc {}/{} {}{}/{}/{} con_head={} old={} rpc={}@{}", + consensus_tier, + total_tiers, + backups_voted_str, num_consensus_rpcs, - num_checked_rpcs, num_active_rpcs, total_rpcs, - consensus_saved_block, + consensus_head_block, old_head_block, rpc, rpc_head_str, ); let consensus_head_block = self - .save_block(consensus_saved_block.block, true) + .try_cache_block(consensus_head_block, true) .await .context("save consensus_head_block as heaviest chain")?; head_block_sender - .send(consensus_head_block) + .send(Some(consensus_head_block)) .context("head_block_sender sending consensus_head_block")?; } } @@ -441,57 +519,59 @@ impl Web3Rpcs { // this is unlikely but possible // TODO: better log warn!( - "chain rolled back {}{}/{}/{}/{} con={} old={} rpc={}@{}", - includes_backups_str, + "chain rolled back {}/{} {}{}/{}/{} con={} old={} rpc={}@{}", + consensus_tier, + total_tiers, + backups_voted_str, num_consensus_rpcs, - num_checked_rpcs, num_active_rpcs, total_rpcs, - consensus_saved_block, + consensus_head_block, old_head_block, rpc, rpc_head_str, ); - if includes_backups { + if backups_needed { // TODO: what else should be in this error? warn!("Backup RPCs are in use!"); } // TODO: tell save_block to remove any higher block numbers from the cache. not needed because we have other checks on requested blocks being > head, but still seems like a good idea let consensus_head_block = self - .save_block(consensus_saved_block.block, true) + .try_cache_block(consensus_head_block, true) .await .context( "save_block sending consensus_head_block as heaviest chain", )?; head_block_sender - .send(consensus_head_block) + .send(Some(consensus_head_block)) .context("head_block_sender sending consensus_head_block")?; } Ordering::Greater => { debug!( - "new {}{}/{}/{}/{} con={} rpc={}@{}", - includes_backups_str, + "new {}/{} {}{}/{}/{} con={} rpc={}@{}", + consensus_tier, + total_tiers, + backups_voted_str, num_consensus_rpcs, - num_checked_rpcs, num_active_rpcs, total_rpcs, - consensus_saved_block, + consensus_head_block, rpc, rpc_head_str, ); - if includes_backups { + if backups_needed { // TODO: what else should be in this error? warn!("Backup RPCs are in use!"); } let consensus_head_block = - self.save_block(consensus_saved_block.block, true).await?; + self.try_cache_block(consensus_head_block, true).await?; - head_block_sender.send(consensus_head_block)?; + head_block_sender.send(Some(consensus_head_block))?; } } } @@ -502,23 +582,27 @@ impl Web3Rpcs { .map(|x| x.to_string()) .unwrap_or_else(|| "None".to_string()); - if num_checked_rpcs >= self.min_head_rpcs { + if num_active_rpcs >= self.min_head_rpcs { + // no consensus!!! error!( - "non {}{}/{}/{}/{} rpc={}@{}", - includes_backups_str, + "non {}/{} {}{}/{}/{} rpc={}@{}", + consensus_tier, + total_tiers, + backups_voted_str, num_consensus_rpcs, - num_checked_rpcs, num_active_rpcs, total_rpcs, rpc, rpc_head_str, ); } else { + // no consensus, but we do not have enough rpcs connected yet to panic debug!( - "non {}{}/{}/{}/{} rpc={}@{}", - includes_backups_str, + "non {}/{} {}{}/{}/{} rpc={}@{}", + consensus_tier, + total_tiers, + backups_voted_str, num_consensus_rpcs, - num_checked_rpcs, num_active_rpcs, total_rpcs, rpc, @@ -530,403 +614,3 @@ impl Web3Rpcs { Ok(()) } } - -struct ConnectionsGroup { - /// TODO: this group might not actually include backups, but they were at leastchecked - includes_backups: bool, - rpc_name_to_hash: HashMap, -} - -impl ConnectionsGroup { - fn new(with_backups: bool) -> Self { - Self { - includes_backups: with_backups, - rpc_name_to_hash: Default::default(), - } - } - - fn without_backups() -> Self { - Self::new(false) - } - - fn with_backups() -> Self { - Self::new(true) - } - - fn remove(&mut self, rpc: &Web3Rpc) -> Option { - self.rpc_name_to_hash.remove(rpc.name.as_str()) - } - - fn insert(&mut self, rpc: &Web3Rpc, block_hash: H256) -> Option { - self.rpc_name_to_hash.insert(rpc.name.clone(), block_hash) - } - - // TODO: i don't love having this here. move to web3_connections? - async fn get_block_from_rpc( - &self, - rpc_name: &str, - hash: &H256, - authorization: &Arc, - web3_rpcs: &Web3Rpcs, - ) -> anyhow::Result { - // // TODO: why does this happen?!?! seems to only happen with uncled blocks - // // TODO: maybe we should do try_get_with? - // // TODO: maybe we should just continue. this only seems to happen when an older block is received - // warn!( - // "Missing connection_head_block in block_hashes. Fetching now. hash={}. other={}", - // connection_head_hash, conn_name - // ); - - // this option should almost always be populated. if the connection reconnects at a bad time it might not be available though - // TODO: if this is None, I think we should error. - let rpc = web3_rpcs.conns.get(rpc_name); - - web3_rpcs.block(authorization, hash, rpc).await - } - - // TODO: do this during insert/remove? - pub(self) async fn highest_block( - &self, - authorization: &Arc, - web3_rpcs: &Web3Rpcs, - ) -> Option { - let mut checked_heads = HashSet::with_capacity(self.rpc_name_to_hash.len()); - let mut highest_block = None::; - - for (rpc_name, rpc_head_hash) in self.rpc_name_to_hash.iter() { - // don't waste time checking the same hash multiple times - if checked_heads.contains(rpc_head_hash) { - continue; - } - - let rpc_block = match self - .get_block_from_rpc(rpc_name, rpc_head_hash, authorization, web3_rpcs) - .await - { - Ok(x) => x, - Err(err) => { - warn!( - "failed getting block {} from {} while finding highest block number: {:?}", - rpc_head_hash, rpc_name, err, - ); - continue; - } - }; - - checked_heads.insert(rpc_head_hash); - - // if this is the first block we've tried - // or if this rpc's newest block has a higher number - // we used to check total difficulty, but that isn't a thing anymore on ETH - // TODO: we still need total difficulty on some other PoW chains. whats annoying is it isn't considered part of the "block header" just the block. so websockets don't return it - let highest_num = highest_block - .as_ref() - .map(|x| x.number.expect("blocks here should always have a number")); - let rpc_num = rpc_block.as_ref().number; - - if rpc_num > highest_num { - highest_block = Some(rpc_block); - } - } - - highest_block - } - - pub(self) async fn consensus_head_connections( - &self, - authorization: &Arc, - web3_rpcs: &Web3Rpcs, - ) -> anyhow::Result { - let mut maybe_head_block = match self.highest_block(authorization, web3_rpcs).await { - None => return Err(anyhow::anyhow!("No blocks known")), - Some(x) => x, - }; - - let num_known = self.rpc_name_to_hash.len(); - - // track rpcs on this heaviest chain so we can build a new ConsensusConnections - let mut highest_rpcs = HashSet::<&str>::new(); - // a running total of the soft limits covered by the rpcs that agree on the head block - let mut highest_rpcs_sum_soft_limit: u32 = 0; - // TODO: also track highest_rpcs_sum_hard_limit? llama doesn't need this, so it can wait - - // check the highest work block for a set of rpcs that can serve our request load - // if it doesn't have enough rpcs for our request load, check the parent block - // TODO: loop for how many parent blocks? we don't want to serve blocks that are too far behind. probably different per chain - // TODO: this loop is pretty long. any way to clean up this code? - for _ in 0..6 { - let maybe_head_hash = maybe_head_block - .hash - .as_ref() - .expect("blocks here always need hashes"); - - // find all rpcs with maybe_head_block as their current head - for (rpc_name, rpc_head_hash) in self.rpc_name_to_hash.iter() { - if rpc_head_hash != maybe_head_hash { - // connection is not on the desired block - continue; - } - if highest_rpcs.contains(rpc_name.as_str()) { - // connection is on a child block - continue; - } - - if let Some(rpc) = web3_rpcs.conns.get(rpc_name.as_str()) { - highest_rpcs.insert(rpc_name); - highest_rpcs_sum_soft_limit += rpc.soft_limit; - } else { - // i don't think this is an error. i think its just if a reconnect is currently happening - warn!("connection missing: {}", rpc_name); - debug!("web3_rpcs.conns: {:#?}", web3_rpcs.conns); - } - } - - if highest_rpcs_sum_soft_limit >= web3_rpcs.min_sum_soft_limit - && highest_rpcs.len() >= web3_rpcs.min_head_rpcs - { - // we have enough servers with enough requests - break; - } - - // not enough rpcs yet. check the parent block - if let Some(parent_block) = web3_rpcs.block_hashes.get(&maybe_head_block.parent_hash) { - // trace!( - // child=%maybe_head_hash, parent=%parent_block.hash.unwrap(), "avoiding thundering herd", - // ); - - maybe_head_block = parent_block; - continue; - } else { - if num_known < web3_rpcs.min_head_rpcs { - return Err(anyhow::anyhow!( - "not enough rpcs connected: {}/{}/{}", - highest_rpcs.len(), - num_known, - web3_rpcs.min_head_rpcs, - )); - } else { - let soft_limit_percent = (highest_rpcs_sum_soft_limit as f32 - / web3_rpcs.min_sum_soft_limit as f32) - * 100.0; - - return Err(anyhow::anyhow!( - "ran out of parents to check. rpcs {}/{}/{}. soft limit: {:.2}% ({}/{})", - highest_rpcs.len(), - num_known, - web3_rpcs.min_head_rpcs, - highest_rpcs_sum_soft_limit, - web3_rpcs.min_sum_soft_limit, - soft_limit_percent, - )); - } - } - } - - // TODO: if consensus_head_rpcs.is_empty, try another method of finding the head block. will need to change the return Err above into breaks. - - // we've done all the searching for the heaviest block that we can - if highest_rpcs.len() < web3_rpcs.min_head_rpcs - || highest_rpcs_sum_soft_limit < web3_rpcs.min_sum_soft_limit - { - // if we get here, not enough servers are synced. return an error - let soft_limit_percent = - (highest_rpcs_sum_soft_limit as f32 / web3_rpcs.min_sum_soft_limit as f32) * 100.0; - - return Err(anyhow::anyhow!( - "Not enough resources. rpcs {}/{}/{}. soft limit: {:.2}% ({}/{})", - highest_rpcs.len(), - num_known, - web3_rpcs.min_head_rpcs, - highest_rpcs_sum_soft_limit, - web3_rpcs.min_sum_soft_limit, - soft_limit_percent, - )); - } - - // success! this block has enough soft limit and nodes on it (or on later blocks) - let conns: Vec> = highest_rpcs - .into_iter() - .filter_map(|conn_name| web3_rpcs.conns.get(conn_name).cloned()) - .collect(); - - // TODO: DEBUG only check - let _ = maybe_head_block - .hash - .expect("head blocks always have hashes"); - let _ = maybe_head_block - .number - .expect("head blocks always have numbers"); - - let consensus_head_block: SavedBlock = maybe_head_block.into(); - - Ok(ConsensusWeb3Rpcs { - head_block: Some(consensus_head_block), - conns, - num_checked_conns: self.rpc_name_to_hash.len(), - includes_backups: self.includes_backups, - }) - } -} - -/// A ConsensusConnections builder that tracks all connection heads across multiple groups of servers -pub struct ConsensusFinder { - /// only main servers - main: ConnectionsGroup, - /// main and backup servers - all: ConnectionsGroup, -} - -impl Default for ConsensusFinder { - fn default() -> Self { - Self { - main: ConnectionsGroup::without_backups(), - all: ConnectionsGroup::with_backups(), - } - } -} - -impl ConsensusFinder { - fn remove(&mut self, rpc: &Web3Rpc) -> Option { - // TODO: should we have multiple backup tiers? (remote datacenters vs third party) - if !rpc.backup { - self.main.remove(rpc); - } - self.all.remove(rpc) - } - - fn insert(&mut self, rpc: &Web3Rpc, new_hash: H256) -> Option { - // TODO: should we have multiple backup tiers? (remote datacenters vs third party) - if !rpc.backup { - self.main.insert(rpc, new_hash); - } - self.all.insert(rpc, new_hash) - } - - /// Update our tracking of the rpc and return true if something changed - async fn update_rpc( - &mut self, - rpc_head_block: Option, - rpc: Arc, - // we need this so we can save the block to caches. i don't like it though. maybe we should use a lazy_static Cache wrapper that has a "save_block" method?. i generally dislike globals but i also dislike all the types having to pass eachother around - web3_connections: &Web3Rpcs, - ) -> anyhow::Result { - // add the rpc's block to connection_heads, or remove the rpc from connection_heads - let changed = match rpc_head_block { - Some(mut rpc_head_block) => { - // we don't know if its on the heaviest chain yet - rpc_head_block.block = web3_connections - .save_block(rpc_head_block.block, false) - .await?; - - // we used to remove here if the block was too far behind. but it just made things more complicated - - let rpc_head_hash = rpc_head_block.hash(); - - if let Some(prev_hash) = self.insert(&rpc, rpc_head_hash) { - if prev_hash == rpc_head_hash { - // this block was already sent by this rpc. return early - false - } else { - // new block for this rpc - true - } - } else { - // first block for this rpc - true - } - } - None => { - if self.remove(&rpc).is_none() { - // this rpc was already removed - false - } else { - // rpc head changed from being synced to not - true - } - } - }; - - Ok(changed) - } - - // TODO: this could definitely be cleaner. i don't like the error handling/unwrapping - async fn best_consensus_connections( - &mut self, - authorization: &Arc, - web3_connections: &Web3Rpcs, - ) -> ConsensusWeb3Rpcs { - let highest_block_num = match self - .all - .highest_block(authorization, web3_connections) - .await - { - None => { - return ConsensusWeb3Rpcs::default(); - } - Some(x) => x.number.expect("blocks here should always have a number"), - }; - - // TODO: also needs to be not less than our current head - let mut min_block_num = highest_block_num.saturating_sub(U64::from(5)); - - // we also want to be sure we don't ever go backwards! - if let Some(current_consensus_head_num) = web3_connections.head_block_num() { - min_block_num = min_block_num.max(current_consensus_head_num); - } - - // TODO: pass `min_block_num` to consensus_head_connections? - let consensus_head_for_main = self - .main - .consensus_head_connections(authorization, web3_connections) - .await - .map_err(|err| err.context("cannot use main group")); - - let consensus_num_for_main = consensus_head_for_main - .as_ref() - .ok() - .map(|x| x.head_block.as_ref().unwrap().number()); - - if let Some(consensus_num_for_main) = consensus_num_for_main { - if consensus_num_for_main >= min_block_num { - return consensus_head_for_main.unwrap(); - } - } - - // TODO: pass `min_block_num` to consensus_head_connections? - let consensus_connections_for_all = match self - .all - .consensus_head_connections(authorization, web3_connections) - .await - { - Err(err) => { - if self.all.rpc_name_to_hash.len() < web3_connections.min_head_rpcs { - debug!("No consensus head yet: {}", err); - } - return ConsensusWeb3Rpcs::default(); - } - Ok(x) => x, - }; - - let consensus_num_for_all = consensus_connections_for_all - .head_block - .as_ref() - .map(|x| x.number()); - - if consensus_num_for_all > consensus_num_for_main { - if consensus_num_for_all < Some(min_block_num) { - // TODO: this should have an alarm in sentry - error!("CONSENSUS HEAD w/ BACKUP NODES IS VERY OLD!"); - } - consensus_connections_for_all - } else { - if let Ok(x) = consensus_head_for_main { - error!("CONSENSUS HEAD IS VERY OLD! Backup RPCs did not improve this situation"); - x - } else { - // TODO: i don't think we need this error. and i doublt we'll ever even get here - error!("NO CONSENSUS HEAD!"); - ConsensusWeb3Rpcs::default() - } - } - } -} diff --git a/web3_proxy/src/rpcs/consensus.rs b/web3_proxy/src/rpcs/consensus.rs new file mode 100644 index 00000000..a348b9d6 --- /dev/null +++ b/web3_proxy/src/rpcs/consensus.rs @@ -0,0 +1,588 @@ +use crate::frontend::authorization::Authorization; + +use super::blockchain::Web3ProxyBlock; +use super::many::Web3Rpcs; +use super::one::Web3Rpc; +use anyhow::Context; +use ethers::prelude::{H256, U64}; +use hashbrown::{HashMap, HashSet}; +use log::{debug, trace, warn}; +use moka::future::Cache; +use serde::Serialize; +use std::collections::BTreeMap; +use std::fmt; +use std::sync::Arc; +use tokio::time::Instant; + +/// A collection of Web3Rpcs that are on the same block. +/// Serialize is so we can print it on our debug endpoint +#[derive(Clone, Default, Serialize)] +pub struct ConsensusWeb3Rpcs { + pub(super) tier: u64, + pub(super) head_block: Option, + // TODO: this should be able to serialize, but it isn't + #[serde(skip_serializing)] + pub(super) rpcs: Vec>, + pub(super) backups_voted: Option, + pub(super) backups_needed: bool, +} + +impl ConsensusWeb3Rpcs { + pub fn num_conns(&self) -> usize { + self.rpcs.len() + } + + pub fn sum_soft_limit(&self) -> u32 { + self.rpcs.iter().fold(0, |sum, rpc| sum + rpc.soft_limit) + } + + // TODO: sum_hard_limit? +} + +impl fmt::Debug for ConsensusWeb3Rpcs { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // TODO: the default formatter takes forever to write. this is too quiet though + // TODO: print the actual conns? + f.debug_struct("ConsensusConnections") + .field("head_block", &self.head_block) + .field("num_conns", &self.rpcs.len()) + .finish_non_exhaustive() + } +} + +impl Web3Rpcs { + // TODO: return a ref? + pub fn head_block(&self) -> Option { + self.watch_consensus_head_receiver + .as_ref() + .and_then(|x| x.borrow().clone()) + } + + // TODO: return a ref? + pub fn head_block_hash(&self) -> Option { + self.head_block().map(|x| *x.hash()) + } + + // TODO: return a ref? + pub fn head_block_num(&self) -> Option { + self.head_block().map(|x| *x.number()) + } + + pub fn synced(&self) -> bool { + !self.watch_consensus_rpcs_sender.borrow().rpcs.is_empty() + } + + pub fn num_synced_rpcs(&self) -> usize { + self.watch_consensus_rpcs_sender.borrow().rpcs.len() + } +} + +type FirstSeenCache = Cache; + +pub struct ConnectionsGroup { + rpc_name_to_block: HashMap, + // TODO: what if there are two blocks with the same number? + highest_block: Option, + /// used to track rpc.head_latency. The same cache should be shared between all ConnectionsGroups + first_seen: FirstSeenCache, +} + +impl ConnectionsGroup { + pub fn new(first_seen: FirstSeenCache) -> Self { + Self { + rpc_name_to_block: Default::default(), + highest_block: Default::default(), + first_seen, + } + } + + pub fn len(&self) -> usize { + self.rpc_name_to_block.len() + } + + fn remove(&mut self, rpc_name: &str) -> Option { + if let Some(removed_block) = self.rpc_name_to_block.remove(rpc_name) { + match self.highest_block.as_mut() { + None => {} + Some(current_highest_block) => { + if removed_block.hash() == current_highest_block.hash() { + for maybe_highest_block in self.rpc_name_to_block.values() { + if maybe_highest_block.number() > current_highest_block.number() { + *current_highest_block = maybe_highest_block.clone(); + }; + } + } + } + } + + Some(removed_block) + } else { + None + } + } + + async fn insert(&mut self, rpc: &Web3Rpc, block: Web3ProxyBlock) -> Option { + let first_seen = self + .first_seen + .get_with(*block.hash(), async move { Instant::now() }) + .await; + + // TODO: this should be 0 if we are first seen, but i think it will be slightly non-zero. + // calculate elapsed time before trying to lock. + let latency = first_seen.elapsed(); + + rpc.head_latency.write().record(latency); + + // TODO: what about a reorg to the same height? + if Some(block.number()) > self.highest_block.as_ref().map(|x| x.number()) { + self.highest_block = Some(block.clone()); + } + + self.rpc_name_to_block.insert(rpc.name.clone(), block) + } + + // // TODO: do this during insert/remove? + // pub(self) async fn highest_block( + // &self, + // authorization: &Arc, + // web3_rpcs: &Web3Rpcs, + // ) -> Option { + // let mut checked_heads = HashSet::with_capacity(self.rpc_name_to_hash.len()); + // let mut highest_block = None::; + + // for (rpc_name, rpc_head_hash) in self.rpc_name_to_hash.iter() { + // // don't waste time checking the same hash multiple times + // if checked_heads.contains(rpc_head_hash) { + // continue; + // } + + // let rpc_block = match web3_rpcs + // .get_block_from_rpc(rpc_name, rpc_head_hash, authorization) + // .await + // { + // Ok(x) => x, + // Err(err) => { + // warn!( + // "failed getting block {} from {} while finding highest block number: {:?}", + // rpc_head_hash, rpc_name, err, + // ); + // continue; + // } + // }; + + // checked_heads.insert(rpc_head_hash); + + // // if this is the first block we've tried + // // or if this rpc's newest block has a higher number + // // we used to check total difficulty, but that isn't a thing anymore on ETH + // // TODO: we still need total difficulty on some other PoW chains. whats annoying is it isn't considered part of the "block header" just the block. so websockets don't return it + // let highest_num = highest_block + // .as_ref() + // .map(|x| x.number.expect("blocks here should always have a number")); + // let rpc_num = rpc_block.as_ref().number; + + // if rpc_num > highest_num { + // highest_block = Some(rpc_block); + // } + // } + + // highest_block + // } + + /// min_consensus_block_num keeps us from ever going backwards. + /// TODO: think about min_consensus_block_num more. i think this might cause an outage if the chain is doing weird things. but 503s is probably better than broken data. + pub(self) async fn consensus_head_connections( + &self, + authorization: &Arc, + web3_rpcs: &Web3Rpcs, + min_consensus_block_num: Option, + tier: &u64, + ) -> anyhow::Result { + let mut maybe_head_block = match self.highest_block.clone() { + None => return Err(anyhow::anyhow!("no blocks known")), + Some(x) => x, + }; + + // TODO: take max_distance_consensus_to_highest as an argument? + // TODO: what if someone's backup node is misconfigured and goes on a really fast forked chain? + let max_lag_consensus_to_highest = + if let Some(min_consensus_block_num) = min_consensus_block_num { + maybe_head_block + .number() + .saturating_add(1.into()) + .saturating_sub(min_consensus_block_num) + .as_u64() + } else { + 10 + }; + + trace!( + "max_lag_consensus_to_highest: {}", + max_lag_consensus_to_highest + ); + + let num_known = self.rpc_name_to_block.len(); + + if num_known < web3_rpcs.min_head_rpcs { + return Err(anyhow::anyhow!( + "not enough rpcs connected: {}/{}", + num_known, + web3_rpcs.min_head_rpcs, + )); + } + + let mut primary_rpcs_voted: Option = None; + let mut backup_rpcs_voted: Option = None; + + // track rpcs on this heaviest chain so we can build a new ConsensusConnections + let mut primary_consensus_rpcs = HashSet::<&str>::new(); + let mut backup_consensus_rpcs = HashSet::<&str>::new(); + + // a running total of the soft limits covered by the rpcs that agree on the head block + let mut primary_sum_soft_limit: u32 = 0; + let mut backup_sum_soft_limit: u32 = 0; + + // TODO: also track the sum of *available* hard_limits. if any servers have no hard limits, use their soft limit or no limit? + + // check the highest work block for a set of rpcs that can serve our request load + // if it doesn't have enough rpcs for our request load, check the parent block + // TODO: loop for how many parent blocks? we don't want to serve blocks that are too far behind. probably different per chain + // TODO: this loop is pretty long. any way to clean up this code? + for _ in 0..max_lag_consensus_to_highest { + let maybe_head_hash = maybe_head_block.hash(); + + // find all rpcs with maybe_head_hash as their current head + for (rpc_name, rpc_head) in self.rpc_name_to_block.iter() { + if rpc_head.hash() != maybe_head_hash { + // connection is not on the desired block + continue; + } + if backup_consensus_rpcs.contains(rpc_name.as_str()) { + // connection is on a later block in this same chain + continue; + } + if primary_consensus_rpcs.contains(rpc_name.as_str()) { + // connection is on a later block in this same chain + continue; + } + + if let Some(rpc) = web3_rpcs.by_name.get(rpc_name.as_str()) { + if backup_rpcs_voted.is_some() { + // backups already voted for a head block. don't change it + } else { + backup_consensus_rpcs.insert(rpc_name); + backup_sum_soft_limit += rpc.soft_limit; + } + if !rpc.backup { + primary_consensus_rpcs.insert(rpc_name); + primary_sum_soft_limit += rpc.soft_limit; + } + } else { + // i don't think this is an error. i think its just if a reconnect is currently happening + warn!("connection missing: {}", rpc_name); + debug!("web3_rpcs.conns: {:#?}", web3_rpcs.by_name); + } + } + + if primary_sum_soft_limit >= web3_rpcs.min_sum_soft_limit + && primary_consensus_rpcs.len() >= web3_rpcs.min_head_rpcs + { + // we have enough servers with enough requests! yey! + primary_rpcs_voted = Some(maybe_head_block.clone()); + break; + } + + if backup_rpcs_voted.is_none() + && backup_consensus_rpcs != primary_consensus_rpcs + && backup_sum_soft_limit >= web3_rpcs.min_sum_soft_limit + && backup_consensus_rpcs.len() >= web3_rpcs.min_head_rpcs + { + // if we include backup servers, we have enough servers with high enough limits + backup_rpcs_voted = Some(maybe_head_block.clone()); + } + + // not enough rpcs on this block. check the parent block + match web3_rpcs + .block(authorization, &maybe_head_block.parent_hash(), None) + .await + { + Ok(parent_block) => { + // trace!( + // child=%maybe_head_hash, parent=%parent_block.hash.unwrap(), "avoiding thundering herd. checking consensus on parent block", + // ); + maybe_head_block = parent_block.into(); + continue; + } + Err(err) => { + let soft_limit_percent = (primary_sum_soft_limit as f32 + / web3_rpcs.min_sum_soft_limit as f32) + * 100.0; + + let err_msg = format!("ran out of parents to check. rpcs {}/{}/{}. soft limit: {:.2}% ({}/{}). err: {:#?}", + primary_consensus_rpcs.len(), + num_known, + web3_rpcs.min_head_rpcs, + primary_sum_soft_limit, + web3_rpcs.min_sum_soft_limit, + soft_limit_percent, + err, + ); + + if backup_rpcs_voted.is_some() { + warn!("{}", err_msg); + break; + } else { + return Err(anyhow::anyhow!(err_msg)); + } + } + } + } + + // TODO: if consensus_head_rpcs.is_empty, try another method of finding the head block. will need to change the return Err above into breaks. + + // we've done all the searching for the heaviest block that we can + if (primary_consensus_rpcs.len() < web3_rpcs.min_head_rpcs + || primary_sum_soft_limit < web3_rpcs.min_sum_soft_limit) + && backup_rpcs_voted.is_none() + { + // if we get here, not enough servers are synced. return an error + let soft_limit_percent = + (primary_sum_soft_limit as f32 / web3_rpcs.min_sum_soft_limit as f32) * 100.0; + + return Err(anyhow::anyhow!( + "Not enough resources. rpcs {}/{}/{}. soft limit: {:.2}% ({}/{})", + primary_consensus_rpcs.len(), + num_known, + web3_rpcs.min_head_rpcs, + primary_sum_soft_limit, + web3_rpcs.min_sum_soft_limit, + soft_limit_percent, + )); + } + + // success! this block has enough soft limit and nodes on it (or on later blocks) + let rpcs: Vec> = primary_consensus_rpcs + .into_iter() + .filter_map(|conn_name| web3_rpcs.by_name.get(conn_name).cloned()) + .collect(); + + #[cfg(debug_assertions)] + let _ = maybe_head_block.hash(); + #[cfg(debug_assertions)] + let _ = maybe_head_block.number(); + + Ok(ConsensusWeb3Rpcs { + tier: *tier, + head_block: Some(maybe_head_block), + rpcs, + backups_voted: backup_rpcs_voted, + backups_needed: primary_rpcs_voted.is_none(), + }) + } +} + +/// A ConsensusConnections builder that tracks all connection heads across multiple groups of servers +pub struct ConsensusFinder { + /// backups for all tiers are only used if necessary + /// tiers[0] = only tier 0. + /// tiers[1] = tier 0 and tier 1 + /// tiers[n] = tier 0..=n + /// This is a BTreeMap and not a Vec because sometimes a tier is empty + tiers: BTreeMap, + /// never serve blocks that are too old + max_block_age: Option, + /// tier 0 will be prefered as long as the distance between it and the other tiers is <= max_tier_lag + max_block_lag: Option, +} + +impl ConsensusFinder { + pub fn new( + configured_tiers: &[u64], + max_block_age: Option, + max_block_lag: Option, + ) -> Self { + // TODO: what's a good capacity for this? + let first_seen = Cache::builder() + .max_capacity(16) + .build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default()); + + // TODO: this will need some thought when config reloading is written + let tiers = configured_tiers + .iter() + .map(|x| (*x, ConnectionsGroup::new(first_seen.clone()))) + .collect(); + + Self { + tiers, + max_block_age, + max_block_lag, + } + } + + pub fn len(&self) -> usize { + self.tiers.len() + } + + /// get the ConnectionsGroup that contains all rpcs + /// panics if there are no tiers + pub fn all_rpcs_group(&self) -> Option<&ConnectionsGroup> { + self.tiers.values().last() + } + + /// get the mutable ConnectionsGroup that contains all rpcs + pub fn all_mut(&mut self) -> Option<&mut ConnectionsGroup> { + self.tiers.values_mut().last() + } + + pub fn remove(&mut self, rpc: &Web3Rpc) -> Option { + let mut removed = None; + + for (i, tier_group) in self.tiers.iter_mut().rev() { + if i < &rpc.tier { + break; + } + let x = tier_group.remove(rpc.name.as_str()); + + if removed.is_none() && x.is_some() { + removed = x; + } + } + + removed + } + + /// returns the block that the rpc was on before updating to the new_block + pub async fn insert( + &mut self, + rpc: &Web3Rpc, + new_block: Web3ProxyBlock, + ) -> Option { + let mut old = None; + + // TODO: error if rpc.tier is not in self.tiers + + for (i, tier_group) in self.tiers.iter_mut().rev() { + if i < &rpc.tier { + break; + } + + // TODO: should new_block be a ref? + let x = tier_group.insert(rpc, new_block.clone()).await; + + if old.is_none() && x.is_some() { + old = x; + } + } + + old + } + + /// Update our tracking of the rpc and return true if something changed + pub(crate) async fn update_rpc( + &mut self, + rpc_head_block: Option, + rpc: Arc, + // we need this so we can save the block to caches. i don't like it though. maybe we should use a lazy_static Cache wrapper that has a "save_block" method?. i generally dislike globals but i also dislike all the types having to pass eachother around + web3_connections: &Web3Rpcs, + ) -> anyhow::Result { + // add the rpc's block to connection_heads, or remove the rpc from connection_heads + let changed = match rpc_head_block { + Some(mut rpc_head_block) => { + // we don't know if its on the heaviest chain yet + rpc_head_block = web3_connections + .try_cache_block(rpc_head_block, false) + .await + .context("failed caching block")?; + + // if let Some(max_block_lag) = max_block_lag { + // if rpc_head_block.number() < ??? { + // trace!("rpc_head_block from {} is too far behind! {}", rpc, rpc_head_block); + // return Ok(self.remove(&rpc).is_some()); + // } + // } + + if let Some(max_age) = self.max_block_age { + if rpc_head_block.age() > max_age { + trace!("rpc_head_block from {} is too old! {}", rpc, rpc_head_block); + return Ok(self.remove(&rpc).is_some()); + } + } + + if let Some(prev_block) = self.insert(&rpc, rpc_head_block.clone()).await { + if prev_block.hash() == rpc_head_block.hash() { + // this block was already sent by this rpc. return early + false + } else { + // new block for this rpc + true + } + } else { + // first block for this rpc + true + } + } + None => { + if self.remove(&rpc).is_none() { + // this rpc was already removed + false + } else { + // rpc head changed from being synced to not + true + } + } + }; + + Ok(changed) + } + + pub async fn best_consensus_connections( + &mut self, + authorization: &Arc, + web3_connections: &Web3Rpcs, + ) -> anyhow::Result { + // TODO: attach context to these? + let highest_known_block = self + .all_rpcs_group() + .context("no rpcs")? + .highest_block + .as_ref() + .context("no highest block")?; + + trace!("highest_known_block: {}", highest_known_block); + + let min_block_num = self + .max_block_lag + .map(|x| highest_known_block.number().saturating_sub(x)) + // we also want to be sure we don't ever go backwards! + .max(web3_connections.head_block_num()); + + trace!("min_block_num: {:#?}", min_block_num); + + // TODO Should this be a Vec>>? + // TODO: how should errors be handled? + // TODO: find the best tier with a connectionsgroup. best case, this only queries the first tier + // TODO: do we need to calculate all of them? I think having highest_known_block included as part of min_block_num should make that unnecessary + for (tier, x) in self.tiers.iter() { + trace!("checking tier {}: {:#?}", tier, x.rpc_name_to_block); + if let Ok(consensus_head_connections) = x + .consensus_head_connections(authorization, web3_connections, min_block_num, tier) + .await + { + trace!("success on tier {}", tier); + // we got one! hopefully it didn't need to use any backups. + // but even if it did need backup servers, that is better than going to a worse tier + return Ok(consensus_head_connections); + } + } + + return Err(anyhow::anyhow!("failed finding consensus on all tiers")); + } +} + +#[cfg(test)] +mod test { + // #[test] + // fn test_simplest_case_consensus_head_connections() { + // todo!(); + // } +} diff --git a/web3_proxy/src/rpcs/grpc_erigon.rs b/web3_proxy/src/rpcs/grpc_erigon.rs new file mode 100644 index 00000000..e69de29b diff --git a/web3_proxy/src/rpcs/http.rs b/web3_proxy/src/rpcs/http.rs new file mode 100644 index 00000000..e69de29b diff --git a/web3_proxy/src/rpcs/many.rs b/web3_proxy/src/rpcs/many.rs index a46e66f6..d53f0531 100644 --- a/web3_proxy/src/rpcs/many.rs +++ b/web3_proxy/src/rpcs/many.rs @@ -1,8 +1,8 @@ ///! Load balanced communication with a group of web3 rpc providers -use super::blockchain::{ArcBlock, BlockHashesCache}; +use super::blockchain::{BlockHashesCache, Web3ProxyBlock}; +use super::consensus::ConsensusWeb3Rpcs; use super::one::Web3Rpc; use super::request::{OpenRequestHandle, OpenRequestResult, RequestRevertHandler}; -use super::synced_connections::ConsensusWeb3Rpcs; use crate::app::{flatten_handle, AnyhowJoinHandle}; use crate::config::{BlockAndRpc, TxHashAndRpc, Web3RpcConfig}; use crate::frontend::authorization::{Authorization, RequestMetadata}; @@ -16,15 +16,18 @@ use futures::future::try_join_all; use futures::stream::FuturesUnordered; use futures::StreamExt; use hashbrown::{HashMap, HashSet}; +use itertools::Itertools; use log::{debug, error, info, trace, warn, Level}; use migration::sea_orm::DatabaseConnection; use moka::future::{Cache, ConcurrentCacheExt}; +use ordered_float::OrderedFloat; use serde::ser::{SerializeStruct, Serializer}; use serde::Serialize; use serde_json::json; use serde_json::value::RawValue; +use std::cmp::min_by_key; use std::collections::BTreeMap; -use std::sync::atomic::Ordering; +use std::sync::atomic::{self, Ordering}; use std::sync::Arc; use std::{cmp, fmt}; use thread_fast_rng::rand::seq::SliceRandom; @@ -36,11 +39,11 @@ use tokio::time::{interval, sleep, sleep_until, Duration, Instant, MissedTickBeh #[derive(From)] pub struct Web3Rpcs { /// any requests will be forwarded to one (or more) of these connections - pub(crate) conns: HashMap>, + pub(crate) by_name: HashMap>, /// all providers with the same consensus head block. won't update if there is no `self.watch_consensus_head_sender` - pub(super) watch_consensus_connections_sender: watch::Sender>, + pub(super) watch_consensus_rpcs_sender: watch::Sender>, /// this head receiver makes it easy to wait until there is a new block - pub(super) watch_consensus_head_receiver: Option>, + pub(super) watch_consensus_head_receiver: Option>>, pub(super) pending_transactions: Cache, /// TODO: this map is going to grow forever unless we do some sort of pruning. maybe store pruned in redis? @@ -48,25 +51,33 @@ pub struct Web3Rpcs { pub(super) block_hashes: BlockHashesCache, /// blocks on the heaviest chain pub(super) block_numbers: Cache, + /// the number of rpcs required to agree on consensus for the head block (thundering herd protection) pub(super) min_head_rpcs: usize, + /// the soft limit required to agree on consensus for the head block. (thundering herd protection) pub(super) min_sum_soft_limit: u32, + /// how far behind the highest known block height we can be before we stop serving requests + pub(super) max_block_lag: Option, + /// how old our consensus head block we can be before we stop serving requests + pub(super) max_block_age: Option, } impl Web3Rpcs { /// Spawn durable connections to multiple Web3 providers. #[allow(clippy::too_many_arguments)] pub async fn spawn( + block_map: BlockHashesCache, chain_id: u64, db_conn: Option, - server_configs: HashMap, http_client: Option, - redis_pool: Option, - block_map: BlockHashesCache, - watch_consensus_head_sender: Option>, - min_sum_soft_limit: u32, + max_block_age: Option, + max_block_lag: Option, min_head_rpcs: usize, - pending_tx_sender: Option>, + min_sum_soft_limit: u32, pending_transactions: Cache, + pending_tx_sender: Option>, + redis_pool: Option, + server_configs: HashMap, + watch_consensus_head_sender: Option>>, ) -> anyhow::Result<(Arc, AnyhowJoinHandle<()>)> { let (pending_tx_id_sender, pending_tx_id_receiver) = flume::unbounded(); let (block_sender, block_receiver) = flume::unbounded::(); @@ -160,6 +171,7 @@ impl Web3Rpcs { block_map, block_sender, pending_tx_id_sender, + true, ) .await }); @@ -210,14 +222,16 @@ impl Web3Rpcs { watch_consensus_head_sender.as_ref().map(|x| x.subscribe()); let connections = Arc::new(Self { - conns: connections, - watch_consensus_connections_sender, + by_name: connections, + watch_consensus_rpcs_sender: watch_consensus_connections_sender, watch_consensus_head_receiver, pending_transactions, block_hashes, block_numbers, min_sum_soft_limit, min_head_rpcs, + max_block_age, + max_block_lag, }); let authorization = Arc::new(Authorization::internal(db_conn.clone())?); @@ -242,7 +256,7 @@ impl Web3Rpcs { } pub fn get(&self, conn_name: &str) -> Option<&Arc> { - self.conns.get(conn_name) + self.by_name.get(conn_name) } /// subscribe to blocks and transactions from all the backend rpcs. @@ -253,7 +267,7 @@ impl Web3Rpcs { authorization: Arc, pending_tx_id_receiver: flume::Receiver, block_receiver: flume::Receiver, - head_block_sender: Option>, + head_block_sender: Option>>, pending_tx_sender: Option>, ) -> anyhow::Result<()> { let mut futures = vec![]; @@ -343,7 +357,7 @@ impl Web3Rpcs { .into_iter() .map(|active_request_handle| async move { let result: Result, _> = active_request_handle - .request(method, &json!(¶ms), error_level.into()) + .request(method, &json!(¶ms), error_level.into(), None) .await; result }) @@ -406,183 +420,184 @@ impl Web3Rpcs { unimplemented!("this shouldn't be possible") } - pub async fn best_consensus_head_connection( + pub async fn best_available_rpc( &self, authorization: &Arc, request_metadata: Option<&Arc>, skip: &[Arc], + // TODO: if we are checking for the consensus head, i don' think we need min_block_needed/max_block_needed min_block_needed: Option<&U64>, + max_block_needed: Option<&U64>, ) -> anyhow::Result { - if let Ok(without_backups) = self - ._best_consensus_head_connection( - false, - authorization, - request_metadata, - skip, - min_block_needed, - ) - .await - { - // TODO: this might use backups too eagerly. but even when we allow backups, we still prioritize our own - if matches!(without_backups, OpenRequestResult::Handle(_)) { - return Ok(without_backups); - } - } + let usable_rpcs_by_tier_and_head_number: BTreeMap<(u64, Option), Vec>> = { + let synced_connections = self.watch_consensus_rpcs_sender.borrow().clone(); - self._best_consensus_head_connection( - true, - authorization, - request_metadata, - skip, - min_block_needed, - ) - .await - } + let (head_block_num, head_block_age) = + if let Some(head_block) = synced_connections.head_block.as_ref() { + (head_block.number(), head_block.age()) + } else { + return Ok(OpenRequestResult::NotReady); + }; - /// get the best available rpc server with the consensus head block. it might have blocks after the consensus head - async fn _best_consensus_head_connection( - &self, - allow_backups: bool, - authorization: &Arc, - request_metadata: Option<&Arc>, - skip: &[Arc], - min_block_needed: Option<&U64>, - ) -> anyhow::Result { - let usable_rpcs_by_head_num_and_weight: BTreeMap<(Option, u64), Vec>> = { - let synced_connections = self.watch_consensus_connections_sender.borrow().clone(); - - let head_block_num = if let Some(head_block) = synced_connections.head_block.as_ref() { - head_block.number() - } else { - // TODO: optionally wait for a head block >= min_block_needed - return Ok(OpenRequestResult::NotReady(allow_backups)); + let needed_blocks_comparison = match (min_block_needed, max_block_needed) { + (None, None) => { + // no required block given. treat this like the requested the consensus head block + cmp::Ordering::Equal + } + (None, Some(max_block_needed)) => max_block_needed.cmp(head_block_num), + (Some(min_block_needed), None) => min_block_needed.cmp(head_block_num), + (Some(min_block_needed), Some(max_block_needed)) => { + match min_block_needed.cmp(max_block_needed) { + cmp::Ordering::Equal => min_block_needed.cmp(head_block_num), + cmp::Ordering::Greater => { + return Err(anyhow::anyhow!( + "Invalid blocks bounds requested. min ({}) > max ({})", + min_block_needed, + max_block_needed + )) + } + cmp::Ordering::Less => min_block_needed.cmp(head_block_num), + } + } }; - let min_block_needed = min_block_needed.unwrap_or(&head_block_num); + trace!("needed_blocks_comparison: {:?}", needed_blocks_comparison); + // collect "usable_rpcs_by_head_num_and_weight" + // TODO: MAKE SURE None SORTS LAST? let mut m = BTreeMap::new(); - match min_block_needed.cmp(&head_block_num) { + match needed_blocks_comparison { cmp::Ordering::Less => { - // need an old block. check all the rpcs. prefer the most synced + // need an old block. check all the rpcs. ignore rpcs that are still syncing + trace!("old block needed"); + + let min_block_age = + self.max_block_age.map(|x| head_block_age.saturating_sub(x)); + let min_sync_num = self.max_block_lag.map(|x| head_block_num.saturating_sub(x)); + + // TODO: cache this somehow? + // TODO: maybe have a helper on synced_connections? that way sum_soft_limits/min_synced_rpcs will be DRY for x in self - .conns + .by_name .values() - .filter(|x| if allow_backups { true } else { !x.backup }) - .filter(|x| !skip.contains(x)) - .filter(|x| x.has_block_data(min_block_needed)) + .filter(|x| { + // TODO: move a bunch of this onto a rpc.is_synced function + if skip.contains(x) { + // we've already tried this server or have some other reason to skip it + false + } else if max_block_needed + .and_then(|max_block_needed| { + Some(!x.has_block_data(max_block_needed)) + }) + .unwrap_or(false) + { + // server does not have the max block + false + } else if min_block_needed + .and_then(|min_block_needed| { + Some(!x.has_block_data(min_block_needed)) + }) + .unwrap_or(false) + { + // server does not have the min block + false + } else { + // server has the block we need! + true + } + }) .cloned() { let x_head_block = x.head_block.read().clone(); - match x_head_block { - None => continue, - Some(x_head) => { - let key = (Some(x_head.number()), u64::MAX - x.tier); + if let Some(x_head) = x_head_block { + // TODO: should nodes that are ahead of the consensus block have priority? seems better to spread the load + let x_head_num = x_head.number().min(head_block_num); - m.entry(key).or_insert_with(Vec::new).push(x); + // TODO: do we really need to check head_num and age? + if let Some(min_sync_num) = min_sync_num.as_ref() { + if x_head_num < min_sync_num { + trace!("rpc is still syncing"); + continue; + } } + if let Some(min_block_age) = min_block_age { + if x_head.age() > min_block_age { + // rpc is still syncing + trace!("block is too old"); + continue; + } + } + + let key = (x.tier, Some(*x_head_num)); + + m.entry(key).or_insert_with(Vec::new).push(x); } } + + // TODO: check min_synced_rpcs and min_sum_soft_limits? or maybe better to just try to serve the request? } cmp::Ordering::Equal => { - // need the consensus head block. filter the synced rpcs - for x in synced_connections - .conns - .iter() - .filter(|x| !skip.contains(x)) - { - let key = (None, u64::MAX - x.tier); + // using the consensus head block. filter the synced rpcs - m.entry(key).or_insert_with(Vec::new).push(x.clone()); + // the key doesn't matter if we are checking synced connections + // they are all at the same block and it is already sized to what we need + let key = (0, None); + + for x in synced_connections.rpcs.iter() { + if skip.contains(x) { + trace!("skipping: {}", x); + continue; + } + trace!("not skipped!"); + + m.entry(key.clone()) + .or_insert_with(Vec::new) + .push(x.clone()); } } cmp::Ordering::Greater => { - // TODO? if the blocks is close and wait_for_sync and allow_backups, wait for change on a watch_consensus_connections_receiver().subscribe() - return Ok(OpenRequestResult::NotReady(allow_backups)); + // TODO? if the blocks is close, maybe we could wait for change on a watch_consensus_connections_receiver().subscribe() + return Ok(OpenRequestResult::NotReady); } } m }; + trace!( + "usable_rpcs_by_tier_and_head_number: {:#?}", + usable_rpcs_by_tier_and_head_number + ); + let mut earliest_retry_at = None; - for usable_rpcs in usable_rpcs_by_head_num_and_weight.into_values().rev() { - // under heavy load, it is possible for even our best server to be negative - let mut minimum = f64::MAX; - let mut maximum = f64::MIN; - - // we sort on a combination of values. cache them here so that we don't do this math multiple times. - let mut available_request_map: HashMap<_, f64> = usable_rpcs - .iter() - .map(|rpc| { - // TODO: are active requests what we want? do we want a counter for requests in the last second + any actives longer than that? - // TODO: get active requests out of redis (that's definitely too slow) - // TODO: do something with hard limit instead? (but that is hitting redis too much) - let active_requests = rpc.active_requests() as f64; - let soft_limit = rpc.soft_limit as f64; - - let available_requests = soft_limit - active_requests; - - // trace!("available requests on {}: {}", rpc, available_requests); - - minimum = minimum.min(available_requests); - maximum = maximum.max(available_requests); - - (rpc, available_requests) - }) - .collect(); - - // trace!("minimum available requests: {}", minimum); - // trace!("maximum available requests: {}", maximum); - - if maximum < 0.0 { - // TODO: if maximum < 0 and there are other tiers on the same block, we should include them now - warn!("soft limits overloaded: {} to {}", minimum, maximum) - } - - // choose_multiple_weighted can't have negative numbers. shift up if any are negative - // TODO: is this a correct way to shift? - if minimum < 0.0 { - available_request_map = available_request_map - .into_iter() - .map(|(rpc, available_requests)| { - // TODO: is simple addition the right way to shift everyone? - // TODO: probably want something non-linear - // minimum is negative, so we subtract to make available requests bigger - let x = available_requests - minimum; - - (rpc, x) - }) - .collect() - } - - let sorted_rpcs = { - if usable_rpcs.len() == 1 { - // TODO: return now instead? we shouldn't need another alloc - vec![usable_rpcs.get(0).expect("there should be 1")] - } else { - let mut rng = thread_fast_rng::thread_fast_rng(); - - usable_rpcs - .choose_multiple_weighted(&mut rng, usable_rpcs.len(), |rpc| { - *available_request_map - .get(rpc) - .expect("rpc should always be in available_request_map") - }) - .unwrap() - .collect::>() - } + for mut usable_rpcs in usable_rpcs_by_tier_and_head_number.into_values() { + // sort the tier randomly + if usable_rpcs.len() == 1 { + // TODO: include an rpc from the next tier? + } else { + // we can't get the rng outside of this loop because it is not Send + // this function should be pretty fast anyway, so it shouldn't matter too much + let mut rng = thread_fast_rng::thread_fast_rng(); + usable_rpcs.shuffle(&mut rng); }; - // now that the rpcs are sorted, try to get an active request handle for one of them - for best_rpc in sorted_rpcs.into_iter() { - // increment our connection counter - match best_rpc - .try_request_handle(authorization, min_block_needed.is_none()) - .await - { + // now that the rpcs are shuffled, try to get an active request handle for one of them + // pick the first two and try the one with the lower rpc.latency.ewma + // TODO: chunks or tuple windows? + for (rpc_a, rpc_b) in usable_rpcs.into_iter().circular_tuple_windows() { + trace!("{} vs {}", rpc_a, rpc_b); + // TODO: cached key to save a read lock + // TODO: ties to the server with the smallest block_data_limit + let best_rpc = min_by_key(rpc_a, rpc_b, |x| { + OrderedFloat(x.head_latency.read().value()) + }); + trace!("winner: {}", best_rpc); + + // just because it has lower latency doesn't mean we are sure to get a connection + match best_rpc.try_request_handle(authorization, None).await { Ok(OpenRequestResult::Handle(handle)) => { // trace!("opened handle: {}", best_rpc); return Ok(OpenRequestResult::Handle(handle)); @@ -590,8 +605,9 @@ impl Web3Rpcs { Ok(OpenRequestResult::RetryAt(retry_at)) => { earliest_retry_at = earliest_retry_at.min(Some(retry_at)); } - Ok(OpenRequestResult::NotReady(_)) => { + Ok(OpenRequestResult::NotReady) => { // TODO: log a warning? emit a stat? + trace!("best_rpc not ready"); } Err(err) => { warn!("No request handle for {}. err={:?}", best_rpc, err) @@ -620,7 +636,7 @@ impl Web3Rpcs { // TODO: should we log here? - Ok(OpenRequestResult::NotReady(allow_backups)) + Ok(OpenRequestResult::NotReady) } Some(earliest_retry_at) => { warn!("no servers on {:?}! {:?}", self, earliest_retry_at); @@ -637,28 +653,42 @@ impl Web3Rpcs { pub async fn all_connections( &self, authorization: &Arc, - block_needed: Option<&U64>, + min_block_needed: Option<&U64>, + max_block_needed: Option<&U64>, max_count: Option, always_include_backups: bool, ) -> Result, Option> { if !always_include_backups { if let Ok(without_backups) = self - ._all_connections(false, authorization, block_needed, max_count) + ._all_connections( + false, + authorization, + min_block_needed, + max_block_needed, + max_count, + ) .await { return Ok(without_backups); } } - self._all_connections(true, authorization, block_needed, max_count) - .await + self._all_connections( + true, + authorization, + min_block_needed, + max_block_needed, + max_count, + ) + .await } async fn _all_connections( &self, allow_backups: bool, authorization: &Arc, - block_needed: Option<&U64>, + min_block_needed: Option<&U64>, + max_block_needed: Option<&U64>, max_count: Option, ) -> Result, Option> { let mut earliest_retry_at = None; @@ -668,24 +698,20 @@ impl Web3Rpcs { let mut max_count = if let Some(max_count) = max_count { max_count } else { - self.conns.len() + self.by_name.len() }; let mut tried = HashSet::new(); - let mut synced_conns = self - .watch_consensus_connections_sender - .borrow() - .conns - .clone(); + let mut synced_conns = self.watch_consensus_rpcs_sender.borrow().rpcs.clone(); // synced connections are all on the same block. sort them by tier with higher soft limits first - synced_conns.sort_by_cached_key(|x| (x.tier, u32::MAX - x.soft_limit)); + synced_conns.sort_by_cached_key(rpc_sync_status_sort_key); // if there aren't enough synced connections, include more connections - let mut all_conns: Vec<_> = self.conns.values().cloned().collect(); - - sort_connections_by_sync_status(&mut all_conns); + // TODO: only do this sorting if the synced_conns isn't enough + let mut all_conns: Vec<_> = self.by_name.values().cloned().collect(); + all_conns.sort_by_cached_key(rpc_sync_status_sort_key); for connection in itertools::chain(synced_conns, all_conns) { if max_count == 0 { @@ -702,17 +728,20 @@ impl Web3Rpcs { continue; } - if let Some(block_needed) = block_needed { + if let Some(block_needed) = min_block_needed { + if !connection.has_block_data(block_needed) { + continue; + } + } + + if let Some(block_needed) = max_block_needed { if !connection.has_block_data(block_needed) { continue; } } // check rate limits and increment our connection counter - match connection - .try_request_handle(authorization, block_needed.is_none()) - .await - { + match connection.try_request_handle(authorization, None).await { Ok(OpenRequestResult::RetryAt(retry_at)) => { // this rpc is not available. skip it earliest_retry_at = earliest_retry_at.min(Some(retry_at)); @@ -721,7 +750,7 @@ impl Web3Rpcs { max_count -= 1; selected_rpcs.push(handle) } - Ok(OpenRequestResult::NotReady(_)) => { + Ok(OpenRequestResult::NotReady) => { warn!("no request handle for {}", connection) } Err(err) => { @@ -749,26 +778,28 @@ impl Web3Rpcs { request: JsonRpcRequest, request_metadata: Option<&Arc>, min_block_needed: Option<&U64>, + max_block_needed: Option<&U64>, ) -> anyhow::Result { let mut skip_rpcs = vec![]; let mut method_not_available_response = None; - let mut watch_consensus_connections = self.watch_consensus_connections_sender.subscribe(); + let mut watch_consensus_connections = self.watch_consensus_rpcs_sender.subscribe(); // TODO: maximum retries? right now its the total number of servers loop { let num_skipped = skip_rpcs.len(); - if num_skipped == self.conns.len() { + if num_skipped == self.by_name.len() { break; } match self - .best_consensus_head_connection( + .best_available_rpc( authorization, request_metadata, &skip_rpcs, min_block_needed, + max_block_needed, ) .await? { @@ -793,6 +824,7 @@ impl Web3Rpcs { &request.method, &json!(request.params), RequestRevertHandler::Save, + None, ) .await; @@ -908,83 +940,39 @@ impl Web3Rpcs { } } } - OpenRequestResult::NotReady(backups_included) => { + OpenRequestResult::NotReady => { if let Some(request_metadata) = request_metadata { request_metadata.no_servers.fetch_add(1, Ordering::Release); } - // todo!( - // "check if we are requesting an old block and no archive servers are synced" - // ); - - if let Some(min_block_needed) = min_block_needed { - let mut theres_a_chance = false; - - for potential_conn in self.conns.values() { - if skip_rpcs.contains(potential_conn) { - continue; - } - - // TODO: should we instead check if has_block_data but with the current head block? - if potential_conn.has_block_data(min_block_needed) { - trace!("chance for {} on {}", min_block_needed, potential_conn); - theres_a_chance = true; - break; - } - - skip_rpcs.push(potential_conn.clone()); - } - - if !theres_a_chance { - debug!("no chance of finding data in block #{}", min_block_needed); - break; - } - } - - if backups_included { - // if NotReady and we tried backups, there's no chance - warn!("No servers ready even after checking backups"); - break; - } - - debug!("No servers ready. Waiting up to 1 second for change in synced servers"); - - // TODO: exponential backoff? - tokio::select! { - _ = sleep(Duration::from_secs(1)) => { - // do NOT pop the last rpc off skip here - } - _ = watch_consensus_connections.changed() => { - watch_consensus_connections.borrow_and_update(); - } - } + break; } } } - if let Some(r) = method_not_available_response { - // TODO: emit a stat for unsupported methods? - return Ok(r); - } - - // TODO: do we need this here, or do we do it somewhere else? + // TODO: do we need this here, or do we do it somewhere else? like, the code could change and a try operator in here would skip this increment if let Some(request_metadata) = request_metadata { request_metadata .error_response .store(true, Ordering::Release); } - let num_conns = self.conns.len(); + if let Some(r) = method_not_available_response { + // TODO: emit a stat for unsupported methods? it would be best to block them at the proxy instead of at the backend + return Ok(r); + } + + let num_conns = self.by_name.len(); let num_skipped = skip_rpcs.len(); if num_skipped == 0 { - error!("No servers synced ({} known)", num_conns); + error!("No servers synced ({} known). None skipped", num_conns); - return Ok(JsonRpcForwardedResponse::from_str( + Ok(JsonRpcForwardedResponse::from_str( "No servers synced", Some(-32000), Some(request.id), - )); + )) } else { // TODO: warn? debug? trace? warn!( @@ -994,11 +982,11 @@ impl Web3Rpcs { // TODO: what error code? // cloudflare gives {"jsonrpc":"2.0","error":{"code":-32043,"message":"Requested data cannot be older than 128 blocks."},"id":1} - return Ok(JsonRpcForwardedResponse::from_str( + Ok(JsonRpcForwardedResponse::from_str( "Requested data is not available", Some(-32043), Some(request.id), - )); + )) } } @@ -1008,7 +996,8 @@ impl Web3Rpcs { authorization: &Arc, request: &JsonRpcRequest, request_metadata: Option>, - block_needed: Option<&U64>, + min_block_needed: Option<&U64>, + max_block_needed: Option<&U64>, error_level: Level, max_count: Option, always_include_backups: bool, @@ -1017,7 +1006,8 @@ impl Web3Rpcs { match self .all_connections( authorization, - block_needed, + min_block_needed, + max_block_needed, max_count, always_include_backups, ) @@ -1100,6 +1090,7 @@ impl Web3Rpcs { request: JsonRpcRequest, request_metadata: Option<&Arc>, min_block_needed: Option<&U64>, + max_block_needed: Option<&U64>, ) -> anyhow::Result { match proxy_mode { ProxyMode::Best => { @@ -1108,6 +1099,7 @@ impl Web3Rpcs { request, request_metadata, min_block_needed, + max_block_needed, ) .await } @@ -1121,7 +1113,7 @@ impl fmt::Debug for Web3Rpcs { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { // TODO: the default formatter takes forever to write. this is too quiet though f.debug_struct("Web3Rpcs") - .field("conns", &self.conns) + .field("rpcs", &self.by_name) .finish_non_exhaustive() } } @@ -1133,11 +1125,12 @@ impl Serialize for Web3Rpcs { { let mut state = serializer.serialize_struct("Web3Rpcs", 6)?; - let conns: Vec<&Web3Rpc> = self.conns.values().map(|x| x.as_ref()).collect(); - state.serialize_field("conns", &conns)?; + let rpcs: Vec<&Web3Rpc> = self.by_name.values().map(|x| x.as_ref()).collect(); + // TODO: coordinate with frontend team to rename "conns" to "rpcs" + state.serialize_field("conns", &rpcs)?; { - let consensus_connections = self.watch_consensus_connections_sender.borrow().clone(); + let consensus_connections = self.watch_consensus_rpcs_sender.borrow().clone(); // TODO: rename synced_connections to consensus_connections? state.serialize_field("synced_connections", &consensus_connections)?; } @@ -1153,30 +1146,37 @@ impl Serialize for Web3Rpcs { } /// sort by block number (descending) and tier (ascending) -fn sort_connections_by_sync_status(rpcs: &mut Vec>) { - rpcs.sort_by_cached_key(|x| { - let reversed_head_block = u64::MAX - - x.head_block - .read() - .as_ref() - .map(|x| x.number().as_u64()) - .unwrap_or(0); +/// TODO: should this be moved into a `impl Web3Rpc`? +/// TODO: i think we still have sorts scattered around the code that should use this +/// TODO: take AsRef or something like that? We don't need an Arc here +fn rpc_sync_status_sort_key(x: &Arc) -> (U64, u64, OrderedFloat) { + let reversed_head_block = U64::MAX + - x.head_block + .read() + .as_ref() + .map(|x| *x.number()) + .unwrap_or_default(); - let tier = x.tier; + let tier = x.tier; - (reversed_head_block, tier) - }); + // TODO: use request instead of head latency + let head_ewma = x.head_latency.read().value(); + + let active_requests = x.active_requests.load(atomic::Ordering::Relaxed) as f64; + + // TODO: i'm not sure head * active is exactly right. but we'll see + // TODO: i don't think this actually counts as peak. investigate with atomics.rs and peak_ewma.rs + let peak_ewma = OrderedFloat(head_ewma * active_requests); + + (reversed_head_block, tier, peak_ewma) } mod tests { // TODO: why is this allow needed? does tokio::test get in the way somehow? #![allow(unused_imports)] use super::*; - use crate::rpcs::{ - blockchain::{ConsensusFinder, SavedBlock}, - one::ProviderState, - provider::Web3Provider, - }; + use crate::rpcs::consensus::ConsensusFinder; + use crate::rpcs::{blockchain::Web3ProxyBlock, provider::Web3Provider}; use ethers::types::{Block, U256}; use log::{trace, LevelFilter}; use parking_lot::RwLock; @@ -1205,10 +1205,10 @@ mod tests { let blocks: Vec<_> = [block_0, block_1, block_2] .into_iter() - .map(|x| SavedBlock::new(Arc::new(x))) + .map(|x| Web3ProxyBlock::try_new(Arc::new(x)).unwrap()) .collect(); - let mut rpcs = [ + let mut rpcs: Vec<_> = [ Web3Rpc { name: "a".to_string(), tier: 0, @@ -1250,7 +1250,7 @@ mod tests { .map(Arc::new) .collect(); - sort_connections_by_sync_status(&mut rpcs); + rpcs.sort_by_cached_key(rpc_sync_status_sort_key); let names_in_sort_order: Vec<_> = rpcs.iter().map(|x| x.name.as_str()).collect(); @@ -1290,37 +1290,32 @@ mod tests { let lagged_block = Arc::new(lagged_block); let head_block = Arc::new(head_block); - // TODO: write a impl From for Block -> BlockId? - let mut lagged_block: SavedBlock = lagged_block.into(); - let mut head_block: SavedBlock = head_block.into(); + let mut lagged_block: Web3ProxyBlock = lagged_block.try_into().unwrap(); + let mut head_block: Web3ProxyBlock = head_block.try_into().unwrap(); let block_data_limit = u64::MAX; let head_rpc = Web3Rpc { name: "synced".to_string(), - provider_state: AsyncRwLock::new(ProviderState::Connected(Arc::new( - Web3Provider::Mock, - ))), soft_limit: 1_000, automatic_block_limit: false, backup: false, block_data_limit: block_data_limit.into(), tier: 0, head_block: RwLock::new(Some(head_block.clone())), + provider: AsyncRwLock::new(Some(Arc::new(Web3Provider::Mock))), ..Default::default() }; let lagged_rpc = Web3Rpc { name: "lagged".to_string(), - provider_state: AsyncRwLock::new(ProviderState::Connected(Arc::new( - Web3Provider::Mock, - ))), soft_limit: 1_000, automatic_block_limit: false, backup: false, block_data_limit: block_data_limit.into(), tier: 0, head_block: RwLock::new(Some(lagged_block.clone())), + provider: AsyncRwLock::new(Some(Arc::new(Web3Provider::Mock))), ..Default::default() }; @@ -1333,68 +1328,65 @@ mod tests { let head_rpc = Arc::new(head_rpc); let lagged_rpc = Arc::new(lagged_rpc); - let conns = HashMap::from([ + let rpcs_by_name = HashMap::from([ (head_rpc.name.clone(), head_rpc.clone()), (lagged_rpc.name.clone(), lagged_rpc.clone()), ]); - let (watch_consensus_connections_sender, _) = watch::channel(Default::default()); + let (watch_consensus_rpcs_sender, _) = watch::channel(Default::default()); // TODO: make a Web3Rpcs::new - let conns = Web3Rpcs { - conns, + let rpcs = Web3Rpcs { + by_name: rpcs_by_name, watch_consensus_head_receiver: None, - watch_consensus_connections_sender, + watch_consensus_rpcs_sender, pending_transactions: Cache::builder() - .max_capacity(10_000) .build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default()), block_hashes: Cache::builder() - .max_capacity(10_000) .build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default()), block_numbers: Cache::builder() - .max_capacity(10_000) .build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default()), + // TODO: test max_block_age? + max_block_age: None, + // TODO: test max_block_lag? + max_block_lag: None, min_head_rpcs: 1, min_sum_soft_limit: 1, }; let authorization = Arc::new(Authorization::internal(None).unwrap()); - let (head_block_sender, _head_block_receiver) = - watch::channel::(Default::default()); - let mut connection_heads = ConsensusFinder::default(); + let (head_block_sender, _head_block_receiver) = watch::channel(Default::default()); + let mut consensus_finder = ConsensusFinder::new(&[0, 1, 2, 3], None, None); // process None so that - conns - .process_block_from_rpc( - &authorization, - &mut connection_heads, - None, - lagged_rpc.clone(), - &head_block_sender, - &None, - ) - .await - .unwrap(); - conns - .process_block_from_rpc( - &authorization, - &mut connection_heads, - None, - head_rpc.clone(), - &head_block_sender, - &None, - ) - .await - .unwrap(); + rpcs.process_block_from_rpc( + &authorization, + &mut consensus_finder, + None, + lagged_rpc.clone(), + &head_block_sender, + &None, + ) + .await + .expect("its lagged, but it should still be seen as consensus if its the first to report"); + rpcs.process_block_from_rpc( + &authorization, + &mut consensus_finder, + None, + head_rpc.clone(), + &head_block_sender, + &None, + ) + .await + .unwrap(); // no head block because the rpcs haven't communicated through their channels - assert!(conns.head_block_hash().is_none()); + assert!(rpcs.head_block_hash().is_none()); // all_backend_connections gives all non-backup servers regardless of sync status assert_eq!( - conns - .all_connections(&authorization, None, None, false) + rpcs.all_connections(&authorization, None, None, None, false) .await .unwrap() .len(), @@ -1402,88 +1394,80 @@ mod tests { ); // best_synced_backend_connection requires servers to be synced with the head block - let x = conns - .best_consensus_head_connection(&authorization, None, &[], None) + let x = rpcs + .best_available_rpc(&authorization, None, &[], None, None) .await .unwrap(); dbg!(&x); - assert!(matches!(x, OpenRequestResult::NotReady(true))); + assert!(matches!(x, OpenRequestResult::NotReady)); - // add lagged blocks to the conns. both servers should be allowed - lagged_block.block = conns.save_block(lagged_block.block, true).await.unwrap(); + // add lagged blocks to the rpcs. both servers should be allowed + lagged_block = rpcs.try_cache_block(lagged_block, true).await.unwrap(); - conns - .process_block_from_rpc( - &authorization, - &mut connection_heads, - Some(lagged_block.clone()), - lagged_rpc, - &head_block_sender, - &None, - ) - .await - .unwrap(); - conns - .process_block_from_rpc( - &authorization, - &mut connection_heads, - Some(lagged_block.clone()), - head_rpc.clone(), - &head_block_sender, - &None, - ) - .await - .unwrap(); + rpcs.process_block_from_rpc( + &authorization, + &mut consensus_finder, + Some(lagged_block.clone()), + lagged_rpc, + &head_block_sender, + &None, + ) + .await + .unwrap(); + rpcs.process_block_from_rpc( + &authorization, + &mut consensus_finder, + Some(lagged_block.clone()), + head_rpc.clone(), + &head_block_sender, + &None, + ) + .await + .unwrap(); - assert_eq!(conns.num_synced_rpcs(), 2); + assert_eq!(rpcs.num_synced_rpcs(), 2); - // add head block to the conns. lagged_rpc should not be available - head_block.block = conns.save_block(head_block.block, true).await.unwrap(); + // add head block to the rpcs. lagged_rpc should not be available + head_block = rpcs.try_cache_block(head_block, true).await.unwrap(); - conns - .process_block_from_rpc( - &authorization, - &mut connection_heads, - Some(head_block.clone()), - head_rpc, - &head_block_sender, - &None, - ) - .await - .unwrap(); + rpcs.process_block_from_rpc( + &authorization, + &mut consensus_finder, + Some(head_block.clone()), + head_rpc, + &head_block_sender, + &None, + ) + .await + .unwrap(); - assert_eq!(conns.num_synced_rpcs(), 1); + assert_eq!(rpcs.num_synced_rpcs(), 1); assert!(matches!( - conns - .best_consensus_head_connection(&authorization, None, &[], None) + rpcs.best_available_rpc(&authorization, None, &[], None, None) .await, Ok(OpenRequestResult::Handle(_)) )); assert!(matches!( - conns - .best_consensus_head_connection(&authorization, None, &[], Some(&0.into())) + rpcs.best_available_rpc(&authorization, None, &[], Some(&0.into()), None) .await, Ok(OpenRequestResult::Handle(_)) )); assert!(matches!( - conns - .best_consensus_head_connection(&authorization, None, &[], Some(&1.into())) + rpcs.best_available_rpc(&authorization, None, &[], Some(&1.into()), None) .await, Ok(OpenRequestResult::Handle(_)) )); // future block should not get a handle - assert!(matches!( - conns - .best_consensus_head_connection(&authorization, None, &[], Some(&2.into())) - .await, - Ok(OpenRequestResult::NotReady(true)) - )); + let future_rpc = rpcs + .best_available_rpc(&authorization, None, &[], Some(&2.into()), None) + .await; + assert!(matches!(future_rpc, Ok(OpenRequestResult::NotReady))); } #[tokio::test] @@ -1509,33 +1493,29 @@ mod tests { ..Default::default() }; - let head_block: SavedBlock = Arc::new(head_block).into(); + let head_block: Web3ProxyBlock = Arc::new(head_block).try_into().unwrap(); let pruned_rpc = Web3Rpc { name: "pruned".to_string(), - provider_state: AsyncRwLock::new(ProviderState::Connected(Arc::new( - Web3Provider::Mock, - ))), soft_limit: 3_000, automatic_block_limit: false, backup: false, block_data_limit: 64.into(), tier: 1, head_block: RwLock::new(Some(head_block.clone())), + provider: AsyncRwLock::new(Some(Arc::new(Web3Provider::Mock))), ..Default::default() }; let archive_rpc = Web3Rpc { name: "archive".to_string(), - provider_state: AsyncRwLock::new(ProviderState::Connected(Arc::new( - Web3Provider::Mock, - ))), soft_limit: 1_000, automatic_block_limit: false, backup: false, block_data_limit: u64::MAX.into(), tier: 2, head_block: RwLock::new(Some(head_block.clone())), + provider: AsyncRwLock::new(Some(Arc::new(Web3Provider::Mock))), ..Default::default() }; @@ -1547,74 +1527,81 @@ mod tests { let pruned_rpc = Arc::new(pruned_rpc); let archive_rpc = Arc::new(archive_rpc); - let conns = HashMap::from([ + let rpcs_by_name = HashMap::from([ (pruned_rpc.name.clone(), pruned_rpc.clone()), (archive_rpc.name.clone(), archive_rpc.clone()), ]); - let (watch_consensus_connections_sender, _) = watch::channel(Default::default()); + let (watch_consensus_rpcs_sender, _) = watch::channel(Default::default()); // TODO: make a Web3Rpcs::new - let conns = Web3Rpcs { - conns, + let rpcs = Web3Rpcs { + by_name: rpcs_by_name, watch_consensus_head_receiver: None, - watch_consensus_connections_sender, + watch_consensus_rpcs_sender, pending_transactions: Cache::builder() - .max_capacity(10) .build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default()), block_hashes: Cache::builder() - .max_capacity(10) .build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default()), block_numbers: Cache::builder() - .max_capacity(10) .build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default()), min_head_rpcs: 1, - min_sum_soft_limit: 3_000, + min_sum_soft_limit: 4_000, + max_block_age: None, + max_block_lag: None, }; let authorization = Arc::new(Authorization::internal(None).unwrap()); - let (head_block_sender, _head_block_receiver) = - watch::channel::(Default::default()); - let mut connection_heads = ConsensusFinder::default(); + let (head_block_sender, _head_block_receiver) = watch::channel(Default::default()); + let mut connection_heads = ConsensusFinder::new(&[0, 1, 2, 3], None, None); - conns - .process_block_from_rpc( - &authorization, - &mut connection_heads, - Some(head_block.clone()), - pruned_rpc.clone(), - &head_block_sender, - &None, - ) - .await - .unwrap(); - conns - .process_block_from_rpc( - &authorization, - &mut connection_heads, - Some(head_block.clone()), - archive_rpc.clone(), - &head_block_sender, - &None, - ) - .await - .unwrap(); + // min sum soft limit will require tier 2 + rpcs.process_block_from_rpc( + &authorization, + &mut connection_heads, + Some(head_block.clone()), + pruned_rpc.clone(), + &head_block_sender, + &None, + ) + .await + .unwrap_err(); - assert_eq!(conns.num_synced_rpcs(), 2); + rpcs.process_block_from_rpc( + &authorization, + &mut connection_heads, + Some(head_block.clone()), + archive_rpc.clone(), + &head_block_sender, + &None, + ) + .await + .unwrap(); + + assert_eq!(rpcs.num_synced_rpcs(), 2); // best_synced_backend_connection requires servers to be synced with the head block - let best_head_server = conns - .best_consensus_head_connection(&authorization, None, &[], Some(&head_block.number())) + // TODO: test with and without passing the head_block.number? + let best_available_server = rpcs + .best_available_rpc(&authorization, None, &[], Some(&head_block.number()), None) .await; + debug!("best_available_server: {:#?}", best_available_server); + assert!(matches!( - best_head_server.unwrap(), + best_available_server.unwrap(), OpenRequestResult::Handle(_) )); - let best_archive_server = conns - .best_consensus_head_connection(&authorization, None, &[], Some(&1.into())) + let best_available_server_from_none = rpcs + .best_available_rpc(&authorization, None, &[], None, None) + .await; + + // assert_eq!(best_available_server, best_available_server_from_none); + + let best_archive_server = rpcs + .best_available_rpc(&authorization, None, &[], Some(&1.into()), None) .await; match best_archive_server { diff --git a/web3_proxy/src/rpcs/mod.rs b/web3_proxy/src/rpcs/mod.rs index 44ea5afe..41b7a6ea 100644 --- a/web3_proxy/src/rpcs/mod.rs +++ b/web3_proxy/src/rpcs/mod.rs @@ -1,8 +1,8 @@ // TODO: all pub, or export useful things here instead? pub mod blockchain; +pub mod consensus; pub mod many; pub mod one; pub mod provider; pub mod request; -pub mod synced_connections; pub mod transactions; diff --git a/web3_proxy/src/rpcs/one.rs b/web3_proxy/src/rpcs/one.rs index 05bc0e54..1fc80ea4 100644 --- a/web3_proxy/src/rpcs/one.rs +++ b/web3_proxy/src/rpcs/one.rs @@ -1,18 +1,19 @@ ///! Rate-limited communication with a web3 provider. -use super::blockchain::{ArcBlock, BlockHashesCache, SavedBlock}; +use super::blockchain::{ArcBlock, BlockHashesCache, Web3ProxyBlock}; use super::provider::Web3Provider; use super::request::{OpenRequestHandle, OpenRequestResult}; use crate::app::{flatten_handle, AnyhowJoinHandle}; -use crate::config::BlockAndRpc; +use crate::config::{BlockAndRpc, Web3RpcConfig}; use crate::frontend::authorization::Authorization; -use anyhow::Context; +use crate::rpcs::request::RequestRevertHandler; +use anyhow::{anyhow, Context}; use ethers::prelude::{Bytes, Middleware, ProviderError, TxHash, H256, U64}; -use ethers::types::U256; +use ethers::types::{Address, Transaction, U256}; use futures::future::try_join_all; use futures::StreamExt; -use hdrhistogram::Histogram; use log::{debug, error, info, trace, warn, Level}; use migration::sea_orm::DatabaseConnection; +use ordered_float::OrderedFloat; use parking_lot::RwLock; use redis_rate_limiter::{RedisPool, RedisRateLimitResult, RedisRateLimiter}; use serde::ser::{SerializeStruct, Serializer}; @@ -21,69 +22,73 @@ use serde_json::json; use std::cmp::min; use std::fmt; use std::hash::{Hash, Hasher}; -use std::sync::atomic::{self, AtomicU32, AtomicU64}; +use std::sync::atomic::{self, AtomicU64, AtomicUsize}; use std::{cmp::Ordering, sync::Arc}; use thread_fast_rng::rand::Rng; use thread_fast_rng::thread_fast_rng; use tokio::sync::{broadcast, oneshot, watch, RwLock as AsyncRwLock}; use tokio::time::{sleep, sleep_until, timeout, Duration, Instant}; -// TODO: maybe provider state should have the block data limit in it. but it is inside an async lock and we can't Serialize then -#[derive(Clone, Debug)] -pub enum ProviderState { - None, - Connecting(Arc), - Connected(Arc), +pub struct Latency { + /// exponentially weighted moving average of how many milliseconds behind the fastest node we are + ewma: ewma::EWMA, } -impl Default for ProviderState { - fn default() -> Self { - Self::None +impl Serialize for Latency { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + serializer.serialize_f64(self.ewma.value()) } } -impl ProviderState { - pub async fn provider(&self, allow_not_ready: bool) -> Option<&Arc> { - match self { - ProviderState::None => None, - ProviderState::Connecting(x) => { - if allow_not_ready { - Some(x) - } else { - // TODO: do a ready check here? - None - } - } - ProviderState::Connected(x) => { - if x.ready() { - Some(x) - } else { - None - } - } - } +impl Latency { + #[inline(always)] + pub fn record(&mut self, duration: Duration) { + self.record_ms(duration.as_secs_f64() * 1000.0); + } + + #[inline(always)] + pub fn record_ms(&mut self, milliseconds: f64) { + self.ewma.add(milliseconds); + } + + #[inline(always)] + pub fn value(&self) -> f64 { + self.ewma.value() } } -pub struct Web3RpcLatencies { - /// Traack how far behind the fastest node we are - new_head: Histogram, - /// exponentially weighted moving average of how far behind the fastest node we are - new_head_ewma: u32, - /// Track how long an rpc call takes on average - request: Histogram, - /// exponentially weighted moving average of how far behind the fastest node we are - request_ewma: u32, +impl Default for Latency { + fn default() -> Self { + // TODO: what should the default span be? 25 requests? have a "new" + let span = 25.0; + + let start = 1000.0; + + Self::new(span, start) + } } -impl Default for Web3RpcLatencies { - fn default() -> Self { - Self { - new_head: Histogram::new(3).unwrap(), - new_head_ewma: 0, - request: Histogram::new(3).unwrap(), - request_ewma: 0, +impl Latency { + // depending on the span, start might not be perfect + pub fn new(span: f64, start: f64) -> Self { + let alpha = Self::span_to_alpha(span); + + let mut ewma = ewma::EWMA::new(alpha); + + if start > 0.0 { + for _ in 0..(span as u64) { + ewma.add(start); + } } + + Self { ewma } + } + + fn span_to_alpha(span: f64) -> f64 { + 2.0 / (span + 1.0) } } @@ -93,19 +98,15 @@ pub struct Web3Rpc { pub name: String, pub display_name: Option, pub db_conn: Option, - /// TODO: can we get this from the provider? do we even need it? - pub(super) url: String, + pub(super) ws_url: Option, + pub(super) http_url: Option, /// Some connections use an http_client. we keep a clone for reconnecting pub(super) http_client: Option, - /// keep track of currently open requests. We sort on this - pub(super) active_requests: AtomicU32, - /// keep track of total requests from the frontend - pub(super) frontend_requests: AtomicU64, - /// keep track of total requests from web3-proxy itself - pub(super) internal_requests: AtomicU64, /// provider is in a RwLock so that we can replace it if re-connecting /// it is an async lock because we hold it open across awaits - pub(super) provider_state: AsyncRwLock, + /// this provider is only used for new heads subscriptions + /// TODO: put the provider inside an arc? + pub(super) provider: AsyncRwLock>>, /// keep track of hard limits pub(super) hard_limit_until: Option>, /// rate limits are stored in a central redis so that multiple proxies can share their rate limits @@ -121,10 +122,17 @@ pub struct Web3Rpc { pub(super) block_data_limit: AtomicU64, /// Lower tiers are higher priority when sending requests pub(super) tier: u64, - /// TODO: change this to a watch channel so that http providers can subscribe and take action on change - pub(super) head_block: RwLock>, - /// Track how fast this RPC is - pub(super) latency: Web3RpcLatencies, + /// TODO: change this to a watch channel so that http providers can subscribe and take action on change. + pub(super) head_block: RwLock>, + /// Track head block latency + pub(super) head_latency: RwLock, + // /// Track request latency + // /// TODO: refactor this. this lock kills perf. for now just use head_latency + // pub(super) request_latency: RwLock, + /// Track total requests served + /// TODO: maybe move this to graphana + pub(super) total_requests: AtomicUsize, + pub(super) active_requests: AtomicUsize, } impl Web3Rpc { @@ -132,39 +140,54 @@ impl Web3Rpc { // TODO: have this take a builder (which will have channels attached). or maybe just take the config and give the config public fields #[allow(clippy::too_many_arguments)] pub async fn spawn( + mut config: Web3RpcConfig, name: String, - display_name: Option, chain_id: u64, db_conn: Option, - url_str: String, // optional because this is only used for http providers. websocket providers don't use it http_client: Option, + // TODO: rename to http_new_head_interval_sender? http_interval_sender: Option>>, - // TODO: have a builder struct for this. - hard_limit: Option<(u64, RedisPool)>, - // TODO: think more about this type - soft_limit: u32, - backup: bool, - block_data_limit: Option, + redis_pool: Option, + // TODO: think more about soft limit. watching ewma of requests is probably better. but what should the random sort be on? maybe group on tier is enough + // soft_limit: u32, block_map: BlockHashesCache, block_sender: Option>, tx_id_sender: Option)>>, reconnect: bool, - tier: u64, ) -> anyhow::Result<(Arc, AnyhowJoinHandle<()>)> { - let hard_limit = hard_limit.map(|(hard_rate_limit, redis_pool)| { - // TODO: is cache size 1 okay? i think we need - RedisRateLimiter::new( - "web3_proxy", - &format!("{}:{}", chain_id, name), - hard_rate_limit, - 60.0, - redis_pool, - ) - }); + let hard_limit = match (config.hard_limit, redis_pool) { + (None, None) => None, + (Some(hard_limit), Some(redis_pool)) => { + // TODO: in process rate limiter instead? or is deffered good enough? + let rrl = RedisRateLimiter::new( + "web3_proxy", + &format!("{}:{}", chain_id, name), + hard_limit, + 60.0, + redis_pool, + ); - // TODO: should we do this even if block_sender is None? then we would know limits on private relays - let block_data_limit: AtomicU64 = block_data_limit.unwrap_or_default().into(); + Some(rrl) + } + (None, Some(_)) => None, + (Some(_hard_limit), None) => { + return Err(anyhow::anyhow!( + "no redis client pool! needed for hard limit" + )) + } + }; + + let tx_id_sender = if config.subscribe_txs { + // TODO: warn if tx_id_sender is None? + tx_id_sender + } else { + None + }; + + let backup = config.backup; + + let block_data_limit: AtomicU64 = config.block_data_limit.unwrap_or_default().into(); let automatic_block_limit = (block_data_limit.load(atomic::Ordering::Acquire) == 0) && block_sender.is_some(); @@ -178,19 +201,36 @@ impl Web3Rpc { None }; + if config.ws_url.is_none() && config.http_url.is_none() { + if let Some(url) = config.url { + if url.starts_with("ws") { + config.ws_url = Some(url); + } else if url.starts_with("http") { + config.http_url = Some(url); + } else { + return Err(anyhow!("only ws or http urls are supported")); + } + } else { + return Err(anyhow!( + "either ws_url or http_url are required. it is best to set both" + )); + } + } + let new_connection = Self { name, db_conn: db_conn.clone(), - display_name, + display_name: config.display_name, http_client, - url: url_str, + ws_url: config.ws_url, + http_url: config.http_url, hard_limit, hard_limit_until, - soft_limit, + soft_limit: config.soft_limit, automatic_block_limit, backup, block_data_limit, - tier, + tier: config.tier, ..Default::default() }; @@ -220,10 +260,23 @@ impl Web3Rpc { Ok((new_connection, handle)) } + pub async fn peak_ewma(&self) -> OrderedFloat { + // TODO: use request instead of head latency? that was killing perf though + let head_ewma = self.head_latency.read().value(); + + // TODO: what ordering? + let active_requests = self.active_requests.load(atomic::Ordering::Relaxed) as f64; + + // TODO: i'm not sure head * active is exactly right. but we'll see + // TODO: i don't think this actually counts as peak. investigate with atomics.rs and peak_ewma.rs + OrderedFloat(head_ewma * active_requests) + } + // TODO: would be great if rpcs exposed this. see https://github.com/ledgerwatch/erigon/issues/6391 async fn check_block_data_limit( self: &Arc, authorization: &Arc, + unlocked_provider: Option>, ) -> anyhow::Result> { if !self.automatic_block_limit { // TODO: is this a good thing to return? @@ -238,7 +291,7 @@ impl Web3Rpc { // TODO: start at 0 or 1? for block_data_limit in [0, 32, 64, 128, 256, 512, 1024, 90_000, u64::MAX] { let handle = self - .wait_for_request_handle(authorization, None, true) + .wait_for_request_handle(authorization, None, unlocked_provider.clone()) .await?; let head_block_num_future = handle.request::, U256>( @@ -246,6 +299,7 @@ impl Web3Rpc { &None, // error here are expected, so keep the level low Level::Debug.into(), + unlocked_provider.clone(), ); let head_block_num = timeout(Duration::from_secs(5), head_block_num_future) @@ -264,7 +318,7 @@ impl Web3Rpc { // TODO: wait for the handle BEFORE we check the current block number. it might be delayed too! // TODO: what should the request be? let handle = self - .wait_for_request_handle(authorization, None, true) + .wait_for_request_handle(authorization, None, unlocked_provider.clone()) .await?; let archive_result: Result = handle @@ -276,6 +330,7 @@ impl Web3Rpc { )), // error here are expected, so keep the level low Level::Trace.into(), + unlocked_provider.clone(), ) .await; @@ -314,9 +369,9 @@ impl Web3Rpc { } pub fn has_block_data(&self, needed_block_num: &U64) -> bool { - let head_block_num = match self.head_block.read().clone() { + let head_block_num = match self.head_block.read().as_ref() { None => return false, - Some(x) => x.number(), + Some(x) => *x.number(), }; // this rpc doesn't have that block yet. still syncing @@ -403,119 +458,111 @@ impl Web3Rpc { chain_id: u64, db_conn: Option<&DatabaseConnection>, ) -> anyhow::Result<()> { - // trace!("provider_state {} locking...", self); - let mut provider_state = self - .provider_state - .try_write() - .context("locking provider for write")?; - // trace!("provider_state {} locked: {:?}", self, provider_state); - - match &*provider_state { - ProviderState::None => { - info!("connecting to {}", self); + if let Ok(mut unlocked_provider) = self.provider.try_write() { + #[cfg(test)] + if let Some(Web3Provider::Mock) = unlocked_provider.as_deref() { + return Ok(()); } - ProviderState::Connecting(provider) | ProviderState::Connected(provider) => { - // disconnect the current provider - if let Web3Provider::Mock = provider.as_ref() { - return Ok(()); + + *unlocked_provider = if let Some(ws_url) = self.ws_url.as_ref() { + // set up ws client + match &*unlocked_provider { + None => { + info!("connecting to {}", self); + } + Some(_) => { + debug!("reconnecting to {}", self); + + // tell the block subscriber that this rpc doesn't have any blocks + if let Some(block_sender) = block_sender { + block_sender + .send_async((None, self.clone())) + .await + .context("block_sender during connect")?; + } + + // reset sync status + let mut head_block = self.head_block.write(); + *head_block = None; + + // disconnect the current provider + // TODO: what until the block_sender's receiver finishes updating this item? + *unlocked_provider = None; + } } - debug!("reconnecting to {}", self); + let p = Web3Provider::from_str(ws_url.as_str(), None) + .await + .context(format!("failed connecting to {}", ws_url))?; - // disconnect the current provider - *provider_state = ProviderState::None; + assert!(p.ws().is_some()); - // reset sync status - // trace!("locking head block on {}", self); - { - let mut head_block = self.head_block.write(); - *head_block = None; - } - // trace!("done with head block on {}", self); - - // tell the block subscriber that we don't have any blocks - if let Some(block_sender) = block_sender { - block_sender - .send_async((None, self.clone())) + Some(Arc::new(p)) + } else { + // http client + if let Some(url) = &self.http_url { + let p = Web3Provider::from_str(url, self.http_client.clone()) .await - .context("block_sender during connect")?; + .context(format!("failed connecting to {}", url))?; + + assert!(p.http().is_some()); + + Some(Arc::new(p)) + } else { + None + } + }; + + let authorization = Arc::new(Authorization::internal(db_conn.cloned())?); + + // check the server's chain_id here + // TODO: some public rpcs (on bsc and fantom) do not return an id and so this ends up being an error + // TODO: what should the timeout be? should there be a request timeout? + // trace!("waiting on chain id for {}", self); + let found_chain_id: Result = self + .wait_for_request_handle(&authorization, None, unlocked_provider.clone()) + .await? + .request( + "eth_chainId", + &json!(Option::None::<()>), + Level::Trace.into(), + unlocked_provider.clone(), + ) + .await; + // trace!("found_chain_id: {:?}", found_chain_id); + + match found_chain_id { + Ok(found_chain_id) => { + // TODO: there has to be a cleaner way to do this + if chain_id != found_chain_id.as_u64() { + return Err(anyhow::anyhow!( + "incorrect chain id! Config has {}, but RPC has {}", + chain_id, + found_chain_id + ) + .context(format!("failed @ {}", self))); + } + } + Err(e) => { + return Err(anyhow::Error::from(e)); } } - } - // trace!("Creating new Web3Provider on {}", self); - // TODO: if this fails, keep retrying! otherwise it crashes and doesn't try again! - let new_provider = Web3Provider::from_str(&self.url, self.http_client.clone()).await?; + self.check_block_data_limit(&authorization, unlocked_provider.clone()) + .await?; - // trace!("saving provider state as NotReady on {}", self); - *provider_state = ProviderState::Connecting(Arc::new(new_provider)); + drop(unlocked_provider); - // drop the lock so that we can get a request handle - // trace!("provider_state {} unlocked", self); - drop(provider_state); - - let authorization = Arc::new(Authorization::internal(db_conn.cloned())?); - - // check the server's chain_id here - // TODO: some public rpcs (on bsc and fantom) do not return an id and so this ends up being an error - // TODO: what should the timeout be? should there be a request timeout? - // trace!("waiting on chain id for {}", self); - let found_chain_id: Result = self - .wait_for_request_handle(&authorization, None, true) - .await? - .request( - "eth_chainId", - &json!(Option::None::<()>), - Level::Trace.into(), - ) - .await; - // trace!("found_chain_id: {:?}", found_chain_id); - - match found_chain_id { - Ok(found_chain_id) => { - // TODO: there has to be a cleaner way to do this - if chain_id != found_chain_id.as_u64() { - return Err(anyhow::anyhow!( - "incorrect chain id! Config has {}, but RPC has {}", - chain_id, - found_chain_id - ) - .context(format!("failed @ {}", self))); - } + info!("successfully connected to {}", self); + } else { + if self.provider.read().await.is_none() { + return Err(anyhow!("failed waiting for client")); } - Err(e) => { - return Err(anyhow::Error::from(e)); - } - } - - self.check_block_data_limit(&authorization).await?; - - { - // trace!("locking for ready..."); - let mut provider_state = self.provider_state.write().await; - // trace!("locked for ready..."); - - // TODO: do this without a clone - let ready_provider = provider_state - .provider(true) - .await - .context("provider missing")? - .clone(); - - *provider_state = ProviderState::Connected(ready_provider); - // trace!("unlocked for ready..."); - } - - info!("successfully connected to {}", self); + }; Ok(()) } - #[inline] - pub fn active_requests(&self) -> u32 { - self.active_requests.load(atomic::Ordering::Acquire) - } - async fn send_head_block_result( self: &Arc, new_head_block: Result, ProviderError>, @@ -539,9 +586,9 @@ impl Web3Rpc { None } Ok(Some(new_head_block)) => { - let new_hash = new_head_block - .hash - .context("sending block to connections")?; + let new_head_block = Web3ProxyBlock::try_new(new_head_block).unwrap(); + + let new_hash = *new_head_block.hash(); // if we already have this block saved, set new_head_block to that arc. otherwise store this copy let new_head_block = block_map @@ -558,7 +605,7 @@ impl Web3Rpc { if self.block_data_limit() == U64::zero() { let authorization = Arc::new(Authorization::internal(self.db_conn.clone())?); - if let Err(err) = self.check_block_data_limit(&authorization).await { + if let Err(err) = self.check_block_data_limit(&authorization, None).await { warn!( "failed checking block limit after {} finished syncing. {:?}", self, err @@ -604,6 +651,12 @@ impl Web3Rpc { reconnect: bool, tx_id_sender: Option)>>, ) -> anyhow::Result<()> { + let revert_handler = if self.backup { + RequestRevertHandler::DebugLevel + } else { + RequestRevertHandler::ErrorLevel + }; + loop { let http_interval_receiver = http_interval_sender.as_ref().map(|x| x.subscribe()); @@ -629,31 +682,104 @@ impl Web3Rpc { // provider is ready ready_tx.send(()).unwrap(); - // wait before doing the initial health check - // TODO: how often? - // TODO: subscribe to self.head_block + // TODO: how often? different depending on the chain? + // TODO: reset this timeout when a new block is seen? we need to keep request_latency updated though let health_sleep_seconds = 10; - sleep(Duration::from_secs(health_sleep_seconds)).await; + + // TODO: benchmark this and lock contention + let mut old_total_requests = 0; + let mut new_total_requests; loop { + sleep(Duration::from_secs(health_sleep_seconds)).await; + // TODO: what if we just happened to have this check line up with another restart? // TODO: think more about this - // trace!("health check on {}. locking...", conn); - if conn - .provider_state - .read() - .await - .provider(false) - .await - .is_none() - { - // trace!("health check unlocked with error on {}", conn); - // returning error will trigger a reconnect - return Err(anyhow::anyhow!("{} is not ready", conn)); - } - // trace!("health check on {}. unlocked", conn); + if let Some(client) = conn.provider.read().await.clone() { + // health check as a way of keeping this rpc's request_ewma accurate + // TODO: do something different if this is a backup server? - sleep(Duration::from_secs(health_sleep_seconds)).await; + new_total_requests = + conn.total_requests.load(atomic::Ordering::Relaxed); + + if new_total_requests - old_total_requests < 10 { + // TODO: if this fails too many times, reset the connection + // TODO: move this into a function and the chaining should be easier + let head_block = conn.head_block.read().clone(); + + if let Some((block_hash, txid)) = head_block.and_then(|x| { + let block = x.block.clone(); + + let block_hash = block.hash?; + let txid = block.transactions.last().cloned()?; + + Some((block_hash, txid)) + }) { + let to = conn + .wait_for_query::<_, Option>( + "eth_getTransactionByHash", + &(txid,), + revert_handler, + authorization.clone(), + Some(client.clone()), + ) + .await + .and_then(|tx| { + let tx = tx.context("no transaction found")?; + + // TODO: what default? something real? + let to = tx.to.unwrap_or_else(|| { + "0xdead00000000000000000000000000000000beef" + .parse::
() + .expect("deafbeef") + }); + + Ok(to) + }); + + let code = match to { + Err(err) => { + if conn.backup { + debug!( + "{} failed health check query! {:#?}", + conn, err + ); + } else { + warn!( + "{} failed health check query! {:#?}", + conn, err + ); + } + continue; + } + Ok(to) => { + conn.wait_for_query::<_, Option>( + "eth_getCode", + &(to, block_hash), + revert_handler, + authorization.clone(), + Some(client), + ) + .await + } + }; + + if let Err(err) = code { + if conn.backup { + debug!( + "{} failed health check query! {:#?}", + conn, err + ); + } else { + warn!("{} failed health check query! {:#?}", conn, err); + } + continue; + } + } + } + + old_total_requests = new_total_requests; + } } }; @@ -712,7 +838,7 @@ impl Web3Rpc { Ok(()) } - /// Subscribe to new blocks. If `reconnect` is true, this runs forever. + /// Subscribe to new blocks. async fn subscribe_new_heads( self: Arc, authorization: Arc, @@ -722,233 +848,222 @@ impl Web3Rpc { ) -> anyhow::Result<()> { trace!("watching new heads on {}", self); - // trace!("locking on new heads"); - let provider_state = self - .provider_state - .try_read() - .context("subscribe_new_heads")? - .clone(); - // trace!("unlocked on new heads"); + let unlocked_provider = self.provider.read().await; - // TODO: need a timeout - if let ProviderState::Connected(provider) = provider_state { - match provider.as_ref() { - Web3Provider::Mock => unimplemented!(), - Web3Provider::Http(_provider) => { - // there is a "watch_blocks" function, but a lot of public nodes do not support the necessary rpc endpoints - // TODO: try watch_blocks and fall back to this? + match unlocked_provider.as_deref() { + Some(Web3Provider::Http(_client)) => { + // there is a "watch_blocks" function, but a lot of public nodes do not support the necessary rpc endpoints + // TODO: try watch_blocks and fall back to this? - let mut http_interval_receiver = http_interval_receiver.unwrap(); + let mut http_interval_receiver = http_interval_receiver.unwrap(); - let mut last_hash = H256::zero(); + let mut last_hash = H256::zero(); - loop { - // TODO: what should the max_wait be? - match self - .wait_for_request_handle(&authorization, None, false) - .await - { - Ok(active_request_handle) => { - let block: Result, _> = active_request_handle - .request( - "eth_getBlockByNumber", - &json!(("latest", false)), - Level::Warn.into(), + loop { + // TODO: what should the max_wait be? + match self + .wait_for_request_handle(&authorization, None, unlocked_provider.clone()) + .await + { + Ok(active_request_handle) => { + let block: Result, _> = active_request_handle + .request( + "eth_getBlockByNumber", + &json!(("latest", false)), + Level::Warn.into(), + None, + ) + .await; + + match block { + Ok(None) => { + warn!("no head block on {}", self); + + self.send_head_block_result( + Ok(None), + &block_sender, + block_map.clone(), ) - .await; + .await?; + } + Ok(Some(block)) => { + // don't send repeat blocks + let new_hash = + block.hash.expect("blocks here should always have hashes"); - match block { - Ok(None) => { - warn!("no head block on {}", self); + if new_hash != last_hash { + // new hash! + last_hash = new_hash; self.send_head_block_result( - Ok(None), - &block_sender, - block_map.clone(), - ) - .await?; - } - Ok(Some(block)) => { - // don't send repeat blocks - let new_hash = block - .hash - .expect("blocks here should always have hashes"); - - if new_hash != last_hash { - // new hash! - last_hash = new_hash; - - self.send_head_block_result( - Ok(Some(block)), - &block_sender, - block_map.clone(), - ) - .await?; - } - } - Err(err) => { - // we did not get a block back. something is up with the server. take it out of rotation - self.send_head_block_result( - Err(err), + Ok(Some(block)), &block_sender, block_map.clone(), ) .await?; } } - } - Err(err) => { - warn!("Internal error on latest block from {}. {:?}", self, err); - - self.send_head_block_result( - Ok(None), - &block_sender, - block_map.clone(), - ) - .await?; - - // TODO: what should we do? sleep? extra time? + Err(err) => { + // we did not get a block back. something is up with the server. take it out of rotation + self.send_head_block_result( + Err(err), + &block_sender, + block_map.clone(), + ) + .await?; + } } } + Err(err) => { + warn!("Internal error on latest block from {}. {:?}", self, err); - // wait for the next interval - // TODO: if error or rate limit, increase interval? - while let Err(err) = http_interval_receiver.recv().await { - match err { - broadcast::error::RecvError::Closed => { - // channel is closed! that's not good. bubble the error up - return Err(err.into()); - } - broadcast::error::RecvError::Lagged(lagged) => { - // querying the block was delayed - // this can happen if tokio is very busy or waiting for requests limits took too long + self.send_head_block_result(Ok(None), &block_sender, block_map.clone()) + .await?; + + // TODO: what should we do? sleep? extra time? + } + } + + // wait for the next interval + // TODO: if error or rate limit, increase interval? + while let Err(err) = http_interval_receiver.recv().await { + match err { + broadcast::error::RecvError::Closed => { + // channel is closed! that's not good. bubble the error up + return Err(err.into()); + } + broadcast::error::RecvError::Lagged(lagged) => { + // querying the block was delayed + // this can happen if tokio is very busy or waiting for requests limits took too long + if self.backup { + debug!("http interval on {} lagging by {}!", self, lagged); + } else { warn!("http interval on {} lagging by {}!", self, lagged); } } } } } - Web3Provider::Ws(provider) => { - // todo: move subscribe_blocks onto the request handle? - let active_request_handle = self - .wait_for_request_handle(&authorization, None, false) - .await; - let mut stream = provider.subscribe_blocks().await?; - drop(active_request_handle); + } + Some(Web3Provider::Both(_, client)) | Some(Web3Provider::Ws(client)) => { + // todo: move subscribe_blocks onto the request handle? + let active_request_handle = self + .wait_for_request_handle(&authorization, None, unlocked_provider.clone()) + .await; + let mut stream = client.subscribe_blocks().await?; + drop(active_request_handle); - // query the block once since the subscription doesn't send the current block - // there is a very small race condition here where the stream could send us a new block right now - // all it does is print "new block" for the same block as current block - // TODO: how does this get wrapped in an arc? does ethers handle that? - let block: Result, _> = self - .wait_for_request_handle(&authorization, None, false) - .await? - .request( - "eth_getBlockByNumber", - &json!(("latest", false)), - Level::Warn.into(), - ) - .await; + // query the block once since the subscription doesn't send the current block + // there is a very small race condition here where the stream could send us a new block right now + // but all that does is print "new block" for the same block as current block + // TODO: how does this get wrapped in an arc? does ethers handle that? + // TODO: do this part over http? + let block: Result, _> = self + .wait_for_request_handle(&authorization, None, unlocked_provider.clone()) + .await? + .request( + "eth_getBlockByNumber", + &json!(("latest", false)), + Level::Warn.into(), + unlocked_provider.clone(), + ) + .await; - let mut last_hash = match &block { - Ok(Some(new_block)) => new_block - .hash - .expect("blocks should always have a hash here"), - _ => H256::zero(), - }; + let mut last_hash = match &block { + Ok(Some(new_block)) => new_block + .hash + .expect("blocks should always have a hash here"), + _ => H256::zero(), + }; - self.send_head_block_result(block, &block_sender, block_map.clone()) - .await?; + self.send_head_block_result(block, &block_sender, block_map.clone()) + .await?; - while let Some(new_block) = stream.next().await { - // TODO: check the new block's hash to be sure we don't send dupes - let new_hash = new_block - .hash - .expect("blocks should always have a hash here"); + while let Some(new_block) = stream.next().await { + // TODO: check the new block's hash to be sure we don't send dupes + let new_hash = new_block + .hash + .expect("blocks should always have a hash here"); - if new_hash == last_hash { - // some rpcs like to give us duplicates. don't waste our time on them - continue; - } else { - last_hash = new_hash; - } - - self.send_head_block_result( - Ok(Some(Arc::new(new_block))), - &block_sender, - block_map.clone(), - ) - .await?; + if new_hash == last_hash { + // some rpcs like to give us duplicates. don't waste our time on them + continue; + } else { + last_hash = new_hash; } - // clear the head block. this might not be needed, but it won't hurt - self.send_head_block_result(Ok(None), &block_sender, block_map) - .await?; - - // TODO: is this always an error? - // TODO: we probably don't want a warn and to return error - warn!("new_heads subscription to {} ended", self); - Err(anyhow::anyhow!("new_heads subscription ended")) + self.send_head_block_result( + Ok(Some(Arc::new(new_block))), + &block_sender, + block_map.clone(), + ) + .await?; } + + // clear the head block. this might not be needed, but it won't hurt + self.send_head_block_result(Ok(None), &block_sender, block_map) + .await?; + + // TODO: is this always an error? + // TODO: we probably don't want a warn and to return error + warn!("new_heads subscription to {} ended", self); + Err(anyhow::anyhow!("new_heads subscription ended")) } - } else { - Err(anyhow::anyhow!( - "Provider not ready! Unable to subscribe to heads" - )) + None => todo!("what should happen now? wait for a connection?"), + #[cfg(test)] + Some(Web3Provider::Mock) => unimplemented!(), } } + /// Turn on the firehose of pending transactions async fn subscribe_pending_transactions( self: Arc, authorization: Arc, tx_id_sender: flume::Sender<(TxHash, Arc)>, ) -> anyhow::Result<()> { - if let ProviderState::Connected(provider) = self - .provider_state - .try_read() - .context("subscribe_pending_transactions")? - .clone() - { - trace!("watching pending transactions on {}", self); - // TODO: does this keep the lock open for too long? - match provider.as_ref() { - Web3Provider::Mock => unimplemented!(), - Web3Provider::Http(provider) => { - // there is a "watch_pending_transactions" function, but a lot of public nodes do not support the necessary rpc endpoints - // TODO: maybe subscribe to self.head_block? - // TODO: this keeps a read lock guard open on provider_state forever. is that okay for an http client? - futures::future::pending::<()>().await; - } - Web3Provider::Ws(provider) => { - // TODO: maybe the subscribe_pending_txs function should be on the active_request_handle - let active_request_handle = self - .wait_for_request_handle(&authorization, None, false) - .await?; + // TODO: give this a separate client. don't use new_head_client for everything. especially a firehose this big + // TODO: timeout + let provider = self.provider.read().await; - let mut stream = provider.subscribe_pending_txs().await?; - - drop(active_request_handle); - - while let Some(pending_tx_id) = stream.next().await { - tx_id_sender - .send_async((pending_tx_id, self.clone())) - .await - .context("tx_id_sender")?; - - // TODO: periodically check for listeners. if no one is subscribed, unsubscribe and wait for a subscription - } - - // TODO: is this always an error? - // TODO: we probably don't want a warn and to return error - warn!("pending_transactions subscription ended on {}", self); - return Err(anyhow::anyhow!("pending_transactions subscription ended")); - } + trace!("watching pending transactions on {}", self); + // TODO: does this keep the lock open for too long? + match provider.as_deref() { + None => { + // TODO: wait for a provider + return Err(anyhow!("no provider")); } - } else { - warn!( - "Provider not ready! Unable to watch pending transactions on {}", - self - ); + Some(Web3Provider::Http(provider)) => { + // there is a "watch_pending_transactions" function, but a lot of public nodes do not support the necessary rpc endpoints + // TODO: maybe subscribe to self.head_block? + // TODO: this keeps a read lock guard open on provider_state forever. is that okay for an http client? + futures::future::pending::<()>().await; + } + Some(Web3Provider::Both(_, client)) | Some(Web3Provider::Ws(client)) => { + // TODO: maybe the subscribe_pending_txs function should be on the active_request_handle + let active_request_handle = self + .wait_for_request_handle(&authorization, None, provider.clone()) + .await?; + + let mut stream = client.subscribe_pending_txs().await?; + + drop(active_request_handle); + + while let Some(pending_tx_id) = stream.next().await { + tx_id_sender + .send_async((pending_tx_id, self.clone())) + .await + .context("tx_id_sender")?; + + // TODO: periodically check for listeners. if no one is subscribed, unsubscribe and wait for a subscription + } + + // TODO: is this always an error? + // TODO: we probably don't want a warn and to return error + warn!("pending_transactions subscription ended on {}", self); + return Err(anyhow::anyhow!("pending_transactions subscription ended")); + } + #[cfg(test)] + Some(Web3Provider::Mock) => futures::future::pending::<()>().await, } Ok(()) @@ -957,17 +1072,17 @@ impl Web3Rpc { /// be careful with this; it might wait forever! /// `allow_not_ready` is only for use by health checks while starting the provider /// TODO: don't use anyhow. use specific error type - pub async fn wait_for_request_handle( - self: &Arc, - authorization: &Arc, + pub async fn wait_for_request_handle<'a>( + self: &'a Arc, + authorization: &'a Arc, max_wait: Option, - allow_not_ready: bool, + unlocked_provider: Option>, ) -> anyhow::Result { let max_wait = max_wait.map(|x| Instant::now() + x); loop { match self - .try_request_handle(authorization, allow_not_ready) + .try_request_handle(authorization, unlocked_provider.clone()) .await { Ok(OpenRequestResult::Handle(handle)) => return Ok(handle), @@ -991,7 +1106,7 @@ impl Web3Rpc { sleep_until(retry_at).await; } - Ok(OpenRequestResult::NotReady(_)) => { + Ok(OpenRequestResult::NotReady) => { // TODO: when can this happen? log? emit a stat? trace!("{} has no handle ready", self); @@ -1015,21 +1130,15 @@ impl Web3Rpc { pub async fn try_request_handle( self: &Arc, authorization: &Arc, - // TODO? ready_provider: Option<&Arc>, - allow_not_ready: bool, + // TODO: borrow on this instead of needing to clone the Arc? + unlocked_provider: Option>, ) -> anyhow::Result { // TODO: think more about this read block - if !allow_not_ready - && self - .provider_state - .read() - .await - .provider(allow_not_ready) - .await - .is_none() - { - trace!("{} is not ready", self); - return Ok(OpenRequestResult::NotReady(self.backup)); + // TODO: this should *not* be new_head_client. this should be a separate object + if unlocked_provider.is_some() || self.provider.read().await.is_some() { + // we already have an unlocked provider. no need to lock + } else { + return Ok(OpenRequestResult::NotReady); } if let Some(hard_limit_until) = self.hard_limit_until.as_ref() { @@ -1071,7 +1180,7 @@ impl Web3Rpc { return Ok(OpenRequestResult::RetryAt(retry_at)); } RedisRateLimitResult::RetryNever => { - return Ok(OpenRequestResult::NotReady(self.backup)); + return Ok(OpenRequestResult::NotReady); } } }; @@ -1080,6 +1189,26 @@ impl Web3Rpc { Ok(OpenRequestResult::Handle(handle)) } + + pub async fn wait_for_query( + self: &Arc, + method: &str, + params: &P, + revert_handler: RequestRevertHandler, + authorization: Arc, + unlocked_provider: Option>, + ) -> anyhow::Result + where + // TODO: not sure about this type. would be better to not need clones, but measure and spawns combine to need it + P: Clone + fmt::Debug + serde::Serialize + Send + Sync + 'static, + R: serde::Serialize + serde::de::DeserializeOwned + fmt::Debug, + { + self.wait_for_request_handle(&authorization, None, None) + .await? + .request::(method, params, revert_handler, unlocked_provider) + .await + .context("ProviderError from the backend") + } } impl fmt::Debug for Web3Provider { @@ -1122,7 +1251,7 @@ impl Serialize for Web3Rpc { S: Serializer, { // 3 is the number of fields in the struct. - let mut state = serializer.serialize_struct("Web3Rpc", 9)?; + let mut state = serializer.serialize_struct("Web3Rpc", 10)?; // the url is excluded because it likely includes private information. just show the name that we use in keys state.serialize_field("name", &self.name)?; @@ -1144,22 +1273,16 @@ impl Serialize for Web3Rpc { state.serialize_field("soft_limit", &self.soft_limit)?; - state.serialize_field( - "active_requests", - &self.active_requests.load(atomic::Ordering::Relaxed), - )?; + // TODO: maybe this is too much data. serialize less? + state.serialize_field("head_block", &*self.head_block.read())?; + + state.serialize_field("head_latency", &self.head_latency.read().value())?; state.serialize_field( "total_requests", - &self.frontend_requests.load(atomic::Ordering::Relaxed), + &self.total_requests.load(atomic::Ordering::Relaxed), )?; - { - // TODO: maybe this is too much data. serialize less? - let head_block = &*self.head_block.read(); - state.serialize_field("head_block", head_block)?; - } - state.end() } } @@ -1211,12 +1334,12 @@ mod tests { let random_block = Arc::new(random_block); - let head_block = SavedBlock::new(random_block); + let head_block = Web3ProxyBlock::try_new(random_block).unwrap(); let block_data_limit = u64::MAX; let x = Web3Rpc { name: "name".to_string(), - url: "ws://example.com".to_string(), + ws_url: Some("ws://example.com".to_string()), soft_limit: 1_000, automatic_block_limit: false, backup: false, @@ -1241,17 +1364,17 @@ mod tests { .as_secs() .into(); - let head_block: SavedBlock = Arc::new(Block { + let head_block: Web3ProxyBlock = Arc::new(Block { hash: Some(H256::random()), number: Some(1_000_000.into()), timestamp: now, ..Default::default() }) - .into(); + .try_into() + .unwrap(); let block_data_limit = 64; - // TODO: this is getting long. have a `impl Default` let x = Web3Rpc { name: "name".to_string(), soft_limit: 1_000, diff --git a/web3_proxy/src/rpcs/provider.rs b/web3_proxy/src/rpcs/provider.rs index add17a43..a65c7cea 100644 --- a/web3_proxy/src/rpcs/provider.rs +++ b/web3_proxy/src/rpcs/provider.rs @@ -2,22 +2,45 @@ use anyhow::Context; use derive_more::From; use std::time::Duration; +// TODO: our own structs for these that handle streaming large responses +type EthersHttpProvider = ethers::providers::Provider; +type EthersWsProvider = ethers::providers::Provider; + /// Use HTTP and WS providers. // TODO: instead of an enum, I tried to use Box, but hit +// TODO: custom types that let us stream JSON responses #[derive(From)] pub enum Web3Provider { - Http(ethers::providers::Provider), - Ws(ethers::providers::Provider), - // TODO: only include this for tests. + Both(EthersHttpProvider, EthersWsProvider), + Http(EthersHttpProvider), + // TODO: deadpool? custom tokio-tungstenite + Ws(EthersWsProvider), + #[cfg(test)] Mock, } impl Web3Provider { pub fn ready(&self) -> bool { match self { - Self::Mock => true, + Self::Both(_, ws) => ws.as_ref().ready(), Self::Http(_) => true, - Self::Ws(provider) => provider.as_ref().ready(), + Self::Ws(ws) => ws.as_ref().ready(), + #[cfg(test)] + Self::Mock => true, + } + } + + pub fn http(&self) -> Option<&EthersHttpProvider> { + match self { + Self::Http(x) => Some(x), + _ => None, + } + } + + pub fn ws(&self) -> Option<&EthersWsProvider> { + match self { + Self::Both(_, x) | Self::Ws(x) => Some(x), + _ => None, } } diff --git a/web3_proxy/src/rpcs/request.rs b/web3_proxy/src/rpcs/request.rs index da204992..139e3bba 100644 --- a/web3_proxy/src/rpcs/request.rs +++ b/web3_proxy/src/rpcs/request.rs @@ -1,6 +1,6 @@ use super::one::Web3Rpc; use super::provider::Web3Provider; -use crate::frontend::authorization::{Authorization, AuthorizationType}; +use crate::frontend::authorization::Authorization; use anyhow::Context; use chrono::Utc; use entities::revert_log; @@ -11,7 +11,6 @@ use log::{debug, error, trace, warn, Level}; use migration::sea_orm::{self, ActiveEnum, ActiveModelTrait}; use serde_json::json; use std::fmt; -use std::sync::atomic; use std::sync::Arc; use thread_fast_rng::rand::Rng; use tokio::time::{sleep, Duration, Instant}; @@ -21,20 +20,20 @@ pub enum OpenRequestResult { Handle(OpenRequestHandle), /// Unable to start a request. Retry at the given time. RetryAt(Instant), - /// Unable to start a request because the server is not synced - /// contains "true" if backup servers were attempted - NotReady(bool), + /// Unable to start a request because no servers are synced + NotReady, } /// Make RPC requests through this handle and drop it when you are done. +/// Opening this handle checks rate limits. Developers, try to keep opening a handle and using it as close together as possible #[derive(Debug)] pub struct OpenRequestHandle { authorization: Arc, - conn: Arc, - provider: Arc, + rpc: Arc, } /// Depending on the context, RPC errors can require different handling. +#[derive(Copy, Clone)] pub enum RequestRevertHandler { /// Log at the trace level. Use when errors are expected. TraceLevel, @@ -123,79 +122,30 @@ impl Authorization { impl OpenRequestHandle { pub async fn new(authorization: Arc, conn: Arc) -> Self { - // TODO: take request_id as an argument? - // TODO: attach a unique id to this? customer requests have one, but not internal queries - // TODO: what ordering?! - conn.active_requests.fetch_add(1, atomic::Ordering::Relaxed); - - let mut provider = None; - let mut logged = false; - while provider.is_none() { - // trace!("waiting on provider: locking..."); - - let ready_provider = conn - .provider_state - .read() - .await - // TODO: hard code true, or take a bool in the `new` function? - .provider(true) - .await - .cloned(); - // trace!("waiting on provider: unlocked!"); - - match ready_provider { - None => { - if !logged { - logged = true; - warn!("no provider for {}!", conn); - } - - // TODO: how should this work? a reconnect should be in progress. but maybe force one now? - // TODO: sleep how long? subscribe to something instead? maybe use a watch handle? - // TODO: this is going to be way too verbose! - sleep(Duration::from_millis(100)).await - } - Some(x) => provider = Some(x), - } - } - let provider = provider.expect("provider was checked already"); - - // TODO: handle overflows? - // TODO: what ordering? - match authorization.as_ref().authorization_type { - AuthorizationType::Frontend => { - conn.frontend_requests - .fetch_add(1, atomic::Ordering::Relaxed); - } - AuthorizationType::Internal => { - conn.internal_requests - .fetch_add(1, atomic::Ordering::Relaxed); - } - } - Self { authorization, - conn, - provider, + rpc: conn, } } pub fn connection_name(&self) -> String { - self.conn.name.clone() + self.rpc.name.clone() } #[inline] pub fn clone_connection(&self) -> Arc { - self.conn.clone() + self.rpc.clone() } /// Send a web3 request /// By having the request method here, we ensure that the rate limiter was called and connection counts were properly incremented + /// depending on how things are locked, you might need to pass the provider in pub async fn request( self, method: &str, params: &P, revert_handler: RequestRevertHandler, + unlocked_provider: Option>, ) -> Result where // TODO: not sure about this type. would be better to not need clones, but measure and spawns combine to need it @@ -205,14 +155,57 @@ impl OpenRequestHandle { // TODO: use tracing spans // TODO: including params in this log is way too verbose // trace!(rpc=%self.conn, %method, "request"); + trace!("requesting from {}", self.rpc); + + let mut provider = if unlocked_provider.is_some() { + unlocked_provider + } else { + self.rpc.provider.read().await.clone() + }; + + let mut logged = false; + while provider.is_none() { + // trace!("waiting on provider: locking..."); + sleep(Duration::from_millis(100)).await; + + if !logged { + debug!("no provider for open handle on {}", self.rpc); + logged = true; + } + + provider = self.rpc.provider.read().await.clone(); + } + + let provider = provider.expect("provider was checked already"); + + self.rpc + .total_requests + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + + self.rpc + .active_requests + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + + // let latency = Instant::now(); // TODO: replace ethers-rs providers with our own that supports streaming the responses - let response = match &*self.provider { + let response = match provider.as_ref() { + #[cfg(test)] Web3Provider::Mock => unimplemented!(), - Web3Provider::Http(provider) => provider.request(method, params).await, - Web3Provider::Ws(provider) => provider.request(method, params).await, + Web3Provider::Ws(p) => p.request(method, params).await, + Web3Provider::Http(p) | Web3Provider::Both(p, _) => { + // TODO: i keep hearing that http is faster. but ws has always been better for me. investigate more with actual benchmarks + p.request(method, params).await + } }; + // note. we intentionally do not record this latency now. we do NOT want to measure errors + // let latency = latency.elapsed(); + + self.rpc + .active_requests + .fetch_sub(1, std::sync::atomic::Ordering::Relaxed); + // // TODO: i think ethers already has trace logging (and does it much more fancy) // trace!( // "response from {} for {} {:?}: {:?}", @@ -266,8 +259,22 @@ impl OpenRequestHandle { // check for "execution reverted" here let response_type = if let ProviderError::JsonRpcClientError(err) = err { // Http and Ws errors are very similar, but different types - let msg = match &*self.provider { + let msg = match &*provider { + #[cfg(test)] Web3Provider::Mock => unimplemented!(), + Web3Provider::Both(_, _) => { + if let Some(HttpClientError::JsonRpcError(err)) = + err.downcast_ref::() + { + Some(&err.message) + } else if let Some(WsClientError::JsonRpcError(err)) = + err.downcast_ref::() + { + Some(&err.message) + } else { + None + } + } Web3Provider::Http(_) => { if let Some(HttpClientError::JsonRpcError(err)) = err.downcast_ref::() @@ -290,10 +297,10 @@ impl OpenRequestHandle { if let Some(msg) = msg { if msg.starts_with("execution reverted") { - trace!("revert from {}", self.conn); + trace!("revert from {}", self.rpc); ResponseTypes::Revert } else if msg.contains("limit") || msg.contains("request") { - trace!("rate limit from {}", self.conn); + trace!("rate limit from {}", self.rpc); ResponseTypes::RateLimit } else { ResponseTypes::Ok @@ -306,10 +313,10 @@ impl OpenRequestHandle { }; if matches!(response_type, ResponseTypes::RateLimit) { - if let Some(hard_limit_until) = self.conn.hard_limit_until.as_ref() { + if let Some(hard_limit_until) = self.rpc.hard_limit_until.as_ref() { let retry_at = Instant::now() + Duration::from_secs(1); - trace!("retry {} at: {:?}", self.conn, retry_at); + trace!("retry {} at: {:?}", self.rpc, retry_at); hard_limit_until.send_replace(retry_at); } @@ -322,14 +329,14 @@ impl OpenRequestHandle { if matches!(response_type, ResponseTypes::Revert) { debug!( "bad response from {}! method={} params={:?} err={:?}", - self.conn, method, params, err + self.rpc, method, params, err ); } } RequestRevertHandler::TraceLevel => { trace!( "bad response from {}! method={} params={:?} err={:?}", - self.conn, + self.rpc, method, params, err @@ -339,20 +346,20 @@ impl OpenRequestHandle { // TODO: include params if not running in release mode error!( "bad response from {}! method={} err={:?}", - self.conn, method, err + self.rpc, method, err ); } RequestRevertHandler::WarnLevel => { // TODO: include params if not running in release mode warn!( "bad response from {}! method={} err={:?}", - self.conn, method, err + self.rpc, method, err ); } RequestRevertHandler::Save => { trace!( "bad response from {}! method={} params={:?} err={:?}", - self.conn, + self.rpc, method, params, err @@ -372,16 +379,16 @@ impl OpenRequestHandle { tokio::spawn(f); } } + } else { + // TODO: record request latency + // let latency_ms = start.elapsed().as_secs_f64() * 1000.0; + + // TODO: is this lock here a problem? should this be done through a channel? i started to code it, but it didn't seem to matter + // let mut latency_recording = self.rpc.request_latency.write(); + + // latency_recording.record(latency_ms); } response } } - -impl Drop for OpenRequestHandle { - fn drop(&mut self) { - self.conn - .active_requests - .fetch_sub(1, atomic::Ordering::AcqRel); - } -} diff --git a/web3_proxy/src/rpcs/synced_connections.rs b/web3_proxy/src/rpcs/synced_connections.rs deleted file mode 100644 index e285c307..00000000 --- a/web3_proxy/src/rpcs/synced_connections.rs +++ /dev/null @@ -1,71 +0,0 @@ -use super::blockchain::{ArcBlock, SavedBlock}; -use super::many::Web3Rpcs; -use super::one::Web3Rpc; -use ethers::prelude::{H256, U64}; -use serde::Serialize; -use std::fmt; -use std::sync::Arc; - -/// A collection of Web3Rpcs that are on the same block. -/// Serialize is so we can print it on our debug endpoint -#[derive(Clone, Default, Serialize)] -pub struct ConsensusWeb3Rpcs { - // TODO: store ArcBlock instead? - pub(super) head_block: Option, - // TODO: this should be able to serialize, but it isn't - #[serde(skip_serializing)] - pub(super) conns: Vec>, - pub(super) num_checked_conns: usize, - pub(super) includes_backups: bool, -} - -impl ConsensusWeb3Rpcs { - pub fn num_conns(&self) -> usize { - self.conns.len() - } - - pub fn sum_soft_limit(&self) -> u32 { - self.conns.iter().fold(0, |sum, rpc| sum + rpc.soft_limit) - } - - // TODO: sum_hard_limit? -} - -impl fmt::Debug for ConsensusWeb3Rpcs { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - // TODO: the default formatter takes forever to write. this is too quiet though - // TODO: print the actual conns? - f.debug_struct("ConsensusConnections") - .field("head_block", &self.head_block) - .field("num_conns", &self.conns.len()) - .finish_non_exhaustive() - } -} - -impl Web3Rpcs { - pub fn head_block(&self) -> Option { - self.watch_consensus_head_receiver - .as_ref() - .map(|x| x.borrow().clone()) - } - - pub fn head_block_hash(&self) -> Option { - self.head_block().and_then(|x| x.hash) - } - - pub fn head_block_num(&self) -> Option { - self.head_block().and_then(|x| x.number) - } - - pub fn synced(&self) -> bool { - !self - .watch_consensus_connections_sender - .borrow() - .conns - .is_empty() - } - - pub fn num_synced_rpcs(&self) -> usize { - self.watch_consensus_connections_sender.borrow().conns.len() - } -} diff --git a/web3_proxy/src/rpcs/transactions.rs b/web3_proxy/src/rpcs/transactions.rs index dc5710d1..466a92be 100644 --- a/web3_proxy/src/rpcs/transactions.rs +++ b/web3_proxy/src/rpcs/transactions.rs @@ -28,13 +28,15 @@ impl Web3Rpcs { // TODO: might not be a race. might be a nonce thats higher than the current account nonce. geth discards chains // TODO: yearn devs have had better luck with batching these, but i think that's likely just adding a delay itself // TODO: if one rpc fails, try another? - let tx: Transaction = match rpc.try_request_handle(authorization, false).await { + // TODO: try_request_handle, or wait_for_request_handle? I think we want wait here + let tx: Transaction = match rpc.try_request_handle(authorization, None).await { Ok(OpenRequestResult::Handle(handle)) => { handle .request( "eth_getTransactionByHash", &(pending_tx_id,), Level::Error.into(), + None, ) .await? } diff --git a/web3_proxy/src/rpcs/ws.rs b/web3_proxy/src/rpcs/ws.rs new file mode 100644 index 00000000..e69de29b