web3-proxy/web3_proxy/src/rpcs/connections.rs

1033 lines
39 KiB
Rust
Raw Normal View History

2022-05-12 02:50:52 +03:00
///! Load balanced communication with a group of web3 providers
use super::blockchain::{ArcBlock, BlockHashesCache};
2022-08-24 03:11:49 +03:00
use super::connection::Web3Connection;
use super::request::{
OpenRequestHandle, OpenRequestHandleMetrics, OpenRequestResult, RequestErrorHandler,
};
2022-08-24 03:11:49 +03:00
use super::synced_connections::SyncedConnections;
2022-08-24 03:59:05 +03:00
use crate::app::{flatten_handle, AnyhowJoinHandle};
2022-08-30 23:01:42 +03:00
use crate::config::{BlockAndRpc, TxHashAndRpc, Web3ConnectionConfig};
use crate::frontend::authorization::{Authorization, RequestMetadata};
use crate::jsonrpc::{JsonRpcForwardedResponse, JsonRpcRequest};
2022-08-24 03:59:05 +03:00
use crate::rpcs::transactions::TxStatus;
2022-11-07 00:05:03 +03:00
use anyhow::Context;
2022-05-18 23:18:01 +03:00
use arc_swap::ArcSwap;
2022-05-28 07:26:24 +03:00
use counter::Counter;
2022-05-05 22:07:09 +03:00
use derive_more::From;
use ethers::prelude::{ProviderError, TxHash, H256, U64};
use futures::future::{join_all, try_join_all};
2022-05-05 22:07:09 +03:00
use futures::stream::FuturesUnordered;
use futures::StreamExt;
2022-05-20 04:26:02 +03:00
use hashbrown::HashMap;
2022-11-12 11:24:32 +03:00
use log::{error, info, warn, Level};
2022-11-14 21:24:52 +03:00
use migration::sea_orm::DatabaseConnection;
2022-09-22 02:50:55 +03:00
use moka::future::{Cache, ConcurrentCacheExt};
2022-08-26 20:26:17 +03:00
use petgraph::graphmap::DiGraphMap;
2022-05-21 01:16:15 +03:00
use serde::ser::{SerializeStruct, Serializer};
use serde::Serialize;
use serde_json::json;
2022-05-05 22:07:09 +03:00
use serde_json::value::RawValue;
use std::fmt;
use std::sync::atomic::Ordering;
2022-05-05 22:07:09 +03:00
use std::sync::Arc;
2022-11-12 09:11:58 +03:00
use thread_fast_rng::rand::seq::SliceRandom;
2022-09-05 08:53:58 +03:00
use tokio::sync::RwLock as AsyncRwLock;
2022-06-16 05:53:37 +03:00
use tokio::sync::{broadcast, watch};
2022-05-17 19:23:27 +03:00
use tokio::task;
2022-09-05 08:53:58 +03:00
use tokio::time::{interval, sleep, sleep_until, Duration, Instant, MissedTickBehavior};
2022-05-05 22:07:09 +03:00
/// A collection of web3 connections. Sends requests either the current best server or all servers.
#[derive(From)]
pub struct Web3Connections {
pub(crate) conns: HashMap<String, Arc<Web3Connection>>,
2022-08-27 02:44:25 +03:00
/// any requests will be forwarded to one (or more) of these connections
pub(super) synced_connections: ArcSwap<SyncedConnections>,
pub(super) pending_transactions:
Cache<TxHash, TxStatus, hashbrown::hash_map::DefaultHashBuilder>,
/// TODO: this map is going to grow forever unless we do some sort of pruning. maybe store pruned in redis?
/// all blocks, including orphans
pub(super) block_hashes: BlockHashesCache,
2022-09-01 08:58:55 +03:00
/// blocks on the heaviest chain
pub(super) block_numbers: Cache<U64, H256, hashbrown::hash_map::DefaultHashBuilder>,
/// TODO: this map is going to grow forever unless we do some sort of pruning. maybe store pruned in redis?
2022-08-26 20:26:17 +03:00
/// TODO: what should we use for edges?
2022-09-05 08:53:58 +03:00
pub(super) blockchain_graphmap: AsyncRwLock<DiGraphMap<H256, u32>>,
pub(super) min_head_rpcs: usize,
2022-08-27 06:11:58 +03:00
pub(super) min_sum_soft_limit: u32,
2022-05-05 22:07:09 +03:00
}
impl Web3Connections {
/// Spawn durable connections to multiple Web3 providers.
2022-08-26 20:26:17 +03:00
#[allow(clippy::too_many_arguments)]
pub async fn spawn(
chain_id: u64,
db_conn: Option<DatabaseConnection>,
2022-08-10 08:56:09 +03:00
server_configs: HashMap<String, Web3ConnectionConfig>,
http_client: Option<reqwest::Client>,
2022-09-15 20:57:24 +03:00
redis_pool: Option<redis_rate_limiter::RedisPool>,
block_map: BlockHashesCache,
2022-08-30 23:01:42 +03:00
head_block_sender: Option<watch::Sender<ArcBlock>>,
2022-08-27 06:11:58 +03:00
min_sum_soft_limit: u32,
min_head_rpcs: usize,
2022-08-24 03:59:05 +03:00
pending_tx_sender: Option<broadcast::Sender<TxStatus>>,
pending_transactions: Cache<TxHash, TxStatus, hashbrown::hash_map::DefaultHashBuilder>,
2022-09-09 06:53:16 +03:00
open_request_handle_metrics: Arc<OpenRequestHandleMetrics>,
2022-06-14 08:43:28 +03:00
) -> anyhow::Result<(Arc<Self>, AnyhowJoinHandle<()>)> {
let (pending_tx_id_sender, pending_tx_id_receiver) = flume::unbounded();
let (block_sender, block_receiver) = flume::unbounded::<BlockAndRpc>();
2022-05-05 22:07:09 +03:00
2022-06-29 22:15:05 +03:00
let http_interval_sender = if http_client.is_some() {
let (sender, receiver) = broadcast::channel(1);
drop(receiver);
// TODO: what interval? follow a websocket also? maybe by watching synced connections with a timeout. will need debounce
2022-06-29 22:15:05 +03:00
let mut interval = interval(Duration::from_secs(13));
interval.set_missed_tick_behavior(MissedTickBehavior::Delay);
let sender = Arc::new(sender);
let f = {
let sender = sender.clone();
async move {
loop {
// TODO: every time a head_block arrives (with a small delay for known slow servers), or on the interval.
2022-06-29 22:15:05 +03:00
interval.tick().await;
2022-11-12 11:24:32 +03:00
// // trace!("http interval ready");
2022-07-16 08:21:08 +03:00
2022-06-29 22:15:05 +03:00
// errors are okay. they mean that all receivers have been dropped
let _ = sender.send(());
}
}
};
// TODO: do something with this handle?
tokio::spawn(f);
Some(sender)
} else {
None
};
// turn configs into connections (in parallel)
2022-08-12 22:07:14 +03:00
// TODO: move this into a helper function. then we can use it when configs change (will need a remove function too)
let spawn_handles: Vec<_> = server_configs
.into_iter()
2022-10-19 02:27:33 +03:00
.filter_map(|(server_name, server_config)| {
if server_config.disabled {
return None;
}
let db_conn = db_conn.clone();
let http_client = http_client.clone();
let redis_pool = redis_pool.clone();
let http_interval_sender = http_interval_sender.clone();
let block_sender = if head_block_sender.is_some() {
Some(block_sender.clone())
} else {
None
};
let pending_tx_id_sender = Some(pending_tx_id_sender.clone());
2022-08-26 20:26:17 +03:00
let block_map = block_map.clone();
2022-09-09 06:53:16 +03:00
let open_request_handle_metrics = open_request_handle_metrics.clone();
2022-10-19 02:27:33 +03:00
let handle = tokio::spawn(async move {
server_config
.spawn(
2022-08-10 08:56:09 +03:00
server_name,
db_conn,
redis_pool,
chain_id,
http_client,
http_interval_sender,
2022-08-26 20:26:17 +03:00
block_map,
block_sender,
pending_tx_id_sender,
2022-09-09 06:53:16 +03:00
open_request_handle_metrics,
)
.await
2022-10-19 02:27:33 +03:00
});
Some(handle)
})
.collect();
// map of connection names to their connection
2022-08-26 20:26:17 +03:00
let mut connections = HashMap::new();
let mut handles = vec![];
// TODO: futures unordered?
for x in join_all(spawn_handles).await {
// TODO: how should we handle errors here? one rpc being down shouldn't cause the program to exit
match x {
Ok(Ok((connection, handle))) => {
connections.insert(connection.name.clone(), connection);
handles.push(handle);
}
Ok(Err(err)) => {
2022-09-14 06:00:35 +03:00
// if we got an error here, it is not retryable
// TODO: include context about which connection failed
2022-11-12 11:24:32 +03:00
error!("Unable to create connection. err={:?}", err);
}
Err(err) => {
return Err(err.into());
2022-06-14 08:43:28 +03:00
}
2022-05-06 00:38:15 +03:00
}
2022-05-05 22:07:09 +03:00
}
2022-10-19 02:27:33 +03:00
// TODO: now this errors for private rpcs when we disable all!
if connections.len() < min_head_rpcs {
return Err(anyhow::anyhow!(
"Only {}/{} connections! Add more connections or reduce min_head_rpcs.",
connections.len(),
min_head_rpcs
));
2022-05-16 01:02:14 +03:00
}
2022-05-15 22:28:22 +03:00
// TODO: safety check on sum soft limit
2022-05-20 05:01:02 +03:00
let synced_connections = SyncedConnections::default();
2022-05-16 01:02:14 +03:00
2022-09-17 05:30:06 +03:00
// TODO: max_capacity and time_to_idle from config
// all block hashes are the same size, so no need for weigher
let block_hashes = Cache::builder()
.time_to_idle(Duration::from_secs(600))
.max_capacity(10_000)
2022-11-11 21:40:52 +03:00
.build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default());
2022-09-17 05:30:06 +03:00
// all block numbers are the same size, so no need for weigher
let block_numbers = Cache::builder()
.time_to_idle(Duration::from_secs(600))
.max_capacity(10_000)
2022-11-11 21:40:52 +03:00
.build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default());
2022-09-05 08:53:58 +03:00
2022-05-16 01:02:14 +03:00
let connections = Arc::new(Self {
conns: connections,
2022-05-18 23:18:01 +03:00
synced_connections: ArcSwap::new(Arc::new(synced_connections)),
2022-06-16 20:51:49 +03:00
pending_transactions,
2022-09-05 08:53:58 +03:00
block_hashes,
block_numbers,
2022-08-27 02:44:25 +03:00
blockchain_graphmap: Default::default(),
2022-08-27 06:11:58 +03:00
min_sum_soft_limit,
min_head_rpcs,
2022-05-16 01:02:14 +03:00
});
let authorization = Arc::new(Authorization::internal(db_conn.clone())?);
let handle = {
let connections = connections.clone();
tokio::spawn(async move {
2022-06-16 20:51:49 +03:00
// TODO: try_join_all with the other handles here
connections
2022-06-14 08:43:28 +03:00
.subscribe(
authorization,
2022-06-14 08:43:28 +03:00
pending_tx_id_receiver,
block_receiver,
head_block_sender,
pending_tx_sender,
)
.await
})
};
2022-06-14 08:43:28 +03:00
Ok((connections, handle))
2022-05-18 19:35:06 +03:00
}
2022-08-26 20:26:17 +03:00
pub fn get(&self, conn_name: &str) -> Option<&Arc<Web3Connection>> {
self.conns.get(conn_name)
2022-08-26 20:26:17 +03:00
}
2022-07-22 22:30:39 +03:00
/// subscribe to blocks and transactions from all the backend rpcs.
/// blocks are processed by all the `Web3Connection`s and then sent to the `block_receiver`
/// transaction ids from all the `Web3Connection`s are deduplicated and forwarded to `pending_tx_sender`
async fn subscribe(
self: Arc<Self>,
authorization: Arc<Authorization>,
2022-08-30 23:01:42 +03:00
pending_tx_id_receiver: flume::Receiver<TxHashAndRpc>,
2022-08-26 20:26:17 +03:00
block_receiver: flume::Receiver<BlockAndRpc>,
2022-08-30 23:01:42 +03:00
head_block_sender: Option<watch::Sender<ArcBlock>>,
2022-08-24 03:59:05 +03:00
pending_tx_sender: Option<broadcast::Sender<TxStatus>>,
) -> anyhow::Result<()> {
2022-06-16 05:53:37 +03:00
let mut futures = vec![];
2022-05-18 19:35:06 +03:00
2022-06-14 08:43:28 +03:00
// setup the transaction funnel
// it skips any duplicates (unless they are being orphaned)
// fetches new transactions from the notifying rpc
// forwards new transacitons to pending_tx_receipt_sender
if let Some(pending_tx_sender) = pending_tx_sender.clone() {
2022-06-14 09:42:52 +03:00
let clone = self.clone();
let authorization = authorization.clone();
let handle = task::spawn(async move {
2022-08-26 20:26:17 +03:00
// TODO: set up this future the same as the block funnel
2022-06-16 05:53:37 +03:00
while let Ok((pending_tx_id, rpc)) = pending_tx_id_receiver.recv_async().await {
2022-08-26 20:26:17 +03:00
let f = clone.clone().process_incoming_tx_id(
authorization.clone(),
2022-06-16 05:53:37 +03:00
rpc,
pending_tx_id,
pending_tx_sender.clone(),
);
tokio::spawn(f);
}
Ok(())
});
2022-06-16 05:53:37 +03:00
futures.push(flatten_handle(handle));
}
2022-06-14 08:43:28 +03:00
// setup the block funnel
if let Some(head_block_sender) = head_block_sender {
let connections = Arc::clone(&self);
let pending_tx_sender = pending_tx_sender.clone();
2022-05-18 19:35:06 +03:00
let handle = task::Builder::default()
2022-08-26 20:26:17 +03:00
.name("process_incoming_blocks")
2022-05-17 19:23:27 +03:00
.spawn(async move {
connections
2022-08-26 20:26:17 +03:00
.process_incoming_blocks(
&authorization,
2022-08-26 20:26:17 +03:00
block_receiver,
head_block_sender,
pending_tx_sender,
)
2022-05-17 19:23:27 +03:00
.await
2022-09-02 23:16:20 +03:00
})?;
2022-05-18 19:35:06 +03:00
2022-06-16 05:53:37 +03:00
futures.push(flatten_handle(handle));
2022-05-16 01:02:14 +03:00
}
2022-06-14 08:43:28 +03:00
if futures.is_empty() {
// no transaction or block subscriptions.
2022-08-11 00:29:50 +03:00
let handle = task::Builder::default().name("noop").spawn(async move {
loop {
sleep(Duration::from_secs(600)).await;
2022-08-26 20:26:17 +03:00
// TODO: "every interval, check that the provider is still connected"
2022-08-11 00:29:50 +03:00
}
2022-09-02 23:16:20 +03:00
})?;
2022-08-11 00:29:50 +03:00
futures.push(flatten_handle(handle));
2022-06-14 08:43:28 +03:00
}
2022-06-16 05:53:37 +03:00
if let Err(e) = try_join_all(futures).await {
2022-06-16 20:51:49 +03:00
error!("subscriptions over: {:?}", self);
2022-06-16 05:53:37 +03:00
return Err(e);
}
2022-05-18 19:35:06 +03:00
2022-06-14 08:43:28 +03:00
info!("subscriptions over: {:?}", self);
Ok(())
2022-05-05 22:07:09 +03:00
}
2022-05-28 07:26:24 +03:00
/// Send the same request to all the handles. Returning the most common success or most common error.
2022-05-12 02:50:52 +03:00
pub async fn try_send_parallel_requests(
2022-05-28 07:26:24 +03:00
&self,
2022-08-24 03:14:49 +03:00
active_request_handles: Vec<OpenRequestHandle>,
2022-05-28 07:26:24 +03:00
method: &str,
params: Option<&serde_json::Value>,
2022-09-23 01:14:24 +03:00
// TODO: remove this box once i figure out how to do the options
2022-05-28 07:26:24 +03:00
) -> Result<Box<RawValue>, ProviderError> {
2022-08-26 20:26:17 +03:00
// TODO: if only 1 active_request_handles, do self.try_send_request?
2022-05-05 22:07:09 +03:00
2022-05-28 07:26:24 +03:00
let responses = active_request_handles
.into_iter()
.map(|active_request_handle| async move {
let result: Result<Box<RawValue>, _> = active_request_handle
2022-11-12 11:24:32 +03:00
.request(method, &json!(&params), Level::Error.into())
.await;
2022-05-28 07:26:24 +03:00
result
})
.collect::<FuturesUnordered<_>>()
.collect::<Vec<Result<Box<RawValue>, ProviderError>>>()
.await;
2022-05-28 21:45:45 +03:00
// TODO: Strings are not great keys, but we can't use RawValue or ProviderError as keys because they don't implement Hash or Eq
2022-05-28 07:26:24 +03:00
let mut count_map: HashMap<String, Result<Box<RawValue>, ProviderError>> = HashMap::new();
let mut counts: Counter<String> = Counter::new();
let mut any_ok = false;
for response in responses {
2022-08-26 20:26:17 +03:00
// TODO: i think we need to do something smarter with provider error. we at least need to wrap it up as JSON
// TODO: emit stats errors?
2022-05-28 07:26:24 +03:00
let s = format!("{:?}", response);
if count_map.get(&s).is_none() {
if response.is_ok() {
any_ok = true;
}
2022-05-17 19:23:27 +03:00
2022-05-28 07:26:24 +03:00
count_map.insert(s.clone(), response);
}
2022-05-05 22:07:09 +03:00
2022-05-28 07:26:24 +03:00
counts.update([s].into_iter());
2022-05-05 22:07:09 +03:00
}
2022-05-28 07:26:24 +03:00
for (most_common, _) in counts.most_common_ordered() {
let most_common = count_map.remove(&most_common).unwrap();
if any_ok && most_common.is_err() {
// errors were more common, but we are going to skip them because we got an okay
continue;
} else {
// return the most common
return most_common;
2022-05-05 22:07:09 +03:00
}
}
2022-05-28 07:26:24 +03:00
// TODO: what should we do if we get here? i don't think we will
panic!("i don't think this is possible")
2022-05-05 22:07:09 +03:00
}
/// get the best available rpc server
pub async fn best_synced_backend_connection(
2022-07-02 04:20:28 +03:00
&self,
authorization: &Arc<Authorization>,
2022-10-12 00:31:34 +03:00
request_metadata: Option<&Arc<RequestMetadata>>,
2022-07-02 04:20:28 +03:00
skip: &[Arc<Web3Connection>],
2022-07-22 22:30:39 +03:00
min_block_needed: Option<&U64>,
2022-08-24 03:14:49 +03:00
) -> anyhow::Result<OpenRequestResult> {
2022-08-07 09:48:57 +03:00
let mut earliest_retry_at = None;
2022-05-05 22:07:09 +03:00
2022-11-07 00:05:03 +03:00
let min_block_needed = if let Some(min_block_needed) = min_block_needed {
*min_block_needed
} else {
// TODO: error or OpenRequestResult::NotSynced? and then we turn that into a 502?
self.head_block_num().context("no servers are synced")?
2022-11-07 00:05:03 +03:00
};
// filter the synced rpcs
2022-11-04 01:16:27 +03:00
// TODO: we are going to be checking "has_block_data" a lot now
let head_rpcs: Vec<Arc<Web3Connection>> = self
.synced_connections
.load()
2022-11-07 00:05:03 +03:00
.conns
.iter()
2022-11-07 00:05:03 +03:00
.filter(|x| !skip.contains(x))
.filter(|x| x.has_block_data(&min_block_needed))
.cloned()
.collect();
2022-05-05 22:07:09 +03:00
if head_rpcs.is_empty() {
2022-08-26 20:26:17 +03:00
// TODO: what should happen here? automatic retry?
// TODO: more detailed error
2022-11-07 00:05:03 +03:00
return Err(anyhow::anyhow!("no servers are synced"));
2022-07-02 04:20:28 +03:00
}
2022-11-12 09:11:58 +03:00
let mut minimum = 0.0;
2022-08-24 03:59:05 +03:00
// we sort on a bunch of values. cache them here so that we don't do this math multiple times.
let weight_map: HashMap<_, f64> = head_rpcs
2022-05-05 22:07:09 +03:00
.iter()
.map(|rpc| {
2022-11-12 09:11:58 +03:00
// TODO: put this on the rpc object instead?
2022-08-10 08:56:09 +03:00
let weight = rpc.weight;
2022-11-12 09:11:58 +03:00
// TODO: are active requests what we want? do we want a counter for requests in the last second + any actives longer than that?
// TODO: get active requests out of redis?
// TODO: do something with hard limit instead?
2022-05-18 23:18:01 +03:00
let active_requests = rpc.active_requests();
2022-08-10 08:56:09 +03:00
let soft_limit = rpc.soft_limit;
2022-05-06 23:44:12 +03:00
2022-11-12 09:11:58 +03:00
// TODO: maybe store weight as the percentile
let available_requests = soft_limit as f64 * weight - active_requests as f64;
2022-05-06 23:44:12 +03:00
2022-11-12 09:11:58 +03:00
if available_requests < 0.0 {
minimum = available_requests.min(minimum);
}
2022-11-12 09:11:58 +03:00
(rpc.clone(), available_requests)
2022-05-05 22:07:09 +03:00
})
.collect();
2022-11-12 09:11:58 +03:00
// we can't have negative numbers. shift up if any are negative
let weight_map: HashMap<_, f64> = if minimum < 0.0 {
weight_map
} else {
weight_map
.into_iter()
.map(|(rpc, weight)| {
// TODO: is simple addition the right way to shift everyone?
let x = weight + minimum;
(rpc, x)
})
.collect()
};
2022-05-06 23:44:12 +03:00
2022-11-12 09:11:58 +03:00
let sorted_rpcs = {
let mut rng = thread_fast_rng::thread_fast_rng();
head_rpcs
.choose_multiple_weighted(&mut rng, head_rpcs.len(), |rpc| {
2022-11-12 09:11:58 +03:00
*weight_map
.get(rpc)
.expect("rpc should always be in the weight map")
})
.unwrap()
.collect::<Vec<_>>()
};
2022-05-05 22:07:09 +03:00
2022-05-19 06:00:54 +03:00
// now that the rpcs are sorted, try to get an active request handle for one of them
2022-11-12 09:11:58 +03:00
for rpc in sorted_rpcs.into_iter() {
2022-05-05 22:07:09 +03:00
// increment our connection counter
match rpc.try_request_handle(authorization).await {
2022-08-24 03:59:05 +03:00
Ok(OpenRequestResult::Handle(handle)) => {
2022-11-12 11:24:32 +03:00
// // trace!("next server on {:?}: {:?}", self, rpc);
2022-08-24 03:59:05 +03:00
return Ok(OpenRequestResult::Handle(handle));
2022-08-07 09:48:57 +03:00
}
2022-08-24 03:14:49 +03:00
Ok(OpenRequestResult::RetryAt(retry_at)) => {
2022-08-07 09:48:57 +03:00
earliest_retry_at = earliest_retry_at.min(Some(retry_at));
}
2022-08-30 23:01:42 +03:00
Ok(OpenRequestResult::RetryNever) => {
2022-08-07 09:48:57 +03:00
// TODO: log a warning?
}
Err(err) => {
// TODO: log a warning?
2022-11-12 11:24:32 +03:00
warn!("No request handle for {}. err={:?}", rpc, err)
2022-05-06 23:44:12 +03:00
}
2022-05-05 22:07:09 +03:00
}
}
2022-08-07 09:48:57 +03:00
match earliest_retry_at {
2022-09-28 20:01:11 +03:00
None => {
2022-10-12 00:31:34 +03:00
if let Some(request_metadata) = request_metadata {
request_metadata.no_servers.fetch_add(1, Ordering::Release);
}
2022-09-28 20:01:11 +03:00
// TODO: error works, but maybe we should just wait a second?
Err(anyhow::anyhow!("no servers synced"))
}
Some(earliest_retry_at) => {
warn!("no servers on {:?}! {:?}", self, earliest_retry_at);
2022-10-12 00:31:34 +03:00
if let Some(request_metadata) = request_metadata {
request_metadata.no_servers.fetch_add(1, Ordering::Release);
}
2022-09-28 20:01:11 +03:00
Ok(OpenRequestResult::RetryAt(earliest_retry_at))
}
2022-08-07 09:48:57 +03:00
}
2022-05-05 22:07:09 +03:00
}
/// get all rpc servers that are not rate limited
2022-05-13 23:50:11 +03:00
/// returns servers even if they aren't in sync. This is useful for broadcasting signed transactions
2022-08-07 09:48:57 +03:00
// TODO: better type on this that can return an anyhow::Result
pub async fn all_backend_connections(
2022-07-09 06:34:39 +03:00
&self,
authorization: &Arc<Authorization>,
2022-09-07 06:54:16 +03:00
block_needed: Option<&U64>,
2022-08-24 03:14:49 +03:00
) -> Result<Vec<OpenRequestHandle>, Option<Instant>> {
2022-08-07 09:48:57 +03:00
let mut earliest_retry_at = None;
2022-05-05 22:07:09 +03:00
// TODO: with capacity?
let mut selected_rpcs = vec![];
for connection in self.conns.values() {
2022-09-07 06:54:16 +03:00
if let Some(block_needed) = block_needed {
if !connection.has_block_data(block_needed) {
2022-07-19 04:31:12 +03:00
continue;
}
2022-07-09 07:25:59 +03:00
}
2022-05-05 22:07:09 +03:00
// check rate limits and increment our connection counter
match connection.try_request_handle(authorization).await {
2022-08-24 03:14:49 +03:00
Ok(OpenRequestResult::RetryAt(retry_at)) => {
// this rpc is not available. skip it
2022-08-07 09:48:57 +03:00
earliest_retry_at = earliest_retry_at.min(Some(retry_at));
}
2022-08-24 03:59:05 +03:00
Ok(OpenRequestResult::Handle(handle)) => selected_rpcs.push(handle),
2022-08-30 23:01:42 +03:00
Ok(OpenRequestResult::RetryNever) => {
2022-08-07 09:48:57 +03:00
warn!("no request handle for {}", connection)
}
Err(err) => {
2022-11-12 11:24:32 +03:00
warn!(
"error getting request handle for {}. err={:?}",
connection, err
)
}
2022-05-05 22:07:09 +03:00
}
}
if !selected_rpcs.is_empty() {
return Ok(selected_rpcs);
}
2022-05-22 02:34:05 +03:00
// return the earliest retry_after (if no rpcs are synced, this will be None)
2022-08-07 09:48:57 +03:00
Err(earliest_retry_at)
2022-05-05 22:07:09 +03:00
}
2022-05-28 07:26:24 +03:00
2022-05-29 04:23:58 +03:00
/// be sure there is a timeout on this or it might loop forever
pub async fn try_send_best_upstream_server(
&self,
authorization: &Arc<Authorization>,
2022-05-29 04:23:58 +03:00
request: JsonRpcRequest,
request_metadata: Option<&Arc<RequestMetadata>>,
2022-07-22 22:30:39 +03:00
min_block_needed: Option<&U64>,
2022-05-29 04:23:58 +03:00
) -> anyhow::Result<JsonRpcForwardedResponse> {
2022-07-02 04:20:28 +03:00
let mut skip_rpcs = vec![];
// TODO: maximum retries? right now its the total number of servers
2022-05-29 04:23:58 +03:00
loop {
if skip_rpcs.len() == self.conns.len() {
2022-11-04 01:16:27 +03:00
// no servers to try
2022-07-02 04:20:28 +03:00
break;
}
2022-07-19 04:31:12 +03:00
match self
.best_synced_backend_connection(
authorization,
2022-10-12 00:31:34 +03:00
request_metadata,
&skip_rpcs,
min_block_needed,
)
2022-09-10 03:58:33 +03:00
.await?
2022-07-19 04:31:12 +03:00
{
2022-09-10 03:58:33 +03:00
OpenRequestResult::Handle(active_request_handle) => {
2022-07-02 04:20:28 +03:00
// save the rpc in case we get an error and want to retry on another server
skip_rpcs.push(active_request_handle.clone_connection());
if let Some(request_metadata) = request_metadata {
request_metadata
.backend_requests
.fetch_add(1, Ordering::Acquire);
}
// TODO: get the log percent from the user data
2022-05-29 04:23:58 +03:00
let response_result = active_request_handle
.request(
&request.method,
&json!(request.params),
2022-09-24 10:04:11 +03:00
RequestErrorHandler::SaveReverts,
)
2022-05-29 04:23:58 +03:00
.await;
2022-07-02 04:20:28 +03:00
match JsonRpcForwardedResponse::try_from_response_result(
2022-06-04 00:45:44 +03:00
response_result,
request.id.clone(),
) {
Ok(response) => {
2022-07-02 04:20:28 +03:00
if let Some(error) = &response.error {
2022-11-12 11:24:32 +03:00
// // trace!(?response, "rpc error");
2022-07-02 04:20:28 +03:00
if let Some(request_metadata) = request_metadata {
request_metadata
.error_response
.store(true, Ordering::Release);
}
2022-07-09 07:25:59 +03:00
// some errors should be retried on other nodes
if error.code == -32000 {
let error_msg = error.message.as_str();
// TODO: regex?
let retry_prefixes = [
2022-07-02 04:20:28 +03:00
"header not found",
"header for hash not found",
2022-07-09 07:25:59 +03:00
"missing trie node",
2022-07-02 04:20:28 +03:00
"node not started",
"RPC timeout",
2022-07-09 07:25:59 +03:00
];
for retry_prefix in retry_prefixes {
if error_msg.starts_with(retry_prefix) {
continue;
}
}
2022-07-02 04:20:28 +03:00
}
2022-06-04 00:45:44 +03:00
} else {
2022-11-12 11:24:32 +03:00
// // trace!(?response, "rpc success");
2022-06-04 00:45:44 +03:00
}
2022-05-29 04:23:58 +03:00
2022-06-04 00:45:44 +03:00
return Ok(response);
}
2022-11-12 11:24:32 +03:00
Err(err) => {
2022-09-10 03:58:33 +03:00
let rpc = skip_rpcs
.last()
.expect("there must have been a provider if we got an error");
2022-11-12 11:24:32 +03:00
warn!(
"Backend server error on {}! Retrying on another. err={:?}",
rpc, err
);
2022-05-29 04:23:58 +03:00
2022-06-04 00:45:44 +03:00
// TODO: sleep how long? until synced_connections changes or rate limits are available
2022-09-10 03:58:33 +03:00
// sleep(Duration::from_millis(100)).await;
2022-06-04 00:45:44 +03:00
continue;
}
}
2022-05-29 04:23:58 +03:00
}
2022-09-10 03:58:33 +03:00
OpenRequestResult::RetryAt(retry_at) => {
2022-08-07 09:48:57 +03:00
// TODO: move this to a helper function
// sleep (TODO: with a lock?) until our rate limits should be available
// TODO: if a server catches up sync while we are waiting, we could stop waiting
2022-11-12 11:24:32 +03:00
warn!("All rate limits exceeded. Sleeping untill {:?}", retry_at);
2022-05-29 04:23:58 +03:00
2022-10-12 00:31:34 +03:00
// TODO: have a separate column for rate limited?
if let Some(request_metadata) = request_metadata {
request_metadata.no_servers.fetch_add(1, Ordering::Release);
}
2022-08-07 09:48:57 +03:00
sleep_until(retry_at).await;
2022-06-03 00:47:43 +03:00
continue;
2022-05-29 04:23:58 +03:00
}
2022-09-10 03:58:33 +03:00
OpenRequestResult::RetryNever => {
2022-11-12 11:24:32 +03:00
warn!("No server handles! {:?}", self);
2022-05-29 04:23:58 +03:00
2022-10-12 00:31:34 +03:00
if let Some(request_metadata) = request_metadata {
request_metadata.no_servers.fetch_add(1, Ordering::Release);
}
2022-08-07 09:48:57 +03:00
// TODO: subscribe to something on synced connections. maybe it should just be a watch channel
sleep(Duration::from_millis(200)).await;
2022-05-29 04:23:58 +03:00
continue;
}
}
}
2022-07-02 04:20:28 +03:00
2022-10-12 00:31:34 +03:00
// TODO: do we need this here, or do we do it somewhere else?
if let Some(request_metadata) = request_metadata {
request_metadata
.error_response
.store(true, Ordering::Release);
}
2022-11-04 01:16:27 +03:00
// TODO: what error code? 502?
2022-09-10 03:58:33 +03:00
Err(anyhow::anyhow!("all {} tries exhausted", skip_rpcs.len()))
2022-05-29 04:23:58 +03:00
}
2022-07-02 04:20:28 +03:00
/// be sure there is a timeout on this or it might loop forever
2022-11-12 11:24:32 +03:00
2022-05-28 07:26:24 +03:00
pub async fn try_send_all_upstream_servers(
&self,
authorization: &Arc<Authorization>,
2022-05-28 07:26:24 +03:00
request: JsonRpcRequest,
request_metadata: Option<Arc<RequestMetadata>>,
2022-09-07 06:54:16 +03:00
block_needed: Option<&U64>,
2022-05-28 07:26:24 +03:00
) -> anyhow::Result<JsonRpcForwardedResponse> {
loop {
match self
.all_backend_connections(authorization, block_needed)
.await
{
2022-05-28 07:26:24 +03:00
Ok(active_request_handles) => {
// TODO: benchmark this compared to waiting on unbounded futures
// TODO: do something with this handle?
// TODO: this is not working right. simplify
2022-10-12 00:31:34 +03:00
if let Some(request_metadata) = request_metadata {
request_metadata
.backend_requests
2022-10-25 06:41:59 +03:00
.fetch_add(active_request_handles.len() as u64, Ordering::Release);
2022-10-12 00:31:34 +03:00
}
2022-05-28 07:26:24 +03:00
let quorum_response = self
.try_send_parallel_requests(
active_request_handles,
request.method.as_ref(),
request.params.as_ref(),
2022-05-28 07:26:24 +03:00
)
.await?;
let response = JsonRpcForwardedResponse {
jsonrpc: "2.0".to_string(),
id: request.id,
result: Some(quorum_response),
error: None,
};
return Ok(response);
}
Err(None) => {
2022-11-12 11:24:32 +03:00
warn!("No servers in sync on {:?}! Retrying", self);
2022-05-29 04:23:58 +03:00
2022-10-12 00:31:34 +03:00
if let Some(request_metadata) = &request_metadata {
request_metadata.no_servers.fetch_add(1, Ordering::Release);
}
2022-05-28 07:26:24 +03:00
// TODO: i don't think this will ever happen
2022-06-03 00:47:43 +03:00
// TODO: return a 502? if it does?
// return Err(anyhow::anyhow!("no available rpcs!"));
// TODO: sleep how long?
2022-07-16 03:08:22 +03:00
// TODO: subscribe to something in SyncedConnections instead
2022-06-03 00:47:43 +03:00
sleep(Duration::from_millis(200)).await;
continue;
2022-05-28 07:26:24 +03:00
}
2022-08-07 09:48:57 +03:00
Err(Some(retry_at)) => {
2022-05-28 07:26:24 +03:00
// TODO: move this to a helper function
// sleep (TODO: with a lock?) until our rate limits should be available
// TODO: if a server catches up sync while we are waiting, we could stop waiting
2022-06-03 00:47:43 +03:00
warn!("All rate limits exceeded. Sleeping");
2022-10-12 00:31:34 +03:00
if let Some(request_metadata) = &request_metadata {
request_metadata.no_servers.fetch_add(1, Ordering::Release);
}
2022-08-07 09:48:57 +03:00
sleep_until(retry_at).await;
2022-05-28 07:26:24 +03:00
2022-06-03 00:47:43 +03:00
continue;
2022-05-28 07:26:24 +03:00
}
}
}
}
2022-05-05 22:07:09 +03:00
}
2022-07-16 07:13:02 +03:00
impl fmt::Debug for Web3Connections {
2022-08-24 02:13:56 +03:00
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// TODO: the default formatter takes forever to write. this is too quiet though
f.debug_struct("Web3Connections")
.field("conns", &self.conns)
2022-08-24 02:13:56 +03:00
.finish_non_exhaustive()
}
}
impl Serialize for Web3Connections {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let mut state = serializer.serialize_struct("Web3Connections", 6)?;
2022-09-22 02:50:55 +03:00
let conns: Vec<&Web3Connection> = self.conns.values().map(|x| x.as_ref()).collect();
state.serialize_field("conns", &conns)?;
2022-09-22 02:50:55 +03:00
let synced_connections = &**self.synced_connections.load();
state.serialize_field("synced_connections", synced_connections)?;
self.block_hashes.sync();
self.block_numbers.sync();
state.serialize_field("block_hashes_count", &self.block_hashes.entry_count())?;
state.serialize_field("block_hashes_size", &self.block_hashes.weighted_size())?;
state.serialize_field("block_numbers_count", &self.block_numbers.entry_count())?;
state.serialize_field("block_numbers_size", &self.block_numbers.weighted_size())?;
state.end()
}
}
mod tests {
use super::*;
use crate::rpcs::{blockchain::BlockId, provider::Web3Provider};
use ethers::types::Block;
use log::LevelFilter;
use parking_lot::RwLock;
#[tokio::test]
async fn test_server_selection() {
// TODO: do this better. can test_env_logger and tokio test be stacked?
let _ = env_logger::builder()
.filter_level(LevelFilter::Error)
.filter_module("web3_proxy", LevelFilter::Trace)
.is_test(true)
.try_init();
let lagged_block = Block {
hash: Some(H256::random()),
number: Some(0.into()),
..Default::default()
};
let head_block = Block {
hash: Some(H256::random()),
number: Some(1.into()),
parent_hash: lagged_block.hash.unwrap(),
..Default::default()
};
// TODO: write a impl From for Block -> BlockId?
let lagged_block_id = BlockId {
hash: lagged_block.hash.unwrap(),
num: lagged_block.number.unwrap(),
};
let head_block_id = BlockId {
hash: head_block.hash.unwrap(),
num: head_block.number.unwrap(),
};
let lagged_block = Arc::new(lagged_block);
let head_block = Arc::new(head_block);
let block_data_limit = u64::MAX;
let head_rpc = Web3Connection {
name: "synced".to_string(),
display_name: None,
url: "ws://example.com/synced".to_string(),
http_client: None,
active_requests: 0.into(),
frontend_requests: 0.into(),
internal_requests: 0.into(),
provider: AsyncRwLock::new(Some(Arc::new(Web3Provider::Mock))),
hard_limit: None,
soft_limit: 1_000,
block_data_limit: block_data_limit.into(),
weight: 100.0,
head_block_id: RwLock::new(Some(head_block_id)),
open_request_handle_metrics: Arc::new(Default::default()),
};
let lagged_rpc = Web3Connection {
name: "lagged".to_string(),
display_name: None,
url: "ws://example.com/lagged".to_string(),
http_client: None,
active_requests: 0.into(),
frontend_requests: 0.into(),
internal_requests: 0.into(),
provider: AsyncRwLock::new(Some(Arc::new(Web3Provider::Mock))),
hard_limit: None,
soft_limit: 1_000,
block_data_limit: block_data_limit.into(),
weight: 100.0,
head_block_id: RwLock::new(Some(lagged_block_id)),
open_request_handle_metrics: Arc::new(Default::default()),
};
assert!(head_rpc.has_block_data(&lagged_block.number.unwrap()));
assert!(head_rpc.has_block_data(&head_block.number.unwrap()));
assert!(lagged_rpc.has_block_data(&lagged_block.number.unwrap()));
assert!(!lagged_rpc.has_block_data(&head_block.number.unwrap()));
let head_rpc = Arc::new(head_rpc);
let lagged_rpc = Arc::new(lagged_rpc);
let conns = HashMap::from([
(head_rpc.name.clone(), head_rpc.clone()),
(lagged_rpc.name.clone(), lagged_rpc.clone()),
]);
let conns = Web3Connections {
conns,
synced_connections: Default::default(),
pending_transactions: Cache::builder()
.max_capacity(10_000)
.build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default()),
block_hashes: Cache::builder()
.max_capacity(10_000)
.build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default()),
block_numbers: Cache::builder()
.max_capacity(10_000)
.build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default()),
blockchain_graphmap: Default::default(),
min_head_rpcs: 1,
min_sum_soft_limit: 1,
};
let authorization = Arc::new(Authorization::internal(None).unwrap());
let (head_block_sender, _head_block_receiver) =
watch::channel::<ArcBlock>(Default::default());
let mut connection_heads = HashMap::new();
// process None so that
conns
.process_block_from_rpc(
&authorization,
&mut connection_heads,
None,
lagged_rpc.clone(),
&head_block_sender,
&None,
)
.await
.unwrap();
conns
.process_block_from_rpc(
&authorization,
&mut connection_heads,
None,
head_rpc.clone(),
&head_block_sender,
&None,
)
.await
.unwrap();
// no head block because the rpcs haven't communicated through their channels
assert!(conns.head_block_hash().is_none());
// all_backend_connections gives everything regardless of sync status
assert_eq!(
conns
.all_backend_connections(&authorization, None)
.await
.unwrap()
.len(),
2
);
// best_synced_backend_connection requires servers to be synced with the head block
// TODO: should this be an error, or a OpenRequestResult::NotSynced?
assert!(conns
.best_synced_backend_connection(&authorization, None, &[], lagged_block.number.as_ref())
.await
.is_err());
// add lagged blocks to the conns. both servers should be allowed
conns.save_block(&lagged_block, true).await.unwrap();
conns
.process_block_from_rpc(
&authorization,
&mut connection_heads,
Some(lagged_block.clone()),
lagged_rpc,
&head_block_sender,
&None,
)
.await
.unwrap();
conns
.process_block_from_rpc(
&authorization,
&mut connection_heads,
Some(lagged_block.clone()),
head_rpc.clone(),
&head_block_sender,
&None,
)
.await
.unwrap();
assert_eq!(conns.num_synced_rpcs(), 2);
// add head block to the conns. lagged_rpc should not be available
conns.save_block(&head_block, true).await.unwrap();
conns
.process_block_from_rpc(
&authorization,
&mut connection_heads,
Some(head_block.clone()),
head_rpc,
&head_block_sender,
&None,
)
.await
.unwrap();
assert_eq!(conns.num_synced_rpcs(), 1);
// TODO: is_ok is too simple. make sure its head_rpc
assert!(conns
.best_synced_backend_connection(&authorization, None, &[], None)
.await
.is_ok());
assert!(conns
.best_synced_backend_connection(&authorization, None, &[], Some(&0.into()))
.await
.is_ok());
assert!(conns
.best_synced_backend_connection(&authorization, None, &[], Some(&1.into()))
.await
.is_ok());
assert!(conns
.best_synced_backend_connection(&authorization, None, &[], Some(&2.into()))
.await
.is_err());
}
}