web3-proxy/web3-proxy/src/connections.rs

482 lines
18 KiB
Rust
Raw Normal View History

2022-05-12 02:50:52 +03:00
///! Load balanced communication with a group of web3 providers
2022-05-18 23:18:01 +03:00
use arc_swap::ArcSwap;
2022-05-28 07:26:24 +03:00
use counter::Counter;
2022-05-05 22:07:09 +03:00
use derive_more::From;
2022-05-28 07:26:24 +03:00
use ethers::prelude::{ProviderError, H256};
2022-05-18 19:35:06 +03:00
use futures::future::join_all;
2022-05-05 22:07:09 +03:00
use futures::stream::FuturesUnordered;
use futures::StreamExt;
2022-05-20 04:26:02 +03:00
use hashbrown::HashMap;
2022-05-21 01:16:15 +03:00
use serde::ser::{SerializeStruct, Serializer};
use serde::Serialize;
2022-05-05 22:07:09 +03:00
use serde_json::value::RawValue;
use std::cmp;
2022-05-20 04:26:02 +03:00
use std::collections::{BTreeMap, BTreeSet};
2022-05-05 22:07:09 +03:00
use std::fmt;
use std::sync::Arc;
2022-05-22 02:34:05 +03:00
use std::time::Duration;
2022-05-17 19:23:27 +03:00
use tokio::task;
2022-05-28 07:26:24 +03:00
use tokio::time::sleep;
2022-05-18 19:35:06 +03:00
use tracing::Instrument;
use tracing::{debug, info, info_span, instrument, trace, warn};
2022-05-05 22:07:09 +03:00
use crate::config::Web3ConnectionConfig;
2022-05-06 08:44:30 +03:00
use crate::connection::{ActiveRequestHandle, Web3Connection};
2022-05-28 07:26:24 +03:00
use crate::jsonrpc::{JsonRpcForwardedResponse, JsonRpcRequest};
2022-05-05 22:07:09 +03:00
2022-05-21 01:16:15 +03:00
// Serialize so we can print it on our debug endpoint
#[derive(Clone, Default, Serialize)]
2022-05-18 23:18:01 +03:00
struct SyncedConnections {
head_block_num: u64,
head_block_hash: H256,
2022-05-20 04:26:02 +03:00
inner: BTreeSet<usize>,
2022-05-05 22:07:09 +03:00
}
impl fmt::Debug for SyncedConnections {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// TODO: the default formatter takes forever to write. this is too quiet though
f.debug_struct("SyncedConnections").finish_non_exhaustive()
}
}
impl SyncedConnections {
2022-05-18 23:18:01 +03:00
pub fn get_head_block_hash(&self) -> &H256 {
&self.head_block_hash
2022-05-15 22:28:22 +03:00
}
2022-05-05 22:07:09 +03:00
}
/// A collection of web3 connections. Sends requests either the current best server or all servers.
#[derive(From)]
pub struct Web3Connections {
inner: Vec<Arc<Web3Connection>>,
2022-05-18 23:18:01 +03:00
synced_connections: ArcSwap<SyncedConnections>,
2022-05-05 22:07:09 +03:00
}
2022-05-21 01:16:15 +03:00
impl Serialize for Web3Connections {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let inner: Vec<&Web3Connection> = self.inner.iter().map(|x| x.as_ref()).collect();
// 3 is the number of fields in the struct.
let mut state = serializer.serialize_struct("Web3Connections", 2)?;
state.serialize_field("rpcs", &inner)?;
state.serialize_field("synced_connections", &**self.synced_connections.load())?;
state.end()
}
}
2022-05-05 22:07:09 +03:00
impl fmt::Debug for Web3Connections {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// TODO: the default formatter takes forever to write. this is too quiet though
f.debug_struct("Web3Connections")
.field("inner", &self.inner)
.finish_non_exhaustive()
}
}
impl Web3Connections {
2022-05-17 20:15:18 +03:00
// #[instrument(name = "try_new_Web3Connections", skip_all)]
2022-05-05 22:07:09 +03:00
pub async fn try_new(
2022-05-12 21:49:57 +03:00
chain_id: usize,
2022-05-05 22:07:09 +03:00
servers: Vec<Web3ConnectionConfig>,
2022-05-22 02:34:05 +03:00
http_client: Option<&reqwest::Client>,
2022-05-22 21:39:06 +03:00
rate_limiter: Option<&redis_cell_client::MultiplexedConnection>,
2022-05-05 22:07:09 +03:00
) -> anyhow::Result<Arc<Self>> {
let num_connections = servers.len();
2022-05-16 01:02:14 +03:00
// turn configs into connections
let mut connections = Vec::with_capacity(num_connections);
2022-05-05 22:07:09 +03:00
for server_config in servers.into_iter() {
2022-05-12 21:49:57 +03:00
match server_config
2022-05-22 02:34:05 +03:00
.try_build(rate_limiter, chain_id, http_client)
2022-05-12 21:49:57 +03:00
.await
{
2022-05-06 00:38:15 +03:00
Ok(connection) => connections.push(connection),
2022-05-13 00:29:33 +03:00
Err(e) => warn!("Unable to connect to a server! {:?}", e),
2022-05-06 00:38:15 +03:00
}
2022-05-05 22:07:09 +03:00
}
2022-05-22 02:49:23 +03:00
// TODO: less than 3? what should we do here?
2022-05-16 01:02:14 +03:00
if connections.len() < 2 {
2022-05-22 03:34:33 +03:00
warn!("Only {} connection(s)!", connections.len());
2022-05-16 01:02:14 +03:00
}
2022-05-15 22:28:22 +03:00
2022-05-20 05:01:02 +03:00
let synced_connections = SyncedConnections::default();
2022-05-16 01:02:14 +03:00
let connections = Arc::new(Self {
inner: connections,
2022-05-18 23:18:01 +03:00
synced_connections: ArcSwap::new(Arc::new(synced_connections)),
2022-05-16 01:02:14 +03:00
});
2022-05-18 19:35:06 +03:00
Ok(connections)
}
pub async fn subscribe_heads(self: &Arc<Self>) {
// TODO: i don't think this needs to be very big
let (block_sender, block_receiver) = flume::bounded(16);
2022-05-18 19:35:06 +03:00
let mut handles = vec![];
2022-05-18 23:18:01 +03:00
for (rpc_id, connection) in self.inner.iter().enumerate() {
2022-05-18 19:35:06 +03:00
// subscribe to new heads in a spawned future
// TODO: channel instead. then we can have one future with write access to a left-right?
let connection = Arc::clone(connection);
let block_sender = block_sender.clone();
// let url = connection.url().to_string();
let handle = task::Builder::default()
.name("subscribe_new_heads")
2022-05-17 19:23:27 +03:00
.spawn(async move {
2022-05-18 19:35:06 +03:00
// loop to automatically reconnect
// TODO: make this cancellable?
// TODO: instead of passing Some(connections), pass Some(channel_sender). Then listen on the receiver below to keep local heads up-to-date
// TODO: proper spann
connection
2022-05-18 23:18:01 +03:00
.subscribe_new_heads(rpc_id, block_sender.clone(), true)
2022-05-18 19:35:06 +03:00
.instrument(tracing::info_span!("url"))
2022-05-17 19:23:27 +03:00
.await
});
2022-05-18 19:35:06 +03:00
handles.push(handle);
2022-05-16 01:02:14 +03:00
}
2022-05-18 19:35:06 +03:00
let connections = Arc::clone(self);
let handle = task::Builder::default()
.name("update_synced_rpcs")
.spawn(async move { connections.update_synced_rpcs(block_receiver).await });
handles.push(handle);
2022-05-20 05:01:02 +03:00
// TODO: do something with join_all's result
2022-05-18 19:35:06 +03:00
join_all(handles).await;
2022-05-05 22:07:09 +03:00
}
2022-05-18 23:18:01 +03:00
pub fn get_head_block_hash(&self) -> H256 {
*self.synced_connections.load().get_head_block_hash()
2022-05-16 01:02:14 +03:00
}
2022-05-05 22:07:09 +03:00
2022-05-28 07:26:24 +03:00
/// Send the same request to all the handles. Returning the most common success or most common error.
2022-05-17 03:56:56 +03:00
#[instrument(skip_all)]
2022-05-12 02:50:52 +03:00
pub async fn try_send_parallel_requests(
2022-05-28 07:26:24 +03:00
&self,
2022-05-12 02:50:52 +03:00
active_request_handles: Vec<ActiveRequestHandle>,
2022-05-28 07:26:24 +03:00
method: &str,
// TODO: remove this box once i figure out how to do the options
params: Option<&RawValue>,
) -> Result<Box<RawValue>, ProviderError> {
2022-05-12 06:54:42 +03:00
// TODO: if only 1 active_request_handles, do self.try_send_request
2022-05-05 22:07:09 +03:00
2022-05-28 07:26:24 +03:00
let responses = active_request_handles
.into_iter()
.map(|active_request_handle| async move {
let result: Result<Box<RawValue>, _> =
active_request_handle.request(method, params).await;
result
})
.collect::<FuturesUnordered<_>>()
.collect::<Vec<Result<Box<RawValue>, ProviderError>>>()
.await;
2022-05-28 21:45:45 +03:00
// TODO: Strings are not great keys, but we can't use RawValue or ProviderError as keys because they don't implement Hash or Eq
2022-05-28 07:26:24 +03:00
let mut count_map: HashMap<String, Result<Box<RawValue>, ProviderError>> = HashMap::new();
let mut counts: Counter<String> = Counter::new();
let mut any_ok = false;
for response in responses {
let s = format!("{:?}", response);
if count_map.get(&s).is_none() {
if response.is_ok() {
any_ok = true;
}
2022-05-17 19:23:27 +03:00
2022-05-28 07:26:24 +03:00
count_map.insert(s.clone(), response);
}
2022-05-05 22:07:09 +03:00
2022-05-28 07:26:24 +03:00
counts.update([s].into_iter());
2022-05-05 22:07:09 +03:00
}
2022-05-28 07:26:24 +03:00
for (most_common, _) in counts.most_common_ordered() {
let most_common = count_map.remove(&most_common).unwrap();
if any_ok && most_common.is_err() {
// errors were more common, but we are going to skip them because we got an okay
continue;
} else {
// return the most common
return most_common;
2022-05-05 22:07:09 +03:00
}
}
2022-05-28 07:26:24 +03:00
// TODO: what should we do if we get here? i don't think we will
panic!("i don't think this is possible")
2022-05-05 22:07:09 +03:00
}
2022-05-13 20:43:37 +03:00
/// TODO: possible dead lock here. investigate more. probably refactor
2022-05-18 23:18:01 +03:00
/// TODO: move parts of this onto SyncedConnections?
2022-05-20 05:16:48 +03:00
// we don't instrument here because we put a span inside the while loop
2022-05-16 01:02:14 +03:00
async fn update_synced_rpcs(
&self,
2022-05-18 23:18:01 +03:00
block_receiver: flume::Receiver<(u64, H256, usize)>,
) -> anyhow::Result<()> {
2022-05-18 23:18:01 +03:00
let max_connections = self.inner.len();
2022-05-18 23:28:00 +03:00
let mut connection_states: HashMap<usize, (u64, H256)> =
HashMap::with_capacity(max_connections);
2022-05-18 19:35:06 +03:00
2022-05-20 05:01:02 +03:00
let mut pending_synced_connections = SyncedConnections::default();
2022-05-18 23:18:01 +03:00
while let Ok((new_block_num, new_block_hash, rpc_id)) = block_receiver.recv_async().await {
2022-05-20 00:58:21 +03:00
// TODO: span with more in it?
2022-05-19 06:00:54 +03:00
// TODO: make sure i'm doing this span right
2022-05-20 00:58:21 +03:00
// TODO: show the actual rpc url?
let span = info_span!("block_receiver", rpc_id, new_block_num);
2022-05-18 19:35:06 +03:00
let _enter = span.enter();
2022-05-20 00:58:21 +03:00
if new_block_num == 0 {
warn!("rpc is still syncing");
}
2022-05-18 23:18:01 +03:00
connection_states.insert(rpc_id, (new_block_num, new_block_hash));
// TODO: do something to update the synced blocks
2022-05-18 23:28:00 +03:00
match new_block_num.cmp(&pending_synced_connections.head_block_num) {
2022-05-18 23:18:01 +03:00
cmp::Ordering::Greater => {
// the rpc's newest block is the new overall best block
2022-05-20 05:19:35 +03:00
// TODO: if trace, do the full block hash?
// TODO: only accept this block if it is a child of the current head_block
2022-05-20 05:19:35 +03:00
info!("new head: {}", new_block_hash);
2022-05-18 23:18:01 +03:00
2022-05-18 23:28:00 +03:00
pending_synced_connections.inner.clear();
2022-05-19 06:00:54 +03:00
pending_synced_connections.inner.insert(rpc_id);
2022-05-18 23:18:01 +03:00
2022-05-18 23:28:00 +03:00
pending_synced_connections.head_block_num = new_block_num;
2022-05-19 06:00:54 +03:00
// TODO: if the parent hash isn't our previous best block, ignore it
2022-05-18 23:28:00 +03:00
pending_synced_connections.head_block_hash = new_block_hash;
2022-05-18 23:18:01 +03:00
}
cmp::Ordering::Equal => {
2022-05-19 06:00:54 +03:00
if new_block_hash == pending_synced_connections.head_block_hash {
// this rpc has caught up with the best known head
// do not clear synced_connections.
// we just want to add this rpc to the end
// TODO: HashSet here? i think we get dupes if we don't
pending_synced_connections.inner.insert(rpc_id);
} else {
2022-05-18 23:18:01 +03:00
// same height, but different chain
2022-05-19 06:00:54 +03:00
// check connection_states to see which head block is more popular!
let mut rpc_ids_by_block: BTreeMap<H256, Vec<usize>> = BTreeMap::new();
let mut synced_rpcs = 0;
for (rpc_id, (block_num, block_hash)) in connection_states.iter() {
if *block_num != new_block_num {
// this connection isn't synced. we don't care what hash it has
continue;
}
synced_rpcs += 1;
let count = rpc_ids_by_block
.entry(*block_hash)
.or_insert_with(|| Vec::with_capacity(max_connections - 1));
count.push(*rpc_id);
}
let most_common_head_hash = rpc_ids_by_block
.iter()
.max_by(|a, b| a.1.len().cmp(&b.1.len()))
.map(|(k, _v)| k)
.unwrap();
2022-05-18 23:18:01 +03:00
warn!(
2022-05-19 06:00:54 +03:00
"chain is forked! {} possible heads. {}/{}/{} rpcs have {}",
rpc_ids_by_block.len(),
rpc_ids_by_block.get(most_common_head_hash).unwrap().len(),
synced_rpcs,
max_connections,
most_common_head_hash
2022-05-18 23:18:01 +03:00
);
2022-05-05 22:07:09 +03:00
2022-05-19 06:00:54 +03:00
// this isn't the best block in the tier. don't do anything
if !pending_synced_connections.inner.remove(&rpc_id) {
// we didn't remove anything. nothing more to do
continue;
}
// we removed. don't continue so that we update self.synced_connections
}
2022-05-18 23:18:01 +03:00
}
cmp::Ordering::Less => {
// this isn't the best block in the tier. don't do anything
2022-05-19 06:00:54 +03:00
if !pending_synced_connections.inner.remove(&rpc_id) {
// we didn't remove anything. nothing more to do
continue;
}
// we removed. don't continue so that we update self.synced_connections
2022-05-18 23:18:01 +03:00
}
}
// the synced connections have changed
2022-05-18 23:28:00 +03:00
let synced_connections = Arc::new(pending_synced_connections.clone());
2022-05-18 23:18:01 +03:00
if synced_connections.inner.len() == max_connections {
// TODO: more metrics
debug!("all head: {}", new_block_hash);
}
2022-05-20 05:11:50 +03:00
trace!(
"rpcs at {}: {:?}",
synced_connections.head_block_hash,
synced_connections.inner
);
// TODO: only publish if there are x (default 2) nodes synced to this block?
2022-05-18 23:18:01 +03:00
// do the arcswap
2022-05-18 23:28:00 +03:00
self.synced_connections.swap(synced_connections);
2022-05-06 08:44:30 +03:00
}
2022-05-05 22:07:09 +03:00
2022-05-18 19:35:06 +03:00
// TODO: if there was an error, we should return it
warn!("block_receiver exited!");
2022-05-05 22:07:09 +03:00
Ok(())
}
/// get the best available rpc server
2022-05-17 03:56:56 +03:00
#[instrument(skip_all)]
2022-05-22 02:34:05 +03:00
pub async fn next_upstream_server(&self) -> Result<ActiveRequestHandle, Option<Duration>> {
let mut earliest_retry_after = None;
2022-05-05 22:07:09 +03:00
2022-05-19 06:00:54 +03:00
let mut synced_rpc_ids: Vec<usize> = self
.synced_connections
.load()
.inner
.iter()
.cloned()
.collect();
2022-05-05 22:07:09 +03:00
2022-05-19 06:00:54 +03:00
let sort_cache: HashMap<usize, (f32, u32)> = synced_rpc_ids
2022-05-05 22:07:09 +03:00
.iter()
2022-05-18 23:18:01 +03:00
.map(|rpc_id| {
let rpc = self.inner.get(*rpc_id).unwrap();
let active_requests = rpc.active_requests();
let soft_limit = rpc.soft_limit();
2022-05-06 23:44:12 +03:00
let utilization = active_requests as f32 / soft_limit as f32;
2022-05-18 23:38:56 +03:00
(*rpc_id, (utilization, soft_limit))
2022-05-05 22:07:09 +03:00
})
.collect();
2022-05-19 06:00:54 +03:00
synced_rpc_ids.sort_unstable_by(|a, b| {
2022-05-18 23:38:56 +03:00
let (a_utilization, a_soft_limit) = sort_cache.get(a).unwrap();
let (b_utilization, b_soft_limit) = sort_cache.get(b).unwrap();
2022-05-06 23:44:12 +03:00
// TODO: i'm comparing floats. crap
match a_utilization
.partial_cmp(b_utilization)
.unwrap_or(cmp::Ordering::Equal)
{
cmp::Ordering::Equal => a_soft_limit.cmp(b_soft_limit),
x => x,
}
2022-05-05 22:07:09 +03:00
});
2022-05-19 06:00:54 +03:00
// now that the rpcs are sorted, try to get an active request handle for one of them
for rpc_id in synced_rpc_ids.into_iter() {
2022-05-18 23:18:01 +03:00
let rpc = self.inner.get(rpc_id).unwrap();
2022-05-05 22:07:09 +03:00
// increment our connection counter
2022-05-22 02:34:05 +03:00
match rpc.try_request_handle().await {
Err(retry_after) => {
earliest_retry_after = earliest_retry_after.min(Some(retry_after));
}
2022-05-06 23:44:12 +03:00
Ok(handle) => {
2022-05-18 23:18:01 +03:00
trace!("next server on {:?}: {:?}", self, rpc_id);
2022-05-06 23:44:12 +03:00
return Ok(handle);
}
2022-05-05 22:07:09 +03:00
}
}
2022-05-22 02:34:05 +03:00
warn!("no servers on {:?}! {:?}", self, earliest_retry_after);
2022-05-06 23:44:12 +03:00
2022-05-05 22:07:09 +03:00
// this might be None
2022-05-22 02:34:05 +03:00
Err(earliest_retry_after)
2022-05-05 22:07:09 +03:00
}
/// get all rpc servers that are not rate limited
2022-05-13 23:50:11 +03:00
/// returns servers even if they aren't in sync. This is useful for broadcasting signed transactions
2022-05-22 02:34:05 +03:00
pub async fn get_upstream_servers(&self) -> Result<Vec<ActiveRequestHandle>, Option<Duration>> {
let mut earliest_retry_after = None;
2022-05-05 22:07:09 +03:00
// TODO: with capacity?
let mut selected_rpcs = vec![];
for connection in self.inner.iter() {
// check rate limits and increment our connection counter
2022-05-22 02:34:05 +03:00
match connection.try_request_handle().await {
Err(retry_after) => {
earliest_retry_after = earliest_retry_after.min(Some(retry_after));
// this rpc is not available. skip it
}
Ok(handle) => selected_rpcs.push(handle),
2022-05-05 22:07:09 +03:00
}
}
if !selected_rpcs.is_empty() {
return Ok(selected_rpcs);
}
2022-05-22 02:34:05 +03:00
// return the earliest retry_after (if no rpcs are synced, this will be None)
Err(earliest_retry_after)
2022-05-05 22:07:09 +03:00
}
2022-05-28 07:26:24 +03:00
pub async fn try_send_all_upstream_servers(
&self,
request: JsonRpcRequest,
) -> anyhow::Result<JsonRpcForwardedResponse> {
// TODO: timeout on this loop
loop {
match self.get_upstream_servers().await {
Ok(active_request_handles) => {
// TODO: benchmark this compared to waiting on unbounded futures
// TODO: do something with this handle?
// TODO: this is not working right. simplify
let quorum_response = self
.try_send_parallel_requests(
active_request_handles,
request.method.as_ref(),
request.params.as_deref(),
)
.await?;
let response = JsonRpcForwardedResponse {
jsonrpc: "2.0".to_string(),
id: request.id,
result: Some(quorum_response),
error: None,
};
return Ok(response);
}
Err(None) => {
// TODO: return a 502?
// TODO: i don't think this will ever happen
2022-05-28 21:45:45 +03:00
return Err(anyhow::anyhow!("no available rpcs!"));
2022-05-28 07:26:24 +03:00
}
Err(Some(retry_after)) => {
// TODO: move this to a helper function
// sleep (TODO: with a lock?) until our rate limits should be available
// TODO: if a server catches up sync while we are waiting, we could stop waiting
sleep(retry_after).await;
warn!("All rate limits exceeded. Sleeping");
}
}
}
}
2022-05-05 22:07:09 +03:00
}