another pass at server selection

This commit is contained in:
Bryan Stitt 2022-12-07 22:54:38 -08:00
parent 78a2119c07
commit 86f6b16761
5 changed files with 151 additions and 136 deletions

View File

@ -254,6 +254,9 @@ These are roughly in order of completition
- need to do all the connections in parallel with spawns
- [x] add block timestamp to the /status page
- [x] be sure to save the timestamp in a way that our request routing logic can make use of it
- [x] node selection still needs improvements. we still send to syncing nodes if they are close
- try consensus heads first! only if that is empty should we try others. and we should try them sorted by block height and then randomly chosen from there
- [ ] having the whole block in status is very verbose. trim it down
- [ ] `cost estimate` script
- sum bytes and number of requests. prompt hosting costs. divide
- [ ] `stat delay` script

View File

@ -189,6 +189,7 @@ pub async fn get_migrated_db(
min_connections: u32,
max_connections: u32,
) -> anyhow::Result<DatabaseConnection> {
// TODO: this seems to fail silently
let db_conn = get_db(db_url, min_connections, max_connections).await?;
let db_backend = db_conn.get_database_backend();

View File

@ -310,7 +310,7 @@ impl Web3Connection {
let oldest_block_num = head_block_num.saturating_sub(block_data_limit);
needed_block_num >= &oldest_block_num
*needed_block_num >= oldest_block_num
}
/// reconnect to the provider. errors are retried forever with exponential backoff with jitter.
@ -1017,6 +1017,19 @@ impl Web3Connection {
// TODO? ready_provider: Option<&Arc<Web3Provider>>,
allow_not_ready: bool,
) -> anyhow::Result<OpenRequestResult> {
// TODO: think more about this read block
if !allow_not_ready
&& self
.provider_state
.read()
.await
.provider(allow_not_ready)
.await
.is_none()
{
return Ok(OpenRequestResult::NotReady);
}
// check rate limits
if let Some(ratelimiter) = self.hard_limit.as_ref() {
// TODO: how should we know if we should set expire or not?

View File

@ -10,6 +10,7 @@ use crate::config::{BlockAndRpc, TxHashAndRpc, Web3ConnectionConfig};
use crate::frontend::authorization::{Authorization, RequestMetadata};
use crate::jsonrpc::{JsonRpcForwardedResponse, JsonRpcRequest};
use crate::rpcs::transactions::TxStatus;
use anyhow::Context;
use arc_swap::ArcSwap;
use counter::Counter;
use derive_more::From;
@ -25,6 +26,7 @@ use serde::ser::{SerializeStruct, Serializer};
use serde::Serialize;
use serde_json::json;
use serde_json::value::RawValue;
use std::collections::BTreeMap;
use std::fmt;
use std::sync::atomic::Ordering;
use std::sync::Arc;
@ -371,163 +373,159 @@ impl Web3Connections {
skip: &[Arc<Web3Connection>],
min_block_needed: Option<&U64>,
) -> anyhow::Result<OpenRequestResult> {
let usable_rpcs_by_head_num: BTreeMap<U64, Vec<Arc<Web3Connection>>> =
if let Some(min_block_needed) = min_block_needed {
// need a potentially old block. check all the rpcs
// TODO: we are going to be checking "has_block_data" a lot now
let mut m = BTreeMap::new();
for x in self
.conns
.values()
.filter(|x| !skip.contains(x))
.filter(|x| x.has_block_data(min_block_needed))
.cloned()
{
let x_head_block = x.head_block.read().clone();
match x_head_block {
None => continue,
Some(x_head) => {
m.entry(x_head.number()).or_insert_with(Vec::new).push(x);
}
}
}
m
} else {
// need latest. filter the synced rpcs
// TODO: double check has_block_data?
let synced_connections = self.synced_connections.load();
let head_num = match synced_connections.head_block.as_ref() {
None => return Ok(OpenRequestResult::NotReady),
Some(x) => x.number(),
};
let c: Vec<_> = synced_connections
.conns
.iter()
.filter(|x| !skip.contains(x))
.cloned()
.collect();
BTreeMap::from([(head_num, c)])
};
let mut earliest_retry_at = None;
let usable_rpcs: Vec<Arc<Web3Connection>> = if let Some(min_block_needed) = min_block_needed
{
// need a potentially old block. check all the rpcs
// TODO: we are going to be checking "has_block_data" a lot now
self.conns
.values()
.filter(|x| !skip.contains(x))
.filter(|x| x.has_block_data(min_block_needed))
.cloned()
.collect()
} else {
// need latest. filter the synced rpcs
// TODO: double check has_block_data?
self.synced_connections
.load()
.conns
for usable_rpcs in usable_rpcs_by_head_num.into_values().rev() {
let mut minimum = f64::MAX;
// we sort on a combination of values. cache them here so that we don't do this math multiple times.
let mut available_request_map: HashMap<_, f64> = usable_rpcs
.iter()
.filter(|x| !skip.contains(x))
.cloned()
.collect()
};
.map(|rpc| {
// TODO: are active requests what we want? do we want a counter for requests in the last second + any actives longer than that?
// TODO: get active requests out of redis (that's definitely too slow)
// TODO: do something with hard limit instead? (but that is hitting redis too much)
let active_requests = rpc.active_requests() as f64;
let soft_limit = rpc.soft_limit as f64 * rpc.weight;
match usable_rpcs.len() {
0 => {
warn!(
"no rpcs @ {:?}: {:?} (skipped {:?})",
min_block_needed,
self.synced_connections.load(),
skip.iter().map(|x| &x.name).collect::<Vec<_>>()
);
// TODO: what should happen here? automatic retry?
// TODO: more detailed error
return Ok(OpenRequestResult::NotReady);
// TODO: maybe store weight as the percentile
let available_requests = soft_limit - active_requests;
trace!("available requests on {}: {}", rpc, available_requests);
// under heavy load, it is possible for even our best server to be negative
minimum = available_requests.min(minimum);
// TODO: clone needed?
(rpc, available_requests)
})
.collect();
trace!("minimum available requests: {}", minimum);
// weights can't have negative numbers. shift up if any are negative
if minimum < 0.0 {
available_request_map = available_request_map
.into_iter()
.map(|(rpc, weight)| {
// TODO: is simple addition the right way to shift everyone?
// TODO: probably want something non-linear
// minimum is negative, so we subtract
let x = weight - minimum;
(rpc, x)
})
.collect()
}
1 => {
let rpc = usable_rpcs.get(0).expect("len is 1");
// TODO: try or wait for a request handle?
let handle = rpc
.wait_for_request_handle(authorization, Duration::from_secs(60), false)
.await?;
let sorted_rpcs = {
if usable_rpcs.len() == 1 {
// TODO: return now instead?
vec![usable_rpcs.get(0).expect("there should be 1")]
} else {
let mut rng = thread_fast_rng::thread_fast_rng();
return Ok(OpenRequestResult::Handle(handle));
}
_ => {
// anything else and we need to pick with a weighted random chooser
// TODO: sort or weight the non-archive nodes to be first
usable_rpcs
.choose_multiple_weighted(&mut rng, usable_rpcs.len(), |rpc| {
*available_request_map
.get(rpc)
.expect("rpc should always be in the weight map")
})
.unwrap()
.collect::<Vec<_>>()
}
};
// now that the rpcs are sorted, try to get an active request handle for one of them
for best_rpc in sorted_rpcs.into_iter() {
// increment our connection counter
match best_rpc.try_request_handle(authorization, false).await {
Ok(OpenRequestResult::Handle(handle)) => {
// // trace!("next server on {:?}: {:?}", self, best_rpc);
return Ok(OpenRequestResult::Handle(handle));
}
Ok(OpenRequestResult::RetryAt(retry_at)) => {
earliest_retry_at = earliest_retry_at.min(Some(retry_at));
}
Ok(OpenRequestResult::NotReady) => {
// TODO: log a warning?
}
Err(err) => {
// TODO: log a warning?
warn!("No request handle for {}. err={:?}", best_rpc, err)
}
}
}
}
let mut minimum = f64::MAX;
// we sort on a bunch of values. cache them here so that we don't do this math multiple times.
let available_request_map: HashMap<_, f64> = usable_rpcs
.iter()
.map(|rpc| {
// TODO: are active requests what we want? do we want a counter for requests in the last second + any actives longer than that?
// TODO: get active requests out of redis (that's definitely too slow)
// TODO: do something with hard limit instead? (but that is hitting redis too much)
let active_requests = rpc.active_requests() as f64;
let soft_limit = rpc.soft_limit as f64 * rpc.weight;
// TODO: maybe store weight as the percentile
let available_requests = soft_limit - active_requests;
trace!("available requests on {}: {}", rpc, available_requests);
// under heavy load, it is possible for even our best server to be negative
minimum = available_requests.min(minimum);
(rpc.clone(), available_requests)
})
.collect();
trace!("minimum available requests: {}", minimum);
// weights can't have negative numbers. shift up if any are negative
let available_request_map: HashMap<_, f64> = if minimum < 0.0 {
available_request_map
.into_iter()
.map(|(rpc, weight)| {
// TODO: is simple addition the right way to shift everyone?
// TODO: probably want something non-linear
// minimum is negative, so we subtract
let x = weight - minimum;
(rpc, x)
})
.collect()
} else {
available_request_map
};
let sorted_rpcs = {
if usable_rpcs.len() == 1 {
vec![usable_rpcs.get(0).expect("there should be 1")]
} else {
let mut rng = thread_fast_rng::thread_fast_rng();
// TODO: sort or weight the non-archive nodes to be first
usable_rpcs
.choose_multiple_weighted(&mut rng, usable_rpcs.len(), |rpc| {
*available_request_map
.get(rpc)
.expect("rpc should always be in the weight map")
})
.unwrap()
.collect::<Vec<_>>()
}
};
// now that the rpcs are sorted, try to get an active request handle for one of them
for rpc in sorted_rpcs.iter() {
// increment our connection counter
match rpc.try_request_handle(authorization, false).await {
Ok(OpenRequestResult::Handle(handle)) => {
// // trace!("next server on {:?}: {:?}", self, rpc);
return Ok(OpenRequestResult::Handle(handle));
}
Ok(OpenRequestResult::RetryAt(retry_at)) => {
earliest_retry_at = earliest_retry_at.min(Some(retry_at));
}
Ok(OpenRequestResult::NotReady) => {
// TODO: log a warning?
}
Err(err) => {
// TODO: log a warning?
warn!("No request handle for {}. err={:?}", rpc, err)
}
}
if let Some(request_metadata) = request_metadata {
request_metadata.no_servers.fetch_add(1, Ordering::Release);
}
match earliest_retry_at {
None => {
// none of the servers gave us a time to retry at
if let Some(request_metadata) = request_metadata {
request_metadata.no_servers.fetch_add(1, Ordering::Release);
}
// TODO: bring this back?
// we could return an error here, but maybe waiting a second will fix the problem
// TODO: configurable max wait? the whole max request time, or just some portion?
let handle = sorted_rpcs
.get(0)
.expect("at least 1 is available")
.wait_for_request_handle(authorization, Duration::from_secs(3), false)
.await?;
// let handle = sorted_rpcs
// .get(0)
// .expect("at least 1 is available")
// .wait_for_request_handle(authorization, Duration::from_secs(3), false)
// .await?;
// Ok(OpenRequestResult::Handle(handle))
Ok(OpenRequestResult::Handle(handle))
Ok(OpenRequestResult::NotReady)
}
Some(earliest_retry_at) => {
warn!("no servers on {:?}! {:?}", self, earliest_retry_at);
if let Some(request_metadata) = request_metadata {
request_metadata.no_servers.fetch_add(1, Ordering::Release);
}
Ok(OpenRequestResult::RetryAt(earliest_retry_at))
}
}

View File

@ -8,7 +8,7 @@ use entities::revert_log;
use entities::sea_orm_active_enums::Method;
use ethers::providers::{HttpClientError, ProviderError, WsClientError};
use ethers::types::{Address, Bytes};
use log::{debug, error, info, trace, warn, Level};
use log::{debug, error, trace, warn, Level};
use metered::metered;
use metered::HitCount;
use metered::ResponseTime;