improve rate limiting and request counters

This commit is contained in:
Bryan Stitt 2022-09-06 20:12:45 +00:00
parent 98265424bb
commit c34e8ef383
12 changed files with 93 additions and 50 deletions

@ -7,7 +7,7 @@ min_sum_soft_limit = 2000
min_synced_rpcs = 2
redis_url = "redis://dev-redis:6379/"
# TODO: how do we find the optimal redis_max_connections? too high actually ends up being slower
redis_max_connections = 99
redis_max_connections = 300
redirect_public_url = "https://llamanodes.com/free-rpc-stats"
redirect_user_url = "https://llamanodes.com/user-rpc-stats/{{user_id}}"
public_rate_limit_per_minute = 0

@ -1,14 +1,14 @@
pub use bb8_redis::bb8::ErrorSink as Bb8ErrorSync;
pub use bb8_redis::redis::RedisError;
use tracing::warn;
use tracing::error;
#[derive(Debug, Clone)]
pub struct RedisErrorSink;
impl Bb8ErrorSync<RedisError> for RedisErrorSink {
fn sink(&self, err: RedisError) {
warn!(?err, "redis error");
error!(?err, "redis error");
}
fn boxed_clone(&self) -> Box<dyn Bb8ErrorSync<RedisError>> {

@ -5,7 +5,7 @@ use anyhow::Context;
use bb8_redis::redis::pipe;
use std::ops::Add;
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use tracing::trace;
use tracing::{debug, trace};
pub use crate::errors::{RedisError, RedisErrorSink};
pub use bb8_redis::{bb8, redis, RedisConnectionManager};
@ -72,12 +72,15 @@ impl RedisRateLimit {
let mut conn = self.pool.get().await?;
// TODO: at high concurency, i think this is giving errors
// TODO: i'm starting to think that bb8 has a bug
let x: Vec<u64> = pipe()
// we could get the key first, but that means an extra redis call for every check. this seems better
.incr(&throttle_key, count)
// set expiration the first time we set the key. ignore the result
// set expiration each time we set the key. ignore the result
.expire(&throttle_key, self.period as usize)
// .arg("NX") // TODO: this works in redis, but not elasticache
// TODO: NX will make it only set the expiration the first time. works in redis, but not elasticache
// .arg("NX")
.ignore()
// do the query
.query_async(&mut *conn)
@ -91,12 +94,13 @@ impl RedisRateLimit {
let retry_at = Instant::now().add(Duration::from_secs_f32(seconds_left_in_period));
trace!(%label, ?retry_at, "rate limited");
debug!(%label, ?retry_at, "rate limited: {}/{}", new_count, max_per_period);
return Ok(ThrottleResult::RetryAt(retry_at));
Ok(ThrottleResult::RetryAt(retry_at))
} else {
trace!(%label, "NOT rate limited: {}/{}", new_count, max_per_period);
Ok(ThrottleResult::Allowed)
}
Ok(ThrottleResult::Allowed)
}
#[inline]

@ -39,7 +39,7 @@ use tokio::sync::{broadcast, watch};
use tokio::task::JoinHandle;
use tokio::time::{timeout, Instant};
use tokio_stream::wrappers::{BroadcastStream, WatchStream};
use tracing::{debug, info, info_span, instrument, trace, warn, Instrument};
use tracing::{info, info_span, instrument, trace, warn, Instrument};
use uuid::Uuid;
// TODO: make this customizable?
@ -78,8 +78,6 @@ pub struct Web3ProxyApp {
// TODO: broadcast channel instead?
head_block_receiver: watch::Receiver<ArcBlock>,
pending_tx_sender: broadcast::Sender<TxStatus>,
/// TODO: this doesn't ever get incremented!
pub active_requests: AtomicUsize,
pub config: AppConfig,
pub db_conn: Option<sea_orm::DatabaseConnection>,
/// store pending transactions that we've seen so that we don't send duplicates to subscribers
@ -265,6 +263,7 @@ impl Web3ProxyApp {
handles.push(balanced_handle);
let private_rpcs = if private_rpcs.is_empty() {
// TODO: do None instead of clone?
warn!("No private relays configured. Any transactions will be broadcast to the public mempool!");
balanced_rpcs.clone()
} else {
@ -310,7 +309,6 @@ impl Web3ProxyApp {
config: top_config.app,
balanced_rpcs,
private_rpcs,
active_requests: Default::default(),
response_cache,
head_block_receiver,
pending_tx_sender,
@ -527,8 +525,7 @@ impl Web3ProxyApp {
request: JsonRpcRequestEnum,
) -> anyhow::Result<JsonRpcForwardedResponseEnum> {
// TODO: this should probably be trace level
// trace!(?request, "proxy_web3_rpc");
debug!(?request, "proxying request");
trace!(?request, "proxy_web3_rpc");
// even though we have timeouts on the requests to our backend providers,
// we need a timeout for the incoming request so that retries don't run forever
@ -545,8 +542,7 @@ impl Web3ProxyApp {
};
// TODO: this should probably be trace level
// trace!(?response, "Forwarding");
debug!(?response.ids(), "forwarding response");
trace!(?response, "Forwarding");
Ok(response)
}

@ -127,7 +127,7 @@ fn main() -> anyhow::Result<()> {
// if RUST_LOG isn't set, configure a default
// TODO: is there a better way to do this?
if std::env::var("RUST_LOG").is_err() {
std::env::set_var("RUST_LOG", "info,web3_proxy=debug");
std::env::set_var("RUST_LOG", "info,redis_rate_limit=debug,web3_proxy=debug");
}
// install global collector configured based on RUST_LOG env var.

@ -1,7 +1,7 @@
use crate::app::Web3ProxyApp;
use axum::{http::StatusCode, response::IntoResponse, Extension, Json};
use serde_json::json;
use std::sync::{atomic::Ordering, Arc};
use std::sync::Arc;
/// Health check page for load balancers to use
pub async fn health(Extension(app): Extension<Arc<Web3ProxyApp>>) -> impl IntoResponse {
@ -15,15 +15,13 @@ pub async fn health(Extension(app): Extension<Arc<Web3ProxyApp>>) -> impl IntoRe
/// Very basic status page
/// TODO: replace this with proper stats and monitoring
pub async fn status(Extension(app): Extension<Arc<Web3ProxyApp>>) -> impl IntoResponse {
// TODO: what else should we include? uptime? prometheus?
// TODO: what else should we include? uptime?
let body = json!({
"balanced_rpcs": app.balanced_rpcs,
"private_rpcs": app.private_rpcs,
"num_active_requests": app.active_requests.load(Ordering::Acquire),
// TODO: include number of items?
"pending_transactions_count": app.pending_transactions.entry_count(),
"pending_transactions_size": app.pending_transactions.weighted_size(),
});
(StatusCode::OK, Json(body))
Json(body)
}

@ -11,7 +11,7 @@ use sea_orm::{
};
use std::{net::IpAddr, time::Duration};
use tokio::time::Instant;
use tracing::debug;
use tracing::{debug, error};
use uuid::Uuid;
pub enum RateLimitResult {
@ -119,6 +119,7 @@ impl TryFrom<RateLimitResult> for RequestFrom {
impl Web3ProxyApp {
pub async fn rate_limit_by_ip(&self, ip: IpAddr) -> anyhow::Result<RateLimitResult> {
// TODO: dry this up with rate_limit_by_key
// TODO: have a local cache because if we hit redis too hard we get errors
if let Some(rate_limiter) = &self.rate_limiter {
let rate_limiter_label = format!("ip-{}", ip);
@ -136,12 +137,13 @@ impl Web3ProxyApp {
}
Ok(ThrottleResult::RetryNever) => {
// TODO: prettier error for the user
return Err(anyhow::anyhow!("blocked by rate limiter"));
return Err(anyhow::anyhow!("ip blocked by rate limiter"));
}
Err(err) => {
// internal error, not rate limit being hit
// TODO: i really want axum to do this for us in a single place.
return Err(err);
error!(?err, "redis is unhappy. allowing ip");
return Ok(RateLimitResult::AllowedIp(ip));
}
}
} else {
@ -194,7 +196,7 @@ impl Web3ProxyApp {
}
};
// save for the next run
// save for the next run
self.user_cache.insert(user_key, user_data).await;
Ok(user_data)
@ -234,20 +236,39 @@ impl Web3ProxyApp {
// user key is valid. now check rate limits
if let Some(rate_limiter) = &self.rate_limiter {
// TODO: query redis in the background so that users don't have to wait on this network request
if rate_limiter
// TODO: better key? have a prefix so its easy to delete all of these
let rate_limiter_label = user_key.to_string();
match rate_limiter
.throttle_label(
&user_key.to_string(),
&rate_limiter_label,
Some(user_data.user_count_per_period),
1,
)
.await
.is_err()
{
// TODO: set headers so they know when they can retry
// warn!(?ip, "public rate limit exceeded"); // this is too verbose, but a stat might be good
// TODO: use their id if possible
// TODO: StatusCode::TOO_MANY_REQUESTS
return Err(anyhow::anyhow!("too many requests from this key"));
Ok(ThrottleResult::Allowed) => {}
Ok(ThrottleResult::RetryAt(_retry_at)) => {
// TODO: set headers so they know when they can retry
debug!(?rate_limiter_label, "user rate limit exceeded"); // this is too verbose, but a stat might be good
// TODO: use their id if possible
return Ok(RateLimitResult::UserRateLimitExceeded(user_data.user_id));
}
Ok(ThrottleResult::RetryNever) => {
// TODO: prettier error for the user
return Err(anyhow::anyhow!("user blocked by rate limiter"));
}
Err(err) => {
// internal error, not rate limit being hit
// rather than have downtime, i think its better to just use in-process rate limiting
// TODO: in-process rate limits that pipe into redis
error!(?err, "redis is unhappy. allowing ip");
return Ok(RateLimitResult::AllowedUser(user_data.user_id));
} // // TODO: set headers so they know when they can retry
// // warn!(?ip, "public rate limit exceeded"); // this is too verbose, but a stat might be good
// // TODO: use their id if possible
// // TODO: StatusCode::TOO_MANY_REQUESTS
// return Err(anyhow::anyhow!("too many requests from this key"));
}
} else {
// TODO: if no redis, rate limit with a local cache?

@ -458,7 +458,7 @@ impl Web3Connections {
debug!(con_head=%heavy_block_id, rpc_head=%rpc_head_str, %rpc, "con {}/{}/{}", num_consensus_rpcs, num_connection_heads, total_conns)
} else {
// hash changed
info!(con_head=%heavy_block_id, rpc_head=%rpc_head_str, old=%old_block_id, %rpc, "unc {}/{}/{}", num_consensus_rpcs, num_connection_heads, total_conns);
info!(con_head=%heavy_block_id, old=%old_block_id, rpc_head=%rpc_head_str, %rpc, "unc {}/{}/{}", num_consensus_rpcs, num_connection_heads, total_conns);
// todo!("handle equal by updating the cannonical chain");
self.save_block(&heavy_block, true).await?;

@ -28,6 +28,9 @@ pub struct Web3Connection {
url: String,
/// keep track of currently open requests. We sort on this
pub(super) active_requests: AtomicU32,
/// keep track of total requests
/// TODO: is this type okay?
pub(super) total_requests: AtomicU64,
/// provider is in a RwLock so that we can replace it if re-connecting
/// it is an async lock because we hold it open across awaits
pub(super) provider: AsyncRwLock<Option<Arc<Web3Provider>>>,
@ -35,7 +38,7 @@ pub struct Web3Connection {
hard_limit: Option<RedisRateLimit>,
/// used for load balancing to the least loaded server
pub(super) soft_limit: u32,
/// TODO: have an enum for this so that "no limit" prints pretty
/// TODO: have an enum for this so that "no limit" prints pretty?
block_data_limit: AtomicU64,
/// Lower weight are higher priority when sending requests
pub(super) weight: u32,
@ -82,6 +85,7 @@ impl Web3Connection {
name,
url: url_str.clone(),
active_requests: 0.into(),
total_requests: 0.into(),
provider: AsyncRwLock::new(Some(Arc::new(provider))),
hard_limit,
soft_limit,
@ -777,13 +781,18 @@ impl Serialize for Web3Connection {
S: Serializer,
{
// 3 is the number of fields in the struct.
let mut state = serializer.serialize_struct("Web3Connection", 5)?;
let mut state = serializer.serialize_struct("Web3Connection", 6)?;
// the url is excluded because it likely includes private information. just show the name
state.serialize_field("name", &self.name)?;
let block_data_limit = self.block_data_limit.load(atomic::Ordering::Relaxed);
state.serialize_field("block_data_limit", &block_data_limit)?;
if block_data_limit == u64::MAX {
state.serialize_field("block_data_limit", "None")?;
} else {
state.serialize_field("block_data_limit", &block_data_limit)?;
}
state.serialize_field("soft_limit", &self.soft_limit)?;
@ -792,6 +801,11 @@ impl Serialize for Web3Connection {
&self.active_requests.load(atomic::Ordering::Relaxed),
)?;
state.serialize_field(
"total_requests",
&self.total_requests.load(atomic::Ordering::Relaxed),
)?;
let head_block_id = &*self.head_block_id.read();
state.serialize_field("head_block_id", head_block_id)?;

@ -56,7 +56,7 @@ impl Web3Connections {
chain_id: u64,
server_configs: HashMap<String, Web3ConnectionConfig>,
http_client: Option<reqwest::Client>,
redis_client_pool: Option<redis_rate_limit::RedisPool>,
redis_pool: Option<redis_rate_limit::RedisPool>,
block_map: BlockHashesMap,
head_block_sender: Option<watch::Sender<ArcBlock>>,
min_sum_soft_limit: u32,
@ -83,7 +83,7 @@ impl Web3Connections {
async move {
loop {
// TODO: every time a head_block arrives (maybe with a small delay), or on the interval.
// TODO: every time a head_block arrives (with a small delay for known slow servers), or on the interval.
interval.tick().await;
trace!("http interval ready");
@ -108,7 +108,7 @@ impl Web3Connections {
.into_iter()
.map(|(server_name, server_config)| {
let http_client = http_client.clone();
let redis_client_pool = redis_client_pool.clone();
let redis_pool = redis_pool.clone();
let http_interval_sender = http_interval_sender.clone();
let block_sender = if head_block_sender.is_some() {
@ -124,7 +124,7 @@ impl Web3Connections {
server_config
.spawn(
server_name,
redis_client_pool,
redis_pool,
chain_id,
http_client,
http_interval_sender,
@ -159,11 +159,16 @@ impl Web3Connections {
}
}
// TODO: less than 3? what should we do here?
if connections.len() < 2 {
warn!("Only {} connection(s)!", connections.len());
if connections.len() < min_synced_rpcs {
return Err(anyhow::anyhow!(
"Only {}/{} connections!",
connections.len(),
min_synced_rpcs
));
}
// TODO: safety check on sum soft limit
let synced_connections = SyncedConnections::default();
// TODO: sizing and expiration on these caches!

@ -1,8 +1,7 @@
use std::time::Duration;
use derive_more::From;
use ethers::providers::Middleware;
use tracing::{error_span, info_span, instrument, Instrument};
use tracing::{info_span, instrument, Instrument};
/// Use HTTP and WS providers.
// TODO: instead of an enum, I tried to use Box<dyn Provider>, but hit <https://github.com/gakonst/ethers-rs/issues/592>

@ -22,12 +22,18 @@ pub struct OpenRequestHandle(Arc<Web3Connection>);
impl OpenRequestHandle {
pub fn new(connection: Arc<Web3Connection>) -> Self {
// TODO: attach a unique id to this?
// TODO: attach a unique id to this? customer requests have one, but not internal queries
// TODO: what ordering?!
connection
.active_requests
.fetch_add(1, atomic::Ordering::AcqRel);
// TODO: handle overflows?
// TODO: what ordering?
connection
.total_requests
.fetch_add(1, atomic::Ordering::Relaxed);
Self(connection)
}