web3-proxy/deferred-rate-limiter/src/lib.rs

//#![warn(missing_docs)]
use log::error;
use moka::future::Cache;
use redis_rate_limiter::{RedisRateLimitResult, RedisRateLimiter};
use std::cmp::Eq;
use std::fmt::{Debug, Display};
use std::hash::Hash;
use std::sync::atomic::Ordering;
use std::sync::{atomic::AtomicU64, Arc};
use tokio::sync::Mutex;
use tokio::time::{Duration, Instant};

/// A local cache that sits in front of a RedisRateLimiter
/// Generic accross the key so it is simple to use with IPs or user keys
pub struct DeferredRateLimiter<K>
where
    K: Send + Sync,
{
    local_cache: Cache<K, Arc<AtomicU64>, hashbrown::hash_map::DefaultHashBuilder>,
    prefix: String,
    rrl: RedisRateLimiter,
    /// if None, defers to the max on rrl
    default_max_requests_per_period: Option<u64>,
}

pub enum DeferredRateLimitResult {
    Allowed,
    RetryAt(Instant),
    RetryNever,
}

impl<K> DeferredRateLimiter<K>
where
    K: Copy + Debug + Display + Hash + Eq + Send + Sync + 'static,
{
    pub fn new(
        // TODO: change this to cache_size in bytes
        cache_size: u64,
        prefix: &str,
        rrl: RedisRateLimiter,
        default_max_requests_per_second: Option<u64>,
    ) -> Self {
        let ttl = rrl.period as u64;

        // TODO: time to live is not exactly right. we want this ttl counter to start only after redis is down. this works for now
        // TODO: what do these weigh?
        // TODO: allow skipping max_capacity
        let local_cache = Cache::builder()
            .time_to_live(Duration::from_secs(ttl))
            .max_capacity(cache_size)
            .name(prefix)
            .build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default());

        Self {
            local_cache,
            prefix: prefix.to_string(),
            rrl,
            default_max_requests_per_period: default_max_requests_per_second,
        }
    }

    /// if setting max_per_period, be sure to keep the period the same for all requests to this label
    /// TODO: max_per_period being None means two things. some places it means unlimited, but here it means to use the default. make an enum
    pub async fn throttle(
        &self,
        key: K,
        max_requests_per_period: Option<u64>,
        count: u64,
    ) -> anyhow::Result<DeferredRateLimitResult> {
        let max_requests_per_period = max_requests_per_period.unwrap_or_else(|| {
            self.default_max_requests_per_period
                .unwrap_or(self.rrl.max_requests_per_period)
        });

        if max_requests_per_period == 0 {
            return Ok(DeferredRateLimitResult::RetryNever);
        }

        let deferred_rate_limit_result = Arc::new(Mutex::new(None));

        let redis_key = format!("{}:{}", self.prefix, key);

        // TODO: i'm sure this could be a lot better. but race conditions make this hard to think through. brain needs sleep
        let local_key_count: Arc<AtomicU64> = {
            // clone things outside of the `async move`
            let deferred_rate_limit_result = deferred_rate_limit_result.clone();
            let redis_key = redis_key.clone();
            let rrl = Arc::new(self.rrl.clone());

            // set arc_deferred_rate_limit_result and return the coun
            self.local_cache
                .get_with_by_ref(&key, async move {
                    // we do not use the try operator here because we want to be okay with redis errors
                    let redis_count = match rrl
                        .throttle_label(&redis_key, Some(max_requests_per_period), count)
                        .await
                    {
                        Ok(RedisRateLimitResult::Allowed(count)) => {
                            let _ = deferred_rate_limit_result
                                .lock()
                                .await
                                .insert(DeferredRateLimitResult::Allowed);
                            count
                        }
                        Ok(RedisRateLimitResult::RetryAt(retry_at, count)) => {
                            let _ = deferred_rate_limit_result
                                .lock()
                                .await
                                .insert(DeferredRateLimitResult::RetryAt(retry_at));
                            count
                        }
                        Ok(RedisRateLimitResult::RetryNever) => {
                            unreachable!();
                        }
                        Err(err) => {
                            let _ = deferred_rate_limit_result
                                .lock()
                                .await
                                .insert(DeferredRateLimitResult::Allowed);

                            // if we get a redis error, just let the user through.
                            // if users are sticky on a server, local caches will work well enough
                            // though now that we do this, we need to reset rate limits every minute! cache must have ttl!
                            error!("unable to rate limit! creating empty cache. err={:?}", err);
                            0
                        }
                    };

                    Arc::new(AtomicU64::new(redis_count))
                })
                .await
        };

        let mut locked = deferred_rate_limit_result.lock().await;

        if let Some(deferred_rate_limit_result) = locked.take() {
            // new entry. redis was already incremented
            // return the retry_at that we got from
            Ok(deferred_rate_limit_result)
        } else {
            // we have a cached amount here
            let cached_key_count = local_key_count.fetch_add(count, Ordering::Acquire);

            // assuming no other parallel futures incremented this key, this is the count that redis has
            let expected_key_count = cached_key_count + count;

            if expected_key_count > max_requests_per_period {
                // rate limit overshot!
                let now = self.rrl.now_as_secs();

                // do not fetch_sub
                // another row might have queued a redis throttle_label to keep our count accurate

                // show that we are rate limited without even querying redis
                let retry_at = self.rrl.next_period(now);
                Ok(DeferredRateLimitResult::RetryAt(retry_at))
            } else {
                // local caches think rate limit should be okay

                // prepare a future to update redis
                let rate_limit_f = {
                    let rrl = self.rrl.clone();
                    async move {
                        match rrl
                            .throttle_label(&redis_key, Some(max_requests_per_period), count)
                            .await
                        {
                            Ok(RedisRateLimitResult::Allowed(count)) => {
                                local_key_count.store(count, Ordering::Release);
                                DeferredRateLimitResult::Allowed
                            }
                            Ok(RedisRateLimitResult::RetryAt(retry_at, count)) => {
                                local_key_count.store(count, Ordering::Release);
                                DeferredRateLimitResult::RetryAt(retry_at)
                            }
                            Ok(RedisRateLimitResult::RetryNever) => {
                                // TODO: what should we do to arc_key_count?
                                DeferredRateLimitResult::RetryNever
                            }
                            Err(err) => {
                                // don't let redis errors block our users!
                                error!(
                                    "unable to query rate limits, but local cache is available. key={:?} err={:?}",
                                    key,
                                    err,
                                );
                                // TODO: we need to start a timer that resets this count every minute
                                DeferredRateLimitResult::Allowed
                            }
                        }
                    }
                };

                // if close to max_per_period, wait for redis
                // TODO: how close should we allow? depends on max expected concurent requests from one user
                let limit: f64 = (max_requests_per_period as f64 * 0.99)
                    .min(max_requests_per_period as f64 - 1.0);
                if expected_key_count > limit as u64 {
                    // close to period. don't risk it. wait on redis
                    Ok(rate_limit_f.await)
                } else {
                    // rate limit has enough headroom that it should be safe to do this in the background
                    // TODO: send an error here somewhere
                    tokio::spawn(rate_limit_f);

                    Ok(DeferredRateLimitResult::Allowed)
                }
            }
        }
    }
}
work in progress 2022-09-15 20:57:24 +03:00			`//#![warn(missing_docs)]`
cut out tracing for now 2022-11-12 11:24:32 +03:00			`use log::error;`
work in progress 2022-09-15 20:57:24 +03:00			`use moka::future::Cache;`
			`use redis_rate_limiter::{RedisRateLimitResult, RedisRateLimiter};`
			`use std::cmp::Eq;`
it compiles, but theres something wrong with moves 2022-09-17 02:02:55 +03:00			`use std::fmt::{Debug, Display};`
work in progress 2022-09-15 20:57:24 +03:00			`use std::hash::Hash;`
no need for an atomic bool 2022-09-20 01:17:24 +03:00			`use std::sync::atomic::Ordering;`
work in progress 2022-09-15 20:57:24 +03:00			`use std::sync::{atomic::AtomicU64, Arc};`
need a mutex, not a cell 2022-09-17 04:06:10 +03:00			`use tokio::sync::Mutex;`
use ahash. allow no redis 2022-09-17 04:19:11 +03:00			`use tokio::time::{Duration, Instant};`
work in progress 2022-09-15 20:57:24 +03:00
			`/// A local cache that sits in front of a RedisRateLimiter`
			`/// Generic accross the key so it is simple to use with IPs or user keys`
			`pub struct DeferredRateLimiter<K>`
			`where`
			`K: Send + Sync,`
			`{`
use cache's expiration and race-free get_with when this was a dashmap, we needed our own expiration and parallel requests would do the same query. with moka, we can use their expiration code and get_with 2022-09-20 04:33:39 +03:00			`local_cache: Cache<K, Arc<AtomicU64>, hashbrown::hash_map::DefaultHashBuilder>,`
work in progress 2022-09-15 20:57:24 +03:00			`prefix: String,`
			`rrl: RedisRateLimiter,`
login needs its own rate limiter 2022-09-24 06:59:21 +03:00			`/// if None, defers to the max on rrl`
			`default_max_requests_per_period: Option<u64>,`
work in progress 2022-09-15 20:57:24 +03:00			`}`

			`pub enum DeferredRateLimitResult {`
			`Allowed,`
			`RetryAt(Instant),`
			`RetryNever,`
			`}`

			`impl<K> DeferredRateLimiter<K>`
			`where`
it compiles, but theres something wrong with moves 2022-09-17 02:02:55 +03:00			`K: Copy + Debug + Display + Hash + Eq + Send + Sync + 'static,`
work in progress 2022-09-15 20:57:24 +03:00			`{`
login needs its own rate limiter 2022-09-24 06:59:21 +03:00			`pub fn new(`
comments and todos 2022-11-16 23:18:37 +03:00			`// TODO: change this to cache_size in bytes`
login needs its own rate limiter 2022-09-24 06:59:21 +03:00			`cache_size: u64,`
			`prefix: &str,`
			`rrl: RedisRateLimiter,`
			`default_max_requests_per_second: Option<u64>,`
			`) -> Self {`
use ahash. allow no redis 2022-09-17 04:19:11 +03:00			`let ttl = rrl.period as u64;`

larger max_capacity now that there is a weigher 2022-09-20 01:24:56 +03:00			`// TODO: time to live is not exactly right. we want this ttl counter to start only after redis is down. this works for now`
comments and todos 2022-11-16 23:18:37 +03:00			`// TODO: what do these weigh?`
comment 2022-12-29 00:53:36 +03:00			`// TODO: allow skipping max_capacity`
use ahash. allow no redis 2022-09-17 04:19:11 +03:00			`let local_cache = Cache::builder()`
			`.time_to_live(Duration::from_secs(ttl))`
			`.max_capacity(cache_size)`
			`.name(prefix)`
cargo upgrade 2022-11-11 21:40:52 +03:00			`.build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default());`
use ahash. allow no redis 2022-09-17 04:19:11 +03:00
work in progress 2022-09-15 20:57:24 +03:00			`Self {`
use ahash. allow no redis 2022-09-17 04:19:11 +03:00			`local_cache,`
work in progress 2022-09-15 20:57:24 +03:00			`prefix: prefix.to_string(),`
			`rrl,`
login needs its own rate limiter 2022-09-24 06:59:21 +03:00			`default_max_requests_per_period: default_max_requests_per_second,`
work in progress 2022-09-15 20:57:24 +03:00			`}`
			`}`

			`/// if setting max_per_period, be sure to keep the period the same for all requests to this label`
no need for an atomic bool 2022-09-20 01:17:24 +03:00			`/// TODO: max_per_period being None means two things. some places it means unlimited, but here it means to use the default. make an enum`
work in progress 2022-09-15 20:57:24 +03:00			`pub async fn throttle(`
			`&self,`
no timeouts here, we already have a timeout on requests 2022-09-20 06:26:12 +03:00			`key: K,`
login needs its own rate limiter 2022-09-24 06:59:21 +03:00			`max_requests_per_period: Option<u64>,`
work in progress 2022-09-15 20:57:24 +03:00			`count: u64,`
			`) -> anyhow::Result<DeferredRateLimitResult> {`
login needs its own rate limiter 2022-09-24 06:59:21 +03:00			`let max_requests_per_period = max_requests_per_period.unwrap_or_else(\|\| {`
			`self.default_max_requests_per_period`
			`.unwrap_or(self.rrl.max_requests_per_period)`
			`});`
work in progress 2022-09-15 20:57:24 +03:00
login needs its own rate limiter 2022-09-24 06:59:21 +03:00			`if max_requests_per_period == 0 {`
work in progress 2022-09-15 20:57:24 +03:00			`return Ok(DeferredRateLimitResult::RetryNever);`
			`}`

instrument more. add max_wait to wait_for_request_handle 2022-09-20 09:00:27 +03:00			`let deferred_rate_limit_result = Arc::new(Mutex::new(None));`
work in progress 2022-09-15 20:57:24 +03:00
it compiles, but theres something wrong with moves 2022-09-17 02:02:55 +03:00			`let redis_key = format!("{}:{}", self.prefix, key);`

work in progress 2022-09-15 20:57:24 +03:00			`// TODO: i'm sure this could be a lot better. but race conditions make this hard to think through. brain needs sleep`
no idle timeout or max lifetime 2022-09-20 01:41:53 +03:00			`let local_key_count: Arc<AtomicU64> = {`
no need for an atomic bool 2022-09-20 01:17:24 +03:00			// clone things outside of the `async move`
instrument more. add max_wait to wait_for_request_handle 2022-09-20 09:00:27 +03:00			`let deferred_rate_limit_result = deferred_rate_limit_result.clone();`
it compiles, but theres something wrong with moves 2022-09-17 02:02:55 +03:00			`let redis_key = redis_key.clone();`
			`let rrl = Arc::new(self.rrl.clone());`

no need for an atomic bool 2022-09-20 01:17:24 +03:00			`// set arc_deferred_rate_limit_result and return the coun`
need a mutex, not a cell 2022-09-17 04:06:10 +03:00			`self.local_cache`
Bryan devel 2023-05-12 (#67) * add minor todo * BadRequest instead of web3_context * more bad request error codes * use tokio-uring for the tcp listener * clear block instead of panic * clone earlier * more watch channels instead of rwlocks * drop uring for now (its single threaded) and combine get/post/put routes * clean up iter vs into_iter and unnecessary collect * arcswap instead of rwlock for Web3Rpcs.by_name * cargo upgrade * uuid fast-rng and alphabetize * if protected rpcs, only use protected rpcs * listenfd * make connectinfo optional * try_get_with_by_ref instead of try_get_with * anyhow ensure. and try_get_with_as_ref isn't actually needed * fix feature flags * more refs and less clone * automatic retry for eth_getTransactionReceipt and eth_getTransactionByHash thanks for the report Lefteris @ Rotki * ArcSwap for provider * set archive_request to true on transaction retrying * merge durable stats * Revert "ArcSwap for provider" This reverts commit 166d77f204cde9fa7722c0cefecbb27008749d47. * comments * less clones * more refs * fix test * add optional mimalloc feature * remove stale dependency * sort * cargo upgrade * lint constants * add todo * another todo * lint * anyhow::ensure instead of panic * allow rpc_accounting_v2 entries for requests without an rpc key 2023-05-13 01:15:32 +03:00			`.get_with_by_ref(&key, async move {`
need a mutex, not a cell 2022-09-17 04:06:10 +03:00			`// we do not use the try operator here because we want to be okay with redis errors`
			`let redis_count = match rrl`
login needs its own rate limiter 2022-09-24 06:59:21 +03:00			`.throttle_label(&redis_key, Some(max_requests_per_period), count)`
need a mutex, not a cell 2022-09-17 04:06:10 +03:00			`.await`
			`{`
no need for an atomic bool 2022-09-20 01:17:24 +03:00			`Ok(RedisRateLimitResult::Allowed(count)) => {`
no idle timeout or max lifetime 2022-09-20 01:41:53 +03:00			`let _ = deferred_rate_limit_result`
no need for an atomic bool 2022-09-20 01:17:24 +03:00			`.lock()`
			`.await`
			`.insert(DeferredRateLimitResult::Allowed);`
			`count`
			`}`
need a mutex, not a cell 2022-09-17 04:06:10 +03:00			`Ok(RedisRateLimitResult::RetryAt(retry_at, count)) => {`
no idle timeout or max lifetime 2022-09-20 01:41:53 +03:00			`let _ = deferred_rate_limit_result`
no need for an atomic bool 2022-09-20 01:17:24 +03:00			`.lock()`
			`.await`
			`.insert(DeferredRateLimitResult::RetryAt(retry_at));`
need a mutex, not a cell 2022-09-17 04:06:10 +03:00			`count`
			`}`
no need for an atomic bool 2022-09-20 01:17:24 +03:00			`Ok(RedisRateLimitResult::RetryNever) => {`
Bryan devel 2023-05-12 (#67) * add minor todo * BadRequest instead of web3_context * more bad request error codes * use tokio-uring for the tcp listener * clear block instead of panic * clone earlier * more watch channels instead of rwlocks * drop uring for now (its single threaded) and combine get/post/put routes * clean up iter vs into_iter and unnecessary collect * arcswap instead of rwlock for Web3Rpcs.by_name * cargo upgrade * uuid fast-rng and alphabetize * if protected rpcs, only use protected rpcs * listenfd * make connectinfo optional * try_get_with_by_ref instead of try_get_with * anyhow ensure. and try_get_with_as_ref isn't actually needed * fix feature flags * more refs and less clone * automatic retry for eth_getTransactionReceipt and eth_getTransactionByHash thanks for the report Lefteris @ Rotki * ArcSwap for provider * set archive_request to true on transaction retrying * merge durable stats * Revert "ArcSwap for provider" This reverts commit 166d77f204cde9fa7722c0cefecbb27008749d47. * comments * less clones * more refs * fix test * add optional mimalloc feature * remove stale dependency * sort * cargo upgrade * lint constants * add todo * another todo * lint * anyhow::ensure instead of panic * allow rpc_accounting_v2 entries for requests without an rpc key 2023-05-13 01:15:32 +03:00			`unreachable!();`
no need for an atomic bool 2022-09-20 01:17:24 +03:00			`}`
need a mutex, not a cell 2022-09-17 04:06:10 +03:00			`Err(err) => {`
no idle timeout or max lifetime 2022-09-20 01:41:53 +03:00			`let _ = deferred_rate_limit_result`
no need for an atomic bool 2022-09-20 01:17:24 +03:00			`.lock()`
			`.await`
			`.insert(DeferredRateLimitResult::Allowed);`

			`// if we get a redis error, just let the user through.`
			`// if users are sticky on a server, local caches will work well enough`
			`// though now that we do this, we need to reset rate limits every minute! cache must have ttl!`
cut out tracing for now 2022-11-12 11:24:32 +03:00			`error!("unable to rate limit! creating empty cache. err={:?}", err);`
need a mutex, not a cell 2022-09-17 04:06:10 +03:00			`0`
			`}`
			`};`

			`Arc::new(AtomicU64::new(redis_count))`
work in progress 2022-09-15 20:57:24 +03:00			`})`
need a mutex, not a cell 2022-09-17 04:06:10 +03:00			`.await`
work in progress 2022-09-15 20:57:24 +03:00			`};`

instrument more. add max_wait to wait_for_request_handle 2022-09-20 09:00:27 +03:00			`let mut locked = deferred_rate_limit_result.lock().await;`
no need for an atomic bool 2022-09-20 01:17:24 +03:00
			`if let Some(deferred_rate_limit_result) = locked.take() {`
it compiles, but theres something wrong with moves 2022-09-17 02:02:55 +03:00			`// new entry. redis was already incremented`
			`// return the retry_at that we got from`
no need for an atomic bool 2022-09-20 01:17:24 +03:00			`Ok(deferred_rate_limit_result)`
work in progress 2022-09-15 20:57:24 +03:00			`} else {`
			`// we have a cached amount here`
no idle timeout or max lifetime 2022-09-20 01:41:53 +03:00			`let cached_key_count = local_key_count.fetch_add(count, Ordering::Acquire);`
it compiles, but theres something wrong with moves 2022-09-17 02:02:55 +03:00
			`// assuming no other parallel futures incremented this key, this is the count that redis has`
			`let expected_key_count = cached_key_count + count;`

login needs its own rate limiter 2022-09-24 06:59:21 +03:00			`if expected_key_count > max_requests_per_period {`
it compiles, but theres something wrong with moves 2022-09-17 02:02:55 +03:00			`// rate limit overshot!`
			`let now = self.rrl.now_as_secs();`

			`// do not fetch_sub`
			`// another row might have queued a redis throttle_label to keep our count accurate`
work in progress 2022-09-15 20:57:24 +03:00
it compiles, but theres something wrong with moves 2022-09-17 02:02:55 +03:00			`// show that we are rate limited without even querying redis`
			`let retry_at = self.rrl.next_period(now);`
need a mutex, not a cell 2022-09-17 04:06:10 +03:00			`Ok(DeferredRateLimitResult::RetryAt(retry_at))`
it compiles, but theres something wrong with moves 2022-09-17 02:02:55 +03:00			`} else {`
			`// local caches think rate limit should be okay`

need a mutex, not a cell 2022-09-17 04:06:10 +03:00			`// prepare a future to update redis`
			`let rate_limit_f = {`
it compiles, but theres something wrong with moves 2022-09-17 02:02:55 +03:00			`let rrl = self.rrl.clone();`
			`async move {`
need a mutex, not a cell 2022-09-17 04:06:10 +03:00			`match rrl`
login needs its own rate limiter 2022-09-24 06:59:21 +03:00			`.throttle_label(&redis_key, Some(max_requests_per_period), count)`
need a mutex, not a cell 2022-09-17 04:06:10 +03:00			`.await`
			`{`
			`Ok(RedisRateLimitResult::Allowed(count)) => {`
no idle timeout or max lifetime 2022-09-20 01:41:53 +03:00			`local_key_count.store(count, Ordering::Release);`
need a mutex, not a cell 2022-09-17 04:06:10 +03:00			`DeferredRateLimitResult::Allowed`
			`}`
			`Ok(RedisRateLimitResult::RetryAt(retry_at, count)) => {`
no idle timeout or max lifetime 2022-09-20 01:41:53 +03:00			`local_key_count.store(count, Ordering::Release);`
need a mutex, not a cell 2022-09-17 04:06:10 +03:00			`DeferredRateLimitResult::RetryAt(retry_at)`
			`}`
			`Ok(RedisRateLimitResult::RetryNever) => {`
			`// TODO: what should we do to arc_key_count?`
			`DeferredRateLimitResult::RetryNever`
			`}`
			`Err(err) => {`
			`// don't let redis errors block our users!`
			`error!(`
cut out tracing for now 2022-11-12 11:24:32 +03:00			`"unable to query rate limits, but local cache is available. key={:?} err={:?}",`
			`key,`
			`err,`
need a mutex, not a cell 2022-09-17 04:06:10 +03:00			`);`
			`// TODO: we need to start a timer that resets this count every minute`
			`DeferredRateLimitResult::Allowed`
			`}`
			`}`
it compiles, but theres something wrong with moves 2022-09-17 02:02:55 +03:00			`}`
work in progress 2022-09-15 20:57:24 +03:00			`};`

it compiles, but theres something wrong with moves 2022-09-17 02:02:55 +03:00			`// if close to max_per_period, wait for redis`
			`// TODO: how close should we allow? depends on max expected concurent requests from one user`
fix overflow 2022-12-24 03:14:10 +03:00			`let limit: f64 = (max_requests_per_period as f64 * 0.99)`
			`.min(max_requests_per_period as f64 - 1.0);`
			`if expected_key_count > limit as u64 {`
it compiles, but theres something wrong with moves 2022-09-17 02:02:55 +03:00			`// close to period. don't risk it. wait on redis`
need a mutex, not a cell 2022-09-17 04:06:10 +03:00			`Ok(rate_limit_f.await)`
it compiles, but theres something wrong with moves 2022-09-17 02:02:55 +03:00			`} else {`
			`// rate limit has enough headroom that it should be safe to do this in the background`
salted recent ip tracking 2022-12-28 09:11:18 +03:00			`// TODO: send an error here somewhere`
cut out tracing for now 2022-11-12 11:24:32 +03:00			`tokio::spawn(rate_limit_f);`
need a mutex, not a cell 2022-09-17 04:06:10 +03:00
			`Ok(DeferredRateLimitResult::Allowed)`
it compiles, but theres something wrong with moves 2022-09-17 02:02:55 +03:00			`}`
			`}`
work in progress 2022-09-15 20:57:24 +03:00			`}`
			`}`
			`}`