atomic and less expires

2022-09-20 06:56:24 +00:00 · 2022-09-20 06:56:24 +00:00 · abe5001792
commit abe5001792
parent 739947792a
9 changed files with 42 additions and 86 deletions
--- a/deferred-rate-limiter/src/lib.rs
+++ b/deferred-rate-limiter/src/lib.rs
@ -79,7 +79,7 @@ where
                .get_with(key, async move {
                    // we do not use the try operator here because we want to be okay with redis errors
                    let redis_count = match rrl
-                        .throttle_label(&redis_key, Some(max_per_period), count)
+                        .throttle_label(&redis_key, Some(max_per_period), count, true)
                        .await
                    {
                        Ok(RedisRateLimitResult::Allowed(count)) => {
@ -149,7 +149,7 @@ where
                    let rrl = self.rrl.clone();
                    async move {
                        match rrl
-                            .throttle_label(&redis_key, Some(max_per_period), count)
+                            .throttle_label(&redis_key, Some(max_per_period), count, false)
                            .await
                        {
                            Ok(RedisRateLimitResult::Allowed(count)) => {
--- a/redis-rate-limiter/src/lib.rs
+++ b/redis-rate-limiter/src/lib.rs
@ -1,5 +1,6 @@
 //#![warn(missing_docs)]
 use anyhow::Context;
+use deadpool_redis::redis::AsyncCommands;
 use std::ops::Add;
 use std::time::{SystemTime, UNIX_EPOCH};
 use tokio::time::{Duration, Instant};
@ -71,6 +72,7 @@ impl RedisRateLimiter {
        label: &str,
        max_per_period: Option<u64>,
        count: u64,
+        expire: bool,
    ) -> anyhow::Result<RedisRateLimitResult> {
        let max_per_period = max_per_period.unwrap_or(self.max_requests_per_period);

@ -92,23 +94,32 @@ impl RedisRateLimiter {
            .await
            .context("get redis connection for rate limits")?;

-        // TODO: at high concurency, i think this is giving errors
-        // TODO: i'm starting to think that bb8 has a bug
-        let x: Vec<u64> = redis::pipe()
-            // we could get the key first, but that means an extra redis call for every check. this seems better
-            .incr(&throttle_key, count)
-            // set expiration each time we set the key. ignore the result
-            .expire(&throttle_key, self.period as usize)
-            // TODO: NX will make it only set the expiration the first time. works in redis, but not elasticache
-            // .arg("NX")
-            .ignore()
-            // do the query
-            .query_async(&mut *conn)
-            .await
-            .context("increment rate limit")?;
+        // TODO: at high concurency, this gives "connection reset by peer" errors. at least they are off the hot path
+        // TODO: only set expire if this is a new key
+        let new_count: u64 = if expire {
+            trace!("redis incr+expire");
+            // TODO: automatic retry
+            let x: Vec<_> = redis::pipe()
+                .atomic()
+                // we could get the key first, but that means an extra redis call for every check. this seems better
+                .incr(&throttle_key, count)
+                // set expiration each time we set the key. ignore the result
+                .expire(&throttle_key, 1 + self.period as usize)
+                // TODO: NX will make it only set the expiration the first time. works in redis, but not elasticache
+                // .arg("NX")
+                .ignore()
+                // do the query
+                .query_async(&mut *conn)
+                .await
+                .context("increment rate limit and set expiration")?;

-        // TODO: is there a better way to do this?
-        let new_count = *x.first().context("check rate limit result")?;
+            *x.first().expect("check redis")
+        } else {
+            trace!("redis incr only");
+            conn.incr(&throttle_key, count)
+                .await
+                .context("increment rate limit")?
+        };

        if new_count > max_per_period {
            // TODO: this might actually be early if we are way over the count
@ -124,7 +135,7 @@ impl RedisRateLimiter {
    }

    #[inline]
-    pub async fn throttle(&self) -> anyhow::Result<RedisRateLimitResult> {
-        self.throttle_label("", None, 1).await
+    pub async fn throttle(&self, expire: bool) -> anyhow::Result<RedisRateLimitResult> {
+        self.throttle_label("", None, 1, expire).await
    }
 }
--- a/web3_proxy/src/app.rs
+++ b/web3_proxy/src/app.rs
@ -220,7 +220,6 @@ impl Web3ProxyApp {
                    .unwrap_or(num_workers * 2);

                // TODO: what are reasonable timeouts?
-                // TODO: set a wait timeout? maybe somehow just emit a warning if this is long
                let redis_pool = RedisConfig::from_url(redis_url)
                    .builder()?
                    .max_size(redis_max_connections)
--- a/web3_proxy/src/bin/web3_proxy.rs
+++ b/web3_proxy/src/bin/web3_proxy.rs
@ -18,7 +18,7 @@ use tracing::{debug, info};
 use tracing_subscriber::EnvFilter;
 use web3_proxy::app::{flatten_handle, Web3ProxyApp};
 use web3_proxy::config::{CliConfig, TopConfig};
-use web3_proxy::{frontend, metrics};
+use web3_proxy::{frontend, metrics_frontend};

 fn run(
    shutdown_receiver: flume::Receiver<()>,
@ -75,7 +75,7 @@ fn run(

        let frontend_handle = tokio::spawn(frontend::serve(app_frontend_port, app.clone()));

-        let prometheus_handle = tokio::spawn(metrics::serve(app, app_prometheus_port));
+        let prometheus_handle = tokio::spawn(metrics_frontend::serve(app, app_prometheus_port));

        // if everything is working, these should both run forever
        // TODO: try_join these instead? use signal_shutdown here?
--- a/web3_proxy/src/frontend/rate_limit.rs
+++ b/web3_proxy/src/frontend/rate_limit.rs
@ -83,12 +83,15 @@ impl Web3ProxyApp {
        }
    }

+    // check the local cache for user data, or query the database
    pub(crate) async fn user_data(&self, user_key: Uuid) -> anyhow::Result<UserCacheValue> {
        let db = self.db_conn.as_ref().context("no database")?;

        let user_data: Result<_, Arc<anyhow::Error>> = self
            .user_cache
            .try_get_with(user_key, async move {
+                trace!(?user_key, "user_cache miss");
+
                /// helper enum for querying just a few columns instead of the entire table
                #[derive(Copy, Clone, Debug, EnumIter, DeriveColumn)]
                enum QueryAs {
@ -130,7 +133,6 @@ impl Web3ProxyApp {
    }

    pub async fn rate_limit_by_key(&self, user_key: Uuid) -> anyhow::Result<RateLimitResult> {
-        // check the local cache fo user data to save a database query
        let user_data = self.user_data(user_key).await?;

        if user_data.user_id == 0 {
--- a/web3_proxy/src/lib.rs
+++ b/web3_proxy/src/lib.rs
@ -3,6 +3,7 @@ pub mod block_number;
 pub mod config;
 pub mod frontend;
 pub mod jsonrpc;
-pub mod metrics;
+pub mod metered;
+pub mod metrics_frontend;
 pub mod rpcs;
 pub mod users;
--- a/web3_proxy/src/metrics.rs
+++ b/web3_proxy/src/metrics.rs
@ -1,58 +0,0 @@
-use axum::headers::HeaderName;
-use axum::http::HeaderValue;
-use axum::response::{IntoResponse, Response};
-use axum::{routing::get, Extension, Router};
-use std::net::SocketAddr;
-use std::sync::Arc;
-use tracing::{info, instrument};
-
-use crate::app::Web3ProxyApp;
-
-/// Run a prometheus metrics server on the given port.
-#[instrument(skip_all)]
-pub async fn serve(app: Arc<Web3ProxyApp>, port: u16) -> anyhow::Result<()> {
-    // build our application with a route
-    // order most to least common
-    // TODO: 404 any unhandled routes?
-    let app = Router::new().route("/", get(root)).layer(Extension(app));
-
-    // run our app with hyper
-    // TODO: allow only listening on localhost?
-    let addr = SocketAddr::from(([0, 0, 0, 0], port));
-    info!("prometheus listening on port {}", port);
-    // TODO: into_make_service is enough if we always run behind a proxy. make into_make_service_with_connect_info optional?
-
-    /*
-    It sequentially looks for an IP in:
-      - x-forwarded-for header (de-facto standard)
-      - x-real-ip header
-      - forwarded header (new standard)
-      - axum::extract::ConnectInfo (if not behind proxy)
-
-    So we probably won't need into_make_service_with_connect_info, but it shouldn't hurt
-    */
-    let service = app.into_make_service_with_connect_info::<SocketAddr>();
-    // let service = app.into_make_service();
-
-    // `axum::Server` is a re-export of `hyper::Server`
-    axum::Server::bind(&addr)
-        // TODO: option to use with_connect_info. we want it in dev, but not when running behind a proxy, but not
-        .serve(service)
-        .await
-        .map_err(Into::into)
-}
-
-#[instrument(skip_all)]
-async fn root(Extension(app): Extension<Arc<Web3ProxyApp>>) -> Response {
-    let serialized = app.prometheus_metrics();
-
-    let mut r = serialized.into_response();
-
-    // // TODO: is there an easier way to do this?
-    r.headers_mut().insert(
-        HeaderName::from_static("content-type"),
-        HeaderValue::from_static("application/openmetrics-text; version=1.0.0; charset=utf-8"),
-    );
-
-    r
-}
--- a/web3_proxy/src/rpcs/connection.rs
+++ b/web3_proxy/src/rpcs/connection.rs
@ -766,7 +766,8 @@ impl Web3Connection {

        // check rate limits
        if let Some(ratelimiter) = self.hard_limit.as_ref() {
-            match ratelimiter.throttle().await? {
+            // TODO: how should we know if we should set expire or not?
+            match ratelimiter.throttle(true).await? {
                RedisRateLimitResult::Allowed(_) => {
                    trace!("rate limit succeeded")
                }
--- a/web3_proxy/src/rpcs/request.rs
+++ b/web3_proxy/src/rpcs/request.rs
@ -1,8 +1,8 @@
 use super::connection::Web3Connection;
 use super::provider::Web3Provider;
+use crate::metered::{JsonRpcErrorCount, ProviderErrorCount};
 use ethers::providers::ProviderError;
 use metered::metered;
-use metered::ErrorCount;
 use metered::HitCount;
 use metered::ResponseTime;
 use metered::Throughput;
@ -65,7 +65,7 @@ impl OpenRequestHandle {
    /// TODO: we no longer take self because metered doesn't like that
    /// TODO: ErrorCount includes too many types of errors, such as transaction reverts
    #[instrument(skip_all)]
-    #[measure([ErrorCount, HitCount, ResponseTime, Throughput])]
+    #[measure([JsonRpcErrorCount, HitCount, ProviderErrorCount, ResponseTime, Throughput])]
    pub async fn request<T, R>(
        &self,
        method: &str,