ingested changes from upstream main. added another cli admin endpoint

2023-01-29 23:27:34 +01:00 · 2023-01-29 23:27:34 +01:00 · 04687b3392
commit 04687b3392
parent 942865b6ac 5628068888
45 changed files with 4625 additions and 2142 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -9,9 +9,7 @@ members = [
 ]

 [profile.release]
-# we leave debug = true on so that sentry can give us line numbers
+# `debug = true` so that sentry can give us line numbers
 debug = true
-# TODO: enable lto (and maybe other things proven with benchmarks) once rapid development is done
-#lto = true
-
-# TODO: we can't do panic = abort because the websockets disconnect by panicking sometimes
+# spend longer compiling for a slightly faster binary
+codegen-units = 1
--- a/8
+++ b/8
@ -8,12 +8,14 @@ COPY . .
 RUN --mount=type=cache,target=/usr/local/cargo/registry \
    --mount=type=cache,target=/usr/src/web3_proxy/target \
    cargo test &&\
-    cargo install --locked --root /opt/bin --path ./web3_proxy
+    cargo install --locked --no-default-features --root /opt/bin --path ./web3_proxy

 FROM debian:bullseye-slim

 COPY --from=builder /opt/bin/* /usr/local/bin/
-ENTRYPOINT ["web3_proxy"]
+
+ENTRYPOINT ["web3_proxy_cli"]
+CMD [ "--config", "/web3-proxy.toml", "proxyd" ]

 # TODO: lower log level when done with prototyping
-ENV RUST_LOG "web3_proxy=debug"
+ENV RUST_LOG "warn,web3_proxy=debug,web3_proxy_cli=debug"
--- a/README.md
+++ b/README.md
@ -37,7 +37,7 @@ Options:
 Start the server with the defaults (listen on `http://localhost:8544` and use `./config/development.toml` which uses the database and cache running under docker and proxies to a bunch of public nodes:

 ```
-cargo run --release
+cargo run --release -- daemon
 ```

 ## Common commands
@ -45,7 +45,7 @@ cargo run --release
 Create a user:

 ```
-cargo run --bin web3_proxy_cli -- --db-url "$YOUR_DB_URL" create_user --address "$USER_ADDRESS_0x"
+cargo run -- --db-url "$YOUR_DB_URL" create_user --address "$USER_ADDRESS_0x"
 ```

 Check that the proxy is working:
@ -104,7 +104,7 @@ web3_proxy_cli --config ... change_user_tier_by_key "$RPC_ULID_KEY_FROM_PREV_COM
 Health check 3 servers and error if the first one doesn't match the others.

 ```
-web3_proxy_cli https://eth.llamarpc.com/ https://rpc.ankr.com/eth https://cloudflare-eth.com
+web3_proxy_cli health_compass https://eth.llamarpc.com/ https://rpc.ankr.com/eth https://cloudflare-eth.com
 ```

 ## Adding new database tables
--- a/TODO.md
+++ b/TODO.md
@ -300,6 +300,32 @@ These are not yet ordered. There might be duplicates. We might not actually need
 - [x] if private txs are disabled, only send trasactions to some of our servers. we were DOSing ourselves with transactions and slowing down sync
 - [x] retry if we get "the method X is not available"
 - [x] remove weight. we don't use it anymore. tiers are what we use now
+- [x] make deadlock feature optional
+- [x] standalone healthcheck daemon (sentryd)
+- [x] status page should show version
+- [x] combine the proxy and cli into one bin
+- [x] improve rate limiting on websockets
+- [x] retry another server if we get a jsonrpc response error about rate limits
+- [x] major refactor to only use backup servers when absolutely necessary
+- [x] remove allowed lag
+- [x] configurable gas buffer. default to the larger of 25k or 25% on polygon to work around erigon bug
+- [x] public is 3900, but free is 360. free should be at least 3900 but probably more
+- [x] add --max-wait to wait_for_sync
+- [x] add automatic compare urls to wait_for_sync
+- [x] send panics to pagerduty
+- [x] enable lto on release builds
+- [x] less logs for backup servers
+- [x] use channels instead of arcswap
+  - this will let us easily wait for a new head or a new synced connection
+- [x] broadcast transactions to more servers
+- [x] send sentryd errors to pagerduty
+- [x] improve handling of unknown methods
+- [x] don't send pagerduty alerts for websocket panics
+- [x] improve waiting for sync when rate limited
+- [x] improve pager duty errors for smarter deduping
+- [x] add create_key cli command
+- [-] proxy mode for benchmarking all backends
+- [-] proxy mode for sending to multiple backends
 - [-] let users choose a % of reverts to log (or maybe x/second). someone like curve logging all reverts will be a BIG database very quickly
  - this must be opt-in and spawned in the background since it will slow things down and will make their calls less private
  - [ ] automatic pruning of old revert logs once too many are collected
@ -323,7 +349,7 @@ These are not yet ordered. There might be duplicates. We might not actually need
 - [ ] `stat delay` script 
  - query database for newest stat
 - [ ] period_datetime should always be :00. right now it depends on start time 
- [ ] two servers running will confuse rpc_accounting!
+- [ ] we have our hard rate limiter set up with a period of 60. but most providers have period of 1- [ ] two servers running will confuse rpc_accounting!
  - it won't happen with users often because they should be sticky to one proxy, but unauthenticated users will definitely hit this
  - one option: we need the insert to be an upsert, but how do we merge historgrams?
 - [ ] don't use systemtime. use chrono
@ -508,7 +534,8 @@ in another repo: event subscriber
 - [ ] if the call is something simple like "symbol" or "decimals", cache that too. though i think this could bite us.
 - [ ] add a subscription that returns the head block number and hash but nothing else
 - [ ] if chain split detected, what should we do? don't send transactions?
- [ ] archive check works well for local servers, but public nodes (especially on other chains) seem to give unreliable results. likely because of load balancers. maybe have a "max block data limit"
+- [ ] archive check works well for local servers, but public nodes (especially on other chains) seem to give unreliable results. likely because of load balancers.
+  - [x] configurable block data limit until better checks
 - [ ] https://docs.rs/derive_builder/latest/derive_builder/
 - [ ] Detect orphaned transactions
 - [ ] https://crates.io/crates/reqwest-middleware easy retry with exponential back off
@ -578,7 +605,6 @@ in another repo: event subscriber
 - [ ] sentry profiling
 - [ ] support alchemy_minedTransactions
 - [ ] debug print of user::Model's address is a big vec of numbers. make that hex somehow
- [ ] should we combine the proxy and cli into one bin?
 - [ ] make it so you can put a string like "LN arbitrum" into the create_user script, and have it automatically turn it into 0x4c4e20617262697472756d000000000000000000.
  - [ ] if --address not given, use the --description
  - [ ] if it is too long, (the last 4 bytes must be zero), give an error so descriptions like this stand out
--- a/deferred-rate-limiter/Cargo.toml
+++ b/deferred-rate-limiter/Cargo.toml
@ -11,4 +11,4 @@ anyhow = "1.0.68"
 hashbrown = "0.13.2"
 log = "0.4.17"
 moka = { version = "0.9.6", default-features = false, features = ["future"] }
-tokio = "1.24.1"
+tokio = "1.24.2"
--- a/docker-compose.common.yml
+++ b/docker-compose.common.yml
@ -4,7 +4,7 @@ services:
    build: .
    init: true
    restart: unless-stopped
-    command: --config /config.toml --workers 16
+    command: --config /config.toml --workers 16 proxyd
    # rust's tokio crate expects a SIGINT https://tokio.rs/tokio/topics/shutdown
    stop_signal: SIGINT
    environment:
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@ -68,7 +68,7 @@ services:
    extends:
      file: docker-compose.common.yml
      service: web3-proxy
-    command: --config /config.toml --workers 48
+    command: --config /config.toml --workers 48 proxyd
    volumes:
      - ./config/production-eth.toml:/config.toml
      - ./data/scratch:/scratch
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -23,6 +23,22 @@ services:
    volumes:
      - ./data/dev_mysql:/var/lib/mysql

+  # influxdb for stats
+  dev-influxdb:
+    image: influxdb:2.6.1-alpine
+    environment:
+      DOCKER_INFLUXDB_INIT_MODE: setup
+      DOCKER_INFLUXDB_INIT_USERNAME: dev_web3_proxy
+      DOCKER_INFLUXDB_INIT_PASSWORD: dev_web3_proxy
+      DOCKER_INFLUXDB_INIT_ORG: dev_org
+      DOCKER_INFLUXDB_INIT_BUCKET: dev_web3_proxy
+      DOCKER_INFLUXDB_INIT_ADMIN_TOKEN: dev_web3_proxy_auth_token
+    ports:
+      - 127.0.0.1:8086:8086
+    volumes:
+      - ./data/dev_influxdb/data:/var/lib/influxdb2
+      - ./data/dev_influxdb/config:/etc/influxdb2
+
  # volatile redis for storing rate limits
  dev-vredis:
    extends:
--- a/entities/Cargo.toml
+++ b/entities/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "entities"
-version = "0.12.0"
+version = "0.13.0"
 edition = "2021"

 [lib]
@ -10,7 +10,7 @@ path = "src/mod.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-sea-orm = "0.10.6"
+sea-orm = "0.10.7"
 serde = "1.0.152"
 uuid = "1.2.2"
 ethers = "1.0.2"
--- a/migration/Cargo.toml
+++ b/migration/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "migration"
-version = "0.12.0"
+version = "0.13.0"
 edition = "2021"
 publish = false

@ -9,10 +9,10 @@ name = "migration"
 path = "src/lib.rs"

 [dependencies]
-tokio = { version = "1.24.1", features = ["full", "tracing"] }
+tokio = { version = "1.24.2", features = ["full", "tracing"] }

 [dependencies.sea-orm-migration]
-version = "0.10.6"
+version = "0.10.7"
 features = [
  # Enable at least one `ASYNC_RUNTIME` and `DATABASE_DRIVER` feature if you want to run migration via CLI.
  # View the list of supported features at https://www.sea-ql.org/SeaORM/docs/install-and-config/database-and-async-runtime.
--- a/migration/README.md
+++ b/migration/README.md
@ -2,7 +2,7 @@

 - Generate a new migration file
    ```sh
-    cargo run -- migrate generate MIGRATION_NAME
+    cargo run -- generate MIGRATION_NAME
    ```
 - Apply all pending migrations
    ```sh
--- a/migration/src/lib.rs
+++ b/migration/src/lib.rs
@ -13,6 +13,7 @@ mod m20221108_200345_save_anon_stats;
 mod m20221211_124002_request_method_privacy;
 mod m20221213_134158_move_login_into_database;
 mod m20230117_191358_admin_table;
+mod m20230119_204135_better_free_tier;

 pub struct Migrator;

@ -33,6 +34,7 @@ impl MigratorTrait for Migrator {
            Box::new(m20221211_124002_request_method_privacy::Migration),
            Box::new(m20221213_134158_move_login_into_database::Migration),
            Box::new(m20230117_191358_admin_table::Migration),
+            Box::new(m20230119_204135_better_free_tier::Migration),
        ]
    }
 }
--- a/migration/src/m20230119_204135_better_free_tier.rs
+++ b/migration/src/m20230119_204135_better_free_tier.rs
@ -0,0 +1,39 @@
+//! Increase requests per minute for the free tier to be better than our public tier (which has 3900/min)
+use sea_orm_migration::{prelude::*, sea_orm::ConnectionTrait};
+
+#[derive(DeriveMigrationName)]
+pub struct Migration;
+
+#[async_trait::async_trait]
+impl MigrationTrait for Migration {
+    async fn up(&self, manager: &SchemaManager) -> Result<(), DbErr> {
+        let db_conn = manager.get_connection();
+        let db_backend = manager.get_database_backend();
+
+        let update_free = Query::update()
+            .table(UserTier::Table)
+            .value(UserTier::MaxRequestsPerPeriod, 6000)
+            .and_where(Expr::col(UserTier::Title).eq("Free"))
+            .limit(1)
+            .to_owned();
+
+        let x = db_backend.build(&update_free);
+
+        let rows_affected = db_conn.execute(x).await?.rows_affected();
+
+        assert_eq!(rows_affected, 1, "unable to update free tier");
+
+        Ok(())
+    }
+
+    async fn down(&self, _manager: &SchemaManager) -> Result<(), DbErr> {
+        todo!();
+    }
+}
+
+#[derive(Iden)]
+enum UserTier {
+    Table,
+    Title,
+    MaxRequestsPerPeriod,
+}
--- a/redis-rate-limiter/Cargo.toml
+++ b/redis-rate-limiter/Cargo.toml
@ -7,4 +7,4 @@ edition = "2021"
 [dependencies]
 anyhow = "1.0.68"
 deadpool-redis = { version = "0.11.1", features = ["rt_tokio_1", "serde"] }
-tokio = "1.24.1"
+tokio = "1.24.2"
--- a/web3_proxy/Cargo.toml
+++ b/web3_proxy/Cargo.toml
@ -1,8 +1,8 @@
 [package]
 name = "web3_proxy"
-version = "0.12.0"
+version = "0.13.0"
 edition = "2021"
-default-run = "web3_proxy"
+default-run = "web3_proxy_cli"

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

@ -19,55 +19,57 @@ migration = { path = "../migration" }
 redis-rate-limiter = { path = "../redis-rate-limiter" }
 thread-fast-rng = { path = "../thread-fast-rng" }

-anyhow = { version = "1.0.68", features = ["backtrace"] }
-arc-swap = "1.6.0"
-argh = "0.1.10"
-axum = { version = "0.6.2", features = ["headers", "ws"] }
-axum-client-ip = "0.3.1"
-axum-macros = "0.3.1"
+# TODO: regex has several "perf" features that we might want to use
+# TODO: make sure this uuid version matches sea-orm. PR to put this in their prelude
+# TODO: import num_traits from sea-orm so we always have the same version
 # TODO: import chrono from sea-orm so we always have the same version
+# TODO: make sure this time version matches siwe. PR to put this in their prelude
+
+anyhow = { version = "1.0.68", features = ["backtrace"] }
+argh = "0.1.10"
+axum = { version = "0.6.4", features = ["headers", "ws"] }
+axum-client-ip = "0.3.1"
+axum-macros = "0.3.2"
 chrono = "0.4.23"
 counter = "0.5.7"
 derive_more = "0.99.17"
 dotenv = "0.15.0"
-ethers = { version = "1.0.2", default-features = false, features = ["rustls", "ws"] }
 env_logger = "0.10.0"
+ethers = { version = "1.0.2", default-features = false, features = ["rustls", "ws"] }
 fdlimit = "0.2.1"
 flume = "0.10.14"
 futures = { version = "0.3.25", features = ["thread-pool"] }
+gethostname = "0.4.1"
+glob = "0.3.1"
+handlebars = "4.3.6"
 hashbrown = { version = "0.13.2", features = ["serde"] }
 hdrhistogram = "7.5.2"
 http = "0.2.8"
 ipnet = "2.7.1"
+itertools = "0.10.5"
 log = "0.4.17"
 metered = { version = "0.9.0", features = ["serialize"] }
 moka = { version = "0.9.6", default-features = false, features = ["future"] }
 notify = "5.0.0"
 num = "0.4.0"
-# TODO: import num_traits from sea-orm so we always have the same version
 num-traits = "0.2.15"
+pagerduty-rs = { version = "0.1.6", default-features = false, features = ["async", "rustls", "sync"] }
 parking_lot = { version = "0.12.1", features = ["arc_lock"] }
 proctitle = "0.1.1"
-# TODO: regex has several "perf" features that we might want to use
 regex = "1.7.1"
-reqwest = { version = "0.11.13", default-features = false, features = ["json", "tokio-rustls"] }
-handlebars = "4.3.6"
+reqwest = { version = "0.11.14", default-features = false, features = ["json", "tokio-rustls"] }
 rustc-hash = "1.1.0"
-siwe = "0.5.0"
-sentry = { version = "0.29.1", default-features = false, features = ["backtrace", "contexts", "panic", "anyhow", "reqwest", "rustls", "log", "sentry-log"] }
+sentry = { version = "0.29.2", default-features = false, features = ["backtrace", "contexts", "panic", "anyhow", "reqwest", "rustls", "log", "sentry-log"] }
 serde = { version = "1.0.152", features = [] }
 serde_json = { version = "1.0.91", default-features = false, features = ["alloc", "raw_value"] }
 serde_prometheus = "0.1.6"
-# TODO: make sure this time version matches siwe. PR to put this in their prelude
+siwe = "0.5.0"
 time = "0.3.17"
-tokio = { version = "1.24.1", features = ["full"] }
-# TODO: make sure this uuid version matches sea-orm. PR to put this in their prelude
+tokio = { version = "1.24.2", features = ["full"] }
 tokio-stream = { version = "0.1.11", features = ["sync"] }
-toml = "0.5.10"
+toml = "0.6.0"
 tower = "0.4.13"
 tower-http = { version = "0.3.5", features = ["cors", "sensitive-headers"] }
 ulid = { version = "1.0.0", features = ["serde"] }
 url = "2.3.1"
 uuid = "1.2.2"
-itertools = "0.10.5"
-glob = "0.3.1"
--- a/web3_proxy/src/app/mod.rs
+++ b/web3_proxy/src/app/mod.rs
@ -4,8 +4,9 @@ mod ws;
 use crate::app_stats::{ProxyResponseStat, StatEmitter, Web3ProxyStat};
 use crate::block_number::{block_needed, BlockNeeded};
 use crate::config::{AppConfig, TopConfig};
-use crate::frontend::authorization::{Authorization, RequestMetadata};
+use crate::frontend::authorization::{Authorization, RequestMetadata, RpcSecretKey};
 use crate::frontend::errors::FrontendErrorResponse;
+use crate::frontend::rpc_proxy_ws::ProxyMode;
 use crate::jsonrpc::{
    JsonRpcForwardedResponse, JsonRpcForwardedResponseEnum, JsonRpcRequest, JsonRpcRequestEnum,
 };
@ -24,6 +25,7 @@ use entities::sea_orm_active_enums::LogLevel;
 use entities::user;
 use ethers::core::utils::keccak256;
 use ethers::prelude::{Address, Block, Bytes, Transaction, TxHash, H256, U64};
+use ethers::types::U256;
 use ethers::utils::rlp::{Decodable, Rlp};
 use futures::future::join_all;
 use futures::stream::{FuturesUnordered, StreamExt};
@ -55,11 +57,12 @@ use tokio::time::{sleep, timeout};
 use ulid::Ulid;

 // TODO: make this customizable?
+// TODO: include GIT_REF in here. i had trouble getting https://docs.rs/vergen/latest/vergen/ to work with a workspace. also .git is in .dockerignore
 pub static APP_USER_AGENT: &str = concat!(
-    "satoshiandkin/",
+    "llamanodes_",
    env!("CARGO_PKG_NAME"),
-    "/",
-    env!("CARGO_PKG_VERSION"),
+    "/v",
+    env!("CARGO_PKG_VERSION")
 );

 /// TODO: allow customizing the request period?
@ -134,12 +137,14 @@ pub type AnyhowJoinHandle<T> = JoinHandle<anyhow::Result<T>>;

 #[derive(Clone, Debug, Default, From)]
 pub struct AuthorizationChecks {
-    /// database id of the primary user.
+    /// database id of the primary user. 0 if anon
    /// TODO: do we need this? its on the authorization so probably not
    pub user_id: u64,
+    /// the key used (if any)
+    pub rpc_secret_key: Option<RpcSecretKey>,
    /// database id of the rpc key
    /// if this is None, then this request is being rate limited by ip
-    pub rpc_key_id: Option<NonZeroU64>,
+    pub rpc_secret_key_id: Option<NonZeroU64>,
    /// if None, allow unlimited queries. inherited from the user_tier
    pub max_requests_per_period: Option<u64>,
    // if None, allow unlimited concurrent requests. inherited from the user_tier
@ -183,10 +188,9 @@ pub struct Web3ProxyApp {
    response_cache: ResponseCache,
    // don't drop this or the sender will stop working
    // TODO: broadcast channel instead?
-    head_block_receiver: watch::Receiver<ArcBlock>,
+    watch_consensus_head_receiver: watch::Receiver<ArcBlock>,
    pending_tx_sender: broadcast::Sender<TxStatus>,
    pub config: AppConfig,
-    pub allowed_lag: u64,
    pub db_conn: Option<sea_orm::DatabaseConnection>,
    pub db_replica: Option<DatabaseReplica>,
    /// prometheus metrics
@ -269,18 +273,14 @@ pub async fn drop_migration_lock(db_conn: &DatabaseConnection) -> Result<(), DbE
    Ok(())
 }

-/// Connect to the database and run migrations
-pub async fn get_migrated_db(
-    db_url: String,
-    min_connections: u32,
-    max_connections: u32,
-) -> anyhow::Result<DatabaseConnection> {
-    // TODO: this seems to fail silently
-    let db_conn = get_db(db_url, min_connections, max_connections).await?;
-
+/// Be super careful with override_existing_lock! It is very important that only one process is running the migrations at a time!
+pub async fn migrate_db(
+    db_conn: &DatabaseConnection,
+    override_existing_lock: bool,
+) -> Result<(), DbErr> {
    let db_backend = db_conn.get_database_backend();

-    // TODO: put the timestamp into this?
+    // TODO: put the timestamp and hostname into this as columns?
    let create_lock_statment = db_backend.build(
        Table::create()
            .table(Alias::new("migration_lock"))
@ -290,18 +290,24 @@ pub async fn get_migrated_db(
    loop {
        if Migrator::get_pending_migrations(&db_conn).await?.is_empty() {
            info!("no migrations to apply");
-            return Ok(db_conn);
+            return Ok(());
        }

        // there are migrations to apply
        // acquire a lock
        if let Err(err) = db_conn.execute(create_lock_statment.clone()).await {
-            debug!("Unable to acquire lock. err={:?}", err);
+            if override_existing_lock {
+                warn!("OVERRIDING EXISTING LOCK in 10 seconds! ctrl+c now if other migrations are actually running!");

-            // TODO: exponential backoff with jitter
-            sleep(Duration::from_secs(1)).await;
+                sleep(Duration::from_secs(10)).await
+            } else {
+                debug!("Unable to acquire lock. if you are positive no migration is running, run \"web3_proxy_cli drop_migration_lock\". err={:?}", err);

-            continue;
+                // TODO: exponential backoff with jitter?
+                sleep(Duration::from_secs(1)).await;
+
+                continue;
+            }
        }

        debug!("migration lock acquired");
@ -314,7 +320,19 @@ pub async fn get_migrated_db(
    drop_migration_lock(&db_conn).await?;

    // return if migrations erred
-    migration_result?;
+    migration_result
+}
+
+/// Connect to the database and run migrations
+pub async fn get_migrated_db(
+    db_url: String,
+    min_connections: u32,
+    max_connections: u32,
+) -> Result<DatabaseConnection, DbErr> {
+    // TODO: this seems to fail silently
+    let db_conn = get_db(db_url, min_connections, max_connections).await?;
+
+    migrate_db(&db_conn, false).await?;

    Ok(db_conn)
 }
@ -515,7 +533,8 @@ impl Web3ProxyApp {
        };

        // TODO: i don't like doing Block::default here! Change this to "None"?
-        let (head_block_sender, head_block_receiver) = watch::channel(Arc::new(Block::default()));
+        let (watch_consensus_head_sender, watch_consensus_head_receiver) =
+            watch::channel(Arc::new(Block::default()));
        // TODO: will one receiver lagging be okay? how big should this be?
        let (pending_tx_sender, pending_tx_receiver) = broadcast::channel(256);

@ -552,7 +571,7 @@ impl Web3ProxyApp {
            http_client.clone(),
            vredis_pool.clone(),
            block_map.clone(),
-            Some(head_block_sender),
+            Some(watch_consensus_head_sender),
            top_config.app.min_sum_soft_limit,
            top_config.app.min_synced_rpcs,
            Some(pending_tx_sender.clone()),
@ -580,6 +599,8 @@ impl Web3ProxyApp {
                vredis_pool.clone(),
                block_map,
                // subscribing to new heads here won't work well. if they are fast, they might be ahead of balanced_rpcs
+                // they also often have low rate limits
+                // however, they are well connected to miners/validators. so maybe using them as a safety check would be good
                None,
                0,
                0,
@ -683,24 +704,12 @@ impl Web3ProxyApp {
            .time_to_idle(Duration::from_secs(120))
            .build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default());

-        // TODO: get this out of the toml instead
-        let allowed_lag = match top_config.app.chain_id {
-            1 => 60,
-            137 => 10,
-            250 => 10,
-            _ => {
-                warn!("defaulting allowed lag to 60");
-                60
-            }
-        };
-
        let app = Self {
            config: top_config.app,
-            allowed_lag,
            balanced_rpcs,
            private_rpcs,
            response_cache,
-            head_block_receiver,
+            watch_consensus_head_receiver,
            pending_tx_sender,
            pending_transactions,
            frontend_ip_rate_limiter,
@ -723,6 +732,10 @@ impl Web3ProxyApp {
        Ok((app, cancellable_handles, important_background_handles).into())
    }

+    pub fn head_block_receiver(&self) -> watch::Receiver<ArcBlock> {
+        self.watch_consensus_head_receiver.clone()
+    }
+
    pub async fn prometheus_metrics(&self) -> String {
        let globals = HashMap::new();
        // TODO: what globals? should this be the hostname or what?
@ -907,10 +920,10 @@ impl Web3ProxyApp {
        self: &Arc<Self>,
        authorization: Arc<Authorization>,
        request: JsonRpcRequestEnum,
+        proxy_mode: ProxyMode,
    ) -> Result<(JsonRpcForwardedResponseEnum, Vec<Arc<Web3Connection>>), FrontendErrorResponse>
    {
-        // TODO: this should probably be trace level
-        // // trace!(?request, "proxy_web3_rpc");
+        // trace!(?request, "proxy_web3_rpc");

        // even though we have timeouts on the requests to our backend providers,
        // we need a timeout for the incoming request so that retries don't run forever
@ -921,7 +934,7 @@ impl Web3ProxyApp {
            JsonRpcRequestEnum::Single(request) => {
                let (response, rpcs) = timeout(
                    max_time,
-                    self.proxy_web3_rpc_request(&authorization, request),
+                    self.proxy_cached_request(&authorization, request, proxy_mode),
                )
                .await??;

@ -930,7 +943,7 @@ impl Web3ProxyApp {
            JsonRpcRequestEnum::Batch(requests) => {
                let (responses, rpcs) = timeout(
                    max_time,
-                    self.proxy_web3_rpc_requests(&authorization, requests),
+                    self.proxy_web3_rpc_requests(&authorization, requests, proxy_mode),
                )
                .await??;

@ -947,6 +960,7 @@ impl Web3ProxyApp {
        self: &Arc<Self>,
        authorization: &Arc<Authorization>,
        requests: Vec<JsonRpcRequest>,
+        proxy_mode: ProxyMode,
    ) -> anyhow::Result<(Vec<JsonRpcForwardedResponse>, Vec<Arc<Web3Connection>>)> {
        // TODO: we should probably change ethers-rs to support this directly. they pushed this off to v2 though
        let num_requests = requests.len();
@ -956,7 +970,7 @@ impl Web3ProxyApp {
        let responses = join_all(
            requests
                .into_iter()
-                .map(|request| self.proxy_web3_rpc_request(authorization, request))
+                .map(|request| self.proxy_cached_request(authorization, request, proxy_mode))
                .collect::<Vec<_>>(),
        )
        .await;
@ -1000,10 +1014,11 @@ impl Web3ProxyApp {
    }

    #[measure([ErrorCount, HitCount, ResponseTime, Throughput])]
-    async fn proxy_web3_rpc_request(
+    async fn proxy_cached_request(
        self: &Arc<Self>,
        authorization: &Arc<Authorization>,
        mut request: JsonRpcRequest,
+        proxy_mode: ProxyMode,
    ) -> anyhow::Result<(JsonRpcForwardedResponse, Vec<Arc<Web3Connection>>)> {
        // trace!("Received request: {:?}", request);

@ -1083,8 +1098,15 @@ impl Web3ProxyApp {
            | "shh_uninstallFilter"
            | "shh_version") => {
                // TODO: client error stat
-                // TODO: proper error code
-                return Err(anyhow::anyhow!("method unsupported: {}", method));
+                // TODO: what error code?
+                return Ok((
+                    JsonRpcForwardedResponse::from_string(
+                        format!("method unsupported: {}", method),
+                        None,
+                        Some(request_id),
+                    ),
+                    vec![],
+                ));
            }
            // TODO: implement these commands
            method @ ("eth_getFilterChanges"
@ -1094,7 +1116,15 @@ impl Web3ProxyApp {
            | "eth_newPendingTransactionFilter"
            | "eth_uninstallFilter") => {
                // TODO: unsupported command stat
-                return Err(anyhow::anyhow!("not yet implemented: {}", method));
+                // TODO: what error code?
+                return Ok((
+                    JsonRpcForwardedResponse::from_string(
+                        format!("not yet implemented: {}", method),
+                        None,
+                        Some(request_id),
+                    ),
+                    vec![],
+                ));
            }
            // some commands can use local data or caches
            "eth_accounts" => {
@ -1122,18 +1152,14 @@ impl Web3ProxyApp {
            // TODO: eth_sendPrivateTransaction (https://docs.flashbots.net/flashbots-auction/searchers/advanced/rpc-endpoint#eth_sendprivatetransaction)
            "eth_coinbase" => {
                // no need for serving coinbase
-                // we could return a per-user payment address here, but then we might leak that to dapps
                // no stats on this. its cheap
                json!(Address::zero())
            }
-            /*
-            // erigon was giving bad estimates. but now it doesn't need it
            "eth_estimateGas" => {
-                // TODO: eth_estimateGas using anvil?
-                // TODO: modify the block requested?
                let mut response = self
                    .balanced_rpcs
-                    .try_send_best_upstream_server(
+                    .try_proxy_connection(
+                        proxy_mode,
                        authorization,
                        request,
                        Some(&request_metadata),
@ -1141,11 +1167,9 @@ impl Web3ProxyApp {
                    )
                    .await?;

-                let parsed_gas_estimate = if let Some(gas_estimate) = response.result.take() {
-                    let parsed_gas_estimate: U256 = serde_json::from_str(gas_estimate.get())
-                        .context("gas estimate result is not an U256")?;
-
-                    parsed_gas_estimate
+                let mut gas_estimate: U256 = if let Some(gas_estimate) = response.result.take() {
+                    serde_json::from_str(gas_estimate.get())
+                        .context("gas estimate result is not an U256")?
                } else {
                    // i think this is always an error response
                    let rpcs = request_metadata.backend_requests.lock().clone();
@ -1153,13 +1177,21 @@ impl Web3ProxyApp {
                    return Ok((response, rpcs));
                };

-                // increase by 1.01%
-                let parsed_gas_estimate =
-                    parsed_gas_estimate * U256::from(101_010) / U256::from(100_000);
+                let gas_increase =
+                    if let Some(gas_increase_percent) = self.config.gas_increase_percent {
+                        let gas_increase = gas_estimate * gas_increase_percent / U256::from(100);

-                json!(parsed_gas_estimate)
+                        let min_gas_increase = self.config.gas_increase_min.unwrap_or_default();
+
+                        gas_increase.max(min_gas_increase)
+                    } else {
+                        self.config.gas_increase_min.unwrap_or_default()
+                    };
+
+                gas_estimate += gas_increase;
+
+                json!(gas_estimate)
            }
-            */
            // TODO: eth_gasPrice that does awesome magic to predict the future
            "eth_hashrate" => {
                // no stats on this. its cheap
@ -1172,22 +1204,32 @@ impl Web3ProxyApp {
            // TODO: eth_sendBundle (flashbots command)
            // broadcast transactions to all private rpcs at once
            "eth_sendRawTransaction" => {
+                // TODO: how should we handle private_mode here?
+                let default_num = match proxy_mode {
+                    // TODO: how many balanced rpcs should we send to? configurable? percentage of total?
+                    ProxyMode::Best => Some(4),
+                    ProxyMode::Fastest(0) => None,
+                    // TODO: how many balanced rpcs should we send to? configurable? percentage of total?
+                    // TODO: what if we do 2 per tier? we want to blast the third party rpcs
+                    // TODO: maybe having the third party rpcs in their own Web3Connections would be good for this
+                    ProxyMode::Fastest(x) => Some(x * 4),
+                    ProxyMode::Versus => None,
+                };
+
                let (private_rpcs, num) = if let Some(private_rpcs) = self.private_rpcs.as_ref() {
                    if authorization.checks.private_txs {
+                        // if we are sending the transaction privately, no matter the proxy_mode, we send to ALL private rpcs
                        (private_rpcs, None)
                    } else {
-                        // TODO: how many balanced rpcs should we send to? configurable? percentage of total?
-                        // TODO: what if we do 2 per tier? we want to blast the third party rpcs
-                        // TODO: maybe having the third party rpcs would be good for this
-                        (&self.balanced_rpcs, Some(2))
+                        (&self.balanced_rpcs, default_num)
                    }
                } else {
-                    (&self.balanced_rpcs, Some(2))
+                    (&self.balanced_rpcs, default_num)
                };

                // try_send_all_upstream_servers puts the request id into the response. no need to do that ourselves here.
                let mut response = private_rpcs
-                    .try_send_all_upstream_servers(
+                    .try_send_all_synced_connections(
                        authorization,
                        &request,
                        Some(request_metadata.clone()),
@ -1283,13 +1325,23 @@ impl Web3ProxyApp {
                json!(false)
            }
            "eth_subscribe" => {
-                return Err(anyhow::anyhow!(
-                    "notifications not supported. eth_subscribe is only available over a websocket"
+                return Ok((
+                    JsonRpcForwardedResponse::from_str(
+                        "notifications not supported. eth_subscribe is only available over a websocket",
+                        Some(-32601),
+                        Some(request_id),
+                    ),
+                    vec![],
                ));
            }
            "eth_unsubscribe" => {
-                return Err(anyhow::anyhow!(
-                    "notifications not supported. eth_unsubscribe is only available over a websocket"
+                return Ok((
+                    JsonRpcForwardedResponse::from_str(
+                        "notifications not supported. eth_unsubscribe is only available over a websocket",
+                        Some(-32601),
+                        Some(request_id),
+                    ),
+                    vec![],
                ));
            }
            "net_listening" => {
@ -1298,7 +1350,8 @@ impl Web3ProxyApp {
                json!(true)
            }
            "net_peerCount" => {
-                // emit stats
+                // no stats on this. its cheap
+                // TODO: do something with proxy_mode here?
                self.balanced_rpcs.num_synced_rpcs().into()
            }
            "web3_clientVersion" => {
@ -1312,10 +1365,18 @@ impl Web3ProxyApp {
                    Some(serde_json::Value::Array(params)) => {
                        // TODO: make a struct and use serde conversion to clean this up
                        if params.len() != 1 || !params[0].is_string() {
-                            // TODO: this needs the correct error code in the response
-                            return Err(anyhow::anyhow!("invalid request"));
+                            // TODO: what error code?
+                            return Ok((
+                                JsonRpcForwardedResponse::from_str(
+                                    "Invalid request",
+                                    Some(-32600),
+                                    Some(request_id),
+                                ),
+                                vec![],
+                            ));
                        }

+                        // TODO: don't return with ? here. send a jsonrpc invalid request
                        let param = Bytes::from_str(
                            params[0]
                                .as_str()
@ -1329,18 +1390,35 @@ impl Web3ProxyApp {
                    _ => {
                        // TODO: this needs the correct error code in the response
                        // TODO: emit stat?
-                        return Err(anyhow::anyhow!("invalid request"));
+                        return Ok((
+                            JsonRpcForwardedResponse::from_str(
+                                "invalid request",
+                                None,
+                                Some(request_id),
+                            ),
+                            vec![],
+                        ));
                    }
                }
            }
+            "test" => {
+                return Ok((
+                    JsonRpcForwardedResponse::from_str(
+                        "The method test does not exist/is not available.",
+                        Some(-32601),
+                        Some(request_id),
+                    ),
+                    vec![],
+                ));
+            }
            // anything else gets sent to backend rpcs and cached
            method => {
                // emit stats

-                // TODO: if no servers synced, wait for them to be synced?
-                let head_block = self
+                // TODO: if no servers synced, wait for them to be synced? probably better to error and let haproxy retry another server
+                let head_block_num = self
                    .balanced_rpcs
-                    .head_block()
+                    .head_block_num()
                    .context("no servers synced")?;

                // we do this check before checking caches because it might modify the request params
@ -1350,7 +1428,7 @@ impl Web3ProxyApp {
                    authorization,
                    method,
                    request.params.as_mut(),
-                    head_block.number(),
+                    head_block_num,
                    &self.balanced_rpcs,
                )
                .await?
@ -1404,11 +1482,12 @@ impl Web3ProxyApp {
                            .try_get_with(cache_key, async move {
                                // TODO: retry some failures automatically!
                                // TODO: try private_rpcs if all the balanced_rpcs fail!
-                                // TODO: put the hash here instead?
+                                // TODO: put the hash here instead of the block number? its in the request already.
+
                                let mut response = self
                                    .balanced_rpcs
-                                    .try_send_best_upstream_server(
-                                        self.allowed_lag,
+                                    .try_proxy_connection(
+                                        proxy_mode,
                                        &authorization,
                                        request,
                                        Some(&request_metadata),
@ -1433,18 +1512,14 @@ impl Web3ProxyApp {
                            })?
                    } else {
                        self.balanced_rpcs
-                            .try_send_best_upstream_server(
-                                self.allowed_lag,
+                            .try_proxy_connection(
+                                proxy_mode,
                                &authorization,
                                request,
                                Some(&request_metadata),
                                None,
                            )
-                            .await
-                            .map_err(|err| {
-                                // TODO: emit a stat for an error
-                                anyhow::anyhow!("error while forwarding response: {}", err)
-                            })?
+                            .await?
                    }
                };

--- a/web3_proxy/src/app/ws.rs
+++ b/web3_proxy/src/app/ws.rs
@ -50,7 +50,7 @@ impl Web3ProxyApp {
        match request_json.params.as_ref() {
            Some(x) if x == &json!(["newHeads"]) => {
                let authorization = authorization.clone();
-                let head_block_receiver = self.head_block_receiver.clone();
+                let head_block_receiver = self.watch_consensus_head_receiver.clone();
                let stat_sender = self.stat_sender.clone();

                trace!("newHeads subscription {:?}", subscription_id);
--- a/web3_proxy/src/app_stats.rs
+++ b/web3_proxy/src/app_stats.rs
@ -36,7 +36,7 @@ impl ProxyResponseStat {
    fn key(&self) -> ProxyResponseAggregateKey {
        // include either the rpc_key_id or the origin
        let (mut rpc_key_id, origin) = match (
-            self.authorization.checks.rpc_key_id,
+            self.authorization.checks.rpc_secret_key_id,
            &self.authorization.origin,
        ) {
            (Some(rpc_key_id), _) => {
--- a/web3_proxy/src/bin/wait_for_sync.rs
+++ b/web3_proxy/src/bin/wait_for_sync.rs
@ -1,27 +1,39 @@
-// TODO: websockets instead of http
+// TODO: support websockets

 use anyhow::Context;
 use argh::FromArgs;
 use chrono::Utc;
+use ethers::types::U64;
 use ethers::types::{Block, TxHash};
 use log::info;
 use log::warn;
 use reqwest::Client;
 use serde::Deserialize;
 use serde_json::json;
+use std::sync::atomic::{AtomicU32, Ordering};
 use tokio::time::sleep;
 use tokio::time::Duration;

 #[derive(Debug, FromArgs)]
 /// Command line interface for admins to interact with web3_proxy
 pub struct CliConfig {
-    /// the RPC to check
+    /// the HTTP RPC to check
    #[argh(option, default = "\"http://localhost:8545\".to_string()")]
    pub check_url: String,

-    /// the RPC to compare to
-    #[argh(option, default = "\"https://eth.llamarpc.com\".to_string()")]
-    pub compare_url: String,
+    /// the HTTP RPC to compare against. defaults to LlamaNodes public RPC
+    #[argh(option)]
+    pub compare_url: Option<String>,
+
+    /// how many seconds to wait for sync.
+    /// Defaults to waiting forever.
+    /// if the wait is exceeded, will exit with code 2
+    #[argh(option)]
+    pub max_wait: Option<u64>,
+
+    /// require a specific chain id (for extra safety)
+    #[argh(option)]
+    pub chain_id: Option<u64>,
 }

 #[tokio::main]
@ -38,26 +50,73 @@ async fn main() -> anyhow::Result<()> {

    let cli_config: CliConfig = argh::from_env();

-    let json_request = json!({
-        "id": "1",
-        "jsonrpc": "2.0",
-        "method": "eth_getBlockByNumber",
-        "params": [
-            "latest",
-            false,
-        ],
-    });
-
    let client = reqwest::Client::new();

-    // TODO: make sure the chain ids match
-    // TODO: automatic compare_url based on the chain id
+    let check_url = cli_config.check_url;
+
+    // make sure the chain ids match
+    let check_id = get_chain_id(&check_url, &client)
+        .await
+        .context("unknown chain id for check_url")?;
+
+    if let Some(chain_id) = cli_config.chain_id {
+        if chain_id != check_id {
+            return Err(anyhow::anyhow!(
+                "chain_id of check_url is wrong! Need {}. Found {}",
+                chain_id,
+                check_id,
+            ));
+        }
+    }
+
+    let compare_url: String = match cli_config.compare_url {
+        Some(x) => x,
+        None => match check_id {
+            1 => "https://eth.llamarpc.com",
+            137 => "https://polygon.llamarpc.com",
+            _ => {
+                return Err(anyhow::anyhow!(
+                    "--compare-url required for chain {}",
+                    check_id
+                ))
+            }
+        }
+        .to_string(),
+    };
+
+    info!(
+        "comparing {} to {} (chain {})",
+        check_url, compare_url, check_id
+    );
+
+    let compare_id = get_chain_id(&compare_url, &client)
+        .await
+        .context("unknown chain id for compare_url")?;
+
+    if check_id != compare_id {
+        return Err(anyhow::anyhow!(
+            "chain_id does not match! Need {}. Found {}",
+            check_id,
+            compare_id,
+        ));
+    }
+
+    // start ids at 2 because id 1 was checking the chain id
+    let counter = AtomicU32::new(2);
+    let start = tokio::time::Instant::now();

    loop {
-        match main_loop(&cli_config, &client, &json_request).await {
+        match main_loop(&check_url, &compare_url, &client, &counter).await {
            Ok(()) => break,
            Err(err) => {
                warn!("{:?}", err);
+
+                if let Some(max_wait) = cli_config.max_wait {
+                    if max_wait == 0 || start.elapsed().as_secs() > max_wait {
+                        std::process::exit(2);
+                    }
+                }
+
                sleep(Duration::from_secs(10)).await;
            }
        }
@ -66,38 +125,77 @@ async fn main() -> anyhow::Result<()> {
    Ok(())
 }

+#[derive(Deserialize)]
+struct JsonRpcChainIdResult {
+    result: U64,
+}
+
+async fn get_chain_id(rpc: &str, client: &reqwest::Client) -> anyhow::Result<u64> {
+    let get_chain_id_request = json!({
+        "id": "1",
+        "jsonrpc": "2.0",
+        "method": "eth_chainId",
+    });
+
+    // TODO: loop until chain id is found?
+    let check_result = client
+        .post(rpc)
+        .json(&get_chain_id_request)
+        .send()
+        .await
+        .context("failed querying chain id")?
+        .json::<JsonRpcChainIdResult>()
+        .await
+        .context("failed parsing chain id")?
+        .result
+        .as_u64();
+
+    Ok(check_result)
+}
+
 #[derive(Deserialize)]
 struct JsonRpcBlockResult {
    result: Block<TxHash>,
 }

 async fn main_loop(
-    cli_config: &CliConfig,
+    check_url: &str,
+    compare_url: &str,
    client: &Client,
-    json_request: &serde_json::Value,
+    counter: &AtomicU32,
 ) -> anyhow::Result<()> {
-    let check_result = client
-        .post(&cli_config.check_url)
-        .json(json_request)
+    // TODO: have a real id here that increments every call?
+    let get_block_number_request = json!({
+        "id": counter.fetch_add(1, Ordering::SeqCst),
+        "jsonrpc": "2.0",
+        "method": "eth_getBlockByNumber",
+        "params": [
+            "latest",
+            false,
+        ],
+    });
+
+    let check_block = client
+        .post(check_url)
+        .json(&get_block_number_request)
        .send()
        .await
        .context("querying check block")?
        .json::<JsonRpcBlockResult>()
        .await
-        .context("parsing check block")?;
+        .context("parsing check block")?
+        .result;

-    let compare_result = client
-        .post(&cli_config.compare_url)
-        .json(json_request)
+    let compare_block = client
+        .post(compare_url)
+        .json(&get_block_number_request)
        .send()
        .await
        .context("querying compare block")?
        .json::<JsonRpcBlockResult>()
        .await
-        .context("parsing compare block")?;
-
-    let check_block = check_result.result;
-    let compare_block = compare_result.result;
+        .context("parsing compare block")?
+        .result;

    let check_number = check_block.number.context("no check block number")?;
    let compare_number = compare_block.number.context("no compare block number")?;
--- a/web3_proxy/src/bin/web3_proxy.rs
+++ b/web3_proxy/src/bin/web3_proxy.rs
@ -1,399 +0,0 @@
-//! Web3_proxy is a fast caching and load balancing proxy for web3 (Ethereum or similar) JsonRPC servers.
-//!
-//! Signed transactions (eth_sendRawTransaction) are sent in parallel to the configured private RPCs (eden, ethermine, flashbots, etc.).
-//!
-//! All other requests are sent to an RPC server on the latest block (alchemy, moralis, rivet, your own node, or one of many other providers).
-//! If multiple servers are in sync, the fastest server is prioritized. Since the fastest server is most likely to serve requests, slow servers are unlikely to ever get any requests.
-
-//#![warn(missing_docs)]
-#![forbid(unsafe_code)]
-
-use anyhow::Context;
-use futures::StreamExt;
-use log::{debug, error, info, warn};
-use num::Zero;
-use parking_lot::deadlock;
-use std::fs;
-use std::path::Path;
-use std::sync::atomic::{self, AtomicUsize};
-use std::thread;
-use tokio::runtime;
-use tokio::sync::broadcast;
-use tokio::time::Duration;
-use web3_proxy::app::{flatten_handle, flatten_handles, Web3ProxyApp};
-use web3_proxy::config::{CliConfig, TopConfig};
-use web3_proxy::{frontend, metrics_frontend};
-
-fn run(
-    shutdown_sender: broadcast::Sender<()>,
-    cli_config: CliConfig,
-    top_config: TopConfig,
-) -> anyhow::Result<()> {
-    debug!("{:?}", cli_config);
-    debug!("{:?}", top_config);
-
-    let mut shutdown_receiver = shutdown_sender.subscribe();
-
-    // spawn a thread for deadlock detection
-    // TODO: disable this feature during release mode and things should go faster
-    thread::spawn(move || loop {
-        thread::sleep(Duration::from_secs(10));
-        let deadlocks = deadlock::check_deadlock();
-        if deadlocks.is_empty() {
-            continue;
-        }
-
-        println!("{} deadlocks detected", deadlocks.len());
-        for (i, threads) in deadlocks.iter().enumerate() {
-            println!("Deadlock #{}", i);
-            for t in threads {
-                println!("Thread Id {:#?}", t.thread_id());
-                println!("{:#?}", t.backtrace());
-            }
-        }
-    });
-
-    // set up tokio's async runtime
-    let mut rt_builder = runtime::Builder::new_multi_thread();
-
-    let chain_id = top_config.app.chain_id;
-    rt_builder.enable_all().thread_name_fn(move || {
-        static ATOMIC_ID: AtomicUsize = AtomicUsize::new(0);
-        // TODO: what ordering? i think we want seqcst so that these all happen in order, but that might be stricter than we really need
-        let worker_id = ATOMIC_ID.fetch_add(1, atomic::Ordering::SeqCst);
-        // TODO: i think these max at 15 characters
-        format!("web3-{}-{}", chain_id, worker_id)
-    });
-
-    if cli_config.workers > 0 {
-        rt_builder.worker_threads(cli_config.workers);
-    }
-
-    // start tokio's async runtime
-    let rt = rt_builder.build()?;
-
-    let num_workers = rt.metrics().num_workers();
-    info!("num_workers: {}", num_workers);
-
-    rt.block_on(async {
-        let app_frontend_port = cli_config.port;
-        let app_prometheus_port = cli_config.prometheus_port;
-
-        // start the main app
-        let mut spawned_app =
-            Web3ProxyApp::spawn(top_config, num_workers, shutdown_sender.subscribe()).await?;
-
-        let frontend_handle =
-            tokio::spawn(frontend::serve(app_frontend_port, spawned_app.app.clone()));
-
-        let prometheus_handle = tokio::spawn(metrics_frontend::serve(
-            spawned_app.app,
-            app_prometheus_port,
-        ));
-
-        // if everything is working, these should both run forever
-        tokio::select! {
-            x = flatten_handles(spawned_app.app_handles) => {
-                match x {
-                    Ok(_) => info!("app_handle exited"),
-                    Err(e) => {
-                        return Err(e);
-                    }
-                }
-            }
-            x = flatten_handle(frontend_handle) => {
-                match x {
-                    Ok(_) => info!("frontend exited"),
-                    Err(e) => {
-                        return Err(e);
-                    }
-                }
-            }
-            x = flatten_handle(prometheus_handle) => {
-                match x {
-                    Ok(_) => info!("prometheus exited"),
-                    Err(e) => {
-                        return Err(e);
-                    }
-                }
-            }
-            x = tokio::signal::ctrl_c() => {
-                match x {
-                    Ok(_) => info!("quiting from ctrl-c"),
-                    Err(e) => {
-                        return Err(e.into());
-                    }
-                }
-            }
-            x = shutdown_receiver.recv() => {
-                match x {
-                    Ok(_) => info!("quiting from shutdown receiver"),
-                    Err(e) => {
-                        return Err(e.into());
-                    }
-                }
-            }
-        };
-
-        // one of the handles stopped. send a value so the others know to shut down
-        if let Err(err) = shutdown_sender.send(()) {
-            warn!("shutdown sender err={:?}", err);
-        };
-
-        // wait for things like saving stats to the database to complete
-        info!("waiting on important background tasks");
-        let mut background_errors = 0;
-        while let Some(x) = spawned_app.background_handles.next().await {
-            match x {
-                Err(e) => {
-                    error!("{:?}", e);
-                    background_errors += 1;
-                }
-                Ok(Err(e)) => {
-                    error!("{:?}", e);
-                    background_errors += 1;
-                }
-                Ok(Ok(_)) => continue,
-            }
-        }
-
-        if background_errors.is_zero() {
-            info!("finished");
-        } else {
-            // TODO: collect instead?
-            error!("finished with errors!")
-        }
-
-        Ok(())
-    })
-}
-
-fn main() -> anyhow::Result<()> {
-    // if RUST_LOG isn't set, configure a default
-    let rust_log = match std::env::var("RUST_LOG") {
-        Ok(x) => x,
-        Err(_) => "info,ethers=debug,redis_rate_limit=debug,web3_proxy=debug".to_string(),
-    };
-
-    // this probably won't matter for us in docker, but better safe than sorry
-    fdlimit::raise_fd_limit();
-
-    // initial configuration from flags
-    let cli_config: CliConfig = argh::from_env();
-
-    // convert to absolute path so error logging is most helpful
-    let config_path = Path::new(&cli_config.config)
-        .canonicalize()
-        .context(format!(
-            "checking full path of {} and {}",
-            ".", // TODO: get cwd somehow
-            cli_config.config
-        ))?;
-
-    // advanced configuration is on disk
-    let top_config: String = fs::read_to_string(config_path.clone())
-        .context(format!("reading config at {}", config_path.display()))?;
-    let top_config: TopConfig = toml::from_str(&top_config)
-        .context(format!("parsing config at {}", config_path.display()))?;
-
-    // TODO: this doesn't seem to do anything
-    proctitle::set_title(format!("web3_proxy-{}", top_config.app.chain_id));
-
-    let logger = env_logger::builder().parse_filters(&rust_log).build();
-
-    let max_level = logger.filter();
-
-    // connect to sentry for error reporting
-    // if no sentry, only log to stdout
-    let _sentry_guard = if let Some(sentry_url) = top_config.app.sentry_url.clone() {
-        let logger = sentry::integrations::log::SentryLogger::with_dest(logger);
-
-        log::set_boxed_logger(Box::new(logger)).unwrap();
-
-        let guard = sentry::init((
-            sentry_url,
-            sentry::ClientOptions {
-                release: sentry::release_name!(),
-                // TODO: Set this a to lower value (from config) in production
-                traces_sample_rate: 1.0,
-                ..Default::default()
-            },
-        ));
-
-        Some(guard)
-    } else {
-        log::set_boxed_logger(Box::new(logger)).unwrap();
-
-        None
-    };
-
-    log::set_max_level(max_level);
-
-    // we used to do this earlier, but now we attach sentry
-    debug!("CLI config @ {:#?}", cli_config.config);
-
-    // tokio has code for catching ctrl+c so we use that
-    // this shutdown sender is currently only used in tests, but we might make a /shutdown endpoint or something
-    // we do not need this receiver. new receivers are made by `shutdown_sender.subscribe()`
-    let (shutdown_sender, _) = broadcast::channel(1);
-
-    run(shutdown_sender, cli_config, top_config)
-}
-
-#[cfg(test)]
-mod tests {
-    use ethers::{
-        prelude::{Http, Provider, U256},
-        utils::Anvil,
-    };
-    use hashbrown::HashMap;
-    use std::env;
-
-    use web3_proxy::{
-        config::{AppConfig, Web3ConnectionConfig},
-        rpcs::blockchain::ArcBlock,
-    };
-
-    use super::*;
-
-    #[tokio::test]
-    async fn it_works() {
-        // TODO: move basic setup into a test fixture
-        let path = env::var("PATH").unwrap();
-
-        println!("path: {}", path);
-
-        // TODO: how should we handle logs in this?
-        // TODO: option for super verbose logs
-        std::env::set_var("RUST_LOG", "info,web3_proxy=debug");
-
-        let _ = env_logger::builder().is_test(true).try_init();
-
-        let anvil = Anvil::new().spawn();
-
-        println!("Anvil running at `{}`", anvil.endpoint());
-
-        let anvil_provider = Provider::<Http>::try_from(anvil.endpoint()).unwrap();
-
-        // mine a block because my code doesn't like being on block 0
-        // TODO: make block 0 okay? is it okay now?
-        let _: U256 = anvil_provider
-            .request("evm_mine", None::<()>)
-            .await
-            .unwrap();
-
-        // make a test CliConfig
-        let cli_config = CliConfig {
-            port: 0,
-            prometheus_port: 0,
-            workers: 4,
-            config: "./does/not/exist/test.toml".to_string(),
-            cookie_key_filename: "./does/not/exist/development_cookie_key".to_string(),
-        };
-
-        // make a test TopConfig
-        // TODO: load TopConfig from a file? CliConfig could have `cli_config.load_top_config`. would need to inject our endpoint ports
-        let top_config = TopConfig {
-            app: AppConfig {
-                chain_id: 31337,
-                default_user_max_requests_per_period: Some(6_000_000),
-                min_sum_soft_limit: 1,
-                min_synced_rpcs: 1,
-                public_requests_per_period: Some(1_000_000),
-                response_cache_max_bytes: 10_usize.pow(7),
-                redirect_public_url: Some("example.com/".to_string()),
-                redirect_rpc_key_url: Some("example.com/{{rpc_key_id}}".to_string()),
-                ..Default::default()
-            },
-            balanced_rpcs: HashMap::from([
-                (
-                    "anvil".to_string(),
-                    Web3ConnectionConfig {
-                        disabled: false,
-                        display_name: None,
-                        url: anvil.endpoint(),
-                        block_data_limit: None,
-                        soft_limit: 100,
-                        hard_limit: None,
-                        tier: 0,
-                        subscribe_txs: Some(false),
-                        extra: Default::default(),
-                    },
-                ),
-                (
-                    "anvil_ws".to_string(),
-                    Web3ConnectionConfig {
-                        disabled: false,
-                        display_name: None,
-                        url: anvil.ws_endpoint(),
-                        block_data_limit: None,
-                        soft_limit: 100,
-                        hard_limit: None,
-                        tier: 0,
-                        subscribe_txs: Some(false),
-                        extra: Default::default(),
-                    },
-                ),
-            ]),
-            private_rpcs: None,
-            extra: Default::default(),
-        };
-
-        let (shutdown_sender, _) = broadcast::channel(1);
-
-        // spawn another thread for running the app
-        // TODO: allow launching into the local tokio runtime instead of creating a new one?
-        let handle = {
-            let shutdown_sender = shutdown_sender.clone();
-
-            thread::spawn(move || run(shutdown_sender, cli_config, top_config))
-        };
-
-        // TODO: do something to the node. query latest block, mine another block, query again
-        let proxy_provider = Provider::<Http>::try_from(anvil.endpoint()).unwrap();
-
-        let anvil_result = anvil_provider
-            .request::<_, Option<ArcBlock>>("eth_getBlockByNumber", ("latest", true))
-            .await
-            .unwrap()
-            .unwrap();
-        let proxy_result = proxy_provider
-            .request::<_, Option<ArcBlock>>("eth_getBlockByNumber", ("latest", true))
-            .await
-            .unwrap()
-            .unwrap();
-
-        assert_eq!(anvil_result, proxy_result);
-
-        let first_block_num = anvil_result.number.unwrap();
-
-        let _: U256 = anvil_provider
-            .request("evm_mine", None::<()>)
-            .await
-            .unwrap();
-
-        let anvil_result = anvil_provider
-            .request::<_, Option<ArcBlock>>("eth_getBlockByNumber", ("latest", true))
-            .await
-            .unwrap()
-            .unwrap();
-        let proxy_result = proxy_provider
-            .request::<_, Option<ArcBlock>>("eth_getBlockByNumber", ("latest", true))
-            .await
-            .unwrap()
-            .unwrap();
-
-        assert_eq!(anvil_result, proxy_result);
-
-        let second_block_num = anvil_result.number.unwrap();
-
-        assert_eq!(first_block_num, second_block_num - 1);
-
-        // tell the test app to shut down
-        shutdown_sender.send(()).unwrap();
-
-        println!("waiting for shutdown...");
-        // TODO: panic if a timeout is reached
-        handle.join().unwrap().unwrap();
-    }
-}
--- a/web3_proxy/src/bin/web3_proxy_cli/create_key.rs
+++ b/web3_proxy/src/bin/web3_proxy_cli/create_key.rs
@ -0,0 +1,77 @@
+use anyhow::Context;
+use argh::FromArgs;
+use entities::{rpc_key, user};
+use ethers::prelude::Address;
+use log::info;
+use migration::sea_orm::{self, ActiveModelTrait, ColumnTrait, EntityTrait, QueryFilter};
+use ulid::Ulid;
+use uuid::Uuid;
+use web3_proxy::frontend::authorization::RpcSecretKey;
+
+#[derive(FromArgs, PartialEq, Debug, Eq)]
+/// Create a new user and api key
+#[argh(subcommand, name = "create_key")]
+pub struct CreateKeySubCommand {
+    /// the user's ethereum address or descriptive string.
+    /// If a string is given, it will be converted to hex and potentially truncated.
+    /// Users from strings are only for testing since they won't be able to log in.
+    #[argh(positional)]
+    address: String,
+
+    /// the user's api ULID or UUID key.
+    /// If none given, one will be created.
+    #[argh(option)]
+    rpc_secret_key: Option<RpcSecretKey>,
+
+    /// an optional short description of the key's purpose.
+    #[argh(option)]
+    description: Option<String>,
+}
+
+impl CreateKeySubCommand {
+    pub async fn main(self, db: &sea_orm::DatabaseConnection) -> anyhow::Result<()> {
+        // TODO: would be nice to use the fixed array instead of a Vec in the entities
+        // take a simple String. If it starts with 0x, parse as address. otherwise convert ascii to hex
+        let address: Vec<u8> = if self.address.starts_with("0x") {
+            let address = self.address.parse::<Address>()?;
+
+            address.to_fixed_bytes().into()
+        } else {
+            // TODO: allow ENS
+            // left pad and truncate the string
+            let address = &format!("{:\x00>20}", self.address)[0..20];
+
+            // convert the string to bytes
+            let bytes = address.as_bytes();
+
+            // convert the slice to a Vec
+            bytes.try_into().expect("Bytes can always be a Vec<u8>")
+        };
+
+        // TODO: get existing or create a new one
+        let u = user::Entity::find()
+            .filter(user::Column::Address.eq(address))
+            .one(db)
+            .await?
+            .context("No user found with that address")?;
+
+        info!("user #{}", u.id);
+
+        let rpc_secret_key = self.rpc_secret_key.unwrap_or_else(RpcSecretKey::new);
+
+        // create a key for the new user
+        let uk = rpc_key::ActiveModel {
+            user_id: sea_orm::Set(u.id),
+            secret_key: sea_orm::Set(rpc_secret_key.into()),
+            description: sea_orm::Set(self.description),
+            ..Default::default()
+        };
+
+        let _uk = uk.save(db).await.context("Failed saving new user key")?;
+
+        info!("user key as ULID: {}", Ulid::from(rpc_secret_key));
+        info!("user key as UUID: {}", Uuid::from(rpc_secret_key));
+
+        Ok(())
+    }
+}
--- a/web3_proxy/src/bin/web3_proxy_cli/daemon.rs
+++ b/web3_proxy/src/bin/web3_proxy_cli/daemon.rs
@ -0,0 +1,309 @@
+#![forbid(unsafe_code)]
+
+use argh::FromArgs;
+use futures::StreamExt;
+use log::{error, info, warn};
+use num::Zero;
+use tokio::sync::broadcast;
+use web3_proxy::app::{flatten_handle, flatten_handles, Web3ProxyApp};
+use web3_proxy::config::TopConfig;
+use web3_proxy::{frontend, metrics_frontend};
+
+/// start the main proxy daemon
+#[derive(FromArgs, PartialEq, Debug, Eq)]
+#[argh(subcommand, name = "proxyd")]
+pub struct ProxydSubCommand {
+    /// path to a toml of rpc servers
+    /// what port the proxy should listen on
+    #[argh(option, default = "8544")]
+    pub port: u16,
+
+    /// what port the proxy should expose prometheus stats on
+    #[argh(option, default = "8543")]
+    pub prometheus_port: u16,
+}
+
+impl ProxydSubCommand {
+    pub async fn main(self, top_config: TopConfig, num_workers: usize) -> anyhow::Result<()> {
+        let (shutdown_sender, _) = broadcast::channel(1);
+
+        run(
+            top_config,
+            self.port,
+            self.prometheus_port,
+            num_workers,
+            shutdown_sender,
+        )
+        .await
+    }
+}
+
+async fn run(
+    top_config: TopConfig,
+    frontend_port: u16,
+    prometheus_port: u16,
+    num_workers: usize,
+    shutdown_sender: broadcast::Sender<()>,
+) -> anyhow::Result<()> {
+    // tokio has code for catching ctrl+c so we use that
+    // this shutdown sender is currently only used in tests, but we might make a /shutdown endpoint or something
+    // we do not need this receiver. new receivers are made by `shutdown_sender.subscribe()`
+
+    let app_frontend_port = frontend_port;
+    let app_prometheus_port = prometheus_port;
+    let mut shutdown_receiver = shutdown_sender.subscribe();
+
+    // start the main app
+    let mut spawned_app =
+        Web3ProxyApp::spawn(top_config, num_workers, shutdown_sender.subscribe()).await?;
+
+    // start the prometheus metrics port
+    let prometheus_handle = tokio::spawn(metrics_frontend::serve(
+        spawned_app.app.clone(),
+        app_prometheus_port,
+    ));
+
+    // wait until the app has seen its first consensus head block
+    // TODO: if backups were included, wait a little longer
+    let _ = spawned_app.app.head_block_receiver().changed().await;
+
+    // start the frontend port
+    let frontend_handle = tokio::spawn(frontend::serve(app_frontend_port, spawned_app.app.clone()));
+
+    // if everything is working, these should all run forever
+    tokio::select! {
+        x = flatten_handles(spawned_app.app_handles) => {
+            match x {
+                Ok(_) => info!("app_handle exited"),
+                Err(e) => {
+                    return Err(e);
+                }
+            }
+        }
+        x = flatten_handle(frontend_handle) => {
+            match x {
+                Ok(_) => info!("frontend exited"),
+                Err(e) => {
+                    return Err(e);
+                }
+            }
+        }
+        x = flatten_handle(prometheus_handle) => {
+            match x {
+                Ok(_) => info!("prometheus exited"),
+                Err(e) => {
+                    return Err(e);
+                }
+            }
+        }
+        x = tokio::signal::ctrl_c() => {
+            match x {
+                Ok(_) => info!("quiting from ctrl-c"),
+                Err(e) => {
+                    return Err(e.into());
+                }
+            }
+        }
+        x = shutdown_receiver.recv() => {
+            match x {
+                Ok(_) => info!("quiting from shutdown receiver"),
+                Err(e) => {
+                    return Err(e.into());
+                }
+            }
+        }
+    };
+
+    // one of the handles stopped. send a value so the others know to shut down
+    if let Err(err) = shutdown_sender.send(()) {
+        warn!("shutdown sender err={:?}", err);
+    };
+
+    // wait for things like saving stats to the database to complete
+    info!("waiting on important background tasks");
+    let mut background_errors = 0;
+    while let Some(x) = spawned_app.background_handles.next().await {
+        match x {
+            Err(e) => {
+                error!("{:?}", e);
+                background_errors += 1;
+            }
+            Ok(Err(e)) => {
+                error!("{:?}", e);
+                background_errors += 1;
+            }
+            Ok(Ok(_)) => continue,
+        }
+    }
+
+    if background_errors.is_zero() {
+        info!("finished");
+        Ok(())
+    } else {
+        // TODO: collect instead?
+        Err(anyhow::anyhow!("finished with errors!"))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use ethers::{
+        prelude::{Http, Provider, U256},
+        utils::Anvil,
+    };
+    use hashbrown::HashMap;
+    use std::env;
+
+    use web3_proxy::{
+        config::{AppConfig, Web3ConnectionConfig},
+        rpcs::blockchain::ArcBlock,
+    };
+
+    use super::*;
+
+    #[tokio::test]
+    async fn it_works() {
+        // TODO: move basic setup into a test fixture
+        let path = env::var("PATH").unwrap();
+
+        println!("path: {}", path);
+
+        // TODO: how should we handle logs in this?
+        // TODO: option for super verbose logs
+        std::env::set_var("RUST_LOG", "info,web3_proxy=debug");
+
+        let _ = env_logger::builder().is_test(true).try_init();
+
+        let anvil = Anvil::new().spawn();
+
+        println!("Anvil running at `{}`", anvil.endpoint());
+
+        let anvil_provider = Provider::<Http>::try_from(anvil.endpoint()).unwrap();
+
+        // mine a block because my code doesn't like being on block 0
+        // TODO: make block 0 okay? is it okay now?
+        let _: U256 = anvil_provider
+            .request("evm_mine", None::<()>)
+            .await
+            .unwrap();
+
+        // make a test TopConfig
+        // TODO: load TopConfig from a file? CliConfig could have `cli_config.load_top_config`. would need to inject our endpoint ports
+        let top_config = TopConfig {
+            app: AppConfig {
+                chain_id: 31337,
+                default_user_max_requests_per_period: Some(6_000_000),
+                min_sum_soft_limit: 1,
+                min_synced_rpcs: 1,
+                public_requests_per_period: Some(1_000_000),
+                response_cache_max_bytes: 10_usize.pow(7),
+                redirect_public_url: Some("example.com/".to_string()),
+                redirect_rpc_key_url: Some("example.com/{{rpc_key_id}}".to_string()),
+                ..Default::default()
+            },
+            balanced_rpcs: HashMap::from([
+                (
+                    "anvil".to_string(),
+                    Web3ConnectionConfig {
+                        disabled: false,
+                        display_name: None,
+                        url: anvil.endpoint(),
+                        backup: Some(false),
+                        block_data_limit: None,
+                        soft_limit: 100,
+                        hard_limit: None,
+                        tier: 0,
+                        subscribe_txs: Some(false),
+                        extra: Default::default(),
+                    },
+                ),
+                (
+                    "anvil_ws".to_string(),
+                    Web3ConnectionConfig {
+                        disabled: false,
+                        display_name: None,
+                        url: anvil.ws_endpoint(),
+                        backup: Some(false),
+                        block_data_limit: None,
+                        soft_limit: 100,
+                        hard_limit: None,
+                        tier: 0,
+                        subscribe_txs: Some(false),
+                        extra: Default::default(),
+                    },
+                ),
+            ]),
+            private_rpcs: None,
+            extra: Default::default(),
+        };
+
+        let (shutdown_sender, _) = broadcast::channel(1);
+
+        // spawn another thread for running the app
+        // TODO: allow launching into the local tokio runtime instead of creating a new one?
+        let handle = {
+            let shutdown_sender = shutdown_sender.clone();
+
+            let frontend_port = 0;
+            let prometheus_port = 0;
+
+            tokio::spawn(async move {
+                run(
+                    top_config,
+                    frontend_port,
+                    prometheus_port,
+                    2,
+                    shutdown_sender,
+                )
+                .await
+            })
+        };
+
+        // TODO: do something to the node. query latest block, mine another block, query again
+        let proxy_provider = Provider::<Http>::try_from(anvil.endpoint()).unwrap();
+
+        let anvil_result = anvil_provider
+            .request::<_, Option<ArcBlock>>("eth_getBlockByNumber", ("latest", true))
+            .await
+            .unwrap()
+            .unwrap();
+        let proxy_result = proxy_provider
+            .request::<_, Option<ArcBlock>>("eth_getBlockByNumber", ("latest", true))
+            .await
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(anvil_result, proxy_result);
+
+        let first_block_num = anvil_result.number.unwrap();
+
+        let _: U256 = anvil_provider
+            .request("evm_mine", None::<()>)
+            .await
+            .unwrap();
+
+        let anvil_result = anvil_provider
+            .request::<_, Option<ArcBlock>>("eth_getBlockByNumber", ("latest", true))
+            .await
+            .unwrap()
+            .unwrap();
+        let proxy_result = proxy_provider
+            .request::<_, Option<ArcBlock>>("eth_getBlockByNumber", ("latest", true))
+            .await
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(anvil_result, proxy_result);
+
+        let second_block_num = anvil_result.number.unwrap();
+
+        assert_eq!(first_block_num, second_block_num - 1);
+
+        // tell the test app to shut down
+        shutdown_sender.send(()).unwrap();
+
+        println!("waiting for shutdown...");
+        // TODO: panic if a timeout is reached
+        handle.await.unwrap().unwrap();
+    }
+}
--- a/web3_proxy/src/bin/web3_proxy_cli/drop_migration_lock.rs
+++ b/web3_proxy/src/bin/web3_proxy_cli/drop_migration_lock.rs
@ -1,15 +1,24 @@
 use argh::FromArgs;
 use migration::sea_orm::DatabaseConnection;
-use web3_proxy::app::drop_migration_lock;
+use web3_proxy::app::{drop_migration_lock, migrate_db};

 #[derive(FromArgs, PartialEq, Debug, Eq)]
 /// In case of emergency, break glass.
 #[argh(subcommand, name = "drop_migration_lock")]
-pub struct DropMigrationLockSubCommand {}
+pub struct DropMigrationLockSubCommand {
+    #[argh(option)]
+    /// run migrations after dropping the lock
+    and_migrate: bool,
+}

 impl DropMigrationLockSubCommand {
    pub async fn main(&self, db_conn: &DatabaseConnection) -> anyhow::Result<()> {
-        drop_migration_lock(db_conn).await?;
+        if self.and_migrate {
+            migrate_db(db_conn, true).await?;
+        } else {
+            // just drop the lock
+            drop_migration_lock(db_conn).await?;
+        }

        Ok(())
    }
--- a/web3_proxy/src/bin/web3_proxy_cli/health_compass.rs
+++ b/web3_proxy/src/bin/web3_proxy_cli/health_compass.rs
@ -1,137 +0,0 @@
-use argh::FromArgs;
-use ethers::types::{Block, TxHash, H256};
-use log::{error, info, warn};
-use serde::{Deserialize, Serialize};
-use serde_json::json;
-use web3_proxy::jsonrpc::JsonRpcErrorData;
-
-#[derive(FromArgs, PartialEq, Debug, Eq)]
-/// Never bring only 2 compasses to sea.
-#[argh(subcommand, name = "health_compass")]
-pub struct HealthCompassSubCommand {
-    #[argh(positional)]
-    /// first rpc
-    rpc_a: String,
-
-    #[argh(positional)]
-    /// second rpc
-    rpc_b: String,
-
-    #[argh(positional)]
-    /// third rpc
-    rpc_c: String,
-}
-
-#[derive(Debug, Deserialize, Serialize)]
-struct JsonRpcResponse<V> {
-    // pub jsonrpc: String,
-    // pub id: Box<RawValue>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub result: Option<V>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub error: Option<JsonRpcErrorData>,
-}
-
-impl HealthCompassSubCommand {
-    pub async fn main(self) -> anyhow::Result<()> {
-        let client = reqwest::Client::new();
-
-        let block_by_number_request = json!({
-            "jsonrpc": "2.0",
-            "id": "1",
-            "method": "eth_getBlockByNumber",
-            "params": ["latest", false],
-        });
-
-        let a = client
-            .post(&self.rpc_a)
-            .json(&block_by_number_request)
-            .send()
-            .await?
-            .json::<JsonRpcResponse<Block<TxHash>>>()
-            .await?
-            .result
-            .unwrap();
-
-        // check the parent because b and c might not be as fast as a
-        let parent_hash = a.parent_hash;
-
-        let a = check_rpc(&parent_hash, &client, &self.rpc_a).await;
-        let b = check_rpc(&parent_hash, &client, &self.rpc_b).await;
-        let c = check_rpc(&parent_hash, &client, &self.rpc_c).await;
-
-        match (a, b, c) {
-            (Ok(Ok(a)), Ok(Ok(b)), Ok(Ok(c))) => {
-                if a != b {
-                    error!("A: {:?}\n\nB: {:?}\n\nC: {:?}", a, b, c);
-                    return Err(anyhow::anyhow!("difference detected!"));
-                }
-
-                if b != c {
-                    error!("\nA: {:?}\n\nB: {:?}\n\nC: {:?}", a, b, c);
-                    return Err(anyhow::anyhow!("difference detected!"));
-                }
-
-                // all three rpcs agree
-            }
-            (Ok(Ok(a)), Ok(Ok(b)), c) => {
-                // not all successes! but still enough to compare
-                warn!("C failed: {:?}", c);
-
-                if a != b {
-                    error!("\nA: {:?}\n\nB: {:?}", a, b);
-                    return Err(anyhow::anyhow!("difference detected!"));
-                }
-            }
-            (Ok(Ok(a)), b, Ok(Ok(c))) => {
-                // not all successes! but still enough to compare
-                warn!("B failed: {:?}", b);
-
-                if a != c {
-                    error!("\nA: {:?}\n\nC: {:?}", a, c);
-                    return Err(anyhow::anyhow!("difference detected!"));
-                }
-            }
-            (a, b, c) => {
-                // not enough successes
-                error!("A: {:?}\n\nB: {:?}\n\nC: {:?}", a, b, c);
-                return Err(anyhow::anyhow!("All are failing!"));
-            }
-        }
-
-        info!("OK");
-
-        Ok(())
-    }
-}
-
-// i don't think we need a whole provider. a simple http request is easiest
-async fn check_rpc(
-    block_hash: &H256,
-    client: &reqwest::Client,
-    rpc: &str,
-) -> anyhow::Result<Result<Block<TxHash>, JsonRpcErrorData>> {
-    let block_by_hash_request = json!({
-        "jsonrpc": "2.0",
-        "id": "1",
-        "method": "eth_getBlockByHash",
-        "params": [block_hash, false],
-    });
-
-    // TODO: don't unwrap! don't use the try operator
-    let response: JsonRpcResponse<Block<TxHash>> = client
-        .post(rpc)
-        .json(&block_by_hash_request)
-        .send()
-        .await?
-        .json()
-        .await?;
-
-    if let Some(result) = response.result {
-        Ok(Ok(result))
-    } else if let Some(result) = response.error {
-        Ok(Err(result))
-    } else {
-        unimplemented!("{:?}", response)
-    }
-}
--- a/web3_proxy/src/bin/web3_proxy_cli/main.rs
+++ b/web3_proxy/src/bin/web3_proxy_cli/main.rs
@ -5,35 +5,61 @@ mod change_user_tier_by_address;
 mod change_user_tier_by_key;
 mod check_config;
 mod count_users;
+mod create_key;
 mod create_user;
+mod daemon;
 mod drop_migration_lock;
-mod health_compass;
 mod list_user_tier;
+mod pagerduty;
 mod rpc_accounting;
+mod sentryd;
 mod transfer_key;
 mod user_export;
 mod user_import;

+use anyhow::Context;
 use argh::FromArgs;
-use std::fs;
+use ethers::types::U256;
+use log::{info, warn};
+use pagerduty_rs::eventsv2async::EventsV2 as PagerdutyAsyncEventsV2;
+use pagerduty_rs::eventsv2sync::EventsV2 as PagerdutySyncEventsV2;
+use std::{
+    fs, panic,
+    path::Path,
+    sync::atomic::{self, AtomicUsize},
+};
+use tokio::runtime;
+use web3_proxy::pagerduty::panic_handler;
 use web3_proxy::{
-    app::{get_db, get_migrated_db},
+    app::{get_db, get_migrated_db, APP_USER_AGENT},
    config::TopConfig,
 };

+#[cfg(feature = "deadlock")]
+use parking_lot::deadlock;
+#[cfg(feature = "deadlock")]
+use std::thread;
+#[cfg(feature = "deadlock")]
+use tokio::time::Duration;
+
 #[derive(Debug, FromArgs)]
 /// Command line interface for admins to interact with web3_proxy
-pub struct CliConfig {
-    /// path to the application config (optional).
+pub struct Web3ProxyCli {
+    /// path to the application config (only required for some commands; defaults to dev config).
    #[argh(option)]
    pub config: Option<String>,

-    /// if no config, what database the client should connect to. Defaults to dev db.
-    #[argh(
-        option,
-        default = "\"mysql://root:dev_web3_proxy@127.0.0.1:13306/dev_web3_proxy\".to_string()"
-    )]
-    pub db_url: String,
+    /// number of worker threads. Defaults to the number of logical processors
+    #[argh(option, default = "0")]
+    pub workers: usize,
+
+    /// if no config, what database the client should connect to (only required for some commands; Defaults to dev db)
+    #[argh(option)]
+    pub db_url: Option<String>,
+
+    /// if no config, what sentry url should the client should connect to
+    #[argh(option)]
+    pub sentry_url: Option<String>,

    /// this one cli can do multiple things
    #[argh(subcommand)]
@ -50,10 +76,13 @@ enum SubCommand {
    ChangeUserTierByKey(change_user_tier_by_key::ChangeUserTierByKeySubCommand),
    CheckConfig(check_config::CheckConfigSubCommand),
    CountUsers(count_users::CountUsersSubCommand),
+    CreateKey(create_key::CreateKeySubCommand),
    CreateUser(create_user::CreateUserSubCommand),
    DropMigrationLock(drop_migration_lock::DropMigrationLockSubCommand),
-    HealthCompass(health_compass::HealthCompassSubCommand),
+    Pagerduty(pagerduty::PagerdutySubCommand),
+    Proxyd(daemon::ProxydSubCommand),
    RpcAccounting(rpc_accounting::RpcAccountingSubCommand),
+    Sentryd(sentryd::SentrydSubCommand),
    TransferKey(transfer_key::TransferKeySubCommand),
    UserExport(user_export::UserExportSubCommand),
    UserImport(user_import::UserImportSubCommand),
@ -62,28 +91,97 @@ enum SubCommand {
    // TODO: sub command to change a user's tier
 }

-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    // if RUST_LOG isn't set, configure a default
-    // TODO: is there a better way to do this?
-    if std::env::var("RUST_LOG").is_err() {
-        // std::env::set_var("RUST_LOG", "info,web3_proxy=debug,web3_proxy_cli=debug");
-        std::env::set_var("RUST_LOG", "info,web3_proxy=debug,web3_proxy_cli=debug");
+fn main() -> anyhow::Result<()> {
+    #[cfg(feature = "deadlock")]
+    {
+        // spawn a thread for deadlock detection
+        thread::spawn(move || loop {
+            thread::sleep(Duration::from_secs(10));
+            let deadlocks = deadlock::check_deadlock();
+            if deadlocks.is_empty() {
+                continue;
+            }
+
+            println!("{} deadlocks detected", deadlocks.len());
+            for (i, threads) in deadlocks.iter().enumerate() {
+                println!("Deadlock #{}", i);
+                for t in threads {
+                    println!("Thread Id {:#?}", t.thread_id());
+                    println!("{:#?}", t.backtrace());
+                }
+            }
+        });
    }

-    env_logger::init();
+    // if RUST_LOG isn't set, configure a default
+    // TODO: is there a better way to do this?
+    let rust_log = match std::env::var("RUST_LOG") {
+        Ok(x) => x,
+        Err(_) => match std::env::var("WEB3_PROXY_TRACE").map(|x| x == "true") {
+            Ok(true) => {
+                vec![
+                    "info",
+                    "ethers=debug",
+                    "redis_rate_limit=debug",
+                    "web3_proxy=trace",
+                    "web3_proxy_cli=trace",
+                    "web3_proxy::rpcs::blockchain=info",
+                    "web3_proxy::rpcs::request=debug",
+                ]
+            }
+            _ => {
+                vec![
+                    "info",
+                    "ethers=debug",
+                    "redis_rate_limit=debug",
+                    "web3_proxy=debug",
+                    "web3_proxy_cli=debug",
+                ]
+            }
+        }
+        .join(","),
+    };

    // this probably won't matter for us in docker, but better safe than sorry
    fdlimit::raise_fd_limit();

-    let mut cli_config: CliConfig = argh::from_env();
+    let mut cli_config: Web3ProxyCli = argh::from_env();
+
+    if cli_config.config.is_none() && cli_config.db_url.is_none() && cli_config.sentry_url.is_none()
+    {
+        // TODO: default to example.toml if development.toml doesn't exist
+        info!("defaulting to development config");
+        cli_config.config = Some("./config/development.toml".to_string());
+    }
+
+    let top_config = if let Some(top_config_path) = cli_config.config.clone() {
+        let top_config_path = Path::new(&top_config_path)
+            .canonicalize()
+            .context(format!("checking for config at {}", top_config_path))?;

-    let _top_config = if let Some(top_config_path) = cli_config.config.clone() {
        let top_config: String = fs::read_to_string(top_config_path)?;
-        let top_config: TopConfig = toml::from_str(&top_config)?;
+        let mut top_config: TopConfig = toml::from_str(&top_config)?;

-        if let Some(top_config_db_url) = top_config.app.db_url.clone() {
-            cli_config.db_url = top_config_db_url;
+        // TODO: this doesn't seem to do anything
+        proctitle::set_title(format!("web3_proxy-{}", top_config.app.chain_id));
+
+        if cli_config.db_url.is_none() {
+            cli_config.db_url = top_config.app.db_url.clone();
+        }
+
+        if let Some(sentry_url) = top_config.app.sentry_url.clone() {
+            cli_config.sentry_url = Some(sentry_url);
+        }
+
+        if top_config.app.chain_id == 137 {
+            // TODO: these numbers are arbitrary. i think the maticnetwork/erigon fork has a bug
+            if top_config.app.gas_increase_min.is_none() {
+                top_config.app.gas_increase_min = Some(U256::from(40_000));
+            }
+
+            if top_config.app.gas_increase_percent.is_none() {
+                top_config.app.gas_increase_percent = Some(U256::from(40));
+            }
        }

        Some(top_config)
@ -91,69 +189,231 @@ async fn main() -> anyhow::Result<()> {
        None
    };

-    match cli_config.sub_command {
-        SubCommand::ChangeUserAddress(x) => {
-            let db_conn = get_db(cli_config.db_url, 1, 1).await?;
+    let logger = env_logger::builder().parse_filters(&rust_log).build();

-            x.main(&db_conn).await
-        }
-        SubCommand::ChangeUserTier(x) => {
-            let db_conn = get_db(cli_config.db_url, 1, 1).await?;
+    let max_level = logger.filter();

-            x.main(&db_conn).await
-        }
-        SubCommand::ChangeUserAdminStatus(x) => {
-            let db_conn = get_db(cli_config.db_url, 1, 1).await?;
+    // connect to sentry for error reporting
+    // if no sentry, only log to stdout
+    let _sentry_guard = if let Some(sentry_url) = cli_config.sentry_url.clone() {
+        let logger = sentry::integrations::log::SentryLogger::with_dest(logger);

-            x.main(&db_conn).await
-        }
-        SubCommand::ChangeUserTierByAddress(x) => {
-            let db_conn = get_db(cli_config.db_url, 1, 1).await?;
+        log::set_boxed_logger(Box::new(logger)).unwrap();

-            x.main(&db_conn).await
-        }
-        SubCommand::ChangeUserTierByKey(x) => {
-            let db_conn = get_db(cli_config.db_url, 1, 1).await?;
+        let guard = sentry::init((
+            sentry_url,
+            sentry::ClientOptions {
+                release: sentry::release_name!(),
+                // TODO: Set this a to lower value (from config) in production
+                traces_sample_rate: 1.0,
+                ..Default::default()
+            },
+        ));

-            x.main(&db_conn).await
-        }
-        SubCommand::CheckConfig(x) => x.main().await,
-        SubCommand::CreateUser(x) => {
-            let db_conn = get_migrated_db(cli_config.db_url, 1, 1).await?;
+        Some(guard)
+    } else {
+        log::set_boxed_logger(Box::new(logger)).unwrap();

-            x.main(&db_conn).await
-        }
-        SubCommand::CountUsers(x) => {
-            let db_conn = get_db(cli_config.db_url, 1, 1).await?;
+        None
+    };

-            x.main(&db_conn).await
-        }
-        SubCommand::DropMigrationLock(x) => {
-            // very intentionally, do NOT run migrations here
-            let db_conn = get_db(cli_config.db_url, 1, 1).await?;
+    log::set_max_level(max_level);

-            x.main(&db_conn).await
-        }
-        SubCommand::HealthCompass(x) => x.main().await,
-        SubCommand::RpcAccounting(x) => {
-            let db_conn = get_migrated_db(cli_config.db_url, 1, 1).await?;
+    info!("{}", APP_USER_AGENT);

-            x.main(&db_conn).await
-        }
-        SubCommand::TransferKey(x) => {
-            let db_conn = get_db(cli_config.db_url, 1, 1).await?;
+    // optionally connect to pagerduty
+    // TODO: fix this nested result
+    let (pagerduty_async, pagerduty_sync) = if let Ok(pagerduty_key) =
+        std::env::var("PAGERDUTY_INTEGRATION_KEY")
+    {
+        let pagerduty_async =
+            PagerdutyAsyncEventsV2::new(pagerduty_key.clone(), Some(APP_USER_AGENT.to_string()))?;
+        let pagerduty_sync =
+            PagerdutySyncEventsV2::new(pagerduty_key, Some(APP_USER_AGENT.to_string()))?;

-            x.main(&db_conn).await
-        }
-        SubCommand::UserExport(x) => {
-            let db_conn = get_migrated_db(cli_config.db_url, 1, 1).await?;
+        (Some(pagerduty_async), Some(pagerduty_sync))
+    } else {
+        info!("No PAGERDUTY_INTEGRATION_KEY");

-            x.main(&db_conn).await
-        }
-        SubCommand::UserImport(x) => {
-            let db_conn = get_migrated_db(cli_config.db_url, 1, 1).await?;
+        (None, None)
+    };

-            x.main(&db_conn).await
-        }
+    // panic handler that sends to pagerduty.
+    // TODO: use the sentry handler if no pager duty. use default if no sentry
+    if let Some(pagerduty_sync) = pagerduty_sync {
+        let top_config = top_config.clone();
+
+        panic::set_hook(Box::new(move |x| {
+            panic_handler(top_config.clone(), &pagerduty_sync, x);
+        }));
    }
+
+    // set up tokio's async runtime
+    let mut rt_builder = runtime::Builder::new_multi_thread();
+
+    rt_builder.enable_all();
+
+    if cli_config.workers > 0 {
+        rt_builder.worker_threads(cli_config.workers);
+    }
+
+    if let Some(top_config) = top_config.as_ref() {
+        let chain_id = top_config.app.chain_id;
+
+        rt_builder.thread_name_fn(move || {
+            static ATOMIC_ID: AtomicUsize = AtomicUsize::new(0);
+            // TODO: what ordering? i think we want seqcst so that these all happen in order, but that might be stricter than we really need
+            let worker_id = ATOMIC_ID.fetch_add(1, atomic::Ordering::SeqCst);
+            // TODO: i think these max at 15 characters
+            format!("web3-{}-{}", chain_id, worker_id)
+        });
+    }
+
+    // start tokio's async runtime
+    let rt = rt_builder.build()?;
+
+    let num_workers = rt.metrics().num_workers();
+    info!("num_workers: {}", num_workers);
+
+    rt.block_on(async {
+        match cli_config.sub_command {
+            SubCommand::ChangeUserAddress(x) => {
+                let db_url = cli_config
+                    .db_url
+                    .expect("'--config' (with a db) or '--db-url' is required to run proxyd");
+
+                let db_conn = get_db(db_url, 1, 1).await?;
+
+                x.main(&db_conn).await
+            }
+            SubCommand::ChangeUserTier(x) => {
+                let db_url = cli_config
+                    .db_url
+                    .expect("'--config' (with a db) or '--db-url' is required to run proxyd");
+
+                let db_conn = get_db(db_url, 1, 1).await?;
+
+                x.main(&db_conn).await
+            }
+            SubCommand::ChangeUserAdminStatus(x) => {
+                let db_url = cli_config
+                    .db_url
+                    .expect("'--config' (with a db) or '--db-url' is required to run proxyd");
+
+                let db_conn = get_db(db_url, 1, 1).await?;
+
+                x.main(&db_conn).await
+            }
+            SubCommand::ChangeUserTierByAddress(x) => {
+                let db_url = cli_config
+                    .db_url
+                    .expect("'--config' (with a db) or '--db-url' is required to run proxyd");
+
+                let db_conn = get_db(db_url, 1, 1).await?;
+
+                x.main(&db_conn).await
+            }
+            SubCommand::ChangeUserTierByKey(x) => {
+                let db_url = cli_config
+                    .db_url
+                    .expect("'--config' (with a db) or '--db-url' is required to run proxyd");
+
+                let db_conn = get_db(db_url, 1, 1).await?;
+
+                x.main(&db_conn).await
+            }
+            SubCommand::CheckConfig(x) => x.main().await,
+            SubCommand::CreateKey(x) => {
+                let db_url = cli_config
+                    .db_url
+                    .expect("'--config' (with a db) or '--db-url' is required to run create a key");
+
+                let db_conn = get_migrated_db(db_url, 1, 1).await?;
+
+                x.main(&db_conn).await
+            }
+            SubCommand::CreateUser(x) => {
+                let db_url = cli_config
+                    .db_url
+                    .expect("'--config' (with a db) or '--db-url' is required to run proxyd");
+
+                let db_conn = get_migrated_db(db_url, 1, 1).await?;
+
+                x.main(&db_conn).await
+            }
+            SubCommand::CountUsers(x) => {
+                let db_url = cli_config
+                    .db_url
+                    .expect("'--config' (with a db) or '--db-url' is required to run proxyd");
+
+                let db_conn = get_db(db_url, 1, 1).await?;
+
+                x.main(&db_conn).await
+            }
+            SubCommand::Proxyd(x) => {
+                let top_config = top_config.expect("--config is required to run proxyd");
+
+                x.main(top_config, num_workers).await
+            }
+            SubCommand::DropMigrationLock(x) => {
+                let db_url = cli_config
+                    .db_url
+                    .expect("'--config' (with a db) or '--db-url' is required to run proxyd");
+
+                // very intentionally, do NOT run migrations here
+                let db_conn = get_db(db_url, 1, 1).await?;
+
+                x.main(&db_conn).await
+            }
+            SubCommand::Pagerduty(x) => {
+                if cli_config.sentry_url.is_none() {
+                    warn!("sentry_url is not set! Logs will only show in this console");
+                }
+
+                x.main(pagerduty_async, top_config).await
+            }
+            SubCommand::Sentryd(x) => {
+                if cli_config.sentry_url.is_none() {
+                    warn!("sentry_url is not set! Logs will only show in this console");
+                }
+
+                x.main(pagerduty_async, top_config).await
+            }
+            SubCommand::RpcAccounting(x) => {
+                let db_url = cli_config
+                    .db_url
+                    .expect("'--config' (with a db) or '--db-url' is required to run proxyd");
+
+                let db_conn = get_migrated_db(db_url, 1, 1).await?;
+
+                x.main(&db_conn).await
+            }
+            SubCommand::TransferKey(x) => {
+                let db_url = cli_config
+                    .db_url
+                    .expect("'--config' (with a db) or '--db-url' is required to run proxyd");
+                let db_conn = get_db(db_url, 1, 1).await?;
+
+                x.main(&db_conn).await
+            }
+            SubCommand::UserExport(x) => {
+                let db_url = cli_config
+                    .db_url
+                    .expect("'--config' (with a db) or '--db-url' is required to run proxyd");
+
+                let db_conn = get_migrated_db(db_url, 1, 1).await?;
+
+                x.main(&db_conn).await
+            }
+            SubCommand::UserImport(x) => {
+                let db_url = cli_config
+                    .db_url
+                    .expect("'--config' (with a db) or '--db-url' is required to run proxyd");
+
+                let db_conn = get_migrated_db(db_url, 1, 1).await?;
+
+                x.main(&db_conn).await
+            }
+        }
+    })
 }
--- a/web3_proxy/src/bin/web3_proxy_cli/pagerduty.rs
+++ b/web3_proxy/src/bin/web3_proxy_cli/pagerduty.rs
@ -0,0 +1,88 @@
+use argh::FromArgs;
+use log::{error, info};
+use pagerduty_rs::{eventsv2async::EventsV2 as PagerdutyAsyncEventsV2, types::Event};
+use web3_proxy::{
+    config::TopConfig,
+    pagerduty::{pagerduty_alert, pagerduty_alert_for_config},
+};
+
+#[derive(FromArgs, PartialEq, Debug, Eq)]
+/// Quickly create a pagerduty alert
+#[argh(subcommand, name = "pagerduty")]
+pub struct PagerdutySubCommand {
+    #[argh(positional)]
+    /// short description of the alert
+    summary: String,
+
+    /// the chain id to require. Only used if not using --config.
+    #[argh(option)]
+    chain_id: Option<u64>,
+
+    #[argh(option)]
+    /// the class/type of the event
+    class: Option<String>,
+
+    #[argh(option)]
+    /// the component of the event
+    component: Option<String>,
+
+    #[argh(option)]
+    /// deduplicate alerts based on this key.
+    /// If there are no open incidents with this key, a new incident will be created.
+    /// If there is an open incident with a matching key, the new event will be appended to that incident's Alerts log as an additional Trigger log entry.
+    dedup_key: Option<String>,
+}
+
+impl PagerdutySubCommand {
+    pub async fn main(
+        self,
+        pagerduty_async: Option<PagerdutyAsyncEventsV2>,
+        top_config: Option<TopConfig>,
+    ) -> anyhow::Result<()> {
+        // TODO: allow customizing severity
+        let event = top_config
+            .map(|top_config| {
+                pagerduty_alert_for_config(
+                    self.class.clone(),
+                    self.component.clone(),
+                    None::<()>,
+                    pagerduty_rs::types::Severity::Error,
+                    self.summary.clone(),
+                    None,
+                    top_config,
+                )
+            })
+            .unwrap_or_else(|| {
+                pagerduty_alert(
+                    self.chain_id,
+                    self.class,
+                    None,
+                    None,
+                    self.component,
+                    None::<()>,
+                    pagerduty_rs::types::Severity::Error,
+                    None,
+                    self.summary,
+                    None,
+                )
+            });
+
+        if let Some(pagerduty_async) = pagerduty_async {
+            info!(
+                "sending to pagerduty: {}",
+                serde_json::to_string_pretty(&event)?
+            );
+
+            if let Err(err) = pagerduty_async.event(Event::AlertTrigger(event)).await {
+                error!("Failed sending to pagerduty: {}", err);
+            }
+        } else {
+            info!(
+                "would send to pagerduty if PAGERDUTY_INTEGRATION_KEY were set: {}",
+                serde_json::to_string_pretty(&event)?
+            );
+        }
+
+        Ok(())
+    }
+}
--- a/web3_proxy/src/bin/web3_proxy_cli/rpc_accounting.rs
+++ b/web3_proxy/src/bin/web3_proxy_cli/rpc_accounting.rs
@ -12,6 +12,8 @@ use migration::{
    },
    Condition,
 };
+use serde::Serialize;
+use serde_json::json;

 /// count requests
 #[derive(FromArgs, PartialEq, Debug, Eq)]
@ -37,7 +39,7 @@ pub struct RpcAccountingSubCommand {

 impl RpcAccountingSubCommand {
    pub async fn main(self, db_conn: &DatabaseConnection) -> anyhow::Result<()> {
-        #[derive(Debug, FromQueryResult)]
+        #[derive(Serialize, FromQueryResult)]
        struct SelectResult {
            total_frontend_requests: Decimal,
            // pub total_backend_retries: Decimal,
@ -137,8 +139,9 @@ impl RpcAccountingSubCommand {
            .context("no query result")?;

        info!(
-            "query_response for chain {:?}: {:#?}",
-            self.chain_id, query_response
+            "query_response for chain {:?}: {:#}",
+            self.chain_id,
+            json!(query_response)
        );

        // let query_seconds: Decimal = query_response
--- a/web3_proxy/src/bin/web3_proxy_cli/sentryd/compare.rs
+++ b/web3_proxy/src/bin/web3_proxy_cli/sentryd/compare.rs
@ -0,0 +1,256 @@
+use anyhow::{anyhow, Context};
+use chrono::{DateTime, Utc};
+use ethers::types::{Block, TxHash, H256};
+use futures::{stream::FuturesUnordered, StreamExt};
+use log::{debug, warn};
+use serde::{Deserialize, Serialize};
+use serde_json::json;
+use web3_proxy::jsonrpc::JsonRpcErrorData;
+
+use super::{SentrydErrorBuilder, SentrydResult};
+
+#[derive(Debug, Deserialize, Serialize)]
+struct JsonRpcResponse<V> {
+    // pub jsonrpc: String,
+    // pub id: Box<RawValue>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub result: Option<V>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub error: Option<JsonRpcErrorData>,
+}
+
+#[derive(Serialize, Ord, PartialEq, PartialOrd, Eq)]
+struct AbbreviatedBlock {
+    pub num: u64,
+    pub time: DateTime<Utc>,
+    pub hash: H256,
+}
+
+impl From<Block<TxHash>> for AbbreviatedBlock {
+    fn from(x: Block<TxHash>) -> Self {
+        Self {
+            num: x.number.unwrap().as_u64(),
+            hash: x.hash.unwrap(),
+            time: x.time().unwrap(),
+        }
+    }
+}
+
+pub async fn main(
+    error_builder: SentrydErrorBuilder,
+    rpc: String,
+    others: Vec<String>,
+    max_age: i64,
+    max_lag: i64,
+) -> SentrydResult {
+    let client = reqwest::Client::new();
+
+    let block_by_number_request = json!({
+        "jsonrpc": "2.0",
+        "id": "1",
+        "method": "eth_getBlockByNumber",
+        "params": ["latest", false],
+    });
+
+    let a = client
+        .post(&rpc)
+        .json(&block_by_number_request)
+        .send()
+        .await
+        .context(format!("error querying block from {}", rpc))
+        .map_err(|x| error_builder.build(x))?;
+
+    if !a.status().is_success() {
+        return error_builder.result(anyhow!("bad response from {}: {}", rpc, a.status()));
+    }
+
+    // TODO: capture response headers now in case of error. store them in the extra data on the pager duty alert
+    let headers = format!("{:#?}", a.headers());
+
+    let body = a
+        .text()
+        .await
+        .context(format!("failed parsing body from {}", rpc))
+        .map_err(|x| error_builder.build(x))?;
+
+    let a: JsonRpcResponse<Block<TxHash>> = serde_json::from_str(&body)
+        .context(format!("body: {}", body))
+        .context(format!("failed parsing json from {}", rpc))
+        .map_err(|x| error_builder.build(x))?;
+
+    let a = if let Some(block) = a.result {
+        block
+    } else if let Some(err) = a.error {
+        return error_builder.result(
+            anyhow::anyhow!("headers: {:#?}. err: {:#?}", headers, err)
+                .context(format!("jsonrpc error from {}: code {}", rpc, err.code)),
+        );
+    } else {
+        return error_builder
+            .result(anyhow!("{:#?}", a).context(format!("empty response from {}", rpc)));
+    };
+
+    // check the parent because b and c might not be as fast as a
+    let parent_hash = a.parent_hash;
+
+    let rpc_block = check_rpc(parent_hash, client.clone(), rpc.to_string())
+        .await
+        .context(format!("Error while querying primary rpc: {}", rpc))
+        .map_err(|err| error_builder.build(err))?;
+
+    let fs = FuturesUnordered::new();
+    for other in others.iter() {
+        let f = check_rpc(parent_hash, client.clone(), other.to_string());
+
+        fs.push(tokio::spawn(f));
+    }
+    let other_check: Vec<_> = fs.collect().await;
+
+    if other_check.is_empty() {
+        return error_builder.result(anyhow::anyhow!("No other RPCs to check!"));
+    }
+
+    // TODO: collect into a counter instead?
+    let mut newest_other = None;
+    for oc in other_check.iter() {
+        match oc {
+            Ok(Ok(x)) => newest_other = newest_other.max(Some(x)),
+            Ok(Err(err)) => warn!("failed checking other rpc: {:?}", err),
+            Err(err) => warn!("internal error checking other rpc: {:?}", err),
+        }
+    }
+
+    if let Some(newest_other) = newest_other {
+        let duration_since = newest_other
+            .time
+            .signed_duration_since(rpc_block.time)
+            .num_seconds();
+
+        match duration_since.abs().cmp(&max_lag) {
+            std::cmp::Ordering::Less | std::cmp::Ordering::Equal => {}
+            std::cmp::Ordering::Greater => match duration_since.cmp(&0) {
+                std::cmp::Ordering::Equal => {
+                    unimplemented!("we already checked that they are not equal")
+                }
+                std::cmp::Ordering::Less => {
+                    return error_builder.result(anyhow::anyhow!(
+                        "Our RPC is too far ahead ({} s)! Something might be wrong.\n{:#}\nvs\n{:#}",
+                        duration_since.abs(),
+                        json!(rpc_block),
+                        json!(newest_other),
+                    ).context(format!("{} is too far ahead", rpc)));
+                }
+                std::cmp::Ordering::Greater => {
+                    return error_builder.result(
+                        anyhow::anyhow!(
+                            "Behind {} s!\n{:#}\nvs\n{:#}",
+                            duration_since,
+                            json!(rpc_block),
+                            json!(newest_other),
+                        )
+                        .context(format!("{} is too far behind", rpc)),
+                    );
+                }
+            },
+        }
+
+        let now = Utc::now();
+
+        let block_age = now
+            .signed_duration_since(newest_other.max(&rpc_block).time)
+            .num_seconds();
+
+        match block_age.abs().cmp(&max_age) {
+            std::cmp::Ordering::Less | std::cmp::Ordering::Equal => {}
+            std::cmp::Ordering::Greater => match duration_since.cmp(&0) {
+                std::cmp::Ordering::Equal => unimplemented!(),
+                std::cmp::Ordering::Less => {
+                    return error_builder.result(
+                        anyhow::anyhow!(
+                            "Clock is behind {}s! Something might be wrong.\n{:#}\nvs\n{:#}",
+                            block_age.abs(),
+                            json!(now),
+                            json!(newest_other),
+                        )
+                        .context(format!("Clock is too far behind on {}!", rpc)),
+                    );
+                }
+                std::cmp::Ordering::Greater => {
+                    return error_builder.result(
+                        anyhow::anyhow!(
+                            "block is too old ({}s)!\n{:#}\nvs\n{:#}",
+                            block_age,
+                            json!(now),
+                            json!(newest_other),
+                        )
+                        .context(format!("block is too old on {}!", rpc)),
+                    );
+                }
+            },
+        }
+    } else {
+        return error_builder.result(anyhow::anyhow!("No other RPC times to check!"));
+    }
+
+    debug!("rpc comparison ok: {:#}", json!(rpc_block));
+
+    Ok(())
+}
+
+// i don't think we need a whole provider. a simple http request is easiest
+async fn check_rpc(
+    block_hash: H256,
+    client: reqwest::Client,
+    rpc: String,
+) -> anyhow::Result<AbbreviatedBlock> {
+    let block_by_hash_request = json!({
+        "jsonrpc": "2.0",
+        "id": "1",
+        "method": "eth_getBlockByHash",
+        "params": [block_hash, false],
+    });
+
+    let response = client
+        .post(&rpc)
+        .json(&block_by_hash_request)
+        .send()
+        .await
+        .context(format!("awaiting response from {}", rpc))?;
+
+    if !response.status().is_success() {
+        return Err(anyhow::anyhow!(
+            "bad response from {}: {}",
+            rpc,
+            response.status(),
+        ));
+    }
+
+    let body = response
+        .text()
+        .await
+        .context(format!("failed parsing body from {}", rpc))?;
+
+    let response_json: JsonRpcResponse<Block<TxHash>> = serde_json::from_str(&body)
+        .context(format!("body: {}", body))
+        .context(format!("failed parsing json from {}", rpc))?;
+
+    if let Some(result) = response_json.result {
+        let abbreviated = AbbreviatedBlock::from(result);
+
+        debug!("{} has {:?}@{}", rpc, abbreviated.hash, abbreviated.num);
+
+        Ok(abbreviated)
+    } else if let Some(result) = response_json.error {
+        Err(anyhow!(
+            "jsonrpc error during check_rpc from {}: {:#}",
+            rpc,
+            json!(result),
+        ))
+    } else {
+        Err(anyhow!(
+            "empty result during check_rpc from {}: {:#}",
+            rpc,
+            json!(response_json)
+        ))
+    }
+}
--- a/web3_proxy/src/bin/web3_proxy_cli/sentryd/mod.rs
+++ b/web3_proxy/src/bin/web3_proxy_cli/sentryd/mod.rs
@ -0,0 +1,284 @@
+mod compare;
+mod simple;
+
+use anyhow::Context;
+use argh::FromArgs;
+use futures::{
+    stream::{FuturesUnordered, StreamExt},
+    Future,
+};
+use log::{error, info};
+use pagerduty_rs::{eventsv2async::EventsV2 as PagerdutyAsyncEventsV2, types::Event};
+use serde_json::json;
+use std::time::Duration;
+use tokio::sync::mpsc;
+use tokio::time::{interval, MissedTickBehavior};
+use web3_proxy::{config::TopConfig, pagerduty::pagerduty_alert};
+
+#[derive(FromArgs, PartialEq, Debug, Eq)]
+/// Loop healthchecks and send pager duty alerts if any fail
+#[argh(subcommand, name = "sentryd")]
+pub struct SentrydSubCommand {
+    #[argh(positional)]
+    /// the main (HTTP only) web3-proxy being checked.
+    web3_proxy: String,
+
+    /// the chain id to require. Only used if not using --config.
+    #[argh(option)]
+    chain_id: Option<u64>,
+
+    #[argh(option)]
+    /// warning threshold for age of the best known head block
+    max_age: i64,
+
+    #[argh(option)]
+    /// warning threshold for seconds between the rpc and best other_rpc's head blocks
+    max_lag: i64,
+
+    #[argh(option)]
+    /// other (HTTP only) rpcs to compare the main rpc to
+    other_rpc: Vec<String>,
+
+    #[argh(option)]
+    /// other (HTTP only) web3-proxies to compare the main rpc to
+    other_proxy: Vec<String>,
+
+    #[argh(option)]
+    /// how many seconds between running checks
+    seconds: Option<u64>,
+}
+
+#[derive(Debug)]
+pub struct SentrydError {
+    /// The class/type of the event, for example ping failure or cpu load
+    class: String,
+    /// Errors will send a pagerduty alert. others just give log messages
+    level: log::Level,
+    /// A short summary that should be mostly static
+    summary: String,
+    /// Lots of detail about the error
+    extra: Option<serde_json::Value>,
+}
+
+/// helper for creating SentrydErrors
+#[derive(Clone)]
+pub struct SentrydErrorBuilder {
+    class: String,
+    level: log::Level,
+}
+
+impl SentrydErrorBuilder {
+    fn build(&self, err: anyhow::Error) -> SentrydError {
+        SentrydError {
+            class: self.class.to_owned(),
+            level: self.level.to_owned(),
+            summary: format!("{}", err),
+            extra: Some(json!(format!("{:#?}", err))),
+        }
+    }
+
+    fn result(&self, err: anyhow::Error) -> SentrydResult {
+        Err(self.build(err))
+    }
+}
+
+type SentrydResult = Result<(), SentrydError>;
+
+impl SentrydSubCommand {
+    pub async fn main(
+        self,
+        pagerduty_async: Option<PagerdutyAsyncEventsV2>,
+        top_config: Option<TopConfig>,
+    ) -> anyhow::Result<()> {
+        // sentry logging should already be configured
+
+        let chain_id = self
+            .chain_id
+            .or_else(|| top_config.map(|x| x.app.chain_id))
+            .context("--config or --chain-id required")?;
+
+        let primary_proxy = self.web3_proxy.trim_end_matches("/").to_string();
+
+        let other_proxy: Vec<_> = self
+            .other_proxy
+            .into_iter()
+            .map(|x| x.trim_end_matches("/").to_string())
+            .collect();
+
+        let other_rpc: Vec<_> = self
+            .other_rpc
+            .into_iter()
+            .map(|x| x.trim_end_matches("/").to_string())
+            .collect();
+
+        let seconds = self.seconds.unwrap_or(60);
+
+        let mut handles = FuturesUnordered::new();
+
+        // channels and a task for sending errors to logs/pagerduty
+        let (error_sender, mut error_receiver) = mpsc::channel::<SentrydError>(10);
+
+        {
+            let error_handler_f = async move {
+                if pagerduty_async.is_none() {
+                    info!("set PAGERDUTY_INTEGRATION_KEY to send create alerts for errors");
+                }
+
+                while let Some(err) = error_receiver.recv().await {
+                    log::log!(err.level, "check failed: {:#?}", err);
+
+                    if matches!(err.level, log::Level::Error) {
+                        let alert = pagerduty_alert(
+                            Some(chain_id),
+                            Some(err.class),
+                            Some("web3-proxy-sentry".to_string()),
+                            None,
+                            None,
+                            err.extra,
+                            pagerduty_rs::types::Severity::Error,
+                            None,
+                            err.summary,
+                            None,
+                        );
+
+                        if let Some(pagerduty_async) = pagerduty_async.as_ref() {
+                            info!(
+                                "sending to pagerduty: {:#}",
+                                serde_json::to_string_pretty(&alert)?
+                            );
+
+                            if let Err(err) =
+                                pagerduty_async.event(Event::AlertTrigger(alert)).await
+                            {
+                                error!("Failed sending to pagerduty: {:#?}", err);
+                            }
+                        }
+                    }
+                }
+
+                Ok(())
+            };
+
+            handles.push(tokio::spawn(error_handler_f));
+        }
+
+        // spawn a bunch of health check loops that do their checks on an interval
+
+        // check the main rpc's /health endpoint
+        {
+            let url = if primary_proxy.contains("/rpc/") {
+                let x = primary_proxy.split("/rpc/").next().unwrap();
+
+                format!("{}/health", x)
+            } else {
+                format!("{}/health", primary_proxy)
+            };
+            let error_sender = error_sender.clone();
+
+            // TODO: what timeout?
+            let timeout = Duration::from_secs(5);
+
+            let loop_f = a_loop(
+                "main /health",
+                seconds,
+                log::Level::Error,
+                error_sender,
+                move |error_builder| simple::main(error_builder, url.clone(), timeout),
+            );
+
+            handles.push(tokio::spawn(loop_f));
+        }
+        // check any other web3-proxy /health endpoints
+        for other_web3_proxy in other_proxy.iter() {
+            let url = if other_web3_proxy.contains("/rpc/") {
+                let x = other_web3_proxy.split("/rpc/").next().unwrap();
+
+                format!("{}/health", x)
+            } else {
+                format!("{}/health", other_web3_proxy)
+            };
+
+            let error_sender = error_sender.clone();
+
+            // TODO: what timeout?
+            let timeout = Duration::from_secs(5);
+
+            let loop_f = a_loop(
+                "other /health",
+                seconds,
+                log::Level::Warn,
+                error_sender,
+                move |error_builder| simple::main(error_builder, url.clone(), timeout),
+            );
+
+            handles.push(tokio::spawn(loop_f));
+        }
+
+        // compare the main web3-proxy head block to all web3-proxies and rpcs
+        {
+            let max_age = self.max_age;
+            let max_lag = self.max_lag;
+            let primary_proxy = primary_proxy.clone();
+            let error_sender = error_sender.clone();
+
+            let mut others = other_proxy.clone();
+
+            others.extend(other_rpc);
+
+            let loop_f = a_loop(
+                "head block comparison",
+                seconds,
+                log::Level::Error,
+                error_sender,
+                move |error_builder| {
+                    compare::main(
+                        error_builder,
+                        primary_proxy.clone(),
+                        others.clone(),
+                        max_age,
+                        max_lag,
+                    )
+                },
+            );
+
+            handles.push(tokio::spawn(loop_f));
+        }
+
+        // wait for any returned values (if everything is working, they will all run forever)
+        while let Some(x) = handles.next().await {
+            // any errors that make it here will end the program
+            x??;
+        }
+
+        Ok(())
+    }
+}
+
+async fn a_loop<T>(
+    class: &str,
+    seconds: u64,
+    error_level: log::Level,
+    error_sender: mpsc::Sender<SentrydError>,
+    f: impl Fn(SentrydErrorBuilder) -> T,
+) -> anyhow::Result<()>
+where
+    T: Future<Output = SentrydResult> + Send + 'static,
+{
+    let error_builder = SentrydErrorBuilder {
+        class: class.to_owned(),
+        level: error_level,
+    };
+
+    let mut interval = interval(Duration::from_secs(seconds));
+
+    // TODO: should we warn if there are delays?
+    interval.set_missed_tick_behavior(MissedTickBehavior::Delay);
+
+    loop {
+        interval.tick().await;
+
+        if let Err(err) = f(error_builder.clone()).await {
+            error_sender.send(err).await?;
+        };
+    }
+}
--- a/web3_proxy/src/bin/web3_proxy_cli/sentryd/simple.rs
+++ b/web3_proxy/src/bin/web3_proxy_cli/sentryd/simple.rs
@ -0,0 +1,60 @@
+use std::time::Duration;
+
+use super::{SentrydErrorBuilder, SentrydResult};
+use anyhow::Context;
+use log::{debug, trace};
+use tokio::time::Instant;
+
+/// GET the url and return an error if it wasn't a success
+pub async fn main(
+    error_builder: SentrydErrorBuilder,
+    url: String,
+    timeout: Duration,
+) -> SentrydResult {
+    let start = Instant::now();
+
+    let r = reqwest::get(&url)
+        .await
+        .context(format!("Failed GET {}", &url))
+        .map_err(|x| error_builder.build(x))?;
+
+    let elapsed = start.elapsed();
+
+    if elapsed > timeout {
+        return error_builder.result(
+            anyhow::anyhow!(
+                "query took longer than {}ms ({}ms): {:#?}",
+                timeout.as_millis(),
+                elapsed.as_millis(),
+                r
+            )
+            .context(format!("fetching {} took too long", &url)),
+        );
+    }
+
+    // TODO: what should we do if we get rate limited here?
+
+    if r.status().is_success() {
+        debug!("{} is healthy", &url);
+        trace!("Successful {:#?}", r);
+        return Ok(());
+    }
+
+    // TODO: capture headers? or is that already part of r?
+    let detail = format!("{:#?}", r);
+
+    let summary = format!("{} is unhealthy: {}", &url, r.status());
+
+    let body = r
+        .text()
+        .await
+        .context(detail.clone())
+        .context(summary.clone())
+        .map_err(|x| error_builder.build(x))?;
+
+    error_builder.result(
+        anyhow::anyhow!("body: {}", body)
+            .context(detail)
+            .context(summary),
+    )
+}
--- a/web3_proxy/src/config.rs
+++ b/web3_proxy/src/config.rs
@ -4,6 +4,7 @@ use crate::rpcs::request::OpenRequestHandleMetrics;
 use crate::{app::AnyhowJoinHandle, rpcs::blockchain::ArcBlock};
 use argh::FromArgs;
 use ethers::prelude::TxHash;
+use ethers::types::U256;
 use hashbrown::HashMap;
 use log::warn;
 use migration::sea_orm::DatabaseConnection;
@ -38,7 +39,7 @@ pub struct CliConfig {
    pub cookie_key_filename: String,
 }

-#[derive(Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize)]
 pub struct TopConfig {
    pub app: AppConfig,
    pub balanced_rpcs: HashMap<String, Web3ConnectionConfig>,
@ -51,7 +52,7 @@ pub struct TopConfig {

 /// shared configuration between Web3Connections
 // TODO: no String, only &str
-#[derive(Debug, Default, Deserialize)]
+#[derive(Clone, Debug, Default, Deserialize)]
 pub struct AppConfig {
    /// Request limit for allowed origins for anonymous users.
    /// These requests get rate limited by IP.
@ -90,6 +91,12 @@ pub struct AppConfig {
    /// None = allow all requests
    pub default_user_max_requests_per_period: Option<u64>,

+    /// minimum amount to increase eth_estimateGas results
+    pub gas_increase_min: Option<U256>,
+
+    /// percentage to increase eth_estimateGas results. 100 == 100%
+    pub gas_increase_percent: Option<U256>,
+
    /// Restrict user registration.
    /// None = no code needed
    pub invite_code: Option<String>,
@ -183,7 +190,7 @@ fn default_response_cache_max_bytes() -> usize {
 }

 /// Configuration for a backend web3 RPC server
-#[derive(Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize)]
 pub struct Web3ConnectionConfig {
    /// simple way to disable a connection without deleting the row
    #[serde(default)]
@ -198,6 +205,8 @@ pub struct Web3ConnectionConfig {
    pub soft_limit: u32,
    /// the requests per second at which the server throws errors (rate limit or otherwise)
    pub hard_limit: Option<u64>,
+    /// only use this rpc if everything else is lagging too far. this allows us to ignore fast but very low limit rpcs
+    pub backup: Option<bool>,
    /// All else equal, a server with a lower tier receives all requests
    #[serde(default = "default_tier")]
    pub tier: u64,
@ -221,7 +230,6 @@ impl Web3ConnectionConfig {
    pub async fn spawn(
        self,
        name: String,
-        allowed_lag: u64,
        db_conn: Option<DatabaseConnection>,
        redis_pool: Option<redis_rate_limiter::RedisPool>,
        chain_id: u64,
@ -256,9 +264,10 @@ impl Web3ConnectionConfig {
            None
        };

+        let backup = self.backup.unwrap_or(false);
+
        Web3Connection::spawn(
            name,
-            allowed_lag,
            self.display_name,
            chain_id,
            db_conn,
@ -267,6 +276,7 @@ impl Web3ConnectionConfig {
            http_interval_sender,
            hard_limit,
            self.soft_limit,
+            backup,
            self.block_data_limit,
            block_map,
            block_sender,
--- a/web3_proxy/src/frontend/authorization.rs
+++ b/web3_proxy/src/frontend/authorization.rs
@ -85,6 +85,7 @@ pub struct RequestMetadata {
    pub error_response: AtomicBool,
    pub response_bytes: AtomicU64,
    pub response_millis: AtomicU64,
+    pub response_from_backup_rpc: AtomicBool,
 }

 impl RequestMetadata {
@ -103,6 +104,7 @@ impl RequestMetadata {
            error_response: false.into(),
            response_bytes: 0.into(),
            response_millis: 0.into(),
+            response_from_backup_rpc: false.into(),
        };

        Ok(new)
@ -660,13 +662,11 @@ impl Web3ProxyApp {

                let db_replica = self.db_replica().context("Getting database connection")?;

-                let rpc_secret_key: Uuid = rpc_secret_key.into();
-
                // TODO: join the user table to this to return the User? we don't always need it
                // TODO: join on secondary users
                // TODO: join on user tier
                match rpc_key::Entity::find()
-                    .filter(rpc_key::Column::SecretKey.eq(rpc_secret_key))
+                    .filter(rpc_key::Column::SecretKey.eq(<Uuid>::from(rpc_secret_key)))
                    .filter(rpc_key::Column::Active.eq(true))
                    .one(db_replica.conn())
                    .await?
@ -741,7 +741,8 @@ impl Web3ProxyApp {

                        Ok(AuthorizationChecks {
                            user_id: rpc_key_model.user_id,
-                            rpc_key_id,
+                            rpc_secret_key: Some(rpc_secret_key),
+                            rpc_secret_key_id: rpc_key_id,
                            allowed_ips,
                            allowed_origins,
                            allowed_referers,
@ -774,7 +775,7 @@ impl Web3ProxyApp {
        let authorization_checks = self.authorization_checks(rpc_key).await?;

        // if no rpc_key_id matching the given rpc was found, then we can't rate limit by key
-        if authorization_checks.rpc_key_id.is_none() {
+        if authorization_checks.rpc_secret_key_id.is_none() {
            return Ok(RateLimitResult::UnknownKey);
        }

@ -845,3 +846,29 @@ impl Web3ProxyApp {
        }
    }
 }
+
+impl Authorization {
+    pub async fn check_again(
+        &self,
+        app: &Arc<Web3ProxyApp>,
+    ) -> Result<(Arc<Self>, Option<OwnedSemaphorePermit>), FrontendErrorResponse> {
+        // TODO: we could probably do this without clones. but this is easy
+        let (a, s) = if let Some(rpc_secret_key) = self.checks.rpc_secret_key {
+            key_is_authorized(
+                app,
+                rpc_secret_key,
+                self.ip,
+                self.origin.clone(),
+                self.referer.clone(),
+                self.user_agent.clone(),
+            )
+            .await?
+        } else {
+            ip_is_authorized(app, self.ip, self.origin.clone()).await?
+        };
+
+        let a = Arc::new(a);
+
+        Ok((a, s))
+    }
+}
--- a/web3_proxy/src/frontend/errors.rs
+++ b/web3_proxy/src/frontend/errors.rs
@ -35,7 +35,6 @@ pub enum FrontendErrorResponse {
    NotFound,
    RateLimited(Authorization, Option<Instant>),
    Redis(RedisError),
-    Response(Response),
    /// simple way to return an error message to the user and an anyhow to our logs
    StatusCode(StatusCode, String, Option<anyhow::Error>),
    /// TODO: what should be attached to the timout?
@ -44,11 +43,9 @@ pub enum FrontendErrorResponse {
    UnknownKey,
 }

-impl IntoResponse for FrontendErrorResponse {
-    fn into_response(self) -> Response {
-        // TODO: include the request id in these so that users can give us something that will point to logs
-        // TODO: status code is in the jsonrpc response and is also the first item in the tuple. DRY
-        let (status_code, response) = match self {
+impl FrontendErrorResponse {
+    pub fn into_response_parts(self) -> (StatusCode, JsonRpcForwardedResponse) {
+        match self {
            Self::AccessDenied => {
                // TODO: attach something to this trace. probably don't include much in the message though. don't want to leak creds by accident
                trace!("access denied");
@ -174,12 +171,12 @@ impl IntoResponse for FrontendErrorResponse {
                };

                // create a string with either the IP or the rpc_key_id
-                let msg = if authorization.checks.rpc_key_id.is_none() {
+                let msg = if authorization.checks.rpc_secret_key_id.is_none() {
                    format!("too many requests from {}.{}", authorization.ip, retry_msg)
                } else {
                    format!(
                        "too many requests from rpc key #{}.{}",
-                        authorization.checks.rpc_key_id.unwrap(),
+                        authorization.checks.rpc_secret_key_id.unwrap(),
                        retry_msg
                    )
                };
@ -204,10 +201,6 @@ impl IntoResponse for FrontendErrorResponse {
                    ),
                )
            }
-            Self::Response(r) => {
-                debug_assert_ne!(r.status(), StatusCode::OK);
-                return r;
-            }
            Self::SemaphoreAcquireError(err) => {
                warn!("semaphore acquire err={:?}", err);
                (
@ -274,7 +267,15 @@ impl IntoResponse for FrontendErrorResponse {
                    None,
                ),
            ),
-        };
+        }
+    }
+}
+
+impl IntoResponse for FrontendErrorResponse {
+    fn into_response(self) -> Response {
+        // TODO: include the request id in these so that users can give us something that will point to logs
+        // TODO: status code is in the jsonrpc response and is also the first item in the tuple. DRY
+        let (status_code, response) = self.into_response_parts();

        (status_code, Json(response)).into_response()
    }
--- a/web3_proxy/src/frontend/mod.rs
+++ b/web3_proxy/src/frontend/mod.rs
@ -41,28 +41,102 @@ pub async fn serve(port: u16, proxy_app: Arc<Web3ProxyApp>) -> anyhow::Result<()
        .time_to_live(Duration::from_secs(1))
        .build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default());

+    // TODO: read config for if fastest/versus should be available publicly. default off
+
    // build our axum Router
    let app = Router::new()
-        // routes should be ordered most to least common
+        // TODO: i think these routes could be done a lot better
+        //
+        // HTTP RPC (POST)
+        //
+        // public
        .route("/", post(rpc_proxy_http::proxy_web3_rpc))
+        // authenticated with and without trailing slash
+        .route(
+            "/rpc/:rpc_key/",
+            post(rpc_proxy_http::proxy_web3_rpc_with_key),
+        )
+        .route(
+            "/rpc/:rpc_key",
+            post(rpc_proxy_http::proxy_web3_rpc_with_key),
+        )
+        // public fastest with and without trailing slash
+        .route("/fastest/", post(rpc_proxy_http::fastest_proxy_web3_rpc))
+        .route("/fastest", post(rpc_proxy_http::fastest_proxy_web3_rpc))
+        // authenticated fastest with and without trailing slash
+        .route(
+            "/fastest/:rpc_key/",
+            post(rpc_proxy_http::fastest_proxy_web3_rpc_with_key),
+        )
+        .route(
+            "/fastest/:rpc_key",
+            post(rpc_proxy_http::fastest_proxy_web3_rpc_with_key),
+        )
+        // public versus
+        .route("/versus/", post(rpc_proxy_http::versus_proxy_web3_rpc))
+        .route("/versus", post(rpc_proxy_http::versus_proxy_web3_rpc))
+        // authenticated versus with and without trailing slash
+        .route(
+            "/versus/:rpc_key/",
+            post(rpc_proxy_http::versus_proxy_web3_rpc_with_key),
+        )
+        .route(
+            "/versus/:rpc_key",
+            post(rpc_proxy_http::versus_proxy_web3_rpc_with_key),
+        )
+        //
+        // Websocket RPC (GET)
+        // If not an RPC, this will redirect to configurable urls
+        //
+        // public
        .route("/", get(rpc_proxy_ws::websocket_handler))
-        .route(
-            "/rpc/:rpc_key",
-            post(rpc_proxy_http::proxy_web3_rpc_with_key),
-        )
+        // authenticated with and without trailing slash
        .route(
            "/rpc/:rpc_key/",
-            post(rpc_proxy_http::proxy_web3_rpc_with_key),
+            get(rpc_proxy_ws::websocket_handler_with_key),
        )
        .route(
            "/rpc/:rpc_key",
            get(rpc_proxy_ws::websocket_handler_with_key),
        )
+        // public fastest with and without trailing slash
+        .route("/fastest/", get(rpc_proxy_ws::fastest_websocket_handler))
+        .route("/fastest", get(rpc_proxy_ws::fastest_websocket_handler))
+        // authenticated fastest with and without trailing slash
        .route(
-            "/rpc/:rpc_key/",
-            get(rpc_proxy_ws::websocket_handler_with_key),
+            "/fastest/:rpc_key/",
+            get(rpc_proxy_ws::fastest_websocket_handler_with_key),
        )
+        .route(
+            "/fastest/:rpc_key",
+            get(rpc_proxy_ws::fastest_websocket_handler_with_key),
+        )
+        // public versus
+        .route(
+            "/versus/",
+            get(rpc_proxy_ws::versus_websocket_handler_with_key),
+        )
+        .route(
+            "/versus",
+            get(rpc_proxy_ws::versus_websocket_handler_with_key),
+        )
+        // authenticated versus with and without trailing slash
+        .route(
+            "/versus/:rpc_key/",
+            get(rpc_proxy_ws::versus_websocket_handler_with_key),
+        )
+        .route(
+            "/versus/:rpc_key",
+            get(rpc_proxy_ws::versus_websocket_handler_with_key),
+        )
+        //
+        // System things
+        //
        .route("/health", get(status::health))
+        .route("/status", get(status::status))
+        //
+        // User stuff
+        //
        .route("/user/login/:user_address", get(users::user_login_get))
        .route(
            "/user/login/:user_address/:message_eip",
@ -88,9 +162,11 @@ pub async fn serve(port: u16, proxy_app: Arc<Web3ProxyApp>) -> anyhow::Result<()
        .route("/user/stats/detailed", get(users::user_stats_detailed_get))
        .route("/admin/modify_role", get(admin::admin_change_user_roles))
        .route("/user/logout", post(users::user_logout_post))
-        .route("/status", get(status::status))
+        //
+        // Axum layers
        // layers are ordered bottom up
        // the last layer is first for requests and last for responses
+        //
        // Mark the `Authorization` request header as sensitive so it doesn't show in logs
        .layer(SetSensitiveRequestHeadersLayer::new(once(AUTHORIZATION)))
        // handle cors
--- a/web3_proxy/src/frontend/rpc_proxy_http.rs
+++ b/web3_proxy/src/frontend/rpc_proxy_http.rs
@ -2,6 +2,7 @@

 use super::authorization::{ip_is_authorized, key_is_authorized};
 use super::errors::FrontendResult;
+use super::rpc_proxy_ws::ProxyMode;
 use crate::{app::Web3ProxyApp, jsonrpc::JsonRpcRequestEnum};
 use axum::extract::Path;
 use axum::headers::{Origin, Referer, UserAgent};
@ -18,9 +19,41 @@ use std::sync::Arc;
 #[debug_handler]
 pub async fn proxy_web3_rpc(
    Extension(app): Extension<Arc<Web3ProxyApp>>,
-    ClientIp(ip): ClientIp,
+    ip: ClientIp,
    origin: Option<TypedHeader<Origin>>,
    Json(payload): Json<JsonRpcRequestEnum>,
+) -> FrontendResult {
+    _proxy_web3_rpc(app, ip, origin, payload, ProxyMode::Best).await
+}
+
+#[debug_handler]
+pub async fn fastest_proxy_web3_rpc(
+    Extension(app): Extension<Arc<Web3ProxyApp>>,
+    ip: ClientIp,
+    origin: Option<TypedHeader<Origin>>,
+    Json(payload): Json<JsonRpcRequestEnum>,
+) -> FrontendResult {
+    // TODO: read the fastest number from params
+    // TODO: check that the app allows this without authentication
+    _proxy_web3_rpc(app, ip, origin, payload, ProxyMode::Fastest(0)).await
+}
+
+#[debug_handler]
+pub async fn versus_proxy_web3_rpc(
+    Extension(app): Extension<Arc<Web3ProxyApp>>,
+    ip: ClientIp,
+    origin: Option<TypedHeader<Origin>>,
+    Json(payload): Json<JsonRpcRequestEnum>,
+) -> FrontendResult {
+    _proxy_web3_rpc(app, ip, origin, payload, ProxyMode::Versus).await
+}
+
+async fn _proxy_web3_rpc(
+    app: Arc<Web3ProxyApp>,
+    ClientIp(ip): ClientIp,
+    origin: Option<TypedHeader<Origin>>,
+    payload: JsonRpcRequestEnum,
+    proxy_mode: ProxyMode,
 ) -> FrontendResult {
    // TODO: benchmark spawning this
    // TODO: do we care about keeping the TypedHeader wrapper?
@ -31,7 +64,7 @@ pub async fn proxy_web3_rpc(
    let authorization = Arc::new(authorization);

    let (response, rpcs, _semaphore) = app
-        .proxy_web3_rpc(authorization, payload)
+        .proxy_web3_rpc(authorization, payload, proxy_mode)
        .await
        .map(|(x, y)| (x, y, semaphore))?;

@ -58,12 +91,82 @@ pub async fn proxy_web3_rpc(
 #[debug_handler]
 pub async fn proxy_web3_rpc_with_key(
    Extension(app): Extension<Arc<Web3ProxyApp>>,
-    ClientIp(ip): ClientIp,
+    ip: ClientIp,
    origin: Option<TypedHeader<Origin>>,
    referer: Option<TypedHeader<Referer>>,
    user_agent: Option<TypedHeader<UserAgent>>,
    Path(rpc_key): Path<String>,
    Json(payload): Json<JsonRpcRequestEnum>,
+) -> FrontendResult {
+    _proxy_web3_rpc_with_key(
+        app,
+        ip,
+        origin,
+        referer,
+        user_agent,
+        rpc_key,
+        payload,
+        ProxyMode::Best,
+    )
+    .await
+}
+
+#[debug_handler]
+pub async fn fastest_proxy_web3_rpc_with_key(
+    Extension(app): Extension<Arc<Web3ProxyApp>>,
+    ip: ClientIp,
+    origin: Option<TypedHeader<Origin>>,
+    referer: Option<TypedHeader<Referer>>,
+    user_agent: Option<TypedHeader<UserAgent>>,
+    Path(rpc_key): Path<String>,
+    Json(payload): Json<JsonRpcRequestEnum>,
+) -> FrontendResult {
+    _proxy_web3_rpc_with_key(
+        app,
+        ip,
+        origin,
+        referer,
+        user_agent,
+        rpc_key,
+        payload,
+        ProxyMode::Fastest(0),
+    )
+    .await
+}
+
+#[debug_handler]
+pub async fn versus_proxy_web3_rpc_with_key(
+    Extension(app): Extension<Arc<Web3ProxyApp>>,
+    ip: ClientIp,
+    origin: Option<TypedHeader<Origin>>,
+    referer: Option<TypedHeader<Referer>>,
+    user_agent: Option<TypedHeader<UserAgent>>,
+    Path(rpc_key): Path<String>,
+    Json(payload): Json<JsonRpcRequestEnum>,
+) -> FrontendResult {
+    _proxy_web3_rpc_with_key(
+        app,
+        ip,
+        origin,
+        referer,
+        user_agent,
+        rpc_key,
+        payload,
+        ProxyMode::Versus,
+    )
+    .await
+}
+
+#[allow(clippy::too_many_arguments)]
+async fn _proxy_web3_rpc_with_key(
+    app: Arc<Web3ProxyApp>,
+    ClientIp(ip): ClientIp,
+    origin: Option<TypedHeader<Origin>>,
+    referer: Option<TypedHeader<Referer>>,
+    user_agent: Option<TypedHeader<UserAgent>>,
+    rpc_key: String,
+    payload: JsonRpcRequestEnum,
+    proxy_mode: ProxyMode,
 ) -> FrontendResult {
    // TODO: DRY w/ proxy_web3_rpc
    // the request can take a while, so we spawn so that we can start serving another request
@ -82,7 +185,7 @@ pub async fn proxy_web3_rpc_with_key(
    let authorization = Arc::new(authorization);

    let (response, rpcs, _semaphore) = app
-        .proxy_web3_rpc(authorization, payload)
+        .proxy_web3_rpc(authorization, payload, proxy_mode)
        .await
        .map(|(x, y)| (x, y, semaphore))?;

--- a/web3_proxy/src/frontend/rpc_proxy_ws.rs
+++ b/web3_proxy/src/frontend/rpc_proxy_ws.rs
@ -32,11 +32,60 @@ use serde_json::json;
 use serde_json::value::to_raw_value;
 use std::sync::Arc;
 use std::{str::from_utf8_mut, sync::atomic::AtomicUsize};
+use tokio::sync::{broadcast, OwnedSemaphorePermit, RwLock};
+
+#[derive(Copy, Clone)]
+pub enum ProxyMode {
+    /// send to the "best" synced server
+    Best,
+    /// send to all synced servers and return the fastest non-error response (reverts do not count as errors here)
+    Fastest(usize),
+    /// send to all servers for benchmarking. return the fastest non-error response
+    Versus,
+}

 /// Public entrypoint for WebSocket JSON-RPC requests.
+/// Queries a single server at a time
 #[debug_handler]
 pub async fn websocket_handler(
    Extension(app): Extension<Arc<Web3ProxyApp>>,
+    ip: ClientIp,
+    origin: Option<TypedHeader<Origin>>,
+    ws_upgrade: Option<WebSocketUpgrade>,
+) -> FrontendResult {
+    _websocket_handler(ProxyMode::Best, app, ip, origin, ws_upgrade).await
+}
+
+/// Public entrypoint for WebSocket JSON-RPC requests that uses all synced servers.
+/// Queries all synced backends with every request! This might get expensive!
+#[debug_handler]
+pub async fn fastest_websocket_handler(
+    Extension(app): Extension<Arc<Web3ProxyApp>>,
+    ip: ClientIp,
+    origin: Option<TypedHeader<Origin>>,
+    ws_upgrade: Option<WebSocketUpgrade>,
+) -> FrontendResult {
+    // TODO: get the fastest number from the url params (default to 0/all)
+    // TODO: config to disable this
+    _websocket_handler(ProxyMode::Fastest(0), app, ip, origin, ws_upgrade).await
+}
+
+/// Public entrypoint for WebSocket JSON-RPC requests that uses all synced servers.
+/// Queries **all** backends with every request! This might get expensive!
+#[debug_handler]
+pub async fn versus_websocket_handler(
+    Extension(app): Extension<Arc<Web3ProxyApp>>,
+    ip: ClientIp,
+    origin: Option<TypedHeader<Origin>>,
+    ws_upgrade: Option<WebSocketUpgrade>,
+) -> FrontendResult {
+    // TODO: config to disable this
+    _websocket_handler(ProxyMode::Versus, app, ip, origin, ws_upgrade).await
+}
+
+async fn _websocket_handler(
+    proxy_mode: ProxyMode,
+    app: Arc<Web3ProxyApp>,
    ClientIp(ip): ClientIp,
    origin: Option<TypedHeader<Origin>>,
    ws_upgrade: Option<WebSocketUpgrade>,
@ -49,7 +98,7 @@ pub async fn websocket_handler(

    match ws_upgrade {
        Some(ws) => Ok(ws
-            .on_upgrade(|socket| proxy_web3_socket(app, authorization, socket))
+            .on_upgrade(move |socket| proxy_web3_socket(app, authorization, socket, proxy_mode))
            .into_response()),
        None => {
            if let Some(redirect) = &app.config.redirect_public_url {
@ -72,12 +121,83 @@ pub async fn websocket_handler(
 #[debug_handler]
 pub async fn websocket_handler_with_key(
    Extension(app): Extension<Arc<Web3ProxyApp>>,
-    ClientIp(ip): ClientIp,
+    ip: ClientIp,
    Path(rpc_key): Path<String>,
    origin: Option<TypedHeader<Origin>>,
    referer: Option<TypedHeader<Referer>>,
    user_agent: Option<TypedHeader<UserAgent>>,
    ws_upgrade: Option<WebSocketUpgrade>,
+) -> FrontendResult {
+    _websocket_handler_with_key(
+        ProxyMode::Best,
+        app,
+        ip,
+        rpc_key,
+        origin,
+        referer,
+        user_agent,
+        ws_upgrade,
+    )
+    .await
+}
+
+#[debug_handler]
+pub async fn fastest_websocket_handler_with_key(
+    Extension(app): Extension<Arc<Web3ProxyApp>>,
+    ip: ClientIp,
+    Path(rpc_key): Path<String>,
+    origin: Option<TypedHeader<Origin>>,
+    referer: Option<TypedHeader<Referer>>,
+    user_agent: Option<TypedHeader<UserAgent>>,
+    ws_upgrade: Option<WebSocketUpgrade>,
+) -> FrontendResult {
+    // TODO: get the fastest number from the url params (default to 0/all)
+    _websocket_handler_with_key(
+        ProxyMode::Fastest(0),
+        app,
+        ip,
+        rpc_key,
+        origin,
+        referer,
+        user_agent,
+        ws_upgrade,
+    )
+    .await
+}
+
+#[debug_handler]
+pub async fn versus_websocket_handler_with_key(
+    Extension(app): Extension<Arc<Web3ProxyApp>>,
+    ip: ClientIp,
+    Path(rpc_key): Path<String>,
+    origin: Option<TypedHeader<Origin>>,
+    referer: Option<TypedHeader<Referer>>,
+    user_agent: Option<TypedHeader<UserAgent>>,
+    ws_upgrade: Option<WebSocketUpgrade>,
+) -> FrontendResult {
+    _websocket_handler_with_key(
+        ProxyMode::Versus,
+        app,
+        ip,
+        rpc_key,
+        origin,
+        referer,
+        user_agent,
+        ws_upgrade,
+    )
+    .await
+}
+
+#[allow(clippy::too_many_arguments)]
+async fn _websocket_handler_with_key(
+    proxy_mode: ProxyMode,
+    app: Arc<Web3ProxyApp>,
+    ClientIp(ip): ClientIp,
+    rpc_key: String,
+    origin: Option<TypedHeader<Origin>>,
+    referer: Option<TypedHeader<Referer>>,
+    user_agent: Option<TypedHeader<UserAgent>>,
+    ws_upgrade: Option<WebSocketUpgrade>,
 ) -> FrontendResult {
    let rpc_key = rpc_key.parse()?;

@ -96,9 +216,8 @@ pub async fn websocket_handler_with_key(
    let authorization = Arc::new(authorization);

    match ws_upgrade {
-        Some(ws_upgrade) => {
-            Ok(ws_upgrade.on_upgrade(move |socket| proxy_web3_socket(app, authorization, socket)))
-        }
+        Some(ws_upgrade) => Ok(ws_upgrade
+            .on_upgrade(move |socket| proxy_web3_socket(app, authorization, socket, proxy_mode))),
        None => {
            // if no websocket upgrade, this is probably a user loading the url with their browser

@ -107,7 +226,7 @@ pub async fn websocket_handler_with_key(
            match (
                &app.config.redirect_public_url,
                &app.config.redirect_rpc_key_url,
-                authorization.checks.rpc_key_id,
+                authorization.checks.rpc_secret_key_id,
            ) {
                (None, None, _) => Err(FrontendErrorResponse::StatusCode(
                    StatusCode::BAD_REQUEST,
@ -120,7 +239,7 @@ pub async fn websocket_handler_with_key(
                (_, Some(redirect_rpc_key_url), rpc_key_id) => {
                    let reg = Handlebars::new();

-                    if authorization.checks.rpc_key_id.is_none() {
+                    if authorization.checks.rpc_secret_key_id.is_none() {
                        // i don't think this is possible
                        Err(FrontendErrorResponse::StatusCode(
                            StatusCode::UNAUTHORIZED,
@ -154,6 +273,7 @@ async fn proxy_web3_socket(
    app: Arc<Web3ProxyApp>,
    authorization: Arc<Authorization>,
    socket: WebSocket,
+    proxy_mode: ProxyMode,
 ) {
    // split the websocket so we can read and write concurrently
    let (ws_tx, ws_rx) = socket.split();
@ -162,7 +282,13 @@ async fn proxy_web3_socket(
    let (response_sender, response_receiver) = flume::unbounded::<Message>();

    tokio::spawn(write_web3_socket(response_receiver, ws_tx));
-    tokio::spawn(read_web3_socket(app, authorization, ws_rx, response_sender));
+    tokio::spawn(read_web3_socket(
+        app,
+        authorization,
+        ws_rx,
+        response_sender,
+        proxy_mode,
+    ));
 }

 /// websockets support a few more methods than http clients
@ -172,8 +298,20 @@ async fn handle_socket_payload(
    payload: &str,
    response_sender: &flume::Sender<Message>,
    subscription_count: &AtomicUsize,
-    subscriptions: &mut HashMap<String, AbortHandle>,
-) -> Message {
+    subscriptions: Arc<RwLock<HashMap<String, AbortHandle>>>,
+    proxy_mode: ProxyMode,
+) -> (Message, Option<OwnedSemaphorePermit>) {
+    let (authorization, semaphore) = match authorization.check_again(&app).await {
+        Ok((a, s)) => (a, s),
+        Err(err) => {
+            let (_, err) = err.into_response_parts();
+
+            let err = serde_json::to_string(&err).expect("to_string should always work here");
+
+            return (Message::Text(err), None);
+        }
+    };
+
    // TODO: do any clients send batches over websockets?
    let (id, response) = match serde_json::from_str::<JsonRpcRequest>(payload) {
        Ok(json_request) => {
@ -183,6 +321,7 @@ async fn handle_socket_payload(
                [..]
            {
                "eth_subscribe" => {
+                    // TODO: how can we subscribe with proxy_mode?
                    match app
                        .eth_subscribe(
                            authorization.clone(),
@ -194,7 +333,9 @@ async fn handle_socket_payload(
                    {
                        Ok((handle, response)) => {
                            // TODO: better key
-                            subscriptions.insert(
+                            let mut x = subscriptions.write().await;
+
+                            x.insert(
                                response
                                    .result
                                    .as_ref()
@ -218,8 +359,10 @@ async fn handle_socket_payload(

                    let subscription_id = json_request.params.unwrap().to_string();

+                    let mut x = subscriptions.write().await;
+
                    // TODO: is this the right response?
-                    let partial_response = match subscriptions.remove(&subscription_id) {
+                    let partial_response = match x.remove(&subscription_id) {
                        None => false,
                        Some(handle) => {
                            handle.abort();
@ -227,6 +370,8 @@ async fn handle_socket_payload(
                        }
                    };

+                    drop(x);
+
                    let response =
                        JsonRpcForwardedResponse::from_value(json!(partial_response), id.clone());

@ -247,7 +392,7 @@ async fn handle_socket_payload(
                    Ok(response.into())
                }
                _ => app
-                    .proxy_web3_rpc(authorization.clone(), json_request.into())
+                    .proxy_web3_rpc(authorization.clone(), json_request.into(), proxy_mode)
                    .await
                    .map_or_else(
                        |err| match err {
@ -281,9 +426,7 @@ async fn handle_socket_payload(
        }
    };

-    // TODO: what error should this be?
-
-    Message::Text(response_str)
+    (Message::Text(response_str), semaphore)
 }

 async fn read_web3_socket(
@ -291,60 +434,99 @@ async fn read_web3_socket(
    authorization: Arc<Authorization>,
    mut ws_rx: SplitStream<WebSocket>,
    response_sender: flume::Sender<Message>,
+    proxy_mode: ProxyMode,
 ) {
-    let mut subscriptions = HashMap::new();
-    let subscription_count = AtomicUsize::new(1);
+    // TODO: need a concurrent hashmap
+    let subscriptions = Arc::new(RwLock::new(HashMap::new()));
+    let subscription_count = Arc::new(AtomicUsize::new(1));

-    while let Some(Ok(msg)) = ws_rx.next().await {
-        // TODO: spawn this?
-        // new message from our client. forward to a backend and then send it through response_tx
-        let response_msg = match msg {
-            Message::Text(payload) => {
-                handle_socket_payload(
-                    app.clone(),
-                    &authorization,
-                    &payload,
-                    &response_sender,
-                    &subscription_count,
-                    &mut subscriptions,
-                )
-                .await
+    let (close_sender, mut close_receiver) = broadcast::channel(1);
+
+    loop {
+        tokio::select! {
+            msg = ws_rx.next() => {
+                if let Some(Ok(msg)) = msg {
+                    // spawn so that we can serve responses from this loop even faster
+                    // TODO: only do these clones if the msg is text/binary?
+                    let close_sender = close_sender.clone();
+                    let app = app.clone();
+                    let authorization = authorization.clone();
+                    let response_sender = response_sender.clone();
+                    let subscriptions = subscriptions.clone();
+                    let subscription_count = subscription_count.clone();
+
+                    let f = async move {
+                        let mut _semaphore = None;
+
+                        // new message from our client. forward to a backend and then send it through response_tx
+                        let response_msg = match msg {
+                            Message::Text(payload) => {
+                                let (msg, s) = handle_socket_payload(
+                                    app.clone(),
+                                    &authorization,
+                                    &payload,
+                                    &response_sender,
+                                    &subscription_count,
+                                    subscriptions,
+                                    proxy_mode,
+                                )
+                                .await;
+
+                                _semaphore = s;
+
+                                msg
+                            }
+                            Message::Ping(x) => {
+                                trace!("ping: {:?}", x);
+                                Message::Pong(x)
+                            }
+                            Message::Pong(x) => {
+                                trace!("pong: {:?}", x);
+                                return;
+                            }
+                            Message::Close(_) => {
+                                info!("closing websocket connection");
+                                // TODO: do something to close subscriptions?
+                                let _ = close_sender.send(true);
+                                return;
+                            }
+                            Message::Binary(mut payload) => {
+                                let payload = from_utf8_mut(&mut payload).unwrap();
+
+                                let (msg, s) = handle_socket_payload(
+                                    app.clone(),
+                                    &authorization,
+                                    payload,
+                                    &response_sender,
+                                    &subscription_count,
+                                    subscriptions,
+                                    proxy_mode,
+                                )
+                                .await;
+
+                                _semaphore = s;
+
+                                msg
+                            }
+                        };
+
+                        if response_sender.send_async(response_msg).await.is_err() {
+                            let _ = close_sender.send(true);
+                            return;
+                        };
+
+                        _semaphore = None;
+                    };
+
+                    tokio::spawn(f);
+                } else {
+                    break;
+                }
            }
-            Message::Ping(x) => {
-                trace!("ping: {:?}", x);
-                Message::Pong(x)
-            }
-            Message::Pong(x) => {
-                trace!("pong: {:?}", x);
-                continue;
-            }
-            Message::Close(_) => {
-                info!("closing websocket connection");
+            _ = close_receiver.recv() => {
                break;
            }
-            Message::Binary(mut payload) => {
-                // TODO: poke rate limit for the user/ip
-                let payload = from_utf8_mut(&mut payload).unwrap();
-
-                handle_socket_payload(
-                    app.clone(),
-                    &authorization,
-                    payload,
-                    &response_sender,
-                    &subscription_count,
-                    &mut subscriptions,
-                )
-                .await
-            }
-        };
-
-        match response_sender.send_async(response_msg).await {
-            Ok(_) => {}
-            Err(err) => {
-                error!("{}", err);
-                break;
-            }
-        };
+        }
    }
 }

--- a/web3_proxy/src/frontend/status.rs
+++ b/web3_proxy/src/frontend/status.rs
@ -4,7 +4,7 @@
 //! They will eventually move to another port.

 use super::{FrontendResponseCache, FrontendResponseCaches};
-use crate::app::Web3ProxyApp;
+use crate::app::{Web3ProxyApp, APP_USER_AGENT};
 use axum::{http::StatusCode, response::IntoResponse, Extension, Json};
 use axum_macros::debug_handler;
 use serde_json::json;
@ -33,6 +33,7 @@ pub async fn status(
        .get_with(FrontendResponseCaches::Status, async {
            // TODO: what else should we include? uptime, cache hit rates, cpu load, memory used
            let body = json!({
+                "version": APP_USER_AGENT,
                "chain_id": app.config.chain_id,
                "balanced_rpcs": app.balanced_rpcs,
                "private_rpcs": app.private_rpcs,
--- a/web3_proxy/src/lib.rs
+++ b/web3_proxy/src/lib.rs
@ -7,6 +7,7 @@ pub mod frontend;
 pub mod jsonrpc;
 pub mod metered;
 pub mod metrics_frontend;
+pub mod pagerduty;
 pub mod rpcs;
 pub mod user_queries;
 pub mod user_token;
--- a/web3_proxy/src/pagerduty.rs
+++ b/web3_proxy/src/pagerduty.rs
@ -0,0 +1,191 @@
+use crate::config::TopConfig;
+use gethostname::gethostname;
+use log::{debug, error};
+use pagerduty_rs::eventsv2sync::EventsV2 as PagerdutySyncEventsV2;
+use pagerduty_rs::types::{AlertTrigger, AlertTriggerPayload, Event};
+use serde::Serialize;
+use std::{
+    collections::hash_map::DefaultHasher,
+    hash::{Hash, Hasher},
+    panic::PanicInfo,
+};
+use time::OffsetDateTime;
+
+/*
+
+        let client = top_config
+            .as_ref()
+            .map(|top_config| format!("web3-proxy chain #{}", top_config.app.chain_id))
+            .unwrap_or_else(|| format!("web3-proxy w/o chain"));
+
+        let client_url = top_config
+            .as_ref()
+            .and_then(|x| x.app.redirect_public_url.clone());
+
+        panic::set_hook(Box::new(move |x| {
+            let hostname = gethostname().into_string().unwrap_or("unknown".to_string());
+            let panic_msg = format!("{} {:?}", x, x);
+
+            if panic_msg.starts_with("panicked at 'WS Server panic") {
+                info!("Underlying library {}", panic_msg);
+            } else {
+                error!("sending panic to pagerduty: {}", panic_msg);
+
+                let mut s = DefaultHasher::new();
+                panic_msg.hash(&mut s);
+                panic_msg.hash(&mut s);
+                let dedup_key = s.finish().to_string();
+
+                let payload = AlertTriggerPayload {
+                    severity: pagerduty_rs::types::Severity::Error,
+                    summary: panic_msg,
+                    source: hostname,
+                    timestamp: None,
+                    component: None,
+                    group: Some("web3-proxy".to_string()),
+                    class: Some("panic".to_string()),
+                    custom_details: None::<()>,
+                };
+
+                let event = Event::AlertTrigger(AlertTrigger {
+                    payload,
+                    dedup_key: Some(dedup_key),
+                    images: None,
+                    links: None,
+                    client: Some(client.clone()),
+                    client_url: client_url.clone(),
+                });
+
+                if let Err(err) = pagerduty_sync.event(event) {
+                    error!("Failed sending panic to pagerduty: {}", err);
+                }
+            }
+        }));
+
+*/
+
+pub fn panic_handler(
+    top_config: Option<TopConfig>,
+    pagerduty_sync: &PagerdutySyncEventsV2,
+    panic_info: &PanicInfo,
+) {
+    let summary = format!("{}", panic_info);
+
+    let details = format!("{:#?}", panic_info);
+
+    if summary.starts_with("panicked at 'WS Server panic") {
+        // the ethers-rs library panics when websockets disconnect. this isn't a panic we care about reporting
+        debug!("Underlying library {}", details);
+        return;
+    }
+
+    let class = Some("panic".to_string());
+
+    let alert = if let Some(top_config) = top_config {
+        pagerduty_alert_for_config(
+            class,
+            None,
+            Some(details),
+            pagerduty_rs::types::Severity::Critical,
+            summary,
+            None,
+            top_config,
+        )
+    } else {
+        pagerduty_alert(
+            None,
+            class,
+            None,
+            None,
+            None,
+            Some(details),
+            pagerduty_rs::types::Severity::Critical,
+            None,
+            summary,
+            None,
+        )
+    };
+
+    let event = Event::AlertTrigger(alert);
+
+    if let Err(err) = pagerduty_sync.event(event) {
+        error!("Failed sending alert to pagerduty! {:#?}", err);
+    }
+}
+
+pub fn pagerduty_alert_for_config<T: Serialize>(
+    class: Option<String>,
+    component: Option<String>,
+    custom_details: Option<T>,
+    severity: pagerduty_rs::types::Severity,
+    summary: String,
+    timestamp: Option<OffsetDateTime>,
+    top_config: TopConfig,
+) -> AlertTrigger<T> {
+    let chain_id = top_config.app.chain_id;
+
+    let client_url = top_config.app.redirect_public_url.clone();
+
+    pagerduty_alert(
+        Some(chain_id),
+        class,
+        None,
+        client_url,
+        component,
+        custom_details,
+        severity,
+        None,
+        summary,
+        timestamp,
+    )
+}
+
+pub fn pagerduty_alert<T: Serialize>(
+    chain_id: Option<u64>,
+    class: Option<String>,
+    client: Option<String>,
+    client_url: Option<String>,
+    component: Option<String>,
+    custom_details: Option<T>,
+    severity: pagerduty_rs::types::Severity,
+    source: Option<String>,
+    summary: String,
+    timestamp: Option<OffsetDateTime>,
+) -> AlertTrigger<T> {
+    let client = client.unwrap_or_else(|| "web3-proxy".to_string());
+
+    let group = chain_id.map(|x| format!("chain #{}", x));
+
+    let source =
+        source.unwrap_or_else(|| gethostname().into_string().unwrap_or("unknown".to_string()));
+
+    let mut s = DefaultHasher::new();
+    // TODO: include severity here?
+    summary.hash(&mut s);
+    client.hash(&mut s);
+    client_url.hash(&mut s);
+    component.hash(&mut s);
+    group.hash(&mut s);
+    class.hash(&mut s);
+    let dedup_key = s.finish().to_string();
+
+    let payload = AlertTriggerPayload {
+        severity,
+        summary,
+        source,
+        timestamp,
+        component,
+        group,
+        class,
+        custom_details,
+    };
+
+    AlertTrigger {
+        payload,
+        dedup_key: Some(dedup_key),
+        images: None,
+        links: None,
+        client: Some(client),
+        client_url: client_url,
+    }
+}
--- a/web3_proxy/src/rpcs/blockchain.rs
+++ b/web3_proxy/src/rpcs/blockchain.rs
--- a/web3_proxy/src/rpcs/connection.rs
+++ b/web3_proxy/src/rpcs/connection.rs
@ -24,22 +24,22 @@ use std::sync::atomic::{self, AtomicU32, AtomicU64};
 use std::{cmp::Ordering, sync::Arc};
 use thread_fast_rng::rand::Rng;
 use thread_fast_rng::thread_fast_rng;
-use tokio::sync::{broadcast, oneshot, RwLock as AsyncRwLock};
+use tokio::sync::{broadcast, oneshot, watch, RwLock as AsyncRwLock};
 use tokio::time::{interval, sleep, sleep_until, timeout, Duration, Instant, MissedTickBehavior};

 // TODO: maybe provider state should have the block data limit in it. but it is inside an async lock and we can't Serialize then
 #[derive(Clone, Debug)]
 pub enum ProviderState {
    None,
-    NotReady(Arc<Web3Provider>),
-    Ready(Arc<Web3Provider>),
+    Connecting(Arc<Web3Provider>),
+    Connected(Arc<Web3Provider>),
 }

 impl ProviderState {
    pub async fn provider(&self, allow_not_ready: bool) -> Option<&Arc<Web3Provider>> {
        match self {
            ProviderState::None => None,
-            ProviderState::NotReady(x) => {
+            ProviderState::Connecting(x) => {
                if allow_not_ready {
                    Some(x)
                } else {
@ -47,7 +47,7 @@ impl ProviderState {
                    None
                }
            }
-            ProviderState::Ready(x) => {
+            ProviderState::Connected(x) => {
                if x.ready() {
                    Some(x)
                } else {
@ -63,7 +63,6 @@ pub struct Web3Connection {
    pub name: String,
    pub display_name: Option<String>,
    pub db_conn: Option<DatabaseConnection>,
-    pub(super) allowed_lag: u64,
    /// TODO: can we get this from the provider? do we even need it?
    pub(super) url: String,
    /// Some connections use an http_client. we keep a clone for reconnecting
@ -77,6 +76,8 @@ pub struct Web3Connection {
    /// provider is in a RwLock so that we can replace it if re-connecting
    /// it is an async lock because we hold it open across awaits
    pub(super) provider_state: AsyncRwLock<ProviderState>,
+    /// keep track of hard limits
+    pub(super) hard_limit_until: Option<watch::Sender<Instant>>,
    /// rate limits are stored in a central redis so that multiple proxies can share their rate limits
    /// We do not use the deferred rate limiter because going over limits would cause errors
    pub(super) hard_limit: Option<RedisRateLimiter>,
@ -84,6 +85,8 @@ pub struct Web3Connection {
    pub(super) soft_limit: u32,
    /// use web3 queries to find the block data limit for archive/pruned nodes
    pub(super) automatic_block_limit: bool,
+    /// only use this rpc if everything else is lagging too far. this allows us to ignore fast but very low limit rpcs
+    pub(super) backup: bool,
    /// TODO: have an enum for this so that "no limit" prints pretty?
    pub(super) block_data_limit: AtomicU64,
    /// Lower tiers are higher priority when sending requests
@ -99,7 +102,6 @@ impl Web3Connection {
    #[allow(clippy::too_many_arguments)]
    pub async fn spawn(
        name: String,
-        allowed_lag: u64,
        display_name: Option<String>,
        chain_id: u64,
        db_conn: Option<DatabaseConnection>,
@ -111,6 +113,7 @@ impl Web3Connection {
        hard_limit: Option<(u64, RedisPool)>,
        // TODO: think more about this type
        soft_limit: u32,
+        backup: bool,
        block_data_limit: Option<u64>,
        block_map: BlockHashesCache,
        block_sender: Option<flume::Sender<BlockAndRpc>>,
@ -135,9 +138,18 @@ impl Web3Connection {
        let automatic_block_limit =
            (block_data_limit.load(atomic::Ordering::Acquire) == 0) && block_sender.is_some();

+        // track hard limit until on backup servers (which might surprise us with rate limit changes)
+        // and track on servers that have a configured hard limit
+        let hard_limit_until = if backup || hard_limit.is_some() {
+            let (sender, _) = watch::channel(Instant::now());
+
+            Some(sender)
+        } else {
+            None
+        };
+
        let new_connection = Self {
            name,
-            allowed_lag,
            db_conn: db_conn.clone(),
            display_name,
            http_client,
@ -147,8 +159,10 @@ impl Web3Connection {
            internal_requests: 0.into(),
            provider_state: AsyncRwLock::new(ProviderState::None),
            hard_limit,
+            hard_limit_until,
            soft_limit,
            automatic_block_limit,
+            backup,
            block_data_limit,
            head_block: RwLock::new(Default::default()),
            tier,
@ -191,25 +205,7 @@ impl Web3Connection {
            return Ok(None);
        }

-        // check if we are synced
-        let head_block: ArcBlock = self
-            .wait_for_request_handle(authorization, Duration::from_secs(30), true)
-            .await?
-            .request::<_, Option<_>>(
-                "eth_getBlockByNumber",
-                &json!(("latest", false)),
-                // error here are expected, so keep the level low
-                Level::Warn.into(),
-            )
-            .await?
-            .context("no block during check_block_data_limit!")?;
-
-        if SavedBlock::from(head_block).syncing(60) {
-            // if the node is syncing, we can't check its block data limit
-            return Ok(None);
-        }
-
-        // TODO: add SavedBlock to self? probably best not to. we might not get marked Ready
+        // TODO: check eth_syncing. if it is not false, return Ok(None)

        let mut limit = None;

@ -217,7 +213,7 @@ impl Web3Connection {
        // TODO: start at 0 or 1?
        for block_data_limit in [0, 32, 64, 128, 256, 512, 1024, 90_000, u64::MAX] {
            let handle = self
-                .wait_for_request_handle(authorization, Duration::from_secs(30), true)
+                .wait_for_request_handle(authorization, None, true)
                .await?;

            let head_block_num_future = handle.request::<Option<()>, U256>(
@ -243,7 +239,7 @@ impl Web3Connection {
            // TODO: wait for the handle BEFORE we check the current block number. it might be delayed too!
            // TODO: what should the request be?
            let handle = self
-                .wait_for_request_handle(authorization, Duration::from_secs(30), true)
+                .wait_for_request_handle(authorization, None, true)
                .await?;

            let archive_result: Result<Bytes, _> = handle
@ -292,26 +288,10 @@ impl Web3Connection {
        self.block_data_limit.load(atomic::Ordering::Acquire).into()
    }

-    pub fn syncing(&self, allowed_lag: u64) -> bool {
-        match self.head_block.read().clone() {
-            None => true,
-            Some(x) => x.syncing(allowed_lag),
-        }
-    }
-
    pub fn has_block_data(&self, needed_block_num: &U64) -> bool {
        let head_block_num = match self.head_block.read().clone() {
            None => return false,
-            Some(x) => {
-                // TODO: this 60 second limit is causing our polygons to fall behind. change this to number of blocks?
-                if x.syncing(60) {
-                    // skip syncing nodes. even though they might be able to serve a query,
-                    // latency will be poor and it will get in the way of them syncing further
-                    return false;
-                }
-
-                x.number()
-            }
+            Some(x) => x.number(),
        };

        // this rpc doesn't have that block yet. still syncing
@ -370,7 +350,15 @@ impl Web3Connection {
            );

            let retry_in = Duration::from_millis(sleep_ms);
-            info!(
+
+            let error_level = if self.backup {
+                log::Level::Debug
+            } else {
+                log::Level::Info
+            };
+
+            log::log!(
+                error_level,
                "Failed reconnect to {}! Retry in {}ms. err={:?}",
                self,
                retry_in.as_millis(),
@ -401,7 +389,7 @@ impl Web3Connection {
            ProviderState::None => {
                info!("connecting to {}", self);
            }
-            ProviderState::NotReady(provider) | ProviderState::Ready(provider) => {
+            ProviderState::Connecting(provider) | ProviderState::Connected(provider) => {
                // disconnect the current provider
                if let Web3Provider::Mock = provider.as_ref() {
                    return Ok(());
@ -435,7 +423,7 @@ impl Web3Connection {
        let new_provider = Web3Provider::from_str(&self.url, self.http_client.clone()).await?;

        // trace!("saving provider state as NotReady on {}", self);
-        *provider_state = ProviderState::NotReady(Arc::new(new_provider));
+        *provider_state = ProviderState::Connecting(Arc::new(new_provider));

        // drop the lock so that we can get a request handle
        // trace!("provider_state {} unlocked", self);
@ -448,7 +436,7 @@ impl Web3Connection {
        // TODO: what should the timeout be? should there be a request timeout?
        // trace!("waiting on chain id for {}", self);
        let found_chain_id: Result<U64, _> = self
-            .wait_for_request_handle(&authorization, Duration::from_secs(30), true)
+            .wait_for_request_handle(&authorization, None, true)
            .await?
            .request(
                "eth_chainId",
@ -489,7 +477,7 @@ impl Web3Connection {
                .context("provider missing")?
                .clone();

-            *provider_state = ProviderState::Ready(ready_provider);
+            *provider_state = ProviderState::Connected(ready_provider);
            // trace!("unlocked for ready...");
        }

@ -543,7 +531,7 @@ impl Web3Connection {
                    let _ = head_block.insert(new_head_block.clone().into());
                }

-                if self.block_data_limit() == U64::zero() && !self.syncing(1) {
+                if self.block_data_limit() == U64::zero() {
                    let authorization = Arc::new(Authorization::internal(self.db_conn.clone())?);
                    if let Err(err) = self.check_block_data_limit(&authorization).await {
                        warn!(
@ -591,8 +579,6 @@ impl Web3Connection {
        reconnect: bool,
        tx_id_sender: Option<flume::Sender<(TxHash, Arc<Self>)>>,
    ) -> anyhow::Result<()> {
-        let allowed_lag = self.allowed_lag;
-
        loop {
            let http_interval_receiver = http_interval_sender.as_ref().map(|x| x.subscribe());

@ -624,8 +610,6 @@ impl Web3Connection {
                    let health_sleep_seconds = 10;
                    sleep(Duration::from_secs(health_sleep_seconds)).await;

-                    let mut warned = 0;
-
                    loop {
                        // TODO: what if we just happened to have this check line up with another restart?
                        // TODO: think more about this
@ -644,34 +628,6 @@ impl Web3Connection {
                        }
                        // trace!("health check on {}. unlocked", conn);

-                        if let Some(x) = &*conn.head_block.read() {
-                            // if this block is too old, return an error so we reconnect
-                            let current_lag = x.lag();
-                            if current_lag > allowed_lag {
-                                let level = if warned == 0 {
-                                    log::Level::Warn
-                                } else if warned % 100 == 0 {
-                                    log::Level::Debug
-                                } else {
-                                    log::Level::Trace
-                                };
-
-                                log::log!(
-                                    level,
-                                    "{} is lagged {} secs: {} {}",
-                                    conn,
-                                    current_lag,
-                                    x.number(),
-                                    x.hash(),
-                                );
-
-                                warned += 1;
-                            } else {
-                                // reset warnings now that we are connected
-                                warned = 0;
-                            }
-                        }
-
                        sleep(Duration::from_secs(health_sleep_seconds)).await;
                    }
                };
@ -750,7 +706,7 @@ impl Web3Connection {
        // trace!("unlocked on new heads");

        // TODO: need a timeout
-        if let ProviderState::Ready(provider) = provider_state {
+        if let ProviderState::Connected(provider) = provider_state {
            match provider.as_ref() {
                Web3Provider::Mock => unimplemented!(),
                Web3Provider::Http(_provider) => {
@ -764,7 +720,7 @@ impl Web3Connection {
                    loop {
                        // TODO: what should the max_wait be?
                        match self
-                            .wait_for_request_handle(&authorization, Duration::from_secs(30), false)
+                            .wait_for_request_handle(&authorization, None, false)
                            .await
                        {
                            Ok(active_request_handle) => {
@ -850,7 +806,7 @@ impl Web3Connection {
                Web3Provider::Ws(provider) => {
                    // todo: move subscribe_blocks onto the request handle?
                    let active_request_handle = self
-                        .wait_for_request_handle(&authorization, Duration::from_secs(30), false)
+                        .wait_for_request_handle(&authorization, None, false)
                        .await;
                    let mut stream = provider.subscribe_blocks().await?;
                    drop(active_request_handle);
@ -860,7 +816,7 @@ impl Web3Connection {
                    // all it does is print "new block" for the same block as current block
                    // TODO: how does this get wrapped in an arc? does ethers handle that?
                    let block: Result<Option<ArcBlock>, _> = self
-                        .wait_for_request_handle(&authorization, Duration::from_secs(30), false)
+                        .wait_for_request_handle(&authorization, None, false)
                        .await?
                        .request(
                            "eth_getBlockByNumber",
@ -922,7 +878,7 @@ impl Web3Connection {
        authorization: Arc<Authorization>,
        tx_id_sender: flume::Sender<(TxHash, Arc<Self>)>,
    ) -> anyhow::Result<()> {
-        if let ProviderState::Ready(provider) = self
+        if let ProviderState::Connected(provider) = self
            .provider_state
            .try_read()
            .context("subscribe_pending_transactions")?
@ -961,8 +917,8 @@ impl Web3Connection {
                Web3Provider::Ws(provider) => {
                    // TODO: maybe the subscribe_pending_txs function should be on the active_request_handle
                    let active_request_handle = self
-                        .wait_for_request_handle(&authorization, Duration::from_secs(30), false)
-                        .await;
+                        .wait_for_request_handle(&authorization, None, false)
+                        .await?;

                    let mut stream = provider.subscribe_pending_txs().await?;

@ -995,13 +951,14 @@ impl Web3Connection {

    /// be careful with this; it might wait forever!
    /// `allow_not_ready` is only for use by health checks while starting the provider
+    /// TODO: don't use anyhow. use specific error type
    pub async fn wait_for_request_handle(
        self: &Arc<Self>,
        authorization: &Arc<Authorization>,
-        max_wait: Duration,
+        max_wait: Option<Duration>,
        allow_not_ready: bool,
    ) -> anyhow::Result<OpenRequestHandle> {
-        let max_wait = Instant::now() + max_wait;
+        let max_wait = max_wait.map(|x| Instant::now() + x);

        loop {
            match self
@ -1011,21 +968,39 @@ impl Web3Connection {
                Ok(OpenRequestResult::Handle(handle)) => return Ok(handle),
                Ok(OpenRequestResult::RetryAt(retry_at)) => {
                    // TODO: emit a stat?
-                    // // trace!(?retry_at);
+                    let wait = retry_at.duration_since(Instant::now());

-                    if retry_at > max_wait {
-                        // break now since we will wait past our maximum wait time
-                        // TODO: don't use anyhow. use specific error type
-                        return Err(anyhow::anyhow!("timeout waiting for request handle"));
+                    trace!(
+                        "waiting {} millis for request handle on {}",
+                        wait.as_millis(),
+                        self
+                    );
+
+                    if let Some(max_wait) = max_wait {
+                        if retry_at > max_wait {
+                            // break now since we will wait past our maximum wait time
+                            // TODO: don't use anyhow. use specific error type
+                            return Err(anyhow::anyhow!("timeout waiting for request handle"));
+                        }
                    }
+
                    sleep_until(retry_at).await;
                }
-                Ok(OpenRequestResult::NotReady) => {
+                Ok(OpenRequestResult::NotReady(_)) => {
                    // TODO: when can this happen? log? emit a stat?
-                    // TODO: subscribe to the head block on this
+                    trace!("{} has no handle ready", self);
+
+                    if let Some(max_wait) = max_wait {
+                        let now = Instant::now();
+
+                        if now > max_wait {
+                            return Err(anyhow::anyhow!("unable to retry for request handle"));
+                        }
+                    }
+
                    // TODO: sleep how long? maybe just error?
-                    // TODO: don't use anyhow. use specific error type
-                    return Err(anyhow::anyhow!("unable to retry for request handle"));
+                    // TODO: instead of an arbitrary sleep, subscribe to the head block on this
+                    sleep(Duration::from_millis(10)).await;
                }
                Err(err) => return Err(err),
            }
@ -1048,27 +1023,50 @@ impl Web3Connection {
                .await
                .is_none()
        {
-            return Ok(OpenRequestResult::NotReady);
+            trace!("{} is not ready", self);
+            return Ok(OpenRequestResult::NotReady(self.backup));
+        }
+
+        if let Some(hard_limit_until) = self.hard_limit_until.as_ref() {
+            let hard_limit_ready = hard_limit_until.borrow().clone();
+
+            let now = Instant::now();
+
+            if now < hard_limit_ready {
+                return Ok(OpenRequestResult::RetryAt(hard_limit_ready));
+            }
        }

        // check rate limits
        if let Some(ratelimiter) = self.hard_limit.as_ref() {
            // TODO: how should we know if we should set expire or not?
-            match ratelimiter.throttle().await? {
+            match ratelimiter
+                .throttle()
+                .await
+                .context(format!("attempting to throttle {}", self))?
+            {
                RedisRateLimitResult::Allowed(_) => {
-                    // // trace!("rate limit succeeded")
+                    // trace!("rate limit succeeded")
                }
                RedisRateLimitResult::RetryAt(retry_at, _) => {
-                    // rate limit failed
-                    // save the smallest retry_after. if nothing succeeds, return an Err with retry_after in it
-                    // TODO: use tracing better
-                    // TODO: i'm seeing "Exhausted rate limit on moralis: 0ns". How is it getting 0?
-                    warn!("Exhausted rate limit on {}. Retry at {:?}", self, retry_at);
+                    // rate limit gave us a wait time
+                    if !self.backup {
+                        let when = retry_at.duration_since(Instant::now());
+                        warn!(
+                            "Exhausted rate limit on {}. Retry in {}ms",
+                            self,
+                            when.as_millis()
+                        );
+                    }
+
+                    if let Some(hard_limit_until) = self.hard_limit_until.as_ref() {
+                        hard_limit_until.send_replace(retry_at.clone());
+                    }

                    return Ok(OpenRequestResult::RetryAt(retry_at));
                }
                RedisRateLimitResult::RetryNever => {
-                    return Ok(OpenRequestResult::NotReady);
+                    return Ok(OpenRequestResult::NotReady(self.backup));
                }
            }
        };
@ -1213,7 +1211,6 @@ mod tests {

        let x = Web3Connection {
            name: "name".to_string(),
-            allowed_lag: 10,
            db_conn: None,
            display_name: None,
            url: "ws://example.com".to_string(),
@ -1223,8 +1220,10 @@ mod tests {
            internal_requests: 0.into(),
            provider_state: AsyncRwLock::new(ProviderState::None),
            hard_limit: None,
+            hard_limit_until: None,
            soft_limit: 1_000,
            automatic_block_limit: false,
+            backup: false,
            block_data_limit: block_data_limit.into(),
            tier: 0,
            head_block: RwLock::new(Some(head_block.clone())),
@ -1261,7 +1260,6 @@ mod tests {
        // TODO: this is getting long. have a `impl Default`
        let x = Web3Connection {
            name: "name".to_string(),
-            allowed_lag: 10,
            db_conn: None,
            display_name: None,
            url: "ws://example.com".to_string(),
@ -1271,8 +1269,10 @@ mod tests {
            internal_requests: 0.into(),
            provider_state: AsyncRwLock::new(ProviderState::None),
            hard_limit: None,
+            hard_limit_until: None,
            soft_limit: 1_000,
            automatic_block_limit: false,
+            backup: false,
            block_data_limit: block_data_limit.into(),
            tier: 0,
            head_block: RwLock::new(Some(head_block.clone())),
@ -1288,6 +1288,8 @@ mod tests {
        assert!(!x.has_block_data(&(head_block.number() + 1000)));
    }

+    /*
+    // TODO: think about how to bring the concept of a "lagged" node back
    #[test]
    fn test_lagged_node_not_has_block_data() {
        let now: U256 = SystemTime::now()
@ -1313,7 +1315,6 @@ mod tests {

        let x = Web3Connection {
            name: "name".to_string(),
-            allowed_lag: 10,
            db_conn: None,
            display_name: None,
            url: "ws://example.com".to_string(),
@ -1325,6 +1326,7 @@ mod tests {
            hard_limit: None,
            soft_limit: 1_000,
            automatic_block_limit: false,
+            backup: false,
            block_data_limit: block_data_limit.into(),
            tier: 0,
            head_block: RwLock::new(Some(head_block.clone())),
@ -1337,4 +1339,5 @@ mod tests {
        assert!(!x.has_block_data(&(head_block.number() + 1)));
        assert!(!x.has_block_data(&(head_block.number() + 1000)));
    }
+    */
 }
--- a/web3_proxy/src/rpcs/connections.rs
+++ b/web3_proxy/src/rpcs/connections.rs
--- a/web3_proxy/src/rpcs/request.rs
+++ b/web3_proxy/src/rpcs/request.rs
@ -27,7 +27,8 @@ pub enum OpenRequestResult {
    /// Unable to start a request. Retry at the given time.
    RetryAt(Instant),
    /// Unable to start a request because the server is not synced
-    NotReady,
+    /// contains "true" if backup servers were attempted
+    NotReady(bool),
 }

 /// Make RPC requests through this handle and drop it when you are done.
@ -42,7 +43,7 @@ pub struct OpenRequestHandle {
 }

 /// Depending on the context, RPC errors can require different handling.
-pub enum RequestErrorHandler {
+pub enum RequestRevertHandler {
    /// Log at the trace level. Use when errors are expected.
    TraceLevel,
    /// Log at the debug level. Use when errors are expected.
@ -52,7 +53,7 @@ pub enum RequestErrorHandler {
    /// Log at the warn level. Use when errors do not cause problems.
    WarnLevel,
    /// Potentially save the revert. Users can tune how often this happens
-    SaveReverts,
+    Save,
 }

 // TODO: second param could be skipped since we don't need it here
@ -65,13 +66,13 @@ struct EthCallFirstParams {
    data: Option<Bytes>,
 }

-impl From<Level> for RequestErrorHandler {
+impl From<Level> for RequestRevertHandler {
    fn from(level: Level) -> Self {
        match level {
-            Level::Trace => RequestErrorHandler::TraceLevel,
-            Level::Debug => RequestErrorHandler::DebugLevel,
-            Level::Error => RequestErrorHandler::ErrorLevel,
-            Level::Warn => RequestErrorHandler::WarnLevel,
+            Level::Trace => RequestRevertHandler::TraceLevel,
+            Level::Debug => RequestRevertHandler::DebugLevel,
+            Level::Error => RequestRevertHandler::ErrorLevel,
+            Level::Warn => RequestRevertHandler::WarnLevel,
            _ => unimplemented!("unexpected tracing Level"),
        }
    }
@ -84,7 +85,7 @@ impl Authorization {
        method: Method,
        params: EthCallFirstParams,
    ) -> anyhow::Result<()> {
-        let rpc_key_id = match self.checks.rpc_key_id {
+        let rpc_key_id = match self.checks.rpc_secret_key_id {
            Some(rpc_key_id) => rpc_key_id.into(),
            None => {
                // // trace!(?self, "cannot save revert without rpc_key_id");
@ -213,7 +214,7 @@ impl OpenRequestHandle {
        &self,
        method: &str,
        params: &P,
-        error_handler: RequestErrorHandler,
+        revert_handler: RequestRevertHandler,
    ) -> Result<R, ProviderError>
    where
        // TODO: not sure about this type. would be better to not need clones, but measure and spawns combine to need it
@ -240,52 +241,58 @@ impl OpenRequestHandle {
            Web3Provider::Ws(provider) => provider.request(method, params).await,
        };

-        // TODO: i think ethers already has trace logging (and does it much more fancy)
-        trace!(
-            "response from {} for {} {:?}: {:?}",
-            self.conn,
-            method,
-            params,
-            response,
-        );
+        // // TODO: i think ethers already has trace logging (and does it much more fancy)
+        // trace!(
+        //     "response from {} for {} {:?}: {:?}",
+        //     self.conn,
+        //     method,
+        //     params,
+        //     response,
+        // );

        if let Err(err) = &response {
            // only save reverts for some types of calls
            // TODO: do something special for eth_sendRawTransaction too
-            let error_handler = if let RequestErrorHandler::SaveReverts = error_handler {
+            let revert_handler = if let RequestRevertHandler::Save = revert_handler {
                // TODO: should all these be Trace or Debug or a mix?
                if !["eth_call", "eth_estimateGas"].contains(&method) {
                    // trace!(%method, "skipping save on revert");
-                    RequestErrorHandler::TraceLevel
+                    RequestRevertHandler::TraceLevel
                } else if self.authorization.db_conn.is_some() {
                    let log_revert_chance = self.authorization.checks.log_revert_chance;

                    if log_revert_chance == 0.0 {
                        // trace!(%method, "no chance. skipping save on revert");
-                        RequestErrorHandler::TraceLevel
+                        RequestRevertHandler::TraceLevel
                    } else if log_revert_chance == 1.0 {
                        // trace!(%method, "gaurenteed chance. SAVING on revert");
-                        error_handler
+                        revert_handler
                    } else if thread_fast_rng::thread_fast_rng().gen_range(0.0f64..=1.0)
                        < log_revert_chance
                    {
                        // trace!(%method, "missed chance. skipping save on revert");
-                        RequestErrorHandler::TraceLevel
+                        RequestRevertHandler::TraceLevel
                    } else {
                        // trace!("Saving on revert");
                        // TODO: is always logging at debug level fine?
-                        error_handler
+                        revert_handler
                    }
                } else {
                    // trace!(%method, "no database. skipping save on revert");
-                    RequestErrorHandler::TraceLevel
+                    RequestRevertHandler::TraceLevel
                }
            } else {
-                error_handler
+                revert_handler
            };

+            enum ResponseTypes {
+                Revert,
+                RateLimit,
+                Ok,
+            }
+
            // check for "execution reverted" here
-            let is_revert = if let ProviderError::JsonRpcClientError(err) = err {
+            let response_type = if let ProviderError::JsonRpcClientError(err) = err {
                // Http and Ws errors are very similar, but different types
                let msg = match &*self.provider {
                    Web3Provider::Mock => unimplemented!(),
@ -310,30 +317,44 @@ impl OpenRequestHandle {
                };

                if let Some(msg) = msg {
-                    msg.starts_with("execution reverted")
+                    if msg.starts_with("execution reverted") {
+                        trace!("revert from {}", self.conn);
+                        ResponseTypes::Revert
+                    } else if msg.contains("limit") || msg.contains("request") {
+                        trace!("rate limit from {}", self.conn);
+                        ResponseTypes::RateLimit
+                    } else {
+                        ResponseTypes::Ok
+                    }
                } else {
-                    false
+                    ResponseTypes::Ok
                }
            } else {
-                false
+                ResponseTypes::Ok
            };

-            if is_revert {
-                trace!("revert from {}", self.conn);
+            if matches!(response_type, ResponseTypes::RateLimit) {
+                if let Some(hard_limit_until) = self.conn.hard_limit_until.as_ref() {
+                    let retry_at = Instant::now() + Duration::from_secs(1);
+
+                    trace!("retry {} at: {:?}", self.conn, retry_at);
+
+                    hard_limit_until.send_replace(retry_at);
+                }
            }

            // TODO: think more about the method and param logs. those can be sensitive information
-            match error_handler {
-                RequestErrorHandler::DebugLevel => {
+            match revert_handler {
+                RequestRevertHandler::DebugLevel => {
                    // TODO: think about this revert check more. sometimes we might want reverts logged so this needs a flag
-                    if !is_revert {
+                    if matches!(response_type, ResponseTypes::Revert) {
                        debug!(
                            "bad response from {}! method={} params={:?} err={:?}",
                            self.conn, method, params, err
                        );
                    }
                }
-                RequestErrorHandler::TraceLevel => {
+                RequestRevertHandler::TraceLevel => {
                    trace!(
                        "bad response from {}! method={} params={:?} err={:?}",
                        self.conn,
@ -342,21 +363,21 @@ impl OpenRequestHandle {
                        err
                    );
                }
-                RequestErrorHandler::ErrorLevel => {
+                RequestRevertHandler::ErrorLevel => {
                    // TODO: include params if not running in release mode
                    error!(
                        "bad response from {}! method={} err={:?}",
                        self.conn, method, err
                    );
                }
-                RequestErrorHandler::WarnLevel => {
+                RequestRevertHandler::WarnLevel => {
                    // TODO: include params if not running in release mode
                    warn!(
                        "bad response from {}! method={} err={:?}",
                        self.conn, method, err
                    );
                }
-                RequestErrorHandler::SaveReverts => {
+                RequestRevertHandler::Save => {
                    trace!(
                        "bad response from {}! method={} params={:?} err={:?}",
                        self.conn,
--- a/web3_proxy/src/rpcs/synced_connections.rs
+++ b/web3_proxy/src/rpcs/synced_connections.rs
@ -1,4 +1,4 @@
-use super::blockchain::SavedBlock;
+use super::blockchain::{ArcBlock, SavedBlock};
 use super::connection::Web3Connection;
 use super::connections::Web3Connections;
 use ethers::prelude::{H256, U64};
@ -9,19 +9,33 @@ use std::sync::Arc;
 /// A collection of Web3Connections that are on the same block.
 /// Serialize is so we can print it on our debug endpoint
 #[derive(Clone, Default, Serialize)]
-pub struct SyncedConnections {
+pub struct ConsensusConnections {
    // TODO: store ArcBlock instead?
    pub(super) head_block: Option<SavedBlock>,
    // TODO: this should be able to serialize, but it isn't
    #[serde(skip_serializing)]
    pub(super) conns: Vec<Arc<Web3Connection>>,
+    pub(super) num_checked_conns: usize,
+    pub(super) includes_backups: bool,
 }

-impl fmt::Debug for SyncedConnections {
+impl ConsensusConnections {
+    pub fn num_conns(&self) -> usize {
+        self.conns.len()
+    }
+
+    pub fn sum_soft_limit(&self) -> u32 {
+        self.conns.iter().fold(0, |sum, rpc| sum + rpc.soft_limit)
+    }
+
+    // TODO: sum_hard_limit?
+}
+
+impl fmt::Debug for ConsensusConnections {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        // TODO: the default formatter takes forever to write. this is too quiet though
        // TODO: print the actual conns?
-        f.debug_struct("SyncedConnections")
+        f.debug_struct("ConsensusConnections")
            .field("head_block", &self.head_block)
            .field("num_conns", &self.conns.len())
            .finish_non_exhaustive()
@ -29,31 +43,29 @@ impl fmt::Debug for SyncedConnections {
 }

 impl Web3Connections {
-    pub fn head_block(&self) -> Option<SavedBlock> {
-        self.synced_connections.load().head_block.clone()
+    pub fn head_block(&self) -> Option<ArcBlock> {
+        self.watch_consensus_head_receiver
+            .as_ref()
+            .map(|x| x.borrow().clone())
    }

    pub fn head_block_hash(&self) -> Option<H256> {
-        self.synced_connections
-            .load()
-            .head_block
-            .as_ref()
-            .map(|head_block| head_block.hash())
+        self.head_block().and_then(|x| x.hash)
    }

    pub fn head_block_num(&self) -> Option<U64> {
-        self.synced_connections
-            .load()
-            .head_block
-            .as_ref()
-            .map(|head_block| head_block.number())
+        self.head_block().and_then(|x| x.number)
    }

    pub fn synced(&self) -> bool {
-        !self.synced_connections.load().conns.is_empty()
+        !self
+            .watch_consensus_connections_sender
+            .borrow()
+            .conns
+            .is_empty()
    }

    pub fn num_synced_rpcs(&self) -> usize {
-        self.synced_connections.load().conns.len()
+        self.watch_consensus_connections_sender.borrow().conns.len()
    }
 }