test: mock retry backoff and compression sleeps in slow tests

Cuts ~65s off shard 3's local runtime (108s \u2192 48s) by neutralizing real wall-clock waits in backoff/compression/retry paths. Tests assert behavior (retry count, final result, error handling), never timing. Changes: - tests/run_agent/conftest.py (NEW): autouse fixture mocks run_agent.jittered_backoff to 0.0 for all tests in the directory. Collapses the `while time.time() < sleep_end` busy-loop to a no-op. Does NOT mock time.sleep globally (breaks threading tests). - test_anthropic_error_handling.py: per-file fixture mocks time.sleep and asyncio.sleep for this test's retry paths (6 tests \u00d7 10s \u2192 ~2s each). - test_413_compression.py: mocks time.sleep for the 2s compression retry pauses (9 tests \u00d7 2s \u2192 millisecond range). - test_run_agent_codex_responses.py: mocks time.sleep for Codex retry path (6.8s \u2192 0.24s on the empty-output retry test). - test_fallback_model.py: mocks time.sleep for transport-recovery path. - test_retaindb_plugin.py: caps retaindb module's time.sleep to 0.05s so background writer-thread sleeps don't block tests. Replaces arbitrary time.sleep(N) waits with polling loops. Validation: - tests/run_agent/ + tests/plugins/test_retaindb_plugin.py: 827 passed, 0 failed, 22.9s (was ~75s before). - Matrix shard 3 local: 3098 passed, 48.2s (was 108s). - No test's timing-assertion contract is changed (tests still verify retry happens, just don't wait 5s for it).
fix(tests): make AIAgent constructor calls self-contained (#11755 )
2026-04-17 13:19:00 -07:00 · 2026-04-17 12:32:03 -07:00 · 2026-04-17 12:05:22 -07:00
20 changed files with 259 additions and 27 deletions
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -2472,10 +2472,10 @@ def _model_flow_kimi(config, current_model=""):

    # Step 3: Model selection — show appropriate models for the endpoint
    if is_coding_plan:
-        # Coding Plan models (kimi-for-coding first)
+        # Coding Plan models (kimi-k2.5 first)
        model_list = [
-            "kimi-for-coding",
            "kimi-k2.5",
+            "kimi-for-coding",
            "kimi-k2-thinking",
            "kimi-k2-thinking-turbo",
        ]
--- a/hermes_cli/models.py
+++ b/hermes_cli/models.py
@@ -26,7 +26,8 @@ COPILOT_REASONING_EFFORTS_O_SERIES = ["low", "medium", "high"]
 # Fallback OpenRouter snapshot used when the live catalog is unavailable.
 # (model_id, display description shown in menus)
 OPENROUTER_MODELS: list[tuple[str, str]] = [
-    ("anthropic/claude-opus-4.7",       "recommended"),
+    ("moonshotai/kimi-k2.5",            "recommended"),
+    ("anthropic/claude-opus-4.7",       ""),
    ("anthropic/claude-opus-4.6",       ""),
    ("anthropic/claude-sonnet-4.6",     ""),
    ("qwen/qwen3.6-plus",               ""),
@@ -49,7 +50,6 @@ OPENROUTER_MODELS: list[tuple[str, str]] = [
    ("z-ai/glm-5.1",                    ""),
    ("z-ai/glm-5v-turbo",               ""),
    ("z-ai/glm-5-turbo",                ""),
-    ("moonshotai/kimi-k2.5",            ""),
    ("x-ai/grok-4.20",                  ""),
    ("nvidia/nemotron-3-super-120b-a12b",      ""),
    ("nvidia/nemotron-3-super-120b-a12b:free", "free"),
@@ -75,6 +75,7 @@ def _codex_curated_models() -> list[str]:

 _PROVIDER_MODELS: dict[str, list[str]] = {
    "nous": [
+        "moonshotai/kimi-k2.5",
        "xiaomi/mimo-v2-pro",
        "anthropic/claude-opus-4.7",
        "anthropic/claude-opus-4.6",
@@ -96,7 +97,6 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "z-ai/glm-5.1",
        "z-ai/glm-5v-turbo",
        "z-ai/glm-5-turbo",
-        "moonshotai/kimi-k2.5",
        "x-ai/grok-4.20-beta",
        "nvidia/nemotron-3-super-120b-a12b",
        "nvidia/nemotron-3-super-120b-a12b:free",
@@ -156,8 +156,8 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "grok-4-1-fast-reasoning",
    ],
    "kimi-coding": [
-        "kimi-for-coding",
        "kimi-k2.5",
+        "kimi-for-coding",
        "kimi-k2-thinking",
        "kimi-k2-thinking-turbo",
        "kimi-k2-turbo-preview",
@@ -212,6 +212,7 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "trinity-mini",
    ],
    "opencode-zen": [
+        "kimi-k2.5",
        "gpt-5.4-pro",
        "gpt-5.4",
        "gpt-5.3-codex",
@@ -243,16 +244,15 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "glm-5",
        "glm-4.7",
        "glm-4.6",
-        "kimi-k2.5",
        "kimi-k2-thinking",
        "kimi-k2",
        "qwen3-coder",
        "big-pickle",
    ],
    "opencode-go": [
+        "kimi-k2.5",
        "glm-5.1",
        "glm-5",
-        "kimi-k2.5",
        "mimo-v2-pro",
        "mimo-v2-omni",
        "minimax-m2.7",
@@ -285,21 +285,21 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
    # to https://dashscope-intl.aliyuncs.com/compatible-mode/v1 (OpenAI-compat)
    # or https://dashscope-intl.aliyuncs.com/apps/anthropic (Anthropic-compat).
    "alibaba": [
+        "kimi-k2.5",
        "qwen3.5-plus",
        "qwen3-coder-plus",
        "qwen3-coder-next",
        # Third-party models available on coding-intl
        "glm-5",
        "glm-4.7",
-        "kimi-k2.5",
        "MiniMax-M2.5",
    ],
    # Curated HF model list — only agentic models that map to OpenRouter defaults.
    "huggingface": [
+        "moonshotai/Kimi-K2.5",
        "Qwen/Qwen3.5-397B-A17B",
        "Qwen/Qwen3.5-35B-A3B",
        "deepseek-ai/DeepSeek-V3.2",
-        "moonshotai/Kimi-K2.5",
        "MiniMaxAI/MiniMax-M2.5",
        "zai-org/GLM-5",
        "XiaomiMiMo/MiMo-V2-Flash",
--- a/tests/hermes_cli/test_opencode_go_in_model_list.py
+++ b/tests/hermes_cli/test_opencode_go_in_model_list.py
@@ -15,7 +15,7 @@ def test_opencode_go_appears_when_api_key_set():
    opencode_go = next((p for p in providers if p["slug"] == "opencode-go"), None)
    
    assert opencode_go is not None, "opencode-go should appear when OPENCODE_GO_API_KEY is set"
-    assert opencode_go["models"] == ["glm-5.1", "glm-5", "kimi-k2.5", "mimo-v2-pro", "mimo-v2-omni", "minimax-m2.7", "minimax-m2.5"]
+    assert opencode_go["models"] == ["kimi-k2.5", "glm-5.1", "glm-5", "mimo-v2-pro", "mimo-v2-omni", "minimax-m2.7", "minimax-m2.5"]
    # opencode-go can appear as "built-in" (from PROVIDER_TO_MODELS_DEV when
    # models.dev is reachable) or "hermes" (from HERMES_OVERLAYS fallback when
    # the API is unavailable, e.g. in CI).
--- a/tests/plugins/test_retaindb_plugin.py
+++ b/tests/plugins/test_retaindb_plugin.py
@@ -31,6 +31,31 @@ def _isolate_env(tmp_path, monkeypatch):
    monkeypatch.delenv("RETAINDB_PROJECT", raising=False)


+@pytest.fixture(autouse=True)
+def _cap_retaindb_sleeps(monkeypatch):
+    """Cap production-code sleeps so background-thread tests run fast.
+
+    The retaindb ``_WriteQueue._flush_row`` does ``time.sleep(2)`` after
+    errors. Across multiple tests that trigger the retry path, that adds
+    up. Cap the module's bound ``time.sleep`` to 0.05s — tests don't care
+    about the exact retry delay, only that it happens. The test file's
+    own ``time.sleep`` stays real since it uses a different reference.
+    """
+    try:
+        from plugins.memory import retaindb as _retaindb
+    except ImportError:
+        return
+
+    real_sleep = _retaindb.time.sleep
+
+    def _capped_sleep(seconds):
+        return real_sleep(min(float(seconds), 0.05))
+
+    import types as _types
+    fake_time = _types.SimpleNamespace(sleep=_capped_sleep, time=_retaindb.time.time)
+    monkeypatch.setattr(_retaindb, "time", fake_time)
+
+
 # We need the repo root on sys.path so the plugin can import agent.memory_provider
 import sys
 _repo_root = str(Path(__file__).resolve().parents[2])
@@ -130,16 +155,18 @@ class TestWriteQueue:
    def test_enqueue_creates_row(self, tmp_path):
        q, client, db_path = self._make_queue(tmp_path)
        q.enqueue("user1", "sess1", [{"role": "user", "content": "hi"}])
-        # Give the writer thread a moment to process
-        time.sleep(1)
+        # shutdown() blocks until the writer thread drains the queue — no need
+        # to pre-sleep (the old 1s sleep was a just-in-case wait, but shutdown
+        # does the right thing).
        q.shutdown()
        # If ingest succeeded, the row should be deleted
        client.ingest_session.assert_called_once()

    def test_enqueue_persists_to_sqlite(self, tmp_path):
        client = MagicMock()
-        # Make ingest hang so the row stays in SQLite
-        client.ingest_session = MagicMock(side_effect=lambda *a, **kw: time.sleep(5))
+        # Make ingest slow so the row is still in SQLite when we peek.
+        # 0.5s is plenty — the test just needs the flush to still be in-flight.
+        client.ingest_session = MagicMock(side_effect=lambda *a, **kw: time.sleep(0.5))
        db_path = tmp_path / "test_queue.db"
        q = _WriteQueue(client, db_path)
        q.enqueue("user1", "sess1", [{"role": "user", "content": "test"}])
@@ -154,8 +181,7 @@ class TestWriteQueue:
    def test_flush_deletes_row_on_success(self, tmp_path):
        q, client, db_path = self._make_queue(tmp_path)
        q.enqueue("user1", "sess1", [{"role": "user", "content": "hi"}])
-        time.sleep(1)
-        q.shutdown()
+        q.shutdown()  # blocks until drain
        # Row should be gone
        conn = sqlite3.connect(str(db_path))
        rows = conn.execute("SELECT COUNT(*) FROM pending").fetchone()[0]
@@ -168,14 +194,20 @@ class TestWriteQueue:
        db_path = tmp_path / "test_queue.db"
        q = _WriteQueue(client, db_path)
        q.enqueue("user1", "sess1", [{"role": "user", "content": "hi"}])
-        time.sleep(3)  # Allow retry + sleep(2) in _flush_row
+        # Poll for the error to be recorded (max 2s), instead of a fixed 3s wait.
+        deadline = time.time() + 2.0
+        last_error = None
+        while time.time() < deadline:
+            conn = sqlite3.connect(str(db_path))
+            row = conn.execute("SELECT last_error FROM pending").fetchone()
+            conn.close()
+            if row and row[0]:
+                last_error = row[0]
+                break
+            time.sleep(0.05)
        q.shutdown()
-        # Row should still exist with error recorded
-        conn = sqlite3.connect(str(db_path))
-        row = conn.execute("SELECT last_error FROM pending").fetchone()
-        conn.close()
-        assert row is not None
-        assert "API down" in row[0]
+        assert last_error is not None
+        assert "API down" in last_error

    def test_thread_local_connection_reuse(self, tmp_path):
        q, _, _ = self._make_queue(tmp_path)
@@ -193,14 +225,27 @@ class TestWriteQueue:
        client1.ingest_session = MagicMock(side_effect=RuntimeError("fail"))
        q1 = _WriteQueue(client1, db_path)
        q1.enqueue("user1", "sess1", [{"role": "user", "content": "lost turn"}])
-        time.sleep(3)
+        # Wait until the error is recorded (poll with short interval).
+        deadline = time.time() + 2.0
+        while time.time() < deadline:
+            conn = sqlite3.connect(str(db_path))
+            row = conn.execute("SELECT last_error FROM pending").fetchone()
+            conn.close()
+            if row and row[0]:
+                break
+            time.sleep(0.05)
        q1.shutdown()

        # Now create a new queue — it should replay the pending rows
        client2 = MagicMock()
        client2.ingest_session = MagicMock(return_value={"status": "ok"})
        q2 = _WriteQueue(client2, db_path)
-        time.sleep(2)
+        # Poll for the replay to happen.
+        deadline = time.time() + 2.0
+        while time.time() < deadline:
+            if client2.ingest_session.called:
+                break
+            time.sleep(0.05)
        q2.shutdown()

        # The replayed row should have been ingested via client2
--- a/tests/run_agent/conftest.py
+++ b/tests/run_agent/conftest.py
@@ -0,0 +1,34 @@
+"""Fast-path fixtures shared across tests/run_agent/.
+
+Many tests in this directory exercise the retry/backoff paths in the
+agent loop. Production code uses ``jittered_backoff(base_delay=5.0)``
+with a ``while time.time() < sleep_end`` loop — a single retry test
+spends 5+ seconds of real wall-clock time on backoff waits.
+
+Mocking ``jittered_backoff`` to return 0.0 collapses the while-loop
+to a no-op (``time.time() < time.time() + 0`` is false immediately),
+which handles the most common case without touching ``time.sleep``.
+
+We deliberately DO NOT mock ``time.sleep`` here — some tests
+(test_interrupt_propagation, test_primary_runtime_restore, etc.) use
+the real ``time.sleep`` for threading coordination or assert that it
+was called with specific values. Tests that want to additionally
+fast-path direct ``time.sleep(N)`` calls in production code should
+monkeypatch ``run_agent.time.sleep`` locally (see
+``test_anthropic_error_handling.py`` for the pattern).
+"""
+
+from __future__ import annotations
+
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def _fast_retry_backoff(monkeypatch):
+    """Short-circuit retry backoff for all tests in this directory."""
+    try:
+        import run_agent
+    except ImportError:
+        return
+
+    monkeypatch.setattr(run_agent, "jittered_backoff", lambda *a, **k: 0.0)
--- a/tests/run_agent/test_1630_context_overflow_loop.py
+++ b/tests/run_agent/test_1630_context_overflow_loop.py
@@ -32,6 +32,7 @@ class TestGeneric400Heuristic:
            from run_agent import AIAgent
            a = AIAgent(
                api_key="test-key-12345",
+                base_url="https://openrouter.ai/api/v1",
                quiet_mode=True,
                skip_context_files=True,
                skip_memory=True,
--- a/tests/run_agent/test_413_compression.py
+++ b/tests/run_agent/test_413_compression.py
@@ -19,6 +19,24 @@ import pytest

 from agent.context_compressor import SUMMARY_PREFIX
 from run_agent import AIAgent
+import run_agent
+
+
+# ---------------------------------------------------------------------------
+# Fast backoff for compression retry tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(autouse=True)
+def _no_compression_sleep(monkeypatch):
+    """Short-circuit the 2s time.sleep between compression retries.
+
+    Production code has ``time.sleep(2)`` in multiple places after a 413/context
+    compression, for rate-limit smoothing. Tests assert behavior, not timing.
+    """
+    import time as _time
+    monkeypatch.setattr(_time, "sleep", lambda *_a, **_k: None)
+    monkeypatch.setattr(run_agent, "jittered_backoff", lambda *a, **k: 0.0)


 # ---------------------------------------------------------------------------
@@ -69,6 +87,7 @@ def agent():
    ):
        a = AIAgent(
            api_key="test-key-1234567890",
+            base_url="https://openrouter.ai/api/v1",
            quiet_mode=True,
            skip_context_files=True,
            skip_memory=True,
--- a/tests/run_agent/test_860_dedup.py
+++ b/tests/run_agent/test_860_dedup.py
@@ -29,6 +29,8 @@ class TestFlushDeduplication:
        with patch.dict(os.environ, {"OPENROUTER_API_KEY": "test-key"}):
            from run_agent import AIAgent
            agent = AIAgent(
+                api_key="test-key",
+                base_url="https://openrouter.ai/api/v1",
                model="test/model",
                quiet_mode=True,
                session_db=session_db,
@@ -271,6 +273,8 @@ class TestFlushIdxInit:
        with patch.dict(os.environ, {"OPENROUTER_API_KEY": "test-key"}):
            from run_agent import AIAgent
            agent = AIAgent(
+                api_key="test-key",
+                base_url="https://openrouter.ai/api/v1",
                model="test/model",
                quiet_mode=True,
                skip_context_files=True,
@@ -283,6 +287,8 @@ class TestFlushIdxInit:
        with patch.dict(os.environ, {"OPENROUTER_API_KEY": "test-key"}):
            from run_agent import AIAgent
            agent = AIAgent(
+                api_key="test-key",
+                base_url="https://openrouter.ai/api/v1",
                model="test/model",
                quiet_mode=True,
                skip_context_files=True,
--- a/tests/run_agent/test_anthropic_error_handling.py
+++ b/tests/run_agent/test_anthropic_error_handling.py
@@ -27,6 +27,39 @@ from gateway.config import Platform
 from gateway.session import SessionSource


+# ---------------------------------------------------------------------------
+# Fast backoff for tests that exercise the retry loop
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(autouse=True)
+def _no_backoff_wait(monkeypatch):
+    """Short-circuit retry backoff so tests don't block on real wall-clock waits.
+
+    The production code uses jittered_backoff() with a 5s base delay plus a
+    tight time.sleep(0.2) loop. Without this patch, each 429/500/529 retry
+    test burns ~10s of real time on CI — across six tests that's ~60s for
+    behavior we're not asserting against timing.
+
+    Tests assert retry counts and final results, never wait durations.
+    """
+    import asyncio as _asyncio
+    import time as _time
+
+    monkeypatch.setattr(run_agent, "jittered_backoff", lambda *a, **k: 0.0)
+    monkeypatch.setattr(_time, "sleep", lambda *_a, **_k: None)
+
+    # Also fast-path asyncio.sleep — the gateway's _run_agent path has
+    # several await asyncio.sleep(...) calls that add real wall-clock time.
+    _real_asyncio_sleep = _asyncio.sleep
+
+    async def _fast_sleep(delay=0, *args, **kwargs):
+        # Yield to the event loop but skip the actual delay.
+        await _real_asyncio_sleep(0)
+
+    monkeypatch.setattr(_asyncio, "sleep", _fast_sleep)
+
+
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
--- a/tests/run_agent/test_compression_persistence.py
+++ b/tests/run_agent/test_compression_persistence.py
@@ -37,6 +37,8 @@ class TestFlushAfterCompression:
        with patch.dict(os.environ, {"OPENROUTER_API_KEY": "test-key"}):
            from run_agent import AIAgent
            agent = AIAgent(
+                api_key="test-key",
+                base_url="https://openrouter.ai/api/v1",
                model="test/model",
                quiet_mode=True,
                session_db=session_db,
--- a/tests/run_agent/test_create_openai_client_kwargs_isolation.py
+++ b/tests/run_agent/test_create_openai_client_kwargs_isolation.py
@@ -19,6 +19,8 @@ from run_agent import AIAgent
 def test_create_openai_client_does_not_mutate_input_kwargs(mock_openai):
    mock_openai.return_value = MagicMock()
    agent = AIAgent(
+        api_key="test-key",
+        base_url="https://openrouter.ai/api/v1",
        model="test/model",
        quiet_mode=True,
        skip_context_files=True,
--- a/tests/run_agent/test_create_openai_client_reuse.py
+++ b/tests/run_agent/test_create_openai_client_reuse.py
@@ -23,6 +23,8 @@ from run_agent import AIAgent

 def _make_agent():
    return AIAgent(
+        api_key="test-key",
+        base_url="https://openrouter.ai/api/v1",
        model="test/model",
        quiet_mode=True,
        skip_context_files=True,
--- a/tests/run_agent/test_fallback_model.py
+++ b/tests/run_agent/test_fallback_model.py
@@ -11,6 +11,16 @@ from unittest.mock import MagicMock, patch
 import pytest

 from run_agent import AIAgent
+import run_agent
+
+
+@pytest.fixture(autouse=True)
+def _no_fallback_wait(monkeypatch):
+    """Short-circuit time.sleep in fallback/recovery paths so tests don't
+    block on the ``min(3 + retry_count, 8)`` wait before a primary retry."""
+    import time as _time
+    monkeypatch.setattr(_time, "sleep", lambda *_a, **_k: None)
+    monkeypatch.setattr(run_agent, "jittered_backoff", lambda *a, **k: 0.0)


 def _make_tool_defs(*names: str) -> list:
@@ -36,6 +46,7 @@ def _make_agent(fallback_model=None):
    ):
        agent = AIAgent(
            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            quiet_mode=True,
            skip_context_files=True,
            skip_memory=True,
--- a/tests/run_agent/test_plugin_context_engine_init.py
+++ b/tests/run_agent/test_plugin_context_engine_init.py
@@ -45,6 +45,7 @@ def test_plugin_engine_gets_context_length_on_init():

        agent = AIAgent(
            api_key="test-key-1234567890",
+            base_url="https://openrouter.ai/api/v1",
            quiet_mode=True,
            skip_context_files=True,
            skip_memory=True,
@@ -75,6 +76,7 @@ def test_plugin_engine_update_model_args():
        agent = AIAgent(
            model="openrouter/auto",
            api_key="test-key-1234567890",
+            base_url="https://openrouter.ai/api/v1",
            quiet_mode=True,
            skip_context_files=True,
            skip_memory=True,
--- a/tests/run_agent/test_provider_fallback.py
+++ b/tests/run_agent/test_provider_fallback.py
@@ -19,6 +19,7 @@ def _make_agent(fallback_model=None):
    ):
        agent = AIAgent(
            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            quiet_mode=True,
            skip_context_files=True,
            skip_memory=True,
--- a/tests/run_agent/test_provider_parity.py
+++ b/tests/run_agent/test_provider_parity.py
@@ -60,6 +60,9 @@ def _make_agent(monkeypatch, provider, api_mode="chat_completions", base_url="ht
    )
    if model:
        kwargs["model"] = model
+    base_url="https://openrouter.ai/api/v1",
+    api_key="test-key",
+    base_url="https://openrouter.ai/api/v1",
    return AIAgent(**kwargs)


--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@@ -55,6 +55,7 @@ def agent():
    ):
        a = AIAgent(
            api_key="test-key-1234567890",
+            base_url="https://openrouter.ai/api/v1",
            quiet_mode=True,
            skip_context_files=True,
            skip_memory=True,
@@ -76,6 +77,7 @@ def agent_with_memory_tool():
    ):
        a = AIAgent(
            api_key="test-k...7890",
+            base_url="https://openrouter.ai/api/v1",
            quiet_mode=True,
            skip_context_files=True,
            skip_memory=True,
@@ -112,12 +114,14 @@ def test_aiagent_reuses_existing_errors_log_handler():
        ):
            AIAgent(
                api_key="test-k...7890",
+                base_url="https://openrouter.ai/api/v1",
                quiet_mode=True,
                skip_context_files=True,
                skip_memory=True,
            )
            AIAgent(
                api_key="test-k...7890",
+                base_url="https://openrouter.ai/api/v1",
                quiet_mode=True,
                skip_context_files=True,
                skip_memory=True,
@@ -491,6 +495,7 @@ class TestInit:
        ):
            a = AIAgent(
                api_key="test-key-1234567890",
+                base_url="https://openrouter.ai/api/v1",
                model="openai/gpt-4o",
                quiet_mode=True,
                skip_context_files=True,
@@ -542,6 +547,7 @@ class TestInit:
        ):
            a = AIAgent(
                api_key="test-key-1234567890",
+                base_url="https://openrouter.ai/api/v1",
                quiet_mode=True,
                skip_context_files=True,
                skip_memory=True,
@@ -557,6 +563,7 @@ class TestInit:
        ):
            a = AIAgent(
                api_key="test-key-1234567890",
+                base_url="https://openrouter.ai/api/v1",
                quiet_mode=True,
                skip_context_files=True,
                skip_memory=True,
@@ -694,6 +701,7 @@ class TestBuildSystemPrompt:
        ):
            agent = AIAgent(
                api_key="test-k...7890",
+                base_url="https://openrouter.ai/api/v1",
                quiet_mode=True,
                skip_context_files=True,
                skip_memory=True,
@@ -726,6 +734,7 @@ class TestToolUseEnforcementConfig:
            a = AIAgent(
                model=model,
                api_key="test-key-1234567890",
+                base_url="https://openrouter.ai/api/v1",
                quiet_mode=True,
                skip_context_files=True,
                skip_memory=True,
@@ -822,6 +831,7 @@ class TestToolUseEnforcementConfig:
        ):
            a = AIAgent(
                api_key="test-key-1234567890",
+                base_url="https://openrouter.ai/api/v1",
                quiet_mode=True,
                skip_context_files=True,
                skip_memory=True,
@@ -3433,7 +3443,7 @@ class TestAnthropicBaseUrlPassthrough:
        ):
            mock_build.return_value = MagicMock()
            a = AIAgent(
-                api_key="sk-ant-api03-test1234567890",
+                api_key="sk-ant...7890",
                api_mode="anthropic_messages",
                quiet_mode=True,
                skip_context_files=True,
@@ -3457,6 +3467,7 @@ class TestAnthropicCredentialRefresh:
            mock_build.side_effect = [old_client, new_client]
            agent = AIAgent(
                api_key="sk-ant-oat01-stale-token",
+                base_url="https://openrouter.ai/api/v1",
                api_mode="anthropic_messages",
                quiet_mode=True,
                skip_context_files=True,
@@ -3487,6 +3498,7 @@ class TestAnthropicCredentialRefresh:
        ):
            agent = AIAgent(
                api_key="sk-ant-oat01-same-token",
+                base_url="https://openrouter.ai/api/v1",
                api_mode="anthropic_messages",
                quiet_mode=True,
                skip_context_files=True,
@@ -3514,6 +3526,7 @@ class TestAnthropicCredentialRefresh:
        ):
            agent = AIAgent(
                api_key="sk-ant-oat01-current-token",
+                base_url="https://openrouter.ai/api/v1",
                api_mode="anthropic_messages",
                quiet_mode=True,
                skip_context_files=True,
--- a/tests/run_agent/test_run_agent_codex_responses.py
+++ b/tests/run_agent/test_run_agent_codex_responses.py
@@ -12,6 +12,15 @@ sys.modules.setdefault("fal_client", types.SimpleNamespace())
 import run_agent


+@pytest.fixture(autouse=True)
+def _no_codex_backoff(monkeypatch):
+    """Short-circuit retry backoff so Codex retry tests don't block on real
+    wall-clock waits (5s jittered_backoff base delay + tight time.sleep loop)."""
+    import time as _time
+    monkeypatch.setattr(run_agent, "jittered_backoff", lambda *a, **k: 0.0)
+    monkeypatch.setattr(_time, "sleep", lambda *_a, **_k: None)
+
+
 def _patch_agent_bootstrap(monkeypatch):
    monkeypatch.setattr(
        run_agent,
--- a/tests/run_agent/test_streaming.py
+++ b/tests/run_agent/test_streaming.py
@@ -80,6 +80,8 @@ class TestStreamingAccumulator:
        mock_create.return_value = mock_client

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -120,6 +122,8 @@ class TestStreamingAccumulator:
        mock_create.return_value = mock_client

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -167,6 +171,8 @@ class TestStreamingAccumulator:
        mock_create.return_value = mock_client

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -205,6 +211,8 @@ class TestStreamingAccumulator:
        mock_create.return_value = mock_client

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -245,6 +253,8 @@ class TestStreamingCallbacks:
        mock_create.return_value = mock_client

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -277,6 +287,8 @@ class TestStreamingCallbacks:
        mock_create.return_value = mock_client

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -308,6 +320,8 @@ class TestStreamingCallbacks:
        mock_create.return_value = mock_client

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -346,6 +360,8 @@ class TestStreamingCallbacks:
        mock_create.return_value = mock_client

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -381,6 +397,8 @@ class TestStreamingCallbacks:
        mock_create.return_value = mock_client

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -428,6 +446,8 @@ class TestStreamingFallback:
        mock_create.return_value = mock_client

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -455,6 +475,8 @@ class TestStreamingFallback:
        mock_create.return_value = mock_client

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -477,6 +499,8 @@ class TestStreamingFallback:
        mock_create.return_value = mock_client

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -500,6 +524,8 @@ class TestStreamingFallback:
        mock_create.return_value = mock_client

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -542,6 +568,8 @@ class TestStreamingFallback:
        mock_create.return_value = mock_client

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -577,6 +605,8 @@ class TestStreamingFallback:
        mock_create.return_value = mock_client

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -619,6 +649,8 @@ class TestReasoningStreaming:
        mock_create.return_value = mock_client

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -646,6 +678,8 @@ class TestHasStreamConsumers:
    def test_no_consumers(self):
        from run_agent import AIAgent
        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -656,6 +690,8 @@ class TestHasStreamConsumers:
    def test_delta_callback_set(self):
        from run_agent import AIAgent
        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -667,6 +703,8 @@ class TestHasStreamConsumers:
    def test_stream_callback_set(self):
        from run_agent import AIAgent
        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -688,6 +726,8 @@ class TestCodexStreamCallbacks:
        deltas = []

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -729,6 +769,8 @@ class TestCodexStreamCallbacks:
        from run_agent import AIAgent

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -792,6 +834,8 @@ class TestCodexStreamCallbacks:
        )

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -810,6 +854,8 @@ class TestCodexStreamCallbacks:
        from run_agent import AIAgent

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
@@ -861,6 +907,8 @@ class TestAnthropicStreamCallbacks:
        from run_agent import AIAgent

        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            model="test/model",
            quiet_mode=True,
            skip_context_files=True,
--- a/tests/run_agent/test_token_persistence_non_cli.py
+++ b/tests/run_agent/test_token_persistence_non_cli.py
@@ -22,6 +22,7 @@ def _make_agent(session_db, *, platform: str):
    ):
        agent = AIAgent(
            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
            quiet_mode=True,
            skip_context_files=True,
            skip_memory=True,