chore: release v0.14.0 (2026.5.16) (#26862 )

The Foundation Release — Hermes installs and runs anywhere now. Highlights: - Native Windows support (early beta) — PowerShell installer, native subprocess/PTY paths, ~40 follow-up Windows-only fixes - pip install hermes-agent — PyPI wheel - Cold-start wave — ~19s off hermes launch, 180x faster browser_console (CDP WS) - Supply-chain advisory checker + lazy-deps + tiered install fallback - OpenAI-compatible local proxy for OAuth providers (Claude Pro, ChatGPT Pro, SuperGrok) - Cross-session 1h Claude prompt cache (Anthropic / OpenRouter / Nous Portal) - 2 new platforms: LINE + SimpleX Chat (22 total) - Microsoft Graph foundation — Teams pipeline + webhook adapter - /handoff actually transfers sessions live - x_search first-class tool, vision_analyze pixel passthrough - LSP semantic diagnostics on every write - Unified video_generate with pluggable backends - computer_use cua-driver backend - 9 new optional skills, OpenRouter Pareto Code router, xAI Grok OAuth - 12 P0 + 50 P1 closures 808 commits · 633 PRs · 1393 files · 165k insertions · 545 issues closed · 215 contributors
test(security): regression guard for OAuth PKCE state/verifier separation
2026-05-16 02:58:57 -07:00 · 2026-05-16 02:38:02 -07:00 · 2026-05-16 02:38:02 -07:00 · 2026-05-16 02:38:02 -07:00 · 2026-05-16 02:25:41 -07:00 · 2026-05-16 02:24:48 -07:00
406 changed files with 31005 additions and 17073 deletions
@@ -281,6 +281,13 @@ BROWSER_SESSION_TIMEOUT=300
 # Browser sessions are automatically closed after this period of no activity
 BROWSER_INACTIVITY_TIMEOUT=120

+# Extra Chromium launch flags passed to agent-browser, comma- or newline-separated.
+# Hermes auto-injects "--no-sandbox,--disable-dev-shm-usage" when it detects root
+# or AppArmor-restricted unprivileged user namespaces (Ubuntu 23.10+, DGX Spark,
+# many container images), so leave this unset unless you need extra flags.
+# Setting this disables the auto-injection.
+# AGENT_BROWSER_ARGS=--no-sandbox
+
 # Camofox local anti-detection browser (Camoufox-based Firefox).
 # Set CAMOFOX_URL to route the browser tools through a local Camofox server
 # instead of agent-browser/Browserbase. See docs/user-guide/features/browser.md.
@@ -387,24 +394,6 @@ IMAGE_TOOLS_DEBUG=false
 # CONTEXT_COMPRESSION_THRESHOLD=0.85      # Compress at 85% of context limit
 # Model is set via compression.summary_model in config.yaml (default: google/gemini-3-flash-preview)

-# =============================================================================
-# RL TRAINING (Tinker + Atropos)
-# =============================================================================
-# Run reinforcement learning training on language models using the Tinker API.
-# Requires the rl-server to be running (from tinker-atropos package).
-
-# Tinker API Key - RL training service
-# Get at: https://tinker-console.thinkingmachines.ai/keys
-# TINKER_API_KEY=
-
-# Weights & Biases API Key - Experiment tracking and metrics
-# Get at: https://wandb.ai/authorize
-# WANDB_API_KEY=
-
-# RL API Server URL (default: http://localhost:8080)
-# Change if running the rl-server on a different host/port
-# RL_API_URL=http://localhost:8080
-
 # =============================================================================
 # SKILLS HUB (GitHub integration for skill search/install/publish)
 # =============================================================================
@@ -0,0 +1,58 @@
+name: History Check
+
+# Rejects PRs whose branch has no common ancestor with main.
+#
+# In May 2026 PR #25045 was merged from a branch that had been disconnected
+# from main's history (likely an accidental `git checkout --orphan` or
+# `.git/` re-init).  GitHub's merge UI does not refuse merges of unrelated
+# histories, so the PR landed cleanly with the intended one-file change —
+# but its parent-less root commit (413990c94) got grafted into main as a
+# second root, and ~1500 files' worth of `git blame` history collapsed
+# onto that single commit.
+#
+# This check catches the failure mode by requiring `git merge-base` between
+# the PR head and main to be non-empty.
+
+on:
+  pull_request:
+    branches: [main]
+
+permissions:
+  contents: read
+
+jobs:
+  check-common-ancestor:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
+        with:
+          fetch-depth: 0  # full history both sides for merge-base
+
+      - name: Reject PRs with no common ancestor on main
+        run: |
+          # `git merge-base` exits non-zero AND prints nothing when the two
+          # commits share no ancestor.  We check both conditions explicitly
+          # so the failure message is clear regardless of which signal fires
+          # first.
+          if ! BASE=$(git merge-base origin/main HEAD 2>/dev/null) || [ -z "$BASE" ]; then
+            echo ""
+            echo "::error::This PR has no common ancestor with main."
+            echo ""
+            echo "Your branch's history is disconnected from main.  Common causes:"
+            echo "  - the branch was created with 'git checkout --orphan'"
+            echo "  - '.git/' was re-initialized at some point during the work"
+            echo "  - the branch was force-pushed from an unrelated repository"
+            echo ""
+            echo "Merging an unrelated-history PR grafts a parent-less root commit"
+            echo "into main and collapses git blame for every file in that snapshot."
+            echo "Reference: PR #25045 caused this and re-rooted blame on ~1500"
+            echo "files to a single orphan commit."
+            echo ""
+            echo "To fix, rebase your changes onto current main:"
+            echo "  git fetch origin main"
+            echo "  git checkout -b fix-branch origin/main"
+            echo "  # re-apply your changes (cherry-pick, copy files, etc.)"
+            echo "  git push -f origin fix-branch"
+            exit 1
+          fi
+          echo "::notice::Common ancestor with main: $BASE"
@@ -11,6 +11,7 @@ on:
      - '**/sitecustomize.py'
      - '**/usercustomize.py'
      - '**/__init__.pth'
+      - 'pyproject.toml'

 permissions:
  pull-requests: write
@@ -137,3 +138,68 @@ jobs:
        run: |
          echo "::error::CRITICAL supply chain risk patterns detected in this PR. See the PR comment for details."
          exit 1
+
+  dep-bounds:
+    name: Check PyPI dependency upper bounds
+    runs-on: ubuntu-latest
+    if: contains(github.event.pull_request.changed_files_url, 'pyproject.toml') || true
+    steps:
+      - name: Checkout
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
+        with:
+          fetch-depth: 0
+
+      - name: Check for unbounded PyPI deps
+        id: bounds
+        run: |
+          set -euo pipefail
+
+          BASE="${{ github.event.pull_request.base.sha }}"
+          HEAD="${{ github.event.pull_request.head.sha }}"
+
+          # Only check added lines in pyproject.toml
+          ADDED=$(git diff "$BASE".."$HEAD" -- pyproject.toml | grep '^+' | grep -v '^+++' || true)
+
+          if [ -z "$ADDED" ]; then
+            echo "found=false" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          # Match PyPI dep specs that have >= but no < ceiling.
+          # Pattern: "package>=version" without a following ",<" bound.
+          # Excludes git+ URLs (which use commit SHAs) and comments.
+          UNBOUNDED=$(echo "$ADDED" | grep -oE '"[a-zA-Z0-9_-]+(\[[^\]]*\])?>=[ 0-9.]+"' | grep -v ',<' || true)
+
+          if [ -n "$UNBOUNDED" ]; then
+            echo "found=true" >> "$GITHUB_OUTPUT"
+            echo "$UNBOUNDED" > /tmp/unbounded.txt
+          else
+            echo "found=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Post unbounded dep warning
+        if: steps.bounds.outputs.found == 'true'
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          BODY="## ⚠️ Unbounded PyPI Dependency Detected
+
+          This PR adds PyPI dependencies without a \`<next_major\` upper bound. Per our [supply chain policy](../blob/main/CONTRIBUTING.md#dependency-pinning-policy-supply-chain-hardening), all PyPI deps must be pinned as \`>=floor,<next_major\`.
+
+          **Unbounded specs found:**
+          \`\`\`
+          $(cat /tmp/unbounded.txt)
+          \`\`\`
+
+          **Fix:** Add an upper bound, e.g. \`\"package>=1.2.0,<2\"\`
+
+          ---
+          *See PR #2810 and CONTRIBUTING.md for the full policy rationale.*"
+
+          gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY" || echo "::warning::Could not post PR comment (expected for fork PRs)"
+
+      - name: Fail on unbounded deps
+        if: steps.bounds.outputs.found == 'true'
+        run: |
+          echo "::error::PyPI dependencies without upper bounds detected. Add <next_major ceiling per CONTRIBUTING.md policy."
+          exit 1
@@ -0,0 +1,163 @@
+name: Publish to PyPI
+
+# Triggered by CalVer tag pushes from scripts/release.py (e.g. v2026.5.15)
+# Can also be triggered manually from the Actions tab as an escape hatch.
+on:
+  push:
+    tags:
+      - 'v20*'  # CalVer tags: v2026.5.15, v2026.5.15.2, etc.
+  workflow_dispatch:
+    inputs:
+      confirm_tag:
+        description: 'Tag to publish (e.g. v2026.5.15). Must already exist.'
+        required: true
+        type: string
+
+# Restrict default token to read-only; each job escalates as needed.
+permissions:
+  contents: read
+
+# Prevent overlapping publishes (e.g. two same-day tags pushed quickly).
+concurrency:
+  group: pypi-publish
+  cancel-in-progress: false
+
+jobs:
+  build:
+    name: Build distribution 📦
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
+        with:
+          persist-credentials: false
+          # On workflow_dispatch, check out the confirmed tag.
+          ref: ${{ inputs.confirm_tag || github.ref }}
+          fetch-tags: true
+
+      - name: Validate tag exists
+        if: github.event_name == 'workflow_dispatch'
+        run: |
+          if ! git tag -l "${{ inputs.confirm_tag }}" | grep -q .; then
+            echo "::error::Tag '${{ inputs.confirm_tag }}' does not exist in the repo"
+            exit 1
+          fi
+
+      - name: Set up Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5
+        with:
+          python-version: '3.13'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@d0cc045d04ccac9d8b7881df0226f9e82c39688e  # v6
+
+      - name: Set up Node.js
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020  # v4
+        with:
+          node-version: '22'
+
+      - name: Build web dashboard
+        run: cd web && npm ci && npm run build
+
+      - name: Build TUI bundle
+        run: cd ui-tui && npm ci && npm run build
+
+      - name: Bundle TUI into hermes_cli
+        run: |
+          mkdir -p hermes_cli/tui_dist
+          cp ui-tui/dist/entry.js hermes_cli/tui_dist/entry.js
+
+      - name: Verify frontend assets exist
+        run: |
+          test -f hermes_cli/web_dist/index.html || { echo "ERROR: web_dist not built"; exit 1; }
+          test -f hermes_cli/tui_dist/entry.js || { echo "ERROR: tui_dist not built"; exit 1; }
+
+      - name: Bundle install.sh into wheel
+        run: |
+          mkdir -p hermes_cli/scripts
+          cp scripts/install.sh hermes_cli/scripts/install.sh
+
+      - name: Build wheel and sdist
+        run: uv build --sdist --wheel
+
+      - name: Upload distribution artifacts
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
+        with:
+          name: python-package-distributions
+          path: dist/
+
+  publish:
+    name: Publish to PyPI
+    needs: build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/hermes-agent
+    permissions:
+      id-token: write  # OIDC trusted publishing
+
+    steps:
+      - name: Download distribution artifacts
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
+        with:
+          name: python-package-distributions
+          path: dist/
+
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b  # v1.14.0
+        with:
+          skip-existing: true
+
+  sign:
+    name: Sign and attach to GitHub Release
+    # Only runs on tag pushes — release.py creates the GitHub Release,
+    # and workflow_dispatch won't have a matching release to attach to.
+    if: startsWith(github.ref, 'refs/tags/')
+    needs: publish
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write   # attach assets to the existing release
+      id-token: write   # sigstore signing
+
+    steps:
+      - name: Download distribution artifacts
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
+        with:
+          name: python-package-distributions
+          path: dist/
+
+      - name: Wait for GitHub Release to exist
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+        # release.py creates the GitHub Release after pushing the tag,
+        # but this workflow starts from the tag push — wait for it.
+        run: |
+          for i in $(seq 1 30); do
+            if gh release view "$GITHUB_REF_NAME" --repo "$GITHUB_REPOSITORY" >/dev/null 2>&1; then
+              echo "Release $GITHUB_REF_NAME found"
+              exit 0
+            fi
+            echo "Waiting for release... ($i/30)"
+            sleep 10
+          done
+          echo "::warning::Release $GITHUB_REF_NAME not found after 5 minutes — skipping signature upload"
+          echo "skip_sign=true" >> "$GITHUB_ENV"
+
+      - name: Sign with Sigstore
+        if: env.skip_sign != 'true'
+        uses: sigstore/gh-action-sigstore-python@f514d46b907ebcd5bedc05145c03b69c1edd8b46  # v3.0.0
+        with:
+          inputs: >-
+            ./dist/*.tar.gz
+            ./dist/*.whl
+
+      - name: Attach signed artifacts to GitHub Release
+        if: env.skip_sign != 'true'
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+        # release.py already created the GitHub Release — just upload
+        # the Sigstore signatures alongside the existing assets.
+        run: >-
+          gh release upload
+          "$GITHUB_REF_NAME" dist/*.sigstore.json
+          --repo "$GITHUB_REPOSITORY"
+          --clobber
@@ -70,3 +70,6 @@ mini-swe-agent/
 result
 website/static/api/skills-index.json
 models-dev-upstream/
+hermes_cli/tui_dist/*
+hermes_cli/scripts/
+docs/superpowers/*
@@ -1,3 +0,0 @@
-[submodule "tinker-atropos"]
-	path = tinker-atropos
-	url = https://github.com/nousresearch/tinker-atropos
@@ -56,7 +56,6 @@ hermes-agent/
 ├── tui_gateway/          # Python JSON-RPC backend for the TUI
 ├── acp_adapter/          # ACP server (VS Code / Zed / JetBrains integration)
 ├── cron/                 # Scheduler — jobs.py, scheduler.py
-├── environments/         # RL training environments (Atropos)
 ├── scripts/              # run_tests.sh, release.py, auxiliary scripts
 ├── website/              # Docusaurus docs site
 └── tests/                # Pytest suite (~17k tests across ~900 files as of May 2026)
@@ -309,6 +308,29 @@ The registry handles schema collection, dispatch, availability checking, and err

 ---

+## Dependency Pinning Policy
+
+All dependencies must have upper bounds to limit supply-chain attack surface.
+This policy was established after the litellm compromise (PR #2796, #2810) and
+reinforced after the Mini Shai-Hulud worm campaign (May 2026).
+
+| Source type | Treatment | Example |
+|---|---|---|
+| PyPI package | `>=floor,<next_major` | `"httpx>=0.28.1,<1"` |
+| Git URL | Commit SHA | `git+https://...@<40-char-sha>` |
+| GitHub Actions | Commit SHA + comment | `uses: actions/checkout@<sha>  # v4` |
+| CI-only pip | `==exact` | `pyyaml==6.0.2` |
+
+**When adding a new dependency to `pyproject.toml`:**
+1. Pin to `>=current_version,<next_major` for post-1.0 (e.g. `>=1.5.0,<2`).
+2. For pre-1.0 packages, use `<0.(current_minor + 2)` (e.g. `>=0.29,<0.32`).
+3. Never commit a bare `>=X.Y.Z` without a ceiling — CI and reviewers will reject it.
+4. Run `uv lock` to regenerate `uv.lock` with hashes.
+
+Reference: #2810 (bounds pass), #9801 (SHA pinning + audit CI).
+
+---
+
 ## Adding Configuration

 ### config.yaml options:
@@ -91,9 +91,6 @@ export VIRTUAL_ENV="$(pwd)/venv"
 # Install with all extras (messaging, cron, CLI menus, dev tools)
 uv pip install -e ".[all,dev]"

-# Optional: RL training submodule
-# git submodule update --init tinker-atropos && uv pip install -e "./tinker-atropos"
-
 # Optional: browser tools
 npm install
 ```
@@ -196,7 +193,6 @@ hermes-agent/
 │
 ├── skills/                   # Bundled skills (copied to ~/.hermes/skills/ on install)
 ├── optional-skills/          # Official optional skills (discoverable via hub, not activated by default)
-├── environments/             # RL training environments (Atropos integration)
 ├── tests/                    # Test suite
 ├── website/                  # Documentation site (hermes-agent.nousresearch.com)
 │
@@ -804,6 +800,47 @@ Hermes has terminal access. Security matters.

 If your PR affects security, note it explicitly in the description.

+### Dependency pinning policy (supply chain hardening)
+
+After the [litellm supply chain compromise](https://github.com/BerriAI/litellm/issues/24512) in March 2026 and the [Mini Shai-Hulud worm campaign](https://socket.dev/blog/tanstack-npm-packages-compromised-mini-shai-hulud-supply-chain-attack) in May 2026, all dependencies must follow these rules:
+
+| Source type | Required treatment | Rationale |
+|---|---|---|
+| **PyPI package** | `>=floor,<next_major` | PyPI versions are immutable once published, but new versions can be pushed into your range. A `<next_major` ceiling stops a 1.x install from upgrading to a malicious 2.0.0. |
+| **Git URL** (atroposlib, tinker, yc-bench, Baileys) | Full commit SHA | Branches and tags are mutable refs; SHA is content-addressed. |
+| **GitHub Actions** | Full commit SHA + version comment | Action tags are mutable refs (e.g. tj-actions/changed-files March 2025). Pin as `uses: owner/action@<sha>  # vX.Y.Z` |
+| **CI-only pip installs** | `==exact` | Hermetic CI builds; churn is acceptable. |
+
+**Every new PyPI dependency in a PR must have a `<next_major` upper bound.** PRs adding unbounded `>=X.Y.Z` specs will be rejected by reviewers. The `supply-chain-audit.yml` CI workflow also flags dependency manifest changes for manual review.
+
+**How to determine the ceiling:**
+- If the package is at version `1.x.y`, use `<2`.
+- If the package is at version `0.x.y` (pre-1.0), use `<0.(current_minor + 2)` — e.g. if current is `0.29.x`, use `<0.32`. This gives ~2 minor versions of headroom while keeping the window small enough that a hostile takeover version is unlikely to land inside it.
+- Exception: packages with very stable APIs (e.g. `aiohttp-socks`) can use `<1` at reviewer discretion.
+
+**Examples:**
+```toml
+# ✅ Correct — post-1.0
+"openai>=2.21.0,<3"
+"pydantic>=2.12.5,<3"
+
+# ✅ Correct — pre-1.0 (tight minor window)
+"asyncpg>=0.29,<0.32"
+"aiosqlite>=0.20,<0.23"
+"hindsight-client>=0.4.22,<0.5"
+
+# ❌ Rejected — no upper bound
+"some-package>=1.2.3"
+
+# ❌ Rejected — too tight (blocks legitimate patches)
+"some-package==1.2.3"
+
+# ❌ Rejected — too loose for pre-1.0 (allows 80 minor versions)
+"some-package>=0.20,<1"
+```
+
+**Reference PRs:** #2796 (litellm removal), #2810 (upper bounds pass), #9801 (SHA pinning + supply-chain-audit CI).
+
 ---

 ## Pull Request Process
@@ -23,7 +23,7 @@ Use any model you want — [Nous Portal](https://portal.nousresearch.com), [Open
 <tr><td><b>Scheduled automations</b></td><td>Built-in cron scheduler with delivery to any platform. Daily reports, nightly backups, weekly audits — all in natural language, running unattended.</td></tr>
 <tr><td><b>Delegates and parallelizes</b></td><td>Spawn isolated subagents for parallel workstreams. Write Python scripts that call tools via RPC, collapsing multi-step pipelines into zero-context-cost turns.</td></tr>
 <tr><td><b>Runs anywhere, not just your laptop</b></td><td>Seven terminal backends — local, Docker, SSH, Singularity, Modal, Daytona, and Vercel Sandbox. Daytona and Modal offer serverless persistence — your agent's environment hibernates when idle and wakes on demand, costing nearly nothing between sessions. Run it on a $5 VPS or a GPU cluster.</td></tr>
-<tr><td><b>Research-ready</b></td><td>Batch trajectory generation, Atropos RL environments, trajectory compression for training the next generation of tool-calling models.</td></tr>
+<tr><td><b>Research-ready</b></td><td>Batch trajectory generation, trajectory compression for training the next generation of tool-calling models.</td></tr>
 </table>

 ---
@@ -175,8 +175,6 @@ uv pip install -e ".[all,dev]"
 scripts/run_tests.sh
 ```

-> **RL Training (optional):** The RL/Atropos integration (`environments/`) — see [`CONTRIBUTING.md`](https://github.com/NousResearch/hermes-agent/blob/main/CONTRIBUTING.md#development-setup) for the full setup.
-
 ---

 ## Community
@@ -23,7 +23,7 @@
 <tr><td><b>定时自动化</b></td><td>内置 cron 调度器，支持向任何平台投递。日报、夜间备份、周审计——全部用自然语言描述，无人值守运行。</td></tr>
 <tr><td><b>委派与并行</b></td><td>生成隔离子代理处理并行工作流。编写 Python 脚本通过 RPC 调用工具，将多步管道压缩为零上下文开销的轮次。</td></tr>
 <tr><td><b>随处运行</b></td><td>六种终端后端——本地、Docker、SSH、Daytona、Singularity 和 Modal。Daytona 和 Modal 提供 Serverless 持久化——代理环境空闲时休眠、按需唤醒，空闲期间几乎零成本。$5 VPS 或 GPU 集群都能跑。</td></tr>
-<tr><td><b>研究就绪</b></td><td>批量轨迹生成、Atropos RL 环境、轨迹压缩——用于训练下一代工具调用模型。</td></tr>
+<tr><td><b>研究就绪</b></td><td>批量轨迹生成、轨迹压缩——用于训练下一代工具调用模型。</td></tr>
 </table>

 ---
@@ -161,12 +161,6 @@ uv pip install -e ".[all,dev]"
 python -m pytest tests/ -q
 ```

-> **RL 训练（可选）：** 如需参与 RL/Tinker-Atropos 集成开发：
-> ```bash
-> git submodule update --init tinker-atropos
-> uv pip install -e "./tinker-atropos"
-> ```
-
 ---

 ## 社区
@@ -0,0 +1,477 @@
+# Hermes Agent v0.14.0 (v2026.5.16)
+
+**Release Date:** May 16, 2026
+**Since v0.13.0:** 808 commits · 633 merged PRs · 1393 files changed · 165,061 insertions · 545 issues closed (12 P0, 50 P1) · 215 community contributors (including co-authors)
+
+> The Foundation Release — Hermes Agent installs and runs anywhere now. Native Windows ships in early beta with a full PowerShell installer story, a `pip install hermes-agent` wheel lands on PyPI, lazy-deps reshape what `pip install hermes-agent` actually pulls down, the supply-chain checker scans every install/upgrade for unsafe versions, and a new OpenAI-compatible local proxy lets Codex / Aider / Cline talk to OAuth-only providers (Claude Pro, ChatGPT Pro, SuperGrok). The cold-start wave shaves ~19 seconds off `hermes` launch, browser-tool CDP calls run 180x faster, and `hermes tools` All-Platforms drops from 14s to under 1.5s. Two new messaging platforms (LINE and SimpleX Chat) and a Microsoft Graph foundation (Teams pipeline + webhook adapter) land alongside `/handoff` that finally transfers sessions live, `vision_analyze` passing pixels through to vision-capable models, `x_search` as a first-class tool, LSP semantic diagnostics on every `write_file` / `patch`, a unified pluggable `video_generate`, a `computer_use` cua-driver backend, cross-session 1-hour Claude prompt caching, a per-turn file-mutation verifier, plus 9 new optional skills. 50+ P1 closures, 12 P0 closures.
+
+---
+
+## ✨ Highlights
+
+- **Native Windows support (early beta)** — full PowerShell installer, native subprocess/PTY paths, taskkill-based process management, MinGit auto-install, Microsoft Store python stub detection, foreground Ctrl+C preservation, taskkill+ps2 fallback, npm prefix handling, and ~40 follow-up Windows-only fixes across CLI / gateway / TUI / curator / tools. Hermes finally runs natively on `cmd.exe` and PowerShell, no WSL required. ([#21561](https://github.com/NousResearch/hermes-agent/pull/21561), [#22130](https://github.com/NousResearch/hermes-agent/pull/22130), [#22752](https://github.com/NousResearch/hermes-agent/pull/22752), [#26618](https://github.com/NousResearch/hermes-agent/pull/26618), and many more)
+
+- **`pip install hermes-agent && hermes`** — Hermes Agent is now a real PyPI package. One command, no clone, no git, no shell installer. Wheel includes the Ink TUI bundle and shell launcher. (salvage of [#26350](https://github.com/NousResearch/hermes-agent/pull/26350)) ([#26593](https://github.com/NousResearch/hermes-agent/pull/26593))
+
+- **Cold-start performance wave — ~19s off `hermes` launch** — skills cache, lazy Feishu import, no Nous HTTP at startup, plus PEP-562 lazy adapter imports (QQ, Yuanbao, Teams, Google Chat), deferred `fal_client` / `google-cloud` / `httpx` loads, models.dev disk-cache-first lookup, parallel doctor API checks, eager-skip plugin discovery on built-in subcommands, `hermes tools` All-Platforms drops from 14s to <1.5s, welcome banner skipped on `chat -q`. ([#22138](https://github.com/NousResearch/hermes-agent/pull/22138), [#22120](https://github.com/NousResearch/hermes-agent/pull/22120), [#22681](https://github.com/NousResearch/hermes-agent/pull/22681), [#22790](https://github.com/NousResearch/hermes-agent/pull/22790), [#22808](https://github.com/NousResearch/hermes-agent/pull/22808), [#22831](https://github.com/NousResearch/hermes-agent/pull/22831), [#22859](https://github.com/NousResearch/hermes-agent/pull/22859), [#22904](https://github.com/NousResearch/hermes-agent/pull/22904), [#22766](https://github.com/NousResearch/hermes-agent/pull/22766), [#25341](https://github.com/NousResearch/hermes-agent/pull/25341))
+
+- **180x faster `browser_console` evaluations** — routed through the supervisor's persistent CDP WebSocket instead of spawning a fresh DevTools session per call. Real-world page interactions feel instant. ([#23226](https://github.com/NousResearch/hermes-agent/pull/23226))
+
+- **Supply-chain advisory checker + lazy-deps framework + tiered install fallback** — every `pip install` / `hermes update` scans dependencies against an advisory list, lazy-deps replace heavy import-time loads with first-use installs, and the installer falls back through extras tiers when a wheel rejects on the target platform. ([#24220](https://github.com/NousResearch/hermes-agent/pull/24220))
+
+- **OpenAI-compatible local proxy** — `hermes proxy` exposes any OAuth-authed provider (Claude Pro, ChatGPT Pro, SuperGrok) as an OpenAI-compatible endpoint that Codex / Aider / Cline / VS Code Continue can hit. Your subscription, your tools. ([#25969](https://github.com/NousResearch/hermes-agent/pull/25969))
+
+- **Cross-session 1-hour Claude prompt cache** — Anthropic / OpenRouter / Nous Portal now share a 1h prefix cache across sessions for Claude models. Fast resume, fast `/new`, lower cost on repeat work. ([#23828](https://github.com/NousResearch/hermes-agent/pull/23828))
+
+- **Two new messaging platforms — LINE + SimpleX Chat** — LINE Messaging API lands as a first-class platform, SimpleX Chat salvages #2558 onto the modern adapter spec. Hermes is now on 22 platforms. ([#23197](https://github.com/NousResearch/hermes-agent/pull/23197), [#26232](https://github.com/NousResearch/hermes-agent/pull/26232))
+
+- **Microsoft Graph foundation — Teams pipeline + webhook adapter** — `msgraph` auth/client foundation, webhook listener platform, Teams pipeline plugin runtime, and Teams outbound delivery via the existing adapter — Hermes can now read and post to Teams. (salvages of #21408–#21411) ([#21922](https://github.com/NousResearch/hermes-agent/pull/21922), [#21969](https://github.com/NousResearch/hermes-agent/pull/21969), [#22007](https://github.com/NousResearch/hermes-agent/pull/22007), [#22024](https://github.com/NousResearch/hermes-agent/pull/22024))
+
+- **`/handoff` actually transfers the session live** — the agent's active session moves to a different model / persona / profile mid-conversation, with messages, tool history, and context preserved. ([#23395](https://github.com/NousResearch/hermes-agent/pull/23395))
+
+- **`x_search` — first-class X (Twitter) search tool** — gated tool with OAuth-or-API-key auth, no skill needed to query the timeline. ([#26763](https://github.com/NousResearch/hermes-agent/pull/26763))
+
+- **`vision_analyze` returns pixels to vision-capable models** — when the active model can see, `vision_analyze` now hands the image straight through instead of falling back to a text description. ([#22955](https://github.com/NousResearch/hermes-agent/pull/22955))
+
+- **LSP semantic diagnostics on every write** — `write_file` and `patch` now run real language-server diagnostics on the post-edit file (delta-only) and surface real errors before they ship downstream. ([#24168](https://github.com/NousResearch/hermes-agent/pull/24168), [#25978](https://github.com/NousResearch/hermes-agent/pull/25978))
+
+- **Per-turn file-mutation verifier footer** — after every turn that wrote files, the agent gets a verifier footer summarizing what actually changed on disk — catches silent overwrites and "wrote it but it didn't land" bugs. ([#24498](https://github.com/NousResearch/hermes-agent/pull/24498))
+
+- **Unified `video_generate` with pluggable provider backends** — single tool, any backend. Drop in a new video provider as a plugin, no core changes. ([#25126](https://github.com/NousResearch/hermes-agent/pull/25126))
+
+- **`computer_use` cua-driver backend** — proper focus-safe ops, non-Anthropic provider support, refresh on `hermes update`. Computer-use is no longer locked to a single SDK. (re-salvage of #16936) ([#21967](https://github.com/NousResearch/hermes-agent/pull/21967), [#24063](https://github.com/NousResearch/hermes-agent/pull/24063))
+
+- **xAI Grok OAuth provider — SuperGrok via subscription** — sign in with your xAI account, talk to Grok models from Hermes. ([#26534](https://github.com/NousResearch/hermes-agent/pull/26534))
+
+- **Clarify with buttons — native inline keyboards on Telegram + Discord** — the `clarify` tool renders multi-choice prompts as platform-native buttons instead of typed responses. ([#24199](https://github.com/NousResearch/hermes-agent/pull/24199), [#25485](https://github.com/NousResearch/hermes-agent/pull/25485))
+
+- **Discord channel history backfill (default on)** — Hermes reads recent channel history when joining a thread so it actually knows what's been said. ([#25984](https://github.com/NousResearch/hermes-agent/pull/25984))
+
+- **Watchers skill — RSS / HTTP JSON / GitHub polling via cron `no_agent` mode** — skill recipes that wire change-detection sources directly into cron's script-only watchdog mode. ([#21881](https://github.com/NousResearch/hermes-agent/pull/21881))
+
+- **Zed ACP Registry integration + uvx distribution** — Hermes is in the Zed registry, installable via `uvx` (no npm). Plus `hermes acp --setup-browser` bootstraps browser tools for registry installs. (salvage of [#25908](https://github.com/NousResearch/hermes-agent/pull/25908)) ([#26079](https://github.com/NousResearch/hermes-agent/pull/26079), [#26120](https://github.com/NousResearch/hermes-agent/pull/26120), [#26234](https://github.com/NousResearch/hermes-agent/pull/26234))
+
+- **OpenRouter Pareto Code router** — wire a new OpenRouter router with `min_coding_score` knob. Pick the cheapest model that meets your quality bar. ([#22838](https://github.com/NousResearch/hermes-agent/pull/22838))
+
+- **Optional codex app-server runtime for OpenAI/Codex models** — drives the OpenAI Codex CLI under the hood for OpenAI/Codex paths, with session reuse, wedge retirement, and OAuth refresh classification. ([#24182](https://github.com/NousResearch/hermes-agent/pull/24182), [#25769](https://github.com/NousResearch/hermes-agent/pull/25769))
+
+- **`hermes-skills/huggingface` as a trusted default tap** — community skills index from huggingface.co/skills is available by default in the Skills Hub. ([#26219](https://github.com/NousResearch/hermes-agent/pull/26219))
+
+- **9 new optional skills** — Hyperliquid (perp/spot trading via SDK + REST) (@kshitijk4poor & Hermes), Yahoo Finance market data, api-testing (REST/GraphQL debug), unified EVM multi-chain skill (folds #25291 + #2010 + base/), darwinian-evolver, osint-investigation (closes #355), pinggy-tunnel, watchers (RSS/HTTP/GitHub via cron), Notion overhaul for the Developer Platform (May 2026). ([#23582](https://github.com/NousResearch/hermes-agent/pull/23582), [#23583](https://github.com/NousResearch/hermes-agent/pull/23583), [#23590](https://github.com/NousResearch/hermes-agent/pull/23590), [#25299](https://github.com/NousResearch/hermes-agent/pull/25299), [#26760](https://github.com/NousResearch/hermes-agent/pull/26760), [#26729](https://github.com/NousResearch/hermes-agent/pull/26729), [#26765](https://github.com/NousResearch/hermes-agent/pull/26765), [#21881](https://github.com/NousResearch/hermes-agent/pull/21881), [#26612](https://github.com/NousResearch/hermes-agent/pull/26612))
+
+- **API server exposes run approval events** — long-running runs surface approval requests over the API stream, no more silent stalls. (salvage of [#20311](https://github.com/NousResearch/hermes-agent/pull/20311)) ([#21899](https://github.com/NousResearch/hermes-agent/pull/21899))
+
+- **`/subgoal` — user-added criteria appended to active `/goal`** — layer extra success criteria onto a running goal loop. The judge sees them in the prompt, no behavior change when subgoals are empty. ([#25449](https://github.com/NousResearch/hermes-agent/pull/25449))
+
+- **Plugins can run any LLM call via `ctx.llm`** — plugins get a first-class hook to make their own LLM requests through the active provider/credentials, no manual wiring. Plus `tool_override` flag for replacing built-in tools. ([#23194](https://github.com/NousResearch/hermes-agent/pull/23194), [#26759](https://github.com/NousResearch/hermes-agent/pull/26759))
+
+- **Brave Search (free tier) + DuckDuckGo (DDGS) as web-search providers** — two new free search backends alongside Tavily / SearXNG / Exa. ([#21337](https://github.com/NousResearch/hermes-agent/pull/21337))
+
+- **Sudo brute-force block + sudo-stdin/askpass DANGEROUS classification** — closes the `sudo -S` brute-force avenue; approval gates classify stdin-fed and askpass-stripped sudo invocations as dangerous. (salvages of #22194 + #21128) ([#23736](https://github.com/NousResearch/hermes-agent/pull/23736))
+
+- **Provider rename — Alibaba Cloud → Qwen Cloud, picker reorder** — matches what the world calls it. Existing config keys still work. ([#24835](https://github.com/NousResearch/hermes-agent/pull/24835))
+
+
+---
+
+## 🪟 Windows — Native Support (Early Beta)
+
+### Bootstrap & installer
+- **Native Windows support (early beta)** — first-class native Windows path across CLI / gateway / TUI / tools ([#21561](https://github.com/NousResearch/hermes-agent/pull/21561))
+- **PyPI wheel packaging — `pip install hermes-agent && hermes`** (salvage of #26350) ([#26593](https://github.com/NousResearch/hermes-agent/pull/26593))
+- **Recognise Shift+Enter as a newline key** + Windows docs (salvage #21545) ([#22130](https://github.com/NousResearch/hermes-agent/pull/22130))
+- **Preserve Ctrl+C for Windows foreground runs** (@helix4u) ([#22752](https://github.com/NousResearch/hermes-agent/pull/22752))
+- **Stop spamming cwd-missing + tirith-spawn warnings on every terminal call** ([#26618](https://github.com/NousResearch/hermes-agent/pull/26618))
+- **Use `--extra all` not `--all-extras`; drop lazy-covered extras from `[all]`** ([#24515](https://github.com/NousResearch/hermes-agent/pull/24515))
+
+### Windows-specific fixes (40+ across cli / tools / gateway / curator / TUI)
+A long tail of native-Windows fixes shipped alongside the beta — taskkill-based subprocess management, MinGit auto-install, Microsoft Store python stub detection, npm prefix handling, native PTY paths, signal handling differences, foreground process management, ANSI sequence handling, path normalization, file-locking semantics, and many more. Full list in commit log under `fix(windows)` / `feat(windows)` / `windows`.
+
+---
+
+## 🚀 Performance Wave
+
+### Cold start
+- **Cut ~19s from `hermes` cold start** — skills cache + lazy Feishu + no Nous HTTP at startup ([#22138](https://github.com/NousResearch/hermes-agent/pull/22138))
+- **Skip eager plugin discovery on known built-in subcommands** ([#22120](https://github.com/NousResearch/hermes-agent/pull/22120))
+- **Cache Nous auth + .env loads** — `hermes tools` All Platforms from 14s to <1.5s ([#25341](https://github.com/NousResearch/hermes-agent/pull/25341))
+- **Skip welcome banner on `chat -q` single-query mode** ([#22904](https://github.com/NousResearch/hermes-agent/pull/22904))
+- **Defer heavy google-cloud imports in google_chat to first adapter use** ([#22681](https://github.com/NousResearch/hermes-agent/pull/22681))
+- **Defer QQAdapter and YuanbaoAdapter imports via PEP 562** ([#22790](https://github.com/NousResearch/hermes-agent/pull/22790))
+- **Defer httpx import in teams to first webhook call** ([#22831](https://github.com/NousResearch/hermes-agent/pull/22831))
+- **Defer fal_client import to first generation request** ([#22859](https://github.com/NousResearch/hermes-agent/pull/22859))
+- **models.dev cache-first lookup, skip network when disk cache is fresh** ([#22808](https://github.com/NousResearch/hermes-agent/pull/22808))
+- **Parallelize API connectivity checks in `hermes doctor` and disable IMDS** ([#22766](https://github.com/NousResearch/hermes-agent/pull/22766))
+
+### Runtime
+- **180x faster `browser_console` evaluations** — route through supervisor's persistent CDP WebSocket ([#23226](https://github.com/NousResearch/hermes-agent/pull/23226))
+- **Tune Telegram cadence + adaptive fast-path for short replies** (salvage of #10388) ([#23587](https://github.com/NousResearch/hermes-agent/pull/23587))
+- **Accumulate length-continuation prefix via list+join** ([#26237](https://github.com/NousResearch/hermes-agent/pull/26237))
+
+### Prompt caching
+- **Cross-session 1h prefix cache for Claude on Anthropic / OpenRouter / Nous Portal** ([#23828](https://github.com/NousResearch/hermes-agent/pull/23828))
+- **Hit prefix cache in background review fork** (salvage #17276 + #25427) ([#25434](https://github.com/NousResearch/hermes-agent/pull/25434))
+
+---
+
+## 📦 Installation & Distribution
+
+### PyPI + supply-chain
+- **PyPI wheel packaging — `pip install hermes-agent && hermes`** (salvage of #26350) ([#26593](https://github.com/NousResearch/hermes-agent/pull/26593))
+- **Supply-chain advisory checker + lazy-install framework + tiered install fallback** ([#24220](https://github.com/NousResearch/hermes-agent/pull/24220))
+- **Use `--extra all` not `--all-extras`; drop lazy-covered extras from `[all]`** ([#24515](https://github.com/NousResearch/hermes-agent/pull/24515))
+- **Skip browser download when system chromium exists** (@helix4u) ([#25317](https://github.com/NousResearch/hermes-agent/pull/25317))
+
+### Nix
+- **`extraDependencyGroups` for sealed venv extras** (@alt-glitch) ([#21817](https://github.com/NousResearch/hermes-agent/pull/21817))
+- **Refresh npm lockfile hashes** — keeps Nix flake builds reproducible
+
+### Docker
+- **Bootstrap auth.json from env on first boot** ([#21880](https://github.com/NousResearch/hermes-agent/pull/21880))
+- **Drop manual @hermes/ink build, rely on esbuild bundle** — slimmer image
+
+### ACP / Zed
+- **Zed ACP Registry integration** (salvage of #25908) ([#26079](https://github.com/NousResearch/hermes-agent/pull/26079))
+- **Switch to uvx distribution, drop npm launcher** ([#26120](https://github.com/NousResearch/hermes-agent/pull/26120))
+- **`hermes acp --setup-browser` bootstraps browser tools for registry installs** ([#26234](https://github.com/NousResearch/hermes-agent/pull/26234))
+
+---
+
+## 🏗️ Core Agent & Architecture
+
+### Sessions & handoff
+- **`/handoff` actually transfers the session live** ([#23395](https://github.com/NousResearch/hermes-agent/pull/23395))
+- **Expose `HERMES_SESSION_ID` env var to agent tools** (@alt-glitch) ([#23847](https://github.com/NousResearch/hermes-agent/pull/23847))
+
+### Goals (Ralph loop)
+- **`/subgoal` — user-added criteria appended to active `/goal`** ([#25449](https://github.com/NousResearch/hermes-agent/pull/25449))
+- **`/goal` checklist + /subgoal user controls** ([#23456](https://github.com/NousResearch/hermes-agent/pull/23456)) — rolled back in window ([#23813](https://github.com/NousResearch/hermes-agent/pull/23813)); /subgoal returned in simpler form via #25449
+
+### Compression
+- **Make `protect_first_n` configurable** ([#25447](https://github.com/NousResearch/hermes-agent/pull/25447))
+
+### Verification
+- **Per-turn file-mutation verifier footer** ([#24498](https://github.com/NousResearch/hermes-agent/pull/24498))
+
+### Stream retry
+- **Log inner cause, upstream headers, bytes/elapsed on every drop** ([#23005](https://github.com/NousResearch/hermes-agent/pull/23005))
+
+---
+
+## 🤖 Models & Providers
+
+### New providers
+- **xAI Grok OAuth (SuperGrok Subscription) provider** ([#26534](https://github.com/NousResearch/hermes-agent/pull/26534))
+- **NovitaAI provider** (salvage #7219) (@kshitijk4poor) ([#25507](https://github.com/NousResearch/hermes-agent/pull/25507))
+- **NVIDIA NIM billing origin header** (salvage #25211) ([#26585](https://github.com/NousResearch/hermes-agent/pull/26585))
+
+### Provider work
+- **OpenRouter Pareto Code router with `min_coding_score` knob** ([#22838](https://github.com/NousResearch/hermes-agent/pull/22838))
+- **Optional codex app-server runtime for OpenAI/Codex models** ([#24182](https://github.com/NousResearch/hermes-agent/pull/24182))
+- **Codex-runtime: retire wedged sessions + post-tool watchdog + OAuth refresh classify** ([#25769](https://github.com/NousResearch/hermes-agent/pull/25769))
+- **Codex-runtime: skip unavailable plugins during migration** ([#25437](https://github.com/NousResearch/hermes-agent/pull/25437))
+- **Codex-runtime: de-dup `[plugins.X]` tables and stop leaking HERMES_HOME into config.toml** (#26250) (@kshitijk4poor) ([#26260](https://github.com/NousResearch/hermes-agent/pull/26260))
+- **Pass `reasoning.effort` to xAI Responses API** ([#22807](https://github.com/NousResearch/hermes-agent/pull/22807))
+- **Custom provider: prompt and persist explicit `api_mode`** ([#25068](https://github.com/NousResearch/hermes-agent/pull/25068))
+- **Rename Alibaba Cloud → Qwen Cloud, reorder picker** ([#24835](https://github.com/NousResearch/hermes-agent/pull/24835))
+- **Restore gpt-5.3-codex-spark for ChatGPT Pro** (salvage #18286 + #19530, fixes #16172) (@kshitijk4poor) ([#22991](https://github.com/NousResearch/hermes-agent/pull/22991))
+- **Inject tool-use enforcement for GLM models** ([#24715](https://github.com/NousResearch/hermes-agent/pull/24715))
+- **Use Nous Portal as model metadata authority** (@rob-maron) ([#24502](https://github.com/NousResearch/hermes-agent/pull/24502))
+- **Unified `client=hermes-client-v<version>` tag on every Portal request** ([#24779](https://github.com/NousResearch/hermes-agent/pull/24779))
+- **Prevent stale Ollama credentials after provider switch** (@kshitijk4poor) ([#21703](https://github.com/NousResearch/hermes-agent/pull/21703))
+- **Auxiliary client: rotate pooled auth after quota failures** (salvage #22779) ([#22792](https://github.com/NousResearch/hermes-agent/pull/22792))
+- **Auxiliary client: skip providers without credentials immediately** (#25395) ([#25487](https://github.com/NousResearch/hermes-agent/pull/25487))
+- **Auth: send Nous refresh token via header** (@shannonsands) ([#21578](https://github.com/NousResearch/hermes-agent/pull/21578))
+- **MiniMax: harden OAuth dashboard and runtime** ([#24165](https://github.com/NousResearch/hermes-agent/pull/24165))
+
+### OpenAI-compatible proxy
+- **Local OpenAI-compatible proxy for OAuth providers** — Codex / Aider / Cline can hit Claude Pro, ChatGPT Pro, SuperGrok ([#25969](https://github.com/NousResearch/hermes-agent/pull/25969))
+
+---
+
+## 📱 Messaging Platforms (Gateway)
+
+### New platforms
+- **LINE Messaging API platform plugin** ([#23197](https://github.com/NousResearch/hermes-agent/pull/23197))
+- **SimpleX Chat platform plugin** (salvages #2558) ([#26232](https://github.com/NousResearch/hermes-agent/pull/26232))
+
+### Microsoft Graph foundation
+- **msgraph: add auth and client foundation** (salvage of #21408) ([#21922](https://github.com/NousResearch/hermes-agent/pull/21922))
+- **msgraph: add webhook listener platform** (salvage of #21409) ([#21969](https://github.com/NousResearch/hermes-agent/pull/21969))
+- **teams-pipeline: add plugin runtime and operator cli** (salvage of #21410) ([#22007](https://github.com/NousResearch/hermes-agent/pull/22007))
+- **teams: add pipeline outbound delivery via existing adapter** (salvage of #21411) ([#22024](https://github.com/NousResearch/hermes-agent/pull/22024))
+
+### Cross-platform
+- **Per-platform admin/user split for slash commands** (salvage of #4443) ([#23373](https://github.com/NousResearch/hermes-agent/pull/23373))
+- **Forensics on signal handling — non-blocking diag, per-phase timing, stale-unit warning** ([#23285](https://github.com/NousResearch/hermes-agent/pull/23285))
+- **Keep gateway running when platforms fail; add per-platform circuit breaker + `/platform`** ([#26600](https://github.com/NousResearch/hermes-agent/pull/26600))
+- **Wire `clarify` tool with inline keyboard buttons on Telegram** ([#24199](https://github.com/NousResearch/hermes-agent/pull/24199))
+- **Add `chat_id` to `hook_ctx` for message source tracking** ([#24710](https://github.com/NousResearch/hermes-agent/pull/24710))
+
+### Telegram
+- **Native draft streaming via `sendMessageDraft` (Bot API 9.5+)** (salvage of #3412) ([#23512](https://github.com/NousResearch/hermes-agent/pull/23512))
+- **Stream Telegram edits safely** — salvage of #22264 (@kshitijk4poor) ([#22518](https://github.com/NousResearch/hermes-agent/pull/22518))
+- **Telegram notification mode** (salvage #22772) ([#22793](https://github.com/NousResearch/hermes-agent/pull/22793))
+- **Telegram guest mention mode** (@kshitijk4poor) ([#22759](https://github.com/NousResearch/hermes-agent/pull/22759))
+- **Split-and-deliver oversized edits instead of silent truncation** (salvage of #19537) ([#23576](https://github.com/NousResearch/hermes-agent/pull/23576))
+- **Preserve DM topic routing via reply fallback** (salvage #22053) (@kshitijk4poor) ([#22410](https://github.com/NousResearch/hermes-agent/pull/22410))
+- **Pass `source.thread_id` explicitly on auto-reset notice** (carve-out of #7404) ([#23440](https://github.com/NousResearch/hermes-agent/pull/23440))
+
+### Discord
+- **Render clarify choices as buttons** ([#25485](https://github.com/NousResearch/hermes-agent/pull/25485))
+- **Channel history backfill — default on, broadened scope** ([#25984](https://github.com/NousResearch/hermes-agent/pull/25984))
+- **`thread_require_mention` for multi-bot threads** (salvage #25313) ([#25445](https://github.com/NousResearch/hermes-agent/pull/25445))
+
+### Slack
+- **Support `!cmd` as alternate prefix for slash commands in threads** ([#25355](https://github.com/NousResearch/hermes-agent/pull/25355))
+
+### WhatsApp
+- **Surface quoted reply metadata from Baileys** (#25398) ([#25489](https://github.com/NousResearch/hermes-agent/pull/25489))
+
+### Feishu / Google Chat / others
+- **Feishu: native update prompt cards** (@kshitijk4poor) ([#22448](https://github.com/NousResearch/hermes-agent/pull/22448))
+- **Google Chat: repair setup prompt imports** (@helix4u) ([#22038](https://github.com/NousResearch/hermes-agent/pull/22038))
+- **Google Chat: honor relay-declared sender_type** (salvage of #22107) (@kshitijk4poor) ([#22432](https://github.com/NousResearch/hermes-agent/pull/22432))
+- **LINE: use `build_source` instead of nonexistent `create_source`** ([#24717](https://github.com/NousResearch/hermes-agent/pull/24717))
+- **Add `weixin, and more` to gateway docs** (salvage of #21063 by @wuwuzhijing)
+
+---
+
+## 🖥️ CLI & TUI
+
+### CLI
+- **Show YOLO mode warning in banner and status bar** ([#26238](https://github.com/NousResearch/hermes-agent/pull/26238))
+- **Confirm prompt for destructive slash commands** (#4069) ([#22687](https://github.com/NousResearch/hermes-agent/pull/22687))
+- **`docker_extra_args` + `display.timestamps`** ([#23599](https://github.com/NousResearch/hermes-agent/pull/23599))
+- **Delegate tool: show user's actual concurrency / spawn-depth limits in description** ([#22694](https://github.com/NousResearch/hermes-agent/pull/22694))
+
+### TUI
+- **`/sessions` slash command for browsing and resuming previous sessions** (@austinpickett) ([#20805](https://github.com/NousResearch/hermes-agent/pull/20805))
+- **Segment turns with rule above non-first user msgs; trim ticker dead space** (@OutThisLife) ([#21846](https://github.com/NousResearch/hermes-agent/pull/21846))
+- **Support attaching to an existing gateway** (@OutThisLife) ([#21978](https://github.com/NousResearch/hermes-agent/pull/21978))
+- **Resolve markdown links to readable page titles** (@OutThisLife) ([#24013](https://github.com/NousResearch/hermes-agent/pull/24013))
+- **Width-aware markdown table rendering with vertical fallback** (@alt-glitch) ([#26195](https://github.com/NousResearch/hermes-agent/pull/26195))
+- **Keep Ink displayCursor in sync with fast-echo writes so cursor stops drifting** (@OutThisLife) ([#26717](https://github.com/NousResearch/hermes-agent/pull/26717))
+- **Allow transcript scroll + Esc during approval/clarify/confirm prompts** (@OutThisLife) ([#26414](https://github.com/NousResearch/hermes-agent/pull/26414))
+- **Preserve session when switching personality** (@austinpickett) ([#20942](https://github.com/NousResearch/hermes-agent/pull/20942))
+- **Skip native safety net on OSC52-capable terminals** (@benbarclay) ([#20954](https://github.com/NousResearch/hermes-agent/pull/20954))
+
+### Dashboard / GUI
+- **Route embedded TUI through dashboard gateway** (@OutThisLife) ([#21979](https://github.com/NousResearch/hermes-agent/pull/21979))
+- **Hide token/cost analytics behind config flag (default off)** ([#25438](https://github.com/NousResearch/hermes-agent/pull/25438))
+- **Fix Langfuse observability — trace I/O, tool outputs, placeholder credentials** (closes #22342, #22763) (@kshitijk4poor) ([#26320](https://github.com/NousResearch/hermes-agent/pull/26320))
+- **MiniMax 'Login' button launched Claude OAuth** (salvage #22849) ([#24058](https://github.com/NousResearch/hermes-agent/pull/24058))
+- **Update cron modals** (@austinpickett) ([#25985](https://github.com/NousResearch/hermes-agent/pull/25985))
+- **Analytics: prevent silent token loss and add Claude 4.5–4.7 pricing** (@austinpickett) ([#21455](https://github.com/NousResearch/hermes-agent/pull/21455))
+
+---
+
+## 🔧 Tools & Capabilities
+
+### Vision & video
+- **`vision_analyze` returns pixels to vision-capable models** ([#22955](https://github.com/NousResearch/hermes-agent/pull/22955))
+- **Unified `video_generate` with pluggable provider backends** ([#25126](https://github.com/NousResearch/hermes-agent/pull/25126))
+- **`image_gen`: actionable setup message when no FAL backend is reachable** ([#26222](https://github.com/NousResearch/hermes-agent/pull/26222))
+
+### Computer use
+- **`computer_use` cua-driver backend + focus-safe ops + non-Anthropic provider fix** (re-salvage #16936) ([#21967](https://github.com/NousResearch/hermes-agent/pull/21967))
+- **Refresh cua-driver on `hermes update` + add `install --upgrade`** ([#24063](https://github.com/NousResearch/hermes-agent/pull/24063))
+
+### LSP & write-time diagnostics
+- **Semantic diagnostics from real language servers in `write_file`/`patch`** ([#24168](https://github.com/NousResearch/hermes-agent/pull/24168))
+- **Shift baseline diagnostics into post-edit coordinates** ([#25978](https://github.com/NousResearch/hermes-agent/pull/25978))
+
+### Search & web
+- **Brave Search (free tier) and DDGS search providers** ([#21337](https://github.com/NousResearch/hermes-agent/pull/21337))
+- **Bearer auth header for Tavily `/crawl` endpoint** ([#24658](https://github.com/NousResearch/hermes-agent/pull/24658))
+
+### X (Twitter)
+- **Gated `x_search` tool with OAuth-or-API-key auth** ([#26763](https://github.com/NousResearch/hermes-agent/pull/26763))
+
+### Browser
+- **Route `browser_console` eval through supervisor's persistent CDP WS (180x faster)** ([#23226](https://github.com/NousResearch/hermes-agent/pull/23226))
+- **Support externally managed Camofox sessions** ([#24499](https://github.com/NousResearch/hermes-agent/pull/24499))
+
+### MCP
+- **`supports_parallel_tool_calls` for MCP servers** (salvage of #9944) ([#26825](https://github.com/NousResearch/hermes-agent/pull/26825))
+- **Codex preset for Codex CLI MCP server** (salvage #22663) ([#22679](https://github.com/NousResearch/hermes-agent/pull/22679))
+- **Stop retrying initial MCP auth failures** (#25624) ([#25776](https://github.com/NousResearch/hermes-agent/pull/25776))
+
+### Google Workspace
+- **Drive write ops + Docs/Sheets create/append** ([#21895](https://github.com/NousResearch/hermes-agent/pull/21895))
+
+### Per-turn verifier
+- **Per-turn file-mutation verifier footer** ([#24498](https://github.com/NousResearch/hermes-agent/pull/24498))
+
+---
+
+## 🧩 Kanban (Multi-Agent)
+
+- **`specify` — auxiliary LLM fleshes out triage tasks** ([#21435](https://github.com/NousResearch/hermes-agent/pull/21435))
+- **Orchestrator board tools — `kanban_list` + `kanban_unblock`** (carve-out of #20568) ([#23012](https://github.com/NousResearch/hermes-agent/pull/23012))
+- **`stranded_in_ready` diagnostic for unclaimed tasks** ([#23578](https://github.com/NousResearch/hermes-agent/pull/23578))
+- **Dashboard batch QOL upgrade** (salvage of #23240) ([#23550](https://github.com/NousResearch/hermes-agent/pull/23550))
+- **Tooltips and docs link across dashboard** ([#21541](https://github.com/NousResearch/hermes-agent/pull/21541))
+- **Dedupe notifier delivery via atomic claim + rewind on failure** (salvage #22558) ([#23401](https://github.com/NousResearch/hermes-agent/pull/23401))
+- **Keep notifier subscriptions alive across retry cycles** (salvage #21398) ([#23423](https://github.com/NousResearch/hermes-agent/pull/23423))
+- **Drop caller-controlled author override in `kanban_comment`** (salvage of #22109) (@kshitijk4poor) ([#22435](https://github.com/NousResearch/hermes-agent/pull/22435))
+- **Sanitize comment author rendering in `build_worker_context`** ([#22769](https://github.com/NousResearch/hermes-agent/pull/22769))
+
+---
+
+## 🧠 Plugins & Extension
+
+### Plugin surface
+- **Run any LLM call from inside a plugin via `ctx.llm`** ([#23194](https://github.com/NousResearch/hermes-agent/pull/23194))
+- **`tool_override` flag for replacing built-in tools** (closes #11049) ([#26759](https://github.com/NousResearch/hermes-agent/pull/26759))
+- **`standalone_sender_fn` for out-of-process cron delivery** (@kshitijk4poor) ([#22461](https://github.com/NousResearch/hermes-agent/pull/22461))
+- **`HERMES_PLUGINS_DEBUG=1` surfaces plugin discovery logs** ([#22684](https://github.com/NousResearch/hermes-agent/pull/22684))
+- **Hindsight-client as optional dependency** (@alt-glitch) ([#21818](https://github.com/NousResearch/hermes-agent/pull/21818))
+
+### Profile & distribution
+- **Shareable profile distributions via git** ([#20831](https://github.com/NousResearch/hermes-agent/pull/20831))
+
+---
+
+## ⏰ Cron
+
+- **Routing intent — `deliver=all` fans out to every connected channel** ([#21495](https://github.com/NousResearch/hermes-agent/pull/21495))
+- **Support name-based lookup for job operations** ([#26231](https://github.com/NousResearch/hermes-agent/pull/26231))
+- **Blank Cron dashboard tab + partial-record crashes** (salvage #21042 + #22330) (@kshitijk4poor) ([#22389](https://github.com/NousResearch/hermes-agent/pull/22389))
+- **Do not seed `HERMES_SESSION_*` contextvars from cron origin** (salvage of #22356) (@kshitijk4poor) ([#22382](https://github.com/NousResearch/hermes-agent/pull/22382))
+- **Scan assembled prompt including skill content for prompt injection** (#3968)
+
+---
+
+## 🧩 Skills Ecosystem
+
+### Skills Hub
+- **`hermes-skills/huggingface` as a trusted default tap** (closes #2549) ([#26219](https://github.com/NousResearch/hermes-agent/pull/26219))
+- **Show per-skill pages in the left sidebar** ([#26646](https://github.com/NousResearch/hermes-agent/pull/26646))
+- **Richer info panels on the Skills Hub** ([#22905](https://github.com/NousResearch/hermes-agent/pull/22905))
+- **Refuse `skill_view` name collisions instead of guessing** (closes #6136 @polkn)
+
+### Curator
+- **Show rename map in user-visible summary** ([#22910](https://github.com/NousResearch/hermes-agent/pull/22910))
+- **Hint at `hermes curator pin` in the rename block** ([#23212](https://github.com/NousResearch/hermes-agent/pull/23212))
+
+### New optional skills
+- **Hyperliquid** — perp/spot trading via SDK + REST (salvage of #1952) ([#23583](https://github.com/NousResearch/hermes-agent/pull/23583))
+- **Yahoo Finance** market data ([#23590](https://github.com/NousResearch/hermes-agent/pull/23590))
+- **api-testing** (REST/GraphQL debug, salvages #1800) ([#23582](https://github.com/NousResearch/hermes-agent/pull/23582))
+- **Unified EVM multi-chain skill** (salvages #25291 + #2010 + folds in base/) ([#25299](https://github.com/NousResearch/hermes-agent/pull/25299))
+- **darwinian-evolver** ([#26760](https://github.com/NousResearch/hermes-agent/pull/26760))
+- **osint-investigation** (closes #355) ([#26729](https://github.com/NousResearch/hermes-agent/pull/26729))
+- **pinggy-tunnel** ([#26765](https://github.com/NousResearch/hermes-agent/pull/26765))
+- **watchers** — RSS / HTTP JSON / GitHub polling via cron no-agent ([#21881](https://github.com/NousResearch/hermes-agent/pull/21881))
+- **Notion overhaul for the Developer Platform** (May 2026) ([#26612](https://github.com/NousResearch/hermes-agent/pull/26612))
+
+---
+
+## 🔒 Security & Reliability
+
+### Security hardening
+- **Sudo brute-force block + sudo-stdin/askpass DANGEROUS** (salvage of #22194 + #21128) (@kshitijk4poor) ([#23736](https://github.com/NousResearch/hermes-agent/pull/23736))
+- **Drop caller-controlled author override in `kanban_comment`** (salvage of #22109) (@kshitijk4poor) ([#22435](https://github.com/NousResearch/hermes-agent/pull/22435))
+- **Cover remaining SSRF fetch paths in skills-hub** (salvage #22804) ([#22843](https://github.com/NousResearch/hermes-agent/pull/22843))
+- **Use credential_pool for custom endpoint model listing probes** (salvage #22810) ([#22842](https://github.com/NousResearch/hermes-agent/pull/22842))
+- **Require dashboard auth for plugin API routes** (salvage #19541) ([#23220](https://github.com/NousResearch/hermes-agent/pull/23220))
+- **Sanitize env and redact output in quick commands + remove write-only `_pending_messages`** ([#23584](https://github.com/NousResearch/hermes-agent/pull/23584))
+- **Reduce unnecessary `shell=True` in subprocess calls** ([#25149](https://github.com/NousResearch/hermes-agent/pull/25149))
+- **Sanitize Google Chat sender_type from relay** (salvage of #22107) (@kshitijk4poor) ([#22432](https://github.com/NousResearch/hermes-agent/pull/22432))
+- **Supply-chain advisory checker** ([#24220](https://github.com/NousResearch/hermes-agent/pull/24220))
+- **Rewrite security policy around OS-level isolation as the boundary** (@jquesnelle) ([#20317](https://github.com/NousResearch/hermes-agent/pull/20317))
+- **Remove public security advisory page** ([#24253](https://github.com/NousResearch/hermes-agent/pull/24253))
+
+### Reliability — notable bug closures
+- **SQLite: fall back to `journal_mode=DELETE` on NFS/SMB/FUSE** (fixes `/resume` on network mounts) (@kshitijk4poor) ([#22043](https://github.com/NousResearch/hermes-agent/pull/22043))
+- **Codex-runtime: retire wedged sessions + post-tool watchdog + OAuth refresh classify** ([#25769](https://github.com/NousResearch/hermes-agent/pull/25769))
+- **Codex-runtime: de-dup `[plugins.X]` tables and stop leaking HERMES_HOME** (#26250) (@kshitijk4poor) ([#26260](https://github.com/NousResearch/hermes-agent/pull/26260))
+- **Daytona: migrate legacy-sandbox lookup to cursor-based `list()`** ([#24587](https://github.com/NousResearch/hermes-agent/pull/24587))
+- **MCP: stop retrying initial MCP auth failures** (#25624) ([#25776](https://github.com/NousResearch/hermes-agent/pull/25776))
+- **Gateway: enable text-intercept for multi-choice clarify fallback** (#25587) ([#25778](https://github.com/NousResearch/hermes-agent/pull/25778))
+- **Gateway: keep running when platforms fail; per-platform circuit breaker + `/platform`** ([#26600](https://github.com/NousResearch/hermes-agent/pull/26600))
+- **Delegate: salvage #21933 JSON-string batch + diagnostic logging** (@kshitijk4poor) ([#22436](https://github.com/NousResearch/hermes-agent/pull/22436))
+- **Profiles+banner: exclude infrastructure from `--clone-all` + fix stale update-check repo resolution** (@kshitijk4poor) ([#22475](https://github.com/NousResearch/hermes-agent/pull/22475))
+- **ACP: inline file attachment resources** (salvage #21400 + image support) ([#21407](https://github.com/NousResearch/hermes-agent/pull/21407))
+- **CI: unblock shared PR checks** (@stephenschoettler) ([#21012](https://github.com/NousResearch/hermes-agent/pull/21012), [#25957](https://github.com/NousResearch/hermes-agent/pull/25957))
+
+### Notable reverts in window
+- **`/goal` checklist + /subgoal feature stack** — rolled back ([#23813](https://github.com/NousResearch/hermes-agent/pull/23813)); `/subgoal` returned in simpler form via [#25449](https://github.com/NousResearch/hermes-agent/pull/25449)
+- **Scrollback box width clamp** (#25975) rolled back to restore full-width borders ([#26163](https://github.com/NousResearch/hermes-agent/pull/26163))
+- **`fix(cli): tolerate unreadable dirs when building systemd PATH`** rolled back
+
+---
+
+## 🌍 i18n
+
+- **Localize all gateway commands + web dashboard, add 8 new locales (16 total)** ([#22914](https://github.com/NousResearch/hermes-agent/pull/22914))
+
+---
+
+## 📚 Documentation
+
+- **Repair Voice & TTS provider table** (@nightcityblade, fixes #24101) ([#24138](https://github.com/NousResearch/hermes-agent/pull/24138))
+- **Show per-skill pages in the left sidebar** ([#26646](https://github.com/NousResearch/hermes-agent/pull/26646))
+- **Mention Weixin in gateway help and docstrings** (salvage of #21063 by @wuwuzhijing)
+- **Richer info panels on the Skills Hub** ([#22905](https://github.com/NousResearch/hermes-agent/pull/22905))
+- Many more doc updates across providers, platforms, skills, Windows install paths, and dashboard.
+
+---
+
+## 🧪 Testing & CI
+
+- **Unblock shared PR checks** (@stephenschoettler) ([#21012](https://github.com/NousResearch/hermes-agent/pull/21012))
+- **Stabilize shared test state after 21012** (@stephenschoettler) ([#25957](https://github.com/NousResearch/hermes-agent/pull/25957))
+- A long tail of test additions for platforms, providers, plugins, and edge cases — 8 explicit `test:` PRs plus ~250 fix PRs that also added regression coverage.
+
+---
+
+## 👥 Contributors
+
+### Core
+- @teknium1 — release lead, architecture, ~406 PRs merged in window
+
+### Top community contributors
+- **@kshitijk4poor** — 38 PRs · Telegram cadence/streaming/topic routing, security hardening (sudo, SSRF, kanban_comment, dashboard auth), codex-runtime hygiene, NovitaAI provider, profile/banner fixes, Feishu update cards, gateway QOL across the board
+- **@alt-glitch** — 13 PRs · Markdown-table TUI rendering, `HERMES_SESSION_ID` env var, hindsight-client optional dep, Nix `extraDependencyGroups`
+- **@OutThisLife** (Brooklyn Nicholson) — 12 PRs · TUI turn segmentation, attach-to-gateway, markdown link titles, embedded TUI via dashboard gateway, Ink cursor sync, scroll/Esc during prompts
+- **@austinpickett** — 8 PRs · `/sessions` slash command, personality switching preserves session, cron modals, dashboard analytics
+- **@helix4u** — 5 PRs · Google Chat setup, browser install skip on system chromium, Windows Ctrl+C preservation
+- **@rob-maron** — 4 PRs · Nous Portal as model metadata authority, provider polish
+- **@stephenschoettler** — 3 PRs · CI stabilization
+- **@ethernet8023** — 3 PRs · platform/gateway work
+
+### All contributors (alphabetical)
+
+@02356abc, @0xbyt4, @0xharryriddle, @1000Delta, @1RB, @29206394, @A-kamal, @aashizpoudel, @Abd0r,
+@adybag14-cyber, @AgentArcLab, @ahmedbadr3, @AhmetArif0, @alblez, @Alex-yang00, @ALIYILD, @AllynSheep,
+@alt-glitch, @am423, @amathxbt, @amethystani, @ArecaNon, @Arkmusn, @askclaw-vesper, @AsoTora, @austinpickett,
+@aydnOktay, @ayushere, @baocin, @Bartok9, @benbarclay, @BennetYrWang, @Bihruze, @binhnt92, @briandevans,
+@brooklynnicholson, @btorresgil, @buntingszn, @CalmProton, @chrisworksai, @CoinTheHat, @dandacompany, @Dangooy,
+@DanielLSM, @David-0x221Eight, @ddupont808, @dhruv-saxena, @diablozzc, @dlkakbs, @dmahan93, @dmnkhorvath,
+@domtriola, @donrhmexe, @Dusk1e, @eloklam, @emozilla, @ephron-ren, @erenkarakus, @EthanGuo-coder,
+@ethernet8023, @evgyur, @explainanalyze, @fahdad, @fr33d3m0n, @Freeman-Consulting, @freqyfreqy, @Frowtek,
+@fu576, @github-actions[bot], @gnanirahulnutakki, @GodsBoy, @guglielmofonda, @Gutslabs, @hanzckernel,
+@heathley, @hekaru-agent, @helix4u, @HenkDz, @HiddenPuppy, @hllqkb, @hrygo, @HuangYuChuh, @Hugo-SEQUIER, @HxT9,
+@iacker, @InB4DevOps, @isaachuangGMICLOUD, @iuyup, @Jaaneek, @jackey8616, @jackjin1997, @Jaggia, @jak983464779,
+@jelrod27, @jethac, @JithendraNara, @johnisag, @Julientalbot, @Jwd-gity, @kallidean, @keyuyuan, @kfa-ai,
+@kidonng, @KiraKatana, @kjames2001, @konsisumer, @Korkyzer, @kshitijk4poor, @KvnGz, @lars-hagen, @leehack,
+@leepoweii, @LeonSGP43, @li0near, @libo1106, @liquidchen, @littlewwwhite, @liuhao1024, @liyoungc, @luandiasrj,
+@luoyuctl, @luyao618, @magic524, @mbac, @McClean, @memosr, @Mibayy, @ming1523, @mizgyo, @mrshu, @ms-alan,
+@MustafaKara7, @nederev, @nicoechaniz, @nidhi-singh02, @nightcityblade, @nik1t7n, @Ninso112, @NivOO5,
+@novax635, @nv-kasikritc, @oferlaor, @oswaldb22, @outdoorsea, @oxngon, @PaTTeeL, @pearjelly, @pefontana,
+@perng, @PhilipAD, @phuongvm, @polkn, @Prasanna28Devadiga, @princepal9120, @pty819, @purzbeats, @Quarkex,
+@quocanh261997, @qWaitCrypto, @Qwinty, @rahimsais, @raymaylee, @ReqX, @rewbs, @RhombusMaximus, @rob-maron,
+@Ruzzgar, @ryptotalent, @Sanjays2402, @shannonsands, @shaun0927, @SiliconID, @silv-mt-holdings, @simpolism,
+@smwbev, @soichiyo, @sprmn24, @steezkelly, @stephenschoettler, @Sylw3ster, @szymonclawd, @teyrebaz33,
+@Tianyu199509, @Tranquil-Flow, @TreyDong, @TurgutKural, @tw2818, @tymrtn, @uzunkuyruk, @v1b3coder,
+@vanthinh6886, @VinceZcrikl, @vKongv, @vominh1919, @voteblake, @VTRiot, @wali-reheman, @wesleysimplicio,
+@wilsen0, @WorldWriter, @worlldz, @wuli666, @wuwuzhijing, @Wysie, @XiaoXiao0221, @xieNniu, @xxxigm, @yehuosi,
+@ygd58, @yifengingit, @yuga-hashimoto, @zccyman, @ZeterMordio, @Zhekinmaksim, @zhengyn0001
+
+Also: @Nagatha (Claude Opus 4.7).
+
+---
+
+**Full Changelog**: [v2026.5.7...v2026.5.16](https://github.com/NousResearch/hermes-agent/compare/v2026.5.7...v2026.5.16)
@@ -1,8 +1,11 @@
-"""ACP auth helpers — detect the currently configured Hermes provider."""
+"""ACP auth helpers — detect and advertise Hermes authentication methods."""

 from __future__ import annotations

-from typing import Optional
+from typing import Any, Optional
+
+
+TERMINAL_SETUP_AUTH_METHOD_ID = "hermes-setup"


 def detect_provider() -> Optional[str]:
@@ -22,3 +25,44 @@ def detect_provider() -> Optional[str]:
 def has_provider() -> bool:
    """Return True if Hermes can resolve any runtime provider credentials."""
    return detect_provider() is not None
+
+
+def build_auth_methods() -> list[Any]:
+    """Return registry-compatible ACP auth methods for Hermes.
+
+    The official ACP registry validates that agents advertise at least one
+    usable auth method during the initial handshake. A fresh Zed install may
+    not have Hermes provider credentials configured yet, so Hermes always
+    advertises a terminal setup method. When credentials are already present,
+    it also advertises the resolved provider as the default agent-managed
+    runtime credential method.
+    """
+    from acp.schema import AuthMethodAgent, TerminalAuthMethod
+
+    methods: list[Any] = []
+    provider = detect_provider()
+    if provider:
+        methods.append(
+            AuthMethodAgent(
+                id=provider,
+                name=f"{provider} runtime credentials",
+                description=(
+                    "Authenticate Hermes using the currently configured "
+                    f"{provider} runtime credentials."
+                ),
+            )
+        )
+
+    methods.append(
+        TerminalAuthMethod(
+            id=TERMINAL_SETUP_AUTH_METHOD_ID,
+            name="Configure Hermes provider",
+            description=(
+                "Open Hermes' interactive model/provider setup in a terminal. "
+                "Use this when Hermes has not been configured on this machine yet."
+            ),
+            type="terminal",
+            args=["--setup"],
+        )
+    )
+    return methods
@@ -0,0 +1,288 @@
+# bootstrap_browser_tools.ps1 — install agent-browser + Playwright Chromium
+# into ~/.hermes/node/ for use by Hermes Agent's browser tools on Windows.
+#
+# Targets the registry-install path: users who got Hermes via
+# `uvx --from 'hermes-agent[acp]==X' hermes-acp` don't have a repo clone,
+# so the install.ps1 `npm install`-in-repo flow doesn't apply. This script
+# is a self-contained, idempotent slice of install.ps1's browser block.
+#
+# Usage:
+#   .\bootstrap_browser_tools.ps1                # use defaults
+#   .\bootstrap_browser_tools.ps1 -Yes           # accept Chromium download
+#   .\bootstrap_browser_tools.ps1 -SkipChromium  # Node + agent-browser only
+#
+# Idempotent: re-running this is safe and fast.
+
+[CmdletBinding()]
+param(
+    [switch]$Yes,
+    [switch]$SkipChromium
+)
+
+$ErrorActionPreference = "Stop"
+$NodeVersion = "22"
+
+# ─────────────────────────────────────────────────────────────────────────
+# Logging
+# ─────────────────────────────────────────────────────────────────────────
+
+function Write-Info    { param([string]$msg) Write-Host "[*] $msg" -ForegroundColor Cyan    }
+function Write-Success { param([string]$msg) Write-Host "[+] $msg" -ForegroundColor Green   }
+function Write-Warn    { param([string]$msg) Write-Host "[!] $msg" -ForegroundColor Yellow  }
+function Write-Err     { param([string]$msg) Write-Host "[x] $msg" -ForegroundColor Red     }
+
+# ─────────────────────────────────────────────────────────────────────────
+# Paths
+# ─────────────────────────────────────────────────────────────────────────
+
+$HermesHome = $env:HERMES_HOME
+if (-not $HermesHome) {
+    $HermesHome = Join-Path $env:USERPROFILE ".hermes"
+}
+$NodePrefix = Join-Path $HermesHome "node"
+
+# ─────────────────────────────────────────────────────────────────────────
+# Step 1: Node.js
+# ─────────────────────────────────────────────────────────────────────────
+
+function Resolve-NpmExe {
+    # Same gotcha as install.ps1: prefer npm.cmd over npm.ps1 so the
+    # PowerShell execution policy doesn't block us.
+    $cmd = Get-Command npm -ErrorAction SilentlyContinue
+    if (-not $cmd) { return $null }
+    $npmExe = $cmd.Source
+    if ($npmExe -like "*.ps1") {
+        $sibling = Join-Path (Split-Path $npmExe -Parent) "npm.cmd"
+        if (Test-Path $sibling) { return $sibling }
+    }
+    return $npmExe
+}
+
+function Resolve-NpxExe {
+    $cmd = Get-Command npx -ErrorAction SilentlyContinue
+    if (-not $cmd) { return $null }
+    $npxExe = $cmd.Source
+    if ($npxExe -like "*.ps1") {
+        $sibling = Join-Path (Split-Path $npxExe -Parent) "npx.cmd"
+        if (Test-Path $sibling) { return $sibling }
+    }
+    return $npxExe
+}
+
+function Ensure-Node {
+    # System Node on PATH?
+    $sysNode = Get-Command node -ErrorAction SilentlyContinue
+    if ($sysNode) {
+        try {
+            $v = & $sysNode.Source --version
+            $major = [int]($v -replace '^v(\d+).*', '$1')
+            if ($major -ge 20) {
+                Write-Success "Node.js $v found on PATH"
+                return
+            }
+            Write-Warn "Node.js $v is older than v20 — installing managed Node."
+        } catch {
+            Write-Warn "Failed to query Node version: $_"
+        }
+    }
+
+    # Hermes-managed Node?
+    $managedNode = Join-Path $NodePrefix "node.exe"
+    if (Test-Path $managedNode) {
+        $v = & $managedNode --version
+        Write-Success "Node.js $v found (Hermes-managed at $NodePrefix)"
+        # Prepend to current-process PATH so subsequent npm/npx calls find it.
+        $env:PATH = "$NodePrefix;$env:PATH"
+        return
+    }
+
+    Write-Info "Installing Node.js $NodeVersion LTS into $NodePrefix ..."
+
+    $arch = if ([Environment]::Is64BitOperatingSystem) { "x64" } else { "x86" }
+    $indexUrl = "https://nodejs.org/dist/latest-v${NodeVersion}.x/"
+
+    try {
+        $indexPage = Invoke-WebRequest -Uri $indexUrl -UseBasicParsing
+        $matches = [regex]::Matches($indexPage.Content, "node-v${NodeVersion}\.\d+\.\d+-win-${arch}\.zip")
+        if ($matches.Count -eq 0) {
+            Write-Err "Could not locate Node.js $NodeVersion zip for win-$arch"
+            throw "no tarball"
+        }
+        $zipName = $matches[0].Value
+        $zipUrl = "$indexUrl$zipName"
+
+        $tmpDir = Join-Path $env:TEMP "hermes-node-$([guid]::NewGuid().ToString('N'))"
+        New-Item -ItemType Directory -Force -Path $tmpDir | Out-Null
+        $zipPath = Join-Path $tmpDir $zipName
+
+        Write-Info "Downloading $zipName ..."
+        Invoke-WebRequest -Uri $zipUrl -OutFile $zipPath -UseBasicParsing
+
+        Expand-Archive -Path $zipPath -DestinationPath $tmpDir -Force
+        $extracted = Get-ChildItem -Path $tmpDir -Directory | Where-Object { $_.Name -like "node-v*" } | Select-Object -First 1
+
+        if (-not $extracted) { Write-Err "Node.js extraction failed"; throw "extract" }
+
+        if (Test-Path $NodePrefix) { Remove-Item -Recurse -Force $NodePrefix }
+        New-Item -ItemType Directory -Force -Path $HermesHome | Out-Null
+        Move-Item -Path $extracted.FullName -Destination $NodePrefix
+
+        Remove-Item -Recurse -Force $tmpDir -ErrorAction SilentlyContinue
+
+        $env:PATH = "$NodePrefix;$env:PATH"
+        $v = & "$NodePrefix\node.exe" --version
+        Write-Success "Node.js $v installed to $NodePrefix"
+    } catch {
+        Write-Err "Node.js install failed: $_"
+        Write-Info "Install Node 20+ manually from https://nodejs.org/en/download/ and re-run."
+        throw
+    }
+}
+
+# ─────────────────────────────────────────────────────────────────────────
+# Step 2: agent-browser
+# ─────────────────────────────────────────────────────────────────────────
+
+function Ensure-AgentBrowser {
+    $npmExe = Resolve-NpmExe
+    if (-not $npmExe) {
+        Write-Err "npm not on PATH after Node install — aborting"
+        throw "npm missing"
+    }
+
+    # Already installed?
+    $existing = Get-Command agent-browser -ErrorAction SilentlyContinue
+    if ($existing) {
+        Write-Success "agent-browser already installed at $($existing.Source)"
+        return
+    }
+
+    # When the user has system Node (winget / installer-based), `npm install
+    # -g` writes to a directory that may require admin rights. Force the
+    # prefix to the user-writable Hermes-managed Node directory so we never
+    # need elevation and the agent can always find the result. Mirrors the
+    # bash bootstrap's `--prefix $NODE_PREFIX` strategy.
+    New-Item -ItemType Directory -Force -Path $NodePrefix | Out-Null
+
+    Write-Info "Installing agent-browser (npm, prefix=$NodePrefix)..."
+    & $npmExe install -g --prefix $NodePrefix --silent `
+        "agent-browser@^0.26.0" "@askjo/camofox-browser@^1.5.2"
+    if ($LASTEXITCODE -ne 0) {
+        Write-Err "npm install -g agent-browser failed (exit $LASTEXITCODE)"
+        throw "npm install"
+    }
+
+    # Windows npm global installs drop shims at $NodePrefix\ root (not bin/).
+    # Prepend to PATH so any subsequent npx call resolves them.
+    $env:PATH = "$NodePrefix;$env:PATH"
+
+    Write-Success "agent-browser installed to $NodePrefix"
+}
+
+# ─────────────────────────────────────────────────────────────────────────
+# Step 3: Playwright Chromium
+# ─────────────────────────────────────────────────────────────────────────
+
+function Find-SystemBrowser {
+    $candidates = @(
+        "C:\Program Files\Google\Chrome\Application\chrome.exe",
+        "C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
+        "C:\Program Files\Chromium\Application\chromium.exe",
+        "${env:LOCALAPPDATA}\Google\Chrome\Application\chrome.exe",
+        "${env:LOCALAPPDATA}\Chromium\Application\chromium.exe"
+    )
+    foreach ($p in $candidates) {
+        if (Test-Path $p) { return $p }
+    }
+    # Edge — Chromium-based, agent-browser can use it
+    foreach ($p in @(
+        "C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe",
+        "C:\Program Files\Microsoft\Edge\Application\msedge.exe"
+    )) {
+        if (Test-Path $p) { return $p }
+    }
+    return $null
+}
+
+function Write-BrowserEnv {
+    param([string]$BrowserPath)
+    $envFile = Join-Path $HermesHome ".env"
+    New-Item -ItemType Directory -Force -Path $HermesHome | Out-Null
+    if (Test-Path $envFile) {
+        $existing = Get-Content $envFile -Raw -ErrorAction SilentlyContinue
+        if ($existing -and ($existing -match "(?m)^AGENT_BROWSER_EXECUTABLE_PATH=")) {
+            return
+        }
+    }
+    Add-Content -Path $envFile -Value ""
+    Add-Content -Path $envFile -Value "# Hermes Agent browser tools — use the system Chrome/Chromium/Edge binary."
+    Add-Content -Path $envFile -Value "AGENT_BROWSER_EXECUTABLE_PATH=$BrowserPath"
+    Write-Success "Configured browser tools to use $BrowserPath"
+}
+
+function Confirm-ChromiumDownload {
+    if ($Yes) { return $true }
+    if (-not [Environment]::UserInteractive) {
+        Write-Warn "Non-interactive shell — skipping Chromium prompt."
+        Write-Info "Re-run with -Yes to install Chromium (~400 MB download)."
+        return $false
+    }
+    $reply = Read-Host "Install Playwright Chromium (~400 MB download)? [y/N]"
+    return ($reply -match "^(y|yes)$")
+}
+
+function Ensure-Chromium {
+    if ($SkipChromium) {
+        Write-Info "Skipping Chromium install (-SkipChromium)"
+        return
+    }
+
+    # agent-browser on Windows expects a Playwright-managed Chromium under
+    # %LOCALAPPDATA%\ms-playwright. The system-browser shortcut from the
+    # Linux/macOS path doesn't apply the same way on Windows — Playwright's
+    # default launch path won't pick up a stock Chrome install without an
+    # explicit AGENT_BROWSER_EXECUTABLE_PATH. We still offer it as a
+    # fallback when the user doesn't want the download.
+
+    if (-not (Confirm-ChromiumDownload)) {
+        $sys = Find-SystemBrowser
+        if ($sys) {
+            Write-Info "Using system browser at $sys (Chromium download skipped)."
+            Write-BrowserEnv -BrowserPath $sys
+        } else {
+            Write-Info "Chromium install skipped. Browser tools won't launch until"
+            Write-Info "Chromium is installed or AGENT_BROWSER_EXECUTABLE_PATH is set."
+        }
+        return
+    }
+
+    $npxExe = Resolve-NpxExe
+    if (-not $npxExe) {
+        Write-Err "npx not on PATH — cannot install Playwright Chromium"
+        throw "npx missing"
+    }
+
+    Write-Info "Installing Playwright Chromium (~400 MB) ..."
+    & $npxExe --yes playwright install chromium
+    if ($LASTEXITCODE -ne 0) {
+        Write-Err "Playwright Chromium install failed (exit $LASTEXITCODE)"
+        Write-Info "Try again later: npx --yes playwright install chromium"
+        throw "playwright"
+    }
+    Write-Success "Playwright Chromium installed"
+}
+
+# ─────────────────────────────────────────────────────────────────────────
+# Main
+# ─────────────────────────────────────────────────────────────────────────
+
+Write-Info "Hermes Agent: bootstrapping browser tools"
+Write-Info "  HERMES_HOME = $HermesHome"
+Write-Info "  OS          = Windows"
+
+Ensure-Node
+Ensure-AgentBrowser
+Ensure-Chromium
+
+Write-Success "Browser tools setup complete."
+Write-Info "Hermes Agent will pick up agent-browser from $NodePrefix on next launch."
@@ -0,0 +1,399 @@
+#!/usr/bin/env bash
+#
+# bootstrap_browser_tools.sh — install agent-browser + Playwright Chromium
+# into ~/.hermes/node/ for use by Hermes Agent's browser tools.
+#
+# Targets the registry-install path: users who got Hermes via
+# `uvx --from 'hermes-agent[acp]==X' hermes-acp` don't have a repo clone,
+# so the install.sh `npm install`-in-repo flow doesn't apply. This script
+# is a self-contained, idempotent slice of install.sh's browser block —
+# safe to run from `hermes-acp --setup-browser`, from a fresh terminal,
+# or from install.sh itself (it's a no-op when everything is already in place).
+#
+# Usage:
+#   bootstrap_browser_tools.sh           # use defaults
+#   bootstrap_browser_tools.sh --yes     # accept the ~400MB Chromium download
+#   bootstrap_browser_tools.sh --skip-chromium    # only install Node + agent-browser
+#   HERMES_HOME=/custom/path bootstrap_browser_tools.sh
+#
+# Idempotent: re-running this is safe and fast. Each step checks whether
+# the work is already done.
+
+set -euo pipefail
+
+# ─────────────────────────────────────────────────────────────────────────
+# Config
+# ─────────────────────────────────────────────────────────────────────────
+
+NODE_VERSION="22"
+HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
+NODE_PREFIX="$HERMES_HOME/node"
+
+SKIP_CHROMIUM=false
+ASSUME_YES=false
+
+# ─────────────────────────────────────────────────────────────────────────
+# Logging
+# ─────────────────────────────────────────────────────────────────────────
+
+if [ -t 1 ]; then
+    C_GREEN='\033[0;32m'
+    C_YELLOW='\033[0;33m'
+    C_BLUE='\033[0;34m'
+    C_RED='\033[0;31m'
+    C_RESET='\033[0m'
+else
+    C_GREEN='' ; C_YELLOW='' ; C_BLUE='' ; C_RED='' ; C_RESET=''
+fi
+
+log_info()    { printf "${C_BLUE}[*]${C_RESET} %s\n"  "$*"; }
+log_success() { printf "${C_GREEN}[✓]${C_RESET} %s\n" "$*"; }
+log_warn()    { printf "${C_YELLOW}[!]${C_RESET} %s\n" "$*" >&2; }
+log_error()   { printf "${C_RED}[✗]${C_RESET} %s\n"   "$*" >&2; }
+
+# ─────────────────────────────────────────────────────────────────────────
+# Arg parsing
+# ─────────────────────────────────────────────────────────────────────────
+
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --skip-chromium) SKIP_CHROMIUM=true ;;
+        --yes|-y)        ASSUME_YES=true ;;
+        -h|--help)
+            cat <<EOF
+Bootstrap Hermes Agent browser tools.
+
+Installs Node.js (into ~/.hermes/node/), the agent-browser npm package,
+and the Playwright Chromium browser engine.
+
+Options:
+  --skip-chromium   Install Node + agent-browser but skip Chromium download
+  --yes, -y         Accept the ~400 MB Chromium download without prompting
+  -h, --help        Show this help
+
+Environment:
+  HERMES_HOME       Override Hermes data dir (default: \$HOME/.hermes)
+EOF
+            exit 0
+            ;;
+        *)
+            log_error "Unknown option: $1"
+            exit 2
+            ;;
+    esac
+    shift
+done
+
+# ─────────────────────────────────────────────────────────────────────────
+# OS / arch detection
+# ─────────────────────────────────────────────────────────────────────────
+
+OS="unknown"
+case "$(uname -s)" in
+    Linux*)  OS="linux"  ;;
+    Darwin*) OS="macos"  ;;
+    *)
+        log_error "Unsupported OS: $(uname -s)"
+        log_info "Windows users: run scripts/bootstrap_browser_tools.ps1 in PowerShell."
+        exit 1
+        ;;
+esac
+
+NODE_ARCH=""
+case "$(uname -m)" in
+    x86_64)         NODE_ARCH="x64"    ;;
+    aarch64|arm64)  NODE_ARCH="arm64"  ;;
+    armv7l)         NODE_ARCH="armv7l" ;;
+    *)
+        log_error "Unsupported architecture: $(uname -m)"
+        exit 1
+        ;;
+esac
+
+NODE_OS=""
+case "$OS" in
+    linux) NODE_OS="linux"  ;;
+    macos) NODE_OS="darwin" ;;
+esac
+
+DISTRO=""
+if [ -f /etc/os-release ]; then
+    # shellcheck disable=SC1091
+    . /etc/os-release
+    DISTRO="${ID:-}"
+fi
+
+# ─────────────────────────────────────────────────────────────────────────
+# Step 1: Node.js
+# ─────────────────────────────────────────────────────────────────────────
+
+ensure_node() {
+    # Already on PATH and recent enough?
+    if command -v node >/dev/null 2>&1; then
+        local found_ver major
+        found_ver=$(node --version 2>/dev/null)
+        major=$(echo "$found_ver" | sed -E 's/^v([0-9]+).*/\1/')
+        if [ -n "$major" ] && [ "$major" -ge 20 ]; then
+            log_success "Node.js $found_ver found on PATH"
+            return 0
+        fi
+        log_warn "Node.js $found_ver is older than v20 — installing managed Node."
+    fi
+
+    if [ -x "$NODE_PREFIX/bin/node" ]; then
+        local found_ver
+        found_ver=$("$NODE_PREFIX/bin/node" --version 2>/dev/null || echo "?")
+        export PATH="$NODE_PREFIX/bin:$PATH"
+        log_success "Node.js $found_ver found (Hermes-managed at $NODE_PREFIX)"
+        return 0
+    fi
+
+    log_info "Installing Node.js $NODE_VERSION LTS into $NODE_PREFIX ..."
+
+    local index_url="https://nodejs.org/dist/latest-v${NODE_VERSION}.x/"
+    local tarball_name
+    tarball_name=$(curl -fsSL "$index_url" \
+        | grep -oE "node-v${NODE_VERSION}\.[0-9]+\.[0-9]+-${NODE_OS}-${NODE_ARCH}\.tar\.xz" \
+        | head -1)
+
+    if [ -z "$tarball_name" ]; then
+        tarball_name=$(curl -fsSL "$index_url" \
+            | grep -oE "node-v${NODE_VERSION}\.[0-9]+\.[0-9]+-${NODE_OS}-${NODE_ARCH}\.tar\.gz" \
+            | head -1)
+    fi
+
+    if [ -z "$tarball_name" ]; then
+        log_error "Could not locate Node.js $NODE_VERSION tarball for $NODE_OS-$NODE_ARCH"
+        log_info "Install Node 20+ manually: https://nodejs.org/en/download/"
+        return 1
+    fi
+
+    local tmp_dir
+    tmp_dir=$(mktemp -d)
+    trap 'rm -rf "$tmp_dir"' RETURN
+
+    log_info "Downloading $tarball_name ..."
+    if ! curl -fsSL "${index_url}${tarball_name}" -o "$tmp_dir/$tarball_name"; then
+        log_error "Node.js download failed"
+        return 1
+    fi
+
+    if [[ "$tarball_name" == *.tar.xz ]]; then
+        tar xf "$tmp_dir/$tarball_name" -C "$tmp_dir"
+    else
+        tar xzf "$tmp_dir/$tarball_name" -C "$tmp_dir"
+    fi
+
+    local extracted_dir
+    extracted_dir=$(ls -d "$tmp_dir"/node-v* 2>/dev/null | head -1)
+    if [ ! -d "$extracted_dir" ]; then
+        log_error "Node.js extraction failed"
+        return 1
+    fi
+
+    mkdir -p "$HERMES_HOME"
+    rm -rf "$NODE_PREFIX"
+    mv "$extracted_dir" "$NODE_PREFIX"
+
+    export PATH="$NODE_PREFIX/bin:$PATH"
+
+    local installed_ver
+    installed_ver=$("$NODE_PREFIX/bin/node" --version 2>/dev/null || echo "?")
+    log_success "Node.js $installed_ver installed to $NODE_PREFIX"
+}
+
+# ─────────────────────────────────────────────────────────────────────────
+# Step 2: agent-browser + @askjo/camofox-browser via global npm install
+# ─────────────────────────────────────────────────────────────────────────
+
+ensure_agent_browser() {
+    if ! command -v npm >/dev/null 2>&1; then
+        log_error "npm not on PATH after Node install — aborting"
+        return 1
+    fi
+
+    # _find_agent_browser() in tools/browser_tool.py walks ~/.hermes/node/bin
+    # plus a few standard prefixes, so installing globally into the managed
+    # Node prefix is enough — no PATH manipulation needed from the agent side.
+    if [ -x "$NODE_PREFIX/bin/agent-browser" ] || command -v agent-browser >/dev/null 2>&1; then
+        log_success "agent-browser already installed"
+        return 0
+    fi
+
+    # When the system's `npm` resolves to a root-owned prefix (e.g.
+    # /usr/lib/node_modules), `npm install -g` fails with EACCES without
+    # sudo. Force the prefix to the user-writable Hermes-managed Node
+    # directory so we never need sudo and the agent can always find the
+    # result. If we installed Node ourselves above, this is a no-op
+    # (managed Node already uses $NODE_PREFIX). If the user has system
+    # Node, we still drop agent-browser under $NODE_PREFIX/bin/ — which
+    # is exactly where _browser_candidate_path_dirs() looks first.
+    mkdir -p "$NODE_PREFIX"
+
+    log_info "Installing agent-browser (npm, prefix=$NODE_PREFIX)..."
+    if ! npm install -g --prefix "$NODE_PREFIX" --silent \
+            agent-browser@^0.26.0 \
+            "@askjo/camofox-browser@^1.5.2"; then
+        log_error "npm install -g agent-browser failed"
+        return 1
+    fi
+
+    # macOS/Linux global installs place the shim into $NODE_PREFIX/bin/.
+    # Add it to PATH for any subsequent steps (npx playwright).
+    export PATH="$NODE_PREFIX/bin:$PATH"
+
+    log_success "agent-browser installed to $NODE_PREFIX/bin/"
+}
+
+# ─────────────────────────────────────────────────────────────────────────
+# Step 3: Playwright Chromium
+# ─────────────────────────────────────────────────────────────────────────
+
+confirm_chromium_download() {
+    if [ "$ASSUME_YES" = true ]; then return 0; fi
+    if [ ! -t 0 ]; then
+        log_warn "Non-interactive shell — skipping Chromium prompt."
+        log_info "Re-run with --yes to install Chromium (~400 MB download)."
+        return 1
+    fi
+    printf "Install Playwright Chromium (~400 MB download)? [y/N] "
+    local reply=""
+    read -r reply || reply=""
+    case "$reply" in
+        y|Y|yes|YES) return 0 ;;
+        *) return 1 ;;
+    esac
+}
+
+# Detect a usable system Chrome/Chromium. agent-browser's Chrome engine can
+# use it instead of downloading Playwright's bundled Chromium, saving the
+# download cost. Returns the path or empty string.
+find_system_browser() {
+    local candidate
+    for candidate in google-chrome google-chrome-stable chromium chromium-browser chrome; do
+        if command -v "$candidate" >/dev/null 2>&1; then
+            command -v "$candidate"
+            return 0
+        fi
+    done
+    # macOS app-bundle locations
+    if [ "$OS" = "macos" ]; then
+        for candidate in \
+            "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" \
+            "/Applications/Chromium.app/Contents/MacOS/Chromium" ; do
+            if [ -x "$candidate" ]; then
+                echo "$candidate"
+                return 0
+            fi
+        done
+    fi
+    return 1
+}
+
+write_browser_env() {
+    local browser_path="$1"
+    local env_file="$HERMES_HOME/.env"
+    mkdir -p "$HERMES_HOME"
+    if [ -f "$env_file" ] && grep -q "^AGENT_BROWSER_EXECUTABLE_PATH=" "$env_file"; then
+        return 0
+    fi
+    {
+        echo ""
+        echo "# Hermes Agent browser tools — use the system Chrome/Chromium binary."
+        echo "AGENT_BROWSER_EXECUTABLE_PATH=$browser_path"
+    } >> "$env_file"
+    log_success "Configured browser tools to use $browser_path"
+}
+
+ensure_chromium() {
+    if [ "$SKIP_CHROMIUM" = true ]; then
+        log_info "Skipping Chromium install (--skip-chromium)"
+        return 0
+    fi
+
+    local system_browser
+    system_browser="$(find_system_browser 2>/dev/null || true)"
+    if [ -n "$system_browser" ]; then
+        log_success "Found system browser: $system_browser"
+        log_info "Skipping Playwright Chromium download; agent-browser will use it."
+        write_browser_env "$system_browser"
+        return 0
+    fi
+
+    if ! confirm_chromium_download; then
+        log_info "Chromium install skipped. Browser tools will only work if you"
+        log_info "set AGENT_BROWSER_EXECUTABLE_PATH or install Chromium later."
+        return 0
+    fi
+
+    if ! command -v npx >/dev/null 2>&1; then
+        log_error "npx not on PATH — cannot install Playwright Chromium"
+        return 1
+    fi
+
+    log_info "Installing Playwright Chromium (~400 MB) ..."
+
+    # On apt-based distros, --with-deps requires sudo. Try non-interactively
+    # only — never prompt — and fall back to the bare browser-only install.
+    local installed=false
+    if [ "$OS" = "linux" ]; then
+        case "$DISTRO" in
+            ubuntu|debian|raspbian|pop|linuxmint|elementary|zorin|kali|parrot)
+                if [ "$(id -u)" -eq 0 ] || (command -v sudo >/dev/null 2>&1 && sudo -n true 2>/dev/null); then
+                    log_info "Installing system deps with --with-deps (sudo available)"
+                    if npx --yes playwright install --with-deps chromium; then
+                        installed=true
+                    fi
+                else
+                    log_warn "sudo not available non-interactively — installing Chromium without system deps."
+                    log_info "If browser tools fail to launch, an administrator should run:"
+                    log_info "  sudo npx playwright install-deps chromium"
+                fi
+                ;;
+            arch|manjaro|cachyos|endeavouros|garuda)
+                log_info "Arch-family system dependencies are not auto-installed."
+                log_info "If launch fails, run: sudo pacman -S nss atk at-spi2-core cups libdrm libxkbcommon mesa pango cairo alsa-lib"
+                ;;
+            fedora|rhel|centos|rocky|alma)
+                log_info "Fedora/RHEL system dependencies are not auto-installed."
+                log_info "If launch fails, run: sudo dnf install nss atk at-spi2-core cups-libs libdrm libxkbcommon mesa-libgbm pango cairo alsa-lib"
+                ;;
+            opensuse*|sles)
+                log_info "openSUSE system dependencies are not auto-installed."
+                ;;
+        esac
+    fi
+
+    if [ "$installed" = false ]; then
+        if npx --yes playwright install chromium; then
+            installed=true
+        fi
+    fi
+
+    if [ "$installed" = true ]; then
+        log_success "Playwright Chromium installed"
+    else
+        log_error "Playwright Chromium install failed"
+        log_info "Try again later: npx --yes playwright install chromium"
+        return 1
+    fi
+}
+
+# ─────────────────────────────────────────────────────────────────────────
+# Main
+# ─────────────────────────────────────────────────────────────────────────
+
+main() {
+    log_info "Hermes Agent: bootstrapping browser tools"
+    log_info "  HERMES_HOME = $HERMES_HOME"
+    log_info "  OS / arch   = $NODE_OS-$NODE_ARCH ${DISTRO:+($DISTRO)}"
+
+    ensure_node
+    ensure_agent_browser
+    ensure_chromium
+
+    log_success "Browser tools setup complete."
+    log_info "Hermes Agent will pick up agent-browser from $NODE_PREFIX/bin/ on next launch."
+}
+
+main
@@ -24,6 +24,7 @@ except ModuleNotFoundError:
    # means UTF-8 stdio setup is skipped on Windows; POSIX is unaffected.
    pass

+import argparse
 import asyncio
 import logging
 import sys
@@ -107,8 +108,150 @@ def _load_env() -> None:
        )


-def main() -> None:
+def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        prog="hermes-acp",
+        description="Run Hermes Agent as an ACP stdio server.",
+    )
+    parser.add_argument("--version", action="store_true", help="Print Hermes version and exit")
+    parser.add_argument(
+        "--check",
+        action="store_true",
+        help="Verify ACP dependencies and adapter imports, then exit",
+    )
+    parser.add_argument(
+        "--setup",
+        action="store_true",
+        help="Run interactive Hermes provider/model setup for ACP terminal auth",
+    )
+    parser.add_argument(
+        "--setup-browser",
+        action="store_true",
+        help="Install agent-browser + Playwright Chromium into ~/.hermes/node/ "
+             "for browser tool support. Idempotent.",
+    )
+    parser.add_argument(
+        "--yes",
+        "-y",
+        action="store_true",
+        dest="assume_yes",
+        help="Accept all prompts (currently used by --setup-browser to skip the "
+             "~400 MB Chromium download confirmation).",
+    )
+    return parser.parse_args(argv)
+
+
+def _print_version() -> None:
+    from hermes_cli import __version__ as hermes_version
+
+    print(hermes_version)
+
+
+def _run_check() -> None:
+    import acp  # noqa: F401
+    from acp_adapter.server import HermesACPAgent  # noqa: F401
+
+    print("Hermes ACP check OK")
+
+
+def _run_setup() -> None:
+    from hermes_cli.main import main as hermes_main
+
+    old_argv = sys.argv[:]
+    try:
+        sys.argv = [old_argv[0] if old_argv else "hermes", "model"]
+        hermes_main()
+    finally:
+        sys.argv = old_argv
+
+    # Offer browser-tools install as a follow-up. The terminal auth method
+    # is the one supported first-run UX for registry installs, so this is
+    # the natural moment to ask. Skip silently if stdin isn't a TTY (the
+    # answer can't be collected anyway).
+    if not sys.stdin.isatty():
+        return
+    try:
+        reply = input(
+            "\nInstall browser tools? Downloads agent-browser (npm) and "
+            "optionally Playwright Chromium (~400 MB). [y/N] "
+        ).strip().lower()
+    except (EOFError, KeyboardInterrupt):
+        return
+    if reply in {"y", "yes"}:
+        _run_setup_browser(assume_yes=False)
+
+
+def _run_setup_browser(assume_yes: bool = False) -> int:
+    """Bootstrap agent-browser + Playwright Chromium for the registry-install path.
+
+    Shells out to the bundled platform-specific bootstrap script
+    (acp_adapter/bootstrap/bootstrap_browser_tools.{sh,ps1}) so the install
+    logic lives in one place — readable, debuggable, and shareable with
+    install.sh / install.ps1 if we ever want to call it from there too.
+
+    Returns the script's exit code (0 on success).
+    """
+    import platform
+    import subprocess
+
+    bootstrap_dir = Path(__file__).resolve().parent / "bootstrap"
+
+    if platform.system() == "Windows":
+        script = bootstrap_dir / "bootstrap_browser_tools.ps1"
+        if not script.is_file():
+            print(
+                f"Bootstrap script not found at {script} — wheel may be incomplete.",
+                file=sys.stderr,
+            )
+            return 1
+        cmd = [
+            "powershell.exe",
+            "-NoProfile",
+            "-ExecutionPolicy", "Bypass",
+            "-File", str(script),
+        ]
+        if assume_yes:
+            cmd.append("-Yes")
+    else:
+        script = bootstrap_dir / "bootstrap_browser_tools.sh"
+        if not script.is_file():
+            print(
+                f"Bootstrap script not found at {script} — wheel may be incomplete.",
+                file=sys.stderr,
+            )
+            return 1
+        cmd = ["bash", str(script)]
+        if assume_yes:
+            cmd.append("--yes")
+
+    # stdio is inherited so the user sees the bootstrap's progress live.
+    try:
+        result = subprocess.run(cmd, check=False)
+    except FileNotFoundError as exc:
+        # bash / powershell.exe not on PATH
+        print(f"Could not launch browser bootstrap: {exc}", file=sys.stderr)
+        return 1
+    return result.returncode
+
+
+def main(argv: list[str] | None = None) -> None:
    """Entry point: load env, configure logging, run the ACP agent."""
+    args = _parse_args(argv)
+    if args.version:
+        _print_version()
+        return
+    if args.check:
+        _run_check()
+        return
+    if args.setup:
+        _run_setup()
+        return
+    if args.setup_browser:
+        rc = _run_setup_browser(assume_yes=args.assume_yes)
+        if rc != 0:
+            sys.exit(rc)
+        return
+
    _setup_logging()
    _load_env()

@@ -14,6 +14,7 @@ from collections import deque
 from typing import Any, Callable, Deque, Dict

 import acp
+from acp.schema import AgentPlanUpdate, PlanEntry

 from .tools import (
    build_tool_complete,
@@ -24,6 +25,65 @@ from .tools import (
 logger = logging.getLogger(__name__)


+def _json_loads_maybe_prefix(value: str) -> Any:
+    """Parse a JSON object even when Hermes appended a human hint after it."""
+    text = value.strip()
+    try:
+        return json.loads(text)
+    except Exception:
+        decoder = json.JSONDecoder()
+        data, _ = decoder.raw_decode(text)
+        return data
+
+
+def _build_plan_update_from_todo_result(result: Any) -> AgentPlanUpdate | None:
+    """Translate Hermes' todo tool result into ACP's native plan update.
+
+    Zed renders ``sessionUpdate: plan`` as its first-class task/todo panel. The
+    Hermes agent already maintains task state through the ``todo`` tool, so the
+    ACP adapter should expose that state natively instead of only as a generic
+    tool-call transcript block.
+    """
+    if not isinstance(result, str) or not result.strip():
+        return None
+
+    try:
+        data = _json_loads_maybe_prefix(result)
+    except Exception:
+        return None
+
+    if not isinstance(data, dict) or not isinstance(data.get("todos"), list):
+        return None
+
+    todos = data["todos"]
+    if not todos:
+        return AgentPlanUpdate(session_update="plan", entries=[])
+
+    status_map = {
+        "pending": "pending",
+        "in_progress": "in_progress",
+        "completed": "completed",
+        # ACP plans only support pending/in_progress/completed. Preserve
+        # cancelled tasks as terminal entries instead of dropping them and
+        # making the client's full-list replacement lose visible context.
+        "cancelled": "completed",
+    }
+    entries: list[PlanEntry] = []
+    for item in todos:
+        if not isinstance(item, dict):
+            continue
+        content = str(item.get("content") or item.get("id") or "").strip()
+        if not content:
+            continue
+        raw_status = str(item.get("status") or "pending").strip()
+        status = status_map.get(raw_status, "pending")
+        if raw_status == "cancelled":
+            content = f"[cancelled] {content}"
+        entries.append(PlanEntry(content=content, priority="medium", status=status))
+
+    return AgentPlanUpdate(session_update="plan", entries=entries)
+
+
 def _send_update(
    conn: acp.Client,
    session_id: str,
@@ -31,10 +91,17 @@ def _send_update(
    update: Any,
 ) -> None:
    """Fire-and-forget an ACP session update from a worker thread."""
+    from agent.async_utils import safe_schedule_threadsafe
+
+    future = safe_schedule_threadsafe(
+        conn.session_update(session_id, update),
+        loop,
+        logger=logger,
+        log_message="Failed to send ACP update",
+    )
+    if future is None:
+        return
    try:
-        future = asyncio.run_coroutine_threadsafe(
-            conn.session_update(session_id, update), loop
-        )
        future.result(timeout=5)
    except Exception:
        logger.debug("Failed to send ACP update", exc_info=True)
@@ -168,6 +235,10 @@ def make_step_cb(
                        snapshot=meta.get("snapshot"),
                    )
                    _send_update(conn, session_id, loop, update)
+                    if tool_name == "todo":
+                        plan_update = _build_plan_update_from_todo_result(result)
+                        if plan_update is not None:
+                            _send_update(conn, session_id, loop, plan_update)
                    if not queue:
                        tool_call_ids.pop(tool_name, None)

@@ -111,21 +111,28 @@ def make_approval_callback(
        allow_permanent: bool = True,
        **_: object,
    ) -> str:
+        from agent.async_utils import safe_schedule_threadsafe
+
        options = _build_permission_options(allow_permanent=allow_permanent)

-        future = None
+        tool_call = _build_permission_tool_call(command, description)
+        coro = request_permission_fn(
+            session_id=session_id,
+            tool_call=tool_call,
+            options=options,
+        )
+        future = safe_schedule_threadsafe(
+            coro, loop,
+            logger=logger,
+            log_message="Permission request: failed to schedule on loop",
+        )
+        if future is None:
+            return "deny"
+
        try:
-            tool_call = _build_permission_tool_call(command, description)
-            coro = request_permission_fn(
-                session_id=session_id,
-                tool_call=tool_call,
-                options=options,
-            )
-            future = asyncio.run_coroutine_threadsafe(coro, loop)
            response = future.result(timeout=timeout)
        except (FutureTimeout, Exception) as exc:
-            if future is not None:
-                future.cancel()
+            future.cancel()
            logger.warning("Permission request timed out or failed: %s", exc)
            return "deny"

@@ -57,14 +57,9 @@ from acp.schema import (
    UserMessageChunk,
 )

-# AuthMethodAgent was renamed from AuthMethod in agent-client-protocol 0.9.0
-try:
-    from acp.schema import AuthMethodAgent
-except ImportError:
-    from acp.schema import AuthMethod as AuthMethodAgent  # type: ignore[attr-defined]
-
-from acp_adapter.auth import detect_provider
+from acp_adapter.auth import TERMINAL_SETUP_AUTH_METHOD_ID, build_auth_methods, detect_provider
 from acp_adapter.events import (
+    _build_plan_update_from_todo_result,
    make_message_cb,
    make_step_cb,
    make_thinking_cb,
@@ -744,16 +739,7 @@ class HermesACPAgent(acp.Agent):
        resolved_protocol_version = (
            protocol_version if isinstance(protocol_version, int) else acp.PROTOCOL_VERSION
        )
-        provider = detect_provider()
-        auth_methods = None
-        if provider:
-            auth_methods = [
-                AuthMethodAgent(
-                    id=provider,
-                    name=f"{provider} runtime credentials",
-                    description=f"Authenticate Hermes using the currently configured {provider} runtime credentials.",
-                )
-            ]
+        auth_methods = build_auth_methods()

        client_name = client_info.name if client_info else "unknown"
        logger.info(
@@ -784,10 +770,18 @@ class HermesACPAgent(acp.Agent):
        # server has provider credentials configured — harmless under
        # Hermes' threat model (ACP is stdio-only, local-trust), but poor
        # API hygiene and confusing if ACP ever grows multi-method auth.
-        provider = detect_provider()
-        if not provider:
+        if not isinstance(method_id, str):
            return None
-        if not isinstance(method_id, str) or method_id.strip().lower() != provider:
+        normalized_method = method_id.strip().lower()
+        provider = detect_provider()
+
+        if normalized_method == TERMINAL_SETUP_AUTH_METHOD_ID:
+            # Terminal auth launches Hermes setup/model selection out-of-band.
+            # Only report success once that flow has produced usable runtime
+            # credentials for the normal ACP session.
+            return AuthenticateResponse() if provider else None
+
+        if not provider or normalized_method != provider:
            return None
        return AuthenticateResponse()

@@ -917,15 +911,20 @@ class HermesACPAgent(acp.Agent):
                if not tool_call_id or not tool_name:
                    continue
                result = message.get("content")
+                result_text = result if isinstance(result, str) else None
                if not await _send(
                    build_tool_complete(
                        tool_call_id,
                        tool_name,
-                        result=result if isinstance(result, str) else None,
+                        result=result_text,
                        function_args=function_args,
                    )
                ):
                    return
+                if tool_name == "todo":
+                    plan_update = _build_plan_update_from_todo_result(result_text)
+                    if plan_update is not None and not await _send(plan_update):
+                        return

    async def new_session(
        self,
@@ -1,12 +1,16 @@
 {
-  "schema_version": 1,
-  "name": "hermes-agent",
-  "display_name": "Hermes Agent",
-  "description": "AI agent by Nous Research with 90+ tools, persistent memory, and multi-platform support",
-  "icon": "icon.svg",
+  "id": "hermes-agent",
+  "name": "Hermes Agent",
+  "version": "0.13.0",
+  "description": "Self-improving open-source AI agent by Nous Research with ACP editor integration, persistent memory, skills, and rich tool support.",
+  "repository": "https://github.com/NousResearch/hermes-agent",
+  "website": "https://hermes-agent.nousresearch.com/docs/user-guide/features/acp",
+  "authors": ["Nous Research"],
+  "license": "MIT",
  "distribution": {
-    "type": "command",
-    "command": "hermes",
-    "args": ["acp"]
+    "uvx": {
+      "package": "hermes-agent[acp]==0.13.0",
+      "args": ["hermes-acp"]
+    }
  }
 }
@@ -1,25 +1,8 @@
-<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 64 64" width="64" height="64">
-  <defs>
-    <linearGradient id="gold" x1="0%" y1="0%" x2="0%" y2="100%">
-      <stop offset="0%" style="stop-color:#F5C542;stop-opacity:1" />
-      <stop offset="100%" style="stop-color:#D4961C;stop-opacity:1" />
-    </linearGradient>
-  </defs>
-  <!-- Staff -->
-  <rect x="30" y="10" width="4" height="46" rx="2" fill="url(#gold)" />
-  <!-- Wings (left) -->
-  <path d="M30 18 C24 14, 14 14, 10 18 C14 16, 22 16, 28 20" fill="#F5C542" opacity="0.9" />
-  <path d="M30 22 C26 19, 18 19, 14 22 C18 20, 24 20, 28 24" fill="#D4961C" opacity="0.8" />
-  <!-- Wings (right) -->
-  <path d="M34 18 C40 14, 50 14, 54 18 C50 16, 42 16, 36 20" fill="#F5C542" opacity="0.9" />
-  <path d="M34 22 C38 19, 46 19, 50 22 C46 20, 40 20, 36 24" fill="#D4961C" opacity="0.8" />
-  <!-- Left serpent -->
-  <path d="M32 48 C22 44, 20 38, 26 34 C20 36, 18 42, 24 46 C18 40, 22 30, 30 28 C24 32, 22 38, 28 42"
-        fill="none" stroke="#F5C542" stroke-width="2.5" stroke-linecap="round" />
-  <!-- Right serpent -->
-  <path d="M32 48 C42 44, 44 38, 38 34 C44 36, 46 42, 40 46 C46 40, 42 30, 34 28 C40 32, 42 38, 36 42"
-        fill="none" stroke="#D4961C" stroke-width="2.5" stroke-linecap="round" />
-  <!-- Orb at top -->
-  <circle cx="32" cy="10" r="4" fill="#F5C542" />
-  <circle cx="32" cy="10" r="2" fill="#FFF8E1" opacity="0.7" />
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" width="16" height="16" fill="none">
+  <path d="M8 1.5v13" stroke="currentColor" stroke-width="1.5" stroke-linecap="round"/>
+  <path d="M8 3.25c-2.35-1.4-4.7-.95-6.25.35 1.85-.2 3.8.2 5.55 1.55" stroke="currentColor" stroke-width="1.1" stroke-linecap="round" stroke-linejoin="round"/>
+  <path d="M8 3.25c2.35-1.4 4.7-.95 6.25.35-1.85-.2-3.8.2-5.55 1.55" stroke="currentColor" stroke-width="1.1" stroke-linecap="round" stroke-linejoin="round"/>
+  <path d="M8 13.25c-2.3-1-3.05-2.65-1.35-4.15-2 .8-2.35 2.95-.35 4" stroke="currentColor" stroke-width="1.1" stroke-linecap="round" stroke-linejoin="round"/>
+  <path d="M8 13.25c2.3-1 3.05-2.65 1.35-4.15 2 .8 2.35 2.95.35 4" stroke="currentColor" stroke-width="1.1" stroke-linecap="round" stroke-linejoin="round"/>
+  <circle cx="8" cy="1.8" r="1.1" fill="currentColor"/>
 </svg>
@@ -1060,10 +1060,12 @@ def _generate_pkce() -> tuple:

 def run_hermes_oauth_login_pure() -> Optional[Dict[str, Any]]:
    """Run Hermes-native OAuth PKCE flow and return credential state."""
+    import secrets
    import time
    import webbrowser

    verifier, challenge = _generate_pkce()
+    oauth_state = secrets.token_urlsafe(32)

    params = {
        "code": "true",
@@ -1073,7 +1075,7 @@ def run_hermes_oauth_login_pure() -> Optional[Dict[str, Any]]:
        "scope": _OAUTH_SCOPES,
        "code_challenge": challenge,
        "code_challenge_method": "S256",
-        "state": verifier,
+        "state": oauth_state,
    }
    from urllib.parse import urlencode

@@ -1110,7 +1112,12 @@ def run_hermes_oauth_login_pure() -> Optional[Dict[str, Any]]:

    splits = auth_code.split("#")
    code = splits[0]
-    state = splits[1] if len(splits) > 1 else ""
+    received_state = splits[1] if len(splits) > 1 else ""
+
+    # Validate state to prevent CSRF (RFC 6749 §10.12)
+    if received_state != oauth_state:
+        logger.warning("OAuth state mismatch — possible CSRF, aborting")
+        return None

    try:
        import urllib.request
@@ -1119,7 +1126,7 @@ def run_hermes_oauth_login_pure() -> Optional[Dict[str, Any]]:
            "grant_type": "authorization_code",
            "client_id": _OAUTH_CLIENT_ID,
            "code": code,
-            "state": state,
+            "state": received_state,
            "redirect_uri": _OAUTH_REDIRECT_URI,
            "code_verifier": verifier,
        }).encode()
@@ -0,0 +1,68 @@
+"""Async/sync bridging helpers.
+
+The codebase has ~30 sites that schedule a coroutine onto an event loop from a
+worker thread via :func:`asyncio.run_coroutine_threadsafe`.  That function can
+raise :class:`RuntimeError` (e.g. the loop was closed during a shutdown race),
+and when it does the coroutine object is never awaited and never closed —
+which triggers a ``"coroutine '<name>' was never awaited"`` RuntimeWarning and
+leaks the coroutine's frame until GC.
+
+:func:`safe_schedule_threadsafe` wraps the call, closes the coroutine on
+scheduling failure, and returns ``None`` (instead of a half-formed future) so
+callers can branch cleanly:
+
+    fut = safe_schedule_threadsafe(coro, loop)
+    if fut is None:
+        return  # or fallback behavior
+    fut.result(timeout=5)
+
+The helper deliberately does NOT also handle ``future.result()`` failures —
+that is a separate concern.  Once the loop has accepted the coroutine, its
+lifecycle belongs to the loop, not the scheduling thread.
+"""
+from __future__ import annotations
+
+import asyncio
+import logging
+from concurrent.futures import Future
+from typing import Any, Coroutine, Optional
+
+
+_DEFAULT_LOGGER = logging.getLogger(__name__)
+
+
+def safe_schedule_threadsafe(
+    coro: Coroutine[Any, Any, Any],
+    loop: Optional[asyncio.AbstractEventLoop],
+    *,
+    logger: Optional[logging.Logger] = None,
+    log_message: str = "Failed to schedule coroutine on loop",
+    log_level: int = logging.DEBUG,
+) -> Optional[Future]:
+    """Schedule ``coro`` on ``loop`` from a sync context, leak-safe.
+
+    Returns the :class:`concurrent.futures.Future` on success, or ``None`` if
+    the loop is missing or :func:`asyncio.run_coroutine_threadsafe` raised
+    (e.g. the loop was closed during a shutdown race).  In all failure paths
+    the coroutine is :meth:`close`-d so it does not trigger
+    ``"coroutine was never awaited"`` warnings or leak its frame.
+
+    Callers retain full control over what to do with the returned future
+    (call ``.result(timeout=...)``, attach ``add_done_callback``, ignore it
+    fire-and-forget, etc.).
+    """
+    log = logger if logger is not None else _DEFAULT_LOGGER
+
+    if loop is None:
+        if asyncio.iscoroutine(coro):
+            coro.close()
+        log.log(log_level, "%s: loop is None", log_message)
+        return None
+
+    try:
+        return asyncio.run_coroutine_threadsafe(coro, loop)
+    except Exception as exc:
+        if asyncio.iscoroutine(coro):
+            coro.close()
+        log.log(log_level, "%s: %s", log_message, exc)
+        return None
@@ -369,6 +369,21 @@ def build_or_headers(or_config: dict | None = None) -> dict:

    return headers

+
+# NVIDIA NIM cloud billing attribution.  Keep this host-gated because the
+# nvidia provider also supports local/on-prem NIM endpoints via NVIDIA_BASE_URL.
+_NVIDIA_NIM_CLOUD_HEADERS = {
+    "X-BILLING-INVOKE-ORIGIN": "HermesAgent",
+}
+
+
+def build_nvidia_nim_headers(base_url: str | None) -> dict:
+    """Return NVIDIA NIM cloud attribution headers for build.nvidia.com traffic."""
+    if base_url_host_matches(str(base_url or ""), "integrate.api.nvidia.com"):
+        return dict(_NVIDIA_NIM_CLOUD_HEADERS)
+    return {}
+
+
 # Vercel AI Gateway app attribution headers. HTTP-Referer maps to
 # referrerUrl and X-Title maps to appName in the gateway's analytics.
 from hermes_cli import __version__ as _HERMES_VERSION
@@ -1254,6 +1269,58 @@ def _resolve_nous_runtime_api(*, force_refresh: bool = False) -> Optional[tuple[
    return api_key, base_url


+def _resolve_xai_oauth_for_aux() -> Optional[Tuple[str, str]]:
+    """Resolve a fresh xAI OAuth (api_key, base_url) for auxiliary clients.
+
+    Prefer the credential pool, matching the main runtime/provider status
+    path.  Some xAI OAuth logins live only as pool entries; falling straight
+    to the singleton auth-store resolver would make auxiliary tasks such as
+    compression report "no provider configured" even though ``hermes auth
+    status`` shows xAI OAuth as logged in.
+
+    Falls back to ``hermes_cli.auth``'s singleton runtime resolver for older
+    auth-store-only logins. Returns ``None`` if the user is not authenticated
+    with xAI Grok OAuth.
+    """
+    try:
+        from hermes_cli.auth import DEFAULT_XAI_OAUTH_BASE_URL
+
+        pool = load_pool("xai-oauth")
+        if pool and pool.has_credentials():
+            entry = pool.select()
+            if entry is not None:
+                api_key = str(
+                    getattr(entry, "runtime_api_key", None)
+                    or getattr(entry, "access_token", "")
+                    or ""
+                ).strip()
+                base_url = str(
+                    os.getenv("HERMES_XAI_BASE_URL", "").strip().rstrip("/")
+                    or os.getenv("XAI_BASE_URL", "").strip().rstrip("/")
+                    or getattr(entry, "runtime_base_url", None)
+                    or getattr(entry, "base_url", None)
+                    or DEFAULT_XAI_OAUTH_BASE_URL
+                ).strip().rstrip("/")
+                if api_key and base_url:
+                    return api_key, base_url
+    except Exception as exc:
+        logger.debug("Auxiliary xAI OAuth pool credential resolution failed: %s", exc)
+
+    try:
+        from hermes_cli.auth import resolve_xai_oauth_runtime_credentials
+
+        creds = resolve_xai_oauth_runtime_credentials()
+    except Exception as exc:
+        logger.debug("Auxiliary xAI OAuth runtime credential resolution failed: %s", exc)
+        return None
+
+    api_key = str(creds.get("api_key") or "").strip()
+    base_url = str(creds.get("base_url") or "").strip().rstrip("/")
+    if not api_key or not base_url:
+        return None
+    return api_key, base_url
+
+
 def _read_codex_access_token() -> Optional[str]:
    """Read a valid, non-expired Codex OAuth access token from Hermes auth store.

@@ -1348,6 +1415,8 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
                from hermes_cli.models import copilot_default_headers

                extra["default_headers"] = copilot_default_headers()
+            elif base_url_host_matches(base_url, "integrate.api.nvidia.com"):
+                extra["default_headers"] = build_nvidia_nim_headers(base_url)
            else:
                try:
                    from providers import get_provider_profile as _gpf_aux
@@ -1383,6 +1452,8 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
            from hermes_cli.models import copilot_default_headers

            extra["default_headers"] = copilot_default_headers()
+        elif base_url_host_matches(base_url, "integrate.api.nvidia.com"):
+            extra["default_headers"] = build_nvidia_nim_headers(base_url)
        else:
            try:
                from providers import get_provider_profile as _gpf_aux2
@@ -1456,8 +1527,21 @@ def _try_nous(vision: bool = False) -> Tuple[Optional[OpenAI], Optional[str]]:
    nous = _read_nous_auth()
    runtime = _resolve_nous_runtime_api(force_refresh=False)
    if runtime is None and not nous:
+        logger.warning(
+            "Auxiliary Nous client unavailable: no Nous authentication found "
+            "(run: hermes auth)."
+        )
        _mark_provider_unhealthy("nous", ttl=60)
        return None, None
+    if runtime is None and nous:
+        # Runtime credential mint failed but stored Nous auth is still present.
+        # Falls back to the raw stored token below; surface a debug line so
+        # operators investigating expired/invalid sessions have a breadcrumb,
+        # without blocking the fallback path the rest of this function relies on.
+        logger.debug(
+            "Auxiliary Nous: runtime credential mint failed; falling back to "
+            "stored auth.json token."
+        )
    global auxiliary_is_nous
    auxiliary_is_nous = True
    logger.debug("Auxiliary client: Nous Portal")
@@ -1731,6 +1815,32 @@ def _try_custom_endpoint() -> Tuple[Optional[Any], Optional[str]]:
    return _fallback_client, model


+def _build_xai_oauth_aux_client(model: str) -> Tuple[Optional[Any], Optional[str]]:
+    """Build a CodexAuxiliaryClient for an xAI Grok OAuth-authenticated session.
+
+    xAI's ``/v1/responses`` endpoint speaks the OpenAI Responses API, so we
+    wrap a plain ``OpenAI`` client in ``CodexAuxiliaryClient`` to translate
+    ``chat.completions.create()`` calls into ``responses.stream()`` requests.
+
+    The caller must pass an explicit model — pinning a default for Grok
+    would silently rot when xAI's allowlist drifts.  Returns ``(None, None)``
+    when the user has not authenticated with xAI Grok OAuth.
+    """
+    if not model:
+        logger.warning(
+            "Auxiliary client: xai-oauth requested without a model; "
+            "pass model explicitly (auxiliary.<task>.model in config.yaml)."
+        )
+        return None, None
+    resolved = _resolve_xai_oauth_for_aux()
+    if resolved is None:
+        return None, None
+    api_key, base_url = resolved
+    logger.debug("Auxiliary client: xAI OAuth (%s via Responses API)", model)
+    real_client = OpenAI(api_key=api_key, base_url=base_url)
+    return CodexAuxiliaryClient(real_client, model), model
+
+
 def _build_codex_client(model: str) -> Tuple[Optional[Any], Optional[str]]:
    """Build a CodexAuxiliaryClient for an explicitly-requested model.

@@ -2627,6 +2737,8 @@ def _to_async_client(sync_client, model: str, is_vision: bool = False):
        )
    elif base_url_host_matches(sync_base_url, "api.kimi.com"):
        async_kwargs["default_headers"] = {"User-Agent": "claude-code/0.1.0"}
+    elif base_url_host_matches(sync_base_url, "integrate.api.nvidia.com"):
+        async_kwargs["default_headers"] = build_nvidia_nim_headers(sync_base_url)
    else:
        # Fall back to profile.default_headers for providers that declare
        # client-level headers on their ProviderProfile (e.g. attribution
@@ -2838,6 +2950,26 @@ def resolve_provider_client(
        return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
                else (client, final_model))

+    # ── xAI Grok OAuth (loopback PKCE → Responses API) ───────────────
+    # Without this branch, an xai-oauth main provider falls through to the
+    # generic ``oauth_external`` arm below and returns ``(None, None)``,
+    # silently re-routing every auxiliary task (compression, web extract,
+    # session search, curator, etc.) to whatever Step-2 fallback the user
+    # has configured.  Users on xAI Grok OAuth would then see surprise
+    # OpenRouter / Nous bills for side tasks they thought were running on
+    # their xAI subscription.
+    if provider == "xai-oauth":
+        client, default = _build_xai_oauth_aux_client(model)
+        if client is None:
+            logger.warning(
+                "resolve_provider_client: xai-oauth requested but no xAI "
+                "OAuth token found (run: hermes model -> xAI Grok OAuth — SuperGrok Subscription)"
+            )
+            return None, None
+        final_model = _normalize_resolved_model(model or default, provider)
+        return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
+                else (client, final_model))
+
    # ── Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY) ───────────
    if provider == "custom":
        if explicit_base_url:
@@ -2868,6 +3000,8 @@ def resolve_provider_client(
                extra["default_headers"] = copilot_request_headers(
                    is_agent_turn=True, is_vision=is_vision
                )
+            elif base_url_host_matches(custom_base, "integrate.api.nvidia.com"):
+                extra["default_headers"] = build_nvidia_nim_headers(custom_base)
            else:
                # Fall back to profile.default_headers for providers that
                # declare client-level attribution headers on their profile.
@@ -3066,6 +3200,8 @@ def resolve_provider_client(
            headers.update(copilot_request_headers(
                is_agent_turn=True, is_vision=is_vision
            ))
+        elif base_url_host_matches(base_url, "integrate.api.nvidia.com"):
+            headers.update(build_nvidia_nim_headers(base_url))
        else:
            # Fall back to profile.default_headers for providers that declare
            # client-level attribution headers on their profile (e.g. GMI
@@ -3188,6 +3324,8 @@ def resolve_provider_client(
            return resolve_provider_client("nous", model, async_mode)
        if provider == "openai-codex":
            return resolve_provider_client("openai-codex", model, async_mode)
+        if provider == "xai-oauth":
+            return resolve_provider_client("xai-oauth", model, async_mode)
        # Other OAuth providers not directly supported
        logger.warning("resolve_provider_client: OAuth provider %s not "
                       "directly supported, try 'auto'", provider)
@@ -244,8 +244,21 @@ def _normalize_responses_message_status(value: Any, *, default: str = "completed
    return default


-def _chat_messages_to_responses_input(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """Convert internal chat-style messages to Responses input items."""
+def _chat_messages_to_responses_input(
+    messages: List[Dict[str, Any]],
+    *,
+    is_xai_responses: bool = False,
+) -> List[Dict[str, Any]]:
+    """Convert internal chat-style messages to Responses input items.
+
+    ``is_xai_responses=True`` strips ``encrypted_content`` from replayed
+    reasoning items.  xAI's OAuth/SuperGrok ``/v1/responses`` surface
+    rejects encrypted reasoning blobs minted by prior turns: the request
+    streams an ``error`` SSE frame before ``response.created`` and the
+    OpenAI SDK collapses it into a generic stream-ordering error.  Native
+    Codex (chatgpt.com backend-api) DOES accept replayed encrypted_content
+    — keep the default off.
+    """
    items: List[Dict[str, Any]] = []
    seen_item_ids: set = set()

@@ -271,9 +284,17 @@ def _chat_messages_to_responses_input(messages: List[Dict[str, Any]]) -> List[Di
            if role == "assistant":
                # Replay encrypted reasoning items from previous turns
                # so the API can maintain coherent reasoning chains.
+                #
+                # xAI OAuth (SuperGrok/Premium) rejects replayed
+                # ``encrypted_content`` reasoning items minted by prior
+                # turns — see _chat_messages_to_responses_input docstring.
+                # When ``is_xai_responses`` is set we drop the replay
+                # entirely; Grok still reasons on each turn server-side,
+                # we just don't try to thread the prior turn's encrypted
+                # blob back in.
                codex_reasoning = msg.get("codex_reasoning_items")
                has_codex_reasoning = False
-                if isinstance(codex_reasoning, list):
+                if isinstance(codex_reasoning, list) and not is_xai_responses:
                    for ri in codex_reasoning:
                        if isinstance(ri, dict) and ri.get("encrypted_content"):
                            item_id = ri.get("id")
@@ -726,7 +747,7 @@ def _preflight_codex_api_kwargs(
        "model", "instructions", "input", "tools", "store",
        "reasoning", "include", "max_output_tokens", "temperature",
        "tool_choice", "parallel_tool_calls", "prompt_cache_key", "service_tier",
-        "extra_headers",
+        "extra_headers", "extra_body",
    }
    normalized: Dict[str, Any] = {
        "model": model,
@@ -776,6 +797,19 @@ def _preflight_codex_api_kwargs(
        if normalized_headers:
            normalized["extra_headers"] = normalized_headers

+    extra_body = api_kwargs.get("extra_body")
+    if extra_body is not None:
+        if not isinstance(extra_body, dict):
+            raise ValueError("Codex Responses request 'extra_body' must be an object.")
+        # Pass extra_body through verbatim — used by xAI Responses to
+        # carry `prompt_cache_key` as a body-level field (the documented
+        # cache-routing surface on /v1/responses). The openai SDK
+        # serializes extra_body into the JSON body without per-field
+        # type checks, so it survives Responses.stream() kwarg-signature
+        # changes that would otherwise raise TypeError before the wire.
+        if extra_body:
+            normalized["extra_body"] = dict(extra_body)
+
    if allow_stream:
        stream = api_kwargs.get("stream")
        if stream is not None and stream is not True:
@@ -1429,15 +1429,23 @@ The user has requested that this compaction PRIORITISE preserving all informatio
            return messages

        turns_to_summarize = messages[compress_start:compress_end]
+        # A persisted handoff summary can sit in the protected head after a
+        # resume (commonly immediately after the system prompt). Search from
+        # the first non-system message through the compression window so we can
+        # rehydrate iterative-summary state without serializing that handoff as
+        # a new turn. Protected messages after the handoff remain live context,
+        # so only summarize messages that are both after the handoff and inside
+        # the current compression window.
+        summary_search_start = 1 if messages and messages[0].get("role") == "system" else 0
        summary_idx, summary_body = self._find_latest_context_summary(
            messages,
-            compress_start,
+            summary_search_start,
            compress_end,
        )
        if summary_idx is not None:
            if summary_body and not self._previous_summary:
                self._previous_summary = summary_body
-            turns_to_summarize = messages[summary_idx + 1:compress_end]
+            turns_to_summarize = messages[max(compress_start, summary_idx + 1):compress_end]

        if not self.quiet_mode:
            logger.info(
@@ -30,6 +30,28 @@ _DEFAULT_TIMEOUT_SECONDS = 900.0
 _TOOL_CALL_BLOCK_RE = re.compile(r"<tool_call>\s*(\{.*?\})\s*</tool_call>", re.DOTALL)
 _TOOL_CALL_JSON_RE = re.compile(r"\{\s*\"id\"\s*:\s*\"[^\"]+\"\s*,\s*\"type\"\s*:\s*\"function\"\s*,\s*\"function\"\s*:\s*\{.*?\}\s*\}", re.DOTALL)

+# Stderr fingerprint of the deprecated `gh copilot` CLI extension
+# (https://github.blog/changelog/2025-09-25-upcoming-deprecation-of-gh-copilot-cli-extension).
+# We require BOTH the literal product name ("gh-copilot") AND a deprecation
+# marker, so generic stderr from the NEW `@github/copilot` CLI — whose repo
+# is github.com/github/copilot-cli and which legitimately mentions "copilot-cli"
+# in its own banners and error messages — doesn't get misclassified as the
+# deprecated extension.
+_DEPRECATION_REQUIRED = ("gh-copilot",)
+_DEPRECATION_MARKERS = (
+    "has been deprecated",
+    "no commands will be executed",
+)
+
+
+def _is_gh_copilot_deprecation_message(stderr_text: str) -> bool:
+    """True iff stderr looks like the deprecated gh-copilot extension's banner."""
+
+    lower = stderr_text.lower()
+    if not any(req in lower for req in _DEPRECATION_REQUIRED):
+        return False
+    return any(marker in lower for marker in _DEPRECATION_MARKERS)
+

 def _resolve_command() -> str:
    return (
@@ -506,6 +528,21 @@ class CopilotACPClient:

            stderr_text = "\n".join(stderr_tail).strip()
            if proc.poll() is not None and stderr_text:
+                if _is_gh_copilot_deprecation_message(stderr_text):
+                    raise RuntimeError(
+                        "Hermes ACP mode requires the NEW GitHub Copilot CLI "
+                        "(github.com/github/copilot-cli), but the binary it just "
+                        "spawned is the deprecated `gh copilot` extension.\n\n"
+                        "Install the new CLI:\n"
+                        "  npm install -g @github/copilot\n"
+                        "  # then verify with: copilot --help\n\n"
+                        "If `copilot` already resolves to the new CLI but you still see this,\n"
+                        "point Hermes at it explicitly:\n"
+                        "  export HERMES_COPILOT_ACP_COMMAND=/path/to/new/copilot\n\n"
+                        "Alternative: use the `copilot` provider (no ACP, hits the Copilot API\n"
+                        "directly with a Copilot subscription token) via `hermes setup`.\n\n"
+                        f"Original error:\n{stderr_text}"
+                    )
                raise RuntimeError(f"Copilot ACP process exited early: {stderr_text}")
            raise TimeoutError(f"Timed out waiting for Copilot ACP response to {method}.")

@@ -29,6 +29,7 @@ from hermes_cli.auth import (
    _resolve_zai_base_url,
    _save_auth_store,
    _save_provider_state,
+    _store_provider_state,
    read_credential_pool,
    write_credential_pool,
 )
@@ -539,6 +540,64 @@ class CredentialPool:
            logger.debug("Failed to sync Codex entry from auth.json: %s", exc)
        return entry

+    def _sync_xai_oauth_entry_from_auth_store(self, entry: PooledCredential) -> PooledCredential:
+        """Sync an xAI OAuth pool entry from auth.json if tokens differ.
+
+        xAI OAuth refresh tokens are single-use.  When another Hermes process
+        (or another profile sharing the same auth.json) refreshes the token,
+        it writes the new pair to ``providers["xai-oauth"]["tokens"]`` under
+        ``_auth_store_lock``.  Without this resync, our in-memory pool entry
+        keeps the consumed refresh_token and the next ``_refresh_entry`` call
+        would replay it and get a ``refresh_token_reused``-style 4xx.
+
+        Only applies to entries seeded from the singleton (``loopback_pkce``);
+        manually added entries (``manual:xai_pkce``) are independent
+        credentials with their own refresh-token lifecycle.
+        """
+        if self.provider != "xai-oauth" or entry.source != "loopback_pkce":
+            return entry
+        try:
+            with _auth_store_lock():
+                auth_store = _load_auth_store()
+                state = _load_provider_state(auth_store, "xai-oauth")
+            if not isinstance(state, dict):
+                return entry
+            tokens = state.get("tokens")
+            if not isinstance(tokens, dict):
+                return entry
+            store_access = tokens.get("access_token", "")
+            store_refresh = tokens.get("refresh_token", "")
+            entry_access = entry.access_token or ""
+            entry_refresh = entry.refresh_token or ""
+            if store_access and (
+                store_access != entry_access
+                or (store_refresh and store_refresh != entry_refresh)
+            ):
+                logger.debug(
+                    "Pool entry %s: syncing xAI OAuth tokens from auth.json "
+                    "(refreshed by another process)",
+                    entry.id,
+                )
+                field_updates: Dict[str, Any] = {
+                    "access_token": store_access,
+                    "refresh_token": store_refresh or entry.refresh_token,
+                    "last_status": None,
+                    "last_status_at": None,
+                    "last_error_code": None,
+                    "last_error_reason": None,
+                    "last_error_message": None,
+                    "last_error_reset_at": None,
+                }
+                if state.get("last_refresh"):
+                    field_updates["last_refresh"] = state["last_refresh"]
+                updated = replace(entry, **field_updates)
+                self._replace_entry(entry, updated)
+                self._persist()
+                return updated
+        except Exception as exc:
+            logger.debug("Failed to sync xAI OAuth entry from auth.json: %s", exc)
+        return entry
+
    def _sync_nous_entry_from_auth_store(self, entry: PooledCredential) -> PooledCredential:
        """Sync a Nous pool entry from auth.json if tokens differ.

@@ -604,9 +663,22 @@ class CredentialPool:
        re-seeding a consumed single-use refresh token.

        Applies to any OAuth provider whose singleton lives in auth.json
-        (currently Nous and OpenAI Codex).
+        (currently Nous, OpenAI Codex, and xAI Grok OAuth).
+
+        ``set_active=False`` on every write: a pool sync-back is a
+        token-rotation side effect, not the user choosing a provider.
+        Using ``_save_provider_state`` (which sets ``active_provider``)
+        here would mean every Nous/Codex/xAI refresh in a multi-provider
+        setup silently flips the ``active_provider`` flag — the next
+        ``hermes`` invocation that defaults to the active provider
+        (e.g. setup wizard, ``hermes auth status``) would land on
+        whatever provider happened to refresh last, not whatever the
+        user actually chose.
        """
-        if entry.source != "device_code":
+        # Only sync entries that were seeded *from* a singleton.  Manually
+        # added pool entries (source="manual:*") are independent credentials
+        # and must not write back to the singleton.
+        if entry.source not in {"device_code", "loopback_pkce"}:
            return
        try:
            with _auth_store_lock():
@@ -632,7 +704,7 @@ class CredentialPool:
                            state[extra_key] = val
                    if entry.inference_base_url:
                        state["inference_base_url"] = entry.inference_base_url
-                    _save_provider_state(auth_store, "nous", state)
+                    _store_provider_state(auth_store, "nous", state, set_active=False)

                elif self.provider == "openai-codex":
                    state = _load_provider_state(auth_store, "openai-codex")
@@ -646,7 +718,21 @@ class CredentialPool:
                        tokens["refresh_token"] = entry.refresh_token
                    if entry.last_refresh:
                        state["last_refresh"] = entry.last_refresh
-                    _save_provider_state(auth_store, "openai-codex", state)
+                    _store_provider_state(auth_store, "openai-codex", state, set_active=False)
+
+                elif self.provider == "xai-oauth":
+                    state = _load_provider_state(auth_store, "xai-oauth")
+                    if not isinstance(state, dict):
+                        return
+                    tokens = state.get("tokens")
+                    if not isinstance(tokens, dict):
+                        return
+                    tokens["access_token"] = entry.access_token
+                    if entry.refresh_token:
+                        tokens["refresh_token"] = entry.refresh_token
+                    if entry.last_refresh:
+                        state["last_refresh"] = entry.last_refresh
+                    _store_provider_state(auth_store, "xai-oauth", state, set_active=False)

                else:
                    return
@@ -699,6 +785,25 @@ class CredentialPool:
                    refresh_token=refreshed["refresh_token"],
                    last_refresh=refreshed.get("last_refresh"),
                )
+            elif self.provider == "xai-oauth":
+                # Adopt fresher tokens from auth.json before spending the
+                # refresh_token — single-use tokens consumed by another
+                # process (or another profile sharing the singleton) would
+                # otherwise trigger ``refresh_token_reused`` on the next
+                # POST.  Only meaningful for singleton-seeded entries.
+                synced = self._sync_xai_oauth_entry_from_auth_store(entry)
+                if synced is not entry:
+                    entry = synced
+                refreshed = auth_mod.refresh_xai_oauth_pure(
+                    entry.access_token,
+                    entry.refresh_token,
+                )
+                updated = replace(
+                    entry,
+                    access_token=refreshed["access_token"],
+                    refresh_token=refreshed["refresh_token"],
+                    last_refresh=refreshed.get("last_refresh"),
+                )
            elif self.provider == "nous":
                synced = self._sync_nous_entry_from_auth_store(entry)
                if synced is not entry:
@@ -777,6 +882,30 @@ class CredentialPool:
                    # Credentials file had a valid (non-expired) token — use it directly
                    logger.debug("Credentials file has valid token, using without refresh")
                    return synced
+            # For xai-oauth: same race as nous — another process may have
+            # consumed the refresh token between our proactive sync and the
+            # HTTP call.  Re-check auth.json and adopt the fresh tokens if
+            # they have rotated since.  Only meaningful for singleton-seeded
+            # (loopback_pkce) entries; manual entries don't share state with
+            # the singleton.
+            if self.provider == "xai-oauth":
+                synced = self._sync_xai_oauth_entry_from_auth_store(entry)
+                if synced.refresh_token != entry.refresh_token:
+                    logger.debug(
+                        "xAI OAuth refresh failed but auth.json has newer tokens — adopting"
+                    )
+                    updated = replace(
+                        synced,
+                        last_status=STATUS_OK,
+                        last_status_at=None,
+                        last_error_code=None,
+                        last_error_reason=None,
+                        last_error_message=None,
+                        last_error_reset_at=None,
+                    )
+                    self._replace_entry(synced, updated)
+                    self._persist()
+                    return updated
            # For nous: another process may have consumed the refresh token
            # between our proactive sync and the HTTP call.  Re-sync from
            # auth.json and adopt the fresh tokens if available.
@@ -829,6 +958,11 @@ class CredentialPool:
                entry.access_token,
                CODEX_ACCESS_TOKEN_REFRESH_SKEW_SECONDS,
            )
+        if self.provider == "xai-oauth":
+            return auth_mod._xai_access_token_is_expiring(
+                entry.access_token,
+                auth_mod.XAI_ACCESS_TOKEN_REFRESH_SKEW_SECONDS,
+            )
        if self.provider == "nous":
            # Nous refresh/mint can require network access and should happen when
            # runtime credentials are actually resolved, not merely when the pool
@@ -883,6 +1017,17 @@ class CredentialPool:
                if synced is not entry:
                    entry = synced
                    cleared_any = True
+            # For xai-oauth singleton-seeded entries, identical pattern:
+            # an entry frozen as exhausted may simply be holding stale
+            # tokens that another process (or a fresh `hermes model` ->
+            # xAI Grok OAuth login) has since rotated in auth.json.
+            if (self.provider == "xai-oauth"
+                    and entry.source == "loopback_pkce"
+                    and entry.last_status == STATUS_EXHAUSTED):
+                synced = self._sync_xai_oauth_entry_from_auth_store(entry)
+                if synced is not entry:
+                    entry = synced
+                    cleared_any = True
            if entry.last_status == STATUS_EXHAUSTED:
                exhausted_until = _exhausted_until(entry)
                if exhausted_until is not None and now < exhausted_until:
@@ -1394,6 +1539,37 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup
                },
            )

+    elif provider == "xai-oauth":
+        # When the user logs in via ``hermes model`` -> xAI Grok OAuth,
+        # tokens are written to the auth.json singleton
+        # (``providers["xai-oauth"]``).  Surface them in the pool too so
+        # ``hermes auth list`` reflects the logged-in state and so the pool
+        # is the single source of truth for refresh during runtime resolution.
+        if _is_suppressed(provider, "loopback_pkce"):
+            return changed, active_sources
+
+        state = _load_provider_state(auth_store, "xai-oauth")
+        tokens = state.get("tokens") if isinstance(state, dict) else None
+        if isinstance(tokens, dict) and tokens.get("access_token"):
+            active_sources.add("loopback_pkce")
+            from hermes_cli.auth import DEFAULT_XAI_OAUTH_BASE_URL
+
+            base_url = DEFAULT_XAI_OAUTH_BASE_URL
+            changed |= _upsert_entry(
+                entries,
+                provider,
+                "loopback_pkce",
+                {
+                    "source": "loopback_pkce",
+                    "auth_type": AUTH_TYPE_OAUTH,
+                    "access_token": tokens.get("access_token", ""),
+                    "refresh_token": tokens.get("refresh_token"),
+                    "base_url": base_url,
+                    "last_refresh": state.get("last_refresh"),
+                    "label": label_from_token(tokens.get("access_token", ""), "loopback_pkce"),
+                },
+            )
+
    return changed, active_sources


@@ -265,6 +265,31 @@ def _remove_minimax_oauth(provider: str, removed) -> RemovalResult:
    return result


+def _remove_xai_oauth_loopback_pkce(provider: str, removed) -> RemovalResult:
+    """xAI OAuth tokens live in auth.json providers.xai-oauth — clear them.
+
+    Without this step, ``hermes auth remove xai-oauth <N>`` silently undoes
+    itself: the central dispatcher only removes the in-memory pool entry,
+    leaves ``providers.xai-oauth`` in auth.json intact, and on the next
+    ``load_pool("xai-oauth")`` call ``_seed_from_singletons`` re-seeds the
+    entry from the still-present singleton — credentials reappear with no
+    user feedback. Clearing the singleton in step with the suppression set
+    by the central dispatcher makes the removal stick.
+
+    Belt-and-braces against the manual entry path: ``hermes auth add
+    xai-oauth`` produces a ``manual:xai_pkce`` entry whose removal step
+    falls through to "unregistered → nothing to clean up" (correct —
+    manual entries are pool-only).
+    """
+    result = RemovalResult()
+    if _clear_auth_store_provider(provider):
+        result.cleaned.append(f"Cleared {provider} OAuth tokens from auth store")
+    result.hints.append(
+        "Run `hermes model` → xAI Grok OAuth (SuperGrok Subscription) to re-authenticate if needed."
+    )
+    return result
+
+
 def _remove_codex_device_code(provider: str, removed) -> RemovalResult:
    """Codex tokens live in TWO places: our auth store AND ~/.codex/auth.json.

@@ -397,6 +422,11 @@ def _register_all_sources() -> None:
        remove_fn=_remove_codex_device_code,
        description="auth.json providers.openai-codex + ~/.codex/auth.json",
    ))
+    register(RemovalStep(
+        provider="xai-oauth", source_id="loopback_pkce",
+        remove_fn=_remove_xai_oauth_loopback_pkce,
+        description="auth.json providers.xai-oauth",
+    ))
    register(RemovalStep(
        provider="qwen-oauth", source_id="qwen-cli",
        remove_fn=_remove_qwen_cli,
@@ -240,21 +240,6 @@ def build_tool_preview(tool_name: str, args: dict, max_len: int | None = None) -
            msg = msg[:17] + "..."
        return f"to {target}: \"{msg}\""

-    if tool_name.startswith("rl_"):
-        rl_previews = {
-            "rl_list_environments": "listing envs",
-            "rl_select_environment": args.get("name", ""),
-            "rl_get_current_config": "reading config",
-            "rl_edit_config": f"{args.get('field', '')}={args.get('value', '')}",
-            "rl_start_training": "starting",
-            "rl_check_status": args.get("run_id", "")[:16],
-            "rl_stop_training": f"stopping {args.get('run_id', '')[:16]}",
-            "rl_get_results": args.get("run_id", "")[:16],
-            "rl_list_runs": "listing runs",
-            "rl_test_inference": f"{args.get('num_steps', 3)} steps",
-        }
-        return rl_previews.get(tool_name)
-
    key = primary_args.get(tool_name)
    if not key:
        for fallback_key in ("query", "text", "command", "path", "name", "prompt", "code", "goal"):
@@ -981,15 +966,6 @@ def get_cute_tool_message(
        if action == "list":
            return _wrap(f"┊ ⏰ cron      listing  {dur}")
        return _wrap(f"┊ ⏰ cron      {action} {args.get('job_id', '')}  {dur}")
-    if tool_name.startswith("rl_"):
-        rl = {
-            "rl_list_environments": "list envs", "rl_select_environment": f"select {args.get('name', '')}",
-            "rl_get_current_config": "get config", "rl_edit_config": f"set {args.get('field', '?')}",
-            "rl_start_training": "start training", "rl_check_status": f"status {args.get('run_id', '?')[:12]}",
-            "rl_stop_training": f"stop {args.get('run_id', '?')[:12]}", "rl_get_results": f"results {args.get('run_id', '?')[:12]}",
-            "rl_list_runs": "list runs", "rl_test_inference": "test inference",
-        }
-        return _wrap(f"┊ 🧪 rl        {rl.get(tool_name, tool_name.replace('rl_', ''))}  {dur}")
    if tool_name == "execute_code":
        code = args.get("code", "")
        first_line = code.strip().split("\n")[0] if code.strip() else ""
@@ -40,7 +40,7 @@ import os
 import threading
 import time
 from concurrent.futures import Future as ConcurrentFuture
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple

 from agent.lsp import eventlog
 from agent.lsp.client import (
@@ -107,9 +107,14 @@ class _BackgroundLoop:

        Returns the coroutine's result, or raises its exception.
        """
+        from agent.async_utils import safe_schedule_threadsafe
        if self._loop is None:
+            if asyncio.iscoroutine(coro):
+                coro.close()
            raise RuntimeError("background loop not started")
-        fut: ConcurrentFuture = asyncio.run_coroutine_threadsafe(coro, self._loop)
+        fut = safe_schedule_threadsafe(coro, self._loop)
+        if fut is None:
+            raise RuntimeError("background loop not running")
        try:
            return fut.result(timeout=timeout)
        except Exception:
@@ -305,6 +310,7 @@ class LSPService:
        *,
        delta: bool = True,
        timeout: Optional[float] = None,
+        line_shift: Optional[Callable[[int], Optional[int]]] = None,
    ) -> List[Dict[str, Any]]:
        """Synchronously open ``file_path`` in the right server, wait for
        diagnostics, return them.
@@ -314,6 +320,18 @@ class LSPService:
        Diagnostics present in the baseline are removed so the caller
        only sees errors introduced by the current edit.

+        When ``line_shift`` is provided, baseline diagnostics are
+        remapped through it before the set-difference.  This handles
+        the case where the edit deleted or inserted lines, causing
+        pre-existing diagnostics below the edit point to surface at
+        different line numbers in the post-edit snapshot — without
+        the shift, they'd all look "introduced by this edit".  Pass
+        a callable built by
+        :func:`agent.lsp.range_shift.build_line_shift` (pre_text,
+        post_text).  Omit when pre/post content isn't available;
+        the unshifted comparison still catches diagnostics that
+        didn't move.
+
        Returns an empty list when LSP is disabled, when no workspace
        can be detected, when no server matches, or when the server
        can't be spawned.  Never raises.
@@ -344,6 +362,14 @@ class LSPService:
        if delta:
            baseline = self._delta_baseline.get(abs_path) or []
            if baseline:
+                if line_shift is not None:
+                    # Remap baseline diagnostics into post-edit
+                    # coordinates so shifted-but-otherwise-identical
+                    # entries hash equal under _diag_key.  Entries
+                    # that mapped into a deleted region drop out
+                    # silently — they no longer apply.
+                    from agent.lsp.range_shift import shift_baseline
+                    baseline = shift_baseline(baseline, line_shift)
                seen = {_diag_key(d) for d in baseline}
                diags = [d for d in diags if _diag_key(d) not in seen]
            # Roll baseline forward — next call returns deltas relative
@@ -585,8 +611,19 @@ class LSPService:


 def _diag_key(d: Dict[str, Any]) -> str:
-    """Content equality key used for delta filtering.  Mirrors
-    :func:`agent.lsp.client._diagnostic_key`."""
+    """Content equality key used for cross-edit delta filtering.
+
+    Includes the diagnostic's position range — when used together
+    with :func:`agent.lsp.range_shift.shift_baseline`, the baseline
+    is line-shifted into post-edit coordinates BEFORE this key is
+    computed, so identical-but-shifted diagnostics hash equal.  Two
+    genuinely distinct diagnostics at different lines (e.g. the same
+    error class introduced at a second site) hash differently and
+    are surfaced as new.
+
+    Mirrors :func:`agent.lsp.client._diagnostic_key`; intentionally
+    identical so the two layers agree on diagnostic identity.
+    """
    rng = d.get("range") or {}
    start = rng.get("start") or {}
    end = rng.get("end") or {}
@@ -0,0 +1,149 @@
+"""Diff-aware line-shift map for cross-edit LSP delta filtering.
+
+When an edit deletes or inserts lines in the middle of a file, every
+diagnostic below the edit point shifts to a new line number.  The
+LSPService delta filter subtracts the pre-edit baseline from the
+post-edit diagnostics keyed on ``(severity, code, source, message,
+range)`` — without an adjustment, the shifted-but-otherwise-identical
+diagnostics look brand-new and the agent gets flooded with noise.
+
+The fix used here is the same trick git's blame and unified diff use:
+build a piecewise-linear map from pre-edit line numbers to post-edit
+line numbers, then apply that map to baseline diagnostics before the
+set-difference.  Diagnostics whose pre-edit line is in a region the
+edit deleted return ``None`` and are dropped from the baseline (they
+genuinely no longer apply).
+
+Trade-off vs. dropping range from the key entirely (the previous
+fix): preserves the "new instance of an identical error at a
+different line" signal — if the model introduces a second instance
+of the same error class at a different location, that one will be
+surfaced as new instead of swallowed by content-only dedup.
+
+The map is derived from ``difflib.SequenceMatcher.get_opcodes()`` and
+exposed as a single callable so callers don't have to reason about
+diff regions.
+"""
+from __future__ import annotations
+
+import difflib
+from typing import Any, Callable, Dict, List, Optional
+
+
+def build_line_shift(pre_text: str, post_text: str) -> Callable[[int], Optional[int]]:
+    """Build a function mapping pre-edit line numbers to post-edit line numbers.
+
+    Lines are 0-indexed to match the LSP wire format
+    (``range.start.line`` is 0-indexed).
+
+    The returned callable takes a pre-edit 0-indexed line number and
+    returns the corresponding post-edit 0-indexed line number, or
+    ``None`` if that line was deleted by the edit (no post-edit
+    counterpart exists).
+
+    Cost: one ``SequenceMatcher.get_opcodes()`` call up front; the
+    returned closure is O(log n) per call (binary search over opcode
+    regions).  Cheap enough to call once per write/patch and apply to
+    every baseline diagnostic.
+    """
+    pre_lines = pre_text.splitlines() if pre_text else []
+    post_lines = post_text.splitlines() if post_text else []
+
+    # Trivial case: identical content or no content — identity map.
+    if pre_lines == post_lines:
+        return lambda line: line
+
+    # SequenceMatcher.get_opcodes() returns a list of
+    # (tag, i1, i2, j1, j2) where tag is 'equal', 'replace', 'delete',
+    # or 'insert'.  i1:i2 is the range in pre, j1:j2 is the range in
+    # post.  We build a list of (i1, i2, j1, j2, tag) tuples and
+    # binary-search by i for each lookup.
+    sm = difflib.SequenceMatcher(a=pre_lines, b=post_lines, autojunk=False)
+    opcodes = sm.get_opcodes()
+
+    def shift(line: int) -> Optional[int]:
+        # Find the opcode region whose i1 <= line < i2.
+        # Linear scan is fine — typical opcode count is small (single
+        # digits for a typical patch-tool edit).
+        for tag, i1, i2, j1, j2 in opcodes:
+            if i1 <= line < i2:
+                if tag == "equal":
+                    # Pre-line N → post-line (N - i1 + j1).
+                    return line - i1 + j1
+                if tag == "delete":
+                    # Pre-line is in a deleted region — no post counterpart.
+                    return None
+                if tag == "replace":
+                    # Replace == delete + insert; the pre-line has no
+                    # post counterpart in any meaningful sense.  Drop.
+                    return None
+                # 'insert' has i1 == i2 so line < i2 can't be hit.
+            if line < i1:
+                # Past the relevant region — handled in earlier iteration.
+                break
+        # Past the last opcode region (line >= len(pre_lines)).
+        # Anchor at end of post.
+        return max(0, len(post_lines) - 1) if post_lines else None
+
+    return shift
+
+
+def shift_diagnostic_range(diag: Dict[str, Any],
+                           shift: Callable[[int], Optional[int]]) -> Optional[Dict[str, Any]]:
+    """Return a copy of ``diag`` with its line range remapped through ``shift``.
+
+    Returns ``None`` if the diagnostic's start line maps to ``None``
+    (the line was deleted by the edit) — caller drops it from the
+    baseline since the diagnostic no longer applies.
+
+    Both ``start.line`` and ``end.line`` are remapped independently;
+    when only the end maps to ``None`` (rare, multi-line diagnostic
+    straddling the edit boundary) we collapse to a single-line range
+    at the shifted start to keep the diagnostic in the baseline.
+
+    The original ``diag`` is not mutated.
+    """
+    rng = diag.get("range") or {}
+    start = rng.get("start") or {}
+    end = rng.get("end") or {}
+
+    pre_start_line = int(start.get("line", 0))
+    pre_end_line = int(end.get("line", pre_start_line))
+
+    new_start_line = shift(pre_start_line)
+    if new_start_line is None:
+        return None
+
+    new_end_line = shift(pre_end_line)
+    if new_end_line is None:
+        # Diagnostic straddled the deletion — collapse to start.
+        new_end_line = new_start_line
+
+    shifted = dict(diag)
+    shifted["range"] = {
+        "start": {
+            "line": new_start_line,
+            "character": int(start.get("character", 0)),
+        },
+        "end": {
+            "line": new_end_line,
+            "character": int(end.get("character", 0)),
+        },
+    }
+    return shifted
+
+
+def shift_baseline(baseline: List[Dict[str, Any]],
+                   shift: Callable[[int], Optional[int]]) -> List[Dict[str, Any]]:
+    """Apply ``shift`` to every diagnostic in ``baseline``, dropping deleted entries."""
+    out: List[Dict[str, Any]] = []
+    for d in baseline:
+        if not isinstance(d, dict):
+            continue
+        shifted = shift_diagnostic_range(d, shift)
+        if shifted is not None:
+            out.append(shifted)
+    return out
+
+
+__all__ = ["build_line_shift", "shift_diagnostic_range", "shift_baseline"]
@@ -213,6 +213,7 @@ DEFAULT_CONTEXT_LENGTHS = {
    "grok-2-vision": 8192,      # grok-2-vision, -1212, -latest
    "grok-4-fast": 2000000,     # grok-4-fast-(non-)reasoning
    "grok-4.20": 2000000,       # grok-4.20-0309-(non-)reasoning, -multi-agent-0309
+    "grok-4.3": 1000000,        # grok-4.3, grok-4.3-latest — 1M context per docs.x.ai
    "grok-4": 256000,           # grok-4, grok-4-0709
    "grok-3": 131072,           # grok-3, grok-3-mini, grok-3-fast, grok-3-mini-fast
    "grok-2": 131072,           # grok-2, grok-2-1212, grok-2-latest
@@ -357,6 +358,12 @@ _URL_TO_PROVIDER: Dict[str, str] = {
    "api.deepseek.com": "deepseek",
    "api.githubcopilot.com": "copilot",
    "models.github.ai": "copilot",
+    # GitHub Models free tier (Azure-hosted prototyping endpoint) — same
+    # canonical provider as the Copilot API.  Hard per-request token cap
+    # (often 8K) makes it unusable for Hermes' system prompt, but mapping
+    # it here lets us recognize the endpoint and emit a targeted hint
+    # instead of falling through the unknown-custom-endpoint path.
+    "models.inference.ai.azure.com": "copilot",
    "api.fireworks.ai": "fireworks",
    "opencode.ai": "opencode-go",
    "api.x.ai": "xai",
@@ -24,7 +24,10 @@ class ResponsesApiTransport(ProviderTransport):
    def convert_messages(self, messages: List[Dict[str, Any]], **kwargs) -> Any:
        """Convert OpenAI chat messages to Responses API input items."""
        from agent.codex_responses_adapter import _chat_messages_to_responses_input
-        return _chat_messages_to_responses_input(messages)
+        return _chat_messages_to_responses_input(
+            messages,
+            is_xai_responses=bool(kwargs.get("is_xai_responses")),
+        )

    def convert_tools(self, tools: List[Dict[str, Any]]) -> Any:
        """Convert OpenAI tool schemas to Responses API function definitions."""
@@ -89,24 +92,38 @@ class ResponsesApiTransport(ProviderTransport):
        _effort_clamp = {"minimal": "low"}
        reasoning_effort = _effort_clamp.get(reasoning_effort, reasoning_effort)

+        response_tools = _responses_tools(tools)
        kwargs = {
            "model": model,
            "instructions": instructions,
-            "input": _chat_messages_to_responses_input(payload_messages),
-            "tools": _responses_tools(tools),
-            "tool_choice": "auto",
-            "parallel_tool_calls": True,
+            "input": _chat_messages_to_responses_input(
+                payload_messages,
+                is_xai_responses=is_xai_responses,
+            ),
+            "tools": response_tools,
            "store": False,
        }
+        if response_tools:
+            kwargs["tool_choice"] = "auto"
+            kwargs["parallel_tool_calls"] = True

        session_id = params.get("session_id")
-        if not is_github_responses and session_id:
+        # xAI Responses takes prompt_cache_key in extra_body (set further
+        # down); GitHub Models opts out of cache-key routing entirely.
+        if not is_github_responses and not is_xai_responses and session_id:
            kwargs["prompt_cache_key"] = session_id

        if reasoning_enabled and is_xai_responses:
            from agent.model_metadata import grok_supports_reasoning_effort

-            kwargs["include"] = ["reasoning.encrypted_content"]
+            # NOTE: Hermes does NOT ask xAI to return ``reasoning.encrypted_content``
+            # any more.  xAI's OAuth/SuperGrok ``/v1/responses`` surface rejects
+            # replayed encrypted reasoning items on turn 2+ — see
+            # _chat_messages_to_responses_input docstring.  Requesting the field
+            # back would just have us cache something we then must strip.  Grok
+            # still reasons natively each turn; coherence across turns rides on
+            # the visible message text alone.
+            kwargs["include"] = []
            # xAI rejects `reasoning.effort` on grok-4 / grok-4-fast / grok-3
            # / grok-code-fast / grok-4.20-0309-* with HTTP 400 even though
            # those models reason natively. Only send the effort dial when
@@ -165,6 +182,17 @@ class ResponsesApiTransport(ProviderTransport):
            merged_extra_headers["x-grok-conv-id"] = session_id
            kwargs["extra_headers"] = merged_extra_headers

+            # xAI Responses cache-routing — body-level field per
+            # https://docs.x.ai/developers/advanced-api-usage/prompt-caching/maximizing-cache-hits.
+            # Sent via extra_body (not the typed kwarg) so it survives openai
+            # SDK builds whose Responses.stream() signature has dropped the field.
+            existing_extra_body = kwargs.get("extra_body")
+            merged_extra_body: Dict[str, Any] = {}
+            if isinstance(existing_extra_body, dict):
+                merged_extra_body.update(existing_extra_body)
+            merged_extra_body.setdefault("prompt_cache_key", session_id)
+            kwargs["extra_body"] = merged_extra_body
+
        return kwargs

    def normalize_response(self, response: Any, **kwargs) -> NormalizedResponse:
@@ -14,20 +14,28 @@ the user gets full Hermes capability inside a Codex turn.
 Scope (what we expose):
  - web_search, web_extract              — Firecrawl, no codex equivalent
  - browser_navigate / _click / _type /  — Camofox/Browserbase automation
-    _snapshot / _screenshot / _scroll / _back / _press / _vision
-  - delegate_task                        — Hermes subagents
+    _snapshot / _scroll / _back / _press /
+    _get_images / _console / _vision
  - vision_analyze                       — image inspection by vision model
  - image_generate                       — image generation
-  - memory                               — Hermes' persistent memory store
  - skill_view, skills_list              — Hermes' skill library
-  - session_search                       — cross-session search
  - text_to_speech                       — TTS
+  - kanban_* (complete/block/comment/    — kanban worker + orchestrator
+    heartbeat/show/list/create/            handoff (stateless: read env var,
+    unblock/link)                          write ~/.hermes/kanban.db)

-What we DO NOT expose (codex has equivalents):
+What we DO NOT expose:
  - terminal / shell                     — codex's own shell tool
  - read_file / write_file / patch       — codex's apply_patch + shell
  - search_files / process               — codex's shell
-  - clarify, todo                        — codex's own UX
+  - clarify                              — codex's own UX
+  - delegate_task / memory /             — `_AGENT_LOOP_TOOLS` in Hermes
+    session_search / todo                  (model_tools.py). They require
+                                           the running AIAgent context to
+                                           dispatch (mid-loop state), so a
+                                           stateless MCP callback can't
+                                           drive them. See the inline
+                                           comment on EXPOSED_TOOLS below.

 Run with: python -m agent.transports.hermes_tools_mcp_server
 Spawned by: CodexAppServerSession.ensure_started() when the runtime is
@@ -457,7 +457,7 @@ prompt_caching:
 # Two stores: MEMORY.md (agent's notes) and USER.md (user profile).
 # Character limits keep the memory small and focused. The agent manages
 # pruning -- when at the limit, it must consolidate or replace entries.
-# Disabled by default in batch_runner and RL environments.
+# Disabled by default in batch_runner.
 #
 memory:
  # Agent's personal notes: environment facts, conventions, things learned
@@ -681,6 +681,16 @@ platform_toolsets:
 #     # allowed_chats: ["-1001234567890"]
 #     extra:
 #       disable_link_previews: false  # Set true to suppress Telegram URL previews in bot messages
+#
+# Discord-specific settings (config.yaml top-level, not under platforms:):
+#
+# discord:
+#   require_mention: true            # Require @mention in server channels (default: true)
+#   auto_thread: true                # Auto-create thread on @mention (default: true)
+#   free_response_channels: ""       # Channel IDs where no mention is needed
+#   reactions: true                  # Show processing reactions (default: true)
+#   history_backfill: true           # Recover missed channel messages on mention (default: true)
+#   history_backfill_limit: 50       # Max messages to scan backwards (default: 50)

 # ─────────────────────────────────────────────────────────────────────────────
 # Available toolsets (use these names in platform_toolsets or the toolsets list)
@@ -705,10 +715,9 @@ platform_toolsets:
 #   todo         - todo (in-memory task planning, no deps)
 #   tts          - text_to_speech  (Edge TTS free, or ELEVENLABS/OPENAI/MINIMAX/MISTRAL key)
 #   cronjob      - cronjob (create/list/update/pause/resume/run/remove scheduled tasks)
-#   rl           - rl_list_environments, rl_start_training, etc. (requires TINKER_API_KEY)
 #
 # PRESETS (curated bundles):
-#   hermes-cli       - All of the above except rl + send_message
+#   hermes-cli       - All of the above except send_message
 #   hermes-telegram  - terminal, file, web, vision, image_gen, tts, browser,
 #                      skills, todo, cronjob, send_message
 #   hermes-discord   - Same as hermes-telegram
@@ -734,7 +743,6 @@ platform_toolsets:
 #   session_search - Search and recall past conversations (FTS5 + Gemini Flash summarization)
 #   tts          - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, MiniMax, Mistral)
 #   cronjob      - Schedule and manage automated tasks (CLI-only)
-#   rl           - RL training tools (Tinker-Atropos)
 #
 # Composite toolsets:
 #   debugging    - terminal + web + file (for troubleshooting)
@@ -1965,43 +1965,7 @@ def _resolve_attachment_path(raw_path: str) -> Path | None:
    return resolved


-def _format_process_notification(evt: dict) -> "str | None":
-    """Format a process notification event into a [IMPORTANT: ...] message.

-    Handles both completion events (notify_on_complete) and watch pattern
-    match events from the unified completion_queue.
-    """
-    evt_type = evt.get("type", "completion")
-    _sid = evt.get("session_id", "unknown")
-    _cmd = evt.get("command", "unknown")
-
-    if evt_type == "watch_disabled":
-        return f"[IMPORTANT: {evt.get('message', '')}]"
-
-    if evt_type == "watch_match":
-        _pat = evt.get("pattern", "?")
-        _out = evt.get("output", "")
-        _sup = evt.get("suppressed", 0)
-        text = (
-            f"[IMPORTANT: Background process {_sid} matched "
-            f"watch pattern \"{_pat}\".\n"
-            f"Command: {_cmd}\n"
-            f"Matched output:\n{_out}"
-        )
-        if _sup:
-            text += f"\n({_sup} earlier matches were suppressed by rate limit)"
-        text += "]"
-        return text
-
-    # Default: completion event
-    _exit = evt.get("exit_code", "?")
-    _out = evt.get("output", "")
-    return (
-        f"[IMPORTANT: Background process {_sid} completed "
-        f"(exit code {_exit}).\n"
-        f"Command: {_cmd}\n"
-        f"Output:\n{_out}]"
-    )


 def _detect_file_drop(user_input: str) -> "dict | None":
@@ -3235,25 +3199,27 @@ class HermesCLI:

    @staticmethod
    def _scrollback_box_width(width: Optional[int] = None) -> int:
-        """Return a resize-safe width for printed scrollback box rules.
+        """Return the full viewport width for printed scrollback box rules.

-        Lines already printed to terminal scrollback are reflowed by the
-        terminal emulator when the column count shrinks. A full-width response
-        border drawn at, say, 200 columns will wrap into two or three rows of
-        dashes after the user resizes to 80 columns, looking like duplicated
-        separator lines (the family of bugs tracked by #18449, #19280, #22976).
+        Previously this clamped to ``max(32, min(width, 56))`` as a defense
+        against terminal-emulator reflow on column-shrink (#25975, salvaging
+        #24403).  That clamp made response/reasoning borders look stubby on
+        any modern wide terminal.  We now trust the prompt_toolkit
+        ``_output_screen_diff`` monkey-patch landed in #26137 (salvaging
+        #25981) to keep chrome out of scrollback in the first place, and
+        accept that an aggressive column-shrink may visually reflow already
+        printed Panel borders — that's a cosmetic artifact of stamped
+        scrollback history, not a live-render bug.

-        Keep decorative scrollback boxes intentionally narrower than the
-        viewport so a moderate resize never triggers reflow. The live TUI
-        footer (status bar, input rule) still uses the full width — only
-        content that is *stamped into scrollback* needs this clamp.
+        A small floor (32 cols) is kept so the box still renders on tiny
+        terminals without negative ``'─' * (w - 2)`` math.
        """
        if width is None:
            try:
                width = shutil.get_terminal_size((80, 24)).columns
            except Exception:
                width = 80
-        return max(32, min(int(width or 80), 56))
+        return max(32, int(width or 80))

    def _tui_input_rule_height(self, position: str, width: Optional[int] = None) -> int:
        """Return the visible height for the top/bottom input separator rules."""
@@ -3368,8 +3334,11 @@ class HermesCLI:
            percent_label = f"{percent}%" if percent is not None else "--"
            duration_label = snapshot["duration"]

+            yolo_active = bool(os.getenv("HERMES_YOLO_MODE"))
            if width < 52:
                text = f"⚕ {snapshot['model_short']} · {duration_label}"
+                if yolo_active:
+                    text += " · ⚠ YOLO"
                return self._trim_status_bar_text(text, width)
            if width < 76:
                parts = [f"⚕ {snapshot['model_short']}", percent_label]
@@ -3377,6 +3346,8 @@ class HermesCLI:
                if compressions:
                    parts.append(f"🗜️ {compressions}")
                parts.append(duration_label)
+                if yolo_active:
+                    parts.append("⚠ YOLO")
                return self._trim_status_bar_text(" · ".join(parts), width)

            if snapshot["context_length"]:
@@ -3394,6 +3365,8 @@ class HermesCLI:
            prompt_elapsed = snapshot.get("prompt_elapsed")
            if prompt_elapsed:
                parts.append(prompt_elapsed)
+            if yolo_active:
+                parts.append("⚠ YOLO")
            return self._trim_status_bar_text(" │ ".join(parts), width)
        except Exception:
            return f"⚕ {self.model if getattr(self, 'model', None) else 'Hermes'}"
@@ -3410,6 +3383,7 @@ class HermesCLI:
            # line and produce duplicated status bar rows over long sessions.
            width = self._get_tui_terminal_width()
            duration_label = snapshot["duration"]
+            yolo_active = bool(os.getenv("HERMES_YOLO_MODE"))

            if width < 52:
                frags = [
@@ -3417,8 +3391,11 @@ class HermesCLI:
                    ("class:status-bar-strong", snapshot["model_short"]),
                    ("class:status-bar-dim", " · "),
                    ("class:status-bar-dim", duration_label),
-                    ("class:status-bar", " "),
                ]
+                if yolo_active:
+                    frags.append(("class:status-bar-dim", " · "))
+                    frags.append(("class:status-bar-yolo", "⚠ YOLO"))
+                frags.append(("class:status-bar", " "))
            else:
                percent = snapshot["context_percent"]
                percent_label = f"{percent}%" if percent is not None else "--"
@@ -3436,8 +3413,11 @@ class HermesCLI:
                    frags.extend([
                        ("class:status-bar-dim", " · "),
                        ("class:status-bar-dim", duration_label),
-                        ("class:status-bar", " "),
                    ])
+                    if yolo_active:
+                        frags.append(("class:status-bar-dim", " · "))
+                        frags.append(("class:status-bar-yolo", "⚠ YOLO"))
+                    frags.append(("class:status-bar", " "))
                else:
                    if snapshot["context_length"]:
                        ctx_total = _format_context_length(snapshot["context_length"])
@@ -3470,6 +3450,9 @@ class HermesCLI:
                    if prompt_elapsed:
                        frags.append(("class:status-bar-dim", " │ "))
                        frags.append(("class:status-bar-dim", prompt_elapsed))
+                    if yolo_active:
+                        frags.append(("class:status-bar-dim", " │ "))
+                        frags.append(("class:status-bar-yolo", "⚠ YOLO"))
                    frags.append(("class:status-bar", " "))

            total_width = sum(self._status_bar_display_width(text) for _, text in frags)
@@ -6216,6 +6199,38 @@ class HermesCLI:
        else:
            _cprint(f"  ↻ Resumed session {target_id}{title_part} — no messages, starting fresh.")

+    def _handle_sessions_command(self, cmd_original: str) -> None:
+        """Handle /sessions [list|<id_or_title>] — browse or resume previous sessions.
+
+        Without arguments, prints the same recent-sessions table that /resume
+        shows when called without a target, and tells the user how to resume.
+        With an explicit subcommand or target, delegates to the resume flow so
+        ``/sessions <id>`` and ``/resume <id>`` behave identically.
+
+        The TUI ships an interactive picker overlay for this command; the
+        classic CLI prints an inline list because there is no equivalent
+        overlay primitive here. Without this handler the canonical name
+        ``sessions`` falls through ``process_command``'s elif chain and
+        prints ``Unknown command: sessions`` even though the command is
+        registered in the central COMMAND_REGISTRY.
+        """
+        parts = cmd_original.split(None, 1)
+        arg = parts[1].strip() if len(parts) > 1 else ""
+        sub = arg.lower()
+
+        # Bare /sessions or /sessions list — show recent sessions inline.
+        if not arg or sub in {"list", "ls", "browse"}:
+            if not self._session_db:
+                from hermes_state import format_session_db_unavailable
+                _cprint(f"  {format_session_db_unavailable()}")
+                return
+            if not self._show_recent_sessions(reason="sessions"):
+                _cprint("  (._.) No previous sessions yet.")
+            return
+
+        # /sessions <id_or_title> behaves the same as /resume <id_or_title>.
+        self._handle_resume_command(f"/resume {arg}")
+
    def _handle_branch_command(self, cmd_original: str) -> None:
        """Handle /branch [name] — fork the current session into a new independent copy.

@@ -7795,6 +7810,8 @@ class HermesCLI:
            self.new_session(title=title)
        elif canonical == "resume":
            self._handle_resume_command(cmd_original)
+        elif canonical == "sessions":
+            self._handle_sessions_command(cmd_original)
        elif canonical == "model":
            self._handle_model_switch(cmd_original)
        elif canonical == "codex-runtime":
@@ -11719,11 +11736,13 @@ class HermesCLI:

        # Ensure tirith security scanner is available (downloads if needed).
        # Warn the user if tirith is enabled in config but not available,
-        # so they know command security scanning is degraded.
+        # so they know command security scanning is degraded.  Suppressed
+        # on platforms where tirith ships no binary (Windows etc.) — the
+        # user can't act on it and pattern-matching guards still run.
        try:
-            from tools.tirith_security import ensure_installed
+            from tools.tirith_security import ensure_installed, is_platform_supported
            tirith_path = ensure_installed(log_failures=False)
-            if tirith_path is None:
+            if tirith_path is None and is_platform_supported():
                security_cfg = self.config.get("security", {}) or {}
                tirith_enabled = security_cfg.get("tirith_enabled", True)
                if tirith_enabled:
@@ -13308,6 +13327,7 @@ class HermesCLI:
            'status-bar-warn': 'bg:#1a1a2e #FFD700 bold',
            'status-bar-bad': 'bg:#1a1a2e #FF8C00 bold',
            'status-bar-critical': 'bg:#1a1a2e #FF6B6B bold',
+            'status-bar-yolo': 'bg:#1a1a2e #FF4444 bold',
            # Bronze horizontal rules around the input area
            'input-rule': '#CD7F32',
            # Clipboard image attachment badges
@@ -13464,16 +13484,8 @@ class HermesCLI:
                            # and watch pattern matches) while agent is idle.
                            try:
                                from tools.process_registry import process_registry
-                                if not process_registry.completion_queue.empty():
-                                    evt = process_registry.completion_queue.get_nowait()
-                                    # Skip if the agent already consumed this via wait/poll/log
-                                    _evt_sid = evt.get("session_id", "")
-                                    if evt.get("type") == "completion" and process_registry.is_completion_consumed(_evt_sid):
-                                        pass  # already delivered via tool result
-                                    else:
-                                        _synth = _format_process_notification(evt)
-                                        if _synth:
-                                            self._pending_input.put(_synth)
+                                for _evt, _synth in process_registry.drain_notifications():
+                                    self._pending_input.put(_synth)
                            except Exception:
                                pass
                        continue
@@ -13581,15 +13593,8 @@ class HermesCLI:
                        # that arrived while the agent was running.
                        try:
                            from tools.process_registry import process_registry
-                            while not process_registry.completion_queue.empty():
-                                evt = process_registry.completion_queue.get_nowait()
-                                # Skip if the agent already consumed this via wait/poll/log
-                                _evt_sid = evt.get("session_id", "")
-                                if evt.get("type") == "completion" and process_registry.is_completion_consumed(_evt_sid):
-                                    continue  # already delivered via tool result
-                                _synth = _format_process_notification(evt)
-                                if _synth:
-                                    self._pending_input.put(_synth)
+                            for _evt, _synth in process_registry.drain_notifications():
+                                self._pending_input.put(_synth)
                        except Exception:
                            pass  # Non-fatal — don't break the main loop

@@ -13721,6 +13726,30 @@ class HermesCLI:
            self._print_exit_summary()
            return

+        # On macOS with uv-managed Python, kqueue's selector cannot register
+        # fd 0, raising OSError(EINVAL) from kqueue.control() when prompt_toolkit
+        # calls loop.add_reader (#6393). Probe kqueue and, if it can't watch
+        # stdin, switch to a SelectSelector-backed event loop policy.
+        if sys.platform == "darwin":
+            try:
+                import selectors as _selectors
+                if hasattr(_selectors, "KqueueSelector"):
+                    _kq = _selectors.KqueueSelector()
+                    try:
+                        _kq.register(0, _selectors.EVENT_READ)
+                        _kq.unregister(0)
+                    finally:
+                        _kq.close()
+            except (OSError, ValueError, KeyError):
+                import asyncio as _aio_probe
+                import selectors as _selectors
+
+                class _SelectEventLoopPolicy(_aio_probe.DefaultEventLoopPolicy):
+                    def new_event_loop(self):
+                        return _aio_probe.SelectorEventLoop(_selectors.SelectSelector())
+
+                _aio_probe.set_event_loop_policy(_SelectEventLoopPolicy())
+
        # Run the application with patch_stdout for proper output handling
        try:
            with patch_stdout():
@@ -13741,12 +13770,20 @@ class HermesCLI:
        except (KeyError, OSError) as _stdin_err:
            # Catch selector registration failures from broken stdin (#6393)
            # and I/O errors from broken stdout during interrupt (#13710).
-            if isinstance(_stdin_err, OSError) and getattr(_stdin_err, "errno", None) == errno.EIO:
+            _errno = getattr(_stdin_err, "errno", None) if isinstance(_stdin_err, OSError) else None
+            _msg = str(_stdin_err)
+            if _errno == errno.EIO:
                pass  # suppress broken-stdout I/O errors on interrupt (#13710)
-            elif "is not registered" in str(_stdin_err) or "Bad file descriptor" in str(_stdin_err):
+            elif (
+                _errno in (errno.EINVAL, errno.EBADF)
+                or "is not registered" in _msg
+                or "Bad file descriptor" in _msg
+                or "Invalid argument" in _msg
+            ):
                print(
                    f"\nError: stdin is not usable ({_stdin_err}).\n"
-                    "This can happen with certain Python installations (e.g. uv-managed cPython on macOS).\n"
+                    "This can happen with certain Python installations (e.g. uv-managed cPython on macOS)\n"
+                    "where kqueue cannot register fd 0.\n"
                    "Try reinstalling Python via pyenv or Homebrew, then re-run: hermes setup"
                )
            else:
@@ -645,6 +645,44 @@ def get_job(job_id: str) -> Optional[Dict[str, Any]]:
    return None


+class AmbiguousJobReference(LookupError):
+    """Raised when a job name matches more than one job."""
+
+    def __init__(self, ref: str, matches: List[Dict[str, Any]]):
+        self.ref = ref
+        self.matches = matches
+        ids = ", ".join(m["id"] for m in matches)
+        super().__init__(
+            f"Job name '{ref}' is ambiguous — matches {len(matches)} jobs: {ids}. "
+            f"Use the job ID instead."
+        )
+
+
+def resolve_job_ref(ref: str) -> Optional[Dict[str, Any]]:
+    """Resolve a job reference (ID or name) to a job record.
+
+    - Exact ID match wins (works even if a different job's name equals this ID).
+    - Otherwise, case-insensitive name match.
+    - If a name matches more than one job, raises AmbiguousJobReference so the
+      caller can surface the matching IDs rather than silently picking one.
+    """
+    if not ref:
+        return None
+    jobs = load_jobs()
+    for job in jobs:
+        if job["id"] == ref:
+            return _normalize_job_record(job)
+    ref_lower = ref.lower()
+    name_matches = [j for j in jobs if (j.get("name") or "").lower() == ref_lower]
+    if not name_matches:
+        return None
+    if len(name_matches) > 1:
+        raise AmbiguousJobReference(
+            ref, [_normalize_job_record(j) for j in name_matches]
+        )
+    return _normalize_job_record(name_matches[0])
+
+
 def list_jobs(include_disabled: bool = False) -> List[Dict[str, Any]]:
    """List all jobs, optionally including disabled ones."""
    jobs = [_normalize_job_record(j) for j in load_jobs()]
@@ -702,9 +740,12 @@ def update_job(job_id: str, updates: Dict[str, Any]) -> Optional[Dict[str, Any]]


 def pause_job(job_id: str, reason: Optional[str] = None) -> Optional[Dict[str, Any]]:
-    """Pause a job without deleting it."""
+    """Pause a job without deleting it. Accepts a job ID or name."""
+    job = resolve_job_ref(job_id)
+    if not job:
+        return None
    return update_job(
-        job_id,
+        job["id"],
        {
            "enabled": False,
            "state": "paused",
@@ -715,14 +756,14 @@ def pause_job(job_id: str, reason: Optional[str] = None) -> Optional[Dict[str, A


 def resume_job(job_id: str) -> Optional[Dict[str, Any]]:
-    """Resume a paused job and compute the next future run from now."""
-    job = get_job(job_id)
+    """Resume a paused job and compute the next future run from now. Accepts a job ID or name."""
+    job = resolve_job_ref(job_id)
    if not job:
        return None

    next_run_at = compute_next_run(job["schedule"])
    return update_job(
-        job_id,
+        job["id"],
        {
            "enabled": True,
            "state": "scheduled",
@@ -734,12 +775,12 @@ def resume_job(job_id: str) -> Optional[Dict[str, Any]]:


 def trigger_job(job_id: str) -> Optional[Dict[str, Any]]:
-    """Schedule a job to run on the next scheduler tick."""
-    job = get_job(job_id)
+    """Schedule a job to run on the next scheduler tick. Accepts a job ID or name."""
+    job = resolve_job_ref(job_id)
    if not job:
        return None
    return update_job(
-        job_id,
+        job["id"],
        {
            "enabled": True,
            "state": "scheduled",
@@ -751,14 +792,18 @@ def trigger_job(job_id: str) -> Optional[Dict[str, Any]]:


 def remove_job(job_id: str) -> bool:
-    """Remove a job by ID."""
+    """Remove a job by ID or name."""
+    job = resolve_job_ref(job_id)
+    if not job:
+        return False
+    canonical_id = job["id"]
    jobs = load_jobs()
    original_len = len(jobs)
-    jobs = [j for j in jobs if j["id"] != job_id]
+    jobs = [j for j in jobs if j["id"] != canonical_id]
    if len(jobs) < original_len:
        save_jobs(jobs)
        # Clean up output directory to prevent orphaned dirs accumulating
-        job_output_dir = OUTPUT_DIR / job_id
+        job_output_dir = OUTPUT_DIR / canonical_id
        if job_output_dir.exists():
            shutil.rmtree(job_output_dir)
        return True
@@ -464,7 +464,14 @@ def _send_media_via_adapter(
            else:
                coro = adapter.send_document(chat_id=chat_id, file_path=media_path, metadata=metadata)

-            future = asyncio.run_coroutine_threadsafe(coro, loop)
+            from agent.async_utils import safe_schedule_threadsafe
+            future = safe_schedule_threadsafe(coro, loop)
+            if future is None:
+                logger.warning(
+                    "Job '%s': cannot send media %s, gateway loop unavailable",
+                    job.get("id", "?"), media_path,
+                )
+                return
            try:
                result = future.result(timeout=30)
            except TimeoutError:
@@ -585,22 +592,26 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option
                text_to_send = cleaned_delivery_content.strip()
                adapter_ok = True
                if text_to_send:
-                    future = asyncio.run_coroutine_threadsafe(
+                    from agent.async_utils import safe_schedule_threadsafe
+                    future = safe_schedule_threadsafe(
                        runtime_adapter.send(chat_id, text_to_send, metadata=send_metadata),
                        loop,
                    )
-                    try:
-                        send_result = future.result(timeout=60)
-                    except TimeoutError:
-                        future.cancel()
-                        raise
-                    if send_result and not getattr(send_result, "success", True):
-                        err = getattr(send_result, "error", "unknown")
-                        logger.warning(
-                            "Job '%s': live adapter send to %s:%s failed (%s), falling back to standalone",
-                            job["id"], platform_name, chat_id, err,
-                        )
-                        adapter_ok = False  # fall through to standalone path
+                    if future is None:
+                        adapter_ok = False
+                    else:
+                        try:
+                            send_result = future.result(timeout=60)
+                        except TimeoutError:
+                            future.cancel()
+                            raise
+                        if send_result and not getattr(send_result, "success", True):
+                            err = getattr(send_result, "error", "unknown")
+                            logger.warning(
+                                "Job '%s': live adapter send to %s:%s failed (%s), falling back to standalone",
+                                job["id"], platform_name, chat_id, err,
+                            )
+                            adapter_ok = False  # fall through to standalone path

                # Send extracted media files as native attachments via the live adapter
                if adapter_ok and media_files:
@@ -1,324 +0,0 @@
-# Hermes-Agent Atropos Environments
-
-This directory contains the integration layer between **hermes-agent's** tool-calling capabilities and the **Atropos** RL training framework. It provides everything needed to run agentic LLMs through multi-turn tool-calling loops, score their output with arbitrary reward functions, and feed results into Atropos for training or evaluation.
-
-## Architecture Overview
-
-```
-                        Atropos Framework
-                    ┌───────────────────────┐
-                    │       BaseEnv          │  (atroposlib)
-                    │  - Server management   │
-                    │  - Worker scheduling   │
-                    │  - Wandb logging       │
-                    │  - CLI (serve/process/ │
-                    │    evaluate)           │
-                    └───────────┬───────────┘
-                                │ inherits
-                    ┌───────────┴───────────┐
-                    │  HermesAgentBaseEnv    │  hermes_base_env.py
-                    │  - Terminal backend    │
-                    │  - Tool resolution     │
-                    │  - Agent loop          │
-                    │  - ToolContext          │
-                    │  - Async patches       │
-                    └───────────┬───────────┘
-                                │ inherits
-              ┌─────────────────┼─────────────────┐
-              │                 │                  │
-     TerminalTestEnv     HermesSweEnv    TerminalBench2EvalEnv
-     (stack testing)     (SWE training)   (TB2 benchmark eval)
-```
-
-### Inheritance Chain
-
-**BaseEnv** (from `atroposlib`) is the Atropos base class. It provides:
- Server management (OpenAI-compatible API servers, VLLM, SGLang)
- Worker scheduling for parallel rollouts
- Wandb integration for metrics and rollout logging
- CLI interface with three subcommands: `serve`, `process`, `evaluate`
- `evaluate_log()` for saving eval results to JSON + samples.jsonl
-
-**HermesAgentBaseEnv** (`hermes_base_env.py`) extends BaseEnv with hermes-agent specifics:
- Sets `os.environ["TERMINAL_ENV"]` to configure the terminal backend (local, docker, ssh, singularity, modal, daytona, vercel_sandbox)
- Resolves hermes-agent toolsets via `_resolve_tools_for_group()` (calls `get_tool_definitions()` which queries `tools/registry.py`)
- Implements `collect_trajectory()` which runs the full agent loop and computes rewards
- Supports two-phase operation (Phase 1: OpenAI server, Phase 2: VLLM ManagedServer)
- Applies monkey patches for async-safe tool operation at import time
-
-Concrete environments inherit from `HermesAgentBaseEnv` and implement:
- `setup()` -- Load dataset, initialize state
- `get_next_item()` -- Return the next item for rollout
- `format_prompt()` -- Convert a dataset item into the user message
- `compute_reward()` -- Score the rollout using ToolContext
- `evaluate()` -- Periodic evaluation logic
-
-## Core Components
-
-### Agent Loop (`agent_loop.py`)
-
-`HermesAgentLoop` is the reusable multi-turn agent engine. It runs the same pattern as hermes-agent's `run_agent.py`:
-
-1. Send messages + tools to the API via `server.chat_completion()`
-2. If the response contains `tool_calls`, execute each one via `handle_function_call()` (which delegates to `tools/registry.py`'s `dispatch()`)
-3. Append tool results to the conversation and go back to step 1
-4. If the response has no tool_calls, the agent is done
-
-Tool calls are executed in a thread pool (`run_in_executor`) so backends that use `asyncio.run()` internally (Modal, Docker) don't deadlock inside Atropos's event loop.
-
-Returns an `AgentResult` containing the full conversation history, turn count, reasoning content per turn, tool errors, and optional ManagedServer state (for Phase 2).
-
-### Tool Context (`tool_context.py`)
-
-`ToolContext` is a per-rollout handle that gives reward/verification functions direct access to **all** hermes-agent tools, scoped to the rollout's `task_id`. The same `task_id` means the terminal/browser session is the SAME one the model used during its rollout -- all state (files, processes, browser tabs) is preserved.
-
-```python
-async def compute_reward(self, item, result, ctx: ToolContext):
-    # Run tests in the model's terminal sandbox
-    test = ctx.terminal("pytest -v")
-    if test["exit_code"] == 0:
-        return 1.0
-
-    # Check if a file was created
-    content = ctx.read_file("/workspace/solution.py")
-    if content.get("content"):
-        return 0.5
-
-    # Download files locally for verification (binary-safe)
-    ctx.download_file("/remote/output.bin", "/local/output.bin")
-
-    return 0.0
-```
-
-Available methods:
- **Terminal**: `terminal(command, timeout)` -- run shell commands
- **Files**: `read_file(path)`, `write_file(path, content)`, `search(query, path)`
- **Transfers**: `upload_file()`, `upload_dir()`, `download_file()`, `download_dir()` -- binary-safe file transfers between host and sandbox
- **Web**: `web_search(query)`, `web_extract(urls)`
- **Browser**: `browser_navigate(url)`, `browser_snapshot()`
- **Generic**: `call_tool(name, args)` -- call any hermes-agent tool by name
- **Cleanup**: `cleanup()` -- release all resources (called automatically after `compute_reward`)
-
-### Patches (`patches.py`)
-
-**Problem**: Some hermes-agent tools use `asyncio.run()` internally (e.g., the Modal backend). This crashes when called from inside Atropos's event loop because `asyncio.run()` cannot be nested.
-
-**Solution**: `ModalEnvironment` uses a dedicated `_AsyncWorker` background thread with its own event loop. The calling code sees a sync interface, but internally all async Modal SDK calls happen on the worker thread so they don't conflict with Atropos's loop. This is built directly into `tools/environments/modal.py` — no monkey-patching required.
-
-`patches.py` is now a no-op (kept for backward compatibility with imports).
-
-### Tool Call Parsers (`tool_call_parsers/`)
-
-Client-side parsers that extract structured `tool_calls` from raw model output text. Used in **Phase 2** (VLLM server type) where ManagedServer's `/generate` endpoint returns raw text without tool call parsing.
-
-Each parser is a standalone reimplementation of the corresponding VLLM parser's `extract_tool_calls()` logic. No VLLM dependency -- only standard library (`re`, `json`, `uuid`) and `openai` types.
-
-Available parsers:
- `hermes` -- Hermes/ChatML `<tool_call>` XML format
- `mistral` -- Mistral `[TOOL_CALLS]` format
- `llama3_json` -- Llama 3 JSON tool calling
- `qwen` -- Qwen tool calling format
- `qwen3_coder` -- Qwen3 Coder format
- `deepseek_v3` -- DeepSeek V3 format
- `deepseek_v3_1` -- DeepSeek V3.1 format
- `kimi_k2` -- Kimi K2 format
- `longcat` -- Longcat format
- `glm45` / `glm47` -- GLM model formats
-
-Usage:
-```python
-from environments.tool_call_parsers import get_parser
-
-parser = get_parser("hermes")
-content, tool_calls = parser.parse(raw_model_output)
-```
-
-In Phase 1 (OpenAI server type), these parsers are not needed -- the server handles tool call parsing natively.
-
-## Two-Phase Operation
-
-### Phase 1: OpenAI Server (Evaluation / SFT Data Generation)
-
-Uses `server.chat_completion()` with `tools=` parameter. The server (VLLM, SGLang, OpenRouter, OpenAI) handles tool call parsing natively. Returns `ChatCompletion` objects with structured `tool_calls`.
-
- Good for: evaluation, SFT data generation, testing
- Run with: `serve` (with `run-api`), `process`, or `evaluate` subcommands
- Placeholder tokens are created for the Atropos pipeline
-
-### Phase 2: VLLM ManagedServer (Full RL Training)
-
-Uses ManagedServer for exact token IDs + logprobs via `/generate`. Client-side tool call parser (from `tool_call_parsers/`) reconstructs structured `tool_calls` from raw output.
-
- Good for: full RL training with GRPO/PPO
- Run with: `serve` subcommand
- Real tokens, masks, and logprobs flow through the pipeline
-
-## Directory Structure
-
-```
-environments/
-├── README.md                     # This file
-├── __init__.py                   # Package exports
-├── hermes_base_env.py            # Abstract base (HermesAgentBaseEnv)
-├── agent_loop.py                 # Multi-turn agent engine (HermesAgentLoop)
-├── tool_context.py               # Per-rollout tool access for reward functions
-├── patches.py                    # Async-safety patches for Modal backend
-│
-├── tool_call_parsers/            # Phase 2 client-side parsers
-│   ├── __init__.py               # Registry + base class
-│   ├── hermes_parser.py
-│   ├── mistral_parser.py
-│   ├── llama_parser.py
-│   ├── qwen_parser.py
-│   ├── qwen3_coder_parser.py
-│   ├── deepseek_v3_parser.py
-│   ├── deepseek_v3_1_parser.py
-│   ├── kimi_k2_parser.py
-│   ├── longcat_parser.py
-│   ├── glm45_parser.py
-│   └── glm47_parser.py
-│
-├── terminal_test_env/            # Stack validation environment
-│   └── terminal_test_env.py
-│
-├── hermes_swe_env/               # SWE-bench style training environment
-│   └── hermes_swe_env.py
-│
-└── benchmarks/                   # Evaluation benchmarks
-    ├── terminalbench_2/          # 89 terminal tasks, Modal sandboxes
-    │   └── terminalbench2_env.py
-    ├── tblite/                   # 100 calibrated tasks (fast TB2 proxy)
-    │   └── tblite_env.py
-    └── yc_bench/                 # Long-horizon strategic benchmark
-        └── yc_bench_env.py
-```
-
-## Concrete Environments
-
-### TerminalTestEnv (`terminal_test_env/`)
-
-A self-contained environment with inline tasks (no external dataset needed) for validating the full stack end-to-end. Each task asks the model to create a file at a known path, and the verifier checks the content matches.
-
-```bash
-# Serve mode (needs run-api)
-run-api
-python environments/terminal_test_env/terminal_test_env.py serve
-
-# Process mode (no run-api, saves to JSONL)
-python environments/terminal_test_env/terminal_test_env.py process \
-    --env.data_path_to_save_groups terminal_test_output.jsonl
-```
-
-### HermesSweEnv (`hermes_swe_env/`)
-
-SWE-bench style training environment. The model gets a coding task, uses terminal + file + web tools to solve it, and the reward function runs tests in the same Modal sandbox.
-
-```bash
-python environments/hermes_swe_env/hermes_swe_env.py serve \
-    --openai.model_name YourModel \
-    --env.dataset_name bigcode/humanevalpack \
-    --env.terminal_backend modal
-```
-
-### TerminalBench2EvalEnv (`benchmarks/terminalbench_2/`)
-
-**Eval-only** environment for the Terminal-Bench 2.0 benchmark (89 tasks). Each task gets a pre-built Docker Hub image, a natural language instruction, and a test suite. The agent uses terminal + file tools to solve the task, then the test suite verifies correctness.
-
-Follows the standard Atropos eval pattern (like GPQA, MMLU, etc.):
- Run via `evaluate` subcommand (no `run-api` needed)
- `setup()` loads the dataset, `evaluate()` runs all tasks
- `rollout_and_score_eval()` handles per-task agent loop + test verification
- Downloads verifier output locally for reliable reward checking (Harbor pattern)
-
-```bash
-# Run full benchmark
-python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
-    --openai.model_name anthropic/claude-opus-4.6
-
-# Run subset of tasks
-python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
-    --openai.model_name anthropic/claude-opus-4.6 \
-    --env.task_filter fix-git,git-multibranch
-
-# Skip specific tasks
-python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
-    --openai.model_name anthropic/claude-opus-4.6 \
-    --env.skip_tasks heavy-task,slow-task
-```
-
-## Creating a New Environment
-
-### Training Environment
-
-1. Create a new directory under `environments/`
-2. Create your env file inheriting from `HermesAgentBaseEnv`
-3. Implement the four abstract methods + `evaluate()`
-
-```python
-from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
-
-class MyEnvConfig(HermesAgentEnvConfig):
-    pass  # Add custom fields as needed
-
-class MyEnv(HermesAgentBaseEnv):
-    name = "my-env"
-    env_config_cls = MyEnvConfig
-
-    @classmethod
-    def config_init(cls):
-        env_config = MyEnvConfig(
-            enabled_toolsets=["terminal", "file"],
-            terminal_backend="modal",
-            # ... other config
-        )
-        server_configs = [APIServerConfig(...)]
-        return env_config, server_configs
-
-    async def setup(self):
-        self.dataset = load_dataset(...)
-        self.iter = 0
-
-    async def get_next_item(self):
-        item = self.dataset[self.iter % len(self.dataset)]
-        self.iter += 1
-        return item
-
-    def format_prompt(self, item):
-        return item["instruction"]
-
-    async def compute_reward(self, item, result, ctx):
-        # ctx gives you full tool access to the rollout's sandbox
-        test = ctx.terminal("pytest -v")
-        return 1.0 if test["exit_code"] == 0 else 0.0
-
-    async def evaluate(self, *args, **kwargs):
-        # Periodic evaluation logic
-        ...
-
-if __name__ == "__main__":
-    MyEnv.cli()
-```
-
-### Eval-Only Environment (Benchmark)
-
-For eval benchmarks, follow the pattern in `terminalbench2_env.py`:
-1. Create under `environments/benchmarks/your-benchmark/`
-2. Inherit from `HermesAgentBaseEnv`
-3. Set eval-only config: `eval_handling=STOP_TRAIN`, `steps_per_eval=1`, `total_steps=1`
-4. Stub the training methods (`collect_trajectories`, `score`)
-5. Implement `rollout_and_score_eval()` and `evaluate()`
-6. Run with `evaluate` subcommand
-
-## Key Config Fields
-
-| Field | Description | Default |
-|-------|-------------|---------|
-| `enabled_toolsets` | Which hermes toolsets to enable | `None` (all) |
-| `disabled_toolsets` | Toolsets to disable | `None` |
-| `distribution` | Probabilistic toolset distribution name | `None` |
-| `max_agent_turns` | Max LLM calls per rollout | `30` |
-| `agent_temperature` | Sampling temperature | `1.0` |
-| `terminal_backend` | `local`, `docker`, `modal`, `daytona`, `ssh`, `singularity` | `local` |
-| `system_prompt` | System message for the agent | `None` |
-| `tool_call_parser` | Parser name for Phase 2 | `hermes` |
-| `eval_handling` | `STOP_TRAIN`, `LIMIT_TRAIN`, `NONE` | `STOP_TRAIN` |
@@ -1,36 +0,0 @@
-"""
-Hermes-Agent Atropos Environments
-
-Provides a layered integration between hermes-agent's tool-calling capabilities
-and the Atropos RL training framework.
-
-Core layers:
-    - agent_loop: Reusable multi-turn agent loop with standard OpenAI-spec tool calling
-    - tool_context: Per-rollout tool access handle for reward/verification functions
-    - hermes_base_env: Abstract base environment (BaseEnv subclass) for Atropos
-    - tool_call_parsers: Client-side tool call parser registry for Phase 2 (VLLM /generate)
-
-Concrete environments:
-    - terminal_test_env/: Simple file-creation tasks for testing the stack
-    - hermes_swe_env/: SWE-bench style tasks with Modal sandboxes
-
-Benchmarks (eval-only):
-    - benchmarks/terminalbench_2/: Terminal-Bench 2.0 evaluation
-"""
-
-try:
-    from environments.agent_loop import AgentResult, HermesAgentLoop
-    from environments.tool_context import ToolContext
-    from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
-except ImportError:
-    # atroposlib not installed — environments are unavailable but
-    # submodules like tool_call_parsers can still be imported directly.
-    pass
-
-__all__ = [
-    "AgentResult",
-    "HermesAgentLoop",
-    "ToolContext",
-    "HermesAgentBaseEnv",
-    "HermesAgentEnvConfig",
-]
@@ -1,534 +0,0 @@
-"""
-HermesAgentLoop -- Reusable Multi-Turn Agent Engine
-
-Runs the hermes-agent tool-calling loop using standard OpenAI-spec tool calling.
-Works with any server that returns ChatCompletion objects with tool_calls:
-    - Phase 1: OpenAI server type (VLLM, SGLang, OpenRouter, OpenAI API)
-    - Phase 2: ManagedServer with client-side tool call parser
-
-The loop passes tools= and checks response.choices[0].message.tool_calls,
-identical to hermes-agent's run_agent.py. Tool execution is dispatched via
-handle_function_call() from model_tools.py.
-"""
-
-import asyncio
-import concurrent.futures
-import json
-import logging
-import os
-import uuid
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Set
-
-from model_tools import handle_function_call
-from tools.terminal_tool import get_active_env
-from tools.tool_result_storage import maybe_persist_tool_result, enforce_turn_budget
-
-# Thread pool for running sync tool calls that internally use asyncio.run()
-# (e.g., the Modal/Docker/Daytona terminal backends). Running them in a separate
-# thread gives them a clean event loop so they don't deadlock inside Atropos's loop.
-# Size must be large enough for concurrent eval tasks (e.g., 89 TB2 tasks all
-# making tool calls). Too small = thread pool starvation, tasks queue for minutes.
-# Resized at runtime by HermesAgentBaseEnv.__init__ via resize_tool_pool().
-_tool_executor = concurrent.futures.ThreadPoolExecutor(max_workers=128)
-
-
-def resize_tool_pool(max_workers: int):
-    """
-    Replace the global tool executor with a new one of the given size.
-
-    Called by HermesAgentBaseEnv.__init__ based on config.tool_pool_size.
-    Safe to call before any tasks are submitted.
-    """
-    global _tool_executor
-    old_executor = _tool_executor
-    _tool_executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers)
-    old_executor.shutdown(wait=False)
-    logger.info("Tool thread pool resized to %d workers", max_workers)
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ToolError:
-    """Record of a tool execution error during the agent loop."""
-
-    turn: int                  # Which turn the error occurred on
-    tool_name: str             # Which tool was called
-    arguments: str             # The arguments passed (truncated)
-    error: str                 # The error message
-    tool_result: str           # The raw result returned to the model
-
-
-@dataclass
-class AgentResult:
-    """Result of running the agent loop."""
-
-    # Full conversation history in OpenAI message format
-    messages: List[Dict[str, Any]]
-    # ManagedServer.get_state() if available (Phase 2), None otherwise
-    managed_state: Optional[Dict[str, Any]] = None
-    # How many LLM calls were made
-    turns_used: int = 0
-    # True if model stopped calling tools naturally (vs hitting max_turns)
-    finished_naturally: bool = False
-    # Extracted reasoning content per turn (from PR #297 helpers)
-    reasoning_per_turn: List[Optional[str]] = field(default_factory=list)
-    # Tool errors encountered during the loop
-    tool_errors: List[ToolError] = field(default_factory=list)
-
-
-def _extract_reasoning_from_message(message) -> Optional[str]:
-    """
-    Extract reasoning content from a ChatCompletion message.
-
-    Handles multiple provider formats:
-    1. message.reasoning_content field (some providers)
-    2. message.reasoning field (some providers)
-    3. message.reasoning_details[].text (OpenRouter style)
-
-    Note: <think> block extraction from content is NOT done here -- that's
-    handled by the response already in Phase 1 (server does it) or by
-    ManagedServer's patch in Phase 2.
-
-    Args:
-        message: The assistant message from ChatCompletion response
-
-    Returns:
-        Extracted reasoning text, or None if not found
-    """
-    # Check reasoning_content field (common across providers)
-    if hasattr(message, "reasoning_content") and message.reasoning_content:
-        return message.reasoning_content
-
-    # Check reasoning field
-    if hasattr(message, "reasoning") and message.reasoning:
-        return message.reasoning
-
-    # Check reasoning_details (OpenRouter style)
-    if hasattr(message, "reasoning_details") and message.reasoning_details:
-        for detail in message.reasoning_details:
-            if hasattr(detail, "text") and detail.text:
-                return detail.text
-            if isinstance(detail, dict) and detail.get("text"):
-                return detail["text"]
-
-    return None
-
-
-class HermesAgentLoop:
-    """
-    Runs hermes-agent's tool-calling loop using standard OpenAI-spec tool calling.
-
-    Same pattern as run_agent.py:
-    - Pass tools= to the API
-    - Check response.choices[0].message.tool_calls
-    - Dispatch via handle_function_call()
-
-    Works identically with any server type -- OpenAI, VLLM, SGLang, OpenRouter,
-    or ManagedServer with a parser. The server determines how tool_calls get
-    populated on the response.
-    """
-
-    def __init__(
-        self,
-        server,
-        tool_schemas: List[Dict[str, Any]],
-        valid_tool_names: Set[str],
-        max_turns: int = 30,
-        task_id: Optional[str] = None,
-        temperature: float = 1.0,
-        max_tokens: Optional[int] = None,
-        extra_body: Optional[Dict[str, Any]] = None,
-        budget_config: Optional["BudgetConfig"] = None,
-    ):
-        """
-        Initialize the agent loop.
-
-        Args:
-            server: Server object with chat_completion() method (OpenAIServer,
-                    ManagedServer, ServerManager, etc.)
-            tool_schemas: OpenAI-format tool definitions from get_tool_definitions()
-            valid_tool_names: Set of tool names the model is allowed to call
-            max_turns: Maximum number of LLM calls before stopping
-            task_id: Unique ID for terminal/browser session isolation
-            temperature: Sampling temperature for generation
-            max_tokens: Max tokens per generation (None for server default)
-            extra_body: Extra parameters passed to the OpenAI client's create() call.
-                        Used for OpenRouter provider preferences, transforms, etc.
-                        e.g. {"provider": {"ignore": ["DeepInfra"]}}
-            budget_config: Tool result persistence budget. Controls per-tool
-                        thresholds, per-turn aggregate budget, and preview size.
-                        If None, uses DEFAULT_BUDGET (current hardcoded values).
-        """
-        from tools.budget_config import DEFAULT_BUDGET
-        self.server = server
-        self.tool_schemas = tool_schemas
-        self.valid_tool_names = valid_tool_names
-        self.max_turns = max_turns
-        self.task_id = task_id or str(uuid.uuid4())
-        self.temperature = temperature
-        self.max_tokens = max_tokens
-        self.extra_body = extra_body
-        self.budget_config = budget_config or DEFAULT_BUDGET
-
-    async def run(self, messages: List[Dict[str, Any]]) -> AgentResult:
-        """
-        Execute the full agent loop using standard OpenAI tool calling.
-
-        Args:
-            messages: Initial conversation messages (system + user).
-                      Modified in-place as the conversation progresses.
-
-        Returns:
-            AgentResult with full conversation history, managed state, and metadata
-        """
-        reasoning_per_turn = []
-        tool_errors: List[ToolError] = []
-
-        # Per-loop TodoStore for the todo tool (ephemeral, dies with the loop)
-        from tools.todo_tool import TodoStore, todo_tool as _todo_tool
-        _todo_store = TodoStore()
-
-        # Extract user task from first user message for browser_snapshot context
-        _user_task = None
-        for msg in messages:
-            if msg.get("role") == "user":
-                content = msg.get("content", "")
-                if isinstance(content, str) and content.strip():
-                    _user_task = content.strip()[:500]  # Cap to avoid huge strings
-                break
-
-        import time as _time
-
-        for turn in range(self.max_turns):
-            turn_start = _time.monotonic()
-
-            # Build the chat_completion kwargs
-            chat_kwargs = {
-                "messages": messages,
-                "n": 1,
-                "temperature": self.temperature,
-            }
-
-            # Only pass tools if we have them
-            if self.tool_schemas:
-                chat_kwargs["tools"] = self.tool_schemas
-
-            # Only pass max_tokens if explicitly set
-            if self.max_tokens is not None:
-                chat_kwargs["max_tokens"] = self.max_tokens
-
-            # Inject extra_body for provider-specific params (e.g., OpenRouter
-            # provider preferences like banned/preferred providers, transforms)
-            if self.extra_body:
-                chat_kwargs["extra_body"] = self.extra_body
-
-            # Make the API call -- standard OpenAI spec
-            api_start = _time.monotonic()
-            try:
-                response = await self.server.chat_completion(**chat_kwargs)
-            except Exception as e:
-                api_elapsed = _time.monotonic() - api_start
-                logger.error("API call failed on turn %d (%.1fs): %s", turn + 1, api_elapsed, e)
-                return AgentResult(
-                    messages=messages,
-                    managed_state=self._get_managed_state(),
-                    turns_used=turn + 1,
-                    finished_naturally=False,
-                    reasoning_per_turn=reasoning_per_turn,
-                    tool_errors=tool_errors,
-                )
-
-            api_elapsed = _time.monotonic() - api_start
-
-            if not response or not response.choices:
-                logger.warning("Empty response on turn %d (api=%.1fs)", turn + 1, api_elapsed)
-                return AgentResult(
-                    messages=messages,
-                    managed_state=self._get_managed_state(),
-                    turns_used=turn + 1,
-                    finished_naturally=False,
-                    reasoning_per_turn=reasoning_per_turn,
-                    tool_errors=tool_errors,
-                )
-
-            assistant_msg = response.choices[0].message
-
-            # Extract reasoning content from the response (all provider formats)
-            reasoning = _extract_reasoning_from_message(assistant_msg)
-            reasoning_per_turn.append(reasoning)
-
-            # Check for tool calls -- standard OpenAI spec.
-            # Fallback: if response has no structured tool_calls but content
-            # contains raw tool call tags (e.g. <tool_call>), parse them using
-            # hermes-agent's standalone parsers. This handles the case where
-            # ManagedServer's ToolCallTranslator couldn't parse because vLLM
-            # isn't installed.
-            if (
-                not assistant_msg.tool_calls
-                and assistant_msg.content
-                and self.tool_schemas
-                and "<tool_call>" in (assistant_msg.content or "")
-            ):
-                try:
-                    from environments.tool_call_parsers import get_parser
-                    fallback_parser = get_parser("hermes")
-                    parsed_content, parsed_calls = fallback_parser.parse(
-                        assistant_msg.content
-                    )
-                    if parsed_calls:
-                        assistant_msg.tool_calls = parsed_calls
-                        if parsed_content is not None:
-                            assistant_msg.content = parsed_content
-                        logger.debug(
-                            "Fallback parser extracted %d tool calls from raw content",
-                            len(parsed_calls),
-                        )
-                except Exception:
-                    pass  # Fall through to no tool calls
-
-            if assistant_msg.tool_calls:
-                # Normalize tool calls to dicts — they may come as objects
-                # (OpenAI API) or dicts (vLLM ToolCallTranslator).
-                def _tc_to_dict(tc):
-                    if isinstance(tc, dict):
-                        return {
-                            "id": tc.get("id", f"call_{uuid.uuid4().hex[:8]}"),
-                            "type": "function",
-                            "function": {
-                                "name": tc.get("function", {}).get("name", tc.get("name", "")),
-                                "arguments": tc.get("function", {}).get("arguments", tc.get("arguments", "{}")),
-                            },
-                        }
-                    return {
-                        "id": tc.id,
-                        "type": "function",
-                        "function": {
-                            "name": tc.function.name,
-                            "arguments": tc.function.arguments,
-                        },
-                    }
-
-                # Build the assistant message dict for conversation history
-                msg_dict: Dict[str, Any] = {
-                    "role": "assistant",
-                    "content": assistant_msg.content or "",
-                    "tool_calls": [_tc_to_dict(tc) for tc in assistant_msg.tool_calls],
-                }
-
-                # Preserve reasoning_content for multi-turn chat template handling
-                # (e.g., Kimi-K2's template renders <think> blocks differently
-                # for history vs. the latest turn based on this field)
-                if reasoning:
-                    msg_dict["reasoning_content"] = reasoning
-
-                messages.append(msg_dict)
-
-                # Execute each tool call via hermes-agent's dispatch
-                for tc in assistant_msg.tool_calls:
-                    # Handle both object (OpenAI) and dict (vLLM) formats
-                    if isinstance(tc, dict):
-                        tool_name = tc.get("function", {}).get("name", tc.get("name", ""))
-                        tool_args_raw = tc.get("function", {}).get("arguments", tc.get("arguments", "{}"))
-                    else:
-                        tool_name = tc.function.name
-                        tool_args_raw = tc.function.arguments
-
-                    # Validate tool name
-                    if tool_name not in self.valid_tool_names:
-                        tool_result = json.dumps(
-                            {
-                                "error": f"Unknown tool '{tool_name}'. "
-                                f"Available tools: {sorted(self.valid_tool_names)}"
-                            }
-                        )
-                        tool_errors.append(ToolError(
-                            turn=turn + 1, tool_name=tool_name,
-                            arguments=tool_args_raw[:200],
-                            error=f"Unknown tool '{tool_name}'",
-                            tool_result=tool_result,
-                        ))
-                        logger.warning(
-                            "Model called unknown tool '%s' on turn %d",
-                            tool_name, turn + 1,
-                        )
-                    else:
-                        # Parse arguments
-                        try:
-                            args = json.loads(tool_args_raw)
-                        except json.JSONDecodeError as e:
-                            args = None
-                            tool_result = json.dumps(
-                                {"error": f"Invalid JSON in tool arguments: {e}. Please retry with valid JSON."}
-                            )
-                            tool_errors.append(ToolError(
-                                turn=turn + 1, tool_name=tool_name,
-                                arguments=tool_args_raw[:200],
-                                error=f"Invalid JSON: {e}",
-                                tool_result=tool_result,
-                            ))
-                            logger.warning(
-                                "Invalid JSON in tool call arguments for '%s': %s",
-                                tool_name, tool_args_raw[:200],
-                            )
-
-                        # Dispatch tool only if arguments parsed successfully
-                        if args is not None:
-                            try:
-                                if tool_name == "terminal":
-                                    backend = os.getenv("TERMINAL_ENV", "local")
-                                    cmd_preview = args.get("command", "")[:80]
-                                    logger.info(
-                                        "[%s] $ %s", self.task_id[:8], cmd_preview,
-                                    )
-
-                                tool_submit_time = _time.monotonic()
-
-                                # Todo tool -- handle locally (needs per-loop TodoStore)
-                                if tool_name == "todo":
-                                    tool_result = _todo_tool(
-                                        todos=args.get("todos"),
-                                        merge=args.get("merge", False),
-                                        store=_todo_store,
-                                    )
-                                    tool_elapsed = _time.monotonic() - tool_submit_time
-                                elif tool_name == "memory":
-                                    tool_result = json.dumps({"error": "Memory is not available in RL environments."})
-                                    tool_elapsed = _time.monotonic() - tool_submit_time
-                                elif tool_name == "session_search":
-                                    tool_result = json.dumps({"error": "Session search is not available in RL environments."})
-                                    tool_elapsed = _time.monotonic() - tool_submit_time
-                                else:
-                                    # Run tool calls in a thread pool so backends that
-                                    # use asyncio.run() internally (modal, docker, daytona) get
-                                    # a clean event loop instead of deadlocking.
-                                    loop = asyncio.get_running_loop()
-                                    # Capture current tool_name/args for the lambda
-                                    _tn, _ta, _tid = tool_name, args, self.task_id
-                                    tool_result = await loop.run_in_executor(
-                                        _tool_executor,
-                                        lambda: handle_function_call(
-                                            _tn, _ta, task_id=_tid,
-                                            user_task=_user_task,
-                                        ),
-                                    )
-                                    tool_elapsed = _time.monotonic() - tool_submit_time
-
-                                # Log slow tools and thread pool stats for debugging
-                                pool_active = _tool_executor._work_queue.qsize()
-                                if tool_elapsed > 30:
-                                    logger.warning(
-                                        "[%s] turn %d: %s took %.1fs (pool queue=%d)",
-                                        self.task_id[:8], turn + 1, tool_name,
-                                        tool_elapsed, pool_active,
-                                    )
-                            except Exception as e:
-                                tool_result = json.dumps(
-                                    {"error": f"Tool execution failed: {type(e).__name__}: {str(e)}"}
-                                )
-                                tool_errors.append(ToolError(
-                                    turn=turn + 1, tool_name=tool_name,
-                                    arguments=tool_args_raw[:200],
-                                    error=f"{type(e).__name__}: {str(e)}",
-                                    tool_result=tool_result,
-                                ))
-                                logger.error(
-                                    "Tool '%s' execution failed on turn %d: %s",
-                                    tool_name, turn + 1, e,
-                                )
-
-                        # Also check if the tool returned an error in its JSON result
-                        try:
-                            result_data = json.loads(tool_result)
-                            if isinstance(result_data, dict):
-                                err = result_data.get("error")
-                                exit_code = result_data.get("exit_code")
-                                if err and exit_code and exit_code < 0:
-                                    tool_errors.append(ToolError(
-                                        turn=turn + 1, tool_name=tool_name,
-                                        arguments=tool_args_raw[:200],
-                                        error=str(err),
-                                        tool_result=tool_result[:500],
-                                    ))
-                        except (json.JSONDecodeError, TypeError):
-                            pass
-
-                    tc_id = tc.get("id", "") if isinstance(tc, dict) else tc.id
-                    tool_result = maybe_persist_tool_result(
-                        content=tool_result,
-                        tool_name=tool_name,
-                        tool_use_id=tc_id,
-                        env=get_active_env(self.task_id),
-                        config=self.budget_config,
-                    )
-
-                    messages.append(
-                        {
-                            "role": "tool",
-                            "tool_call_id": tc_id,
-                            "content": tool_result,
-                        }
-                    )
-
-                num_tcs = len(assistant_msg.tool_calls)
-                if num_tcs > 0:
-                    enforce_turn_budget(
-                        messages[-num_tcs:],
-                        env=get_active_env(self.task_id),
-                        config=self.budget_config,
-                    )
-
-                turn_elapsed = _time.monotonic() - turn_start
-                logger.info(
-                    "[%s] turn %d: api=%.1fs, %d tools, turn_total=%.1fs",
-                    self.task_id[:8], turn + 1, api_elapsed,
-                    len(assistant_msg.tool_calls), turn_elapsed,
-                )
-
-            else:
-                # No tool calls -- model is done
-                msg_dict = {
-                    "role": "assistant",
-                    "content": assistant_msg.content or "",
-                }
-                if reasoning:
-                    msg_dict["reasoning_content"] = reasoning
-                messages.append(msg_dict)
-
-                turn_elapsed = _time.monotonic() - turn_start
-                logger.info(
-                    "[%s] turn %d: api=%.1fs, no tools (finished), turn_total=%.1fs",
-                    self.task_id[:8], turn + 1, api_elapsed, turn_elapsed,
-                )
-
-                return AgentResult(
-                    messages=messages,
-                    managed_state=self._get_managed_state(),
-                    turns_used=turn + 1,
-                    finished_naturally=True,
-                    reasoning_per_turn=reasoning_per_turn,
-                    tool_errors=tool_errors,
-                )
-
-        # Hit max turns without the model stopping
-        logger.info("Agent hit max_turns (%d) without finishing", self.max_turns)
-        return AgentResult(
-            messages=messages,
-            managed_state=self._get_managed_state(),
-            turns_used=self.max_turns,
-            finished_naturally=False,
-            reasoning_per_turn=reasoning_per_turn,
-            tool_errors=tool_errors,
-        )
-
-    def _get_managed_state(self) -> Optional[Dict[str, Any]]:
-        """
-        Get ManagedServer state if the server supports it.
-
-        Returns state dict with SequenceNodes containing tokens/logprobs/masks,
-        or None if the server doesn't support get_state() (e.g., regular OpenAI server).
-        """
-        if hasattr(self.server, "get_state"):
-            return self.server.get_state()
-        return None
@@ -1,73 +0,0 @@
-# OpenThoughts-TBLite Evaluation Environment
-
-This environment evaluates terminal agents on the [OpenThoughts-TBLite](https://huggingface.co/datasets/open-thoughts/OpenThoughts-TBLite) benchmark, a difficulty-calibrated subset of [Terminal-Bench 2.0](https://www.tbench.ai/leaderboard/terminal-bench/2.0).
-
-## Source
-
-OpenThoughts-TBLite was created by the [OpenThoughts](https://www.openthoughts.ai/) Agent team in collaboration with [Snorkel AI](https://snorkel.ai/) and [Bespoke Labs](https://bespokelabs.ai/). The original dataset and documentation live at:
-
- **Dataset (source):** [open-thoughts/OpenThoughts-TBLite](https://huggingface.co/datasets/open-thoughts/OpenThoughts-TBLite)
- **GitHub:** [open-thoughts/OpenThoughts-TBLite](https://github.com/open-thoughts/OpenThoughts-TBLite)
- **Blog post:** [openthoughts.ai/blog/openthoughts-tblite](https://www.openthoughts.ai/blog/openthoughts-tblite)
-
-## Our Dataset
-
-We converted the source into the same schema used by our Terminal-Bench 2.0 environment (pre-built Docker Hub images, base64-encoded test tarballs, etc.) and published it as:
-
- **Dataset (ours):** [NousResearch/openthoughts-tblite](https://huggingface.co/datasets/NousResearch/openthoughts-tblite)
- **Docker images:** `nousresearch/tblite-<task-name>:latest` on Docker Hub (100 images)
-
-The conversion script is at `scripts/prepare_tblite_dataset.py`.
-
-## Why TBLite?
-
-Terminal-Bench 2.0 is one of the strongest frontier evaluations for terminal agents, but when a model scores near the floor (e.g., Qwen 3 8B at <1%), many changes look identical in aggregate score. TBLite addresses this by calibrating task difficulty using Claude Haiku 4.5 as a reference:
-
-| Difficulty | Pass Rate Range | Tasks |
-|------------|----------------|-------|
-| Easy       | >= 70%         | 40    |
-| Medium     | 40-69%         | 26    |
-| Hard       | 10-39%         | 26    |
-| Extreme    | < 10%          | 8     |
-
-This gives enough solvable tasks to detect small improvements quickly, while preserving enough hard tasks to avoid saturation. The correlation between TBLite and TB2 scores is **r = 0.911**.
-
-TBLite also runs 2.6-8x faster than the full TB2, making it practical for iteration loops.
-
-## Usage
-
-```bash
-# Run the full benchmark
-python environments/benchmarks/tblite/tblite_env.py evaluate
-
-# Filter to specific tasks
-python environments/benchmarks/tblite/tblite_env.py evaluate \
-    --env.task_filter "broken-python,pandas-etl"
-
-# Use a different model
-python environments/benchmarks/tblite/tblite_env.py evaluate \
-    --server.model_name "qwen/qwen3-30b"
-```
-
-## Architecture
-
-`TBLiteEvalEnv` is a thin subclass of `TerminalBench2EvalEnv`. All evaluation logic (agent loop, Docker sandbox management, test verification, metrics) is inherited. Only the defaults differ:
-
-| Setting        | TB2                              | TBLite                                  |
-|----------------|----------------------------------|-----------------------------------------|
-| Dataset        | `NousResearch/terminal-bench-2`  | `NousResearch/openthoughts-tblite`      |
-| Tasks          | 89                               | 100                                     |
-| Task timeout   | 1800s (30 min)                   | 1200s (20 min)                          |
-| Wandb name     | `terminal-bench-2`               | `openthoughts-tblite`                   |
-
-## Citation
-
-```bibtex
-@software{OpenThoughts-TBLite,
-  author = {OpenThoughts-Agent team, Snorkel AI, Bespoke Labs},
-  month = Feb,
-  title = {{OpenThoughts-TBLite: A High-Signal Benchmark for Iterating on Terminal Agents}},
-  howpublished = {https://www.openthoughts.ai/blog/openthoughts-tblite},
-  year = {2026}
-}
-```
@@ -1,39 +0,0 @@
-# OpenThoughts-TBLite Evaluation -- Default Configuration
-#
-# Eval-only environment for the TBLite benchmark (100 difficulty-calibrated
-# terminal tasks, a faster proxy for Terminal-Bench 2.0).
-# Uses Modal terminal backend for per-task cloud-isolated sandboxes
-# and OpenRouter for inference.
-#
-# Usage:
-#   python environments/benchmarks/tblite/tblite_env.py evaluate \
-#       --config environments/benchmarks/tblite/default.yaml
-#
-#   # Override model:
-#   python environments/benchmarks/tblite/tblite_env.py evaluate \
-#       --config environments/benchmarks/tblite/default.yaml \
-#       --openai.model_name anthropic/claude-sonnet-4
-
-env:
-  enabled_toolsets: ["terminal", "file"]
-  max_agent_turns: 60
-  max_token_length: 32000
-  agent_temperature: 0.8
-  terminal_backend: "modal"
-  terminal_timeout: 300        # 5 min per command (builds, pip install)
-  tool_pool_size: 128          # thread pool for 100 parallel tasks
-  dataset_name: "NousResearch/openthoughts-tblite"
-  test_timeout: 600
-  task_timeout: 1200           # 20 min wall-clock per task (TBLite tasks are faster)
-  tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
-  use_wandb: true
-  wandb_name: "openthoughts-tblite"
-  ensure_scores_are_not_same: false
-  data_dir_to_save_evals: "environments/benchmarks/evals/openthoughts-tblite"
-
-openai:
-  base_url: "https://openrouter.ai/api/v1"
-  model_name: "anthropic/claude-opus-4.6"
-  server_type: "openai"
-  health_check: false
-  # api_key loaded from OPENROUTER_API_KEY in .env
@@ -1,38 +0,0 @@
-# OpenThoughts-TBLite Evaluation -- Docker Backend (Local Compute)
-#
-# Runs tasks in Docker containers on the local machine.
-# Sandboxed like Modal but no cloud costs. Good for dev/testing.
-#
-# Usage:
-#   python environments/benchmarks/tblite/tblite_env.py evaluate \
-#       --config environments/benchmarks/tblite/local.yaml
-#
-#   # Override concurrency:
-#   python environments/benchmarks/tblite/tblite_env.py evaluate \
-#       --config environments/benchmarks/tblite/local.yaml \
-#       --env.eval_concurrency 4
-
-env:
-  enabled_toolsets: ["terminal", "file"]
-  max_agent_turns: 60
-  max_token_length: 32000
-  agent_temperature: 0.8
-  terminal_backend: "docker"
-  terminal_timeout: 300
-  tool_pool_size: 16
-  dataset_name: "NousResearch/openthoughts-tblite"
-  test_timeout: 600
-  task_timeout: 1200
-  eval_concurrency: 8          # max 8 tasks at once
-  tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
-  use_wandb: false
-  wandb_name: "openthoughts-tblite-local"
-  ensure_scores_are_not_same: false
-  data_dir_to_save_evals: "environments/benchmarks/evals/openthoughts-tblite-local"
-
-openai:
-  base_url: "https://openrouter.ai/api/v1"
-  model_name: "anthropic/claude-sonnet-4"
-  server_type: "openai"
-  health_check: false
-  # api_key loaded from OPENROUTER_API_KEY in .env
@@ -1,40 +0,0 @@
-# OpenThoughts-TBLite Evaluation -- Local vLLM Backend
-#
-# Runs against a local vLLM server with Docker sandboxes.
-#
-# Start the vLLM server from the atropos directory:
-#   python -m example_trainer.vllm_api_server \
-#       --model Qwen/Qwen3-4B-Instruct-2507 \
-#       --port 9001 \
-#       --gpu-memory-utilization 0.8 \
-#       --max-model-len=32000
-#
-# Then run:
-#   python environments/benchmarks/tblite/tblite_env.py evaluate \
-#       --config environments/benchmarks/tblite/local_vllm.yaml
-
-env:
-  enabled_toolsets: ["terminal", "file"]
-  max_agent_turns: 60
-  max_token_length: 16000
-  agent_temperature: 0.6
-  terminal_backend: "docker"
-  terminal_timeout: 300
-  tool_pool_size: 16
-  dataset_name: "NousResearch/openthoughts-tblite"
-  test_timeout: 600
-  task_timeout: 1200
-  eval_concurrency: 8
-  tool_call_parser: "hermes"
-  system_prompt: "You are an expert terminal agent. You MUST use the provided tools to complete tasks. Use the terminal tool to run shell commands, read_file to read files, write_file to write files, search_files to search, and patch to edit files. Do NOT write out solutions as text - execute them using the tools. Always start by exploring the environment with terminal commands."
-  tokenizer_name: "Qwen/Qwen3-4B-Instruct-2507"
-  use_wandb: false
-  wandb_name: "tblite-qwen3-4b-instruct"
-  ensure_scores_are_not_same: false
-  data_dir_to_save_evals: "environments/benchmarks/evals/tblite-qwen3-4b-local"
-
-openai:
-  base_url: "http://localhost:9001"
-  model_name: "Qwen/Qwen3-4B-Instruct-2507"
-  server_type: "vllm"
-  health_check: false
@@ -1,42 +0,0 @@
-#!/bin/bash
-
-# OpenThoughts-TBLite Evaluation
-#
-# Run from repo root:
-#   bash environments/benchmarks/tblite/run_eval.sh
-#
-# Override model:
-#   bash environments/benchmarks/tblite/run_eval.sh \
-#       --openai.model_name anthropic/claude-sonnet-4
-#
-# Run a subset:
-#   bash environments/benchmarks/tblite/run_eval.sh \
-#       --env.task_filter broken-python,pandas-etl
-#
-# All terminal settings (backend, timeout, lifetime, pool size) are
-# configured via env config fields -- no env vars needed.
-
-set -euo pipefail
-
-mkdir -p logs evals/openthoughts-tblite
-LOG_FILE="logs/tblite_$(date +%Y%m%d_%H%M%S).log"
-
-echo "OpenThoughts-TBLite Evaluation"
-echo "Log file: $LOG_FILE"
-echo ""
-
-# Unbuffered python output so logs are written in real-time
-export PYTHONUNBUFFERED=1
-
-# Show INFO-level agent loop timing (api/tool durations per turn)
-# These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal
-export LOGLEVEL=INFO
-
-python tblite_env.py evaluate \
-  --config default.yaml \
-  "$@" \
-  2>&1 | tee "$LOG_FILE"
-
-echo ""
-echo "Log saved to: $LOG_FILE"
-echo "Eval results: evals/openthoughts-tblite/"
@@ -1,119 +0,0 @@
-"""
-OpenThoughts-TBLite Evaluation Environment
-
-A lighter, faster alternative to Terminal-Bench 2.0 for iterating on terminal
-agents. Uses the same evaluation logic as TerminalBench2EvalEnv but defaults
-to the NousResearch/openthoughts-tblite dataset (100 difficulty-calibrated
-tasks vs TB2's 89 harder tasks).
-
-TBLite tasks are a curated subset of TB2 with a difficulty distribution
-designed to give meaningful signal even for smaller models:
-  - Easy (40 tasks):   >= 70% pass rate with Claude Haiku 4.5
-  - Medium (26 tasks): 40-69% pass rate
-  - Hard (26 tasks):   10-39% pass rate
-  - Extreme (8 tasks): < 10% pass rate
-
-Usage:
-    python environments/benchmarks/tblite/tblite_env.py evaluate
-
-    # Filter to specific tasks:
-    python environments/benchmarks/tblite/tblite_env.py evaluate \\
-        --env.task_filter "broken-python,pandas-etl"
-"""
-
-import os
-import sys
-from pathlib import Path
-from typing import List, Tuple
-
-_repo_root = Path(__file__).resolve().parent.parent.parent.parent
-if str(_repo_root) not in sys.path:
-    sys.path.insert(0, str(_repo_root))
-
-from pydantic import Field
-
-from atroposlib.envs.base import EvalHandlingEnum
-from atroposlib.envs.server_handling.server_manager import APIServerConfig
-
-from environments.benchmarks.terminalbench_2.terminalbench2_env import (
-    TerminalBench2EvalConfig,
-    TerminalBench2EvalEnv,
-)
-
-
-class TBLiteEvalConfig(TerminalBench2EvalConfig):
-    """Configuration for the OpenThoughts-TBLite evaluation environment.
-
-    Inherits all TB2 config fields. Only the dataset default and task timeout
-    differ -- TBLite tasks are calibrated to be faster.
-    """
-
-    dataset_name: str = Field(
-        default="NousResearch/openthoughts-tblite",
-        description="HuggingFace dataset containing TBLite tasks.",
-    )
-
-    task_timeout: int = Field(
-        default=1200,
-        description="Maximum wall-clock seconds per task. TBLite tasks are "
-        "generally faster than TB2, so 20 minutes is usually sufficient.",
-    )
-
-
-class TBLiteEvalEnv(TerminalBench2EvalEnv):
-    """OpenThoughts-TBLite evaluation environment.
-
-    Inherits all evaluation logic from TerminalBench2EvalEnv (agent loop,
-    test verification, Docker image resolution, metrics, wandb logging).
-    Only the default configuration differs.
-    """
-
-    name = "openthoughts-tblite"
-    env_config_cls = TBLiteEvalConfig
-
-    @classmethod
-    def config_init(cls) -> Tuple[TBLiteEvalConfig, List[APIServerConfig]]:
-        env_config = TBLiteEvalConfig(
-            enabled_toolsets=["terminal", "file"],
-            disabled_toolsets=None,
-            distribution=None,
-
-            max_agent_turns=60,
-            max_token_length=16000,
-            agent_temperature=0.6,
-            system_prompt=None,
-
-            terminal_backend="modal",
-            terminal_timeout=300,
-
-            test_timeout=180,
-
-            # 100 tasks in parallel
-            tool_pool_size=128,
-
-            eval_handling=EvalHandlingEnum.STOP_TRAIN,
-            group_size=1,
-            steps_per_eval=1,
-            total_steps=1,
-
-            tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B",
-            use_wandb=True,
-            wandb_name="openthoughts-tblite",
-            ensure_scores_are_not_same=False,
-        )
-
-        server_configs = [
-            APIServerConfig(
-                base_url="https://openrouter.ai/api/v1",
-                model_name="anthropic/claude-sonnet-4",
-                server_type="openai",
-                api_key=os.getenv("OPENROUTER_API_KEY", ""),
-                health_check=False,
-            )
-        ]
-
-        return env_config, server_configs
-
-
-if __name__ == "__main__":
-    TBLiteEvalEnv.cli()
@@ -1,42 +0,0 @@
-# Terminal-Bench 2.0 Evaluation -- Default Configuration
-#
-# Eval-only environment for the TB2 benchmark (89 terminal tasks).
-# Uses Modal terminal backend for per-task cloud-isolated sandboxes
-# and OpenRouter for inference.
-#
-# Usage:
-#   python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
-#       --config environments/benchmarks/terminalbench_2/default.yaml
-#
-#   # Override model:
-#   python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
-#       --config environments/benchmarks/terminalbench_2/default.yaml \
-#       --openai.model_name anthropic/claude-sonnet-4
-
-env:
-  enabled_toolsets: ["terminal", "file"]
-  max_agent_turns: 60
-  max_token_length: 32000
-  agent_temperature: 0.8
-  terminal_backend: "modal"
-  terminal_timeout: 300        # 5 min per command (builds, pip install)
-  tool_pool_size: 128          # thread pool for 89 parallel tasks
-  dataset_name: "NousResearch/terminal-bench-2"
-  test_timeout: 600
-  task_timeout: 1800           # 30 min wall-clock per task, auto-FAIL if exceeded
-  tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
-  use_wandb: true
-  wandb_name: "terminal-bench-2"
-  ensure_scores_are_not_same: false
-  data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2"
-  # CRITICAL: Limit concurrent Modal sandbox creations to avoid deadlocks.
-  # Modal's blocking calls (App.lookup, etc.) deadlock when too many sandboxes
-  # are created simultaneously inside thread pool workers via asyncio.run().
-  max_concurrent_tasks: 8
-
-openai:
-  base_url: "https://openrouter.ai/api/v1"
-  model_name: "anthropic/claude-opus-4.6"
-  server_type: "openai"
-  health_check: false
-  # api_key loaded from OPENROUTER_API_KEY in .env
@@ -1,42 +0,0 @@
-#!/bin/bash
-
-# Terminal-Bench 2.0 Evaluation
-#
-# Run from repo root:
-#   bash environments/benchmarks/terminalbench_2/run_eval.sh
-#
-# Override model:
-#   bash environments/benchmarks/terminalbench_2/run_eval.sh \
-#       --openai.model_name anthropic/claude-sonnet-4
-#
-# Run a subset:
-#   bash environments/benchmarks/terminalbench_2/run_eval.sh \
-#       --env.task_filter fix-git,git-multibranch
-#
-# All terminal settings (backend, timeout, lifetime, pool size) are
-# configured via env config fields -- no env vars needed.
-
-set -euo pipefail
-
-mkdir -p logs evals/terminal-bench-2
-LOG_FILE="logs/terminalbench2_$(date +%Y%m%d_%H%M%S).log"
-
-echo "Terminal-Bench 2.0 Evaluation"
-echo "Log file: $LOG_FILE"
-echo ""
-
-# Unbuffered python output so logs are written in real-time
-export PYTHONUNBUFFERED=1
-
-# Show INFO-level agent loop timing (api/tool durations per turn)
-# These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal
-export LOGLEVEL=INFO
-
-python terminalbench2_env.py evaluate \
-  --config default.yaml \
-  "$@" \
-  2>&1 | tee "$LOG_FILE"
-
-echo ""
-echo "Log saved to: $LOG_FILE"
-echo "Eval results: evals/terminal-bench-2/"
@@ -1,115 +0,0 @@
-# YC-Bench: Long-Horizon Agent Benchmark
-
-[YC-Bench](https://github.com/collinear-ai/yc-bench) by [Collinear AI](https://collinear.ai/) is a deterministic, long-horizon benchmark that tests LLM agents' ability to act as a tech startup CEO. The agent manages a simulated company over 1-3 years, making compounding decisions about resource allocation, cash flow, task management, and prestige specialisation across 4 skill domains.
-
-Unlike TerminalBench2 (which evaluates per-task coding ability with binary pass/fail), YC-Bench measures **long-term strategic coherence** — whether an agent can maintain consistent strategy, manage compounding consequences, and adapt plans over hundreds of turns.
-
-## Setup
-
-```bash
-# Install yc-bench (optional dependency)
-pip install "hermes-agent[yc-bench]"
-
-# Or install from source
-git clone https://github.com/collinear-ai/yc-bench
-cd yc-bench && pip install -e .
-
-# Verify
-yc-bench --help
-```
-
-## Running
-
-```bash
-# From the repo root:
-bash environments/benchmarks/yc_bench/run_eval.sh
-
-# Or directly:
-python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
-    --config environments/benchmarks/yc_bench/default.yaml
-
-# Override model:
-bash environments/benchmarks/yc_bench/run_eval.sh \
-    --openai.model_name anthropic/claude-opus-4-20250514
-
-# Quick single-preset test:
-bash environments/benchmarks/yc_bench/run_eval.sh \
-    --env.presets '["fast_test"]' --env.seeds '[1]'
-```
-
-## How It Works
-
-### Architecture
-
-```
-HermesAgentLoop (our agent)
-  -> terminal tool -> subprocess("yc-bench company status") -> JSON output
-  -> terminal tool -> subprocess("yc-bench task accept --task-id X") -> JSON
-  -> terminal tool -> subprocess("yc-bench sim resume") -> JSON (advance time)
-  -> ... (100-500 turns per run)
-```
-
-The environment initialises the simulation via `yc-bench sim init` (NOT `yc-bench run`, which would start yc-bench's own built-in agent loop). Our `HermesAgentLoop` then drives all interaction through CLI commands.
-
-### Simulation Mechanics
-
- **4 skill domains**: research, inference, data_environment, training
- **Prestige system** (1.0-10.0): Gates access to higher-paying tasks
- **Employee management**: Junior/Mid/Senior with domain-specific skill rates
- **Throughput splitting**: `effective_rate = base_rate / N` active tasks per employee
- **Financial pressure**: Monthly payroll, bankruptcy = game over
- **Deterministic**: SHA256-based RNG — same seed + preset = same world
-
-### Difficulty Presets
-
-| Preset | Employees | Tasks | Focus |
-|-----------|-----------|-------|-------|
-| tutorial  | 3         | 50    | Basic loop mechanics |
-| easy      | 5         | 100   | Throughput awareness |
-| **medium**| 5         | 150   | Prestige climbing + domain specialisation |
-| **hard**  | 7         | 200   | Precise ETA reasoning |
-| nightmare | 8         | 300   | Sustained perfection under payroll pressure |
-| fast_test | (varies)  | (varies) | Quick validation (~50 turns) |
-
-Default eval runs **fast_test + medium + hard** × 3 seeds = 9 runs.
-
-### Scoring
-
-```
-composite = 0.5 × survival + 0.5 × normalised_funds
-```
-
- **Survival** (binary): Did the company avoid bankruptcy?
- **Normalised funds** (0.0-1.0): Log-scale relative to initial $250K capital
-
-## Configuration
-
-Key fields in `default.yaml`:
-
-| Field | Default | Description |
-|-------|---------|-------------|
-| `presets` | `["fast_test", "medium", "hard"]` | Which presets to evaluate |
-| `seeds` | `[1, 2, 3]` | RNG seeds per preset |
-| `max_agent_turns` | 200 | Max LLM calls per run |
-| `run_timeout` | 3600 | Wall-clock timeout per run (seconds) |
-| `survival_weight` | 0.5 | Weight of survival in composite score |
-| `funds_weight` | 0.5 | Weight of normalised funds in composite |
-| `horizon_years` | null | Override horizon (null = auto from preset) |
-
-## Cost & Time Estimates
-
-Each run is 100-500 LLM turns. Approximate costs per run at typical API rates:
-
-| Preset | Turns | Time | Est. Cost |
-|--------|-------|------|-----------|
-| fast_test | ~50 | 5-10 min | $1-5 |
-| medium | ~200 | 20-40 min | $5-15 |
-| hard | ~300 | 30-60 min | $10-25 |
-
-Full default eval (9 runs): ~3-6 hours, $50-200 depending on model.
-
-## References
-
- [collinear-ai/yc-bench](https://github.com/collinear-ai/yc-bench) — Official repository
- [Collinear AI](https://collinear.ai/) — Company behind yc-bench
- [TerminalBench2](../terminalbench_2/) — Per-task coding benchmark (complementary)
@@ -1,43 +0,0 @@
-# YC-Bench Evaluation -- Default Configuration
-#
-# Long-horizon agent benchmark: agent plays CEO of an AI startup over
-# a simulated 1-3 year run, interacting via yc-bench CLI subcommands.
-#
-# Requires: pip install "hermes-agent[yc-bench]"
-#
-# Usage:
-#   python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
-#       --config environments/benchmarks/yc_bench/default.yaml
-#
-#   # Override model:
-#   python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
-#       --config environments/benchmarks/yc_bench/default.yaml \
-#       --openai.model_name anthropic/claude-opus-4-20250514
-
-env:
-  enabled_toolsets: ["terminal"]
-  max_agent_turns: 200
-  max_token_length: 32000
-  agent_temperature: 0.0
-  terminal_backend: "local"
-  terminal_timeout: 60
-  presets: ["fast_test", "medium", "hard"]
-  seeds: [1, 2, 3]
-  run_timeout: 3600          # 60 min wall-clock per run, auto-FAIL if exceeded
-  survival_weight: 0.5       # weight of binary survival in composite score
-  funds_weight: 0.5          # weight of normalised final funds in composite score
-  db_dir: "/tmp/yc_bench_dbs"
-  company_name: "BenchCo"
-  start_date: "01/01/2025"   # MM/DD/YYYY (yc-bench convention)
-  tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
-  use_wandb: true
-  wandb_name: "yc-bench"
-  ensure_scores_are_not_same: false
-  data_dir_to_save_evals: "environments/benchmarks/evals/yc-bench"
-
-openai:
-  base_url: "https://openrouter.ai/api/v1"
-  model_name: "anthropic/claude-sonnet-4.6"
-  server_type: "openai"
-  health_check: false
-  # api_key loaded from OPENROUTER_API_KEY in .env
@@ -1,34 +0,0 @@
-#!/bin/bash
-
-# YC-Bench Evaluation
-#
-# Requires: pip install "hermes-agent[yc-bench]"
-#
-# Run from repo root:
-#   bash environments/benchmarks/yc_bench/run_eval.sh
-#
-# Override model:
-#   bash environments/benchmarks/yc_bench/run_eval.sh \
-#       --openai.model_name anthropic/claude-opus-4-20250514
-#
-# Run a single preset:
-#   bash environments/benchmarks/yc_bench/run_eval.sh \
-#       --env.presets '["fast_test"]' --env.seeds '[1]'
-
-set -euo pipefail
-
-mkdir -p logs evals/yc-bench
-LOG_FILE="logs/yc_bench_$(date +%Y%m%d_%H%M%S).log"
-
-echo "YC-Bench Evaluation"
-echo "Log: $LOG_FILE"
-echo ""
-
-PYTHONUNBUFFERED=1 LOGLEVEL="${LOGLEVEL:-INFO}" \
-  python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
-  --config environments/benchmarks/yc_bench/default.yaml \
-  "$@" \
-  2>&1 | tee "$LOG_FILE"
-
-echo ""
-echo "Log saved to: $LOG_FILE"
@@ -1,848 +0,0 @@
-"""
-YCBenchEvalEnv -- YC-Bench Long-Horizon Agent Benchmark Environment
-
-Evaluates agentic LLMs on YC-Bench: a deterministic, long-horizon benchmark
-where the agent acts as CEO of an AI startup over a simulated 1-3 year run.
-The agent manages cash flow, employees, tasks, and prestige across 4 domains,
-interacting exclusively via CLI subprocess calls against a SQLite-backed
-discrete-event simulation.
-
-Unlike TerminalBench2 (per-task binary pass/fail), YC-Bench measures sustained
-multi-turn strategic coherence -- whether an agent can manage compounding
-decisions over hundreds of turns without going bankrupt.
-
-This is an eval-only environment. Run via:
-
-    python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
-        --config environments/benchmarks/yc_bench/default.yaml
-
-The evaluate flow:
-    1. setup()     -- Verifies yc-bench installed, builds eval matrix (preset x seed)
-    2. evaluate()  -- Iterates over all runs sequentially through:
-        a. rollout_and_score_eval()  -- Per-run agent loop
-            - Initialises a fresh yc-bench simulation via `sim init` (NOT `run`)
-            - Runs HermesAgentLoop with terminal tool only
-            - Reads final SQLite DB to extract score
-            - Returns survival (0/1) + normalised funds score
-        b. Aggregates per-preset and overall metrics
-        c. Logs results via evaluate_log() and wandb
-
-Key features:
-  - CLI-only interface: agent calls yc-bench subcommands via terminal tool
-  - Deterministic: same seed + preset = same world (SHA256-based RNG)
-  - Multi-dimensional scoring: survival + normalised final funds
-  - Per-preset difficulty breakdown in results
-  - Isolated SQLite DB per run (no cross-run state leakage)
-
-Requires: pip install hermes-agent[yc-bench]
-"""
-
-import asyncio
-import datetime
-import json
-import logging
-import math
-import os
-import sqlite3
-import subprocess
-import sys
-import threading
-import time
-import uuid
-from collections import defaultdict
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
-
-_repo_root = Path(__file__).resolve().parent.parent.parent.parent
-if str(_repo_root) not in sys.path:
-    sys.path.insert(0, str(_repo_root))
-
-from pydantic import Field
-
-from atroposlib.envs.base import EvalHandlingEnum
-from atroposlib.envs.server_handling.server_manager import APIServerConfig
-
-from environments.agent_loop import HermesAgentLoop
-from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
-
-logger = logging.getLogger(__name__)
-
-# =============================================================================
-# System prompt
-# =============================================================================
-
-YC_BENCH_SYSTEM_PROMPT = """\
-You are the autonomous CEO of an early-stage AI startup in a deterministic
-business simulation. You manage the company exclusively through the `yc-bench`
-CLI tool. Your primary goal is to **survive** until the simulation horizon ends
-without going bankrupt, while **maximising final funds**.
-
-## Simulation Mechanics
-
- **Funds**: You start with $250,000 seed capital. Revenue comes from completing
-  tasks. Rewards scale with your prestige: `base × (1 + scale × (prestige − 1))`.
- **Domains**: There are 4 skill domains: **research**, **inference**,
-  **data_environment**, and **training**. Each has its own prestige level
-  (1.0-10.0). Higher prestige unlocks better-paying tasks.
- **Employees**: You have employees (Junior/Mid/Senior) with domain-specific
-  skill rates. **Throughput splits**: `effective_rate = base_rate / N` where N
-  is the number of active tasks assigned to that employee. Focus beats breadth.
- **Payroll**: Deducted automatically on the first business day of each month.
-  Running out of funds = bankruptcy = game over.
- **Time**: The simulation runs on business days (Mon-Fri), 09:00-18:00.
-  Time only advances when you call `yc-bench sim resume`.
-
-## Task Lifecycle
-
-1. Browse market tasks with `market browse`
-2. Accept a task with `task accept` (this sets its deadline)
-3. Assign employees with `task assign`
-4. Dispatch with `task dispatch` to start work
-5. Call `sim resume` to advance time and let employees make progress
-6. Tasks complete when all domain requirements are fulfilled
-
-**Penalties for failure vary by difficulty preset.** Completing a task on time
-earns full reward + prestige gain. Missing a deadline or cancelling a task
-incurs prestige penalties -- cancelling is always more costly than letting a
-task fail, so cancel only as a last resort.
-
-## CLI Commands
-
-### Observe
- `yc-bench company status`                                         -- funds, prestige, runway
- `yc-bench employee list`                                          -- skills, salary, active tasks
- `yc-bench market browse [--domain D] [--required-prestige-lte N]` -- available tasks
- `yc-bench task list [--status active|planned]`                    -- your tasks
- `yc-bench task inspect --task-id UUID`                            -- progress, deadline, assignments
- `yc-bench finance ledger [--category monthly_payroll|task_reward]` -- transaction history
- `yc-bench report monthly`                                         -- monthly P&L
-
-### Act
- `yc-bench task accept --task-id UUID`                              -- accept from market
- `yc-bench task assign --task-id UUID --employee-id UUID`           -- assign employee
- `yc-bench task dispatch --task-id UUID`                            -- start work (needs >=1 assignment)
- `yc-bench task cancel --task-id UUID --reason "text"`              -- cancel (prestige penalty)
- `yc-bench sim resume`                                              -- advance simulation clock
-
-### Memory (persists across context truncation)
- `yc-bench scratchpad read`            -- read your persistent notes
- `yc-bench scratchpad write --content "text"`  -- overwrite notes
- `yc-bench scratchpad append --content "text"` -- append to notes
- `yc-bench scratchpad clear`           -- clear notes
-
-## Strategy Guidelines
-
-1. **Specialise in 2-3 domains** to climb the prestige ladder faster and unlock
-   high-reward tasks. Don't spread thin across all 4 domains early on.
-2. **Focus employees** -- assigning one employee to many tasks halves their
-   throughput per additional task. Keep assignments concentrated.
-3. **Use the scratchpad** to track your strategy, upcoming deadlines, and
-   employee assignments. This persists even if conversation context is truncated.
-4. **Monitor runway** -- always know how many months of payroll you can cover.
-   Accept high-reward tasks before payroll dates.
-5. **Don't over-accept** -- taking too many tasks and missing deadlines cascades
-   into prestige loss, locking you out of profitable contracts.
-6. Use `finance ledger` and `report monthly` to track revenue trends.
-
-## Your Turn
-
-Each turn:
-1. Call `yc-bench company status` and `yc-bench task list` to orient yourself.
-2. Check for completed tasks and pending deadlines.
-3. Browse market for profitable tasks within your prestige level.
-4. Accept, assign, and dispatch tasks strategically.
-5. Call `yc-bench sim resume` to advance time.
-6. Repeat until the simulation ends.
-
-Think step by step before acting."""
-
-# Starting funds in cents ($250,000)
-INITIAL_FUNDS_CENTS = 25_000_000
-
-# Default horizon per preset (years)
-_PRESET_HORIZONS = {
-    "tutorial": 1,
-    "easy": 1,
-    "medium": 1,
-    "hard": 1,
-    "nightmare": 1,
-    "fast_test": 1,
-    "default": 3,
-    "high_reward": 1,
-}
-
-
-# =============================================================================
-# Configuration
-# =============================================================================
-
-class YCBenchEvalConfig(HermesAgentEnvConfig):
-    """
-    Configuration for the YC-Bench evaluation environment.
-
-    Extends HermesAgentEnvConfig with YC-Bench-specific settings for
-    preset selection, seed control, scoring, and simulation parameters.
-    """
-
-    presets: List[str] = Field(
-        default=["fast_test", "medium", "hard"],
-        description="YC-Bench preset names to evaluate.",
-    )
-    seeds: List[int] = Field(
-        default=[1, 2, 3],
-        description="Random seeds -- each preset x seed = one run.",
-    )
-    run_timeout: int = Field(
-        default=3600,
-        description="Maximum wall-clock seconds per run. Default 60 minutes.",
-    )
-    survival_weight: float = Field(
-        default=0.5,
-        description="Weight of survival (0/1) in composite score.",
-    )
-    funds_weight: float = Field(
-        default=0.5,
-        description="Weight of normalised final funds in composite score.",
-    )
-    db_dir: str = Field(
-        default="/tmp/yc_bench_dbs",
-        description="Directory for per-run SQLite databases.",
-    )
-    horizon_years: Optional[int] = Field(
-        default=None,
-        description=(
-            "Simulation horizon in years. If None (default), inferred from "
-            "preset name (1 year for most, 3 for 'default')."
-        ),
-    )
-    company_name: str = Field(
-        default="BenchCo",
-        description="Name of the simulated company.",
-    )
-    start_date: str = Field(
-        default="01/01/2025",
-        description="Simulation start date in MM/DD/YYYY format (yc-bench convention).",
-    )
-
-
-# =============================================================================
-# Scoring helpers
-# =============================================================================
-
-def _read_final_score(db_path: str) -> Dict[str, Any]:
-    """
-    Read final game state from a YC-Bench SQLite database.
-
-    Returns dict with final_funds_cents (int), survived (bool),
-    terminal_reason (str).
-
-    Note: yc-bench table names are plural -- 'companies' not 'company',
-    'sim_events' not 'simulation_log'.
-    """
-    if not os.path.exists(db_path):
-        logger.warning("DB not found at %s", db_path)
-        return {
-            "final_funds_cents": 0,
-            "survived": False,
-            "terminal_reason": "db_missing",
-        }
-
-    conn = None
-    try:
-        conn = sqlite3.connect(db_path)
-        cur = conn.cursor()
-
-        # Read final funds from the 'companies' table
-        cur.execute("SELECT funds_cents FROM companies LIMIT 1")
-        row = cur.fetchone()
-        funds = row[0] if row else 0
-
-        # Determine terminal reason from 'sim_events' table
-        terminal_reason = "unknown"
-        try:
-            cur.execute(
-                "SELECT event_type FROM sim_events "
-                "WHERE event_type IN ('bankruptcy', 'horizon_end') "
-                "ORDER BY scheduled_at DESC LIMIT 1"
-            )
-            event_row = cur.fetchone()
-            if event_row:
-                terminal_reason = event_row[0]
-        except sqlite3.OperationalError:
-            # Table may not exist if simulation didn't progress
-            pass
-
-        survived = funds >= 0 and terminal_reason != "bankruptcy"
-        return {
-            "final_funds_cents": funds,
-            "survived": survived,
-            "terminal_reason": terminal_reason,
-        }
-
-    except Exception as e:
-        logger.error("Failed to read DB %s: %s", db_path, e)
-        return {
-            "final_funds_cents": 0,
-            "survived": False,
-            "terminal_reason": f"db_error: {e}",
-        }
-    finally:
-        if conn:
-            conn.close()
-
-
-def _compute_composite_score(
-    final_funds_cents: int,
-    survived: bool,
-    survival_weight: float = 0.5,
-    funds_weight: float = 0.5,
-    initial_funds_cents: int = INITIAL_FUNDS_CENTS,
-) -> float:
-    """
-    Compute composite score from survival and final funds.
-
-    Score = survival_weight * survival_score
-          + funds_weight * normalised_funds_score
-
-    Normalised funds uses log-scale relative to initial capital:
-    - funds <= 0:          0.0
-    - funds == initial:   ~0.15
-    - funds == 10x:       ~0.52
-    - funds == 100x:       1.0
-    """
-    survival_score = 1.0 if survived else 0.0
-
-    if final_funds_cents <= 0:
-        funds_score = 0.0
-    else:
-        max_ratio = 100.0
-        ratio = final_funds_cents / max(initial_funds_cents, 1)
-        funds_score = min(math.log1p(ratio) / math.log1p(max_ratio), 1.0)
-
-    return survival_weight * survival_score + funds_weight * funds_score
-
-
-# =============================================================================
-# Main Environment
-# =============================================================================
-
-class YCBenchEvalEnv(HermesAgentBaseEnv):
-    """
-    YC-Bench long-horizon agent benchmark environment (eval-only).
-
-    Each eval item is a (preset, seed) pair. The environment initialises the
-    simulation via ``yc-bench sim init`` (NOT ``yc-bench run`` which would start
-    a competing built-in agent loop). The HermesAgentLoop then drives the
-    interaction by calling individual yc-bench CLI commands via the terminal tool.
-
-    After the agent loop ends, the SQLite DB is read to extract the final score.
-
-    Scoring:
-      composite = 0.5 * survival + 0.5 * normalised_funds
-    """
-
-    name = "yc-bench"
-    env_config_cls = YCBenchEvalConfig
-
-    @classmethod
-    def config_init(cls) -> Tuple[YCBenchEvalConfig, List[APIServerConfig]]:
-        env_config = YCBenchEvalConfig(
-            enabled_toolsets=["terminal"],
-            disabled_toolsets=None,
-            distribution=None,
-            max_agent_turns=200,
-            max_token_length=32000,
-            agent_temperature=0.0,
-            system_prompt=YC_BENCH_SYSTEM_PROMPT,
-            terminal_backend="local",
-            terminal_timeout=60,
-            presets=["fast_test", "medium", "hard"],
-            seeds=[1, 2, 3],
-            run_timeout=3600,
-            survival_weight=0.5,
-            funds_weight=0.5,
-            db_dir="/tmp/yc_bench_dbs",
-            eval_handling=EvalHandlingEnum.STOP_TRAIN,
-            group_size=1,
-            steps_per_eval=1,
-            total_steps=1,
-            tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B",
-            use_wandb=True,
-            wandb_name="yc-bench",
-            ensure_scores_are_not_same=False,
-        )
-
-        server_configs = [
-            APIServerConfig(
-                base_url="https://openrouter.ai/api/v1",
-                model_name="anthropic/claude-sonnet-4.6",
-                server_type="openai",
-                api_key=os.getenv("OPENROUTER_API_KEY", ""),
-                health_check=False,
-            )
-        ]
-
-        return env_config, server_configs
-
-    # =========================================================================
-    # Setup
-    # =========================================================================
-
-    async def setup(self):
-        """Verify yc-bench is installed and build the eval matrix."""
-        # Verify yc-bench CLI is available
-        try:
-            result = subprocess.run(
-                ["yc-bench", "--help"], capture_output=True, text=True, timeout=10
-            )
-            if result.returncode != 0:
-                raise FileNotFoundError
-        except (FileNotFoundError, subprocess.TimeoutExpired):
-            raise RuntimeError(
-                "yc-bench CLI not found. Install with:\n"
-                '  pip install "hermes-agent[yc-bench]"\n'
-                "Or: git clone https://github.com/collinear-ai/yc-bench "
-                "&& cd yc-bench && pip install -e ."
-            )
-        print("yc-bench CLI verified.")
-
-        # Build eval matrix: preset x seed
-        self.all_eval_items = [
-            {"preset": preset, "seed": seed}
-            for preset in self.config.presets
-            for seed in self.config.seeds
-        ]
-        self.iter = 0
-
-        os.makedirs(self.config.db_dir, exist_ok=True)
-        self.eval_metrics: List[Tuple[str, float]] = []
-
-        # Streaming JSONL log for crash-safe result persistence
-        log_dir = os.path.join(os.path.dirname(__file__), "logs")
-        os.makedirs(log_dir, exist_ok=True)
-        run_ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-        self._streaming_path = os.path.join(log_dir, f"samples_{run_ts}.jsonl")
-        self._streaming_file = open(self._streaming_path, "w", encoding="utf-8")
-        self._streaming_lock = threading.Lock()
-
-        print(f"\nYC-Bench eval matrix: {len(self.all_eval_items)} runs")
-        for item in self.all_eval_items:
-            print(f"  preset={item['preset']!r}  seed={item['seed']}")
-        print(f"Streaming results to: {self._streaming_path}\n")
-
-    def _save_result(self, result: Dict[str, Any]):
-        """Write a single run result to the streaming JSONL file immediately."""
-        if not hasattr(self, "_streaming_file") or self._streaming_file.closed:
-            return
-        with self._streaming_lock:
-            self._streaming_file.write(
-                json.dumps(result, ensure_ascii=False, default=str) + "\n"
-            )
-            self._streaming_file.flush()
-
-    # =========================================================================
-    # Training pipeline stubs (eval-only -- not used)
-    # =========================================================================
-
-    async def get_next_item(self):
-        item = self.all_eval_items[self.iter % len(self.all_eval_items)]
-        self.iter += 1
-        return item
-
-    def format_prompt(self, item: Dict[str, Any]) -> str:
-        preset = item["preset"]
-        seed = item["seed"]
-        return (
-            f"A new YC-Bench simulation has been initialized "
-            f"(preset='{preset}', seed={seed}).\n"
-            f"Your company '{self.config.company_name}' is ready.\n\n"
-            "Begin by calling:\n"
-            "1. `yc-bench company status` -- see your starting funds and prestige\n"
-            "2. `yc-bench employee list` -- see your team and their skills\n"
-            "3. `yc-bench market browse --required-prestige-lte 1` -- find tasks "
-            "you can take\n\n"
-            "Then accept 2-3 tasks, assign employees, dispatch them, and call "
-            "`yc-bench sim resume` to advance time. Repeat this loop until the "
-            "simulation ends (horizon reached or bankruptcy)."
-        )
-
-    async def compute_reward(self, item, result, ctx) -> float:
-        return 0.0
-
-    async def collect_trajectories(self, item):
-        return None, []
-
-    async def score(self, rollout_group_data):
-        return None
-
-    # =========================================================================
-    # Per-run evaluation
-    # =========================================================================
-
-    async def rollout_and_score_eval(self, eval_item: Dict[str, Any]) -> Dict:
-        """
-        Evaluate a single (preset, seed) run.
-
-        1. Sets DATABASE_URL and YC_BENCH_EXPERIMENT env vars
-        2. Initialises the simulation via ``yc-bench sim init`` (NOT ``run``)
-        3. Runs HermesAgentLoop with terminal tool
-        4. Reads SQLite DB to compute final score
-        5. Returns result dict with survival, funds, and composite score
-        """
-        preset = eval_item["preset"]
-        seed = eval_item["seed"]
-        run_id = str(uuid.uuid4())[:8]
-        run_key = f"{preset}_seed{seed}_{run_id}"
-
-        from tqdm import tqdm
-        tqdm.write(f"  [START] preset={preset!r} seed={seed} (run_id={run_id})")
-        run_start = time.time()
-
-        # Isolated DB per run -- prevents cross-run state leakage
-        db_path = os.path.join(self.config.db_dir, f"yc_bench_{run_key}.db")
-        os.environ["DATABASE_URL"] = f"sqlite:///{db_path}"
-        os.environ["YC_BENCH_EXPERIMENT"] = preset
-
-        # Determine horizon: explicit config override > preset lookup > default 1
-        horizon = self.config.horizon_years or _PRESET_HORIZONS.get(preset, 1)
-
-        try:
-            # ----------------------------------------------------------
-            # Step 1: Initialise the simulation via CLI
-            # IMPORTANT: We use `sim init`, NOT `yc-bench run`.
-            # `yc-bench run` starts yc-bench's own LLM agent loop (via
-            # LiteLLM), which would compete with our HermesAgentLoop.
-            # `sim init` just sets up the world and returns.
-            # ----------------------------------------------------------
-            init_cmd = [
-                "yc-bench", "sim", "init",
-                "--seed", str(seed),
-                "--start-date", self.config.start_date,
-                "--company-name", self.config.company_name,
-                "--horizon-years", str(horizon),
-            ]
-            init_result = subprocess.run(
-                init_cmd, capture_output=True, text=True, timeout=30,
-            )
-            if init_result.returncode != 0:
-                error_msg = (init_result.stderr or init_result.stdout).strip()
-                raise RuntimeError(f"yc-bench sim init failed: {error_msg}")
-
-            tqdm.write(f"    Simulation initialized (horizon={horizon}yr)")
-
-            # ----------------------------------------------------------
-            # Step 2: Run the HermesAgentLoop
-            # ----------------------------------------------------------
-            tools, valid_names = self._resolve_tools_for_group()
-
-            messages: List[Dict[str, Any]] = [
-                {"role": "system", "content": YC_BENCH_SYSTEM_PROMPT},
-                {"role": "user", "content": self.format_prompt(eval_item)},
-            ]
-
-            agent = HermesAgentLoop(
-                server=self.server,
-                tool_schemas=tools,
-                valid_tool_names=valid_names,
-                max_turns=self.config.max_agent_turns,
-                task_id=run_id,
-                temperature=self.config.agent_temperature,
-                max_tokens=self.config.max_token_length,
-                extra_body=self.config.extra_body,
-                budget_config=self.config.build_budget_config(),
-            )
-            result = await agent.run(messages)
-
-            # ----------------------------------------------------------
-            # Step 3: Read final score from the simulation DB
-            # ----------------------------------------------------------
-            score_data = _read_final_score(db_path)
-            final_funds = score_data["final_funds_cents"]
-            survived = score_data["survived"]
-            terminal_reason = score_data["terminal_reason"]
-
-            composite = _compute_composite_score(
-                final_funds_cents=final_funds,
-                survived=survived,
-                survival_weight=self.config.survival_weight,
-                funds_weight=self.config.funds_weight,
-            )
-
-            elapsed = time.time() - run_start
-            status = "SURVIVED" if survived else "BANKRUPT"
-            if final_funds >= 0:
-                funds_str = f"${final_funds / 100:,.0f}"
-            else:
-                funds_str = f"-${abs(final_funds) / 100:,.0f}"
-
-            tqdm.write(
-                f"  [{status}] preset={preset!r} seed={seed} "
-                f"funds={funds_str} score={composite:.3f} "
-                f"turns={result.turns_used} ({elapsed:.0f}s)"
-            )
-
-            out = {
-                "preset": preset,
-                "seed": seed,
-                "survived": survived,
-                "final_funds_cents": final_funds,
-                "final_funds_usd": final_funds / 100,
-                "terminal_reason": terminal_reason,
-                "composite_score": composite,
-                "turns_used": result.turns_used,
-                "finished_naturally": result.finished_naturally,
-                "elapsed_seconds": elapsed,
-                "db_path": db_path,
-                "messages": result.messages,
-            }
-            self._save_result(out)
-            return out
-
-        except Exception as e:
-            elapsed = time.time() - run_start
-            logger.error("Run %s failed: %s", run_key, e, exc_info=True)
-            tqdm.write(
-                f"  [ERROR] preset={preset!r} seed={seed}: {e} ({elapsed:.0f}s)"
-            )
-            out = {
-                "preset": preset,
-                "seed": seed,
-                "survived": False,
-                "final_funds_cents": 0,
-                "final_funds_usd": 0.0,
-                "terminal_reason": f"error: {e}",
-                "composite_score": 0.0,
-                "turns_used": 0,
-                "error": str(e),
-                "elapsed_seconds": elapsed,
-            }
-            self._save_result(out)
-            return out
-
-    # =========================================================================
-    # Evaluate
-    # =========================================================================
-
-    async def _run_with_timeout(self, item: Dict[str, Any]) -> Dict:
-        """Wrap a single rollout with a wall-clock timeout."""
-        preset = item["preset"]
-        seed = item["seed"]
-        try:
-            return await asyncio.wait_for(
-                self.rollout_and_score_eval(item),
-                timeout=self.config.run_timeout,
-            )
-        except asyncio.TimeoutError:
-            from tqdm import tqdm
-            tqdm.write(
-                f"  [TIMEOUT] preset={preset!r} seed={seed} "
-                f"(exceeded {self.config.run_timeout}s)"
-            )
-            out = {
-                "preset": preset,
-                "seed": seed,
-                "survived": False,
-                "final_funds_cents": 0,
-                "final_funds_usd": 0.0,
-                "terminal_reason": f"timeout ({self.config.run_timeout}s)",
-                "composite_score": 0.0,
-                "turns_used": 0,
-                "error": "timeout",
-            }
-            self._save_result(out)
-            return out
-
-    async def evaluate(self, *args, **kwargs) -> None:
-        """
-        Run YC-Bench evaluation over all (preset, seed) combinations.
-
-        Runs sequentially -- each run is 100-500 turns, parallelising would
-        be prohibitively expensive and cause env var conflicts.
-        """
-        start_time = time.time()
-        from tqdm import tqdm
-
-        # --- tqdm-compatible logging handler (TB2 pattern) ---
-        class _TqdmHandler(logging.Handler):
-            def emit(self, record):
-                try:
-                    tqdm.write(self.format(record))
-                except Exception:
-                    self.handleError(record)
-
-        root = logging.getLogger()
-        handler = _TqdmHandler()
-        handler.setFormatter(
-            logging.Formatter("%(levelname)s %(name)s: %(message)s")
-        )
-        root.handlers = [handler]
-        for noisy in ("httpx", "openai"):
-            logging.getLogger(noisy).setLevel(logging.WARNING)
-
-        # --- Print config summary ---
-        print(f"\n{'='*60}")
-        print("Starting YC-Bench Evaluation")
-        print(f"{'='*60}")
-        print(f"  Presets: {self.config.presets}")
-        print(f"  Seeds: {self.config.seeds}")
-        print(f"  Total runs: {len(self.all_eval_items)}")
-        print(f"  Max turns/run: {self.config.max_agent_turns}")
-        print(f"  Run timeout: {self.config.run_timeout}s")
-        print(f"{'='*60}\n")
-
-        results = []
-        pbar = tqdm(
-            total=len(self.all_eval_items), desc="YC-Bench", dynamic_ncols=True
-        )
-
-        try:
-            for item in self.all_eval_items:
-                result = await self._run_with_timeout(item)
-                results.append(result)
-                survived_count = sum(1 for r in results if r.get("survived"))
-                pbar.set_postfix_str(
-                    f"survived={survived_count}/{len(results)}"
-                )
-                pbar.update(1)
-
-        except (KeyboardInterrupt, asyncio.CancelledError):
-            tqdm.write("\n[INTERRUPTED] Stopping evaluation...")
-            pbar.close()
-            try:
-                from tools.terminal_tool import cleanup_all_environments
-                cleanup_all_environments()
-            except Exception:
-                pass
-            if hasattr(self, "_streaming_file") and not self._streaming_file.closed:
-                self._streaming_file.close()
-            return
-
-        pbar.close()
-        end_time = time.time()
-
-        # --- Compute metrics ---
-        valid = [r for r in results if r is not None]
-        if not valid:
-            print("Warning: No valid results.")
-            return
-
-        total = len(valid)
-        survived_total = sum(1 for r in valid if r.get("survived"))
-        survival_rate = survived_total / total if total else 0.0
-        avg_score = (
-            sum(r.get("composite_score", 0) for r in valid) / total
-            if total
-            else 0.0
-        )
-
-        preset_results: Dict[str, List[Dict]] = defaultdict(list)
-        for r in valid:
-            preset_results[r["preset"]].append(r)
-
-        eval_metrics = {
-            "eval/survival_rate": survival_rate,
-            "eval/avg_composite_score": avg_score,
-            "eval/total_runs": total,
-            "eval/survived_runs": survived_total,
-            "eval/evaluation_time_seconds": end_time - start_time,
-        }
-
-        for preset, items in sorted(preset_results.items()):
-            ps = sum(1 for r in items if r.get("survived"))
-            pt = len(items)
-            pa = (
-                sum(r.get("composite_score", 0) for r in items) / pt
-                if pt
-                else 0
-            )
-            key = preset.replace("-", "_")
-            eval_metrics[f"eval/survival_rate_{key}"] = ps / pt if pt else 0
-            eval_metrics[f"eval/avg_score_{key}"] = pa
-
-        self.eval_metrics = list(eval_metrics.items())
-
-        # --- Print summary ---
-        print(f"\n{'='*60}")
-        print("YC-Bench Evaluation Results")
-        print(f"{'='*60}")
-        print(
-            f"Overall survival rate: {survival_rate:.1%} "
-            f"({survived_total}/{total})"
-        )
-        print(f"Average composite score: {avg_score:.4f}")
-        print(f"Evaluation time: {end_time - start_time:.1f}s")
-
-        print("\nPer-preset breakdown:")
-        for preset, items in sorted(preset_results.items()):
-            ps = sum(1 for r in items if r.get("survived"))
-            pt = len(items)
-            pa = (
-                sum(r.get("composite_score", 0) for r in items) / pt
-                if pt
-                else 0
-            )
-            print(f"  {preset}: {ps}/{pt} survived  avg_score={pa:.4f}")
-            for r in items:
-                status = "SURVIVED" if r.get("survived") else "BANKRUPT"
-                funds = r.get("final_funds_usd", 0)
-                print(
-                    f"    seed={r['seed']}  [{status}]  "
-                    f"${funds:,.0f}  "
-                    f"score={r.get('composite_score', 0):.3f}"
-                )
-
-        print(f"{'='*60}\n")
-
-        # --- Log results ---
-        samples = [
-            {k: v for k, v in r.items() if k != "messages"} for r in valid
-        ]
-
-        try:
-            await self.evaluate_log(
-                metrics=eval_metrics,
-                samples=samples,
-                start_time=start_time,
-                end_time=end_time,
-                generation_parameters={
-                    "temperature": self.config.agent_temperature,
-                    "max_tokens": self.config.max_token_length,
-                    "max_agent_turns": self.config.max_agent_turns,
-                },
-            )
-        except Exception as e:
-            print(f"Error logging results: {e}")
-
-        # --- Cleanup (TB2 pattern) ---
-        if hasattr(self, "_streaming_file") and not self._streaming_file.closed:
-            self._streaming_file.close()
-            print(f"Results saved to: {self._streaming_path}")
-
-        try:
-            from tools.terminal_tool import cleanup_all_environments
-            cleanup_all_environments()
-        except Exception:
-            pass
-
-        try:
-            from environments.agent_loop import _tool_executor
-            _tool_executor.shutdown(wait=False, cancel_futures=True)
-        except Exception:
-            pass
-
-    # =========================================================================
-    # Wandb logging
-    # =========================================================================
-
-    async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
-        """Log YC-Bench-specific metrics to wandb."""
-        if wandb_metrics is None:
-            wandb_metrics = {}
-        for k, v in self.eval_metrics:
-            wandb_metrics[k] = v
-        self.eval_metrics = []
-        await super().wandb_log(wandb_metrics)
-
-
-if __name__ == "__main__":
-    YCBenchEvalEnv.cli()
@@ -1,714 +0,0 @@
-"""
-HermesAgentBaseEnv -- Abstract Base Environment for Hermes-Agent + Atropos
-
-Provides the Atropos integration plumbing that all hermes-agent environments share:
- Two-mode operation (OpenAI server for Phase 1, VLLM ManagedServer for Phase 2)
- Per-group toolset/distribution resolution
- Agent loop orchestration via HermesAgentLoop
- ToolContext creation for reward functions
- ScoredDataGroup construction from ManagedServer state
-
-Subclasses only need to implement:
-    setup()           -- Load dataset, initialize state
-    get_next_item()   -- Return the next item from the dataset
-    format_prompt()   -- Convert a dataset item into the user message
-    compute_reward()  -- Score the rollout (has full ToolContext access)
-    evaluate()        -- Periodic evaluation
-"""
-
-import asyncio
-import json
-import logging
-import os
-import sys
-import uuid
-from abc import abstractmethod
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
-
-# Ensure the hermes-agent repo root is on sys.path so that imports like
-# `from model_tools import ...` and `from environments.X import ...` work
-# regardless of where the script is invoked from.
-_repo_root = Path(__file__).resolve().parent.parent
-if str(_repo_root) not in sys.path:
-    sys.path.insert(0, str(_repo_root))
-
-from dotenv import load_dotenv
-from pydantic import Field
-
-# Load API keys from hermes-agent/.env so all environments can access them
-_env_path = _repo_root / ".env"
-if _env_path.exists():
-    load_dotenv(dotenv_path=_env_path)
-
-# Apply monkey patches for async-safe tool operation inside Atropos's event loop.
-# This patches SwerexModalEnvironment to use a background thread instead of
-# asyncio.run(), which would deadlock inside Atropos. Safe for normal CLI too.
-from environments.patches import apply_patches
-apply_patches()
-
-from atroposlib.envs.base import (
-    BaseEnv,
-    BaseEnvConfig,
-    ScoredDataGroup,
-    ScoredDataItem,
-)
-from atroposlib.envs.server_handling.server_manager import (
-    APIServerConfig,
-    ServerBaseline,
-    ServerManager,
-)
-from atroposlib.type_definitions import Item
-
-from environments.agent_loop import AgentResult, HermesAgentLoop
-from environments.tool_context import ToolContext
-from tools.budget_config import (
-    DEFAULT_RESULT_SIZE_CHARS,
-    DEFAULT_TURN_BUDGET_CHARS,
-    DEFAULT_PREVIEW_SIZE_CHARS,
-)
-
-# Import hermes-agent toolset infrastructure
-from model_tools import get_tool_definitions
-from toolset_distributions import sample_toolsets_from_distribution
-
-logger = logging.getLogger(__name__)
-
-
-class HermesAgentEnvConfig(BaseEnvConfig):
-    """
-    Configuration for hermes-agent Atropos environments.
-
-    Extends BaseEnvConfig with agent-specific settings for toolsets,
-    terminal backend, dataset loading, and tool call parsing.
-    """
-
-    # --- Toolset configuration ---
-    # Mutually exclusive: use either enabled_toolsets OR distribution
-    enabled_toolsets: Optional[List[str]] = Field(
-        default=None,
-        description="Explicit list of hermes toolsets to enable (e.g., ['terminal', 'file', 'web']). "
-        "If None and distribution is also None, all available toolsets are enabled.",
-    )
-    disabled_toolsets: Optional[List[str]] = Field(
-        default=None,
-        description="Toolsets to disable. Applied as a filter on top of enabled_toolsets or distribution.",
-    )
-    distribution: Optional[str] = Field(
-        default=None,
-        description="Name of a toolset distribution from toolset_distributions.py "
-        "(e.g., 'development', 'terminal_tasks'). Sampled once per group. "
-        "Mutually exclusive with enabled_toolsets.",
-    )
-
-    # --- Agent loop configuration ---
-    max_agent_turns: int = Field(
-        default=30,
-        description="Maximum number of LLM calls (tool-calling iterations) per rollout.",
-    )
-    system_prompt: Optional[str] = Field(
-        default=None,
-        description="System prompt for the agent. Tools are handled via the tools= parameter, "
-        "not embedded in the prompt text.",
-    )
-    agent_temperature: float = Field(
-        default=1.0,
-        description="Sampling temperature for agent generation during rollouts.",
-    )
-
-    # --- Terminal backend ---
-    terminal_backend: str = Field(
-        default="local",
-        description="Terminal backend: 'local', 'docker', 'modal', 'daytona', 'ssh', 'singularity'. "
-        "Modal or Daytona recommended for production RL (cloud isolation per rollout).",
-    )
-    terminal_timeout: int = Field(
-        default=120,
-        description="Per-command timeout in seconds for terminal tool calls. "
-        "Commands exceeding this are killed. Increase for tasks with long-running "
-        "commands (compilation, pip install, etc.).",
-    )
-    terminal_lifetime: int = Field(
-        default=3600,
-        description="Sandbox inactivity lifetime in seconds. The cleanup thread kills "
-        "sandboxes that have been idle longer than this. Must be longer than "
-        "the longest gap between tool calls (e.g., waiting for LLM response).",
-    )
-
-    # --- Dataset ---
-    dataset_name: Optional[str] = Field(
-        default=None,
-        description="HuggingFace dataset name. Optional if tasks are defined inline.",
-    )
-    dataset_split: str = Field(
-        default="train",
-        description="Dataset split to use.",
-    )
-    prompt_field: str = Field(
-        default="prompt",
-        description="Which field in the dataset contains the prompt.",
-    )
-
-    # --- Thread pool ---
-    tool_pool_size: int = Field(
-        default=128,
-        description="Thread pool size for tool execution. Each concurrent task needs a "
-        "thread for tool calls. Must be large enough for parallel evaluation. "
-        "Too small = thread pool starvation.",
-    )
-
-    # --- Phase 2: Tool call parsing ---
-    tool_call_parser: str = Field(
-        default="hermes",
-        description="Tool call parser name for Phase 2 (VLLM server type). "
-        "Ignored in Phase 1 (OpenAI server type where VLLM parses natively). "
-        "Options: hermes, mistral, llama3_json, qwen, deepseek_v3, etc.",
-    )
-
-    # --- Tool result budget ---
-    # Defaults imported from tools.budget_config (single source of truth).
-    default_result_size_chars: int = Field(
-        default=DEFAULT_RESULT_SIZE_CHARS,
-        description="Default per-tool threshold (chars) for persisting large results "
-        "to sandbox. Results exceeding this are written to /tmp/hermes-results/ "
-        "and replaced with a preview. Per-tool registry values take precedence "
-        "unless overridden via tool_result_overrides.",
-    )
-    turn_budget_chars: int = Field(
-        default=DEFAULT_TURN_BUDGET_CHARS,
-        description="Aggregate char budget per assistant turn. If all tool results "
-        "in a single turn exceed this, the largest are persisted to disk first.",
-    )
-    preview_size_chars: int = Field(
-        default=DEFAULT_PREVIEW_SIZE_CHARS,
-        description="Size of the inline preview shown after a tool result is persisted.",
-    )
-    tool_result_overrides: Optional[Dict[str, int]] = Field(
-        default=None,
-        description="Per-tool threshold overrides (chars). Keys are tool names, "
-        "values are char thresholds. Overrides both the default and registry "
-        "per-tool values. Example: {'terminal': 10000, 'search_files': 5000}. "
-        "Note: read_file is pinned to infinity and cannot be overridden.",
-    )
-
-    # --- Provider-specific parameters ---
-    # Passed as extra_body to the OpenAI client's chat.completions.create() call.
-    # Useful for OpenRouter provider preferences, transforms, route settings, etc.
-    # Example YAML:
-    #   extra_body:
-    #     provider:
-    #       ignore: ["DeepInfra", "Fireworks"]
-    #       order: ["Together"]
-    #     transforms: ["middle-out"]
-    extra_body: Optional[Dict[str, Any]] = Field(
-        default=None,
-        description="Extra body parameters passed to the OpenAI client's "
-        "chat.completions.create(). Used for OpenRouter provider preferences, "
-        "transforms, and other provider-specific settings.",
-    )
-
-    def build_budget_config(self):
-        """Build a BudgetConfig from env config fields."""
-        from tools.budget_config import BudgetConfig
-        return BudgetConfig(
-            default_result_size=self.default_result_size_chars,
-            turn_budget=self.turn_budget_chars,
-            preview_size=self.preview_size_chars,
-            tool_overrides=dict(self.tool_result_overrides) if self.tool_result_overrides else {},
-        )
-
-
-class HermesAgentBaseEnv(BaseEnv):
-    """
-    Abstract base environment for hermes-agent Atropos integration.
-
-    Handles two modes of operation:
-    - Phase 1 (OpenAI server type): Uses server.chat_completion() directly.
-      The server (VLLM, SGLang, OpenRouter, OpenAI) handles tool call parsing
-      and reasoning extraction natively. DummyManagedServer provides placeholder
-      tokens. Good for SFT data gen, verifier testing, evaluation.
-
-    - Phase 2 (VLLM server type): Uses ManagedServer for exact token IDs + logprobs
-      via /generate. Client-side tool call parser reconstructs structured tool_calls
-      from raw output. Full RL training capability.
-
-    Subclasses must implement:
-        setup()           -- Load dataset, initialize state
-        get_next_item()   -- Return the next item to roll out
-        format_prompt()   -- Convert a dataset item into the user message string
-        compute_reward()  -- Score the rollout using ToolContext
-        evaluate()        -- Periodic evaluation
-    """
-
-    name: Optional[str] = "hermes-agent"
-    env_config_cls = HermesAgentEnvConfig
-
-    def __init__(
-        self,
-        config: HermesAgentEnvConfig,
-        server_configs: Union[ServerBaseline, List[APIServerConfig]],
-        slurm=False,
-        testing=False,
-    ):
-        super().__init__(config, server_configs, slurm, testing)
-
-        # Set terminal environment variables so hermes tools pick them up.
-        # These can all be overridden per-environment via config fields instead
-        # of requiring users to set shell env vars.
-        if config.terminal_backend:
-            os.environ["TERMINAL_ENV"] = config.terminal_backend
-        os.environ["TERMINAL_TIMEOUT"] = str(config.terminal_timeout)
-        os.environ["TERMINAL_LIFETIME_SECONDS"] = str(config.terminal_lifetime)
-        print(
-            f"🖥️  Terminal: backend={config.terminal_backend}, "
-            f"timeout={config.terminal_timeout}s, lifetime={config.terminal_lifetime}s"
-        )
-
-        # Resize the agent loop's thread pool for tool execution.
-        # This must be large enough for the number of concurrent tasks
-        # (e.g., 89 parallel TB2 eval tasks each need a thread for tool calls).
-        from environments.agent_loop import resize_tool_pool
-        resize_tool_pool(config.tool_pool_size)
-
-        # Set tool_parser on the ServerManager so ManagedServer uses it
-        # for bidirectional tool call translation (raw text ↔ OpenAI tool_calls).
-        if hasattr(self.server, 'tool_parser'):
-            self.server.tool_parser = config.tool_call_parser
-            print(f"🔧 Tool parser: {config.tool_call_parser}")
-
-        # Current group's resolved tools (set in collect_trajectories)
-        self._current_group_tools: Optional[Tuple[List[Dict], Set[str]]] = None
-
-        # Tool error tracking for wandb logging
-        self._tool_error_buffer: List[Dict[str, Any]] = []
-
-    # =========================================================================
-    # Toolset resolution (per-group)
-    # =========================================================================
-
-    def _resolve_tools_for_group(self) -> Tuple[List[Dict[str, Any]], Set[str]]:
-        """
-        Resolve toolsets for a group. Called once in collect_trajectories(),
-        then shared by all collect_trajectory() calls in the group.
-
-        If distribution is set, samples probabilistically.
-        If enabled_toolsets is set, uses that explicit list.
-        disabled_toolsets is applied as a filter on top.
-
-        Returns:
-            (tool_schemas, valid_tool_names) tuple
-        """
-        config = self.config
-
-        if config.distribution:
-            group_toolsets = sample_toolsets_from_distribution(config.distribution)
-            logger.info("Sampled toolsets from '%s': %s", config.distribution, group_toolsets)
-        else:
-            group_toolsets = config.enabled_toolsets  # None means "all available"
-            if group_toolsets is None:
-                logger.warning(
-                    "enabled_toolsets is None -- loading ALL tools including messaging. "
-                    "Set explicit enabled_toolsets for RL training."
-                )
-
-        tools = get_tool_definitions(
-            enabled_toolsets=group_toolsets,
-            disabled_toolsets=config.disabled_toolsets,
-            quiet_mode=True,
-        )
-
-        valid_names = {t["function"]["name"] for t in tools} if tools else set()
-        logger.info("Resolved %d tools for group: %s", len(valid_names), sorted(valid_names))
-        return tools, valid_names
-
-    # =========================================================================
-    # Server mode detection
-    # =========================================================================
-
-    def _use_managed_server(self) -> bool:
-        """
-        Determine if we should use ManagedServer (Phase 2) or direct server (Phase 1).
-
-        Phase 2 (ManagedServer) is used when the server type is 'vllm' or 'sglang',
-        which go through the /generate endpoint for exact token tracking.
-
-        Phase 1 (direct server) is used for 'openai' server type, which uses
-        /v1/chat/completions with native tool call parsing.
-        """
-        if not self.server.servers:
-            return False
-
-        server = self.server.servers[0]
-        # If the server is an OpenAI server (not VLLM/SGLang), use direct mode
-        from atroposlib.envs.server_handling.openai_server import OpenAIServer
-        return not isinstance(server, OpenAIServer)
-
-    # =========================================================================
-    # Core Atropos integration
-    # =========================================================================
-
-    async def collect_trajectories(
-        self, item: Item
-    ) -> Tuple[
-        Union[Optional[ScoredDataGroup], List[Optional[ScoredDataGroup]]],
-        List[Item],
-    ]:
-        """
-        Override collect_trajectories to resolve toolsets once per group,
-        then delegate to the standard group-level collection.
-
-        The default BaseEnv.collect_trajectories() calls collect_trajectory()
-        group_size times in parallel. We resolve tools once here and store
-        them for all those calls to use.
-        """
-        # Resolve toolsets for this group (shared by all rollouts in the group)
-        self._current_group_tools = self._resolve_tools_for_group()
-
-        # Delegate to the default implementation which calls collect_trajectory()
-        # group_size times via asyncio.gather
-        return await super().collect_trajectories(item)
-
-    # =========================================================================
-    # Wandb rollout display -- format trajectories nicely
-    # =========================================================================
-
-    @staticmethod
-    def _format_trajectory_for_display(messages: List[Dict[str, Any]]) -> str:
-        """
-        Format a conversation's messages into a readable trajectory string
-        for wandb rollout tables. Shows tool calls, tool results, and reasoning
-        in a structured way instead of raw token decoding.
-        """
-        parts = []
-        for msg in messages:
-            role = msg.get("role", "unknown")
-            content = msg.get("content", "")
-
-            if role == "system":
-                parts.append(f"[SYSTEM]\n{content}")
-
-            elif role == "user":
-                parts.append(f"[USER]\n{content}")
-
-            elif role == "assistant":
-                # Show reasoning if present
-                reasoning = msg.get("reasoning_content", "")
-                if reasoning:
-                    # Truncate long reasoning for display
-                    if len(reasoning) > 300:
-                        reasoning = reasoning[:300] + "..."
-                    parts.append(f"[ASSISTANT thinking]\n{reasoning}")
-
-                # Show content
-                if content:
-                    parts.append(f"[ASSISTANT]\n{content}")
-
-                # Show tool calls
-                tool_calls = msg.get("tool_calls", [])
-                for tc in tool_calls:
-                    func = tc.get("function", {})
-                    name = func.get("name", "?")
-                    args = func.get("arguments", "{}")
-                    # Truncate long arguments for display
-                    if len(args) > 200:
-                        args = args[:200] + "..."
-                    parts.append(f"[TOOL CALL] {name}({args})")
-
-            elif role == "tool":
-                tool_id = msg.get("tool_call_id", "")
-                result = content
-                # Truncate long tool results for display
-                if len(result) > 500:
-                    result = result[:500] + "..."
-                parts.append(f"[TOOL RESULT] {result}")
-
-        return "\n\n".join(parts)
-
-    async def add_rollouts_for_wandb(
-        self,
-        scored_data,
-        item=None,
-    ):
-        """
-        Override to show formatted trajectories with tool calls visible,
-        instead of raw token decoding which loses all structure.
-        """
-        num_keep = self.config.num_rollouts_per_group_for_logging
-        if num_keep == -1:
-            num_keep = self.config.group_size
-
-        group = []
-        for i in range(min(num_keep, len(scored_data.get("scores", [])))):
-            score = scored_data["scores"][i]
-
-            # Use messages if available for rich display
-            messages = None
-            if scored_data.get("messages") and i < len(scored_data["messages"]):
-                messages = scored_data["messages"][i]
-
-            if messages:
-                text = self._format_trajectory_for_display(messages)
-            elif scored_data.get("tokens") and i < len(scored_data["tokens"]):
-                text = self.tokenizer.decode(scored_data["tokens"][i])
-            else:
-                text = "(no data)"
-
-            group.append((text, score))
-
-        self.rollouts_for_wandb.append(group)
-        if len(self.rollouts_for_wandb) > self.config.num_rollouts_to_keep:
-            self.rollouts_for_wandb.pop(0)
-
-    async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
-        """Log base metrics including tool errors to wandb."""
-        if wandb_metrics is None:
-            wandb_metrics = {}
-
-        # Log tool error stats
-        if self._tool_error_buffer:
-            wandb_metrics["train/tool_errors_count"] = len(self._tool_error_buffer)
-
-            # Log error details as a summary string (tables can crash wandb on tmp cleanup)
-            error_summaries = []
-            for err in self._tool_error_buffer:
-                error_summaries.append(
-                    f"[turn {err['turn']}] {err['tool']}({err['args'][:80]}) -> {err['error'][:150]}"
-                )
-            wandb_metrics["train/tool_error_details"] = "\n".join(error_summaries)
-
-            # Also print to stdout for immediate visibility
-            for summary in error_summaries:
-                print(f"  Tool Error: {summary}")
-
-            self._tool_error_buffer = []
-        else:
-            wandb_metrics["train/tool_errors_count"] = 0
-
-        await super().wandb_log(wandb_metrics)
-
-    async def collect_trajectory(
-        self, item: Item
-    ) -> Tuple[Optional[Union[ScoredDataItem, Any]], List[Item]]:
-        """
-        Run a single rollout: agent loop + reward computation.
-
-        This is called group_size times in parallel by collect_trajectories().
-        Each call gets its own task_id for terminal/browser session isolation.
-        """
-        task_id = str(uuid.uuid4())
-
-        # Get group-level tools (resolved once in collect_trajectories)
-        if self._current_group_tools is None:
-            # Fallback: resolve per-trajectory if called outside collect_trajectories
-            tools, valid_names = self._resolve_tools_for_group()
-        else:
-            tools, valid_names = self._current_group_tools
-
-        # Build initial messages
-        messages: List[Dict[str, Any]] = []
-        if self.config.system_prompt:
-            messages.append({"role": "system", "content": self.config.system_prompt})
-        messages.append({"role": "user", "content": self.format_prompt(item)})
-
-        # Run the agent loop
-        result: AgentResult
-        if self._use_managed_server():
-            # Phase 2: ManagedServer with ToolCallTranslator -- exact tokens + logprobs
-            # tool_parser is set on ServerManager in __init__ and passed through
-            # to ManagedServer, which uses ToolCallTranslator for bidirectional
-            # translation between raw text and OpenAI tool_calls.
-            try:
-                async with self.server.managed_server(
-                    tokenizer=self.tokenizer,
-                    preserve_think_blocks=bool(self.config.thinking_mode),
-                ) as managed:
-                    agent = HermesAgentLoop(
-                        server=managed,
-                        tool_schemas=tools,
-                        valid_tool_names=valid_names,
-                        max_turns=self.config.max_agent_turns,
-                        task_id=task_id,
-                        temperature=self.config.agent_temperature,
-                        max_tokens=self.config.max_token_length,
-                        extra_body=self.config.extra_body,
-                        budget_config=self.config.build_budget_config(),
-                    )
-                    result = await agent.run(messages)
-            except NotImplementedError:
-                # DummyManagedServer not allowed -- fall back to Phase 1
-                logger.warning(
-                    "ManagedServer not available (OpenAI server?). "
-                    "Falling back to direct server mode."
-                )
-                agent = HermesAgentLoop(
-                    server=self.server,
-                    tool_schemas=tools,
-                    valid_tool_names=valid_names,
-                    max_turns=self.config.max_agent_turns,
-                    task_id=task_id,
-                    temperature=self.config.agent_temperature,
-                    max_tokens=self.config.max_token_length,
-                    extra_body=self.config.extra_body,
-                    budget_config=self.config.build_budget_config(),
-                )
-                result = await agent.run(messages)
-        else:
-            # Phase 1: OpenAI server -- native tool_calls, placeholder tokens
-            agent = HermesAgentLoop(
-                server=self.server,
-                tool_schemas=tools,
-                valid_tool_names=valid_names,
-                max_turns=self.config.max_agent_turns,
-                task_id=task_id,
-                temperature=self.config.agent_temperature,
-                max_tokens=self.config.max_token_length,
-                extra_body=self.config.extra_body,
-                budget_config=self.config.build_budget_config(),
-            )
-            result = await agent.run(messages)
-
-        # Skip reward computation if the agent loop produced no meaningful work
-        # (e.g., API call failed on turn 1). No point spinning up a Modal sandbox
-        # just to verify files that were never created.
-        only_system_and_user = all(
-            msg.get("role") in {"system", "user"} for msg in result.messages
-        )
-        if result.turns_used == 0 or only_system_and_user:
-            logger.warning(
-                "Agent loop produced no output (turns=%d, msgs=%d). Skipping reward.",
-                result.turns_used, len(result.messages),
-            )
-            reward = 0.0
-        else:
-            # Compute reward using ToolContext (gives verifier full tool access)
-            ctx = ToolContext(task_id)
-            try:
-                reward = await self.compute_reward(item, result, ctx)
-            except Exception as e:
-                logger.error("compute_reward failed: %s", e)
-                reward = 0.0
-            finally:
-                ctx.cleanup()
-
-        # Track tool errors for wandb logging
-        if result.tool_errors:
-            for err in result.tool_errors:
-                self._tool_error_buffer.append({
-                    "turn": err.turn,
-                    "tool": err.tool_name,
-                    "args": err.arguments[:150],
-                    "error": err.error[:300],
-                    "result": err.tool_result[:300],
-                })
-
-        # Build ScoredDataItem from ManagedServer state
-        # Phase 2: real tokens/masks/logprobs from SequenceNodes
-        # Phase 1: placeholder tokens (still need a valid ScoredDataItem for the pipeline)
-        nodes = (result.managed_state or {}).get("nodes", [])
-
-        if nodes:
-            # Phase 2 (or DummyManagedServer): use actual node data
-            node = nodes[-1]  # Final sequence node = full trajectory
-            scored_item: Dict[str, Any] = {
-                "tokens": node.tokens,
-                "masks": node.masked_tokens,
-                "scores": reward,
-            }
-
-            # Include logprobs if available (Phase 2)
-            if hasattr(node, "logprobs") and node.logprobs:
-                scored_item["advantages"] = None  # Computed by trainer
-                scored_item["ref_logprobs"] = None
-        else:
-            # Phase 1 with no managed state: create placeholder tokens
-            # so the data pipeline doesn't break. These are NOT suitable
-            # for training but allow process mode (SFT data gen) to work.
-            # Tokenize the full conversation to get approximate tokens.
-            full_text = "\n".join(
-                msg.get("content", "") for msg in result.messages if msg.get("content")
-            )
-            if self.tokenizer:
-                tokens = self.tokenizer.encode(full_text, add_special_tokens=True)
-            else:
-                tokens = list(range(min(len(full_text) // 4, 128)))
-
-            scored_item = {
-                "tokens": tokens,
-                "masks": [-100] + tokens[1:],  # Mask first token as prompt
-                "scores": reward,
-            }
-
-        # Always include messages for wandb rollout display and data logging
-        scored_item["messages"] = result.messages
-
-        return scored_item, []
-
-    # =========================================================================
-    # Abstract methods -- subclasses must implement
-    # =========================================================================
-
-    @abstractmethod
-    async def setup(self):
-        """
-        Load dataset, initialize state.
-
-        Called once when the environment starts. Typical implementation:
-            self.dataset = load_dataset(self.config.dataset_name, split=self.config.dataset_split)
-            self.iter = 0
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    async def get_next_item(self) -> Item:
-        """
-        Return the next item from the dataset for rollout.
-
-        Called by the base env's main loop to get items for workers.
-        Should cycle through the dataset.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def format_prompt(self, item: Item) -> str:
-        """
-        Convert a dataset item into the user message for the agent.
-
-        Args:
-            item: Dataset item (dict, tuple, etc.)
-
-        Returns:
-            The prompt string to send to the agent
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    async def compute_reward(
-        self, item: Item, result: AgentResult, ctx: ToolContext
-    ) -> float:
-        """
-        Score the rollout. Has full access to:
-        - item: the original dataset item (ground truth, test commands, etc.)
-        - result: AgentResult with full messages, turn count, reasoning, etc.
-        - ctx: ToolContext -- call ANY hermes-agent tool (terminal, file, web,
-               browser, vision...) scoped to this rollout's sandbox. Nothing
-               is off-limits.
-
-        Args:
-            item: The dataset item that was rolled out
-            result: The agent's rollout result
-            ctx: ToolContext with full tool access for verification
-
-        Returns:
-            Reward float (typically 0.0 to 1.0, but any float is valid)
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    async def evaluate(self, *args, **kwargs):
-        """
-        Periodic evaluation. Called every steps_per_eval steps.
-
-        Typical implementation runs the agent on a held-out eval set
-        and logs metrics via wandb/evaluate_log.
-        """
-        raise NotImplementedError
@@ -1,34 +0,0 @@
-# SWE Environment -- Default Configuration
-#
-# SWE-bench style tasks with Modal sandboxes for cloud isolation.
-# Uses terminal + file + web toolsets.
-#
-# Usage:
-#   python environments/hermes_swe_env/hermes_swe_env.py serve \
-#       --config environments/hermes_swe_env/default.yaml
-
-env:
-  enabled_toolsets: ["terminal", "file", "web"]
-  max_agent_turns: 30
-  max_token_length: 4096
-  group_size: 4
-  terminal_backend: "modal"
-  tool_call_parser: "hermes"
-  tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
-  dataset_name: "bigcode/humanevalpack"
-  dataset_split: "test"
-  prompt_field: "prompt"
-  steps_per_eval: 50
-  total_steps: 500
-  use_wandb: true
-  wandb_name: "hermes-swe"
-  system_prompt: >
-    You are a skilled software engineer. You have access to a terminal,
-    file tools, and web search. Use these tools to complete the coding task.
-    Write clean, working code and verify it runs correctly before finishing.
-
-openai:
-  base_url: "http://localhost:8000/v1"
-  model_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
-  server_type: "openai"
-  api_key: ""
@@ -1,229 +0,0 @@
-"""
-HermesSweEnv -- SWE-Bench Style Environment with Modal Sandboxes
-
-A concrete environment for software engineering tasks where the model writes code
-and the reward function runs tests to verify correctness. Uses Modal terminal
-backend for cloud-isolated sandboxes per rollout.
-
-The reward function uses ToolContext.terminal() to run test commands in the same
-Modal sandbox the model used during its agentic loop. All filesystem state from
-the model's tool calls is preserved for verification.
-
-Usage:
-    # Phase 1: OpenAI server type
-    vllm serve YourModel --tool-parser hermes
-    run-api
-    python environments/hermes_swe_env.py serve \\
-        --openai.base_url http://localhost:8000/v1 \\
-        --openai.model_name YourModel \\
-        --openai.server_type openai \\
-        --env.dataset_name bigcode/humanevalpack \\
-        --env.terminal_backend modal
-
-    # Phase 2: VLLM server type (full RL training)
-    python environments/hermes_swe_env.py serve \\
-        --openai.base_url http://localhost:8000/v1 \\
-        --openai.model_name YourModel \\
-        --openai.server_type vllm \\
-        --env.tool_call_parser hermes \\
-        --env.terminal_backend modal
-"""
-
-import logging
-import sys
-import time
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-# Ensure repo root is on sys.path for imports
-_repo_root = Path(__file__).resolve().parent.parent.parent
-if str(_repo_root) not in sys.path:
-    sys.path.insert(0, str(_repo_root))
-
-from datasets import load_dataset
-
-from atroposlib.envs.base import ScoredDataGroup
-from atroposlib.envs.server_handling.server_manager import APIServerConfig
-from atroposlib.type_definitions import Item
-
-from environments.agent_loop import AgentResult
-from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
-from environments.tool_context import ToolContext
-
-logger = logging.getLogger(__name__)
-
-
-class HermesSweEnvConfig(HermesAgentEnvConfig):
-    """Config with defaults for SWE-bench style tasks."""
-
-    pass  # Inherits all fields, overrides defaults in config_init
-
-
-class HermesSweEnv(HermesAgentBaseEnv):
-    """
-    SWE-bench style environment using Modal terminal backend.
-
-    The model gets a coding task, uses terminal + file + web tools to solve it,
-    and the reward function runs tests in the same Modal sandbox to verify.
-
-    Subclass this for specific SWE datasets (HumanEval, SWE-bench, etc.)
-    and customize format_prompt() and compute_reward() as needed.
-    """
-
-    name = "hermes-swe"
-    env_config_cls = HermesSweEnvConfig
-
-    @classmethod
-    def config_init(cls) -> Tuple[HermesSweEnvConfig, List[APIServerConfig]]:
-        """
-        Default configuration for the SWE environment.
-
-        Uses Modal terminal backend for cloud isolation and terminal + file + web toolsets.
-        """
-        env_config = HermesSweEnvConfig(
-            # Toolsets: terminal for running code, file for reading/writing, web for docs
-            enabled_toolsets=["terminal", "file", "web"],
-            disabled_toolsets=None,
-            distribution=None,
-            # Agent settings -- SWE tasks need more turns
-            max_agent_turns=30,
-            max_token_length=4096,
-            agent_temperature=1.0,
-            system_prompt=(
-                "You are a skilled software engineer. You have access to a terminal, "
-                "file tools, and web search. Use these tools to complete the coding task. "
-                "Write clean, working code and verify it runs correctly before finishing."
-            ),
-            # Modal backend for cloud-isolated sandboxes
-            terminal_backend="modal",
-            # Dataset -- override via CLI for your specific SWE dataset
-            dataset_name="bigcode/humanevalpack",
-            dataset_split="test",
-            prompt_field="prompt",
-            # Atropos settings
-            group_size=4,
-            tokenizer_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
-            tool_call_parser="hermes",
-            steps_per_eval=50,
-            total_steps=500,
-            use_wandb=True,
-            wandb_name="hermes-swe",
-        )
-
-        server_configs = [
-            APIServerConfig(
-                base_url="http://localhost:8000/v1",
-                model_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
-                server_type="openai",  # Phase 1; switch to "vllm" for Phase 2
-                api_key="",
-            )
-        ]
-
-        return env_config, server_configs
-
-    async def setup(self):
-        """Load the SWE dataset."""
-        if self.config.dataset_name:
-            self.dataset = load_dataset(
-                self.config.dataset_name, split=self.config.dataset_split
-            )
-        else:
-            # Placeholder if no dataset specified
-            self.dataset = []
-        self.iter = 0
-        self.reward_buffer: List[float] = []
-
-    async def get_next_item(self) -> Dict[str, Any]:
-        """Cycle through the SWE dataset."""
-        if not self.dataset:
-            raise ValueError("No dataset loaded. Set dataset_name in config.")
-        item = self.dataset[self.iter % len(self.dataset)]
-        self.iter += 1
-        return item
-
-    def format_prompt(self, item: Dict[str, Any]) -> str:
-        """
-        Format the SWE task prompt.
-
-        Override this in subclasses for different dataset formats.
-        Default assumes the dataset has a 'prompt' field and optionally a 'test' field.
-        """
-        prompt = item.get(self.config.prompt_field, "")
-
-        # If the dataset has test information, include it in the prompt
-        test_info = item.get("test", item.get("test_code", item.get("tests", "")))
-        if test_info:
-            prompt += f"\n\nTests to pass:\n{test_info}"
-
-        return prompt
-
-    async def compute_reward(
-        self, item: Dict[str, Any], result: AgentResult, ctx: ToolContext
-    ) -> float:
-        """
-        Score by running tests in the model's Modal sandbox.
-
-        Default implementation:
-        - If the dataset item has a 'test' or 'test_code' field, run it
-        - Check exit code: 0 = pass, non-zero = fail
-        - Partial credit for file creation
-
-        Override this in subclasses for more sophisticated reward logic.
-        """
-        # Find the test command from the dataset item
-        test_code = item.get("test", item.get("test_code", item.get("tests", "")))
-
-        if test_code:
-            # Run the test in the model's sandbox
-            test_result = ctx.terminal(
-                f'cd /workspace && python3 -c "{test_code}"', timeout=60
-            )
-
-            if test_result["exit_code"] == 0:
-                self.reward_buffer.append(1.0)
-                return 1.0
-
-        # Partial credit: check if the model created any Python files
-        file_check = ctx.terminal("find /workspace -name '*.py' -newer /tmp/.start_marker 2>/dev/null | head -5")
-        if file_check["exit_code"] == 0 and file_check.get("output", "").strip():
-            self.reward_buffer.append(0.1)
-            return 0.1
-
-        self.reward_buffer.append(0.0)
-        return 0.0
-
-    async def evaluate(self, *args, **kwargs):
-        """
-        Run evaluation on a held-out set.
-
-        Override for dataset-specific evaluation logic.
-        """
-        start_time = time.time()
-        end_time = time.time()
-
-        eval_metrics = {"eval/placeholder": 0.0}
-        await self.evaluate_log(
-            metrics=eval_metrics,
-            start_time=start_time,
-            end_time=end_time,
-        )
-
-    async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
-        """Log SWE-specific metrics."""
-        if wandb_metrics is None:
-            wandb_metrics = {}
-
-        if self.reward_buffer:
-            wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / len(
-                self.reward_buffer
-            )
-            wandb_metrics["train/pass_rate"] = sum(
-                1 for r in self.reward_buffer if r == 1.0
-            ) / len(self.reward_buffer)
-            self.reward_buffer = []
-
-        await super().wandb_log(wandb_metrics)
-
-
-if __name__ == "__main__":
-    HermesSweEnv.cli()
@@ -1,35 +0,0 @@
-"""
-Monkey patches for making hermes-agent tools work inside async frameworks (Atropos).
-
-Problem:
-    Some tools use asyncio.run() internally (e.g., Modal backend via SWE-ReX,
-    web_extract). This crashes when called from inside Atropos's event loop because
-    asyncio.run() can't be nested.
-
-Solution:
-    The Modal environment (tools/environments/modal.py) now uses a dedicated
-    _AsyncWorker thread internally, making it safe for both CLI and Atropos use.
-    No monkey-patching is required.
-
-    This module is kept for backward compatibility. apply_patches() is a no-op.
-
-Usage:
-    Call apply_patches() once at import time (done automatically by hermes_base_env.py).
-    This is idempotent and safe to call multiple times.
-"""
-
-import logging
-
-logger = logging.getLogger(__name__)
-
-_patches_applied = False
-
-
-def apply_patches():
-    """Apply all monkey patches needed for Atropos compatibility."""
-    global _patches_applied
-    if _patches_applied:
-        return
-
-    logger.debug("apply_patches() called; no patches needed (async safety is built-in)")
-    _patches_applied = True
@@ -1,34 +0,0 @@
-# Terminal Test Environment -- Default Configuration
-#
-# Simple file-creation tasks for validating the full Atropos + hermes-agent stack.
-# Uses Modal terminal backend and OpenRouter (Claude) for inference.
-# API keys loaded from ~/hermes-agent/.env
-#
-# Usage:
-#   run-api
-#   python environments/terminal_test_env/terminal_test_env.py serve \
-#       --config environments/terminal_test_env/default.yaml
-
-env:
-  enabled_toolsets: ["terminal", "file"]
-  max_agent_turns: 10
-  max_token_length: 2048
-  group_size: 3
-  total_steps: 3
-  steps_per_eval: 3
-  terminal_backend: "modal"
-  tool_call_parser: "hermes"
-  tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
-  ensure_scores_are_not_same: false
-  use_wandb: false
-  system_prompt: >
-    You are a helpful assistant with access to a terminal and file tools.
-    Complete the user's request by using the available tools.
-    Be precise and follow instructions exactly.
-
-openai:
-  base_url: "https://openrouter.ai/api/v1"
-  model_name: "anthropic/claude-opus-4.6"
-  server_type: "openai"
-  health_check: false
-  # api_key loaded from OPENROUTER_API_KEY in .env
@@ -1,292 +0,0 @@
-"""
-TerminalTestEnv -- Simple Test Environment for Validating the Stack
-
-A self-contained environment with inline tasks (no external dataset needed).
-Each task asks the model to create a file at a known path with specific content.
-The reward verifier cats the file and checks if the content matches.
-
-Enables only terminal + file toolsets. Uses Modal terminal backend with
-OpenRouter (Claude) by default.
-
-Training tasks (3):
-    1. Create ~/greeting.txt with "Hello from Hermes Agent"
-    2. Create ~/count.txt with numbers 1-5, one per line
-    3. Create ~/answer.txt with the result of 123 + 456
-
-Eval task (1):
-    1. Create ~/result.txt with the result of 6 * 7
-
-Usage:
-    # Start Atropos API server
-    run-api
-
-    # Run environment (uses OpenRouter + Modal by default)
-    python environments/terminal_test_env.py serve
-
-    # Process mode (no run-api needed, saves to JSONL)
-    python environments/terminal_test_env.py process \\
-        --env.data_path_to_save_groups terminal_test_output.jsonl
-"""
-
-import logging
-import os
-import sys
-import time
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-# Ensure repo root is on sys.path for imports
-_repo_root = Path(__file__).resolve().parent.parent.parent
-if str(_repo_root) not in sys.path:
-    sys.path.insert(0, str(_repo_root))
-
-from atroposlib.envs.base import ScoredDataGroup
-from atroposlib.envs.server_handling.server_manager import APIServerConfig
-from atroposlib.type_definitions import Item
-
-from environments.agent_loop import AgentResult
-from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
-from environments.tool_context import ToolContext
-
-logger = logging.getLogger(__name__)
-
-
-# =============================================================================
-# Inline task definitions -- no external dataset needed
-# =============================================================================
-
-TRAIN_TASKS = [
-    {
-        "prompt": "Create a file at ~/greeting.txt containing exactly the text: Hello from Hermes Agent",
-        "verify_path": "~/greeting.txt",
-        "expected_content": "Hello from Hermes Agent",
-    },
-    {
-        "prompt": "Create a file at ~/count.txt containing the numbers 1 through 5, one per line",
-        "verify_path": "~/count.txt",
-        "expected_content": "1\n2\n3\n4\n5",
-    },
-    {
-        "prompt": "Create a file at ~/answer.txt containing the result of 123 + 456",
-        "verify_path": "~/answer.txt",
-        "expected_content": "579",
-    },
-]
-
-EVAL_TASKS = [
-    {
-        "prompt": "Create a file at ~/result.txt containing the result of 6 * 7",
-        "verify_path": "~/result.txt",
-        "expected_content": "42",
-    },
-]
-
-
-class TerminalTestEnvConfig(HermesAgentEnvConfig):
-    """Config with defaults suitable for terminal testing."""
-
-    pass  # Inherits all fields, overrides defaults in config_init
-
-
-class TerminalTestEnv(HermesAgentBaseEnv):
-    """
-    Simple test environment with inline file-creation tasks.
-
-    All tasks follow the same pattern: "create a file at ~/X.txt with content Y".
-    The verifier runs `cat ~/X.txt` in the rollout's terminal and checks the output
-    against the expected string. Same verifier logic for all tasks.
-
-    This environment is designed to validate the full stack end-to-end:
-    - Agent loop executes tool calls (terminal/file)
-    - ToolContext provides terminal access to the reward function
-    - Reward function verifies file content via cat
-    - Scored data flows through the Atropos pipeline
-    """
-
-    name = "terminal-test"
-    env_config_cls = TerminalTestEnvConfig
-
-    @classmethod
-    def config_init(cls) -> Tuple[TerminalTestEnvConfig, List[APIServerConfig]]:
-        """
-        Default configuration for the terminal test environment.
-
-        Uses Modal terminal backend for cloud isolation and OpenRouter with
-        Claude for inference. API keys loaded from ~/hermes-agent/.env.
-        """
-        env_config = TerminalTestEnvConfig(
-            # Terminal + file tools only
-            enabled_toolsets=["terminal", "file"],
-            disabled_toolsets=None,
-            distribution=None,
-            # Agent settings
-            max_agent_turns=10,  # Simple tasks, don't need many turns
-            max_token_length=16000,
-            agent_temperature=1.0,
-            system_prompt=(
-                "You are a helpful assistant with access to a terminal and file tools. "
-                "Complete the user's request by using the available tools. "
-                "Be precise and follow instructions exactly."
-            ),
-            # Modal terminal backend for cloud-isolated sandboxes per rollout
-            terminal_backend="modal",
-            # Atropos settings
-            group_size=3,              # 3 rollouts per group
-            tokenizer_name="NousResearch/q-30b-t-h45-e1",
-            tool_call_parser="hermes",
-            steps_per_eval=3,          # Eval after all 3 steps
-            total_steps=3,             # 3 groups total (1 group per step)
-            use_wandb=True,
-            wandb_name="terminal-test",
-            ensure_scores_are_not_same=False,  # Allow all-same scores for simple tasks
-            # No external dataset
-            dataset_name=None,
-        )
-
-        # OpenRouter with Claude -- API key loaded from .env (OPENROUTER_API_KEY)
-        server_configs = [
-            APIServerConfig(
-                base_url="https://openrouter.ai/api/v1",
-                model_name="anthropic/claude-opus-4.6",
-                server_type="openai",
-                api_key=os.getenv("OPENROUTER_API_KEY", ""),
-                health_check=False,  # OpenRouter doesn't have a /health endpoint
-            )
-        ]
-
-        return env_config, server_configs
-
-    async def setup(self):
-        """Initialize inline task lists."""
-        self.train_tasks = list(TRAIN_TASKS)
-        self.eval_tasks = list(EVAL_TASKS)
-        self.iter = 0
-        # Track reward stats for wandb logging
-        self.reward_buffer: List[float] = []
-
-    async def get_next_item(self) -> Dict[str, str]:
-        """Cycle through training tasks."""
-        item = self.train_tasks[self.iter % len(self.train_tasks)]
-        self.iter += 1
-        return item
-
-    def format_prompt(self, item: Dict[str, str]) -> str:
-        """The prompt is directly in the task item."""
-        return item["prompt"]
-
-    async def compute_reward(
-        self, item: Dict[str, str], result: AgentResult, ctx: ToolContext
-    ) -> float:
-        """
-        Verify by cat-ing the expected file path and checking content matches.
-        Same verifier for all tasks -- they all write a file at a known path.
-
-        Scoring:
-            1.0 = exact match
-            0.5 = expected content is present but has extra stuff
-            0.0 = file doesn't exist or content doesn't match
-        """
-        verify_result = ctx.terminal(f"cat {item['verify_path']}")
-
-        # File doesn't exist or can't be read
-        if verify_result["exit_code"] != 0:
-            self.reward_buffer.append(0.0)
-            return 0.0
-
-        actual = verify_result.get("output", "").strip()
-        expected = item["expected_content"].strip()
-
-        # Exact match
-        if actual == expected:
-            self.reward_buffer.append(1.0)
-            return 1.0
-
-        # Partial credit: expected content is present but has extra stuff
-        if expected in actual:
-            self.reward_buffer.append(0.5)
-            return 0.5
-
-        self.reward_buffer.append(0.0)
-        return 0.0
-
-    async def evaluate(self, *args, **kwargs):
-        """
-        Run eval tasks using the agent loop and verify results.
-        Logs accuracy metrics.
-        """
-        start_time = time.time()
-        correct = 0
-        total = len(self.eval_tasks)
-        samples = []
-
-        for eval_item in self.eval_tasks:
-            try:
-                # For eval, we do a simple single-turn completion (not full agent loop)
-                # to keep eval fast. The agent loop is tested via training.
-                completion = await self.server.chat_completion(
-                    messages=[
-                        {"role": "system", "content": self.config.system_prompt or ""},
-                        {"role": "user", "content": eval_item["prompt"]},
-                    ],
-                    n=1,
-                    max_tokens=self.config.max_token_length,
-                    temperature=0.0,
-                    split="eval",
-                )
-
-                response_content = (
-                    completion.choices[0].message.content if completion.choices else ""
-                )
-
-                samples.append(
-                    {
-                        "prompt": eval_item["prompt"],
-                        "response": response_content,
-                        "expected": eval_item["expected_content"],
-                    }
-                )
-
-            except Exception as e:
-                logger.error("Eval failed for item: %s", e)
-                samples.append(
-                    {
-                        "prompt": eval_item["prompt"],
-                        "response": f"ERROR: {e}",
-                        "expected": eval_item["expected_content"],
-                    }
-                )
-
-        end_time = time.time()
-
-        eval_metrics = {
-            "eval/num_samples": total,
-        }
-
-        await self.evaluate_log(
-            metrics=eval_metrics,
-            samples=samples,
-            start_time=start_time,
-            end_time=end_time,
-        )
-
-    async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
-        """Log training metrics including reward stats and accuracy."""
-        if wandb_metrics is None:
-            wandb_metrics = {}
-
-        if self.reward_buffer:
-            total = len(self.reward_buffer)
-            correct = sum(1 for r in self.reward_buffer if r == 1.0)
-            partial = sum(1 for r in self.reward_buffer if r == 0.5)
-
-            wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / total
-            wandb_metrics["train/accuracy"] = correct / total
-            wandb_metrics["train/partial_match_rate"] = partial / total
-            wandb_metrics["train/total_rollouts"] = total
-            self.reward_buffer = []
-
-        await super().wandb_log(wandb_metrics)
-
-
-if __name__ == "__main__":
-    TerminalTestEnv.cli()
@@ -1,120 +0,0 @@
-"""
-Tool Call Parser Registry
-
-Client-side parsers that extract structured tool_calls from raw model output text.
-Used in Phase 2 (VLLM server type) where ManagedServer's /generate endpoint returns
-raw text without tool call parsing.
-
-Each parser is a standalone reimplementation of the corresponding VLLM parser's
-non-streaming extract_tool_calls() logic. No VLLM dependency -- only standard library
-(re, json, uuid) and openai types.
-
-Usage:
-    from environments.tool_call_parsers import get_parser
-
-    parser = get_parser("hermes")
-    content, tool_calls = parser.parse(raw_model_output)
-    # content = text with tool call markup stripped
-    # tool_calls = list of ChatCompletionMessageToolCall objects, or None
-"""
-
-import logging
-from abc import ABC, abstractmethod
-from typing import Dict, List, Optional, Tuple, Type
-
-from openai.types.chat.chat_completion_message_tool_call import (
-    ChatCompletionMessageToolCall,
-)
-
-logger = logging.getLogger(__name__)
-
-# Type alias for parser return value
-ParseResult = Tuple[Optional[str], Optional[List[ChatCompletionMessageToolCall]]]
-
-
-class ToolCallParser(ABC):
-    """
-    Base class for tool call parsers.
-
-    Each parser knows how to extract structured tool_calls from a specific
-    model family's raw output text format.
-    """
-
-    @abstractmethod
-    def parse(self, text: str) -> ParseResult:
-        """
-        Parse raw model output text for tool calls.
-
-        Args:
-            text: Raw decoded text from the model's completion
-
-        Returns:
-            Tuple of (content, tool_calls) where:
-            - content: text with tool call markup stripped (the message 'content' field),
-                       or None if the entire output was tool calls
-            - tool_calls: list of ChatCompletionMessageToolCall objects,
-                          or None if no tool calls were found
-        """
-        raise NotImplementedError
-
-
-# Global parser registry: name -> parser class
-PARSER_REGISTRY: Dict[str, Type[ToolCallParser]] = {}
-
-
-def register_parser(name: str):
-    """
-    Decorator to register a parser class under a given name.
-
-    Usage:
-        @register_parser("hermes")
-        class HermesToolCallParser(ToolCallParser):
-            ...
-    """
-
-    def decorator(cls: Type[ToolCallParser]) -> Type[ToolCallParser]:
-        PARSER_REGISTRY[name] = cls
-        return cls
-
-    return decorator
-
-
-def get_parser(name: str) -> ToolCallParser:
-    """
-    Get a parser instance by name.
-
-    Args:
-        name: Parser name (e.g., "hermes", "mistral", "llama3_json")
-
-    Returns:
-        Instantiated parser
-
-    Raises:
-        KeyError: If parser name is not found in registry
-    """
-    if name not in PARSER_REGISTRY:
-        available = sorted(PARSER_REGISTRY.keys())
-        raise KeyError(
-            f"Tool call parser '{name}' not found. Available parsers: {available}"
-        )
-    return PARSER_REGISTRY[name]()
-
-
-def list_parsers() -> List[str]:
-    """Return sorted list of registered parser names."""
-    return sorted(PARSER_REGISTRY.keys())
-
-
-# Import all parser modules to trigger registration via @register_parser decorators
-# Each module registers itself when imported
-from environments.tool_call_parsers.hermes_parser import HermesToolCallParser  # noqa: E402, F401
-from environments.tool_call_parsers.longcat_parser import LongcatToolCallParser  # noqa: E402, F401
-from environments.tool_call_parsers.mistral_parser import MistralToolCallParser  # noqa: E402, F401
-from environments.tool_call_parsers.llama_parser import LlamaToolCallParser  # noqa: E402, F401
-from environments.tool_call_parsers.qwen_parser import QwenToolCallParser  # noqa: E402, F401
-from environments.tool_call_parsers.deepseek_v3_parser import DeepSeekV3ToolCallParser  # noqa: E402, F401
-from environments.tool_call_parsers.deepseek_v3_1_parser import DeepSeekV31ToolCallParser  # noqa: E402, F401
-from environments.tool_call_parsers.kimi_k2_parser import KimiK2ToolCallParser  # noqa: E402, F401
-from environments.tool_call_parsers.glm45_parser import Glm45ToolCallParser  # noqa: E402, F401
-from environments.tool_call_parsers.glm47_parser import Glm47ToolCallParser  # noqa: E402, F401
-from environments.tool_call_parsers.qwen3_coder_parser import Qwen3CoderToolCallParser  # noqa: E402, F401
@@ -1,72 +0,0 @@
-"""
-DeepSeek V3.1 tool call parser.
-
-Similar to V3 but with a slightly different format:
-    <｜tool▁call▁begin｜>function_name<｜tool▁sep｜>arguments<｜tool▁call▁end｜>
-
-Note: V3 has type+name before the separator, V3.1 has name before and args after.
-
-Based on VLLM's DeepSeekV31ToolParser.extract_tool_calls()
-"""
-
-import re
-import uuid
-from typing import List, Optional
-
-from openai.types.chat.chat_completion_message_tool_call import (
-    ChatCompletionMessageToolCall,
-    Function,
-)
-
-from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
-
-
-@register_parser("deepseek_v3_1")
-@register_parser("deepseek_v31")
-class DeepSeekV31ToolCallParser(ToolCallParser):
-    """
-    Parser for DeepSeek V3.1 tool calls.
-
-    Slightly different regex than V3: function_name comes before the separator,
-    arguments come after (no type field, no json code block wrapper).
-    """
-
-    START_TOKEN = "<｜tool▁calls▁begin｜>"
-
-    # Regex captures: function_name, function_arguments
-    PATTERN = re.compile(
-        r"<｜tool▁call▁begin｜>(?P<function_name>.*?)<｜tool▁sep｜>(?P<function_arguments>.*?)<｜tool▁call▁end｜>",
-        re.DOTALL,
-    )
-
-    def parse(self, text: str) -> ParseResult:
-        if self.START_TOKEN not in text:
-            return text, None
-
-        try:
-            matches = self.PATTERN.findall(text)
-            if not matches:
-                return text, None
-
-            tool_calls: List[ChatCompletionMessageToolCall] = []
-            for match in matches:
-                func_name, func_args = match
-                tool_calls.append(
-                    ChatCompletionMessageToolCall(
-                        id=f"call_{uuid.uuid4().hex[:8]}",
-                        type="function",
-                        function=Function(
-                            name=func_name.strip(),
-                            arguments=func_args.strip(),
-                        ),
-                    )
-                )
-
-            if not tool_calls:
-                return text, None
-
-            content = text[: text.find(self.START_TOKEN)].strip()
-            return content if content else None, tool_calls
-
-        except Exception:
-            return text, None
@@ -1,89 +0,0 @@
-"""
-DeepSeek V3 tool call parser.
-
-Format uses special unicode tokens:
-    <｜tool▁calls▁begin｜>
-    <｜tool▁call▁begin｜>type<｜tool▁sep｜>function_name
-    ```json
-    {"arg": "value"}
-    ```
-    <｜tool▁call▁end｜>
-    <｜tool▁calls▁end｜>
-
-Fixes Issue #989: Support for multiple simultaneous tool calls.
-"""
-
-import re
-import uuid
-import logging
-from typing import List, Optional, Tuple
-
-from openai.types.chat.chat_completion_message_tool_call import (
-    ChatCompletionMessageToolCall,
-    Function,
-)
-
-from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
-
-logger = logging.getLogger(__name__)
-
-@register_parser("deepseek_v3")
-class DeepSeekV3ToolCallParser(ToolCallParser):
-    """
-    Parser for DeepSeek V3 tool calls.
-
-    Uses special unicode tokens with fullwidth angle brackets and block elements.
-    Extracts type, function name, and JSON arguments from the structured format.
-    Ensures all tool calls are captured when the model executes multiple actions.
-    """
-
-    START_TOKEN = "<｜tool▁calls▁begin｜>"
-
-    # Updated PATTERN: Using \s* instead of literal \n for increased robustness
-    # against variations in model formatting (Issue #989).
-    PATTERN = re.compile(
-        r"<｜tool▁call▁begin｜>(?P<type>.*?)<｜tool▁sep｜>(?P<function_name>.*?)\s*```json\s*(?P<function_arguments>.*?)\s*```\s*<｜tool▁call▁end｜>",
-        re.DOTALL,
-    )
-
-    def parse(self, text: str) -> ParseResult:
-        """
-        Parses the input text and extracts all available tool calls.
-        """
-        if self.START_TOKEN not in text:
-            return text, None
-
-        try:
-            # Using finditer to capture ALL tool calls in the sequence
-            matches = list(self.PATTERN.finditer(text))
-            if not matches:
-                return text, None
-
-            tool_calls: List[ChatCompletionMessageToolCall] = []
-            
-            for match in matches:
-                func_name = match.group("function_name").strip()
-                func_args = match.group("function_arguments").strip()
-                
-                tool_calls.append(
-                    ChatCompletionMessageToolCall(
-                        id=f"call_{uuid.uuid4().hex[:8]}",
-                        type="function",
-                        function=Function(
-                            name=func_name,
-                            arguments=func_args,
-                        ),
-                    )
-                )
-
-            if tool_calls:
-                # Content is text before the first tool call block
-                content_index = text.find(self.START_TOKEN)
-                content = text[:content_index].strip()
-                return content if content else None, tool_calls
-
-            return text, None
-
-        except Exception as e:
-            logger.error(f"Error parsing DeepSeek V3 tool calls: {e}")
-            return text, None
@@ -1,109 +0,0 @@
-"""
-GLM 4.5 (GLM-4-MoE) tool call parser.
-
-Format uses custom arg_key/arg_value tags rather than standard JSON:
-    <tool_call>function_name
-    <arg_key>param1</arg_key><arg_value>value1</arg_value>
-    <arg_key>param2</arg_key><arg_value>value2</arg_value>
-    </tool_call>
-
-Values are deserialized using json.loads -> ast.literal_eval -> raw string fallback.
-
-Based on VLLM's Glm4MoeModelToolParser.extract_tool_calls()
-"""
-
-import ast
-import json
-import re
-import uuid
-from typing import Any, Dict, List, Optional
-
-from openai.types.chat.chat_completion_message_tool_call import (
-    ChatCompletionMessageToolCall,
-    Function,
-)
-
-from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
-
-
-def _deserialize_value(value: str) -> Any:
-    """
-    Try to deserialize a string value to its native Python type.
-    Attempts json.loads, then ast.literal_eval, then returns raw string.
-    """
-    try:
-        return json.loads(value)
-    except (json.JSONDecodeError, TypeError):
-        pass
-
-    try:
-        return ast.literal_eval(value)
-    except (ValueError, SyntaxError, TypeError):
-        pass
-
-    return value
-
-
-@register_parser("glm45")
-class Glm45ToolCallParser(ToolCallParser):
-    """
-    Parser for GLM 4.5 (GLM-4-MoE) tool calls.
-
-    Uses <tool_call>...</tool_call> tags with <arg_key>/<arg_value> pairs
-    instead of standard JSON arguments.
-    """
-
-    FUNC_CALL_REGEX = re.compile(r"<tool_call>.*?</tool_call>", re.DOTALL)
-    FUNC_DETAIL_REGEX = re.compile(r"<tool_call>([^\n]*)\n(.*)</tool_call>", re.DOTALL)
-    FUNC_ARG_REGEX = re.compile(
-        r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>", re.DOTALL
-    )
-
-    START_TOKEN = "<tool_call>"
-
-    def parse(self, text: str) -> ParseResult:
-        if self.START_TOKEN not in text:
-            return text, None
-
-        try:
-            matched_calls = self.FUNC_CALL_REGEX.findall(text)
-            if not matched_calls:
-                return text, None
-
-            tool_calls: List[ChatCompletionMessageToolCall] = []
-
-            for match in matched_calls:
-                detail = self.FUNC_DETAIL_REGEX.search(match)
-                if not detail:
-                    continue
-
-                func_name = detail.group(1).strip()
-                func_args_raw = detail.group(2)
-
-                # Parse arg_key/arg_value pairs
-                pairs = self.FUNC_ARG_REGEX.findall(func_args_raw) if func_args_raw else []
-                arg_dict: Dict[str, Any] = {}
-                for key, value in pairs:
-                    arg_key = key.strip()
-                    arg_val = _deserialize_value(value.strip())
-                    arg_dict[arg_key] = arg_val
-
-                tool_calls.append(
-                    ChatCompletionMessageToolCall(
-                        id=f"call_{uuid.uuid4().hex[:8]}",
-                        type="function",
-                        function=Function(
-                            name=func_name,
-                            arguments=json.dumps(arg_dict, ensure_ascii=False),
-                        ),
-                    )
-                )
-
-            if not tool_calls:
-                return text, None
-
-            content = text[: text.find(self.START_TOKEN)].strip()
-            return content if content else None, tool_calls
-
-        except Exception:
-            return text, None
@@ -1,35 +0,0 @@
-"""
-GLM 4.7 tool call parser.
-
-Same as GLM 4.5 but with slightly different regex patterns.
-The tool_call tags may wrap differently and arg parsing handles
-newlines between key/value pairs.
-
-Based on VLLM's Glm47MoeModelToolParser (extends Glm4MoeModelToolParser).
-"""
-
-import re
-
-from environments.tool_call_parsers import ParseResult, register_parser
-from environments.tool_call_parsers.glm45_parser import Glm45ToolCallParser
-
-
-@register_parser("glm47")
-class Glm47ToolCallParser(Glm45ToolCallParser):
-    """
-    Parser for GLM 4.7 tool calls.
-    Extends GLM 4.5 with updated regex patterns.
-    """
-
-    def __init__(self):
-        super().__init__()
-        # GLM 4.7 uses a slightly different detail regex that includes
-        # the <tool_call> wrapper and optional arg_key content
-        self.FUNC_DETAIL_REGEX = re.compile(
-            r"<tool_call>(.*?)(<arg_key>.*?)?</tool_call>", re.DOTALL
-        )
-        # GLM 4.7 handles newlines between arg_key and arg_value tags
-        self.FUNC_ARG_REGEX = re.compile(
-            r"<arg_key>(.*?)</arg_key>(?:\\n|\s)*<arg_value>(.*?)</arg_value>",
-            re.DOTALL,
-        )
@@ -1,75 +0,0 @@
-"""
-Hermes tool call parser.
-
-Format: <tool_call>{"name": "func", "arguments": {...}}</tool_call>
-Based on VLLM's Hermes2ProToolParser.extract_tool_calls()
-"""
-
-import json
-import re
-import uuid
-from typing import List, Optional, Tuple
-
-from openai.types.chat.chat_completion_message_tool_call import (
-    ChatCompletionMessageToolCall,
-    Function,
-)
-
-from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
-
-
-@register_parser("hermes")
-class HermesToolCallParser(ToolCallParser):
-    """
-    Parser for Hermes-format tool calls.
-
-    Matches <tool_call>...</tool_call> tags containing JSON with "name" and "arguments".
-    Also handles unclosed <tool_call> at end-of-string (truncated generation).
-    """
-
-    # Matches both closed and unclosed tool_call tags
-    PATTERN = re.compile(
-        r"<tool_call>\s*(.*?)\s*</tool_call>|<tool_call>\s*(.*)", re.DOTALL
-    )
-
-    def parse(self, text: str) -> ParseResult:
-        if "<tool_call>" not in text:
-            return text, None
-
-        try:
-            matches = self.PATTERN.findall(text)
-            if not matches:
-                return text, None
-
-            tool_calls: List[ChatCompletionMessageToolCall] = []
-            for match in matches:
-                # match is a tuple: (closed_content, unclosed_content)
-                raw_json = match[0] if match[0] else match[1]
-                if not raw_json.strip():
-                    continue
-
-                tc_data = json.loads(raw_json)
-                if "name" not in tc_data:
-                    continue
-                tool_calls.append(
-                    ChatCompletionMessageToolCall(
-                        id=f"call_{uuid.uuid4().hex[:8]}",
-                        type="function",
-                        function=Function(
-                            name=tc_data["name"],
-                            arguments=json.dumps(
-                                tc_data.get("arguments", {}), ensure_ascii=False
-                            ),
-                        ),
-                    )
-                )
-
-            if not tool_calls:
-                return text, None
-
-            # Content is everything before the first <tool_call> tag
-            content = text[: text.find("<tool_call>")].strip()
-            return content if content else None, tool_calls
-
-        except Exception:
-            return text, None
@@ -1,93 +0,0 @@
-"""
-Kimi K2 tool call parser.
-
-Format:
-    <|tool_calls_section_begin|>
-    <|tool_call_begin|>function_id:0<|tool_call_argument_begin|>{"arg": "val"}<|tool_call_end|>
-    <|tool_calls_section_end|>
-
-The function_id format is typically "functions.func_name:index" or "func_name:index".
-
-Based on VLLM's KimiK2ToolParser.extract_tool_calls()
-"""
-
-import re
-import uuid
-from typing import List, Optional
-
-from openai.types.chat.chat_completion_message_tool_call import (
-    ChatCompletionMessageToolCall,
-    Function,
-)
-
-from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
-
-
-@register_parser("kimi_k2")
-class KimiK2ToolCallParser(ToolCallParser):
-    """
-    Parser for Kimi K2 tool calls.
-
-    Uses section begin/end tokens wrapping individual tool call begin/end tokens.
-    The tool_call_id contains the function name (after last dot, before colon).
-    """
-
-    # Support both singular and plural variants
-    START_TOKENS = [
-        "<|tool_calls_section_begin|>",
-        "<|tool_call_section_begin|>",
-    ]
-
-    # Regex captures: tool_call_id (e.g., "functions.get_weather:0"), function_arguments
-    PATTERN = re.compile(
-        r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[^<]+:\d+)\s*"
-        r"<\|tool_call_argument_begin\|>\s*"
-        r"(?P<function_arguments>(?:(?!<\|tool_call_begin\|>).)*?)\s*"
-        r"<\|tool_call_end\|>",
-        re.DOTALL,
-    )
-
-    def parse(self, text: str) -> ParseResult:
-        # Check for any variant of the start token
-        has_start = any(token in text for token in self.START_TOKENS)
-        if not has_start:
-            return text, None
-
-        try:
-            matches = self.PATTERN.findall(text)
-            if not matches:
-                return text, None
-
-            tool_calls: List[ChatCompletionMessageToolCall] = []
-            for match in matches:
-                function_id, function_args = match
-
-                # Extract function name from ID format: "functions.get_weather:0" -> "get_weather"
-                function_name = function_id.split(":")[0].split(".")[-1]
-
-                tool_calls.append(
-                    ChatCompletionMessageToolCall(
-                        id=function_id,  # Preserve the original ID format
-                        type="function",
-                        function=Function(
-                            name=function_name,
-                            arguments=function_args.strip(),
-                        ),
-                    )
-                )
-
-            if not tool_calls:
-                return text, None
-
-            # Content is everything before the tool calls section
-            earliest_start = len(text)
-            for token in self.START_TOKENS:
-                idx = text.find(token)
-                if idx >= 0 and idx < earliest_start:
-                    earliest_start = idx
-
-            content = text[:earliest_start].strip()
-            return content if content else None, tool_calls
-
-        except Exception:
-            return text, None
@@ -1,96 +0,0 @@
-"""
-Llama 3.x / 4 tool call parser.
-
-Format: The model outputs JSON objects with "name" and "arguments" (or "parameters") keys.
-May be preceded by <|python_tag|> token. Supports multiple JSON objects separated
-by content or semicolons.
-
-Based on VLLM's Llama3JsonToolParser.extract_tool_calls()
-"""
-
-import json
-import re
-import uuid
-from typing import List, Optional
-
-from openai.types.chat.chat_completion_message_tool_call import (
-    ChatCompletionMessageToolCall,
-    Function,
-)
-
-from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
-
-
-@register_parser("llama3_json")
-@register_parser("llama4_json")
-class LlamaToolCallParser(ToolCallParser):
-    """
-    Parser for Llama 3.x and 4 JSON-format tool calls.
-
-    Finds JSON objects containing "name" + ("arguments" or "parameters") keys.
-    Uses Python's json.JSONDecoder.raw_decode for robust extraction of
-    JSON objects from mixed text.
-    """
-
-    BOT_TOKEN = "<|python_tag|>"
-
-    # Regex to find the start of potential JSON objects
-    JSON_START = re.compile(r"\{")
-
-    def parse(self, text: str) -> ParseResult:
-        # Quick check: need either the bot token or a JSON brace
-        if self.BOT_TOKEN not in text and "{" not in text:
-            return text, None
-
-        try:
-            decoder = json.JSONDecoder()
-            tool_calls: List[ChatCompletionMessageToolCall] = []
-            end_index = -1  # Track where the last parsed JSON ended
-
-            for match in self.JSON_START.finditer(text):
-                start = match.start()
-                # Skip if this brace is inside a previously parsed JSON object
-                if start <= end_index:
-                    continue
-
-                try:
-                    obj, json_end = decoder.raw_decode(text[start:])
-                    end_index = start + json_end
-
-                    # Must have "name" and either "arguments" or "parameters"
-                    name = obj.get("name")
-                    args = obj.get("arguments", obj.get("parameters"))
-
-                    if not name or args is None:
-                        continue
-
-                    # Normalize arguments to JSON string
-                    if isinstance(args, dict):
-                        args = json.dumps(args, ensure_ascii=False)
-                    elif not isinstance(args, str):
-                        args = json.dumps(args, ensure_ascii=False)
-
-                    tool_calls.append(
-                        ChatCompletionMessageToolCall(
-                            id=f"call_{uuid.uuid4().hex[:8]}",
-                            type="function",
-                            function=Function(name=name, arguments=args),
-                        )
-                    )
-                except (json.JSONDecodeError, KeyError, ValueError):
-                    continue
-
-            if not tool_calls:
-                return text, None
-
-            # Content is everything before the first tool call JSON
-            # Find where the first tool call starts in the text
-            first_tc_start = text.find("{")
-            if self.BOT_TOKEN in text:
-                first_tc_start = text.find(self.BOT_TOKEN)
-            content = text[:first_tc_start].strip() if first_tc_start > 0 else None
-
-            return content, tool_calls
-
-        except Exception:
-            return text, None
@@ -1,69 +0,0 @@
-"""
-Longcat Flash Chat tool call parser.
-
-Same as Hermes but uses <longcat_tool_call> tags instead of <tool_call>.
-Based on VLLM's LongcatFlashToolParser (extends Hermes2ProToolParser).
-"""
-
-import json
-import re
-import uuid
-from typing import List, Optional
-
-from openai.types.chat.chat_completion_message_tool_call import (
-    ChatCompletionMessageToolCall,
-    Function,
-)
-
-from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
-
-
-@register_parser("longcat")
-class LongcatToolCallParser(ToolCallParser):
-    """
-    Parser for Longcat Flash Chat tool calls.
-    Identical logic to Hermes, just different tag names.
-    """
-
-    PATTERN = re.compile(
-        r"<longcat_tool_call>\s*(.*?)\s*</longcat_tool_call>|<longcat_tool_call>\s*(.*)",
-        re.DOTALL,
-    )
-
-    def parse(self, text: str) -> ParseResult:
-        if "<longcat_tool_call>" not in text:
-            return text, None
-
-        try:
-            matches = self.PATTERN.findall(text)
-            if not matches:
-                return text, None
-
-            tool_calls: List[ChatCompletionMessageToolCall] = []
-            for match in matches:
-                raw_json = match[0] if match[0] else match[1]
-                if not raw_json.strip():
-                    continue
-
-                tc_data = json.loads(raw_json)
-                tool_calls.append(
-                    ChatCompletionMessageToolCall(
-                        id=f"call_{uuid.uuid4().hex[:8]}",
-                        type="function",
-                        function=Function(
-                            name=tc_data["name"],
-                            arguments=json.dumps(
-                                tc_data.get("arguments", {}), ensure_ascii=False
-                            ),
-                        ),
-                    )
-                )
-
-            if not tool_calls:
-                return text, None
-
-            content = text[: text.find("<longcat_tool_call>")].strip()
-            return content if content else None, tool_calls
-
-        except Exception:
-            return text, None
@@ -1,137 +0,0 @@
-"""
-Mistral tool call parser.
-
-Supports two formats depending on tokenizer version:
- Pre-v11: content[TOOL_CALLS] [{"name": ..., "arguments": {...}}, ...]
- v11+:    content[TOOL_CALLS]tool_name1{"arg": "val"}[TOOL_CALLS]tool_name2{"arg": "val"}
-
-Based on VLLM's MistralToolParser.extract_tool_calls()
-The [TOOL_CALLS] token is the bot_token used by Mistral models.
-"""
-
-import json
-import uuid
-from typing import List, Optional
-
-from openai.types.chat.chat_completion_message_tool_call import (
-    ChatCompletionMessageToolCall,
-    Function,
-)
-
-from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
-
-
-def _generate_mistral_id() -> str:
-    """Mistral tool call IDs are 9-char alphanumeric strings."""
-    import random
-    import string
-
-    return "".join(random.choices(string.ascii_letters + string.digits, k=9))
-
-
-@register_parser("mistral")
-class MistralToolCallParser(ToolCallParser):
-    """
-    Parser for Mistral-format tool calls.
-
-    Detects format by checking if the content after [TOOL_CALLS] starts with '['
-    (pre-v11 JSON array) or with a tool name (v11+ format).
-    """
-
-    # The [TOOL_CALLS] token -- may appear as different strings depending on tokenizer
-    BOT_TOKEN = "[TOOL_CALLS]"
-
-    def parse(self, text: str) -> ParseResult:
-        if self.BOT_TOKEN not in text:
-            return text, None
-
-        try:
-            parts = text.split(self.BOT_TOKEN)
-            content = parts[0].strip()
-            raw_tool_calls = parts[1:]
-
-            # Detect format: if the first raw part starts with '[', it's pre-v11
-            first_raw = raw_tool_calls[0].strip() if raw_tool_calls else ""
-            is_pre_v11 = first_raw.startswith("[") or first_raw.startswith("{")
-
-            tool_calls: List[ChatCompletionMessageToolCall] = []
-
-            if not is_pre_v11:
-                # v11+ format: [TOOL_CALLS]tool_name{args}[TOOL_CALLS]tool_name2{args2}
-                for raw in raw_tool_calls:
-                    raw = raw.strip()
-                    if not raw or "{" not in raw:
-                        continue
-
-                    brace_idx = raw.find("{")
-                    tool_name = raw[:brace_idx].strip()
-                    args_str = raw[brace_idx:]
-
-                    # Validate and clean the JSON arguments
-                    try:
-                        parsed_args = json.loads(args_str)
-                        args_str = json.dumps(parsed_args, ensure_ascii=False)
-                    except json.JSONDecodeError:
-                        pass  # Keep raw if parsing fails
-
-                    tool_calls.append(
-                        ChatCompletionMessageToolCall(
-                            id=_generate_mistral_id(),
-                            type="function",
-                            function=Function(name=tool_name, arguments=args_str),
-                        )
-                    )
-            else:
-                # Pre-v11 format: [TOOL_CALLS] [{"name": ..., "arguments": {...}}]
-                try:
-                    parsed = json.loads(first_raw)
-                    if isinstance(parsed, dict):
-                        parsed = [parsed]
-
-                    for tc in parsed:
-                        if "name" not in tc:
-                            continue
-                        args = tc.get("arguments", {})
-                        if isinstance(args, dict):
-                            args = json.dumps(args, ensure_ascii=False)
-
-                        tool_calls.append(
-                            ChatCompletionMessageToolCall(
-                                id=_generate_mistral_id(),
-                                type="function",
-                                function=Function(
-                                    name=tc["name"], arguments=args
-                                ),
-                            )
-                        )
-                except json.JSONDecodeError:
-                    # Fallback: extract JSON objects using raw_decode
-                    decoder = json.JSONDecoder()
-                    idx = 0
-                    while idx < len(first_raw):
-                        try:
-                            obj, end_idx = decoder.raw_decode(first_raw, idx)
-                            if isinstance(obj, dict) and "name" in obj:
-                                args = obj.get("arguments", {})
-                                if isinstance(args, dict):
-                                    args = json.dumps(args, ensure_ascii=False)
-                                tool_calls.append(
-                                    ChatCompletionMessageToolCall(
-                                        id=_generate_mistral_id(),
-                                        type="function",
-                                        function=Function(
-                                            name=obj["name"], arguments=args
-                                        ),
-                                    )
-                                )
-                            idx = end_idx
-                        except json.JSONDecodeError:
-                            idx += 1
-
-            if not tool_calls:
-                return text, None
-
-            return content if content else None, tool_calls
-
-        except Exception:
-            return text, None
@@ -1,163 +0,0 @@
-"""
-Qwen3-Coder tool call parser.
-
-Format uses XML-style nested tags:
-    <tool_call>
-    <function=function_name>
-    <parameter=param_name>value</parameter>
-    <parameter=param_name2>value2</parameter>
-    </function>
-    </tool_call>
-
-Parameters are extracted from <parameter=name>value</parameter> tags and
-type-converted using the schema if available, otherwise treated as strings.
-
-Based on VLLM's Qwen3CoderToolParser.extract_tool_calls()
-"""
-
-import ast
-import json
-import re
-import uuid
-from typing import Any, Dict, List, Optional
-
-from openai.types.chat.chat_completion_message_tool_call import (
-    ChatCompletionMessageToolCall,
-    Function,
-)
-
-from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
-
-
-def _try_convert_value(value: str) -> Any:
-    """
-    Try to convert a parameter value string to a native Python type.
-    Handles null, numbers, booleans, JSON objects/arrays, and falls back to string.
-    """
-    stripped = value.strip()
-
-    # Handle null
-    if stripped.lower() == "null":
-        return None
-
-    # Try JSON first (handles objects, arrays, strings, numbers, booleans)
-    try:
-        return json.loads(stripped)
-    except (json.JSONDecodeError, TypeError):
-        pass
-
-    # Try Python literal eval (handles tuples, etc.)
-    try:
-        return ast.literal_eval(stripped)
-    except (ValueError, SyntaxError, TypeError):
-        pass
-
-    # Return as string
-    return stripped
-
-
-@register_parser("qwen3_coder")
-class Qwen3CoderToolCallParser(ToolCallParser):
-    """
-    Parser for Qwen3-Coder XML-format tool calls.
-
-    Uses nested XML tags: <tool_call><function=name><parameter=key>val</parameter></function></tool_call>
-    """
-
-    START_TOKEN = "<tool_call>"
-    FUNCTION_PREFIX = "<function="
-
-    # Find complete tool_call blocks (or unclosed at end)
-    TOOL_CALL_REGEX = re.compile(
-        r"<tool_call>(.*?)</tool_call>|<tool_call>(.*?)$", re.DOTALL
-    )
-
-    # Find function blocks within a tool_call
-    FUNCTION_REGEX = re.compile(
-        r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL
-    )
-
-    # Find parameter blocks within a function
-    PARAMETER_REGEX = re.compile(
-        r"<parameter=(.*?)(?:</parameter>|(?=<parameter=)|(?=</function>)|$)",
-        re.DOTALL,
-    )
-
-    def _parse_function_call(self, function_str: str) -> Optional[ChatCompletionMessageToolCall]:
-        """Parse a single <function=name>...</function> block into a ToolCall."""
-        try:
-            # Extract function name: everything before the first '>'
-            gt_idx = function_str.index(">")
-            func_name = function_str[:gt_idx].strip()
-            params_str = function_str[gt_idx + 1:]
-
-            # Extract parameters
-            param_dict: Dict[str, Any] = {}
-            for match_text in self.PARAMETER_REGEX.findall(params_str):
-                if ">" not in match_text:
-                    continue
-                eq_idx = match_text.index(">")
-                param_name = match_text[:eq_idx].strip()
-                param_value = match_text[eq_idx + 1:]
-
-                # Clean up whitespace
-                if param_value.startswith("\n"):
-                    param_value = param_value[1:]
-                if param_value.endswith("\n"):
-                    param_value = param_value[:-1]
-
-                param_dict[param_name] = _try_convert_value(param_value)
-
-            return ChatCompletionMessageToolCall(
-                id=f"call_{uuid.uuid4().hex[:24]}",
-                type="function",
-                function=Function(
-                    name=func_name,
-                    arguments=json.dumps(param_dict, ensure_ascii=False),
-                ),
-            )
-        except (ValueError, IndexError):
-            return None
-
-    def parse(self, text: str) -> ParseResult:
-        if self.FUNCTION_PREFIX not in text:
-            return text, None
-
-        try:
-            # Find all tool_call blocks
-            tc_matches = self.TOOL_CALL_REGEX.findall(text)
-            raw_blocks = [m[0] if m[0] else m[1] for m in tc_matches]
-
-            # Fallback: if no tool_call tags, try the whole text
-            if not raw_blocks:
-                raw_blocks = [text]
-
-            # Find function blocks within each tool_call
-            function_strs: List[str] = []
-            for block in raw_blocks:
-                func_matches = self.FUNCTION_REGEX.findall(block)
-                function_strs.extend(m[0] if m[0] else m[1] for m in func_matches)
-
-            if not function_strs:
-                return text, None
-
-            # Parse each function call
-            tool_calls: List[ChatCompletionMessageToolCall] = []
-            for func_str in function_strs:
-                tc = self._parse_function_call(func_str)
-                if tc is not None:
-                    tool_calls.append(tc)
-
-            if not tool_calls:
-                return text, None
-
-            # Content before tool calls
-            first_tc = text.find(self.START_TOKEN)
-            if first_tc < 0:
-                first_tc = text.find(self.FUNCTION_PREFIX)
-            content = text[:first_tc].strip() if first_tc > 0 else None
-
-            return content, tool_calls
-
-        except Exception:
-            return text, None
@@ -1,19 +0,0 @@
-"""
-Qwen 2.5 tool call parser.
-
-Uses the same <tool_call> format as Hermes.
-Registered as a separate parser name for clarity when using --tool-parser=qwen.
-"""
-
-from environments.tool_call_parsers import register_parser
-from environments.tool_call_parsers.hermes_parser import HermesToolCallParser
-
-
-@register_parser("qwen")
-class QwenToolCallParser(HermesToolCallParser):
-    """
-    Parser for Qwen 2.5 tool calls.
-    Same <tool_call>{"name": ..., "arguments": ...}</tool_call> format as Hermes.
-    """
-
-    pass  # Identical format -- inherits everything from Hermes
@@ -1,473 +0,0 @@
-"""
-ToolContext -- Unrestricted Tool Access for Reward Functions
-
-A per-rollout handle that gives reward/verification functions direct access to
-ALL hermes-agent tools, scoped to the rollout's task_id. The same task_id means
-the terminal/browser session is the SAME one the model used during its rollout --
-all state (files, processes, browser tabs) is preserved.
-
-The verifier author decides which tools to use. Nothing is hardcoded or gated.
-
-Example usage in a compute_reward():
-    async def compute_reward(self, item, result, ctx):
-        # Run tests in the model's terminal sandbox
-        test = ctx.terminal("pytest -v")
-        if test["exit_code"] == 0:
-            return 1.0
-
-        # Check if a file was created
-        content = ctx.read_file("/workspace/solution.py")
-        if content.get("content"):
-            return 0.5
-
-        return 0.0
-"""
-
-import json
-import logging
-import os
-from typing import Any, Dict, List, Optional
-
-import asyncio
-import concurrent.futures
-
-from model_tools import handle_function_call
-from tools.terminal_tool import cleanup_vm
-from tools.browser_tool import cleanup_browser
-
-logger = logging.getLogger(__name__)
-
-# Thread pool for running sync tool calls that internally use asyncio.run()
-_tool_executor = concurrent.futures.ThreadPoolExecutor(max_workers=4)
-
-
-def _run_tool_in_thread(tool_name: str, arguments: Dict[str, Any], task_id: str) -> str:
-    """
-    Run a tool call in a thread pool executor so backends that use asyncio.run()
-    internally (modal, docker, daytona) get a clean event loop.
-
-    If we're already in an async context, executes handle_function_call() in a
-    disposable worker thread and blocks for the result.
-    If not (e.g., called from sync code), runs directly.
-    """
-    try:
-        loop = asyncio.get_running_loop()
-        # We're in an async context -- need to run in thread
-        with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
-            future = pool.submit(
-                handle_function_call, tool_name, arguments, task_id
-            )
-            return future.result(timeout=300)
-    except RuntimeError:
-        # No running event loop -- safe to call directly
-        return handle_function_call(tool_name, arguments, task_id)
-
-
-class ToolContext:
-    """
-    Open-ended access to all hermes-agent tools for a specific rollout.
-
-    Passed to compute_reward() so verifiers can use any tool they need:
-    terminal commands, file reads/writes, web searches, browser automation, etc.
-    All calls share the rollout's task_id for session isolation.
-    """
-
-    def __init__(self, task_id: str):
-        self.task_id = task_id
-
-    # -------------------------------------------------------------------------
-    # Terminal tools
-    # -------------------------------------------------------------------------
-
-    def terminal(self, command: str, timeout: int = 180) -> Dict[str, Any]:
-        """
-        Run a command in the rollout's terminal session.
-
-        Args:
-            command: Shell command to execute
-            timeout: Command timeout in seconds
-
-        Returns:
-            Dict with 'exit_code' (int) and 'output' (str)
-        """
-        import os
-        backend = os.getenv("TERMINAL_ENV", "local")
-        logger.debug("ToolContext.terminal [%s backend] task=%s: %s", backend, self.task_id[:8], command[:100])
-
-        # Run via thread helper so modal/docker/daytona backends' asyncio.run() doesn't deadlock
-        result = _run_tool_in_thread(
-            "terminal",
-            {"command": command, "timeout": timeout},
-            self.task_id,
-        )
-        try:
-            return json.loads(result)
-        except json.JSONDecodeError:
-            return {"exit_code": -1, "output": result}
-
-    # -------------------------------------------------------------------------
-    # File tools
-    # -------------------------------------------------------------------------
-
-    def read_file(self, path: str) -> Dict[str, Any]:
-        """
-        Read a file from the rollout's filesystem.
-
-        Args:
-            path: File path to read
-
-        Returns:
-            Dict with file content or error
-        """
-        result = handle_function_call(
-            "read_file", {"path": path}, task_id=self.task_id
-        )
-        try:
-            return json.loads(result)
-        except json.JSONDecodeError:
-            return {"error": result}
-
-    def write_file(self, path: str, content: str) -> Dict[str, Any]:
-        """
-        Write a TEXT file in the rollout's filesystem.
-
-        Uses a shell heredoc under the hood, so this is only safe for text content.
-        For binary files (images, compiled artifacts, etc.), use upload_file() instead.
-
-        Args:
-            path: File path to write
-            content: Text content to write
-
-        Returns:
-            Dict with success status or error
-        """
-        result = handle_function_call(
-            "write_file", {"path": path, "content": content}, task_id=self.task_id
-        )
-        try:
-            return json.loads(result)
-        except json.JSONDecodeError:
-            return {"error": result}
-
-    def upload_file(self, local_path: str, remote_path: str) -> Dict[str, Any]:
-        """
-        Upload a local file to the rollout's sandbox (binary-safe).
-
-        Unlike write_file() which passes content through a shell heredoc (text-only),
-        this method base64-encodes the file and decodes it inside the sandbox.
-        Safe for any file type: binaries, images, archives, etc.
-
-        For large files (>1MB), the content is split into chunks to avoid
-        hitting shell command-length limits.
-
-        Args:
-            local_path: Path to a local file on the host
-            remote_path: Destination path inside the sandbox
-
-        Returns:
-            Dict with 'exit_code' and 'output'
-        """
-        import base64
-        from pathlib import Path as _Path
-
-        local = _Path(local_path)
-        if not local.exists():
-            return {"exit_code": -1, "output": f"Local file not found: {local_path}"}
-
-        raw = local.read_bytes()
-        b64 = base64.b64encode(raw).decode("ascii")
-
-        # Ensure parent directory exists in the sandbox
-        parent = str(_Path(remote_path).parent)
-        if parent not in {".", "/"}:
-            self.terminal(f"mkdir -p {parent}", timeout=10)
-
-        # For small files, single command is fine
-        chunk_size = 60_000  # ~60KB per chunk (well within shell limits)
-        if len(b64) <= chunk_size:
-            result = self.terminal(
-                f"printf '%s' '{b64}' | base64 -d > {remote_path}",
-                timeout=30,
-            )
-        else:
-            # For larger files, write base64 in chunks then decode
-            tmp_b64 = "/tmp/_hermes_upload.b64"
-            self.terminal(f": > {tmp_b64}", timeout=5)  # truncate
-            for i in range(0, len(b64), chunk_size):
-                chunk = b64[i : i + chunk_size]
-                self.terminal(f"printf '%s' '{chunk}' >> {tmp_b64}", timeout=15)
-            result = self.terminal(
-                f"base64 -d {tmp_b64} > {remote_path} && rm -f {tmp_b64}",
-                timeout=30,
-            )
-
-        return result
-
-    def upload_dir(self, local_dir: str, remote_dir: str) -> List[Dict[str, Any]]:
-        """
-        Upload an entire local directory to the rollout's sandbox (binary-safe).
-
-        Recursively uploads all files, preserving directory structure.
-
-        Args:
-            local_dir: Path to a local directory on the host
-            remote_dir: Destination directory inside the sandbox
-
-        Returns:
-            List of results, one per file uploaded
-        """
-        from pathlib import Path as _Path
-
-        local = _Path(local_dir)
-        if not local.exists() or not local.is_dir():
-            return [{"exit_code": -1, "output": f"Local directory not found: {local_dir}"}]
-
-        results = []
-        for file_path in sorted(local.rglob("*")):
-            if file_path.is_file():
-                relative = file_path.relative_to(local)
-                target = f"{remote_dir}/{relative}"
-                results.append(self.upload_file(str(file_path), target))
-        return results
-
-    def download_file(self, remote_path: str, local_path: str) -> Dict[str, Any]:
-        """
-        Download a file from the rollout's sandbox to the host (binary-safe).
-
-        The inverse of upload_file(). Base64-encodes the file inside the sandbox,
-        reads the encoded data through the terminal, and decodes it locally.
-        Safe for any file type.
-
-        Args:
-            remote_path: Path to the file inside the sandbox
-            local_path: Destination path on the host
-
-        Returns:
-            Dict with 'success' (bool) and 'bytes' (int) or 'error' (str)
-        """
-        import base64
-        from pathlib import Path as _Path
-
-        # Base64-encode the file inside the sandbox and capture output
-        result = self.terminal(
-            f"base64 {remote_path} 2>/dev/null",
-            timeout=30,
-        )
-
-        if result.get("exit_code", -1) != 0:
-            return {
-                "success": False,
-                "error": f"Failed to read remote file: {result.get('output', '')}",
-            }
-
-        b64_data = result.get("output", "").strip()
-        if not b64_data:
-            return {"success": False, "error": f"Remote file is empty or missing: {remote_path}"}
-
-        try:
-            raw = base64.b64decode(b64_data)
-        except Exception as e:
-            return {"success": False, "error": f"Base64 decode failed: {e}"}
-
-        # Write to local host filesystem
-        local = _Path(local_path)
-        local.parent.mkdir(parents=True, exist_ok=True)
-        local.write_bytes(raw)
-
-        return {"success": True, "bytes": len(raw)}
-
-    def download_dir(self, remote_dir: str, local_dir: str) -> List[Dict[str, Any]]:
-        """
-        Download a directory from the rollout's sandbox to the host (binary-safe).
-
-        Lists all files in the remote directory, then downloads each one.
-        Preserves directory structure.
-
-        Args:
-            remote_dir: Path to the directory inside the sandbox
-            local_dir: Destination directory on the host
-
-        Returns:
-            List of results, one per file downloaded
-        """
-        from pathlib import Path as _Path
-
-        # List files in the remote directory
-        ls_result = self.terminal(
-            f"find {remote_dir} -type f 2>/dev/null",
-            timeout=15,
-        )
-
-        if ls_result.get("exit_code", -1) != 0:
-            return [{"success": False, "error": f"Failed to list remote dir: {remote_dir}"}]
-
-        file_list = ls_result.get("output", "").strip()
-        if not file_list:
-            return [{"success": False, "error": f"Remote directory is empty or missing: {remote_dir}"}]
-
-        results = []
-        for remote_file in file_list.splitlines():
-            remote_file = remote_file.strip()
-            if not remote_file:
-                continue
-            # Compute the relative path to preserve directory structure
-            if remote_file.startswith(remote_dir):
-                relative = remote_file[len(remote_dir):].lstrip("/")
-            else:
-                relative = _Path(remote_file).name
-            local_file = str(_Path(local_dir) / relative)
-            results.append(self.download_file(remote_file, local_file))
-
-        return results
-
-    def search(self, query: str, path: str = ".") -> Dict[str, Any]:
-        """
-        Search for text in the rollout's filesystem.
-
-        Args:
-            query: Search query
-            path: Directory to search in
-
-        Returns:
-            Dict with search results
-        """
-        result = handle_function_call(
-            "search_files", {"pattern": query, "path": path}, task_id=self.task_id
-        )
-        try:
-            return json.loads(result)
-        except json.JSONDecodeError:
-            return {"error": result}
-
-    # -------------------------------------------------------------------------
-    # Web tools
-    # -------------------------------------------------------------------------
-
-    def web_search(self, query: str) -> Dict[str, Any]:
-        """
-        Search the web.
-
-        Args:
-            query: Search query
-
-        Returns:
-            Dict with search results
-        """
-        result = handle_function_call("web_search", {"query": query})
-        try:
-            return json.loads(result)
-        except json.JSONDecodeError:
-            return {"error": result}
-
-    def web_extract(self, urls: List[str]) -> Dict[str, Any]:
-        """
-        Extract content from URLs.
-
-        Args:
-            urls: List of URLs to extract content from
-
-        Returns:
-            Dict with extracted content
-        """
-        result = handle_function_call("web_extract", {"urls": urls})
-        try:
-            return json.loads(result)
-        except json.JSONDecodeError:
-            return {"error": result}
-
-    # -------------------------------------------------------------------------
-    # Browser tools
-    # -------------------------------------------------------------------------
-
-    def browser_navigate(self, url: str) -> Dict[str, Any]:
-        """
-        Navigate the rollout's browser session to a URL.
-
-        Args:
-            url: URL to navigate to
-
-        Returns:
-            Dict with page snapshot or error
-        """
-        result = handle_function_call(
-            "browser_navigate", {"url": url}, task_id=self.task_id
-        )
-        try:
-            return json.loads(result)
-        except json.JSONDecodeError:
-            return {"error": result}
-
-    def browser_snapshot(self) -> Dict[str, Any]:
-        """
-        Take a snapshot of the current browser page.
-
-        Returns:
-            Dict with page content/accessibility snapshot
-        """
-        result = handle_function_call(
-            "browser_snapshot", {}, task_id=self.task_id
-        )
-        try:
-            return json.loads(result)
-        except json.JSONDecodeError:
-            return {"error": result}
-
-    # -------------------------------------------------------------------------
-    # Generic tool access
-    # -------------------------------------------------------------------------
-
-    def call_tool(self, tool_name: str, arguments: Dict[str, Any]) -> str:
-        """
-        Call any hermes-agent tool by name.
-
-        This is the generic escape hatch -- if a tool doesn't have a convenience
-        wrapper above, you can call it directly here.
-
-        Args:
-            tool_name: Name of the tool (e.g., "vision_analyze", "skills_list")
-            arguments: Dict of arguments for the tool
-
-        Returns:
-            Raw JSON string result from the tool
-        """
-        return _run_tool_in_thread(tool_name, arguments, self.task_id)
-
-    # -------------------------------------------------------------------------
-    # Cleanup
-    # -------------------------------------------------------------------------
-
-    def cleanup(self):
-        """
-        Release all resources (terminal VMs, browser sessions, background processes)
-        for this rollout.
-
-        Called automatically by the base environment via try/finally after
-        compute_reward() completes. You generally don't need to call this yourself.
-        """
-        # Kill any background processes from this rollout (safety net)
-        try:
-            from tools.process_registry import process_registry
-            killed = process_registry.kill_all(task_id=self.task_id)
-            if killed:
-                logger.debug("Process cleanup for task %s: killed %d process(es)", self.task_id, killed)
-        except Exception as e:
-            logger.debug("Process cleanup for task %s: %s", self.task_id, e)
-
-        try:
-            cleanup_vm(self.task_id)
-        except Exception as e:
-            logger.debug("VM cleanup for task %s: %s", self.task_id, e)
-
-        # Suppress browser_tool's noisy debug prints during cleanup.
-        # The cleanup still runs (safe), it just doesn't spam the console.
-        _prev_quiet = os.environ.get("HERMES_QUIET")
-        os.environ["HERMES_QUIET"] = "1"
-        try:
-            cleanup_browser(self.task_id)
-        except Exception as e:
-            logger.debug("Browser cleanup for task %s: %s", self.task_id, e)
-        finally:
-            if _prev_quiet is None:
-                os.environ.pop("HERMES_QUIET", None)
-            else:
-                os.environ["HERMES_QUIET"] = _prev_quiet
@@ -1,719 +0,0 @@
-"""
-WebResearchEnv — RL Environment for Multi-Step Web Research
-============================================================
-
-Trains models to do accurate, efficient, multi-source web research.
-
-Reward signals:
-  - Answer correctness  (LLM judge, 0.0–1.0)
-  - Source diversity    (used ≥2 distinct domains)
-  - Efficiency          (penalizes excessive tool calls)
-  - Tool usage          (bonus for actually using web tools)
-
-Dataset: FRAMES benchmark (Google, 2024) — multi-hop factual questions
-  HuggingFace: google/frames-benchmark
-  Fallback:    built-in sample questions (no HF token needed)
-
-Usage:
-    # Phase 1 (OpenAI-compatible server)
-    python environments/web_research_env.py serve \\
-        --openai.base_url http://localhost:8000/v1 \\
-        --openai.model_name YourModel \\
-        --openai.server_type openai
-
-    # Process mode (offline data generation)
-    python environments/web_research_env.py process \\
-        --env.data_path_to_save_groups data/web_research.jsonl
-
-    # Standalone eval
-    python environments/web_research_env.py evaluate \\
-        --openai.base_url http://localhost:8000/v1 \\
-        --openai.model_name YourModel
-
-Built by: github.com/jackx707
-Inspired by: GroceryMind — production Hermes agent doing live web research
-             across German grocery stores (firecrawl + hermes-agent)
-"""
-
-from __future__ import annotations
-
-import asyncio
-import json
-import logging
-import os
-import random
-import re
-import sys
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
-from urllib.parse import urlparse
-
-from pydantic import Field
-
-# Ensure hermes-agent root is on path
-_repo_root = Path(__file__).resolve().parent.parent
-if str(_repo_root) not in sys.path:
-    sys.path.insert(0, str(_repo_root))
-
-# ---------------------------------------------------------------------------
-# Optional HuggingFace datasets import
-# ---------------------------------------------------------------------------
-try:
-    from datasets import load_dataset
-    HF_AVAILABLE = True
-except ImportError:
-    HF_AVAILABLE = False
-
-from atroposlib.envs.base import ScoredDataGroup
-from atroposlib.envs.server_handling.server_manager import APIServerConfig
-from atroposlib.type_definitions import Item
-
-from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
-from environments.agent_loop import AgentResult
-from environments.tool_context import ToolContext
-
-logger = logging.getLogger(__name__)
-
-# ---------------------------------------------------------------------------
-# Fallback sample dataset (used when HuggingFace is unavailable)
-# Multi-hop questions requiring real web search to answer.
-# ---------------------------------------------------------------------------
-SAMPLE_QUESTIONS = [
-    {
-        "question": "What is the current population of the capital city of the country that won the 2022 FIFA World Cup?",
-        "answer": "Buenos Aires has approximately 3 million people in the city proper, or around 15 million in the greater metro area.",
-        "difficulty": "medium",
-        "hops": 2,
-    },
-    {
-        "question": "Who is the CEO of the company that makes the most widely used open-source container orchestration platform?",
-        "answer": "The Linux Foundation oversees Kubernetes. CNCF (Cloud Native Computing Foundation) is the specific body — it does not have a traditional CEO but has an executive director.",
-        "difficulty": "medium",
-        "hops": 2,
-    },
-    {
-        "question": "What programming language was used to write the original version of the web framework used by Instagram?",
-        "answer": "Django, which Instagram was built on, is written in Python.",
-        "difficulty": "easy",
-        "hops": 2,
-    },
-    {
-        "question": "In what year was the university founded where the inventor of the World Wide Web currently holds a professorship?",
-        "answer": "Tim Berners-Lee holds a professorship at MIT (founded 1861) and the University of Southampton (founded 1952).",
-        "difficulty": "hard",
-        "hops": 3,
-    },
-    {
-        "question": "What is the latest stable version of the programming language that ranks #1 on the TIOBE index as of this year?",
-        "answer": "Python is currently #1 on TIOBE. The latest stable version should be verified via the official python.org site.",
-        "difficulty": "medium",
-        "hops": 2,
-    },
-    {
-        "question": "How many employees does the parent company of Instagram have?",
-        "answer": "Meta Platforms (parent of Instagram) employs approximately 70,000+ people as of recent reports.",
-        "difficulty": "medium",
-        "hops": 2,
-    },
-    {
-        "question": "What is the current interest rate set by the central bank of the country where the Eiffel Tower is located?",
-        "answer": "The European Central Bank sets rates for France/eurozone. The current rate should be verified — it has changed frequently in 2023-2025.",
-        "difficulty": "hard",
-        "hops": 2,
-    },
-    {
-        "question": "Which company acquired the startup founded by the creator of Oculus VR?",
-        "answer": "Palmer Luckey founded Oculus VR, which was acquired by Facebook (now Meta). He later founded Anduril Industries.",
-        "difficulty": "medium",
-        "hops": 2,
-    },
-    {
-        "question": "What is the market cap of the company that owns the most popular search engine in Russia?",
-        "answer": "Yandex (now split into separate entities after 2024 restructuring). Current market cap should be verified via financial sources.",
-        "difficulty": "hard",
-        "hops": 2,
-    },
-    {
-        "question": "What was the GDP growth rate of the country that hosted the most recent Summer Olympics?",
-        "answer": "Paris, France hosted the 2024 Summer Olympics. France's recent GDP growth should be verified via World Bank or IMF data.",
-        "difficulty": "hard",
-        "hops": 2,
-    },
-]
-
-
-# ---------------------------------------------------------------------------
-# Configuration
-# ---------------------------------------------------------------------------
-
-class WebResearchEnvConfig(HermesAgentEnvConfig):
-    """Configuration for the web research RL environment."""
-
-    # Reward weights
-    correctness_weight: float = Field(
-        default=0.6,
-        description="Weight for answer correctness in reward (LLM judge score).",
-    )
-    tool_usage_weight: float = Field(
-        default=0.2,
-        description="Weight for tool usage signal (did the model actually use web tools?).",
-    )
-    efficiency_weight: float = Field(
-        default=0.2,
-        description="Weight for efficiency signal (penalizes excessive tool calls).",
-    )
-    diversity_bonus: float = Field(
-        default=0.1,
-        description="Bonus reward for citing ≥2 distinct domains.",
-    )
-
-    # Efficiency thresholds
-    efficient_max_calls: int = Field(
-        default=5,
-        description="Maximum tool calls before efficiency penalty begins.",
-    )
-    heavy_penalty_calls: int = Field(
-        default=10,
-        description="Tool call count where efficiency penalty steepens.",
-    )
-
-    # Eval
-    eval_size: int = Field(
-        default=20,
-        description="Number of held-out items for evaluation.",
-    )
-    eval_split_ratio: float = Field(
-        default=0.1,
-        description="Fraction of dataset to hold out for evaluation (0.0–1.0).",
-    )
-
-    # Dataset
-    dataset_name: str = Field(
-        default="google/frames-benchmark",
-        description="HuggingFace dataset name for research questions.",
-    )
-
-
-# ---------------------------------------------------------------------------
-# Environment
-# ---------------------------------------------------------------------------
-
-class WebResearchEnv(HermesAgentBaseEnv):
-    """
-    RL environment for training multi-step web research skills.
-
-    The model is given a factual question requiring 2-3 hops of web research
-    and must use web_search / web_extract tools to find and synthesize the answer.
-
-    Reward is multi-signal:
-      60% — answer correctness (LLM judge)
-      20% — tool usage (did the model actually search the web?)
-      20% — efficiency (penalizes >5 tool calls)
-
-    Bonus +0.1 for source diversity (≥2 distinct domains cited).
-    """
-
-    name = "web-research"
-    env_config_cls = WebResearchEnvConfig
-
-    # Default toolsets for this environment — web + file for saving notes
-    default_toolsets = ["web", "file"]
-
-    @classmethod
-    def config_init(cls) -> Tuple[WebResearchEnvConfig, List[APIServerConfig]]:
-        """Default configuration for the web research environment."""
-        env_config = WebResearchEnvConfig(
-            enabled_toolsets=["web", "file"],
-            max_agent_turns=15,
-            agent_temperature=1.0,
-            system_prompt=(
-                "You are a highly capable research agent. When asked a factual question, "
-                "always use web_search to find current, accurate information before answering. "
-                "Cite at least 2 sources. Be concise and accurate."
-            ),
-            group_size=4,
-            total_steps=1000,
-            steps_per_eval=100,
-            use_wandb=True,
-            wandb_name="web-research",
-        )
-
-        server_configs = [
-            APIServerConfig(
-                base_url="https://openrouter.ai/api/v1",
-                model_name="anthropic/claude-sonnet-4.5",
-                server_type="openai",
-                api_key=os.getenv("OPENROUTER_API_KEY", ""),
-                health_check=False,
-            )
-        ]
-
-        return env_config, server_configs
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._items: list[dict] = []
-        self._eval_items: list[dict] = []
-        self._index: int = 0
-
-        # Metrics tracking for wandb
-        self._reward_buffer: list[float] = []
-        self._correctness_buffer: list[float] = []
-        self._tool_usage_buffer: list[float] = []
-        self._efficiency_buffer: list[float] = []
-        self._diversity_buffer: list[float] = []
-
-    # ------------------------------------------------------------------
-    # 1. Setup — load dataset
-    # ------------------------------------------------------------------
-
-    async def setup(self) -> None:
-        """Load the FRAMES benchmark or fall back to built-in samples."""
-        if HF_AVAILABLE:
-            try:
-                logger.info("Loading FRAMES benchmark from HuggingFace...")
-                ds = load_dataset(self.config.dataset_name, split="test")
-                self._items = [
-                    {
-                        "question": row["Prompt"],
-                        "answer": row["Answer"],
-                        "difficulty": row.get("reasoning_types", "unknown"),
-                        "hops": 2,
-                    }
-                    for row in ds
-                ]
-                # Hold out for eval
-                eval_size = max(
-                    self.config.eval_size,
-                    int(len(self._items) * self.config.eval_split_ratio),
-                )
-                random.shuffle(self._items)
-                self._eval_items = self._items[:eval_size]
-                self._items = self._items[eval_size:]
-                logger.info(
-                    f"Loaded {len(self._items)} train / {len(self._eval_items)} eval items "
-                    f"from FRAMES benchmark."
-                )
-                return
-            except Exception as e:
-                logger.warning(f"Could not load FRAMES from HuggingFace: {e}. Using built-in samples.")
-
-        # Fallback
-        random.shuffle(SAMPLE_QUESTIONS)
-        split = max(1, len(SAMPLE_QUESTIONS) * 8 // 10)
-        self._items = SAMPLE_QUESTIONS[:split]
-        self._eval_items = SAMPLE_QUESTIONS[split:]
-        logger.info(
-            f"Using built-in sample dataset: {len(self._items)} train / "
-            f"{len(self._eval_items)} eval items."
-        )
-
-    # ------------------------------------------------------------------
-    # 2. get_next_item — return the next question
-    # ------------------------------------------------------------------
-
-    async def get_next_item(self) -> dict:
-        """Return the next item, cycling through the dataset."""
-        if not self._items:
-            raise RuntimeError("Dataset is empty. Did you call setup()?")
-        item = self._items[self._index % len(self._items)]
-        self._index += 1
-        return item
-
-    # ------------------------------------------------------------------
-    # 3. format_prompt — build the user-facing prompt
-    # ------------------------------------------------------------------
-
-    def format_prompt(self, item: dict) -> str:
-        """Format the research question as a task prompt."""
-        return (
-            f"Research the following question thoroughly using web search. "
-            f"You MUST search the web to find current, accurate information — "
-            f"do not rely solely on your training data.\n\n"
-            f"Question: {item['question']}\n\n"
-            f"Requirements:\n"
-            f"- Use web_search and/or web_extract tools to find information\n"
-            f"- Search at least 2 different sources\n"
-            f"- Provide a concise, accurate answer (2-4 sentences)\n"
-            f"- Cite the sources you used"
-        )
-
-    # ------------------------------------------------------------------
-    # 4. compute_reward — multi-signal scoring
-    # ------------------------------------------------------------------
-
-    async def compute_reward(
-        self,
-        item: dict,
-        result: AgentResult,
-        ctx: ToolContext,
-    ) -> float:
-        """
-        Multi-signal reward function:
-
-          correctness_weight * correctness  — LLM judge comparing answer to ground truth
-          tool_usage_weight  * tool_used    — binary: did the model use web tools?
-          efficiency_weight  * efficiency   — penalizes wasteful tool usage
-          + diversity_bonus                 — source diversity (≥2 distinct domains)
-        """
-        # Extract final response from messages (last assistant message with content)
-        final_response = ""
-        tools_used: list[str] = []
-        for msg in reversed(result.messages):
-            if msg.get("role") == "assistant" and msg.get("content") and not final_response:
-                final_response = msg["content"]
-            # Collect tool names from tool call messages
-            if msg.get("role") == "assistant" and msg.get("tool_calls"):
-                for tc in msg["tool_calls"]:
-                    fn = tc.get("function", {}) if isinstance(tc, dict) else {}
-                    name = fn.get("name", "")
-                    if name:
-                        tools_used.append(name)
-        tool_call_count: int = result.turns_used or len(tools_used)
-
-        cfg = self.config
-
-        # ---- Signal 1: Answer correctness (LLM judge) ----------------
-        correctness = await self._llm_judge(
-            question=item["question"],
-            expected=item["answer"],
-            model_answer=final_response,
-        )
-
-        # ---- Signal 2: Web tool usage --------------------------------
-        web_tools = {"web_search", "web_extract", "search", "firecrawl"}
-        tool_used = 1.0 if any(t in web_tools for t in tools_used) else 0.0
-
-        # ---- Signal 3: Efficiency ------------------------------------
-        if tool_call_count <= cfg.efficient_max_calls:
-            efficiency = 1.0
-        elif tool_call_count <= cfg.heavy_penalty_calls:
-            efficiency = 1.0 - (tool_call_count - cfg.efficient_max_calls) * 0.08
-        else:
-            efficiency = max(0.0, 1.0 - (tool_call_count - cfg.efficient_max_calls) * 0.12)
-
-        # ---- Bonus: Source diversity ---------------------------------
-        domains = self._extract_domains(final_response)
-        diversity = cfg.diversity_bonus if len(domains) >= 2 else 0.0
-
-        # ---- Combine ------------------------------------------------
-        reward = (
-            cfg.correctness_weight * correctness
-            + cfg.tool_usage_weight * tool_used
-            + cfg.efficiency_weight * efficiency
-            + diversity
-        )
-        reward = min(1.0, max(0.0, reward))  # clamp to [0, 1]
-
-        # Track for wandb
-        self._reward_buffer.append(reward)
-        self._correctness_buffer.append(correctness)
-        self._tool_usage_buffer.append(tool_used)
-        self._efficiency_buffer.append(efficiency)
-        self._diversity_buffer.append(diversity)
-
-        logger.debug(
-            f"Reward breakdown — correctness={correctness:.2f}, "
-            f"tool_used={tool_used:.1f}, efficiency={efficiency:.2f}, "
-            f"diversity={diversity:.1f} → total={reward:.3f}"
-        )
-
-        return reward
-
-    # ------------------------------------------------------------------
-    # 5. evaluate — run on held-out eval split
-    # ------------------------------------------------------------------
-
-    async def evaluate(self, *args, **kwargs) -> None:
-        """Run evaluation on the held-out split using the full agent loop with tools.
-
-        Each eval item runs through the same agent loop as training —
-        the model can use web_search, web_extract, etc. to research answers.
-        This measures actual agentic research capability, not just knowledge.
-        """
-        import time
-        import uuid
-        from environments.agent_loop import HermesAgentLoop
-        from environments.tool_context import ToolContext
-
-        items = self._eval_items
-        if not items:
-            logger.warning("No eval items available.")
-            return
-
-        eval_size = min(self.config.eval_size, len(items))
-        eval_items = items[:eval_size]
-
-        logger.info(f"Running eval on {len(eval_items)} questions (with agent loop + tools)...")
-        start_time = time.time()
-        samples = []
-
-        # Resolve tools once for all eval items
-        tools, valid_names = self._resolve_tools_for_group()
-
-        for i, item in enumerate(eval_items):
-            task_id = str(uuid.uuid4())
-            logger.info(f"Eval [{i+1}/{len(eval_items)}]: {item['question'][:80]}...")
-
-            try:
-                # Build messages
-                messages: List[Dict[str, Any]] = []
-                if self.config.system_prompt:
-                    messages.append({"role": "system", "content": self.config.system_prompt})
-                messages.append({"role": "user", "content": self.format_prompt(item)})
-
-                # Run the full agent loop with tools
-                agent = HermesAgentLoop(
-                    server=self.server,
-                    tool_schemas=tools,
-                    valid_tool_names=valid_names,
-                    max_turns=self.config.max_agent_turns,
-                    task_id=task_id,
-                    temperature=0.0,  # Deterministic for eval
-                    max_tokens=self.config.max_token_length,
-                    extra_body=self.config.extra_body,
-                    budget_config=self.config.build_budget_config(),
-                )
-                result = await agent.run(messages)
-
-                # Extract final response and tool usage from messages
-                final_response = ""
-                tool_call_count = 0
-                for msg in reversed(result.messages):
-                    if msg.get("role") == "assistant" and msg.get("content") and not final_response:
-                        final_response = msg["content"]
-                    if msg.get("role") == "assistant" and msg.get("tool_calls"):
-                        tool_call_count += len(msg["tool_calls"])
-
-                # Compute reward (includes LLM judge for correctness)
-                # Temporarily save buffer lengths so we can extract the
-                # correctness score without calling judge twice, and avoid
-                # polluting training metric buffers with eval data.
-                buf_len = len(self._correctness_buffer)
-                ctx = ToolContext(task_id)
-                try:
-                    reward = await self.compute_reward(item, result, ctx)
-                finally:
-                    ctx.cleanup()
-
-                # Extract correctness from the buffer (compute_reward appended it)
-                # then remove eval entries from training buffers
-                correctness = (
-                    self._correctness_buffer[buf_len]
-                    if len(self._correctness_buffer) > buf_len
-                    else 0.0
-                )
-                # Roll back buffers to avoid polluting training metrics
-                for buf in (
-                    self._reward_buffer, self._correctness_buffer,
-                    self._tool_usage_buffer, self._efficiency_buffer,
-                    self._diversity_buffer,
-                ):
-                    if len(buf) > buf_len:
-                        buf.pop()
-
-                samples.append({
-                    "prompt": item["question"],
-                    "response": final_response[:500],
-                    "expected": item["answer"],
-                    "correctness": correctness,
-                    "reward": reward,
-                    "tool_calls": tool_call_count,
-                    "turns": result.turns_used,
-                })
-
-                logger.info(
-                    f"  → correctness={correctness:.2f}, reward={reward:.3f}, "
-                    f"tools={tool_call_count}, turns={result.turns_used}"
-                )
-
-            except Exception as e:
-                logger.error(f"Eval error on item: {e}")
-                samples.append({
-                    "prompt": item["question"],
-                    "response": f"ERROR: {e}",
-                    "expected": item["answer"],
-                    "correctness": 0.0,
-                    "reward": 0.0,
-                    "tool_calls": 0,
-                    "turns": 0,
-                })
-
-        end_time = time.time()
-
-        # Compute aggregate metrics
-        correctness_scores = [s["correctness"] for s in samples]
-        rewards = [s["reward"] for s in samples]
-        tool_counts = [s["tool_calls"] for s in samples]
-        n = len(samples)
-
-        eval_metrics = {
-            "eval/mean_correctness": sum(correctness_scores) / n if n else 0.0,
-            "eval/mean_reward": sum(rewards) / n if n else 0.0,
-            "eval/mean_tool_calls": sum(tool_counts) / n if n else 0.0,
-            "eval/tool_usage_rate": sum(1 for t in tool_counts if t > 0) / n if n else 0.0,
-            "eval/n_items": n,
-        }
-
-        logger.info(
-            f"Eval complete — correctness={eval_metrics['eval/mean_correctness']:.3f}, "
-            f"reward={eval_metrics['eval/mean_reward']:.3f}, "
-            f"tool_usage={eval_metrics['eval/tool_usage_rate']:.0%}"
-        )
-
-        await self.evaluate_log(
-            metrics=eval_metrics,
-            samples=samples,
-            start_time=start_time,
-            end_time=end_time,
-        )
-
-    # ------------------------------------------------------------------
-    # 6. wandb_log — custom metrics
-    # ------------------------------------------------------------------
-
-    async def wandb_log(self, wandb_metrics: Optional[Dict] = None) -> None:
-        """Log reward breakdown metrics to wandb."""
-        if wandb_metrics is None:
-            wandb_metrics = {}
-
-        if self._reward_buffer:
-            n = len(self._reward_buffer)
-            wandb_metrics["train/mean_reward"] = sum(self._reward_buffer) / n
-            wandb_metrics["train/mean_correctness"] = sum(self._correctness_buffer) / n
-            wandb_metrics["train/mean_tool_usage"] = sum(self._tool_usage_buffer) / n
-            wandb_metrics["train/mean_efficiency"] = sum(self._efficiency_buffer) / n
-            wandb_metrics["train/mean_diversity"] = sum(self._diversity_buffer) / n
-            wandb_metrics["train/total_rollouts"] = n
-
-            # Accuracy buckets
-            wandb_metrics["train/correct_rate"] = (
-                sum(1 for c in self._correctness_buffer if c >= 0.7) / n
-            )
-            wandb_metrics["train/tool_usage_rate"] = (
-                sum(1 for t in self._tool_usage_buffer if t > 0) / n
-            )
-
-            # Clear buffers
-            self._reward_buffer.clear()
-            self._correctness_buffer.clear()
-            self._tool_usage_buffer.clear()
-            self._efficiency_buffer.clear()
-            self._diversity_buffer.clear()
-
-        await super().wandb_log(wandb_metrics)
-
-    # ------------------------------------------------------------------
-    # Private helpers
-    # ------------------------------------------------------------------
-
-    async def _llm_judge(
-        self,
-        question: str,
-        expected: str,
-        model_answer: str,
-    ) -> float:
-        """
-        Use the server's LLM to judge answer correctness.
-        Falls back to keyword heuristic if LLM call fails.
-        """
-        if not model_answer or not model_answer.strip():
-            return 0.0
-
-        judge_prompt = (
-            "You are an impartial judge evaluating the quality of an AI research answer.\n\n"
-            f"Question: {question}\n\n"
-            f"Reference answer: {expected}\n\n"
-            f"Model answer: {model_answer}\n\n"
-            "Score the model answer on a scale from 0.0 to 1.0 where:\n"
-            "  1.0 = fully correct and complete\n"
-            "  0.7 = mostly correct with minor gaps\n"
-            "  0.4 = partially correct\n"
-            "  0.1 = mentions relevant topic but wrong or very incomplete\n"
-            "  0.0 = completely wrong or no answer\n\n"
-            "Consider: factual accuracy, completeness, and relevance.\n"
-            'Respond with ONLY a JSON object: {"score": <float>, "reason": "<one sentence>"}'
-        )
-
-        try:
-            response = await self.server.chat_completion(
-                messages=[{"role": "user", "content": judge_prompt}],
-                n=1,
-                max_tokens=150,
-                temperature=0.0,
-                split="eval",
-            )
-            text = response.choices[0].message.content if response.choices else ""
-            parsed = self._parse_judge_json(text)
-            if parsed is not None:
-                return float(parsed)
-        except Exception as e:
-            logger.debug(f"LLM judge failed: {e}. Using heuristic.")
-
-        return self._heuristic_score(expected, model_answer)
-
-    @staticmethod
-    def _parse_judge_json(text: str) -> Optional[float]:
-        """Extract the score float from LLM judge JSON response."""
-        try:
-            clean = re.sub(r"```(?:json)?|```", "", text).strip()
-            data = json.loads(clean)
-            score = float(data.get("score", -1))
-            if 0.0 <= score <= 1.0:
-                return score
-        except Exception:
-            match = re.search(r'"score"\s*:\s*([0-9.]+)', text)
-            if match:
-                score = float(match.group(1))
-                if 0.0 <= score <= 1.0:
-                    return score
-        return None
-
-    @staticmethod
-    def _heuristic_score(expected: str, model_answer: str) -> float:
-        """Lightweight keyword overlap score as fallback."""
-        stopwords = {
-            "the", "a", "an", "is", "are", "was", "were", "of", "in", "on",
-            "at", "to", "for", "with", "and", "or", "but", "it", "its",
-            "this", "that", "as", "by", "from", "be", "has", "have", "had",
-        }
-
-        def tokenize(text: str) -> set:
-            tokens = re.findall(r'\b\w+\b', text.lower())
-            return {t for t in tokens if t not in stopwords and len(t) > 2}
-
-        expected_tokens = tokenize(expected)
-        answer_tokens = tokenize(model_answer)
-
-        if not expected_tokens:
-            return 0.5
-
-        overlap = len(expected_tokens & answer_tokens)
-        union = len(expected_tokens | answer_tokens)
-
-        jaccard = overlap / union if union > 0 else 0.0
-        recall = overlap / len(expected_tokens)
-        return min(1.0, 0.4 * jaccard + 0.6 * recall)
-
-    @staticmethod
-    def _extract_domains(text: str) -> set:
-        """Extract unique domains from URLs cited in the response."""
-        urls = re.findall(r'https?://[^\s\)>\]"\']+', text)
-        domains = set()
-        for url in urls:
-            try:
-                parsed = urlparse(url)
-                domain = parsed.netloc.lower().lstrip("www.")
-                if domain:
-                    domains.add(domain)
-            except Exception:
-                pass
-        return domains
-
-
-# ---------------------------------------------------------------------------
-# Entry point
-# ---------------------------------------------------------------------------
-
-if __name__ == "__main__":
-    WebResearchEnv.cli()
@@ -941,6 +941,14 @@ def load_gateway_config() -> GatewayConfig:
                    if isinstance(ntc, list):
                        ntc = ",".join(str(v) for v in ntc)
                    os.environ["DISCORD_NO_THREAD_CHANNELS"] = str(ntc)
+                # history_backfill: recover missed channel messages for shared sessions
+                # when require_mention is active.  Fetches messages between bot turns
+                # and prepends them to the user message for context.
+                if "history_backfill" in discord_cfg and not os.getenv("DISCORD_HISTORY_BACKFILL"):
+                    os.environ["DISCORD_HISTORY_BACKFILL"] = str(discord_cfg["history_backfill"]).lower()
+                hbl = discord_cfg.get("history_backfill_limit")
+                if hbl is not None and not os.getenv("DISCORD_HISTORY_BACKFILL_LIMIT"):
+                    os.environ["DISCORD_HISTORY_BACKFILL_LIMIT"] = str(hbl)
                # allow_mentions: granular control over what the bot can ping.
                # Safe defaults (no @everyone/roles) are applied in the adapter;
                # these YAML keys only override when set and let users opt back
@@ -356,15 +356,34 @@ class ResponseStore:
        # Evict oldest entries beyond max_size
        count = self._conn.execute("SELECT COUNT(*) FROM responses").fetchone()[0]
        if count > self._max_size:
-            self._conn.execute(
-                "DELETE FROM responses WHERE response_id IN "
-                "(SELECT response_id FROM responses ORDER BY accessed_at ASC LIMIT ?)",
-                (count - self._max_size,),
-            )
+            # Collect IDs that will be evicted
+            evict_ids = [
+                row[0]
+                for row in self._conn.execute(
+                    "SELECT response_id FROM responses ORDER BY accessed_at ASC LIMIT ?",
+                    (count - self._max_size,),
+                ).fetchall()
+            ]
+            if evict_ids:
+                placeholders = ",".join("?" for _ in evict_ids)
+                # Clear conversation mappings pointing to evicted responses
+                self._conn.execute(
+                    f"DELETE FROM conversations WHERE response_id IN ({placeholders})",
+                    evict_ids,
+                )
+                # Delete evicted responses
+                self._conn.execute(
+                    f"DELETE FROM responses WHERE response_id IN ({placeholders})",
+                    evict_ids,
+                )
        self._conn.commit()

    def delete(self, response_id: str) -> bool:
        """Remove a response from the store. Returns True if found and deleted."""
+        # Clear conversation mappings pointing to this response
+        self._conn.execute(
+            "DELETE FROM conversations WHERE response_id = ?", (response_id,)
+        )
        cursor = self._conn.execute(
            "DELETE FROM responses WHERE response_id = ?", (response_id,)
        )
@@ -955,6 +955,12 @@ class MessageEvent:
    # Per-channel ephemeral system prompt (e.g. Discord channel_prompts).
    # Applied at API call time and never persisted to transcript history.
    channel_prompt: Optional[str] = None
+
+    # Channel context recovered by history backfill (e.g. messages between
+    # bot turns that were missed due to require_mention).  Kept separate
+    # from ``text`` so the sender-prefix logic in run.py can operate on the
+    # trigger message alone, then prepend this context afterward.
+    channel_context: Optional[str] = None
    
    # Internal flag — set for synthetic events (e.g. background process
    # completion notifications) that must bypass user authorization checks.
@@ -2955,9 +2961,25 @@ class BasePlatformAdapter(ABC):
                merge_pending_message_event(self._pending_messages, session_key, event)
                return  # Don't interrupt now - will run after current task completes

-            # Default behavior for non-photo follow-ups: interrupt the running agent
+            # Default behavior for non-photo follow-ups: interrupt the running agent.
+            #
+            # Use merge_text=True so rapid TEXT follow-ups (#4469) accumulate
+            # into the single pending slot instead of clobbering each other.
+            # Without merging, three rapid messages "A", "B", "C" land like:
+            #   _pending_messages[k] = A  (interrupts)
+            #   _pending_messages[k] = B  (replaces A before consumer reads)
+            #   _pending_messages[k] = C  (replaces B)
+            # ...and only "C" reaches the next turn.  merge_pending_message_event
+            # already does the right thing for photo/media bursts; the
+            # ``merge_text=True`` flag extends that to plain TEXT events.
+            # Same shape as the Telegram bursty-grace path in gateway/run.py.
            logger.debug("[%s] New message while session %s is active — triggering interrupt", self.name, session_key)
-            self._pending_messages[session_key] = event
+            merge_pending_message_event(
+                self._pending_messages,
+                session_key,
+                event,
+                merge_text=True,
+            )
            # Signal the interrupt (the processing task checks this)
            self._active_sessions[session_key].set()
            return  # Don't process now - will be handled after current task finishes
@@ -589,6 +589,10 @@ class DiscordAdapter(BasePlatformAdapter):
        # chunk only, default), "all" (reply-reference on every chunk).
        self._reply_to_mode: str = getattr(config, 'reply_to_mode', 'first') or 'first'
        self._slash_commands: bool = self.config.extra.get("slash_commands", True)
+        # In-memory cache of the bot's last message ID per channel, used by
+        # history backfill to skip the full scan on hot paths.  Falls back to
+        # scanning channel.history() on cache miss (cold start / restart).
+        self._last_self_message_id: Dict[str, str] = {}

    async def connect(self) -> bool:
        """Connect to Discord and start receiving events."""
@@ -1459,6 +1463,12 @@ class DiscordAdapter(BasePlatformAdapter):
                        raise
                message_ids.append(str(msg.id))

+            # Track the last message we sent in this channel for history
+            # backfill — avoids a full channel.history() scan on hot paths.
+            if message_ids:
+                _target_id = thread_id or chat_id
+                self._last_self_message_id[_target_id] = message_ids[-1]
+
            return SendResult(
                success=True,
                message_id=message_ids[0] if message_ids else None,
@@ -3596,6 +3606,134 @@ class DiscordAdapter(BasePlatformAdapter):
            return bool(configured)
        return os.getenv("DISCORD_THREAD_REQUIRE_MENTION", "false").lower() in ("true", "1", "yes", "on")

+    def _discord_history_backfill(self) -> bool:
+        """Return whether history backfill is enabled for shared sessions."""
+        configured = self.config.extra.get("history_backfill")
+        if configured is not None:
+            if isinstance(configured, str):
+                return configured.lower() not in ("false", "0", "no", "off")
+            return bool(configured)
+        return os.getenv("DISCORD_HISTORY_BACKFILL", "true").lower() in ("true", "1", "yes")
+
+    def _discord_history_backfill_limit(self) -> int:
+        """Return the max number of messages to scan backwards for context.
+
+        In practice the scan usually stops much earlier — at the bot's own
+        last message in the channel (the natural partition point).  This
+        limit is a safety cap for cold starts and long gaps where no prior
+        bot message exists in recent history.
+        """
+        configured = self.config.extra.get("history_backfill_limit")
+        if configured is not None:
+            try:
+                return int(configured)
+            except (ValueError, TypeError):
+                pass
+        raw = os.getenv("DISCORD_HISTORY_BACKFILL_LIMIT", "50")
+        try:
+            return int(raw)
+        except (ValueError, TypeError):
+            return 50
+
+    async def _fetch_channel_context(
+        self,
+        channel: Any,
+        before: "DiscordMessage",
+    ) -> str:
+        """Fetch recent channel messages for conversational context.
+
+        Scans backwards from *before* and collects messages until it hits
+        a message sent by this bot (the natural partition point between
+        bot turns) or reaches ``history_backfill_limit``.
+
+        Returns a formatted block like::
+
+            [Recent channel messages]
+            [Alice] some message
+            [Bob [bot]] another message
+
+        Returns an empty string if no context is available.
+        """
+        limit = self._discord_history_backfill_limit()
+        if limit <= 0:
+            return ""
+
+        # Determine which bot messages to include in context
+        allow_bots_raw = os.getenv("DISCORD_ALLOW_BOTS", "none").lower().strip()
+        include_other_bots = allow_bots_raw != "none"
+
+        # Use the in-memory cache to narrow the fetch window on hot paths.
+        # If we know our last message ID in this channel, pass it as `after`
+        # to avoid scanning the full limit.  Falls back to scanning on cache
+        # miss (cold start / restart).
+        # Guard: only use the cache when it's chronologically before the
+        # trigger — Discord snowflake IDs are monotonically increasing, so
+        # a simple int comparison suffices.
+        channel_id = str(getattr(channel, "id", ""))
+        _cached_id = self._last_self_message_id.get(channel_id)
+        _after_obj = None
+        try:
+            if _cached_id and int(_cached_id) < int(before.id):
+                _after_obj = discord.Object(id=int(_cached_id))
+        except (ValueError, TypeError):
+            pass  # Malformed cache entry — fall back to cold-start scan
+
+        try:
+            collected = []
+            # IMPORTANT: pass oldest_first=False explicitly.  discord.py 2.x
+            # silently flips the default to True when `after=` is supplied,
+            # which would select the *earliest* N messages after our last
+            # response instead of the *latest* N before the trigger.  In
+            # high-traffic windows that returns stale tool traces and drops
+            # the actual final answer.  See the regression test
+            # `test_fetch_channel_context_cache_uses_latest_window_when_after_set`.
+            async for msg in channel.history(
+                limit=limit,
+                before=before,
+                after=_after_obj,
+                oldest_first=False,
+            ):
+                # Stop at our own message — this is the partition point.
+                # Everything before this is already in the session transcript.
+                # (Redundant when _after_obj is set, but needed for cold start.)
+                if msg.author == self._client.user:
+                    break
+
+                # Skip system messages (pins, joins, thread renames, etc.)
+                if msg.type not in (discord.MessageType.default, discord.MessageType.reply):
+                    continue
+
+                # Respect DISCORD_ALLOW_BOTS for other bots.
+                # For history context, "mentions" is treated as "all" — we are
+                # deciding what context to show, not whether to respond.
+                if getattr(msg.author, "bot", False) and not include_other_bots:
+                    continue
+
+                content = getattr(msg, "clean_content", msg.content) or ""
+                if not content and msg.attachments:
+                    content = "(attachment)"
+                if not content:
+                    continue
+
+                name = msg.author.display_name
+                if getattr(msg.author, "bot", False):
+                    name = f"{name} [bot]"
+                collected.append(f"[{name}] {content}")
+
+            if not collected:
+                return ""
+
+            # channel.history returns newest-first (oldest_first=False); reverse for chronological order
+            collected.reverse()
+            return "[Recent channel messages]\n" + "\n".join(collected)
+
+        except discord.Forbidden:
+            logger.debug("[%s] Missing permissions to fetch channel history", self.name)
+            return ""
+        except Exception as e:
+            logger.warning("[%s] Failed to fetch channel history: %s", self.name, e)
+            return ""
+
    def _thread_parent_channel(self, channel: Any) -> Any:
        """Return the parent text channel when invoked from a thread."""
        return getattr(channel, "parent", None) or channel
@@ -4504,9 +4642,50 @@ class DiscordAdapter(BasePlatformAdapter):
        if pending_text_injection:
            event_text = f"{pending_text_injection}\n\n{event_text}" if event_text else pending_text_injection

+        # ── History backfill ─────────────────────────────────────────
+        # When require_mention is active, the bot only processes messages
+        # that @mention it.  Messages in the channel between bot turns are
+        # invisible to the session transcript.  To recover that context,
+        # fetch recent channel history and prepend it to the user message.
+        #
+        # The fetch window is: everything after the bot's last message in
+        # the channel up to (but not including) the current trigger.  On
+        # cold start (no prior bot message found), fetch the last N messages
+        # and stop at the first self-message encountered.
+        #
+        # Threads naturally scope to thread-only history (channel.history()
+        # on a thread returns only that thread's messages).  DMs are skipped
+        # because every DM message triggers the bot — there's no mention gap
+        # to fill; the session transcript already has everything.
+        #
+        # Per-user sessions also benefit: Alice's session is missing the
+        # other-channel-participants' context, and her own messages from
+        # before she mentioned the bot.  Backfill fills that gap.
+        #
+        # Messages that arrive while the bot is processing (between trigger
+        # and response) are not captured — this is an accepted simplification
+        # to keep the partition rule clean.
+        _channel_context = None
+        _is_dm = isinstance(message.channel, discord.DMChannel)
+        if not _is_dm:
+            _needed_mention = (
+                require_mention
+                and not is_free_channel
+                and not in_bot_thread
+            )
+            _backfill_enabled = self._discord_history_backfill()
+            if _needed_mention and _backfill_enabled:
+                _backfill_text = await self._fetch_channel_context(
+                    message.channel, before=message,
+                )
+                if _backfill_text:
+                    _channel_context = _backfill_text
+
        # Defense-in-depth: prevent empty user messages from entering session
-        # (can happen when user sends @mention-only with no other text)
-        if not event_text or not event_text.strip():
+        # (can happen when user sends @mention-only with no other text).
+        # When channel_context is present, a bare mention means "catch me up"
+        # — the context IS the message, so skip the placeholder.
+        if (not event_text or not event_text.strip()) and not _channel_context:
            event_text = "(The user sent a message with no text content)"

        _chan = message.channel
@@ -4535,6 +4714,7 @@ class DiscordAdapter(BasePlatformAdapter):
            timestamp=message.created_at,
            auto_skill=_skills,
            channel_prompt=_channel_prompt,
+            channel_context=_channel_context,
        )

        # Track thread participation so the bot won't require @mention for
@@ -2273,11 +2273,7 @@ class FeishuAdapter(BasePlatformAdapter):
                    daemon=True,
                ).start()
            return
-        future = asyncio.run_coroutine_threadsafe(
-            self._handle_message_event_data(data),
-            loop,
-        )
-        future.add_done_callback(self._log_background_failure)
+        self._submit_on_loop(loop, self._handle_message_event_data(data))

    def _enqueue_pending_inbound_event(self, data: Any) -> bool:
        """Append an event to the pending-inbound queue.
@@ -2353,16 +2349,12 @@ class FeishuAdapter(BasePlatformAdapter):
                    dispatched = 0
                    requeue: List[Any] = []
                    for event in batch:
-                        try:
-                            fut = asyncio.run_coroutine_threadsafe(
-                                self._handle_message_event_data(event),
-                                loop,
-                            )
-                            fut.add_done_callback(self._log_background_failure)
+                        if self._submit_on_loop(
+                            loop, self._handle_message_event_data(event)
+                        ):
                            dispatched += 1
-                        except RuntimeError:
-                            # Loop closed between check and submit — requeue
-                            # and poll again.
+                        else:
+                            # Loop closed/unavailable — requeue and poll again.
                            requeue.append(event)
                    if requeue:
                        with self._pending_inbound_lock:
@@ -2466,11 +2458,10 @@ class FeishuAdapter(BasePlatformAdapter):
        if not self._loop_accepts_callbacks(loop):
            logger.warning("[Feishu] Dropping drive comment event before adapter loop is ready")
            return
-        future = asyncio.run_coroutine_threadsafe(
-            handle_drive_comment_event(self._client, data, self_open_id=self._bot_open_id),
+        self._submit_on_loop(
            loop,
+            handle_drive_comment_event(self._client, data, self_open_id=self._bot_open_id),
        )
-        future.add_done_callback(self._log_background_failure)

    def _on_reaction_event(self, event_type: str, data: Any) -> None:
        """Route user reactions on bot messages as synthetic text events."""
@@ -2498,11 +2489,7 @@ class FeishuAdapter(BasePlatformAdapter):
            or bool(getattr(loop, "is_closed", lambda: False)())
        ):
            return
-        future = asyncio.run_coroutine_threadsafe(
-            self._handle_reaction_event(event_type, data),
-            loop,
-        )
-        future.add_done_callback(self._log_background_failure)
+        self._submit_on_loop(loop, self._handle_reaction_event(event_type, data))

    def _on_card_action_trigger(self, data: Any) -> Any:
        """Handle card-action callback from the Feishu SDK (synchronous).
@@ -2548,11 +2535,14 @@ class FeishuAdapter(BasePlatformAdapter):

    def _submit_on_loop(self, loop: Any, coro: Any) -> bool:
        """Schedule background work on the adapter loop with shared failure logging."""
-        try:
-            future = asyncio.run_coroutine_threadsafe(coro, loop)
-        except Exception:
-            coro.close()
-            logger.warning("[Feishu] Failed to schedule background callback work", exc_info=True)
+        from agent.async_utils import safe_schedule_threadsafe
+        future = safe_schedule_threadsafe(
+            coro, loop,
+            logger=logger,
+            log_message="[Feishu] Failed to schedule background callback work",
+            log_level=logging.WARNING,
+        )
+        if future is None:
            return False
        future.add_done_callback(self._log_background_failure)
        return True
@@ -2785,7 +2785,10 @@ class SlackAdapter(BasePlatformAdapter):
            from hermes_cli.commands import slack_subcommand_map
            subcommand_map = slack_subcommand_map()
            subcommand_map["compact"] = "/compress"
-            first_word = text.split()[0] if text else ""
+            # Guard against whitespace-only text where ``text`` is truthy but
+            # ``text.split()`` returns ``[]`` (e.g. user sends ``/hermes   ``).
+            parts = text.split() if text else []
+            first_word = parts[0] if parts else ""
            if first_word in subcommand_map:
                rest = text[len(first_word):].strip()
                text = f"{subcommand_map[first_word]} {rest}".strip() if rest else subcommand_map[first_word]
@@ -493,13 +493,45 @@ class WhatsAppAdapter(BasePlatformAdapter):
        """
        if not check_whatsapp_requirements():
            logger.warning("[%s] Node.js not found. WhatsApp requires Node.js.", self.name)
+            self._set_fatal_error(
+                "whatsapp_node_missing",
+                "Node.js is not installed — install Node.js and re-run `hermes gateway`.",
+                retryable=False,
+            )
            return False
        
        bridge_path = Path(self._bridge_script)
        if not bridge_path.exists():
            logger.warning("[%s] Bridge script not found: %s", self.name, bridge_path)
+            self._set_fatal_error(
+                "whatsapp_bridge_missing",
+                f"WhatsApp bridge script missing at {bridge_path}.",
+                retryable=False,
+            )
            return False
-        
+
+        # Pre-flight: skip the 30s bridge bootstrap entirely if the user
+        # never finished pairing.  Without creds.json the bridge prints
+        # QR codes to its log file and never reaches status:connected,
+        # so every gateway restart paid the 30s timeout + queued WhatsApp
+        # for indefinite retries.  Mark non-retryable so the user gets a
+        # clear "run hermes whatsapp" message instead of the watcher
+        # silently hammering an unconfigured platform.
+        creds_path = self._session_path / "creds.json"
+        if not creds_path.exists():
+            logger.warning(
+                "[%s] WhatsApp is enabled but not paired (no creds.json at %s). "
+                "Run `hermes whatsapp` to pair, or remove WHATSAPP_ENABLED from "
+                "your .env to disable.",
+                self.name, creds_path,
+            )
+            self._set_fatal_error(
+                "whatsapp_not_paired",
+                "WhatsApp enabled but not paired — run `hermes whatsapp` to pair.",
+                retryable=False,
+            )
+            return False
+
        logger.info("[%s] Bridge found at %s", self.name, bridge_path)
        
        # Acquire scoped lock to prevent duplicate sessions
@@ -147,6 +147,9 @@ _YB_RES_REF_RE = re.compile(
    r"\[(image|voice|video|file(?::[^|\]]*)?)\|ybres:([A-Za-z0-9_\-]+)\]"
 )

+# Media kinds that can be resolved and injected into the model context
+_RESOLVABLE_MEDIA_KINDS = frozenset({"image", "file"})
+
 # Strip page indicators like (1/3) appended by BasePlatformAdapter
 _INDICATOR_RE = re.compile(r'\s*\(\d+/\d+\)$')

@@ -925,6 +928,7 @@ class InboundContext:
    # Populated by QuoteContextMiddleware
    reply_to_message_id: Optional[str] = None
    reply_to_text: Optional[str] = None
+    quote_media_refs: list = dc_field(default_factory=list)  # List of (rid, kind, filename)

    # Populated by MediaResolveMiddleware
    media_urls: list = dc_field(default_factory=list)
@@ -1645,6 +1649,25 @@ class ExtractContentMiddleware(InboundMiddleware):
            return None
        return f"[link: {link} | visit link for full content]"

+    @staticmethod
+    def _parse_resource_id(url: str) -> str:
+        """Extract resourceId from Yuanbao resource URL query parameters.
+
+        Args:
+            url: Resource URL (e.g., https://...?resourceId=abc123)
+
+        Returns:
+            Resource ID string, or empty string if not found
+        """
+        if not url:
+            return ""
+        try:
+            query = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
+            ids = query.get("resourceId") or query.get("resourceid") or []
+            return str(ids[0]).strip() if ids else ""
+        except Exception:
+            return ""
+
    @classmethod
    def _extract_text(cls, msg_body: list) -> str:
        """Extract plain text content from MsgBody.
@@ -1668,14 +1691,35 @@ class ExtractContentMiddleware(InboundMiddleware):
                if text:
                    parts.append(text)
            elif elem_type == "TIMImageElem":
-                parts.append("[image]")
+                # Extract resourceId from image_info_array URL
+                image_info_array = content.get("image_info_array")
+                if not isinstance(image_info_array, list):
+                    image_info_array = []
+                image_info = None
+                # Prefer medium image (index 1), fallback to index 0
+                if len(image_info_array) > 1 and isinstance(image_info_array[1], dict):
+                    image_info = image_info_array[1]
+                elif len(image_info_array) > 0 and isinstance(image_info_array[0], dict):
+                    image_info = image_info_array[0]
+                image_url = str((image_info or {}).get("url") or "").strip()
+                rid = cls._parse_resource_id(image_url)
+                parts.append(f"[image|ybres:{rid}]" if rid else "[image]")
            elif elem_type == "TIMFileElem":
                filename = content.get("file_name", content.get("fileName", content.get("filename", "")))
-                parts.append(f"[file: {filename}]" if filename else "[file]")
+                file_url = str(content.get("url") or "").strip()
+                rid = cls._parse_resource_id(file_url)
+                if rid:
+                    parts.append(f"[file:{filename}|ybres:{rid}]" if filename else f"[file|ybres:{rid}]")
+                else:
+                    parts.append(f"[file: {filename}]" if filename else "[file]")
            elif elem_type == "TIMSoundElem":
-                parts.append("[voice]")
+                sound_url = str(content.get("url") or "").strip()
+                rid = cls._parse_resource_id(sound_url)
+                parts.append(f"[voice|ybres:{rid}]" if rid else "[voice]")
            elif elem_type == "TIMVideoFileElem":
-                parts.append("[video]")
+                video_url = str(content.get("url") or "").strip()
+                rid = cls._parse_resource_id(video_url)
+                parts.append(f"[video|ybres:{rid}]" if rid else "[video]")
            elif elem_type == "TIMCustomElem":
                data_val = content.get("data", "")
                if data_val:
@@ -2132,22 +2176,23 @@ class QuoteContextMiddleware(InboundMiddleware):
    name = "quote-context"

    @staticmethod
-    def _extract_quote_context(cloud_custom_data: str) -> Tuple[Optional[str], Optional[str]]:
+    def _extract_quote_context(cloud_custom_data: str) -> Tuple[Optional[str], Optional[str], list]:
        """Extract quote context, mapping to MessageEvent.reply_to_*.

        Returns:
-          (reply_to_message_id, reply_to_text)
+          (reply_to_message_id, reply_to_text, quote_media_refs)
+          where quote_media_refs is a list of (rid, kind, filename) tuples
        """
        if not cloud_custom_data:
-            return None, None
+            return None, None, []
        try:
            parsed = json.loads(cloud_custom_data)
        except (json.JSONDecodeError, TypeError):
-            return None, None
+            return None, None, []

        quote = parsed.get("quote") if isinstance(parsed, dict) else None
        if not isinstance(quote, dict):
-            return None, None
+            return None, None, []

        # type=2 corresponds to image reference; desc may be empty, provide a placeholder.
        quote_type = int(quote.get("type") or 0)
@@ -2155,15 +2200,26 @@ class QuoteContextMiddleware(InboundMiddleware):
        if quote_type == 2 and not desc:
            desc = "[image]"
        if not desc:
-            return None, None
+            return None, None, []

        quote_id = str(quote.get("id") or "").strip() or None
        sender = str(quote.get("sender_nickname") or quote.get("sender_id") or "").strip()
        quote_text = f"{sender}: {desc}" if sender else desc
-        return quote_id, quote_text
+
+        # Extract media references from desc using _YB_RES_REF_RE regex
+        media_refs: list = []
+        for m in _YB_RES_REF_RE.finditer(desc):
+            head = m.group(1)  # "image" | "file:<name>" | "voice" | "video"
+            rid = m.group(2)
+            kind, _, filename = head.partition(":")
+            kind = kind.strip()
+            media_refs.append((rid, kind, filename.strip()))
+
+        return quote_id, quote_text, media_refs

    async def handle(self, ctx: InboundContext, next_fn) -> None:
-        ctx.reply_to_message_id, ctx.reply_to_text = self._extract_quote_context(ctx.cloud_custom_data)
+        ctx.reply_to_message_id, ctx.reply_to_text, ctx.quote_media_refs = self._extract_quote_context(ctx.cloud_custom_data)
+
        await next_fn()


@@ -2332,7 +2388,7 @@ class MediaResolveMiddleware(InboundMiddleware):
        for ref in media_refs:
            kind = str(ref.get("kind") or "").strip().lower()
            url = str(ref.get("url") or "").strip()
-            if kind not in {"image", "file"} or not url:
+            if kind not in _RESOLVABLE_MEDIA_KINDS or not url:
                continue

            try:
@@ -2391,7 +2447,7 @@ class MediaResolveMiddleware(InboundMiddleware):
                rid = m.group(2)
                kind, _, filename = head.partition(":")
                kind = kind.strip()
-                if kind not in {"image", "file"}:
+                if kind not in _RESOLVABLE_MEDIA_KINDS:
                    continue
                if rid in seen:
                    continue
@@ -2458,26 +2514,82 @@ class DispatchMiddleware(InboundMiddleware):
            media_urls = list(ctx.media_urls)
            media_types = list(ctx.media_types)

-            # Backfill observed media from recent transcript history
-            extra_img_urls: List[str] = []
-            extra_img_mimes: List[str] = []
-            try:
-                extra_img_urls, extra_img_mimes = await MediaResolveMiddleware._collect_observed_media(
-                    adapter, ctx.source,
-                )
-            except Exception as exc:
-                logger.warning(
-                    "[%s] observed-image hydration raised, continuing anyway: %s",
-                    adapter.name, exc,
-                )
-            if extra_img_urls:
-                current = set(media_urls)
-                for u, m in zip(extra_img_urls, extra_img_mimes):
-                    if u in current:
+            # If user quoted a message (reply_to_message_id is set), resolve only
+            # quote_media_refs to avoid injecting unrelated history media.
+            # Otherwise, backfill observed media from recent transcript history.
+            if ctx.reply_to_message_id is not None:
+                # Fallback: if desc didn't contain ybres refs, look up transcript
+                if not ctx.quote_media_refs:
+                    try:
+                        store = getattr(adapter, "_session_store", None)
+                        if store:
+                            session_entry = store.get_or_create_session(ctx.source)
+                            history = store.load_transcript(session_entry.session_id)
+                            for msg in reversed(history or []):
+                                mid = msg.get("message_id", "")
+                                if mid and mid == ctx.reply_to_message_id:
+                                    _content = msg.get("content", "")
+                                    if isinstance(_content, str) and "|ybres:" in _content:
+                                        for m in _YB_RES_REF_RE.finditer(_content):
+                                            head = m.group(1)
+                                            rid = m.group(2)
+                                            kind, _, filename = head.partition(":")
+                                            kind = kind.strip()
+                                            if kind in _RESOLVABLE_MEDIA_KINDS:
+                                                ctx.quote_media_refs.append((rid, kind, filename.strip()))
+                                    break
+                    except Exception as exc:
+                        logger.warning(
+                            "[%s] quote transcript lookup failed: %s",
+                            adapter.name, exc,
+                        )
+                # User quoted a message — resolve only media from the quote
+                for rid, kind, filename in ctx.quote_media_refs:
+                    if kind not in _RESOLVABLE_MEDIA_KINDS:
                        continue
-                    media_urls.append(u)
-                    media_types.append(m)
-                    current.add(u)
+                    try:
+                        fresh_url = await MediaResolveMiddleware._resolve_by_resource_id(adapter, rid)
+                    except Exception as exc:
+                        logger.warning(
+                            "[%s] quote media resolve failed: rid=%s kind=%s err=%s",
+                            adapter.name, rid, kind, exc,
+                        )
+                        continue
+                    cached = await MediaResolveMiddleware._download_and_cache(
+                        adapter,
+                        fetch_url=fresh_url,
+                        kind=kind,
+                        file_name=filename or None,
+                        log_tag=f"quote rid={rid}",
+                    )
+                    if cached is None:
+                        continue
+                    path, mime = cached
+                    # Avoid duplicates
+                    if path not in media_urls:
+                        media_urls.append(path)
+                        media_types.append(mime)
+            else:
+                # No quote — backfill observed media from recent transcript history
+                extra_img_urls: List[str] = []
+                extra_img_mimes: List[str] = []
+                try:
+                    extra_img_urls, extra_img_mimes = await MediaResolveMiddleware._collect_observed_media(
+                        adapter, ctx.source,
+                    )
+                except Exception as exc:
+                    logger.warning(
+                        "[%s] observed-image hydration raised, continuing anyway: %s",
+                        adapter.name, exc,
+                    )
+                if extra_img_urls:
+                    current = set(media_urls)
+                    for u, m in zip(extra_img_urls, extra_img_mimes):
+                        if u in current:
+                            continue
+                        media_urls.append(u)
+                        media_types.append(m)
+                        current.add(u)

            # Replace [kind|ybres:xxx] anchors with local cache paths so
            # the transcript records usable paths for the model.
@@ -2506,7 +2618,11 @@ class DispatchMiddleware(InboundMiddleware):

            event = MessageEvent(
                text=_patched_event_text,
-                message_type=ctx.msg_type,
+                message_type=(
+                    MessageType.DOCUMENT
+                    if any(mt.startswith(("application/", "text/")) for mt in media_types)
+                    else ctx.msg_type
+                ),
                source=ctx.source,
                message_id=ctx.msg_id or None,
                raw_message=ctx.push,
@@ -50,6 +50,7 @@ from typing import Dict, Optional, Any, List, Union
 # gateway is a long-running daemon, so its boot cost matters less than
 # preserving the established test-patch surface.
 from agent.account_usage import fetch_account_usage, render_account_usage_lines
+from agent.async_utils import safe_schedule_threadsafe
 from agent.i18n import t
 from hermes_cli.config import cfg_get

@@ -1989,21 +1990,21 @@ class GatewayRunner:
            await self.stop()
        elif not self.adapters and self._failed_platforms:
            # All platforms are down and queued for background reconnection.
-            # If the error is retryable, exit with failure so systemd Restart=on-failure
-            # can restart the process. Otherwise stay alive and keep retrying in background.
-            if adapter.fatal_error_retryable:
-                self._exit_reason = adapter.fatal_error_message or "All messaging platforms failed with retryable errors"
-                self._exit_with_failure = True
-                logger.error(
-                    "All messaging platforms failed with retryable errors. "
-                    "Shutting down gateway for service restart (systemd will retry)."
-                )
-                await self.stop()
-            else:
-                logger.warning(
-                    "No connected messaging platforms remain, but %d platform(s) queued for reconnection",
-                    len(self._failed_platforms),
-                )
+            # Keep the gateway alive so:
+            #   • cron jobs still run
+            #   • the reconnect watcher can recover platforms when the
+            #     underlying problem clears (proxy comes back, user runs
+            #     `hermes whatsapp`, etc.)
+            # We used to exit-with-failure here to trigger systemd restart,
+            # but that converted a transient outage into a restart loop and
+            # killed in-process state every time. The reconnect watcher
+            # already handles long-running recovery — let it do its job.
+            logger.warning(
+                "No connected messaging platforms remain, but %d platform(s) "
+                "queued for reconnection — gateway staying alive, watcher will "
+                "retry in background.",
+                len(self._failed_platforms),
+            )

    def _request_clean_exit(self, reason: str) -> None:
        self._exit_cleanly = True
@@ -2179,6 +2180,73 @@ class GatewayRunner:
        except Exception:
            pass

+    # ------------------------------------------------------------------
+    # Per-platform circuit breaker (pause/resume) — used by the reconnect
+    # watcher when a retryable failure recurs past a threshold, and by the
+    # /platform pause|resume slash command for manual control.
+    # ------------------------------------------------------------------
+    def _pause_failed_platform(self, platform, *, reason: str = "") -> None:
+        """Mark a queued platform as paused — keep it in ``_failed_platforms``
+        but stop the reconnect watcher from hammering it.
+
+        Used by the circuit breaker after ``_PAUSE_AFTER_FAILURES`` consecutive
+        retryable failures, and by ``/platform pause <name>`` for manual
+        intervention.  Paused platforms are surfaced in ``/platform list``
+        and resumed with ``/platform resume <name>``.
+        """
+        info = getattr(self, "_failed_platforms", {}).get(platform)
+        if info is None:
+            return
+        if info.get("paused"):
+            return
+        info["paused"] = True
+        info["pause_reason"] = reason or "auto-paused after repeated failures"
+        # Push next_retry far enough out that even if "paused" is missed
+        # by a stale code path, the watcher won't fire on it.
+        info["next_retry"] = float("inf")
+        try:
+            self._update_platform_runtime_status(
+                platform.value,
+                platform_state="paused",
+                error_code=None,
+                error_message=info["pause_reason"],
+            )
+        except Exception:
+            pass
+        logger.warning(
+            "%s paused after %d consecutive failures (%s) — "
+            "fix the underlying issue then run `/platform resume %s` "
+            "to retry, or `hermes gateway restart` to restart the gateway.",
+            platform.value, info.get("attempts", 0),
+            info["pause_reason"], platform.value,
+        )
+
+    def _resume_paused_platform(self, platform) -> bool:
+        """Unpause a platform — reset its attempt counter and schedule an
+        immediate retry.  Returns True if the platform was paused and is
+        now queued; False if it wasn't paused (or wasn't in the queue).
+        """
+        info = getattr(self, "_failed_platforms", {}).get(platform)
+        if info is None:
+            return False
+        if not info.get("paused"):
+            return False
+        info["paused"] = False
+        info.pop("pause_reason", None)
+        info["attempts"] = 0
+        info["next_retry"] = time.monotonic()  # retry on next watcher tick
+        try:
+            self._update_platform_runtime_status(
+                platform.value,
+                platform_state="retrying",
+                error_code=None,
+                error_message=None,
+            )
+        except Exception:
+            pass
+        logger.info("%s resumed — retrying on next watcher tick", platform.value)
+        return True
+
    @staticmethod
    def _load_prefill_messages() -> List[Dict[str, Any]]:
        """Load ephemeral prefill messages from config or env var.
@@ -3612,16 +3680,32 @@ class GatewayRunner:
                return True
            if enabled_platform_count > 0:
                if startup_retryable_errors:
-                    # At least one platform attempted a connection and failed —
-                    # this is a real startup error that should block the gateway.
+                    # All enabled platforms hit retryable failures (network
+                    # blip, bridge not paired, npm install timeout, etc.).
+                    # Keep the gateway alive so:
+                    #   • cron jobs still run
+                    #   • the reconnect watcher gets a chance to recover the
+                    #     failing platforms once the underlying problem is
+                    #     fixed (e.g. user runs `hermes whatsapp`, fixes
+                    #     proxy, etc.)
+                    # Exiting here used to convert a single misconfigured
+                    # platform into an infinite systemd restart loop.
                    reason = "; ".join(startup_retryable_errors)
-                    logger.error("Gateway failed to connect any configured messaging platform: %s", reason)
+                    logger.warning(
+                        "Gateway started with no connected platforms — "
+                        "%d platform(s) queued for retry: %s",
+                        len(self._failed_platforms), reason,
+                    )
                    try:
                        from gateway.status import write_runtime_status
-                        write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
+                        write_runtime_status(
+                            gateway_state="degraded",
+                            exit_reason=None,
+                        )
                    except Exception:
                        pass
-                    return False
+                    # Fall through to the normal "running" state — reconnect
+                    # watcher takes it from here.
                # All enabled platforms had no adapter (missing library or credentials).
                # In fleet deployments the same config.yaml is shared across nodes that
                # may only have credentials for a subset of platforms.  Rather than
@@ -4736,11 +4820,15 @@ class GatewayRunner:
        """Background task that periodically retries connecting failed platforms.

        Uses exponential backoff: 30s → 60s → 120s → 240s → 300s (cap).
-        Stops retrying a platform after 20 failed attempts or if the error
-        is non-retryable (e.g. bad auth token).
+        Retryable failures keep retrying at the backoff cap indefinitely
+        — but if a platform fails ``_PAUSE_AFTER_FAILURES`` times in a row
+        without ever succeeding, it is *paused*: kept in the retry queue
+        but no longer hammered.  The user surfaces it with ``/platform list``
+        and resumes it with ``/platform resume <name>``.  Non-retryable
+        failures (bad auth, etc.) still drop out of the queue immediately.
        """
-        _MAX_ATTEMPTS = 20
        _BACKOFF_CAP = 300  # 5 minutes max between retries
+        _PAUSE_AFTER_FAILURES = 10  # circuit-breaker threshold

        await asyncio.sleep(10)  # initial delay — let startup finish
        while self._running:
@@ -4757,22 +4845,18 @@ class GatewayRunner:
                if not self._running:
                    return
                info = self._failed_platforms[platform]
+                # Skip paused platforms entirely — they need explicit
+                # /platform resume to come back.
+                if info.get("paused"):
+                    continue
                if now < info["next_retry"]:
                    continue  # not time yet

-                if info["attempts"] >= _MAX_ATTEMPTS:
-                    logger.warning(
-                        "Giving up reconnecting %s after %d attempts",
-                        platform.value, info["attempts"],
-                    )
-                    del self._failed_platforms[platform]
-                    continue
-
                platform_config = info["config"]
                attempt = info["attempts"] + 1
                logger.info(
-                    "Reconnecting %s (attempt %d/%d)...",
-                    platform.value, attempt, _MAX_ATTEMPTS,
+                    "Reconnecting %s (attempt %d)...",
+                    platform.value, attempt,
                )

                try:
@@ -4837,6 +4921,14 @@ class GatewayRunner:
                            "Reconnect %s failed, next retry in %ds",
                            platform.value, backoff,
                        )
+                        if attempt >= _PAUSE_AFTER_FAILURES:
+                            self._pause_failed_platform(
+                                platform,
+                                reason=(
+                                    adapter.fatal_error_message
+                                    or "failed to reconnect"
+                                ),
+                            )
                except Exception as e:
                    self._update_platform_runtime_status(
                        platform.value,
@@ -4851,6 +4943,8 @@ class GatewayRunner:
                        "Reconnect %s error: %s, next retry in %ds",
                        platform.value, e, backoff,
                    )
+                    if attempt >= _PAUSE_AFTER_FAILURES:
+                        self._pause_failed_platform(platform, reason=str(e))

            # Check every 10 seconds for platforms that need reconnection
            for _ in range(10):
@@ -6450,6 +6544,9 @@ class GatewayRunner:
        if canonical == "agents":
            return await self._handle_agents_command(event)

+        if canonical == "platform":
+            return await self._handle_platform_command(event)
+
        if canonical == "restart":
            return await self._handle_restart_command(event)
        
@@ -6809,6 +6906,12 @@ class GatewayRunner:
        if _is_shared_multi_user and source.user_name:
            message_text = f"[{source.user_name}] {message_text}"

+        # Prepend channel context from history backfill (if any).  This
+        # happens after sender-prefix so the prefix only applies to the
+        # trigger message, not the backfill block.
+        if getattr(event, "channel_context", None):
+            message_text = f"{event.channel_context}\n\n[New message]\n{message_text}"
+
        if event.media_urls:
            image_paths = []
            audio_paths = []
@@ -7985,6 +8088,8 @@ class GatewayRunner:
                try:
                    if _err_body is not None:
                        _err_json = _err_body.json().get("error", {})
+                        if not isinstance(_err_json, dict):
+                            _err_json = {}
                except Exception:
                    pass
                if _err_json.get("type") == "usage_limit_reached":
@@ -8689,6 +8794,99 @@ class GatewayRunner:
        else:
            return t("gateway.stop.no_active")

+    async def _handle_platform_command(self, event: MessageEvent) -> str:
+        """Handle ``/platform list|pause|resume [name]`` — surface and
+        manually control failed/paused gateway adapters.
+
+        Examples:
+            ``/platform list``           — show connected + failed/paused platforms
+            ``/platform pause whatsapp`` — stop the reconnect watcher hammering whatsapp
+            ``/platform resume whatsapp`` — re-queue a paused platform for retry
+        """
+        text = (getattr(event, "content", "") or "").strip()
+        # Strip the leading "/platform" (or "/PLATFORM") token if present
+        parts = text.split(maxsplit=2)
+        if parts and parts[0].lower().lstrip("/").startswith("platform"):
+            parts = parts[1:]
+        action = (parts[0] if parts else "list").lower()
+        target = parts[1].lower() if len(parts) > 1 else ""
+
+        # Resolve platform name (case-insensitive, value match)
+        def _resolve_platform(name: str):
+            if not name:
+                return None
+            for p in Platform.__members__.values():
+                if p.value.lower() == name:
+                    return p
+            return None
+
+        if action == "list":
+            lines = ["**Gateway platforms**"]
+            connected = sorted(p.value for p in self.adapters.keys())
+            if connected:
+                lines.append("Connected: " + ", ".join(connected))
+            else:
+                lines.append("Connected: (none)")
+            failed = getattr(self, "_failed_platforms", {}) or {}
+            if failed:
+                for p, info in failed.items():
+                    if info.get("paused"):
+                        reason = info.get("pause_reason") or "paused"
+                        lines.append(
+                            f"  · {p.value} — PAUSED ({reason}). "
+                            f"Resume with `/platform resume {p.value}`."
+                        )
+                    else:
+                        attempts = info.get("attempts", 0)
+                        lines.append(
+                            f"  · {p.value} — retrying (attempt {attempts})"
+                        )
+            else:
+                lines.append("Failed/paused: (none)")
+            return "\n".join(lines)
+
+        if action in ("pause", "resume"):
+            if not target:
+                return f"Usage: /platform {action} <name>"
+            platform = _resolve_platform(target)
+            if platform is None:
+                return f"Unknown platform: {target}"
+            failed = getattr(self, "_failed_platforms", {}) or {}
+            if action == "pause":
+                if platform not in failed:
+                    return (
+                        f"{platform.value} is not in the retry queue "
+                        f"(it's either connected or not enabled)."
+                    )
+                if failed[platform].get("paused"):
+                    return f"{platform.value} is already paused."
+                self._pause_failed_platform(platform, reason="paused via /platform pause")
+                return (
+                    f"✓ {platform.value} paused. "
+                    f"Resume with `/platform resume {platform.value}` or "
+                    f"`hermes gateway restart` to reset."
+                )
+            # action == "resume"
+            if platform not in failed:
+                return (
+                    f"{platform.value} is not in the retry queue — "
+                    f"nothing to resume."
+                )
+            if not failed[platform].get("paused"):
+                return (
+                    f"{platform.value} is already retrying — "
+                    f"no resume needed."
+                )
+            self._resume_paused_platform(platform)
+            return f"✓ {platform.value} resumed — retrying on next watcher tick."
+
+        return (
+            "Usage: /platform <list|pause|resume> [name]\n"
+            "  /platform list — show platform status\n"
+            "  /platform pause <name> — stop retrying a failing platform\n"
+            "  /platform resume <name> — re-queue a paused platform"
+        )
+
    async def _handle_restart_command(self, event: MessageEvent) -> Union[str, EphemeralReply]:
        """Handle /restart command - drain active work, then restart the gateway."""
        # Defensive idempotency check: if the previous gateway process
@@ -11209,10 +11407,14 @@ class GatewayRunner:
            copied_source = dataclasses.replace(source)
        except Exception:
            copied_source = source
-        future = asyncio.run_coroutine_threadsafe(
+        future = safe_schedule_threadsafe(
            self._rename_telegram_topic_for_session_title(copied_source, session_id, title),
            loop,
+            logger=logger,
+            log_message="Telegram topic title rename failed to schedule",
        )
+        if future is None:
+            return
        def _log_rename_failure(fut) -> None:
            try:
                fut.result()
@@ -14802,29 +15004,28 @@ class GatewayRunner:
        def _step_callback_sync(iteration: int, prev_tools: list) -> None:
            if not _run_still_current():
                return
-            try:
-                # prev_tools may be list[str] or list[dict] with "name"/"result"
-                # keys.  Normalise to keep "tool_names" backward-compatible for
-                # user-authored hooks that do ', '.join(tool_names)'.
-                _names: list[str] = []
-                for _t in (prev_tools or []):
-                    if isinstance(_t, dict):
-                        _names.append(_t.get("name") or "")
-                    else:
-                        _names.append(str(_t))
-                asyncio.run_coroutine_threadsafe(
-                    _hooks_ref.emit("agent:step", {
-                        "platform": source.platform.value if source.platform else "",
-                        "user_id": source.user_id,
-                        "session_id": session_id,
-                        "iteration": iteration,
-                        "tool_names": _names,
-                        "tools": prev_tools,
-                    }),
-                    _loop_for_step,
-                )
-            except Exception as _e:
-                logger.debug("agent:step hook error: %s", _e)
+            # prev_tools may be list[str] or list[dict] with "name"/"result"
+            # keys.  Normalise to keep "tool_names" backward-compatible for
+            # user-authored hooks that do ', '.join(tool_names)'.
+            _names: list[str] = []
+            for _t in (prev_tools or []):
+                if isinstance(_t, dict):
+                    _names.append(_t.get("name") or "")
+                else:
+                    _names.append(str(_t))
+            safe_schedule_threadsafe(
+                _hooks_ref.emit("agent:step", {
+                    "platform": source.platform.value if source.platform else "",
+                    "user_id": source.user_id,
+                    "session_id": session_id,
+                    "iteration": iteration,
+                    "tool_names": _names,
+                    "tools": prev_tools,
+                }),
+                _loop_for_step,
+                logger=logger,
+                log_message="agent:step hook scheduling error",
+            )

        # Bridge sync status_callback → async adapter.send for context pressure
        _status_adapter = self.adapters.get(source.platform)
@@ -14844,27 +15045,28 @@ class GatewayRunner:
        def _status_callback_sync(event_type: str, message: str) -> None:
            if not _status_adapter or not _run_still_current():
                return
-            try:
-                _fut = asyncio.run_coroutine_threadsafe(
-                    _status_adapter.send(
-                        _status_chat_id,
-                        message,
-                        metadata=_status_thread_metadata,
-                    ),
-                    _loop_for_step,
-                )
-                if _cleanup_progress:
-                    def _track_status_id(fut) -> None:
-                        try:
-                            res = fut.result()
-                        except Exception:
-                            return
-                        mid = getattr(res, "message_id", None)
-                        if getattr(res, "success", False) and mid:
-                            _cleanup_msg_ids.append(str(mid))
-                    _fut.add_done_callback(_track_status_id)
-            except Exception as _e:
-                logger.debug("status_callback error (%s): %s", event_type, _e)
+            _fut = safe_schedule_threadsafe(
+                _status_adapter.send(
+                    _status_chat_id,
+                    message,
+                    metadata=_status_thread_metadata,
+                ),
+                _loop_for_step,
+                logger=logger,
+                log_message=f"status_callback ({event_type}) scheduling error",
+            )
+            if _fut is None:
+                return
+            if _cleanup_progress:
+                def _track_status_id(fut) -> None:
+                    try:
+                        res = fut.result()
+                    except Exception:
+                        return
+                    mid = getattr(res, "message_id", None)
+                    if getattr(res, "success", False) and mid:
+                        _cleanup_msg_ids.append(str(mid))
+                _fut.add_done_callback(_track_status_id)

        def run_sync():
            # The conditional re-assignment of `message` further below
@@ -15018,17 +15220,16 @@ class GatewayRunner:
                    return
                if already_streamed or not _status_adapter or not str(text or "").strip():
                    return
-                try:
-                    asyncio.run_coroutine_threadsafe(
-                        _status_adapter.send(
-                            _status_chat_id,
-                            text,
-                            metadata=_status_thread_metadata,
-                        ),
-                        _loop_for_step,
-                    )
-                except Exception as _e:
-                    logger.debug("interim_assistant_callback error: %s", _e)
+                safe_schedule_threadsafe(
+                    _status_adapter.send(
+                        _status_chat_id,
+                        text,
+                        metadata=_status_thread_metadata,
+                    ),
+                    _loop_for_step,
+                    logger=logger,
+                    log_message="interim_assistant_callback scheduling error",
+                )

            turn_route = self._resolve_turn_agent_config(message, model, runtime_kwargs)

@@ -15117,17 +15318,16 @@ class GatewayRunner:
            def _deliver_bg_review_message(message: str) -> None:
                if not _status_adapter or not _run_still_current():
                    return
-                try:
-                    asyncio.run_coroutine_threadsafe(
-                        _status_adapter.send(
-                            _status_chat_id,
-                            message,
-                            metadata=_status_thread_metadata,
-                        ),
-                        _loop_for_step,
-                    )
-                except Exception as _e:
-                    logger.debug("background_review_callback error: %s", _e)
+                safe_schedule_threadsafe(
+                    _status_adapter.send(
+                        _status_chat_id,
+                        message,
+                        metadata=_status_thread_metadata,
+                    ),
+                    _loop_for_step,
+                    logger=logger,
+                    log_message="background_review_callback scheduling error",
+                )

            def _release_bg_review_messages() -> None:
                _bg_review_release.set()
@@ -15199,23 +15399,28 @@ class GatewayRunner:
                    pass

                send_ok = False
-                try:
-                    fut = asyncio.run_coroutine_threadsafe(
-                        _status_adapter.send_clarify(
-                            chat_id=_status_chat_id,
-                            question=question,
-                            choices=list(choices) if choices else None,
-                            clarify_id=clarify_id,
-                            session_key=session_key or "",
-                            metadata=_status_thread_metadata,
-                        ),
-                        _loop_for_step,
-                    )
-                    result = fut.result(timeout=15)
-                    send_ok = bool(getattr(result, "success", False))
-                except Exception as exc:
-                    logger.warning("Clarify send failed: %s", exc)
+                fut = safe_schedule_threadsafe(
+                    _status_adapter.send_clarify(
+                        chat_id=_status_chat_id,
+                        question=question,
+                        choices=list(choices) if choices else None,
+                        clarify_id=clarify_id,
+                        session_key=session_key or "",
+                        metadata=_status_thread_metadata,
+                    ),
+                    _loop_for_step,
+                    logger=logger,
+                    log_message="Clarify send failed to schedule",
+                )
+                if fut is None:
                    send_ok = False
+                else:
+                    try:
+                        result = fut.result(timeout=15)
+                        send_ok = bool(getattr(result, "success", False))
+                    except Exception as exc:
+                        logger.warning("Clarify send failed: %s", exc)
+                        send_ok = False

                if not send_ok:
                    # Couldn't deliver the prompt — clean up and return
@@ -15335,7 +15540,7 @@ class GatewayRunner:
                # false positives from MagicMock auto-attribute creation in tests.
                if getattr(type(_status_adapter), "send_exec_approval", None) is not None:
                    try:
-                        _approval_result = asyncio.run_coroutine_threadsafe(
+                        _approval_fut = safe_schedule_threadsafe(
                            _status_adapter.send_exec_approval(
                                chat_id=_status_chat_id,
                                command=cmd,
@@ -15344,7 +15549,12 @@ class GatewayRunner:
                                metadata=_status_thread_metadata,
                            ),
                            _loop_for_step,
-                        ).result(timeout=15)
+                            logger=logger,
+                            log_message="send_exec_approval scheduling error",
+                        )
+                        if _approval_fut is None:
+                            raise RuntimeError("send_exec_approval: loop unavailable")
+                        _approval_result = _approval_fut.result(timeout=15)
                        if _approval_result.success:
                            return
                        logger.warning(
@@ -15366,14 +15576,18 @@ class GatewayRunner:
                    f"for the session, `/approve always` to approve permanently, or `/deny` to cancel."
                )
                try:
-                    asyncio.run_coroutine_threadsafe(
+                    _approval_send_fut = safe_schedule_threadsafe(
                        _status_adapter.send(
                            _status_chat_id,
                            msg,
                            metadata=_status_thread_metadata,
                        ),
                        _loop_for_step,
-                    ).result(timeout=15)
+                        logger=logger,
+                        log_message="Approval text-send scheduling error",
+                    )
+                    if _approval_send_fut is not None:
+                        _approval_send_fut.result(timeout=15)
                except Exception as _e:
                    logger.error("Failed to send approval request: %s", _e)

@@ -16335,7 +16549,11 @@ class GatewayRunner:
                        except Exception:
                            pass
                try:
-                    asyncio.run_coroutine_threadsafe(_delete_all(), _loop_snapshot)
+                    safe_schedule_threadsafe(
+                        _delete_all(), _loop_snapshot,
+                        logger=logger,
+                        log_message="Temp bubble cleanup scheduling error",
+                    )
                except Exception:
                    pass

@@ -16392,10 +16610,13 @@ def _start_cron_ticker(stop_event: threading.Event, adapters=None, loop=None, in
                    # this ticker runs in a background thread. Schedule onto
                    # the gateway event loop and wait briefly for completion
                    # so refresh failures are still logged via the except.
-                    fut = asyncio.run_coroutine_threadsafe(
-                        build_channel_directory(adapters), loop
+                    fut = safe_schedule_threadsafe(
+                        build_channel_directory(adapters), loop,
+                        logger=logger,
+                        log_message="Channel directory refresh scheduling error",
                    )
-                    fut.result(timeout=30)
+                    if fut is not None:
+                        fut.result(timeout=30)
            except Exception as e:
                logger.debug("Channel directory refresh error: %s", e)

@@ -518,6 +518,9 @@ class SessionEntry:
                else None
            ),
            "is_fresh_reset": self.is_fresh_reset,
+            "was_auto_reset": self.was_auto_reset,
+            "auto_reset_reason": self.auto_reset_reason,
+            "reset_had_activity": self.reset_had_activity,
        }
        if self.origin:
            result["origin"] = self.origin.to_dict()
@@ -567,6 +570,9 @@ class SessionEntry:
            resume_reason=data.get("resume_reason"),
            last_resume_marked_at=last_resume_marked_at,
            is_fresh_reset=data.get("is_fresh_reset", False),
+            was_auto_reset=data.get("was_auto_reset", False),
+            auto_reset_reason=data.get("auto_reset_reason"),
+            reset_had_activity=data.get("reset_had_activity", False),
        )


@@ -14,8 +14,8 @@ Provides subcommands for:
 import os
 import sys

-__version__ = "0.13.0"
-__release_date__ = "2026.5.7"
+__version__ = "0.14.0"
+__release_date__ = "2026.5.16"


 def _ensure_utf8():
@@ -72,6 +72,7 @@ DEFAULT_AGENT_KEY_MIN_TTL_SECONDS = 30 * 60  # 30 minutes
 ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 120       # refresh 2 min before expiry
 DEVICE_AUTH_POLL_INTERVAL_CAP_SECONDS = 1     # poll at most every 1s
 DEFAULT_CODEX_BASE_URL = "https://chatgpt.com/backend-api/codex"
+DEFAULT_XAI_OAUTH_BASE_URL = "https://api.x.ai/v1"
 MINIMAX_OAUTH_CLIENT_ID = "78257093-7e40-4613-99e0-527b14b39113"
 MINIMAX_OAUTH_SCOPE = "group_id profile model.completion"
 MINIMAX_OAUTH_GRANT_TYPE = "urn:ietf:params:oauth:grant-type:user_code"
@@ -89,6 +90,14 @@ STEPFUN_STEP_PLAN_CN_BASE_URL = "https://api.stepfun.com/step_plan/v1"
 CODEX_OAUTH_CLIENT_ID = "app_EMoamEEZ73f0CkXaXp7hrann"
 CODEX_OAUTH_TOKEN_URL = "https://auth.openai.com/oauth/token"
 CODEX_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 120
+XAI_OAUTH_ISSUER = "https://auth.x.ai"
+XAI_OAUTH_DISCOVERY_URL = f"{XAI_OAUTH_ISSUER}/.well-known/openid-configuration"
+XAI_OAUTH_CLIENT_ID = "b1a00492-073a-47ea-816f-4c329264a828"
+XAI_OAUTH_SCOPE = "openid profile email offline_access grok-cli:access api:access"
+XAI_OAUTH_REDIRECT_HOST = "127.0.0.1"
+XAI_OAUTH_REDIRECT_PORT = 56121
+XAI_OAUTH_REDIRECT_PATH = "/callback"
+XAI_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 120
 QWEN_OAUTH_CLIENT_ID = "f0304373b74a44d2b584a3fb70ca9e56"
 QWEN_OAUTH_TOKEN_URL = "https://chat.qwen.ai/api/v1/oauth2/token"
 QWEN_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 120
@@ -98,6 +107,9 @@ DEFAULT_SPOTIFY_REDIRECT_URI = "http://127.0.0.1:43827/spotify/callback"
 SPOTIFY_DOCS_URL = "https://hermes-agent.nousresearch.com/docs/user-guide/features/spotify"
 SPOTIFY_DASHBOARD_URL = "https://developer.spotify.com/dashboard"
 SPOTIFY_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 120
+
+XAI_OAUTH_DOCS_URL = "https://hermes-agent.nousresearch.com/docs/guides/xai-grok-oauth"
+OAUTH_OVER_SSH_DOCS_URL = "https://hermes-agent.nousresearch.com/docs/guides/oauth-over-ssh"
 DEFAULT_SPOTIFY_SCOPE = " ".join((
    "user-modify-playback-state",
    "user-read-playback-state",
@@ -162,6 +174,12 @@ PROVIDER_REGISTRY: Dict[str, ProviderConfig] = {
        auth_type="oauth_external",
        inference_base_url=DEFAULT_CODEX_BASE_URL,
    ),
+    "xai-oauth": ProviderConfig(
+        id="xai-oauth",
+        name="xAI Grok OAuth (SuperGrok Subscription)",
+        auth_type="oauth_external",
+        inference_base_url=DEFAULT_XAI_OAUTH_BASE_URL,
+    ),
    "qwen-oauth": ProviderConfig(
        id="qwen-oauth",
        name="Qwen OAuth",
@@ -1364,6 +1382,8 @@ def resolve_provider(
        "glm": "zai", "z-ai": "zai", "z.ai": "zai", "zhipu": "zai",
        "google": "gemini", "google-gemini": "gemini", "google-ai-studio": "gemini",
        "x-ai": "xai", "x.ai": "xai", "grok": "xai",
+        "xai-oauth": "xai-oauth", "x-ai-oauth": "xai-oauth",
+        "grok-oauth": "xai-oauth", "xai-grok-oauth": "xai-oauth",
        "kimi": "kimi-coding", "kimi-for-coding": "kimi-coding", "moonshot": "kimi-coding",
        "kimi-cn": "kimi-coding-cn", "moonshot-cn": "kimi-coding-cn",
        "step": "stepfun", "stepfun-coding-plan": "stepfun",
@@ -1907,6 +1927,16 @@ def _spotify_code_challenge(code_verifier: str) -> str:
    return base64.urlsafe_b64encode(digest).decode("ascii").rstrip("=")


+def _oauth_pkce_code_verifier(length: int = 64) -> str:
+    raw = base64.urlsafe_b64encode(os.urandom(length)).decode("ascii")
+    return raw.rstrip("=")[:128]
+
+
+def _oauth_pkce_code_challenge(code_verifier: str) -> str:
+    digest = hashlib.sha256(code_verifier.encode("utf-8")).digest()
+    return base64.urlsafe_b64encode(digest).decode("ascii").rstrip("=")
+
+
 def _spotify_build_authorize_url(
    *,
    client_id: str,
@@ -2029,6 +2059,158 @@ def _spotify_wait_for_callback(
    )


+def _xai_validate_loopback_redirect_uri(redirect_uri: str) -> tuple[str, int, str]:
+    parsed = urlparse(redirect_uri)
+    if parsed.scheme != "http":
+        raise AuthError(
+            "xAI OAuth redirect_uri must use http://127.0.0.1.",
+            provider="xai-oauth",
+            code="xai_redirect_invalid",
+        )
+    host = parsed.hostname or ""
+    if host != XAI_OAUTH_REDIRECT_HOST:
+        raise AuthError(
+            "xAI OAuth redirect_uri must point to 127.0.0.1.",
+            provider="xai-oauth",
+            code="xai_redirect_invalid",
+        )
+    if not parsed.port:
+        raise AuthError(
+            "xAI OAuth redirect_uri must include an explicit localhost port.",
+            provider="xai-oauth",
+            code="xai_redirect_invalid",
+        )
+    return host, parsed.port, parsed.path or "/"
+
+
+def _xai_callback_cors_origin(origin: Optional[str]) -> str:
+    # CORS allowlist for the loopback callback.  Only xAI's own auth origins
+    # are accepted; the redirect_uri itself is bound to 127.0.0.1 and gated by
+    # PKCE+state, so additional dev/3p origins are not needed here.
+    allowed = {
+        "https://accounts.x.ai",
+        "https://auth.x.ai",
+    }
+    return origin if origin in allowed else ""
+
+
+def _make_xai_callback_handler(expected_path: str) -> tuple[type[BaseHTTPRequestHandler], dict[str, Any]]:
+    result: dict[str, Any] = {
+        "code": None,
+        "state": None,
+        "error": None,
+        "error_description": None,
+    }
+
+    class _XAICallbackHandler(BaseHTTPRequestHandler):
+        def _maybe_write_cors_headers(self) -> None:
+            origin = self.headers.get("Origin")
+            allow_origin = _xai_callback_cors_origin(origin)
+            if allow_origin:
+                self.send_header("Access-Control-Allow-Origin", allow_origin)
+                self.send_header("Access-Control-Allow-Methods", "GET, OPTIONS")
+                self.send_header("Access-Control-Allow-Headers", "Content-Type")
+                self.send_header("Access-Control-Allow-Private-Network", "true")
+                self.send_header("Vary", "Origin")
+
+        def do_OPTIONS(self) -> None:  # noqa: N802
+            self.send_response(204)
+            self._maybe_write_cors_headers()
+            self.end_headers()
+
+        def do_GET(self) -> None:  # noqa: N802
+            parsed = urlparse(self.path)
+            if parsed.path != expected_path:
+                self.send_response(404)
+                self.end_headers()
+                self.wfile.write(b"Not found.")
+                return
+
+            params = parse_qs(parsed.query)
+            result["code"] = params.get("code", [None])[0]
+            result["state"] = params.get("state", [None])[0]
+            result["error"] = params.get("error", [None])[0]
+            result["error_description"] = params.get("error_description", [None])[0]
+
+            self.send_response(200)
+            self._maybe_write_cors_headers()
+            self.send_header("Content-Type", "text/html; charset=utf-8")
+            self.end_headers()
+            if result["error"]:
+                body = "<html><body><h1>xAI authorization failed.</h1>You can close this tab.</body></html>"
+            else:
+                body = "<html><body><h1>xAI authorization received.</h1>You can close this tab.</body></html>"
+            self.wfile.write(body.encode("utf-8"))
+
+        def log_message(self, format: str, *args: Any) -> None:  # noqa: A003
+            return
+
+    return _XAICallbackHandler, result
+
+
+def _xai_start_callback_server(
+    preferred_port: int = XAI_OAUTH_REDIRECT_PORT,
+) -> tuple[HTTPServer, threading.Thread, dict[str, Any], str]:
+    host = XAI_OAUTH_REDIRECT_HOST
+    expected_path = XAI_OAUTH_REDIRECT_PATH
+    handler_cls, result = _make_xai_callback_handler(expected_path)
+
+    class _ReuseHTTPServer(HTTPServer):
+        allow_reuse_address = True
+
+    ports_to_try = [preferred_port]
+    if preferred_port != 0:
+        ports_to_try.append(0)
+    server = None
+    last_error: Optional[OSError] = None
+    for port in ports_to_try:
+        try:
+            server = _ReuseHTTPServer((host, port), handler_cls)
+            break
+        except OSError as exc:
+            last_error = exc
+    if server is None:
+        raise AuthError(
+            f"Could not bind xAI callback server on {host}:{preferred_port}: {last_error}",
+            provider="xai-oauth",
+            code="xai_callback_bind_failed",
+        ) from last_error
+
+    actual_port = int(server.server_address[1])
+    redirect_uri = f"http://{host}:{actual_port}{expected_path}"
+    thread = threading.Thread(
+        target=server.serve_forever,
+        kwargs={"poll_interval": 0.1},
+        daemon=True,
+    )
+    thread.start()
+    return server, thread, result, redirect_uri
+
+
+def _xai_wait_for_callback(
+    server: HTTPServer,
+    thread: threading.Thread,
+    result: dict[str, Any],
+    *,
+    timeout_seconds: float = 180.0,
+) -> dict[str, Any]:
+    deadline = time.monotonic() + max(5.0, timeout_seconds)
+    try:
+        while time.monotonic() < deadline:
+            if result["code"] or result["error"]:
+                return result
+            time.sleep(0.1)
+    finally:
+        server.shutdown()
+        server.server_close()
+        thread.join(timeout=1.0)
+    raise AuthError(
+        "xAI authorization timed out waiting for the local callback.",
+        provider="xai-oauth",
+        code="xai_callback_timeout",
+    )
+
+
 def _spotify_token_payload_to_state(
    token_payload: Dict[str, Any],
    *,
@@ -2349,6 +2531,8 @@ def login_spotify_command(args) -> None:
    print(f"Full setup guide: {SPOTIFY_DOCS_URL}")
    print()

+    _print_loopback_ssh_hint(redirect_uri, docs_url=SPOTIFY_DOCS_URL)
+
    if open_browser and not _is_remote_session():
        try:
            opened = webbrowser.open(authorize_url)
@@ -2405,6 +2589,45 @@ def _is_remote_session() -> bool:
    return bool(os.getenv("SSH_CLIENT") or os.getenv("SSH_TTY"))


+def _print_loopback_ssh_hint(redirect_uri: str, *, docs_url: str | None = None) -> None:
+    """Print an SSH tunnel hint when running a loopback-redirect OAuth flow on a
+    remote host. The auth server (xAI, Spotify, ...) will redirect the user's
+    browser to ``127.0.0.1:<port>/callback``. If the browser is on a different
+    machine than the loopback listener (the usual SSH case), the redirect can't
+    reach the listener without a local port forward.
+
+    The hint is best-effort: silent if we don't think we're remote, or if we
+    can't parse a host/port out of the redirect URI.
+
+    Pass ``docs_url`` for a provider-specific guide (e.g. the xAI Grok OAuth
+    page); the generic OAuth-over-SSH guide is always shown after it.
+    """
+    if not _is_remote_session():
+        return
+    try:
+        parsed = urlparse(redirect_uri)
+    except Exception:
+        return
+    host = parsed.hostname or ""
+    port = parsed.port
+    if host not in ("127.0.0.1", "::1", "localhost") or not port:
+        return
+    print()
+    print("Remote session detected. Your browser will redirect to")
+    print(f"  {redirect_uri}")
+    print("which the loopback listener on THIS machine is waiting on. If your")
+    print("browser is on a different machine, forward the port first from your")
+    print("local machine in a separate terminal:")
+    print()
+    print(f"  ssh -N -L {port}:127.0.0.1:{port} <user>@<this-host>")
+    print()
+    print("Then open the authorize URL above in your local browser.")
+    if docs_url:
+        print(f"Provider docs:      {docs_url}")
+    print(f"SSH/jump-box guide: {OAUTH_OVER_SSH_DOCS_URL}")
+    print()
+
+
 # =============================================================================
 # OpenAI Codex auth — tokens stored in ~/.hermes/auth.json (not ~/.codex/)
 #
@@ -2680,6 +2903,348 @@ def resolve_codex_runtime_credentials(
    }


+# =============================================================================
+# xAI Grok OAuth — tokens stored in ~/.hermes/auth.json
+# =============================================================================
+
+def _read_xai_oauth_tokens(*, _lock: bool = True) -> Dict[str, Any]:
+    if _lock:
+        with _auth_store_lock():
+            auth_store = _load_auth_store()
+    else:
+        auth_store = _load_auth_store()
+    state = _load_provider_state(auth_store, "xai-oauth")
+    if not state:
+        raise AuthError(
+            "No xAI OAuth credentials stored. Select xAI Grok OAuth (SuperGrok Subscription) in `hermes model`.",
+            provider="xai-oauth",
+            code="xai_auth_missing",
+            relogin_required=True,
+        )
+    tokens = state.get("tokens")
+    if not isinstance(tokens, dict):
+        raise AuthError(
+            "xAI OAuth state is missing tokens. Re-authenticate with `hermes model`.",
+            provider="xai-oauth",
+            code="xai_auth_invalid_shape",
+            relogin_required=True,
+        )
+    access_token = str(tokens.get("access_token", "") or "").strip()
+    refresh_token = str(tokens.get("refresh_token", "") or "").strip()
+    if not access_token:
+        raise AuthError(
+            "xAI OAuth state is missing access_token. Re-authenticate with `hermes model`.",
+            provider="xai-oauth",
+            code="xai_auth_missing_access_token",
+            relogin_required=True,
+        )
+    if not refresh_token:
+        raise AuthError(
+            "xAI OAuth state is missing refresh_token. Re-authenticate with `hermes model`.",
+            provider="xai-oauth",
+            code="xai_auth_missing_refresh_token",
+            relogin_required=True,
+        )
+    return {
+        "tokens": tokens,
+        "last_refresh": state.get("last_refresh"),
+        "discovery": state.get("discovery") or {},
+        "redirect_uri": state.get("redirect_uri"),
+    }
+
+
+def _save_xai_oauth_tokens(
+    tokens: Dict[str, Any],
+    *,
+    discovery: Optional[Dict[str, Any]] = None,
+    redirect_uri: str = "",
+    last_refresh: Optional[str] = None,
+) -> None:
+    if last_refresh is None:
+        last_refresh = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
+    with _auth_store_lock():
+        auth_store = _load_auth_store()
+        state = _load_provider_state(auth_store, "xai-oauth") or {}
+        state["tokens"] = tokens
+        state["last_refresh"] = last_refresh
+        state["auth_mode"] = "oauth_pkce"
+        if discovery:
+            state["discovery"] = discovery
+        if redirect_uri:
+            state["redirect_uri"] = redirect_uri
+        _save_provider_state(auth_store, "xai-oauth", state)
+        _save_auth_store(auth_store)
+
+
+def _xai_access_token_is_expiring(access_token: str, skew_seconds: int = 0) -> bool:
+    if not isinstance(access_token, str) or "." not in access_token:
+        return False
+    try:
+        parts = access_token.split(".")
+        if len(parts) < 2:
+            return False
+        payload_b64 = parts[1]
+        payload_b64 += "=" * (-len(payload_b64) % 4)
+        payload = json.loads(base64.urlsafe_b64decode(payload_b64.encode("ascii")).decode("utf-8"))
+        exp = payload.get("exp")
+        if not isinstance(exp, (int, float)):
+            return False
+        return float(exp) <= (time.time() + max(0, int(skew_seconds)))
+    except Exception:
+        return False
+
+
+def _xai_validate_oauth_endpoint(url: str, *, field: str) -> str:
+    """Refuse any OIDC discovery endpoint that isn't HTTPS on the xAI origin.
+
+    The OIDC discovery response is a long-lived, low-frequency request whose
+    output is cached in ``~/.hermes/auth.json``. A single MITM during initial
+    login could substitute a malicious ``token_endpoint``; that URL would
+    then receive the refresh_token on every subsequent refresh — a permanent
+    credential leak from a one-time MITM. Validating scheme + host pins the
+    cached endpoint to the xAI auth origin (or a future ``*.x.ai`` subdomain
+    if xAI migrates) so the cache poisoning loses its persistence guarantee.
+
+    RFC 8414 §2 requires the issuer to be ``https://`` and SHOULD-keeps the
+    token_endpoint on the same origin; we enforce both. ``x.ai`` is the
+    bare apex, so we accept either exact host match or any ``.x.ai`` suffix.
+    """
+    parsed = urlparse(url)
+    if parsed.scheme != "https":
+        raise AuthError(
+            f"xAI OIDC discovery returned a non-HTTPS {field}: {url!r}.",
+            provider="xai-oauth",
+            code="xai_discovery_invalid",
+        )
+    host = (parsed.hostname or "").lower()
+    if not host:
+        raise AuthError(
+            f"xAI OIDC discovery {field} is missing a hostname: {url!r}.",
+            provider="xai-oauth",
+            code="xai_discovery_invalid",
+        )
+    if host != "x.ai" and not host.endswith(".x.ai"):
+        raise AuthError(
+            f"xAI OIDC discovery {field} host {host!r} is not on the xAI origin "
+            f"(expected x.ai or a *.x.ai subdomain). Refusing to use a cached "
+            f"endpoint that may have been substituted by a MITM during initial "
+            f"discovery; re-authenticate with `hermes model` to re-fetch.",
+            provider="xai-oauth",
+            code="xai_discovery_invalid",
+        )
+    return url
+
+
+def _xai_oauth_discovery(timeout_seconds: float = 15.0) -> Dict[str, str]:
+    try:
+        response = httpx.get(
+            XAI_OAUTH_DISCOVERY_URL,
+            headers={"Accept": "application/json"},
+            timeout=timeout_seconds,
+        )
+    except Exception as exc:
+        raise AuthError(
+            f"xAI OIDC discovery failed: {exc}",
+            provider="xai-oauth",
+            code="xai_discovery_failed",
+        ) from exc
+    if response.status_code != 200:
+        raise AuthError(
+            f"xAI OIDC discovery returned status {response.status_code}.",
+            provider="xai-oauth",
+            code="xai_discovery_failed",
+        )
+    try:
+        payload = response.json()
+    except Exception as exc:
+        raise AuthError(
+            f"xAI OIDC discovery returned invalid JSON: {exc}",
+            provider="xai-oauth",
+            code="xai_discovery_invalid_json",
+        ) from exc
+    if not isinstance(payload, dict):
+        raise AuthError(
+            "xAI OIDC discovery response was not a JSON object.",
+            provider="xai-oauth",
+            code="xai_discovery_incomplete",
+        )
+    authorization_endpoint = str(payload.get("authorization_endpoint", "") or "").strip()
+    token_endpoint = str(payload.get("token_endpoint", "") or "").strip()
+    if not authorization_endpoint or not token_endpoint:
+        raise AuthError(
+            "xAI OIDC discovery response was missing required endpoints.",
+            provider="xai-oauth",
+            code="xai_discovery_incomplete",
+        )
+    _xai_validate_oauth_endpoint(authorization_endpoint, field="authorization_endpoint")
+    _xai_validate_oauth_endpoint(token_endpoint, field="token_endpoint")
+    return {
+        "authorization_endpoint": authorization_endpoint,
+        "token_endpoint": token_endpoint,
+    }
+
+
+def refresh_xai_oauth_pure(
+    access_token: str,
+    refresh_token: str,
+    *,
+    token_endpoint: str = "",
+    timeout_seconds: float = 20.0,
+) -> Dict[str, Any]:
+    del access_token
+    if not isinstance(refresh_token, str) or not refresh_token.strip():
+        raise AuthError(
+            "xAI OAuth is missing refresh_token. Re-authenticate with `hermes model`.",
+            provider="xai-oauth",
+            code="xai_auth_missing_refresh_token",
+            relogin_required=True,
+        )
+    endpoint = token_endpoint.strip() or _xai_oauth_discovery(timeout_seconds)["token_endpoint"]
+    # Re-validate cached endpoints on the refresh hot path: an auth.json
+    # written by an older Hermes (or hand-edited) may carry a non-xAI
+    # token_endpoint that would receive every future refresh_token in
+    # plaintext if we trusted it blindly. Cheap suffix check; fast-fail
+    # with a clear error so the user can re-run `hermes model` to refetch.
+    _xai_validate_oauth_endpoint(endpoint, field="token_endpoint")
+    timeout = httpx.Timeout(max(5.0, float(timeout_seconds)))
+    with httpx.Client(timeout=timeout, headers={"Accept": "application/json"}) as client:
+        response = client.post(
+            endpoint,
+            headers={"Content-Type": "application/x-www-form-urlencoded"},
+            data={
+                "grant_type": "refresh_token",
+                "client_id": XAI_OAUTH_CLIENT_ID,
+                "refresh_token": refresh_token,
+            },
+        )
+    if response.status_code != 200:
+        detail = response.text.strip()
+        raise AuthError(
+            "xAI token refresh failed."
+            + (f" Response: {detail}" if detail else ""),
+            provider="xai-oauth",
+            code="xai_refresh_failed",
+            relogin_required=(response.status_code in {400, 401, 403}),
+        )
+    try:
+        payload = response.json()
+    except Exception as exc:
+        raise AuthError(
+            f"xAI token refresh returned invalid JSON: {exc}",
+            provider="xai-oauth",
+            code="xai_refresh_invalid_json",
+        ) from exc
+    if not isinstance(payload, dict):
+        raise AuthError(
+            "xAI token refresh response was not a JSON object.",
+            provider="xai-oauth",
+            code="xai_refresh_invalid_response",
+            relogin_required=True,
+        )
+    refreshed_access = str(payload.get("access_token", "") or "").strip()
+    if not refreshed_access:
+        raise AuthError(
+            "xAI token refresh response was missing access_token.",
+            provider="xai-oauth",
+            code="xai_refresh_missing_access_token",
+            relogin_required=True,
+        )
+    updated = {
+        "access_token": refreshed_access,
+        "refresh_token": str(payload.get("refresh_token") or refresh_token).strip(),
+        "id_token": str(payload.get("id_token") or "").strip(),
+        "expires_in": payload.get("expires_in"),
+        "token_type": str(payload.get("token_type") or "Bearer").strip() or "Bearer",
+        "last_refresh": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
+    }
+    return updated
+
+
+def _refresh_xai_oauth_tokens(
+    tokens: Dict[str, Any],
+    *,
+    token_endpoint: str,
+    redirect_uri: str = "",
+    timeout_seconds: float,
+) -> Dict[str, Any]:
+    refreshed = refresh_xai_oauth_pure(
+        str(tokens.get("access_token", "") or ""),
+        str(tokens.get("refresh_token", "") or ""),
+        token_endpoint=token_endpoint,
+        timeout_seconds=timeout_seconds,
+    )
+    updated_tokens = dict(tokens)
+    updated_tokens["access_token"] = refreshed["access_token"]
+    updated_tokens["refresh_token"] = refreshed["refresh_token"]
+    if refreshed.get("id_token"):
+        updated_tokens["id_token"] = refreshed["id_token"]
+    if refreshed.get("expires_in") is not None:
+        updated_tokens["expires_in"] = refreshed["expires_in"]
+    if refreshed.get("token_type"):
+        updated_tokens["token_type"] = refreshed["token_type"]
+    _save_xai_oauth_tokens(
+        updated_tokens,
+        discovery={"token_endpoint": token_endpoint},
+        redirect_uri=redirect_uri,
+        last_refresh=refreshed["last_refresh"],
+    )
+    return updated_tokens
+
+
+def resolve_xai_oauth_runtime_credentials(
+    *,
+    force_refresh: bool = False,
+    refresh_if_expiring: bool = True,
+    refresh_skew_seconds: int = XAI_ACCESS_TOKEN_REFRESH_SKEW_SECONDS,
+) -> Dict[str, Any]:
+    data = _read_xai_oauth_tokens()
+    tokens = dict(data["tokens"])
+    access_token = str(tokens.get("access_token", "") or "").strip()
+    refresh_timeout_seconds = float(os.getenv("HERMES_XAI_REFRESH_TIMEOUT_SECONDS", "20"))
+    discovery = dict(data.get("discovery") or {})
+    token_endpoint = str(discovery.get("token_endpoint", "") or "").strip()
+    redirect_uri = str(data.get("redirect_uri", "") or "").strip()
+
+    should_refresh = bool(force_refresh)
+    if (not should_refresh) and refresh_if_expiring:
+        should_refresh = _xai_access_token_is_expiring(access_token, refresh_skew_seconds)
+    if should_refresh:
+        with _auth_store_lock(timeout_seconds=max(float(AUTH_LOCK_TIMEOUT_SECONDS), refresh_timeout_seconds + 5.0)):
+            data = _read_xai_oauth_tokens(_lock=False)
+            tokens = dict(data["tokens"])
+            access_token = str(tokens.get("access_token", "") or "").strip()
+            discovery = dict(data.get("discovery") or {})
+            token_endpoint = str(discovery.get("token_endpoint", "") or "").strip()
+            redirect_uri = str(data.get("redirect_uri", "") or "").strip()
+            should_refresh = bool(force_refresh)
+            if (not should_refresh) and refresh_if_expiring:
+                should_refresh = _xai_access_token_is_expiring(access_token, refresh_skew_seconds)
+            if should_refresh:
+                if not token_endpoint:
+                    token_endpoint = _xai_oauth_discovery(refresh_timeout_seconds)["token_endpoint"]
+                tokens = _refresh_xai_oauth_tokens(
+                    tokens,
+                    token_endpoint=token_endpoint,
+                    redirect_uri=redirect_uri,
+                    timeout_seconds=refresh_timeout_seconds,
+                )
+                access_token = str(tokens.get("access_token", "") or "").strip()
+
+    base_url = (
+        os.getenv("HERMES_XAI_BASE_URL", "").strip().rstrip("/")
+        or os.getenv("XAI_BASE_URL", "").strip().rstrip("/")
+        or DEFAULT_XAI_OAUTH_BASE_URL
+    )
+    return {
+        "provider": "xai-oauth",
+        "base_url": base_url,
+        "api_key": access_token,
+        "source": "hermes-auth-store",
+        "last_refresh": data.get("last_refresh"),
+        "auth_mode": "oauth_pkce",
+    }
+
+
 # =============================================================================
 # TLS verification helper
 # =============================================================================
@@ -4030,6 +4595,48 @@ def get_codex_auth_status() -> Dict[str, Any]:
        }


+def get_xai_oauth_auth_status() -> Dict[str, Any]:
+    try:
+        from agent.credential_pool import load_pool
+
+        pool = load_pool("xai-oauth")
+        if pool and pool.has_credentials():
+            entry = pool.select()
+            if entry is not None:
+                api_key = (
+                    getattr(entry, "runtime_api_key", None)
+                    or getattr(entry, "access_token", "")
+                )
+                if api_key and not _xai_access_token_is_expiring(api_key, 0):
+                    return {
+                        "logged_in": True,
+                        "auth_store": str(_auth_file_path()),
+                        "last_refresh": getattr(entry, "last_refresh", None),
+                        "auth_mode": "oauth_pkce",
+                        "source": f"pool:{getattr(entry, 'label', 'unknown')}",
+                        "api_key": api_key,
+                    }
+    except Exception:
+        pass
+
+    try:
+        creds = resolve_xai_oauth_runtime_credentials()
+        return {
+            "logged_in": True,
+            "auth_store": str(_auth_file_path()),
+            "last_refresh": creds.get("last_refresh"),
+            "auth_mode": creds.get("auth_mode"),
+            "source": creds.get("source"),
+            "api_key": creds.get("api_key"),
+        }
+    except AuthError as exc:
+        return {
+            "logged_in": False,
+            "auth_store": str(_auth_file_path()),
+            "error": str(exc),
+        }
+
+
 def get_api_key_provider_status(provider_id: str) -> Dict[str, Any]:
    """Status snapshot for API-key providers (z.ai, Kimi, MiniMax)."""
    pconfig = PROVIDER_REGISTRY.get(provider_id)
@@ -4100,6 +4707,8 @@ def get_auth_status(provider_id: Optional[str] = None) -> Dict[str, Any]:
        return get_nous_auth_status()
    if target == "openai-codex":
        return get_codex_auth_status()
+    if target == "xai-oauth":
+        return get_xai_oauth_auth_status()
    if target == "qwen-oauth":
        return get_qwen_auth_status()
    if target == "google-gemini-cli":
@@ -4320,7 +4929,7 @@ def _logout_default_provider_from_config() -> Optional[str]:
    "No provider is currently logged in" and never reset model.provider.
    """
    provider = _get_config_provider()
-    if provider in {"nous", "openai-codex"}:
+    if provider in {"nous", "openai-codex", "xai-oauth"}:
        return provider
    return None

@@ -4619,6 +5228,247 @@ def _login_openai_codex(
    print(f"  Config updated: {config_path} (model.provider=openai-codex)")


+def _login_xai_oauth(
+    args,
+    pconfig: ProviderConfig,
+    *,
+    force_new_login: bool = False,
+) -> None:
+    del pconfig
+
+    if not force_new_login:
+        try:
+            existing = resolve_xai_oauth_runtime_credentials()
+            api_key = existing.get("api_key", "")
+            if isinstance(api_key, str) and api_key and not _xai_access_token_is_expiring(api_key, 60):
+                print("Existing xAI OAuth credentials found in Hermes auth store.")
+                try:
+                    reuse = input("Use existing credentials? [Y/n]: ").strip().lower()
+                except (EOFError, KeyboardInterrupt):
+                    reuse = "y"
+                if reuse in ("", "y", "yes"):
+                    config_path = _update_config_for_provider(
+                        "xai-oauth",
+                        existing.get("base_url", DEFAULT_XAI_OAUTH_BASE_URL),
+                    )
+                    print()
+                    print("Login successful!")
+                    print(f"  Config updated: {config_path} (model.provider=xai-oauth)")
+                    return
+        except AuthError:
+            pass
+
+    print()
+    print("Signing in to xAI Grok OAuth (SuperGrok Subscription)...")
+    print("(Hermes creates its own local OAuth session)")
+    print()
+
+    timeout_seconds = float(getattr(args, "timeout", None) or 20.0)
+    open_browser = not getattr(args, "no_browser", False)
+    if _is_remote_session():
+        open_browser = False
+
+    creds = _xai_oauth_loopback_login(timeout_seconds=timeout_seconds, open_browser=open_browser)
+    _save_xai_oauth_tokens(
+        creds["tokens"],
+        discovery=creds.get("discovery"),
+        redirect_uri=creds.get("redirect_uri", ""),
+        last_refresh=creds.get("last_refresh"),
+    )
+    config_path = _update_config_for_provider("xai-oauth", creds.get("base_url", DEFAULT_XAI_OAUTH_BASE_URL))
+    print()
+    print("Login successful!")
+    from hermes_constants import display_hermes_home as _dhh
+    print(f"  Auth state: {_dhh()}/auth.json")
+    print(f"  Config updated: {config_path} (model.provider=xai-oauth)")
+
+
+def _xai_oauth_build_authorize_url(
+    *,
+    authorization_endpoint: str,
+    redirect_uri: str,
+    code_challenge: str,
+    state: str,
+    nonce: str,
+) -> str:
+    # `plan=generic` opts the consent screen into xAI's generic OAuth plan
+    # tier instead of falling back to the per-account default. Without it,
+    # accounts.x.ai rejects loopback OAuth from non-allowlisted clients.
+    # `referrer=hermes-agent` lets xAI attribute Hermes-originated logins
+    # in their OAuth server logs (we still impersonate the upstream Grok-CLI
+    # client_id; this is best-effort attribution until xAI mints us our own).
+    authorize_params = {
+        "response_type": "code",
+        "client_id": XAI_OAUTH_CLIENT_ID,
+        "redirect_uri": redirect_uri,
+        "scope": XAI_OAUTH_SCOPE,
+        "code_challenge": code_challenge,
+        "code_challenge_method": "S256",
+        "state": state,
+        "nonce": nonce,
+        "plan": "generic",
+        "referrer": "hermes-agent",
+    }
+    return f"{authorization_endpoint}?{urlencode(authorize_params)}"
+
+
+def _xai_oauth_loopback_login(
+    *,
+    timeout_seconds: float = 20.0,
+    open_browser: bool = True,
+) -> Dict[str, Any]:
+    discovery = _xai_oauth_discovery(timeout_seconds)
+    authorization_endpoint = discovery["authorization_endpoint"]
+    token_endpoint = discovery["token_endpoint"]
+
+    server, thread, callback_result, redirect_uri = _xai_start_callback_server()
+    try:
+        _xai_validate_loopback_redirect_uri(redirect_uri)
+        code_verifier = _oauth_pkce_code_verifier()
+        code_challenge = _oauth_pkce_code_challenge(code_verifier)
+        state = uuid.uuid4().hex
+        nonce = uuid.uuid4().hex
+        authorize_url = _xai_oauth_build_authorize_url(
+            authorization_endpoint=authorization_endpoint,
+            redirect_uri=redirect_uri,
+            code_challenge=code_challenge,
+            state=state,
+            nonce=nonce,
+        )
+
+        print("Open this URL to authorize Hermes with xAI:")
+        print(authorize_url)
+        print()
+        print(f"Waiting for callback on {redirect_uri}")
+
+        _print_loopback_ssh_hint(redirect_uri, docs_url=XAI_OAUTH_DOCS_URL)
+
+        if open_browser and not _is_remote_session():
+            try:
+                opened = webbrowser.open(authorize_url)
+            except Exception:
+                opened = False
+            if opened:
+                print("Browser opened for xAI authorization.")
+            else:
+                print("Could not open the browser automatically; use the URL above.")
+
+        callback = _xai_wait_for_callback(
+            server,
+            thread,
+            callback_result,
+            timeout_seconds=max(30.0, timeout_seconds * 9),
+        )
+    except Exception:
+        try:
+            server.shutdown()
+            server.server_close()
+        except Exception:
+            pass
+        try:
+            thread.join(timeout=1.0)
+        except Exception:
+            pass
+        raise
+
+    if callback.get("error"):
+        detail = callback.get("error_description") or callback["error"]
+        raise AuthError(
+            f"xAI authorization failed: {detail}",
+            provider="xai-oauth",
+            code="xai_authorization_failed",
+        )
+    if callback.get("state") != state:
+        raise AuthError(
+            "xAI authorization failed: state mismatch.",
+            provider="xai-oauth",
+            code="xai_state_mismatch",
+        )
+    code = str(callback.get("code") or "").strip()
+    if not code:
+        raise AuthError(
+            "xAI authorization failed: missing authorization code.",
+            provider="xai-oauth",
+            code="xai_code_missing",
+        )
+
+    try:
+        response = httpx.post(
+            token_endpoint,
+            headers={"Content-Type": "application/x-www-form-urlencoded", "Accept": "application/json"},
+            data={
+                "grant_type": "authorization_code",
+                "code": code,
+                "redirect_uri": redirect_uri,
+                "client_id": XAI_OAUTH_CLIENT_ID,
+                "code_verifier": code_verifier,
+            },
+            timeout=max(20.0, timeout_seconds),
+        )
+    except Exception as exc:
+        raise AuthError(
+            f"xAI token exchange failed: {exc}",
+            provider="xai-oauth",
+            code="xai_token_exchange_failed",
+        ) from exc
+    if response.status_code != 200:
+        detail = response.text.strip()
+        raise AuthError(
+            "xAI token exchange failed."
+            + (f" Response: {detail}" if detail else ""),
+            provider="xai-oauth",
+            code="xai_token_exchange_failed",
+        )
+    try:
+        payload = response.json()
+    except Exception as exc:
+        raise AuthError(
+            f"xAI token exchange returned invalid JSON: {exc}",
+            provider="xai-oauth",
+            code="xai_token_exchange_invalid",
+        ) from exc
+    if not isinstance(payload, dict):
+        raise AuthError(
+            "xAI token exchange response was not a JSON object.",
+            provider="xai-oauth",
+            code="xai_token_exchange_invalid",
+        )
+    access_token = str(payload.get("access_token", "") or "").strip()
+    refresh_token = str(payload.get("refresh_token", "") or "").strip()
+    if not access_token:
+        raise AuthError(
+            "xAI token exchange did not return an access_token.",
+            provider="xai-oauth",
+            code="xai_token_exchange_invalid",
+        )
+    if not refresh_token:
+        raise AuthError(
+            "xAI token exchange did not return a refresh_token.",
+            provider="xai-oauth",
+            code="xai_token_exchange_invalid",
+        )
+
+    base_url = (
+        os.getenv("HERMES_XAI_BASE_URL", "").strip().rstrip("/")
+        or os.getenv("XAI_BASE_URL", "").strip().rstrip("/")
+        or DEFAULT_XAI_OAUTH_BASE_URL
+    )
+    return {
+        "tokens": {
+            "access_token": access_token,
+            "refresh_token": refresh_token,
+            "id_token": str(payload.get("id_token", "") or "").strip(),
+            "expires_in": payload.get("expires_in"),
+            "token_type": str(payload.get("token_type") or "Bearer").strip() or "Bearer",
+        },
+        "discovery": discovery,
+        "redirect_uri": redirect_uri,
+        "base_url": base_url,
+        "last_refresh": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
+        "source": "oauth-loopback",
+    }
+
+
 def _codex_device_code_login() -> Dict[str, Any]:
    """Run the OpenAI device code login flow and return credentials dict."""
    import time as _time
@@ -33,7 +33,7 @@ from hermes_constants import OPENROUTER_BASE_URL


 # Providers that support OAuth login in addition to API keys.
-_OAUTH_CAPABLE_PROVIDERS = {"anthropic", "nous", "openai-codex", "qwen-oauth", "google-gemini-cli", "minimax-oauth"}
+_OAUTH_CAPABLE_PROVIDERS = {"anthropic", "nous", "openai-codex", "xai-oauth", "qwen-oauth", "google-gemini-cli", "minimax-oauth"}


 def _get_custom_provider_names() -> list:
@@ -77,6 +77,8 @@ def _normalize_provider(provider: str) -> str:
    normalized = (provider or "").strip().lower()
    if normalized in {"or", "open-router"}:
        return "openrouter"
+    if normalized in {"grok-oauth", "xai-oauth", "x-ai-oauth", "xai-grok-oauth"}:
+        return "xai-oauth"
    # Check if it matches a custom provider name
    custom_key = _resolve_custom_provider_input(normalized)
    if custom_key:
@@ -170,7 +172,7 @@ def auth_add_command(args) -> None:
        if provider.startswith(CUSTOM_POOL_PREFIX):
            requested_type = AUTH_TYPE_API_KEY
        else:
-            requested_type = AUTH_TYPE_OAUTH if provider in {"anthropic", "nous", "openai-codex", "qwen-oauth", "google-gemini-cli", "minimax-oauth"} else AUTH_TYPE_API_KEY
+            requested_type = AUTH_TYPE_OAUTH if provider in _OAUTH_CAPABLE_PROVIDERS else AUTH_TYPE_API_KEY

    pool = load_pool(provider)

@@ -333,6 +335,31 @@ def auth_add_command(args) -> None:
        print(f'Added {provider} OAuth credential #{len(pool.entries())}: "{entry.label}"')
        return

+    if provider == "xai-oauth":
+        creds = auth_mod._xai_oauth_loopback_login(
+            timeout_seconds=getattr(args, "timeout", None) or 20.0,
+            open_browser=not getattr(args, "no_browser", False),
+        )
+        label = (getattr(args, "label", None) or "").strip() or label_from_token(
+            creds["tokens"]["access_token"],
+            _oauth_default_label(provider, len(pool.entries()) + 1),
+        )
+        entry = PooledCredential(
+            provider=provider,
+            id=uuid.uuid4().hex[:6],
+            label=label,
+            auth_type=AUTH_TYPE_OAUTH,
+            priority=0,
+            source=f"{SOURCE_MANUAL}:xai_pkce",
+            access_token=creds["tokens"]["access_token"],
+            refresh_token=creds["tokens"].get("refresh_token"),
+            base_url=creds.get("base_url"),
+            last_refresh=creds.get("last_refresh"),
+        )
+        pool.add_entry(entry)
+        print(f'Added {provider} OAuth credential #{len(pool.entries())}: "{entry.label}"')
+        return
+
    if provider == "google-gemini-cli":
        from agent.google_oauth import run_gemini_oauth_login_pure

@@ -175,6 +175,48 @@ def _check_via_local_git(repo_dir: Path) -> Optional[int]:
    return None


+def _version_tuple(v: str) -> tuple[int, ...]:
+    """Parse '0.13.0' into (0, 13, 0) for comparison. Non-numeric segments become 0."""
+    parts = []
+    for segment in v.split("."):
+        try:
+            parts.append(int(segment))
+        except ValueError:
+            parts.append(0)
+    return tuple(parts)
+
+
+def _fetch_pypi_latest(package: str = "hermes-agent") -> Optional[str]:
+    """Fetch the latest version of a package from PyPI. Returns None on failure."""
+    try:
+        import urllib.request
+        url = f"https://pypi.org/pypi/{package}/json"
+        req = urllib.request.Request(url, headers={"Accept": "application/json"})
+        with urllib.request.urlopen(req, timeout=5) as resp:
+            data = json.loads(resp.read())
+            return data.get("info", {}).get("version")
+    except Exception:
+        return None
+
+
+def check_via_pypi() -> Optional[int]:
+    """Compare installed version against PyPI latest.
+
+    Returns 0 if up-to-date, 1 if behind, None on failure.
+    """
+    latest = _fetch_pypi_latest()
+    if latest is None:
+        return None
+    if latest == VERSION:
+        return 0
+    try:
+        if _version_tuple(latest) > _version_tuple(VERSION):
+            return 1
+        return 0
+    except Exception:
+        return 1 if latest != VERSION else 0
+
+
 def check_for_updates() -> Optional[int]:
    """Check whether a Hermes update is available.

@@ -213,8 +255,9 @@ def check_for_updates() -> Optional[int]:
        if not (repo_dir / ".git").exists():
            repo_dir = hermes_home / "hermes-agent"
        if not (repo_dir / ".git").exists():
-            return None
-        behind = _check_via_local_git(repo_dir)
+            behind = check_via_pypi()
+        else:
+            behind = _check_via_local_git(repo_dir)

    try:
        cache_file.write_text(json.dumps({"ts": now, "behind": behind, "rev": embedded_rev}))
@@ -470,6 +513,9 @@ def build_welcome_banner(console: Console, model: str, cwd: str,
        model_short = model_short[:25] + "..."
    ctx_str = f" [dim {dim}]·[/] [dim {dim}]{_format_context_length(context_length)} context[/]" if context_length else ""
    left_lines.append(f"[{accent}]{model_short}[/]{ctx_str} [dim {dim}]·[/] [dim {dim}]Nous Research[/]")
+
+    if os.getenv("HERMES_YOLO_MODE"):
+        left_lines.append(f"[bold red]⚠ YOLO mode[/] [dim {dim}]— all approval prompts bypassed[/]")
    left_lines.append(f"[dim {dim}]{cwd}[/]")
    if session_id:
        left_lines.append(f"[dim {session_color}]Session: {session_id}[/]")
@@ -304,6 +304,103 @@ def render_codex_toml_section(
    return "\n".join(out) + "\n"


+def _insert_managed_block_at_top_level(user_text: str, managed_block: str) -> str:
+    """Insert Hermes' managed Codex TOML block while keeping root keys root-scoped.
+
+    TOML has no syntax to return to the document root after a table header.
+    Therefore appending a root key like `default_permissions = ...` after a
+    user table such as `[features]` actually creates `features.default_permissions`,
+    which Codex rejects. Insert the managed block before the first table header
+    so its root keys remain top-level, while preserving user content verbatim.
+    """
+    if not user_text.strip():
+        return managed_block
+
+    lines = user_text.splitlines(keepends=True)
+    first_table_idx: Optional[int] = None
+    for idx, line in enumerate(lines):
+        stripped = line.lstrip()
+        if stripped.startswith("["):
+            first_table_idx = idx
+            break
+
+    if first_table_idx is None:
+        prefix = user_text.rstrip("\n")
+        return f"{prefix}\n\n{managed_block}" if prefix else managed_block
+
+    prefix = "".join(lines[:first_table_idx]).rstrip("\n")
+    suffix = "".join(lines[first_table_idx:]).lstrip("\n")
+    if prefix:
+        return f"{prefix}\n\n{managed_block}\n{suffix}"
+    return f"{managed_block}\n{suffix}"
+
+
+def _strip_unmanaged_plugin_tables(toml_text: str) -> str:
+    """Remove ``[plugins."<name>@<marketplace>"]`` tables that live OUTSIDE the
+    managed block.
+
+    Codex itself writes these tables when the user runs ``codex plugins enable``
+    directly (i.e. before Hermes' migrate has ever touched the file). When we
+    later run migrate, ``_query_codex_plugins()`` reports the same plugins via
+    the live ``plugin/list`` RPC and we re-emit them inside the managed block.
+    The result without this strip is duplicate ``[plugins."X@Y"]`` table
+    headers — codex's strict TOML parser then refuses to load the file.
+
+    We own the ``[plugins.*]`` namespace once migrate has run, so dropping any
+    pre-existing ``[plugins.*]`` tables is safe: ``plugin/list`` is the source
+    of truth for what's actually installed. The caller is expected to only
+    invoke this strip when ``plugin/list`` succeeded — otherwise we'd lose
+    plugins the user installed via ``codex`` without a way to re-emit them.
+
+    Behavior:
+      * Lines beginning with ``[plugins.`` start a swallow region that ends at
+        the next non-``[plugins.`` table header or end-of-file.
+      * Content inside the managed block is untouched (callers should run
+        ``_strip_existing_managed_block`` first so the managed block has
+        already been removed when this runs).
+    """
+    lines = toml_text.splitlines(keepends=True)
+    out: list[str] = []
+    in_plugin_table = False
+    for line in lines:
+        stripped = line.lstrip()
+        # Only treat a line as a table header when it has the shape
+        # ``[...]`` (optionally followed by a comment). Multi-line array
+        # continuations like ``["nested"],`` also start with ``[`` after
+        # lstrip but are not headers — without this guard they would
+        # falsely flip ``in_plugin_table`` to False mid-table and leak
+        # array fragments into the output.
+        if _looks_like_table_header(stripped):
+            in_plugin_table = stripped.startswith("[plugins.")
+            if in_plugin_table:
+                continue
+        if in_plugin_table:
+            # Swallow keys/comments/blanks until the next table header.
+            continue
+        out.append(line)
+    return "".join(out)
+
+
+def _looks_like_table_header(stripped_line: str) -> bool:
+    """Return True if ``stripped_line`` is a TOML table header.
+
+    A header has the shape ``[name]`` or ``[[name]]`` (array-of-tables),
+    optionally followed by a comment. The closing ``]`` (or ``]]``) must
+    appear on the same line, and no key-assignment ``=`` can precede it.
+    This distinguishes real headers from multi-line array continuation
+    lines that also start with ``[`` after ``lstrip()``.
+    """
+    if not stripped_line.startswith("["):
+        return False
+    # Drop trailing comment so e.g. ``[features]  # note`` still matches.
+    head = stripped_line.split("#", 1)[0].rstrip()
+    if not head.endswith("]"):
+        return False
+    # ``key = [x]`` would have an ``=`` before the bracket; a header doesn't.
+    bracket_idx = head.index("]")
+    return "=" not in head[: bracket_idx + 1]
+
+
 def _strip_existing_managed_block(toml_text: str) -> str:
    """Remove any prior managed section so re-runs idempotently replace it.

@@ -431,6 +528,32 @@ def _query_codex_plugins(
    return out, None


+def _looks_like_test_tempdir(path: str) -> bool:
+    """Heuristic: does ``path`` look like a pytest/transient tempdir?
+
+    pytest tempdirs live under ``pytest-of-<user>/pytest-<n>/`` (created via
+    ``tmp_path`` / ``tmp_path_factory``) and are reaped between sessions.
+    macOS routes ``/tmp`` through ``/private/var/folders/<…>/T`` which is
+    what pytest's tempdir factory uses by default. If a HERMES_HOME pointing
+    at one of those paths is burned into ``~/.codex/config.toml``, every
+    codex-routed hermes-tools call fails silently once the directory is GC'd.
+
+    We err on the side of refusing — losing a (very unlikely) real
+    ``~/.hermes`` symlink that happens to live under ``/private/var/folders``
+    is much less harmful than silently bricking codex's tool surface.
+    """
+    if not path:
+        return False
+    needles = (
+        "pytest-of-",
+        "/pytest-",
+        "/tmp/pytest",
+        "/private/var/folders/",  # macOS tempdir root
+    )
+    normalized = path.lower()
+    return any(needle in normalized for needle in needles)
+
+
 def _build_hermes_tools_mcp_entry() -> dict:
    """Build the codex stdio-transport entry that launches Hermes' own
    tool surface as an MCP server. Codex's subprocess will call back into
@@ -443,9 +566,22 @@ def _build_hermes_tools_mcp_entry() -> dict:
    import sys

    env: dict[str, str] = {}
-    # HERMES_HOME passes through if set so the MCP subprocess sees the
-    # same config / auth / sessions DB as the parent CLI.
-    hermes_home = os.environ.get("HERMES_HOME")
+    # HERMES_HOME passes through IF SET so the MCP subprocess sees the same
+    # config / auth / sessions DB as the parent CLI. Read from os.environ
+    # (not get_hermes_home()) on purpose: when the env var is unset we want
+    # codex's subprocess to inherit whatever HERMES_HOME its launcher sets
+    # at runtime (systemd unit, gateway, kanban dispatcher, custom shell),
+    # rather than burning the migrate-time resolved default into config.toml
+    # — that would override the launcher's HERMES_HOME and pin the subprocess
+    # to the wrong profile.
+    #
+    # The pytest-tempdir guard below catches the issue #26250 Bug C scenario:
+    # a sibling test's monkeypatch.setenv("HERMES_HOME", tmp_path) would
+    # otherwise leak a transient pytest tempdir into the user's real
+    # ~/.codex/config.toml and silently brick codex once the tempdir is GC'd.
+    hermes_home = os.environ.get("HERMES_HOME") or ""
+    if hermes_home and _looks_like_test_tempdir(hermes_home):
+        hermes_home = ""
    if hermes_home:
        env["HERMES_HOME"] = hermes_home
    # PYTHONPATH passes through so a worktree-launched hermes finds the
@@ -533,10 +669,16 @@ def migrate(
    # Discover installed Codex curated plugins. Best-effort — never blocks
    # the migration if codex is unreachable or the RPC fails.
    plugins: list[dict] = []
+    plugin_query_succeeded = False
    if discover_plugins and not dry_run:
        plugins, plugin_err = _query_codex_plugins(codex_home=codex_home)
        if plugin_err:
            report.plugin_query_error = plugin_err
+        else:
+            # plugin/list returned authoritatively (even if the list is empty).
+            # That means we own [plugins.*] for this re-render and can safely
+            # strip any pre-existing tables outside the managed block.
+            plugin_query_succeeded = True
        for p in plugins:
            report.migrated_plugins.append(f"{p['name']}@{p['marketplace']}")

@@ -571,14 +713,15 @@ def migrate(
            report.errors.append(f"could not read {target}: {exc}")
            return report
        without_managed = _strip_existing_managed_block(existing)
-        # Ensure exactly one blank line between user content and managed block
-        if without_managed and not without_managed.endswith("\n"):
-            without_managed += "\n"
-        new_text = (
-            without_managed.rstrip("\n") + "\n\n" + managed_block
-            if without_managed.strip()
-            else managed_block
-        )
+        # Bug B: when plugin/list ran authoritatively, codex's own
+        # [plugins."<name>@<marketplace>"] tables outside our managed block
+        # would survive _strip_existing_managed_block and then collide with
+        # the entries we re-emit inside the managed block — producing
+        # duplicate-table-header parse errors on codex's next startup. Drop
+        # those pre-existing tables since plugin/list is the source of truth.
+        if plugin_query_succeeded:
+            without_managed = _strip_unmanaged_plugin_tables(without_managed)
+        new_text = _insert_managed_block_at_top_level(without_managed, managed_block)
    else:
        new_text = managed_block

@@ -198,6 +198,8 @@ COMMAND_REGISTRY: list[CommandDef] = [
               args_hint="[days]"),
    CommandDef("platforms", "Show gateway/messaging platform status", "Info",
               cli_only=True, aliases=("gateway",)),
+    CommandDef("platform", "Pause, resume, or list a failing gateway platform", "Info",
+               gateway_only=True, args_hint="<pause|resume|list> [name]"),
    CommandDef("copy", "Copy the last assistant response to clipboard", "Info",
               cli_only=True, args_hint="[number]"),
    CommandDef("paste", "Attach clipboard image from your clipboard", "Info",
@@ -199,9 +199,40 @@ def get_managed_update_command() -> Optional[str]:
    return None


+def detect_install_method(project_root: Optional[Path] = None) -> str:
+    """Detect how Hermes was installed: 'nixos', 'homebrew', 'git', or 'pip'."""
+    managed = get_managed_system()
+    if managed:
+        return managed.lower().replace(" ", "-")
+    if project_root is None:
+        project_root = Path(__file__).parent.parent.resolve()
+    if (project_root / ".git").is_dir():
+        return "git"
+    return "pip"
+
+
+def recommended_update_command_for_method(method: str) -> str:
+    """Return the update command for a given install method."""
+    if method == "nixos":
+        return "sudo nixos-rebuild switch"
+    if method == "homebrew":
+        return "brew upgrade hermes-agent"
+    if method == "pip":
+        import shutil
+        uv = shutil.which("uv")
+        if uv:
+            return "uv pip install --upgrade hermes-agent"
+        return "pip install --upgrade hermes-agent"
+    return "hermes update"
+
+
 def recommended_update_command() -> str:
    """Return the best update command for the current installation."""
-    return get_managed_update_command() or "hermes update"
+    managed_cmd = get_managed_update_command()
+    if managed_cmd:
+        return managed_cmd
+    method = detect_install_method()
+    return recommended_update_command_for_method(method)


 def format_managed_message(action: str = "modify this Hermes installation") -> str:
@@ -401,7 +432,10 @@ def ensure_hermes_home():
    else:
        home.mkdir(parents=True, exist_ok=True)
        _secure_dir(home)
-        for subdir in ("cron", "sessions", "logs", "logs/curator", "memories"):
+        for subdir in (
+            "cron", "sessions", "logs", "logs/curator", "memories",
+            "pairing", "hooks", "image_cache", "audio_cache", "skills",
+        ):
            d = home / subdir
            d.mkdir(parents=True, exist_ok=True)
            _secure_dir(d)
@@ -1112,6 +1146,10 @@ DEFAULT_CONFIG = {
        "provider": "",    # e.g. "openrouter" (empty = inherit parent provider + credentials)
        "base_url": "",    # direct OpenAI-compatible endpoint for subagents
        "api_key": "",     # API key for delegation.base_url (falls back to OPENAI_API_KEY)
+        "api_mode": "",    # wire protocol for delegation.base_url: "chat_completions",
+                           # "codex_responses", or "anthropic_messages". Empty = auto-detect
+                           # from URL (e.g. /anthropic suffix → anthropic_messages). Set this
+                           # explicitly for non-standard endpoints the heuristic can't detect.
        # When delegate_task narrows child toolsets explicitly, preserve any
        # MCP toolsets the parent already has enabled. On by default so
        # narrowing (e.g. toolsets=["web","browser"]) expresses "I want these
@@ -1251,6 +1289,8 @@ DEFAULT_CONFIG = {
        "allowed_channels": "",        # If set, bot ONLY responds in these channel IDs (whitelist)
        "auto_thread": True,           # Auto-create threads on @mention in channels (like Slack)
        "thread_require_mention": False,  # If True, require @mention in threads too (multi-bot threads)
+        "history_backfill": True,         # If True, prepend recent channel scrollback when bot is triggered (recovers messages missed while require_mention gated them out)
+        "history_backfill_limit": 50,     # Max number of recent messages to scan when assembling the backfill block
        "reactions": True,             # Add 👀/✅/❌ reactions to messages during processing
        "channel_prompts": {},         # Per-channel ephemeral system prompts (forum parents apply to child threads)
        # Opt-in DM role-based auth (#12136). By default, DISCORD_ALLOWED_ROLES
@@ -1567,6 +1607,23 @@ DEFAULT_CONFIG = {
        "servers": {},
    },

+    # X (Twitter) Search via xAI's built-in x_search Responses tool.
+    # The tool registers when xAI credentials are available (SuperGrok
+    # OAuth or XAI_API_KEY) AND the x_search toolset is enabled in
+    # `hermes tools`. These settings tune the backing Responses API call.
+    "x_search": {
+        # xAI model used for the Responses call. grok-4.20-reasoning is
+        # the recommended default; any Grok model with x_search tool
+        # access works.
+        "model": "grok-4.20-reasoning",
+        # Request timeout in seconds (minimum 30). x_search can take
+        # 60-120s for complex queries — the default is generous.
+        "timeout_seconds": 180,
+        # Number of automatic retries on 5xx / ReadTimeout / ConnectionError.
+        # Each retry backs off (1.5x attempt seconds, capped at 5s).
+        "retries": 2,
+    },
+
    # Config schema version - bump this when adding new required fields
    "_config_version": 23,
 }
@@ -2136,22 +2193,6 @@ OPTIONAL_ENV_VARS = {
        "password": True,
        "category": "tool",
    },
-    "TINKER_API_KEY": {
-        "description": "Tinker API key for RL training",
-        "prompt": "Tinker API key",
-        "url": "https://tinker-console.thinkingmachines.ai/keys",
-        "tools": ["rl_start_training", "rl_check_status", "rl_stop_training"],
-        "password": True,
-        "category": "tool",
-    },
-    "WANDB_API_KEY": {
-        "description": "Weights & Biases API key for experiment tracking",
-        "prompt": "WandB API key",
-        "url": "https://wandb.ai/authorize",
-        "tools": ["rl_get_results", "rl_check_status"],
-        "password": True,
-        "category": "tool",
-    },
    "VOICE_TOOLS_OPENAI_KEY": {
        "description": "OpenAI API key for voice transcription (Whisper) and OpenAI TTS",
        "prompt": "OpenAI API Key (for Whisper STT + TTS)",
@@ -4988,8 +5029,7 @@ def set_config_value(key: str, value: str):
        'FAL_KEY', 'TELEGRAM_BOT_TOKEN', 'DISCORD_BOT_TOKEN',
        'TERMINAL_SSH_HOST', 'TERMINAL_SSH_USER', 'TERMINAL_SSH_KEY',
        'SUDO_PASSWORD', 'SLACK_BOT_TOKEN', 'SLACK_APP_TOKEN',
-        'GITHUB_TOKEN', 'HONCHO_API_KEY', 'WANDB_API_KEY',
-        'TINKER_API_KEY',
+        'GITHUB_TOKEN', 'HONCHO_API_KEY',
    ]
    
    if key.upper() in api_keys or key.upper().endswith(('_API_KEY', '_TOKEN')) or key.upper().startswith('TERMINAL_SSH'):
@@ -196,9 +196,15 @@ def cron_create(args):


 def cron_edit(args):
-    from cron.jobs import get_job
+    from cron.jobs import AmbiguousJobReference, resolve_job_ref

-    job = get_job(args.job_id)
+    try:
+        job = resolve_job_ref(args.job_id)
+    except AmbiguousJobReference as exc:
+        print(color(str(exc), Colors.RED))
+        for m in exc.matches:
+            print(f"  {m['id']}  (name: {m.get('name')!r})")
+        return 1
    if not job:
        print(color(f"Job not found: {args.job_id}", Colors.RED))
        return 1
@@ -0,0 +1,106 @@
+"""Lazy dependency bootstrapper for non-Python runtime deps.
+
+Detection and prompting live here in Python — not in install.sh — because:
+  1. shutil.which() works on every platform; install.sh needs bash.
+  2. Detection is instant; spawning bash for a "is node installed?" check is waste.
+  3. Python controls the UX (rich prompts, non-interactive fallback, TTY detection).
+
+install.sh is still the *installation* backend because it has 1900 lines of
+battle-tested OS detection and package-manager logic (apt/brew/pacman/dnf/
+zypper/Termux/…).  Reimplementing that in Python would be huge duplication.
+
+Deps that degrade gracefully (ripgrep → grep fallback, ffmpeg → skip conversion)
+don't need ensure_dependency wired in — only hard-fail sites do (TUI needs node,
+browser tool needs agent-browser).
+"""
+from __future__ import annotations
+
+import os
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+_DEP_CHECKS = {
+    "node": lambda: shutil.which("node") is not None,
+    "browser": lambda: (
+        shutil.which("agent-browser") is not None
+        or _has_system_browser()
+        or _has_hermes_agent_browser()
+    ),
+    "ripgrep": lambda: shutil.which("rg") is not None,
+    "ffmpeg": lambda: shutil.which("ffmpeg") is not None,
+}
+
+_DEP_DESCRIPTIONS = {
+    "node": "Node.js (required for browser tools and TUI)",
+    "browser": "Browser engine (Chromium, for web browsing tools)",
+    "ripgrep": "ripgrep (fast file search)",
+    "ffmpeg": "ffmpeg (TTS voice messages)",
+}
+
+
+def _has_system_browser() -> bool:
+    for name in ("google-chrome", "google-chrome-stable", "chromium", "chromium-browser", "chrome"):
+        if shutil.which(name):
+            return True
+    return False
+
+
+def _has_hermes_agent_browser() -> bool:
+    from hermes_constants import get_hermes_home
+    return (get_hermes_home() / "node_modules" / ".bin" / "agent-browser").is_file()
+
+
+def _find_install_script(
+    package_dir: Path | None = None,
+    repo_root: Path | None = None,
+) -> Path | None:
+    """Locate install.sh — bundled in wheel or in git checkout."""
+    if package_dir is None:
+        package_dir = Path(__file__).parent
+    if repo_root is None:
+        repo_root = package_dir.parent
+
+    bundled = package_dir / "scripts" / "install.sh"
+    if bundled.is_file():
+        return bundled
+    repo = repo_root / "scripts" / "install.sh"
+    if repo.is_file():
+        return repo
+    return None
+
+
+def ensure_dependency(dep: str, interactive: bool = True) -> bool:
+    """Ensure a non-Python dependency is available. Returns True if available."""
+    check = _DEP_CHECKS.get(dep)
+    if check and check():
+        return True
+
+    script = _find_install_script()
+    if script is None:
+        if interactive:
+            desc = _DEP_DESCRIPTIONS.get(dep, dep)
+            print(f"  {desc} is not installed and install.sh was not found.")
+            print(f"  Install {dep} manually and try again.")
+        return False
+
+    if interactive and sys.stdin.isatty():
+        desc = _DEP_DESCRIPTIONS.get(dep, dep)
+        try:
+            reply = input(f"{desc} is not installed. Install now? [Y/n] ").strip().lower()
+        except (EOFError, KeyboardInterrupt):
+            return False
+        if reply not in ("", "y", "yes"):
+            return False
+
+    result = subprocess.run(
+        ["bash", str(script), "--ensure", dep],
+        env={**os.environ, "IS_INTERACTIVE": "false"},
+    )
+    if result.returncode != 0:
+        return False
+
+    if check:
+        return check()
+    return True
--- a/Show More
+++ b/Show More