ci: add path filters to Docker and test workflows, remove supply chain audit

- Docker build only triggers on main push (code/config changes) and releases, no longer on every PR - Tests skip markdown-only and docs-only changes - Remove supply-chain-audit workflow
fix(gateway): flush undelivered tail before segment reset to preserve streamed text (#8124 )
2026-04-20 01:39:39 +05:30 · 2026-04-19 01:43:04 -07:00 · 2026-04-19 01:42:35 -07:00 · 2026-04-19 01:16:34 -07:00 · 2026-04-19 00:28:25 -07:00 · 2026-04-19 00:28:25 -07:00
862 changed files with 130291 additions and 12638 deletions
@@ -24,6 +24,15 @@
 # Optional base URL override (default: Google's OpenAI-compatible endpoint)
 # GEMINI_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai

+# =============================================================================
+# LLM PROVIDER (Ollama Cloud)
+# =============================================================================
+# Cloud-hosted open models via Ollama's OpenAI-compatible endpoint.
+# Get your key at: https://ollama.com/settings
+# OLLAMA_API_KEY=your_ollama_key_here
+# Optional base URL override (default: https://ollama.com/v1)
+# OLLAMA_BASE_URL=https://ollama.com/v1
+
 # =============================================================================
 # LLM PROVIDER (z.ai / GLM)
 # =============================================================================
@@ -145,6 +154,10 @@
 # Only override here if you need to force a backend without touching config.yaml:
 # TERMINAL_ENV=local

+# Override the container runtime binary (e.g. to use Podman instead of Docker).
+# Useful on systems where Docker's storage driver is broken or unavailable.
+# HERMES_DOCKER_BINARY=/usr/local/bin/podman
+
 # Container images (for singularity/docker/modal backends)
 # TERMINAL_DOCKER_IMAGE=nikolaik/python-nodejs:python3.11-nodejs20
 # TERMINAL_SINGULARITY_IMAGE=docker://nikolaik/python-nodejs:python3.11-nodejs20
@@ -1 +1,5 @@
+watch_file pyproject.toml uv.lock
+watch_file ui-tui/package-lock.json ui-tui/package.json
+watch_file flake.nix flake.lock nix/devShell.nix nix/tui.nix nix/package.nix nix/python.nix
+
 use flake
@@ -1,11 +1,12 @@
 name: Deploy Site

 on:
+  release:
+    types: [published]
  push:
    branches: [main]
    paths:
      - 'website/**'
-      - 'landingpage/**'
      - 'skills/**'
      - 'optional-skills/**'
      - '.github/workflows/deploy-site.yml'
@@ -20,8 +21,14 @@ concurrency:
  cancel-in-progress: false

 jobs:
-  build-and-deploy:
-    # Only run on the upstream repository, not on forks
+  deploy-vercel:
+    if: github.event_name == 'release'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Trigger Vercel Deploy
+        run: curl -X POST "${{ secrets.VERCEL_DEPLOY_HOOK }}"
+
+  deploy-docs:
    if: github.repository == 'NousResearch/hermes-agent'
    runs-on: ubuntu-latest
    environment:
@@ -65,12 +72,7 @@ jobs:
      - name: Stage deployment
        run: |
          mkdir -p _site/docs
-          # Landing page at root
-          cp -r landingpage/* _site/
-          # Docusaurus at /docs/
          cp -r website/build/* _site/docs/
-          # CNAME so GitHub Pages keeps the custom domain between deploys
-          echo "hermes-agent.nousresearch.com" > _site/CNAME

      - name: Upload artifact
        uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa  # v3
@@ -3,8 +3,13 @@ name: Docker Build and Publish
 on:
  push:
    branches: [main]
-  pull_request:
-    branches: [main]
+    paths:
+      - '**/*.py'
+      - 'pyproject.toml'
+      - 'uv.lock'
+      - 'Dockerfile'
+      - 'docker/**'
+      - '.github/workflows/docker-publish.yml'
  release:
    types: [published]

@@ -1,248 +0,0 @@
-name: Supply Chain Audit
-
-on:
-  pull_request:
-    types: [opened, synchronize, reopened]
-
-permissions:
-  pull-requests: write
-  contents: read
-
-jobs:
-  scan:
-    name: Scan PR for supply chain risks
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
-        with:
-          fetch-depth: 0
-
-      - name: Scan diff for suspicious patterns
-        id: scan
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          set -euo pipefail
-
-          BASE="${{ github.event.pull_request.base.sha }}"
-          HEAD="${{ github.event.pull_request.head.sha }}"
-
-          # Get the full diff (added lines only)
-          DIFF=$(git diff "$BASE".."$HEAD" -- . ':!uv.lock' ':!*.lock' ':!package-lock.json' ':!yarn.lock' || true)
-
-          FINDINGS=""
-          CRITICAL=false
-
-          # --- .pth files (auto-execute on Python startup) ---
-          PTH_FILES=$(git diff --name-only "$BASE".."$HEAD" | grep '\.pth$' || true)
-          if [ -n "$PTH_FILES" ]; then
-            CRITICAL=true
-            FINDINGS="${FINDINGS}
-          ### 🚨 CRITICAL: .pth file added or modified
-          Python \`.pth\` files in \`site-packages/\` execute automatically when the interpreter starts — no import required. This is the exact mechanism used in the [litellm supply chain attack](https://github.com/BerriAI/litellm/issues/24512).
-
-          **Files:**
-          \`\`\`
-          ${PTH_FILES}
-          \`\`\`
-          "
-          fi
-
-          # --- base64 + exec/eval combo (the litellm attack pattern) ---
-          B64_EXEC_HITS=$(echo "$DIFF" | grep -n '^\+' | grep -iE 'base64\.(b64decode|decodebytes|urlsafe_b64decode)' | grep -iE 'exec\(|eval\(' | head -10 || true)
-          if [ -n "$B64_EXEC_HITS" ]; then
-            CRITICAL=true
-            FINDINGS="${FINDINGS}
-          ### 🚨 CRITICAL: base64 decode + exec/eval combo
-          This is the exact pattern used in the [litellm supply chain attack](https://github.com/BerriAI/litellm/issues/24512) — base64-decoded strings passed to exec/eval to hide credential-stealing payloads.
-
-          **Matches:**
-          \`\`\`
-          ${B64_EXEC_HITS}
-          \`\`\`
-          "
-          fi
-
-          # --- base64 decode/encode (alone — legitimate uses exist) ---
-          B64_HITS=$(echo "$DIFF" | grep -n '^\+' | grep -iE 'base64\.(b64decode|b64encode|decodebytes|encodebytes|urlsafe_b64decode)|atob\(|btoa\(|Buffer\.from\(.*base64' | head -20 || true)
-          if [ -n "$B64_HITS" ]; then
-            FINDINGS="${FINDINGS}
-          ### ⚠️ WARNING: base64 encoding/decoding detected
-          Base64 has legitimate uses (images, JWT, etc.) but is also commonly used to obfuscate malicious payloads. Verify the usage is appropriate.
-
-          **Matches (first 20):**
-          \`\`\`
-          ${B64_HITS}
-          \`\`\`
-          "
-          fi
-
-          # --- exec/eval with string arguments ---
-          EXEC_HITS=$(echo "$DIFF" | grep -n '^\+' | grep -E '(exec|eval)\s*\(' | grep -v '^\+\s*#' | grep -v 'test_\|mock\|assert\|# ' | head -20 || true)
-          if [ -n "$EXEC_HITS" ]; then
-            FINDINGS="${FINDINGS}
-          ### ⚠️ WARNING: exec() or eval() usage
-          Dynamic code execution can hide malicious behavior, especially when combined with base64 or network fetches.
-
-          **Matches (first 20):**
-          \`\`\`
-          ${EXEC_HITS}
-          \`\`\`
-          "
-          fi
-
-          # --- subprocess with encoded/obfuscated commands ---
-          PROC_HITS=$(echo "$DIFF" | grep -n '^\+' | grep -E 'subprocess\.(Popen|call|run)\s*\(' | grep -iE 'base64|decode|encode|\\x|chr\(' | head -10 || true)
-          if [ -n "$PROC_HITS" ]; then
-            CRITICAL=true
-            FINDINGS="${FINDINGS}
-          ### 🚨 CRITICAL: subprocess with encoded/obfuscated command
-          Subprocess calls with encoded arguments are a strong indicator of payload execution.
-
-          **Matches:**
-          \`\`\`
-          ${PROC_HITS}
-          \`\`\`
-          "
-          fi
-
-          # --- Network calls to non-standard domains ---
-          EXFIL_HITS=$(echo "$DIFF" | grep -n '^\+' | grep -iE 'requests\.(post|put)\(|httpx\.(post|put)\(|urllib\.request\.urlopen' | grep -v '^\+\s*#' | grep -v 'test_\|mock\|assert' | head -10 || true)
-          if [ -n "$EXFIL_HITS" ]; then
-            FINDINGS="${FINDINGS}
-          ### ⚠️ WARNING: Outbound network calls (POST/PUT)
-          Outbound POST/PUT requests in new code could be data exfiltration. Verify the destination URLs are legitimate.
-
-          **Matches (first 10):**
-          \`\`\`
-          ${EXFIL_HITS}
-          \`\`\`
-          "
-          fi
-
-          # --- setup.py / setup.cfg install hooks ---
-          SETUP_HITS=$(git diff --name-only "$BASE".."$HEAD" | grep -E '(setup\.py|setup\.cfg|__init__\.pth|sitecustomize\.py|usercustomize\.py)$' || true)
-          if [ -n "$SETUP_HITS" ]; then
-            FINDINGS="${FINDINGS}
-          ### ⚠️ WARNING: Install hook files modified
-          These files can execute code during package installation or interpreter startup.
-
-          **Files:**
-          \`\`\`
-          ${SETUP_HITS}
-          \`\`\`
-          "
-          fi
-
-          # --- Compile/marshal/pickle (code object injection) ---
-          MARSHAL_HITS=$(echo "$DIFF" | grep -n '^\+' | grep -iE 'marshal\.loads|pickle\.loads|compile\(' | grep -v '^\+\s*#' | grep -v 'test_\|re\.compile\|ast\.compile' | head -10 || true)
-          if [ -n "$MARSHAL_HITS" ]; then
-            FINDINGS="${FINDINGS}
-          ### ⚠️ WARNING: marshal/pickle/compile usage
-          These can deserialize or construct executable code objects.
-
-          **Matches:**
-          \`\`\`
-          ${MARSHAL_HITS}
-          \`\`\`
-          "
-          fi
-
-          # --- CI/CD workflow files modified ---
-          WORKFLOW_HITS=$(git diff --name-only "$BASE".."$HEAD" | grep -E '\.github/workflows/.*\.ya?ml$' || true)
-          if [ -n "$WORKFLOW_HITS" ]; then
-            FINDINGS="${FINDINGS}
-          ### ⚠️ WARNING: CI/CD workflow files modified
-          Changes to workflow files can alter build pipelines, inject steps, or modify permissions. Verify no unauthorized actions or secrets access were added.
-
-          **Files:**
-          \`\`\`
-          ${WORKFLOW_HITS}
-          \`\`\`
-          "
-          fi
-
-          # --- Dockerfile / container build files modified ---
-          DOCKER_HITS=$(git diff --name-only "$BASE".."$HEAD" | grep -iE '(Dockerfile|\.dockerignore|docker-compose)' || true)
-          if [ -n "$DOCKER_HITS" ]; then
-            FINDINGS="${FINDINGS}
-          ### ⚠️ WARNING: Container build files modified
-          Changes to Dockerfiles or compose files can alter base images, add build steps, or expose ports. Verify base image pins and build commands.
-
-          **Files:**
-          \`\`\`
-          ${DOCKER_HITS}
-          \`\`\`
-          "
-          fi
-
-          # --- Dependency manifest files modified ---
-          DEP_HITS=$(git diff --name-only "$BASE".."$HEAD" | grep -E '(pyproject\.toml|requirements.*\.txt|package\.json|Gemfile|go\.mod|Cargo\.toml)$' || true)
-          if [ -n "$DEP_HITS" ]; then
-            FINDINGS="${FINDINGS}
-          ### ⚠️ WARNING: Dependency manifest files modified
-          Changes to dependency files can introduce new packages or change version pins. Verify all dependency changes are intentional and from trusted sources.
-
-          **Files:**
-          \`\`\`
-          ${DEP_HITS}
-          \`\`\`
-          "
-          fi
-
-          # --- GitHub Actions version unpinning (mutable tags instead of SHAs) ---
-          ACTIONS_UNPIN=$(echo "$DIFF" | grep -n '^\+' | grep 'uses:' | grep -v '#' | grep -E '@v[0-9]' | head -10 || true)
-          if [ -n "$ACTIONS_UNPIN" ]; then
-            FINDINGS="${FINDINGS}
-          ### ⚠️ WARNING: GitHub Actions with mutable version tags
-          Actions should be pinned to full commit SHAs (not \`@v4\`, \`@v5\`). Mutable tags can be retargeted silently if a maintainer account is compromised.
-
-          **Matches:**
-          \`\`\`
-          ${ACTIONS_UNPIN}
-          \`\`\`
-          "
-          fi
-
-          # --- Output results ---
-          if [ -n "$FINDINGS" ]; then
-            echo "found=true" >> "$GITHUB_OUTPUT"
-            if [ "$CRITICAL" = true ]; then
-              echo "critical=true" >> "$GITHUB_OUTPUT"
-            else
-              echo "critical=false" >> "$GITHUB_OUTPUT"
-            fi
-            # Write findings to a file (multiline env vars are fragile)
-            echo "$FINDINGS" > /tmp/findings.md
-          else
-            echo "found=false" >> "$GITHUB_OUTPUT"
-            echo "critical=false" >> "$GITHUB_OUTPUT"
-          fi
-
-      - name: Post warning comment
-        if: steps.scan.outputs.found == 'true'
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          SEVERITY="⚠️ Supply Chain Risk Detected"
-          if [ "${{ steps.scan.outputs.critical }}" = "true" ]; then
-            SEVERITY="🚨 CRITICAL Supply Chain Risk Detected"
-          fi
-
-          BODY="## ${SEVERITY}
-
-          This PR contains patterns commonly associated with supply chain attacks. This does **not** mean the PR is malicious — but these patterns require careful human review before merging.
-
-          $(cat /tmp/findings.md)
-
-          ---
-          *Automated scan triggered by [supply-chain-audit](/.github/workflows/supply-chain-audit.yml). If this is a false positive, a maintainer can approve after manual review.*"
-
-          gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY" || echo "::warning::Could not post PR comment (expected for fork PRs — GITHUB_TOKEN is read-only)"
-
-      - name: Fail on critical findings
-        if: steps.scan.outputs.critical == 'true'
-        run: |
-          echo "::error::CRITICAL supply chain risk patterns detected in this PR. See the PR comment for details."
-          exit 1
@@ -3,8 +3,14 @@ name: Tests
 on:
  push:
    branches: [main]
+    paths-ignore:
+      - '**/*.md'
+      - 'docs/**'
  pull_request:
    branches: [main]
+    paths-ignore:
+      - '**/*.md'
+      - 'docs/**'

 permissions:
  contents: read
@@ -60,5 +60,6 @@ mini-swe-agent/

 # Nix
 .direnv/
+.nix-stamps/
 result
 website/static/api/skills-index.json
@@ -105,3 +105,4 @@ tesseracttars-creator <tesseracttars@gmail.com> <tesseracttars@gmail.com>
 xinbenlv <zzn+pa@zzn.im> <zzn+pa@zzn.im>
 SaulJWu <saul.jj.wu@gmail.com> <saul.jj.wu@gmail.com>
 angelos <angelos@oikos.lan.home.malaiwah.com> <angelos@oikos.lan.home.malaiwah.com>
+MestreY0d4-Uninter <241404605+MestreY0d4-Uninter@users.noreply.github.com> <MestreY0d4-Uninter@users.noreply.github.com>
@@ -13,7 +13,7 @@ source venv/bin/activate  # ALWAYS activate before running Python
 ```
 hermes-agent/
 ├── run_agent.py          # AIAgent class — core conversation loop
-├── model_tools.py        # Tool orchestration, _discover_tools(), handle_function_call()
+├── model_tools.py        # Tool orchestration, discover_builtin_tools(), handle_function_call()
 ├── toolsets.py           # Toolset definitions, _HERMES_CORE_TOOLS list
 ├── cli.py                # HermesCLI class — interactive CLI orchestrator
 ├── hermes_state.py       # SessionDB — SQLite session store (FTS5 search)
@@ -56,6 +56,19 @@ hermes-agent/
 │   ├── run.py            # Main loop, slash commands, message dispatch
 │   ├── session.py        # SessionStore — conversation persistence
 │   └── platforms/        # Adapters: telegram, discord, slack, whatsapp, homeassistant, signal, qqbot
+├── ui-tui/               # Ink (React) terminal UI — `hermes --tui`
+│   ├── src/entry.tsx        # TTY gate + render()
+│   ├── src/app.tsx          # Main state machine and UI
+│   ├── src/gatewayClient.ts # Child process + JSON-RPC bridge
+│   ├── src/app/             # Decomposed app logic (event handler, slash handler, stores, hooks)
+│   ├── src/components/      # Ink components (branding, markdown, prompts, pickers, etc.)
+│   ├── src/hooks/           # useCompletion, useInputHistory, useQueue, useVirtualHistory
+│   └── src/lib/             # Pure helpers (history, osc52, text, rpc, messages)
+├── tui_gateway/          # Python JSON-RPC backend for the TUI
+│   ├── entry.py             # stdio entrypoint
+│   ├── server.py            # RPC handlers and session logic
+│   ├── render.py            # Optional rich/ANSI bridge
+│   └── slash_worker.py      # Persistent HermesCLI subprocess for slash commands
 ├── acp_adapter/          # ACP server (VS Code / Zed / JetBrains integration)
 ├── cron/                 # Scheduler (jobs.py, scheduler.py)
 ├── environments/         # RL training environments (Atropos)
@@ -179,9 +192,62 @@ if canonical == "mycommand":

 ---

+## TUI Architecture (ui-tui + tui_gateway)
+
+The TUI is a full replacement for the classic (prompt_toolkit) CLI, activated via `hermes --tui` or `HERMES_TUI=1`.
+
+### Process Model
+
+```
+hermes --tui
+  └─ Node (Ink)  ──stdio JSON-RPC──  Python (tui_gateway)
+       │                                  └─ AIAgent + tools + sessions
+       └─ renders transcript, composer, prompts, activity
+```
+
+TypeScript owns the screen. Python owns sessions, tools, model calls, and slash command logic.
+
+### Transport
+
+Newline-delimited JSON-RPC over stdio. Requests from Ink, events from Python. See `tui_gateway/server.py` for the full method/event catalog.
+
+### Key Surfaces
+
+| Surface | Ink component | Gateway method |
+|---------|---------------|----------------|
+| Chat streaming | `app.tsx` + `messageLine.tsx` | `prompt.submit` → `message.delta/complete` |
+| Tool activity | `thinking.tsx` | `tool.start/progress/complete` |
+| Approvals | `prompts.tsx` | `approval.respond` ← `approval.request` |
+| Clarify/sudo/secret | `prompts.tsx`, `maskedPrompt.tsx` | `clarify/sudo/secret.respond` |
+| Session picker | `sessionPicker.tsx` | `session.list/resume` |
+| Slash commands | Local handler + fallthrough | `slash.exec` → `_SlashWorker`, `command.dispatch` |
+| Completions | `useCompletion` hook | `complete.slash`, `complete.path` |
+| Theming | `theme.ts` + `branding.tsx` | `gateway.ready` with skin data |
+
+### Slash Command Flow
+
+1. Built-in client commands (`/help`, `/quit`, `/clear`, `/resume`, `/copy`, `/paste`, etc.) handled locally in `app.tsx`
+2. Everything else → `slash.exec` (runs in persistent `_SlashWorker` subprocess) → `command.dispatch` fallback
+
+### Dev Commands
+
+```bash
+cd ui-tui
+npm install       # first time
+npm run dev       # watch mode (rebuilds hermes-ink + tsx --watch)
+npm start         # production
+npm run build     # full build (hermes-ink + tsc)
+npm run type-check # typecheck only (tsc --noEmit)
+npm run lint      # eslint
+npm run fmt       # prettier
+npm test          # vitest
+```
+
+---
+
 ## Adding New Tools

-Requires changes in **3 files**:
+Requires changes in **2 files**:

 **1. Create `tools/your_tool.py`:**
 ```python
@@ -204,9 +270,9 @@ registry.register(
 )
 ```

-**2. Add import** in `model_tools.py` `_discover_tools()` list.
+**2. Add to `toolsets.py`** — either `_HERMES_CORE_TOOLS` (all platforms) or a new toolset.

-**3. Add to `toolsets.py`** — either `_HERMES_CORE_TOOLS` (all platforms) or a new toolset.
+Auto-discovery: any `tools/*.py` file with a top-level `registry.register()` call is imported automatically — no manual import list to maintain.

 The registry handles schema collection, dispatch, availability checking, and error wrapping. All handlers MUST return a JSON string.

@@ -458,13 +524,45 @@ def profile_env(tmp_path, monkeypatch):

 ## Testing

+**ALWAYS use `scripts/run_tests.sh`** — do not call `pytest` directly. The script enforces
+hermetic environment parity with CI (unset credential vars, TZ=UTC, LANG=C.UTF-8,
+4 xdist workers matching GHA ubuntu-latest). Direct `pytest` on a 16+ core
+developer machine with API keys set diverges from CI in ways that have caused
+multiple "works locally, fails in CI" incidents (and the reverse).
+
 ```bash
-source venv/bin/activate
-python -m pytest tests/ -q          # Full suite (~3000 tests, ~3 min)
-python -m pytest tests/test_model_tools.py -q   # Toolset resolution
-python -m pytest tests/test_cli_init.py -q       # CLI config loading
-python -m pytest tests/gateway/ -q               # Gateway tests
-python -m pytest tests/tools/ -q                 # Tool-level tests
+scripts/run_tests.sh                                  # full suite, CI-parity
+scripts/run_tests.sh tests/gateway/                   # one directory
+scripts/run_tests.sh tests/agent/test_foo.py::test_x  # one test
+scripts/run_tests.sh -v --tb=long                     # pass-through pytest flags
 ```

+### Why the wrapper (and why the old "just call pytest" doesn't work)
+
+Five real sources of local-vs-CI drift the script closes:
+
+| | Without wrapper | With wrapper |
+|---|---|---|
+| Provider API keys | Whatever is in your env (auto-detects pool) | All `*_API_KEY`/`*_TOKEN`/etc. unset |
+| HOME / `~/.hermes/` | Your real config+auth.json | Temp dir per test |
+| Timezone | Local TZ (PDT etc.) | UTC |
+| Locale | Whatever is set | C.UTF-8 |
+| xdist workers | `-n auto` = all cores (20+ on a workstation) | `-n 4` matching CI |
+
+`tests/conftest.py` also enforces points 1-4 as an autouse fixture so ANY pytest
+invocation (including IDE integrations) gets hermetic behavior — but the wrapper
+is belt-and-suspenders.
+
+### Running without the wrapper (only if you must)
+
+If you can't use the wrapper (e.g. on Windows or inside an IDE that shells
+pytest directly), at minimum activate the venv and pass `-n 4`:
+
+```bash
+source venv/bin/activate
+python -m pytest tests/ -q -n 4
+```
+
+Worker count above 4 will surface test-ordering flakes that CI never sees.
+
 Always run the full suite before pushing changes.
@@ -21,26 +21,36 @@ RUN useradd -u 10000 -m -d /opt/data hermes
 COPY --chmod=0755 --from=gosu_source /gosu /usr/local/bin/
 COPY --chmod=0755 --from=uv_source /usr/local/bin/uv /usr/local/bin/uvx /usr/local/bin/

-COPY . /opt/hermes
 WORKDIR /opt/hermes

-# Install Node dependencies and Playwright as root (--with-deps needs apt)
+# ---------- Layer-cached dependency install ----------
+# Copy only package manifests first so npm install + Playwright are cached
+# unless the lockfiles themselves change.
+COPY package.json package-lock.json ./
+COPY scripts/whatsapp-bridge/package.json scripts/whatsapp-bridge/package-lock.json scripts/whatsapp-bridge/
+COPY web/package.json web/package-lock.json web/
+
 RUN npm install --prefer-offline --no-audit && \
    npx playwright install --with-deps chromium --only-shell && \
-    cd /opt/hermes/scripts/whatsapp-bridge && \
-    npm install --prefer-offline --no-audit && \
+    (cd scripts/whatsapp-bridge && npm install --prefer-offline --no-audit) && \
+    (cd web && npm install --prefer-offline --no-audit) && \
    npm cache clean --force

-# Hand ownership to hermes user, then install Python deps in a virtualenv
-RUN chown -R hermes:hermes /opt/hermes
-USER hermes
+# ---------- Source code ----------
+# .dockerignore excludes node_modules, so the installs above survive.
+COPY --chown=hermes:hermes . .

+# Build web dashboard (Vite outputs to hermes_cli/web_dist/)
+RUN cd web && npm run build
+
+# ---------- Python virtualenv ----------
+RUN chown hermes:hermes /opt/hermes
+USER hermes
 RUN uv venv && \
    uv pip install --no-cache-dir -e ".[all]"

-USER root
-RUN chmod +x /opt/hermes/docker/entrypoint.sh
-
+# ---------- Runtime ----------
+ENV HERMES_WEB_DIST=/opt/hermes/hermes_cli/web_dist
 ENV HERMES_HOME=/opt/data
 VOLUME [ "/opt/data" ]
 ENTRYPOINT [ "/opt/hermes/docker/entrypoint.sh" ]
@@ -13,7 +13,7 @@

 **The self-improving AI agent built by [Nous Research](https://nousresearch.com).** It's the only agent with a built-in learning loop — it creates skills from experience, improves them during use, nudges itself to persist knowledge, searches its own past conversations, and builds a deepening model of who you are across sessions. Run it on a $5 VPS, a GPU cluster, or serverless infrastructure that costs nearly nothing when idle. It's not tied to your laptop — talk to it from Telegram while it works on a cloud VM.

-Use any model you want — [Nous Portal](https://portal.nousresearch.com), [OpenRouter](https://openrouter.ai) (200+ models), [Xiaomi MiMo](https://platform.xiaomimimo.com), [z.ai/GLM](https://z.ai), [Kimi/Moonshot](https://platform.moonshot.ai), [MiniMax](https://www.minimax.io), [Hugging Face](https://huggingface.co), OpenAI, or your own endpoint. Switch with `hermes model` — no code changes, no lock-in.
+Use any model you want — [Nous Portal](https://portal.nousresearch.com), [OpenRouter](https://openrouter.ai) (200+ models), [NVIDIA NIM](https://build.nvidia.com) (Nemotron), [Xiaomi MiMo](https://platform.xiaomimimo.com), [z.ai/GLM](https://z.ai), [Kimi/Moonshot](https://platform.moonshot.ai), [MiniMax](https://www.minimax.io), [Hugging Face](https://huggingface.co), OpenAI, or your own endpoint. Switch with `hermes model` — no code changes, no lock-in.

 <table>
 <tr><td><b>A real terminal interface</b></td><td>Full TUI with multiline editing, slash-command autocomplete, conversation history, interrupt-and-redirect, and streaming tool output.</td></tr>
@@ -141,11 +141,18 @@ See `hermes claw migrate --help` for all options, or use the `openclaw-migration

 We welcome contributions! See the [Contributing Guide](https://hermes-agent.nousresearch.com/docs/developer-guide/contributing) for development setup, code style, and PR process.

-Quick start for contributors:
+Quick start for contributors — clone and go with `setup-hermes.sh`:

 ```bash
 git clone https://github.com/NousResearch/hermes-agent.git
 cd hermes-agent
+./setup-hermes.sh     # installs uv, creates venv, installs .[all], symlinks ~/.local/bin/hermes
+./hermes              # auto-detects the venv, no need to `source` first
+```
+
+Manual path (equivalent to the above):
+
+```bash
 curl -LsSf https://astral.sh/uv/install.sh | sh
 uv venv venv --python 3.11
 source venv/bin/activate
@@ -0,0 +1,27 @@
+# Hermes Agent v0.10.0 (v2026.4.16)
+
+**Release Date:** April 16, 2026
+
+> The Tool Gateway release — paid Nous Portal subscribers can now use web search, image generation, text-to-speech, and browser automation through their existing subscription with zero additional API keys.
+
+---
+
+## ✨ Highlights
+
+- **Nous Tool Gateway** — Paid [Nous Portal](https://portal.nousresearch.com) subscribers now get automatic access to **web search** (Firecrawl), **image generation** (FAL / FLUX 2 Pro), **text-to-speech** (OpenAI TTS), and **browser automation** (Browser Use) through their existing subscription. No separate API keys needed — just run `hermes model`, select Nous Portal, and pick which tools to enable. Per-tool opt-in via `use_gateway` config, full integration with `hermes tools` and `hermes status`, and the runtime correctly prefers the gateway even when direct API keys exist. Replaces the old hidden `HERMES_ENABLE_NOUS_MANAGED_TOOLS` env var with clean subscription-based detection. ([#11206](https://github.com/NousResearch/hermes-agent/pull/11206), based on work by @jquesnelle; docs: [#11208](https://github.com/NousResearch/hermes-agent/pull/11208))
+
+---
+
+## 🐛 Bug Fixes & Improvements
+
+This release includes 180+ commits with numerous bug fixes, platform improvements, and reliability enhancements across the agent core, gateway, CLI, and tool system. Full details will be published in the v0.11.0 changelog.
+
+---
+
+## 👥 Contributors
+
+- **@jquesnelle** (emozilla) — Original Tool Gateway implementation ([#10799](https://github.com/NousResearch/hermes-agent/pull/10799)), salvaged and shipped in this release
+
+---
+
+**Full Changelog**: [v2026.4.13...v2026.4.16](https://github.com/NousResearch/hermes-agent/compare/v2026.4.13...v2026.4.16)
@@ -0,0 +1,84 @@
+# Hermes Agent Security Policy
+
+This document outlines the security protocols, trust model, and deployment hardening guidelines for the **Hermes Agent** project.
+
+## 1. Vulnerability Reporting
+
+Hermes Agent does **not** operate a bug bounty program. Security issues should be reported via [GitHub Security Advisories (GHSA)](https://github.com/NousResearch/hermes-agent/security/advisories/new) or by emailing **security@nousresearch.com**. Do not open public issues for security vulnerabilities.
+
+### Required Submission Details
+- **Title & Severity:** Concise description and CVSS score/rating.
+- **Affected Component:** Exact file path and line range (e.g., `tools/approval.py:120-145`).
+- **Environment:** Output of `hermes version`, commit SHA, OS, and Python version.
+- **Reproduction:** Step-by-step Proof-of-Concept (PoC) against `main` or the latest release.
+- **Impact:** Explanation of what trust boundary was crossed.
+
+---
+
+## 2. Trust Model
+
+The core assumption is that Hermes is a **personal agent** with one trusted operator.
+
+### Operator & Session Trust
+- **Single Tenant:** The system protects the operator from LLM actions, not from malicious co-tenants. Multi-user isolation must happen at the OS/host level.
+- **Gateway Security:** Authorized callers (Telegram, Discord, Slack, etc.) receive equal trust. Session keys are used for routing, not as authorization boundaries.
+- **Execution:** Defaults to `terminal.backend: local` (direct host execution). Container isolation (Docker, Modal, Daytona) is opt-in for sandboxing.
+
+### Dangerous Command Approval
+The approval system (`tools/approval.py`) is a core security boundary. Terminal commands, file operations, and other potentially destructive actions are gated behind explicit user confirmation before execution. The approval mode is configurable via `approvals.mode` in `config.yaml`:
+- `"on"` (default) — prompts the user to approve dangerous commands.
+- `"auto"` — auto-approves after a configurable delay.
+- `"off"` — disables the gate entirely (break-glass; see Section 3).
+
+### Output Redaction
+`agent/redact.py` strips secret-like patterns (API keys, tokens, credentials) from all display output before it reaches the terminal or gateway platform. This prevents accidental credential leakage in chat logs, tool previews, and response text. Redaction operates on the display layer only — underlying values remain intact for internal agent operations.
+
+### Skills vs. MCP Servers
+- **Installed Skills:** High trust. Equivalent to local host code; skills can read environment variables and run arbitrary commands.
+- **MCP Servers:** Lower trust. MCP subprocesses receive a filtered environment (`_build_safe_env()` in `tools/mcp_tool.py`) — only safe baseline variables (`PATH`, `HOME`, `XDG_*`) plus variables explicitly declared in the server's `env` config block are passed through. Host credentials are stripped by default. Additionally, packages invoked via `npx`/`uvx` are checked against the OSV malware database before spawning.
+
+### Code Execution Sandbox
+The `execute_code` tool (`tools/code_execution_tool.py`) runs LLM-generated Python scripts in a child process with API keys and tokens stripped from the environment to prevent credential exfiltration. Only environment variables explicitly declared by loaded skills (via `env_passthrough`) or by the user in `config.yaml` (`terminal.env_passthrough`) are passed through. The child accesses Hermes tools via RPC, not direct API calls.
+
+### Subagents
+- **No recursive delegation:** The `delegate_task` tool is disabled for child agents.
+- **Depth limit:** `MAX_DEPTH = 2` — parent (depth 0) can spawn a child (depth 1); grandchildren are rejected.
+- **Memory isolation:** Subagents run with `skip_memory=True` and do not have access to the parent's persistent memory provider. The parent receives only the task prompt and final response as an observation.
+
+---
+
+## 3. Out of Scope (Non-Vulnerabilities)
+
+The following scenarios are **not** considered security breaches:
+- **Prompt Injection:** Unless it results in a concrete bypass of the approval system, toolset restrictions, or container sandbox.
+- **Public Exposure:** Deploying the gateway to the public internet without external authentication or network protection.
+- **Trusted State Access:** Reports that require pre-existing write access to `~/.hermes/`, `.env`, or `config.yaml` (these are operator-owned files).
+- **Default Behavior:** Host-level command execution when `terminal.backend` is set to `local` — this is the documented default, not a vulnerability.
+- **Configuration Trade-offs:** Intentional break-glass settings such as `approvals.mode: "off"` or `terminal.backend: local` in production.
+- **Tool-level read/access restrictions:** The agent has unrestricted shell access via the `terminal` tool by design. Reports that a specific tool (e.g., `read_file`) can access a resource are not vulnerabilities if the same access is available through `terminal`. Tool-level deny lists only constitute a meaningful security boundary when paired with equivalent restrictions on the terminal side (as with write operations, where `WRITE_DENIED_PATHS` is paired with the dangerous command approval system).
+
+---
+
+## 4. Deployment Hardening & Best Practices
+
+### Filesystem & Network
+- **Production sandboxing:** Use container backends (`docker`, `modal`, `daytona`) instead of `local` for untrusted workloads.
+- **File permissions:** Run as non-root (the Docker image uses UID 10000); protect credentials with `chmod 600 ~/.hermes/.env` on local installs.
+- **Network exposure:** Do not expose the gateway or API server to the public internet without VPN, Tailscale, or firewall protection. SSRF protection is enabled by default across all gateway platform adapters (Telegram, Discord, Slack, Matrix, Mattermost, etc.) with redirect validation. Note: the local terminal backend does not apply SSRF filtering, as it operates within the trusted operator's environment.
+
+### Skills & Supply Chain
+- **Skill installation:** Review Skills Guard reports (`tools/skills_guard.py`) before installing third-party skills. The audit log at `~/.hermes/skills/.hub/audit.log` tracks every install and removal.
+- **MCP safety:** OSV malware checking runs automatically for `npx`/`uvx` packages before MCP server processes are spawned.
+- **CI/CD:** GitHub Actions are pinned to full commit SHAs. The `supply-chain-audit.yml` workflow blocks PRs containing `.pth` files or suspicious `base64`+`exec` patterns.
+
+### Credential Storage
+- API keys and tokens belong exclusively in `~/.hermes/.env` — never in `config.yaml` or checked into version control.
+- The credential pool system (`agent/credential_pool.py`) handles key rotation and fallback. Credentials are resolved from environment variables, not stored in plaintext databases.
+
+---
+
+## 5. Disclosure Process
+
+- **Coordinated Disclosure:** 90-day window or until a fix is released, whichever comes first.
+- **Communication:** All updates occur via the GHSA thread or email correspondence with security@nousresearch.com.
+- **Credits:** Reporters are credited in release notes unless anonymity is requested.
@@ -49,6 +49,7 @@ def make_tool_progress_cb(
    session_id: str,
    loop: asyncio.AbstractEventLoop,
    tool_call_ids: Dict[str, Deque[str]],
+    tool_call_meta: Dict[str, Dict[str, Any]],
 ) -> Callable:
    """Create a ``tool_progress_callback`` for AIAgent.

@@ -84,6 +85,16 @@ def make_tool_progress_cb(
            tool_call_ids[name] = queue
        queue.append(tc_id)

+        snapshot = None
+        if name in {"write_file", "patch", "skill_manage"}:
+            try:
+                from agent.display import capture_local_edit_snapshot
+
+                snapshot = capture_local_edit_snapshot(name, args)
+            except Exception:
+                logger.debug("Failed to capture ACP edit snapshot for %s", name, exc_info=True)
+        tool_call_meta[tc_id] = {"args": args, "snapshot": snapshot}
+
        update = build_tool_start(tc_id, name, args)
        _send_update(conn, session_id, loop, update)

@@ -119,6 +130,7 @@ def make_step_cb(
    session_id: str,
    loop: asyncio.AbstractEventLoop,
    tool_call_ids: Dict[str, Deque[str]],
+    tool_call_meta: Dict[str, Dict[str, Any]],
 ) -> Callable:
    """Create a ``step_callback`` for AIAgent.

@@ -132,10 +144,12 @@ def make_step_cb(
            for tool_info in prev_tools:
                tool_name = None
                result = None
+                function_args = None

                if isinstance(tool_info, dict):
                    tool_name = tool_info.get("name") or tool_info.get("function_name")
                    result = tool_info.get("result") or tool_info.get("output")
+                    function_args = tool_info.get("arguments") or tool_info.get("args")
                elif isinstance(tool_info, str):
                    tool_name = tool_info

@@ -145,8 +159,13 @@ def make_step_cb(
                    tool_call_ids[tool_name] = queue
                if tool_name and queue:
                    tc_id = queue.popleft()
+                    meta = tool_call_meta.pop(tc_id, {})
                    update = build_tool_complete(
-                        tc_id, tool_name, result=str(result) if result is not None else None
+                        tc_id,
+                        tool_name,
+                        result=str(result) if result is not None else None,
+                        function_args=function_args or meta.get("args"),
+                        snapshot=meta.get("snapshot"),
                    )
                    _send_update(conn, session_id, loop, update)
                    if not queue:
@@ -26,6 +26,7 @@ from acp.schema import (
    McpServerHttp,
    McpServerSse,
    McpServerStdio,
+    ModelInfo,
    NewSessionResponse,
    PromptResponse,
    ResumeSessionResponse,
@@ -36,6 +37,7 @@ from acp.schema import (
    SessionCapabilities,
    SessionForkCapabilities,
    SessionListCapabilities,
+    SessionModelState,
    SessionResumeCapabilities,
    SessionInfo,
    TextContentBlock,
@@ -147,6 +149,98 @@ class HermesACPAgent(acp.Agent):
        self._conn = conn
        logger.info("ACP client connected")

+    @staticmethod
+    def _encode_model_choice(provider: str | None, model: str | None) -> str:
+        """Encode a model selection so ACP clients can keep provider context."""
+        raw_model = str(model or "").strip()
+        if not raw_model:
+            return ""
+        raw_provider = str(provider or "").strip().lower()
+        if not raw_provider:
+            return raw_model
+        return f"{raw_provider}:{raw_model}"
+
+    def _build_model_state(self, state: SessionState) -> SessionModelState | None:
+        """Return the ACP model selector payload for editors like Zed."""
+        model = str(state.model or getattr(state.agent, "model", "") or "").strip()
+        provider = getattr(state.agent, "provider", None) or detect_provider() or "openrouter"
+
+        try:
+            from hermes_cli.models import curated_models_for_provider, normalize_provider, provider_label
+
+            normalized_provider = normalize_provider(provider)
+            provider_name = provider_label(normalized_provider)
+            available_models: list[ModelInfo] = []
+            seen_ids: set[str] = set()
+
+            for model_id, description in curated_models_for_provider(normalized_provider):
+                rendered_model = str(model_id or "").strip()
+                if not rendered_model:
+                    continue
+                choice_id = self._encode_model_choice(normalized_provider, rendered_model)
+                if choice_id in seen_ids:
+                    continue
+                desc_parts = [f"Provider: {provider_name}"]
+                if description:
+                    desc_parts.append(str(description).strip())
+                if rendered_model == model:
+                    desc_parts.append("current")
+                available_models.append(
+                    ModelInfo(
+                        model_id=choice_id,
+                        name=rendered_model,
+                        description=" • ".join(part for part in desc_parts if part),
+                    )
+                )
+                seen_ids.add(choice_id)
+
+            current_model_id = self._encode_model_choice(normalized_provider, model)
+            if current_model_id and current_model_id not in seen_ids:
+                available_models.insert(
+                    0,
+                    ModelInfo(
+                        model_id=current_model_id,
+                        name=model,
+                        description=f"Provider: {provider_name} • current",
+                    ),
+                )
+
+            if available_models:
+                return SessionModelState(
+                    available_models=available_models,
+                    current_model_id=current_model_id or available_models[0].model_id,
+                )
+        except Exception:
+            logger.debug("Could not build ACP model state", exc_info=True)
+
+        if not model:
+            return None
+
+        fallback_choice = self._encode_model_choice(provider, model)
+        return SessionModelState(
+            available_models=[ModelInfo(model_id=fallback_choice, name=model)],
+            current_model_id=fallback_choice,
+        )
+
+    @staticmethod
+    def _resolve_model_selection(raw_model: str, current_provider: str) -> tuple[str, str]:
+        """Resolve ``provider:model`` input into the provider and normalized model id."""
+        target_provider = current_provider
+        new_model = raw_model.strip()
+
+        try:
+            from hermes_cli.models import detect_provider_for_model, parse_model_input
+
+            target_provider, new_model = parse_model_input(new_model, current_provider)
+            if target_provider == current_provider:
+                detected = detect_provider_for_model(new_model, current_provider)
+                if detected:
+                    target_provider, new_model = detected
+        except Exception:
+            logger.debug("Provider detection failed, using model as-is", exc_info=True)
+
+        return target_provider, new_model
+
    async def _register_session_mcp_servers(
        self,
        state: SessionState,
@@ -273,7 +367,10 @@ class HermesACPAgent(acp.Agent):
        await self._register_session_mcp_servers(state, mcp_servers)
        logger.info("New session %s (cwd=%s)", state.session_id, cwd)
        self._schedule_available_commands_update(state.session_id)
-        return NewSessionResponse(session_id=state.session_id)
+        return NewSessionResponse(
+            session_id=state.session_id,
+            models=self._build_model_state(state),
+        )

    async def load_session(
        self,
@@ -289,7 +386,7 @@ class HermesACPAgent(acp.Agent):
        await self._register_session_mcp_servers(state, mcp_servers)
        logger.info("Loaded session %s", session_id)
        self._schedule_available_commands_update(session_id)
-        return LoadSessionResponse()
+        return LoadSessionResponse(models=self._build_model_state(state))

    async def resume_session(
        self,
@@ -305,7 +402,7 @@ class HermesACPAgent(acp.Agent):
        await self._register_session_mcp_servers(state, mcp_servers)
        logger.info("Resumed session %s", state.session_id)
        self._schedule_available_commands_update(state.session_id)
-        return ResumeSessionResponse()
+        return ResumeSessionResponse(models=self._build_model_state(state))

    async def cancel(self, session_id: str, **kwargs: Any) -> None:
        state = self.session_manager.get_session(session_id)
@@ -340,11 +437,20 @@ class HermesACPAgent(acp.Agent):
        cwd: str | None = None,
        **kwargs: Any,
    ) -> ListSessionsResponse:
-        infos = self.session_manager.list_sessions()
-        sessions = [
-            SessionInfo(session_id=s["session_id"], cwd=s["cwd"])
-            for s in infos
-        ]
+        infos = self.session_manager.list_sessions(cwd=cwd)
+        sessions = []
+        for s in infos:
+            updated_at = s.get("updated_at")
+            if updated_at is not None and not isinstance(updated_at, str):
+                updated_at = str(updated_at)
+            sessions.append(
+                SessionInfo(
+                    session_id=s["session_id"],
+                    cwd=s["cwd"],
+                    title=s.get("title"),
+                    updated_at=updated_at,
+                )
+            )
        return ListSessionsResponse(sessions=sessions)

    # ---- Prompt (core) ------------------------------------------------------
@@ -389,12 +495,13 @@ class HermesACPAgent(acp.Agent):
            state.cancel_event.clear()

        tool_call_ids: dict[str, Deque[str]] = defaultdict(deque)
+        tool_call_meta: dict[str, dict[str, Any]] = {}
        previous_approval_cb = None

        if conn:
-            tool_progress_cb = make_tool_progress_cb(conn, session_id, loop, tool_call_ids)
+            tool_progress_cb = make_tool_progress_cb(conn, session_id, loop, tool_call_ids, tool_call_meta)
            thinking_cb = make_thinking_cb(conn, session_id, loop)
-            step_cb = make_step_cb(conn, session_id, loop, tool_call_ids)
+            step_cb = make_step_cb(conn, session_id, loop, tool_call_ids, tool_call_meta)
            message_cb = make_message_cb(conn, session_id, loop)
            approval_cb = make_approval_callback(conn.request_permission, loop, session_id)
        else:
@@ -449,6 +556,19 @@ class HermesACPAgent(acp.Agent):
            self.session_manager.save_session(session_id)

        final_response = result.get("final_response", "")
+        if final_response:
+            try:
+                from agent.title_generator import maybe_auto_title
+
+                maybe_auto_title(
+                    self.session_manager._get_db(),
+                    session_id,
+                    user_text,
+                    final_response,
+                    state.history,
+                )
+            except Exception:
+                logger.debug("Failed to auto-title ACP session %s", session_id, exc_info=True)
        if final_response and conn:
            update = acp.update_agent_message_text(final_response)
            await conn.session_update(session_id, update)
@@ -556,27 +676,15 @@ class HermesACPAgent(acp.Agent):
            provider = getattr(state.agent, "provider", None) or "auto"
            return f"Current model: {model}\nProvider: {provider}"

-        new_model = args.strip()
-        target_provider = None
        current_provider = getattr(state.agent, "provider", None) or "openrouter"
-
-        # Auto-detect provider for the requested model
-        try:
-            from hermes_cli.models import parse_model_input, detect_provider_for_model
-            target_provider, new_model = parse_model_input(new_model, current_provider)
-            if target_provider == current_provider:
-                detected = detect_provider_for_model(new_model, current_provider)
-                if detected:
-                    target_provider, new_model = detected
-        except Exception:
-            logger.debug("Provider detection failed, using model as-is", exc_info=True)
+        target_provider, new_model = self._resolve_model_selection(args, current_provider)

        state.model = new_model
        state.agent = self.session_manager._make_agent(
            session_id=state.session_id,
            cwd=state.cwd,
            model=new_model,
-            requested_provider=target_provider or current_provider,
+            requested_provider=target_provider,
        )
        self.session_manager.save_session(state.session_id)
        provider_label = getattr(state.agent, "provider", None) or target_provider or current_provider
@@ -678,20 +786,30 @@ class HermesACPAgent(acp.Agent):
        """Switch the model for a session (called by ACP protocol)."""
        state = self.session_manager.get_session(session_id)
        if state:
-            state.model = model_id
            current_provider = getattr(state.agent, "provider", None)
-            current_base_url = getattr(state.agent, "base_url", None)
-            current_api_mode = getattr(state.agent, "api_mode", None)
+            requested_provider, resolved_model = self._resolve_model_selection(
+                model_id,
+                current_provider or "openrouter",
+            )
+            state.model = resolved_model
+            provider_changed = bool(current_provider and requested_provider != current_provider)
+            current_base_url = None if provider_changed else getattr(state.agent, "base_url", None)
+            current_api_mode = None if provider_changed else getattr(state.agent, "api_mode", None)
            state.agent = self.session_manager._make_agent(
                session_id=session_id,
                cwd=state.cwd,
-                model=model_id,
-                requested_provider=current_provider,
+                model=resolved_model,
+                requested_provider=requested_provider,
                base_url=current_base_url,
                api_mode=current_api_mode,
            )
            self.session_manager.save_session(session_id)
-            logger.info("Session %s: model switched to %s", session_id, model_id)
+            logger.info(
+                "Session %s: model switched to %s via provider %s",
+                session_id,
+                resolved_model,
+                requested_provider,
+            )
            return SetSessionModelResponse()
        logger.warning("Session %s: model switch requested for missing session", session_id)
        return None
@@ -13,8 +13,12 @@ from hermes_constants import get_hermes_home
 import copy
 import json
 import logging
+import os
+import re
 import sys
+import time
 import uuid
+from datetime import datetime, timezone
 from dataclasses import dataclass, field
 from threading import Lock
 from typing import Any, Dict, List, Optional
@@ -22,6 +26,64 @@ from typing import Any, Dict, List, Optional
 logger = logging.getLogger(__name__)


+def _normalize_cwd_for_compare(cwd: str | None) -> str:
+    raw = str(cwd or ".").strip()
+    if not raw:
+        raw = "."
+    expanded = os.path.expanduser(raw)
+
+    # Normalize Windows drive paths into the equivalent WSL mount form so
+    # ACP history filters match the same workspace across Windows and WSL.
+    match = re.match(r"^([A-Za-z]):[\\/](.*)$", expanded)
+    if match:
+        drive = match.group(1).lower()
+        tail = match.group(2).replace("\\", "/")
+        expanded = f"/mnt/{drive}/{tail}"
+    elif re.match(r"^/mnt/[A-Za-z]/", expanded):
+        expanded = f"/mnt/{expanded[5].lower()}/{expanded[7:]}"
+
+    return os.path.normpath(expanded)
+
+
+def _build_session_title(title: Any, preview: Any, cwd: str | None) -> str:
+    explicit = str(title or "").strip()
+    if explicit:
+        return explicit
+    preview_text = str(preview or "").strip()
+    if preview_text:
+        return preview_text
+    leaf = os.path.basename(str(cwd or "").rstrip("/\\"))
+    return leaf or "New thread"
+
+
+def _format_updated_at(value: Any) -> str | None:
+    if value is None:
+        return None
+    if isinstance(value, str) and value.strip():
+        return value
+    try:
+        return datetime.fromtimestamp(float(value), tz=timezone.utc).isoformat()
+    except Exception:
+        return None
+
+
+def _updated_at_sort_key(value: Any) -> float:
+    if value is None:
+        return float("-inf")
+    if isinstance(value, (int, float)):
+        return float(value)
+    raw = str(value).strip()
+    if not raw:
+        return float("-inf")
+    try:
+        return datetime.fromisoformat(raw.replace("Z", "+00:00")).timestamp()
+    except Exception:
+        try:
+            return float(raw)
+        except Exception:
+            return float("-inf")
+
+
 def _acp_stderr_print(*args, **kwargs) -> None:
    """Best-effort human-readable output sink for ACP stdio sessions.

@@ -162,47 +224,78 @@ class SessionManager:
        logger.info("Forked ACP session %s -> %s", session_id, new_id)
        return state

-    def list_sessions(self) -> List[Dict[str, Any]]:
+    def list_sessions(self, cwd: str | None = None) -> List[Dict[str, Any]]:
        """Return lightweight info dicts for all sessions (memory + database)."""
+        normalized_cwd = _normalize_cwd_for_compare(cwd) if cwd else None
+        db = self._get_db()
+        persisted_rows: dict[str, dict[str, Any]] = {}
+
+        if db is not None:
+            try:
+                for row in db.list_sessions_rich(source="acp", limit=1000):
+                    persisted_rows[str(row["id"])] = dict(row)
+            except Exception:
+                logger.debug("Failed to load ACP sessions from DB", exc_info=True)
+
        # Collect in-memory sessions first.
        with self._lock:
            seen_ids = set(self._sessions.keys())
-            results = [
-                {
-                    "session_id": s.session_id,
-                    "cwd": s.cwd,
-                    "model": s.model,
-                    "history_len": len(s.history),
-                }
-                for s in self._sessions.values()
-            ]
+            results = []
+            for s in self._sessions.values():
+                history_len = len(s.history)
+                if history_len <= 0:
+                    continue
+                if normalized_cwd and _normalize_cwd_for_compare(s.cwd) != normalized_cwd:
+                    continue
+                persisted = persisted_rows.get(s.session_id, {})
+                preview = next(
+                    (
+                        str(msg.get("content") or "").strip()
+                        for msg in s.history
+                        if msg.get("role") == "user" and str(msg.get("content") or "").strip()
+                    ),
+                    persisted.get("preview") or "",
+                )
+                results.append(
+                    {
+                        "session_id": s.session_id,
+                        "cwd": s.cwd,
+                        "model": s.model,
+                        "history_len": history_len,
+                        "title": _build_session_title(persisted.get("title"), preview, s.cwd),
+                        "updated_at": _format_updated_at(
+                            persisted.get("last_active") or persisted.get("started_at") or time.time()
+                        ),
+                    }
+                )

        # Merge any persisted sessions not currently in memory.
-        db = self._get_db()
-        if db is not None:
-            try:
-                rows = db.search_sessions(source="acp", limit=1000)
-                for row in rows:
-                    sid = row["id"]
-                    if sid in seen_ids:
-                        continue
-                    # Extract cwd from model_config JSON.
-                    cwd = "."
-                    mc = row.get("model_config")
-                    if mc:
-                        try:
-                            cwd = json.loads(mc).get("cwd", ".")
-                        except (json.JSONDecodeError, TypeError):
-                            pass
-                    results.append({
-                        "session_id": sid,
-                        "cwd": cwd,
-                        "model": row.get("model") or "",
-                        "history_len": row.get("message_count") or 0,
-                    })
-            except Exception:
-                logger.debug("Failed to list ACP sessions from DB", exc_info=True)
+        for sid, row in persisted_rows.items():
+            if sid in seen_ids:
+                continue
+            message_count = int(row.get("message_count") or 0)
+            if message_count <= 0:
+                continue
+            # Extract cwd from model_config JSON.
+            session_cwd = "."
+            mc = row.get("model_config")
+            if mc:
+                try:
+                    session_cwd = json.loads(mc).get("cwd", ".")
+                except (json.JSONDecodeError, TypeError):
+                    pass
+            if normalized_cwd and _normalize_cwd_for_compare(session_cwd) != normalized_cwd:
+                continue
+            results.append({
+                "session_id": sid,
+                "cwd": session_cwd,
+                "model": row.get("model") or "",
+                "history_len": message_count,
+                "title": _build_session_title(row.get("title"), row.get("preview"), session_cwd),
+                "updated_at": _format_updated_at(row.get("last_active") or row.get("started_at")),
+            })

+        results.sort(key=lambda item: _updated_at_sort_key(item.get("updated_at")), reverse=True)
        return results

    def update_cwd(self, session_id: str, cwd: str) -> Optional[SessionState]:
@@ -2,6 +2,7 @@

 from __future__ import annotations

+import json
 import uuid
 from typing import Any, Dict, List, Optional

@@ -96,6 +97,170 @@ def build_tool_title(tool_name: str, args: Dict[str, Any]) -> str:
    return tool_name


+def _build_patch_mode_content(patch_text: str) -> List[Any]:
+    """Parse V4A patch mode input into ACP diff blocks when possible."""
+    if not patch_text:
+        return [acp.tool_content(acp.text_block(""))]
+
+    try:
+        from tools.patch_parser import OperationType, parse_v4a_patch
+
+        operations, error = parse_v4a_patch(patch_text)
+        if error or not operations:
+            return [acp.tool_content(acp.text_block(patch_text))]
+
+        content: List[Any] = []
+        for op in operations:
+            if op.operation == OperationType.UPDATE:
+                old_chunks: list[str] = []
+                new_chunks: list[str] = []
+                for hunk in op.hunks:
+                    old_lines = [line.content for line in hunk.lines if line.prefix in (" ", "-")]
+                    new_lines = [line.content for line in hunk.lines if line.prefix in (" ", "+")]
+                    if old_lines or new_lines:
+                        old_chunks.append("\n".join(old_lines))
+                        new_chunks.append("\n".join(new_lines))
+
+                old_text = "\n...\n".join(chunk for chunk in old_chunks if chunk)
+                new_text = "\n...\n".join(chunk for chunk in new_chunks if chunk)
+                if old_text or new_text:
+                    content.append(
+                        acp.tool_diff_content(
+                            path=op.file_path,
+                            old_text=old_text or None,
+                            new_text=new_text or "",
+                        )
+                    )
+                continue
+
+            if op.operation == OperationType.ADD:
+                added_lines = [line.content for hunk in op.hunks for line in hunk.lines if line.prefix == "+"]
+                content.append(
+                    acp.tool_diff_content(
+                        path=op.file_path,
+                        new_text="\n".join(added_lines),
+                    )
+                )
+                continue
+
+            if op.operation == OperationType.DELETE:
+                content.append(
+                    acp.tool_diff_content(
+                        path=op.file_path,
+                        old_text=f"Delete file: {op.file_path}",
+                        new_text="",
+                    )
+                )
+                continue
+
+            if op.operation == OperationType.MOVE:
+                content.append(
+                    acp.tool_content(acp.text_block(f"Move file: {op.file_path} -> {op.new_path}"))
+                )
+
+        return content or [acp.tool_content(acp.text_block(patch_text))]
+    except Exception:
+        return [acp.tool_content(acp.text_block(patch_text))]
+
+
+def _strip_diff_prefix(path: str) -> str:
+    raw = str(path or "").strip()
+    if raw.startswith(("a/", "b/")):
+        return raw[2:]
+    return raw
+
+
+def _parse_unified_diff_content(diff_text: str) -> List[Any]:
+    """Convert unified diff text into ACP diff content blocks."""
+    if not diff_text:
+        return []
+
+    content: List[Any] = []
+    current_old_path: Optional[str] = None
+    current_new_path: Optional[str] = None
+    old_lines: list[str] = []
+    new_lines: list[str] = []
+
+    def _flush() -> None:
+        nonlocal current_old_path, current_new_path, old_lines, new_lines
+        if current_old_path is None and current_new_path is None:
+            return
+        path = current_new_path if current_new_path and current_new_path != "/dev/null" else current_old_path
+        if not path or path == "/dev/null":
+            current_old_path = None
+            current_new_path = None
+            old_lines = []
+            new_lines = []
+            return
+        content.append(
+            acp.tool_diff_content(
+                path=_strip_diff_prefix(path),
+                old_text="\n".join(old_lines) if old_lines else None,
+                new_text="\n".join(new_lines),
+            )
+        )
+        current_old_path = None
+        current_new_path = None
+        old_lines = []
+        new_lines = []
+
+    for line in diff_text.splitlines():
+        if line.startswith("--- "):
+            _flush()
+            current_old_path = line[4:].strip()
+            continue
+        if line.startswith("+++ "):
+            current_new_path = line[4:].strip()
+            continue
+        if line.startswith("@@"):
+            continue
+        if current_old_path is None and current_new_path is None:
+            continue
+        if line.startswith("+"):
+            new_lines.append(line[1:])
+        elif line.startswith("-"):
+            old_lines.append(line[1:])
+        elif line.startswith(" "):
+            shared = line[1:]
+            old_lines.append(shared)
+            new_lines.append(shared)
+
+    _flush()
+    return content
+
+
+def _build_tool_complete_content(
+    tool_name: str,
+    result: Optional[str],
+    *,
+    function_args: Optional[Dict[str, Any]] = None,
+    snapshot: Any = None,
+) -> List[Any]:
+    """Build structured ACP completion content, falling back to plain text."""
+    display_result = result or ""
+    if len(display_result) > 5000:
+        display_result = display_result[:4900] + f"\n... ({len(result)} chars total, truncated)"
+
+    if tool_name in {"write_file", "patch", "skill_manage"}:
+        try:
+            from agent.display import extract_edit_diff
+
+            diff_text = extract_edit_diff(
+                tool_name,
+                result,
+                function_args=function_args,
+                snapshot=snapshot,
+            )
+            if isinstance(diff_text, str) and diff_text.strip():
+                diff_content = _parse_unified_diff_content(diff_text)
+                if diff_content:
+                    return diff_content
+        except Exception:
+            pass
+
+    return [acp.tool_content(acp.text_block(display_result))]
+
+
 # ---------------------------------------------------------------------------
 # Build ACP content objects for tool-call events
 # ---------------------------------------------------------------------------
@@ -119,9 +284,8 @@ def build_tool_start(
            new = arguments.get("new_string", "")
            content = [acp.tool_diff_content(path=path, new_text=new, old_text=old)]
        else:
-            # Patch mode — show the patch content as text
            patch_text = arguments.get("patch", "")
-            content = [acp.tool_content(acp.text_block(patch_text))]
+            content = _build_patch_mode_content(patch_text)
        return acp.start_tool_call(
            tool_call_id, title, kind=kind, content=content, locations=locations,
            raw_input=arguments,
@@ -178,16 +342,17 @@ def build_tool_complete(
    tool_call_id: str,
    tool_name: str,
    result: Optional[str] = None,
+    function_args: Optional[Dict[str, Any]] = None,
+    snapshot: Any = None,
 ) -> ToolCallProgress:
    """Create a ToolCallUpdate (progress) event for a completed tool call."""
    kind = get_tool_kind(tool_name)
-
-    # Truncate very large results for the UI
-    display_result = result or ""
-    if len(display_result) > 5000:
-        display_result = display_result[:4900] + f"\n... ({len(result)} chars total, truncated)"
-
-    content = [acp.tool_content(acp.text_block(display_result))]
+    content = _build_tool_complete_content(
+        tool_name,
+        result,
+        function_args=function_args,
+        snapshot=snapshot,
+    )
    return acp.update_tool_call(
        tool_call_id,
        kind=kind,
@@ -28,19 +28,45 @@ except ImportError:
 logger = logging.getLogger(__name__)

 THINKING_BUDGET = {"xhigh": 32000, "high": 16000, "medium": 8000, "low": 4000}
+# Hermes effort → Anthropic adaptive-thinking effort (output_config.effort).
+# Anthropic exposes 5 levels on 4.7+: low, medium, high, xhigh, max.
+# Opus/Sonnet 4.6 only expose 4 levels: low, medium, high, max — no xhigh.
+# We preserve xhigh as xhigh on 4.7+ (the recommended default for coding/
+# agentic work) and downgrade it to max on pre-4.7 adaptive models (which
+# is the strongest level they accept).  "minimal" is a legacy alias that
+# maps to low on every model.  See:
+# https://platform.claude.com/docs/en/about-claude/models/migration-guide
 ADAPTIVE_EFFORT_MAP = {
-    "xhigh": "max",
-    "high": "high",
-    "medium": "medium",
-    "low": "low",
+    "max":     "max",
+    "xhigh":   "xhigh",
+    "high":    "high",
+    "medium":  "medium",
+    "low":     "low",
    "minimal": "low",
 }

+# Models that accept the "xhigh" output_config.effort level.  Opus 4.7 added
+# xhigh as a distinct level between high and max; older adaptive-thinking
+# models (4.6) reject it with a 400.  Keep this substring list in sync with
+# the Anthropic migration guide as new model families ship.
+_XHIGH_EFFORT_SUBSTRINGS = ("4-7", "4.7")
+
+# Models where extended thinking is deprecated/removed (4.6+ behavior: adaptive
+# is the only supported mode; 4.7 additionally forbids manual thinking entirely
+# and drops temperature/top_p/top_k).
+_ADAPTIVE_THINKING_SUBSTRINGS = ("4-6", "4.6", "4-7", "4.7")
+
+# Models where temperature/top_p/top_k return 400 if set to non-default values.
+# This is the Opus 4.7 contract; future 4.x+ models are expected to follow it.
+_NO_SAMPLING_PARAMS_SUBSTRINGS = ("4-7", "4.7")
+
 # ── Max output token limits per Anthropic model ───────────────────────
 # Source: Anthropic docs + Cline model catalog.  Anthropic's API requires
 # max_tokens as a mandatory field.  Previously we hardcoded 16384, which
 # starves thinking-enabled models (thinking tokens count toward the limit).
 _ANTHROPIC_OUTPUT_LIMITS = {
+    # Claude 4.7
+    "claude-opus-4-7":   128_000,
    # Claude 4.6
    "claude-opus-4-6":   128_000,
    "claude-sonnet-4-6":  64_000,
@@ -91,11 +117,37 @@ def _get_anthropic_max_output(model: str) -> int:


 def _supports_adaptive_thinking(model: str) -> bool:
-    """Return True for Claude 4.6 models that support adaptive thinking."""
-    return any(v in model for v in ("4-6", "4.6"))
+    """Return True for Claude 4.6+ models that support adaptive thinking."""
+    return any(v in model for v in _ADAPTIVE_THINKING_SUBSTRINGS)


-# Beta headers for enhanced features (sent with ALL auth types)
+def _supports_xhigh_effort(model: str) -> bool:
+    """Return True for models that accept the 'xhigh' adaptive effort level.
+
+    Opus 4.7 introduced xhigh as a distinct level between high and max.
+    Pre-4.7 adaptive models (Opus/Sonnet 4.6) only accept low/medium/high/max
+    and reject xhigh with an HTTP 400. Callers should downgrade xhigh→max
+    when this returns False.
+    """
+    return any(v in model for v in _XHIGH_EFFORT_SUBSTRINGS)
+
+
+def _forbids_sampling_params(model: str) -> bool:
+    """Return True for models that 400 on any non-default temperature/top_p/top_k.
+
+    Opus 4.7 explicitly rejects sampling parameters; later Claude releases are
+    expected to follow suit.  Callers should omit these fields entirely rather
+    than passing zero/default values (the API rejects anything non-null).
+    """
+    return any(v in model for v in _NO_SAMPLING_PARAMS_SUBSTRINGS)
+
+
+# Beta headers for enhanced features (sent with ALL auth types).
+# As of Opus 4.7 (2026-04-16), both of these are GA on Claude 4.6+ — the
+# beta headers are still accepted (harmless no-op) but not required. Kept
+# here so older Claude (4.5, 4.1) + third-party Anthropic-compat endpoints
+# that still gate on the headers continue to get the enhanced features.
+# Migration guide: remove these if you no longer support ≤4.5 models.
 _COMMON_BETAS = [
    "interleaved-thinking-2025-05-14",
    "fine-grained-tool-streaming-2025-05-14",
@@ -298,6 +350,33 @@ def build_anthropic_client(api_key: str, base_url: str = None):
    return _anthropic_sdk.Anthropic(**kwargs)


+def build_anthropic_bedrock_client(region: str):
+    """Create an AnthropicBedrock client for Bedrock Claude models.
+
+    Uses the Anthropic SDK's native Bedrock adapter, which provides full
+    Claude feature parity: prompt caching, thinking budgets, adaptive
+    thinking, fast mode — features not available via the Converse API.
+
+    Auth uses the boto3 default credential chain (IAM roles, SSO, env vars).
+    """
+    if _anthropic_sdk is None:
+        raise ImportError(
+            "The 'anthropic' package is required for the Bedrock provider. "
+            "Install it with: pip install 'anthropic>=0.39.0'"
+        )
+    if not hasattr(_anthropic_sdk, "AnthropicBedrock"):
+        raise ImportError(
+            "anthropic.AnthropicBedrock not available. "
+            "Upgrade with: pip install 'anthropic>=0.39.0'"
+        )
+    from httpx import Timeout
+
+    return _anthropic_sdk.AnthropicBedrock(
+        aws_region=region,
+        timeout=Timeout(timeout=900.0, connect=10.0),
+    )
+
+
 def read_claude_code_credentials() -> Optional[Dict[str, Any]]:
    """Read refreshable Claude Code OAuth credentials from ~/.claude/.credentials.json.

@@ -1314,18 +1393,31 @@ def build_anthropic_kwargs(
            kwargs["tool_choice"] = {"type": "tool", "name": tool_choice}

    # Map reasoning_config to Anthropic's thinking parameter.
-    # Claude 4.6 models use adaptive thinking + output_config.effort.
+    # Claude 4.6+ models use adaptive thinking + output_config.effort.
    # Older models use manual thinking with budget_tokens.
    # MiniMax Anthropic-compat endpoints support thinking (manual mode only,
    # not adaptive).  Haiku does NOT support extended thinking — skip entirely.
+    #
+    # On 4.7+ the `thinking.display` field defaults to "omitted", which
+    # silently hides reasoning text that Hermes surfaces in its CLI. We
+    # request "summarized" so the reasoning blocks stay populated — matching
+    # 4.6 behavior and preserving the activity-feed UX during long tool runs.
    if reasoning_config and isinstance(reasoning_config, dict):
        if reasoning_config.get("enabled") is not False and "haiku" not in model.lower():
            effort = str(reasoning_config.get("effort", "medium")).lower()
            budget = THINKING_BUDGET.get(effort, 8000)
            if _supports_adaptive_thinking(model):
-                kwargs["thinking"] = {"type": "adaptive"}
+                kwargs["thinking"] = {
+                    "type": "adaptive",
+                    "display": "summarized",
+                }
+                adaptive_effort = ADAPTIVE_EFFORT_MAP.get(effort, "medium")
+                # Downgrade xhigh→max on models that don't list xhigh as a
+                # supported level (Opus/Sonnet 4.6). Opus 4.7+ keeps xhigh.
+                if adaptive_effort == "xhigh" and not _supports_xhigh_effort(model):
+                    adaptive_effort = "max"
                kwargs["output_config"] = {
-                    "effort": ADAPTIVE_EFFORT_MAP.get(effort, "medium")
+                    "effort": adaptive_effort,
                }
            else:
                kwargs["thinking"] = {"type": "enabled", "budget_tokens": budget}
@@ -1333,6 +1425,15 @@ def build_anthropic_kwargs(
                kwargs["temperature"] = 1
                kwargs["max_tokens"] = max(effective_max_tokens, budget + 4096)

+    # ── Strip sampling params on 4.7+ ─────────────────────────────────
+    # Opus 4.7 rejects any non-default temperature/top_p/top_k with a 400.
+    # Callers (auxiliary_client, flush_memories, etc.) may set these for
+    # older models; drop them here as a safety net so upstream 4.6 → 4.7
+    # migrations don't require coordinated edits everywhere.
+    if _forbids_sampling_params(model):
+        for _sampling_key in ("temperature", "top_p", "top_k"):
+            kwargs.pop(_sampling_key, None)
+
    # ── Fast mode (Opus 4.6 only) ────────────────────────────────────
    # Adds extra_body.speed="fast" + the fast-mode beta header for ~2.5x
    # output speed. Only for native Anthropic endpoints — third-party
@@ -1390,12 +1491,20 @@ def normalize_anthropic_response(
                )
            )

-    # Map Anthropic stop_reason to OpenAI finish_reason
+    # Map Anthropic stop_reason to OpenAI finish_reason.
+    # Newer stop reasons added in Claude 4.5+ / 4.7:
+    #   - refusal: the model declined to answer (cyber safeguards, CSAM, etc.)
+    #   - model_context_window_exceeded: hit context limit (not max_tokens)
+    # Both need distinct handling upstream — a refusal should surface to the
+    # user with a clear message, and a context-window overflow should trigger
+    # compression/truncation rather than be treated as normal end-of-turn.
    stop_reason_map = {
        "end_turn": "stop",
        "tool_use": "tool_calls",
        "max_tokens": "length",
        "stop_sequence": "stop",
+        "refusal": "content_filter",
+        "model_context_window_exceeded": "length",
    }
    finish_reason = stop_reason_map.get(response.stop_reason, "stop")

@@ -58,6 +58,9 @@ _PROVIDER_ALIASES = {
    "google": "gemini",
    "google-gemini": "gemini",
    "google-ai-studio": "gemini",
+    "x-ai": "xai",
+    "x.ai": "xai",
+    "grok": "xai",
    "glm": "zai",
    "z-ai": "zai",
    "z.ai": "zai",
@@ -91,6 +94,54 @@ def _normalize_aux_provider(provider: Optional[str]) -> str:
        return "custom"
    return _PROVIDER_ALIASES.get(normalized, normalized)

+
+_FIXED_TEMPERATURE_MODELS: Dict[str, float] = {
+    "kimi-for-coding": 0.6,
+}
+
+# Moonshot's kimi-for-coding endpoint (api.kimi.com/coding) documents:
+# "k2.5 model will use a fixed value 1.0, non-thinking mode will use a fixed
+# value 0.6.  Any other value will result in an error."  The same lock applies
+# to the other k2.* models served on that endpoint.  Enumerated explicitly so
+# non-coding siblings like `kimi-k2-instruct` (variable temperature, served on
+# the standard chat API and third parties) are NOT clamped.
+# Source: https://platform.kimi.ai/docs/guide/kimi-k2-5-quickstart
+_KIMI_INSTANT_MODELS: frozenset = frozenset({
+    "kimi-k2.5",
+    "kimi-k2-turbo-preview",
+    "kimi-k2-0905-preview",
+})
+_KIMI_THINKING_MODELS: frozenset = frozenset({
+    "kimi-k2-thinking",
+    "kimi-k2-thinking-turbo",
+})
+
+
+def _fixed_temperature_for_model(model: Optional[str]) -> Optional[float]:
+    """Return a required temperature override for models with strict contracts.
+
+    Moonshot's kimi-for-coding endpoint rejects any non-approved temperature on
+    the k2.5 family.  Non-thinking variants require exactly 0.6; thinking
+    variants require 1.0.  An optional ``vendor/`` prefix (e.g.
+    ``moonshotai/kimi-k2.5``) is tolerated for aggregator routings.
+
+    Returns ``None`` for every other model, including ``kimi-k2-instruct*``
+    which is the separate non-coding K2 family with variable temperature.
+    """
+    normalized = (model or "").strip().lower()
+    fixed = _FIXED_TEMPERATURE_MODELS.get(normalized)
+    if fixed is not None:
+        logger.debug("Forcing temperature=%s for model %r (fixed map)", fixed, model)
+        return fixed
+    bare = normalized.rsplit("/", 1)[-1]
+    if bare in _KIMI_THINKING_MODELS:
+        logger.debug("Forcing temperature=1.0 for kimi thinking model %r", model)
+        return 1.0
+    if bare in _KIMI_INSTANT_MODELS:
+        logger.debug("Forcing temperature=0.6 for kimi instant model %r", model)
+        return 0.6
+    return None
+
 # Default auxiliary models for direct API-key providers (cheap/fast for side tasks)
 _API_KEY_PROVIDER_AUX_MODELS: Dict[str, str] = {
    "gemini": "gemini-3-flash-preview",
@@ -104,6 +155,7 @@ _API_KEY_PROVIDER_AUX_MODELS: Dict[str, str] = {
    "opencode-zen": "gemini-3-flash",
    "opencode-go": "glm-5",
    "kilocode": "google/gemini-3-flash-preview",
+    "ollama-cloud": "nemotron-3-nano:30b",
 }

 # Vision-specific model overrides for direct providers.
@@ -514,8 +566,13 @@ class _AnthropicCompletionsAdapter:
            tool_choice=normalized_tool_choice,
            is_oauth=self._is_oauth,
        )
+        # Opus 4.7+ rejects any non-default temperature/top_p/top_k; only set
+        # temperature for models that still accept it. build_anthropic_kwargs
+        # additionally strips these keys as a safety net — keep both layers.
        if temperature is not None:
-            anthropic_kwargs["temperature"] = temperature
+            from agent.anthropic_adapter import _forbids_sampling_params
+            if not _forbids_sampling_params(model):
+                anthropic_kwargs["temperature"] = temperature

        response = self._client.messages.create(**anthropic_kwargs)
        assistant_message, finish_reason = normalize_anthropic_response(response)
@@ -775,6 +832,21 @@ def _try_openrouter() -> Tuple[Optional[OpenAI], Optional[str]]:


 def _try_nous(vision: bool = False) -> Tuple[Optional[OpenAI], Optional[str]]:
+    # Check cross-session rate limit guard before attempting Nous —
+    # if another session already recorded a 429, skip Nous entirely
+    # to avoid piling more requests onto the tapped RPH bucket.
+    try:
+        from agent.nous_rate_guard import nous_rate_limit_remaining
+        _remaining = nous_rate_limit_remaining()
+        if _remaining is not None and _remaining > 0:
+            logger.debug(
+                "Auxiliary: skipping Nous Portal (rate-limited, resets in %.0fs)",
+                _remaining,
+            )
+            return None, None
+    except Exception:
+        pass
+
    nous = _read_nous_auth()
    if not nous:
        return None, None
@@ -899,6 +971,51 @@ def _current_custom_base_url() -> str:
    return custom_base or ""


+def _validate_proxy_env_urls() -> None:
+    """Fail fast with a clear error when proxy env vars have malformed URLs.
+
+    Common cause: shell config (e.g. .zshrc) with a typo like
+    ``export HTTP_PROXY=http://127.0.0.1:6153export NEXT_VAR=...``
+    which concatenates 'export' into the port number.  Without this
+    check the OpenAI/httpx client raises a cryptic ``Invalid port``
+    error that doesn't name the offending env var.
+    """
+    from urllib.parse import urlparse
+
+    for key in ("HTTPS_PROXY", "HTTP_PROXY", "ALL_PROXY",
+                "https_proxy", "http_proxy", "all_proxy"):
+        value = str(os.environ.get(key) or "").strip()
+        if not value:
+            continue
+        try:
+            parsed = urlparse(value)
+            if parsed.scheme:
+                _ = parsed.port          # raises ValueError for e.g. '6153export'
+        except ValueError as exc:
+            raise RuntimeError(
+                f"Malformed proxy environment variable {key}={value!r}. "
+                "Fix or unset your proxy settings and try again."
+            ) from exc
+
+
+def _validate_base_url(base_url: str) -> None:
+    """Reject obviously broken custom endpoint URLs before they reach httpx."""
+    from urllib.parse import urlparse
+
+    candidate = str(base_url or "").strip()
+    if not candidate or candidate.startswith("acp://"):
+        return
+    try:
+        parsed = urlparse(candidate)
+        if parsed.scheme in {"http", "https"}:
+            _ = parsed.port              # raises ValueError for malformed ports
+    except ValueError as exc:
+        raise RuntimeError(
+            f"Malformed custom endpoint URL: {candidate!r}. "
+            "Run `hermes setup` or `hermes model` and enter a valid http(s) base URL."
+        ) from exc
+
+
 def _try_custom_endpoint() -> Tuple[Optional[OpenAI], Optional[str]]:
    runtime = _resolve_custom_runtime()
    if len(runtime) == 2:
@@ -995,8 +1112,6 @@ _AUTO_PROVIDER_LABELS = {
    "_resolve_api_key_provider": "api-key",
 }

-_AGGREGATOR_PROVIDERS = frozenset({"openrouter", "nous"})
-
 _MAIN_RUNTIME_FIELDS = ("provider", "model", "base_url", "api_key", "api_mode")


@@ -1127,11 +1242,15 @@ def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Option
    """Full auto-detection chain.

    Priority:
-      1. If the user's main provider is NOT an aggregator (OpenRouter / Nous),
-         use their main provider + main model directly.  This ensures users on
-         Alibaba, DeepSeek, ZAI, etc. get auxiliary tasks handled by the same
-         provider they already have credentials for — no OpenRouter key needed.
-      2. OpenRouter → Nous → custom → Codex → API-key providers (original chain).
+      1. User's main provider + main model, regardless of provider type.
+         This means auxiliary tasks (compression, vision, web extraction,
+         session search, etc.) use the same model the user configured for
+         chat.  Users on OpenRouter/Nous get their chosen chat model; users
+         on DeepSeek/ZAI/Alibaba get theirs; etc.  Running aux tasks on the
+         user's picked model keeps behavior predictable — no surprise
+         switches to a cheap fallback model for side tasks.
+      2. OpenRouter → Nous → custom → Codex → API-key providers (fallback
+         chain, only used when the main provider has no working client).
    """
    global auxiliary_is_nous, _stale_base_url_warned
    auxiliary_is_nous = False  # Reset — _try_nous() will set True if it wins
@@ -1161,11 +1280,16 @@ def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Option
            )
            _stale_base_url_warned = True

-    # ── Step 1: non-aggregator main provider → use main model directly ──
+    # ── Step 1: main provider + main model → use them directly ──
+    #
+    # This is the primary aux backend for every user.  "auto" means
+    # "use my main chat model for side tasks as well" — including users
+    # on aggregators (OpenRouter, Nous) who previously got routed to a
+    # cheap provider-side default.  Explicit per-task overrides set via
+    # config.yaml (auxiliary.<task>.provider) still win over this.
    main_provider = runtime_provider or _read_main_provider()
    main_model = runtime_model or _read_main_model()
    if (main_provider and main_model
-            and main_provider not in _AGGREGATOR_PROVIDERS
            and main_provider not in ("auto", "")):
        resolved_provider = main_provider
        explicit_base_url = None
@@ -1299,6 +1423,7 @@ def resolve_provider_client(
    Returns:
        (client, resolved_model) or (None, None) if auth is unavailable.
    """
+    _validate_proxy_env_urls()
    # Normalise aliases
    provider = _normalize_aux_provider(provider)

@@ -1523,7 +1648,6 @@ def resolve_provider_client(
            from hermes_cli.models import copilot_default_headers

            headers.update(copilot_default_headers())
-
        client = OpenAI(api_key=api_key, base_url=base_url,
                        **({"default_headers": headers} if headers else {}))

@@ -1747,34 +1871,31 @@ def resolve_vision_provider_client(

    if requested == "auto":
        # Vision auto-detection order:
-        #   1. Active provider + model (user's main chat config)
-        #   2. OpenRouter  (known vision-capable default model)
-        #   3. Nous Portal (known vision-capable default model)
+        #   1. User's main provider + main model (including aggregators).
+        #      _PROVIDER_VISION_MODELS provides per-provider vision model
+        #      overrides when the provider has a dedicated multimodal model
+        #      that differs from the chat model (e.g. xiaomi → mimo-v2-omni,
+        #      zai → glm-5v-turbo).
+        #   2. OpenRouter  (vision-capable aggregator fallback)
+        #   3. Nous Portal (vision-capable aggregator fallback)
        #   4. Stop
        main_provider = _read_main_provider()
        main_model = _read_main_model()
        if main_provider and main_provider not in ("auto", ""):
-            if main_provider in _VISION_AUTO_PROVIDER_ORDER:
-                # Known strict backend — use its defaults.
-                sync_client, default_model = _resolve_strict_vision_backend(main_provider)
-                if sync_client is not None:
-                    return _finalize(main_provider, sync_client, default_model)
-            else:
-                # Exotic provider (DeepSeek, Alibaba, Xiaomi, named custom, etc.)
-                # Use provider-specific vision model if available, otherwise main model.
-                vision_model = _PROVIDER_VISION_MODELS.get(main_provider, main_model)
-                rpc_client, rpc_model = resolve_provider_client(
-                    main_provider, vision_model,
-                    api_mode=resolved_api_mode)
-                if rpc_client is not None:
-                    logger.info(
-                        "Vision auto-detect: using active provider %s (%s)",
-                        main_provider, rpc_model or vision_model,
-                    )
-                    return _finalize(
-                        main_provider, rpc_client, rpc_model or vision_model)
+            vision_model = _PROVIDER_VISION_MODELS.get(main_provider, main_model)
+            rpc_client, rpc_model = resolve_provider_client(
+                main_provider, vision_model,
+                api_mode=resolved_api_mode)
+            if rpc_client is not None:
+                logger.info(
+                    "Vision auto-detect: using main provider %s (%s)",
+                    main_provider, rpc_model or vision_model,
+                )
+                return _finalize(
+                    main_provider, rpc_client, rpc_model or vision_model)

-        # Fall back through aggregators.
+        # Fall back through aggregators (uses their dedicated vision model,
+        # not the user's main model) when main provider has no client.
        for candidate in _VISION_AUTO_PROVIDER_ORDER:
            if candidate == main_provider:
                continue  # already tried above
@@ -1835,9 +1956,15 @@ def auxiliary_max_tokens_param(value: int) -> dict:
 # Every auxiliary LLM consumer should use these instead of manually
 # constructing clients and calling .chat.completions.create().

-# Client cache: (provider, async_mode, base_url, api_key) -> (client, default_model)
+# Client cache: (provider, async_mode, base_url, api_key, api_mode, runtime_key) -> (client, default_model, loop)
+# NOTE: loop identity is NOT part of the key.  On async cache hits we check
+# whether the cached loop is the *current* loop; if not, the stale entry is
+# replaced in-place.  This bounds cache growth to one entry per unique
+# provider config rather than one per (config × event-loop), which previously
+# caused unbounded fd accumulation in long-running gateway processes (#10200).
 _client_cache: Dict[tuple, tuple] = {}
 _client_cache_lock = threading.Lock()
+_CLIENT_CACHE_MAX_SIZE = 64  # safety belt — evict oldest when exceeded


 def neuter_async_httpx_del() -> None:
@@ -1970,39 +2097,49 @@ def _get_cached_client(
    Async clients (AsyncOpenAI) use httpx.AsyncClient internally, which
    binds to the event loop that was current when the client was created.
    Using such a client on a *different* loop causes deadlocks or
-    RuntimeError.  To prevent cross-loop issues (especially in gateway
-    mode where _run_async() may spawn fresh loops in worker threads), the
-    cache key for async clients includes the current event loop's identity
-    so each loop gets its own client instance.
+    RuntimeError.  To prevent cross-loop issues, the cache validates on
+    every async hit that the cached loop is the *current, open* loop.
+    If the loop changed (e.g. a new gateway worker-thread loop), the stale
+    entry is replaced in-place rather than creating an additional entry.
+
+    This keeps cache size bounded to one entry per unique provider config,
+    preventing the fd-exhaustion that previously occurred in long-running
+    gateways where recycled worker threads created unbounded entries (#10200).
    """
-    # Include loop identity for async clients to prevent cross-loop reuse.
-    # httpx.AsyncClient (inside AsyncOpenAI) is bound to the loop where it
-    # was created — reusing it on a different loop causes deadlocks (#2681).
-    loop_id = 0
+    # Resolve the current event loop for async clients so we can validate
+    # cached entries.  Loop identity is NOT in the cache key — instead we
+    # check at hit time whether the cached loop is still current and open.
+    # This prevents unbounded cache growth from recycled worker-thread loops
+    # while still guaranteeing we never reuse a client on the wrong loop
+    # (which causes deadlocks, see #2681).
    current_loop = None
    if async_mode:
        try:
            import asyncio as _aio
            current_loop = _aio.get_event_loop()
-            loop_id = id(current_loop)
        except RuntimeError:
            pass
    runtime = _normalize_main_runtime(main_runtime)
    runtime_key = tuple(runtime.get(field, "") for field in _MAIN_RUNTIME_FIELDS) if provider == "auto" else ()
-    cache_key = (provider, async_mode, base_url or "", api_key or "", api_mode or "", loop_id, runtime_key)
+    cache_key = (provider, async_mode, base_url or "", api_key or "", api_mode or "", runtime_key)
    with _client_cache_lock:
        if cache_key in _client_cache:
            cached_client, cached_default, cached_loop = _client_cache[cache_key]
            if async_mode:
-                # A cached async client whose loop has been closed will raise
-                # "Event loop is closed" when httpx tries to clean up its
-                # transport.  Discard the stale client and create a fresh one.
-                if cached_loop is not None and cached_loop.is_closed():
-                    _force_close_async_httpx(cached_client)
-                    del _client_cache[cache_key]
-                else:
+                # Validate: the cached client must be bound to the CURRENT,
+                # OPEN loop.  If the loop changed or was closed, the httpx
+                # transport inside is dead — force-close and replace.
+                loop_ok = (
+                    cached_loop is not None
+                    and cached_loop is current_loop
+                    and not cached_loop.is_closed()
+                )
+                if loop_ok:
                    effective = _compat_model(cached_client, model, cached_default)
                    return cached_client, effective
+                # Stale — evict and fall through to create a new client.
+                _force_close_async_httpx(cached_client)
+                del _client_cache[cache_key]
            else:
                effective = _compat_model(cached_client, model, cached_default)
                return cached_client, effective
@@ -2022,6 +2159,12 @@ def _get_cached_client(
        bound_loop = current_loop
        with _client_cache_lock:
            if cache_key not in _client_cache:
+                # Safety belt: if the cache has grown beyond the max, evict
+                # the oldest entries (FIFO — dict preserves insertion order).
+                while len(_client_cache) >= _CLIENT_CACHE_MAX_SIZE:
+                    evict_key, evict_entry = next(iter(_client_cache.items()))
+                    _force_close_async_httpx(evict_entry[0])
+                    del _client_cache[evict_key]
                _client_cache[cache_key] = (client, default_model, bound_loop)
            else:
                client, default_model, _ = _client_cache[cache_key]
@@ -2201,6 +2344,19 @@ def _build_call_kwargs(
        "timeout": timeout,
    }

+    fixed_temperature = _fixed_temperature_for_model(model)
+    if fixed_temperature is not None:
+        temperature = fixed_temperature
+
+    # Opus 4.7+ rejects any non-default temperature/top_p/top_k — silently
+    # drop here so auxiliary callers that hardcode temperature (e.g. 0.3 on
+    # flush_memories, 0 on structured-JSON extraction) don't 400 the moment
+    # the aux model is flipped to 4.7.
+    if temperature is not None:
+        from agent.anthropic_adapter import _forbids_sampling_params
+        if _forbids_sampling_params(model):
+            temperature = None
+
    if temperature is not None:
        kwargs["temperature"] = temperature

@@ -2304,10 +2460,10 @@ def call_llm(

    if task == "vision":
        effective_provider, client, final_model = resolve_vision_provider_client(
-            provider=provider,
-            model=model,
-            base_url=base_url,
-            api_key=api_key,
+            provider=resolved_provider if resolved_provider != "auto" else provider,
+            model=resolved_model or model,
+            base_url=resolved_base_url or base_url,
+            api_key=resolved_api_key or api_key,
            async_mode=False,
        )
        if client is None and resolved_provider != "auto" and not resolved_base_url:
@@ -2512,10 +2668,10 @@ async def async_call_llm(

    if task == "vision":
        effective_provider, client, final_model = resolve_vision_provider_client(
-            provider=provider,
-            model=model,
-            base_url=base_url,
-            api_key=api_key,
+            provider=resolved_provider if resolved_provider != "auto" else provider,
+            model=resolved_model or model,
+            base_url=resolved_base_url or base_url,
+            api_key=resolved_api_key or api_key,
            async_mode=True,
        )
        if client is None and resolved_provider != "auto" and not resolved_base_url:
@@ -17,7 +17,10 @@ Improvements over v2:
  - Richer tool call/result detail in summarizer input
 """

+import hashlib
+import json
 import logging
+import re
 import time
 from typing import Any, Dict, List, Optional

@@ -36,7 +39,10 @@ SUMMARY_PREFIX = (
    "into the summary below. This is a handoff from a previous context "
    "window — treat it as background reference, NOT as active instructions. "
    "Do NOT answer questions or fulfill requests mentioned in this summary; "
-    "they were already addressed. Respond ONLY to the latest user message "
+    "they were already addressed. "
+    "Your current task is identified in the '## Active Task' section of the "
+    "summary — resume exactly from there. "
+    "Respond ONLY to the latest user message "
    "that appears AFTER this summary. The current session state (files, "
    "config, etc.) may reflect work described here — avoid repeating it:"
 )
@@ -57,6 +63,174 @@ _CHARS_PER_TOKEN = 4
 _SUMMARY_FAILURE_COOLDOWN_SECONDS = 600


+def _truncate_tool_call_args_json(args: str, head_chars: int = 200) -> str:
+    """Shrink long string values inside a tool-call arguments JSON blob while
+    preserving JSON validity.
+
+    The ``function.arguments`` field on a tool call is a JSON-encoded string
+    passed through to the LLM provider; downstream providers strictly
+    validate it and return a non-retryable 400 when it is not well-formed.
+    An earlier implementation sliced the raw JSON at a fixed byte offset and
+    appended ``...[truncated]`` — which routinely produced strings like::
+
+        {"path": "/foo/bar", "content": "# long markdown
+        ...[truncated]
+
+    i.e. an unterminated string and a missing closing brace. MiniMax, for
+    example, rejects this with ``invalid function arguments json string``
+    and the session gets stuck re-sending the same broken history on every
+    turn. See issue #11762 for the observed loop.
+
+    This helper parses the arguments, shrinks long string leaves inside the
+    parsed structure, and re-serialises. Non-string values (paths, ints,
+    booleans) are preserved intact. If the arguments are not valid JSON
+    to begin with — some model backends use non-JSON tool arguments — the
+    original string is returned unchanged rather than replaced with
+    something neither we nor the backend can parse.
+    """
+    try:
+        parsed = json.loads(args)
+    except (ValueError, TypeError):
+        return args
+
+    def _shrink(obj: Any) -> Any:
+        if isinstance(obj, str):
+            if len(obj) > head_chars:
+                return obj[:head_chars] + "...[truncated]"
+            return obj
+        if isinstance(obj, dict):
+            return {k: _shrink(v) for k, v in obj.items()}
+        if isinstance(obj, list):
+            return [_shrink(v) for v in obj]
+        return obj
+
+    shrunken = _shrink(parsed)
+    # ensure_ascii=False preserves CJK/emoji instead of bloating with \uXXXX
+    return json.dumps(shrunken, ensure_ascii=False)
+
+
+def _summarize_tool_result(tool_name: str, tool_args: str, tool_content: str) -> str:
+    """Create an informative 1-line summary of a tool call + result.
+
+    Used during the pre-compression pruning pass to replace large tool
+    outputs with a short but useful description of what the tool did,
+    rather than a generic placeholder that carries zero information.
+
+    Returns strings like::
+
+        [terminal] ran `npm test` -> exit 0, 47 lines output
+        [read_file] read config.py from line 1 (1,200 chars)
+        [search_files] content search for 'compress' in agent/ -> 12 matches
+    """
+    try:
+        args = json.loads(tool_args) if tool_args else {}
+    except (json.JSONDecodeError, TypeError):
+        args = {}
+
+    content = tool_content or ""
+    content_len = len(content)
+    line_count = content.count("\n") + 1 if content.strip() else 0
+
+    if tool_name == "terminal":
+        cmd = args.get("command", "")
+        if len(cmd) > 80:
+            cmd = cmd[:77] + "..."
+        exit_match = re.search(r'"exit_code"\s*:\s*(-?\d+)', content)
+        exit_code = exit_match.group(1) if exit_match else "?"
+        return f"[terminal] ran `{cmd}` -> exit {exit_code}, {line_count} lines output"
+
+    if tool_name == "read_file":
+        path = args.get("path", "?")
+        offset = args.get("offset", 1)
+        return f"[read_file] read {path} from line {offset} ({content_len:,} chars)"
+
+    if tool_name == "write_file":
+        path = args.get("path", "?")
+        written_lines = args.get("content", "").count("\n") + 1 if args.get("content") else "?"
+        return f"[write_file] wrote to {path} ({written_lines} lines)"
+
+    if tool_name == "search_files":
+        pattern = args.get("pattern", "?")
+        path = args.get("path", ".")
+        target = args.get("target", "content")
+        match_count = re.search(r'"total_count"\s*:\s*(\d+)', content)
+        count = match_count.group(1) if match_count else "?"
+        return f"[search_files] {target} search for '{pattern}' in {path} -> {count} matches"
+
+    if tool_name == "patch":
+        path = args.get("path", "?")
+        mode = args.get("mode", "replace")
+        return f"[patch] {mode} in {path} ({content_len:,} chars result)"
+
+    if tool_name in ("browser_navigate", "browser_click", "browser_snapshot",
+                     "browser_type", "browser_scroll", "browser_vision"):
+        url = args.get("url", "")
+        ref = args.get("ref", "")
+        detail = f" {url}" if url else (f" ref={ref}" if ref else "")
+        return f"[{tool_name}]{detail} ({content_len:,} chars)"
+
+    if tool_name == "web_search":
+        query = args.get("query", "?")
+        return f"[web_search] query='{query}' ({content_len:,} chars result)"
+
+    if tool_name == "web_extract":
+        urls = args.get("urls", [])
+        url_desc = urls[0] if isinstance(urls, list) and urls else "?"
+        if isinstance(urls, list) and len(urls) > 1:
+            url_desc += f" (+{len(urls) - 1} more)"
+        return f"[web_extract] {url_desc} ({content_len:,} chars)"
+
+    if tool_name == "delegate_task":
+        goal = args.get("goal", "")
+        if len(goal) > 60:
+            goal = goal[:57] + "..."
+        return f"[delegate_task] '{goal}' ({content_len:,} chars result)"
+
+    if tool_name == "execute_code":
+        code_preview = (args.get("code") or "")[:60].replace("\n", " ")
+        if len(args.get("code", "")) > 60:
+            code_preview += "..."
+        return f"[execute_code] `{code_preview}` ({line_count} lines output)"
+
+    if tool_name in ("skill_view", "skills_list", "skill_manage"):
+        name = args.get("name", "?")
+        return f"[{tool_name}] name={name} ({content_len:,} chars)"
+
+    if tool_name == "vision_analyze":
+        question = args.get("question", "")[:50]
+        return f"[vision_analyze] '{question}' ({content_len:,} chars)"
+
+    if tool_name == "memory":
+        action = args.get("action", "?")
+        target = args.get("target", "?")
+        return f"[memory] {action} on {target}"
+
+    if tool_name == "todo":
+        return "[todo] updated task list"
+
+    if tool_name == "clarify":
+        return "[clarify] asked user a question"
+
+    if tool_name == "text_to_speech":
+        return f"[text_to_speech] generated audio ({content_len:,} chars)"
+
+    if tool_name == "cronjob":
+        action = args.get("action", "?")
+        return f"[cronjob] {action}"
+
+    if tool_name == "process":
+        action = args.get("action", "?")
+        sid = args.get("session_id", "?")
+        return f"[process] {action} session={sid}"
+
+    # Generic fallback
+    first_arg = ""
+    for k, v in list(args.items())[:2]:
+        sv = str(v)[:40]
+        first_arg += f" {k}={sv}"
+    return f"[{tool_name}]{first_arg} ({content_len:,} chars result)"
+
+
 class ContextCompressor(ContextEngine):
    """Default context engine — compresses conversation context via lossy summarization.

@@ -78,6 +252,8 @@ class ContextCompressor(ContextEngine):
        self._context_probed = False
        self._context_probe_persistable = False
        self._previous_summary = None
+        self._last_compression_savings_pct = 100.0
+        self._ineffective_compression_count = 0

    def update_model(
        self,
@@ -167,6 +343,9 @@ class ContextCompressor(ContextEngine):

        # Stores the previous compaction summary for iterative updates
        self._previous_summary: Optional[str] = None
+        # Anti-thrashing: track whether last compression was effective
+        self._last_compression_savings_pct: float = 100.0
+        self._ineffective_compression_count: int = 0
        self._summary_failure_cooldown_until: float = 0.0

    def update_from_response(self, usage: Dict[str, Any]):
@@ -175,9 +354,26 @@ class ContextCompressor(ContextEngine):
        self.last_completion_tokens = usage.get("completion_tokens", 0)

    def should_compress(self, prompt_tokens: int = None) -> bool:
-        """Check if context exceeds the compression threshold."""
+        """Check if context exceeds the compression threshold.
+
+        Includes anti-thrashing protection: if the last two compressions
+        each saved less than 10%, skip compression to avoid infinite loops
+        where each pass removes only 1-2 messages.
+        """
        tokens = prompt_tokens if prompt_tokens is not None else self.last_prompt_tokens
-        return tokens >= self.threshold_tokens
+        if tokens < self.threshold_tokens:
+            return False
+        # Anti-thrashing: back off if recent compressions were ineffective
+        if self._ineffective_compression_count >= 2:
+            if not self.quiet_mode:
+                logger.warning(
+                    "Compression skipped — last %d compressions saved <10%% each. "
+                    "Consider /new to start a fresh session, or /compress <topic> "
+                    "for focused compression.",
+                    self._ineffective_compression_count,
+                )
+            return False
+        return True

    # ------------------------------------------------------------------
    # Tool output pruning (cheap pre-pass, no LLM call)
@@ -187,7 +383,16 @@ class ContextCompressor(ContextEngine):
        self, messages: List[Dict[str, Any]], protect_tail_count: int,
        protect_tail_tokens: int | None = None,
    ) -> tuple[List[Dict[str, Any]], int]:
-        """Replace old tool result contents with a short placeholder.
+        """Replace old tool result contents with informative 1-line summaries.
+
+        Instead of a generic placeholder, generates a summary like::
+
+            [terminal] ran `npm test` -> exit 0, 47 lines output
+            [read_file] read config.py from line 1 (3,400 chars)
+
+        Also deduplicates identical tool results (e.g. reading the same file
+        5x keeps only the newest full copy) and truncates large tool_call
+        arguments in assistant messages outside the protected tail.

        Walks backward from the end, protecting the most recent messages that
        fall within ``protect_tail_tokens`` (when provided) OR the last
@@ -203,6 +408,22 @@ class ContextCompressor(ContextEngine):
        result = [m.copy() for m in messages]
        pruned = 0

+        # Build index: tool_call_id -> (tool_name, arguments_json)
+        call_id_to_tool: Dict[str, tuple] = {}
+        for msg in result:
+            if msg.get("role") == "assistant":
+                for tc in msg.get("tool_calls") or []:
+                    if isinstance(tc, dict):
+                        cid = tc.get("id", "")
+                        fn = tc.get("function", {})
+                        call_id_to_tool[cid] = (fn.get("name", "unknown"), fn.get("arguments", ""))
+                    else:
+                        cid = getattr(tc, "id", "") or ""
+                        fn = getattr(tc, "function", None)
+                        name = getattr(fn, "name", "unknown") if fn else "unknown"
+                        args_str = getattr(fn, "arguments", "") if fn else ""
+                        call_id_to_tool[cid] = (name, args_str)
+
        # Determine the prune boundary
        if protect_tail_tokens is not None and protect_tail_tokens > 0:
            # Token-budget approach: walk backward accumulating tokens
@@ -211,7 +432,8 @@ class ContextCompressor(ContextEngine):
            min_protect = min(protect_tail_count, len(result) - 1)
            for i in range(len(result) - 1, -1, -1):
                msg = result[i]
-                content_len = len(msg.get("content") or "")
+                raw_content = msg.get("content") or ""
+                content_len = sum(len(p.get("text", "")) for p in raw_content) if isinstance(raw_content, list) else len(raw_content)
                msg_tokens = content_len // _CHARS_PER_TOKEN + 10
                for tc in msg.get("tool_calls") or []:
                    if isinstance(tc, dict):
@@ -226,18 +448,76 @@ class ContextCompressor(ContextEngine):
        else:
            prune_boundary = len(result) - protect_tail_count

+        # Pass 1: Deduplicate identical tool results.
+        # When the same file is read multiple times, keep only the most recent
+        # full copy and replace older duplicates with a back-reference.
+        content_hashes: dict = {}  # hash -> (index, tool_call_id)
+        for i in range(len(result) - 1, -1, -1):
+            msg = result[i]
+            if msg.get("role") != "tool":
+                continue
+            content = msg.get("content") or ""
+            # Skip multimodal content (list of content blocks)
+            if isinstance(content, list):
+                continue
+            if len(content) < 200:
+                continue
+            h = hashlib.md5(content.encode("utf-8", errors="replace")).hexdigest()[:12]
+            if h in content_hashes:
+                # This is an older duplicate — replace with back-reference
+                result[i] = {**msg, "content": "[Duplicate tool output — same content as a more recent call]"}
+                pruned += 1
+            else:
+                content_hashes[h] = (i, msg.get("tool_call_id", "?"))
+
+        # Pass 2: Replace old tool results with informative summaries
        for i in range(prune_boundary):
            msg = result[i]
            if msg.get("role") != "tool":
                continue
            content = msg.get("content", "")
+            # Skip multimodal content (list of content blocks)
+            if isinstance(content, list):
+                continue
            if not content or content == _PRUNED_TOOL_PLACEHOLDER:
                continue
+            # Skip already-deduplicated or previously-summarized results
+            if content.startswith("[Duplicate tool output"):
+                continue
            # Only prune if the content is substantial (>200 chars)
            if len(content) > 200:
-                result[i] = {**msg, "content": _PRUNED_TOOL_PLACEHOLDER}
+                call_id = msg.get("tool_call_id", "")
+                tool_name, tool_args = call_id_to_tool.get(call_id, ("unknown", ""))
+                summary = _summarize_tool_result(tool_name, tool_args, content)
+                result[i] = {**msg, "content": summary}
                pruned += 1

+        # Pass 3: Truncate large tool_call arguments in assistant messages
+        # outside the protected tail. write_file with 50KB content, for
+        # example, survives pruning entirely without this.
+        #
+        # The shrinking is done inside the parsed JSON structure so the
+        # result remains valid JSON — otherwise downstream providers 400
+        # on every subsequent turn until the broken call falls out of
+        # the window. See ``_truncate_tool_call_args_json`` docstring.
+        for i in range(prune_boundary):
+            msg = result[i]
+            if msg.get("role") != "assistant" or not msg.get("tool_calls"):
+                continue
+            new_tcs = []
+            modified = False
+            for tc in msg["tool_calls"]:
+                if isinstance(tc, dict):
+                    args = tc.get("function", {}).get("arguments", "")
+                    if len(args) > 500:
+                        new_args = _truncate_tool_call_args_json(args)
+                        if new_args != args:
+                            tc = {**tc, "function": {**tc["function"], "arguments": new_args}}
+                            modified = True
+                new_tcs.append(tc)
+            if modified:
+                result[i] = {**msg, "tool_calls": new_tcs}
+
        return result, pruned

    # ------------------------------------------------------------------
@@ -357,29 +637,45 @@ class ContextCompressor(ContextEngine):
        )

        # Shared structured template (used by both paths).
-        # Key changes vs v1:
-        #   - "Pending User Asks" section (from Claude Code) explicitly tracks
-        #     unanswered questions so the model knows what's resolved vs open
-        #   - "Remaining Work" replaces "Next Steps" to avoid reading as active
-        #     instructions
-        #   - "Resolved Questions" makes it clear which questions were already
-        #     answered (prevents model from re-answering them)
-        _template_sections = f"""## Goal
-[What the user is trying to accomplish]
+        _template_sections = f"""## Active Task
+[THE SINGLE MOST IMPORTANT FIELD. Copy the user's most recent request or
+task assignment verbatim — the exact words they used. If multiple tasks
+were requested and only some are done, list only the ones NOT yet completed.
+The next assistant must pick up exactly here. Example:
+"User asked: 'Now refactor the auth module to use JWT instead of sessions'"
+If no outstanding task exists, write "None."]
+
+## Goal
+[What the user is trying to accomplish overall]

 ## Constraints & Preferences
 [User preferences, coding style, constraints, important decisions]

-## Progress
-### Done
-[Completed work — include specific file paths, commands run, results obtained]
-### In Progress
-[Work currently underway]
-### Blocked
-[Any blockers or issues encountered]
+## Completed Actions
+[Numbered list of concrete actions taken — include tool used, target, and outcome.
+Format each as: N. ACTION target — outcome [tool: name]
+Example:
+1. READ config.py:45 — found `==` should be `!=` [tool: read_file]
+2. PATCH config.py:45 — changed `==` to `!=` [tool: patch]
+3. TEST `pytest tests/` — 3/50 failed: test_parse, test_validate, test_edge [tool: terminal]
+Be specific with file paths, commands, line numbers, and results.]
+
+## Active State
+[Current working state — include:
+- Working directory and branch (if applicable)
+- Modified/created files with brief note on each
+- Test status (X/Y passing)
+- Any running processes or servers
+- Environment details that matter]
+
+## In Progress
+[Work currently underway — what was being done when compaction fired]
+
+## Blocked
+[Any blockers, errors, or issues not yet resolved. Include exact error messages.]

 ## Key Decisions
-[Important technical decisions and why they were made]
+[Important technical decisions and WHY they were made]

 ## Resolved Questions
 [Questions the user asked that were ALREADY answered — include the answer so the next assistant does not re-answer them]
@@ -396,10 +692,7 @@ class ContextCompressor(ContextEngine):
 ## Critical Context
 [Any specific values, error messages, configuration details, or data that would be lost without explicit preservation]

-## Tools & Patterns
-[Which tools were used, how they were used effectively, and any tool-specific discoveries]
-
-Target ~{summary_budget} tokens. Be specific — include file paths, command outputs, error messages, and concrete values rather than vague descriptions.
+Target ~{summary_budget} tokens. Be CONCRETE — include file paths, command outputs, error messages, line numbers, and specific values. Avoid vague descriptions like "made some changes" — say exactly what changed.

 Write only the summary body. Do not include any preamble or prefix."""

@@ -415,7 +708,7 @@ PREVIOUS SUMMARY:
 NEW TURNS TO INCORPORATE:
 {content_to_summarize}

-Update the summary using this exact structure. PRESERVE all existing information that is still relevant. ADD new progress. Move items from "In Progress" to "Done" when completed. Move answered questions to "Resolved Questions". Remove information only if it is clearly obsolete.
+Update the summary using this exact structure. PRESERVE all existing information that is still relevant. ADD new completed actions to the numbered list (continue numbering). Move items from "In Progress" to "Completed Actions" when done. Move answered questions to "Resolved Questions". Update "Active State" to reflect current state. Remove information only if it is clearly obsolete. CRITICAL: Update "## Active Task" to reflect the user's most recent unfulfilled request — this is the most important field for task continuity.

 {_template_sections}"""
        else:
@@ -450,7 +743,7 @@ The user has requested that this compaction PRIORITISE preserving all informatio
                    "api_mode": self.api_mode,
                },
                "messages": [{"role": "user", "content": prompt}],
-                "max_tokens": summary_budget * 2,
+                "max_tokens": int(summary_budget * 1.3),
                # timeout resolved from auxiliary.compression.timeout config by call_llm
            }
            if self.summary_model:
@@ -464,8 +757,10 @@ The user has requested that this compaction PRIORITISE preserving all informatio
            # Store for iterative updates on next compaction
            self._previous_summary = summary
            self._summary_failure_cooldown_until = 0.0
+            self._summary_model_fallen_back = False
            return self._with_summary_prefix(summary)
        except RuntimeError:
+            # No provider configured — long cooldown, unlikely to self-resolve
            self._summary_failure_cooldown_until = time.monotonic() + _SUMMARY_FAILURE_COOLDOWN_SECONDS
            logging.warning("Context compression: no provider available for "
                            "summary. Middle turns will be dropped without summary "
@@ -473,12 +768,42 @@ The user has requested that this compaction PRIORITISE preserving all informatio
                            _SUMMARY_FAILURE_COOLDOWN_SECONDS)
            return None
        except Exception as e:
-            self._summary_failure_cooldown_until = time.monotonic() + _SUMMARY_FAILURE_COOLDOWN_SECONDS
+            # If the summary model is different from the main model and the
+            # error looks permanent (model not found, 503, 404), fall back to
+            # using the main model instead of entering cooldown that leaves
+            # context growing unbounded.  (#8620 sub-issue 4)
+            _status = getattr(e, "status_code", None) or getattr(getattr(e, "response", None), "status_code", None)
+            _err_str = str(e).lower()
+            _is_model_not_found = (
+                _status in (404, 503)
+                or "model_not_found" in _err_str
+                or "does not exist" in _err_str
+                or "no available channel" in _err_str
+            )
+            if (
+                _is_model_not_found
+                and self.summary_model
+                and self.summary_model != self.model
+                and not getattr(self, "_summary_model_fallen_back", False)
+            ):
+                self._summary_model_fallen_back = True
+                logging.warning(
+                    "Summary model '%s' not available (%s). "
+                    "Falling back to main model '%s' for compression.",
+                    self.summary_model, e, self.model,
+                )
+                self.summary_model = ""  # empty = use main model
+                self._summary_failure_cooldown_until = 0.0  # no cooldown
+                return self._generate_summary(messages, summary_budget)  # retry immediately
+
+            # Transient errors (timeout, rate limit, network) — shorter cooldown
+            _transient_cooldown = 60
+            self._summary_failure_cooldown_until = time.monotonic() + _transient_cooldown
            logging.warning(
                "Failed to generate context summary: %s. "
                "Further summary attempts paused for %d seconds.",
                e,
-                _SUMMARY_FAILURE_COOLDOWN_SECONDS,
+                _transient_cooldown,
            )
            return None

@@ -601,6 +926,62 @@ The user has requested that this compaction PRIORITISE preserving all informatio
    # Tail protection by token budget
    # ------------------------------------------------------------------

+    def _find_last_user_message_idx(
+        self, messages: List[Dict[str, Any]], head_end: int
+    ) -> int:
+        """Return the index of the last user-role message at or after *head_end*, or -1."""
+        for i in range(len(messages) - 1, head_end - 1, -1):
+            if messages[i].get("role") == "user":
+                return i
+        return -1
+
+    def _ensure_last_user_message_in_tail(
+        self,
+        messages: List[Dict[str, Any]],
+        cut_idx: int,
+        head_end: int,
+    ) -> int:
+        """Guarantee the most recent user message is in the protected tail.
+
+        Context compressor bug (#10896): ``_align_boundary_backward`` can pull
+        ``cut_idx`` past a user message when it tries to keep tool_call/result
+        groups together.  If the last user message ends up in the *compressed*
+        middle region the LLM summariser writes it into "Pending User Asks",
+        but ``SUMMARY_PREFIX`` tells the next model to respond only to user
+        messages *after* the summary — so the task effectively disappears from
+        the active context, causing the agent to stall, repeat completed work,
+        or silently drop the user's latest request.
+
+        Fix: if the last user-role message is not already in the tail
+        (``messages[cut_idx:]``), walk ``cut_idx`` back to include it.  We
+        then re-align backward one more time to avoid splitting any
+        tool_call/result group that immediately precedes the user message.
+        """
+        last_user_idx = self._find_last_user_message_idx(messages, head_end)
+        if last_user_idx < 0:
+            # No user message found beyond head — nothing to anchor.
+            return cut_idx
+
+        if last_user_idx >= cut_idx:
+            # Already in the tail; nothing to do.
+            return cut_idx
+
+        # The last user message is in the middle (compressed) region.
+        # Pull cut_idx back to it directly — a user message is already a
+        # clean boundary (no tool_call/result splitting risk), so there is no
+        # need to call _align_boundary_backward here; doing so would
+        # unnecessarily pull the cut further back into the preceding
+        # assistant + tool_calls group.
+        if not self.quiet_mode:
+            logger.debug(
+                "Anchoring tail cut to last user message at index %d "
+                "(was %d) to prevent active-task loss after compression",
+                last_user_idx,
+                cut_idx,
+            )
+        # Safety: never go back into the head region.
+        return max(last_user_idx, head_end + 1)
+
    def _find_tail_cut_by_tokens(
        self, messages: List[Dict[str, Any]], head_end: int,
        token_budget: int | None = None,
@@ -618,7 +999,8 @@ The user has requested that this compaction PRIORITISE preserving all informatio
        read, etc.).  If even the minimum 3 messages exceed 1.5x the budget
        the cut is placed right after the head so compression still runs.

-        Never cuts inside a tool_call/result group.
+        Never cuts inside a tool_call/result group.  Always ensures the most
+        recent user message is in the tail (see ``_ensure_last_user_message_in_tail``).
        """
        if token_budget is None:
            token_budget = self.tail_token_budget
@@ -657,6 +1039,10 @@ The user has requested that this compaction PRIORITISE preserving all informatio
        # Align to avoid splitting tool groups
        cut_idx = self._align_boundary_backward(messages, cut_idx)

+        # Ensure the most recent user message is always in the tail so the
+        # active task is never lost to compression (fixes #10896).
+        cut_idx = self._ensure_last_user_message_in_tail(messages, cut_idx, head_end)
+
        return max(cut_idx, head_end + 1)

    # ------------------------------------------------------------------
@@ -744,11 +1130,11 @@ The user has requested that this compaction PRIORITISE preserving all informatio
        compressed = []
        for i in range(compress_start):
            msg = messages[i].copy()
-            if i == 0 and msg.get("role") == "system" and self.compression_count == 0:
-                msg["content"] = (
-                    (msg.get("content") or "")
-                    + "\n\n[Note: Some earlier conversation turns have been compacted into a handoff summary to preserve context space. The current session state may still reflect earlier work, so build on that summary and state rather than re-doing work.]"
-                )
+            if i == 0 and msg.get("role") == "system":
+                existing = msg.get("content") or ""
+                _compression_note = "[Note: Some earlier conversation turns have been compacted into a handoff summary to preserve context space. The current session state may still reflect earlier work, so build on that summary and state rather than re-doing work.]"
+                if _compression_note not in existing:
+                    msg["content"] = existing + "\n\n" + _compression_note
            compressed.append(msg)

        # If LLM summary failed, insert a static fallback so the model
@@ -806,14 +1192,24 @@ The user has requested that this compaction PRIORITISE preserving all informatio

        compressed = self._sanitize_tool_pairs(compressed)

+        new_estimate = estimate_messages_tokens_rough(compressed)
+        saved_estimate = display_tokens - new_estimate
+
+        # Anti-thrashing: track compression effectiveness
+        savings_pct = (saved_estimate / display_tokens * 100) if display_tokens > 0 else 0
+        self._last_compression_savings_pct = savings_pct
+        if savings_pct < 10:
+            self._ineffective_compression_count += 1
+        else:
+            self._ineffective_compression_count = 0
+
        if not self.quiet_mode:
-            new_estimate = estimate_messages_tokens_rough(compressed)
-            saved_estimate = display_tokens - new_estimate
            logger.info(
-                "Compressed: %d -> %d messages (~%d tokens saved)",
+                "Compressed: %d -> %d messages (~%d tokens saved, %.0f%%)",
                n_messages,
                len(compressed),
                saved_estimate,
+                savings_pct,
            )
            logger.info("Compression #%d complete", self.compression_count)

@@ -313,9 +313,25 @@ class CopilotACPClient:
            tools=tools,
            tool_choice=tool_choice,
        )
+        # Normalise timeout: run_agent.py may pass an httpx.Timeout object
+        # (used natively by the OpenAI SDK) rather than a plain float.
+        if timeout is None:
+            _effective_timeout = _DEFAULT_TIMEOUT_SECONDS
+        elif isinstance(timeout, (int, float)):
+            _effective_timeout = float(timeout)
+        else:
+            # httpx.Timeout or similar — pick the largest component so the
+            # subprocess has enough wall-clock time for the full response.
+            _candidates = [
+                getattr(timeout, attr, None)
+                for attr in ("read", "write", "connect", "pool", "timeout")
+            ]
+            _numeric = [float(v) for v in _candidates if isinstance(v, (int, float))]
+            _effective_timeout = max(_numeric) if _numeric else _DEFAULT_TIMEOUT_SECONDS
+
        response_text, reasoning_text = self._run_prompt(
            prompt_text,
-            timeout_seconds=float(timeout or _DEFAULT_TIMEOUT_SECONDS),
+            timeout_seconds=_effective_timeout,
        )

        tool_calls, cleaned_text = _extract_tool_calls_from_text(response_text)
@@ -22,8 +22,6 @@ from hermes_cli.auth import (
    _auth_store_lock,
    _codex_access_token_is_expiring,
    _decode_jwt_claims,
-    _import_codex_cli_tokens,
-    _write_codex_cli_tokens,
    _load_auth_store,
    _load_provider_state,
    _resolve_kimi_base_url,
@@ -457,39 +455,6 @@ class CredentialPool:
            logger.debug("Failed to sync from credentials file: %s", exc)
        return entry

-    def _sync_codex_entry_from_cli(self, entry: PooledCredential) -> PooledCredential:
-        """Sync an openai-codex pool entry from ~/.codex/auth.json if tokens differ.
-
-        OpenAI OAuth refresh tokens are single-use and rotate on every refresh.
-        When the Codex CLI (or another Hermes profile) refreshes its token,
-        the pool entry's refresh_token becomes stale.  This method detects that
-        by comparing against ~/.codex/auth.json and syncing the fresh pair.
-        """
-        if self.provider != "openai-codex":
-            return entry
-        try:
-            cli_tokens = _import_codex_cli_tokens()
-            if not cli_tokens:
-                return entry
-            cli_refresh = cli_tokens.get("refresh_token", "")
-            cli_access = cli_tokens.get("access_token", "")
-            if cli_refresh and cli_refresh != entry.refresh_token:
-                logger.debug("Pool entry %s: syncing tokens from ~/.codex/auth.json (refresh token changed)", entry.id)
-                updated = replace(
-                    entry,
-                    access_token=cli_access,
-                    refresh_token=cli_refresh,
-                    last_status=None,
-                    last_status_at=None,
-                    last_error_code=None,
-                )
-                self._replace_entry(entry, updated)
-                self._persist()
-                return updated
-        except Exception as exc:
-            logger.debug("Failed to sync from ~/.codex/auth.json: %s", exc)
-        return entry
-
    def _sync_device_code_entry_to_auth_store(self, entry: PooledCredential) -> None:
        """Write refreshed pool entry tokens back to auth.json providers.

@@ -585,13 +550,6 @@ class CredentialPool:
                    except Exception as wexc:
                        logger.debug("Failed to write refreshed token to credentials file: %s", wexc)
            elif self.provider == "openai-codex":
-                # Proactively sync from ~/.codex/auth.json before refresh.
-                # The Codex CLI (or another Hermes profile) may have already
-                # consumed our refresh_token.  Syncing first avoids a
-                # "refresh_token_reused" error when the CLI has a newer pair.
-                synced = self._sync_codex_entry_from_cli(entry)
-                if synced is not entry:
-                    entry = synced
                refreshed = auth_mod.refresh_codex_oauth_pure(
                    entry.access_token,
                    entry.refresh_token,
@@ -677,45 +635,6 @@ class CredentialPool:
                    # Credentials file had a valid (non-expired) token — use it directly
                    logger.debug("Credentials file has valid token, using without refresh")
                    return synced
-            # For openai-codex: the refresh_token may have been consumed by
-            # the Codex CLI between our proactive sync and the refresh call.
-            # Re-sync and retry once.
-            if self.provider == "openai-codex":
-                synced = self._sync_codex_entry_from_cli(entry)
-                if synced.refresh_token != entry.refresh_token:
-                    logger.debug("Retrying Codex refresh with synced token from ~/.codex/auth.json")
-                    try:
-                        refreshed = auth_mod.refresh_codex_oauth_pure(
-                            synced.access_token,
-                            synced.refresh_token,
-                        )
-                        updated = replace(
-                            synced,
-                            access_token=refreshed["access_token"],
-                            refresh_token=refreshed["refresh_token"],
-                            last_refresh=refreshed.get("last_refresh"),
-                            last_status=STATUS_OK,
-                            last_status_at=None,
-                            last_error_code=None,
-                        )
-                        self._replace_entry(synced, updated)
-                        self._persist()
-                        self._sync_device_code_entry_to_auth_store(updated)
-                        try:
-                            _write_codex_cli_tokens(
-                                updated.access_token,
-                                updated.refresh_token,
-                                last_refresh=updated.last_refresh,
-                            )
-                        except Exception as wexc:
-                            logger.debug("Failed to write refreshed Codex tokens to CLI file (retry): %s", wexc)
-                        return updated
-                    except Exception as retry_exc:
-                        logger.debug("Codex retry refresh also failed: %s", retry_exc)
-                elif not self._entry_needs_refresh(synced):
-                    logger.debug("Codex CLI has valid token, using without refresh")
-                    self._sync_device_code_entry_to_auth_store(synced)
-                    return synced
            self._mark_exhausted(entry, None)
            return None

@@ -734,17 +653,6 @@ class CredentialPool:
        # _seed_from_singletons() on the next load_pool() sees fresh state
        # instead of re-seeding stale/consumed tokens.
        self._sync_device_code_entry_to_auth_store(updated)
-        # Write refreshed tokens back to ~/.codex/auth.json so Codex CLI
-        # and VS Code don't hit "refresh_token_reused" on their next refresh.
-        if self.provider == "openai-codex":
-            try:
-                _write_codex_cli_tokens(
-                    updated.access_token,
-                    updated.refresh_token,
-                    last_refresh=updated.last_refresh,
-                )
-            except Exception as wexc:
-                logger.debug("Failed to write refreshed Codex tokens to CLI file: %s", wexc)
        return updated

    def _entry_needs_refresh(self, entry: PooledCredential) -> bool:
@@ -790,16 +698,6 @@ class CredentialPool:
                if synced is not entry:
                    entry = synced
                    cleared_any = True
-            # For openai-codex entries, sync from ~/.codex/auth.json before
-            # any status/refresh checks.  This picks up tokens refreshed by
-            # the Codex CLI or another Hermes profile.
-            if (self.provider == "openai-codex"
-                    and entry.last_status == STATUS_EXHAUSTED
-                    and entry.refresh_token):
-                synced = self._sync_codex_entry_from_cli(entry)
-                if synced is not entry:
-                    entry = synced
-                    cleared_any = True
            if entry.last_status == STATUS_EXHAUSTED:
                exhausted_until = _exhausted_until(entry)
                if exhausted_until is not None and now < exhausted_until:
@@ -1130,6 +1028,14 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup
        state = _load_provider_state(auth_store, "nous")
        if state:
            active_sources.add("device_code")
+            # Prefer a user-supplied label embedded in the singleton state
+            # (set by persist_nous_credentials(label=...) when the user ran
+            # `hermes auth add nous --label <name>`).  Fall back to the
+            # auto-derived token fingerprint for logins that didn't supply one.
+            custom_label = str(state.get("label") or "").strip()
+            seeded_label = custom_label or label_from_token(
+                state.get("access_token", ""), "device_code"
+            )
            changed |= _upsert_entry(
                entries,
                provider,
@@ -1148,7 +1054,7 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup
                    "agent_key": state.get("agent_key"),
                    "agent_key_expires_at": state.get("agent_key_expires_at"),
                    "tls": state.get("tls") if isinstance(state.get("tls"), dict) else None,
-                    "label": label_from_token(state.get("access_token", ""), "device_code"),
+                    "label": seeded_label,
                },
            )

@@ -1162,6 +1068,7 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup
            if token:
                source_name = "gh_cli" if "gh" in source.lower() else f"env:{source}"
                active_sources.add(source_name)
+                pconfig = PROVIDER_REGISTRY.get(provider)
                changed |= _upsert_entry(
                    entries,
                    provider,
@@ -1170,6 +1077,7 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup
                        "source": source_name,
                        "auth_type": AUTH_TYPE_API_KEY,
                        "access_token": token,
+                        "base_url": pconfig.inference_base_url if pconfig else "",
                        "label": source,
                    },
                )
@@ -1206,25 +1114,27 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup
            logger.debug("Qwen OAuth token seed failed: %s", exc)

    elif provider == "openai-codex":
+        # Respect user suppression — `hermes auth remove openai-codex` marks
+        # the device_code source as suppressed so it won't be re-seeded from
+        # the Hermes auth store.  Without this gate the removal is instantly
+        # undone on the next load_pool() call.
+        codex_suppressed = False
+        try:
+            from hermes_cli.auth import is_source_suppressed
+            codex_suppressed = is_source_suppressed(provider, "device_code")
+        except ImportError:
+            pass
+        if codex_suppressed:
+            return changed, active_sources
+
        state = _load_provider_state(auth_store, "openai-codex")
        tokens = state.get("tokens") if isinstance(state, dict) else None
-        # Fallback: import from Codex CLI (~/.codex/auth.json) if Hermes auth
-        # store has no tokens.  This mirrors resolve_codex_runtime_credentials()
-        # so that load_pool() and list_authenticated_providers() detect tokens
-        # that only exist in the Codex CLI shared file.
-        if not (isinstance(tokens, dict) and tokens.get("access_token")):
-            try:
-                from hermes_cli.auth import _import_codex_cli_tokens, _save_codex_tokens
-                cli_tokens = _import_codex_cli_tokens()
-                if cli_tokens:
-                    logger.info("Importing Codex CLI tokens into Hermes auth store.")
-                    _save_codex_tokens(cli_tokens)
-                    # Re-read state after import
-                    auth_store = _load_auth_store()
-                    state = _load_provider_state(auth_store, "openai-codex")
-                    tokens = state.get("tokens") if isinstance(state, dict) else None
-            except Exception as exc:
-                logger.debug("Codex CLI token import failed: %s", exc)
+        # Hermes owns its own Codex auth state — we do NOT auto-import from
+        # ~/.codex/auth.json at pool-load time.  OAuth refresh tokens are
+        # single-use, so sharing them with Codex CLI / VS Code causes
+        # refresh_token_reused race failures.  Users who want to adopt
+        # existing Codex CLI credentials get a one-time, explicit prompt
+        # via `hermes auth openai-codex`.
        if isinstance(tokens, dict) and tokens.get("access_token"):
            active_sources.add("device_code")
            changed |= _upsert_entry(
@@ -600,6 +600,45 @@ class KawaiiSpinner:
        "analyzing", "computing", "synthesizing", "formulating", "brainstorming",
    ]

+    @classmethod
+    def get_waiting_faces(cls) -> list:
+        """Return waiting faces from the active skin, falling back to KAWAII_WAITING."""
+        try:
+            skin = _get_skin()
+            if skin:
+                faces = skin.spinner.get("waiting_faces", [])
+                if faces:
+                    return faces
+        except Exception:
+            pass
+        return cls.KAWAII_WAITING
+
+    @classmethod
+    def get_thinking_faces(cls) -> list:
+        """Return thinking faces from the active skin, falling back to KAWAII_THINKING."""
+        try:
+            skin = _get_skin()
+            if skin:
+                faces = skin.spinner.get("thinking_faces", [])
+                if faces:
+                    return faces
+        except Exception:
+            pass
+        return cls.KAWAII_THINKING
+
+    @classmethod
+    def get_thinking_verbs(cls) -> list:
+        """Return thinking verbs from the active skin, falling back to THINKING_VERBS."""
+        try:
+            skin = _get_skin()
+            if skin:
+                verbs = skin.spinner.get("thinking_verbs", [])
+                if verbs:
+                    return verbs
+        except Exception:
+            pass
+        return cls.THINKING_VERBS
+
    def __init__(self, message: str = "", spinner_type: str = 'dots', print_fn=None):
        self.message = message
        self.spinner_frames = self.SPINNERS.get(spinner_type, self.SPINNERS['dots'])
@@ -954,84 +993,4 @@ def get_cute_tool_message(
 # Honcho session line (one-liner with clickable OSC 8 hyperlink)
 # =========================================================================

-_DIM = "\033[2m"
-_SKY_BLUE = "\033[38;5;117m"
-_ANSI_RESET = "\033[0m"

-
-# =========================================================================
-# Context pressure display (CLI user-facing warnings)
-# =========================================================================
-
-# ANSI color codes for context pressure tiers
-_CYAN = "\033[36m"
-_YELLOW = "\033[33m"
-_BOLD = "\033[1m"
-_DIM_ANSI = "\033[2m"
-
-# Bar characters
-_BAR_FILLED = "▰"
-_BAR_EMPTY = "▱"
-_BAR_WIDTH = 20
-
-
-def format_context_pressure(
-    compaction_progress: float,
-    threshold_tokens: int,
-    threshold_percent: float,
-    compression_enabled: bool = True,
-) -> str:
-    """Build a formatted context pressure line for CLI display.
-
-    The bar and percentage show progress toward the compaction threshold,
-    NOT the raw context window.  100% = compaction fires.
-
-    Args:
-        compaction_progress: How close to compaction (0.0–1.0, 1.0 = fires).
-        threshold_tokens: Compaction threshold in tokens.
-        threshold_percent: Compaction threshold as a fraction of context window.
-        compression_enabled: Whether auto-compression is active.
-    """
-    pct_int = min(int(compaction_progress * 100), 100)
-    filled = min(int(compaction_progress * _BAR_WIDTH), _BAR_WIDTH)
-    bar = _BAR_FILLED * filled + _BAR_EMPTY * (_BAR_WIDTH - filled)
-
-    threshold_k = f"{threshold_tokens // 1000}k" if threshold_tokens >= 1000 else str(threshold_tokens)
-    threshold_pct_int = int(threshold_percent * 100)
-
-    color = f"{_BOLD}{_YELLOW}"
-    icon = "⚠"
-    if compression_enabled:
-        hint = "compaction approaching"
-    else:
-        hint = "no auto-compaction"
-
-    return (
-        f"  {color}{icon} context {bar} {pct_int}% to compaction{_ANSI_RESET}"
-        f"  {_DIM_ANSI}{threshold_k} threshold ({threshold_pct_int}%) · {hint}{_ANSI_RESET}"
-    )
-
-
-def format_context_pressure_gateway(
-    compaction_progress: float,
-    threshold_percent: float,
-    compression_enabled: bool = True,
-) -> str:
-    """Build a plain-text context pressure notification for messaging platforms.
-
-    No ANSI — just Unicode and plain text suitable for Telegram/Discord/etc.
-    The percentage shows progress toward the compaction threshold.
-    """
-    pct_int = min(int(compaction_progress * 100), 100)
-    filled = min(int(compaction_progress * _BAR_WIDTH), _BAR_WIDTH)
-    bar = _BAR_FILLED * filled + _BAR_EMPTY * (_BAR_WIDTH - filled)
-
-    threshold_pct_int = int(threshold_percent * 100)
-
-    icon = "⚠️"
-    if compression_enabled:
-        hint = f"Context compaction approaching (threshold: {threshold_pct_int}% of window)."
-    else:
-        hint = "Auto-compaction is disabled — context may be truncated."
-
-    return f"{icon} Context: {bar} {pct_int}% to compaction\n{hint}"
@@ -112,6 +112,10 @@ _RATE_LIMIT_PATTERNS = [
    "please retry after",
    "resource_exhausted",
    "rate increased too quickly",  # Alibaba/DashScope throttling
+    # AWS Bedrock throttling
+    "throttlingexception",
+    "too many concurrent requests",
+    "servicequotaexceededexception",
 ]

 # Usage-limit patterns that need disambiguation (could be billing OR rate_limit)
@@ -171,6 +175,11 @@ _CONTEXT_OVERFLOW_PATTERNS = [
    # Chinese error messages (some providers return these)
    "超过最大长度",
    "上下文长度",
+    # AWS Bedrock Converse API error patterns
+    "input is too long",
+    "max input token",
+    "input token",
+    "exceeds the maximum number of input tokens",
 ]

 # Model not found patterns
@@ -0,0 +1,895 @@
+"""OpenAI-compatible facade that talks to Google's Cloud Code Assist backend.
+
+This adapter lets Hermes use the ``google-gemini-cli`` provider as if it were
+a standard OpenAI-shaped chat completion endpoint, while the underlying HTTP
+traffic goes to ``cloudcode-pa.googleapis.com/v1internal:{generateContent,
+streamGenerateContent}`` with a Bearer access token obtained via OAuth PKCE.
+
+Architecture
+------------
+- ``GeminiCloudCodeClient`` exposes ``.chat.completions.create(**kwargs)``
+  mirroring the subset of the OpenAI SDK that ``run_agent.py`` uses.
+- Incoming OpenAI ``messages[]`` / ``tools[]`` / ``tool_choice`` are translated
+  to Gemini's native ``contents[]`` / ``tools[].functionDeclarations`` /
+  ``toolConfig`` / ``systemInstruction`` shape.
+- The request body is wrapped ``{project, model, user_prompt_id, request}``
+  per Code Assist API expectations.
+- Responses (``candidates[].content.parts[]``) are converted back to
+  OpenAI ``choices[0].message`` shape with ``content`` + ``tool_calls``.
+- Streaming uses SSE (``?alt=sse``) and yields OpenAI-shaped delta chunks.
+
+Attribution
+-----------
+Translation semantics follow jenslys/opencode-gemini-auth (MIT) and the public
+Gemini API docs. Request envelope shape
+(``{project, model, user_prompt_id, request}``) is documented nowhere; it is
+reverse-engineered from the opencode-gemini-auth and clawdbot implementations.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import time
+import uuid
+from types import SimpleNamespace
+from typing import Any, Dict, Iterator, List, Optional
+
+import httpx
+
+from agent import google_oauth
+from agent.google_code_assist import (
+    CODE_ASSIST_ENDPOINT,
+    FREE_TIER_ID,
+    CodeAssistError,
+    ProjectContext,
+    resolve_project_context,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# =============================================================================
+# Request translation: OpenAI → Gemini
+# =============================================================================
+
+_ROLE_MAP_OPENAI_TO_GEMINI = {
+    "user": "user",
+    "assistant": "model",
+    "system": "user",   # handled separately via systemInstruction
+    "tool": "user",     # functionResponse is wrapped in a user-role turn
+    "function": "user",
+}
+
+
+def _coerce_content_to_text(content: Any) -> str:
+    """OpenAI content may be str or a list of parts; reduce to plain text."""
+    if content is None:
+        return ""
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        pieces: List[str] = []
+        for p in content:
+            if isinstance(p, str):
+                pieces.append(p)
+            elif isinstance(p, dict):
+                if p.get("type") == "text" and isinstance(p.get("text"), str):
+                    pieces.append(p["text"])
+                # Multimodal (image_url, etc.) — stub for now; log and skip
+                elif p.get("type") in ("image_url", "input_audio"):
+                    logger.debug("Dropping multimodal part (not yet supported): %s", p.get("type"))
+        return "\n".join(pieces)
+    return str(content)
+
+
+def _translate_tool_call_to_gemini(tool_call: Dict[str, Any]) -> Dict[str, Any]:
+    """OpenAI tool_call -> Gemini functionCall part."""
+    fn = tool_call.get("function") or {}
+    args_raw = fn.get("arguments", "")
+    try:
+        args = json.loads(args_raw) if isinstance(args_raw, str) and args_raw else {}
+    except json.JSONDecodeError:
+        args = {"_raw": args_raw}
+    if not isinstance(args, dict):
+        args = {"_value": args}
+    return {
+        "functionCall": {
+            "name": fn.get("name") or "",
+            "args": args,
+        },
+        # Sentinel signature — matches opencode-gemini-auth's approach.
+        # Without this, Code Assist rejects function calls that originated
+        # outside its own chain.
+        "thoughtSignature": "skip_thought_signature_validator",
+    }
+
+
+def _translate_tool_result_to_gemini(message: Dict[str, Any]) -> Dict[str, Any]:
+    """OpenAI tool-role message -> Gemini functionResponse part.
+
+    The function name isn't in the OpenAI tool message directly; it must be
+    passed via the assistant message that issued the call. For simplicity we
+    look up ``name`` on the message (OpenAI SDK copies it there) or on the
+    ``tool_call_id`` cross-reference.
+    """
+    name = str(message.get("name") or message.get("tool_call_id") or "tool")
+    content = _coerce_content_to_text(message.get("content"))
+    # Gemini expects the response as a dict under `response`. We wrap plain
+    # text in {"output": "..."}.
+    try:
+        parsed = json.loads(content) if content.strip().startswith(("{", "[")) else None
+    except json.JSONDecodeError:
+        parsed = None
+    response = parsed if isinstance(parsed, dict) else {"output": content}
+    return {
+        "functionResponse": {
+            "name": name,
+            "response": response,
+        },
+    }
+
+
+def _build_gemini_contents(
+    messages: List[Dict[str, Any]],
+) -> tuple[List[Dict[str, Any]], Optional[Dict[str, Any]]]:
+    """Convert OpenAI messages[] to Gemini contents[] + systemInstruction."""
+    system_text_parts: List[str] = []
+    contents: List[Dict[str, Any]] = []
+
+    for msg in messages:
+        if not isinstance(msg, dict):
+            continue
+        role = str(msg.get("role") or "user")
+
+        if role == "system":
+            system_text_parts.append(_coerce_content_to_text(msg.get("content")))
+            continue
+
+        # Tool result message — emit a user-role turn with functionResponse
+        if role == "tool" or role == "function":
+            contents.append({
+                "role": "user",
+                "parts": [_translate_tool_result_to_gemini(msg)],
+            })
+            continue
+
+        gemini_role = _ROLE_MAP_OPENAI_TO_GEMINI.get(role, "user")
+        parts: List[Dict[str, Any]] = []
+
+        text = _coerce_content_to_text(msg.get("content"))
+        if text:
+            parts.append({"text": text})
+
+        # Assistant messages can carry tool_calls
+        tool_calls = msg.get("tool_calls") or []
+        if isinstance(tool_calls, list):
+            for tc in tool_calls:
+                if isinstance(tc, dict):
+                    parts.append(_translate_tool_call_to_gemini(tc))
+
+        if not parts:
+            # Gemini rejects empty parts; skip the turn entirely
+            continue
+
+        contents.append({"role": gemini_role, "parts": parts})
+
+    system_instruction: Optional[Dict[str, Any]] = None
+    joined_system = "\n".join(p for p in system_text_parts if p).strip()
+    if joined_system:
+        system_instruction = {
+            "role": "system",
+            "parts": [{"text": joined_system}],
+        }
+
+    return contents, system_instruction
+
+
+def _translate_tools_to_gemini(tools: Any) -> List[Dict[str, Any]]:
+    """OpenAI tools[] -> Gemini tools[].functionDeclarations[]."""
+    if not isinstance(tools, list) or not tools:
+        return []
+    declarations: List[Dict[str, Any]] = []
+    for t in tools:
+        if not isinstance(t, dict):
+            continue
+        fn = t.get("function") or {}
+        if not isinstance(fn, dict):
+            continue
+        name = fn.get("name")
+        if not name:
+            continue
+        decl = {"name": str(name)}
+        if fn.get("description"):
+            decl["description"] = str(fn["description"])
+        params = fn.get("parameters")
+        if isinstance(params, dict):
+            decl["parameters"] = params
+        declarations.append(decl)
+    if not declarations:
+        return []
+    return [{"functionDeclarations": declarations}]
+
+
+def _translate_tool_choice_to_gemini(tool_choice: Any) -> Optional[Dict[str, Any]]:
+    """OpenAI tool_choice -> Gemini toolConfig.functionCallingConfig."""
+    if tool_choice is None:
+        return None
+    if isinstance(tool_choice, str):
+        if tool_choice == "auto":
+            return {"functionCallingConfig": {"mode": "AUTO"}}
+        if tool_choice == "required":
+            return {"functionCallingConfig": {"mode": "ANY"}}
+        if tool_choice == "none":
+            return {"functionCallingConfig": {"mode": "NONE"}}
+    if isinstance(tool_choice, dict):
+        fn = tool_choice.get("function") or {}
+        name = fn.get("name")
+        if name:
+            return {
+                "functionCallingConfig": {
+                    "mode": "ANY",
+                    "allowedFunctionNames": [str(name)],
+                },
+            }
+    return None
+
+
+def _normalize_thinking_config(config: Any) -> Optional[Dict[str, Any]]:
+    """Accept thinkingBudget / thinkingLevel / includeThoughts (+ snake_case)."""
+    if not isinstance(config, dict) or not config:
+        return None
+    budget = config.get("thinkingBudget", config.get("thinking_budget"))
+    level = config.get("thinkingLevel", config.get("thinking_level"))
+    include = config.get("includeThoughts", config.get("include_thoughts"))
+    normalized: Dict[str, Any] = {}
+    if isinstance(budget, (int, float)):
+        normalized["thinkingBudget"] = int(budget)
+    if isinstance(level, str) and level.strip():
+        normalized["thinkingLevel"] = level.strip().lower()
+    if isinstance(include, bool):
+        normalized["includeThoughts"] = include
+    return normalized or None
+
+
+def build_gemini_request(
+    *,
+    messages: List[Dict[str, Any]],
+    tools: Any = None,
+    tool_choice: Any = None,
+    temperature: Optional[float] = None,
+    max_tokens: Optional[int] = None,
+    top_p: Optional[float] = None,
+    stop: Any = None,
+    thinking_config: Any = None,
+) -> Dict[str, Any]:
+    """Build the inner Gemini request body (goes inside ``request`` wrapper)."""
+    contents, system_instruction = _build_gemini_contents(messages)
+
+    body: Dict[str, Any] = {"contents": contents}
+    if system_instruction is not None:
+        body["systemInstruction"] = system_instruction
+
+    gemini_tools = _translate_tools_to_gemini(tools)
+    if gemini_tools:
+        body["tools"] = gemini_tools
+    tool_cfg = _translate_tool_choice_to_gemini(tool_choice)
+    if tool_cfg is not None:
+        body["toolConfig"] = tool_cfg
+
+    generation_config: Dict[str, Any] = {}
+    if isinstance(temperature, (int, float)):
+        generation_config["temperature"] = float(temperature)
+    if isinstance(max_tokens, int) and max_tokens > 0:
+        generation_config["maxOutputTokens"] = max_tokens
+    if isinstance(top_p, (int, float)):
+        generation_config["topP"] = float(top_p)
+    if isinstance(stop, str) and stop:
+        generation_config["stopSequences"] = [stop]
+    elif isinstance(stop, list) and stop:
+        generation_config["stopSequences"] = [str(s) for s in stop if s]
+    normalized_thinking = _normalize_thinking_config(thinking_config)
+    if normalized_thinking:
+        generation_config["thinkingConfig"] = normalized_thinking
+    if generation_config:
+        body["generationConfig"] = generation_config
+
+    return body
+
+
+def wrap_code_assist_request(
+    *,
+    project_id: str,
+    model: str,
+    inner_request: Dict[str, Any],
+    user_prompt_id: Optional[str] = None,
+) -> Dict[str, Any]:
+    """Wrap the inner Gemini request in the Code Assist envelope."""
+    return {
+        "project": project_id,
+        "model": model,
+        "user_prompt_id": user_prompt_id or str(uuid.uuid4()),
+        "request": inner_request,
+    }
+
+
+# =============================================================================
+# Response translation: Gemini → OpenAI
+# =============================================================================
+
+def _translate_gemini_response(
+    resp: Dict[str, Any],
+    model: str,
+) -> SimpleNamespace:
+    """Non-streaming Gemini response -> OpenAI-shaped SimpleNamespace.
+
+    Code Assist wraps the actual Gemini response inside ``response``, so we
+    unwrap it first if present.
+    """
+    inner = resp.get("response") if isinstance(resp.get("response"), dict) else resp
+
+    candidates = inner.get("candidates") or []
+    if not isinstance(candidates, list) or not candidates:
+        return _empty_response(model)
+
+    cand = candidates[0]
+    content_obj = cand.get("content") if isinstance(cand, dict) else {}
+    parts = content_obj.get("parts") if isinstance(content_obj, dict) else []
+
+    text_pieces: List[str] = []
+    reasoning_pieces: List[str] = []
+    tool_calls: List[SimpleNamespace] = []
+
+    for i, part in enumerate(parts or []):
+        if not isinstance(part, dict):
+            continue
+        # Thought parts are model's internal reasoning — surface as reasoning,
+        # don't mix into content.
+        if part.get("thought") is True:
+            if isinstance(part.get("text"), str):
+                reasoning_pieces.append(part["text"])
+            continue
+        if isinstance(part.get("text"), str):
+            text_pieces.append(part["text"])
+            continue
+        fc = part.get("functionCall")
+        if isinstance(fc, dict) and fc.get("name"):
+            try:
+                args_str = json.dumps(fc.get("args") or {}, ensure_ascii=False)
+            except (TypeError, ValueError):
+                args_str = "{}"
+            tool_calls.append(SimpleNamespace(
+                id=f"call_{uuid.uuid4().hex[:12]}",
+                type="function",
+                index=i,
+                function=SimpleNamespace(name=str(fc["name"]), arguments=args_str),
+            ))
+
+    finish_reason = "tool_calls" if tool_calls else _map_gemini_finish_reason(
+        str(cand.get("finishReason") or "")
+    )
+
+    usage_meta = inner.get("usageMetadata") or {}
+    usage = SimpleNamespace(
+        prompt_tokens=int(usage_meta.get("promptTokenCount") or 0),
+        completion_tokens=int(usage_meta.get("candidatesTokenCount") or 0),
+        total_tokens=int(usage_meta.get("totalTokenCount") or 0),
+        prompt_tokens_details=SimpleNamespace(
+            cached_tokens=int(usage_meta.get("cachedContentTokenCount") or 0),
+        ),
+    )
+
+    message = SimpleNamespace(
+        role="assistant",
+        content="".join(text_pieces) if text_pieces else None,
+        tool_calls=tool_calls or None,
+        reasoning="".join(reasoning_pieces) or None,
+        reasoning_content="".join(reasoning_pieces) or None,
+        reasoning_details=None,
+    )
+    choice = SimpleNamespace(
+        index=0,
+        message=message,
+        finish_reason=finish_reason,
+    )
+    return SimpleNamespace(
+        id=f"chatcmpl-{uuid.uuid4().hex[:12]}",
+        object="chat.completion",
+        created=int(time.time()),
+        model=model,
+        choices=[choice],
+        usage=usage,
+    )
+
+
+def _empty_response(model: str) -> SimpleNamespace:
+    message = SimpleNamespace(
+        role="assistant", content="", tool_calls=None,
+        reasoning=None, reasoning_content=None, reasoning_details=None,
+    )
+    choice = SimpleNamespace(index=0, message=message, finish_reason="stop")
+    usage = SimpleNamespace(
+        prompt_tokens=0, completion_tokens=0, total_tokens=0,
+        prompt_tokens_details=SimpleNamespace(cached_tokens=0),
+    )
+    return SimpleNamespace(
+        id=f"chatcmpl-{uuid.uuid4().hex[:12]}",
+        object="chat.completion",
+        created=int(time.time()),
+        model=model,
+        choices=[choice],
+        usage=usage,
+    )
+
+
+def _map_gemini_finish_reason(reason: str) -> str:
+    mapping = {
+        "STOP": "stop",
+        "MAX_TOKENS": "length",
+        "SAFETY": "content_filter",
+        "RECITATION": "content_filter",
+        "OTHER": "stop",
+    }
+    return mapping.get(reason.upper(), "stop")
+
+
+# =============================================================================
+# Streaming SSE iterator
+# =============================================================================
+
+class _GeminiStreamChunk(SimpleNamespace):
+    """Mimics an OpenAI ChatCompletionChunk with .choices[0].delta."""
+    pass
+
+
+def _make_stream_chunk(
+    *,
+    model: str,
+    content: str = "",
+    tool_call_delta: Optional[Dict[str, Any]] = None,
+    finish_reason: Optional[str] = None,
+    reasoning: str = "",
+) -> _GeminiStreamChunk:
+    delta_kwargs: Dict[str, Any] = {"role": "assistant"}
+    if content:
+        delta_kwargs["content"] = content
+    if tool_call_delta is not None:
+        delta_kwargs["tool_calls"] = [SimpleNamespace(
+            index=tool_call_delta.get("index", 0),
+            id=tool_call_delta.get("id") or f"call_{uuid.uuid4().hex[:12]}",
+            type="function",
+            function=SimpleNamespace(
+                name=tool_call_delta.get("name") or "",
+                arguments=tool_call_delta.get("arguments") or "",
+            ),
+        )]
+    if reasoning:
+        delta_kwargs["reasoning"] = reasoning
+        delta_kwargs["reasoning_content"] = reasoning
+    delta = SimpleNamespace(**delta_kwargs)
+    choice = SimpleNamespace(index=0, delta=delta, finish_reason=finish_reason)
+    return _GeminiStreamChunk(
+        id=f"chatcmpl-{uuid.uuid4().hex[:12]}",
+        object="chat.completion.chunk",
+        created=int(time.time()),
+        model=model,
+        choices=[choice],
+        usage=None,
+    )
+
+
+def _iter_sse_events(response: httpx.Response) -> Iterator[Dict[str, Any]]:
+    """Parse Server-Sent Events from an httpx streaming response."""
+    buffer = ""
+    for chunk in response.iter_text():
+        if not chunk:
+            continue
+        buffer += chunk
+        while "\n" in buffer:
+            line, buffer = buffer.split("\n", 1)
+            line = line.rstrip("\r")
+            if not line:
+                continue
+            if line.startswith("data: "):
+                data = line[6:]
+                if data == "[DONE]":
+                    return
+                try:
+                    yield json.loads(data)
+                except json.JSONDecodeError:
+                    logger.debug("Non-JSON SSE line: %s", data[:200])
+
+
+def _translate_stream_event(
+    event: Dict[str, Any],
+    model: str,
+    tool_call_indices: Dict[str, int],
+) -> List[_GeminiStreamChunk]:
+    """Unwrap Code Assist envelope and emit OpenAI-shaped chunk(s)."""
+    inner = event.get("response") if isinstance(event.get("response"), dict) else event
+    candidates = inner.get("candidates") or []
+    if not candidates:
+        return []
+    cand = candidates[0]
+    if not isinstance(cand, dict):
+        return []
+
+    chunks: List[_GeminiStreamChunk] = []
+
+    content = cand.get("content") or {}
+    parts = content.get("parts") if isinstance(content, dict) else []
+    for part in parts or []:
+        if not isinstance(part, dict):
+            continue
+        if part.get("thought") is True and isinstance(part.get("text"), str):
+            chunks.append(_make_stream_chunk(
+                model=model, reasoning=part["text"],
+            ))
+            continue
+        if isinstance(part.get("text"), str) and part["text"]:
+            chunks.append(_make_stream_chunk(model=model, content=part["text"]))
+        fc = part.get("functionCall")
+        if isinstance(fc, dict) and fc.get("name"):
+            name = str(fc["name"])
+            idx = tool_call_indices.setdefault(name, len(tool_call_indices))
+            try:
+                args_str = json.dumps(fc.get("args") or {}, ensure_ascii=False)
+            except (TypeError, ValueError):
+                args_str = "{}"
+            chunks.append(_make_stream_chunk(
+                model=model,
+                tool_call_delta={
+                    "index": idx,
+                    "name": name,
+                    "arguments": args_str,
+                },
+            ))
+
+    finish_reason_raw = str(cand.get("finishReason") or "")
+    if finish_reason_raw:
+        mapped = _map_gemini_finish_reason(finish_reason_raw)
+        if tool_call_indices:
+            mapped = "tool_calls"
+        chunks.append(_make_stream_chunk(model=model, finish_reason=mapped))
+    return chunks
+
+
+# =============================================================================
+# GeminiCloudCodeClient — OpenAI-compatible facade
+# =============================================================================
+
+MARKER_BASE_URL = "cloudcode-pa://google"
+
+
+class _GeminiChatCompletions:
+    def __init__(self, client: "GeminiCloudCodeClient"):
+        self._client = client
+
+    def create(self, **kwargs: Any) -> Any:
+        return self._client._create_chat_completion(**kwargs)
+
+
+class _GeminiChatNamespace:
+    def __init__(self, client: "GeminiCloudCodeClient"):
+        self.completions = _GeminiChatCompletions(client)
+
+
+class GeminiCloudCodeClient:
+    """Minimal OpenAI-SDK-compatible facade over Code Assist v1internal."""
+
+    def __init__(
+        self,
+        *,
+        api_key: Optional[str] = None,
+        base_url: Optional[str] = None,
+        default_headers: Optional[Dict[str, str]] = None,
+        project_id: str = "",
+        **_: Any,
+    ):
+        # `api_key` here is a dummy — real auth is the OAuth access token
+        # fetched on every call via agent.google_oauth.get_valid_access_token().
+        # We accept the kwarg for openai.OpenAI interface parity.
+        self.api_key = api_key or "google-oauth"
+        self.base_url = base_url or MARKER_BASE_URL
+        self._default_headers = dict(default_headers or {})
+        self._configured_project_id = project_id
+        self._project_context: Optional[ProjectContext] = None
+        self._project_context_lock = False  # simple single-thread guard
+        self.chat = _GeminiChatNamespace(self)
+        self.is_closed = False
+        self._http = httpx.Client(timeout=httpx.Timeout(connect=15.0, read=600.0, write=30.0, pool=30.0))
+
+    def close(self) -> None:
+        self.is_closed = True
+        try:
+            self._http.close()
+        except Exception:
+            pass
+
+    # Implement the OpenAI SDK's context-manager-ish closure check
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+
+    def _ensure_project_context(self, access_token: str, model: str) -> ProjectContext:
+        """Lazily resolve and cache the project context for this client."""
+        if self._project_context is not None:
+            return self._project_context
+
+        env_project = google_oauth.resolve_project_id_from_env()
+        creds = google_oauth.load_credentials()
+        stored_project = creds.project_id if creds else ""
+
+        # Prefer what's already baked into the creds
+        if stored_project:
+            self._project_context = ProjectContext(
+                project_id=stored_project,
+                managed_project_id=creds.managed_project_id if creds else "",
+                tier_id="",
+                source="stored",
+            )
+            return self._project_context
+
+        ctx = resolve_project_context(
+            access_token,
+            configured_project_id=self._configured_project_id,
+            env_project_id=env_project,
+            user_agent_model=model,
+        )
+        # Persist discovered project back to the creds file so the next
+        # session doesn't re-run the discovery.
+        if ctx.project_id or ctx.managed_project_id:
+            google_oauth.update_project_ids(
+                project_id=ctx.project_id,
+                managed_project_id=ctx.managed_project_id,
+            )
+        self._project_context = ctx
+        return ctx
+
+    def _create_chat_completion(
+        self,
+        *,
+        model: str = "gemini-2.5-flash",
+        messages: Optional[List[Dict[str, Any]]] = None,
+        stream: bool = False,
+        tools: Any = None,
+        tool_choice: Any = None,
+        temperature: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        top_p: Optional[float] = None,
+        stop: Any = None,
+        extra_body: Optional[Dict[str, Any]] = None,
+        timeout: Any = None,
+        **_: Any,
+    ) -> Any:
+        access_token = google_oauth.get_valid_access_token()
+        ctx = self._ensure_project_context(access_token, model)
+
+        thinking_config = None
+        if isinstance(extra_body, dict):
+            thinking_config = extra_body.get("thinking_config") or extra_body.get("thinkingConfig")
+
+        inner = build_gemini_request(
+            messages=messages or [],
+            tools=tools,
+            tool_choice=tool_choice,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            top_p=top_p,
+            stop=stop,
+            thinking_config=thinking_config,
+        )
+        wrapped = wrap_code_assist_request(
+            project_id=ctx.project_id,
+            model=model,
+            inner_request=inner,
+        )
+
+        headers = {
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+            "Authorization": f"Bearer {access_token}",
+            "User-Agent": "hermes-agent (gemini-cli-compat)",
+            "X-Goog-Api-Client": "gl-python/hermes",
+            "x-activity-request-id": str(uuid.uuid4()),
+        }
+        headers.update(self._default_headers)
+
+        if stream:
+            return self._stream_completion(model=model, wrapped=wrapped, headers=headers)
+
+        url = f"{CODE_ASSIST_ENDPOINT}/v1internal:generateContent"
+        response = self._http.post(url, json=wrapped, headers=headers)
+        if response.status_code != 200:
+            raise _gemini_http_error(response)
+        try:
+            payload = response.json()
+        except ValueError as exc:
+            raise CodeAssistError(
+                f"Invalid JSON from Code Assist: {exc}",
+                code="code_assist_invalid_json",
+            ) from exc
+        return _translate_gemini_response(payload, model=model)
+
+    def _stream_completion(
+        self,
+        *,
+        model: str,
+        wrapped: Dict[str, Any],
+        headers: Dict[str, str],
+    ) -> Iterator[_GeminiStreamChunk]:
+        """Generator that yields OpenAI-shaped streaming chunks."""
+        url = f"{CODE_ASSIST_ENDPOINT}/v1internal:streamGenerateContent?alt=sse"
+        stream_headers = dict(headers)
+        stream_headers["Accept"] = "text/event-stream"
+
+        def _generator() -> Iterator[_GeminiStreamChunk]:
+            try:
+                with self._http.stream("POST", url, json=wrapped, headers=stream_headers) as response:
+                    if response.status_code != 200:
+                        # Materialize error body for better diagnostics
+                        response.read()
+                        raise _gemini_http_error(response)
+                    tool_call_indices: Dict[str, int] = {}
+                    for event in _iter_sse_events(response):
+                        for chunk in _translate_stream_event(event, model, tool_call_indices):
+                            yield chunk
+            except httpx.HTTPError as exc:
+                raise CodeAssistError(
+                    f"Streaming request failed: {exc}",
+                    code="code_assist_stream_error",
+                ) from exc
+
+        return _generator()
+
+
+def _gemini_http_error(response: httpx.Response) -> CodeAssistError:
+    """Translate an httpx response into a CodeAssistError with rich metadata.
+
+    Parses Google's error envelope (``{"error": {"code", "message", "status",
+    "details": [...]}}``) so the agent's error classifier can reason about
+    the failure — ``status_code`` enables the rate_limit / auth classification
+    paths, and ``response`` lets the main loop honor ``Retry-After`` just
+    like it does for OpenAI SDK exceptions.
+
+    Also lifts a few recognizable Google conditions into human-readable
+    messages so the user sees something better than a 500-char JSON dump:
+
+        MODEL_CAPACITY_EXHAUSTED → "Gemini model capacity exhausted for
+            <model>. This is a Google-side throttle..."
+        RESOURCE_EXHAUSTED w/o reason → quota-style message
+        404 → "Model <name> not found at cloudcode-pa..."
+    """
+    status = response.status_code
+
+    # Parse the body once, surviving any weird encodings.
+    body_text = ""
+    body_json: Dict[str, Any] = {}
+    try:
+        body_text = response.text
+    except Exception:
+        body_text = ""
+    if body_text:
+        try:
+            parsed = json.loads(body_text)
+            if isinstance(parsed, dict):
+                body_json = parsed
+        except (ValueError, TypeError):
+            body_json = {}
+
+    # Dig into Google's error envelope.  Shape is:
+    #   {"error": {"code": 429, "message": "...", "status": "RESOURCE_EXHAUSTED",
+    #              "details": [{"@type": ".../ErrorInfo", "reason": "MODEL_CAPACITY_EXHAUSTED",
+    #                           "metadata": {...}},
+    #                          {"@type": ".../RetryInfo", "retryDelay": "30s"}]}}
+    err_obj = body_json.get("error") if isinstance(body_json, dict) else None
+    if not isinstance(err_obj, dict):
+        err_obj = {}
+    err_status = str(err_obj.get("status") or "").strip()
+    err_message = str(err_obj.get("message") or "").strip()
+    err_details_list = err_obj.get("details") if isinstance(err_obj.get("details"), list) else []
+
+    # Extract google.rpc.ErrorInfo reason + metadata.  There may be more
+    # than one ErrorInfo (rare), so we pick the first one with a reason.
+    error_reason = ""
+    error_metadata: Dict[str, Any] = {}
+    retry_delay_seconds: Optional[float] = None
+    for detail in err_details_list:
+        if not isinstance(detail, dict):
+            continue
+        type_url = str(detail.get("@type") or "")
+        if not error_reason and type_url.endswith("/google.rpc.ErrorInfo"):
+            reason = detail.get("reason")
+            if isinstance(reason, str) and reason:
+                error_reason = reason
+            md = detail.get("metadata")
+            if isinstance(md, dict):
+                error_metadata = md
+        elif retry_delay_seconds is None and type_url.endswith("/google.rpc.RetryInfo"):
+            # retryDelay is a google.protobuf.Duration string like "30s" or "1.5s".
+            delay_raw = detail.get("retryDelay")
+            if isinstance(delay_raw, str) and delay_raw.endswith("s"):
+                try:
+                    retry_delay_seconds = float(delay_raw[:-1])
+                except ValueError:
+                    pass
+            elif isinstance(delay_raw, (int, float)):
+                retry_delay_seconds = float(delay_raw)
+
+    # Fall back to the Retry-After header if the body didn't include RetryInfo.
+    if retry_delay_seconds is None:
+        try:
+            header_val = response.headers.get("Retry-After") or response.headers.get("retry-after")
+        except Exception:
+            header_val = None
+        if header_val:
+            try:
+                retry_delay_seconds = float(header_val)
+            except (TypeError, ValueError):
+                retry_delay_seconds = None
+
+    # Classify the error code.  ``code_assist_rate_limited`` stays the default
+    # for 429s; a more specific reason tag helps downstream callers (e.g. tests,
+    # logs) without changing the rate_limit classification path.
+    code = f"code_assist_http_{status}"
+    if status == 401:
+        code = "code_assist_unauthorized"
+    elif status == 429:
+        code = "code_assist_rate_limited"
+        if error_reason == "MODEL_CAPACITY_EXHAUSTED":
+            code = "code_assist_capacity_exhausted"
+
+    # Build a human-readable message.  Keep the status + a raw-body tail for
+    # debugging, but lead with a friendlier summary when we recognize the
+    # Google signal.
+    model_hint = ""
+    if isinstance(error_metadata, dict):
+        model_hint = str(error_metadata.get("model") or error_metadata.get("modelId") or "").strip()
+
+    if status == 429 and error_reason == "MODEL_CAPACITY_EXHAUSTED":
+        target = model_hint or "this Gemini model"
+        message = (
+            f"Gemini capacity exhausted for {target} (Google-side throttle, "
+            f"not a Hermes issue). Try a different Gemini model or set a "
+            f"fallback_providers entry to a non-Gemini provider."
+        )
+        if retry_delay_seconds is not None:
+            message += f" Google suggests retrying in {retry_delay_seconds:g}s."
+    elif status == 429 and err_status == "RESOURCE_EXHAUSTED":
+        message = (
+            f"Gemini quota exhausted ({err_message or 'RESOURCE_EXHAUSTED'}). "
+            f"Check /gquota for remaining daily requests."
+        )
+        if retry_delay_seconds is not None:
+            message += f" Retry suggested in {retry_delay_seconds:g}s."
+    elif status == 404:
+        # Google returns 404 when a model has been retired or renamed.
+        target = model_hint or (err_message or "model")
+        message = (
+            f"Code Assist 404: {target} is not available at "
+            f"cloudcode-pa.googleapis.com. It may have been renamed or "
+            f"retired. Check hermes_cli/models.py for the current list."
+        )
+    elif err_message:
+        # Generic fallback with the parsed message.
+        message = f"Code Assist HTTP {status} ({err_status or 'error'}): {err_message}"
+    else:
+        # Last-ditch fallback — raw body snippet.
+        message = f"Code Assist returned HTTP {status}: {body_text[:500]}"
+
+    return CodeAssistError(
+        message,
+        code=code,
+        status_code=status,
+        response=response,
+        retry_after=retry_delay_seconds,
+        details={
+            "status": err_status,
+            "reason": error_reason,
+            "metadata": error_metadata,
+            "message": err_message,
+        },
+    )
@@ -0,0 +1,453 @@
+"""Google Code Assist API client — project discovery, onboarding, quota.
+
+The Code Assist API powers Google's official gemini-cli. It sits at
+``cloudcode-pa.googleapis.com`` and provides:
+
+- Free tier access (generous daily quota) for personal Google accounts
+- Paid tier access via GCP projects with billing / Workspace / Standard / Enterprise
+
+This module handles the control-plane dance needed before inference:
+
+1. ``load_code_assist()`` — probe the user's account to learn what tier they're on
+   and whether a ``cloudaicompanionProject`` is already assigned.
+2. ``onboard_user()`` — if the user hasn't been onboarded yet (new account, fresh
+   free tier, etc.), call this with the chosen tier + project id. Supports LRO
+   polling for slow provisioning.
+3. ``retrieve_user_quota()`` — fetch the ``buckets[]`` array showing remaining
+   quota per model, used by the ``/gquota`` slash command.
+
+VPC-SC handling: enterprise accounts under a VPC Service Controls perimeter
+will get ``SECURITY_POLICY_VIOLATED`` on ``load_code_assist``. We catch this
+and force the account to ``standard-tier`` so the call chain still succeeds.
+
+Derived from opencode-gemini-auth (MIT) and clawdbot/extensions/google. The
+request/response shapes are specific to Google's internal Code Assist API,
+documented nowhere public — we copy them from the reference implementations.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import time
+import urllib.error
+import urllib.parse
+import urllib.request
+import uuid
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+# =============================================================================
+# Constants
+# =============================================================================
+
+CODE_ASSIST_ENDPOINT = "https://cloudcode-pa.googleapis.com"
+
+# Fallback endpoints tried when prod returns an error during project discovery
+FALLBACK_ENDPOINTS = [
+    "https://daily-cloudcode-pa.sandbox.googleapis.com",
+    "https://autopush-cloudcode-pa.sandbox.googleapis.com",
+]
+
+# Tier identifiers that Google's API uses
+FREE_TIER_ID = "free-tier"
+LEGACY_TIER_ID = "legacy-tier"
+STANDARD_TIER_ID = "standard-tier"
+
+# Default HTTP headers matching gemini-cli's fingerprint.
+# Google may reject unrecognized User-Agents on these internal endpoints.
+_GEMINI_CLI_USER_AGENT = "google-api-nodejs-client/9.15.1 (gzip)"
+_X_GOOG_API_CLIENT = "gl-node/24.0.0"
+_DEFAULT_REQUEST_TIMEOUT = 30.0
+_ONBOARDING_POLL_ATTEMPTS = 12
+_ONBOARDING_POLL_INTERVAL_SECONDS = 5.0
+
+
+class CodeAssistError(RuntimeError):
+    """Exception raised by the Code Assist (``cloudcode-pa``) integration.
+
+    Carries HTTP status / response / retry-after metadata so the agent's
+    ``error_classifier._extract_status_code`` and the main loop's Retry-After
+    handling (which walks ``error.response.headers``) pick up the right
+    signals.  Without these, 429s from the OAuth path look like opaque
+    ``RuntimeError`` and skip the rate-limit path.
+    """
+
+    def __init__(
+        self,
+        message: str,
+        *,
+        code: str = "code_assist_error",
+        status_code: Optional[int] = None,
+        response: Any = None,
+        retry_after: Optional[float] = None,
+        details: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        super().__init__(message)
+        self.code = code
+        # ``status_code`` is picked up by ``agent.error_classifier._extract_status_code``
+        # so a 429 from Code Assist classifies as FailoverReason.rate_limit and
+        # triggers the main loop's fallback_providers chain the same way SDK
+        # errors do.
+        self.status_code = status_code
+        # ``response`` is the underlying ``httpx.Response`` (or a shim with a
+        # ``.headers`` mapping and ``.json()`` method).  The main loop reads
+        # ``error.response.headers["Retry-After"]`` to honor Google's retry
+        # hints when the backend throttles us.
+        self.response = response
+        # Parsed ``Retry-After`` seconds (kept separately for convenience —
+        # Google returns retry hints in both the header and the error body's
+        # ``google.rpc.RetryInfo`` details, and we pick whichever we found).
+        self.retry_after = retry_after
+        # Parsed structured error details from the Google error envelope
+        # (e.g. ``{"reason": "MODEL_CAPACITY_EXHAUSTED", "status": "RESOURCE_EXHAUSTED"}``).
+        # Useful for logging and for tests that want to assert on specifics.
+        self.details = details or {}
+
+
+class ProjectIdRequiredError(CodeAssistError):
+    def __init__(self, message: str = "GCP project id required for this tier") -> None:
+        super().__init__(message, code="code_assist_project_id_required")
+
+
+# =============================================================================
+# HTTP primitive (auth via Bearer token passed per-call)
+# =============================================================================
+
+def _build_headers(access_token: str, *, user_agent_model: str = "") -> Dict[str, str]:
+    ua = _GEMINI_CLI_USER_AGENT
+    if user_agent_model:
+        ua = f"{ua} model/{user_agent_model}"
+    return {
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+        "Authorization": f"Bearer {access_token}",
+        "User-Agent": ua,
+        "X-Goog-Api-Client": _X_GOOG_API_CLIENT,
+        "x-activity-request-id": str(uuid.uuid4()),
+    }
+
+
+def _client_metadata() -> Dict[str, str]:
+    """Match Google's gemini-cli exactly — unrecognized metadata may be rejected."""
+    return {
+        "ideType": "IDE_UNSPECIFIED",
+        "platform": "PLATFORM_UNSPECIFIED",
+        "pluginType": "GEMINI",
+    }
+
+
+def _post_json(
+    url: str,
+    body: Dict[str, Any],
+    access_token: str,
+    *,
+    timeout: float = _DEFAULT_REQUEST_TIMEOUT,
+    user_agent_model: str = "",
+) -> Dict[str, Any]:
+    data = json.dumps(body).encode("utf-8")
+    request = urllib.request.Request(
+        url, data=data, method="POST",
+        headers=_build_headers(access_token, user_agent_model=user_agent_model),
+    )
+    try:
+        with urllib.request.urlopen(request, timeout=timeout) as response:
+            raw = response.read().decode("utf-8", errors="replace")
+            return json.loads(raw) if raw else {}
+    except urllib.error.HTTPError as exc:
+        detail = ""
+        try:
+            detail = exc.read().decode("utf-8", errors="replace")
+        except Exception:
+            pass
+        # Special case: VPC-SC violation should be distinguishable
+        if _is_vpc_sc_violation(detail):
+            raise CodeAssistError(
+                f"VPC-SC policy violation: {detail}",
+                code="code_assist_vpc_sc",
+            ) from exc
+        raise CodeAssistError(
+            f"Code Assist HTTP {exc.code}: {detail or exc.reason}",
+            code=f"code_assist_http_{exc.code}",
+        ) from exc
+    except urllib.error.URLError as exc:
+        raise CodeAssistError(
+            f"Code Assist request failed: {exc}",
+            code="code_assist_network_error",
+        ) from exc
+
+
+def _is_vpc_sc_violation(body: str) -> bool:
+    """Detect a VPC Service Controls violation from a response body."""
+    if not body:
+        return False
+    try:
+        parsed = json.loads(body)
+    except (json.JSONDecodeError, ValueError):
+        return "SECURITY_POLICY_VIOLATED" in body
+    # Walk the nested error structure Google uses
+    error = parsed.get("error") if isinstance(parsed, dict) else None
+    if not isinstance(error, dict):
+        return False
+    details = error.get("details") or []
+    if isinstance(details, list):
+        for item in details:
+            if isinstance(item, dict):
+                reason = item.get("reason") or ""
+                if reason == "SECURITY_POLICY_VIOLATED":
+                    return True
+    msg = str(error.get("message", ""))
+    return "SECURITY_POLICY_VIOLATED" in msg
+
+
+# =============================================================================
+# load_code_assist — discovers current tier + assigned project
+# =============================================================================
+
+@dataclass
+class CodeAssistProjectInfo:
+    """Result from ``load_code_assist``."""
+    current_tier_id: str = ""
+    cloudaicompanion_project: str = ""   # Google-managed project (free tier)
+    allowed_tiers: List[str] = field(default_factory=list)
+    raw: Dict[str, Any] = field(default_factory=dict)
+
+
+def load_code_assist(
+    access_token: str,
+    *,
+    project_id: str = "",
+    user_agent_model: str = "",
+) -> CodeAssistProjectInfo:
+    """Call ``POST /v1internal:loadCodeAssist`` with prod → sandbox fallback.
+
+    Returns whatever tier + project info Google reports. On VPC-SC violations,
+    returns a synthetic ``standard-tier`` result so the chain can continue.
+    """
+    body: Dict[str, Any] = {
+        "metadata": {
+            "duetProject": project_id,
+            **_client_metadata(),
+        },
+    }
+    if project_id:
+        body["cloudaicompanionProject"] = project_id
+
+    endpoints = [CODE_ASSIST_ENDPOINT] + FALLBACK_ENDPOINTS
+    last_err: Optional[Exception] = None
+    for endpoint in endpoints:
+        url = f"{endpoint}/v1internal:loadCodeAssist"
+        try:
+            resp = _post_json(url, body, access_token, user_agent_model=user_agent_model)
+            return _parse_load_response(resp)
+        except CodeAssistError as exc:
+            if exc.code == "code_assist_vpc_sc":
+                logger.info("VPC-SC violation on %s — defaulting to standard-tier", endpoint)
+                return CodeAssistProjectInfo(
+                    current_tier_id=STANDARD_TIER_ID,
+                    cloudaicompanion_project=project_id,
+                )
+            last_err = exc
+            logger.warning("loadCodeAssist failed on %s: %s", endpoint, exc)
+            continue
+    if last_err:
+        raise last_err
+    return CodeAssistProjectInfo()
+
+
+def _parse_load_response(resp: Dict[str, Any]) -> CodeAssistProjectInfo:
+    current_tier = resp.get("currentTier") or {}
+    tier_id = str(current_tier.get("id") or "") if isinstance(current_tier, dict) else ""
+    project = str(resp.get("cloudaicompanionProject") or "")
+    allowed = resp.get("allowedTiers") or []
+    allowed_ids: List[str] = []
+    if isinstance(allowed, list):
+        for t in allowed:
+            if isinstance(t, dict):
+                tid = str(t.get("id") or "")
+                if tid:
+                    allowed_ids.append(tid)
+    return CodeAssistProjectInfo(
+        current_tier_id=tier_id,
+        cloudaicompanion_project=project,
+        allowed_tiers=allowed_ids,
+        raw=resp,
+    )
+
+
+# =============================================================================
+# onboard_user — provisions a new user on a tier (with LRO polling)
+# =============================================================================
+
+def onboard_user(
+    access_token: str,
+    *,
+    tier_id: str,
+    project_id: str = "",
+    user_agent_model: str = "",
+) -> Dict[str, Any]:
+    """Call ``POST /v1internal:onboardUser`` to provision the user.
+
+    For paid tiers, ``project_id`` is REQUIRED (raises ProjectIdRequiredError).
+    For free tiers, ``project_id`` is optional — Google will assign one.
+
+    Returns the final operation response. Polls ``/v1internal/<name>`` for up
+    to ``_ONBOARDING_POLL_ATTEMPTS`` × ``_ONBOARDING_POLL_INTERVAL_SECONDS``
+    (default: 12 × 5s = 1 min).
+    """
+    if tier_id != FREE_TIER_ID and tier_id != LEGACY_TIER_ID and not project_id:
+        raise ProjectIdRequiredError(
+            f"Tier {tier_id!r} requires a GCP project id. "
+            "Set HERMES_GEMINI_PROJECT_ID or GOOGLE_CLOUD_PROJECT."
+        )
+
+    body: Dict[str, Any] = {
+        "tierId": tier_id,
+        "metadata": _client_metadata(),
+    }
+    if project_id:
+        body["cloudaicompanionProject"] = project_id
+
+    endpoint = CODE_ASSIST_ENDPOINT
+    url = f"{endpoint}/v1internal:onboardUser"
+    resp = _post_json(url, body, access_token, user_agent_model=user_agent_model)
+
+    # Poll if LRO (long-running operation)
+    if not resp.get("done"):
+        op_name = resp.get("name", "")
+        if not op_name:
+            return resp
+        for attempt in range(_ONBOARDING_POLL_ATTEMPTS):
+            time.sleep(_ONBOARDING_POLL_INTERVAL_SECONDS)
+            poll_url = f"{endpoint}/v1internal/{op_name}"
+            try:
+                poll_resp = _post_json(poll_url, {}, access_token, user_agent_model=user_agent_model)
+            except CodeAssistError as exc:
+                logger.warning("Onboarding poll attempt %d failed: %s", attempt + 1, exc)
+                continue
+            if poll_resp.get("done"):
+                return poll_resp
+        logger.warning("Onboarding did not complete within %d attempts", _ONBOARDING_POLL_ATTEMPTS)
+    return resp
+
+
+# =============================================================================
+# retrieve_user_quota — for /gquota
+# =============================================================================
+
+@dataclass
+class QuotaBucket:
+    model_id: str
+    token_type: str = ""
+    remaining_fraction: float = 0.0
+    reset_time_iso: str = ""
+    raw: Dict[str, Any] = field(default_factory=dict)
+
+
+def retrieve_user_quota(
+    access_token: str,
+    *,
+    project_id: str = "",
+    user_agent_model: str = "",
+) -> List[QuotaBucket]:
+    """Call ``POST /v1internal:retrieveUserQuota`` and parse ``buckets[]``."""
+    body: Dict[str, Any] = {}
+    if project_id:
+        body["project"] = project_id
+    url = f"{CODE_ASSIST_ENDPOINT}/v1internal:retrieveUserQuota"
+    resp = _post_json(url, body, access_token, user_agent_model=user_agent_model)
+    raw_buckets = resp.get("buckets") or []
+    buckets: List[QuotaBucket] = []
+    if not isinstance(raw_buckets, list):
+        return buckets
+    for b in raw_buckets:
+        if not isinstance(b, dict):
+            continue
+        buckets.append(QuotaBucket(
+            model_id=str(b.get("modelId") or ""),
+            token_type=str(b.get("tokenType") or ""),
+            remaining_fraction=float(b.get("remainingFraction") or 0.0),
+            reset_time_iso=str(b.get("resetTime") or ""),
+            raw=b,
+        ))
+    return buckets
+
+
+# =============================================================================
+# Project context resolution
+# =============================================================================
+
+@dataclass
+class ProjectContext:
+    """Resolved state for a given OAuth session."""
+    project_id: str = ""           # effective project id sent on requests
+    managed_project_id: str = ""   # Google-assigned project (free tier)
+    tier_id: str = ""
+    source: str = ""               # "env", "config", "discovered", "onboarded"
+
+
+def resolve_project_context(
+    access_token: str,
+    *,
+    configured_project_id: str = "",
+    env_project_id: str = "",
+    user_agent_model: str = "",
+) -> ProjectContext:
+    """Figure out what project id + tier to use for requests.
+
+    Priority:
+      1. If configured_project_id or env_project_id is set, use that directly
+         and short-circuit (no discovery needed).
+      2. Otherwise call loadCodeAssist to see what Google says.
+      3. If no tier assigned yet, onboard the user (free tier default).
+    """
+    # Short-circuit: caller provided a project id
+    if configured_project_id:
+        return ProjectContext(
+            project_id=configured_project_id,
+            tier_id=STANDARD_TIER_ID,  # assume paid since they specified one
+            source="config",
+        )
+    if env_project_id:
+        return ProjectContext(
+            project_id=env_project_id,
+            tier_id=STANDARD_TIER_ID,
+            source="env",
+        )
+
+    # Discover via loadCodeAssist
+    info = load_code_assist(access_token, user_agent_model=user_agent_model)
+
+    effective_project = info.cloudaicompanion_project
+    tier = info.current_tier_id
+
+    if not tier:
+        # User hasn't been onboarded — provision them on free tier
+        onboard_resp = onboard_user(
+            access_token,
+            tier_id=FREE_TIER_ID,
+            project_id="",
+            user_agent_model=user_agent_model,
+        )
+        # Re-parse from the onboard response
+        response_body = onboard_resp.get("response") or {}
+        if isinstance(response_body, dict):
+            effective_project = (
+                effective_project
+                or str(response_body.get("cloudaicompanionProject") or "")
+            )
+        tier = FREE_TIER_ID
+        source = "onboarded"
+    else:
+        source = "discovered"
+
+    return ProjectContext(
+        project_id=effective_project,
+        managed_project_id=effective_project if tier == FREE_TIER_ID else "",
+        tier_id=tier,
+        source=source,
+    )
@@ -634,13 +634,7 @@ class InsightsEngine:
        lines.append(f"  Sessions:          {o['total_sessions']:<12}  Messages:        {o['total_messages']:,}")
        lines.append(f"  Tool calls:        {o['total_tool_calls']:<12,}  User messages:   {o['user_messages']:,}")
        lines.append(f"  Input tokens:      {o['total_input_tokens']:<12,}  Output tokens:   {o['total_output_tokens']:,}")
-        cache_total = o.get("total_cache_read_tokens", 0) + o.get("total_cache_write_tokens", 0)
-        if cache_total > 0:
-            lines.append(f"  Cache read:        {o['total_cache_read_tokens']:<12,}  Cache write:     {o['total_cache_write_tokens']:,}")
-        cost_str = f"${o['estimated_cost']:.2f}"
-        if o.get("models_without_pricing"):
-            cost_str += " *"
-        lines.append(f"  Total tokens:      {o['total_tokens']:<12,}  Est. cost:       {cost_str}")
+        lines.append(f"  Total tokens:      {o['total_tokens']:,}")
        if o["total_hours"] > 0:
            lines.append(f"  Active time:       ~{_format_duration(o['total_hours'] * 3600):<11}  Avg session:     ~{_format_duration(o['avg_session_duration'])}")
        lines.append(f"  Avg msgs/session:  {o['avg_messages_per_session']:.1f}")
@@ -650,16 +644,10 @@ class InsightsEngine:
        if report["models"]:
            lines.append("  🤖 Models Used")
            lines.append("  " + "─" * 56)
-            lines.append(f"  {'Model':<30} {'Sessions':>8} {'Tokens':>12} {'Cost':>8}")
+            lines.append(f"  {'Model':<30} {'Sessions':>8} {'Tokens':>12}")
            for m in report["models"]:
                model_name = m["model"][:28]
-                if m.get("has_pricing"):
-                    cost_cell = f"${m['cost']:>6.2f}"
-                else:
-                    cost_cell = "     N/A"
-                lines.append(f"  {model_name:<30} {m['sessions']:>8} {m['total_tokens']:>12,} {cost_cell}")
-            if o.get("models_without_pricing"):
-                lines.append("  * Cost N/A for custom/self-hosted models")
+                lines.append(f"  {model_name:<30} {m['sessions']:>8} {m['total_tokens']:>12,}")
            lines.append("")

        # Platform breakdown
@@ -739,15 +727,7 @@ class InsightsEngine:

        # Overview
        lines.append(f"**Sessions:** {o['total_sessions']} | **Messages:** {o['total_messages']:,} | **Tool calls:** {o['total_tool_calls']:,}")
-        cache_total = o.get("total_cache_read_tokens", 0) + o.get("total_cache_write_tokens", 0)
-        if cache_total > 0:
-            lines.append(f"**Tokens:** {o['total_tokens']:,} (in: {o['total_input_tokens']:,} / out: {o['total_output_tokens']:,} / cache: {cache_total:,})")
-        else:
-            lines.append(f"**Tokens:** {o['total_tokens']:,} (in: {o['total_input_tokens']:,} / out: {o['total_output_tokens']:,})")
-        cost_note = ""
-        if o.get("models_without_pricing"):
-            cost_note = " _(excludes custom/self-hosted models)_"
-        lines.append(f"**Est. cost:** ${o['estimated_cost']:.2f}{cost_note}")
+        lines.append(f"**Tokens:** {o['total_tokens']:,} (in: {o['total_input_tokens']:,} / out: {o['total_output_tokens']:,})")
        if o["total_hours"] > 0:
            lines.append(f"**Active time:** ~{_format_duration(o['total_hours'] * 3600)} | **Avg session:** ~{_format_duration(o['avg_session_duration'])}")
        lines.append("")
@@ -756,8 +736,7 @@ class InsightsEngine:
        if report["models"]:
            lines.append("**🤖 Models:**")
            for m in report["models"][:5]:
-                cost_str = f"${m['cost']:.2f}" if m.get("has_pricing") else "N/A"
-                lines.append(f"  {m['model'][:25]} — {m['sessions']} sessions, {m['total_tokens']:,} tokens, {cost_str}")
+                lines.append(f"  {m['model'][:25]} — {m['sessions']} sessions, {m['total_tokens']:,} tokens")
            lines.append("")

        # Platforms (if multi-platform)
@@ -28,6 +28,7 @@ Usage in run_agent.py:

 from __future__ import annotations

+import json
 import logging
 import re
 from typing import Any, Dict, List, Optional
@@ -43,11 +44,22 @@ logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------

 _FENCE_TAG_RE = re.compile(r'</?\s*memory-context\s*>', re.IGNORECASE)
+_INTERNAL_CONTEXT_RE = re.compile(
+    r'<\s*memory-context\s*>[\s\S]*?</\s*memory-context\s*>',
+    re.IGNORECASE,
+)
+_INTERNAL_NOTE_RE = re.compile(
+    r'\[System note:\s*The following is recalled memory context,\s*NOT new user input\.\s*Treat as informational background data\.\]\s*',
+    re.IGNORECASE,
+)


 def sanitize_context(text: str) -> str:
-    """Strip fence-escape sequences from provider output."""
-    return _FENCE_TAG_RE.sub('', text)
+    """Strip fence tags, injected context blocks, and system notes from provider output."""
+    text = _INTERNAL_CONTEXT_RE.sub('', text)
+    text = _INTERNAL_NOTE_RE.sub('', text)
+    text = _FENCE_TAG_RE.sub('', text)
+    return text


 def build_memory_context_block(raw_context: str) -> str:
@@ -23,7 +23,7 @@ logger = logging.getLogger(__name__)
 # are preserved so the full model name reaches cache lookups and server queries.
 _PROVIDER_PREFIXES: frozenset[str] = frozenset({
    "openrouter", "nous", "openai-codex", "copilot", "copilot-acp",
-    "gemini", "zai", "kimi-coding", "kimi-coding-cn", "minimax", "minimax-cn", "anthropic", "deepseek",
+    "gemini", "ollama-cloud", "zai", "kimi-coding", "kimi-coding-cn", "minimax", "minimax-cn", "anthropic", "deepseek",
    "opencode-zen", "opencode-go", "ai-gateway", "kilocode", "alibaba",
    "qwen-oauth",
    "xiaomi",
@@ -33,9 +33,12 @@ _PROVIDER_PREFIXES: frozenset[str] = frozenset({
    "google", "google-gemini", "google-ai-studio",
    "glm", "z-ai", "z.ai", "zhipu", "github", "github-copilot",
    "github-models", "kimi", "moonshot", "kimi-cn", "moonshot-cn", "claude", "deep-seek",
+    "ollama",
    "opencode", "zen", "go", "vercel", "kilo", "dashscope", "aliyun", "qwen",
    "mimo", "xiaomi-mimo",
    "arcee-ai", "arceeai",
+    "xai", "x-ai", "x.ai", "grok",
+    "nvidia", "nim", "nvidia-nim", "nemotron",
    "qwen-portal",
 })

@@ -100,6 +103,8 @@ DEFAULT_CONTEXT_LENGTHS = {
    # fuzzy-match collisions (e.g. "anthropic/claude-sonnet-4" is a
    # substring of "anthropic/claude-sonnet-4.6").
    # OpenRouter-prefixed models resolve via OpenRouter live API or models.dev.
+    "claude-opus-4-7": 1000000,
+    "claude-opus-4.7": 1000000,
    "claude-opus-4-6": 1000000,
    "claude-sonnet-4-6": 1000000,
    "claude-opus-4.6": 1000000,
@@ -120,7 +125,6 @@ DEFAULT_CONTEXT_LENGTHS = {
    "gemini": 1048576,
    # Gemma (open models served via AI Studio)
    "gemma-4-31b": 256000,
-    "gemma-4-26b": 256000,
    "gemma-3": 131072,
    "gemma": 8192,  # fallback for older gemma models
    # DeepSeek
@@ -154,6 +158,8 @@ DEFAULT_CONTEXT_LENGTHS = {
    "grok": 131072,             # catch-all (grok-beta, unknown grok-*)
    # Kimi
    "kimi": 262144,
+    # Nemotron — NVIDIA's open-weights series (128K context across all sizes)
+    "nemotron": 131072,
    # Arcee
    "trinity": 262144,
    # OpenRouter
@@ -236,8 +242,10 @@ _URL_TO_PROVIDER: Dict[str, str] = {
    "api.fireworks.ai": "fireworks",
    "opencode.ai": "opencode-go",
    "api.x.ai": "xai",
+    "integrate.api.nvidia.com": "nvidia",
    "api.xiaomimimo.com": "xiaomi",
    "xiaomimimo.com": "xiaomi",
+    "ollama.com": "ollama-cloud",
 }


@@ -1011,6 +1019,16 @@ def get_model_context_length(
        if ctx:
            return ctx

+    # 4b. AWS Bedrock — use static context length table.
+    # Bedrock's ListFoundationModels doesn't expose context window sizes,
+    # so we maintain a curated table in bedrock_adapter.py.
+    if provider == "bedrock" or (base_url and "bedrock-runtime" in base_url):
+        try:
+            from agent.bedrock_adapter import get_bedrock_context_length
+            return get_bedrock_context_length(model)
+        except ImportError:
+            pass  # boto3 not installed — fall through to generic resolution
+
    # 5. Provider-aware lookups (before generic OpenRouter cache)
    # These are provider-specific and take priority over the generic OR cache,
    # since the same model can have different context limits per provider
@@ -169,6 +169,7 @@ PROVIDER_TO_MODELS_DEV: Dict[str, str] = {
    "togetherai": "togetherai",
    "perplexity": "perplexity",
    "cohere": "cohere",
+    "ollama-cloud": "ollama-cloud",
 }

 # Reverse mapping: models.dev → Hermes (built lazily)
@@ -419,7 +420,10 @@ def list_provider_models(provider: str) -> List[str]:
    models = _get_provider_models(provider)
    if models is None:
        return []
-    return list(models.keys())
+    return [
+        mid for mid in models.keys()
+        if not _should_hide_from_provider_catalog(provider, mid)
+    ]


 # Patterns that indicate non-agentic or noise models (TTS, embedding,
@@ -431,6 +435,43 @@ _NOISE_PATTERNS: re.Pattern = re.compile(
    re.IGNORECASE,
 )

+# Google's live Gemini catalogs currently include a mix of stale slugs and
+# Gemma models whose TPM quotas are too small for normal Hermes agent traffic.
+# Keep capability metadata available for direct/manual use, but hide these from
+# the Gemini model catalogs we surface in setup and model selection.
+_GOOGLE_HIDDEN_MODELS = frozenset({
+    # Low-TPM Gemma models that trip Google input-token quota walls under
+    # agent-style traffic despite advertising large context windows.
+    "gemma-4-31b-it",
+    "gemma-4-26b-it",
+    "gemma-4-26b-a4b-it",
+    "gemma-3-1b",
+    "gemma-3-1b-it",
+    "gemma-3-2b",
+    "gemma-3-2b-it",
+    "gemma-3-4b",
+    "gemma-3-4b-it",
+    "gemma-3-12b",
+    "gemma-3-12b-it",
+    "gemma-3-27b",
+    "gemma-3-27b-it",
+    # Stale/retired Google slugs that still surface through models.dev-backed
+    # Gemini selection but 404 on the current Google endpoints.
+    "gemini-1.5-flash",
+    "gemini-1.5-pro",
+    "gemini-1.5-flash-8b",
+    "gemini-2.0-flash",
+    "gemini-2.0-flash-lite",
+})
+
+
+def _should_hide_from_provider_catalog(provider: str, model_id: str) -> bool:
+    provider_lower = (provider or "").strip().lower()
+    model_lower = (model_id or "").strip().lower()
+    if provider_lower in {"gemini", "google"} and model_lower in _GOOGLE_HIDDEN_MODELS:
+        return True
+    return False
+

 def list_agentic_models(provider: str) -> List[str]:
    """Return model IDs suitable for agentic use from models.dev.
@@ -447,6 +488,8 @@ def list_agentic_models(provider: str) -> List[str]:
    for mid, entry in models.items():
        if not isinstance(entry, dict):
            continue
+        if _should_hide_from_provider_catalog(provider, mid):
+            continue
        if not entry.get("tool_call", False):
            continue
        if _NOISE_PATTERNS.search(mid):
@@ -581,5 +624,3 @@ def get_model_info(
            return _parse_model_info(mid, mdata, mdev_id)

    return None
-
-
@@ -0,0 +1,182 @@
+"""Cross-session rate limit guard for Nous Portal.
+
+Writes rate limit state to a shared file so all sessions (CLI, gateway,
+cron, auxiliary) can check whether Nous Portal is currently rate-limited
+before making requests.  Prevents retry amplification when RPH is tapped.
+
+Each 429 from Nous triggers up to 9 API calls per conversation turn
+(3 SDK retries x 3 Hermes retries), and every one of those calls counts
+against RPH.  By recording the rate limit state on first 429 and checking
+it before subsequent attempts, we eliminate the amplification effect.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import tempfile
+import time
+from typing import Any, Mapping, Optional
+
+logger = logging.getLogger(__name__)
+
+_STATE_SUBDIR = "rate_limits"
+_STATE_FILENAME = "nous.json"
+
+
+def _state_path() -> str:
+    """Return the path to the Nous rate limit state file."""
+    try:
+        from hermes_constants import get_hermes_home
+        base = get_hermes_home()
+    except ImportError:
+        base = os.path.join(os.path.expanduser("~"), ".hermes")
+    return os.path.join(base, _STATE_SUBDIR, _STATE_FILENAME)
+
+
+def _parse_reset_seconds(headers: Optional[Mapping[str, str]]) -> Optional[float]:
+    """Extract the best available reset-time estimate from response headers.
+
+    Priority:
+      1. x-ratelimit-reset-requests-1h  (hourly RPH window — most useful)
+      2. x-ratelimit-reset-requests     (per-minute RPM window)
+      3. retry-after                     (generic HTTP header)
+
+    Returns seconds-from-now, or None if no usable header found.
+    """
+    if not headers:
+        return None
+
+    lowered = {k.lower(): v for k, v in headers.items()}
+
+    for key in (
+        "x-ratelimit-reset-requests-1h",
+        "x-ratelimit-reset-requests",
+        "retry-after",
+    ):
+        raw = lowered.get(key)
+        if raw is not None:
+            try:
+                val = float(raw)
+                if val > 0:
+                    return val
+            except (TypeError, ValueError):
+                pass
+
+    return None
+
+
+def record_nous_rate_limit(
+    *,
+    headers: Optional[Mapping[str, str]] = None,
+    error_context: Optional[dict[str, Any]] = None,
+    default_cooldown: float = 300.0,
+) -> None:
+    """Record that Nous Portal is rate-limited.
+
+    Parses the reset time from response headers or error context.
+    Falls back to ``default_cooldown`` (5 minutes) if no reset info
+    is available.  Writes to a shared file that all sessions can read.
+
+    Args:
+        headers: HTTP response headers from the 429 error.
+        error_context: Structured error context from _extract_api_error_context().
+        default_cooldown: Fallback cooldown in seconds when no header data.
+    """
+    now = time.time()
+    reset_at = None
+
+    # Try headers first (most accurate)
+    header_seconds = _parse_reset_seconds(headers)
+    if header_seconds is not None:
+        reset_at = now + header_seconds
+
+    # Try error_context reset_at (from body parsing)
+    if reset_at is None and isinstance(error_context, dict):
+        ctx_reset = error_context.get("reset_at")
+        if isinstance(ctx_reset, (int, float)) and ctx_reset > now:
+            reset_at = float(ctx_reset)
+
+    # Default cooldown
+    if reset_at is None:
+        reset_at = now + default_cooldown
+
+    path = _state_path()
+    try:
+        state_dir = os.path.dirname(path)
+        os.makedirs(state_dir, exist_ok=True)
+
+        state = {
+            "reset_at": reset_at,
+            "recorded_at": now,
+            "reset_seconds": reset_at - now,
+        }
+
+        # Atomic write: write to temp file + rename
+        fd, tmp_path = tempfile.mkstemp(dir=state_dir, suffix=".tmp")
+        try:
+            with os.fdopen(fd, "w") as f:
+                json.dump(state, f)
+            os.replace(tmp_path, path)
+        except Exception:
+            # Clean up temp file on failure
+            try:
+                os.unlink(tmp_path)
+            except OSError:
+                pass
+            raise
+
+        logger.info(
+            "Nous rate limit recorded: resets in %.0fs (at %.0f)",
+            reset_at - now, reset_at,
+        )
+    except Exception as exc:
+        logger.debug("Failed to write Nous rate limit state: %s", exc)
+
+
+def nous_rate_limit_remaining() -> Optional[float]:
+    """Check if Nous Portal is currently rate-limited.
+
+    Returns:
+        Seconds remaining until reset, or None if not rate-limited.
+    """
+    path = _state_path()
+    try:
+        with open(path) as f:
+            state = json.load(f)
+        reset_at = state.get("reset_at", 0)
+        remaining = reset_at - time.time()
+        if remaining > 0:
+            return remaining
+        # Expired — clean up
+        try:
+            os.unlink(path)
+        except OSError:
+            pass
+        return None
+    except (FileNotFoundError, json.JSONDecodeError, KeyError, TypeError):
+        return None
+
+
+def clear_nous_rate_limit() -> None:
+    """Clear the rate limit state (e.g., after a successful Nous request)."""
+    try:
+        os.unlink(_state_path())
+    except FileNotFoundError:
+        pass
+    except OSError as exc:
+        logger.debug("Failed to clear Nous rate limit state: %s", exc)
+
+
+def format_remaining(seconds: float) -> str:
+    """Format seconds remaining into human-readable duration."""
+    s = max(0, int(seconds))
+    if s < 60:
+        return f"{s}s"
+    if s < 3600:
+        m, sec = divmod(s, 60)
+        return f"{m}m {sec}s" if sec else f"{m}m"
+    h, remainder = divmod(s, 3600)
+    m = remainder // 60
+    return f"{h}h {m}m" if m else f"{h}h"
@@ -295,7 +295,9 @@ PLATFORM_HINTS = {
    ),
    "telegram": (
        "You are on a text messaging communication platform, Telegram. "
-        "Please do not use markdown as it does not render. "
+        "Standard markdown is automatically converted to Telegram format. "
+        "Supported: **bold**, *italic*, ~~strikethrough~~, ||spoiler||, "
+        "`inline code`, ```code blocks```, [links](url), and ## headers. "
        "You can send media files natively: to deliver a file to the user, "
        "include MEDIA:/absolute/path/to/file in your response. Images "
        "(.png, .jpg, .webp) appear as photos, audio (.ogg) sends as voice "
@@ -652,7 +654,7 @@ def build_skills_system_prompt(
            ):
                continue
            skills_by_category.setdefault(category, []).append(
-                (skill_name, entry.get("description", ""))
+                (frontmatter_name, entry.get("description", ""))
            )
        category_descriptions = {
            str(k): str(v)
@@ -677,7 +679,7 @@ def build_skills_system_prompt(
            ):
                continue
            skills_by_category.setdefault(entry["category"], []).append(
-                (skill_name, entry["description"])
+                (entry["frontmatter_name"], entry["description"])
            )

        # Read category-level DESCRIPTION.md files
@@ -720,9 +722,10 @@ def build_skills_system_prompt(
                    continue
                entry = _build_snapshot_entry(skill_file, ext_dir, frontmatter, desc)
                skill_name = entry["skill_name"]
-                if skill_name in seen_skill_names:
+                frontmatter_name = entry["frontmatter_name"]
+                if frontmatter_name in seen_skill_names:
                    continue
-                if entry["frontmatter_name"] in disabled or skill_name in disabled:
+                if frontmatter_name in disabled or skill_name in disabled:
                    continue
                if not _skill_should_show(
                    extract_skill_conditions(frontmatter),
@@ -730,9 +733,9 @@ def build_skills_system_prompt(
                    available_toolsets,
                ):
                    continue
-                seen_skill_names.add(skill_name)
+                seen_skill_names.add(frontmatter_name)
                skills_by_category.setdefault(entry["category"], []).append(
-                    (skill_name, entry["description"])
+                    (frontmatter_name, entry["description"])
                )
            except Exception as e:
                logger.debug("Error reading external skill %s: %s", skill_file, e)
@@ -93,6 +93,17 @@ _DB_CONNSTR_RE = re.compile(
    re.IGNORECASE,
 )

+# JWT tokens: header.payload[.signature] — always start with "eyJ" (base64 for "{")
+# Matches 1-part (header only), 2-part (header.payload), and full 3-part JWTs.
+_JWT_RE = re.compile(
+    r"eyJ[A-Za-z0-9_-]{10,}"           # Header (always starts with eyJ)
+    r"(?:\.[A-Za-z0-9_=-]{4,}){0,2}"   # Optional payload and/or signature
+)
+
+# Discord user/role mentions: <@123456789012345678> or <@!123456789012345678>
+# Snowflake IDs are 17-20 digit integers that resolve to specific Discord accounts.
+_DISCORD_MENTION_RE = re.compile(r"<@!?(\d{17,20})>")
+
 # E.164 phone numbers: +<country><number>, 7-15 digits
 # Negative lookahead prevents matching hex strings or identifiers
 _SIGNAL_PHONE_RE = re.compile(r"(\+[1-9]\d{6,14})(?![A-Za-z0-9])")
@@ -159,6 +170,12 @@ def redact_sensitive_text(text: str) -> str:
    # Database connection string passwords
    text = _DB_CONNSTR_RE.sub(lambda m: f"{m.group(1)}***{m.group(3)}", text)

+    # JWT tokens (eyJ... — base64-encoded JSON headers)
+    text = _JWT_RE.sub(lambda m: _mask_token(m.group(0)), text)
+
+    # Discord user/role mentions (<@snowflake_id>)
+    text = _DISCORD_MENTION_RE.sub(lambda m: f"<@{'!' if '!' in m.group(0) else ''}***>", text)
+
    # E.164 phone numbers (Signal, WhatsApp)
    def _redact_phone(m):
        phone = m.group(1)
@@ -12,6 +12,8 @@ from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, Optional

+from hermes_constants import display_hermes_home
+
 logger = logging.getLogger(__name__)

 _skill_commands: Dict[str, Dict[str, Any]] = {}
@@ -70,7 +72,14 @@ def _load_skill_payload(skill_identifier: str, task_id: str | None = None) -> tu
    skill_name = str(loaded_skill.get("name") or normalized)
    skill_path = str(loaded_skill.get("path") or "")
    skill_dir = None
-    if skill_path:
+    # Prefer the absolute skill_dir returned by skill_view() — this is
+    # correct for both local and external skills.  Fall back to the old
+    # SKILLS_DIR-relative reconstruction only when skill_dir is absent
+    # (e.g. legacy skill_view responses).
+    abs_skill_dir = loaded_skill.get("skill_dir")
+    if abs_skill_dir:
+        skill_dir = Path(abs_skill_dir)
+    elif skill_path:
        try:
            skill_dir = SKILLS_DIR / Path(skill_path).parent
        except Exception:
@@ -108,7 +117,7 @@ def _inject_skill_config(loaded_skill: dict[str, Any], parts: list[str]) -> None
        if not resolved:
            return

-        lines = ["", "[Skill config (from ~/.hermes/config.yaml):"]
+        lines = ["", f"[Skill config (from {display_hermes_home()}/config.yaml):"]
        for key, value in resolved.items():
            display_val = str(value) if value else "(not set)"
            lines.append(f"  {key} = {display_val}")
@@ -284,6 +284,80 @@ _OFFICIAL_DOCS_PRICING: Dict[tuple[str, str], PricingEntry] = {
        source_url="https://ai.google.dev/pricing",
        pricing_version="google-pricing-2026-03-16",
    ),
+    # AWS Bedrock — pricing per the Bedrock pricing page.
+    # Bedrock charges the same per-token rates as the model provider but
+    # through AWS billing.  These are the on-demand prices (no commitment).
+    # Source: https://aws.amazon.com/bedrock/pricing/
+    (
+        "bedrock",
+        "anthropic.claude-opus-4-6",
+    ): PricingEntry(
+        input_cost_per_million=Decimal("15.00"),
+        output_cost_per_million=Decimal("75.00"),
+        source="official_docs_snapshot",
+        source_url="https://aws.amazon.com/bedrock/pricing/",
+        pricing_version="bedrock-pricing-2026-04",
+    ),
+    (
+        "bedrock",
+        "anthropic.claude-sonnet-4-6",
+    ): PricingEntry(
+        input_cost_per_million=Decimal("3.00"),
+        output_cost_per_million=Decimal("15.00"),
+        source="official_docs_snapshot",
+        source_url="https://aws.amazon.com/bedrock/pricing/",
+        pricing_version="bedrock-pricing-2026-04",
+    ),
+    (
+        "bedrock",
+        "anthropic.claude-sonnet-4-5",
+    ): PricingEntry(
+        input_cost_per_million=Decimal("3.00"),
+        output_cost_per_million=Decimal("15.00"),
+        source="official_docs_snapshot",
+        source_url="https://aws.amazon.com/bedrock/pricing/",
+        pricing_version="bedrock-pricing-2026-04",
+    ),
+    (
+        "bedrock",
+        "anthropic.claude-haiku-4-5",
+    ): PricingEntry(
+        input_cost_per_million=Decimal("0.80"),
+        output_cost_per_million=Decimal("4.00"),
+        source="official_docs_snapshot",
+        source_url="https://aws.amazon.com/bedrock/pricing/",
+        pricing_version="bedrock-pricing-2026-04",
+    ),
+    (
+        "bedrock",
+        "amazon.nova-pro",
+    ): PricingEntry(
+        input_cost_per_million=Decimal("0.80"),
+        output_cost_per_million=Decimal("3.20"),
+        source="official_docs_snapshot",
+        source_url="https://aws.amazon.com/bedrock/pricing/",
+        pricing_version="bedrock-pricing-2026-04",
+    ),
+    (
+        "bedrock",
+        "amazon.nova-lite",
+    ): PricingEntry(
+        input_cost_per_million=Decimal("0.06"),
+        output_cost_per_million=Decimal("0.24"),
+        source="official_docs_snapshot",
+        source_url="https://aws.amazon.com/bedrock/pricing/",
+        pricing_version="bedrock-pricing-2026-04",
+    ),
+    (
+        "bedrock",
+        "amazon.nova-micro",
+    ): PricingEntry(
+        input_cost_per_million=Decimal("0.035"),
+        output_cost_per_million=Decimal("0.14"),
+        source="official_docs_snapshot",
+        source_url="https://aws.amazon.com/bedrock/pricing/",
+        pricing_version="bedrock-pricing-2026-04",
+    ),
 }


@@ -561,7 +561,10 @@ class BatchRunner:
            provider_sort (str): Sort providers by price/throughput/latency (optional)
            max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
            reasoning_config (Dict): OpenRouter reasoning config override (e.g. {"effort": "none"} to disable thinking)
-            prefill_messages (List[Dict]): Messages to prepend as prefilled conversation context (few-shot priming)
+            prefill_messages (List[Dict]): Messages to prepend as prefilled conversation context (few-shot priming).
+                NOTE: Anthropic Sonnet 4.6+ and Opus 4.6+ reject a trailing assistant-role prefill
+                (400 error).  For those models use output_config.format or structured-output
+                schemas instead.  Safe here for user-role priming and for older Claude / non-Claude models.
            max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set)
        """
        self.dataset_file = Path(dataset_file)
@@ -16,7 +16,7 @@ model:
  #   "nous"         - Nous Portal OAuth (requires: hermes login)
  #   "nous-api"     - Nous Portal API key (requires: NOUS_API_KEY)
  #   "anthropic"    - Direct Anthropic API (requires: ANTHROPIC_API_KEY)
-  #   "openai-codex" - OpenAI Codex (requires: hermes login --provider openai-codex)
+  #   "openai-codex" - OpenAI Codex (requires: hermes auth)
  #   "copilot"      - GitHub Copilot / GitHub Models (requires: GITHUB_TOKEN)
  #   "gemini"      - Use Google AI Studio direct (requires: GOOGLE_API_KEY or GEMINI_API_KEY)
  #   "zai"         - Use z.ai / ZhipuAI GLM models (requires: GLM_API_KEY)
@@ -24,8 +24,10 @@ model:
  #   "minimax"      - MiniMax global (requires: MINIMAX_API_KEY)
  #   "minimax-cn"   - MiniMax China (requires: MINIMAX_CN_API_KEY)
  #   "huggingface"  - Hugging Face Inference (requires: HF_TOKEN)
+  #   "nvidia"       - NVIDIA NIM / build.nvidia.com (requires: NVIDIA_API_KEY)
  #   "xiaomi"       - Xiaomi MiMo (requires: XIAOMI_API_KEY)
  #   "arcee"        - Arcee AI Trinity models (requires: ARCEEAI_API_KEY)
+  #   "ollama-cloud" - Ollama Cloud (requires: OLLAMA_API_KEY — https://ollama.com/settings)
  #   "kilocode"     - KiloCode gateway (requires: KILOCODE_API_KEY)
  #   "ai-gateway"   - Vercel AI Gateway (requires: AI_GATEWAY_API_KEY)
  #
@@ -37,12 +39,6 @@ model:
  #     base_url: "http://localhost:1234/v1"
  #   No API key needed — local servers typically ignore auth.
  #
-  #   For Ollama Cloud (https://ollama.com/pricing):
-  #     provider: "custom"
-  #     base_url: "https://ollama.com/v1"
-  #   Set OLLAMA_API_KEY in .env — automatically picked up when base_url
-  #   points to ollama.com.
-  #
  # Can also be overridden with --provider flag or HERMES_INFERENCE_PROVIDER env var.
  provider: "auto"
  
@@ -337,6 +333,7 @@ compression:
 #   "openrouter" - Force OpenRouter (requires OPENROUTER_API_KEY)
 #   "nous"       - Force Nous Portal (requires: hermes login)
 #   "gemini"      - Force Google AI Studio direct (requires: GOOGLE_API_KEY or GEMINI_API_KEY)
+#   "ollama-cloud" - Ollama Cloud (requires: OLLAMA_API_KEY)
 #   "codex"       - Force Codex OAuth (requires: hermes model → Codex).
 #                  Uses gpt-5.3-codex which supports vision.
 #   "main"       - Use your custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY).
@@ -564,6 +561,18 @@ platform_toolsets:
  homeassistant: [hermes-homeassistant]
  qqbot: [hermes-qqbot]

+# =============================================================================
+# Gateway Platform Settings
+# =============================================================================
+# Optional per-platform messaging settings.
+# Platform-specific knobs live under `extra`.
+#
+# platforms:
+#   telegram:
+#     reply_to_mode: "first"  # off | first | all
+#     extra:
+#       disable_link_previews: false  # Set true to suppress Telegram URL previews in bot messages
+
 # ─────────────────────────────────────────────────────────────────────────────
 # Available toolsets (use these names in platform_toolsets or the toolsets list)
 #
@@ -501,6 +501,12 @@ def update_job(job_id: str, updates: Dict[str, Any]) -> Optional[Dict[str, Any]]

        if schedule_changed:
            updated_schedule = updated["schedule"]
+            # The API may pass schedule as a raw string (e.g. "every 10m")
+            # instead of a pre-parsed dict.  Normalize it the same way
+            # create_job() does so downstream code can call .get() safely.
+            if isinstance(updated_schedule, str):
+                updated_schedule = parse_schedule(updated_schedule)
+                updated["schedule"] = updated_schedule
            updated["schedule_display"] = updates.get(
                "schedule_display",
                updated_schedule.get("display", updated.get("schedule_display")),
@@ -10,6 +10,7 @@ runs at a time if multiple processes overlap.

 import asyncio
 import concurrent.futures
+import contextvars
 import json
 import logging
 import os
@@ -26,7 +27,7 @@ except ImportError:
    except ImportError:
        msvcrt = None
 from pathlib import Path
-from typing import Optional
+from typing import List, Optional

 # Add parent directory to path for imports BEFORE repo-level imports.
 # Without this, standalone invocations (e.g. after `hermes update` reloads
@@ -48,6 +49,33 @@ _KNOWN_DELIVERY_PLATFORMS = frozenset({
    "qqbot",
 })

+# Platforms that support a configured cron/notification home target, mapped to
+# the environment variable used by gateway setup/runtime config.
+_HOME_TARGET_ENV_VARS = {
+    "matrix": "MATRIX_HOME_ROOM",
+    "telegram": "TELEGRAM_HOME_CHANNEL",
+    "discord": "DISCORD_HOME_CHANNEL",
+    "slack": "SLACK_HOME_CHANNEL",
+    "signal": "SIGNAL_HOME_CHANNEL",
+    "mattermost": "MATTERMOST_HOME_CHANNEL",
+    "sms": "SMS_HOME_CHANNEL",
+    "email": "EMAIL_HOME_ADDRESS",
+    "dingtalk": "DINGTALK_HOME_CHANNEL",
+    "feishu": "FEISHU_HOME_CHANNEL",
+    "wecom": "WECOM_HOME_CHANNEL",
+    "weixin": "WEIXIN_HOME_CHANNEL",
+    "bluebubbles": "BLUEBUBBLES_HOME_CHANNEL",
+    "qqbot": "QQBOT_HOME_CHANNEL",
+}
+
+# Legacy env var names kept for back-compat.  Each entry is the current
+# primary env var → the previous name.  _get_home_target_chat_id falls
+# back to the legacy name if the primary is unset, so users who set the
+# old name before the rename keep working until they migrate.
+_LEGACY_HOME_TARGET_ENV_VARS = {
+    "QQBOT_HOME_CHANNEL": "QQ_HOME_CHANNEL",
+}
+
 from cron.jobs import get_due_jobs, mark_job_run, save_job_output, advance_next_run

 # Sentinel: when a cron agent has nothing new to report, it can start its
@@ -75,15 +103,28 @@ def _resolve_origin(job: dict) -> Optional[dict]:
    return None


-def _resolve_delivery_target(job: dict) -> Optional[dict]:
-    """Resolve the concrete auto-delivery target for a cron job, if any."""
-    deliver = job.get("deliver", "local")
+def _get_home_target_chat_id(platform_name: str) -> str:
+    """Return the configured home target chat/room ID for a delivery platform."""
+    env_var = _HOME_TARGET_ENV_VARS.get(platform_name.lower())
+    if not env_var:
+        return ""
+    value = os.getenv(env_var, "")
+    if not value:
+        legacy = _LEGACY_HOME_TARGET_ENV_VARS.get(env_var)
+        if legacy:
+            value = os.getenv(legacy, "")
+    return value
+
+
+def _resolve_single_delivery_target(job: dict, deliver_value: str) -> Optional[dict]:
+    """Resolve one concrete auto-delivery target for a cron job."""
+
    origin = _resolve_origin(job)

-    if deliver == "local":
+    if deliver_value == "local":
        return None

-    if deliver == "origin":
+    if deliver_value == "origin":
        if origin:
            return {
                "platform": origin["platform"],
@@ -92,8 +133,8 @@ def _resolve_delivery_target(job: dict) -> Optional[dict]:
            }
        # Origin missing (e.g. job created via API/script) — try each
        # platform's home channel as a fallback instead of silently dropping.
-        for platform_name in ("matrix", "telegram", "discord", "slack", "bluebubbles"):
-            chat_id = os.getenv(f"{platform_name.upper()}_HOME_CHANNEL", "")
+        for platform_name in _HOME_TARGET_ENV_VARS:
+            chat_id = _get_home_target_chat_id(platform_name)
            if chat_id:
                logger.info(
                    "Job '%s' has deliver=origin but no origin; falling back to %s home channel",
@@ -107,8 +148,8 @@ def _resolve_delivery_target(job: dict) -> Optional[dict]:
                }
        return None

-    if ":" in deliver:
-        platform_name, rest = deliver.split(":", 1)
+    if ":" in deliver_value:
+        platform_name, rest = deliver_value.split(":", 1)
        platform_key = platform_name.lower()

        from tools.send_message_tool import _parse_target_ref
@@ -138,7 +179,7 @@ def _resolve_delivery_target(job: dict) -> Optional[dict]:
            "thread_id": thread_id,
        }

-    platform_name = deliver
+    platform_name = deliver_value
    if origin and origin.get("platform") == platform_name:
        return {
            "platform": platform_name,
@@ -148,7 +189,7 @@ def _resolve_delivery_target(job: dict) -> Optional[dict]:

    if platform_name.lower() not in _KNOWN_DELIVERY_PLATFORMS:
        return None
-    chat_id = os.getenv(f"{platform_name.upper()}_HOME_CHANNEL", "")
+    chat_id = _get_home_target_chat_id(platform_name)
    if not chat_id:
        return None

@@ -159,6 +200,30 @@ def _resolve_delivery_target(job: dict) -> Optional[dict]:
    }


+def _resolve_delivery_targets(job: dict) -> List[dict]:
+    """Resolve all concrete auto-delivery targets for a cron job (supports comma-separated deliver)."""
+    deliver = job.get("deliver", "local")
+    if deliver == "local":
+        return []
+    parts = [p.strip() for p in str(deliver).split(",") if p.strip()]
+    seen = set()
+    targets = []
+    for part in parts:
+        target = _resolve_single_delivery_target(job, part)
+        if target:
+            key = (target["platform"].lower(), str(target["chat_id"]), target.get("thread_id"))
+            if key not in seen:
+                seen.add(key)
+                targets.append(target)
+    return targets
+
+
+def _resolve_delivery_target(job: dict) -> Optional[dict]:
+    """Resolve the concrete auto-delivery target for a cron job, if any."""
+    targets = _resolve_delivery_targets(job)
+    return targets[0] if targets else None
+
+
 # Media extension sets — keep in sync with gateway/platforms/base.py:_process_message_background
 _AUDIO_EXTS = frozenset({'.ogg', '.opus', '.mp3', '.wav', '.m4a'})
 _VIDEO_EXTS = frozenset({'.mp4', '.mov', '.avi', '.mkv', '.webm', '.3gp'})
@@ -199,7 +264,7 @@ def _send_media_via_adapter(adapter, chat_id: str, media_files: list, metadata:

 def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Optional[str]:
    """
-    Deliver job output to the configured target (origin chat, specific platform, etc.).
+    Deliver job output to the configured target(s) (origin chat, specific platform, etc.).

    When ``adapters`` and ``loop`` are provided (gateway is running), tries to
    use the live adapter first — this supports E2EE rooms (e.g. Matrix) where
@@ -208,33 +273,14 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option

    Returns None on success, or an error string on failure.
    """
-    target = _resolve_delivery_target(job)
-    if not target:
+    targets = _resolve_delivery_targets(job)
+    if not targets:
        if job.get("deliver", "local") != "local":
            msg = f"no delivery target resolved for deliver={job.get('deliver', 'local')}"
            logger.warning("Job '%s': %s", job["id"], msg)
            return msg
        return None  # local-only jobs don't deliver — not a failure

-    platform_name = target["platform"]
-    chat_id = target["chat_id"]
-    thread_id = target.get("thread_id")
-
-    # Diagnostic: log thread_id for topic-aware delivery debugging
-    origin = job.get("origin") or {}
-    origin_thread = origin.get("thread_id")
-    if origin_thread and not thread_id:
-        logger.warning(
-            "Job '%s': origin has thread_id=%s but delivery target lost it "
-            "(deliver=%s, target=%s)",
-            job["id"], origin_thread, job.get("deliver", "local"), target,
-        )
-    elif thread_id:
-        logger.debug(
-            "Job '%s': delivering to %s:%s thread_id=%s",
-            job["id"], platform_name, chat_id, thread_id,
-        )
-
    from tools.send_message_tool import _send_to_platform
    from gateway.config import load_gateway_config, Platform

@@ -257,24 +303,6 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option
        "bluebubbles": Platform.BLUEBUBBLES,
        "qqbot": Platform.QQBOT,
    }
-    platform = platform_map.get(platform_name.lower())
-    if not platform:
-        msg = f"unknown platform '{platform_name}'"
-        logger.warning("Job '%s': %s", job["id"], msg)
-        return msg
-
-    try:
-        config = load_gateway_config()
-    except Exception as e:
-        msg = f"failed to load gateway config: {e}"
-        logger.error("Job '%s': %s", job["id"], msg)
-        return msg
-
-    pconfig = config.platforms.get(platform)
-    if not pconfig or not pconfig.enabled:
-        msg = f"platform '{platform_name}' not configured/enabled"
-        logger.warning("Job '%s': %s", job["id"], msg)
-        return msg

    # Optionally wrap the content with a header/footer so the user knows this
    # is a cron delivery.  Wrapping is on by default; set cron.wrap_response: false
@@ -288,11 +316,13 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option

    if wrap_response:
        task_name = job.get("name", job["id"])
+        job_id = job.get("id", "")
        delivery_content = (
            f"Cronjob Response: {task_name}\n"
+            f"(job_id: {job_id})\n"
            f"-------------\n\n"
            f"{content}\n\n"
-            f"Note: The agent cannot see this message, and therefore cannot respond to it."
+            f"To stop or manage this job, send me a new message (e.g. \"stop reminder {task_name}\")."
        )
    else:
        delivery_content = content
@@ -301,67 +331,117 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option
    from gateway.platforms.base import BasePlatformAdapter
    media_files, cleaned_delivery_content = BasePlatformAdapter.extract_media(delivery_content)

-    # Prefer the live adapter when the gateway is running — this supports E2EE
-    # rooms (e.g. Matrix) where the standalone HTTP path cannot encrypt.
-    runtime_adapter = (adapters or {}).get(platform)
-    if runtime_adapter is not None and loop is not None and getattr(loop, "is_running", lambda: False)():
-        send_metadata = {"thread_id": thread_id} if thread_id else None
-        try:
-            # Send cleaned text (MEDIA tags stripped) — not the raw content
-            text_to_send = cleaned_delivery_content.strip()
-            adapter_ok = True
-            if text_to_send:
-                future = asyncio.run_coroutine_threadsafe(
-                    runtime_adapter.send(chat_id, text_to_send, metadata=send_metadata),
-                    loop,
-                )
-                send_result = future.result(timeout=60)
-                if send_result and not getattr(send_result, "success", True):
-                    err = getattr(send_result, "error", "unknown")
-                    logger.warning(
-                        "Job '%s': live adapter send to %s:%s failed (%s), falling back to standalone",
-                        job["id"], platform_name, chat_id, err,
-                    )
-                    adapter_ok = False  # fall through to standalone path
+    try:
+        config = load_gateway_config()
+    except Exception as e:
+        msg = f"failed to load gateway config: {e}"
+        logger.error("Job '%s': %s", job["id"], msg)
+        return msg

-            # Send extracted media files as native attachments via the live adapter
-            if adapter_ok and media_files:
-                _send_media_via_adapter(runtime_adapter, chat_id, media_files, send_metadata, loop, job)
+    delivery_errors = []

-            if adapter_ok:
-                logger.info("Job '%s': delivered to %s:%s via live adapter", job["id"], platform_name, chat_id)
-                return None
-        except Exception as e:
+    for target in targets:
+        platform_name = target["platform"]
+        chat_id = target["chat_id"]
+        thread_id = target.get("thread_id")
+
+        # Diagnostic: log thread_id for topic-aware delivery debugging
+        origin = job.get("origin") or {}
+        origin_thread = origin.get("thread_id")
+        if origin_thread and not thread_id:
            logger.warning(
-                "Job '%s': live adapter delivery to %s:%s failed (%s), falling back to standalone",
-                job["id"], platform_name, chat_id, e,
+                "Job '%s': origin has thread_id=%s but delivery target lost it "
+                "(deliver=%s, target=%s)",
+                job["id"], origin_thread, job.get("deliver", "local"), target,
+            )
+        elif thread_id:
+            logger.debug(
+                "Job '%s': delivering to %s:%s thread_id=%s",
+                job["id"], platform_name, chat_id, thread_id,
            )

-    # Standalone path: run the async send in a fresh event loop (safe from any thread)
-    coro = _send_to_platform(platform, pconfig, chat_id, cleaned_delivery_content, thread_id=thread_id, media_files=media_files)
-    try:
-        result = asyncio.run(coro)
-    except RuntimeError:
-        # asyncio.run() checks for a running loop before awaiting the coroutine;
-        # when it raises, the original coro was never started — close it to
-        # prevent "coroutine was never awaited" RuntimeWarning, then retry in a
-        # fresh thread that has no running loop.
-        coro.close()
-        import concurrent.futures
-        with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
-            future = pool.submit(asyncio.run, _send_to_platform(platform, pconfig, chat_id, cleaned_delivery_content, thread_id=thread_id, media_files=media_files))
-            result = future.result(timeout=30)
-    except Exception as e:
-        msg = f"delivery to {platform_name}:{chat_id} failed: {e}"
-        logger.error("Job '%s': %s", job["id"], msg)
-        return msg
+        platform = platform_map.get(platform_name.lower())
+        if not platform:
+            msg = f"unknown platform '{platform_name}'"
+            logger.warning("Job '%s': %s", job["id"], msg)
+            delivery_errors.append(msg)
+            continue

-    if result and result.get("error"):
-        msg = f"delivery error: {result['error']}"
-        logger.error("Job '%s': %s", job["id"], msg)
-        return msg
+        # Prefer the live adapter when the gateway is running — this supports E2EE
+        # rooms (e.g. Matrix) where the standalone HTTP path cannot encrypt.
+        runtime_adapter = (adapters or {}).get(platform)
+        delivered = False
+        if runtime_adapter is not None and loop is not None and getattr(loop, "is_running", lambda: False)():
+            send_metadata = {"thread_id": thread_id} if thread_id else None
+            try:
+                # Send cleaned text (MEDIA tags stripped) — not the raw content
+                text_to_send = cleaned_delivery_content.strip()
+                adapter_ok = True
+                if text_to_send:
+                    future = asyncio.run_coroutine_threadsafe(
+                        runtime_adapter.send(chat_id, text_to_send, metadata=send_metadata),
+                        loop,
+                    )
+                    send_result = future.result(timeout=60)
+                    if send_result and not getattr(send_result, "success", True):
+                        err = getattr(send_result, "error", "unknown")
+                        logger.warning(
+                            "Job '%s': live adapter send to %s:%s failed (%s), falling back to standalone",
+                            job["id"], platform_name, chat_id, err,
+                        )
+                        adapter_ok = False  # fall through to standalone path

-    logger.info("Job '%s': delivered to %s:%s", job["id"], platform_name, chat_id)
+                # Send extracted media files as native attachments via the live adapter
+                if adapter_ok and media_files:
+                    _send_media_via_adapter(runtime_adapter, chat_id, media_files, send_metadata, loop, job)
+
+                if adapter_ok:
+                    logger.info("Job '%s': delivered to %s:%s via live adapter", job["id"], platform_name, chat_id)
+                    delivered = True
+            except Exception as e:
+                logger.warning(
+                    "Job '%s': live adapter delivery to %s:%s failed (%s), falling back to standalone",
+                    job["id"], platform_name, chat_id, e,
+                )
+
+        if not delivered:
+            pconfig = config.platforms.get(platform)
+            if not pconfig or not pconfig.enabled:
+                msg = f"platform '{platform_name}' not configured/enabled"
+                logger.warning("Job '%s': %s", job["id"], msg)
+                delivery_errors.append(msg)
+                continue
+
+            # Standalone path: run the async send in a fresh event loop (safe from any thread)
+            coro = _send_to_platform(platform, pconfig, chat_id, cleaned_delivery_content, thread_id=thread_id, media_files=media_files)
+            try:
+                result = asyncio.run(coro)
+            except RuntimeError:
+                # asyncio.run() checks for a running loop before awaiting the coroutine;
+                # when it raises, the original coro was never started — close it to
+                # prevent "coroutine was never awaited" RuntimeWarning, then retry in a
+                # fresh thread that has no running loop.
+                coro.close()
+                import concurrent.futures
+                with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
+                    future = pool.submit(asyncio.run, _send_to_platform(platform, pconfig, chat_id, cleaned_delivery_content, thread_id=thread_id, media_files=media_files))
+                    result = future.result(timeout=30)
+            except Exception as e:
+                msg = f"delivery to {platform_name}:{chat_id} failed: {e}"
+                logger.error("Job '%s': %s", job["id"], msg)
+                delivery_errors.append(msg)
+                continue
+
+            if result and result.get("error"):
+                msg = f"delivery error: {result['error']}"
+                logger.error("Job '%s': %s", job["id"], msg)
+                delivery_errors.append(msg)
+                continue
+
+            logger.info("Job '%s': delivered to %s:%s", job["id"], platform_name, chat_id)
+
+    if delivery_errors:
+        return "; ".join(delivery_errors)
    return None


@@ -484,15 +564,53 @@ def _run_job_script(script_path: str) -> tuple[bool, str]:
        return False, f"Script execution failed: {exc}"


-def _build_job_prompt(job: dict) -> str:
-    """Build the effective prompt for a cron job, optionally loading one or more skills first."""
+def _parse_wake_gate(script_output: str) -> bool:
+    """Parse the last non-empty stdout line of a cron job's pre-check script
+    as a wake gate.
+
+    The convention (ported from nanoclaw #1232): if the last stdout line is
+    JSON like ``{"wakeAgent": false}``, the agent is skipped entirely — no
+    LLM run, no delivery. Any other output (non-JSON, missing flag, gate
+    absent, or ``wakeAgent: true``) means wake the agent normally.
+
+    Returns True if the agent should wake, False to skip.
+    """
+    if not script_output:
+        return True
+    stripped_lines = [line for line in script_output.splitlines() if line.strip()]
+    if not stripped_lines:
+        return True
+    last_line = stripped_lines[-1].strip()
+    try:
+        gate = json.loads(last_line)
+    except (json.JSONDecodeError, ValueError):
+        return True
+    if not isinstance(gate, dict):
+        return True
+    return gate.get("wakeAgent", True) is not False
+
+
+def _build_job_prompt(job: dict, prerun_script: Optional[tuple] = None) -> str:
+    """Build the effective prompt for a cron job, optionally loading one or more skills first.
+
+    Args:
+        job: The cron job dict.
+        prerun_script: Optional ``(success, stdout)`` from a script that has
+            already been executed by the caller (e.g. for a wake-gate check).
+            When provided, the script is not re-executed and the cached
+            result is used for prompt injection. When omitted, the script
+            (if any) runs inline as before.
+    """
    prompt = job.get("prompt", "")
    skills = job.get("skills")

    # Run data-collection script if configured, inject output as context.
    script_path = job.get("script")
    if script_path:
-        success, script_output = _run_job_script(script_path)
+        if prerun_script is not None:
+            success, script_output = prerun_script
+        else:
+            success, script_output = _run_job_script(script_path)
        if success:
            if script_output:
                prompt = (
@@ -594,13 +712,41 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
    
    job_id = job["id"]
    job_name = job["name"]
-    prompt = _build_job_prompt(job)
+
+    # Wake-gate: if this job has a pre-check script, run it BEFORE building
+    # the prompt so a ``{"wakeAgent": false}`` response can short-circuit
+    # the whole agent run. We pass the result into _build_job_prompt so
+    # the script is only executed once.
+    prerun_script = None
+    script_path = job.get("script")
+    if script_path:
+        prerun_script = _run_job_script(script_path)
+        _ran_ok, _script_output = prerun_script
+        if _ran_ok and not _parse_wake_gate(_script_output):
+            logger.info(
+                "Job '%s' (ID: %s): wakeAgent=false, skipping agent run",
+                job_name, job_id,
+            )
+            silent_doc = (
+                f"# Cron Job: {job_name}\n\n"
+                f"**Job ID:** {job_id}\n"
+                f"**Run Time:** {_hermes_now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
+                "Script gate returned `wakeAgent=false` — agent skipped.\n"
+            )
+            return True, silent_doc, SILENT_MARKER, None
+
+    prompt = _build_job_prompt(job, prerun_script=prerun_script)
    origin = _resolve_origin(job)
    _cron_session_id = f"cron_{job_id}_{_hermes_now().strftime('%Y%m%d_%H%M%S')}"

    logger.info("Running job '%s' (ID: %s)", job_name, job_id)
    logger.info("Prompt: %s", prompt[:100])

+    # Mark this as a cron session so the approval system can apply cron_mode.
+    # This env var is process-wide and persists for the lifetime of the
+    # scheduler process — every job this process runs is a cron job.
+    os.environ["HERMES_CRON_SESSION"] = "1"
+
    try:
        # Inject origin context so the agent's send_message tool knows the chat.
        # Must be INSIDE the try block so the finally cleanup always runs.
@@ -768,7 +914,11 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
        _cron_inactivity_limit = _cron_timeout if _cron_timeout > 0 else None
        _POLL_INTERVAL = 5.0
        _cron_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1)
-        _cron_future = _cron_pool.submit(agent.run_conversation, prompt)
+        # Preserve scheduler-scoped ContextVar state (for example skill-declared
+        # env passthrough registrations) when the cron run hops into the worker
+        # thread used for inactivity timeout monitoring.
+        _cron_context = contextvars.copy_context()
+        _cron_future = _cron_pool.submit(_cron_context.run, agent.run_conversation, prompt)
        _inactivity_timeout = False
        try:
            if _cron_inactivity_limit is None:
@@ -830,6 +980,9 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
            )

        final_response = result.get("final_response", "") or ""
+        # Strip leaked placeholder text that upstream may inject on empty completions.
+        if final_response.strip() == "(No response generated)":
+            final_response = ""
        # Use a separate variable for log display; keep final_response clean
        # for delivery logic (empty response = no delivery).
        logged_response = final_response if final_response else "(No response generated)"
@@ -969,6 +1122,13 @@ def tick(verbose: bool = True, adapters=None, loop=None) -> int:
                        delivery_error = str(de)
                        logger.error("Delivery failed for job %s: %s", job["id"], de)

+                # Treat empty final_response as a soft failure so last_status
+                # is not "ok" — the agent ran but produced nothing useful.
+                # (issue #8585)
+                if success and not final_response:
+                    success = False
+                    error = "Agent completed but produced empty response (model error, timeout, or misconfiguration)"
+
                mark_job_run(job["id"], success, error, delivery_error=delivery_error)
                executed += 1

@@ -1,13 +1,14 @@
 #!/bin/bash
-# Docker entrypoint: bootstrap config files into the mounted volume, then run hermes.
+# Docker/Podman entrypoint: bootstrap config files into the mounted volume, then run hermes.
 set -e

-HERMES_HOME="/opt/data"
+HERMES_HOME="${HERMES_HOME:-/opt/data}"
 INSTALL_DIR="/opt/hermes"

 # --- Privilege dropping via gosu ---
-# When started as root (the default), optionally remap the hermes user/group
-# to match host-side ownership, fix volume permissions, then re-exec as hermes.
+# When started as root (the default for Docker, or fakeroot in rootless Podman),
+# optionally remap the hermes user/group to match host-side ownership, fix volume
+# permissions, then re-exec as hermes.
 if [ "$(id -u)" = "0" ]; then
    if [ -n "$HERMES_UID" ] && [ "$HERMES_UID" != "$(id -u hermes)" ]; then
        echo "Changing hermes UID to $HERMES_UID"
@@ -16,13 +17,19 @@ if [ "$(id -u)" = "0" ]; then

    if [ -n "$HERMES_GID" ] && [ "$HERMES_GID" != "$(id -g hermes)" ]; then
        echo "Changing hermes GID to $HERMES_GID"
-        groupmod -g "$HERMES_GID" hermes
+        # -o allows non-unique GID (e.g. macOS GID 20 "staff" may already exist
+        # as "dialout" in the Debian-based container image)
+        groupmod -o -g "$HERMES_GID" hermes 2>/dev/null || true
    fi

    actual_hermes_uid=$(id -u hermes)
    if [ "$(stat -c %u "$HERMES_HOME" 2>/dev/null)" != "$actual_hermes_uid" ]; then
        echo "$HERMES_HOME is not owned by $actual_hermes_uid, fixing"
-        chown -R hermes:hermes "$HERMES_HOME"
+        # In rootless Podman the container's "root" is mapped to an unprivileged
+        # host UID — chown will fail.  That's fine: the volume is already owned
+        # by the mapped user on the host side.
+        chown -R hermes:hermes "$HERMES_HOME" 2>/dev/null || \
+            echo "Warning: chown failed (rootless container?) — continuing anyway"
    fi

    echo "Dropping root privileges"
@@ -0,0 +1,108 @@
+# Ink Gateway TUI Migration — Post-mortem
+
+Planned: 2026-04-01 · Delivered: 2026-04 · Status: shipped, classic (prompt_toolkit) CLI still present
+
+## What Shipped
+
+Three layers, same repo, Python runtime unchanged.
+
+```
+ui-tui (Node/TS)  ──stdio JSON-RPC──▶  tui_gateway (Py)  ──▶  AIAgent (run_agent.py)
+```
+
+### Backend — `tui_gateway/`
+
+```
+tui_gateway/
+├── entry.py          # subprocess entrypoint, stdio read/write loop
+├── server.py         # everything: sessions dict, @method handlers, _emit
+├── render.py         # stream renderer, diff rendering, message rendering
+├── slash_worker.py   # subprocess that runs hermes_cli slash commands
+└── __init__.py
+```
+
+`server.py` owns the full runtime-control surface: session store (`_sessions: dict[str, dict]`), method registry (`@method("…")` decorator), event emitter (`_emit`), agent lifecycle (`_make_agent`, `_init_session`, `_wire_callbacks`), approval/sudo/clarify round-trips, and JSON-RPC dispatch.
+
+Protocol methods (`@method(...)` in `server.py`):
+
+- session: `session.{create, resume, list, close, interrupt, usage, history, compress, branch, title, save, undo}`
+- prompt: `prompt.{submit, background, btw}`
+- tools: `tools.{list, show, configure}`
+- slash: `slash.exec`, `command.{dispatch, resolve}`, `commands.catalog`, `complete.{path, slash}`
+- approvals: `approval.respond`, `sudo.respond`, `clarify.respond`, `secret.respond`
+- config/state: `config.{get, set, show}`, `model.options`, `reload.mcp`
+- ops: `shell.exec`, `cli.exec`, `terminal.resize`, `input.detect_drop`, `clipboard.paste`, `paste.collapse`, `image.attach`, `process.stop`
+- misc: `agents.list`, `skills.manage`, `plugins.list`, `cron.manage`, `insights.get`, `rollback.{list, diff, restore}`, `browser.manage`
+
+Protocol events (`_emit(…)` → handled in `ui-tui/src/app/createGatewayEventHandler.ts`):
+
+- lifecycle: `gateway.{ready, stderr}`, `session.info`, `skin.changed`
+- stream: `message.{start, delta, complete}`, `thinking.delta`, `reasoning.{delta, available}`, `status.update`
+- tools: `tool.{start, progress, complete, generating}`, `subagent.{start, thinking, tool, progress, complete}`
+- interactive: `approval.request`, `sudo.request`, `clarify.request`, `secret.request`
+- async: `background.complete`, `btw.complete`, `error`
+
+### Frontend — `ui-tui/src/`
+
+```
+src/
+├── entry.tsx            # node bootstrap: bootBanner → spawn python → dynamic-import Ink → render(<App/>)
+├── app.tsx              # <GatewayProvider> wraps <AppLayout>
+├── bootBanner.ts        # raw-ANSI banner to stdout in ~2ms, pre-React
+├── gatewayClient.ts     # JSON-RPC client over child_process stdio
+├── gatewayTypes.ts      # typed RPC responses + GatewayEvent union
+├── theme.ts             # DEFAULT_THEME + fromSkin
+│
+├── app/                 # hooks + stores — the orchestration layer
+│   ├── uiStore.ts             # nanostore: sid, info, busy, usage, theme, status…
+│   ├── turnStore.ts           # nanostore: per-turn activity / reasoning / tools
+│   ├── turnController.ts      # imperative singleton for stream-time operations
+│   ├── overlayStore.ts        # nanostore: modal/overlay state
+│   ├── useMainApp.ts          # top-level composition hook
+│   ├── useSessionLifecycle.ts # session.create/resume/close/reset
+│   ├── useSubmission.ts       # shell/slash/prompt dispatch + interpolation
+│   ├── useConfigSync.ts       # config.get + mtime poll
+│   ├── useComposerState.ts    # input buffer, paste snippets, editor mode
+│   ├── useInputHandlers.ts    # key bindings
+│   ├── createGatewayEventHandler.ts  # event-stream dispatcher
+│   ├── createSlashHandler.ts         # slash command router (registry + python fallback)
+│   └── slash/commands/        # core.ts, ops.ts, session.ts — TS-owned slash commands
+│
+├── components/          # AppLayout, AppChrome, AppOverlays, MessageLine, Thinking, Markdown, pickers, prompts, Banner, SessionPanel
+├── config/              # env, limits, timing constants
+├── content/             # charms, faces, fortunes, hotkeys, placeholders, verbs
+├── domain/              # details, messages, paths, roles, slash, usage, viewport
+├── protocol/            # interpolation, paste regex
+├── hooks/               # useCompletion, useInputHistory, useQueue, useVirtualHistory
+└── lib/                 # history, messages, osc52, rpc, text
+```
+
+### CLI entry points — `hermes_cli/main.py`
+
+- `hermes --tui`      → `node dist/entry.js` (auto-builds when `.ts`/`.tsx` newer than `dist/entry.js`)
+- `hermes --tui --dev` → `tsx src/entry.tsx` (skip build)
+- `HERMES_TUI_DIR=…`  → external prebuilt dist (nix, distro packaging)
+
+## Diverged From Original Plan
+
+| Plan | Reality | Why |
+|---|---|---|
+| `tui_gateway/{controller,session_state,events,protocol}.py` | all collapsed into `server.py` | no second consumer ever emerged, keeping one file cheaper than four |
+| `ui-tui/src/main.tsx` | split into `entry.tsx` (bootstrap) + `app.tsx` (shell) | boot banner + early python spawn wanted a pre-React moment |
+| `ui-tui/src/state/store.ts` | three nanostores (`uiStore`, `turnStore`, `overlayStore`) | separate lifetimes: ui persists, turn resets per reply, overlay is modal |
+| `approval.requested` / `sudo.requested` / `clarify.requested` | `*.request` (no `-ed`) | cosmetic |
+| `session.cancel` | dropped | `session.interrupt` covers it |
+| `HERMES_EXPERIMENTAL_TUI=1`, `display.experimental_tui: true`, `/tui on/off/status` | none shipped | `--tui` went from opt-in to first-class without an experimental phase |
+
+## Post-migration Additions (not in original plan)
+
+- **Async `session.create`** — returns sid in ~1ms, agent builds on a background thread, `session.info` broadcasts when ready; `_wait_agent()` gates every agent-touching handler via `_sess`
+- **`bootBanner`** — raw-ANSI logo painted to stdout at T≈2ms, before Ink loads; `<AlternateScreen>` wipes it seamlessly when React mounts
+- **Selection uniform bg** — `theme.color.selectionBg` wired via `useSelection().setSelectionBgColor`; replaces SGR-inverse per-cell swap that fragmented over amber/gold fg
+- **Slash command registry** — TS-owned commands in `app/slash/commands/{core,ops,session}.ts`, everything else falls through to `slash.exec` (python worker)
+- **Turn store + controller split** — imperative singleton (`turnController`) holds refs/timers, nanostore (`turnStore`) holds render-visible state
+
+## What's Still Open
+
+- **Classic CLI not deleted.** `cli.py` still has ~80 `prompt_toolkit` references; classic REPL is still the default when `--tui` is absent. The original plan's "Cut 4 · prompt_toolkit removal later" hasn't happened.
+- **No config-file opt-in.** `HERMES_EXPERIMENTAL_TUI` and `display.experimental_tui` were never built; only the CLI flag exists. Fine for now — if we want "default to TUI", a single line in `main.py` flips it.
@@ -6,6 +6,11 @@
 # All fields are optional — missing values inherit from the default skin.
 # Activate with: /skin <name>  or  display.skin: <name> in config.yaml
 #
+# Keys are marked:
+#   (both)    — applies to both the classic CLI and the TUI
+#   (classic) — classic CLI only (see hermes --tui in user-guide/tui.md)
+#   (tui)     — TUI only
+#
 # See hermes_cli/skin_engine.py for the full schema reference.
 # ============================================================================

@@ -14,43 +19,47 @@ name: example
 description: An example custom skin — copy and modify this template

 # ── Colors ──────────────────────────────────────────────────────────────────
-# Hex color values for Rich markup. These control the CLI's visual palette.
+# Hex color values. These control the visual palette.
 colors:
-  # Banner panel (the startup welcome box)
+  # Banner panel (the startup welcome box) — (both)
  banner_border: "#CD7F32"        # Panel border
  banner_title: "#FFD700"         # Panel title text
  banner_accent: "#FFBF00"        # Section headers (Available Tools, Skills, etc.)
  banner_dim: "#B8860B"           # Dim/muted text (separators, model info)
  banner_text: "#FFF8DC"          # Body text (tool names, skill names)

-  # UI elements
-  ui_accent: "#FFBF00"            # General accent color
+  # UI elements — (both)
+  ui_accent: "#FFBF00"            # General accent (falls back to banner_accent)
  ui_label: "#4dd0e1"             # Labels
  ui_ok: "#4caf50"                # Success indicators
  ui_error: "#ef5350"             # Error indicators
  ui_warn: "#ffa726"              # Warning indicators

  # Input area
-  prompt: "#FFF8DC"               # Prompt text color
-  input_rule: "#CD7F32"           # Horizontal rule around input
+  prompt: "#FFF8DC"               # Prompt text / `❯` glyph color (both)
+  input_rule: "#CD7F32"           # Horizontal rule above input (classic)

-  # Response box
-  response_border: "#FFD700"      # Response box border (ANSI color)
+  # Response box — (classic)
+  response_border: "#FFD700"      # Response box border

-  # Session display
-  session_label: "#DAA520"        # Session label
-  session_border: "#8B8682"       # Session ID dim color
+  # Session display — (both)
+  session_label: "#DAA520"        # "Session: " label
+  session_border: "#8B8682"       # Session ID text

-  # TUI surfaces
-  status_bar_bg: "#1a1a2e"              # Status / usage bar background
-  voice_status_bg: "#1a1a2e"            # Voice-mode badge background
-  completion_menu_bg: "#1a1a2e"         # Completion list background
-  completion_menu_current_bg: "#333355" # Active completion row background
-  completion_menu_meta_bg: "#1a1a2e"    # Completion meta column background
-  completion_menu_meta_current_bg: "#333355"  # Active completion meta background
+  # TUI / CLI surfaces — (classic: status bar, voice badge, completion meta)
+  status_bar_bg: "#1a1a2e"              # Status / usage bar background (classic)
+  voice_status_bg: "#1a1a2e"            # Voice-mode badge background (classic)
+  completion_menu_bg: "#1a1a2e"         # Completion list background (both)
+  completion_menu_current_bg: "#333355" # Active completion row background (both)
+  completion_menu_meta_bg: "#1a1a2e"    # Completion meta column bg (classic)
+  completion_menu_meta_current_bg: "#333355"  # Active meta bg (classic)
+
+  # Drag-to-select background — (tui)
+  selection_bg: "#3a3a55"               # Uniform selection highlight in the TUI

 # ── Spinner ─────────────────────────────────────────────────────────────────
-# Customize the animated spinner shown during API calls and tool execution.
+# (classic) — the TUI uses its own animated indicators; spinner config here
+# is only read by the classic prompt_toolkit CLI.
 spinner:
  # Faces shown while waiting for the API response
  waiting_faces:
@@ -78,17 +87,17 @@ spinner:
  #   - ["⟪▲", "▲⟫"]

 # ── Branding ────────────────────────────────────────────────────────────────
-# Text strings used throughout the CLI interface.
+# Text strings used throughout the interface.
 branding:
-  agent_name: "Hermes Agent"          # Banner title, about display
-  welcome: "Welcome! Type your message or /help for commands."
-  goodbye: "Goodbye! ⚕"              # Exit message
-  response_label: " ⚕ Hermes "       # Response box header label
-  prompt_symbol: "❯ "                 # Input prompt symbol
-  help_header: "(^_^)? Available Commands"  # /help header text
+  agent_name: "Hermes Agent"                  # (both) Banner title, about display
+  welcome: "Welcome! Type your message or /help for commands."  # (both)
+  goodbye: "Goodbye! ⚕"                       # (both) Exit message
+  response_label: " ⚕ Hermes "                # (classic) Response box header label
+  prompt_symbol: "❯ "                          # (both) Input prompt glyph
+  help_header: "(^_^)? Available Commands"     # (both) /help overlay title

 # ── Tool Output ─────────────────────────────────────────────────────────────
-# Character used as the prefix for tool output lines.
+# Character used as the prefix for tool output lines. (both)
 # Default is "┊" (thin dotted vertical line). Some alternatives:
 #   "╎" (light triple dash vertical)
 #   "▏" (left one-eighth block)
@@ -36,6 +36,26 @@
        "type": "github"
      }
    },
+    "npm-lockfile-fix": {
+      "inputs": {
+        "nixpkgs": [
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1775903712,
+        "narHash": "sha256-2GV79U6iVH4gKAPWYrxUReB0S41ty/Y3dBLquU8AlaA=",
+        "owner": "jeslie0",
+        "repo": "npm-lockfile-fix",
+        "rev": "c6093acb0c0548e0f9b8b3d82918823721930fe8",
+        "type": "github"
+      },
+      "original": {
+        "owner": "jeslie0",
+        "repo": "npm-lockfile-fix",
+        "type": "github"
+      }
+    },
    "pyproject-build-systems": {
      "inputs": {
        "nixpkgs": [
@@ -124,6 +144,7 @@
      "inputs": {
        "flake-parts": "flake-parts",
        "nixpkgs": "nixpkgs",
+        "npm-lockfile-fix": "npm-lockfile-fix",
        "pyproject-build-systems": "pyproject-build-systems",
        "pyproject-nix": "pyproject-nix_2",
        "uv2nix": "uv2nix_2"
@@ -19,11 +19,20 @@
      url = "github:pyproject-nix/build-system-pkgs";
      inputs.nixpkgs.follows = "nixpkgs";
    };
+    npm-lockfile-fix = {
+      url = "github:jeslie0/npm-lockfile-fix";
+      inputs.nixpkgs.follows = "nixpkgs";
+    };
  };

-  outputs = inputs:
+  outputs =
+    inputs:
    inputs.flake-parts.lib.mkFlake { inherit inputs; } {
-      systems = [ "x86_64-linux" "aarch64-linux" "aarch64-darwin" ];
+      systems = [
+        "x86_64-linux"
+        "aarch64-linux"
+        "aarch64-darwin"
+      ];

      imports = [
        ./nix/packages.nix
@@ -100,7 +100,7 @@ def build_channel_directory(adapters: Dict[Any, Any]) -> Dict[str, Any]:


 def _build_discord(adapter) -> List[Dict[str, str]]:
-    """Enumerate all text channels the Discord bot can see."""
+    """Enumerate all text channels and forum channels the Discord bot can see."""
    channels = []
    client = getattr(adapter, "_client", None)
    if not client:
@@ -119,6 +119,15 @@ def _build_discord(adapter) -> List[Dict[str, str]]:
                "guild": guild.name,
                "type": "channel",
            })
+        # Forum channels (type 15) — creating a message auto-spawns a thread post.
+        forums = getattr(guild, "forum_channels", None) or []
+        for ch in forums:
+            channels.append({
+                "id": str(ch.id),
+                "name": ch.name,
+                "guild": guild.name,
+                "type": "forum",
+            })
        # Also include DM-capable users we've interacted with is not
        # feasible via guild enumeration; those come from sessions.

@@ -191,6 +200,15 @@ def load_directory() -> Dict[str, Any]:
        return {"updated_at": None, "platforms": {}}


+def lookup_channel_type(platform_name: str, chat_id: str) -> Optional[str]:
+    """Return the channel ``type`` string (e.g. ``"channel"``, ``"forum"``) for *chat_id*, or *None* if unknown."""
+    directory = load_directory()
+    for ch in directory.get("platforms", {}).get(platform_name, []):
+        if ch.get("id") == chat_id:
+            return ch.get("type")
+    return None
+
+
 def resolve_channel_name(platform_name: str, name: str) -> Optional[str]:
    """
    Resolve a human-friendly channel name to a numeric ID.
@@ -258,6 +258,13 @@ class GatewayConfig:
    # Streaming configuration
    streaming: StreamingConfig = field(default_factory=StreamingConfig)

+    # Session store pruning: drop SessionEntry records older than this many
+    # days from the in-memory dict and sessions.json.  Keeps the store from
+    # growing unbounded in gateways serving many chats/threads/users over
+    # months.  Pruning is invisible to users — if they resume, they get a
+    # fresh session exactly as if the reset policy had fired.  0 = disabled.
+    session_store_max_age_days: int = 90
+
    def get_connected_platforms(self) -> List[Platform]:
        """Return list of platforms that are enabled and configured."""
        connected = []
@@ -307,6 +314,14 @@ class GatewayConfig:
            # QQBot uses extra dict for app credentials
            elif platform == Platform.QQBOT and config.extra.get("app_id") and config.extra.get("client_secret"):
                connected.append(platform)
+            # DingTalk uses client_id/client_secret from config.extra or env vars
+            elif platform == Platform.DINGTALK and (
+                config.extra.get("client_id") or os.getenv("DINGTALK_CLIENT_ID")
+            ) and (
+                config.extra.get("client_secret") or os.getenv("DINGTALK_CLIENT_SECRET")
+            ):
+                connected.append(platform)
+        
        return connected
    
    def get_home_channel(self, platform: Platform) -> Optional[HomeChannel]:
@@ -357,6 +372,7 @@ class GatewayConfig:
            "thread_sessions_per_user": self.thread_sessions_per_user,
            "unauthorized_dm_behavior": self.unauthorized_dm_behavior,
            "streaming": self.streaming.to_dict(),
+            "session_store_max_age_days": self.session_store_max_age_days,
        }
    
    @classmethod
@@ -404,6 +420,13 @@ class GatewayConfig:
            "pair",
        )

+        try:
+            session_store_max_age_days = int(data.get("session_store_max_age_days", 90))
+            if session_store_max_age_days < 0:
+                session_store_max_age_days = 0
+        except (TypeError, ValueError):
+            session_store_max_age_days = 90
+
        return cls(
            platforms=platforms,
            default_reset_policy=default_policy,
@@ -418,6 +441,7 @@ class GatewayConfig:
            thread_sessions_per_user=_coerce_bool(thread_sessions_per_user, False),
            unauthorized_dm_behavior=unauthorized_dm_behavior,
            streaming=StreamingConfig.from_dict(data.get("streaming", {})),
+            session_store_max_age_days=session_store_max_age_days,
        )

    def get_unauthorized_dm_behavior(self, platform: Optional[Platform] = None) -> str:
@@ -554,6 +578,12 @@ def load_gateway_config() -> GatewayConfig:
                    bridged["mention_patterns"] = platform_cfg["mention_patterns"]
                if plat == Platform.DISCORD and "channel_skill_bindings" in platform_cfg:
                    bridged["channel_skill_bindings"] = platform_cfg["channel_skill_bindings"]
+                if "channel_prompts" in platform_cfg:
+                    channel_prompts = platform_cfg["channel_prompts"]
+                    if isinstance(channel_prompts, dict):
+                        bridged["channel_prompts"] = {str(k): v for k, v in channel_prompts.items()}
+                    else:
+                        bridged["channel_prompts"] = channel_prompts
                if not bridged:
                    continue
                plat_data = platforms_data.setdefault(plat.value, {})
@@ -611,6 +641,20 @@ def load_gateway_config() -> GatewayConfig:
                    if isinstance(ntc, list):
                        ntc = ",".join(str(v) for v in ntc)
                    os.environ["DISCORD_NO_THREAD_CHANNELS"] = str(ntc)
+                # allow_mentions: granular control over what the bot can ping.
+                # Safe defaults (no @everyone/roles) are applied in the adapter;
+                # these YAML keys only override when set and let users opt back
+                # into unsafe modes (e.g. roles=true) if they actually want it.
+                allow_mentions_cfg = discord_cfg.get("allow_mentions")
+                if isinstance(allow_mentions_cfg, dict):
+                    for yaml_key, env_key in (
+                        ("everyone", "DISCORD_ALLOW_MENTION_EVERYONE"),
+                        ("roles", "DISCORD_ALLOW_MENTION_ROLES"),
+                        ("users", "DISCORD_ALLOW_MENTION_USERS"),
+                        ("replied_user", "DISCORD_ALLOW_MENTION_REPLIED_USER"),
+                    ):
+                        if yaml_key in allow_mentions_cfg and not os.getenv(env_key):
+                            os.environ[env_key] = str(allow_mentions_cfg[yaml_key]).lower()

            # Telegram settings → env vars (env vars take precedence)
            telegram_cfg = yaml_cfg.get("telegram", {})
@@ -632,6 +676,18 @@ def load_gateway_config() -> GatewayConfig:
                    os.environ["TELEGRAM_IGNORED_THREADS"] = str(ignored_threads)
                if "reactions" in telegram_cfg and not os.getenv("TELEGRAM_REACTIONS"):
                    os.environ["TELEGRAM_REACTIONS"] = str(telegram_cfg["reactions"]).lower()
+                if "proxy_url" in telegram_cfg and not os.getenv("TELEGRAM_PROXY"):
+                    os.environ["TELEGRAM_PROXY"] = str(telegram_cfg["proxy_url"]).strip()
+                if "disable_link_previews" in telegram_cfg:
+                    plat_data = platforms_data.setdefault(Platform.TELEGRAM.value, {})
+                    if not isinstance(plat_data, dict):
+                        plat_data = {}
+                        platforms_data[Platform.TELEGRAM.value] = plat_data
+                    extra = plat_data.setdefault("extra", {})
+                    if not isinstance(extra, dict):
+                        extra = {}
+                        plat_data["extra"] = extra
+                    extra["disable_link_previews"] = telegram_cfg["disable_link_previews"]

            whatsapp_cfg = yaml_cfg.get("whatsapp", {})
            if isinstance(whatsapp_cfg, dict):
@@ -645,6 +701,24 @@ def load_gateway_config() -> GatewayConfig:
                        frc = ",".join(str(v) for v in frc)
                    os.environ["WHATSAPP_FREE_RESPONSE_CHATS"] = str(frc)

+            # DingTalk settings → env vars (env vars take precedence)
+            dingtalk_cfg = yaml_cfg.get("dingtalk", {})
+            if isinstance(dingtalk_cfg, dict):
+                if "require_mention" in dingtalk_cfg and not os.getenv("DINGTALK_REQUIRE_MENTION"):
+                    os.environ["DINGTALK_REQUIRE_MENTION"] = str(dingtalk_cfg["require_mention"]).lower()
+                if "mention_patterns" in dingtalk_cfg and not os.getenv("DINGTALK_MENTION_PATTERNS"):
+                    os.environ["DINGTALK_MENTION_PATTERNS"] = json.dumps(dingtalk_cfg["mention_patterns"])
+                frc = dingtalk_cfg.get("free_response_chats")
+                if frc is not None and not os.getenv("DINGTALK_FREE_RESPONSE_CHATS"):
+                    if isinstance(frc, list):
+                        frc = ",".join(str(v) for v in frc)
+                    os.environ["DINGTALK_FREE_RESPONSE_CHATS"] = str(frc)
+                allowed = dingtalk_cfg.get("allowed_users")
+                if allowed is not None and not os.getenv("DINGTALK_ALLOWED_USERS"):
+                    if isinstance(allowed, list):
+                        allowed = ",".join(str(v) for v in allowed)
+                    os.environ["DINGTALK_ALLOWED_USERS"] = str(allowed)
+
            # Matrix settings → env vars (env vars take precedence)
            matrix_cfg = yaml_cfg.get("matrix", {})
            if isinstance(matrix_cfg, dict):
@@ -988,6 +1062,25 @@ def _apply_env_overrides(config: GatewayConfig) -> None:
        if webhook_secret:
            config.platforms[Platform.WEBHOOK].extra["secret"] = webhook_secret

+    # DingTalk
+    dingtalk_client_id = os.getenv("DINGTALK_CLIENT_ID")
+    dingtalk_client_secret = os.getenv("DINGTALK_CLIENT_SECRET")
+    if dingtalk_client_id and dingtalk_client_secret:
+        if Platform.DINGTALK not in config.platforms:
+            config.platforms[Platform.DINGTALK] = PlatformConfig()
+        config.platforms[Platform.DINGTALK].enabled = True
+        config.platforms[Platform.DINGTALK].extra.update({
+            "client_id": dingtalk_client_id,
+            "client_secret": dingtalk_client_secret,
+        })
+        dingtalk_home = os.getenv("DINGTALK_HOME_CHANNEL")
+        if dingtalk_home:
+            config.platforms[Platform.DINGTALK].home_channel = HomeChannel(
+                platform=Platform.DINGTALK,
+                chat_id=dingtalk_home,
+                name=os.getenv("DINGTALK_HOME_CHANNEL_NAME", "Home"),
+            )
+
    # Feishu / Lark
    feishu_app_id = os.getenv("FEISHU_APP_ID")
    feishu_app_secret = os.getenv("FEISHU_APP_SECRET")
@@ -1136,12 +1229,24 @@ def _apply_env_overrides(config: GatewayConfig) -> None:
        qq_group_allowed = os.getenv("QQ_GROUP_ALLOWED_USERS", "").strip()
        if qq_group_allowed:
            extra["group_allow_from"] = qq_group_allowed
-        qq_home = os.getenv("QQ_HOME_CHANNEL", "").strip()
+        qq_home = os.getenv("QQBOT_HOME_CHANNEL", "").strip()
+        qq_home_name_env = "QQBOT_HOME_CHANNEL_NAME"
+        if not qq_home:
+            # Back-compat: accept the pre-rename name and log a one-time warning.
+            legacy_home = os.getenv("QQ_HOME_CHANNEL", "").strip()
+            if legacy_home:
+                qq_home = legacy_home
+                qq_home_name_env = "QQ_HOME_CHANNEL_NAME"
+                import logging
+                logging.getLogger(__name__).warning(
+                    "QQ_HOME_CHANNEL is deprecated; rename to QQBOT_HOME_CHANNEL "
+                    "in your .env for consistency with the platform key."
+                )
        if qq_home:
            config.platforms[Platform.QQBOT].home_channel = HomeChannel(
                platform=Platform.QQBOT,
                chat_id=qq_home,
-                name=os.getenv("QQ_HOME_CHANNEL_NAME", "Home"),
+                name=os.getenv("QQBOT_HOME_CHANNEL_NAME") or os.getenv(qq_home_name_env, "Home"),
            )

    # Session settings
@@ -515,6 +515,8 @@ class APIServerAdapter(BasePlatformAdapter):
        session_id: Optional[str] = None,
        stream_delta_callback=None,
        tool_progress_callback=None,
+        tool_start_callback=None,
+        tool_complete_callback=None,
    ) -> Any:
        """
        Create an AIAgent instance using the gateway's runtime config.
@@ -553,6 +555,8 @@ class APIServerAdapter(BasePlatformAdapter):
            platform="api_server",
            stream_delta_callback=stream_delta_callback,
            tool_progress_callback=tool_progress_callback,
+            tool_start_callback=tool_start_callback,
+            tool_complete_callback=tool_complete_callback,
            session_db=self._ensure_session_db(),
            fallback_model=fallback_model,
        )
@@ -898,7 +902,7 @@ class APIServerAdapter(BasePlatformAdapter):
                return time.monotonic()

            # Stream content chunks as they arrive from the agent
-            loop = asyncio.get_event_loop()
+            loop = asyncio.get_running_loop()
            while True:
                try:
                    delta = await loop.run_in_executor(None, lambda: stream_q.get(timeout=0.5))
@@ -965,6 +969,427 @@ class APIServerAdapter(BasePlatformAdapter):

        return response

+    async def _write_sse_responses(
+        self,
+        request: "web.Request",
+        response_id: str,
+        model: str,
+        created_at: int,
+        stream_q,
+        agent_task,
+        agent_ref,
+        conversation_history: List[Dict[str, str]],
+        user_message: str,
+        instructions: Optional[str],
+        conversation: Optional[str],
+        store: bool,
+        session_id: str,
+    ) -> "web.StreamResponse":
+        """Write an SSE stream for POST /v1/responses (OpenAI Responses API).
+
+        Emits spec-compliant event types as the agent runs:
+
+        - ``response.created`` — initial envelope (status=in_progress)
+        - ``response.output_text.delta`` / ``response.output_text.done`` —
+          streamed assistant text
+        - ``response.output_item.added`` / ``response.output_item.done``
+          with ``item.type == "function_call"`` — when the agent invokes a
+          tool (both events fire; the ``done`` event carries the finalized
+          ``arguments`` string)
+        - ``response.output_item.added`` with
+          ``item.type == "function_call_output"`` — tool result with
+          ``{call_id, output, status}``
+        - ``response.completed`` — terminal event carrying the full
+          response object with all output items + usage (same payload
+          shape as the non-streaming path for parity)
+        - ``response.failed`` — terminal event on agent error
+
+        If the client disconnects mid-stream, ``agent.interrupt()`` is
+        called so the agent stops issuing upstream LLM calls, then the
+        asyncio task is cancelled.  When ``store=True`` the full response
+        is persisted to the ResponseStore in a ``finally`` block so GET
+        /v1/responses/{id} and ``previous_response_id`` chaining work the
+        same as the batch path.
+        """
+        import queue as _q
+
+        sse_headers = {
+            "Content-Type": "text/event-stream",
+            "Cache-Control": "no-cache",
+            "X-Accel-Buffering": "no",
+        }
+        origin = request.headers.get("Origin", "")
+        cors = self._cors_headers_for_origin(origin) if origin else None
+        if cors:
+            sse_headers.update(cors)
+        if session_id:
+            sse_headers["X-Hermes-Session-Id"] = session_id
+        response = web.StreamResponse(status=200, headers=sse_headers)
+        await response.prepare(request)
+
+        # State accumulated during the stream
+        final_text_parts: List[str] = []
+        # Track open function_call items by name so we can emit a matching
+        # ``done`` event when the tool completes.  Order preserved.
+        pending_tool_calls: List[Dict[str, Any]] = []
+        # Output items we've emitted so far (used to build the terminal
+        # response.completed payload).  Kept in the order they appeared.
+        emitted_items: List[Dict[str, Any]] = []
+        # Monotonic counter for output_index (spec requires it).
+        output_index = 0
+        # Monotonic counter for call_id generation if the agent doesn't
+        # provide one (it doesn't, from tool_progress_callback).
+        call_counter = 0
+        # Canonical Responses SSE events include a monotonically increasing
+        # sequence_number. Add it server-side for every emitted event so
+        # clients that validate the OpenAI event schema can parse our stream.
+        sequence_number = 0
+        # Track the assistant message item id + content index for text
+        # delta events — the spec ties deltas to a specific item.
+        message_item_id = f"msg_{uuid.uuid4().hex[:24]}"
+        message_output_index: Optional[int] = None
+        message_opened = False
+
+        async def _write_event(event_type: str, data: Dict[str, Any]) -> None:
+            nonlocal sequence_number
+            if "sequence_number" not in data:
+                data["sequence_number"] = sequence_number
+            sequence_number += 1
+            payload = f"event: {event_type}\ndata: {json.dumps(data)}\n\n"
+            await response.write(payload.encode())
+
+        def _envelope(status: str) -> Dict[str, Any]:
+            env: Dict[str, Any] = {
+                "id": response_id,
+                "object": "response",
+                "status": status,
+                "created_at": created_at,
+                "model": model,
+            }
+            return env
+
+        final_response_text = ""
+        agent_error: Optional[str] = None
+        usage: Dict[str, int] = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
+
+        try:
+            # response.created — initial envelope, status=in_progress
+            created_env = _envelope("in_progress")
+            created_env["output"] = []
+            await _write_event("response.created", {
+                "type": "response.created",
+                "response": created_env,
+            })
+            last_activity = time.monotonic()
+
+            async def _open_message_item() -> None:
+                """Emit response.output_item.added for the assistant message
+                the first time any text delta arrives."""
+                nonlocal message_opened, message_output_index, output_index
+                if message_opened:
+                    return
+                message_opened = True
+                message_output_index = output_index
+                output_index += 1
+                item = {
+                    "id": message_item_id,
+                    "type": "message",
+                    "status": "in_progress",
+                    "role": "assistant",
+                    "content": [],
+                }
+                await _write_event("response.output_item.added", {
+                    "type": "response.output_item.added",
+                    "output_index": message_output_index,
+                    "item": item,
+                })
+
+            async def _emit_text_delta(delta_text: str) -> None:
+                await _open_message_item()
+                final_text_parts.append(delta_text)
+                await _write_event("response.output_text.delta", {
+                    "type": "response.output_text.delta",
+                    "item_id": message_item_id,
+                    "output_index": message_output_index,
+                    "content_index": 0,
+                    "delta": delta_text,
+                    "logprobs": [],
+                })
+
+            async def _emit_tool_started(payload: Dict[str, Any]) -> str:
+                """Emit response.output_item.added for a function_call.
+
+                Returns the call_id so the matching completion event can
+                reference it.  Prefer the real ``tool_call_id`` from the
+                agent when available; fall back to a generated call id for
+                safety in tests or older code paths.
+                """
+                nonlocal output_index, call_counter
+                call_counter += 1
+                call_id = payload.get("tool_call_id") or f"call_{response_id[5:]}_{call_counter}"
+                args = payload.get("arguments", {})
+                if isinstance(args, dict):
+                    arguments_str = json.dumps(args)
+                else:
+                    arguments_str = str(args)
+                item = {
+                    "id": f"fc_{uuid.uuid4().hex[:24]}",
+                    "type": "function_call",
+                    "status": "in_progress",
+                    "name": payload.get("name", ""),
+                    "call_id": call_id,
+                    "arguments": arguments_str,
+                }
+                idx = output_index
+                output_index += 1
+                pending_tool_calls.append({
+                    "call_id": call_id,
+                    "name": payload.get("name", ""),
+                    "arguments": arguments_str,
+                    "item_id": item["id"],
+                    "output_index": idx,
+                })
+                emitted_items.append({
+                    "type": "function_call",
+                    "name": payload.get("name", ""),
+                    "arguments": arguments_str,
+                    "call_id": call_id,
+                })
+                await _write_event("response.output_item.added", {
+                    "type": "response.output_item.added",
+                    "output_index": idx,
+                    "item": item,
+                })
+                return call_id
+
+            async def _emit_tool_completed(payload: Dict[str, Any]) -> None:
+                """Emit response.output_item.done (function_call) followed
+                by response.output_item.added (function_call_output)."""
+                nonlocal output_index
+                call_id = payload.get("tool_call_id")
+                result = payload.get("result", "")
+                pending = None
+                if call_id:
+                    for i, p in enumerate(pending_tool_calls):
+                        if p["call_id"] == call_id:
+                            pending = pending_tool_calls.pop(i)
+                            break
+                if pending is None:
+                    # Completion without a matching start — skip to avoid
+                    # emitting orphaned done events.
+                    return
+
+                # function_call done
+                done_item = {
+                    "id": pending["item_id"],
+                    "type": "function_call",
+                    "status": "completed",
+                    "name": pending["name"],
+                    "call_id": pending["call_id"],
+                    "arguments": pending["arguments"],
+                }
+                await _write_event("response.output_item.done", {
+                    "type": "response.output_item.done",
+                    "output_index": pending["output_index"],
+                    "item": done_item,
+                })
+
+                # function_call_output added (result)
+                result_str = result if isinstance(result, str) else json.dumps(result)
+                output_parts = [{"type": "input_text", "text": result_str}]
+                output_item = {
+                    "id": f"fco_{uuid.uuid4().hex[:24]}",
+                    "type": "function_call_output",
+                    "call_id": pending["call_id"],
+                    "output": output_parts,
+                    "status": "completed",
+                }
+                idx = output_index
+                output_index += 1
+                emitted_items.append({
+                    "type": "function_call_output",
+                    "call_id": pending["call_id"],
+                    "output": output_parts,
+                })
+                await _write_event("response.output_item.added", {
+                    "type": "response.output_item.added",
+                    "output_index": idx,
+                    "item": output_item,
+                })
+                await _write_event("response.output_item.done", {
+                    "type": "response.output_item.done",
+                    "output_index": idx,
+                    "item": output_item,
+                })
+
+            # Main drain loop — thread-safe queue fed by agent callbacks.
+            async def _dispatch(it) -> None:
+                """Route a queue item to the correct SSE emitter.
+
+                Plain strings are text deltas.  Tagged tuples with
+                ``__tool_started__`` / ``__tool_completed__`` prefixes
+                are tool lifecycle events.
+                """
+                if isinstance(it, tuple) and len(it) == 2 and isinstance(it[0], str):
+                    tag, payload = it
+                    if tag == "__tool_started__":
+                        await _emit_tool_started(payload)
+                    elif tag == "__tool_completed__":
+                        await _emit_tool_completed(payload)
+                    # Unknown tags are silently ignored (forward-compat).
+                elif isinstance(it, str):
+                    await _emit_text_delta(it)
+                # Other types (non-string, non-tuple) are silently dropped.
+
+            loop = asyncio.get_running_loop()
+            while True:
+                try:
+                    item = await loop.run_in_executor(None, lambda: stream_q.get(timeout=0.5))
+                except _q.Empty:
+                    if agent_task.done():
+                        # Drain remaining
+                        while True:
+                            try:
+                                item = stream_q.get_nowait()
+                                if item is None:
+                                    break
+                                await _dispatch(item)
+                                last_activity = time.monotonic()
+                            except _q.Empty:
+                                break
+                        break
+                    if time.monotonic() - last_activity >= CHAT_COMPLETIONS_SSE_KEEPALIVE_SECONDS:
+                        await response.write(b": keepalive\n\n")
+                        last_activity = time.monotonic()
+                    continue
+
+                if item is None:  # EOS sentinel
+                    break
+
+                await _dispatch(item)
+                last_activity = time.monotonic()
+
+            # Pick up agent result + usage from the completed task
+            try:
+                result, agent_usage = await agent_task
+                usage = agent_usage or usage
+                # If the agent produced a final_response but no text
+                # deltas were streamed (e.g. some providers only emit
+                # the full response at the end), emit a single fallback
+                # delta so Responses clients still receive a live text part.
+                agent_final = result.get("final_response", "") if isinstance(result, dict) else ""
+                if agent_final and not final_text_parts:
+                    await _emit_text_delta(agent_final)
+                if agent_final and not final_response_text:
+                    final_response_text = agent_final
+                if isinstance(result, dict) and result.get("error") and not final_response_text:
+                    agent_error = result["error"]
+            except Exception as e:  # noqa: BLE001
+                logger.error("Error running agent for streaming responses: %s", e, exc_info=True)
+                agent_error = str(e)
+
+            # Close the message item if it was opened
+            final_response_text = "".join(final_text_parts) or final_response_text
+            if message_opened:
+                await _write_event("response.output_text.done", {
+                    "type": "response.output_text.done",
+                    "item_id": message_item_id,
+                    "output_index": message_output_index,
+                    "content_index": 0,
+                    "text": final_response_text,
+                    "logprobs": [],
+                })
+                msg_done_item = {
+                    "id": message_item_id,
+                    "type": "message",
+                    "status": "completed",
+                    "role": "assistant",
+                    "content": [
+                        {"type": "output_text", "text": final_response_text}
+                    ],
+                }
+                await _write_event("response.output_item.done", {
+                    "type": "response.output_item.done",
+                    "output_index": message_output_index,
+                    "item": msg_done_item,
+                })
+
+            # Always append a final message item in the completed
+            # response envelope so clients that only parse the terminal
+            # payload still see the assistant text.  This mirrors the
+            # shape produced by _extract_output_items in the batch path.
+            final_items: List[Dict[str, Any]] = list(emitted_items)
+            final_items.append({
+                "type": "message",
+                "role": "assistant",
+                "content": [
+                    {"type": "output_text", "text": final_response_text or (agent_error or "")}
+                ],
+            })
+
+            if agent_error:
+                failed_env = _envelope("failed")
+                failed_env["output"] = final_items
+                failed_env["error"] = {"message": agent_error, "type": "server_error"}
+                failed_env["usage"] = {
+                    "input_tokens": usage.get("input_tokens", 0),
+                    "output_tokens": usage.get("output_tokens", 0),
+                    "total_tokens": usage.get("total_tokens", 0),
+                }
+                await _write_event("response.failed", {
+                    "type": "response.failed",
+                    "response": failed_env,
+                })
+            else:
+                completed_env = _envelope("completed")
+                completed_env["output"] = final_items
+                completed_env["usage"] = {
+                    "input_tokens": usage.get("input_tokens", 0),
+                    "output_tokens": usage.get("output_tokens", 0),
+                    "total_tokens": usage.get("total_tokens", 0),
+                }
+                await _write_event("response.completed", {
+                    "type": "response.completed",
+                    "response": completed_env,
+                })
+
+                # Persist for future chaining / GET retrieval, mirroring
+                # the batch path behavior.
+                if store:
+                    full_history = list(conversation_history)
+                    full_history.append({"role": "user", "content": user_message})
+                    if isinstance(result, dict) and result.get("messages"):
+                        full_history.extend(result["messages"])
+                    else:
+                        full_history.append({"role": "assistant", "content": final_response_text})
+                    self._response_store.put(response_id, {
+                        "response": completed_env,
+                        "conversation_history": full_history,
+                        "instructions": instructions,
+                        "session_id": session_id,
+                    })
+                    if conversation:
+                        self._response_store.set_conversation(conversation, response_id)
+
+        except (ConnectionResetError, ConnectionAbortedError, BrokenPipeError, OSError):
+            # Client disconnected — interrupt the agent so it stops
+            # making upstream LLM calls, then cancel the task.
+            agent = agent_ref[0] if agent_ref else None
+            if agent is not None:
+                try:
+                    agent.interrupt("SSE client disconnected")
+                except Exception:
+                    pass
+            if not agent_task.done():
+                agent_task.cancel()
+                try:
+                    await agent_task
+                except (asyncio.CancelledError, Exception):
+                    pass
+            logger.info("SSE client disconnected; interrupted agent task %s", response_id)
+
+        return response
+
    async def _handle_responses(self, request: "web.Request") -> "web.Response":
        """POST /v1/responses — OpenAI Responses API format."""
        auth_err = self._check_auth(request)
@@ -1035,11 +1460,13 @@ class APIServerAdapter(BasePlatformAdapter):
            if previous_response_id:
                logger.debug("Both conversation_history and previous_response_id provided; using conversation_history")

+        stored_session_id = None
        if not conversation_history and previous_response_id:
            stored = self._response_store.get(previous_response_id)
            if stored is None:
                return web.json_response(_openai_error(f"Previous response not found: {previous_response_id}"), status=404)
            conversation_history = list(stored.get("conversation_history", []))
+            stored_session_id = stored.get("session_id")
            # If no instructions provided, carry forward from previous
            if instructions is None:
                instructions = stored.get("instructions")
@@ -1057,8 +1484,83 @@ class APIServerAdapter(BasePlatformAdapter):
        if body.get("truncation") == "auto" and len(conversation_history) > 100:
            conversation_history = conversation_history[-100:]

-        # Run the agent (with Idempotency-Key support)
-        session_id = str(uuid.uuid4())
+        # Reuse session from previous_response_id chain so the dashboard
+        # groups the entire conversation under one session entry.
+        session_id = stored_session_id or str(uuid.uuid4())
+
+        stream = bool(body.get("stream", False))
+        if stream:
+            # Streaming branch — emit OpenAI Responses SSE events as the
+            # agent runs so frontends can render text deltas and tool
+            # calls in real time.  See _write_sse_responses for details.
+            import queue as _q
+            _stream_q: _q.Queue = _q.Queue()
+
+            def _on_delta(delta):
+                # None from the agent is a CLI box-close signal, not EOS.
+                # Forwarding would kill the SSE stream prematurely; the
+                # SSE writer detects completion via agent_task.done().
+                if delta is not None:
+                    _stream_q.put(delta)
+
+            def _on_tool_progress(event_type, name, preview, args, **kwargs):
+                """Queue non-start tool progress events if needed in future.
+
+                The structured Responses stream uses ``tool_start_callback``
+                and ``tool_complete_callback`` for exact call-id correlation,
+                so progress events are currently ignored here.
+                """
+                return
+
+            def _on_tool_start(tool_call_id, function_name, function_args):
+                """Queue a started tool for live function_call streaming."""
+                _stream_q.put(("__tool_started__", {
+                    "tool_call_id": tool_call_id,
+                    "name": function_name,
+                    "arguments": function_args or {},
+                }))
+
+            def _on_tool_complete(tool_call_id, function_name, function_args, function_result):
+                """Queue a completed tool result for live function_call_output streaming."""
+                _stream_q.put(("__tool_completed__", {
+                    "tool_call_id": tool_call_id,
+                    "name": function_name,
+                    "arguments": function_args or {},
+                    "result": function_result,
+                }))
+
+            agent_ref = [None]
+            agent_task = asyncio.ensure_future(self._run_agent(
+                user_message=user_message,
+                conversation_history=conversation_history,
+                ephemeral_system_prompt=instructions,
+                session_id=session_id,
+                stream_delta_callback=_on_delta,
+                tool_progress_callback=_on_tool_progress,
+                tool_start_callback=_on_tool_start,
+                tool_complete_callback=_on_tool_complete,
+                agent_ref=agent_ref,
+            ))
+
+            response_id = f"resp_{uuid.uuid4().hex[:28]}"
+            model_name = body.get("model", self._model_name)
+            created_at = int(time.time())
+
+            return await self._write_sse_responses(
+                request=request,
+                response_id=response_id,
+                model=model_name,
+                created_at=created_at,
+                stream_q=_stream_q,
+                agent_task=agent_task,
+                agent_ref=agent_ref,
+                conversation_history=conversation_history,
+                user_message=user_message,
+                instructions=instructions,
+                conversation=conversation,
+                store=store,
+                session_id=session_id,
+            )

        async def _compute_response():
            return await self._run_agent(
@@ -1133,6 +1635,7 @@ class APIServerAdapter(BasePlatformAdapter):
                "response": response_data,
                "conversation_history": full_history,
                "instructions": instructions,
+                "session_id": session_id,
            })
            # Update conversation mapping so the next request with the same
            # conversation name automatically chains to this response
@@ -1486,6 +1989,8 @@ class APIServerAdapter(BasePlatformAdapter):
        session_id: Optional[str] = None,
        stream_delta_callback=None,
        tool_progress_callback=None,
+        tool_start_callback=None,
+        tool_complete_callback=None,
        agent_ref: Optional[list] = None,
    ) -> tuple:
        """
@@ -1499,7 +2004,7 @@ class APIServerAdapter(BasePlatformAdapter):
        callers (e.g. the SSE writer) to call ``agent.interrupt()`` from
        another thread to stop in-progress LLM calls.
        """
-        loop = asyncio.get_event_loop()
+        loop = asyncio.get_running_loop()

        def _run():
            agent = self._create_agent(
@@ -1507,6 +2012,8 @@ class APIServerAdapter(BasePlatformAdapter):
                session_id=session_id,
                stream_delta_callback=stream_delta_callback,
                tool_progress_callback=tool_progress_callback,
+                tool_start_callback=tool_start_callback,
+                tool_complete_callback=tool_complete_callback,
            )
            if agent_ref is not None:
                agent_ref[0] = agent
@@ -1643,10 +2150,12 @@ class APIServerAdapter(BasePlatformAdapter):
            if previous_response_id:
                logger.debug("Both conversation_history and previous_response_id provided; using conversation_history")

+        stored_session_id = None
        if not conversation_history and previous_response_id:
            stored = self._response_store.get(previous_response_id)
            if stored:
                conversation_history = list(stored.get("conversation_history", []))
+                stored_session_id = stored.get("session_id")
                if instructions is None:
                    instructions = stored.get("instructions")

@@ -1665,7 +2174,7 @@ class APIServerAdapter(BasePlatformAdapter):
                        )
                    conversation_history.append({"role": msg["role"], "content": str(content)})

-        session_id = body.get("session_id") or run_id
+        session_id = body.get("session_id") or stored_session_id or run_id
        ephemeral_system_prompt = instructions

        async def _run_and_close():
@@ -669,6 +669,15 @@ class MessageEvent:
    # Original platform data
    raw_message: Any = None
    message_id: Optional[str] = None
+
+    # Platform-specific update identifier.  For Telegram this is the
+    # ``update_id`` from the PTB Update wrapper; other platforms currently
+    # ignore it.  Used by ``/restart`` to record the triggering update so the
+    # new gateway can advance the Telegram offset past it and avoid processing
+    # the same ``/restart`` twice if PTB's graceful-shutdown ACK times out
+    # ("Error while calling `get_updates` one more time to mark all fetched
+    # updates" in gateway.log).
+    platform_update_id: Optional[int] = None
    
    # Media attachments
    # media_urls: local file paths (for vision tool access)
@@ -682,6 +691,10 @@ class MessageEvent:
    # Auto-loaded skill(s) for topic/channel bindings (e.g., Telegram DM Topics,
    # Discord channel_skill_bindings).  A single name or ordered list.
    auto_skill: Optional[str | list[str]] = None
+
+    # Per-channel ephemeral system prompt (e.g. Discord channel_prompts).
+    # Applied at API call time and never persisted to transcript history.
+    channel_prompt: Optional[str] = None
    
    # Internal flag — set for synthetic events (e.g. background process
    # completion notifications) that must bypass user authorization checks.
@@ -730,25 +743,56 @@ def merge_pending_message_event(
    pending_messages: Dict[str, MessageEvent],
    session_key: str,
    event: MessageEvent,
+    *,
+    merge_text: bool = False,
 ) -> None:
    """Store or merge a pending event for a session.

    Photo bursts/albums often arrive as multiple near-simultaneous PHOTO
    events. Merge those into the existing queued event so the next turn sees
-    the whole burst, while non-photo follow-ups still replace the pending
-    event normally.
+    the whole burst.
+
+    When ``merge_text`` is enabled, rapid follow-up TEXT events are appended
+    instead of replacing the pending turn. This is used for Telegram bursty
+    follow-ups so a multi-part user thought is not silently truncated to only
+    the last queued fragment.
    """
    existing = pending_messages.get(session_key)
-    if (
-        existing
-        and getattr(existing, "message_type", None) == MessageType.PHOTO
-        and event.message_type == MessageType.PHOTO
-    ):
-        existing.media_urls.extend(event.media_urls)
-        existing.media_types.extend(event.media_types)
-        if event.text:
-            existing.text = BasePlatformAdapter._merge_caption(existing.text, event.text)
-        return
+    if existing:
+        existing_is_photo = getattr(existing, "message_type", None) == MessageType.PHOTO
+        incoming_is_photo = event.message_type == MessageType.PHOTO
+        existing_has_media = bool(existing.media_urls)
+        incoming_has_media = bool(event.media_urls)
+
+        if existing_is_photo and incoming_is_photo:
+            existing.media_urls.extend(event.media_urls)
+            existing.media_types.extend(event.media_types)
+            if event.text:
+                existing.text = BasePlatformAdapter._merge_caption(existing.text, event.text)
+            return
+
+        if existing_has_media or incoming_has_media:
+            if incoming_has_media:
+                existing.media_urls.extend(event.media_urls)
+                existing.media_types.extend(event.media_types)
+            if event.text:
+                if existing.text:
+                    existing.text = BasePlatformAdapter._merge_caption(existing.text, event.text)
+                else:
+                    existing.text = event.text
+            if existing_is_photo or incoming_is_photo:
+                existing.message_type = MessageType.PHOTO
+            return
+
+        if (
+            merge_text
+            and getattr(existing, "message_type", None) == MessageType.TEXT
+            and event.message_type == MessageType.TEXT
+        ):
+            if event.text:
+                existing.text = f"{existing.text}\n{event.text}" if existing.text else event.text
+            return
+
    pending_messages[session_key] = event


@@ -776,6 +820,36 @@ _RETRYABLE_ERROR_PATTERNS = (
 MessageHandler = Callable[[MessageEvent], Awaitable[Optional[str]]]


+def resolve_channel_prompt(
+    config_extra: dict,
+    channel_id: str,
+    parent_id: str | None = None,
+) -> str | None:
+    """Resolve a per-channel ephemeral prompt from platform config.
+
+    Looks up ``channel_prompts`` in the adapter's ``config.extra`` dict.
+    Prefers an exact match on *channel_id*; falls back to *parent_id*
+    (useful for forum threads / child channels inheriting a parent prompt).
+
+    Returns the prompt string, or None if no match is found.  Blank/whitespace-
+    only prompts are treated as absent.
+    """
+    prompts = config_extra.get("channel_prompts") or {}
+    if not isinstance(prompts, dict):
+        return None
+
+    for key in (channel_id, parent_id):
+        if not key:
+            continue
+        prompt = prompts.get(key)
+        if prompt is None:
+            continue
+        prompt = str(prompt).strip()
+        if prompt:
+            return prompt
+    return None
+
+
 class BasePlatformAdapter(ABC):
    """
    Base class for platform adapters.
@@ -805,6 +879,11 @@ class BasePlatformAdapter(ABC):
        # Gateway shutdown cancels these so an old gateway instance doesn't keep
        # working on a task after --replace or manual restarts.
        self._background_tasks: set[asyncio.Task] = set()
+        # One-shot callbacks to fire after the main response is delivered.
+        # Keyed by session_key.  GatewayRunner uses this to defer
+        # background-review notifications ("💾 Skill created") until the
+        # primary reply has been sent.
+        self._post_delivery_callbacks: Dict[str, Callable] = {}
        self._expected_cancelled_tasks: set[asyncio.Task] = set()
        self._busy_session_handler: Optional[Callable[[MessageEvent, str], Awaitable[bool]]] = None
        # Chats where auto-TTS on voice input is disabled (set by /voice off)
@@ -975,16 +1054,40 @@ class BasePlatformAdapter(ABC):
        """
        pass

+    # Default: the adapter treats ``finalize=True`` on edit_message as a
+    # no-op and is happy to have the stream consumer skip redundant final
+    # edits.  Subclasses that *require* an explicit finalize call to close
+    # out the message lifecycle (e.g. rich card / AI assistant surfaces
+    # such as DingTalk AI Cards) override this to True (class attribute or
+    # property) so the stream consumer knows not to short-circuit.
+    REQUIRES_EDIT_FINALIZE: bool = False
+
    async def edit_message(
        self,
        chat_id: str,
        message_id: str,
        content: str,
+        *,
+        finalize: bool = False,
    ) -> SendResult:
        """
        Edit a previously sent message. Optional — platforms that don't
        support editing return success=False and callers fall back to
        sending a new message.
+
+        ``finalize`` signals that this is the last edit in a streaming
+        sequence.  Most platforms (Telegram, Slack, Discord, Matrix,
+        etc.) treat it as a no-op because their edit APIs have no notion
+        of message lifecycle state — an edit is an edit.  Platforms that
+        render streaming updates with a distinct "in progress" state and
+        require explicit closure (e.g. rich card / AI assistant surfaces
+        such as DingTalk AI Cards) use it to finalize the message and
+        transition the UI out of the streaming indicator — those should
+        also set ``REQUIRES_EDIT_FINALIZE = True`` so callers route a
+        final edit through even when content is unchanged.  Callers
+        should set ``finalize=True`` on the final edit of a streamed
+        response (typically when ``got_done`` fires in the stream
+        consumer) and leave it ``False`` on intermediate edits.
        """
        return SendResult(success=False, error="Not supported")

@@ -1221,7 +1324,7 @@ class BasePlatformAdapter(ABC):
                path = path[1:-1].strip()
            path = path.lstrip("`\"'").rstrip("`\"',.;:)}]")
            if path:
-                media.append((path, has_voice_tag))
+                media.append((os.path.expanduser(path), has_voice_tag))

        # Remove MEDIA tags from content (including surrounding quote/backtick wrappers)
        if media:
@@ -1509,7 +1612,9 @@ class BasePlatformAdapter(ABC):
            # session lifecycle and its cleanup races with the running task
            # (see PR #4926).
            cmd = event.get_command()
-            if cmd in ("approve", "deny", "status", "stop", "new", "reset", "background", "restart"):
+            from hermes_cli.commands import should_bypass_active_session
+
+            if should_bypass_active_session(cmd):
                logger.debug(
                    "[%s] Command '/%s' bypassing active-session guard for %s",
                    self.name, cmd, session_key,
@@ -1624,6 +1729,21 @@ class BasePlatformAdapter(ABC):
            # streaming already delivered the text (already_sent=True) or
            # when the message was queued behind an active agent.  Log at
            # DEBUG to avoid noisy warnings for expected behavior.
+            #
+            # Suppress stale response when the session was interrupted by a
+            # new message that hasn't been consumed yet.  The pending message
+            # is processed by the pending-message handler below (#8221/#2483).
+            if (
+                response
+                and interrupt_event.is_set()
+                and session_key in self._pending_messages
+            ):
+                logger.info(
+                    "[%s] Suppressing stale response for interrupted session %s",
+                    self.name,
+                    session_key,
+                )
+                response = None
            if not response:
                logger.debug("[%s] Handler returned empty/None response for %s", self.name, event.source.chat_id)
            if response:
@@ -1806,9 +1926,18 @@ class BasePlatformAdapter(ABC):
            if session_key in self._pending_messages:
                pending_event = self._pending_messages.pop(session_key)
                logger.debug("[%s] Processing queued message from interrupt", self.name)
-                # Clean up current session before processing pending
-                if session_key in self._active_sessions:
-                    del self._active_sessions[session_key]
+                # Keep the _active_sessions entry live across the turn chain
+                # and only CLEAR the interrupt Event — do NOT delete the entry.
+                # If we deleted here, a concurrent inbound message arriving
+                # during the awaits below would pass the Level-1 guard, spawn
+                # its own _process_message_background, and run simultaneously
+                # with the recursive drain below.  Two agents on one
+                # session_key = duplicate responses, duplicate tool calls.
+                # Clearing the Event keeps the guard live so follow-ups take
+                # the busy-handler path (queue + interrupt) as intended.
+                _active = self._active_sessions.get(session_key)
+                if _active is not None:
+                    _active.clear()
                typing_task.cancel()
                try:
                    await typing_task
@@ -1845,6 +1974,14 @@ class BasePlatformAdapter(ABC):
            except Exception:
                pass  # Last resort — don't let error reporting crash the handler
        finally:
+            # Fire any one-shot post-delivery callback registered for this
+            # session (e.g. deferred background-review notifications).
+            _post_cb = getattr(self, "_post_delivery_callbacks", {}).pop(session_key, None)
+            if callable(_post_cb):
+                try:
+                    _post_cb()
+                except Exception:
+                    pass
            # Stop typing indicator
            typing_task.cancel()
            try:
@@ -1858,6 +1995,34 @@ class BasePlatformAdapter(ABC):
                    await self.stop_typing(event.source.chat_id)
            except Exception:
                pass
+            # Late-arrival drain: a message may have arrived during the
+            # cleanup awaits above (typing_task cancel, stop_typing).  Such
+            # messages passed the Level-1 guard (entry still live, Event
+            # possibly set) and landed in _pending_messages via the
+            # busy-handler path.  Without this block, we would delete the
+            # active-session entry and the queued message would be silently
+            # dropped (user never gets a reply).
+            late_pending = self._pending_messages.pop(session_key, None)
+            if late_pending is not None:
+                logger.debug(
+                    "[%s] Late-arrival pending message during cleanup — spawning drain task",
+                    self.name,
+                )
+                _active = self._active_sessions.get(session_key)
+                if _active is not None:
+                    _active.clear()
+                drain_task = asyncio.create_task(
+                    self._process_message_background(late_pending, session_key)
+                )
+                try:
+                    self._background_tasks.add(drain_task)
+                    drain_task.add_done_callback(self._background_tasks.discard)
+                except TypeError:
+                    # Tests stub create_task() with non-hashable sentinels; tolerate.
+                    pass
+                # Leave _active_sessions[session_key] populated — the drain
+                # task's own lifecycle will clean it up.
+                return
            # Clean up session tracking
            if session_key in self._active_sessions:
                del self._active_sessions[session_key]
@@ -1898,6 +2063,7 @@ class BasePlatformAdapter(ABC):
        chat_topic: Optional[str] = None,
        user_id_alt: Optional[str] = None,
        chat_id_alt: Optional[str] = None,
+        is_bot: bool = False,
    ) -> SessionSource:
        """Helper to build a SessionSource for this platform."""
        # Normalize empty topic to None
@@ -1914,6 +2080,7 @@ class BasePlatformAdapter(ABC):
            chat_topic=chat_topic.strip() if chat_topic else None,
            user_id_alt=user_id_alt,
            chat_id_alt=chat_id_alt,
+            is_bot=is_bot,
        )
    
    @abstractmethod
@@ -1073,6 +1073,13 @@ class FeishuAdapter(BasePlatformAdapter):
        self._webhook_rate_counts: Dict[str, tuple[int, float]] = {}  # rate_key → (count, window_start)
        self._webhook_anomaly_counts: Dict[str, tuple[int, str, float]] = {}  # ip → (count, last_status, first_seen)
        self._card_action_tokens: Dict[str, float] = {}  # token → first_seen_time
+        # Inbound events that arrived before the adapter loop was ready
+        # (e.g. during startup/restart or network-flap reconnect). A single
+        # drainer thread replays them as soon as the loop becomes available.
+        self._pending_inbound_events: List[Any] = []
+        self._pending_inbound_lock = threading.Lock()
+        self._pending_drain_scheduled = False
+        self._pending_inbound_max_depth = 1000  # cap queue; drop oldest beyond
        self._chat_locks: Dict[str, asyncio.Lock] = {}  # chat_id → lock (per-chat serial processing)
        self._sent_message_ids_to_chat: Dict[str, str] = {}  # message_id → chat_id (for reaction routing)
        self._sent_message_id_order: List[str] = []  # LRU order for _sent_message_ids_to_chat
@@ -1219,6 +1226,12 @@ class FeishuAdapter(BasePlatformAdapter):
            .register_p2_card_action_trigger(self._on_card_action_trigger)
            .register_p2_im_chat_member_bot_added_v1(self._on_bot_added_to_chat)
            .register_p2_im_chat_member_bot_deleted_v1(self._on_bot_removed_from_chat)
+            .register_p2_im_chat_access_event_bot_p2p_chat_entered_v1(self._on_p2p_chat_entered)
+            .register_p2_im_message_recalled_v1(self._on_message_recalled)
+            .register_p2_customized_event(
+                "drive.notice.comment_add_v1",
+                self._on_drive_comment_event,
+            )
            .build()
        )

@@ -1757,10 +1770,22 @@ class FeishuAdapter(BasePlatformAdapter):
    # =========================================================================

    def _on_message_event(self, data: Any) -> None:
-        """Normalize Feishu inbound events into MessageEvent."""
+        """Normalize Feishu inbound events into MessageEvent.
+
+        Called by the lark_oapi SDK's event dispatcher on a background thread.
+        If the adapter loop is not currently accepting callbacks (brief window
+        during startup/restart or network-flap reconnect), the event is queued
+        for replay instead of dropped.
+        """
        loop = self._loop
-        if loop is None or bool(getattr(loop, "is_closed", lambda: False)()):
-            logger.warning("[Feishu] Dropping inbound message before adapter loop is ready")
+        if not self._loop_accepts_callbacks(loop):
+            start_drainer = self._enqueue_pending_inbound_event(data)
+            if start_drainer:
+                threading.Thread(
+                    target=self._drain_pending_inbound_events,
+                    name="feishu-pending-inbound-drainer",
+                    daemon=True,
+                ).start()
            return
        future = asyncio.run_coroutine_threadsafe(
            self._handle_message_event_data(data),
@@ -1768,6 +1793,124 @@ class FeishuAdapter(BasePlatformAdapter):
        )
        future.add_done_callback(self._log_background_failure)

+    def _enqueue_pending_inbound_event(self, data: Any) -> bool:
+        """Append an event to the pending-inbound queue.
+
+        Returns True if the caller should spawn a drainer thread (no drainer
+        currently scheduled), False if a drainer is already running and will
+        pick up the new event on its next pass.
+        """
+        with self._pending_inbound_lock:
+            if len(self._pending_inbound_events) >= self._pending_inbound_max_depth:
+                # Queue full — drop the oldest to make room. This happens only
+                # if the loop stays unavailable for an extended period AND the
+                # WS keeps firing callbacks. Still better than silent drops.
+                dropped = self._pending_inbound_events.pop(0)
+                try:
+                    event = getattr(dropped, "event", None)
+                    message = getattr(event, "message", None)
+                    message_id = str(getattr(message, "message_id", "") or "unknown")
+                except Exception:
+                    message_id = "unknown"
+                logger.error(
+                    "[Feishu] Pending-inbound queue full (%d); dropped oldest event %s",
+                    self._pending_inbound_max_depth,
+                    message_id,
+                )
+            self._pending_inbound_events.append(data)
+            depth = len(self._pending_inbound_events)
+            should_start = not self._pending_drain_scheduled
+            if should_start:
+                self._pending_drain_scheduled = True
+        logger.warning(
+            "[Feishu] Queued inbound event for replay (loop not ready, queue depth=%d)",
+            depth,
+        )
+        return should_start
+
+    def _drain_pending_inbound_events(self) -> None:
+        """Replay queued inbound events once the adapter loop is ready.
+
+        Runs in a dedicated daemon thread. Polls ``_running`` and
+        ``_loop_accepts_callbacks`` until events can be dispatched or the
+        adapter shuts down. A single drainer handles the entire queue;
+        concurrent ``_on_message_event`` calls just append.
+        """
+        poll_interval = 0.25
+        max_wait_seconds = 120.0  # safety cap: drop queue after 2 minutes
+        waited = 0.0
+        try:
+            while True:
+                if not getattr(self, "_running", True):
+                    # Adapter shutting down — drop queued events rather than
+                    # holding them against a closed loop.
+                    with self._pending_inbound_lock:
+                        dropped = len(self._pending_inbound_events)
+                        self._pending_inbound_events.clear()
+                    if dropped:
+                        logger.warning(
+                            "[Feishu] Dropped %d queued inbound event(s) during shutdown",
+                            dropped,
+                        )
+                    return
+                loop = self._loop
+                if self._loop_accepts_callbacks(loop):
+                    with self._pending_inbound_lock:
+                        batch = self._pending_inbound_events[:]
+                        self._pending_inbound_events.clear()
+                    if not batch:
+                        # Queue emptied between check and grab; done.
+                        with self._pending_inbound_lock:
+                            if not self._pending_inbound_events:
+                                return
+                        continue
+                    dispatched = 0
+                    requeue: List[Any] = []
+                    for event in batch:
+                        try:
+                            fut = asyncio.run_coroutine_threadsafe(
+                                self._handle_message_event_data(event),
+                                loop,
+                            )
+                            fut.add_done_callback(self._log_background_failure)
+                            dispatched += 1
+                        except RuntimeError:
+                            # Loop closed between check and submit — requeue
+                            # and poll again.
+                            requeue.append(event)
+                    if requeue:
+                        with self._pending_inbound_lock:
+                            self._pending_inbound_events[:0] = requeue
+                    if dispatched:
+                        logger.info(
+                            "[Feishu] Replayed %d queued inbound event(s)",
+                            dispatched,
+                        )
+                    if not requeue:
+                        # Successfully drained; check if more arrived while
+                        # we were dispatching and exit if not.
+                        with self._pending_inbound_lock:
+                            if not self._pending_inbound_events:
+                                return
+                    # More events queued or requeue pending — loop again.
+                    continue
+                if waited >= max_wait_seconds:
+                    with self._pending_inbound_lock:
+                        dropped = len(self._pending_inbound_events)
+                        self._pending_inbound_events.clear()
+                    logger.error(
+                        "[Feishu] Adapter loop unavailable for %.0fs; "
+                        "dropped %d queued inbound event(s)",
+                        max_wait_seconds,
+                        dropped,
+                    )
+                    return
+                time.sleep(poll_interval)
+                waited += poll_interval
+        finally:
+            with self._pending_inbound_lock:
+                self._pending_drain_scheduled = False
+
    async def _handle_message_event_data(self, data: Any) -> None:
        """Shared inbound message handling for websocket and webhook transports."""
        event = getattr(data, "event", None)
@@ -1820,6 +1963,31 @@ class FeishuAdapter(BasePlatformAdapter):
        logger.info("[Feishu] Bot removed from chat: %s", chat_id)
        self._chat_info_cache.pop(chat_id, None)

+    def _on_p2p_chat_entered(self, data: Any) -> None:
+        logger.debug("[Feishu] User entered P2P chat with bot")
+
+    def _on_message_recalled(self, data: Any) -> None:
+        logger.debug("[Feishu] Message recalled by user")
+
+    def _on_drive_comment_event(self, data: Any) -> None:
+        """Handle drive document comment notification (drive.notice.comment_add_v1).
+
+        Delegates to :mod:`gateway.platforms.feishu_comment` for parsing,
+        logging, and reaction.  Scheduling follows the same
+        ``run_coroutine_threadsafe`` pattern used by ``_on_message_event``.
+        """
+        from gateway.platforms.feishu_comment import handle_drive_comment_event
+
+        loop = self._loop
+        if not self._loop_accepts_callbacks(loop):
+            logger.warning("[Feishu] Dropping drive comment event before adapter loop is ready")
+            return
+        future = asyncio.run_coroutine_threadsafe(
+            handle_drive_comment_event(self._client, data, self_open_id=self._bot_open_id),
+            loop,
+        )
+        future.add_done_callback(self._log_background_failure)
+
    def _on_reaction_event(self, event_type: str, data: Any) -> None:
        """Route user reactions on bot messages as synthetic text events."""
        event = getattr(data, "event", None)
@@ -2445,6 +2613,8 @@ class FeishuAdapter(BasePlatformAdapter):
            self._on_reaction_event(event_type, data)
        elif event_type == "card.action.trigger":
            self._on_card_action_trigger(data)
+        elif event_type == "drive.notice.comment_add_v1":
+            self._on_drive_comment_event(data)
        else:
            logger.debug("[Feishu] Ignoring webhook event type: %s", event_type or "unknown")
        return web.json_response({"code": 0, "msg": "ok"})
@@ -0,0 +1,429 @@
+"""
+Feishu document comment access-control rules.
+
+3-tier rule resolution: exact doc > wildcard "*" > top-level > code defaults.
+Each field (enabled/policy/allow_from) falls back independently.
+Config: ~/.hermes/feishu_comment_rules.json (mtime-cached, hot-reload).
+Pairing store: ~/.hermes/feishu_comment_pairing.json.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+from hermes_constants import get_hermes_home
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Paths
+# ---------------------------------------------------------------------------
+#
+# Uses the canonical ``get_hermes_home()`` helper (HERMES_HOME-aware and
+# profile-safe). Resolved at import time; this module is lazy-imported by
+# the Feishu comment event handler, which runs long after profile overrides
+# have been applied, so freezing paths here is safe.
+
+RULES_FILE = get_hermes_home() / "feishu_comment_rules.json"
+PAIRING_FILE = get_hermes_home() / "feishu_comment_pairing.json"
+
+# ---------------------------------------------------------------------------
+# Data models
+# ---------------------------------------------------------------------------
+
+_VALID_POLICIES = ("allowlist", "pairing")
+
+
+@dataclass(frozen=True)
+class CommentDocumentRule:
+    """Per-document rule.  ``None`` means 'inherit from lower tier'."""
+    enabled: Optional[bool] = None
+    policy: Optional[str] = None
+    allow_from: Optional[frozenset] = None
+
+
+@dataclass(frozen=True)
+class CommentsConfig:
+    """Top-level comment access config."""
+    enabled: bool = True
+    policy: str = "pairing"
+    allow_from: frozenset = field(default_factory=frozenset)
+    documents: Dict[str, CommentDocumentRule] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class ResolvedCommentRule:
+    """Fully resolved rule after field-by-field fallback."""
+    enabled: bool
+    policy: str
+    allow_from: frozenset
+    match_source: str  # e.g. "exact:docx:xxx" | "wildcard" | "top" | "default"
+
+
+# ---------------------------------------------------------------------------
+# Mtime-cached file loading
+# ---------------------------------------------------------------------------
+
+class _MtimeCache:
+    """Generic mtime-based file cache.  ``stat()`` per access, re-read only on change."""
+
+    def __init__(self, path: Path):
+        self._path = path
+        self._mtime: float = 0.0
+        self._data: Optional[dict] = None
+
+    def load(self) -> dict:
+        try:
+            st = self._path.stat()
+            mtime = st.st_mtime
+        except FileNotFoundError:
+            self._mtime = 0.0
+            self._data = {}
+            return {}
+
+        if mtime == self._mtime and self._data is not None:
+            return self._data
+
+        try:
+            with open(self._path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            if not isinstance(data, dict):
+                data = {}
+        except (json.JSONDecodeError, OSError):
+            logger.warning("[Feishu-Rules] Failed to read %s, using empty config", self._path)
+            data = {}
+
+        self._mtime = mtime
+        self._data = data
+        return data
+
+
+_rules_cache = _MtimeCache(RULES_FILE)
+_pairing_cache = _MtimeCache(PAIRING_FILE)
+
+
+# ---------------------------------------------------------------------------
+# Config parsing
+# ---------------------------------------------------------------------------
+
+def _parse_frozenset(raw: Any) -> Optional[frozenset]:
+    """Parse a list of strings into a frozenset; return None if key absent."""
+    if raw is None:
+        return None
+    if isinstance(raw, (list, tuple)):
+        return frozenset(str(u).strip() for u in raw if str(u).strip())
+    return None
+
+
+def _parse_document_rule(raw: dict) -> CommentDocumentRule:
+    enabled = raw.get("enabled")
+    if enabled is not None:
+        enabled = bool(enabled)
+    policy = raw.get("policy")
+    if policy is not None:
+        policy = str(policy).strip().lower()
+        if policy not in _VALID_POLICIES:
+            policy = None
+    allow_from = _parse_frozenset(raw.get("allow_from"))
+    return CommentDocumentRule(enabled=enabled, policy=policy, allow_from=allow_from)
+
+
+def load_config() -> CommentsConfig:
+    """Load comment rules from disk (mtime-cached)."""
+    raw = _rules_cache.load()
+    if not raw:
+        return CommentsConfig()
+
+    documents: Dict[str, CommentDocumentRule] = {}
+    raw_docs = raw.get("documents", {})
+    if isinstance(raw_docs, dict):
+        for key, rule_raw in raw_docs.items():
+            if isinstance(rule_raw, dict):
+                documents[str(key)] = _parse_document_rule(rule_raw)
+
+    policy = str(raw.get("policy", "pairing")).strip().lower()
+    if policy not in _VALID_POLICIES:
+        policy = "pairing"
+
+    return CommentsConfig(
+        enabled=raw.get("enabled", True),
+        policy=policy,
+        allow_from=_parse_frozenset(raw.get("allow_from")) or frozenset(),
+        documents=documents,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Rule resolution  (§8.4 field-by-field fallback)
+# ---------------------------------------------------------------------------
+
+def has_wiki_keys(cfg: CommentsConfig) -> bool:
+    """Check if any document rule key starts with 'wiki:'."""
+    return any(k.startswith("wiki:") for k in cfg.documents)
+
+
+def resolve_rule(
+    cfg: CommentsConfig,
+    file_type: str,
+    file_token: str,
+    wiki_token: str = "",
+) -> ResolvedCommentRule:
+    """Resolve effective rule: exact doc → wiki key → wildcard → top-level → defaults."""
+    exact_key = f"{file_type}:{file_token}"
+
+    exact = cfg.documents.get(exact_key)
+    exact_src = f"exact:{exact_key}"
+    if exact is None and wiki_token:
+        wiki_key = f"wiki:{wiki_token}"
+        exact = cfg.documents.get(wiki_key)
+        exact_src = f"exact:{wiki_key}"
+
+    wildcard = cfg.documents.get("*")
+
+    layers = []
+    if exact is not None:
+        layers.append((exact, exact_src))
+    if wildcard is not None:
+        layers.append((wildcard, "wildcard"))
+
+    def _pick(field_name: str):
+        for layer, source in layers:
+            val = getattr(layer, field_name)
+            if val is not None:
+                return val, source
+        return getattr(cfg, field_name), "top"
+
+    enabled, en_src = _pick("enabled")
+    policy, pol_src = _pick("policy")
+    allow_from, _ = _pick("allow_from")
+
+    # match_source = highest-priority tier that contributed any field
+    priority_order = {"exact": 0, "wildcard": 1, "top": 2}
+    best_src = min(
+        [en_src, pol_src],
+        key=lambda s: priority_order.get(s.split(":")[0], 3),
+    )
+
+    return ResolvedCommentRule(
+        enabled=enabled,
+        policy=policy,
+        allow_from=allow_from,
+        match_source=best_src,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Pairing store
+# ---------------------------------------------------------------------------
+
+def _load_pairing_approved() -> set:
+    """Return set of approved user open_ids (mtime-cached)."""
+    data = _pairing_cache.load()
+    approved = data.get("approved", {})
+    if isinstance(approved, dict):
+        return set(approved.keys())
+    if isinstance(approved, list):
+        return set(str(u) for u in approved if u)
+    return set()
+
+
+def _save_pairing(data: dict) -> None:
+    PAIRING_FILE.parent.mkdir(parents=True, exist_ok=True)
+    tmp = PAIRING_FILE.with_suffix(".tmp")
+    with open(tmp, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
+    tmp.replace(PAIRING_FILE)
+    # Invalidate cache so next load picks up change
+    _pairing_cache._mtime = 0.0
+    _pairing_cache._data = None
+
+
+def pairing_add(user_open_id: str) -> bool:
+    """Add a user to the pairing-approved list. Returns True if newly added."""
+    data = _pairing_cache.load()
+    approved = data.get("approved", {})
+    if not isinstance(approved, dict):
+        approved = {}
+    if user_open_id in approved:
+        return False
+    approved[user_open_id] = {"approved_at": time.time()}
+    data["approved"] = approved
+    _save_pairing(data)
+    return True
+
+
+def pairing_remove(user_open_id: str) -> bool:
+    """Remove a user from the pairing-approved list. Returns True if removed."""
+    data = _pairing_cache.load()
+    approved = data.get("approved", {})
+    if not isinstance(approved, dict):
+        return False
+    if user_open_id not in approved:
+        return False
+    del approved[user_open_id]
+    data["approved"] = approved
+    _save_pairing(data)
+    return True
+
+
+def pairing_list() -> Dict[str, Any]:
+    """Return the approved dict  {user_open_id: {approved_at: ...}}."""
+    data = _pairing_cache.load()
+    approved = data.get("approved", {})
+    return dict(approved) if isinstance(approved, dict) else {}
+
+
+# ---------------------------------------------------------------------------
+# Access check  (public API for feishu_comment.py)
+# ---------------------------------------------------------------------------
+
+def is_user_allowed(rule: ResolvedCommentRule, user_open_id: str) -> bool:
+    """Check if user passes the resolved rule's policy gate."""
+    if user_open_id in rule.allow_from:
+        return True
+    if rule.policy == "pairing":
+        return user_open_id in _load_pairing_approved()
+    return False
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def _print_status() -> None:
+    cfg = load_config()
+    print(f"Rules file: {RULES_FILE}")
+    print(f"  exists: {RULES_FILE.exists()}")
+    print(f"Pairing file: {PAIRING_FILE}")
+    print(f"  exists: {PAIRING_FILE.exists()}")
+    print()
+    print(f"Top-level:")
+    print(f"  enabled:    {cfg.enabled}")
+    print(f"  policy:     {cfg.policy}")
+    print(f"  allow_from: {sorted(cfg.allow_from) if cfg.allow_from else '[]'}")
+    print()
+    if cfg.documents:
+        print(f"Document rules ({len(cfg.documents)}):")
+        for key, rule in sorted(cfg.documents.items()):
+            parts = []
+            if rule.enabled is not None:
+                parts.append(f"enabled={rule.enabled}")
+            if rule.policy is not None:
+                parts.append(f"policy={rule.policy}")
+            if rule.allow_from is not None:
+                parts.append(f"allow_from={sorted(rule.allow_from)}")
+            print(f"  [{key}] {', '.join(parts) if parts else '(empty — inherits all)'}")
+    else:
+        print("Document rules: (none)")
+    print()
+    approved = pairing_list()
+    print(f"Pairing approved ({len(approved)}):")
+    for uid, meta in sorted(approved.items()):
+        ts = meta.get("approved_at", 0)
+        print(f"  {uid}  (approved_at={ts})")
+
+
+def _do_check(doc_key: str, user_open_id: str) -> None:
+    cfg = load_config()
+    parts = doc_key.split(":", 1)
+    if len(parts) != 2:
+        print(f"Error: doc_key must be 'fileType:fileToken', got '{doc_key}'")
+        return
+    file_type, file_token = parts
+    rule = resolve_rule(cfg, file_type, file_token)
+    allowed = is_user_allowed(rule, user_open_id)
+    print(f"Document:     {doc_key}")
+    print(f"User:         {user_open_id}")
+    print(f"Resolved rule:")
+    print(f"  enabled:      {rule.enabled}")
+    print(f"  policy:       {rule.policy}")
+    print(f"  allow_from:   {sorted(rule.allow_from) if rule.allow_from else '[]'}")
+    print(f"  match_source: {rule.match_source}")
+    print(f"Result:       {'ALLOWED' if allowed else 'DENIED'}")
+
+
+def _main() -> int:
+    import sys
+
+    try:
+        from hermes_cli.env_loader import load_hermes_dotenv
+        load_hermes_dotenv()
+    except Exception:
+        pass
+
+    usage = (
+        "Usage: python -m gateway.platforms.feishu_comment_rules <command> [args]\n"
+        "\n"
+        "Commands:\n"
+        "  status                              Show rules config and pairing state\n"
+        "  check <fileType:token> <user>        Simulate access check\n"
+        "  pairing add <user_open_id>           Add user to pairing-approved list\n"
+        "  pairing remove <user_open_id>        Remove user from pairing-approved list\n"
+        "  pairing list                         List pairing-approved users\n"
+        "\n"
+        f"Rules config file: {RULES_FILE}\n"
+        "  Edit this JSON file directly to configure policies and document rules.\n"
+        "  Changes take effect on the next comment event (no restart needed).\n"
+    )
+
+    args = sys.argv[1:]
+    if not args:
+        print(usage)
+        return 1
+
+    cmd = args[0]
+
+    if cmd == "status":
+        _print_status()
+
+    elif cmd == "check":
+        if len(args) < 3:
+            print("Usage: check <fileType:fileToken> <user_open_id>")
+            return 1
+        _do_check(args[1], args[2])
+
+    elif cmd == "pairing":
+        if len(args) < 2:
+            print("Usage: pairing <add|remove|list> [args]")
+            return 1
+        sub = args[1]
+        if sub == "add":
+            if len(args) < 3:
+                print("Usage: pairing add <user_open_id>")
+                return 1
+            if pairing_add(args[2]):
+                print(f"Added: {args[2]}")
+            else:
+                print(f"Already approved: {args[2]}")
+        elif sub == "remove":
+            if len(args) < 3:
+                print("Usage: pairing remove <user_open_id>")
+                return 1
+            if pairing_remove(args[2]):
+                print(f"Removed: {args[2]}")
+            else:
+                print(f"Not in approved list: {args[2]}")
+        elif sub == "list":
+            approved = pairing_list()
+            if not approved:
+                print("(no approved users)")
+            for uid, meta in sorted(approved.items()):
+                print(f"  {uid}  approved_at={meta.get('approved_at', '?')}")
+        else:
+            print(f"Unknown pairing subcommand: {sub}")
+            return 1
+    else:
+        print(f"Unknown command: {cmd}\n")
+        print(usage)
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(_main())
@@ -49,7 +49,10 @@ class MessageDeduplicator:
            return False
        now = time.time()
        if msg_id in self._seen:
-            return True
+            if now - self._seen[msg_id] < self._ttl:
+                return True
+            # Entry has expired — remove it and treat as new
+            del self._seen[msg_id]
        self._seen[msg_id] = now
        if len(self._seen) > self._max_size:
            cutoff = now - self._ttl
@@ -718,6 +718,12 @@ class MattermostAdapter(BasePlatformAdapter):
            thread_id=thread_id,
        )

+        # Per-channel ephemeral prompt
+        from gateway.platforms.base import resolve_channel_prompt
+        _channel_prompt = resolve_channel_prompt(
+            self.config.extra, channel_id, None,
+        )
+
        msg_event = MessageEvent(
            text=message_text,
            message_type=msg_type,
@@ -726,6 +732,7 @@ class MattermostAdapter(BasePlatformAdapter):
            message_id=post_id,
            media_urls=media_urls if media_urls else None,
            media_types=media_types if media_types else None,
+            channel_prompt=_channel_prompt,
        )

        await self.handle_message(msg_event)
@@ -0,0 +1,57 @@
+"""
+QQBot platform package.
+
+Re-exports the main adapter symbols from ``adapter.py`` (the original
+``qqbot.py``) so that **all existing import paths remain unchanged**::
+
+    from gateway.platforms.qqbot import QQAdapter          # works
+    from gateway.platforms.qqbot import check_qq_requirements  # works
+
+New modules:
+    - ``constants`` — shared constants (API URLs, timeouts, message types)
+    - ``utils`` — User-Agent builder, config helpers
+    - ``crypto`` — AES-256-GCM key generation and decryption
+    - ``onboard`` — QR-code scan-to-configure flow
+"""
+
+# -- Adapter (original qqbot.py) ------------------------------------------
+from .adapter import (  # noqa: F401
+    QQAdapter,
+    QQCloseError,
+    check_qq_requirements,
+    _coerce_list,
+    _ssrf_redirect_guard,
+)
+
+# -- Onboard (QR-code scan-to-configure) -----------------------------------
+from .onboard import (  # noqa: F401
+    BindStatus,
+    create_bind_task,
+    poll_bind_result,
+    build_connect_url,
+)
+from .crypto import decrypt_secret, generate_bind_key  # noqa: F401
+
+# -- Utils -----------------------------------------------------------------
+from .utils import build_user_agent, get_api_headers, coerce_list  # noqa: F401
+
+__all__ = [
+    # adapter
+    "QQAdapter",
+    "QQCloseError",
+    "check_qq_requirements",
+    "_coerce_list",
+    "_ssrf_redirect_guard",
+    # onboard
+    "BindStatus",
+    "create_bind_task",
+    "poll_bind_result",
+    "build_connect_url",
+    # crypto
+    "decrypt_secret",
+    "generate_bind_key",
+    # utils
+    "build_user_agent",
+    "get_api_headers",
+    "coerce_list",
+]
@@ -0,0 +1,74 @@
+"""QQBot package-level constants shared across adapter, onboard, and other modules."""
+
+from __future__ import annotations
+
+import os
+
+# ---------------------------------------------------------------------------
+# QQBot adapter version — bump on functional changes to the adapter package.
+# ---------------------------------------------------------------------------
+
+QQBOT_VERSION = "1.1.0"
+
+# ---------------------------------------------------------------------------
+# API endpoints
+# ---------------------------------------------------------------------------
+
+# The portal domain is configurable via QQ_API_HOST for corporate proxies
+# or test environments.  Default: q.qq.com (production).
+PORTAL_HOST = os.getenv("QQ_PORTAL_HOST", "q.qq.com")
+
+API_BASE = "https://api.sgroup.qq.com"
+TOKEN_URL = "https://bots.qq.com/app/getAppAccessToken"
+GATEWAY_URL_PATH = "/gateway"
+
+# QR-code onboard endpoints (on the portal host)
+ONBOARD_CREATE_PATH = "/lite/create_bind_task"
+ONBOARD_POLL_PATH = "/lite/poll_bind_result"
+QR_URL_TEMPLATE = (
+    "https://q.qq.com/qqbot/openclaw/connect.html"
+    "?task_id={task_id}&_wv=2&source=hermes"
+)
+
+# ---------------------------------------------------------------------------
+# Timeouts & retry
+# ---------------------------------------------------------------------------
+
+DEFAULT_API_TIMEOUT = 30.0
+FILE_UPLOAD_TIMEOUT = 120.0
+CONNECT_TIMEOUT_SECONDS = 20.0
+
+RECONNECT_BACKOFF = [2, 5, 10, 30, 60]
+MAX_RECONNECT_ATTEMPTS = 100
+RATE_LIMIT_DELAY = 60  # seconds
+QUICK_DISCONNECT_THRESHOLD = 5.0  # seconds
+MAX_QUICK_DISCONNECT_COUNT = 3
+
+ONBOARD_POLL_INTERVAL = 2.0  # seconds between poll_bind_result calls
+ONBOARD_API_TIMEOUT = 10.0
+
+# ---------------------------------------------------------------------------
+# Message limits
+# ---------------------------------------------------------------------------
+
+MAX_MESSAGE_LENGTH = 4000
+DEDUP_WINDOW_SECONDS = 300
+DEDUP_MAX_SIZE = 1000
+
+# ---------------------------------------------------------------------------
+# QQ Bot message types
+# ---------------------------------------------------------------------------
+
+MSG_TYPE_TEXT = 0
+MSG_TYPE_MARKDOWN = 2
+MSG_TYPE_MEDIA = 7
+MSG_TYPE_INPUT_NOTIFY = 6
+
+# ---------------------------------------------------------------------------
+# QQ Bot file media types
+# ---------------------------------------------------------------------------
+
+MEDIA_TYPE_IMAGE = 1
+MEDIA_TYPE_VIDEO = 2
+MEDIA_TYPE_VOICE = 3
+MEDIA_TYPE_FILE = 4
@@ -0,0 +1,45 @@
+"""AES-256-GCM utilities for QQBot scan-to-configure credential decryption."""
+
+from __future__ import annotations
+
+import base64
+import os
+
+
+def generate_bind_key() -> str:
+    """Generate a 256-bit random AES key and return it as base64.
+
+    The key is passed to ``create_bind_task`` so the server can encrypt
+    the bot's *client_secret* before returning it.  Only this CLI holds
+    the key, ensuring the secret never travels in plaintext.
+    """
+    return base64.b64encode(os.urandom(32)).decode()
+
+
+def decrypt_secret(encrypted_base64: str, key_base64: str) -> str:
+    """Decrypt a base64-encoded AES-256-GCM ciphertext.
+
+    Ciphertext layout (after base64-decoding)::
+
+        IV (12 bytes) ‖ ciphertext (N bytes) ‖ AuthTag (16 bytes)
+
+    Args:
+        encrypted_base64: The ``bot_encrypt_secret`` value from
+            ``poll_bind_result``.
+        key_base64: The base64 AES key generated by
+            :func:`generate_bind_key`.
+
+    Returns:
+        The decrypted *client_secret* as a UTF-8 string.
+    """
+    from cryptography.hazmat.primitives.ciphers.aead import AESGCM
+
+    key = base64.b64decode(key_base64)
+    raw = base64.b64decode(encrypted_base64)
+
+    iv = raw[:12]
+    ciphertext_with_tag = raw[12:]  # AESGCM expects ciphertext + tag concatenated
+
+    aesgcm = AESGCM(key)
+    plaintext = aesgcm.decrypt(iv, ciphertext_with_tag, None)
+    return plaintext.decode("utf-8")
@@ -0,0 +1,124 @@
+"""
+QQBot scan-to-configure (QR code onboard) module.
+
+Calls the ``q.qq.com`` ``create_bind_task`` / ``poll_bind_result`` APIs to
+generate a QR-code URL and poll for scan completion.  On success the caller
+receives the bot's *app_id*, *client_secret* (decrypted locally), and the
+scanner's *user_openid* — enough to fully configure the QQBot gateway.
+
+Reference: https://bot.q.qq.com/wiki/develop/api-v2/
+"""
+
+from __future__ import annotations
+
+import logging
+from enum import IntEnum
+from typing import Tuple
+from urllib.parse import quote
+
+from .constants import (
+    ONBOARD_API_TIMEOUT,
+    ONBOARD_CREATE_PATH,
+    ONBOARD_POLL_PATH,
+    PORTAL_HOST,
+    QR_URL_TEMPLATE,
+)
+from .crypto import generate_bind_key
+from .utils import get_api_headers
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Bind status
+# ---------------------------------------------------------------------------
+
+
+class BindStatus(IntEnum):
+    """Status codes returned by ``poll_bind_result``."""
+
+    NONE = 0
+    PENDING = 1
+    COMPLETED = 2
+    EXPIRED = 3
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+async def create_bind_task(
+    timeout: float = ONBOARD_API_TIMEOUT,
+) -> Tuple[str, str]:
+    """Create a bind task and return *(task_id, aes_key_base64)*.
+
+    The AES key is generated locally and sent to the server so it can
+    encrypt the bot credentials before returning them.
+
+    Raises:
+        RuntimeError: If the API returns a non-zero ``retcode``.
+    """
+    import httpx
+
+    url = f"https://{PORTAL_HOST}{ONBOARD_CREATE_PATH}"
+    key = generate_bind_key()
+
+    async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
+        resp = await client.post(url, json={"key": key}, headers=get_api_headers())
+        resp.raise_for_status()
+        data = resp.json()
+
+    if data.get("retcode") != 0:
+        raise RuntimeError(data.get("msg", "create_bind_task failed"))
+
+    task_id = data.get("data", {}).get("task_id")
+    if not task_id:
+        raise RuntimeError("create_bind_task: missing task_id in response")
+
+    logger.debug("create_bind_task ok: task_id=%s", task_id)
+    return task_id, key
+
+
+async def poll_bind_result(
+    task_id: str,
+    timeout: float = ONBOARD_API_TIMEOUT,
+) -> Tuple[BindStatus, str, str, str]:
+    """Poll the bind result for *task_id*.
+
+    Returns:
+        A 4-tuple of ``(status, bot_appid, bot_encrypt_secret, user_openid)``.
+
+        * ``bot_encrypt_secret`` is AES-256-GCM encrypted — decrypt it with
+          :func:`~gateway.platforms.qqbot.crypto.decrypt_secret` using the
+          key from :func:`create_bind_task`.
+        * ``user_openid`` is the OpenID of the person who scanned the code
+          (available when ``status == COMPLETED``).
+
+    Raises:
+        RuntimeError: If the API returns a non-zero ``retcode``.
+    """
+    import httpx
+
+    url = f"https://{PORTAL_HOST}{ONBOARD_POLL_PATH}"
+
+    async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
+        resp = await client.post(url, json={"task_id": task_id}, headers=get_api_headers())
+        resp.raise_for_status()
+        data = resp.json()
+
+    if data.get("retcode") != 0:
+        raise RuntimeError(data.get("msg", "poll_bind_result failed"))
+
+    d = data.get("data", {})
+    return (
+        BindStatus(d.get("status", 0)),
+        str(d.get("bot_appid", "")),
+        d.get("bot_encrypt_secret", ""),
+        d.get("user_openid", ""),
+    )
+
+
+def build_connect_url(task_id: str) -> str:
+    """Build the QR-code target URL for a given *task_id*."""
+    return QR_URL_TEMPLATE.format(task_id=quote(task_id))
@@ -0,0 +1,71 @@
+"""QQBot shared utilities — User-Agent, HTTP helpers, config coercion."""
+
+from __future__ import annotations
+
+import platform
+import sys
+from typing import Any, Dict, List
+
+from .constants import QQBOT_VERSION
+
+
+# ---------------------------------------------------------------------------
+# User-Agent
+# ---------------------------------------------------------------------------
+
+def _get_hermes_version() -> str:
+    """Return the hermes-agent package version, or 'dev' if unavailable."""
+    try:
+        from importlib.metadata import version
+        return version("hermes-agent")
+    except Exception:
+        return "dev"
+
+
+def build_user_agent() -> str:
+    """Build a descriptive User-Agent string.
+
+    Format::
+
+        QQBotAdapter/<qqbot_version> (Python/<py_version>; <os>; Hermes/<hermes_version>)
+
+    Example::
+
+        QQBotAdapter/1.0.0 (Python/3.11.15; darwin; Hermes/0.9.0)
+    """
+    py_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
+    os_name = platform.system().lower()
+    hermes_version = _get_hermes_version()
+    return f"QQBotAdapter/{QQBOT_VERSION} (Python/{py_version}; {os_name}; Hermes/{hermes_version})"
+
+
+def get_api_headers() -> Dict[str, str]:
+    """Return standard HTTP headers for QQBot API requests.
+
+    Includes ``Content-Type``, ``Accept``, and a dynamic ``User-Agent``.
+    ``q.qq.com`` requires ``Accept: application/json`` — without it,
+    the server returns a JavaScript anti-bot challenge page.
+    """
+    return {
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+        "User-Agent": build_user_agent(),
+    }
+
+
+# ---------------------------------------------------------------------------
+# Config helpers
+# ---------------------------------------------------------------------------
+
+def coerce_list(value: Any) -> List[str]:
+    """Coerce config values into a trimmed string list.
+
+    Accepts comma-separated strings, lists, tuples, sets, or single values.
+    """
+    if value is None:
+        return []
+    if isinstance(value, str):
+        return [item.strip() for item in value.split(",") if item.strip()]
+    if isinstance(value, (list, tuple, set)):
+        return [str(item).strip() for item in value if str(item).strip()]
+    return [str(value).strip()] if str(value).strip() else []
@@ -160,6 +160,14 @@ class SignalAdapter(BasePlatformAdapter):
        self._sse_task: Optional[asyncio.Task] = None
        self._health_monitor_task: Optional[asyncio.Task] = None
        self._typing_tasks: Dict[str, asyncio.Task] = {}
+        # Per-chat typing-indicator backoff. When signal-cli reports
+        # NETWORK_FAILURE (recipient offline / unroutable), base.py's
+        # _keep_typing refresh loop would otherwise hammer sendTyping every
+        # ~2s indefinitely, producing WARNING-level log spam and pointless
+        # RPC traffic. We track consecutive failures per chat and skip the
+        # RPC during a cooldown window instead.
+        self._typing_failures: Dict[str, int] = {}
+        self._typing_skip_until: Dict[str, float] = {}
        self._running = False
        self._last_sse_activity = 0.0
        self._sse_response: Optional[httpx.Response] = None
@@ -548,8 +556,22 @@ class SignalAdapter(BasePlatformAdapter):
    # JSON-RPC Communication
    # ------------------------------------------------------------------

-    async def _rpc(self, method: str, params: dict, rpc_id: str = None) -> Any:
-        """Send a JSON-RPC 2.0 request to signal-cli daemon."""
+    async def _rpc(
+        self,
+        method: str,
+        params: dict,
+        rpc_id: str = None,
+        *,
+        log_failures: bool = True,
+    ) -> Any:
+        """Send a JSON-RPC 2.0 request to signal-cli daemon.
+
+        When ``log_failures=False``, error and exception paths log at DEBUG
+        instead of WARNING — used by the typing-indicator path to silence
+        repeated NETWORK_FAILURE spam for unreachable recipients while
+        still preserving visibility for the first occurrence and for
+        unrelated RPCs.
+        """
        if not self.client:
            logger.warning("Signal: RPC called but client not connected")
            return None
@@ -574,13 +596,19 @@ class SignalAdapter(BasePlatformAdapter):
            data = resp.json()

            if "error" in data:
-                logger.warning("Signal RPC error (%s): %s", method, data["error"])
+                if log_failures:
+                    logger.warning("Signal RPC error (%s): %s", method, data["error"])
+                else:
+                    logger.debug("Signal RPC error (%s): %s", method, data["error"])
                return None

            return data.get("result")

        except Exception as e:
-            logger.warning("Signal RPC %s failed: %s", method, e)
+            if log_failures:
+                logger.warning("Signal RPC %s failed: %s", method, e)
+            else:
+                logger.debug("Signal RPC %s failed: %s", method, e)
            return None

    # ------------------------------------------------------------------
@@ -627,7 +655,28 @@ class SignalAdapter(BasePlatformAdapter):
                self._recent_sent_timestamps.pop()

    async def send_typing(self, chat_id: str, metadata=None) -> None:
-        """Send a typing indicator."""
+        """Send a typing indicator.
+
+        base.py's ``_keep_typing`` refresh loop calls this every ~2s while
+        the agent is processing. If signal-cli returns NETWORK_FAILURE for
+        this recipient (offline, unroutable, group membership lost, etc.)
+        the unmitigated behaviour is: a WARNING log every 2 seconds for as
+        long as the agent keeps running. Instead we:
+
+        - silence the WARNING after the first consecutive failure (subsequent
+          attempts log at DEBUG) so transport issues are still visible once
+          but don't flood the log,
+        - skip the RPC entirely during an exponential cooldown window once
+          three consecutive failures have happened, so we stop hammering
+          signal-cli with requests it can't deliver.
+
+        A successful sendTyping clears the counters.
+        """
+        now = time.monotonic()
+        skip_until = self._typing_skip_until.get(chat_id, 0.0)
+        if now < skip_until:
+            return
+
        params: Dict[str, Any] = {
            "account": self.account,
        }
@@ -637,7 +686,26 @@ class SignalAdapter(BasePlatformAdapter):
        else:
            params["recipient"] = [chat_id]

-        await self._rpc("sendTyping", params, rpc_id="typing")
+        fails = self._typing_failures.get(chat_id, 0)
+        result = await self._rpc(
+            "sendTyping",
+            params,
+            rpc_id="typing",
+            log_failures=(fails == 0),
+        )
+
+        if result is None:
+            fails += 1
+            self._typing_failures[chat_id] = fails
+            # After 3 consecutive failures, back off exponentially (16s,
+            # 32s, 60s cap) to stop spamming signal-cli for a recipient
+            # that clearly isn't reachable right now.
+            if fails >= 3:
+                backoff = min(60.0, 16.0 * (2 ** (fails - 3)))
+                self._typing_skip_until[chat_id] = now + backoff
+        else:
+            self._typing_failures.pop(chat_id, None)
+            self._typing_skip_until.pop(chat_id, None)

    async def send_image(
        self,
@@ -789,6 +857,10 @@ class SignalAdapter(BasePlatformAdapter):
                await task
            except asyncio.CancelledError:
                pass
+        # Reset per-chat typing backoff state so the next agent turn starts
+        # fresh rather than inheriting a cooldown from a prior conversation.
+        self._typing_failures.pop(chat_id, None)
+        self._typing_skip_until.pop(chat_id, None)

    async def stop_typing(self, chat_id: str) -> None:
        """Public interface for stopping typing — called by base adapter's
@@ -366,6 +366,20 @@ class SlackAdapter(BasePlatformAdapter):
            # in an assistant-enabled context. Falls back to reactions.
            logger.debug("[Slack] assistant.threads.setStatus failed: %s", e)

+    def _dm_top_level_threads_as_sessions(self) -> bool:
+        """Whether top-level Slack DMs get per-message session threads.
+
+        Defaults to ``True`` so each visible DM reply thread is isolated as its
+        own Hermes session — matching the per-thread behavior channels already
+        have.  Set ``platforms.slack.extra.dm_top_level_threads_as_sessions``
+        to ``false`` in config.yaml to revert to the legacy behavior where all
+        top-level DMs share one continuous session.
+        """
+        raw = self.config.extra.get("dm_top_level_threads_as_sessions")
+        if raw is None:
+            return True  # default: each DM thread is its own session
+        return str(raw).strip().lower() in ("1", "true", "yes", "on")
+
    def _resolve_thread_ts(
        self,
        reply_to: Optional[str] = None,
@@ -996,10 +1010,14 @@ class SlackAdapter(BasePlatformAdapter):
        # Build thread_ts for session keying.
        # In channels: fall back to ts so each top-level @mention starts a
        #   new thread/session (the bot always replies in a thread).
-        # In DMs: only use the real thread_ts — top-level DMs should share
-        #   one continuous session, threaded DMs get their own session.
+        # In DMs: fall back to ts so each top-level DM reply thread gets
+        #   its own session key (matching channel behavior). Set
+        #   dm_top_level_threads_as_sessions: false in config to revert to
+        #   legacy single-session-per-DM-channel behavior.
        if is_dm:
-            thread_ts = event.get("thread_ts") or assistant_meta.get("thread_ts")  # None for top-level DMs
+            thread_ts = event.get("thread_ts") or assistant_meta.get("thread_ts")
+            if not thread_ts and self._dm_top_level_threads_as_sessions():
+                thread_ts = ts
        else:
            thread_ts = event.get("thread_ts") or ts  # ts fallback for channels

@@ -1167,6 +1185,12 @@ class SlackAdapter(BasePlatformAdapter):
            thread_id=thread_ts,
        )

+        # Per-channel ephemeral prompt
+        from gateway.platforms.base import resolve_channel_prompt
+        _channel_prompt = resolve_channel_prompt(
+            self.config.extra, channel_id, None,
+        )
+
        msg_event = MessageEvent(
            text=text,
            message_type=msg_type,
@@ -1176,6 +1200,7 @@ class SlackAdapter(BasePlatformAdapter):
            media_urls=media_urls,
            media_types=media_types,
            reply_to_message_id=thread_ts if thread_ts != ts else None,
+            channel_prompt=_channel_prompt,
        )

        # Only react when bot is directly addressed (DM or @mention).
@@ -11,6 +11,7 @@ import asyncio
 import json
 import logging
 import os
+import html as _html
 import re
 from typing import Dict, List, Optional, Any

@@ -18,6 +19,10 @@ logger = logging.getLogger(__name__)

 try:
    from telegram import Update, Bot, Message, InlineKeyboardButton, InlineKeyboardMarkup
+    try:
+        from telegram import LinkPreviewOptions
+    except ImportError:
+        LinkPreviewOptions = None
    from telegram.ext import (
        Application,
        CommandHandler,
@@ -36,6 +41,7 @@ except ImportError:
    Message = Any
    InlineKeyboardButton = Any
    InlineKeyboardMarkup = Any
+    LinkPreviewOptions = None
    Application = Any
    CommandHandler = Any
    CallbackQueryHandler = Any
@@ -112,6 +118,84 @@ def _strip_mdv2(text: str) -> str:
    return cleaned


+# ---------------------------------------------------------------------------
+# Markdown table → code block conversion
+# ---------------------------------------------------------------------------
+# Telegram's MarkdownV2 has no table syntax — '|' is just an escaped literal,
+# so pipe tables render as noisy backslash-pipe text with no alignment.
+# Wrapping the table in a fenced code block makes Telegram render it as
+# monospace preformatted text with columns intact.
+
+# Matches a GFM table delimiter row: optional outer pipes, cells containing
+# only dashes (with optional leading/trailing colons for alignment) separated
+# by '|'.  Requires at least one internal '|' so lone '---' horizontal rules
+# are NOT matched.
+_TABLE_SEPARATOR_RE = re.compile(
+    r'^\s*\|?\s*:?-+:?\s*(?:\|\s*:?-+:?\s*){1,}\|?\s*$'
+)
+
+
+def _is_table_row(line: str) -> bool:
+    """Return True if *line* could plausibly be a table data row."""
+    stripped = line.strip()
+    return bool(stripped) and '|' in stripped
+
+
+def _wrap_markdown_tables(text: str) -> str:
+    """Wrap GFM-style pipe tables in ``` fences so Telegram renders them.
+
+    Detected by a row containing '|' immediately followed by a delimiter
+    row matching :data:`_TABLE_SEPARATOR_RE`.  Subsequent pipe-containing
+    non-blank lines are consumed as the table body and included in the
+    wrapped block.  Tables inside existing fenced code blocks are left
+    alone.
+    """
+    if '|' not in text or '-' not in text:
+        return text
+
+    lines = text.split('\n')
+    out: list[str] = []
+    in_fence = False
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        stripped = line.lstrip()
+
+        # Track existing fenced code blocks — never touch content inside.
+        if stripped.startswith('```'):
+            in_fence = not in_fence
+            out.append(line)
+            i += 1
+            continue
+        if in_fence:
+            out.append(line)
+            i += 1
+            continue
+
+        # Look for a header row (contains '|') immediately followed by a
+        # delimiter row.
+        if (
+            '|' in line
+            and i + 1 < len(lines)
+            and _TABLE_SEPARATOR_RE.match(lines[i + 1])
+        ):
+            table_block = [line, lines[i + 1]]
+            j = i + 2
+            while j < len(lines) and _is_table_row(lines[j]):
+                table_block.append(lines[j])
+                j += 1
+            out.append('```')
+            out.extend(table_block)
+            out.append('```')
+            i = j
+            continue
+
+        out.append(line)
+        i += 1
+
+    return '\n'.join(out)
+
+
 class TelegramAdapter(BasePlatformAdapter):
    """
    Telegram bot adapter.
@@ -129,6 +213,7 @@ class TelegramAdapter(BasePlatformAdapter):
    # When a chunk is near this limit, a continuation is almost certain.
    _SPLIT_THRESHOLD = 4000
    MEDIA_GROUP_WAIT_SECONDS = 0.8
+    _GENERAL_TOPIC_THREAD_ID = "1"
    
    def __init__(self, config: PlatformConfig):
        super().__init__(config, Platform.TELEGRAM)
@@ -137,6 +222,7 @@ class TelegramAdapter(BasePlatformAdapter):
        self._webhook_mode: bool = False
        self._mention_patterns = self._compile_mention_patterns()
        self._reply_to_mode: str = getattr(config, 'reply_to_mode', 'first') or 'first'
+        self._disable_link_previews: bool = self._coerce_bool_extra("disable_link_previews", False)
        # Buffer rapid/album photo updates so Telegram image bursts are handled
        # as a single MessageEvent instead of self-interrupting multiple turns.
        self._media_batch_delay_seconds = float(os.getenv("HERMES_TELEGRAM_MEDIA_BATCH_DELAY_SECONDS", "0.8"))
@@ -163,6 +249,38 @@ class TelegramAdapter(BasePlatformAdapter):
        # Approval button state: message_id → session_key
        self._approval_state: Dict[int, str] = {}

+    @staticmethod
+    def _is_callback_user_authorized(user_id: str) -> bool:
+        """Return whether a Telegram inline-button caller may perform gated actions."""
+        allowed_csv = os.getenv("TELEGRAM_ALLOWED_USERS", "").strip()
+        if not allowed_csv:
+            return True
+        allowed_ids = {uid.strip() for uid in allowed_csv.split(",") if uid.strip()}
+        return "*" in allowed_ids or user_id in allowed_ids
+
+    @classmethod
+    def _metadata_thread_id(cls, metadata: Optional[Dict[str, Any]]) -> Optional[str]:
+        if not metadata:
+            return None
+        thread_id = metadata.get("thread_id") or metadata.get("message_thread_id")
+        return str(thread_id) if thread_id is not None else None
+
+    @classmethod
+    def _message_thread_id_for_send(cls, thread_id: Optional[str]) -> Optional[int]:
+        if not thread_id or str(thread_id) == cls._GENERAL_TOPIC_THREAD_ID:
+            return None
+        return int(thread_id)
+
+    @classmethod
+    def _message_thread_id_for_typing(cls, thread_id: Optional[str]) -> Optional[int]:
+        if not thread_id:
+            return None
+        return int(thread_id)
+
+    @staticmethod
+    def _is_thread_not_found_error(error: Exception) -> bool:
+        return "thread not found" in str(error).lower()
+
    def _fallback_ips(self) -> list[str]:
        """Return validated fallback IPs from config (populated by _apply_env_overrides)."""
        configured = self.config.extra.get("fallback_ips", []) if getattr(self.config, "extra", None) else []
@@ -193,6 +311,26 @@ class TelegramAdapter(BasePlatformAdapter):
            pass
        return isinstance(error, OSError)

+    def _coerce_bool_extra(self, key: str, default: bool = False) -> bool:
+        value = self.config.extra.get(key) if getattr(self.config, "extra", None) else None
+        if value is None:
+            return default
+        if isinstance(value, str):
+            lowered = value.strip().lower()
+            if lowered in ("true", "1", "yes", "on"):
+                return True
+            if lowered in ("false", "0", "no", "off"):
+                return False
+            return default
+        return bool(value)
+
+    def _link_preview_kwargs(self) -> Dict[str, Any]:
+        if not getattr(self, "_disable_link_previews", False):
+            return {}
+        if LinkPreviewOptions is not None:
+            return {"link_preview_options": LinkPreviewOptions(is_disabled=True)}
+        return {"disable_web_page_preview": True}
+
    async def _handle_polling_network_error(self, error: Exception) -> None:
        """Reconnect polling after a transient network interruption.

@@ -540,7 +678,7 @@ class TelegramAdapter(BasePlatformAdapter):
                "write_timeout": _env_float("HERMES_TELEGRAM_HTTP_WRITE_TIMEOUT", 20.0),
            }

-            proxy_url = resolve_proxy_url()
+            proxy_url = resolve_proxy_url("TELEGRAM_PROXY")
            disable_fallback = (os.getenv("HERMES_TELEGRAM_DISABLE_FALLBACK_IPS", "").strip().lower() in ("1", "true", "yes", "on"))
            fallback_ips = self._fallback_ips()
            if not fallback_ips:
@@ -606,14 +744,14 @@ class TelegramAdapter(BasePlatformAdapter):
                from telegram.error import NetworkError, TimedOut
            except ImportError:
                NetworkError = TimedOut = OSError  # type: ignore[misc,assignment]
-            _max_connect = 3
+            _max_connect = 8
            for _attempt in range(_max_connect):
                try:
                    await self._app.initialize()
                    break
                except (NetworkError, TimedOut, OSError) as init_err:
                    if _attempt < _max_connect - 1:
-                        wait = 2 ** _attempt
+                        wait = min(2 ** _attempt, 15)
                        logger.warning(
                            "[%s] Connect attempt %d/%d failed: %s — retrying in %ds",
                            self.name, _attempt + 1, _max_connect, init_err, wait,
@@ -814,7 +952,7 @@ class TelegramAdapter(BasePlatformAdapter):
                ]
            
            message_ids = []
-            thread_id = metadata.get("thread_id") if metadata else None
+            thread_id = self._metadata_thread_id(metadata)
            
            try:
                from telegram.error import NetworkError as _NetErr
@@ -834,7 +972,7 @@ class TelegramAdapter(BasePlatformAdapter):
            for i, chunk in enumerate(chunks):
                should_thread = self._should_thread_reply(reply_to, i)
                reply_to_id = int(reply_to) if should_thread else None
-                effective_thread_id = int(thread_id) if thread_id else None
+                effective_thread_id = self._message_thread_id_for_send(thread_id)

                msg = None
                for _send_attempt in range(3):
@@ -847,6 +985,7 @@ class TelegramAdapter(BasePlatformAdapter):
                                parse_mode=ParseMode.MARKDOWN_V2,
                                reply_to_message_id=reply_to_id,
                                message_thread_id=effective_thread_id,
+                                **self._link_preview_kwargs(),
                            )
                        except Exception as md_error:
                            # Markdown parsing failed, try plain text
@@ -859,6 +998,7 @@ class TelegramAdapter(BasePlatformAdapter):
                                    parse_mode=None,
                                    reply_to_message_id=reply_to_id,
                                    message_thread_id=effective_thread_id,
+                                    **self._link_preview_kwargs(),
                                )
                            else:
                                raise
@@ -869,8 +1009,7 @@ class TelegramAdapter(BasePlatformAdapter):
                        # (not transient network issues). Detect and handle
                        # specific cases instead of blindly retrying.
                        if _BadReq and isinstance(send_err, _BadReq):
-                            err_lower = str(send_err).lower()
-                            if "thread not found" in err_lower and effective_thread_id is not None:
+                            if self._is_thread_not_found_error(send_err) and effective_thread_id is not None:
                                # Thread doesn't exist — retry without
                                # message_thread_id so the message still
                                # reaches the chat.
@@ -880,6 +1019,7 @@ class TelegramAdapter(BasePlatformAdapter):
                                )
                                effective_thread_id = None
                                continue
+                            err_lower = str(send_err).lower()
                            if "message to be replied not found" in err_lower and reply_to_id is not None:
                                # Original message was deleted before we
                                # could reply — clear reply target and retry
@@ -1046,6 +1186,7 @@ class TelegramAdapter(BasePlatformAdapter):
                text=text,
                parse_mode=ParseMode.MARKDOWN,
                reply_markup=keyboard,
+                **self._link_preview_kwargs(),
            )
            return SendResult(success=True, message_id=str(msg.message_id))
        except Exception as e:
@@ -1068,15 +1209,13 @@ class TelegramAdapter(BasePlatformAdapter):
        try:
            cmd_preview = command[:3800] + "..." if len(command) > 3800 else command
            text = (
-                f"⚠️ *Command Approval Required*\n\n"
-                f"`{cmd_preview}`\n\n"
-                f"Reason: {description}"
+                f"⚠️ <b>Command Approval Required</b>\n\n"
+                f"<pre>{_html.escape(cmd_preview)}</pre>\n\n"
+                f"Reason: {_html.escape(description)}"
            )

            # Resolve thread context for thread replies
-            thread_id = None
-            if metadata:
-                thread_id = metadata.get("thread_id") or metadata.get("message_thread_id")
+            thread_id = self._metadata_thread_id(metadata)

            # We'll use the message_id as part of callback_data to look up session_key
            # Send a placeholder first, then update — or use a counter.
@@ -1100,11 +1239,13 @@ class TelegramAdapter(BasePlatformAdapter):
            kwargs: Dict[str, Any] = {
                "chat_id": int(chat_id),
                "text": text,
-                "parse_mode": ParseMode.MARKDOWN,
+                "parse_mode": ParseMode.HTML,
                "reply_markup": keyboard,
+                **self._link_preview_kwargs(),
            }
-            if thread_id:
-                kwargs["message_thread_id"] = int(thread_id)
+            message_thread_id = self._message_thread_id_for_send(thread_id)
+            if message_thread_id is not None:
+                kwargs["message_thread_id"] = message_thread_id

            msg = await self._bot.send_message(**kwargs)

@@ -1172,6 +1313,7 @@ class TelegramAdapter(BasePlatformAdapter):
                parse_mode=ParseMode.MARKDOWN,
                reply_markup=keyboard,
                message_thread_id=int(thread_id) if thread_id else None,
+                **self._link_preview_kwargs(),
            )

            # Store picker state keyed by chat_id
@@ -1440,12 +1582,9 @@ class TelegramAdapter(BasePlatformAdapter):

                # Only authorized users may click approval buttons.
                caller_id = str(getattr(query.from_user, "id", ""))
-                allowed_csv = os.getenv("TELEGRAM_ALLOWED_USERS", "").strip()
-                if allowed_csv:
-                    allowed_ids = {uid.strip() for uid in allowed_csv.split(",") if uid.strip()}
-                    if "*" not in allowed_ids and caller_id not in allowed_ids:
-                        await query.answer(text="⛔ You are not authorized to approve commands.")
-                        return
+                if not self._is_callback_user_authorized(caller_id):
+                    await query.answer(text="⛔ You are not authorized to approve commands.")
+                    return

                session_key = self._approval_state.pop(approval_id, None)
                if not session_key:
@@ -1490,6 +1629,10 @@ class TelegramAdapter(BasePlatformAdapter):
        if not data.startswith("update_prompt:"):
            return
        answer = data.split(":", 1)[1]  # "y" or "n"
+        caller_id = str(getattr(query.from_user, "id", ""))
+        if not self._is_callback_user_authorized(caller_id):
+            await query.answer(text="⛔ You are not authorized to answer update prompts.")
+            return
        await query.answer(text=f"Sent '{answer}' to the update process.")
        # Edit the message to show the choice and remove buttons
        label = "Yes" if answer == "y" else "No"
@@ -1535,23 +1678,23 @@ class TelegramAdapter(BasePlatformAdapter):
            with open(audio_path, "rb") as audio_file:
                # .ogg files -> send as voice (round playable bubble)
                if audio_path.endswith((".ogg", ".opus")):
-                    _voice_thread = metadata.get("thread_id") if metadata else None
+                    _voice_thread = self._metadata_thread_id(metadata)
                    msg = await self._bot.send_voice(
                        chat_id=int(chat_id),
                        voice=audio_file,
                        caption=caption[:1024] if caption else None,
                        reply_to_message_id=int(reply_to) if reply_to else None,
-                        message_thread_id=int(_voice_thread) if _voice_thread else None,
+                        message_thread_id=self._message_thread_id_for_send(_voice_thread),
                    )
                else:
                    # .mp3 and others -> send as audio file
-                    _audio_thread = metadata.get("thread_id") if metadata else None
+                    _audio_thread = self._metadata_thread_id(metadata)
                    msg = await self._bot.send_audio(
                        chat_id=int(chat_id),
                        audio=audio_file,
                        caption=caption[:1024] if caption else None,
                        reply_to_message_id=int(reply_to) if reply_to else None,
-                        message_thread_id=int(_audio_thread) if _audio_thread else None,
+                        message_thread_id=self._message_thread_id_for_send(_audio_thread),
                    )
            return SendResult(success=True, message_id=str(msg.message_id))
        except Exception as e:
@@ -1581,14 +1724,14 @@ class TelegramAdapter(BasePlatformAdapter):
            if not os.path.exists(image_path):
                return SendResult(success=False, error=f"Image file not found: {image_path}")

-            _thread = metadata.get("thread_id") if metadata else None
+            _thread = self._metadata_thread_id(metadata)
            with open(image_path, "rb") as image_file:
                msg = await self._bot.send_photo(
                    chat_id=int(chat_id),
                    photo=image_file,
                    caption=caption[:1024] if caption else None,
                    reply_to_message_id=int(reply_to) if reply_to else None,
-                    message_thread_id=int(_thread) if _thread else None,
+                    message_thread_id=self._message_thread_id_for_send(_thread),
                )
            return SendResult(success=True, message_id=str(msg.message_id))
        except Exception as e:
@@ -1619,7 +1762,7 @@ class TelegramAdapter(BasePlatformAdapter):
                return SendResult(success=False, error=f"File not found: {file_path}")

            display_name = file_name or os.path.basename(file_path)
-            _thread = metadata.get("thread_id") if metadata else None
+            _thread = self._metadata_thread_id(metadata)

            with open(file_path, "rb") as f:
                msg = await self._bot.send_document(
@@ -1628,7 +1771,7 @@ class TelegramAdapter(BasePlatformAdapter):
                    filename=display_name,
                    caption=caption[:1024] if caption else None,
                    reply_to_message_id=int(reply_to) if reply_to else None,
-                    message_thread_id=int(_thread) if _thread else None,
+                    message_thread_id=self._message_thread_id_for_send(_thread),
                )
            return SendResult(success=True, message_id=str(msg.message_id))
        except Exception as e:
@@ -1652,14 +1795,14 @@ class TelegramAdapter(BasePlatformAdapter):
            if not os.path.exists(video_path):
                return SendResult(success=False, error=f"Video file not found: {video_path}")

-            _thread = metadata.get("thread_id") if metadata else None
+            _thread = self._metadata_thread_id(metadata)
            with open(video_path, "rb") as f:
                msg = await self._bot.send_video(
                    chat_id=int(chat_id),
                    video=f,
                    caption=caption[:1024] if caption else None,
                    reply_to_message_id=int(reply_to) if reply_to else None,
-                    message_thread_id=int(_thread) if _thread else None,
+                    message_thread_id=self._message_thread_id_for_send(_thread),
                )
            return SendResult(success=True, message_id=str(msg.message_id))
        except Exception as e:
@@ -1689,13 +1832,13 @@ class TelegramAdapter(BasePlatformAdapter):

        try:
            # Telegram can send photos directly from URLs (up to ~5MB)
-            _photo_thread = metadata.get("thread_id") if metadata else None
+            _photo_thread = self._metadata_thread_id(metadata)
            msg = await self._bot.send_photo(
                chat_id=int(chat_id),
                photo=image_url,
                caption=caption[:1024] if caption else None,  # Telegram caption limit
                reply_to_message_id=int(reply_to) if reply_to else None,
-                message_thread_id=int(_photo_thread) if _photo_thread else None,
+                message_thread_id=self._message_thread_id_for_send(_photo_thread),
            )
            return SendResult(success=True, message_id=str(msg.message_id))
        except Exception as e:
@@ -1718,6 +1861,7 @@ class TelegramAdapter(BasePlatformAdapter):
                    photo=image_data,
                    caption=caption[:1024] if caption else None,
                    reply_to_message_id=int(reply_to) if reply_to else None,
+                    message_thread_id=self._message_thread_id_for_send(_photo_thread),
                )
                return SendResult(success=True, message_id=str(msg.message_id))
            except Exception as e2:
@@ -1743,13 +1887,13 @@ class TelegramAdapter(BasePlatformAdapter):
            return SendResult(success=False, error="Not connected")
        
        try:
-            _anim_thread = metadata.get("thread_id") if metadata else None
+            _anim_thread = self._metadata_thread_id(metadata)
            msg = await self._bot.send_animation(
                chat_id=int(chat_id),
                animation=animation_url,
                caption=caption[:1024] if caption else None,
                reply_to_message_id=int(reply_to) if reply_to else None,
-                message_thread_id=int(_anim_thread) if _anim_thread else None,
+                message_thread_id=self._message_thread_id_for_send(_anim_thread),
            )
            return SendResult(success=True, message_id=str(msg.message_id))
        except Exception as e:
@@ -1766,12 +1910,23 @@ class TelegramAdapter(BasePlatformAdapter):
        """Send typing indicator."""
        if self._bot:
            try:
-                _typing_thread = metadata.get("thread_id") if metadata else None
-                await self._bot.send_chat_action(
-                    chat_id=int(chat_id),
-                    action="typing",
-                    message_thread_id=int(_typing_thread) if _typing_thread else None,
-                )
+                _typing_thread = self._metadata_thread_id(metadata)
+                message_thread_id = self._message_thread_id_for_typing(_typing_thread)
+                try:
+                    await self._bot.send_chat_action(
+                        chat_id=int(chat_id),
+                        action="typing",
+                        message_thread_id=message_thread_id,
+                    )
+                except Exception as e:
+                    if message_thread_id is not None and self._is_thread_not_found_error(e):
+                        await self._bot.send_chat_action(
+                            chat_id=int(chat_id),
+                            action="typing",
+                            message_thread_id=None,
+                        )
+                    else:
+                        raise
            except Exception as e:
                # Typing failures are non-fatal; log at debug level only.
                logger.debug(
@@ -1839,6 +1994,12 @@ class TelegramAdapter(BasePlatformAdapter):

        text = content

+        # 0) Pre-wrap GFM-style pipe tables in ``` fences.  Telegram can't
+        #    render tables natively, but fenced code blocks render as
+        #    monospace preformatted text with columns intact.  The wrapped
+        #    tables then flow through step (1) below as protected regions.
+        text = _wrap_markdown_tables(text)
+
        # 1) Protect fenced code blocks (``` ... ```)
        #    Per MarkdownV2 spec, \ and ` inside pre/code must be escaped.
        def _protect_fenced(m):
@@ -2165,7 +2326,7 @@ class TelegramAdapter(BasePlatformAdapter):
        if not self._should_process_message(update.message):
            return

-        event = self._build_message_event(update.message, MessageType.TEXT)
+        event = self._build_message_event(update.message, MessageType.TEXT, update_id=update.update_id)
        event.text = self._clean_bot_trigger_text(event.text)
        self._enqueue_text_event(event)
    
@@ -2176,7 +2337,7 @@ class TelegramAdapter(BasePlatformAdapter):
        if not self._should_process_message(update.message, is_command=True):
            return
        
-        event = self._build_message_event(update.message, MessageType.COMMAND)
+        event = self._build_message_event(update.message, MessageType.COMMAND, update_id=update.update_id)
        await self.handle_message(event)
    
    async def _handle_location_message(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
@@ -2212,7 +2373,7 @@ class TelegramAdapter(BasePlatformAdapter):
        parts.append(f"Map: https://www.google.com/maps/search/?api=1&query={lat},{lon}")
        parts.append("Ask what they'd like to find nearby (restaurants, cafes, etc.) and any preferences.")

-        event = self._build_message_event(msg, MessageType.LOCATION)
+        event = self._build_message_event(msg, MessageType.LOCATION, update_id=update.update_id)
        event.text = "\n".join(parts)
        await self.handle_message(event)

@@ -2363,7 +2524,7 @@ class TelegramAdapter(BasePlatformAdapter):
        else:
            msg_type = MessageType.DOCUMENT
        
-        event = self._build_message_event(msg, msg_type)
+        event = self._build_message_event(msg, msg_type, update_id=update.update_id)
        
        # Add caption as text
        if msg.caption:
@@ -2702,8 +2863,19 @@ class TelegramAdapter(BasePlatformAdapter):
                self.name, cache_key, thread_id,
            )

-    def _build_message_event(self, message: Message, msg_type: MessageType) -> MessageEvent:
-        """Build a MessageEvent from a Telegram message."""
+    def _build_message_event(
+        self,
+        message: Message,
+        msg_type: MessageType,
+        update_id: Optional[int] = None,
+    ) -> MessageEvent:
+        """Build a MessageEvent from a Telegram message.
+
+        ``update_id`` is the ``Update.update_id`` from PTB; passing it through
+        lets ``/restart`` record the triggering offset so the new gateway
+        process can advance past it (prevents ``/restart`` being re-delivered
+        when PTB's graceful-shutdown ACK fails).
+        """
        chat = message.chat
        user = message.from_user
        
@@ -2716,7 +2888,9 @@ class TelegramAdapter(BasePlatformAdapter):

        # Resolve DM topic name and skill binding
        thread_id_raw = message.message_thread_id
-        thread_id_str = str(thread_id_raw) if thread_id_raw else None
+        thread_id_str = str(thread_id_raw) if thread_id_raw is not None else None
+        if chat_type == "group" and thread_id_str is None and getattr(chat, "is_forum", False):
+            thread_id_str = self._GENERAL_TOPIC_THREAD_ID
        chat_topic = None
        topic_skill = None

@@ -2752,8 +2926,8 @@ class TelegramAdapter(BasePlatformAdapter):
            chat_id=str(chat.id),
            chat_name=chat.title or (chat.full_name if hasattr(chat, "full_name") else None),
            chat_type=chat_type,
-            user_id=str(user.id) if user else None,
-            user_name=user.full_name if user else None,
+            user_id=str(user.id) if user else (str(chat.id) if chat_type == "dm" else None),
+            user_name=user.full_name if user else (chat.full_name if hasattr(chat, "full_name") and chat_type == "dm" else None),
            thread_id=thread_id_str,
            chat_topic=chat_topic,
        )
@@ -2765,15 +2939,26 @@ class TelegramAdapter(BasePlatformAdapter):
            reply_to_id = str(message.reply_to_message.message_id)
            reply_to_text = message.reply_to_message.text or message.reply_to_message.caption or None

+        # Per-channel/topic ephemeral prompt
+        from gateway.platforms.base import resolve_channel_prompt
+        _chat_id_str = str(chat.id)
+        _channel_prompt = resolve_channel_prompt(
+            self.config.extra,
+            thread_id_str or _chat_id_str,
+            _chat_id_str if thread_id_str else None,
+        )
+
        return MessageEvent(
            text=message.text or "",
            message_type=msg_type,
            source=source,
            raw_message=message,
            message_id=str(message.message_id),
+            platform_update_id=update_id,
            reply_to_message_id=reply_to_id,
            reply_to_text=reply_to_text,
            auto_skill=topic_skill,
+            channel_prompt=_channel_prompt,
            timestamp=message.date,
        )

@@ -46,7 +46,7 @@ _SEED_FALLBACK_IPS: list[str] = ["149.154.167.220"]
 def _resolve_proxy_url() -> str | None:
    # Delegate to shared implementation (env vars + macOS system proxy detection)
    from gateway.platforms.base import resolve_proxy_url
-    return resolve_proxy_url()
+    return resolve_proxy_url("TELEGRAM_PROXY")


 class TelegramFallbackTransport(httpx.AsyncBaseTransport):
@@ -180,6 +180,8 @@ class WeComAdapter(BasePlatformAdapter):
        self._text_batch_split_delay_seconds = float(os.getenv("HERMES_WECOM_TEXT_BATCH_SPLIT_DELAY_SECONDS", "2.0"))
        self._pending_text_batches: Dict[str, MessageEvent] = {}
        self._pending_text_batch_tasks: Dict[str, asyncio.Task] = {}
+        self._device_id = uuid.uuid4().hex
+        self._last_chat_req_ids: Dict[str, str] = {}

    # ------------------------------------------------------------------
    # Connection lifecycle
@@ -277,7 +279,11 @@ class WeComAdapter(BasePlatformAdapter):
            {
                "cmd": APP_CMD_SUBSCRIBE,
                "headers": {"req_id": req_id},
-                "body": {"bot_id": self._bot_id, "secret": self._secret},
+                "body": {
+                    "bot_id": self._bot_id,
+                    "secret": self._secret,
+                    "device_id": self._device_id,
+                },
            }
        )

@@ -496,6 +502,11 @@ class WeComAdapter(BasePlatformAdapter):
            logger.debug("[%s] DM sender %s blocked by policy", self.name, sender_id)
            return

+        # Cache the inbound req_id after policy checks so proactive sends to
+        # this chat can fall back to APP_CMD_RESPONSE (required for groups —
+        # WeCom AI Bots cannot initiate APP_CMD_SEND in group chats).
+        self._remember_chat_req_id(chat_id, self._payload_req_id(payload))
+
        text, reply_text = self._extract_text(body)
        media_urls, media_types = await self._extract_media(body)
        message_type = self._derive_message_type(body, text, media_types)
@@ -847,6 +858,23 @@ class WeComAdapter(BasePlatformAdapter):
        while len(self._reply_req_ids) > DEDUP_MAX_SIZE:
            self._reply_req_ids.pop(next(iter(self._reply_req_ids)))

+    def _remember_chat_req_id(self, chat_id: str, req_id: str) -> None:
+        """Cache the most recent inbound req_id per chat.
+
+        Used as a fallback reply target when we need to send into a group
+        without an explicit ``reply_to`` — WeCom AI Bots are blocked from
+        APP_CMD_SEND in groups and must use APP_CMD_RESPONSE bound to some
+        prior req_id. Bounded like _reply_req_ids so long-running gateways
+        don't leak memory across many chats.
+        """
+        normalized_chat_id = str(chat_id or "").strip()
+        normalized_req_id = str(req_id or "").strip()
+        if not normalized_chat_id or not normalized_req_id:
+            return
+        self._last_chat_req_ids[normalized_chat_id] = normalized_req_id
+        while len(self._last_chat_req_ids) > DEDUP_MAX_SIZE:
+            self._last_chat_req_ids.pop(next(iter(self._last_chat_req_ids)))
+
    def _reply_req_id_for_message(self, reply_to: Optional[str]) -> Optional[str]:
        normalized = str(reply_to or "").strip()
        if not normalized or normalized.startswith("quote:"):
@@ -1163,19 +1191,15 @@ class WeComAdapter(BasePlatformAdapter):
        self._raise_for_wecom_error(response, "send media message")
        return response

-    async def _send_reply_stream(self, reply_req_id: str, content: str) -> Dict[str, Any]:
+    async def _send_reply_markdown(self, reply_req_id: str, content: str) -> Dict[str, Any]:
        response = await self._send_reply_request(
            reply_req_id,
            {
-                "msgtype": "stream",
-                "stream": {
-                    "id": self._new_req_id("stream"),
-                    "finish": True,
-                    "content": content[:self.MAX_MESSAGE_LENGTH],
-                },
+                "msgtype": "markdown",
+                "markdown": {"content": content[:self.MAX_MESSAGE_LENGTH]},
            },
        )
-        self._raise_for_wecom_error(response, "send reply stream")
+        self._raise_for_wecom_error(response, "send reply markdown")
        return response

    async def _send_reply_media_message(
@@ -1235,6 +1259,9 @@ class WeComAdapter(BasePlatformAdapter):
            return SendResult(success=False, error=prepared["reject_reason"])

        reply_req_id = self._reply_req_id_for_message(reply_to)
+        if not reply_req_id and chat_id in self._last_chat_req_ids:
+            reply_req_id = self._last_chat_req_ids[chat_id]
+
        try:
            upload_result = await self._upload_media_bytes(
                prepared["data"],
@@ -1302,8 +1329,12 @@ class WeComAdapter(BasePlatformAdapter):

        try:
            reply_req_id = self._reply_req_id_for_message(reply_to)
+
+            if not reply_req_id and chat_id in self._last_chat_req_ids:
+                reply_req_id = self._last_chat_req_ids[chat_id]
+
            if reply_req_id:
-                response = await self._send_reply_stream(reply_req_id, content)
+                response = await self._send_reply_markdown(reply_req_id, content)
            else:
                response = await self._send_request(
                    APP_CMD_SEND,
@@ -258,6 +258,20 @@ class WecomCallbackAdapter(BasePlatformAdapter):
                )
                event = self._build_event(app, decrypted)
                if event is not None:
+                    # Deduplicate: WeCom retries callbacks on timeout,
+                    # producing duplicate inbound messages (#10305).
+                    if event.message_id:
+                        now = time.time()
+                        if event.message_id in self._seen_messages:
+                            if now - self._seen_messages[event.message_id] < MESSAGE_DEDUP_TTL_SECONDS:
+                                logger.debug("[WecomCallback] Duplicate MsgId %s, skipping", event.message_id)
+                                return web.Response(text="success", content_type="text/plain")
+                            del self._seen_messages[event.message_id]
+                        self._seen_messages[event.message_id] = now
+                        # Prune expired entries when cache grows large
+                        if len(self._seen_messages) > 2000:
+                            cutoff = now - MESSAGE_DEDUP_TTL_SECONDS
+                            self._seen_messages = {k: v for k, v in self._seen_messages.items() if v > cutoff}
                    # Record which app this user belongs to.
                    if event.source and event.source.user_id:
                        map_key = self._user_app_key(
@@ -28,7 +28,7 @@ import uuid
 from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
-from urllib.parse import quote
+from urllib.parse import quote, urlparse

 logger = logging.getLogger(__name__)

@@ -96,6 +96,28 @@ MEDIA_VIDEO = 2
 MEDIA_FILE = 3
 MEDIA_VOICE = 4

+_LIVE_ADAPTERS: Dict[str, Any] = {}
+
+
+def _make_ssl_connector() -> Optional["aiohttp.TCPConnector"]:
+    """Return a TCPConnector with a certifi CA bundle, or None if certifi is unavailable.
+
+    Tencent's iLink server (``ilinkai.weixin.qq.com``) is not verifiable against
+    some system CA stores (notably Homebrew's OpenSSL on macOS Apple Silicon).
+    When ``certifi`` is installed, use its Mozilla CA bundle to guarantee
+    verification. Otherwise fall back to aiohttp's default (which honors
+    ``SSL_CERT_FILE`` env var via ``trust_env=True``).
+    """
+    try:
+        import ssl
+        import certifi
+    except ImportError:
+        return None
+    if not AIOHTTP_AVAILABLE:
+        return None
+    ssl_ctx = ssl.create_default_context(cafile=certifi.where())
+    return aiohttp.TCPConnector(ssl=ssl_ctx)
+
 ITEM_TEXT = 1
 ITEM_IMAGE = 2
 ITEM_VOICE = 3
@@ -398,7 +420,12 @@ async def _send_message(
    text: str,
    context_token: Optional[str],
    client_id: str,
-) -> None:
+) -> Dict[str, Any]:
+    """Send a text message via iLink sendmessage API.
+
+    Returns the raw API response dict (may contain error codes like
+    ``errcode: -14`` for session expiry that the caller can inspect).
+    """
    if not text or not text.strip():
        raise ValueError("_send_message: text must not be empty")
    message: Dict[str, Any] = {
@@ -411,7 +438,7 @@ async def _send_message(
    }
    if context_token:
        message["context_token"] = context_token
-    await _api_post(
+    return await _api_post(
        session,
        base_url=base_url,
        endpoint=EP_SEND_MESSAGE,
@@ -533,6 +560,39 @@ async def _download_bytes(
        return await response.read()


+_WEIXIN_CDN_ALLOWLIST: frozenset[str] = frozenset(
+    {
+        "novac2c.cdn.weixin.qq.com",
+        "ilinkai.weixin.qq.com",
+        "wx.qlogo.cn",
+        "thirdwx.qlogo.cn",
+        "res.wx.qq.com",
+        "mmbiz.qpic.cn",
+        "mmbiz.qlogo.cn",
+    }
+)
+
+
+def _assert_weixin_cdn_url(url: str) -> None:
+    """Raise ValueError if *url* does not point at a known WeChat CDN host."""
+    try:
+        parsed = urlparse(url)
+        scheme = parsed.scheme.lower()
+        host = parsed.hostname or ""
+    except Exception as exc:  # noqa: BLE001
+        raise ValueError(f"Unparseable media URL: {url!r}") from exc
+
+    if scheme not in ("http", "https"):
+        raise ValueError(
+            f"Media URL has disallowed scheme {scheme!r}; only http/https are permitted."
+        )
+    if host not in _WEIXIN_CDN_ALLOWLIST:
+        raise ValueError(
+            f"Media URL host {host!r} is not in the WeChat CDN allowlist. "
+            "Refusing to fetch to prevent SSRF."
+        )
+
+
 def _media_reference(item: Dict[str, Any], key: str) -> Dict[str, Any]:
    return (item.get(key) or {}).get("media") or {}

@@ -553,6 +613,7 @@ async def _download_and_decrypt_media(
            timeout_seconds=timeout_seconds,
        )
    elif full_url:
+        _assert_weixin_cdn_url(full_url)
        raw = await _download_bytes(session, url=full_url, timeout_seconds=timeout_seconds)
    else:
        raise RuntimeError("media item had neither encrypt_query_param nor full_url")
@@ -623,42 +684,31 @@ def _rewrite_table_block_for_weixin(lines: List[str]) -> str:
 def _normalize_markdown_blocks(content: str) -> str:
    lines = content.splitlines()
    result: List[str] = []
-    i = 0
    in_code_block = False
+    blank_run = 0

-    while i < len(lines):
-        line = lines[i].rstrip()
-        fence_match = _FENCE_RE.match(line.strip())
-        if fence_match:
+    for raw_line in lines:
+        line = raw_line.rstrip()
+        if _FENCE_RE.match(line.strip()):
            in_code_block = not in_code_block
            result.append(line)
-            i += 1
+            blank_run = 0
            continue

        if in_code_block:
            result.append(line)
-            i += 1
            continue

-        if (
-            i + 1 < len(lines)
-            and "|" in lines[i]
-            and _TABLE_RULE_RE.match(lines[i + 1].rstrip())
-        ):
-            table_lines = [lines[i].rstrip(), lines[i + 1].rstrip()]
-            i += 2
-            while i < len(lines) and "|" in lines[i]:
-                table_lines.append(lines[i].rstrip())
-                i += 1
-            result.append(_rewrite_table_block_for_weixin(table_lines))
+        if not line.strip():
+            blank_run += 1
+            if blank_run <= 1:
+                result.append("")
            continue

-        result.append(_MARKDOWN_LINK_RE.sub(r"\1 (\2)", _rewrite_headers_for_weixin(line)))
-        i += 1
+        blank_run = 0
+        result.append(line)

-    normalized = "\n".join(item.rstrip() for item in result)
-    normalized = re.sub(r"\n{3,}", "\n\n", normalized)
-    return normalized.strip()
+    return "\n".join(result).strip()


 def _split_markdown_blocks(content: str) -> List[str]:
@@ -704,8 +754,8 @@ def _split_delivery_units_for_weixin(content: str) -> List[str]:

    Weixin can render Markdown, but chat readability is better when top-level
    line breaks become separate messages. Keep fenced code blocks intact and
-    attach indented continuation lines to the previous top-level line so
-    transformed tables/lists do not get torn apart.
+    attach indented continuation lines to the previous top-level line so nested
+    list items do not get torn apart.
    """
    units: List[str] = []

@@ -747,7 +797,9 @@ def _looks_like_chatty_line_for_weixin(line: str) -> bool:
        return False
    if line.startswith((" ", "\t")):
        return False
-    if stripped.startswith((">", "-", "*", "【")):
+    if stripped.startswith((">", "-", "*", "【", "#", "|")):
+        return False
+    if _TABLE_RULE_RE.match(stripped):
        return False
    if re.match(r"^\*\*[^*]+\*\*$", stripped):
        return False
@@ -757,10 +809,12 @@ def _looks_like_chatty_line_for_weixin(line: str) -> bool:


 def _looks_like_heading_line_for_weixin(line: str) -> bool:
-    """Return True when a short line behaves like a plain-text heading."""
+    """Return True when a short line behaves like a heading."""
    stripped = line.strip()
    if not stripped:
        return False
+    if _HEADER_RE.match(stripped):
+        return True
    return len(stripped) <= 24 and stripped.endswith((":", "："))


@@ -935,7 +989,7 @@ async def qr_login(
    if not AIOHTTP_AVAILABLE:
        raise RuntimeError("aiohttp is required for Weixin QR login")

-    async with aiohttp.ClientSession(trust_env=True) as session:
+    async with aiohttp.ClientSession(trust_env=True, connector=_make_ssl_connector()) as session:
        try:
            qr_resp = await _api_get(
                session,
@@ -953,6 +1007,10 @@ async def qr_login(
            logger.error("weixin: QR response missing qrcode")
            return None

+        # qrcode_url is the full scannable liteapp URL; qrcode_value is just the hex token
+        # WeChat needs to scan the full URL, not the raw hex string
+        qr_scan_data = qrcode_url if qrcode_url else qrcode_value
+
        print("\n请使用微信扫描以下二维码：")
        if qrcode_url:
            print(qrcode_url)
@@ -960,11 +1018,11 @@ async def qr_login(
            import qrcode

            qr = qrcode.QRCode()
-            qr.add_data(qrcode_url or qrcode_value)
+            qr.add_data(qr_scan_data)
            qr.make(fit=True)
            qr.print_ascii(invert=True)
-        except Exception:
-            print("（终端二维码渲染失败，请直接打开上面的二维码链接）")
+        except Exception as _qr_exc:
+            print(f"（终端二维码渲染失败: {_qr_exc}，请直接打开上面的二维码链接）")

        deadline = time.time() + timeout_seconds
        current_base_url = ILINK_BASE_URL
@@ -1010,8 +1068,17 @@ async def qr_login(
                    )
                    qrcode_value = str(qr_resp.get("qrcode") or "")
                    qrcode_url = str(qr_resp.get("qrcode_img_content") or "")
+                    qr_scan_data = qrcode_url if qrcode_url else qrcode_value
                    if qrcode_url:
                        print(qrcode_url)
+                    try:
+                        import qrcode as _qrcode
+                        qr = _qrcode.QRCode()
+                        qr.add_data(qr_scan_data)
+                        qr.make(fit=True)
+                        qr.print_ascii(invert=True)
+                    except Exception:
+                        pass
                except Exception as exc:
                    logger.error("weixin: QR refresh failed: %s", exc)
                    return None
@@ -1059,7 +1126,8 @@ class WeixinAdapter(BasePlatformAdapter):
        self._hermes_home = hermes_home
        self._token_store = ContextTokenStore(hermes_home)
        self._typing_cache = TypingTicketCache()
-        self._session: Optional[aiohttp.ClientSession] = None
+        self._poll_session: Optional[aiohttp.ClientSession] = None
+        self._send_session: Optional[aiohttp.ClientSession] = None
        self._poll_task: Optional[asyncio.Task] = None
        self._dedup = MessageDeduplicator(ttl_seconds=MESSAGE_DEDUP_TTL_SECONDS)

@@ -1134,14 +1202,17 @@ class WeixinAdapter(BasePlatformAdapter):
        except Exception as exc:
            logger.debug("[%s] Token lock unavailable (non-fatal): %s", self.name, exc)

-        self._session = aiohttp.ClientSession(trust_env=True)
+        self._poll_session = aiohttp.ClientSession(trust_env=True, connector=_make_ssl_connector())
+        self._send_session = aiohttp.ClientSession(trust_env=True, connector=_make_ssl_connector())
        self._token_store.restore(self._account_id)
        self._poll_task = asyncio.create_task(self._poll_loop(), name="weixin-poll")
        self._mark_connected()
+        _LIVE_ADAPTERS[self._token] = self
        logger.info("[%s] Connected account=%s base=%s", self.name, _safe_id(self._account_id), self._base_url)
        return True

    async def disconnect(self) -> None:
+        _LIVE_ADAPTERS.pop(self._token, None)
        self._running = False
        if self._poll_task and not self._poll_task.done():
            self._poll_task.cancel()
@@ -1150,15 +1221,18 @@ class WeixinAdapter(BasePlatformAdapter):
            except asyncio.CancelledError:
                pass
        self._poll_task = None
-        if self._session and not self._session.closed:
-            await self._session.close()
-        self._session = None
+        if self._poll_session and not self._poll_session.closed:
+            await self._poll_session.close()
+        self._poll_session = None
+        if self._send_session and not self._send_session.closed:
+            await self._send_session.close()
+        self._send_session = None
        self._release_platform_lock()
        self._mark_disconnected()
        logger.info("[%s] Disconnected", self.name)

    async def _poll_loop(self) -> None:
-        assert self._session is not None
+        assert self._poll_session is not None
        sync_buf = _load_sync_buf(self._hermes_home, self._account_id)
        timeout_ms = LONG_POLL_TIMEOUT_MS
        consecutive_failures = 0
@@ -1166,7 +1240,7 @@ class WeixinAdapter(BasePlatformAdapter):
        while self._running:
            try:
                response = await _get_updates(
-                    self._session,
+                    self._poll_session,
                    base_url=self._base_url,
                    token=self._token,
                    sync_buf=sync_buf,
@@ -1223,7 +1297,7 @@ class WeixinAdapter(BasePlatformAdapter):
            logger.error("[%s] unhandled inbound error from=%s: %s", self.name, _safe_id(message.get("from_user_id")), exc, exc_info=True)

    async def _process_message(self, message: Dict[str, Any]) -> None:
-        assert self._session is not None
+        assert self._poll_session is not None
        sender_id = str(message.get("from_user_id") or "").strip()
        if not sender_id:
            return
@@ -1316,7 +1390,7 @@ class WeixinAdapter(BasePlatformAdapter):
        media = _media_reference(item, "image_item")
        try:
            data = await _download_and_decrypt_media(
-                self._session,
+                self._poll_session,
                cdn_base_url=self._cdn_base_url,
                encrypted_query_param=media.get("encrypt_query_param"),
                aes_key_b64=(item.get("image_item") or {}).get("aeskey")
@@ -1334,7 +1408,7 @@ class WeixinAdapter(BasePlatformAdapter):
        media = _media_reference(item, "video_item")
        try:
            data = await _download_and_decrypt_media(
-                self._session,
+                self._poll_session,
                cdn_base_url=self._cdn_base_url,
                encrypted_query_param=media.get("encrypt_query_param"),
                aes_key_b64=media.get("aes_key"),
@@ -1353,7 +1427,7 @@ class WeixinAdapter(BasePlatformAdapter):
        mime = _mime_from_filename(filename)
        try:
            data = await _download_and_decrypt_media(
-                self._session,
+                self._poll_session,
                cdn_base_url=self._cdn_base_url,
                encrypted_query_param=media.get("encrypt_query_param"),
                aes_key_b64=media.get("aes_key"),
@@ -1372,7 +1446,7 @@ class WeixinAdapter(BasePlatformAdapter):
            return None
        try:
            data = await _download_and_decrypt_media(
-                self._session,
+                self._poll_session,
                cdn_base_url=self._cdn_base_url,
                encrypted_query_param=media.get("encrypt_query_param"),
                aes_key_b64=media.get("aes_key"),
@@ -1385,13 +1459,13 @@ class WeixinAdapter(BasePlatformAdapter):
            return None

    async def _maybe_fetch_typing_ticket(self, user_id: str, context_token: Optional[str]) -> None:
-        if not self._session or not self._token:
+        if not self._poll_session or not self._token:
            return
        if self._typing_cache.get(user_id):
            return
        try:
            response = await _get_config(
-                self._session,
+                self._poll_session,
                base_url=self._base_url,
                token=self._token,
                user_id=user_id,
@@ -1416,12 +1490,19 @@ class WeixinAdapter(BasePlatformAdapter):
        context_token: Optional[str],
        client_id: str,
    ) -> None:
-        """Send a single text chunk with per-chunk retry and backoff."""
+        """Send a single text chunk with per-chunk retry and backoff.
+
+        On session-expired errors (errcode -14), automatically retries
+        *without* ``context_token`` — iLink accepts tokenless sends as a
+        degraded fallback, which keeps cron-initiated push messages working
+        even when no user message has refreshed the session recently.
+        """
        last_error: Optional[Exception] = None
+        retried_without_token = False
        for attempt in range(self._send_chunk_retries + 1):
            try:
-                await _send_message(
-                    self._session,
+                resp = await _send_message(
+                    self._send_session,
                    base_url=self._base_url,
                    token=self._token,
                    to=chat_id,
@@ -1429,6 +1510,31 @@ class WeixinAdapter(BasePlatformAdapter):
                    context_token=context_token,
                    client_id=client_id,
                )
+                # Check iLink response for session-expired error
+                if resp and isinstance(resp, dict):
+                    ret = resp.get("ret")
+                    errcode = resp.get("errcode")
+                    if (ret is not None and ret not in (0,)) or (errcode is not None and errcode not in (0,)):
+                        is_session_expired = (
+                            ret == SESSION_EXPIRED_ERRCODE
+                            or errcode == SESSION_EXPIRED_ERRCODE
+                        )
+                        # Session expired — strip token and retry once
+                        if is_session_expired and not retried_without_token and context_token:
+                            retried_without_token = True
+                            context_token = None
+                            self._token_store._cache.pop(
+                                self._token_store._key(self._account_id, chat_id), None
+                            )
+                            logger.warning(
+                                "[%s] session expired for %s; retrying without context_token",
+                                self.name, _safe_id(chat_id),
+                            )
+                            continue
+                        errmsg = resp.get("errmsg") or resp.get("msg") or "unknown error"
+                        raise RuntimeError(
+                            f"iLink sendmessage error: ret={ret} errcode={errcode} errmsg={errmsg}"
+                        )
                return
            except Exception as exc:
                last_error = exc
@@ -1456,12 +1562,48 @@ class WeixinAdapter(BasePlatformAdapter):
        reply_to: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
    ) -> SendResult:
-        if not self._session or not self._token:
+        if not self._send_session or not self._token:
            return SendResult(success=False, error="Not connected")
        context_token = self._token_store.get(self._account_id, chat_id)
        last_message_id: Optional[str] = None
+
+        # Extract MEDIA: tags and bare local file paths before text delivery.
+        media_files, cleaned_content = self.extract_media(content)
+        _, image_cleaned = self.extract_images(cleaned_content)
+        local_files, final_content = self.extract_local_files(image_cleaned)
+
+        _AUDIO_EXTS = {".ogg", ".opus", ".mp3", ".wav", ".m4a"}
+        _VIDEO_EXTS = {".mp4", ".mov", ".avi", ".mkv", ".webm", ".3gp"}
+        _IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".gif"}
+
+        async def _deliver_media(path: str, is_voice: bool = False) -> None:
+            ext = Path(path).suffix.lower()
+            if is_voice or ext in _AUDIO_EXTS:
+                await self.send_voice(chat_id=chat_id, audio_path=path, metadata=metadata)
+            elif ext in _VIDEO_EXTS:
+                await self.send_video(chat_id=chat_id, video_path=path, metadata=metadata)
+            elif ext in _IMAGE_EXTS:
+                await self.send_image_file(chat_id=chat_id, image_path=path, metadata=metadata)
+            else:
+                await self.send_document(chat_id=chat_id, file_path=path, metadata=metadata)
+
        try:
-            chunks = [c for c in self._split_text(self.format_message(content)) if c and c.strip()]
+            # Deliver extracted MEDIA: attachments first.
+            for media_path, is_voice in media_files:
+                try:
+                    await _deliver_media(media_path, is_voice)
+                except Exception as exc:
+                    logger.warning("[%s] media delivery failed for %s: %s", self.name, media_path, exc)
+
+            # Deliver bare local file paths.
+            for file_path in local_files:
+                try:
+                    await _deliver_media(file_path, is_voice=False)
+                except Exception as exc:
+                    logger.warning("[%s] local file delivery failed for %s: %s", self.name, file_path, exc)
+
+            # Deliver text content.
+            chunks = [c for c in self._split_text(self.format_message(final_content)) if c and c.strip()]
            for idx, chunk in enumerate(chunks):
                client_id = f"hermes-weixin-{uuid.uuid4().hex}"
                await self._send_text_chunk(
@@ -1479,14 +1621,14 @@ class WeixinAdapter(BasePlatformAdapter):
            return SendResult(success=False, error=str(exc))

    async def send_typing(self, chat_id: str, metadata: Optional[Dict[str, Any]] = None) -> None:
-        if not self._session or not self._token:
+        if not self._send_session or not self._token:
            return
        typing_ticket = self._typing_cache.get(chat_id)
        if not typing_ticket:
            return
        try:
            await _send_typing(
-                self._session,
+                self._send_session,
                base_url=self._base_url,
                token=self._token,
                to_user_id=chat_id,
@@ -1497,14 +1639,14 @@ class WeixinAdapter(BasePlatformAdapter):
            logger.debug("[%s] typing start failed for %s: %s", self.name, _safe_id(chat_id), exc)

    async def stop_typing(self, chat_id: str) -> None:
-        if not self._session or not self._token:
+        if not self._send_session or not self._token:
            return
        typing_ticket = self._typing_cache.get(chat_id)
        if not typing_ticket:
            return
        try:
            await _send_typing(
-                self._session,
+                self._send_session,
                base_url=self._base_url,
                token=self._token,
                to_user_id=chat_id,
@@ -1542,24 +1684,35 @@ class WeixinAdapter(BasePlatformAdapter):
    async def send_image_file(
        self,
        chat_id: str,
-        path: str,
-        caption: str = "",
+        image_path: str,
+        caption: Optional[str] = None,
        reply_to: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
+        **kwargs,
    ) -> SendResult:
-        return await self.send_document(chat_id, file_path=path, caption=caption, metadata=metadata)
+        del reply_to, kwargs
+        return await self.send_document(
+            chat_id=chat_id,
+            file_path=image_path,
+            caption=caption,
+            metadata=metadata,
+        )

    async def send_document(
        self,
        chat_id: str,
        file_path: str,
-        caption: str = "",
+        caption: Optional[str] = None,
+        file_name: Optional[str] = None,
+        reply_to: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
+        **kwargs,
    ) -> SendResult:
-        if not self._session or not self._token:
+        del file_name, reply_to, metadata, kwargs
+        if not self._send_session or not self._token:
            return SendResult(success=False, error="Not connected")
        try:
-            message_id = await self._send_file(chat_id, file_path, caption)
+            message_id = await self._send_file(chat_id, file_path, caption or "")
            return SendResult(success=True, message_id=message_id)
        except Exception as exc:
            logger.error("[%s] send_document failed to=%s: %s", self.name, _safe_id(chat_id), exc)
@@ -1573,7 +1726,7 @@ class WeixinAdapter(BasePlatformAdapter):
        reply_to: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
    ) -> SendResult:
-        if not self._session or not self._token:
+        if not self._send_session or not self._token:
            return SendResult(success=False, error="Not connected")
        try:
            message_id = await self._send_file(chat_id, video_path, caption or "")
@@ -1590,7 +1743,24 @@ class WeixinAdapter(BasePlatformAdapter):
        reply_to: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
    ) -> SendResult:
-        return await self.send_document(chat_id, audio_path, caption=caption or "", metadata=metadata)
+        if not self._send_session or not self._token:
+            return SendResult(success=False, error="Not connected")
+
+        # Native outbound Weixin voice bubbles are not proven-working in the
+        # upstream reference implementation. Prefer a reliable file attachment
+        # fallback so users at least receive playable audio, even for .silk.
+        fallback_caption = caption or "[voice message as attachment]"
+        try:
+            message_id = await self._send_file(
+                chat_id,
+                audio_path,
+                fallback_caption,
+                force_file_attachment=True,
+            )
+            return SendResult(success=True, message_id=message_id)
+        except Exception as exc:
+            logger.error("[%s] send_voice failed to=%s: %s", self.name, _safe_id(chat_id), exc)
+            return SendResult(success=False, error=str(exc))

    async def _download_remote_media(self, url: str) -> str:
        from tools.url_safety import is_safe_url
@@ -1598,8 +1768,8 @@ class WeixinAdapter(BasePlatformAdapter):
        if not is_safe_url(url):
            raise ValueError(f"Blocked unsafe URL (SSRF protection): {url}")

-        assert self._session is not None
-        async with self._session.get(url, timeout=aiohttp.ClientTimeout(total=30)) as response:
+        assert self._send_session is not None
+        async with self._send_session.get(url, timeout=aiohttp.ClientTimeout(total=30)) as response:
            response.raise_for_status()
            data = await response.read()
            suffix = Path(url.split("?", 1)[0]).suffix or ".bin"
@@ -1607,16 +1777,22 @@ class WeixinAdapter(BasePlatformAdapter):
            handle.write(data)
            return handle.name

-    async def _send_file(self, chat_id: str, path: str, caption: str) -> str:
-        assert self._session is not None and self._token is not None
+    async def _send_file(
+        self,
+        chat_id: str,
+        path: str,
+        caption: str,
+        force_file_attachment: bool = False,
+    ) -> str:
+        assert self._send_session is not None and self._token is not None
        plaintext = Path(path).read_bytes()
-        media_type, item_builder = self._outbound_media_builder(path)
+        media_type, item_builder = self._outbound_media_builder(path, force_file_attachment=force_file_attachment)
        filekey = secrets.token_hex(16)
        aes_key = secrets.token_bytes(16)
        rawsize = len(plaintext)
        rawfilemd5 = hashlib.md5(plaintext).hexdigest()
        upload_response = await _get_upload_url(
-            self._session,
+            self._send_session,
            base_url=self._base_url,
            token=self._token,
            to_user_id=chat_id,
@@ -1642,30 +1818,34 @@ class WeixinAdapter(BasePlatformAdapter):
            raise RuntimeError(f"getUploadUrl returned neither upload_param nor upload_full_url: {upload_response}")

        encrypted_query_param = await _upload_ciphertext(
-            self._session,
+            self._send_session,
            ciphertext=ciphertext,
            upload_url=upload_url,
        )
-
        context_token = self._token_store.get(self._account_id, chat_id)
        # The iLink API expects aes_key as base64(hex_string), not base64(raw_bytes).
        # Sending base64(raw_bytes) causes images to show as grey boxes on the
        # receiver side because the decryption key doesn't match.
        aes_key_for_api = base64.b64encode(aes_key.hex().encode("ascii")).decode("ascii")
-        media_item = item_builder(
-            encrypt_query_param=encrypted_query_param,
-            aes_key_for_api=aes_key_for_api,
-            ciphertext_size=len(ciphertext),
-            plaintext_size=rawsize,
-            filename=Path(path).name,
-            rawfilemd5=rawfilemd5,
-        )
+        item_kwargs = {
+            "encrypt_query_param": encrypted_query_param,
+            "aes_key_for_api": aes_key_for_api,
+            "ciphertext_size": len(ciphertext),
+            "plaintext_size": rawsize,
+            "filename": Path(path).name,
+            "rawfilemd5": rawfilemd5,
+        }
+        if media_type == MEDIA_VOICE and path.endswith(".silk"):
+            item_kwargs["encode_type"] = 6
+            item_kwargs["sample_rate"] = 24000
+            item_kwargs["bits_per_sample"] = 16
+        media_item = item_builder(**item_kwargs)

        last_message_id = None
        if caption:
            last_message_id = f"hermes-weixin-{uuid.uuid4().hex}"
            await _send_message(
-                self._session,
+                self._send_session,
                base_url=self._base_url,
                token=self._token,
                to=chat_id,
@@ -1676,7 +1856,7 @@ class WeixinAdapter(BasePlatformAdapter):

        last_message_id = f"hermes-weixin-{uuid.uuid4().hex}"
        await _api_post(
-            self._session,
+            self._send_session,
            base_url=self._base_url,
            endpoint=EP_SEND_MESSAGE,
            payload={
@@ -1695,7 +1875,7 @@ class WeixinAdapter(BasePlatformAdapter):
        )
        return last_message_id

-    def _outbound_media_builder(self, path: str):
+    def _outbound_media_builder(self, path: str, force_file_attachment: bool = False):
        mime = mimetypes.guess_type(path)[0] or "application/octet-stream"
        if mime.startswith("image/"):
            return MEDIA_IMAGE, lambda **kw: {
@@ -1723,7 +1903,7 @@ class WeixinAdapter(BasePlatformAdapter):
                    "video_md5": kw.get("rawfilemd5", ""),
                },
            }
-        if mime.startswith("audio/") or path.endswith(".silk"):
+        if path.endswith(".silk") and not force_file_attachment:
            return MEDIA_VOICE, lambda **kw: {
                "type": ITEM_VOICE,
                "voice_item": {
@@ -1732,9 +1912,25 @@ class WeixinAdapter(BasePlatformAdapter):
                        "aes_key": kw["aes_key_for_api"],
                        "encrypt_type": 1,
                    },
+                    "encode_type": kw.get("encode_type"),
+                    "bits_per_sample": kw.get("bits_per_sample"),
+                    "sample_rate": kw.get("sample_rate"),
                    "playtime": kw.get("playtime", 0),
                },
            }
+        if mime.startswith("audio/"):
+            return MEDIA_FILE, lambda **kw: {
+                "type": ITEM_FILE,
+                "file_item": {
+                    "media": {
+                        "encrypt_query_param": kw["encrypt_query_param"],
+                        "aes_key": kw["aes_key_for_api"],
+                        "encrypt_type": 1,
+                    },
+                    "file_name": kw["filename"],
+                    "len": str(kw["plaintext_size"]),
+                },
+            }
        return MEDIA_FILE, lambda **kw: {
            "type": ITEM_FILE,
            "file_item": {
@@ -1784,7 +1980,34 @@ async def send_weixin_direct(
    token_store.restore(account_id)
    context_token = token_store.get(account_id, chat_id)

-    async with aiohttp.ClientSession(trust_env=True) as session:
+    live_adapter = _LIVE_ADAPTERS.get(resolved_token)
+    send_session = getattr(live_adapter, '_send_session', None)
+    if live_adapter is not None and send_session is not None and not send_session.closed:
+        last_result: Optional[SendResult] = None
+        cleaned = live_adapter.format_message(message)
+        if cleaned:
+            last_result = await live_adapter.send(chat_id, cleaned)
+            if not last_result.success:
+                return {"error": f"Weixin send failed: {last_result.error}"}
+
+        for media_path, _is_voice in media_files or []:
+            ext = Path(media_path).suffix.lower()
+            if ext in {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}:
+                last_result = await live_adapter.send_image_file(chat_id, media_path)
+            else:
+                last_result = await live_adapter.send_document(chat_id, media_path)
+            if not last_result.success:
+                return {"error": f"Weixin media send failed: {last_result.error}"}
+
+        return {
+            "success": True,
+            "platform": "weixin",
+            "chat_id": chat_id,
+            "message_id": last_result.message_id if last_result else None,
+            "context_token_used": bool(context_token),
+        }
+
+    async with aiohttp.ClientSession(trust_env=True, connector=_make_ssl_connector()) as session:
        adapter = WeixinAdapter(
            PlatformConfig(
                enabled=True,
@@ -1797,6 +2020,7 @@ async def send_weixin_direct(
                },
            )
        )
+        adapter._send_session = session
        adapter._session = session
        adapter._token = resolved_token
        adapter._account_id = account_id
@@ -82,6 +82,7 @@ class SessionSource:
    chat_topic: Optional[str] = None  # Channel topic/description (Discord, Slack)
    user_id_alt: Optional[str] = None  # Signal UUID (alternative to phone number)
    chat_id_alt: Optional[str] = None  # Signal group internal ID
+    is_bot: bool = False  # True when the message author is a bot/webhook (Discord)
    
    @property
    def description(self) -> str:
@@ -301,6 +302,8 @@ def build_session_context_prompt(
    lines.append("")
    lines.append("**Delivery options for scheduled tasks:**")
    
+    from hermes_constants import display_hermes_home
+
    # Origin delivery
    if context.source.platform == Platform.LOCAL:
        lines.append("- `\"origin\"` → Local output (saved to files)")
@@ -309,9 +312,11 @@ def build_session_context_prompt(
            _hash_chat_id(context.source.chat_id) if redact_pii else context.source.chat_id
        )
        lines.append(f"- `\"origin\"` → Back to this chat ({_origin_label})")
-    
+
    # Local always available
-    lines.append("- `\"local\"` → Save to local files only (~/.hermes/cron/output/)")
+    lines.append(
+        f"- `\"local\"` → Save to local files only ({display_hermes_home()}/cron/output/)"
+    )
    
    # Platform home channels
    for platform, home in context.home_channels.items():
@@ -372,7 +377,19 @@ class SessionEntry:
    # this session (create a new session_id) so the user starts fresh.
    # Set by /stop to break stuck-resume loops (#7536).
    suspended: bool = False
-    
+
+    # When True the session was interrupted by a gateway restart/shutdown
+    # drain timeout, but recovery is still expected.  Unlike ``suspended``,
+    # ``resume_pending`` preserves the existing session_id on next access —
+    # the user stays on the same transcript and the agent auto-continues
+    # from where it left off.  Cleared after the next successful turn.
+    # Escalation to ``suspended`` is handled by the existing
+    # ``.restart_failure_counts`` stuck-loop counter (#7536), not by a
+    # parallel counter on this entry.
+    resume_pending: bool = False
+    resume_reason: Optional[str] = None  # e.g. "restart_timeout"
+    last_resume_marked_at: Optional[datetime] = None
+
    def to_dict(self) -> Dict[str, Any]:
        result = {
            "session_key": self.session_key,
@@ -392,6 +409,13 @@ class SessionEntry:
            "cost_status": self.cost_status,
            "memory_flushed": self.memory_flushed,
            "suspended": self.suspended,
+            "resume_pending": self.resume_pending,
+            "resume_reason": self.resume_reason,
+            "last_resume_marked_at": (
+                self.last_resume_marked_at.isoformat()
+                if self.last_resume_marked_at
+                else None
+            ),
        }
        if self.origin:
            result["origin"] = self.origin.to_dict()
@@ -409,7 +433,15 @@ class SessionEntry:
                platform = Platform(data["platform"])
            except ValueError as e:
                logger.debug("Unknown platform value %r: %s", data["platform"], e)
-        
+
+        last_resume_marked_at = None
+        _lrma = data.get("last_resume_marked_at")
+        if _lrma:
+            try:
+                last_resume_marked_at = datetime.fromisoformat(_lrma)
+            except (TypeError, ValueError):
+                last_resume_marked_at = None
+
        return cls(
            session_key=data["session_key"],
            session_id=data["session_id"],
@@ -429,6 +461,9 @@ class SessionEntry:
            cost_status=data.get("cost_status", "unknown"),
            memory_flushed=data.get("memory_flushed", False),
            suspended=data.get("suspended", False),
+            resume_pending=data.get("resume_pending", False),
+            resume_reason=data.get("resume_reason"),
+            last_resume_marked_at=last_resume_marked_at,
        )


@@ -705,9 +740,23 @@ class SessionStore:
                entry = self._entries[session_key]

                # Auto-reset sessions marked as suspended (e.g. after /stop
-                # broke a stuck loop — #7536).
+                # broke a stuck loop — #7536).  ``suspended`` is the hard
+                # forced-wipe signal and always wins over ``resume_pending``,
+                # so repeated interrupted restarts that escalate via the
+                # existing ``.restart_failure_counts`` stuck-loop counter
+                # still converge to a clean slate.
                if entry.suspended:
                    reset_reason = "suspended"
+                elif entry.resume_pending:
+                    # Restart-interrupted session: preserve the session_id
+                    # and return the existing entry so the transcript
+                    # reloads intact.  ``resume_pending`` is cleared after
+                    # the NEXT successful turn completes (not here), which
+                    # means a re-interrupted retry keeps trying — the
+                    # stuck-loop counter handles terminal escalation.
+                    entry.updated_at = now
+                    self._save()
+                    return entry
                else:
                    reset_reason = self._should_reset(entry, source)
                if not reset_reason:
@@ -797,6 +846,106 @@ class SessionStore:
                return True
        return False

+    def mark_resume_pending(
+        self,
+        session_key: str,
+        reason: str = "restart_timeout",
+    ) -> bool:
+        """Mark a session as resumable after a restart interruption.
+
+        Unlike ``suspend_session()``, this preserves the existing
+        ``session_id`` and the transcript.  The next call to
+        ``get_or_create_session()`` for this key returns the same entry
+        so the user auto-resumes on the same conversation lane.
+
+        Returns True if the session existed and was marked.
+        """
+        with self._lock:
+            self._ensure_loaded_locked()
+            if session_key in self._entries:
+                entry = self._entries[session_key]
+                # Never override an explicit ``suspended`` — that is a hard
+                # forced-wipe signal (from /stop or stuck-loop escalation).
+                if entry.suspended:
+                    return False
+                entry.resume_pending = True
+                entry.resume_reason = reason
+                entry.last_resume_marked_at = _now()
+                self._save()
+                return True
+        return False
+
+    def clear_resume_pending(self, session_key: str) -> bool:
+        """Clear the resume-pending flag after a successful resumed turn.
+
+        Called from the gateway after ``run_conversation()`` returns a
+        final response for a session that had ``resume_pending=True``,
+        signalling that recovery succeeded.
+
+        Returns True if a flag was cleared.
+        """
+        with self._lock:
+            self._ensure_loaded_locked()
+            entry = self._entries.get(session_key)
+            if entry is None or not entry.resume_pending:
+                return False
+            entry.resume_pending = False
+            entry.resume_reason = None
+            entry.last_resume_marked_at = None
+            self._save()
+            return True
+
+    def prune_old_entries(self, max_age_days: int) -> int:
+        """Drop SessionEntry records older than max_age_days.
+
+        Pruning is based on ``updated_at`` (last activity), not ``created_at``.
+        A session that's been active within the window is kept regardless of
+        how old it is.  Entries marked ``suspended`` are kept — the user
+        explicitly paused them for later resume.  Entries held by an active
+        process (via has_active_processes_fn) are also kept so long-running
+        background work isn't orphaned.
+
+        Pruning is functionally identical to a natural reset-policy expiry:
+        the transcript in SQLite stays, but the session_key → session_id
+        mapping is dropped and the user starts a fresh session on return.
+
+        ``max_age_days <= 0`` disables pruning; returns 0 immediately.
+        Returns the number of entries removed.
+        """
+        if max_age_days is None or max_age_days <= 0:
+            return 0
+        from datetime import timedelta
+
+        cutoff = _now() - timedelta(days=max_age_days)
+        removed_keys: list[str] = []
+
+        with self._lock:
+            self._ensure_loaded_locked()
+            for key, entry in list(self._entries.items()):
+                if entry.suspended:
+                    continue
+                # Never prune sessions with an active background process
+                # attached — the user may still be waiting on output.
+                if self._has_active_processes_fn is not None:
+                    try:
+                        if self._has_active_processes_fn(entry.session_id):
+                            continue
+                    except Exception:
+                        pass
+                if entry.updated_at < cutoff:
+                    removed_keys.append(key)
+            for key in removed_keys:
+                self._entries.pop(key, None)
+            if removed_keys:
+                self._save()
+
+        if removed_keys:
+            logger.info(
+                "SessionStore pruned %d entries older than %d days",
+                len(removed_keys), max_age_days,
+            )
+        return len(removed_keys)
+
    def suspend_recently_active(self, max_age_seconds: int = 120) -> int:
        """Mark recently-active sessions as suspended.

@@ -805,6 +954,12 @@ class SessionStore:
        (#7536).  Only suspends sessions updated within *max_age_seconds*
        to avoid resetting long-idle sessions that are harmless to resume.
        Returns the number of sessions that were suspended.
+
+        Entries flagged ``resume_pending=True`` are skipped — those were
+        marked intentionally by the drain-timeout path as recoverable.
+        Terminal escalation for genuinely stuck ``resume_pending`` sessions
+        is handled by the existing ``.restart_failure_counts`` stuck-loop
+        counter, which runs after this method on startup.
        """
        from datetime import timedelta

@@ -813,6 +968,8 @@ class SessionStore:
        with self._lock:
            self._ensure_loaded_locked()
            for entry in self._entries.values():
+                if entry.resume_pending:
+                    continue
                if not entry.suspended and entry.updated_at >= cutoff:
                    entry.suspended = True
                    count += 1
@@ -37,18 +37,24 @@ needs to replace the import + call site:
 """

 from contextvars import ContextVar
+from typing import Any
+
+# Sentinel to distinguish "never set in this context" from "explicitly set to empty".
+# When a contextvar holds _UNSET, we fall back to os.environ (CLI/cron compat).
+# When it holds "" (after clear_session_vars resets it), we return "" — no fallback.
+_UNSET: Any = object()

 # ---------------------------------------------------------------------------
 # Per-task session variables
 # ---------------------------------------------------------------------------

-_SESSION_PLATFORM: ContextVar[str] = ContextVar("HERMES_SESSION_PLATFORM", default="")
-_SESSION_CHAT_ID: ContextVar[str] = ContextVar("HERMES_SESSION_CHAT_ID", default="")
-_SESSION_CHAT_NAME: ContextVar[str] = ContextVar("HERMES_SESSION_CHAT_NAME", default="")
-_SESSION_THREAD_ID: ContextVar[str] = ContextVar("HERMES_SESSION_THREAD_ID", default="")
-_SESSION_USER_ID: ContextVar[str] = ContextVar("HERMES_SESSION_USER_ID", default="")
-_SESSION_USER_NAME: ContextVar[str] = ContextVar("HERMES_SESSION_USER_NAME", default="")
-_SESSION_KEY: ContextVar[str] = ContextVar("HERMES_SESSION_KEY", default="")
+_SESSION_PLATFORM: ContextVar = ContextVar("HERMES_SESSION_PLATFORM", default=_UNSET)
+_SESSION_CHAT_ID: ContextVar = ContextVar("HERMES_SESSION_CHAT_ID", default=_UNSET)
+_SESSION_CHAT_NAME: ContextVar = ContextVar("HERMES_SESSION_CHAT_NAME", default=_UNSET)
+_SESSION_THREAD_ID: ContextVar = ContextVar("HERMES_SESSION_THREAD_ID", default=_UNSET)
+_SESSION_USER_ID: ContextVar = ContextVar("HERMES_SESSION_USER_ID", default=_UNSET)
+_SESSION_USER_NAME: ContextVar = ContextVar("HERMES_SESSION_USER_NAME", default=_UNSET)
+_SESSION_KEY: ContextVar = ContextVar("HERMES_SESSION_KEY", default=_UNSET)

 _VAR_MAP = {
    "HERMES_SESSION_PLATFORM": _SESSION_PLATFORM,
@@ -91,10 +97,17 @@ def set_session_vars(


 def clear_session_vars(tokens: list) -> None:
-    """Restore session context variables to their pre-handler values."""
-    if not tokens:
-        return
-    vars_in_order = [
+    """Mark session context variables as explicitly cleared.
+
+    Sets all variables to ``""`` so that ``get_session_env`` returns an empty
+    string instead of falling back to (potentially stale) ``os.environ``
+    values.  The *tokens* argument is accepted for API compatibility with
+    callers that saved the return value of ``set_session_vars``, but the
+    actual clearing uses ``var.set("")`` rather than ``var.reset(token)``
+    to ensure the "explicitly cleared" state is distinguishable from
+    "never set" (which holds the ``_UNSET`` sentinel).
+    """
+    for var in (
        _SESSION_PLATFORM,
        _SESSION_CHAT_ID,
        _SESSION_CHAT_NAME,
@@ -102,9 +115,8 @@ def clear_session_vars(tokens: list) -> None:
        _SESSION_USER_ID,
        _SESSION_USER_NAME,
        _SESSION_KEY,
-    ]
-    for var, token in zip(vars_in_order, tokens):
-        var.reset(token)
+    ):
+        var.set("")


 def get_session_env(name: str, default: str = "") -> str:
@@ -113,8 +125,13 @@ def get_session_env(name: str, default: str = "") -> str:
    Drop-in replacement for ``os.getenv("HERMES_SESSION_*", default)``.

    Resolution order:
-    1. Context variable (set by the gateway for concurrency-safe access)
-    2. ``os.environ`` (used by CLI, cron scheduler, and tests)
+    1. Context variable (set by the gateway for concurrency-safe access).
+       If the variable was explicitly set (even to ``""``) via
+       ``set_session_vars`` or ``clear_session_vars``, that value is
+       returned — **no fallback to os.environ**.
+    2. ``os.environ`` (only when the context variable was never set in
+       this context — i.e. CLI, cron scheduler, and test processes that
+       don't use ``set_session_vars`` at all).
    3. *default*
    """
    import os
@@ -122,7 +139,7 @@ def get_session_env(name: str, default: str = "") -> str:
    var = _VAR_MAP.get(name)
    if var is not None:
        value = var.get()
-        if value:
+        if value is not _UNSET:
            return value
    # Fall back to os.environ for CLI, cron, and test compatibility
    return os.getenv(name, default)
@@ -188,8 +188,8 @@ def _write_json_file(path: Path, payload: dict[str, Any]) -> None:
    path.write_text(json.dumps(payload))


-def _read_pid_record() -> Optional[dict]:
-    pid_path = _get_pid_path()
+def _read_pid_record(pid_path: Optional[Path] = None) -> Optional[dict]:
+    pid_path = pid_path or _get_pid_path()
    if not pid_path.exists():
        return None

@@ -212,6 +212,18 @@ def _read_pid_record() -> Optional[dict]:
    return None


+def _cleanup_invalid_pid_path(pid_path: Path, *, cleanup_stale: bool) -> None:
+    if not cleanup_stale:
+        return
+    try:
+        if pid_path == _get_pid_path():
+            remove_pid_file()
+        else:
+            pid_path.unlink(missing_ok=True)
+    except Exception:
+        pass
+
+
 def write_pid_file() -> None:
    """Write the current process PID and metadata to the gateway PID file."""
    _write_json_file(_get_pid_path(), _build_pid_record())
@@ -413,43 +425,179 @@ def release_all_scoped_locks() -> int:
    return removed


-def get_running_pid() -> Optional[int]:
+# ── --replace takeover marker ─────────────────────────────────────────
+#
+# When a new gateway starts with ``--replace``, it SIGTERMs the existing
+# gateway so it can take over the bot token. PR #5646 made SIGTERM exit
+# the gateway with code 1 so ``Restart=on-failure`` can revive it after
+# unexpected kills — but that also means a --replace takeover target
+# exits 1, which tricks systemd into reviving it 30 seconds later,
+# starting a flap loop against the replacer when both services are
+# enabled in the user's systemd (e.g. ``hermes.service`` + ``hermes-
+# gateway.service``).
+#
+# The takeover marker breaks the loop: the replacer writes a short-lived
+# file naming the target PID + start_time BEFORE sending SIGTERM.
+# The target's shutdown handler reads the marker and, if it names
+# this process, treats the SIGTERM as a planned takeover and exits 0.
+# The marker is unlinked after the target has consumed it, so a stale
+# marker left by a crashed replacer can grief at most one future
+# shutdown on the same PID — and only within _TAKEOVER_MARKER_TTL_S.
+
+_TAKEOVER_MARKER_FILENAME = ".gateway-takeover.json"
+_TAKEOVER_MARKER_TTL_S = 60  # Marker older than this is treated as stale
+
+
+def _get_takeover_marker_path() -> Path:
+    """Return the path to the --replace takeover marker file."""
+    home = get_hermes_home()
+    return home / _TAKEOVER_MARKER_FILENAME
+
+
+def write_takeover_marker(target_pid: int) -> bool:
+    """Record that ``target_pid`` is being replaced by the current process.
+
+    Captures the target's ``start_time`` so that PID reuse after the
+    target exits cannot later match the marker. Also records the
+    replacer's PID and a UTC timestamp for TTL-based staleness checks.
+
+    Returns True on successful write, False on any failure. The caller
+    should proceed with the SIGTERM even if the write fails (the marker
+    is a best-effort signal, not a correctness requirement).
+    """
+    try:
+        target_start_time = _get_process_start_time(target_pid)
+        record = {
+            "target_pid": target_pid,
+            "target_start_time": target_start_time,
+            "replacer_pid": os.getpid(),
+            "written_at": _utc_now_iso(),
+        }
+        _write_json_file(_get_takeover_marker_path(), record)
+        return True
+    except (OSError, PermissionError):
+        return False
+
+
+def consume_takeover_marker_for_self() -> bool:
+    """Check & unlink the takeover marker if it names the current process.
+
+    Returns True only when a valid (non-stale) marker names this PID +
+    start_time. A returning True indicates the current SIGTERM is a
+    planned --replace takeover; the caller should exit 0 instead of
+    signalling ``_signal_initiated_shutdown``.
+
+    Always unlinks the marker on match (and on detected staleness) so
+    subsequent unrelated signals don't re-trigger.
+    """
+    path = _get_takeover_marker_path()
+    record = _read_json_file(path)
+    if not record:
+        return False
+
+    # Any malformed or stale marker → drop it and return False
+    try:
+        target_pid = int(record["target_pid"])
+        target_start_time = record.get("target_start_time")
+        written_at = record.get("written_at") or ""
+    except (KeyError, TypeError, ValueError):
+        try:
+            path.unlink(missing_ok=True)
+        except OSError:
+            pass
+        return False
+
+    # TTL guard: a stale marker older than _TAKEOVER_MARKER_TTL_S is ignored.
+    stale = False
+    try:
+        written_dt = datetime.fromisoformat(written_at)
+        age = (datetime.now(timezone.utc) - written_dt).total_seconds()
+        if age > _TAKEOVER_MARKER_TTL_S:
+            stale = True
+    except (TypeError, ValueError):
+        stale = True  # Unparseable timestamp — treat as stale
+
+    if stale:
+        try:
+            path.unlink(missing_ok=True)
+        except OSError:
+            pass
+        return False
+
+    # Does the marker name THIS process?
+    our_pid = os.getpid()
+    our_start_time = _get_process_start_time(our_pid)
+    matches = (
+        target_pid == our_pid
+        and target_start_time is not None
+        and our_start_time is not None
+        and target_start_time == our_start_time
+    )
+
+    # Consume the marker whether it matched or not — a marker that doesn't
+    # match our identity is stale-for-us anyway.
+    try:
+        path.unlink(missing_ok=True)
+    except OSError:
+        pass
+
+    return matches
+
+
+def clear_takeover_marker() -> None:
+    """Remove the takeover marker unconditionally. Safe to call repeatedly."""
+    try:
+        _get_takeover_marker_path().unlink(missing_ok=True)
+    except OSError:
+        pass
+
+
+def get_running_pid(
+    pid_path: Optional[Path] = None,
+    *,
+    cleanup_stale: bool = True,
+) -> Optional[int]:
    """Return the PID of a running gateway instance, or ``None``.

    Checks the PID file and verifies the process is actually alive.
    Cleans up stale PID files automatically.
    """
-    record = _read_pid_record()
+    resolved_pid_path = pid_path or _get_pid_path()
+    record = _read_pid_record(resolved_pid_path)
    if not record:
-        remove_pid_file()
+        _cleanup_invalid_pid_path(resolved_pid_path, cleanup_stale=cleanup_stale)
        return None

    try:
        pid = int(record["pid"])
    except (KeyError, TypeError, ValueError):
-        remove_pid_file()
+        _cleanup_invalid_pid_path(resolved_pid_path, cleanup_stale=cleanup_stale)
        return None

    try:
        os.kill(pid, 0)  # signal 0 = existence check, no actual signal sent
    except (ProcessLookupError, PermissionError):
-        remove_pid_file()
+        _cleanup_invalid_pid_path(resolved_pid_path, cleanup_stale=cleanup_stale)
        return None

    recorded_start = record.get("start_time")
    current_start = _get_process_start_time(pid)
    if recorded_start is not None and current_start is not None and current_start != recorded_start:
-        remove_pid_file()
+        _cleanup_invalid_pid_path(resolved_pid_path, cleanup_stale=cleanup_stale)
        return None

    if not _looks_like_gateway_process(pid):
        if not _record_looks_like_gateway(record):
-            remove_pid_file()
+            _cleanup_invalid_pid_path(resolved_pid_path, cleanup_stale=cleanup_stale)
            return None

    return pid


-def is_gateway_running() -> bool:
+def is_gateway_running(
+    pid_path: Optional[Path] = None,
+    *,
+    cleanup_stale: bool = True,
+) -> bool:
    """Check if the gateway daemon is currently running."""
-    return get_running_pid() is not None
+    return get_running_pid(pid_path, cleanup_stale=cleanup_stale) is not None
@@ -43,6 +43,7 @@ class StreamConsumerConfig:
    edit_interval: float = 1.0
    buffer_threshold: int = 40
    cursor: str = " ▉"
+    buffer_only: bool = False


 class GatewayStreamConsumer:
@@ -99,6 +100,14 @@ class GatewayStreamConsumer:
        self._flood_strikes = 0         # Consecutive flood-control edit failures
        self._current_edit_interval = self.cfg.edit_interval  # Adaptive backoff
        self._final_response_sent = False
+        # Cache adapter lifecycle capability: only platforms that need an
+        # explicit finalize call (e.g. DingTalk AI Cards) force us to make
+        # a redundant final edit.  Everyone else keeps the fast path.
+        # Use ``is True`` (not ``bool(...)``) so MagicMock attribute access
+        # in tests doesn't incorrectly enable this path.
+        self._adapter_requires_finalize: bool = (
+            getattr(adapter, "REQUIRES_EDIT_FINALIZE", False) is True
+        )

        # Think-block filter state (mirrors CLI's _stream_delta tag suppression)
        self._in_think_block = False
@@ -295,10 +304,13 @@ class GatewayStreamConsumer:
                    got_done
                    or got_segment_break
                    or commentary_text is not None
-                    or (elapsed >= self._current_edit_interval
-                        and self._accumulated)
-                    or len(self._accumulated) >= self.cfg.buffer_threshold
                )
+                if not self.cfg.buffer_only:
+                    should_edit = should_edit or (
+                        (elapsed >= self._current_edit_interval
+                            and self._accumulated)
+                        or len(self._accumulated) >= self.cfg.buffer_threshold
+                    )

                current_update_visible = False
                if should_edit and self._accumulated:
@@ -357,7 +369,16 @@ class GatewayStreamConsumer:
                    if not got_done and not got_segment_break and commentary_text is None:
                        display_text += self.cfg.cursor

-                    current_update_visible = await self._send_or_edit(display_text)
+                    # Segment break: finalize the current message so platforms
+                    # that need explicit closure (e.g. DingTalk AI Cards) don't
+                    # leave the previous segment stuck in a loading state when
+                    # the next segment (tool progress, next chunk) creates a
+                    # new message below it.  got_done has its own finalize
+                    # path below so we don't finalize here for it.
+                    current_update_visible = await self._send_or_edit(
+                        display_text,
+                        finalize=got_segment_break,
+                    )
                    self._last_edit_time = time.monotonic()

                if got_done:
@@ -368,10 +389,22 @@ class GatewayStreamConsumer:
                    if self._accumulated:
                        if self._fallback_final_send:
                            await self._send_fallback_final(self._accumulated)
-                        elif current_update_visible:
+                        elif (
+                            current_update_visible
+                            and not self._adapter_requires_finalize
+                        ):
+                            # Mid-stream edit above already delivered the
+                            # final accumulated content.  Skip the redundant
+                            # final edit — but only for adapters that don't
+                            # need an explicit finalize signal.
                            self._final_response_sent = True
                        elif self._message_id:
-                            self._final_response_sent = await self._send_or_edit(self._accumulated)
+                            # Either the mid-stream edit didn't run (no
+                            # visible update this tick) OR the adapter needs
+                            # explicit finalize=True to close the stream.
+                            self._final_response_sent = await self._send_or_edit(
+                                self._accumulated, finalize=True,
+                            )
                        elif not self._already_sent:
                            self._final_response_sent = await self._send_or_edit(self._accumulated)
                    return
@@ -397,24 +430,41 @@ class GatewayStreamConsumer:
                # a real string like "msg_1", not "__no_edit__", so that case
                # still resets and creates a fresh segment as intended.)
                if got_segment_break:
+                    # If the segment-break edit failed to deliver the
+                    # accumulated content (flood control that has not yet
+                    # promoted to fallback mode, or fallback mode itself),
+                    # _accumulated still holds pre-boundary text the user
+                    # never saw. Flush that tail as a continuation message
+                    # before the reset below wipes _accumulated — otherwise
+                    # text generated before the tool boundary is silently
+                    # dropped (issue #8124).
+                    if (
+                        self._accumulated
+                        and not current_update_visible
+                        and self._message_id
+                        and self._message_id != "__no_edit__"
+                    ):
+                        await self._flush_segment_tail_on_edit_failure()
                    self._reset_segment_state(preserve_no_edit=True)

                await asyncio.sleep(0.05)  # Small yield to not busy-loop

        except asyncio.CancelledError:
            # Best-effort final edit on cancellation
+            _best_effort_ok = False
            if self._accumulated and self._message_id:
                try:
-                    await self._send_or_edit(self._accumulated)
+                    _best_effort_ok = bool(await self._send_or_edit(self._accumulated))
                except Exception:
                    pass
-            # If we delivered any content before being cancelled, mark the
-            # final response as sent so the gateway's already_sent check
-            # doesn't trigger a duplicate message.  The 5-second
-            # stream_task timeout (gateway/run.py) can cancel us while
-            # waiting on a slow Telegram API call — without this flag the
-            # gateway falls through to the normal send path.
-            if self._already_sent:
+            # Only confirm final delivery if the best-effort send above
+            # actually succeeded OR if the final response was already
+            # confirmed before we were cancelled.  Previously this
+            # promoted any partial send (already_sent=True) to
+            # final_response_sent — which suppressed the gateway's
+            # fallback send even when only intermediate text (e.g.
+            # "Let me search…") had been delivered, not the real answer.
+            if _best_effort_ok and not self._final_response_sent:
                self._final_response_sent = True
        except Exception as e:
            logger.error("Stream consumer error: %s", e)
@@ -513,9 +563,17 @@ class GatewayStreamConsumer:
        self._fallback_final_send = False
        if not continuation.strip():
            # Nothing new to send — the visible partial already matches final text.
-            self._already_sent = True
-            self._final_response_sent = True
-            return
+            # BUT: if final_text itself has meaningful content (e.g. a timeout
+            # message after a long tool call), the prefix-based continuation
+            # calculation may wrongly conclude "already shown" because the
+            # streamed prefix was from a *previous* segment (before the tool
+            # boundary).  In that case, send the full final_text as-is (#10807).
+            if final_text.strip() and final_text != self._visible_prefix():
+                continuation = final_text
+            else:
+                self._already_sent = True
+                self._final_response_sent = True
+                return

        raw_limit = getattr(self.adapter, "MAX_MESSAGE_LENGTH", 4096)
        safe_limit = max(500, raw_limit - 100)
@@ -577,6 +635,39 @@ class GatewayStreamConsumer:
        err_lower = err.lower()
        return "flood" in err_lower or "retry after" in err_lower or "rate" in err_lower

+    async def _flush_segment_tail_on_edit_failure(self) -> None:
+        """Deliver un-sent tail content before a segment-break reset.
+
+        When an edit fails (flood control, transport error) and a tool
+        boundary arrives before the next retry, ``_accumulated`` holds text
+        that was generated but never shown to the user. Without this flush,
+        the segment reset would discard that tail and leave a frozen cursor
+        in the partial message.
+
+        Sends the tail that sits after the last successfully-delivered
+        prefix as a new message, and best-effort strips the stuck cursor
+        from the previous partial message.
+        """
+        if not self._fallback_final_send:
+            await self._try_strip_cursor()
+        visible = self._fallback_prefix or self._visible_prefix()
+        tail = self._accumulated
+        if visible and tail.startswith(visible):
+            tail = tail[len(visible):].lstrip()
+        tail = self._clean_for_display(tail)
+        if not tail.strip():
+            return
+        try:
+            result = await self.adapter.send(
+                chat_id=self.chat_id,
+                content=tail,
+                metadata=self.metadata,
+            )
+            if result.success:
+                self._already_sent = True
+        except Exception as e:
+            logger.error("Segment-break tail flush error: %s", e)
+
    async def _try_strip_cursor(self) -> None:
        """Best-effort edit to remove the cursor from the last visible message.

@@ -609,19 +700,25 @@ class GatewayStreamConsumer:
                content=text,
                metadata=self.metadata,
            )
-            if result.success:
-                self._already_sent = True
-                return True
+            # Note: do NOT set _already_sent = True here.
+            # Commentary messages are interim status updates (e.g. "Using browser
+            # tool..."), not the final response. Setting already_sent would cause
+            # the final response to be incorrectly suppressed when there are
+            # multiple tool calls. See: https://github.com/NousResearch/hermes-agent/issues/10454
+            return result.success
        except Exception as e:
            logger.error("Commentary send error: %s", e)
-        return False
+            return False

-    async def _send_or_edit(self, text: str) -> bool:
+    async def _send_or_edit(self, text: str, *, finalize: bool = False) -> bool:
        """Send or edit the streaming message.

        Returns True if the text was successfully delivered (sent or edited),
        False otherwise.  Callers like the overflow split loop use this to
        decide whether to advance past the delivered chunk.
+
+        ``finalize`` is True when this is the last edit in a streaming
+        sequence.
        """
        # Strip MEDIA: directives so they don't appear as visible text.
        # Media files are delivered as native attachments after the stream
@@ -655,14 +752,22 @@ class GatewayStreamConsumer:
        try:
            if self._message_id is not None:
                if self._edit_supported:
-                    # Skip if text is identical to what we last sent
-                    if text == self._last_sent_text:
+                    # Skip if text is identical to what we last sent.
+                    # Exception: adapters that require an explicit finalize
+                    # call (REQUIRES_EDIT_FINALIZE) must still receive the
+                    # finalize=True edit even when content is unchanged, so
+                    # their streaming UI can transition out of the in-
+                    # progress state.  Everyone else short-circuits.
+                    if text == self._last_sent_text and not (
+                        finalize and self._adapter_requires_finalize
+                    ):
                        return True
                    # Edit existing message
                    result = await self.adapter.edit_message(
                        chat_id=self.chat_id,
                        message_id=self._message_id,
                        content=text,
+                        finalize=finalize,
                    )
                    if result.success:
                        self._already_sent = True
@@ -11,5 +11,5 @@ Provides subcommands for:
 - hermes cron          - Manage cron jobs
 """

-__version__ = "0.9.0"
-__release_date__ = "2026.4.13"
+__version__ = "0.10.0"
+__release_date__ = "2026.4.16"
@@ -70,6 +70,7 @@ DEFAULT_CODEX_BASE_URL = "https://chatgpt.com/backend-api/codex"
 DEFAULT_QWEN_BASE_URL = "https://portal.qwen.ai/v1"
 DEFAULT_GITHUB_MODELS_BASE_URL = "https://api.githubcopilot.com"
 DEFAULT_COPILOT_ACP_BASE_URL = "acp://copilot"
+DEFAULT_OLLAMA_CLOUD_BASE_URL = "https://ollama.com/v1"
 CODEX_OAUTH_CLIENT_ID = "app_EMoamEEZ73f0CkXaXp7hrann"
 CODEX_OAUTH_TOKEN_URL = "https://auth.openai.com/oauth/token"
 CODEX_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 120
@@ -77,6 +78,10 @@ QWEN_OAUTH_CLIENT_ID = "f0304373b74a44d2b584a3fb70ca9e56"
 QWEN_OAUTH_TOKEN_URL = "https://chat.qwen.ai/api/v1/oauth2/token"
 QWEN_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 120

+# Google Gemini OAuth (google-gemini-cli provider, Cloud Code Assist backend)
+DEFAULT_GEMINI_CLOUDCODE_BASE_URL = "cloudcode-pa://google"
+GEMINI_OAUTH_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 60  # refresh 60s before expiry
+

 # =============================================================================
 # Provider Registry
@@ -121,6 +126,12 @@ PROVIDER_REGISTRY: Dict[str, ProviderConfig] = {
        auth_type="oauth_external",
        inference_base_url=DEFAULT_QWEN_BASE_URL,
    ),
+    "google-gemini-cli": ProviderConfig(
+        id="google-gemini-cli",
+        name="Google Gemini (OAuth)",
+        auth_type="oauth_external",
+        inference_base_url=DEFAULT_GEMINI_CLOUDCODE_BASE_URL,
+    ),
    "copilot": ProviderConfig(
        id="copilot",
        name="GitHub Copilot",
@@ -222,6 +233,14 @@ PROVIDER_REGISTRY: Dict[str, ProviderConfig] = {
        api_key_env_vars=("XAI_API_KEY",),
        base_url_env_var="XAI_BASE_URL",
    ),
+    "nvidia": ProviderConfig(
+        id="nvidia",
+        name="NVIDIA NIM",
+        auth_type="api_key",
+        inference_base_url="https://integrate.api.nvidia.com/v1",
+        api_key_env_vars=("NVIDIA_API_KEY",),
+        base_url_env_var="NVIDIA_BASE_URL",
+    ),
    "ai-gateway": ProviderConfig(
        id="ai-gateway",
        name="Vercel AI Gateway",
@@ -274,6 +293,22 @@ PROVIDER_REGISTRY: Dict[str, ProviderConfig] = {
        api_key_env_vars=("XIAOMI_API_KEY",),
        base_url_env_var="XIAOMI_BASE_URL",
    ),
+    "ollama-cloud": ProviderConfig(
+        id="ollama-cloud",
+        name="Ollama Cloud",
+        auth_type="api_key",
+        inference_base_url=DEFAULT_OLLAMA_CLOUD_BASE_URL,
+        api_key_env_vars=("OLLAMA_API_KEY",),
+        base_url_env_var="OLLAMA_BASE_URL",
+    ),
+    "bedrock": ProviderConfig(
+        id="bedrock",
+        name="AWS Bedrock",
+        auth_type="aws_sdk",
+        inference_base_url="https://bedrock-runtime.us-east-1.amazonaws.com",
+        api_key_env_vars=(),
+        base_url_env_var="BEDROCK_BASE_URL",
+    ),
 }


@@ -746,6 +781,28 @@ def is_source_suppressed(provider_id: str, source: str) -> bool:
        return False


+def unsuppress_credential_source(provider_id: str, source: str) -> bool:
+    """Clear a suppression marker so the source will be re-seeded on the next load.
+
+    Returns True if a marker was cleared, False if no marker existed.
+    """
+    with _auth_store_lock():
+        auth_store = _load_auth_store()
+        suppressed = auth_store.get("suppressed_sources")
+        if not isinstance(suppressed, dict):
+            return False
+        provider_list = suppressed.get(provider_id)
+        if not isinstance(provider_list, list) or source not in provider_list:
+            return False
+        provider_list.remove(source)
+        if not provider_list:
+            suppressed.pop(provider_id, None)
+        if not suppressed:
+            auth_store.pop("suppressed_sources", None)
+        _save_auth_store(auth_store)
+        return True
+
+
 def get_provider_auth_state(provider_id: str) -> Optional[Dict[str, Any]]:
    """Return persisted auth state for a provider, or None."""
    auth_store = _load_auth_store()
@@ -911,6 +968,7 @@ def resolve_provider(
    _PROVIDER_ALIASES = {
        "glm": "zai", "z-ai": "zai", "z.ai": "zai", "zhipu": "zai",
        "google": "gemini", "google-gemini": "gemini", "google-ai-studio": "gemini",
+        "x-ai": "xai", "x.ai": "xai", "grok": "xai",
        "kimi": "kimi-coding", "kimi-for-coding": "kimi-coding", "moonshot": "kimi-coding",
        "kimi-cn": "kimi-coding-cn", "moonshot-cn": "kimi-coding-cn",
        "arcee-ai": "arcee", "arceeai": "arcee",
@@ -921,14 +979,16 @@ def resolve_provider(
        "github-copilot-acp": "copilot-acp", "copilot-acp-agent": "copilot-acp",
        "aigateway": "ai-gateway", "vercel": "ai-gateway", "vercel-ai-gateway": "ai-gateway",
        "opencode": "opencode-zen", "zen": "opencode-zen",
-        "qwen-portal": "qwen-oauth", "qwen-cli": "qwen-oauth", "qwen-oauth": "qwen-oauth",
+        "qwen-portal": "qwen-oauth", "qwen-cli": "qwen-oauth", "qwen-oauth": "qwen-oauth", "google-gemini-cli": "google-gemini-cli", "gemini-cli": "google-gemini-cli", "gemini-oauth": "google-gemini-cli",
        "hf": "huggingface", "hugging-face": "huggingface", "huggingface-hub": "huggingface",
        "mimo": "xiaomi", "xiaomi-mimo": "xiaomi",
+        "aws": "bedrock", "aws-bedrock": "bedrock", "amazon-bedrock": "bedrock", "amazon": "bedrock",
        "go": "opencode-go", "opencode-go-sub": "opencode-go",
        "kilo": "kilocode", "kilo-code": "kilocode", "kilo-gateway": "kilocode",
        # Local server aliases — route through the generic custom provider
        "lmstudio": "custom", "lm-studio": "custom", "lm_studio": "custom",
-        "ollama": "custom", "vllm": "custom", "llamacpp": "custom",
+        "ollama": "custom", "ollama_cloud": "ollama-cloud",
+        "vllm": "custom", "llamacpp": "custom",
        "llama.cpp": "custom", "llama-cpp": "custom",
    }
    normalized = _PROVIDER_ALIASES.get(normalized, normalized)
@@ -980,6 +1040,15 @@ def resolve_provider(
            if has_usable_secret(os.getenv(env_var, "")):
                return pid

+    # AWS Bedrock — detect via boto3 credential chain (IAM roles, SSO, env vars).
+    # This runs after API-key providers so explicit keys always win.
+    try:
+        from agent.bedrock_adapter import has_aws_credentials
+        if has_aws_credentials():
+            return "bedrock"
+    except ImportError:
+        pass  # boto3 not installed — skip Bedrock auto-detection
+
    raise AuthError(
        "No inference provider configured. Run 'hermes model' to choose a "
        "provider and model, or set an API key (OPENROUTER_API_KEY, "
@@ -1222,6 +1291,83 @@ def get_qwen_auth_status() -> Dict[str, Any]:
        }


+# =============================================================================
+# Google Gemini OAuth (google-gemini-cli) — PKCE flow + Cloud Code Assist.
+#
+# Tokens live in ~/.hermes/auth/google_oauth.json (managed by agent.google_oauth).
+# The `base_url` here is the marker "cloudcode-pa://google" that run_agent.py
+# uses to construct a GeminiCloudCodeClient instead of the default OpenAI SDK.
+# Actual HTTP traffic goes to https://cloudcode-pa.googleapis.com/v1internal:*.
+# =============================================================================
+
+def resolve_gemini_oauth_runtime_credentials(
+    *,
+    force_refresh: bool = False,
+) -> Dict[str, Any]:
+    """Resolve runtime OAuth creds for google-gemini-cli."""
+    try:
+        from agent.google_oauth import (
+            GoogleOAuthError,
+            _credentials_path,
+            get_valid_access_token,
+            load_credentials,
+        )
+    except ImportError as exc:
+        raise AuthError(
+            f"agent.google_oauth is not importable: {exc}",
+            provider="google-gemini-cli",
+            code="google_oauth_module_missing",
+        ) from exc
+
+    try:
+        access_token = get_valid_access_token(force_refresh=force_refresh)
+    except GoogleOAuthError as exc:
+        raise AuthError(
+            str(exc),
+            provider="google-gemini-cli",
+            code=exc.code,
+        ) from exc
+
+    creds = load_credentials()
+    base_url = DEFAULT_GEMINI_CLOUDCODE_BASE_URL
+    return {
+        "provider": "google-gemini-cli",
+        "base_url": base_url,
+        "api_key": access_token,
+        "source": "google-oauth",
+        "expires_at_ms": (creds.expires_ms if creds else None),
+        "auth_file": str(_credentials_path()),
+        "email": (creds.email if creds else "") or "",
+        "project_id": (creds.project_id if creds else "") or "",
+    }
+
+
+def get_gemini_oauth_auth_status() -> Dict[str, Any]:
+    """Return a status dict for `hermes auth list` / `hermes status`."""
+    try:
+        from agent.google_oauth import _credentials_path, load_credentials
+    except ImportError:
+        return {"logged_in": False, "error": "agent.google_oauth unavailable"}
+    auth_path = _credentials_path()
+    creds = load_credentials()
+    if creds is None or not creds.access_token:
+        return {
+            "logged_in": False,
+            "auth_file": str(auth_path),
+            "error": "not logged in",
+        }
+    return {
+        "logged_in": True,
+        "auth_file": str(auth_path),
+        "source": "google-oauth",
+        "api_key": creds.access_token,
+        "expires_at_ms": creds.expires_ms,
+        "email": creds.email,
+        "project_id": creds.project_id,
+    }
+
+
+
 # =============================================================================
 # SSH / remote session detection
 # =============================================================================
@@ -1288,49 +1434,6 @@ def _read_codex_tokens(*, _lock: bool = True) -> Dict[str, Any]:
    }


-def _write_codex_cli_tokens(
-    access_token: str,
-    refresh_token: str,
-    *,
-    last_refresh: Optional[str] = None,
-) -> None:
-    """Write refreshed tokens back to ~/.codex/auth.json.
-
-    OpenAI OAuth refresh tokens are single-use and rotate on every refresh.
-    When Hermes refreshes a token it consumes the old refresh_token; if we
-    don't write the new pair back, the Codex CLI (or VS Code extension) will
-    fail with ``refresh_token_reused`` on its next refresh attempt.
-
-    This mirrors the Anthropic write-back to ~/.claude/.credentials.json
-    via ``_write_claude_code_credentials()``.
-    """
-    codex_home = os.getenv("CODEX_HOME", "").strip()
-    if not codex_home:
-        codex_home = str(Path.home() / ".codex")
-    auth_path = Path(codex_home).expanduser() / "auth.json"
-    try:
-        existing: Dict[str, Any] = {}
-        if auth_path.is_file():
-            existing = json.loads(auth_path.read_text(encoding="utf-8"))
-        if not isinstance(existing, dict):
-            existing = {}
-
-        tokens_dict = existing.get("tokens")
-        if not isinstance(tokens_dict, dict):
-            tokens_dict = {}
-        tokens_dict["access_token"] = access_token
-        tokens_dict["refresh_token"] = refresh_token
-        existing["tokens"] = tokens_dict
-        if last_refresh is not None:
-            existing["last_refresh"] = last_refresh
-
-        auth_path.parent.mkdir(parents=True, exist_ok=True)
-        auth_path.write_text(json.dumps(existing, indent=2), encoding="utf-8")
-        auth_path.chmod(0o600)
-    except (OSError, IOError) as exc:
-        logger.debug("Failed to write refreshed tokens to %s: %s", auth_path, exc)
-
-
 def _save_codex_tokens(tokens: Dict[str, str], last_refresh: str = None) -> None:
    """Save Codex OAuth tokens to Hermes auth store (~/.hermes/auth.json)."""
    if last_refresh is None:
@@ -1398,6 +1501,11 @@ def refresh_codex_oauth_pure(
                "then run `hermes auth` to re-authenticate."
            )
            relogin_required = True
+        # A 401/403 from the token endpoint always means the refresh token
+        # is invalid/expired — force relogin even if the body error code
+        # wasn't one of the known strings above.
+        if response.status_code in (401, 403) and not relogin_required:
+            relogin_required = True
        raise AuthError(
            message,
            provider="openai-codex",
@@ -1453,12 +1561,6 @@ def _refresh_codex_auth_tokens(
    updated_tokens["refresh_token"] = refreshed["refresh_token"]

    _save_codex_tokens(updated_tokens)
-    # Write back to ~/.codex/auth.json so Codex CLI / VS Code stay in sync.
-    _write_codex_cli_tokens(
-        refreshed["access_token"],
-        refreshed["refresh_token"],
-        last_refresh=refreshed.get("last_refresh"),
-    )
    return updated_tokens


@@ -1503,25 +1605,7 @@ def resolve_codex_runtime_credentials(
    refresh_skew_seconds: int = CODEX_ACCESS_TOKEN_REFRESH_SKEW_SECONDS,
 ) -> Dict[str, Any]:
    """Resolve runtime credentials from Hermes's own Codex token store."""
-    try:
-        data = _read_codex_tokens()
-    except AuthError as orig_err:
-        # Only attempt migration when there are NO tokens stored at all
-        # (code == "codex_auth_missing"), not when tokens exist but are invalid.
-        if orig_err.code != "codex_auth_missing":
-            raise
-
-        # Migration: user had Codex as active provider with old storage (~/.codex/).
-        cli_tokens = _import_codex_cli_tokens()
-        if cli_tokens:
-            logger.info("Migrating Codex credentials from ~/.codex/ to Hermes auth store")
-            print("⚠️  Migrating Codex credentials to Hermes's own auth store.")
-            print("   This avoids conflicts with Codex CLI and VS Code.")
-            print("   Run `hermes auth` to create a fully independent session.\n")
-            _save_codex_tokens(cli_tokens)
-            data = _read_codex_tokens()
-        else:
-            raise
+    data = _read_codex_tokens()
    tokens = dict(data["tokens"])
    access_token = str(tokens.get("access_token", "") or "").strip()
    refresh_timeout_seconds = float(os.getenv("HERMES_CODEX_REFRESH_TIMEOUT_SECONDS", "20"))
@@ -2013,6 +2097,62 @@ def refresh_nous_oauth_from_state(
    )


+NOUS_DEVICE_CODE_SOURCE = "device_code"
+
+
+def persist_nous_credentials(
+    creds: Dict[str, Any],
+    *,
+    label: Optional[str] = None,
+):
+    """Persist minted Nous OAuth credentials as the singleton provider state
+    and ensure the credential pool is in sync.
+
+    Nous credentials are read at runtime from two independent locations:
+
+    - ``providers.nous``: singleton state read by
+      ``resolve_nous_runtime_credentials()`` during 401 recovery and by
+      ``_seed_from_singletons()`` during pool load.
+    - ``credential_pool.nous``: used by the runtime ``pool.select()`` path.
+
+    Historically ``hermes auth add nous`` wrote a ``manual:device_code`` pool
+    entry only, skipping ``providers.nous``.  When the 24h agent_key TTL
+    expired, the recovery path read the empty singleton state and raised
+    ``AuthError`` silently (``logger.debug`` at INFO level).
+
+    This helper writes ``providers.nous`` then calls ``load_pool("nous")`` so
+    ``_seed_from_singletons`` materialises the canonical ``device_code`` pool
+    entry from the singleton.  Re-running login upserts the same entry in
+    place; the pool never accumulates duplicate device_code rows.
+
+    ``label`` is an optional user-chosen display name (from
+    ``hermes auth add nous --label <name>``).  It gets embedded in the
+    singleton state so that ``_seed_from_singletons`` uses it as the pool
+    entry's label on every subsequent ``load_pool("nous")`` instead of the
+    auto-derived token fingerprint.  When ``None``, the auto-derived label
+    via ``label_from_token`` is used (unchanged default behaviour).
+
+    Returns the upserted :class:`PooledCredential` entry (or ``None`` if
+    seeding somehow produced no match — shouldn't happen).
+    """
+    from agent.credential_pool import load_pool
+
+    state = dict(creds)
+    if label and str(label).strip():
+        state["label"] = str(label).strip()
+
+    with _auth_store_lock():
+        auth_store = _load_auth_store()
+        _save_provider_state(auth_store, "nous", state)
+        _save_auth_store(auth_store)
+
+    pool = load_pool("nous")
+    return next(
+        (e for e in pool.entries() if e.source == NOUS_DEVICE_CODE_SOURCE),
+        None,
+    )
+
+
 def resolve_nous_runtime_credentials(
    *,
    min_key_ttl_seconds: int = DEFAULT_AGENT_KEY_MIN_TTL_SECONDS,
@@ -2384,7 +2524,7 @@ def get_api_key_provider_status(provider_id: str) -> Dict[str, Any]:
    if pconfig.base_url_env_var:
        env_url = os.getenv(pconfig.base_url_env_var, "").strip()

-    if provider_id == "kimi-coding":
+    if provider_id in ("kimi-coding", "kimi-coding-cn"):
        base_url = _resolve_kimi_base_url(api_key, pconfig.inference_base_url, env_url)
    elif env_url:
        base_url = env_url
@@ -2440,12 +2580,21 @@ def get_auth_status(provider_id: Optional[str] = None) -> Dict[str, Any]:
        return get_codex_auth_status()
    if target == "qwen-oauth":
        return get_qwen_auth_status()
+    if target == "google-gemini-cli":
+        return get_gemini_oauth_auth_status()
    if target == "copilot-acp":
        return get_external_process_provider_status(target)
    # API-key providers
    pconfig = PROVIDER_REGISTRY.get(target)
    if pconfig and pconfig.auth_type == "api_key":
        return get_api_key_provider_status(target)
+    # AWS SDK providers (Bedrock) — check via boto3 credential chain
+    if pconfig and pconfig.auth_type == "aws_sdk":
+        try:
+            from agent.bedrock_adapter import has_aws_credentials
+            return {"logged_in": has_aws_credentials(), "provider": target}
+        except ImportError:
+            return {"logged_in": False, "provider": target, "error": "boto3 not installed"}
    return {"logged_in": False}


@@ -2470,7 +2619,7 @@ def resolve_api_key_provider_credentials(provider_id: str) -> Dict[str, Any]:
    if pconfig.base_url_env_var:
        env_url = os.getenv(pconfig.base_url_env_var, "").strip()

-    if provider_id == "kimi-coding":
+    if provider_id in ("kimi-coding", "kimi-coding-cn"):
        base_url = _resolve_kimi_base_url(api_key, pconfig.inference_base_url, env_url)
    elif provider_id == "zai":
        base_url = _resolve_zai_base_url(api_key, pconfig.inference_base_url, env_url)
@@ -3172,6 +3321,14 @@ def _login_nous(args, pconfig: ProviderConfig) -> None:

        inference_base_url = auth_state["inference_base_url"]

+        # Snapshot the prior active_provider BEFORE _save_provider_state
+        # overwrites it to "nous".  If the user picks "Skip (keep current)"
+        # during model selection below, we restore this so the user's previous
+        # provider (e.g. openrouter) is preserved.
+        with _auth_store_lock():
+            _prior_store = _load_auth_store()
+            prior_active_provider = _prior_store.get("active_provider")
+
        with _auth_store_lock():
            auth_store = _load_auth_store()
            _save_provider_state(auth_store, "nous", auth_state)
@@ -3231,6 +3388,27 @@ def _login_nous(args, pconfig: ProviderConfig) -> None:
            print(f"Login succeeded, but could not fetch available models. Reason: {message}")

        # Write provider + model atomically so config is never mismatched.
+        # If no model was selected (user picked "Skip (keep current)",
+        # model list fetch failed, or no curated models were available),
+        # preserve the user's previous provider — don't silently switch
+        # them to Nous with a mismatched model.  The Nous OAuth tokens
+        # stay saved for future use.
+        if not selected_model:
+            # Restore the prior active_provider that _save_provider_state
+            # overwrote to "nous".  config.yaml model.provider is left
+            # untouched, so the user's previous provider is fully preserved.
+            with _auth_store_lock():
+                auth_store = _load_auth_store()
+                if prior_active_provider:
+                    auth_store["active_provider"] = prior_active_provider
+                else:
+                    auth_store.pop("active_provider", None)
+                _save_auth_store(auth_store)
+            print()
+            print("No provider change. Nous credentials saved for future use.")
+            print("  Run `hermes model` again to switch to Nous Portal.")
+            return
+
        config_path = _update_config_for_provider(
            "nous", inference_base_url, default_model=selected_model,
        )
@@ -4,6 +4,7 @@ from __future__ import annotations

 from getpass import getpass
 import math
+import sys
 import time
 from types import SimpleNamespace
 import uuid
@@ -32,7 +33,7 @@ from hermes_constants import OPENROUTER_BASE_URL


 # Providers that support OAuth login in addition to API keys.
-_OAUTH_CAPABLE_PROVIDERS = {"anthropic", "nous", "openai-codex", "qwen-oauth"}
+_OAUTH_CAPABLE_PROVIDERS = {"anthropic", "nous", "openai-codex", "qwen-oauth", "google-gemini-cli"}


 def _get_custom_provider_names() -> list:
@@ -147,7 +148,7 @@ def auth_add_command(args) -> None:
        if provider.startswith(CUSTOM_POOL_PREFIX):
            requested_type = AUTH_TYPE_API_KEY
        else:
-            requested_type = AUTH_TYPE_OAUTH if provider in {"anthropic", "nous", "openai-codex", "qwen-oauth"} else AUTH_TYPE_API_KEY
+            requested_type = AUTH_TYPE_OAUTH if provider in {"anthropic", "nous", "openai-codex", "qwen-oauth", "google-gemini-cli"} else AUTH_TYPE_API_KEY

    pool = load_pool(provider)

@@ -160,7 +161,10 @@ def auth_add_command(args) -> None:
        default_label = _api_key_default_label(len(pool.entries()) + 1)
        label = (getattr(args, "label", None) or "").strip()
        if not label:
-            label = input(f"Label (optional, default: {default_label}): ").strip() or default_label
+            if sys.stdin.isatty():
+                label = input(f"Label (optional, default: {default_label}): ").strip() or default_label
+            else:
+                label = default_label
        entry = PooledCredential(
            provider=provider,
            id=uuid.uuid4().hex[:6],
@@ -213,22 +217,21 @@ def auth_add_command(args) -> None:
            ca_bundle=getattr(args, "ca_bundle", None),
            min_key_ttl_seconds=max(60, int(getattr(args, "min_key_ttl_seconds", 5 * 60))),
        )
-        label = (getattr(args, "label", None) or "").strip() or label_from_token(
-            creds.get("access_token", ""),
-            _oauth_default_label(provider, len(pool.entries()) + 1),
+        # Honor `--label <name>` so nous matches other providers' UX.  The
+        # helper embeds this into providers.nous so that label_from_token
+        # doesn't overwrite it on every subsequent load_pool("nous").
+        custom_label = (getattr(args, "label", None) or "").strip() or None
+        entry = auth_mod.persist_nous_credentials(creds, label=custom_label)
+        shown_label = entry.label if entry is not None else label_from_token(
+            creds.get("access_token", ""), _oauth_default_label(provider, 1),
        )
-        entry = PooledCredential.from_dict(provider, {
-            **creds,
-            "label": label,
-            "auth_type": AUTH_TYPE_OAUTH,
-            "source": f"{SOURCE_MANUAL}:device_code",
-            "base_url": creds.get("inference_base_url"),
-        })
-        pool.add_entry(entry)
-        print(f'Added {provider} OAuth credential #{len(pool.entries())}: "{entry.label}"')
+        print(f'Saved {provider} OAuth device-code credentials: "{shown_label}"')
        return

    if provider == "openai-codex":
+        # Clear any existing suppression marker so a re-link after `hermes auth
+        # remove openai-codex` works without the new tokens being skipped.
+        auth_mod.unsuppress_credential_source(provider, "device_code")
        creds = auth_mod._codex_device_code_login()
        label = (getattr(args, "label", None) or "").strip() or label_from_token(
            creds["tokens"]["access_token"],
@@ -250,6 +253,27 @@ def auth_add_command(args) -> None:
        print(f'Added {provider} OAuth credential #{len(pool.entries())}: "{entry.label}"')
        return

+    if provider == "google-gemini-cli":
+        from agent.google_oauth import run_gemini_oauth_login_pure
+
+        creds = run_gemini_oauth_login_pure()
+        label = (getattr(args, "label", None) or "").strip() or (
+            creds.get("email") or _oauth_default_label(provider, len(pool.entries()) + 1)
+        )
+        entry = PooledCredential(
+            provider=provider,
+            id=uuid.uuid4().hex[:6],
+            label=label,
+            auth_type=AUTH_TYPE_OAUTH,
+            priority=0,
+            source=f"{SOURCE_MANUAL}:google_pkce",
+            access_token=creds["access_token"],
+            refresh_token=creds.get("refresh_token"),
+        )
+        pool.add_entry(entry)
+        print(f'Added {provider} OAuth credential #{len(pool.entries())}: "{entry.label}"')
+        return
+
    if provider == "qwen-oauth":
        creds = auth_mod.resolve_qwen_runtime_credentials(refresh_if_expiring=False)
        label = (getattr(args, "label", None) or "").strip() or label_from_token(
@@ -327,7 +351,34 @@ def auth_remove_command(args) -> None:
    # If this was a singleton-seeded credential (OAuth device_code, hermes_pkce),
    # clear the underlying auth store / credential file so it doesn't get
    # re-seeded on the next load_pool() call.
-    elif removed.source == "device_code" and provider in ("openai-codex", "nous"):
+    elif provider == "openai-codex" and (
+        removed.source == "device_code" or removed.source.endswith(":device_code")
+    ):
+        # Codex tokens live in TWO places: the Hermes auth store and
+        # ~/.codex/auth.json (the Codex CLI shared file).  On every refresh,
+        # refresh_codex_oauth_pure() writes to both.  So clearing only the
+        # Hermes auth store is not enough — _seed_from_singletons() will
+        # auto-import from ~/.codex/auth.json on the next load_pool() and
+        # the removal is instantly undone.  Mark the source as suppressed
+        # so auto-import is skipped; leave ~/.codex/auth.json untouched so
+        # the Codex CLI itself keeps working.
+        from hermes_cli.auth import (
+            _load_auth_store, _save_auth_store, _auth_store_lock,
+            suppress_credential_source,
+        )
+        with _auth_store_lock():
+            auth_store = _load_auth_store()
+            providers_dict = auth_store.get("providers")
+            if isinstance(providers_dict, dict) and provider in providers_dict:
+                del providers_dict[provider]
+                _save_auth_store(auth_store)
+                print(f"Cleared {provider} OAuth tokens from auth store")
+        suppress_credential_source(provider, "device_code")
+        print("Suppressed openai-codex device_code source — it will not be re-seeded.")
+        print("Note: Codex CLI credentials still live in ~/.codex/auth.json")
+        print("Run `hermes auth add openai-codex` to re-enable if needed.")
+
+    elif removed.source == "device_code" and provider == "nous":
        from hermes_cli.auth import (
            _load_auth_store, _save_auth_store, _auth_store_lock,
        )
@@ -368,6 +419,27 @@ def _interactive_auth() -> None:
    print("=" * 50)

    auth_list_command(SimpleNamespace(provider=None))
+
+    # Show AWS Bedrock credential status (not in the pool — uses boto3 chain)
+    try:
+        from agent.bedrock_adapter import has_aws_credentials, resolve_aws_auth_env_var, resolve_bedrock_region
+        if has_aws_credentials():
+            auth_source = resolve_aws_auth_env_var() or "unknown"
+            region = resolve_bedrock_region()
+            print(f"bedrock (AWS SDK credential chain):")
+            print(f"  Auth: {auth_source}")
+            print(f"  Region: {region}")
+            try:
+                import boto3
+                sts = boto3.client("sts", region_name=region)
+                identity = sts.get_caller_identity()
+                arn = identity.get("Arn", "unknown")
+                print(f"  Identity: {arn}")
+            except Exception:
+                print(f"  Identity: (could not resolve — boto3 STS call failed)")
+            print()
+    except ImportError:
+        pass  # boto3 or bedrock_adapter not available
    print()

    # Main menu
@@ -7,8 +7,8 @@ CLI tools that ship with the platform (or are commonly installed).

 Platform support:
  macOS   — osascript (always available), pngpaste (if installed)
-  Windows — PowerShell via .NET System.Windows.Forms.Clipboard
-  WSL2    — powershell.exe via .NET System.Windows.Forms.Clipboard
+  Windows — PowerShell via WinForms, Get-Clipboard, file-drop fallback
+  WSL2    — powershell.exe via WinForms, Get-Clipboard, file-drop fallback
  Linux   — wl-paste (Wayland), xclip (X11)
 """

@@ -46,10 +46,11 @@ def has_clipboard_image() -> bool:
        return _macos_has_image()
    if sys.platform == "win32":
        return _windows_has_image()
-    if _is_wsl():
-        return _wsl_has_image()
-    if os.environ.get("WAYLAND_DISPLAY"):
-        return _wayland_has_image()
+    # Match _linux_save fallthrough order: WSL → Wayland → X11
+    if _is_wsl() and _wsl_has_image():
+        return True
+    if os.environ.get("WAYLAND_DISPLAY") and _wayland_has_image():
+        return True
    return _xclip_has_image()


@@ -135,6 +136,114 @@ _PS_EXTRACT_IMAGE = (
    "[System.Convert]::ToBase64String($ms.ToArray())"
 )

+_PS_CHECK_IMAGE_GET_CLIPBOARD = (
+    "try { "
+    "$img = Get-Clipboard -Format Image -ErrorAction Stop;"
+    "if ($null -ne $img) { 'True' } else { 'False' }"
+    "} catch { 'False' }"
+)
+
+_PS_EXTRACT_IMAGE_GET_CLIPBOARD = (
+    "try { "
+    "Add-Type -AssemblyName System.Drawing;"
+    "Add-Type -AssemblyName PresentationCore;"
+    "Add-Type -AssemblyName WindowsBase;"
+    "$img = Get-Clipboard -Format Image -ErrorAction Stop;"
+    "if ($null -eq $img) { exit 1 }"
+    "$ms = New-Object System.IO.MemoryStream;"
+    "if ($img -is [System.Drawing.Image]) {"
+    "$img.Save($ms, [System.Drawing.Imaging.ImageFormat]::Png)"
+    "} elseif ($img -is [System.Windows.Media.Imaging.BitmapSource]) {"
+    "$enc = New-Object System.Windows.Media.Imaging.PngBitmapEncoder;"
+    "$enc.Frames.Add([System.Windows.Media.Imaging.BitmapFrame]::Create($img));"
+    "$enc.Save($ms)"
+    "} else { exit 2 }"
+    "[System.Convert]::ToBase64String($ms.ToArray())"
+    "} catch { exit 1 }"
+)
+
+_FILEDROP_IMAGE_EXTS = "'.png','.jpg','.jpeg','.gif','.webp','.bmp','.tiff','.tif'"
+
+_PS_CHECK_FILEDROP_IMAGE = (
+    "try { "
+    "$files = Get-Clipboard -Format FileDropList -ErrorAction Stop;"
+    f"$exts = @({_FILEDROP_IMAGE_EXTS});"
+    "$hit = $files | Where-Object { $exts -contains ([System.IO.Path]::GetExtension($_).ToLowerInvariant()) } | Select-Object -First 1;"
+    "if ($null -ne $hit) { 'True' } else { 'False' }"
+    "} catch { 'False' }"
+)
+
+_PS_EXTRACT_FILEDROP_IMAGE = (
+    "try { "
+    "$files = Get-Clipboard -Format FileDropList -ErrorAction Stop;"
+    f"$exts = @({_FILEDROP_IMAGE_EXTS});"
+    "$hit = $files | Where-Object { $exts -contains ([System.IO.Path]::GetExtension($_).ToLowerInvariant()) } | Select-Object -First 1;"
+    "if ($null -eq $hit) { exit 1 }"
+    "[System.Convert]::ToBase64String([System.IO.File]::ReadAllBytes($hit))"
+    "} catch { exit 1 }"
+)
+
+_POWERSHELL_HAS_IMAGE_SCRIPTS = (
+    _PS_CHECK_IMAGE,
+    _PS_CHECK_IMAGE_GET_CLIPBOARD,
+    _PS_CHECK_FILEDROP_IMAGE,
+)
+
+_POWERSHELL_EXTRACT_IMAGE_SCRIPTS = (
+    _PS_EXTRACT_IMAGE,
+    _PS_EXTRACT_IMAGE_GET_CLIPBOARD,
+    _PS_EXTRACT_FILEDROP_IMAGE,
+)
+
+
+def _run_powershell(exe: str, script: str, timeout: int) -> subprocess.CompletedProcess:
+    return subprocess.run(
+        [exe, "-NoProfile", "-NonInteractive", "-Command", script],
+        capture_output=True, text=True, timeout=timeout,
+    )
+
+
+def _write_base64_image(dest: Path, b64_data: str) -> bool:
+    image_bytes = base64.b64decode(b64_data, validate=True)
+    dest.write_bytes(image_bytes)
+    return dest.exists() and dest.stat().st_size > 0
+
+
+def _powershell_has_image(exe: str, *, timeout: int, label: str) -> bool:
+    for script in _POWERSHELL_HAS_IMAGE_SCRIPTS:
+        try:
+            r = _run_powershell(exe, script, timeout=timeout)
+            if r.returncode == 0 and "True" in r.stdout:
+                return True
+        except FileNotFoundError:
+            logger.debug("%s not found — clipboard unavailable", exe)
+            return False
+        except Exception as e:
+            logger.debug("%s clipboard image check failed: %s", label, e)
+    return False
+
+
+def _powershell_save_image(exe: str, dest: Path, *, timeout: int, label: str) -> bool:
+    for script in _POWERSHELL_EXTRACT_IMAGE_SCRIPTS:
+        try:
+            r = _run_powershell(exe, script, timeout=timeout)
+            if r.returncode != 0:
+                continue
+
+            b64_data = r.stdout.strip()
+            if not b64_data:
+                continue
+
+            if _write_base64_image(dest, b64_data):
+                return True
+        except FileNotFoundError:
+            logger.debug("%s not found — clipboard unavailable", exe)
+            return False
+        except Exception as e:
+            logger.debug("%s clipboard image extraction failed: %s", label, e)
+            dest.unlink(missing_ok=True)
+    return False
+

 # ── Native Windows ────────────────────────────────────────────────────────

@@ -175,15 +284,7 @@ def _windows_has_image() -> bool:
    ps = _get_ps_exe()
    if ps is None:
        return False
-    try:
-        r = subprocess.run(
-            [ps, "-NoProfile", "-NonInteractive", "-Command", _PS_CHECK_IMAGE],
-            capture_output=True, text=True, timeout=5,
-        )
-        return r.returncode == 0 and "True" in r.stdout
-    except Exception as e:
-        logger.debug("Windows clipboard image check failed: %s", e)
-    return False
+    return _powershell_has_image(ps, timeout=5, label="Windows")


 def _windows_save(dest: Path) -> bool:
@@ -192,26 +293,7 @@ def _windows_save(dest: Path) -> bool:
    if ps is None:
        logger.debug("No PowerShell found — Windows clipboard image paste unavailable")
        return False
-    try:
-        r = subprocess.run(
-            [ps, "-NoProfile", "-NonInteractive", "-Command", _PS_EXTRACT_IMAGE],
-            capture_output=True, text=True, timeout=15,
-        )
-        if r.returncode != 0:
-            return False
-
-        b64_data = r.stdout.strip()
-        if not b64_data:
-            return False
-
-        png_bytes = base64.b64decode(b64_data)
-        dest.write_bytes(png_bytes)
-        return dest.exists() and dest.stat().st_size > 0
-
-    except Exception as e:
-        logger.debug("Windows clipboard image extraction failed: %s", e)
-        dest.unlink(missing_ok=True)
-    return False
+    return _powershell_save_image(ps, dest, timeout=15, label="Windows")


 # ── Linux ────────────────────────────────────────────────────────────────
@@ -235,45 +317,12 @@ def _linux_save(dest: Path) -> bool:

 def _wsl_has_image() -> bool:
    """Check if Windows clipboard has an image (via powershell.exe)."""
-    try:
-        r = subprocess.run(
-            ["powershell.exe", "-NoProfile", "-NonInteractive", "-Command",
-             _PS_CHECK_IMAGE],
-            capture_output=True, text=True, timeout=8,
-        )
-        return r.returncode == 0 and "True" in r.stdout
-    except FileNotFoundError:
-        logger.debug("powershell.exe not found — WSL clipboard unavailable")
-    except Exception as e:
-        logger.debug("WSL clipboard check failed: %s", e)
-    return False
+    return _powershell_has_image("powershell.exe", timeout=8, label="WSL")


 def _wsl_save(dest: Path) -> bool:
    """Extract clipboard image via powershell.exe → base64 → decode to PNG."""
-    try:
-        r = subprocess.run(
-            ["powershell.exe", "-NoProfile", "-NonInteractive", "-Command",
-             _PS_EXTRACT_IMAGE],
-            capture_output=True, text=True, timeout=15,
-        )
-        if r.returncode != 0:
-            return False
-
-        b64_data = r.stdout.strip()
-        if not b64_data:
-            return False
-
-        png_bytes = base64.b64decode(b64_data)
-        dest.write_bytes(png_bytes)
-        return dest.exists() and dest.stat().st_size > 0
-
-    except FileNotFoundError:
-        logger.debug("powershell.exe not found — WSL clipboard unavailable")
-    except Exception as e:
-        logger.debug("WSL clipboard extraction failed: %s", e)
-        dest.unlink(missing_ok=True)
-    return False
+    return _powershell_save_image("powershell.exe", dest, timeout=15, label="WSL")


 # ── Wayland (wl-paste) ──────────────────────────────────────────────────
@@ -87,8 +87,12 @@ COMMAND_REGISTRY: list[CommandDef] = [
               aliases=("bg",), args_hint="<prompt>"),
    CommandDef("btw", "Ephemeral side question using session context (no tools, not persisted)", "Session",
               args_hint="<question>"),
+    CommandDef("agents", "Show active agents and running tasks", "Session",
+               aliases=("tasks",)),
    CommandDef("queue", "Queue a prompt for the next turn (doesn't interrupt)", "Session",
               aliases=("q",), args_hint="<prompt>"),
+    CommandDef("steer", "Inject a message after the next tool call without interrupting", "Session",
+               args_hint="<prompt>"),
    CommandDef("status", "Show session info", "Session"),
    CommandDef("profile", "Show active profile name and home directory", "Info"),
    CommandDef("sethome", "Set this chat as the home channel", "Session",
@@ -99,9 +103,10 @@ COMMAND_REGISTRY: list[CommandDef] = [
    # Configuration
    CommandDef("config", "Show current configuration", "Configuration",
               cli_only=True),
-    CommandDef("model", "Switch model for this session", "Configuration", args_hint="[model] [--global]"),
+    CommandDef("model", "Switch model for this session", "Configuration", args_hint="[model] [--provider name] [--global]"),
    CommandDef("provider", "Show available providers and current provider",
               "Configuration"),
+    CommandDef("gquota", "Show Google Gemini Code Assist quota usage", "Info"),

    CommandDef("personality", "Set a predefined personality", "Configuration",
               args_hint="[name]"),
@@ -119,7 +124,7 @@ COMMAND_REGISTRY: list[CommandDef] = [
               args_hint="[normal|fast|status]",
               subcommands=("normal", "fast", "status", "on", "off")),
    CommandDef("skin", "Show or change the display skin/theme", "Configuration",
-               cli_only=True, args_hint="[name]"),
+               args_hint="[name]"),
    CommandDef("voice", "Toggle voice mode", "Configuration",
               args_hint="[on|off|tts|status]", subcommands=("on", "off", "tts", "status")),

@@ -154,7 +159,9 @@ COMMAND_REGISTRY: list[CommandDef] = [
               args_hint="[days]"),
    CommandDef("platforms", "Show gateway/messaging platform status", "Info",
               cli_only=True, aliases=("gateway",)),
-    CommandDef("paste", "Check clipboard for an image and attach it", "Info",
+    CommandDef("copy", "Copy the last assistant response to clipboard", "Info",
+               cli_only=True, args_hint="[number]"),
+    CommandDef("paste", "Attach clipboard image from your clipboard", "Info",
               cli_only=True),
    CommandDef("image", "Attach a local image file for your next prompt", "Info",
               cli_only=True, args_hint="<path>"),
@@ -164,7 +171,7 @@ COMMAND_REGISTRY: list[CommandDef] = [

    # Exit
    CommandDef("quit", "Exit the CLI", "Exit",
-               cli_only=True, aliases=("exit", "q")),
+               cli_only=True, aliases=("exit",)),
 ]


@@ -253,6 +260,53 @@ GATEWAY_KNOWN_COMMANDS: frozenset[str] = frozenset(
 )


+# Commands with explicit Level-2 running-agent handlers in gateway/run.py.
+# Listed here for introspection / tests; semantically a subset of
+# "all resolvable commands" — which is the real bypass set (see
+# should_bypass_active_session below).
+ACTIVE_SESSION_BYPASS_COMMANDS: frozenset[str] = frozenset(
+    {
+        "agents",
+        "approve",
+        "background",
+        "commands",
+        "deny",
+        "help",
+        "new",
+        "profile",
+        "queue",
+        "restart",
+        "status",
+        "steer",
+        "stop",
+        "update",
+    }
+)
+
+
+def should_bypass_active_session(command_name: str | None) -> bool:
+    """Return True for any resolvable slash command.
+
+    Rationale: every gateway-registered slash command either has a
+    specific Level-2 handler in gateway/run.py (/stop, /new, /model,
+    /approve, etc.) or reaches the running-agent catch-all that returns
+    a "busy — wait or /stop first" response. In both paths the command
+    is dispatched, not queued.
+
+    Queueing is always wrong for a recognized slash command because the
+    safety net in gateway.run discards any command text that reaches
+    the pending queue — which meant a mid-run /model (or /reasoning,
+    /voice, /insights, /title, /resume, /retry, /undo, /compress,
+    /usage, /provider, /reload-mcp, /sethome, /reset) would silently
+    interrupt the agent AND get discarded, producing a zero-char
+    response. See issue #5057 / PRs #6252, #10370, #4665.
+
+    ACTIVE_SESSION_BYPASS_COMMANDS remains the subset of commands with
+    explicit Level-2 handlers; the rest fall through to the catch-all.
+    """
+    return resolve_command(command_name) is not None if command_name else False
+
+
 def _resolve_config_gates() -> set[str]:
    """Return canonical names of commands whose ``gateway_config_gate`` is truthy.

@@ -450,7 +504,7 @@ def _collect_gateway_skill_entries(
            name = sanitize_name(cmd_name) if sanitize_name else cmd_name
            if not name:
                continue
-            desc = "Plugin command"
+            desc = plugin_cmds[cmd_name].get("description", "Plugin command")
            if len(desc) > desc_limit:
                desc = desc[:desc_limit - 3] + "..."
            plugin_pairs.append((name, desc))
@@ -844,8 +898,7 @@ class SlashCommandCompleter(Completer):
            return None
        return word

-    @staticmethod
-    def _context_completions(word: str, limit: int = 30):
+    def _context_completions(self, word: str, limit: int = 30):
        """Yield Claude Code-style @ context completions.

        Bare ``@`` or ``@partial`` shows static references and matching
@@ -1044,6 +1097,51 @@ class SlashCommandCompleter(Completer):
                display_meta=f"{fp}  {meta}" if meta else fp,
            )

+    @staticmethod
+    def _skin_completions(sub_text: str, sub_lower: str):
+        """Yield completions for /skin from available skins."""
+        try:
+            from hermes_cli.skin_engine import list_skins
+            for s in list_skins():
+                name = s["name"]
+                if name.startswith(sub_lower) and name != sub_lower:
+                    yield Completion(
+                        name,
+                        start_position=-len(sub_text),
+                        display=name,
+                        display_meta=s.get("description", "") or s.get("source", ""),
+                    )
+        except Exception:
+            pass
+
+    @staticmethod
+    def _personality_completions(sub_text: str, sub_lower: str):
+        """Yield completions for /personality from configured personalities."""
+        try:
+            from hermes_cli.config import load_config
+            personalities = load_config().get("agent", {}).get("personalities", {})
+            if "none".startswith(sub_lower) and "none" != sub_lower:
+                yield Completion(
+                    "none",
+                    start_position=-len(sub_text),
+                    display="none",
+                    display_meta="clear personality overlay",
+                )
+            for name, prompt in personalities.items():
+                if name.startswith(sub_lower) and name != sub_lower:
+                    if isinstance(prompt, dict):
+                        meta = prompt.get("description") or prompt.get("system_prompt", "")[:50]
+                    else:
+                        meta = str(prompt)[:50]
+                    yield Completion(
+                        name,
+                        start_position=-len(sub_text),
+                        display=name,
+                        display_meta=meta,
+                    )
+        except Exception:
+            pass
+
    def _model_completions(self, sub_text: str, sub_lower: str):
        """Yield completions for /model from config aliases + built-in aliases."""
        seen = set()
@@ -1098,10 +1196,17 @@ class SlashCommandCompleter(Completer):
            sub_text = parts[1] if len(parts) > 1 else ""
            sub_lower = sub_text.lower()

-            # Dynamic model alias completions for /model
-            if " " not in sub_text and base_cmd == "/model":
-                yield from self._model_completions(sub_text, sub_lower)
-                return
+            # Dynamic completions for commands with runtime lists
+            if " " not in sub_text:
+                if base_cmd == "/model":
+                    yield from self._model_completions(sub_text, sub_lower)
+                    return
+                if base_cmd == "/skin":
+                    yield from self._skin_completions(sub_text, sub_lower)
+                    return
+                if base_cmd == "/personality":
+                    yield from self._personality_completions(sub_text, sub_lower)
+                    return

            # Static subcommand completions
            if " " not in sub_text and base_cmd in SUBCOMMANDS and self._command_allowed(base_cmd):
@@ -1140,6 +1245,22 @@ class SlashCommandCompleter(Completer):
                    display_meta=f"⚡ {short_desc}",
                )

+        # Plugin-registered slash commands
+        try:
+            from hermes_cli.plugins import get_plugin_commands
+            for cmd_name, cmd_info in get_plugin_commands().items():
+                if cmd_name.startswith(word):
+                    desc = str(cmd_info.get("description", "Plugin command"))
+                    short_desc = desc[:50] + ("..." if len(desc) > 50 else "")
+                    yield Completion(
+                        self._completion_text(cmd_name, word),
+                        start_position=-len(word),
+                        display=f"/{cmd_name}",
+                        display_meta=f"🔌 {short_desc}",
+                    )
+        except Exception:
+            pass
+

 # ---------------------------------------------------------------------------
 # Inline auto-suggest (ghost text) for slash commands
@@ -12,6 +12,7 @@ This module provides:
 - hermes config wizard   - Re-run setup wizard
 """

+import copy
 import os
 import platform
 import re
@@ -23,10 +24,10 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, Any, Optional, List, Tuple

-from tools.tool_backend_helpers import managed_nous_tools_enabled as _managed_nous_tools_enabled

 _IS_WINDOWS = platform.system() == "Windows"
 _ENV_VAR_NAME_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
+_LAST_EXPANDED_CONFIG_BY_PATH: Dict[str, Any] = {}
 # Env var names written to .env that aren't in OPTIONAL_ENV_VARS
 # (managed by setup/provider flows directly).
 _EXTRA_ENV_KEYS = frozenset({
@@ -45,7 +46,8 @@ _EXTRA_ENV_KEYS = frozenset({
    "WEIXIN_HOME_CHANNEL", "WEIXIN_HOME_CHANNEL_NAME", "WEIXIN_DM_POLICY", "WEIXIN_GROUP_POLICY",
    "WEIXIN_ALLOWED_USERS", "WEIXIN_GROUP_ALLOWED_USERS", "WEIXIN_ALLOW_ALL_USERS",
    "BLUEBUBBLES_SERVER_URL", "BLUEBUBBLES_PASSWORD",
-    "QQ_APP_ID", "QQ_CLIENT_SECRET", "QQ_HOME_CHANNEL", "QQ_HOME_CHANNEL_NAME",
+    "QQ_APP_ID", "QQ_CLIENT_SECRET", "QQBOT_HOME_CHANNEL", "QQBOT_HOME_CHANNEL_NAME",
+    "QQ_HOME_CHANNEL", "QQ_HOME_CHANNEL_NAME",  # legacy aliases (pre-rename, still read for back-compat)
    "QQ_ALLOWED_USERS", "QQ_GROUP_ALLOWED_USERS", "QQ_ALLOW_ALL_USERS", "QQ_MARKDOWN_SUPPORT",
    "QQ_STT_API_KEY", "QQ_STT_BASE_URL", "QQ_STT_MODEL",
    "TERMINAL_ENV", "TERMINAL_SSH_KEY", "TERMINAL_SSH_PORT",
@@ -241,13 +243,41 @@ def _secure_dir(path):
        pass


+def _is_container() -> bool:
+    """Detect if we're running inside a Docker/Podman/LXC container.
+
+    When Hermes runs in a container with volume-mounted config files, forcing
+    0o600 permissions breaks multi-process setups where the gateway and
+    dashboard run as different UIDs or the volume mount requires broader
+    permissions.
+    """
+    # Explicit opt-out
+    if os.environ.get("HERMES_CONTAINER") or os.environ.get("HERMES_SKIP_CHMOD"):
+        return True
+    # Docker / Podman marker file
+    if os.path.exists("/.dockerenv"):
+        return True
+    # LXC / cgroup-based detection
+    try:
+        with open("/proc/1/cgroup", "r") as f:
+            cgroup_content = f.read()
+        if "docker" in cgroup_content or "lxc" in cgroup_content or "kubepods" in cgroup_content:
+            return True
+    except (OSError, IOError):
+        pass
+    return False
+
+
 def _secure_file(path):
    """Set file to owner-only read/write (0600). No-op on Windows.

    Skipped in managed mode — the NixOS activation script sets
    group-readable permissions (0640) on config files.
+
+    Skipped in containers — Docker/Podman volume mounts often need broader
+    permissions.  Set HERMES_SKIP_CHMOD=1 to force-skip on other systems.
    """
-    if is_managed():
+    if is_managed() or _is_container():
        return
    try:
        if os.path.exists(str(path)):
@@ -390,10 +420,10 @@ DEFAULT_CONFIG = {
        "command_timeout": 30,  # Timeout for browser commands in seconds (screenshot, navigate, etc.)
        "record_sessions": False,  # Auto-record browser sessions as WebM videos
        "allow_private_urls": False,  # Allow navigating to private/internal IPs (localhost, 192.168.x.x, etc.)
+        "cdp_url": "",  # Optional persistent CDP endpoint for attaching to an existing Chromium/Chrome
        "camofox": {
            # When true, Hermes sends a stable profile-scoped userId to Camofox
-            # so the server can map it to a persistent browser profile directory.
-            # Requires Camofox server to be configured with CAMOFOX_PROFILE_DIR.
+            # so the server maps it to a persistent Firefox profile automatically.
            # When false (default), each session gets a random userId (ephemeral).
            "managed_persistence": False,
        },
@@ -419,6 +449,27 @@ DEFAULT_CONFIG = {
        "protect_last_n": 20,         # minimum recent messages to keep uncompressed

    },
+
+    # AWS Bedrock provider configuration.
+    # Only used when model.provider is "bedrock".
+    "bedrock": {
+        "region": "",  # AWS region for Bedrock API calls (empty = AWS_REGION env var → us-east-1)
+        "discovery": {
+            "enabled": True,           # Auto-discover models via ListFoundationModels
+            "provider_filter": [],     # Only show models from these providers (e.g. ["anthropic", "amazon"])
+            "refresh_interval": 3600,  # Cache discovery results for this many seconds
+        },
+        "guardrail": {
+            # Amazon Bedrock Guardrails — content filtering and safety policies.
+            # Create a guardrail in the Bedrock console, then set the ID and version here.
+            # See: https://docs.aws.amazon.com/bedrock/latest/userguide/guardrails.html
+            "guardrail_identifier": "",  # e.g. "abc123def456"
+            "guardrail_version": "",     # e.g. "1" or "DRAFT"
+            "stream_processing_mode": "async",  # "sync" or "async"
+            "trace": "disabled",         # "enabled", "disabled", or "enabled_full"
+        },
+    },
+
    "smart_model_routing": {
        "enabled": False,
        "max_simple_chars": 160,
@@ -490,6 +541,13 @@ DEFAULT_CONFIG = {
            "api_key": "",
            "timeout": 30,
        },
+        "title_generation": {
+            "provider": "auto",
+            "model": "",
+            "base_url": "",
+            "api_key": "",
+            "timeout": 30,
+        },
    },
    
    "display": {
@@ -510,6 +568,11 @@ DEFAULT_CONFIG = {
        "platforms": {},  # Per-platform display overrides: {"telegram": {"tool_progress": "all"}, "slack": {"tool_progress": "off"}}
    },

+    # Web dashboard settings
+    "dashboard": {
+        "theme": "default",  # Dashboard visual theme: "default", "midnight", "ember", "mono", "cyberpunk", "rose"
+    },
+
    # Privacy settings
    "privacy": {
        "redact_pii": False,  # When True, hash user IDs and strip phone numbers from LLM context
@@ -517,7 +580,7 @@ DEFAULT_CONFIG = {
    
    # Text-to-speech configuration
    "tts": {
-        "provider": "edge",  # "edge" (free) | "elevenlabs" (premium) | "openai" | "minimax" | "mistral" | "neutts" (local)
+        "provider": "edge",  # "edge" (free) | "elevenlabs" (premium) | "openai" | "xai" | "minimax" | "mistral" | "neutts" (local)
        "edge": {
            "voice": "en-US-AriaNeural",
            # Popular: AriaNeural, JennyNeural, AndrewNeural, BrianNeural, SoniaNeural
@@ -531,6 +594,12 @@ DEFAULT_CONFIG = {
            "voice": "alloy",
            # Voices: alloy, echo, fable, onyx, nova, shimmer
        },
+        "xai": {
+            "voice_id": "eve",
+            "language": "en",
+            "sample_rate": 24000,
+            "bit_rate": 128000,
+        },
        "mistral": {
            "model": "voxtral-mini-tts-2603",
            "voice_id": "c69964a6-ab8b-4f8a-9465-ec0925096ec8",  # Paul - Neutral
@@ -638,6 +707,7 @@ DEFAULT_CONFIG = {
        "allowed_channels": "",        # If set, bot ONLY responds in these channel IDs (whitelist)
        "auto_thread": True,           # Auto-create threads on @mention in channels (like Slack)
        "reactions": True,             # Add 👀/✅/❌ reactions to messages during processing
+        "channel_prompts": {},         # Per-channel ephemeral system prompts (forum parents apply to child threads)
    },

    # WhatsApp platform settings (gateway mode)
@@ -648,13 +718,33 @@ DEFAULT_CONFIG = {
        # Supports \n for newlines, e.g. "🤖 *My Bot*\n──────\n"
    },

+    # Telegram platform settings (gateway mode)
+    "telegram": {
+        "channel_prompts": {},         # Per-chat/topic ephemeral system prompts (topics inherit from parent group)
+    },
+
+    # Slack platform settings (gateway mode)
+    "slack": {
+        "channel_prompts": {},         # Per-channel ephemeral system prompts
+    },
+
+    # Mattermost platform settings (gateway mode)
+    "mattermost": {
+        "channel_prompts": {},         # Per-channel ephemeral system prompts
+    },
+
    # Approval mode for dangerous commands:
    #   manual — always prompt the user (default)
    #   smart  — use auxiliary LLM to auto-approve low-risk commands, prompt for high-risk
    #   off    — skip all approval prompts (equivalent to --yolo)
+    #
+    # cron_mode — what to do when a cron job hits a dangerous command:
+    #   deny    — block the command and let the agent find another way (default, safe)
+    #   approve — auto-approve all dangerous commands in cron jobs
    "approvals": {
        "mode": "manual",
        "timeout": 60,
+        "cron_mode": "deny",
    },

    # Permanently allowed dangerous command patterns (added via "always" approval)
@@ -686,6 +776,20 @@ DEFAULT_CONFIG = {
        "wrap_response": True,
    },

+    # execute_code settings — controls the tool used for programmatic tool calls.
+    "code_execution": {
+        # Execution mode:
+        #   project (default) — scripts run in the session's working directory
+        #     with the active virtualenv/conda env's python, so project deps
+        #     (pandas, torch, project packages) and relative paths resolve.
+        #   strict            — scripts run in an isolated temp directory with
+        #     hermes-agent's own python (sys.executable). Maximum isolation
+        #     and reproducibility; project deps and relative paths won't work.
+        # Env scrubbing (strips *_API_KEY, *_TOKEN, *_SECRET, ...) and the
+        # tool whitelist apply identically in both modes.
+        "mode": "project",
+    },
+
    # Logging — controls file logging to ~/.hermes/logs/.
    # agent.log captures INFO+ (all agent activity); errors.log captures WARNING+.
    "logging": {
@@ -703,7 +807,7 @@ DEFAULT_CONFIG = {
    },

    # Config schema version - bump this when adding new required fields
-    "_config_version": 17,
+    "_config_version": 19,
 }

 # =============================================================================
@@ -771,6 +875,38 @@ OPTIONAL_ENV_VARS = {
        "category": "provider",
        "advanced": True,
    },
+    "XAI_API_KEY": {
+        "description": "xAI API key",
+        "prompt": "xAI API key",
+        "url": "https://console.x.ai/",
+        "password": True,
+        "category": "provider",
+        "advanced": True,
+    },
+    "XAI_BASE_URL": {
+        "description": "xAI base URL override",
+        "prompt": "xAI base URL (leave empty for default)",
+        "url": None,
+        "password": False,
+        "category": "provider",
+        "advanced": True,
+    },
+    "NVIDIA_API_KEY": {
+        "description": "NVIDIA NIM API key (build.nvidia.com or local NIM endpoint)",
+        "prompt": "NVIDIA NIM API key",
+        "url": "https://build.nvidia.com/",
+        "password": True,
+        "category": "provider",
+        "advanced": True,
+    },
+    "NVIDIA_BASE_URL": {
+        "description": "NVIDIA NIM base URL override (e.g. http://localhost:8000/v1 for local NIM)",
+        "prompt": "NVIDIA NIM base URL (leave empty for default)",
+        "url": None,
+        "password": False,
+        "category": "provider",
+        "advanced": True,
+    },
    "GLM_API_KEY": {
        "description": "Z.AI / GLM API key (also recognized as ZAI_API_KEY / Z_AI_API_KEY)",
        "prompt": "Z.AI / GLM API key",
@@ -912,6 +1048,30 @@ OPTIONAL_ENV_VARS = {
        "category": "provider",
        "advanced": True,
    },
+    "HERMES_GEMINI_CLIENT_ID": {
+        "description": "Google OAuth client ID for google-gemini-cli (optional; defaults to Google's public gemini-cli client)",
+        "prompt": "Google OAuth client ID (optional — leave empty to use the public default)",
+        "url": "https://console.cloud.google.com/apis/credentials",
+        "password": False,
+        "category": "provider",
+        "advanced": True,
+    },
+    "HERMES_GEMINI_CLIENT_SECRET": {
+        "description": "Google OAuth client secret for google-gemini-cli (optional)",
+        "prompt": "Google OAuth client secret (optional)",
+        "url": "https://console.cloud.google.com/apis/credentials",
+        "password": True,
+        "category": "provider",
+        "advanced": True,
+    },
+    "HERMES_GEMINI_PROJECT_ID": {
+        "description": "GCP project ID for paid Gemini tiers (free tier auto-provisions)",
+        "prompt": "GCP project ID for Gemini OAuth (leave empty for free tier)",
+        "url": None,
+        "password": False,
+        "category": "provider",
+        "advanced": True,
+    },
    "OPENCODE_ZEN_API_KEY": {
        "description": "OpenCode Zen API key (pay-as-you-go access to curated models)",
        "prompt": "OpenCode Zen API key",
@@ -959,6 +1119,22 @@ OPTIONAL_ENV_VARS = {
        "category": "provider",
        "advanced": True,
    },
+    "OLLAMA_API_KEY": {
+        "description": "Ollama Cloud API key (ollama.com — cloud-hosted open models)",
+        "prompt": "Ollama Cloud API key",
+        "url": "https://ollama.com/settings",
+        "password": True,
+        "category": "provider",
+        "advanced": True,
+    },
+    "OLLAMA_BASE_URL": {
+        "description": "Ollama Cloud base URL override (default: https://ollama.com/v1)",
+        "prompt": "Ollama base URL (leave empty for default)",
+        "url": None,
+        "password": False,
+        "category": "provider",
+        "advanced": True,
+    },
    "XIAOMI_API_KEY": {
        "description": "Xiaomi MiMo API key for MiMo models (mimo-v2-pro, mimo-v2-omni, mimo-v2-flash)",
        "prompt": "Xiaomi MiMo API Key",
@@ -974,6 +1150,22 @@ OPTIONAL_ENV_VARS = {
        "category": "provider",
        "advanced": True,
    },
+    "AWS_REGION": {
+        "description": "AWS region for Bedrock API calls (e.g. us-east-1, eu-central-1)",
+        "prompt": "AWS Region",
+        "url": "https://docs.aws.amazon.com/bedrock/latest/userguide/bedrock-regions.html",
+        "password": False,
+        "category": "provider",
+        "advanced": True,
+    },
+    "AWS_PROFILE": {
+        "description": "AWS named profile for Bedrock authentication (from ~/.aws/credentials)",
+        "prompt": "AWS Profile",
+        "url": None,
+        "password": False,
+        "category": "provider",
+        "advanced": True,
+    },

    # ── Tool API keys ──
    "EXA_API_KEY": {
@@ -1171,6 +1363,12 @@ OPTIONAL_ENV_VARS = {
        "password": False,
        "category": "messaging",
    },
+    "TELEGRAM_PROXY": {
+        "description": "Proxy URL for Telegram connections (overrides HTTPS_PROXY). Supports http://, https://, socks5://",
+        "prompt": "Telegram proxy URL (optional)",
+        "password": False,
+        "category": "messaging",
+    },
    "DISCORD_BOT_TOKEN": {
        "description": "Discord bot token from Developer Portal",
        "prompt": "Discord bot token",
@@ -1366,12 +1564,12 @@ OPTIONAL_ENV_VARS = {
        "prompt": "Allow All QQ Users",
        "category": "messaging",
    },
-    "QQ_HOME_CHANNEL": {
+    "QQBOT_HOME_CHANNEL": {
        "description": "Default QQ channel/group for cron delivery and notifications",
        "prompt": "QQ Home Channel",
        "category": "messaging",
    },
-    "QQ_HOME_CHANNEL_NAME": {
+    "QQBOT_HOME_CHANNEL_NAME": {
        "description": "Display name for the QQ home channel",
        "prompt": "QQ Home Channel Name",
        "category": "messaging",
@@ -1468,13 +1666,8 @@ OPTIONAL_ENV_VARS = {
    },

    # ── Agent settings ──
-    "MESSAGING_CWD": {
-        "description": "Working directory for terminal commands via messaging",
-        "prompt": "Messaging working directory (default: home)",
-        "url": None,
-        "password": False,
-        "category": "setting",
-    },
+    # NOTE: MESSAGING_CWD was removed here — use terminal.cwd in config.yaml
+    # instead.  The gateway reads TERMINAL_CWD (bridged from terminal.cwd).
    "SUDO_PASSWORD": {
        "description": "Sudo password for terminal commands requiring root access; set to an explicit empty string to try empty without prompting",
        "prompt": "Sudo password",
@@ -1522,14 +1715,8 @@ OPTIONAL_ENV_VARS = {
    },
 }

-if not _managed_nous_tools_enabled():
-    for _hidden_var in (
-        "FIRECRAWL_GATEWAY_URL",
-        "TOOL_GATEWAY_DOMAIN",
-        "TOOL_GATEWAY_SCHEME",
-        "TOOL_GATEWAY_USER_TOKEN",
-    ):
-        OPTIONAL_ENV_VARS.pop(_hidden_var, None)
+# Tool Gateway env vars are always visible — they're useful for
+# self-hosted / custom gateway setups regardless of subscription state.


 def get_missing_env_vars(required_only: bool = False) -> List[Dict[str, Any]]:
@@ -1953,6 +2140,52 @@ def print_config_warnings(config: Optional[Dict[str, Any]] = None) -> None:
    sys.stderr.write("\n".join(lines) + "\n\n")


+def warn_deprecated_cwd_env_vars(config: Optional[Dict[str, Any]] = None) -> None:
+    """Warn if MESSAGING_CWD or TERMINAL_CWD is set in .env instead of config.yaml.
+
+    These env vars are deprecated — the canonical setting is terminal.cwd
+    in config.yaml.  Prints a migration hint to stderr.
+    """
+    import os, sys
+    messaging_cwd = os.environ.get("MESSAGING_CWD")
+    terminal_cwd_env = os.environ.get("TERMINAL_CWD")
+
+    if config is None:
+        try:
+            config = load_config()
+        except Exception:
+            return
+
+    terminal_cfg = config.get("terminal", {})
+    config_cwd = terminal_cfg.get("cwd", ".") if isinstance(terminal_cfg, dict) else "."
+    # Only warn if config.yaml doesn't have an explicit path
+    config_has_explicit_cwd = config_cwd not in (".", "auto", "cwd", "")
+
+    lines: list[str] = []
+    if messaging_cwd:
+        lines.append(
+            f"  \033[33m⚠\033[0m MESSAGING_CWD={messaging_cwd} found in .env — "
+            f"this is deprecated."
+        )
+    if terminal_cwd_env and not config_has_explicit_cwd:
+        # TERMINAL_CWD in env but not from config bridge — likely from .env
+        lines.append(
+            f"  \033[33m⚠\033[0m TERMINAL_CWD={terminal_cwd_env} found in .env — "
+            f"this is deprecated."
+        )
+    if lines:
+        hint_path = os.environ.get("HERMES_HOME", "~/.hermes")
+        lines.insert(0, "\033[33m⚠ Deprecated .env settings detected:\033[0m")
+        lines.append(
+            f"  \033[2mMove to config.yaml instead:  "
+            f"terminal:\\n    cwd: /your/project/path\033[0m"
+        )
+        lines.append(
+            f"  \033[2mThen remove the old entries from {hint_path}/.env\033[0m"
+        )
+        sys.stderr.write("\n".join(lines) + "\n\n")
+
+
 def migrate_config(interactive: bool = True, quiet: bool = False) -> Dict[str, Any]:
    """
    Migrate config to latest version, prompting for new required fields.
@@ -2423,6 +2656,85 @@ def _expand_env_vars(obj):
    return obj


+def _items_by_unique_name(items):
+    """Return a name-indexed dict only when all items have unique string names."""
+    if not isinstance(items, list):
+        return None
+    indexed = {}
+    for item in items:
+        if not isinstance(item, dict) or not isinstance(item.get("name"), str):
+            return None
+        name = item["name"]
+        if name in indexed:
+            return None
+        indexed[name] = item
+    return indexed
+
+
+def _preserve_env_ref_templates(current, raw, loaded_expanded=None):
+    """Restore raw ``${VAR}`` templates when a value is otherwise unchanged.
+
+    ``load_config()`` expands env refs for runtime use. When a caller later
+    persists that config after modifying some unrelated setting, keep the
+    original on-disk template instead of writing the expanded plaintext
+    secret back to ``config.yaml``.
+
+    Prefer preserving the raw template when ``current`` still matches either
+    the value previously returned by ``load_config()`` for this config path or
+    the current environment expansion of ``raw``. This handles env-var
+    rotation between load and save while still treating mixed literal/template
+    string edits as caller-owned once their rendered value diverges.
+    """
+    if isinstance(current, str) and isinstance(raw, str) and re.search(r"\${[^}]+}", raw):
+        if current == raw:
+            return raw
+        if isinstance(loaded_expanded, str) and current == loaded_expanded:
+            return raw
+        if _expand_env_vars(raw) == current:
+            return raw
+        return current
+
+    if isinstance(current, dict) and isinstance(raw, dict):
+        return {
+            key: _preserve_env_ref_templates(
+                value,
+                raw.get(key),
+                loaded_expanded.get(key) if isinstance(loaded_expanded, dict) else None,
+            )
+            for key, value in current.items()
+        }
+
+    if isinstance(current, list) and isinstance(raw, list):
+        # Prefer matching named config objects (e.g. custom_providers) by name
+        # so harmless reordering doesn't drop the original template. If names
+        # are duplicated, fall back to positional matching instead of silently
+        # shadowing one entry.
+        current_by_name = _items_by_unique_name(current)
+        raw_by_name = _items_by_unique_name(raw)
+        loaded_by_name = _items_by_unique_name(loaded_expanded)
+        if current_by_name is not None and raw_by_name is not None:
+            return [
+                _preserve_env_ref_templates(
+                    item,
+                    raw_by_name.get(item.get("name")),
+                    loaded_by_name.get(item.get("name")) if loaded_by_name is not None else None,
+                )
+                for item in current
+            ]
+        return [
+            _preserve_env_ref_templates(
+                item,
+                raw[index] if index < len(raw) else None,
+                loaded_expanded[index]
+                if isinstance(loaded_expanded, list) and index < len(loaded_expanded)
+                else None,
+            )
+            for index, item in enumerate(current)
+        ]
+
+    return current
+
+
 def _normalize_root_model_keys(config: Dict[str, Any]) -> Dict[str, Any]:
    """Move stale root-level provider/base_url into model section.

@@ -2490,7 +2802,6 @@ def read_raw_config() -> Dict[str, Any]:

 def load_config() -> Dict[str, Any]:
    """Load configuration from ~/.hermes/config.yaml."""
-    import copy
    ensure_hermes_home()
    config_path = get_config_path()
    
@@ -2511,8 +2822,11 @@ def load_config() -> Dict[str, Any]:
            config = _deep_merge(config, user_config)
        except Exception as e:
            print(f"Warning: Failed to load config: {e}")
-    
-    return _expand_env_vars(_normalize_root_model_keys(_normalize_max_turns_config(config)))
+
+    normalized = _normalize_root_model_keys(_normalize_max_turns_config(config))
+    expanded = _expand_env_vars(normalized)
+    _LAST_EXPANDED_CONFIG_BY_PATH[str(config_path)] = copy.deepcopy(expanded)
+    return expanded


 _SECURITY_COMMENT = """
@@ -2547,7 +2861,7 @@ _FALLBACK_COMMENT = """
 #   minimax      (MINIMAX_API_KEY)     — MiniMax
 #   minimax-cn   (MINIMAX_CN_API_KEY)  — MiniMax (China)
 #
-# For custom OpenAI-compatible endpoints, add base_url and api_key_env.
+# For custom OpenAI-compatible endpoints, add base_url and key_env.
 #
 # fallback_model:
 #   provider: openrouter
@@ -2591,7 +2905,7 @@ _COMMENTED_SECTIONS = """
 #   minimax      (MINIMAX_API_KEY)     — MiniMax
 #   minimax-cn   (MINIMAX_CN_API_KEY)  — MiniMax (China)
 #
-# For custom OpenAI-compatible endpoints, add base_url and api_key_env.
+# For custom OpenAI-compatible endpoints, add base_url and key_env.
 #
 # fallback_model:
 #   provider: openrouter
@@ -2621,7 +2935,15 @@ def save_config(config: Dict[str, Any]):

    ensure_hermes_home()
    config_path = get_config_path()
-    normalized = _normalize_root_model_keys(_normalize_max_turns_config(config))
+    current_normalized = _normalize_root_model_keys(_normalize_max_turns_config(config))
+    normalized = current_normalized
+    raw_existing = _normalize_root_model_keys(_normalize_max_turns_config(read_raw_config()))
+    if raw_existing:
+        normalized = _preserve_env_ref_templates(
+            normalized,
+            raw_existing,
+            _LAST_EXPANDED_CONFIG_BY_PATH.get(str(config_path)),
+        )

    # Build optional commented-out sections for features that are off by
    # default or only relevant when explicitly configured.
@@ -2639,6 +2961,7 @@ def save_config(config: Dict[str, Any]):
        extra_content="".join(parts) if parts else None,
    )
    _secure_file(config_path)
+    _LAST_EXPANDED_CONFIG_BY_PATH[str(config_path)] = copy.deepcopy(current_normalized)


 def load_env() -> Dict[str, str]:
@@ -2766,6 +3089,47 @@ def sanitize_env_file() -> int:
    return fixes


+def _check_non_ascii_credential(key: str, value: str) -> str:
+    """Warn and strip non-ASCII characters from credential values.
+
+    API keys and tokens must be pure ASCII — they are sent as HTTP header
+    values which httpx/httpcore encode as ASCII.  Non-ASCII characters
+    (commonly introduced by copy-pasting from rich-text editors or PDFs
+    that substitute lookalike Unicode glyphs for ASCII letters) cause
+    ``UnicodeEncodeError: 'ascii' codec can't encode character`` at
+    request time.
+
+    Returns the sanitized (ASCII-only) value.  Prints a warning if any
+    non-ASCII characters were found and removed.
+    """
+    try:
+        value.encode("ascii")
+        return value  # all ASCII — nothing to do
+    except UnicodeEncodeError:
+        pass
+
+    # Build a readable list of the offending characters
+    bad_chars: list[str] = []
+    for i, ch in enumerate(value):
+        if ord(ch) > 127:
+            bad_chars.append(f"  position {i}: {ch!r} (U+{ord(ch):04X})")
+    sanitized = value.encode("ascii", errors="ignore").decode("ascii")
+
+    import sys
+    print(
+        f"\n  Warning: {key} contains non-ASCII characters that will break API requests.\n"
+        f"  This usually happens when copy-pasting from a PDF, rich-text editor,\n"
+        f"  or web page that substitutes lookalike Unicode glyphs for ASCII letters.\n"
+        f"\n"
+        + "\n".join(f"  {line}" for line in bad_chars[:5])
+        + ("\n  ... and more" if len(bad_chars) > 5 else "")
+        + f"\n\n  The non-ASCII characters have been stripped automatically.\n"
+        f"  If authentication fails, re-copy the key from the provider's dashboard.\n",
+        file=sys.stderr,
+    )
+    return sanitized
+
+
 def save_env_value(key: str, value: str):
    """Save or update a value in ~/.hermes/.env."""
    if is_managed():
@@ -2774,6 +3138,8 @@ def save_env_value(key: str, value: str):
    if not _ENV_VAR_NAME_RE.match(key):
        raise ValueError(f"Invalid environment variable name: {key!r}")
    value = value.replace("\n", "").replace("\r", "")
+    # API keys / tokens must be ASCII — strip non-ASCII with a warning.
+    value = _check_non_ascii_credential(key, value)
    ensure_hermes_home()
    env_path = get_env_path()
    
@@ -2804,12 +3170,25 @@ def save_env_value(key: str, value: str):
        lines.append(f"{key}={value}\n")
    
    fd, tmp_path = tempfile.mkstemp(dir=str(env_path.parent), suffix='.tmp', prefix='.env_')
+    # Preserve original permissions so Docker volume mounts aren't clobbered.
+    original_mode = None
+    if env_path.exists():
+        try:
+            original_mode = stat.S_IMODE(env_path.stat().st_mode)
+        except OSError:
+            pass
    try:
        with os.fdopen(fd, 'w', **write_kw) as f:
            f.writelines(lines)
            f.flush()
            os.fsync(f.fileno())
        os.replace(tmp_path, env_path)
+        # Restore original permissions before _secure_file may tighten them.
+        if original_mode is not None:
+            try:
+                os.chmod(env_path, original_mode)
+            except OSError:
+                pass
    except BaseException:
        try:
            os.unlink(tmp_path)
@@ -2820,13 +3199,6 @@ def save_env_value(key: str, value: str):

    os.environ[key] = value

-    # Restrict .env permissions to owner-only (contains API keys)
-    if not _IS_WINDOWS:
-        try:
-            os.chmod(env_path, stat.S_IRUSR | stat.S_IWUSR)
-        except OSError:
-            pass
-

 def remove_env_value(key: str) -> bool:
    """Remove a key from ~/.hermes/.env and os.environ.
@@ -2855,12 +3227,23 @@ def remove_env_value(key: str) -> bool:

    if found:
        fd, tmp_path = tempfile.mkstemp(dir=str(env_path.parent), suffix='.tmp', prefix='.env_')
+        # Preserve original permissions so Docker volume mounts aren't clobbered.
+        original_mode = None
+        try:
+            original_mode = stat.S_IMODE(env_path.stat().st_mode)
+        except OSError:
+            pass
        try:
            with os.fdopen(fd, 'w', **write_kw) as f:
                f.writelines(new_lines)
                f.flush()
                os.fsync(f.fileno())
            os.replace(tmp_path, env_path)
+            if original_mode is not None:
+                try:
+                    os.chmod(env_path, original_mode)
+                except OSError:
+                    pass
        except BaseException:
            try:
                os.unlink(tmp_path)
@@ -166,6 +166,7 @@ def curses_radiolist(
    selected: int = 0,
    *,
    cancel_returns: int | None = None,
+    description: str | None = None,
 ) -> int:
    """Curses single-select radio list. Returns the selected index.

@@ -174,6 +175,9 @@ def curses_radiolist(
        items: Display labels for each row.
        selected: Index that starts selected (pre-selected).
        cancel_returns: Returned on ESC/q. Defaults to the original *selected*.
+        description: Optional multi-line text shown between the title and
+            the item list.  Useful for context that should survive the
+            curses screen clear.
    """
    if cancel_returns is None:
        cancel_returns = selected
@@ -181,6 +185,10 @@ def curses_radiolist(
    if not sys.stdin.isatty():
        return cancel_returns

+    desc_lines: list[str] = []
+    if description:
+        desc_lines = description.splitlines()
+
    try:
        import curses
        result_holder: list = [None]
@@ -199,22 +207,35 @@ def curses_radiolist(
                stdscr.clear()
                max_y, max_x = stdscr.getmaxyx()

+                row = 0
+
                # Header
                try:
                    hattr = curses.A_BOLD
                    if curses.has_colors():
                        hattr |= curses.color_pair(2)
-                    stdscr.addnstr(0, 0, title, max_x - 1, hattr)
+                    stdscr.addnstr(row, 0, title, max_x - 1, hattr)
+                    row += 1
+
+                    # Description lines
+                    for dline in desc_lines:
+                        if row >= max_y - 1:
+                            break
+                        stdscr.addnstr(row, 0, dline, max_x - 1, curses.A_NORMAL)
+                        row += 1
+
                    stdscr.addnstr(
-                        1, 0,
+                        row, 0,
                        "  \u2191\u2193 navigate  ENTER/SPACE select  ESC cancel",
                        max_x - 1, curses.A_DIM,
                    )
+                    row += 1
                except curses.error:
                    pass

                # Scrollable item list
-                visible_rows = max_y - 4
+                items_start = row + 1
+                visible_rows = max_y - items_start - 1
                if cursor < scroll_offset:
                    scroll_offset = cursor
                elif cursor >= scroll_offset + visible_rows:
@@ -223,7 +244,7 @@ def curses_radiolist(
                for draw_i, i in enumerate(
                    range(scroll_offset, min(len(items), scroll_offset + visible_rows))
                ):
-                    y = draw_i + 3
+                    y = draw_i + items_start
                    if y >= max_y - 1:
                        break
                    radio = "\u25cf" if i == selected else "\u25cb"
@@ -6,7 +6,10 @@ Currently supports:
 """

 import io
+import json
+import os
 import sys
+import time
 import urllib.error
 import urllib.parse
 import urllib.request
@@ -27,6 +30,205 @@ _DPASTE_COM_URL = "https://dpaste.com/api/"
 # paste.rs caps at ~1 MB; we stay under that with headroom.
 _MAX_LOG_BYTES = 512_000

+# Auto-delete pastes after this many seconds (6 hours).
+_AUTO_DELETE_SECONDS = 21600
+
+
+# ---------------------------------------------------------------------------
+# Pending-deletion tracking (replaces the old fork-and-sleep subprocess).
+# ---------------------------------------------------------------------------
+
+def _pending_file() -> Path:
+    """Path to ``~/.hermes/pastes/pending.json``.
+
+    Each entry: ``{"url": "...", "expire_at": <unix_ts>}``.  Scheduled
+    DELETEs used to be handled by spawning a detached Python process per
+    paste that slept for 6 hours; those accumulated forever if the user
+    ran ``hermes debug share`` repeatedly.  We now persist the schedule
+    to disk and sweep expired entries on the next debug invocation.
+    """
+    return get_hermes_home() / "pastes" / "pending.json"
+
+
+def _load_pending() -> list[dict]:
+    path = _pending_file()
+    if not path.exists():
+        return []
+    try:
+        data = json.loads(path.read_text(encoding="utf-8"))
+        if isinstance(data, list):
+            # Filter to well-formed entries only
+            return [
+                e for e in data
+                if isinstance(e, dict) and "url" in e and "expire_at" in e
+            ]
+    except (OSError, ValueError, json.JSONDecodeError):
+        pass
+    return []
+
+
+def _save_pending(entries: list[dict]) -> None:
+    path = _pending_file()
+    try:
+        path.parent.mkdir(parents=True, exist_ok=True)
+        tmp = path.with_suffix(".json.tmp")
+        tmp.write_text(json.dumps(entries, indent=2), encoding="utf-8")
+        os.replace(tmp, path)
+    except OSError:
+        # Non-fatal — worst case the user has to run ``hermes debug delete``
+        # manually.
+        pass
+
+
+def _record_pending(urls: list[str], delay_seconds: int = _AUTO_DELETE_SECONDS) -> None:
+    """Record *urls* for deletion at ``now + delay_seconds``.
+
+    Only paste.rs URLs are recorded (dpaste.com auto-expires).  Entries
+    are merged into any existing pending.json.
+    """
+    paste_rs_urls = [u for u in urls if _extract_paste_id(u)]
+    if not paste_rs_urls:
+        return
+
+    entries = _load_pending()
+    # Dedupe by URL: keep the later expire_at if same URL appears twice
+    by_url: dict[str, float] = {e["url"]: float(e["expire_at"]) for e in entries}
+    expire_at = time.time() + delay_seconds
+    for u in paste_rs_urls:
+        by_url[u] = max(expire_at, by_url.get(u, 0.0))
+    merged = [{"url": u, "expire_at": ts} for u, ts in by_url.items()]
+    _save_pending(merged)
+
+
+def _sweep_expired_pastes(now: Optional[float] = None) -> tuple[int, int]:
+    """Synchronously DELETE any pending pastes whose ``expire_at`` has passed.
+
+    Returns ``(deleted, remaining)``.  Best-effort: failed deletes stay in
+    the pending file and will be retried on the next sweep.  Silent —
+    intended to be called from every ``hermes debug`` invocation with
+    minimal noise.
+    """
+    entries = _load_pending()
+    if not entries:
+        return (0, 0)
+
+    current = time.time() if now is None else now
+    deleted = 0
+    remaining: list[dict] = []
+
+    for entry in entries:
+        try:
+            expire_at = float(entry.get("expire_at", 0))
+        except (TypeError, ValueError):
+            continue  # drop malformed entries
+        if expire_at > current:
+            remaining.append(entry)
+            continue
+
+        url = entry.get("url", "")
+        try:
+            if delete_paste(url):
+                deleted += 1
+                continue
+        except Exception:
+            # Network hiccup, 404 (already gone), etc. — drop the entry
+            # after a grace period; don't retry forever.
+            pass
+
+        # Retain failed deletes for up to 24h past expiration, then give up.
+        if expire_at + 86400 > current:
+            remaining.append(entry)
+        else:
+            deleted += 1  # count as reaped (paste.rs will GC eventually)
+
+    if deleted:
+        _save_pending(remaining)
+
+    return (deleted, len(remaining))
+
+
+# ---------------------------------------------------------------------------
+# Privacy / delete helpers
+# ---------------------------------------------------------------------------
+
+_PRIVACY_NOTICE = """\
+⚠️  This will upload the following to a public paste service:
+  • System info (OS, Python version, Hermes version, provider, which API keys
+    are configured — NOT the actual keys)
+  • Recent log lines (agent.log, errors.log, gateway.log — may contain
+    conversation fragments and file paths)
+  • Full agent.log and gateway.log (up to 512 KB each — likely contains
+    conversation content, tool outputs, and file paths)
+
+Pastes auto-delete after 6 hours.
+"""
+
+_GATEWAY_PRIVACY_NOTICE = (
+    "⚠️ **Privacy notice:** This uploads system info + recent log tails "
+    "(may contain conversation fragments) to a public paste service. "
+    "Full logs are NOT included from the gateway — use `hermes debug share` "
+    "from the CLI for full log uploads.\n"
+    "Pastes auto-delete after 6 hours."
+)
+
+
+def _extract_paste_id(url: str) -> Optional[str]:
+    """Extract the paste ID from a paste.rs or dpaste.com URL.
+
+    Returns the ID string, or None if the URL doesn't match a known service.
+    """
+    url = url.strip().rstrip("/")
+    for prefix in ("https://paste.rs/", "http://paste.rs/"):
+        if url.startswith(prefix):
+            return url[len(prefix):]
+    return None
+
+
+def delete_paste(url: str) -> bool:
+    """Delete a paste from paste.rs.  Returns True on success.
+
+    Only paste.rs supports unauthenticated DELETE.  dpaste.com pastes
+    expire automatically but cannot be deleted via API.
+    """
+    paste_id = _extract_paste_id(url)
+    if not paste_id:
+        raise ValueError(
+            f"Cannot delete: only paste.rs URLs are supported.  Got: {url}"
+        )
+
+    target = f"{_PASTE_RS_URL}{paste_id}"
+    req = urllib.request.Request(
+        target, method="DELETE",
+        headers={"User-Agent": "hermes-agent/debug-share"},
+    )
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        return 200 <= resp.status < 300
+
+
+def _schedule_auto_delete(urls: list[str], delay_seconds: int = _AUTO_DELETE_SECONDS):
+    """Record *urls* for deletion ``delay_seconds`` from now.
+
+    Previously this spawned a detached Python subprocess per call that slept
+    for 6 hours and then issued DELETE requests.  Those subprocesses leaked —
+    every ``hermes debug share`` invocation added ~20 MB of resident Python
+    interpreters that never exited until the sleep completed.
+
+    The replacement is stateless: we append to ``~/.hermes/pastes/pending.json``
+    and rely on opportunistic sweeps (``_sweep_expired_pastes``) called from
+    every ``hermes debug`` invocation.  If the user never runs ``hermes debug``
+    again, paste.rs's own retention policy handles cleanup.
+    """
+    _record_pending(urls, delay_seconds=delay_seconds)
+
+
+def _delete_hint(url: str) -> str:
+    """Return a one-liner delete command for the given paste URL."""
+    paste_id = _extract_paste_id(url)
+    if paste_id:
+        return f"hermes debug delete {url}"
+    # dpaste.com — no API delete, expires on its own.
+    return "(auto-expires per dpaste.com policy)"
+

 def _upload_paste_rs(content: str) -> str:
    """Upload to paste.rs.  Returns the paste URL.
@@ -250,6 +452,9 @@ def run_debug_share(args):
    expiry = getattr(args, "expire", 7)
    local_only = getattr(args, "local", False)

+    if not local_only:
+        print(_PRIVACY_NOTICE)
+
    print("Collecting debug report...")

    # Capture dump once — prepended to every paste for context.
@@ -315,22 +520,66 @@ def run_debug_share(args):
    if failures:
        print(f"\n  (failed to upload: {', '.join(failures)})")

+    # Schedule auto-deletion after 6 hours
+    _schedule_auto_delete(list(urls.values()))
+    print(f"\n⏱  Pastes will auto-delete in 6 hours.")
+
+    # Manual delete fallback
+    print(f"To delete now:  hermes debug delete <url>")
+
    print(f"\nShare these links with the Hermes team for support.")


+def run_debug_delete(args):
+    """Delete one or more paste URLs uploaded by /debug."""
+    urls = getattr(args, "urls", [])
+    if not urls:
+        print("Usage: hermes debug delete <url> [<url> ...]")
+        print("  Deletes paste.rs pastes uploaded by 'hermes debug share'.")
+        return
+
+    for url in urls:
+        try:
+            ok = delete_paste(url)
+            if ok:
+                print(f"  ✓ Deleted: {url}")
+            else:
+                print(f"  ✗ Failed to delete: {url} (unexpected response)")
+        except ValueError as exc:
+            print(f"  ✗ {exc}")
+        except Exception as exc:
+            print(f"  ✗ Could not delete {url}: {exc}")
+
+
 def run_debug(args):
    """Route debug subcommands."""
+    # Opportunistic sweep of expired pastes on every ``hermes debug`` call.
+    # Replaces the old per-paste sleeping subprocess that used to leak as
+    # one orphaned Python interpreter per scheduled deletion.  Silent and
+    # best-effort — any failure is swallowed so ``hermes debug`` stays
+    # reliable even when offline.
+    try:
+        _sweep_expired_pastes()
+    except Exception:
+        pass
+
    subcmd = getattr(args, "debug_command", None)
    if subcmd == "share":
        run_debug_share(args)
+    elif subcmd == "delete":
+        run_debug_delete(args)
    else:
        # Default: show help
-        print("Usage: hermes debug share [--lines N] [--expire N] [--local]")
+        print("Usage: hermes debug <command>")
        print()
        print("Commands:")
        print("  share    Upload debug report to a paste service and print URL")
+        print("  delete   Delete a previously uploaded paste")
        print()
-        print("Options:")
+        print("Options (share):")
        print("  --lines N    Number of log lines to include (default: 200)")
        print("  --expire N   Paste expiry in days (default: 7)")
        print("  --local      Print report locally instead of uploading")
+        print()
+        print("Options (delete):")
+        print("  <url> ...    One or more paste URLs to delete")
@@ -0,0 +1,294 @@
+"""
+DingTalk Device Flow authorization.
+
+Implements the same 3-step registration flow as dingtalk-openclaw-connector:
+  1. POST /app/registration/init   → get nonce
+  2. POST /app/registration/begin  → get device_code + verification_uri_complete
+  3. POST /app/registration/poll   → poll until SUCCESS → get client_id + client_secret
+
+The verification_uri_complete is rendered as a QR code in the terminal so the
+user can scan it with DingTalk to authorize, yielding AppKey + AppSecret
+automatically.
+"""
+
+from __future__ import annotations
+
+import io
+import os
+import sys
+import time
+import logging
+from typing import Optional, Tuple
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+# ── Configuration ──────────────────────────────────────────────────────────
+
+REGISTRATION_BASE_URL = os.environ.get(
+    "DINGTALK_REGISTRATION_BASE_URL", "https://oapi.dingtalk.com"
+).rstrip("/")
+
+REGISTRATION_SOURCE = os.environ.get("DINGTALK_REGISTRATION_SOURCE", "openClaw")
+
+
+# ── API helpers ────────────────────────────────────────────────────────────
+
+class RegistrationError(Exception):
+    """Raised when a DingTalk registration API call fails."""
+
+
+def _api_post(path: str, payload: dict) -> dict:
+    """POST to the registration API and return the parsed JSON body."""
+    url = f"{REGISTRATION_BASE_URL}{path}"
+    try:
+        resp = requests.post(url, json=payload, timeout=15)
+        resp.raise_for_status()
+        data = resp.json()
+    except requests.RequestException as exc:
+        raise RegistrationError(f"Network error calling {url}: {exc}") from exc
+
+    errcode = data.get("errcode", -1)
+    if errcode != 0:
+        errmsg = data.get("errmsg", "unknown error")
+        raise RegistrationError(f"API error [{path}]: {errmsg} (errcode={errcode})")
+    return data
+
+
+# ── Core flow ──────────────────────────────────────────────────────────────
+
+def begin_registration() -> dict:
+    """Start a device-flow registration.
+
+    Returns a dict with keys:
+        device_code, verification_uri_complete, expires_in, interval
+    """
+    # Step 1: init → nonce
+    init_data = _api_post("/app/registration/init", {"source": REGISTRATION_SOURCE})
+    nonce = str(init_data.get("nonce", "")).strip()
+    if not nonce:
+        raise RegistrationError("init response missing nonce")
+
+    # Step 2: begin → device_code, verification_uri_complete
+    begin_data = _api_post("/app/registration/begin", {"nonce": nonce})
+    device_code = str(begin_data.get("device_code", "")).strip()
+    verification_uri_complete = str(begin_data.get("verification_uri_complete", "")).strip()
+    if not device_code:
+        raise RegistrationError("begin response missing device_code")
+    if not verification_uri_complete:
+        raise RegistrationError("begin response missing verification_uri_complete")
+
+    return {
+        "device_code": device_code,
+        "verification_uri_complete": verification_uri_complete,
+        "expires_in": int(begin_data.get("expires_in", 7200)),
+        "interval": max(int(begin_data.get("interval", 3)), 2),
+    }
+
+
+def poll_registration(device_code: str) -> dict:
+    """Poll the registration status once.
+
+    Returns a dict with keys:  status, client_id?, client_secret?, fail_reason?
+    """
+    data = _api_post("/app/registration/poll", {"device_code": device_code})
+    status_raw = str(data.get("status", "")).strip().upper()
+    if status_raw not in ("WAITING", "SUCCESS", "FAIL", "EXPIRED"):
+        status_raw = "UNKNOWN"
+    return {
+        "status": status_raw,
+        "client_id": str(data.get("client_id", "")).strip() or None,
+        "client_secret": str(data.get("client_secret", "")).strip() or None,
+        "fail_reason": str(data.get("fail_reason", "")).strip() or None,
+    }
+
+
+def wait_for_registration_success(
+    device_code: str,
+    interval: int = 3,
+    expires_in: int = 7200,
+    on_waiting: Optional[callable] = None,
+) -> Tuple[str, str]:
+    """Block until the registration succeeds or times out.
+
+    Returns (client_id, client_secret).
+    """
+    deadline = time.monotonic() + expires_in
+    retry_window = 120  # 2 minutes for transient errors
+    retry_start = 0.0
+
+    while time.monotonic() < deadline:
+        time.sleep(interval)
+        try:
+            result = poll_registration(device_code)
+        except RegistrationError:
+            if retry_start == 0:
+                retry_start = time.monotonic()
+            if time.monotonic() - retry_start < retry_window:
+                continue
+            raise
+
+        status = result["status"]
+        if status == "WAITING":
+            retry_start = 0
+            if on_waiting:
+                on_waiting()
+            continue
+        if status == "SUCCESS":
+            cid = result["client_id"]
+            csecret = result["client_secret"]
+            if not cid or not csecret:
+                raise RegistrationError("authorization succeeded but credentials are missing")
+            return cid, csecret
+        # FAIL / EXPIRED / UNKNOWN
+        if retry_start == 0:
+            retry_start = time.monotonic()
+        if time.monotonic() - retry_start < retry_window:
+            continue
+        reason = result.get("fail_reason") or status
+        raise RegistrationError(f"authorization failed: {reason}")
+
+    raise RegistrationError("authorization timed out, please retry")
+
+
+# ── QR code rendering ─────────────────────────────────────────────────────
+
+def _ensure_qrcode_installed() -> bool:
+    """Try to import qrcode; if missing, auto-install it via pip/uv."""
+    try:
+        import qrcode  # noqa: F401
+        return True
+    except ImportError:
+        pass
+
+    import subprocess
+
+    # Try uv first (Hermes convention), then pip
+    for cmd in (
+        [sys.executable, "-m", "uv", "pip", "install", "qrcode"],
+        [sys.executable, "-m", "pip", "install", "-q", "qrcode"],
+    ):
+        try:
+            subprocess.check_call(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            import qrcode  # noqa: F401,F811
+            return True
+        except (subprocess.CalledProcessError, ImportError, FileNotFoundError):
+            continue
+    return False
+
+
+def render_qr_to_terminal(url: str) -> bool:
+    """Render *url* as a compact QR code in the terminal.
+
+    Returns True if the QR code was printed, False if the library is missing.
+    """
+    try:
+        import qrcode
+    except ImportError:
+        return False
+
+    qr = qrcode.QRCode(
+        version=1,
+        error_correction=qrcode.constants.ERROR_CORRECT_L,
+        box_size=1,
+        border=1,
+    )
+    qr.add_data(url)
+    qr.make(fit=True)
+
+    # Use half-block characters for compact rendering (2 rows per character)
+    matrix = qr.get_matrix()
+    rows = len(matrix)
+    lines: list[str] = []
+
+    TOP_HALF = "\u2580"      # ▀
+    BOTTOM_HALF = "\u2584"   # ▄
+    FULL_BLOCK = "\u2588"    # █
+    EMPTY = " "
+
+    for r in range(0, rows, 2):
+        line_chars: list[str] = []
+        for c in range(len(matrix[r])):
+            top = matrix[r][c]
+            bottom = matrix[r + 1][c] if r + 1 < rows else False
+            if top and bottom:
+                line_chars.append(FULL_BLOCK)
+            elif top:
+                line_chars.append(TOP_HALF)
+            elif bottom:
+                line_chars.append(BOTTOM_HALF)
+            else:
+                line_chars.append(EMPTY)
+        lines.append("    " + "".join(line_chars))
+
+    print("\n".join(lines))
+    return True
+
+
+# ── High-level entry point for the setup wizard ───────────────────────────
+
+def dingtalk_qr_auth() -> Optional[Tuple[str, str]]:
+    """Run the interactive QR-code device-flow authorization.
+
+    Returns (client_id, client_secret) on success, or None if the user
+    cancelled or the flow failed.
+    """
+    from hermes_cli.setup import print_info, print_success, print_warning, print_error
+
+    print()
+    print_info("  Initializing DingTalk device authorization...")
+    print_info("  Note: the scan page is branded 'OpenClaw' — DingTalk's")
+    print_info("        ecosystem onboarding bridge. Safe to use.")
+
+    try:
+        reg = begin_registration()
+    except RegistrationError as exc:
+        print_error(f"  Authorization init failed: {exc}")
+        return None
+
+    url = reg["verification_uri_complete"]
+
+    # Ensure qrcode library is available (auto-install if missing)
+    if not _ensure_qrcode_installed():
+        print_warning("  qrcode library install failed, will show link only.")
+
+    print()
+    print_info("  Please scan the QR code below with DingTalk to authorize:")
+    print()
+
+    if not render_qr_to_terminal(url):
+        print_warning(f"  QR code render failed, please open the link below to authorize:")
+
+    print()
+    print_info(f"  Or open this link manually: {url}")
+    print()
+    print_info("  Waiting for QR scan authorization... (timeout: 2 hours)")
+
+    dot_count = 0
+
+    def _on_waiting():
+        nonlocal dot_count
+        dot_count += 1
+        if dot_count % 10 == 0:
+            sys.stdout.write(".")
+            sys.stdout.flush()
+
+    try:
+        client_id, client_secret = wait_for_registration_success(
+            device_code=reg["device_code"],
+            interval=reg["interval"],
+            expires_in=reg["expires_in"],
+            on_waiting=_on_waiting,
+        )
+    except RegistrationError as exc:
+        print()
+        print_error(f"  Authorization failed: {exc}")
+        return None
+
+    print()
+    print_success("  QR scan authorization successful!")
+    print_success(f"  Client ID:     {client_id}")
+    print_success(f"  Client Secret: {client_secret[:8]}{'*' * (len(client_secret) - 8)}")
+
+    return client_id, client_secret
@@ -8,6 +8,7 @@ import os
 import sys
 import subprocess
 import shutil
+from pathlib import Path

 from hermes_cli.config import get_project_root, get_hermes_home, get_env_path
 from hermes_constants import display_hermes_home
@@ -372,7 +373,11 @@ def run_doctor(args):
    print(color("◆ Auth Providers", Colors.CYAN, Colors.BOLD))

    try:
-        from hermes_cli.auth import get_nous_auth_status, get_codex_auth_status
+        from hermes_cli.auth import (
+            get_nous_auth_status,
+            get_codex_auth_status,
+            get_gemini_oauth_auth_status,
+        )

        nous_status = get_nous_auth_status()
        if nous_status.get("logged_in"):
@@ -387,6 +392,20 @@ def run_doctor(args):
            check_warn("OpenAI Codex auth", "(not logged in)")
            if codex_status.get("error"):
                check_info(codex_status["error"])
+
+        gemini_status = get_gemini_oauth_auth_status()
+        if gemini_status.get("logged_in"):
+            email = gemini_status.get("email") or ""
+            project = gemini_status.get("project_id") or ""
+            pieces = []
+            if email:
+                pieces.append(email)
+            if project:
+                pieces.append(f"project={project}")
+            suffix = f" ({', '.join(pieces)})" if pieces else ""
+            check_ok("Google Gemini OAuth", f"(logged in{suffix})")
+        else:
+            check_warn("Google Gemini OAuth", "(not logged in)")
    except Exception as e:
        check_warn("Auth provider status", f"(could not check: {e})")

@@ -513,7 +532,87 @@ def run_doctor(args):
            pass

    _check_gateway_service_linger(issues)
-    
+
+    # =========================================================================
+    # Check: Command installation (hermes bin symlink)
+    # =========================================================================
+    if sys.platform != "win32":
+        print()
+        print(color("◆ Command Installation", Colors.CYAN, Colors.BOLD))
+
+        # Determine the venv entry point location
+        _venv_bin = None
+        for _venv_name in ("venv", ".venv"):
+            _candidate = PROJECT_ROOT / _venv_name / "bin" / "hermes"
+            if _candidate.exists():
+                _venv_bin = _candidate
+                break
+
+        # Determine the expected command link directory (mirrors install.sh logic)
+        _prefix = os.environ.get("PREFIX", "")
+        _is_termux_env = bool(os.environ.get("TERMUX_VERSION")) or "com.termux/files/usr" in _prefix
+        if _is_termux_env and _prefix:
+            _cmd_link_dir = Path(_prefix) / "bin"
+            _cmd_link_display = "$PREFIX/bin"
+        else:
+            _cmd_link_dir = Path.home() / ".local" / "bin"
+            _cmd_link_display = "~/.local/bin"
+        _cmd_link = _cmd_link_dir / "hermes"
+
+        if _venv_bin is None:
+            check_warn(
+                "Venv entry point not found",
+                "(hermes not in venv/bin/ or .venv/bin/ — reinstall with pip install -e '.[all]')"
+            )
+            manual_issues.append(
+                f"Reinstall entry point: cd {PROJECT_ROOT} && source venv/bin/activate && pip install -e '.[all]'"
+            )
+        else:
+            check_ok(f"Venv entry point exists ({_venv_bin.relative_to(PROJECT_ROOT)})")
+
+            # Check the symlink at the command link location
+            if _cmd_link.is_symlink():
+                _target = _cmd_link.resolve()
+                _expected = _venv_bin.resolve()
+                if _target == _expected:
+                    check_ok(f"{_cmd_link_display}/hermes → correct target")
+                else:
+                    check_warn(
+                        f"{_cmd_link_display}/hermes points to wrong target",
+                        f"(→ {_target}, expected → {_expected})"
+                    )
+                    if should_fix:
+                        _cmd_link.unlink()
+                        _cmd_link.symlink_to(_venv_bin)
+                        check_ok(f"Fixed symlink: {_cmd_link_display}/hermes → {_venv_bin}")
+                        fixed_count += 1
+                    else:
+                        issues.append(f"Broken symlink at {_cmd_link_display}/hermes — run 'hermes doctor --fix'")
+            elif _cmd_link.exists():
+                # It's a regular file, not a symlink — possibly a wrapper script
+                check_ok(f"{_cmd_link_display}/hermes exists (non-symlink)")
+            else:
+                check_fail(
+                    f"{_cmd_link_display}/hermes not found",
+                    "(hermes command may not work outside the venv)"
+                )
+                if should_fix:
+                    _cmd_link_dir.mkdir(parents=True, exist_ok=True)
+                    _cmd_link.symlink_to(_venv_bin)
+                    check_ok(f"Created symlink: {_cmd_link_display}/hermes → {_venv_bin}")
+                    fixed_count += 1
+
+                    # Check if the link dir is on PATH
+                    _path_dirs = os.environ.get("PATH", "").split(os.pathsep)
+                    if str(_cmd_link_dir) not in _path_dirs:
+                        check_warn(
+                            f"{_cmd_link_display} is not on your PATH",
+                            "(add it to your shell config: export PATH=\"$HOME/.local/bin:$PATH\")"
+                        )
+                        manual_issues.append(f"Add {_cmd_link_display} to your PATH")
+                else:
+                    issues.append(f"Missing {_cmd_link_display}/hermes symlink — run 'hermes doctor --fix'")
+
    # =========================================================================
    # Check: External tools
    # =========================================================================
@@ -726,6 +825,7 @@ def run_doctor(args):
        ("Arcee AI",         ("ARCEEAI_API_KEY",),                            "https://api.arcee.ai/api/v1/models",  "ARCEE_BASE_URL", True),
        ("DeepSeek",         ("DEEPSEEK_API_KEY",),                           "https://api.deepseek.com/v1/models",  "DEEPSEEK_BASE_URL", True),
        ("Hugging Face",     ("HF_TOKEN",),                                   "https://router.huggingface.co/v1/models", "HF_BASE_URL", True),
+        ("NVIDIA NIM",       ("NVIDIA_API_KEY",),                             "https://integrate.api.nvidia.com/v1/models", "NVIDIA_BASE_URL", True),
        ("Alibaba/DashScope", ("DASHSCOPE_API_KEY",),                         "https://dashscope-intl.aliyuncs.com/compatible-mode/v1/models", "DASHSCOPE_BASE_URL", True),
        # MiniMax: the /anthropic endpoint doesn't support /models, but the /v1 endpoint does.
        ("MiniMax",          ("MINIMAX_API_KEY",),                            "https://api.minimax.io/v1/models",    "MINIMAX_BASE_URL", True),
@@ -733,7 +833,8 @@ def run_doctor(args):
        ("Vercel AI Gateway",       ("AI_GATEWAY_API_KEY",),                          "https://ai-gateway.vercel.sh/v1/models", "AI_GATEWAY_BASE_URL", True),
        ("Kilo Code",        ("KILOCODE_API_KEY",),                            "https://api.kilo.ai/api/gateway/models",  "KILOCODE_BASE_URL", True),
        ("OpenCode Zen",     ("OPENCODE_ZEN_API_KEY",),                        "https://opencode.ai/zen/v1/models",  "OPENCODE_ZEN_BASE_URL", True),
-        ("OpenCode Go",      ("OPENCODE_GO_API_KEY",),                         "https://opencode.ai/zen/go/v1/models", "OPENCODE_GO_BASE_URL", True),
+        # OpenCode Go has no shared /models endpoint; skip the health check.
+        ("OpenCode Go",      ("OPENCODE_GO_API_KEY",),                         None,                                  "OPENCODE_GO_BASE_URL", False),
    ]
    for _pname, _env_vars, _default_url, _base_env, _supports_health_check in _apikey_providers:
        _key = ""
@@ -778,6 +879,31 @@ def run_doctor(args):
            except Exception as _e:
                print(f"\r  {color('⚠', Colors.YELLOW)} {_label} {color(f'({_e})', Colors.DIM)}           ")

+    # -- AWS Bedrock --
+    # Bedrock uses the AWS SDK credential chain, not API keys.
+    try:
+        from agent.bedrock_adapter import has_aws_credentials, resolve_aws_auth_env_var, resolve_bedrock_region
+        if has_aws_credentials():
+            _auth_var = resolve_aws_auth_env_var()
+            _region = resolve_bedrock_region()
+            _label = "AWS Bedrock".ljust(20)
+            print(f"  Checking AWS Bedrock...", end="", flush=True)
+            try:
+                import boto3
+                _br_client = boto3.client("bedrock", region_name=_region)
+                _br_resp = _br_client.list_foundation_models()
+                _model_count = len(_br_resp.get("modelSummaries", []))
+                print(f"\r  {color('✓', Colors.GREEN)} {_label} {color(f'({_auth_var}, {_region}, {_model_count} models)', Colors.DIM)}           ")
+            except ImportError:
+                print(f"\r  {color('⚠', Colors.YELLOW)} {_label} {color(f'(boto3 not installed — {sys.executable} -m pip install boto3)', Colors.DIM)}           ")
+                issues.append(f"Install boto3 for Bedrock: {sys.executable} -m pip install boto3")
+            except Exception as _e:
+                _err_name = type(_e).__name__
+                print(f"\r  {color('⚠', Colors.YELLOW)} {_label} {color(f'({_err_name}: {_e})', Colors.DIM)}           ")
+                issues.append(f"AWS Bedrock: {_err_name} — check IAM permissions for bedrock:ListFoundationModels")
+    except ImportError:
+        pass  # bedrock_adapter not available — skip silently
+
    # =========================================================================
    # Check: Submodules
    # =========================================================================
@@ -43,41 +43,20 @@ def _redact(value: str) -> str:

 def _gateway_status() -> str:
    """Return a short gateway status string."""
-    if sys.platform.startswith("linux"):
-        from hermes_constants import is_container
-        if is_container():
-            try:
-                from hermes_cli.gateway import find_gateway_pids
-                pids = find_gateway_pids()
-                if pids:
-                    return f"running (docker, pid {pids[0]})"
-                return "stopped (docker)"
-            except Exception:
-                return "stopped (docker)"
-        try:
-            from hermes_cli.gateway import get_service_name
-            svc = get_service_name()
-        except Exception:
-            svc = "hermes-gateway"
-        try:
-            r = subprocess.run(
-                ["systemctl", "--user", "is-active", svc],
-                capture_output=True, text=True, timeout=5,
-            )
-            return "running (systemd)" if r.stdout.strip() == "active" else "stopped"
-        except Exception:
-            return "unknown"
-    elif sys.platform == "darwin":
-        try:
-            from hermes_cli.gateway import get_launchd_label
-            r = subprocess.run(
-                ["launchctl", "list", get_launchd_label()],
-                capture_output=True, text=True, timeout=5,
-            )
-            return "loaded (launchd)" if r.returncode == 0 else "not loaded"
-        except Exception:
-            return "unknown"
-    return "N/A"
+    try:
+        from hermes_cli.gateway import get_gateway_runtime_snapshot
+
+        snapshot = get_gateway_runtime_snapshot()
+        if snapshot.running:
+            mode = snapshot.manager
+            if snapshot.has_process_service_mismatch:
+                mode = "manual"
+            return f"running ({mode}, pid {snapshot.gateway_pids[0]})"
+        if snapshot.service_installed and not snapshot.service_running:
+            return f"stopped ({snapshot.manager})"
+        return f"stopped ({snapshot.manager})"
+    except Exception:
+        return "unknown" if sys.platform.startswith(("linux", "darwin")) else "N/A"


 def _count_skills(hermes_home: Path) -> int:
@@ -296,6 +275,7 @@ def run_dump(args):
        ("DEEPSEEK_API_KEY", "deepseek"),
        ("DASHSCOPE_API_KEY", "dashscope"),
        ("HF_TOKEN", "huggingface"),
+        ("NVIDIA_API_KEY", "nvidia"),
        ("AI_GATEWAY_API_KEY", "ai_gateway"),
        ("OPENCODE_ZEN_API_KEY", "opencode_zen"),
        ("OPENCODE_GO_API_KEY", "opencode_go"),
@@ -8,11 +8,40 @@ from pathlib import Path
 from dotenv import load_dotenv


+# Env var name suffixes that indicate credential values.  These are the
+# only env vars whose values we sanitize on load — we must not silently
+# alter arbitrary user env vars, but credentials are known to require
+# pure ASCII (they become HTTP header values).
+_CREDENTIAL_SUFFIXES = ("_API_KEY", "_TOKEN", "_SECRET", "_KEY")
+
+
+def _sanitize_loaded_credentials() -> None:
+    """Strip non-ASCII characters from credential env vars in os.environ.
+
+    Called after dotenv loads so the rest of the codebase never sees
+    non-ASCII API keys.  Only touches env vars whose names end with
+    known credential suffixes (``_API_KEY``, ``_TOKEN``, etc.).
+    """
+    for key, value in list(os.environ.items()):
+        if not any(key.endswith(suffix) for suffix in _CREDENTIAL_SUFFIXES):
+            continue
+        try:
+            value.encode("ascii")
+        except UnicodeEncodeError:
+            os.environ[key] = value.encode("ascii", errors="ignore").decode("ascii")
+
+
 def _load_dotenv_with_fallback(path: Path, *, override: bool) -> None:
    try:
        load_dotenv(dotenv_path=path, override=override, encoding="utf-8")
    except UnicodeDecodeError:
        load_dotenv(dotenv_path=path, override=override, encoding="latin-1")
+    # Strip non-ASCII characters from credential env vars that were just
+    # loaded.  API keys must be pure ASCII since they're sent as HTTP
+    # header values (httpx encodes headers as ASCII).  Non-ASCII chars
+    # typically come from copy-pasting keys from PDFs or rich-text editors
+    # that substitute Unicode lookalike glyphs (e.g. ʋ U+028B for v).
+    _sanitize_loaded_credentials()


 def _sanitize_env_file_if_needed(path: Path) -> None:
@@ -279,8 +279,8 @@ def cmd_mcp_add(args):
        _info(f"Starting OAuth flow for '{name}'...")
        oauth_ok = False
        try:
-            from tools.mcp_oauth import build_oauth_auth
-            oauth_auth = build_oauth_auth(name, url)
+            from tools.mcp_oauth_manager import get_manager
+            oauth_auth = get_manager().get_or_build_provider(name, url, None)
            if oauth_auth:
                server_config["auth"] = "oauth"
                _success("OAuth configured (tokens will be acquired on first connection)")
@@ -428,10 +428,12 @@ def cmd_mcp_remove(args):
    _remove_mcp_server(name)
    _success(f"Removed '{name}' from config")

-    # Clean up OAuth tokens if they exist
+    # Clean up OAuth tokens if they exist — route through MCPOAuthManager so
+    # any provider instance cached in the current process (e.g. from an
+    # earlier `hermes mcp test` in the same session) is evicted too.
    try:
-        from tools.mcp_oauth import remove_oauth_tokens
-        remove_oauth_tokens(name)
+        from tools.mcp_oauth_manager import get_manager
+        get_manager().remove(name)
        _success("Cleaned up OAuth tokens")
    except Exception:
        pass
@@ -577,6 +579,63 @@ def _interpolate_value(value: str) -> str:
    return re.sub(r"\$\{(\w+)\}", _replace, value)


+# ─── hermes mcp login ────────────────────────────────────────────────────────
+
+def cmd_mcp_login(args):
+    """Force re-authentication for an OAuth-based MCP server.
+
+    Deletes cached tokens (both on disk and in the running process's
+    MCPOAuthManager cache) and triggers a fresh OAuth flow via the
+    existing probe path.
+
+    Use this when:
+      - Tokens are stuck in a bad state (server revoked, refresh token
+        consumed by an external process, etc.)
+      - You want to re-authenticate to change scopes or account
+      - A tool call returned ``needs_reauth: true``
+    """
+    name = args.name
+    servers = _get_mcp_servers()
+
+    if name not in servers:
+        _error(f"Server '{name}' not found in config.")
+        if servers:
+            _info(f"Available servers: {', '.join(servers)}")
+        return
+
+    server_config = servers[name]
+    url = server_config.get("url")
+    if not url:
+        _error(f"Server '{name}' has no URL — not an OAuth-capable server")
+        return
+    if server_config.get("auth") != "oauth":
+        _error(f"Server '{name}' is not configured for OAuth (auth={server_config.get('auth')})")
+        _info("Use `hermes mcp remove` + `hermes mcp add` to reconfigure auth.")
+        return
+
+    # Wipe both disk and in-memory cache so the next probe forces a fresh
+    # OAuth flow.
+    try:
+        from tools.mcp_oauth_manager import get_manager
+        mgr = get_manager()
+        mgr.remove(name)
+    except Exception as exc:
+        _warning(f"Could not clear existing OAuth state: {exc}")
+
+    print()
+    _info(f"Starting OAuth flow for '{name}'...")
+
+    # Probe triggers the OAuth flow (browser redirect + callback capture).
+    try:
+        tools = _probe_single_server(name, server_config)
+        if tools:
+            _success(f"Authenticated — {len(tools)} tool(s) available")
+        else:
+            _success("Authenticated (server reported no tools)")
+    except Exception as exc:
+        _error(f"Authentication failed: {exc}")
+
+
 # ─── hermes mcp configure ────────────────────────────────────────────────────

 def cmd_mcp_configure(args):
@@ -696,6 +755,7 @@ def mcp_command(args):
        "test": cmd_mcp_test,
        "configure": cmd_mcp_configure,
        "config": cmd_mcp_configure,
+        "login": cmd_mcp_login,
    }

    handler = handlers.get(action)
@@ -713,4 +773,5 @@ def mcp_command(args):
        _info("hermes mcp list                               List servers")
        _info("hermes mcp test <name>                        Test connection")
        _info("hermes mcp configure <name>                   Toggle tools")
+        _info("hermes mcp login <name>                       Re-authenticate OAuth")
        print()
@@ -58,9 +58,11 @@ def _prompt(label: str, default: str | None = None, secret: bool = False) -> str
 def _install_dependencies(provider_name: str) -> None:
    """Install pip dependencies declared in plugin.yaml."""
    import subprocess
-    from pathlib import Path as _Path
+    from plugins.memory import find_provider_dir

-    plugin_dir = _Path(__file__).parent.parent / "plugins" / "memory" / provider_name
+    plugin_dir = find_provider_dir(provider_name)
+    if not plugin_dir:
+        return
    yaml_path = plugin_dir / "plugin.yaml"
    if not yaml_path.exists():
        return
@@ -96,6 +96,7 @@ _MATCHING_PREFIX_STRIP_PROVIDERS: frozenset[str] = frozenset({
    "qwen-oauth",
    "xiaomi",
    "arcee",
+    "ollama-cloud",
    "custom",
 })

@@ -373,7 +374,26 @@ def normalize_model_for_provider(model_input: str, target_provider: str) -> str:
            return bare
        return _dots_to_hyphens(bare)

-    # --- Copilot: strip matching provider prefix, keep dots ---
+    # --- Copilot / Copilot ACP: delegate to the Copilot-specific
+    #     normalizer.  It knows about the alias table (vendor-prefix
+    #     stripping for Anthropic/OpenAI, dash-to-dot repair for Claude)
+    #     and live-catalog lookups.  Without this, vendor-prefixed or
+    #     dash-notation Claude IDs survive to the Copilot API and hit
+    #     HTTP 400 "model_not_supported".  See issue #6879.
+    if provider in {"copilot", "copilot-acp"}:
+        try:
+            from hermes_cli.models import normalize_copilot_model_id
+
+            normalized = normalize_copilot_model_id(name)
+            if normalized:
+                return normalized
+        except Exception:
+            # Fall through to the generic strip-vendor behaviour below
+            # if the Copilot-specific path is unavailable for any reason.
+            pass
+
+    # --- Copilot / Copilot ACP / openai-codex fallback:
+    #     strip matching provider prefix, keep dots ---
    if provider in _STRIP_VENDOR_ONLY_PROVIDERS:
        stripped = _strip_matching_provider_prefix(name, provider)
        if stripped == name and name.startswith("openai/"):
@@ -274,6 +274,11 @@ def parse_model_flags(raw_args: str) -> tuple[str, str, bool]:
    is_global = False
    explicit_provider = ""

+    # Normalize Unicode dashes (Telegram/iOS auto-converts -- to em/en dash)
+    # A single Unicode dash before a flag keyword becomes "--"
+    import re as _re
+    raw_args = _re.sub(r'[\u2012\u2013\u2014\u2015](provider|global)', r'--\1', raw_args)
+
    # Extract --global
    if "--global" in raw_args:
        is_global = True
@@ -452,6 +457,7 @@ def switch_model(
        ModelSwitchResult with all information the caller needs.
    """
    from hermes_cli.models import (
+        copilot_model_api_mode,
        detect_provider_for_model,
        validate_requested_model,
        opencode_model_api_mode,
@@ -686,12 +692,12 @@ def switch_model(
            api_key=api_key,
            base_url=base_url,
        )
-    except Exception:
+    except Exception as e:
        validation = {
-            "accepted": True,
-            "persist": True,
+            "accepted": False,
+            "persist": False,
            "recognized": False,
-            "message": None,
+            "message": f"Could not validate `{new_model}`: {e}",
        }

    if not validation.get("accepted"):
@@ -709,14 +715,34 @@ def switch_model(
    if validation.get("corrected_model"):
        new_model = validation["corrected_model"]

+    # --- Copilot api_mode override ---
+    if target_provider in {"copilot", "github-copilot"}:
+        api_mode = copilot_model_api_mode(new_model, api_key=api_key)
+
    # --- OpenCode api_mode override ---
-    if target_provider in {"opencode-zen", "opencode-go", "opencode", "opencode-go"}:
+    if target_provider in {"opencode-zen", "opencode-go", "opencode"}:
        api_mode = opencode_model_api_mode(target_provider, new_model)

    # --- Determine api_mode if not already set ---
    if not api_mode:
        api_mode = determine_api_mode(target_provider, base_url)

+    # OpenCode base URLs end with /v1 for OpenAI-compatible models, but the
+    # Anthropic SDK prepends its own /v1/messages to the base_url.  Strip the
+    # trailing /v1 so the SDK constructs the correct path (e.g.
+    # https://opencode.ai/zen/go/v1/messages instead of .../v1/v1/messages).
+    # Mirrors the same logic in hermes_cli.runtime_provider.resolve_runtime_provider;
+    # without it, /model switches into an anthropic_messages-routed OpenCode
+    # model (e.g. `/model minimax-m2.7` on opencode-go, `/model claude-sonnet-4-6`
+    # on opencode-zen) hit a double /v1 and returned OpenCode's website 404 page.
+    if (
+        api_mode == "anthropic_messages"
+        and target_provider in {"opencode-zen", "opencode-go"}
+        and isinstance(base_url, str)
+        and base_url
+    ):
+        base_url = re.sub(r"/v1/?$", "", base_url)
+
    # --- Get capabilities (legacy) ---
    capabilities = get_model_capabilities(target_provider, new_model)

@@ -786,7 +812,8 @@ def list_authenticated_providers(
    from hermes_cli.models import OPENROUTER_MODELS, _PROVIDER_MODELS

    results: List[dict] = []
-    seen_slugs: set = set()
+    seen_slugs: set = set()  # lowercase-normalized to catch case variants (#9545)
+    seen_mdev_ids: set = set()  # prevent duplicate entries for aliases (e.g. kimi-coding + kimi-coding-cn)

    data = fetch_models_dev()

@@ -796,9 +823,18 @@ def list_authenticated_providers(
    # "nous" shares OpenRouter's curated list if not separately defined
    if "nous" not in curated:
        curated["nous"] = curated["openrouter"]
+    # Ollama Cloud uses dynamic discovery (no static curated list)
+    if "ollama-cloud" not in curated:
+        from hermes_cli.models import fetch_ollama_cloud_models
+        curated["ollama-cloud"] = fetch_ollama_cloud_models()

    # --- 1. Check Hermes-mapped providers ---
    for hermes_id, mdev_id in PROVIDER_TO_MODELS_DEV.items():
+        # Skip aliases that map to the same models.dev provider (e.g.
+        # kimi-coding and kimi-coding-cn both → kimi-for-coding).
+        # The first one with valid credentials wins (#10526).
+        if mdev_id in seen_mdev_ids:
+            continue
        pdata = data.get(mdev_id)
        if not isinstance(pdata, dict):
            continue
@@ -837,7 +873,8 @@ def list_authenticated_providers(
            "total_models": total,
            "source": "built-in",
        })
-        seen_slugs.add(slug)
+        seen_slugs.add(slug.lower())
+        seen_mdev_ids.add(mdev_id)

    # --- 2. Check Hermes-only providers (nous, openai-codex, copilot, opencode-go) ---
    from hermes_cli.providers import HERMES_OVERLAYS
@@ -849,12 +886,12 @@ def list_authenticated_providers(
    _mdev_to_hermes = {v: k for k, v in PROVIDER_TO_MODELS_DEV.items()}

    for pid, overlay in HERMES_OVERLAYS.items():
-        if pid in seen_slugs:
+        if pid.lower() in seen_slugs:
            continue

        # Resolve Hermes slug — e.g. "github-copilot" → "copilot"
        hermes_slug = _mdev_to_hermes.get(pid, pid)
-        if hermes_slug in seen_slugs:
+        if hermes_slug.lower() in seen_slugs:
            continue

        # Check if credentials exist
@@ -935,8 +972,8 @@ def list_authenticated_providers(
            "total_models": total,
            "source": "hermes",
        })
-        seen_slugs.add(pid)
-        seen_slugs.add(hermes_slug)
+        seen_slugs.add(pid.lower())
+        seen_slugs.add(hermes_slug.lower())

    # --- 2b. Cross-check canonical provider list ---
    # Catches providers that are in CANONICAL_PROVIDERS but weren't found
@@ -948,7 +985,7 @@ def list_authenticated_providers(
        _canon_provs = []

    for _cp in _canon_provs:
-        if _cp.slug in seen_slugs:
+        if _cp.slug.lower() in seen_slugs:
            continue

        # Check credentials via PROVIDER_REGISTRY (auth.py)
@@ -995,7 +1032,7 @@ def list_authenticated_providers(
            "total_models": _cp_total,
            "source": "canonical",
        })
-        seen_slugs.add(_cp.slug)
+        seen_slugs.add(_cp.slug.lower())

    # --- 3. User-defined endpoints from config ---
    if user_providers and isinstance(user_providers, dict):
@@ -1068,7 +1105,7 @@ def list_authenticated_providers(
                groups[slug]["models"].append(default_model)

        for slug, grp in groups.items():
-            if slug in seen_slugs:
+            if slug.lower() in seen_slugs:
                continue
            results.append({
                "slug": slug,
@@ -1080,11 +1117,9 @@ def list_authenticated_providers(
                "source": "user-config",
                "api_url": grp["api_url"],
            })
-            seen_slugs.add(slug)
+            seen_slugs.add(slug.lower())

    # Sort: current provider first, then by model count descending
    results.sort(key=lambda r: (not r["is_current"], -r["total_models"]))

    return results
-
-
@@ -11,7 +11,9 @@ import json
 import os
 import urllib.request
 import urllib.error
+import time
 from difflib import get_close_matches
+from pathlib import Path
 from typing import Any, NamedTuple, Optional

 COPILOT_BASE_URL = "https://api.githubcopilot.com"
@@ -24,7 +26,9 @@ COPILOT_REASONING_EFFORTS_O_SERIES = ["low", "medium", "high"]
 # Fallback OpenRouter snapshot used when the live catalog is unavailable.
 # (model_id, display description shown in menus)
 OPENROUTER_MODELS: list[tuple[str, str]] = [
-    ("anthropic/claude-opus-4.6",       "recommended"),
+    ("moonshotai/kimi-k2.5",            "recommended"),
+    ("anthropic/claude-opus-4.7",       ""),
+    ("anthropic/claude-opus-4.6",       ""),
    ("anthropic/claude-sonnet-4.6",     ""),
    ("qwen/qwen3.6-plus",               ""),
    ("anthropic/claude-sonnet-4.5",     ""),
@@ -46,7 +50,6 @@ OPENROUTER_MODELS: list[tuple[str, str]] = [
    ("z-ai/glm-5.1",                    ""),
    ("z-ai/glm-5v-turbo",               ""),
    ("z-ai/glm-5-turbo",                ""),
-    ("moonshotai/kimi-k2.5",            ""),
    ("x-ai/grok-4.20",                  ""),
    ("nvidia/nemotron-3-super-120b-a12b",      ""),
    ("nvidia/nemotron-3-super-120b-a12b:free", "free"),
@@ -72,7 +75,9 @@ def _codex_curated_models() -> list[str]:

 _PROVIDER_MODELS: dict[str, list[str]] = {
    "nous": [
+        "moonshotai/kimi-k2.5",
        "xiaomi/mimo-v2-pro",
+        "anthropic/claude-opus-4.7",
        "anthropic/claude-opus-4.6",
        "anthropic/claude-sonnet-4.6",
        "anthropic/claude-sonnet-4.5",
@@ -92,7 +97,6 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "z-ai/glm-5.1",
        "z-ai/glm-5v-turbo",
        "z-ai/glm-5-turbo",
-        "moonshotai/kimi-k2.5",
        "x-ai/grok-4.20-beta",
        "nvidia/nemotron-3-super-120b-a12b",
        "nvidia/nemotron-3-super-120b-a12b:free",
@@ -129,9 +133,11 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "gemini-2.5-pro",
        "gemini-2.5-flash",
        "gemini-2.5-flash-lite",
-        # Gemma open models (also served via AI Studio)
-        "gemma-4-31b-it",
-        "gemma-4-26b-it",
+    ],
+    "google-gemini-cli": [
+        "gemini-2.5-pro",
+        "gemini-2.5-flash",
+        "gemini-2.5-flash-lite",
    ],
    "zai": [
        "glm-5.1",
@@ -143,21 +149,26 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "glm-4.5-flash",
    ],
    "xai": [
-        "grok-4.20-0309-reasoning",
-        "grok-4.20-0309-non-reasoning",
-        "grok-4.20-multi-agent-0309",
+        "grok-4.20-reasoning",
        "grok-4-1-fast-reasoning",
-        "grok-4-1-fast-non-reasoning",
-        "grok-4-fast-reasoning",
-        "grok-4-fast-non-reasoning",
-        "grok-4-0709",
-        "grok-code-fast-1",
-        "grok-3",
-        "grok-3-mini",
+    ],
+    "nvidia": [
+        # NVIDIA flagship reasoning models
+        "nvidia/nemotron-3-super-120b-a12b",
+        "nvidia/nemotron-3-nano-30b-a3b",
+        "nvidia/llama-3.3-nemotron-super-49b-v1.5",
+        # Third-party agentic models hosted on build.nvidia.com
+        # (map to OpenRouter defaults — users get familiar picks on NIM)
+        "qwen/qwen3.5-397b-a17b",
+        "deepseek-ai/deepseek-v3.2",
+        "moonshotai/kimi-k2.5",
+        "minimaxai/minimax-m2.5",
+        "z-ai/glm5",
+        "openai/gpt-oss-120b",
    ],
    "kimi-coding": [
-        "kimi-for-coding",
        "kimi-k2.5",
+        "kimi-for-coding",
        "kimi-k2-thinking",
        "kimi-k2-thinking-turbo",
        "kimi-k2-turbo-preview",
@@ -188,6 +199,7 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "MiniMax-M2",
    ],
    "anthropic": [
+        "claude-opus-4-7",
        "claude-opus-4-6",
        "claude-sonnet-4-6",
        "claude-opus-4-5-20251101",
@@ -211,6 +223,7 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "trinity-mini",
    ],
    "opencode-zen": [
+        "kimi-k2.5",
        "gpt-5.4-pro",
        "gpt-5.4",
        "gpt-5.3-codex",
@@ -242,15 +255,15 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "glm-5",
        "glm-4.7",
        "glm-4.6",
-        "kimi-k2.5",
        "kimi-k2-thinking",
        "kimi-k2",
        "qwen3-coder",
        "big-pickle",
    ],
    "opencode-go": [
-        "glm-5",
        "kimi-k2.5",
+        "glm-5.1",
+        "glm-5",
        "mimo-v2-pro",
        "mimo-v2-omni",
        "minimax-m2.7",
@@ -283,26 +296,42 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
    # to https://dashscope-intl.aliyuncs.com/compatible-mode/v1 (OpenAI-compat)
    # or https://dashscope-intl.aliyuncs.com/apps/anthropic (Anthropic-compat).
    "alibaba": [
+        "kimi-k2.5",
        "qwen3.5-plus",
        "qwen3-coder-plus",
        "qwen3-coder-next",
        # Third-party models available on coding-intl
        "glm-5",
        "glm-4.7",
-        "kimi-k2.5",
        "MiniMax-M2.5",
    ],
    # Curated HF model list — only agentic models that map to OpenRouter defaults.
    "huggingface": [
+        "moonshotai/Kimi-K2.5",
        "Qwen/Qwen3.5-397B-A17B",
        "Qwen/Qwen3.5-35B-A3B",
        "deepseek-ai/DeepSeek-V3.2",
-        "moonshotai/Kimi-K2.5",
        "MiniMaxAI/MiniMax-M2.5",
        "zai-org/GLM-5",
        "XiaomiMiMo/MiMo-V2-Flash",
        "moonshotai/Kimi-K2-Thinking",
    ],
+    # AWS Bedrock — static fallback list used when dynamic discovery is
+    # unavailable (no boto3, no credentials, or API error).  The agent
+    # prefers live discovery via ListFoundationModels + ListInferenceProfiles.
+    # Use inference profile IDs (us.*) since most models require them.
+    "bedrock": [
+        "us.anthropic.claude-sonnet-4-6",
+        "us.anthropic.claude-opus-4-6-v1",
+        "us.anthropic.claude-haiku-4-5-20251001-v1:0",
+        "us.anthropic.claude-sonnet-4-5-20250929-v1:0",
+        "us.amazon.nova-pro-v1:0",
+        "us.amazon.nova-lite-v1:0",
+        "us.amazon.nova-micro-v1:0",
+        "deepseek.v3.2",
+        "us.meta.llama4-maverick-17b-instruct-v1:0",
+        "us.meta.llama4-scout-17b-instruct-v1:0",
+    ],
 }

 # ---------------------------------------------------------------------------
@@ -518,30 +547,35 @@ CANONICAL_PROVIDERS: list[ProviderEntry] = [
    ProviderEntry("anthropic",      "Anthropic",                "Anthropic (Claude models — API key or Claude Code)"),
    ProviderEntry("openai-codex",   "OpenAI Codex",             "OpenAI Codex"),
    ProviderEntry("xiaomi",         "Xiaomi MiMo",              "Xiaomi MiMo (MiMo-V2 models — pro, omni, flash)"),
+    ProviderEntry("nvidia",         "NVIDIA NIM",               "NVIDIA NIM (Nemotron models — build.nvidia.com or local NIM)"),
    ProviderEntry("qwen-oauth",     "Qwen OAuth (Portal)",      "Qwen OAuth (reuses local Qwen CLI login)"),
    ProviderEntry("copilot",        "GitHub Copilot",           "GitHub Copilot (uses GITHUB_TOKEN or gh auth token)"),
    ProviderEntry("copilot-acp",    "GitHub Copilot ACP",       "GitHub Copilot ACP (spawns `copilot --acp --stdio`)"),
    ProviderEntry("huggingface",    "Hugging Face",             "Hugging Face Inference Providers (20+ open models)"),
    ProviderEntry("gemini",         "Google AI Studio",         "Google AI Studio (Gemini models — OpenAI-compatible endpoint)"),
+    ProviderEntry("google-gemini-cli", "Google Gemini (OAuth)",   "Google Gemini via OAuth + Code Assist (free tier supported; no API key needed)"),
    ProviderEntry("deepseek",       "DeepSeek",                 "DeepSeek (DeepSeek-V3, R1, coder — direct API)"),
    ProviderEntry("xai",            "xAI",                      "xAI (Grok models — direct API)"),
    ProviderEntry("zai",            "Z.AI / GLM",               "Z.AI / GLM (Zhipu AI direct API)"),
-    ProviderEntry("kimi-coding",    "Kimi / Moonshot",          "Kimi / Moonshot (Moonshot AI direct API)"),
+    ProviderEntry("kimi-coding",    "Kimi / Kimi Coding Plan",  "Kimi Coding Plan (api.kimi.com) & Moonshot API"),
    ProviderEntry("kimi-coding-cn", "Kimi / Moonshot (China)",  "Kimi / Moonshot China (Moonshot CN direct API)"),
    ProviderEntry("minimax",        "MiniMax",                  "MiniMax (global direct API)"),
    ProviderEntry("minimax-cn",     "MiniMax (China)",          "MiniMax China (domestic direct API)"),
    ProviderEntry("alibaba",        "Alibaba Cloud (DashScope)","Alibaba Cloud / DashScope Coding (Qwen + multi-provider)"),
+    ProviderEntry("ollama-cloud",   "Ollama Cloud",             "Ollama Cloud (cloud-hosted open models — ollama.com)"),
    ProviderEntry("arcee",          "Arcee AI",                 "Arcee AI (Trinity models — direct API)"),
    ProviderEntry("kilocode",       "Kilo Code",                "Kilo Code (Kilo Gateway API)"),
    ProviderEntry("opencode-zen",   "OpenCode Zen",             "OpenCode Zen (35+ curated models, pay-as-you-go)"),
    ProviderEntry("opencode-go",    "OpenCode Go",              "OpenCode Go (open models, $10/month subscription)"),
    ProviderEntry("ai-gateway",     "Vercel AI Gateway",        "Vercel AI Gateway (200+ models, pay-per-use)"),
+    ProviderEntry("bedrock",        "AWS Bedrock",              "AWS Bedrock (Claude, Nova, Llama, DeepSeek — IAM or API key)"),
 ]

 # Derived dicts — used throughout the codebase
 _PROVIDER_LABELS = {p.slug: p.label for p in CANONICAL_PROVIDERS}
 _PROVIDER_LABELS["custom"] = "Custom endpoint"  # special case: not a named provider

+
 _PROVIDER_ALIASES = {
    "glm": "zai",
    "z-ai": "zai",
@@ -582,14 +616,26 @@ _PROVIDER_ALIASES = {
    "qwen": "alibaba",
    "alibaba-cloud": "alibaba",
    "qwen-portal": "qwen-oauth",
+    "gemini-cli": "google-gemini-cli",
+    "gemini-oauth": "google-gemini-cli",
    "hf": "huggingface",
    "hugging-face": "huggingface",
    "huggingface-hub": "huggingface",
    "mimo": "xiaomi",
    "xiaomi-mimo": "xiaomi",
+    "aws": "bedrock",
+    "aws-bedrock": "bedrock",
+    "amazon-bedrock": "bedrock",
+    "amazon": "bedrock",
    "grok": "xai",
    "x-ai": "xai",
    "x.ai": "xai",
+    "nim": "nvidia",
+    "nvidia-nim": "nvidia",
+    "build-nvidia": "nvidia",
+    "nemotron": "nvidia",
+    "ollama": "custom",  # bare "ollama" = local; use "ollama-cloud" for cloud
+    "ollama_cloud": "ollama-cloud",
 }


@@ -1026,7 +1072,7 @@ def detect_provider_for_model(
            return (resolved_provider, default_models[0])

    # Aggregators list other providers' models — never auto-switch TO them
-    _AGGREGATORS = {"nous", "openrouter"}
+    _AGGREGATORS = {"nous", "openrouter", "ai-gateway", "copilot", "kilocode"}

    # If the model belongs to the current provider's catalog, don't suggest switching
    current_models = _PROVIDER_MODELS.get(current_provider, [])
@@ -1043,7 +1089,8 @@ def detect_provider_for_model(
            break

    if direct_match:
-        # Check if we have credentials for this provider
+        # Check if we have credentials for this provider — env vars,
+        # credential pool, or auth store entries.
        has_creds = False
        try:
            from hermes_cli.auth import PROVIDER_REGISTRY
@@ -1056,16 +1103,28 @@ def detect_provider_for_model(
                        break
        except Exception:
            pass
+        # Also check credential pool and auth store — covers OAuth,
+        # Claude Code tokens, and other non-env-var credentials (#10300).
+        if not has_creds:
+            try:
+                from agent.credential_pool import load_pool
+                pool = load_pool(direct_match)
+                if pool.has_credentials():
+                    has_creds = True
+            except Exception:
+                pass
+        if not has_creds:
+            try:
+                from hermes_cli.auth import _load_auth_store
+                store = _load_auth_store()
+                if direct_match in store.get("providers", {}) or direct_match in store.get("credential_pool", {}):
+                    has_creds = True
+            except Exception:
+                pass

-        if has_creds:
-            return (direct_match, name)
-
-        # No direct creds — try to find this model on OpenRouter instead
-        or_slug = _find_openrouter_slug(name)
-        if or_slug:
-            return ("openrouter", or_slug)
-        # Still return the direct provider — credential resolution will
-        # give a clear error rather than silently using the wrong provider
+        # Always return the direct provider match.  If credentials are
+        # missing, the client init will give a clear error rather than
+        # silently routing through the wrong provider (#10300).
        return (direct_match, name)

    # --- Step 2: check OpenRouter catalog ---
@@ -1255,6 +1314,10 @@ def provider_model_ids(provider: Optional[str], *, force_refresh: bool = False)
        live = _fetch_ai_gateway_models()
        if live:
            return live
+    if normalized == "ollama-cloud":
+        live = fetch_ollama_cloud_models(force_refresh=force_refresh)
+        if live:
+            return live
    if normalized == "custom":
        base_url = _get_custom_base_url()
        if base_url:
@@ -1441,6 +1504,19 @@ _COPILOT_MODEL_ALIASES = {
    "anthropic/claude-sonnet-4.6": "claude-sonnet-4.6",
    "anthropic/claude-sonnet-4.5": "claude-sonnet-4.5",
    "anthropic/claude-haiku-4.5": "claude-haiku-4.5",
+    # Dash-notation fallbacks: Hermes' default Claude IDs elsewhere use
+    # hyphens (anthropic native format), but Copilot's API only accepts
+    # dot-notation.  Accept both so users who configure copilot + a
+    # default hyphenated Claude model don't hit HTTP 400
+    # "model_not_supported".  See issue #6879.
+    "claude-opus-4-6": "claude-opus-4.6",
+    "claude-sonnet-4-6": "claude-sonnet-4.6",
+    "claude-sonnet-4-5": "claude-sonnet-4.5",
+    "claude-haiku-4-5": "claude-haiku-4.5",
+    "anthropic/claude-opus-4-6": "claude-opus-4.6",
+    "anthropic/claude-sonnet-4-6": "claude-sonnet-4.6",
+    "anthropic/claude-sonnet-4-5": "claude-sonnet-4.5",
+    "anthropic/claude-haiku-4-5": "claude-haiku-4.5",
 }


@@ -1539,6 +1615,11 @@ def copilot_model_api_mode(
    primary signal.  Falls back to the catalog's ``supported_endpoints``
    only for models not covered by the pattern check.
    """
+    # Fetch the catalog once so normalize + endpoint check share it
+    # (avoids two redundant network calls for non-GPT-5 models).
+    if catalog is None and api_key:
+        catalog = fetch_github_model_catalog(api_key=api_key)
+
    normalized = normalize_copilot_model_id(model_id, catalog=catalog, api_key=api_key)
    if not normalized:
        return "chat_completions"
@@ -1548,9 +1629,6 @@ def copilot_model_api_mode(
        return "codex_responses"

    # Secondary: check catalog for non-GPT-5 models (Claude via /v1/messages, etc.)
-    if catalog is None and api_key:
-        catalog = fetch_github_model_catalog(api_key=api_key)
-
    if catalog:
        catalog_entry = next((item for item in catalog if item.get("id") == normalized), None)
        if isinstance(catalog_entry, dict):
@@ -1765,6 +1843,125 @@ def fetch_api_models(
    return probe_api_models(api_key, base_url, timeout=timeout).get("models")


+# ---------------------------------------------------------------------------
+# Ollama Cloud — merged model discovery with disk cache
+# ---------------------------------------------------------------------------
+
+
+
+_OLLAMA_CLOUD_CACHE_TTL = 3600  # 1 hour
+
+
+def _ollama_cloud_cache_path() -> Path:
+    """Return the path for the Ollama Cloud model cache."""
+    from hermes_constants import get_hermes_home
+    return get_hermes_home() / "ollama_cloud_models_cache.json"
+
+
+def _load_ollama_cloud_cache(*, ignore_ttl: bool = False) -> Optional[dict]:
+    """Load cached Ollama Cloud models from disk.
+
+    Args:
+        ignore_ttl: If True, return data even if the TTL has expired (stale fallback).
+    """
+    try:
+        cache_path = _ollama_cloud_cache_path()
+        if not cache_path.exists():
+            return None
+        with open(cache_path, encoding="utf-8") as f:
+            data = json.load(f)
+        if not isinstance(data, dict):
+            return None
+        models = data.get("models")
+        if not (isinstance(models, list) and models):
+            return None
+        if not ignore_ttl:
+            cached_at = data.get("cached_at", 0)
+            if (time.time() - cached_at) > _OLLAMA_CLOUD_CACHE_TTL:
+                return None  # stale
+        return data
+    except Exception:
+        pass
+    return None
+
+
+def _save_ollama_cloud_cache(models: list[str]) -> None:
+    """Persist the merged Ollama Cloud model list to disk."""
+    try:
+        from utils import atomic_json_write
+        cache_path = _ollama_cloud_cache_path()
+        cache_path.parent.mkdir(parents=True, exist_ok=True)
+        atomic_json_write(cache_path, {"models": models, "cached_at": time.time()}, indent=None)
+    except Exception:
+        pass
+
+
+def fetch_ollama_cloud_models(
+    api_key: Optional[str] = None,
+    base_url: Optional[str] = None,
+    *,
+    force_refresh: bool = False,
+) -> list[str]:
+    """Fetch Ollama Cloud models by merging live API + models.dev, with disk cache.
+
+    Resolution order:
+      1. Disk cache (if fresh, < 1 hour, and not force_refresh)
+      2. Live ``/v1/models`` endpoint (primary — freshest source)
+      3. models.dev registry (secondary — fills gaps for unlisted models)
+      4. Merge: live models first, then models.dev additions (deduped)
+
+    Returns a list of model IDs (never None — empty list on total failure).
+    """
+    # 1. Check disk cache
+    if not force_refresh:
+        cached = _load_ollama_cloud_cache()
+        if cached is not None:
+            return cached["models"]
+
+    # 2. Live API probe
+    if not api_key:
+        api_key = os.getenv("OLLAMA_API_KEY", "")
+    if not base_url:
+        base_url = os.getenv("OLLAMA_BASE_URL", "") or "https://ollama.com/v1"
+
+    live_models: list[str] = []
+    if api_key:
+        result = fetch_api_models(api_key, base_url, timeout=8.0)
+        if result:
+            live_models = result
+
+    # 3. models.dev registry
+    mdev_models: list[str] = []
+    try:
+        from agent.models_dev import list_agentic_models
+        mdev_models = list_agentic_models("ollama-cloud")
+    except Exception:
+        pass
+
+    # 4. Merge: live first, then models.dev additions (deduped, order-preserving)
+    if live_models or mdev_models:
+        seen: set[str] = set()
+        merged: list[str] = []
+        for m in live_models:
+            if m and m not in seen:
+                seen.add(m)
+                merged.append(m)
+        for m in mdev_models:
+            if m and m not in seen:
+                seen.add(m)
+                merged.append(m)
+        if merged:
+            _save_ollama_cloud_cache(merged)
+            return merged
+
+    # Total failure — return stale cache if available (ignore TTL)
+    stale = _load_ollama_cloud_cache(ignore_ttl=True)
+    if stale is not None:
+        return stale["models"]
+
+    return []
+
+
 def validate_requested_model(
    model_name: str,
    provider: Optional[str],
@@ -1851,8 +2048,8 @@ def validate_requested_model(
                )

            return {
-                "accepted": True,
-                "persist": True,
+                "accepted": False,
+                "persist": False,
                "recognized": False,
                "message": message,
            }
@@ -1865,8 +2062,8 @@ def validate_requested_model(
            message += f"\n  If this server expects `/v1`, try base URL: `{probe.get('suggested_base_url')}`"

        return {
-            "accepted": True,
-            "persist": True,
+            "accepted": False,
+            "persist": False,
            "recognized": False,
            "message": message,
        }
@@ -1900,12 +2097,11 @@ def validate_requested_model(
            if suggestions:
                suggestion_text = "\n  Similar models: " + ", ".join(f"`{s}`" for s in suggestions)
            return {
-                "accepted": True,
-                "persist": True,
+                "accepted": False,
+                "persist": False,
                "recognized": False,
                "message": (
-                    f"Note: `{requested}` was not found in the OpenAI Codex model listing. "
-                    f"It may still work if your account has access to it."
+                    f"Model `{requested}` was not found in the OpenAI Codex model listing."
                    f"{suggestion_text}"
                ),
            }
@@ -1944,23 +2140,58 @@ def validate_requested_model(
            if suggestions:
                suggestion_text = "\n  Similar models: " + ", ".join(f"`{s}`" for s in suggestions)

+        return {
+            "accepted": False,
+            "persist": False,
+            "recognized": False,
+            "message": (
+                f"Model `{requested}` was not found in this provider's model listing."
+                f"{suggestion_text}"
+            ),
+        }
+
+    # api_models is None — couldn't reach API.  Accept and persist,
+    # but warn so typos don't silently break things.
+
+    # Bedrock: use our own discovery instead of HTTP /models endpoint.
+    # Bedrock's bedrock-runtime URL doesn't support /models — it uses the
+    # AWS SDK control plane (ListFoundationModels + ListInferenceProfiles).
+    if normalized == "bedrock":
+        try:
+            from agent.bedrock_adapter import discover_bedrock_models, resolve_bedrock_region
+            region = resolve_bedrock_region()
+            discovered = discover_bedrock_models(region)
+            discovered_ids = {m["id"] for m in discovered}
+            if requested in discovered_ids:
+                return {
+                    "accepted": True,
+                    "persist": True,
+                    "recognized": True,
+                    "message": None,
+                }
+            # Not in discovered list — still accept (user may have custom
+            # inference profiles or cross-account access), but warn.
+            suggestions = get_close_matches(requested, list(discovered_ids), n=3, cutoff=0.4)
+            suggestion_text = ""
+            if suggestions:
+                suggestion_text = "\n  Similar models: " + ", ".join(f"`{s}`" for s in suggestions)
            return {
                "accepted": True,
                "persist": True,
                "recognized": False,
                "message": (
-                    f"Note: `{requested}` was not found in this provider's model listing. "
-                    f"It may still work if your plan supports it."
+                    f"Note: `{requested}` was not found in Bedrock model discovery for {region}. "
+                    f"It may still work with custom inference profiles or cross-account access."
                    f"{suggestion_text}"
                ),
            }
+        except Exception:
+            pass  # Fall through to generic warning

-    # api_models is None — couldn't reach API.  Accept and persist,
-    # but warn so typos don't silently break things.
    provider_label = _PROVIDER_LABELS.get(normalized, normalized)
    return {
-        "accepted": True,
-        "persist": True,
+        "accepted": False,
+        "persist": False,
        "recognized": False,
        "message": (
            f"Could not reach the {provider_label} API to validate `{requested}`. "
@@ -143,6 +143,7 @@ def _tts_label(current_provider: str) -> str:
        "openai": "OpenAI TTS",
        "elevenlabs": "ElevenLabs",
        "edge": "Edge TTS",
+        "xai": "xAI TTS",
        "mistral": "Mistral Voxtral TTS",
        "neutts": "NeuTTS",
    }
@@ -257,6 +258,15 @@ def get_nous_subscription_features(
        terminal_cfg.get("modal_mode")
    )

+    # use_gateway flags — when True, the user explicitly opted into the
+    # Tool Gateway via `hermes model`, so direct credentials should NOT
+    # prevent gateway routing.
+    web_use_gateway = bool(web_cfg.get("use_gateway"))
+    tts_use_gateway = bool(tts_cfg.get("use_gateway"))
+    browser_use_gateway = bool(browser_cfg.get("use_gateway"))
+    image_gen_cfg = config.get("image_gen") if isinstance(config.get("image_gen"), dict) else {}
+    image_use_gateway = bool(image_gen_cfg.get("use_gateway"))
+
    direct_exa = bool(get_env_value("EXA_API_KEY"))
    direct_firecrawl = bool(get_env_value("FIRECRAWL_API_KEY") or get_env_value("FIRECRAWL_API_URL"))
    direct_parallel = bool(get_env_value("PARALLEL_API_KEY"))
@@ -269,6 +279,21 @@ def get_nous_subscription_features(
    direct_browser_use = bool(get_env_value("BROWSER_USE_API_KEY"))
    direct_modal = has_direct_modal_credentials()

+    # When use_gateway is set, suppress direct credentials for managed detection
+    if web_use_gateway:
+        direct_firecrawl = False
+        direct_exa = False
+        direct_parallel = False
+        direct_tavily = False
+    if image_use_gateway:
+        direct_fal = False
+    if tts_use_gateway:
+        direct_openai_tts = False
+        direct_elevenlabs = False
+    if browser_use_gateway:
+        direct_browser_use = False
+        direct_browserbase = False
+
    managed_web_available = managed_tools_flag and nous_auth_present and is_managed_tool_gateway_ready("firecrawl")
    managed_image_available = managed_tools_flag and nous_auth_present and is_managed_tool_gateway_ready("fal-queue")
    managed_tts_available = managed_tools_flag and nous_auth_present and is_managed_tool_gateway_ready("openai-audio")
@@ -439,37 +464,7 @@ def get_nous_subscription_features(
    )


-def get_nous_subscription_explainer_lines() -> list[str]:
-    if not managed_nous_tools_enabled():
-        return []

-    return [
-        "Nous subscription enables managed web tools, image generation, OpenAI TTS, and browser automation by default.",
-        "Those managed tools bill to your Nous subscription. Modal execution is optional and can bill to your subscription too.",
-        "Change these later with: hermes setup tools, hermes setup terminal, or hermes status.",
-    ]
-
-
-def apply_nous_provider_defaults(config: Dict[str, object]) -> set[str]:
-    """Apply provider-level Nous defaults shared by `hermes setup` and `hermes model`."""
-    if not managed_nous_tools_enabled():
-        return set()
-
-    features = get_nous_subscription_features(config)
-    if not features.provider_is_nous:
-        return set()
-
-    tts_cfg = config.get("tts")
-    if not isinstance(tts_cfg, dict):
-        tts_cfg = {}
-        config["tts"] = tts_cfg
-
-    current_tts = str(tts_cfg.get("provider") or "edge").strip().lower()
-    if current_tts not in {"", "edge"}:
-        return set()
-
-    tts_cfg["provider"] = "openai"
-    return {"tts"}


 def apply_nous_managed_defaults(
@@ -529,3 +524,255 @@ def apply_nous_managed_defaults(
        changed.add("image_gen")

    return changed
+
+
+# ---------------------------------------------------------------------------
+# Tool Gateway offer — single Y/n prompt after model selection
+# ---------------------------------------------------------------------------
+
+_GATEWAY_TOOL_LABELS = {
+    "web": "Web search & extract (Firecrawl)",
+    "image_gen": "Image generation (FAL)",
+    "tts": "Text-to-speech (OpenAI TTS)",
+    "browser": "Browser automation (Browser Use)",
+}
+
+
+def _get_gateway_direct_credentials() -> Dict[str, bool]:
+    """Return a dict of tool_key -> has_direct_credentials."""
+    return {
+        "web": bool(
+            get_env_value("FIRECRAWL_API_KEY")
+            or get_env_value("FIRECRAWL_API_URL")
+            or get_env_value("PARALLEL_API_KEY")
+            or get_env_value("TAVILY_API_KEY")
+            or get_env_value("EXA_API_KEY")
+        ),
+        "image_gen": bool(get_env_value("FAL_KEY")),
+        "tts": bool(
+            resolve_openai_audio_api_key()
+            or get_env_value("ELEVENLABS_API_KEY")
+        ),
+        "browser": bool(
+            get_env_value("BROWSER_USE_API_KEY")
+            or (get_env_value("BROWSERBASE_API_KEY") and get_env_value("BROWSERBASE_PROJECT_ID"))
+        ),
+    }
+
+
+_GATEWAY_DIRECT_LABELS = {
+    "web": "Firecrawl/Exa/Parallel/Tavily key",
+    "image_gen": "FAL key",
+    "tts": "OpenAI/ElevenLabs key",
+    "browser": "Browser Use/Browserbase key",
+}
+
+_ALL_GATEWAY_KEYS = ("web", "image_gen", "tts", "browser")
+
+
+def get_gateway_eligible_tools(
+    config: Optional[Dict[str, object]] = None,
+) -> tuple[list[str], list[str], list[str]]:
+    """Return (unconfigured, has_direct, already_managed) tool key lists.
+
+    - unconfigured: tools with no direct credentials (easy switch)
+    - has_direct: tools where the user has their own API keys
+    - already_managed: tools already routed through the gateway
+
+    All lists are empty when the user is not a paid Nous subscriber or
+    is not using Nous as their provider.
+    """
+    if not managed_nous_tools_enabled():
+        return [], [], []
+
+    if config is None:
+        from hermes_cli.config import load_config
+        config = load_config() or {}
+
+    # Quick provider check without the heavy get_nous_subscription_features call
+    model_cfg = config.get("model")
+    if not isinstance(model_cfg, dict) or str(model_cfg.get("provider") or "").strip().lower() != "nous":
+        return [], [], []
+
+    direct = _get_gateway_direct_credentials()
+
+    # Check which tools the user has explicitly opted into the gateway for.
+    # This is distinct from managed_by_nous which fires implicitly when
+    # no direct keys exist — we only skip the prompt for tools where
+    # use_gateway was explicitly set.
+    opted_in = {
+        "web": bool((config.get("web") if isinstance(config.get("web"), dict) else {}).get("use_gateway")),
+        "image_gen": bool((config.get("image_gen") if isinstance(config.get("image_gen"), dict) else {}).get("use_gateway")),
+        "tts": bool((config.get("tts") if isinstance(config.get("tts"), dict) else {}).get("use_gateway")),
+        "browser": bool((config.get("browser") if isinstance(config.get("browser"), dict) else {}).get("use_gateway")),
+    }
+
+    unconfigured: list[str] = []
+    has_direct: list[str] = []
+    already_managed: list[str] = []
+    for key in _ALL_GATEWAY_KEYS:
+        if opted_in.get(key):
+            already_managed.append(key)
+        elif direct.get(key):
+            has_direct.append(key)
+        else:
+            unconfigured.append(key)
+    return unconfigured, has_direct, already_managed
+
+
+def apply_gateway_defaults(
+    config: Dict[str, object],
+    tool_keys: list[str],
+) -> set[str]:
+    """Apply Tool Gateway config for the given tool keys.
+
+    Sets ``use_gateway: true`` in each tool's config section so the
+    runtime prefers the gateway even when direct API keys are present.
+
+    Returns the set of tools that were actually changed.
+    """
+    changed: set[str] = set()
+
+    web_cfg = config.get("web")
+    if not isinstance(web_cfg, dict):
+        web_cfg = {}
+        config["web"] = web_cfg
+
+    tts_cfg = config.get("tts")
+    if not isinstance(tts_cfg, dict):
+        tts_cfg = {}
+        config["tts"] = tts_cfg
+
+    browser_cfg = config.get("browser")
+    if not isinstance(browser_cfg, dict):
+        browser_cfg = {}
+        config["browser"] = browser_cfg
+
+    if "web" in tool_keys:
+        web_cfg["backend"] = "firecrawl"
+        web_cfg["use_gateway"] = True
+        changed.add("web")
+
+    if "tts" in tool_keys:
+        tts_cfg["provider"] = "openai"
+        tts_cfg["use_gateway"] = True
+        changed.add("tts")
+
+    if "browser" in tool_keys:
+        browser_cfg["cloud_provider"] = "browser-use"
+        browser_cfg["use_gateway"] = True
+        changed.add("browser")
+
+    if "image_gen" in tool_keys:
+        image_cfg = config.get("image_gen")
+        if not isinstance(image_cfg, dict):
+            image_cfg = {}
+            config["image_gen"] = image_cfg
+        image_cfg["use_gateway"] = True
+        changed.add("image_gen")
+
+    return changed
+
+
+def prompt_enable_tool_gateway(config: Dict[str, object]) -> set[str]:
+    """If eligible tools exist, prompt the user to enable the Tool Gateway.
+
+    Uses prompt_choice() with a description parameter so the curses TUI
+    shows the tool context alongside the choices.
+
+    Returns the set of tools that were enabled, or empty set if the user
+    declined or no tools were eligible.
+    """
+    unconfigured, has_direct, already_managed = get_gateway_eligible_tools(config)
+    if not unconfigured and not has_direct:
+        return set()
+
+    try:
+        from hermes_cli.setup import prompt_choice
+    except Exception:
+        return set()
+
+    # Build description lines showing full status of all gateway tools
+    desc_parts: list[str] = [
+        "",
+        "  The Tool Gateway gives you access to web search, image generation,",
+        "  text-to-speech, and browser automation through your Nous subscription.",
+        "  No need to sign up for separate API keys — just pick the tools you want.",
+        "",
+    ]
+    if already_managed:
+        for k in already_managed:
+            desc_parts.append(f"  ✓ {_GATEWAY_TOOL_LABELS[k]} — using Tool Gateway")
+    if unconfigured:
+        for k in unconfigured:
+            desc_parts.append(f"  ○ {_GATEWAY_TOOL_LABELS[k]} — not configured")
+    if has_direct:
+        for k in has_direct:
+            desc_parts.append(f"  ○ {_GATEWAY_TOOL_LABELS[k]} — using {_GATEWAY_DIRECT_LABELS[k]}")
+
+    # Build short choice labels — detail is in the description above
+    choices: list[str] = []
+    choice_keys: list[str] = []  # maps choice index -> action
+
+    if unconfigured and has_direct:
+        choices.append("Enable for all tools (existing keys kept, not used)")
+        choice_keys.append("all")
+
+        choices.append("Enable only for tools without existing keys")
+        choice_keys.append("unconfigured")
+
+        choices.append("Skip")
+        choice_keys.append("skip")
+
+    elif unconfigured:
+        choices.append("Enable Tool Gateway")
+        choice_keys.append("unconfigured")
+
+        choices.append("Skip")
+        choice_keys.append("skip")
+
+    else:
+        choices.append("Enable Tool Gateway (existing keys kept, not used)")
+        choice_keys.append("all")
+
+        choices.append("Skip")
+        choice_keys.append("skip")
+
+    description = "\n".join(desc_parts) if desc_parts else None
+    # Default to "Enable" when user has no direct keys (new user),
+    # default to "Skip" when they have existing keys to preserve.
+    default_idx = 0 if not has_direct else len(choices) - 1
+
+    try:
+        idx = prompt_choice(
+            "Your Nous subscription includes the Tool Gateway.",
+            choices,
+            default_idx,
+            description=description,
+        )
+    except (KeyboardInterrupt, EOFError, OSError, SystemExit):
+        return set()
+
+    action = choice_keys[idx]
+    if action == "skip":
+        return set()
+
+    if action == "all":
+        # Apply to switchable tools + ensure already-managed tools also
+        # have use_gateway persisted in config for consistency.
+        to_apply = list(_ALL_GATEWAY_KEYS)
+    else:
+        to_apply = unconfigured
+
+    changed = apply_gateway_defaults(config, to_apply)
+    if changed:
+        from hermes_cli.config import save_config
+        save_config(config)
+        # Only report the tools that actually switched (not already-managed ones)
+        newly_switched = changed - set(already_managed)
+        for key in sorted(newly_switched):
+            label = _GATEWAY_TOOL_LABELS.get(key, key)
+            print(f"  ✓ {label}: enabled via Nous subscription")
+        if already_managed and not newly_switched:
+            print("  (all tools already using Tool Gateway)")
+    return changed
@@ -112,6 +112,7 @@ class LoadedPlugin:
    module: Optional[types.ModuleType] = None
    tools_registered: List[str] = field(default_factory=list)
    hooks_registered: List[str] = field(default_factory=list)
+    commands_registered: List[str] = field(default_factory=list)
    enabled: bool = False
    error: Optional[str] = None

@@ -211,6 +212,84 @@ class PluginContext:
        }
        logger.debug("Plugin %s registered CLI command: %s", self.manifest.name, name)

+    # -- slash command registration -------------------------------------------
+
+    def register_command(
+        self,
+        name: str,
+        handler: Callable,
+        description: str = "",
+    ) -> None:
+        """Register a slash command (e.g. ``/lcm``) available in CLI and gateway sessions.
+
+        The handler signature is ``fn(raw_args: str) -> str | None``.
+        It may also be an async callable — the gateway dispatch handles both.
+
+        Unlike ``register_cli_command()`` (which creates ``hermes <subcommand>``
+        terminal commands), this registers in-session slash commands that users
+        invoke during a conversation.
+
+        Names conflicting with built-in commands are rejected with a warning.
+        """
+        clean = name.lower().strip().lstrip("/").replace(" ", "-")
+        if not clean:
+            logger.warning(
+                "Plugin '%s' tried to register a command with an empty name.",
+                self.manifest.name,
+            )
+            return
+
+        # Reject if it conflicts with a built-in command
+        try:
+            from hermes_cli.commands import resolve_command
+            if resolve_command(clean) is not None:
+                logger.warning(
+                    "Plugin '%s' tried to register command '/%s' which conflicts "
+                    "with a built-in command. Skipping.",
+                    self.manifest.name, clean,
+                )
+                return
+        except Exception:
+            pass  # If commands module isn't available, skip the check
+
+        self._manager._plugin_commands[clean] = {
+            "handler": handler,
+            "description": description or "Plugin command",
+            "plugin": self.manifest.name,
+        }
+        logger.debug("Plugin %s registered command: /%s", self.manifest.name, clean)
+
+    # -- tool dispatch -------------------------------------------------------
+
+    def dispatch_tool(self, tool_name: str, args: dict, **kwargs) -> str:
+        """Dispatch a tool call through the registry, with parent agent context.
+
+        This is the public interface for plugin slash commands that need to call
+        tools like ``delegate_task`` without reaching into framework internals.
+        The parent agent (if available) is resolved automatically — plugins never
+        need to access the agent directly.
+
+        Args:
+            tool_name: Registry name of the tool (e.g. ``"delegate_task"``).
+            args: Tool arguments dict (same as what the model would pass).
+            **kwargs: Extra keyword args forwarded to the registry dispatch.
+
+        Returns:
+            JSON string from the tool handler (same format as model tool calls).
+        """
+        from tools.registry import registry
+
+        # Wire up parent agent context when available (CLI mode).
+        # In gateway mode _cli_ref is None — tools degrade gracefully
+        # (workspace hints fall back to TERMINAL_CWD, no spinner).
+        if "parent_agent" not in kwargs:
+            cli = self._manager._cli_ref
+            agent = getattr(cli, "agent", None) if cli else None
+            if agent is not None:
+                kwargs["parent_agent"] = agent
+
+        return registry.dispatch(tool_name, args, **kwargs)
+
    # -- context engine registration -----------------------------------------

    def register_context_engine(self, engine) -> None:
@@ -323,6 +402,7 @@ class PluginManager:
        self._plugin_tool_names: Set[str] = set()
        self._cli_commands: Dict[str, dict] = {}
        self._context_engine = None  # Set by a plugin via register_context_engine()
+        self._plugin_commands: Dict[str, dict] = {}  # Slash commands registered by plugins
        self._discovered: bool = False
        self._cli_ref = None  # Set by CLI after plugin discovery
        # Plugin skill registry: qualified name → metadata dict.
@@ -485,6 +565,10 @@ class PluginManager:
                        for h in p.hooks_registered
                    }
                )
+                loaded.commands_registered = [
+                    c for c in self._plugin_commands
+                    if self._plugin_commands[c].get("plugin") == manifest.name
+                ]
                loaded.enabled = True

        except Exception as exc:
@@ -598,6 +682,7 @@ class PluginManager:
                    "enabled": loaded.enabled,
                    "tools": len(loaded.tools_registered),
                    "hooks": len(loaded.hooks_registered),
+                    "commands": len(loaded.commands_registered),
                    "error": loaded.error,
                }
            )
@@ -699,6 +784,20 @@ def get_plugin_context_engine():
    return get_plugin_manager()._context_engine


+def get_plugin_command_handler(name: str) -> Optional[Callable]:
+    """Return the handler for a plugin-registered slash command, or ``None``."""
+    entry = get_plugin_manager()._plugin_commands.get(name)
+    return entry["handler"] if entry else None
+
+
+def get_plugin_commands() -> Dict[str, dict]:
+    """Return the full plugin commands dict (name → {handler, description, plugin}).
+
+    Safe to call before discovery — returns an empty dict if no plugins loaded.
+    """
+    return get_plugin_manager()._plugin_commands
+
+
 def get_plugin_toolsets() -> List[tuple]:
    """Return plugin toolsets as ``(key, label, description)`` tuples.

@@ -300,19 +300,10 @@ def _read_config_model(profile_dir: Path) -> tuple:

 def _check_gateway_running(profile_dir: Path) -> bool:
    """Check if a gateway is running for a given profile directory."""
-    pid_file = profile_dir / "gateway.pid"
-    if not pid_file.exists():
-        return False
    try:
-        raw = pid_file.read_text().strip()
-        if not raw:
-            return False
-        data = json.loads(raw) if raw.startswith("{") else {"pid": int(raw)}
-        pid = int(data["pid"])
-        os.kill(pid, 0)  # existence check
-        return True
-    except (json.JSONDecodeError, KeyError, ValueError, TypeError,
-            ProcessLookupError, PermissionError, OSError):
+        from gateway.status import get_running_pid
+        return get_running_pid(profile_dir / "gateway.pid", cleanup_stale=False) is not None
+    except Exception:
        return False


@@ -64,6 +64,11 @@ HERMES_OVERLAYS: Dict[str, HermesOverlay] = {
        base_url_override="https://portal.qwen.ai/v1",
        base_url_env_var="HERMES_QWEN_BASE_URL",
    ),
+    "google-gemini-cli": HermesOverlay(
+        transport="openai_chat",
+        auth_type="oauth_external",
+        base_url_override="cloudcode-pa://google",
+    ),
    "copilot-acp": HermesOverlay(
        transport="codex_responses",
        auth_type="external_process",
@@ -128,10 +133,15 @@ HERMES_OVERLAYS: Dict[str, HermesOverlay] = {
        base_url_env_var="HF_BASE_URL",
    ),
    "xai": HermesOverlay(
-        transport="openai_chat",
+        transport="codex_responses",
        base_url_override="https://api.x.ai/v1",
        base_url_env_var="XAI_BASE_URL",
    ),
+    "nvidia": HermesOverlay(
+        transport="openai_chat",
+        base_url_override="https://integrate.api.nvidia.com/v1",
+        base_url_env_var="NVIDIA_BASE_URL",
+    ),
    "xiaomi": HermesOverlay(
        transport="openai_chat",
        base_url_env_var="XIAOMI_BASE_URL",
@@ -141,6 +151,10 @@ HERMES_OVERLAYS: Dict[str, HermesOverlay] = {
        base_url_override="https://api.arcee.ai/api/v1",
        base_url_env_var="ARCEE_BASE_URL",
    ),
+    "ollama-cloud": HermesOverlay(
+        transport="openai_chat",
+        base_url_env_var="OLLAMA_BASE_URL",
+    ),
 }


@@ -180,6 +194,13 @@ ALIASES: Dict[str, str] = {
    # xai
    "x-ai": "xai",
    "x.ai": "xai",
+    "grok": "xai",
+
+    # nvidia
+    "nim": "nvidia",
+    "nvidia-nim": "nvidia",
+    "build-nvidia": "nvidia",
+    "nemotron": "nvidia",

    # kimi-for-coding (models.dev ID)
    "kimi": "kimi-for-coding",
@@ -227,6 +248,11 @@ ALIASES: Dict[str, str] = {
    "qwen": "alibaba",
    "alibaba-cloud": "alibaba",

+    # google-gemini-cli (OAuth + Code Assist)
+    "gemini-cli": "google-gemini-cli",
+    "gemini-oauth": "google-gemini-cli",
+
+
    # huggingface
    "hf": "huggingface",
    "hugging-face": "huggingface",
@@ -236,6 +262,12 @@ ALIASES: Dict[str, str] = {
    "mimo": "xiaomi",
    "xiaomi-mimo": "xiaomi",

+    # bedrock
+    "aws": "bedrock",
+    "aws-bedrock": "bedrock",
+    "amazon-bedrock": "bedrock",
+    "amazon": "bedrock",
+
    # arcee
    "arcee-ai": "arcee",
    "arceeai": "arcee",
@@ -244,7 +276,7 @@ ALIASES: Dict[str, str] = {
    "lmstudio": "lmstudio",
    "lm-studio": "lmstudio",
    "lm_studio": "lmstudio",
-    "ollama": "ollama-cloud",
+    "ollama": "custom",  # bare "ollama" = local; use "ollama-cloud" for cloud
    "vllm": "local",
    "llamacpp": "local",
    "llama.cpp": "local",
@@ -262,6 +294,8 @@ _LABEL_OVERRIDES: Dict[str, str] = {
    "copilot-acp": "GitHub Copilot ACP",
    "xiaomi": "Xiaomi MiMo",
    "local": "Local endpoint",
+    "bedrock": "AWS Bedrock",
+    "ollama-cloud": "Ollama Cloud",
 }


@@ -271,6 +305,7 @@ TRANSPORT_TO_API_MODE: Dict[str, str] = {
    "openai_chat": "chat_completions",
    "anthropic_messages": "anthropic_messages",
    "codex_responses": "codex_responses",
+    "bedrock_converse": "bedrock_converse",
 }


@@ -388,6 +423,10 @@ def determine_api_mode(provider: str, base_url: str = "") -> str:
    if pdef is not None:
        return TRANSPORT_TO_API_MODE.get(pdef.transport, "chat_completions")

+    # Direct provider checks for providers not in HERMES_OVERLAYS
+    if provider == "bedrock":
+        return "bedrock_converse"
+
    # URL-based heuristics for custom / unknown providers
    if base_url:
        url_lower = base_url.rstrip("/").lower()
@@ -395,6 +434,8 @@ def determine_api_mode(provider: str, base_url: str = "") -> str:
            return "anthropic_messages"
        if "api.openai.com" in url_lower:
            return "codex_responses"
+        if "bedrock-runtime" in url_lower and "amazonaws.com" in url_lower:
+            return "bedrock_converse"

    return "chat_completions"

--- a/Show More
+++ b/Show More