chore: remove vendor-specific references from app_tools

fix: use 'is not None' checks for session/session_id, remove dead _EXECUTE_STRIP_KEYS
- 'if session:' drops empty dict {} which is schema-valid - 'if session_id:' drops empty string which shouldn't be silently eaten - _EXECUTE_STRIP_KEYS frozenset was defined but never referenced (handler uses allowlist approach instead)
2026-05-23 22:59:01 +05:30 · 2026-05-23 22:19:24 +05:30 · 2026-05-23 22:13:08 +05:30 · 2026-05-23 21:16:42 +05:30 · 2026-05-23 21:08:04 +05:30 · 2026-05-23 20:52:08 +05:30
183 changed files with 11916 additions and 2018 deletions
@@ -27,9 +27,9 @@ on:
 permissions:
  contents: read

-# Concurrency: push/release runs are NEVER cancelled so every merge gets its
-# own SHA-tagged image; :main and :latest are guarded separately by the
-# move-main and move-latest jobs.  PR runs reuse a PR-scoped group with
+# Concurrency: push/release runs are NEVER cancelled so every merge gets
+# its own :main or release-tagged image.  :latest is guarded separately
+# by the move-latest job.  PR runs reuse a PR-scoped group with
 # cancel-in-progress: true so rapid pushes to the same PR collapse to the
 # latest commit.
 concurrency:
@@ -92,10 +92,10 @@ jobs:
      # pattern for multi-runner multi-platform builds.
      #
      # We apply the OCI revision label here (and again on arm64) because
-      # the move-main / move-latest jobs read it off the linux/amd64
-      # sub-manifest config of the floating tag to decide whether it's safe
-      # to advance.  The label must be on each per-arch image — manifest
-      # lists themselves don't carry image config labels.
+      # the move-latest job reads it off the linux/amd64 sub-manifest
+      # config of the floating tag to decide whether it's safe to advance.
+      # The label must be on each per-arch image — manifest lists themselves
+      # don't carry image config labels.
      - name: Push amd64 by digest
        id: push
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
@@ -208,8 +208,14 @@ jobs:
  # ---------------------------------------------------------------------------
  # Stitch both per-arch digests into a single tagged multi-arch manifest.
  # This is a registry-side operation — no building, no layer re-push —
-  # so it runs in ~30 seconds.  On main pushes it produces :sha-<sha>.
-  # On releases it produces :<release_tag_name>.
+  # so it runs in ~30 seconds.  On main pushes it produces :main; on
+  # releases it produces :<release_tag_name>.
+  #
+  # For main pushes the ancestor check runs BEFORE the manifest push so
+  # we never overwrite :main with an older commit.  The top-level
+  # concurrency group (`docker-${{ github.ref }}` with
+  # `cancel-in-progress: false`) already serialises runs per ref; the
+  # ancestor check is defense-in-depth.
  # ---------------------------------------------------------------------------
  merge:
    if: github.repository == 'NousResearch/hermes-agent' && (github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release')
@@ -217,10 +223,15 @@ jobs:
    needs: [build-amd64, build-arm64]
    timeout-minutes: 10
    outputs:
-      pushed_sha_tag: ${{ steps.mark_pushed.outputs.pushed }}
      pushed_release_tag: ${{ steps.mark_release_pushed.outputs.pushed }}
      release_tag: ${{ steps.tag.outputs.tag }}
    steps:
+      - name: Checkout code
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          fetch-depth: 1000
+
      - name: Download digests
        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
        with:
@@ -237,120 +248,19 @@ jobs:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}

-      # Compute the tag for this run.  Main pushes use sha-<sha> (so every
-      # commit gets its own immutable tag); releases use the release tag name.
-      - name: Compute tag
-        id: tag
-        run: |
-          if [ "${{ github.event_name }}" = "release" ]; then
-            echo "tag=${{ github.event.release.tag_name }}" >> "$GITHUB_OUTPUT"
-          else
-            echo "tag=sha-${{ github.sha }}" >> "$GITHUB_OUTPUT"
-          fi
-
-      - name: Create manifest list and push
-        working-directory: /tmp/digests
-        run: |
-          set -euo pipefail
-          # Build the arg array from each digest file (filename = the digest
-          # hex, with no sha256: prefix; empty file content, only the name
-          # matters).  Using an array avoids shellcheck SC2046 and keeps
-          # every digest a single argv token even under pathological names.
-          args=()
-          for digest_file in *; do
-            args+=("${IMAGE_NAME}@sha256:${digest_file}")
-          done
-          docker buildx imagetools create \
-            -t "${IMAGE_NAME}:${TAG}" \
-            "${args[@]}"
-        env:
-          IMAGE_NAME: ${{ env.IMAGE_NAME }}
-          TAG: ${{ steps.tag.outputs.tag }}
-
-      - name: Inspect image
-        run: |
-          docker buildx imagetools inspect "${IMAGE_NAME}:${TAG}"
-        env:
-          IMAGE_NAME: ${{ env.IMAGE_NAME }}
-          TAG: ${{ steps.tag.outputs.tag }}
-
-      # Signal to move-main that the SHA tag is live.  Only on main pushes;
-      # releases set pushed_release_tag instead.
-      - name: Mark SHA tag pushed
-        id: mark_pushed
-        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
-        run: echo "pushed=true" >> "$GITHUB_OUTPUT"
-
-      # Signal to move-latest that the release tag is live.
-      - name: Mark release tag pushed
-        id: mark_release_pushed
-        if: github.event_name == 'release'
-        run: echo "pushed=true" >> "$GITHUB_OUTPUT"
-
-  # ---------------------------------------------------------------------------
-  # Move :main to point at the SHA tag the merge job pushed.
-  #
-  # :main is the floating tag that tracks the tip of the main branch.  Every
-  # merge to main retags :main forward.  Users who want "latest dev build"
-  # pull :main; users who want stable releases pull :latest.
-  #
-  # The real serialization guarantee comes from the top-level concurrency
-  # group (`docker-${{ github.ref }}` with `cancel-in-progress: false`),
-  # which ensures at most one workflow run for this ref executes at a time.
-  # That means two move-main steps for the same ref cannot overlap.
-  #
-  # This job has its own concurrency group as defense-in-depth: if the
-  # top-level group is ever loosened, queued move-mains will run serially
-  # in arrival order, each one running the ancestor check below and either
-  # advancing :main or skipping.  `cancel-in-progress: false` matches the
-  # top-level setting — we don't want rapid pushes to cancel a queued
-  # move-main, because the ancestor check is the real safety mechanism
-  # and queueing is cheap (move-main is a ~30s registry op).
-  #
-  # Combined with the ancestor check, this means :main only ever moves
-  # forward in git history.
-  # ---------------------------------------------------------------------------
-  move-main:
-    if: |
-      github.repository == 'NousResearch/hermes-agent'
-      && github.event_name == 'push'
-      && github.ref == 'refs/heads/main'
-      && needs.merge.outputs.pushed_sha_tag == 'true'
-    needs: merge
-    runs-on: ubuntu-latest
-    timeout-minutes: 10
-    concurrency:
-      group: docker-move-main-${{ github.ref }}
-      cancel-in-progress: false
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          fetch-depth: 1000
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3
-
-      - name: Log in to Docker Hub
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-
      # Read the git revision label off the current :main manifest, then
-      # use `git merge-base --is-ancestor` to check whether our commit is a
-      # descendant of it.  If :main doesn't exist yet, or its label is
-      # missing, we treat that as "safe to publish".  If another run already
-      # advanced :main past us (or diverged), we skip and leave it alone.
+      # use `git merge-base --is-ancestor` to check whether our commit is
+      # a descendant of it.  If :main doesn't exist yet, or its label is
+      # missing, we treat that as "safe to publish".  If another run
+      # already advanced :main past us (or diverged), we skip and leave
+      # it alone.
      - name: Decide whether to move :main
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
        id: main_check
        run: |
          set -euo pipefail
          image=nousresearch/hermes-agent

-          # Pull the JSON for the linux/amd64 sub-manifest's config and extract
-          # the OCI revision label with jq — Go template field access can't
-          # handle dots in map keys, so using json+jq is the robust route.
          image_json=$(
            docker buildx imagetools inspect "${image}:main" \
              --format '{{ json (index .Image "linux/amd64") }}' \
@@ -383,7 +293,6 @@ jobs:
            exit 0
          fi

-          # Make sure we have the :main commit locally for merge-base.
          if ! git cat-file -e "${current_sha}^{commit}" 2>/dev/null; then
            git fetch --no-tags --prune origin \
              "+refs/heads/main:refs/remotes/origin/main" \
@@ -396,7 +305,6 @@ jobs:
            exit 0
          fi

-          # Our SHA must be a descendant of the current :main to be safe.
          if git merge-base --is-ancestor "${current_sha}" "${GITHUB_SHA}"; then
            echo "Our commit is a descendant of :main — safe to advance."
            echo "push_main=true" >> "$GITHUB_OUTPUT"
@@ -405,19 +313,48 @@ jobs:
            echo "push_main=false" >> "$GITHUB_OUTPUT"
          fi

-      # Retag the already-pushed SHA manifest as :main.  This is a registry-
-      # side operation — no rebuild, no layer re-push — so it's quick and
-      # atomic per-tag.  The ancestor check above plus the cancel-in-progress
-      # concurrency on this job together guarantee we only ever move :main
-      # forward in git history.
-      - name: Move :main to this SHA
-        if: steps.main_check.outputs.push_main == 'true'
+      # Compute the tag for this run.  Main pushes tag directly as :main
+      # (no per-commit SHA tags); releases use the release tag name.
+      - name: Compute tag
+        id: tag
+        run: |
+          if [ "${{ github.event_name }}" = "release" ]; then
+            echo "tag=${{ github.event.release.tag_name }}" >> "$GITHUB_OUTPUT"
+          else
+            echo "tag=main" >> "$GITHUB_OUTPUT"
+          fi
+
+      # Gate the manifest push on the ancestor check for main pushes.
+      # For releases there is no gate — the check doesn't even run.
+      - name: Create manifest list and push
+        if: github.event_name != 'push' || steps.main_check.outputs.push_main == 'true'
+        working-directory: /tmp/digests
        run: |
          set -euo pipefail
-          image=nousresearch/hermes-agent
+          args=()
+          for digest_file in *; do
+            args+=("${IMAGE_NAME}@sha256:${digest_file}")
+          done
          docker buildx imagetools create \
-            --tag "${image}:main" \
-            "${image}:sha-${GITHUB_SHA}"
+            -t "${IMAGE_NAME}:${TAG}" \
+            "${args[@]}"
+        env:
+          IMAGE_NAME: ${{ env.IMAGE_NAME }}
+          TAG: ${{ steps.tag.outputs.tag }}
+
+      - name: Inspect image
+        if: github.event_name != 'push' || steps.main_check.outputs.push_main == 'true'
+        run: |
+          docker buildx imagetools inspect "${IMAGE_NAME}:${TAG}"
+        env:
+          IMAGE_NAME: ${{ env.IMAGE_NAME }}
+          TAG: ${{ steps.tag.outputs.tag }}
+
+      # Signal to move-latest that the release tag is live.
+      - name: Mark release tag pushed
+        id: mark_release_pushed
+        if: github.event_name == 'release'
+        run: echo "pushed=true" >> "$GITHUB_OUTPUT"

  # ---------------------------------------------------------------------------
  # Move :latest to point at the release tag the merge job pushed.
@@ -427,10 +364,10 @@ jobs:
  #
  # We still run an ancestor check against the existing :latest so that a
  # backport release on an older branch (e.g. patching v1.1.5 after v1.2.3
-  # is out) doesn't drag :latest backwards.  The check is the same shape as
-  # move-main: read the OCI revision label off the current :latest, look up
-  # that commit in git, and only advance if our release commit is a strict
-  # descendant.
+  # is out) doesn't drag :latest backwards.  The check is the same shape
+  # as the ancestor check in the merge job for :main: read the OCI
+  # revision label off the current :latest, look up that commit in git,
+  # and only advance if our release commit is a strict descendant.
  # ---------------------------------------------------------------------------
  move-latest:
    if: |
@@ -23,13 +23,24 @@ concurrency:
 jobs:
  test:
    runs-on: ubuntu-latest
-    timeout-minutes: 30
+    timeout-minutes: 60
    steps:
      - name: Checkout code
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2

-      - name: Install system dependencies
-        run: sudo apt-get update && sudo apt-get install -y ripgrep
+      - name: Install ripgrep (prebuilt binary)
+        run: |
+          set -euo pipefail
+          RG_VERSION=15.1.0
+          RG_SHA256=1c9297be4a084eea7ecaedf93eb03d058d6faae29bbc57ecdaf5063921491599
+          RG_TARBALL=ripgrep-${RG_VERSION}-x86_64-unknown-linux-musl.tar.gz
+          curl -sSfL -o "$RG_TARBALL" \
+            "https://github.com/BurntSushi/ripgrep/releases/download/${RG_VERSION}/${RG_TARBALL}"
+          echo "${RG_SHA256}  ${RG_TARBALL}" | sha256sum -c -
+          tar -xzf "$RG_TARBALL"
+          sudo mv "ripgrep-${RG_VERSION}-x86_64-unknown-linux-musl/rg" /usr/local/bin/rg
+          rm -rf "$RG_TARBALL" "ripgrep-${RG_VERSION}-x86_64-unknown-linux-musl"
+          rg --version

      - name: Install uv
        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86  # v5
@@ -44,9 +55,26 @@ jobs:
          uv pip install -e ".[all,dev]"

      - name: Run tests
+        # Per-file isolation via scripts/run_tests_parallel.py: discovers
+        # every test_*.py file under tests/ (excluding integration/ + e2e/),
+        # then runs `python -m pytest <file>` in a freshly-spawned subprocess
+        # with bounded parallelism. No xdist, no shared workers, no
+        # module-level state leakage between files.
+        #
+        # Why per-file (not per-test): per-test spawn cost (~250ms × 17k
+        # tests = 70min CPU minimum) blew the wall-clock budget. Per-file
+        # spawn (~250ms × ~850 files = ~3.5min) fits while still giving
+        # every file a fresh interpreter — the only isolation boundary
+        # that matters in practice (cross-file leakage was the original
+        # flake source; intra-file is the test author's responsibility).
+        #
+        # Why drop xdist entirely: xdist's persistent workers accumulate
+        # state across files, which is exactly the leakage we wanted to
+        # fix. ThreadPoolExecutor + subprocess.run is ~60 lines and does
+        # the job with cleaner semantics.
        run: |
          source .venv/bin/activate
-          python -m pytest tests/ -q --ignore=tests/integration --ignore=tests/e2e --tb=short -n auto --timeout=30 --timeout-method=signal
+          python scripts/run_tests_parallel.py
        env:
          # Ensure tests don't accidentally call real APIs
          OPENROUTER_API_KEY: ""
@@ -60,8 +88,19 @@ jobs:
      - name: Checkout code
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2

-      - name: Install system dependencies
-        run: sudo apt-get update && sudo apt-get install -y ripgrep
+      - name: Install ripgrep (prebuilt binary)
+        run: |
+          set -euo pipefail
+          RG_VERSION=15.1.0
+          RG_SHA256=1c9297be4a084eea7ecaedf93eb03d058d6faae29bbc57ecdaf5063921491599
+          RG_TARBALL=ripgrep-${RG_VERSION}-x86_64-unknown-linux-musl.tar.gz
+          curl -sSfL -o "$RG_TARBALL" \
+            "https://github.com/BurntSushi/ripgrep/releases/download/${RG_VERSION}/${RG_TARBALL}"
+          echo "${RG_SHA256}  ${RG_TARBALL}" | sha256sum -c -
+          tar -xzf "$RG_TARBALL"
+          sudo mv "ripgrep-${RG_VERSION}-x86_64-unknown-linux-musl/rg" /usr/local/bin/rg
+          rm -rf "$RG_TARBALL" "ripgrep-${RG_VERSION}-x86_64-unknown-linux-musl"
+          rg --version

      - name: Install uv
        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86  # v5
@@ -18,6 +18,7 @@ __pycache__/web_tools.cpython-310.pyc
 logs/
 data/
 .pytest_cache/
+.pytest-cache/
 tmp/
 temp_vision_images/
 hermes-*/*
@@ -1013,17 +1013,39 @@ def profile_env(tmp_path, monkeypatch):

 **ALWAYS use `scripts/run_tests.sh`** — do not call `pytest` directly. The script enforces
 hermetic environment parity with CI (unset credential vars, TZ=UTC, LANG=C.UTF-8,
-4 xdist workers matching GHA ubuntu-latest). Direct `pytest` on a 16+ core
-developer machine with API keys set diverges from CI in ways that have caused
-multiple "works locally, fails in CI" incidents (and the reverse).
+`-n auto` xdist workers, in-tree subprocess-isolation plugin). Direct `pytest`
+on a 16+ core developer machine with API keys set diverges from CI in ways
+that have caused multiple "works locally, fails in CI" incidents (and the reverse).

 ```bash
 scripts/run_tests.sh                                  # full suite, CI-parity
 scripts/run_tests.sh tests/gateway/                   # one directory
 scripts/run_tests.sh tests/agent/test_foo.py::test_x  # one test
 scripts/run_tests.sh -v --tb=long                     # pass-through pytest flags
+scripts/run_tests.sh --no-isolate tests/foo/          # disable subprocess isolation (faster, for debugging)
 ```

+### Subprocess-per-test isolation
+
+Every test runs in a freshly-spawned Python subprocess via the in-tree plugin
+at `tests/_isolate_plugin.py`. This means module-level dicts/sets and
+ContextVars from one test cannot leak into the next — the historic
+`_reset_module_state` autouse fixture is gone.
+
+Implementation notes:
+
+- The plugin uses `multiprocessing.get_context("spawn")`, which works on
+  Linux, macOS, and Windows alike (POSIX `fork` is not used).
+- Per-test overhead is ~0.5–1.0s (Python startup + pytest collection). xdist
+  parallelism amortizes this across cores; on a 20-core box the full suite
+  finishes in roughly the same wall time as before, but flake-free.
+- `isolate_timeout` (configured in `pyproject.toml`) caps each test at 30s.
+  Hangs are killed and surfaced as a failure report.
+- Pass `--no-isolate` to disable isolation — useful when debugging a single
+  test interactively, or when you specifically want to verify state leakage.
+- The plugin disables itself in child processes (sentinel envvar
+  `HERMES_ISOLATE_CHILD=1`), so there's no fork-bomb risk.
+
 ### Why the wrapper (and why the old "just call pytest" doesn't work)

 Five real sources of local-vs-CI drift the script closes:
@@ -1034,7 +1056,7 @@ Five real sources of local-vs-CI drift the script closes:
 | HOME / `~/.hermes/` | Your real config+auth.json | Temp dir per test |
 | Timezone | Local TZ (PDT etc.) | UTC |
 | Locale | Whatever is set | C.UTF-8 |
-| xdist workers | `-n auto` = all cores (20+ on a workstation) | `-n 4` matching CI |
+| xdist workers | `-n auto` = all cores | `-n auto` (safe — subprocess isolation prevents cross-worker flakes) |

 `tests/conftest.py` also enforces points 1-4 as an autouse fixture so ANY pytest
 invocation (including IDE integrations) gets hermetic behavior — but the wrapper
@@ -1042,15 +1064,21 @@ is belt-and-suspenders.

 ### Running without the wrapper (only if you must)

-If you can't use the wrapper (e.g. on Windows or inside an IDE that shells
-pytest directly), at minimum activate the venv and pass `-n 4`:
+If you can't use the wrapper (e.g. inside an IDE that shells pytest directly),
+at minimum activate the venv. The isolation plugin loads automatically from
+`addopts` in `pyproject.toml`, so you get the same per-test process isolation
+either way.

 ```bash
 source .venv/bin/activate   # or: source venv/bin/activate
-python -m pytest tests/ -q -n 4
+python -m pytest tests/ -q
 ```

-Worker count above 4 will surface test-ordering flakes that CI never sees.
+If you need to bypass isolation for fast feedback while debugging:
+
+```bash
+python -m pytest tests/agent/test_foo.py -q --no-isolate
+```

 Always run the full suite before pushing changes.

@@ -71,6 +71,71 @@ def _ra():
    return run_agent


+def _normalized_custom_base_url(value: Any) -> str:
+    if not isinstance(value, str):
+        return ""
+    return value.strip().rstrip("/")
+
+
+def _custom_provider_model_matches(agent_model: str, entry: Dict[str, Any]) -> bool:
+    provider_model = str(entry.get("model", "") or "").strip().lower()
+    if not provider_model:
+        return True
+    return provider_model == str(agent_model or "").strip().lower()
+
+
+def _custom_provider_extra_body_for_agent(
+    *,
+    provider: str,
+    model: str,
+    base_url: str,
+    custom_providers: List[Dict[str, Any]],
+) -> Optional[Dict[str, Any]]:
+    if (provider or "").strip().lower() != "custom":
+        return None
+
+    target_url = _normalized_custom_base_url(base_url)
+    if not target_url:
+        return None
+
+    fallback: Optional[Dict[str, Any]] = None
+    for entry in custom_providers or []:
+        if not isinstance(entry, dict):
+            continue
+        if _normalized_custom_base_url(entry.get("base_url")) != target_url:
+            continue
+        extra_body = entry.get("extra_body")
+        if not isinstance(extra_body, dict) or not extra_body:
+            continue
+        provider_model = str(entry.get("model", "") or "").strip()
+        if provider_model:
+            if _custom_provider_model_matches(model, entry):
+                return dict(extra_body)
+        elif fallback is None:
+            fallback = dict(extra_body)
+
+    return fallback
+
+
+def _merge_custom_provider_extra_body(agent, custom_providers: List[Dict[str, Any]]) -> None:
+    extra_body = _custom_provider_extra_body_for_agent(
+        provider=agent.provider,
+        model=agent.model,
+        base_url=agent.base_url,
+        custom_providers=custom_providers,
+    )
+    if not extra_body:
+        return
+
+    overrides = dict(getattr(agent, "request_overrides", {}) or {})
+    merged_extra_body = dict(extra_body)
+    existing_extra_body = overrides.get("extra_body")
+    if isinstance(existing_extra_body, dict):
+        merged_extra_body.update(existing_extra_body)
+    overrides["extra_body"] = merged_extra_body
+    agent.request_overrides = overrides
+
+
 def init_agent(
    agent,
    base_url: str = None,
@@ -1060,7 +1125,18 @@ def init_agent(
    # through _ra().get_tool_definitions()).  Duplicate function names cause
    # 400 errors on providers that enforce unique names (e.g. Xiaomi
    # MiMo via Nous Portal).
-    if agent._memory_manager and agent.tools is not None:
+    #
+    # Respect the platform's enabled_toolsets configuration (#5544):
+    #   enabled_toolsets is None        → no filter, inject (backward compat)
+    #   "memory" in enabled_toolsets    → user opted in, inject
+    #   otherwise (incl. [])            → user excluded memory, skip injection
+    #
+    # Without this gate, `platform_toolsets: telegram: []` still leaks memory
+    # provider tools (fact_store, etc.) into the tool surface — a 10x latency
+    # penalty on local models and a frequent trigger of tool-call loops.
+    if agent._memory_manager and agent.tools is not None and (
+        agent.enabled_toolsets is None or "memory" in agent.enabled_toolsets
+    ):
        _existing_tool_names = {
            t.get("function", {}).get("name")
            for t in agent.tools
@@ -1213,6 +1289,7 @@ def init_agent(
    # Store for reuse by _check_compression_model_feasibility (auxiliary
    # compression model context-length detection needs the same list).
    agent._custom_providers = _custom_providers
+    _merge_custom_provider_extra_body(agent, _custom_providers)

    # Check custom_providers per-model context_length
    if _config_context_length is None and _custom_providers:
@@ -1369,8 +1446,22 @@ def init_agent(
    # errors. Even with the cache fix, dedup is the right defense
    # against plugin paths that may register the same schemas via
    # ctx.register_tool(). Mirrors the memory tools dedup above.
+    #
+    # Respect the platform's enabled_toolsets configuration (#5544):
+    # context engine tools follow the same gating pattern as memory
+    # provider tools — without the gate, `platform_toolsets: telegram: []`
+    # would still leak lcm_* tools into the tool surface and incur the
+    # same local-model latency penalty.
    agent._context_engine_tool_names: set = set()
-    if hasattr(agent, "context_compressor") and agent.context_compressor and agent.tools is not None:
+    if (
+        hasattr(agent, "context_compressor")
+        and agent.context_compressor
+        and agent.tools is not None
+        and (
+            agent.enabled_toolsets is None
+            or "context_engine" in agent.enabled_toolsets
+        )
+    ):
        _existing_tool_names = {
            t.get("function", {}).get("name")
            for t in agent.tools
@@ -1606,182 +1606,155 @@ def _content_parts_to_anthropic_blocks(parts: Any) -> List[Dict[str, Any]]:
    return out


-def convert_messages_to_anthropic(
-    messages: List[Dict],
-    base_url: str | None = None,
-    model: str | None = None,
-) -> Tuple[Optional[Any], List[Dict]]:
-    """Convert OpenAI-format messages to Anthropic format.
+def _convert_assistant_message(m: Dict[str, Any]) -> Dict[str, Any]:
+    """Convert an assistant message to Anthropic content blocks.

-    Returns (system_prompt, anthropic_messages).
-    System messages are extracted since Anthropic takes them as a separate param.
-    system_prompt is a string or list of content blocks (when cache_control present).
-
-    When *base_url* is provided and points to a third-party Anthropic-compatible
-    endpoint, all thinking block signatures are stripped.  Signatures are
-    Anthropic-proprietary — third-party endpoints cannot validate them and will
-    reject them with HTTP 400 "Invalid signature in thinking block".
-
-    When *model* is provided and matches the Kimi / Moonshot family (or
-    *base_url* is a Kimi / Moonshot host), unsigned thinking blocks
-    synthesised from ``reasoning_content`` are preserved on replayed
-    assistant tool-call messages — Kimi requires the field to exist, even
-    if empty.
+    Handles thinking blocks, regular content, tool calls, and
+    reasoning_content injection for Kimi/DeepSeek endpoints.
    """
-    system = None
-    result = []
-
-    for m in messages:
-        role = m.get("role", "user")
-        content = m.get("content", "")
-
-        if role == "system":
-            if isinstance(content, list):
-                # Preserve cache_control markers on content blocks
-                has_cache = any(
-                    p.get("cache_control") for p in content if isinstance(p, dict)
-                )
-                if has_cache:
-                    system = [p for p in content if isinstance(p, dict)]
-                else:
-                    system = "\n".join(
-                        p["text"] for p in content if p.get("type") == "text"
-                    )
-            else:
-                system = content
-            continue
-
-        if role == "assistant":
-            blocks = _extract_preserved_thinking_blocks(m)
-            if content:
-                if isinstance(content, list):
-                    converted_content = _convert_content_to_anthropic(content)
-                    if isinstance(converted_content, list):
-                        blocks.extend(converted_content)
-                else:
-                    blocks.append({"type": "text", "text": str(content)})
-            for tc in m.get("tool_calls", []):
-                if not tc or not isinstance(tc, dict):
-                    continue
-                fn = tc.get("function", {})
-                args = fn.get("arguments", "{}")
-                try:
-                    parsed_args = json.loads(args) if isinstance(args, str) else args
-                except (json.JSONDecodeError, ValueError):
-                    parsed_args = {}
-                blocks.append({
-                    "type": "tool_use",
-                    "id": _sanitize_tool_id(tc.get("id", "")),
-                    "name": fn.get("name", ""),
-                    "input": parsed_args,
-                })
-            # Kimi's /coding endpoint (Anthropic protocol) requires assistant
-            # tool-call messages to carry reasoning_content when thinking is
-            # enabled server-side.  Preserve it as a thinking block so Kimi
-            # can validate the message history.  See hermes-agent#13848.
-            #
-            # Accept empty string "" — _copy_reasoning_content_for_api()
-            # injects "" as a tier-3 fallback for Kimi tool-call messages
-            # that had no reasoning.  Kimi requires the field to exist, even
-            # if empty.
-            #
-            # Prepend (not append): Anthropic protocol requires thinking
-            # blocks before text and tool_use blocks.
-            #
-            # Guard: only add when reasoning_details didn't already contribute
-            # thinking blocks.  On native Anthropic, reasoning_details produces
-            # signed thinking blocks — adding another unsigned one from
-            # reasoning_content would create a duplicate (same text) that gets
-            # downgraded to a spurious text block on the last assistant message.
-            reasoning_content = m.get("reasoning_content")
-            _already_has_thinking = any(
-                isinstance(b, dict) and b.get("type") in {"thinking", "redacted_thinking"}
-                for b in blocks
-            )
-            if isinstance(reasoning_content, str) and not _already_has_thinking:
-                blocks.insert(0, {"type": "thinking", "thinking": reasoning_content})
-            # Anthropic rejects empty assistant content
-            effective = blocks or content
-            if not effective or effective == "":
-                effective = [{"type": "text", "text": "(empty)"}]
-            result.append({"role": "assistant", "content": effective})
-            continue
-
-        if role == "tool":
-            # Sanitize tool_use_id and ensure non-empty content.
-            # Computer-use (and other multimodal) tool results arrive as
-            # either a list of OpenAI-style content parts, or a dict
-            # marked `_multimodal` with an embedded `content` list. Convert
-            # both into Anthropic `tool_result` inner blocks (text + image).
-            multimodal_blocks: Optional[List[Dict[str, Any]]] = None
-            if isinstance(content, dict) and content.get("_multimodal"):
-                multimodal_blocks = _content_parts_to_anthropic_blocks(
-                    content.get("content") or []
-                )
-                # Fallback text if the conversion produced nothing usable.
-                if not multimodal_blocks and content.get("text_summary"):
-                    multimodal_blocks = [
-                        {"type": "text", "text": str(content["text_summary"])}
-                    ]
-            elif isinstance(content, list):
-                converted = _content_parts_to_anthropic_blocks(content)
-                if any(b.get("type") == "image" for b in converted):
-                    multimodal_blocks = converted
-            # Back-compat: some callers stash blocks under a private key.
-            if multimodal_blocks is None:
-                stashed = m.get("_anthropic_content_blocks")
-                if isinstance(stashed, list) and stashed:
-                    text_content = content if isinstance(content, str) and content.strip() else None
-                    multimodal_blocks = (
-                        [{"type": "text", "text": text_content}] + stashed
-                        if text_content else list(stashed)
-                    )
-
-            if multimodal_blocks:
-                result_content: Any = multimodal_blocks
-            elif isinstance(content, str):
-                result_content = content
-            else:
-                result_content = json.dumps(content) if content else "(no output)"
-            if not result_content:
-                result_content = "(no output)"
-            tool_result = {
-                "type": "tool_result",
-                "tool_use_id": _sanitize_tool_id(m.get("tool_call_id", "")),
-                "content": result_content,
-            }
-            if isinstance(m.get("cache_control"), dict):
-                tool_result["cache_control"] = dict(m["cache_control"])
-            # Merge consecutive tool results into one user message
-            if (
-                result
-                and result[-1]["role"] == "user"
-                and isinstance(result[-1]["content"], list)
-                and result[-1]["content"]
-                and result[-1]["content"][0].get("type") == "tool_result"
-            ):
-                result[-1]["content"].append(tool_result)
-            else:
-                result.append({"role": "user", "content": [tool_result]})
-            continue
-
-        # Regular user message — validate non-empty content (Anthropic rejects empty)
+    content = m.get("content", "")
+    blocks = _extract_preserved_thinking_blocks(m)
+    if content:
        if isinstance(content, list):
-            converted_blocks = _convert_content_to_anthropic(content)
-            # Check if all text blocks are empty
-            if not converted_blocks or all(
-                b.get("text", "").strip() == ""
-                for b in converted_blocks
-                if isinstance(b, dict) and b.get("type") == "text"
-            ):
-                converted_blocks = [{"type": "text", "text": "(empty message)"}]
-            result.append({"role": "user", "content": converted_blocks})
+            converted_content = _convert_content_to_anthropic(content)
+            if isinstance(converted_content, list):
+                blocks.extend(converted_content)
        else:
-            # Validate string content is non-empty
-            if not content or (isinstance(content, str) and not content.strip()):
-                content = "(empty message)"
-            result.append({"role": "user", "content": content})
+            blocks.append({"type": "text", "text": str(content)})
+    for tc in m.get("tool_calls", []):
+        if not tc or not isinstance(tc, dict):
+            continue
+        fn = tc.get("function", {})
+        args = fn.get("arguments", "{}")
+        try:
+            parsed_args = json.loads(args) if isinstance(args, str) else args
+        except (json.JSONDecodeError, ValueError):
+            parsed_args = {}
+        blocks.append({
+            "type": "tool_use",
+            "id": _sanitize_tool_id(tc.get("id", "")),
+            "name": fn.get("name", ""),
+            "input": parsed_args,
+        })
+    # Kimi's /coding endpoint (Anthropic protocol) requires assistant
+    # tool-call messages to carry reasoning_content when thinking is
+    # enabled server-side.  Preserve it as a thinking block so Kimi
+    # can validate the message history.  See hermes-agent#13848.
+    #
+    # Accept empty string "" — _copy_reasoning_content_for_api()
+    # injects "" as a tier-3 fallback for Kimi tool-call messages
+    # that had no reasoning.  Kimi requires the field to exist, even
+    # if empty.
+    #
+    # Prepend (not append): Anthropic protocol requires thinking
+    # blocks before text and tool_use blocks.
+    #
+    # Guard: only add when reasoning_details didn't already contribute
+    # thinking blocks.  On native Anthropic, reasoning_details produces
+    # signed thinking blocks — adding another unsigned one from
+    # reasoning_content would create a duplicate (same text) that gets
+    # downgraded to a spurious text block on the last assistant message.
+    reasoning_content = m.get("reasoning_content")
+    _already_has_thinking = any(
+        isinstance(b, dict) and b.get("type") in {"thinking", "redacted_thinking"}
+        for b in blocks
+    )
+    if isinstance(reasoning_content, str) and not _already_has_thinking:
+        blocks.insert(0, {"type": "thinking", "thinking": reasoning_content})
+    # Anthropic rejects empty assistant content
+    effective = blocks or content
+    if not effective or effective == "":
+        effective = [{"type": "text", "text": "(empty)"}]
+    return {"role": "assistant", "content": effective}

+
+def _convert_tool_message_to_result(
+    result: List[Dict[str, Any]], m: Dict[str, Any]
+) -> None:
+    """Convert a tool message to an Anthropic tool_result, merging consecutive
+    results into one user message.
+
+    Mutates ``result`` in place — either appends a new user message or extends
+    the trailing user message's tool_result list.
+    """
+    content = m.get("content", "")
+    multimodal_blocks: Optional[List[Dict[str, Any]]] = None
+    if isinstance(content, dict) and content.get("_multimodal"):
+        multimodal_blocks = _content_parts_to_anthropic_blocks(
+            content.get("content") or []
+        )
+        # Fallback text if the conversion produced nothing usable.
+        if not multimodal_blocks and content.get("text_summary"):
+            multimodal_blocks = [
+                {"type": "text", "text": str(content["text_summary"])}
+            ]
+    elif isinstance(content, list):
+        converted = _content_parts_to_anthropic_blocks(content)
+        if any(b.get("type") == "image" for b in converted):
+            multimodal_blocks = converted
+    # Back-compat: some callers stash blocks under a private key.
+    if multimodal_blocks is None:
+        stashed = m.get("_anthropic_content_blocks")
+        if isinstance(stashed, list) and stashed:
+            text_content = content if isinstance(content, str) and content.strip() else None
+            multimodal_blocks = (
+                [{"type": "text", "text": text_content}] + stashed
+                if text_content else list(stashed)
+            )
+
+    if multimodal_blocks:
+        result_content: Any = multimodal_blocks
+    elif isinstance(content, str):
+        result_content = content
+    else:
+        result_content = json.dumps(content) if content else "(no output)"
+    if not result_content:
+        result_content = "(no output)"
+    tool_result = {
+        "type": "tool_result",
+        "tool_use_id": _sanitize_tool_id(m.get("tool_call_id", "")),
+        "content": result_content,
+    }
+    if isinstance(m.get("cache_control"), dict):
+        tool_result["cache_control"] = dict(m["cache_control"])
+    # Merge consecutive tool results into one user message
+    if (
+        result
+        and result[-1]["role"] == "user"
+        and isinstance(result[-1]["content"], list)
+        and result[-1]["content"]
+        and result[-1]["content"][0].get("type") == "tool_result"
+    ):
+        result[-1]["content"].append(tool_result)
+    else:
+        result.append({"role": "user", "content": [tool_result]})
+
+
+def _convert_user_message(content: Any) -> Dict[str, Any]:
+    """Validate and convert a user message to anthropic format."""
+    if isinstance(content, list):
+        converted_blocks = _convert_content_to_anthropic(content)
+        if not converted_blocks or all(
+            b.get("text", "").strip() == ""
+            for b in converted_blocks
+            if isinstance(b, dict) and b.get("type") == "text"
+        ):
+            converted_blocks = [{"type": "text", "text": "(empty message)"}]
+        return {"role": "user", "content": converted_blocks}
+    else:
+        if not content or (isinstance(content, str) and not content.strip()):
+            content = "(empty message)"
+        return {"role": "user", "content": content}
+
+
+def _strip_orphaned_tool_blocks(result: List[Dict[str, Any]]) -> None:
+    """Strip tool_use blocks with no matching tool_result, and vice versa.
+
+    Context compression or session truncation can remove either side of a
+    tool-call pair.  Anthropic rejects both orphans with HTTP 400.
+
+    Mutates ``result`` in place.
+    """
    # Strip orphaned tool_use blocks (no matching tool_result follows)
    tool_result_ids = set()
    for m in result:
@@ -1799,10 +1772,7 @@ def convert_messages_to_anthropic(
            if not m["content"]:
                m["content"] = [{"type": "text", "text": "(tool call removed)"}]

-    # Strip orphaned tool_result blocks (no matching tool_use precedes them).
-    # This is the mirror of the above: context compression or session truncation
-    # can remove an assistant message containing a tool_use while leaving the
-    # subsequent tool_result intact.  Anthropic rejects these with a 400.
+    # Strip orphaned tool_result blocks (no matching tool_use precedes them)
    tool_use_ids = set()
    for m in result:
        if m["role"] == "assistant" and isinstance(m["content"], list):
@@ -1819,12 +1789,16 @@ def convert_messages_to_anthropic(
            if not m["content"]:
                m["content"] = [{"type": "text", "text": "(tool result removed)"}]

-    # Enforce strict role alternation (Anthropic rejects consecutive same-role messages)
+
+def _merge_consecutive_roles(result: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Merge consecutive same-role messages to enforce Anthropic alternation.
+
+    Returns a new list (caller must rebind ``result``).
+    """
    fixed = []
    for m in result:
        if fixed and fixed[-1]["role"] == m["role"]:
            if m["role"] == "user":
-                # Merge consecutive user messages
                prev_content = fixed[-1]["content"]
                curr_content = m["content"]
                if isinstance(prev_content, str) and isinstance(curr_content, str):
@@ -1832,7 +1806,6 @@ def convert_messages_to_anthropic(
                elif isinstance(prev_content, list) and isinstance(curr_content, list):
                    fixed[-1]["content"] = prev_content + curr_content
                else:
-                    # Mixed types — wrap string in list
                    if isinstance(prev_content, str):
                        prev_content = [{"type": "text", "text": prev_content}]
                    if isinstance(curr_content, str):
@@ -1855,7 +1828,6 @@ def convert_messages_to_anthropic(
                elif isinstance(prev_blocks, str) and isinstance(curr_blocks, str):
                    fixed[-1]["content"] = prev_blocks + "\n" + curr_blocks
                else:
-                    # Mixed types — normalize both to list and merge
                    if isinstance(prev_blocks, str):
                        prev_blocks = [{"type": "text", "text": prev_blocks}]
                    if isinstance(curr_blocks, str):
@@ -1863,37 +1835,34 @@ def convert_messages_to_anthropic(
                    fixed[-1]["content"] = prev_blocks + curr_blocks
        else:
            fixed.append(m)
-    result = fixed
+    return fixed

-    # ── Thinking block signature management ──────────────────────────
-    # Anthropic signs thinking blocks against the full turn content.
-    # Any upstream mutation (context compression, session truncation,
-    # orphan stripping, message merging) invalidates the signature,
-    # causing HTTP 400 "Invalid signature in thinking block".
-    #
-    # Signatures are Anthropic-proprietary.  Third-party endpoints
-    # (MiniMax, Microsoft Foundry, self-hosted proxies) cannot validate
-    # them and will reject them outright.  When targeting a third-party
-    # endpoint, strip ALL thinking/redacted_thinking blocks from every
-    # assistant message — the third-party will generate its own
-    # thinking blocks if it supports extended thinking.
-    #
-    # For direct Anthropic (strategy following clawdbot/OpenClaw):
-    # 1. Strip thinking/redacted_thinking from all assistant messages
-    #    EXCEPT the last one — preserves reasoning continuity on the
-    #    current tool-use chain while avoiding stale signature errors.
-    # 2. Downgrade unsigned thinking blocks (no signature) to text —
-    #    Anthropic can't validate them and will reject them.
-    # 3. Strip cache_control from thinking/redacted_thinking blocks —
-    #    cache markers can interfere with signature validation.
+
+def _manage_thinking_signatures(
+    result: List[Dict[str, Any]], base_url: str | None, model: str | None
+) -> None:
+    """Strip or preserve thinking blocks based on endpoint type.
+
+    Anthropic signs thinking blocks against the full turn content.
+    Any upstream mutation (context compression, session truncation, orphan
+    stripping, message merging) invalidates the signature, causing HTTP 400
+    "Invalid signature in thinking block".
+
+    Signatures are Anthropic-proprietary.  Third-party endpoints (MiniMax,
+    Azure AI Foundry, AWS Bedrock, self-hosted proxies) cannot validate them
+    and will reject them outright.  Kimi's /coding and DeepSeek's /anthropic
+    endpoints speak the Anthropic protocol upstream but require unsigned
+    thinking blocks (synthesised from ``reasoning_content``) to round-trip on
+    replayed assistant tool-call messages.  See hermes-agent#13848 (Kimi) and
+    hermes-agent#16748 (DeepSeek).
+
+    Mutates ``result`` in place.
+    """
    _THINKING_TYPES = frozenset(("thinking", "redacted_thinking"))
    _is_third_party = _is_third_party_anthropic_endpoint(base_url)
-    # Kimi /coding and DeepSeek /anthropic share a contract: both speak the
-    # Anthropic Messages protocol upstream but require that thinking blocks
-    # synthesised from reasoning_content round-trip on subsequent turns when
-    # thinking is enabled.  Signed Anthropic blocks still have to be stripped
-    # (neither endpoint can validate Anthropic's signatures); unsigned blocks
-    # are preserved.  See hermes-agent#13848 (Kimi) and #16748 (DeepSeek).
+    # Kimi / DeepSeek share a contract: strip signed Anthropic blocks
+    # (neither upstream can validate Anthropic signatures), preserve unsigned
+    # ones synthesised from reasoning_content.  See #13848, #16748.
    _preserve_unsigned_thinking = (
        _is_kimi_family_endpoint(base_url, model)
        or _is_deepseek_anthropic_endpoint(base_url)
@@ -1910,26 +1879,19 @@ def convert_messages_to_anthropic(
            continue

        if _preserve_unsigned_thinking:
-            # Kimi's /coding and DeepSeek's /anthropic endpoints both enable
-            # thinking server-side and require unsigned thinking blocks on
-            # replayed assistant tool-call messages.  Strip signed Anthropic
-            # blocks (neither upstream can validate Anthropic signatures) but
-            # preserve the unsigned ones we synthesised from reasoning_content.
+            # Kimi / DeepSeek: strip signed, preserve unsigned.
            new_content = []
            for b in m["content"]:
                if not isinstance(b, dict) or b.get("type") not in _THINKING_TYPES:
                    new_content.append(b)
                    continue
                if b.get("signature") or b.get("data"):
-                    # Anthropic-signed block — upstream can't validate, strip
+                    # Signed (or redacted-with-data) — upstream can't validate, strip.
                    continue
-                # Unsigned thinking (synthesised from reasoning_content) —
-                # keep it: the upstream needs it for message-history validation.
                new_content.append(b)
            m["content"] = new_content or [{"type": "text", "text": "(empty)"}]
        elif _is_third_party or idx != last_assistant_idx:
-            # Third-party endpoint: strip ALL thinking blocks from every
-            # assistant message — signatures are Anthropic-proprietary.
+            # Third-party: strip ALL thinking blocks (signatures are proprietary).
            # Direct Anthropic: strip from non-latest assistant messages only.
            stripped = [
                b for b in m["content"]
@@ -1937,24 +1899,21 @@ def convert_messages_to_anthropic(
            ]
            m["content"] = stripped or [{"type": "text", "text": "(thinking elided)"}]
        else:
-            # Latest assistant on direct Anthropic: keep signed thinking
-            # blocks for reasoning continuity; downgrade unsigned ones to
-            # plain text.
+            # Latest assistant on direct Anthropic: keep signed, downgrade unsigned
+            # to text so the reasoning isn't lost.
            new_content = []
            for b in m["content"]:
                if not isinstance(b, dict) or b.get("type") not in _THINKING_TYPES:
                    new_content.append(b)
                    continue
                if b.get("type") == "redacted_thinking":
-                    # Redacted blocks use 'data' for the signature payload
+                    # Redacted blocks use 'data' for the signature payload —
+                    # drop the block when 'data' is missing (can't be validated).
                    if b.get("data"):
                        new_content.append(b)
-                    # else: drop — no data means it can't be validated
                elif b.get("signature"):
-                    # Signed thinking block — keep it
                    new_content.append(b)
                else:
-                    # Unsigned thinking — downgrade to text so it's not lost
                    thinking_text = b.get("thinking", "")
                    if thinking_text:
                        new_content.append({"type": "text", "text": thinking_text})
@@ -1966,12 +1925,15 @@ def convert_messages_to_anthropic(
            if isinstance(b, dict) and b.get("type") in _THINKING_TYPES:
                b.pop("cache_control", None)

-    # ── Image eviction: keep only the most recent N screenshots ─────
-    # computer_use screenshots (base64 images) sit inside tool_result
-    # blocks: they accumulate and are sent with every API call. Each
-    # costs ~1,465 tokens; after 10+ the conversation becomes slow
-    # even for simple text queries. Walk backward, keep the most recent
-    # _MAX_KEEP_IMAGES, replace older ones with a text placeholder.
+
+def _evict_old_screenshots(result: List[Dict[str, Any]]) -> None:
+    """Keep only the most recent ``_MAX_KEEP_IMAGES`` computer-use screenshots.
+
+    Base64 images cost ~1,465 tokens each and accumulate across tool calls.
+    Walk backward, keep the most recent N, replace older ones with a placeholder.
+
+    Mutates ``result`` in place.
+    """
    _MAX_KEEP_IMAGES = 3
    _image_count = 0
    for msg in reversed(result):
@@ -1998,6 +1960,68 @@ def convert_messages_to_anthropic(
                    for b in inner
                ]

+
+def convert_messages_to_anthropic(
+    messages: List[Dict],
+    base_url: str | None = None,
+    model: str | None = None,
+) -> Tuple[Optional[Any], List[Dict]]:
+    """Convert OpenAI-format messages to Anthropic format.
+
+    Returns (system_prompt, anthropic_messages).
+    System messages are extracted since Anthropic takes them as a separate param.
+    system_prompt is a string or list of content blocks (when cache_control present).
+
+    When *base_url* is provided and points to a third-party Anthropic-compatible
+    endpoint, all thinking block signatures are stripped.  Signatures are
+    Anthropic-proprietary — third-party endpoints cannot validate them and will
+    reject them with HTTP 400 "Invalid signature in thinking block".
+
+    When *model* is provided and matches the Kimi / Moonshot family (or
+    *base_url* is a Kimi / Moonshot host), unsigned thinking blocks
+    synthesised from ``reasoning_content`` are preserved on replayed
+    assistant tool-call messages — Kimi requires the field to exist, even
+    if empty.
+    """
+    system = None
+    result: List[Dict[str, Any]] = []
+
+    for m in messages:
+        role = m.get("role", "user")
+        content = m.get("content", "")
+
+        if role == "system":
+            if isinstance(content, list):
+                # Preserve cache_control markers on content blocks
+                has_cache = any(
+                    p.get("cache_control") for p in content if isinstance(p, dict)
+                )
+                if has_cache:
+                    system = [p for p in content if isinstance(p, dict)]
+                else:
+                    system = "\n".join(
+                        p["text"] for p in content if p.get("type") == "text"
+                    )
+            else:
+                system = content
+            continue
+
+        if role == "assistant":
+            result.append(_convert_assistant_message(m))
+            continue
+
+        if role == "tool":
+            _convert_tool_message_to_result(result, m)
+            continue
+
+        # Regular user message
+        result.append(_convert_user_message(content))
+
+    _strip_orphaned_tool_blocks(result)
+    result = _merge_consecutive_roles(result)
+    _manage_thinking_signatures(result, base_url, model)
+    _evict_old_screenshots(result)
+
    return system, result


@@ -46,6 +46,7 @@ from agent.message_sanitization import (
    _strip_non_ascii,
 )
 from agent.model_metadata import (
+    MINIMUM_CONTEXT_LENGTH,
    estimate_messages_tokens_rough,
    estimate_request_tokens_rough,
    get_next_probe_tier,
@@ -73,6 +74,50 @@ from utils import base_url_host_matches, env_var_enabled
 logger = logging.getLogger(__name__)


+def _ollama_context_limit_error(agent: Any, request_tokens: int) -> Optional[str]:
+    """Return a user-facing error when Ollama is loaded with too little context."""
+    if not getattr(agent, "tools", None):
+        return None
+
+    runtime_ctx = getattr(agent, "_ollama_num_ctx", None)
+    if not isinstance(runtime_ctx, int) or runtime_ctx <= 0:
+        return None
+    if runtime_ctx >= MINIMUM_CONTEXT_LENGTH:
+        return None
+
+    model = getattr(agent, "model", "") or "the selected model"
+    base_url = getattr(agent, "base_url", "") or "unknown base URL"
+    provider = getattr(agent, "provider", "") or "unknown"
+    tool_count = len(getattr(agent, "tools", None) or [])
+
+    logger.warning(
+        "Ollama runtime context too small for Hermes tool use: "
+        "model=%s provider=%s base_url=%s runtime_context=%d "
+        "minimum_context=%d estimated_request_tokens=%d tool_count=%d "
+        "session=%s",
+        model,
+        provider,
+        base_url,
+        runtime_ctx,
+        MINIMUM_CONTEXT_LENGTH,
+        request_tokens,
+        tool_count,
+        getattr(agent, "session_id", None) or "none",
+    )
+
+    return (
+        f"Ollama loaded `{model}` with only {runtime_ctx:,} tokens of runtime "
+        f"context, but Hermes needs at least {MINIMUM_CONTEXT_LENGTH:,} tokens "
+        "for reliable tool use.\n\n"
+        "Increase the Ollama context for this model and restart/reload the "
+        "model before trying again. A known-good starting point is 65,536 "
+        "tokens. In Hermes config, set `model.ollama_num_ctx: 65536` "
+        "(and `model.context_length: 65536` if you also override the displayed "
+        "model context). If you manage the model through an Ollama Modelfile, "
+        "set `PARAMETER num_ctx 65536` there instead."
+    )
+
+
 def _ra():
    """Lazy reference to ``run_agent`` so callers can patch
    ``run_agent.handle_function_call`` / ``run_agent._set_interrupt`` /
@@ -527,6 +572,7 @@ def run_conversation(
    api_call_count = 0
    final_response = None
    interrupted = False
+    failed = False
    codex_ack_continuations = 0
    length_continue_retries = 0
    truncated_tool_call_retries = 0
@@ -883,6 +929,26 @@ def run_conversation(
        # Calculate approximate request size for logging
        total_chars = sum(len(str(msg)) for msg in api_messages)
        approx_tokens = estimate_messages_tokens_rough(api_messages)
+        approx_request_tokens = estimate_request_tokens_rough(
+            api_messages, tools=agent.tools or None
+        )
+
+        _runtime_context_error = _ollama_context_limit_error(
+            agent, approx_request_tokens
+        )
+        if _runtime_context_error:
+            final_response = _runtime_context_error
+            failed = True
+            _turn_exit_reason = "ollama_runtime_context_too_small"
+            messages.append({"role": "assistant", "content": final_response})
+            agent._emit_status("❌ Ollama runtime context is too small for Hermes tool use")
+            api_call_count -= 1
+            agent._api_call_count = api_call_count
+            try:
+                agent.iteration_budget.refund()
+            except Exception:
+                pass
+            break
        
        # Thinking spinner for quiet mode (animated during API call)
        thinking_spinner = None
@@ -923,6 +989,7 @@ def run_conversation(
        copilot_auth_retry_attempted=False
        thinking_sig_retry_attempted = False
        image_shrink_retry_attempted = False
+        multimodal_tool_content_retry_attempted = False
        oauth_1m_beta_retry_attempted = False
        llama_cpp_grammar_retry_attempted = False
        has_retried_429 = False
@@ -1994,6 +2061,31 @@ def run_conversation(
                            "or shrink didn't reduce size; surfacing original error."
                        )

+                # Multimodal-tool-content recovery: providers that follow
+                # the OpenAI spec strictly (tool message content must be a
+                # string) reject our list-type content with a 400.  Strip
+                # image parts from any list-type tool messages, mark the
+                # (provider, model) as no-list-tool-content for the rest
+                # of this session so future tool results preemptively
+                # downgrade, and retry once.  See issue #27344.
+                if (
+                    classified.reason == FailoverReason.multimodal_tool_content_unsupported
+                    and not multimodal_tool_content_retry_attempted
+                ):
+                    multimodal_tool_content_retry_attempted = True
+                    if agent._try_strip_image_parts_from_tool_messages(api_messages):
+                        agent._vprint(
+                            f"{agent.log_prefix}📐 Provider rejected list-type tool content — "
+                            f"downgraded screenshots to text and retrying...",
+                            force=True,
+                        )
+                        continue
+                    else:
+                        logger.info(
+                            "multimodal-tool-content recovery: no list-type tool "
+                            "messages with image parts found; surfacing original error."
+                        )
+
                # Anthropic OAuth subscription rejected the 1M-context beta
                # header ("long context beta is not yet available for this
                # subscription"). Disable the beta for the rest of this
@@ -3848,7 +3940,11 @@ def run_conversation(
                )

    # Determine if conversation completed successfully
-    completed = final_response is not None and api_call_count < agent.max_iterations
+    completed = (
+        final_response is not None
+        and api_call_count < agent.max_iterations
+        and not failed
+    )

    # Save trajectory if enabled.  ``user_message`` may be a multimodal
    # list of parts; the trajectory format wants a plain string.
@@ -3998,6 +4094,7 @@ def run_conversation(
        "api_calls": api_call_count,
        "completed": completed,
        "turn_exit_reason": _turn_exit_reason,
+        "failed": failed,
        "partial": False,  # True only when stopped due to invalid tool calls
        "interrupted": interrupted,
        "response_previewed": getattr(agent, "_response_was_previewed", False),
@@ -50,6 +50,7 @@ from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple

 from hermes_constants import get_hermes_home
+from agent.skill_utils import is_excluded_skill_path

 logger = logging.getLogger(__name__)

@@ -176,7 +177,9 @@ def get_keep() -> int:

 def _count_skill_files(base: Path) -> int:
    try:
-        return sum(1 for _ in base.rglob("SKILL.md"))
+        return sum(
+            1 for p in base.rglob("SKILL.md") if not is_excluded_skill_path(p)
+        )
    except OSError:
        return 0

@@ -50,6 +50,7 @@ class FailoverReason(enum.Enum):

    # Request format
    format_error = "format_error"        # 400 bad request — abort or strip + retry
+    multimodal_tool_content_unsupported = "multimodal_tool_content_unsupported"  # Provider rejected list-type content in tool messages (e.g. Xiaomi MiMo) — downgrade to text and retry

    # Provider-specific
    thinking_signature = "thinking_signature"  # Anthropic thinking block sig invalid
@@ -165,6 +166,32 @@ _IMAGE_TOO_LARGE_PATTERNS = [
    # the likely culprit; we still try the shrink path before giving up.
 ]

+# Providers that follow the OpenAI spec strictly require tool message
+# ``content`` to be a string.  Some (Anthropic native, Codex Responses,
+# Gemini native, first-party OpenAI) extend this to accept a content-parts
+# list (text + image_url) so screenshots from computer_use survive.  Others
+# (Xiaomi MiMo, some Alibaba endpoints, a long tail of OpenAI-compatible
+# providers) reject the list with a 400 — the patterns below are the most
+# common error shapes we see.  Recovery: strip image parts from tool
+# messages in-place, record the (provider, model) for the rest of the
+# session so we don't waste another call learning the same lesson, retry.
+#
+# See: https://github.com/NousResearch/hermes-agent/issues/27344
+_MULTIMODAL_TOOL_CONTENT_PATTERNS = [
+    # Xiaomi MiMo: {"error":{"code":"400","message":"Param Incorrect","param":"text is not set"}}
+    "text is not set",
+    # Generic "tool message must be string" shapes
+    "tool message content must be a string",
+    "tool content must be a string",
+    "tool message must be a string",
+    # OpenAI-compat servers that reject list-type tool content with a
+    # schema-validation message
+    "expected string, got list",
+    "expected string, got array",
+    # Alibaba/DashScope variant
+    "tool_call.content must be string",
+]
+
 # Context overflow patterns
 _CONTEXT_OVERFLOW_PATTERNS = [
    "context length",
@@ -781,6 +808,19 @@ def _classify_400(
 ) -> ClassifiedError:
    """Classify 400 Bad Request — context overflow, format error, or generic."""

+    # Multimodal tool content rejected from 400.  Must be checked BEFORE
+    # image_too_large because the recovery is different (strip image parts
+    # from tool messages, mark the model as no-list-tool-content for the
+    # rest of the session) and BEFORE context_overflow because some of the
+    # patterns ("text is not set") are ambiguous in isolation but become
+    # specific when combined with a 400 on a request known to contain
+    # multimodal tool content.
+    if any(p in error_msg for p in _MULTIMODAL_TOOL_CONTENT_PATTERNS):
+        return result_fn(
+            FailoverReason.multimodal_tool_content_unsupported,
+            retryable=True,
+        )
+
    # Image-too-large from 400 (Anthropic's 5 MB per-image check fires this way).
    # Must be checked BEFORE context_overflow because messages can trip both
    # patterns ("exceeds" + "image") and image-shrink is a cheaper recovery.
@@ -922,6 +962,13 @@ def _classify_by_message(
            should_compress=True,
        )

+    # Multimodal tool content patterns (from message text when no status_code)
+    if any(p in error_msg for p in _MULTIMODAL_TOOL_CONTENT_PATTERNS):
+        return result_fn(
+            FailoverReason.multimodal_tool_content_unsupported,
+            retryable=True,
+        )
+
    # Image-too-large patterns (from message text when no status_code)
    if any(p in error_msg for p in _IMAGE_TOO_LARGE_PATTERNS):
        return result_fn(
@@ -1258,6 +1258,10 @@ def build_nous_subscription_prompt(valid_tool_names: "set[str] | None" = None) -
        "terminal",
        "process",
        "execute_code",
+        "app_search_tools",
+        "app_tool_schemas",
+        "app_execute_tools",
+        "app_manage_connections",
    }

    if valid_names and not (valid_names & relevant_tool_names):
@@ -1279,7 +1283,7 @@ def build_nous_subscription_prompt(valid_tool_names: "set[str] | None" = None) -

    lines = [
        "# Nous Subscription",
-        "Nous subscription includes managed web tools (Firecrawl), image generation (FAL), OpenAI TTS, and browser automation (Browser Use) by default. Modal execution is optional.",
+        "Nous subscription includes managed web tools (Firecrawl), image generation (FAL), OpenAI TTS, browser automation (Browser Use), and app integrations (500+ apps) by default. Modal execution is optional.",
        "Current capability status:",
    ]
    lines.extend(_status_line(feature) for feature in features.items())
@@ -1294,6 +1298,39 @@ def build_nous_subscription_prompt(valid_tool_names: "set[str] | None" = None) -
    return "\n".join(lines)


+# =========================================================================
+# App tools (500+ external integrations) behavioural prompt
+# =========================================================================
+
+_APP_TOOLS_PROMPT = """\
+## App Tools (500+ External Integrations)
+
+You have app_search_tools, app_tool_schemas, app_execute_tools, and app_manage_connections available RIGHT NOW as callable tools. They are already configured and connected to the Nous tool gateway — no SDK installation, no API keys, no plugin setup needed. Just call them.
+
+**When to use:** When a user asks to interact with ANY external app or service — Gmail, Slack, GitHub, Jira, Notion, Google Sheets, Linear, HubSpot, Figma, Salesforce, or any of 500+ other apps. ALWAYS prefer these tools over loading skills about the same service (e.g. do NOT load the 'linear', 'airtable', 'google-workspace', 'notion', or any similar skill — use app_search_tools instead). Do NOT suggest installing SDKs, CLI tools, MCP servers, or API keys for external services — call app_search_tools directly.
+
+**Workflow:**
+1. Call app_search_tools with a clear use_case description to discover available tools
+2. Check the response for connection status — if no active connection, call app_manage_connections and share the auth link with the user
+3. Review the execution plan and pitfalls in the search response before executing
+4. If a tool has schemaRef instead of input_schema, call app_tool_schemas to get the full schema
+5. Execute tools via app_execute_tools with schema-compliant arguments
+
+**Session tracking:** Pass session: {generate_id: true} on your first app_search_tools call. Reuse the returned session.id in all subsequent calls. Generate a new session when the user pivots to a different task.
+
+**Important:** Never fabricate tool slugs or argument field names. Only use slugs and schemas returned by app_search_tools or app_tool_schemas."""
+
+
+def build_app_tools_prompt(valid_tool_names: "set[str] | None" = None) -> str:
+    """Return the app tools behavioural guidance when the toolset is active."""
+    if valid_tool_names and "app_search_tools" not in valid_tool_names:
+        return ""
+    if not valid_tool_names:
+        # No tool names known — skip (conservative)
+        return ""
+    return _APP_TOOLS_PROMPT
+
+
 # =========================================================================
 # Context files (SOUL.md, AGENTS.md, .cursorrules)
 # =========================================================================
@@ -0,0 +1,13 @@
+"""External secret source integrations.
+
+A secret source is anything that can supply environment-variable-shaped
+credentials at process startup, _after_ ~/.hermes/.env has loaded.  By
+default sources are non-destructive: they only set values for env vars
+that aren't already present, so .env and shell exports continue to win.
+
+Currently shipped:
+
+  - ``bitwarden`` — Bitwarden Secrets Manager (`bws` CLI).  See
+    ``agent.secret_sources.bitwarden`` for the integration and
+    ``hermes_cli.secrets_cli`` for the user-facing setup wizard.
+"""
@@ -0,0 +1,515 @@
+"""Bitwarden Secrets Manager (`bws` CLI) integration.
+
+Hermes pulls API keys from Bitwarden Secrets Manager at process startup
+so they don't have to live in plaintext in ``~/.hermes/.env``.
+
+Design summary
+--------------
+
+* The ``bws`` binary is auto-installed into ``<hermes_home>/bin/bws`` on
+  first use.  Hermes pins one version (``_BWS_VERSION``) and downloads
+  the matching asset from the official GitHub Releases page, verifying
+  the SHA-256 against the release's published checksum file.
+* The access token is stored in ``~/.hermes/.env`` as
+  ``BWS_ACCESS_TOKEN`` (or whatever name the user picked in
+  ``secrets.bitwarden.access_token_env``).  This is the one
+  bootstrap secret — every other provider key can live in Bitwarden.
+* Pulling secrets is a single ``bws secret list <project_id>
+  --output json`` call.  We cache the result in-process for
+  ``cache_ttl_seconds`` so back-to-back ``hermes`` invocations don't
+  hammer the API.
+* Failures NEVER block Hermes startup.  Missing binary, no network,
+  expired token, etc. all emit a one-line warning and continue with
+  whatever credentials ``.env`` already had.
+
+The module is intentionally subprocess-driven rather than going through
+the ``bitwarden-sdk-secrets`` Python package: one cross-platform binary
+is easier to lazy-install than a wheels-with-Rust-extension dependency.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import logging
+import os
+import platform
+import shutil
+import stat
+import subprocess
+import sys
+import tempfile
+import time
+import urllib.error
+import urllib.request
+import zipfile
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Configuration constants
+# ---------------------------------------------------------------------------
+
+# Pinned upstream version.  Bump in a follow-up PR — never auto-resolve
+# "latest" because upstream release shape (asset names, CLI flags) is
+# allowed to change between majors and we want updates to be deliberate.
+_BWS_VERSION = "2.0.0"
+
+_BWS_RELEASE_BASE = (
+    f"https://github.com/bitwarden/sdk-sm/releases/download/bws-v{_BWS_VERSION}"
+)
+_BWS_CHECKSUM_NAME = f"bws-sha256-checksums-{_BWS_VERSION}.txt"
+
+# How long to wait for bws subprocesses and HTTP downloads, in seconds.
+_BWS_DOWNLOAD_TIMEOUT = 60
+_BWS_RUN_TIMEOUT = 30
+
+# In-process cache so repeated load_hermes_dotenv() calls (CLI startup,
+# gateway hot-reload, test suites) don't re-fetch from BSM.
+_CacheKey = Tuple[str, str]  # (access_token_fingerprint, project_id)
+_CACHE: Dict[_CacheKey, "_CachedFetch"] = {}
+
+
+@dataclass
+class _CachedFetch:
+    secrets: Dict[str, str]
+    fetched_at: float
+
+    def is_fresh(self, ttl_seconds: float) -> bool:
+        if ttl_seconds <= 0:
+            return False
+        return (time.time() - self.fetched_at) < ttl_seconds
+
+
+# ---------------------------------------------------------------------------
+# Public dataclasses
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class FetchResult:
+    """Outcome of a single BSM pull."""
+
+    secrets: Dict[str, str] = field(default_factory=dict)
+    applied: List[str] = field(default_factory=list)   # set into os.environ
+    skipped: List[str] = field(default_factory=list)   # already set, not overridden
+    warnings: List[str] = field(default_factory=list)  # non-fatal issues
+    error: Optional[str] = None                        # fatal: nothing was fetched
+    binary_path: Optional[Path] = None
+
+    @property
+    def ok(self) -> bool:
+        return self.error is None
+
+
+# ---------------------------------------------------------------------------
+# Binary discovery + lazy install
+# ---------------------------------------------------------------------------
+
+
+def _hermes_bin_dir() -> Path:
+    """Where Hermes stores its managed binaries.  Profile-aware."""
+    from hermes_constants import get_hermes_home
+
+    return get_hermes_home() / "bin"
+
+
+def find_bws(*, install_if_missing: bool = False) -> Optional[Path]:
+    """Return a path to a usable ``bws`` binary, or None.
+
+    Resolution order:
+      1. ``<hermes_home>/bin/bws``  (our managed copy — preferred)
+      2. ``shutil.which("bws")``    (system PATH)
+
+    When ``install_if_missing`` is True and neither resolves, this calls
+    :func:`install_bws` to download and verify the pinned version.
+    """
+    managed = _hermes_bin_dir() / _platform_binary_name()
+    if managed.exists() and os.access(managed, os.X_OK):
+        return managed
+
+    system = shutil.which("bws")
+    if system:
+        return Path(system)
+
+    if install_if_missing:
+        try:
+            return install_bws()
+        except Exception as exc:  # noqa: BLE001 — never block startup
+            logger.warning("bws auto-install failed: %s", exc)
+            return None
+    return None
+
+
+def _platform_binary_name() -> str:
+    return "bws.exe" if platform.system() == "Windows" else "bws"
+
+
+def _platform_asset_name() -> str:
+    """Map (uname, arch, libc) → the upstream asset filename.
+
+    Asset names follow Rust's target triple convention.  Linux defaults
+    to gnu (glibc); we switch to musl only if ldd --version says so.
+    """
+    system = platform.system()
+    machine = platform.machine().lower()
+
+    if system == "Darwin":
+        # Universal binary works on both Intel and Apple Silicon — no
+        # need to pick a per-arch asset.
+        return f"bws-macos-universal-{_BWS_VERSION}.zip"
+
+    if system == "Windows":
+        arch = "aarch64" if machine in ("arm64", "aarch64") else "x86_64"
+        return f"bws-{arch}-pc-windows-msvc-{_BWS_VERSION}.zip"
+
+    if system == "Linux":
+        arch = "aarch64" if machine in ("arm64", "aarch64") else "x86_64"
+        libc = "gnu"
+        # ldd --version writes to stderr on glibc, stdout on musl.  We
+        # don't need bullet-proof detection — getting it wrong falls
+        # back to a clear error from the binary loader, which we catch.
+        try:
+            res = subprocess.run(
+                ["ldd", "--version"],
+                capture_output=True,
+                text=True,
+                timeout=2,
+            )
+            if "musl" in (res.stdout + res.stderr).lower():
+                libc = "musl"
+        except (OSError, subprocess.TimeoutExpired):
+            pass
+        return f"bws-{arch}-unknown-linux-{libc}-{_BWS_VERSION}.zip"
+
+    raise RuntimeError(
+        f"Unsupported platform for bws auto-install: {system} {machine}"
+    )
+
+
+def install_bws(*, force: bool = False) -> Path:
+    """Download, verify, and install the pinned ``bws`` binary.
+
+    Returns the path to the installed executable.  Raises on any
+    failure (network, checksum, extraction) — callers in the auto-install
+    path catch these; the user-facing ``hermes secrets bitwarden setup``
+    surface lets them propagate so the wizard can show a clear error.
+    """
+    bin_dir = _hermes_bin_dir()
+    bin_dir.mkdir(parents=True, exist_ok=True)
+    target = bin_dir / _platform_binary_name()
+
+    if target.exists() and not force:
+        return target
+
+    asset_name = _platform_asset_name()
+    asset_url = f"{_BWS_RELEASE_BASE}/{asset_name}"
+    checksum_url = f"{_BWS_RELEASE_BASE}/{_BWS_CHECKSUM_NAME}"
+
+    with tempfile.TemporaryDirectory(prefix="hermes-bws-") as tmpdir:
+        tmp = Path(tmpdir)
+        zip_path = tmp / asset_name
+        checksum_path = tmp / _BWS_CHECKSUM_NAME
+
+        logger.info("Downloading %s", asset_url)
+        _http_download(asset_url, zip_path)
+        _http_download(checksum_url, checksum_path)
+
+        expected = _expected_sha256(checksum_path, asset_name)
+        actual = _sha256_file(zip_path)
+        if expected.lower() != actual.lower():
+            raise RuntimeError(
+                f"Checksum mismatch for {asset_name}: "
+                f"expected {expected}, got {actual}"
+            )
+
+        with zipfile.ZipFile(zip_path) as zf:
+            member = _pick_zip_member(zf, _platform_binary_name())
+            zf.extract(member, tmp)
+            extracted = tmp / member
+
+        # Move into place atomically.  We write to a sibling tempfile in
+        # the final directory so the rename can't cross filesystems.
+        fd, staged = tempfile.mkstemp(dir=str(bin_dir), prefix=".bws_")
+        os.close(fd)
+        shutil.copy2(extracted, staged)
+        os.chmod(
+            staged,
+            stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR
+            | stat.S_IRGRP | stat.S_IXGRP
+            | stat.S_IROTH | stat.S_IXOTH,
+        )
+        os.replace(staged, target)
+
+    logger.info("Installed bws %s at %s", _BWS_VERSION, target)
+    return target
+
+
+def _http_download(url: str, dest: Path) -> None:
+    req = urllib.request.Request(url, headers={"User-Agent": "hermes-agent"})
+    try:
+        with urllib.request.urlopen(req, timeout=_BWS_DOWNLOAD_TIMEOUT) as resp:  # noqa: S310
+            with open(dest, "wb") as f:
+                shutil.copyfileobj(resp, f)
+    except urllib.error.URLError as exc:
+        raise RuntimeError(f"Failed to download {url}: {exc}") from exc
+
+
+def _expected_sha256(checksum_file: Path, asset_name: str) -> str:
+    """Parse the upstream ``bws-sha256-checksums-X.Y.Z.txt`` file.
+
+    Format is the standard ``sha256sum`` output: ``<hex>  <filename>``,
+    one per line.
+    """
+    text = checksum_file.read_text(encoding="utf-8", errors="replace")
+    for line in text.splitlines():
+        parts = line.strip().split()
+        if len(parts) >= 2 and parts[-1] == asset_name:
+            return parts[0]
+    raise RuntimeError(
+        f"No checksum entry for {asset_name} in {checksum_file.name}"
+    )
+
+
+def _sha256_file(path: Path) -> str:
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        for chunk in iter(lambda: f.read(65536), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def _pick_zip_member(zf: zipfile.ZipFile, binary_name: str) -> str:
+    """Find the binary inside the upstream zip.
+
+    Historically the archive has been flat (``bws`` at the root) but we
+    tolerate a top-level directory just in case upstream changes.
+    """
+    candidates = [n for n in zf.namelist() if n.split("/")[-1] == binary_name]
+    if not candidates:
+        raise RuntimeError(
+            f"Could not find {binary_name} inside downloaded archive "
+            f"(members: {zf.namelist()[:5]}...)"
+        )
+    # Prefer the shortest path (i.e. root over nested) for determinism.
+    candidates.sort(key=len)
+    return candidates[0]
+
+
+# ---------------------------------------------------------------------------
+# Secret fetch + apply
+# ---------------------------------------------------------------------------
+
+
+def _token_fingerprint(token: str) -> str:
+    """SHA-256 prefix used as a cache key — never logged, never displayed."""
+    return hashlib.sha256(token.encode("utf-8")).hexdigest()[:16]
+
+
+def fetch_bitwarden_secrets(
+    *,
+    access_token: str,
+    project_id: str,
+    binary: Optional[Path] = None,
+    cache_ttl_seconds: float = 300,
+    use_cache: bool = True,
+) -> Tuple[Dict[str, str], List[str]]:
+    """Pull the secrets for ``project_id`` from Bitwarden Secrets Manager.
+
+    Returns ``(secrets_dict, warnings_list)``.
+
+    Raises :class:`RuntimeError` for fatal conditions (missing binary,
+    auth failure, unparseable output).  Callers in the env_loader path
+    catch this and emit a single warning; callers in the user-facing
+    setup wizard let it propagate.
+    """
+    if not access_token:
+        raise RuntimeError("Bitwarden access token is empty")
+    if not project_id:
+        raise RuntimeError("Bitwarden project_id is empty")
+
+    cache_key = (_token_fingerprint(access_token), project_id)
+    if use_cache:
+        cached = _CACHE.get(cache_key)
+        if cached and cached.is_fresh(cache_ttl_seconds):
+            return cached.secrets, []
+
+    bws = binary or find_bws(install_if_missing=True)
+    if bws is None:
+        raise RuntimeError(
+            "bws binary not available — auto-install failed and `bws` is "
+            "not on PATH.  Install manually from "
+            "https://github.com/bitwarden/sdk-sm/releases or re-run "
+            "`hermes secrets bitwarden setup`."
+        )
+
+    secrets, warnings = _run_bws_list(bws, access_token, project_id)
+    _CACHE[cache_key] = _CachedFetch(secrets=secrets, fetched_at=time.time())
+    return secrets, warnings
+
+
+def _run_bws_list(
+    bws: Path, access_token: str, project_id: str
+) -> Tuple[Dict[str, str], List[str]]:
+    cmd = [str(bws), "secret", "list", project_id, "--output", "json"]
+    env = os.environ.copy()
+    env["BWS_ACCESS_TOKEN"] = access_token
+    # Make sure we're not echoing telemetry / colour codes into json.
+    env.setdefault("NO_COLOR", "1")
+
+    try:
+        proc = subprocess.run(  # noqa: S603 — bws path is trusted
+            cmd,
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=_BWS_RUN_TIMEOUT,
+        )
+    except subprocess.TimeoutExpired as exc:
+        raise RuntimeError(
+            f"bws timed out after {_BWS_RUN_TIMEOUT}s fetching secrets"
+        ) from exc
+    except OSError as exc:
+        raise RuntimeError(f"failed to invoke bws: {exc}") from exc
+
+    if proc.returncode != 0:
+        # bws writes auth/network errors to stderr in plain English.
+        # Strip ANSI just in case and surface the first 200 chars.
+        err = (proc.stderr or proc.stdout or "").strip().replace("\x1b", "")
+        raise RuntimeError(
+            f"bws exited {proc.returncode}: {err[:200]}"
+        )
+
+    raw = proc.stdout.strip()
+    if not raw:
+        return {}, ["bws returned no output (empty project?)"]
+
+    try:
+        payload = json.loads(raw)
+    except json.JSONDecodeError as exc:
+        raise RuntimeError(f"bws returned non-JSON output: {exc}") from exc
+
+    if not isinstance(payload, list):
+        raise RuntimeError(
+            f"bws returned unexpected shape: {type(payload).__name__}"
+        )
+
+    secrets: Dict[str, str] = {}
+    warnings: List[str] = []
+    for item in payload:
+        if not isinstance(item, dict):
+            continue
+        key = item.get("key")
+        value = item.get("value")
+        if not isinstance(key, str) or not isinstance(value, str):
+            continue
+        if not _is_valid_env_name(key):
+            warnings.append(
+                f"Skipping secret {key!r}: not a valid env-var name"
+            )
+            continue
+        secrets[key] = value
+    return secrets, warnings
+
+
+def _is_valid_env_name(name: str) -> bool:
+    if not name:
+        return False
+    if not (name[0].isalpha() or name[0] == "_"):
+        return False
+    return all(c.isalnum() or c == "_" for c in name)
+
+
+# ---------------------------------------------------------------------------
+# Public entry point — called from hermes_cli.env_loader
+# ---------------------------------------------------------------------------
+
+
+def apply_bitwarden_secrets(
+    *,
+    enabled: bool,
+    access_token_env: str = "BWS_ACCESS_TOKEN",
+    project_id: str = "",
+    override_existing: bool = False,
+    cache_ttl_seconds: float = 300,
+    auto_install: bool = True,
+) -> FetchResult:
+    """Pull secrets from BSM and set them on ``os.environ``.
+
+    This is the function ``load_hermes_dotenv()`` calls after the .env
+    files have loaded.  It is intentionally defensive — any failure
+    returns a :class:`FetchResult` with ``error`` set; it never raises.
+
+    Parameters mirror the ``secrets.bitwarden.*`` config keys so the
+    caller can just splat the dict in.
+    """
+    result = FetchResult()
+
+    if not enabled:
+        return result
+
+    access_token = os.environ.get(access_token_env, "").strip()
+    if not access_token:
+        result.error = (
+            f"secrets.bitwarden.enabled is true but {access_token_env} is "
+            "not set.  Run `hermes secrets bitwarden setup`."
+        )
+        return result
+
+    if not project_id:
+        result.error = (
+            "secrets.bitwarden.project_id is empty.  "
+            "Run `hermes secrets bitwarden setup`."
+        )
+        return result
+
+    binary = find_bws(install_if_missing=auto_install)
+    result.binary_path = binary
+    if binary is None:
+        result.error = (
+            "bws binary not available and auto-install is disabled.  "
+            "Run `hermes secrets bitwarden setup` to install."
+        )
+        return result
+
+    try:
+        secrets, warnings = fetch_bitwarden_secrets(
+            access_token=access_token,
+            project_id=project_id,
+            binary=binary,
+            cache_ttl_seconds=cache_ttl_seconds,
+        )
+    except RuntimeError as exc:
+        result.error = str(exc)
+        return result
+
+    result.secrets = secrets
+    result.warnings.extend(warnings)
+
+    for key, value in secrets.items():
+        if key == access_token_env:
+            # Don't let BSM clobber the very token we used to fetch
+            # itself — that would be a footgun if someone stored the
+            # token as a BSM secret too.
+            result.skipped.append(key)
+            continue
+        if not override_existing and os.environ.get(key):
+            result.skipped.append(key)
+            continue
+        os.environ[key] = value
+        result.applied.append(key)
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Test hook — used by hermetic tests to flush the cache between cases.
+# ---------------------------------------------------------------------------
+
+
+def _reset_cache_for_tests() -> None:
+    _CACHE.clear()
@@ -12,7 +12,7 @@ import sys
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Set, Tuple

-from hermes_constants import get_config_path, get_skills_dir
+from hermes_constants import get_config_path, get_skills_dir, is_termux

 logger = logging.getLogger(__name__)

@@ -24,7 +24,43 @@ PLATFORM_MAP = {
    "windows": "win32",
 }

-EXCLUDED_SKILL_DIRS = frozenset((".git", ".github", ".hub", ".archive"))
+EXCLUDED_SKILL_DIRS = frozenset(
+    (
+        ".git",
+        ".github",
+        ".hub",
+        ".archive",
+        ".venv",
+        "venv",
+        "node_modules",
+        "site-packages",
+        "__pycache__",
+        ".tox",
+        ".nox",
+        ".pytest_cache",
+        ".mypy_cache",
+        ".ruff_cache",
+    )
+)
+
+
+def is_excluded_skill_path(path) -> bool:
+    """True if any component of *path* is in EXCLUDED_SKILL_DIRS.
+
+    Use this on every SKILL.md path produced by ``rglob`` to prune
+    dependency, virtualenv, VCS, and cache directories. Centralising the
+    check here keeps every skill-scanning site in sync with the shared
+    exclusion set.
+
+    Accepts a Path or string.
+    """
+    try:
+        parts = path.parts  # Path
+    except AttributeError:
+        from pathlib import PurePath
+        parts = PurePath(str(path)).parts
+    return any(part in EXCLUDED_SKILL_DIRS for part in parts)
+

 # ── Lazy YAML loader ─────────────────────────────────────────────────────

@@ -100,6 +136,14 @@ def skill_matches_platform(frontmatter: Dict[str, Any]) -> bool:

    If the field is absent or empty the skill is compatible with **all**
    platforms (backward-compatible default).
+
+    Termux note: on Termux/Android, ``sys.platform`` is ``"linux"`` on
+    older Pythons but became ``"android"`` on Python 3.13+. Termux is a
+    Linux userland riding on the Android kernel, so skills tagged
+    ``linux`` are treated as compatible in Termux regardless of which
+    ``sys.platform`` value Python reports. Individual Linux commands
+    inside a skill may still misbehave (no systemd, BusyBox utils, no
+    apt/dnf, etc.) but that is on the skill, not on platform gating.
    """
    platforms = frontmatter.get("platforms")
    if not platforms:
@@ -107,11 +151,21 @@ def skill_matches_platform(frontmatter: Dict[str, Any]) -> bool:
    if not isinstance(platforms, list):
        platforms = [platforms]
    current = sys.platform
+    running_in_termux = is_termux()
    for platform in platforms:
        normalized = str(platform).lower().strip()
        mapped = PLATFORM_MAP.get(normalized, normalized)
        if current.startswith(mapped):
            return True
+        # Termux runs a Linux userland on Android. Accept linux-tagged
+        # skills regardless of whether sys.platform is "linux" (pre-3.13
+        # Termux) or "android" (Python 3.13+ Termux, and any other
+        # Android runtime).
+        if running_in_termux and mapped == "linux":
+            return True
+        # Explicit termux/android tags match a Termux session too.
+        if running_in_termux and mapped in ("termux", "android"):
+            return True
    return False


@@ -478,7 +532,8 @@ def extract_skill_description(frontmatter: Dict[str, Any]) -> str:
 def iter_skill_index_files(skills_dir: Path, filename: str):
    """Walk skills_dir yielding sorted paths matching *filename*.

-    Excludes ``.git``, ``.github``, ``.hub``, ``.archive`` directories.
+    Excludes Hermes metadata, VCS, virtualenv/dependency, and cache
+    directories so dependencies cannot register nested skills.
    """
    matches = []
    for root, dirs, files in os.walk(skills_dir, followlinks=True):
@@ -130,6 +130,12 @@ def build_system_prompt_parts(agent: Any, system_message: Optional[str] = None)
    nous_subscription_prompt = _r.build_nous_subscription_prompt(agent.valid_tool_names)
    if nous_subscription_prompt:
        stable_parts.append(nous_subscription_prompt)
+
+    # App tools (500+ external integrations) behavioural guidance
+    app_tools_prompt = _r.build_app_tools_prompt(agent.valid_tool_names)
+    if app_tools_prompt:
+        stable_parts.append(app_tools_prompt)
+
    # Tool-use enforcement: tells the model to actually call tools instead
    # of describing intended actions.  Controlled by config.yaml
    # agent.tool_use_enforcement:
@@ -10221,6 +10221,7 @@ class HermesCLI:
            self._voice_processing = True

        submitted = False
+        transcription_failed = False
        wav_path = None
        try:
            if self._voice_recorder is None:
@@ -10269,18 +10270,24 @@ class HermesCLI:
            else:
                error = result.get("error", "Unknown error")
                _cprint(f"\n{_DIM}Transcription failed: {error}{_RST}")
+                transcription_failed = True

        except Exception as e:
            _cprint(f"\n{_DIM}Voice processing error: {e}{_RST}")
+            transcription_failed = wav_path is not None
        finally:
            with self._voice_lock:
                self._voice_processing = False
            if hasattr(self, '_app') and self._app:
                self._app.invalidate()
-            # Clean up temp file
+            # Clean up temp file unless transcription failed. On failure, keep
+            # the source recording so long dictation is not lost.
            try:
                if wav_path and os.path.isfile(wav_path):
-                    os.unlink(wav_path)
+                    if transcription_failed:
+                        _cprint(f"{_DIM}Recording preserved at: {wav_path}{_RST}")
+                    else:
+                        os.unlink(wav_path)
            except Exception:
                pass

@@ -18,6 +18,7 @@ Security features (based on OWASP + NIST SP 800-63-4 guidance):
 Storage: ~/.hermes/pairing/
 """

+import hashlib
 import json
 import os
 import secrets
@@ -148,6 +149,11 @@ class PairingStore:

    # ----- Pending codes -----

+    @staticmethod
+    def _hash_code(code: str, salt: bytes) -> str:
+        """Hash a pairing code with the given salt using SHA-256."""
+        return hashlib.sha256(salt + code.encode("utf-8")).hexdigest()
+
    def generate_code(
        self, platform: str, user_id: str, user_name: str = ""
    ) -> Optional[str]:
@@ -158,6 +164,9 @@ class PairingStore:
          - User is rate-limited (too recent request)
          - Max pending codes reached for this platform
          - User/platform is in lockout due to failed attempts
+
+        The code is NOT stored in plaintext.  Only a salted SHA-256 hash is
+        persisted so that reading the pending file does not reveal codes.
        """
        with self._lock:
            self._cleanup_expired(platform)
@@ -178,8 +187,17 @@ class PairingStore:
            # Generate cryptographically random code
            code = "".join(secrets.choice(ALPHABET) for _ in range(CODE_LENGTH))

-            # Store pending request
-            pending[code] = {
+            # Hash the code with a random salt before storing
+            salt = os.urandom(16)
+            code_hash = self._hash_code(code, salt)
+
+            # Use a unique entry id as the key (not the code itself)
+            entry_id = secrets.token_hex(8)
+
+            # Store pending request with hashed code
+            pending[entry_id] = {
+                "hash": code_hash,
+                "salt": salt.hex(),
                "user_id": user_id,
                "user_name": user_name,
                "created_at": time.time(),
@@ -195,10 +213,16 @@ class PairingStore:
        """
        Approve a pairing code. Adds the user to the approved list.

-        Returns {user_id, user_name} on success, None if code is
+        Returns ``{user_id, user_name}`` on success, ``None`` if the code is
        invalid/expired OR the platform is currently locked out after
        ``MAX_FAILED_ATTEMPTS`` failed approvals (#10195). Callers can
        disambiguate with ``_is_locked_out(platform)``.
+
+        Verification: the user-provided code is hashed with each stored
+        entry's salt and compared to the stored hash using constant-time
+        comparison. Pre-hash entries (legacy plaintext-key format from
+        pre-upgrade pending.json files) are silently ignored — they get
+        pruned at TTL by ``_cleanup_expired``.
        """
        with self._lock:
            self._cleanup_expired(platform)
@@ -213,34 +237,73 @@ class PairingStore:
                return None

            pending = self._load_json(self._pending_path(platform))
-            if code not in pending:
+
+            # Find the entry whose hash matches the provided code.
+            # Tolerate legacy plaintext-key entries (no salt/hash) and
+            # malformed entries — skip them rather than KeyError, so an
+            # in-place upgrade across an existing pending.json doesn't
+            # crash on the first approve call. Legacy entries get pruned
+            # at their TTL by _cleanup_expired.
+            matched_key = None
+            matched_entry = None
+            for entry_id, entry in pending.items():
+                if not isinstance(entry, dict):
+                    continue
+                if "salt" not in entry or "hash" not in entry:
+                    continue
+                try:
+                    salt = bytes.fromhex(entry["salt"])
+                except ValueError:
+                    continue
+                candidate_hash = self._hash_code(code, salt)
+                if secrets.compare_digest(candidate_hash, entry["hash"]):
+                    matched_key = entry_id
+                    matched_entry = entry
+                    break
+
+            if matched_key is None:
                self._record_failed_attempt(platform)
                return None

-            entry = pending.pop(code)
+            del pending[matched_key]
            self._save_json(self._pending_path(platform), pending)

            # Add to approved list
-            self._approve_user(platform, entry["user_id"], entry.get("user_name", ""))
+            self._approve_user(platform, matched_entry["user_id"],
+                               matched_entry.get("user_name", ""))

            return {
-                "user_id": entry["user_id"],
-                "user_name": entry.get("user_name", ""),
+                "user_id": matched_entry["user_id"],
+                "user_name": matched_entry.get("user_name", ""),
            }

    def list_pending(self, platform: str = None) -> list:
-        """List pending pairing requests, optionally filtered by platform."""
+        """List pending pairing requests, optionally filtered by platform.
+
+        Codes are stored hashed — the ``code`` field is replaced with the
+        first 8 hex characters of the hash so admins can distinguish entries
+        without revealing the original code. Legacy plaintext-key entries
+        (pre-hash format) are shown with a "legacy" placeholder so admins
+        can see them age out without crashing on a missing ``hash`` field.
+        """
        results = []
        platforms = [platform] if platform else self._all_platforms("pending")
        for p in platforms:
            self._cleanup_expired(p)
            pending = self._load_json(self._pending_path(p))
-            for code, info in pending.items():
-                age_min = int((time.time() - info["created_at"]) / 60)
+            for entry_id, info in pending.items():
+                if not isinstance(info, dict):
+                    continue
+                created_at = info.get("created_at")
+                if not isinstance(created_at, (int, float)):
+                    continue
+                age_min = int((time.time() - created_at) / 60)
+                hash_val = info.get("hash")
+                code_display = hash_val[:8] if isinstance(hash_val, str) else "legacy"
                results.append({
                    "platform": p,
-                    "code": code,
-                    "user_id": info["user_id"],
+                    "code": code_display,
+                    "user_id": info.get("user_id", ""),
                    "user_name": info.get("user_name", ""),
                    "age_minutes": age_min,
                })
@@ -297,17 +360,29 @@ class PairingStore:
    # ----- Cleanup -----

    def _cleanup_expired(self, platform: str) -> None:
-        """Remove expired pending codes."""
+        """Remove expired pending codes.
+
+        Tolerant of malformed / legacy entries — anything without a numeric
+        ``created_at`` is treated as expired (it's effectively unusable
+        with the new hash-keyed schema anyway).
+        """
        path = self._pending_path(platform)
        pending = self._load_json(path)
        now = time.time()
-        expired = [
-            code for code, info in pending.items()
-            if (now - info["created_at"]) > CODE_TTL_SECONDS
-        ]
+        expired = []
+        for entry_id, info in pending.items():
+            if not isinstance(info, dict):
+                expired.append(entry_id)
+                continue
+            created_at = info.get("created_at")
+            if not isinstance(created_at, (int, float)):
+                expired.append(entry_id)
+                continue
+            if (now - created_at) > CODE_TTL_SECONDS:
+                expired.append(entry_id)
        if expired:
-            for code in expired:
-                del pending[code]
+            for entry_id in expired:
+                del pending[entry_id]
            self._save_json(path, pending)

    def _all_platforms(self, suffix: str) -> list:
@@ -308,11 +308,26 @@ class WebhookAdapter(BasePlatformAdapter):
            data = json.loads(subs_path.read_text(encoding="utf-8"))
            if not isinstance(data, dict):
                return
-            # Merge: static routes take precedence over dynamic ones
-            self._dynamic_routes = {
-                k: v for k, v in data.items()
-                if k not in self._static_routes
-            }
+            # Merge: static routes take precedence over dynamic ones.
+            # Reject any dynamic route whose effective secret is empty —
+            # an empty secret would cause _handle_webhook to skip HMAC
+            # validation entirely, letting unauthenticated callers in.
+            new_dynamic: Dict[str, dict] = {}
+            for k, v in data.items():
+                if k in self._static_routes:
+                    continue
+                effective_secret = v.get("secret", self._global_secret)
+                if not effective_secret:
+                    logger.warning(
+                        "[webhook] Dynamic route '%s' skipped: 'secret' is "
+                        "missing or empty. Set a valid HMAC secret, or use "
+                        "'%s' to explicitly disable auth (testing only).",
+                        k,
+                        _INSECURE_NO_AUTH,
+                    )
+                    continue
+                new_dynamic[k] = v
+            self._dynamic_routes = new_dynamic
            self._routes = {**self._dynamic_routes, **self._static_routes}
            self._dynamic_routes_mtime = mtime
            logger.info(
@@ -1109,7 +1109,7 @@ def _check_unavailable_skill(command_name: str) -> str | None:
    normalized = command_name.lower().replace("_", "-")
    try:
        from tools.skills_tool import _get_disabled_skill_names
-        from agent.skill_utils import get_all_skills_dirs
+        from agent.skill_utils import get_all_skills_dirs, is_excluded_skill_path
        disabled = _get_disabled_skill_names()

        # Check disabled skills across all dirs (local + external)
@@ -1117,7 +1117,7 @@ def _check_unavailable_skill(command_name: str) -> str | None:
            if not skills_dir.exists():
                continue
            for skill_md in skills_dir.rglob("SKILL.md"):
-                if any(part in {'.git', '.github', '.hub', '.archive'} for part in skill_md.parts):
+                if is_excluded_skill_path(skill_md):
                    continue
                slug, declared_name = _skill_slug_from_frontmatter(skill_md)
                if not slug or not declared_name:
@@ -1136,6 +1136,8 @@ def _check_unavailable_skill(command_name: str) -> str | None:
        optional_dir = get_optional_skills_dir(repo_root / "optional-skills")
        if optional_dir.exists():
            for skill_md in optional_dir.rglob("SKILL.md"):
+                if is_excluded_skill_path(skill_md):
+                    continue
                slug, _declared = _skill_slug_from_frontmatter(skill_md)
                if not slug:
                    continue
@@ -1747,8 +1747,48 @@ DEFAULT_CONFIG = {
        "retries": 2,
    },

+    # =========================================================================
+    # External secret sources
+    # =========================================================================
+    # Pull credentials from external secret managers at process startup
+    # rather than storing them in ~/.hermes/.env.
+    "secrets": {
+        "bitwarden": {
+            # Master switch.  When false, BSM is never contacted and the
+            # bws binary is never auto-installed — same as not having
+            # this section at all.
+            "enabled": False,
+            # Name of the env var that holds the Bitwarden machine-account
+            # access token.  This is the one bootstrap secret; it lives
+            # in ~/.hermes/.env (or your shell) and never in config.yaml.
+            "access_token_env": "BWS_ACCESS_TOKEN",
+            # UUID of the BSM project to sync from.
+            "project_id": "",
+            # Seconds to cache fetched secrets in-process.  0 disables.
+            "cache_ttl_seconds": 300,
+            # When True, BSM values overwrite existing env vars.  Default
+            # True because the point of using BSM is centralized rotation —
+            # if .env had the final say, rotating in Bitwarden wouldn't
+            # take effect until you also cleared the matching .env line.
+            "override_existing": True,
+            # When True, the bws binary is auto-downloaded into
+            # ~/.hermes/bin/ on first use.  When False you must install
+            # bws yourself and have it on PATH.
+            "auto_install": True,
+        },
+    },
+
+    # ── Nous Portal feature flags ──────────────────────────────────────
+    "portal": {
+        # App tools: 500+ external app integrations (Gmail, Slack, GitHub,
+        # Notion, etc.) via the Nous tool gateway.  Requires an active Nous
+        # subscription.  Set to False to hide the app_tools toolset even
+        # when a subscription is present.
+        "app_tools": True,
+    },
+
    # Config schema version - bump this when adding new required fields
-    "_config_version": 23,
+    "_config_version": 24,
 }

 # =============================================================================
@@ -2236,6 +2276,22 @@ OPTIONAL_ENV_VARS = {
        "category": "tool",
        "advanced": True,
    },
+    "TOOLS_GATEWAY_URL": {
+        "description": "Explicit URL for the tools-gateway (app integrations). Overrides the auto-derived tools-gateway.nousresearch.com",
+        "prompt": "Tools-gateway URL",
+        "url": None,
+        "password": False,
+        "category": "tool",
+        "advanced": True,
+    },
+    "PORTAL_APP_TOOLS": {
+        "description": "Enable app integration tools (500+ apps via Nous tool gateway). Requires Nous subscription.",
+        "prompt": "Enable app tools (500+ apps)",
+        "url": None,
+        "password": False,
+        "category": "tool",
+        "advanced": True,
+    },
    "TAVILY_API_KEY": {
        "description": "Tavily API key for AI-native web search, extract, and crawl",
        "prompt": "Tavily API key",
@@ -3017,7 +3073,7 @@ def _normalize_custom_provider_entry(
        "api_mode", "transport", "model", "default_model", "models",
        "context_length", "rate_limit_delay",
        "request_timeout_seconds", "stale_timeout_seconds",
-        "discover_models",
+        "discover_models", "extra_body",
    }
    for camel, snake in _CAMEL_ALIASES.items():
        if camel in entry and snake not in entry:
@@ -3112,6 +3168,10 @@ def _normalize_custom_provider_entry(
    if isinstance(discover_models, bool):
        normalized["discover_models"] = discover_models

+    extra_body = entry.get("extra_body")
+    if isinstance(extra_body, dict):
+        normalized["extra_body"] = dict(extra_body)
+
    return normalized


@@ -3266,13 +3326,13 @@ _KNOWN_ROOT_KEYS = {
    "fallback_providers", "credential_pool_strategies", "toolsets",
    "agent", "terminal", "display", "compression", "delegation",
    "auxiliary", "custom_providers", "context", "memory", "gateway",
-    "sessions",
+    "sessions", "portal",
 }

 # Valid fields inside a custom_providers list entry
 _VALID_CUSTOM_PROVIDER_FIELDS = {
    "name", "base_url", "api_key", "api_mode", "model", "models",
-    "context_length", "rate_limit_delay",
+    "context_length", "rate_limit_delay", "extra_body",
    # key_env is read at runtime by runtime_provider.py and auxiliary_client.py
    # — include it here so the set accurately describes the supported schema.
    "key_env",
@@ -3929,6 +3989,26 @@ def migrate_config(interactive: bool = True, quiet: bool = False) -> Dict[str, A
                        f"{', '.join(added_aux)}"
                    )

+    # ── Version 23 → 24: inject app_tools into saved platform_toolsets ──
+    # The portal.app_tools config flag is handled by deep-merge (DEFAULT_CONFIG
+    # has it, so load_config() always includes it). But platform_toolsets are
+    # user-owned lists that deep-merge can't append to — existing users who
+    # ran `hermes tools` have a saved list that won't include app_tools.
+    if current_ver < 24:
+        config = read_raw_config()
+        pt = config.get("platform_toolsets")
+        if isinstance(pt, dict):
+            patched = False
+            for plat_key, ts_list in pt.items():
+                if isinstance(ts_list, list) and "app_tools" not in ts_list:
+                    ts_list.append("app_tools")
+                    patched = True
+            if patched:
+                save_config(config)
+                results["config_added"].append("app_tools added to platform_toolsets")
+                if not quiet:
+                    print("  ✓ Added app_tools to saved platform toolset lists")
+
    if current_ver < latest_ver and not quiet:
        print(f"Config version: {current_ver} → {latest_ver}")
    
@@ -71,7 +71,7 @@ def curses_checklist(
                curses.use_default_colors()
                curses.init_pair(1, curses.COLOR_GREEN, -1)
                curses.init_pair(2, curses.COLOR_YELLOW, -1)
-                curses.init_pair(3, 8, -1)  # dim gray
+                curses.init_pair(3, 8 if curses.COLORS > 8 else curses.COLOR_WHITE, -1)  # dim gray
            cursor = 0
            scroll_offset = 0

@@ -16,6 +16,7 @@ from pathlib import Path
 from hermes_cli.config import get_hermes_home, get_env_path, get_project_root, load_config
 from hermes_cli.env_loader import load_hermes_dotenv
 from hermes_constants import display_hermes_home
+from agent.skill_utils import is_excluded_skill_path


 def _get_git_commit(project_root: Path) -> str:
@@ -69,6 +70,8 @@ def _count_skills(hermes_home: Path) -> int:
        return 0
    count = 0
    for item in skills_dir.rglob("SKILL.md"):
+        if is_excluded_skill_path(item):
+            continue
        count += 1
    return count

@@ -21,6 +21,44 @@ _CREDENTIAL_SUFFIXES = ("_API_KEY", "_TOKEN", "_SECRET", "_KEY")
 # tests) don't spam the same warning multiple times.
 _WARNED_KEYS: set[str] = set()

+# Map of env-var name → source label ("bitwarden", etc.) for credentials
+# that were injected by an external secret source during load_hermes_dotenv().
+# Used by setup / `hermes model` flows to label detected credentials so
+# users understand WHERE a key came from when their .env doesn't contain it
+# directly (otherwise the "credentials detected ✓" line looks identical to
+# the .env case and they don't know Bitwarden is wired up).
+_SECRET_SOURCES: dict[str, str] = {}
+
+
+def get_secret_source(env_var: str) -> str | None:
+    """Return the label of the secret source that supplied ``env_var``, if any.
+
+    Returns ``"bitwarden"`` for keys pulled from Bitwarden Secrets Manager
+    during the current process's ``load_hermes_dotenv()`` call.  Returns
+    ``None`` for keys that came from ``.env``, the shell environment, or
+    aren't tracked.
+    """
+    return _SECRET_SOURCES.get(env_var)
+
+
+def format_secret_source_suffix(env_var: str) -> str:
+    """Return a human-readable suffix like ``" (from Bitwarden)"`` or ``""``.
+
+    Use this when printing a detected credential so the user can see where
+    it came from.  Empty string when the credential came from ``.env`` or
+    the shell — those are the implicit / "default" cases users already
+    understand.
+    """
+    source = get_secret_source(env_var)
+    if not source:
+        return ""
+    if source == "bitwarden":
+        return " (from Bitwarden)"
+    # Generic fallback — future-proofing for additional secret sources
+    # (e.g. 1Password, HashiCorp Vault) without having to update every
+    # call site.
+    return f" (from {source})"
+

 def _format_offending_chars(value: str, limit: int = 3) -> str:
    """Return a compact 'U+XXXX ('c'), ...' summary of non-ASCII codepoints."""
@@ -172,4 +210,87 @@ def load_hermes_dotenv(
        _load_dotenv_with_fallback(project_env_path, override=not loaded)
        loaded.append(project_env_path)

+    _apply_external_secret_sources(home_path)
+
    return loaded
+
+
+def _apply_external_secret_sources(home_path: Path) -> None:
+    """Pull secrets from external sources (currently Bitwarden) into env.
+
+    Runs AFTER dotenv loads so .env values are visible (we use them to
+    locate the access token) but BEFORE the rest of Hermes reads
+    ``os.environ`` for credentials.  Any failure here is logged and
+    swallowed — external secret sources must never block startup.
+    """
+    try:
+        cfg = _load_secrets_config(home_path)
+    except Exception:  # noqa: BLE001 — config errors must not block startup
+        return
+
+    bw_cfg = (cfg or {}).get("bitwarden") or {}
+    if not bw_cfg.get("enabled"):
+        return
+
+    try:
+        from agent.secret_sources.bitwarden import apply_bitwarden_secrets
+    except ImportError:
+        return
+
+    result = apply_bitwarden_secrets(
+        enabled=True,
+        access_token_env=bw_cfg.get("access_token_env", "BWS_ACCESS_TOKEN"),
+        project_id=bw_cfg.get("project_id", ""),
+        override_existing=bool(bw_cfg.get("override_existing", False)),
+        cache_ttl_seconds=float(bw_cfg.get("cache_ttl_seconds", 300)),
+        auto_install=bool(bw_cfg.get("auto_install", True)),
+    )
+
+    if result.applied:
+        # Re-run the ASCII sanitization pass: BSM values are user-supplied
+        # and might have the same copy-paste corruption as a manually
+        # edited .env (see #6843).
+        _sanitize_loaded_credentials()
+        # Remember where these came from so the setup / `hermes model`
+        # flows can label detected credentials with "(from Bitwarden)" —
+        # otherwise users see "credentials ✓" with no hint that the value
+        # came from BSM rather than .env.
+        for name in result.applied:
+            _SECRET_SOURCES[name] = "bitwarden"
+        print(
+            f"  Bitwarden Secrets Manager: applied {len(result.applied)} "
+            f"secret{'s' if len(result.applied) != 1 else ''} "
+            f"({', '.join(sorted(result.applied))})",
+            file=sys.stderr,
+        )
+    if result.error:
+        print(
+            f"  Bitwarden Secrets Manager: {result.error}",
+            file=sys.stderr,
+        )
+    for warn in result.warnings:
+        print(
+            f"  Bitwarden Secrets Manager: {warn}",
+            file=sys.stderr,
+        )
+
+
+def _load_secrets_config(home_path: Path) -> dict:
+    """Read just the ``secrets:`` section out of config.yaml.
+
+    Imported lazily and isolated from the main config loader so a
+    malformed config can't take down dotenv loading entirely.
+    """
+    config_path = home_path / "config.yaml"
+    if not config_path.exists():
+        return {}
+    try:
+        import yaml  # type: ignore
+    except ImportError:
+        return {}
+    try:
+        with open(config_path, "r", encoding="utf-8") as f:
+            data = yaml.safe_load(f) or {}
+    except Exception:  # noqa: BLE001
+        return {}
+    return data.get("secrets") or {}
@@ -275,6 +275,133 @@ def _is_termux_startup_environment(env: dict[str, str] | None = None) -> bool:
    )


+def _read_packed_ref(common_dir: Path, ref: str) -> str | None:
+    """Look up a ref in .git/packed-refs without spawning git.
+
+    packed-refs lines look like ``<sha> <ref>`` with optional ``^<sha>``
+    peel lines and ``#``-prefixed comments / ``# pack-refs with:`` header.
+    """
+    try:
+        text = (common_dir / "packed-refs").read_text(encoding="utf-8", errors="replace")
+    except OSError:
+        return None
+    for line in text.splitlines():
+        if not line or line.startswith("#") or line.startswith("^"):
+            continue
+        parts = line.split(" ", 1)
+        if len(parts) == 2 and parts[1].strip() == ref:
+            return parts[0].strip()
+    return None
+
+
+def _read_git_revision_fingerprint(repo_root: Path) -> str | None:
+    """Return a cheap checkout fingerprint without spawning git."""
+    git_dir = repo_root / ".git"
+    try:
+        if git_dir.is_file():
+            for line in git_dir.read_text(encoding="utf-8", errors="replace").splitlines():
+                key, _, value = line.partition(":")
+                if key.strip() == "gitdir" and value.strip():
+                    git_dir = (repo_root / value.strip()).resolve()
+                    break
+        # Worktrees point HEAD at a per-worktree gitdir but pack their refs
+        # in the main repo's gitdir (referenced via ``commondir``). Resolve
+        # that up front so packed-refs lookups hit the right file.
+        common_dir = git_dir
+        commondir_file = git_dir / "commondir"
+        if commondir_file.exists():
+            try:
+                rel = commondir_file.read_text(encoding="utf-8", errors="replace").strip()
+                if rel:
+                    common_dir = (git_dir / rel).resolve()
+            except OSError:
+                pass
+        head_file = git_dir / "HEAD"
+        head = head_file.read_text(encoding="utf-8", errors="replace").strip()
+        if head.startswith("ref:"):
+            ref = head.split(":", 1)[1].strip()
+            # Loose refs may live in the worktree gitdir OR the common dir
+            # (branches created via `git worktree add` typically live in the
+            # common dir's refs/heads/).
+            for candidate in (git_dir, common_dir):
+                ref_file = candidate / ref
+                if ref_file.exists():
+                    return f"git:{ref}:{ref_file.read_text(encoding='utf-8', errors='replace').strip()}"
+            packed_sha = _read_packed_ref(common_dir, ref)
+            if packed_sha:
+                return f"git:{ref}:{packed_sha}"
+            # Ref name is known but unresolved — still stable across launches,
+            # and the version/release fallback in the caller will invalidate
+            # after `hermes update`.
+            return f"git:{ref}:unresolved"
+        return f"git:HEAD:{head}"
+    except OSError:
+        return None
+
+
+def _termux_bundled_skills_fingerprint() -> str:
+    """Cheap invalidation key for Termux bundled-skill startup sync."""
+    git_fp = _read_git_revision_fingerprint(PROJECT_ROOT)
+    if git_fp:
+        return git_fp
+    skills_dir = PROJECT_ROOT / "skills"
+    try:
+        stat = skills_dir.stat()
+        return f"skills:{__version__}:{__release_date__}:{stat.st_mtime_ns}:{stat.st_size}"
+    except OSError:
+        return f"skills:{__version__}:{__release_date__}:missing"
+
+
+def _termux_bundled_skills_stamp_path() -> Path:
+    return get_hermes_home() / "skills" / ".termux_bundled_sync_stamp"
+
+
+def _termux_bundled_skills_sync_needed() -> bool:
+    if not _is_termux_startup_environment():
+        return True
+    if os.environ.get("HERMES_TERMUX_FORCE_SKILLS_SYNC") == "1":
+        return True
+    try:
+        stamp = _termux_bundled_skills_stamp_path()
+        return stamp.read_text(encoding="utf-8").strip() != _termux_bundled_skills_fingerprint()
+    except OSError:
+        return True
+
+
+def _mark_termux_bundled_skills_synced() -> None:
+    if not _is_termux_startup_environment():
+        return
+    try:
+        stamp = _termux_bundled_skills_stamp_path()
+        stamp.parent.mkdir(parents=True, exist_ok=True)
+        stamp.write_text(_termux_bundled_skills_fingerprint() + "\n", encoding="utf-8")
+    except OSError:
+        pass
+
+
+def _sync_bundled_skills_for_startup() -> bool:
+    """Sync bundled skills, but skip unchanged Termux checkouts cheaply.
+
+    Hashing every bundled skill is safe but expensive on older Android
+    storage. The git/ref stamp keeps post-update correctness: a changed
+    checkout revision forces one real sync, then later starts skip it.
+    """
+    if _is_termux_startup_environment() and not _termux_bundled_skills_sync_needed():
+        return False
+
+    from tools.skills_sync import sync_skills
+
+    sync_skills(quiet=True)
+    _mark_termux_bundled_skills_synced()
+    return True
+
+
+def _termux_should_prefetch_update_check() -> bool:
+    if not _is_termux_startup_environment():
+        return True
+    return os.environ.get("HERMES_TERMUX_PREFETCH_UPDATES") == "1"
+
+
 def _relative_time(ts) -> str:
    """Format a timestamp as relative time (e.g., '2h ago', 'yesterday')."""
    if not ts:
@@ -464,7 +591,7 @@ def _session_browse_picker(sessions: list) -> Optional[str]:
                curses.init_pair(1, curses.COLOR_GREEN, -1)  # selected
                curses.init_pair(2, curses.COLOR_YELLOW, -1)  # header
                curses.init_pair(3, curses.COLOR_CYAN, -1)  # search
-                curses.init_pair(4, 8, -1)  # dim
+                curses.init_pair(4, 8 if curses.COLORS > 8 else curses.COLOR_WHITE, -1)  # dim

            cursor = 0
            scroll_offset = 0
@@ -1146,13 +1273,13 @@ def _make_tui_argv(tui_dir: Path, tui_dev: bool) -> tuple[list[str], Path]:
            p = Path(ext_dir)
            if (p / "dist" / "entry.js").is_file():
                node = _node_bin("node")
-                return [node, str(p / "dist" / "entry.js")], p
+                return [node, "--expose-gc", str(p / "dist" / "entry.js")], p

        # 1b. Bundled in wheel (pip install)
        bundled = _find_bundled_tui()
        if bundled is not None:
            node = _node_bin("node")
-            return [node, str(bundled)], bundled.parent
+            return [node, "--expose-gc", str(bundled)], bundled.parent

    # 2. Normal flow: npm install if needed, always esbuild, then node dist/entry.js.
    #    --dev flow: npm install if needed, then tsx src/entry.tsx.
@@ -1229,7 +1356,7 @@ def _make_tui_argv(tui_dir: Path, tui_dev: bool) -> tuple[list[str], Path]:
            sys.exit(1)

    node = _node_bin("node")
-    return [node, str(tui_dir / "dist" / "entry.js")], tui_dir
+    return [node, "--expose-gc", str(tui_dir / "dist" / "entry.js")], tui_dir


 def _normalize_tui_toolsets(toolsets: object) -> list[str]:
@@ -1351,16 +1478,16 @@ def _launch_tui(
        env["HERMES_TUI_TOOL_PROGRESS"] = "off"
    if accept_hooks:
        env["HERMES_ACCEPT_HOOKS"] = "1"
-    # Guarantee an 8GB V8 heap + exposed GC for the TUI. Default node cap is
-    # ~1.5–4GB depending on version and can fatal-OOM on long sessions with
-    # large transcripts / reasoning blobs. Token-level merge: respect any
-    # user-supplied --max-old-space-size (they may have set it higher) and
-    # avoid duplicating --expose-gc.
+    # Guarantee an 8GB V8 heap for the TUI. Default node cap is ~1.5–4GB
+    # depending on version and can fatal-OOM on long sessions with large
+    # transcripts / reasoning blobs. Token-level merge: respect any
+    # user-supplied --max-old-space-size (they may have set it higher).
+    # --expose-gc is *not* added here: Node rejects it in NODE_OPTIONS
+    # ("--expose-gc is not allowed in NODE_OPTIONS") and refuses to start.
+    # It is passed as a direct argv flag in _make_tui_argv() instead.
    _tokens = env.get("NODE_OPTIONS", "").split()
    if not any(t.startswith("--max-old-space-size=") for t in _tokens):
        _tokens.append("--max-old-space-size=8192")
-    if "--expose-gc" not in _tokens:
-        _tokens.append("--expose-gc")
    env["NODE_OPTIONS"] = " ".join(_tokens)
    # HERMES_TUI_RESUME is an internal hand-off from the Python wrapper to the
    # Ink app.  Because we start from os.environ.copy(), an exported/stale value
@@ -1523,19 +1650,20 @@ def cmd_chat(args):
        print("You can run 'hermes setup' at any time to configure.")
        sys.exit(1)

-    # Start update check in background (runs while other init happens)
-    try:
-        from hermes_cli.banner import prefetch_update_check
+    # Start update check in background (runs while other init happens).
+    # On Termux this imports rich/prompt_toolkit in the foreground and then
+    # competes for CPU on single-core devices, so keep it opt-in there.
+    if _termux_should_prefetch_update_check():
+        try:
+            from hermes_cli.banner import prefetch_update_check

-        prefetch_update_check()
-    except Exception:
-        pass
+            prefetch_update_check()
+        except Exception:
+            pass

    # Sync bundled skills on every CLI launch (fast -- skips unchanged skills)
    try:
-        from tools.skills_sync import sync_skills
-
-        sync_skills(quiet=True)
+        _sync_bundled_skills_for_startup()
    except Exception:
        pass

@@ -2305,6 +2433,9 @@ _AUX_TASKS: list[tuple[str, str, str]] = [
    ("mcp", "MCP", "MCP tool reasoning"),
    ("title_generation", "Title generation", "session titles"),
    ("skills_hub", "Skills hub", "skills search/install"),
+    ("triage_specifier", "Triage specifier", "kanban spec fleshing"),
+    ("kanban_decomposer", "Kanban decomposer", "task decomposition"),
+    ("profile_describer", "Profile describer", "auto profile descriptions"),
    ("curator", "Curator", "skill-usage review pass"),
 ]

@@ -4534,7 +4665,9 @@ def _model_flow_copilot(config, current_model=""):
        source = creds.get("source", "")
    else:
        if source in {"GITHUB_TOKEN", "GH_TOKEN"}:
-            print(f"  GitHub token: {api_key[:8]}... ✓ ({source})")
+            from hermes_cli.env_loader import format_secret_source_suffix
+            bw_suffix = format_secret_source_suffix(source)
+            print(f"  GitHub token: {api_key[:8]}... ✓ ({source}{bw_suffix})")
        elif source == "gh auth token":
            print("  GitHub token: ✓ (from `gh auth token`)")
        else:
@@ -4791,7 +4924,10 @@ def _prompt_api_key(pconfig, existing_key: str, provider_id: str = "") -> tuple:
        return new_key, False

    # Already configured — offer K / R / C ────────────────────────────────
-    print(f"  {pconfig.name} API key: {existing_key[:8]}... ✓")
+    from hermes_cli.env_loader import format_secret_source_suffix
+
+    source_suffix = format_secret_source_suffix(key_env) if key_env else ""
+    print(f"  {pconfig.name} API key: {existing_key[:8]}... ✓{source_suffix}")
    if not key_env:
        # Nothing we can rewrite; just acknowledge and move on.
        print()
@@ -5074,7 +5210,9 @@ def _model_flow_bedrock_api_key(config, region, current_model=""):
    # Prompt for API key
    existing_key = get_env_value("AWS_BEARER_TOKEN_BEDROCK") or ""
    if existing_key:
-        print(f"  Bedrock API Key: {existing_key[:12]}... ✓")
+        from hermes_cli.env_loader import format_secret_source_suffix
+        source_suffix = format_secret_source_suffix("AWS_BEARER_TOKEN_BEDROCK")
+        print(f"  Bedrock API Key: {existing_key[:12]}... ✓{source_suffix}")
    else:
        print(f"  Endpoint: {mantle_base_url}")
        print()
@@ -5745,7 +5883,22 @@ def _model_flow_anthropic(config, current_model=""):
    if has_creds:
        # Show what we found
        if existing_key:
-            print(f"  Anthropic credentials: {existing_key[:12]}... ✓")
+            from hermes_cli.env_loader import format_secret_source_suffix
+            from hermes_cli.auth import PROVIDER_REGISTRY
+
+            # Surface which env var supplied the key so users with
+            # Bitwarden see "(from Bitwarden)" — without this, a detected
+            # BSM key looks identical to a key in .env and users assume
+            # nothing is wired up.
+            source_suffix = ""
+            for var in PROVIDER_REGISTRY["anthropic"].api_key_env_vars:
+                if os.getenv(var, "").strip() == existing_key:
+                    source_suffix = format_secret_source_suffix(var)
+                    if source_suffix:
+                        break
+            print(
+                f"  Anthropic credentials: {existing_key[:12]}... ✓{source_suffix}"
+            )
        elif cc_available:
            print("  Claude Code credentials: ✓ (auto-detected)")
        print()
@@ -5971,8 +6124,7 @@ def cmd_import(args):
    run_import(args)


-def cmd_version(args):
-    """Show version."""
+def _print_version_info(*, check_updates: bool = True) -> None:
    print(f"Hermes Agent v{__version__} ({__release_date__})")
    print(f"Project: {PROJECT_ROOT}")

@@ -5992,6 +6144,9 @@ def cmd_version(args):
    except ImportError:
        print("OpenAI SDK: Not installed")

+    if not check_updates:
+        return
+
    # Show update status (synchronous — acceptable since user asked for version info)
    try:
        from hermes_cli.banner import check_for_updates
@@ -6010,6 +6165,11 @@ def cmd_version(args):
        pass


+def cmd_version(args):
+    """Show version."""
+    _print_version_info(check_updates=True)
+
+
 def cmd_uninstall(args):
    """Uninstall Hermes Agent."""
    _require_tty("uninstall")
@@ -6086,24 +6246,36 @@ def _validate_critical_files_syntax(root) -> tuple[bool, str | None, str | None]
    them after a successful ``git pull`` so we can auto-roll-back instead of
    leaving the user with a bricked install.

+    The compiled ``.pyc`` is written to a temp directory rather than the
+    source tree's ``__pycache__/`` so we don't race with concurrent test
+    workers that walk the same dir, and so we don't leave a stale pyc
+    behind in production if the next interpreter run picks a different
+    Python version. The pyc is discarded on function return either way —
+    we only care about the compile-or-not signal.
+
    Returns ``(ok, failing_path, error_message)``. ``ok=True`` means every
    file parsed cleanly.
    """
    import py_compile
+    import tempfile

    root = Path(root)
-    for relpath in _UPDATE_CRITICAL_FILES:
-        path = root / relpath
-        if not path.exists():
-            # Missing file is suspicious but not necessarily fatal — a future
-            # refactor may legitimately remove one of these. Skip and move on.
-            continue
-        try:
-            py_compile.compile(str(path), doraise=True)
-        except py_compile.PyCompileError as exc:
-            return False, str(path), str(exc)
-        except OSError as exc:
-            return False, str(path), f"could not read: {exc}"
+    with tempfile.TemporaryDirectory(prefix="hermes-syntax-check-") as tmpdir:
+        for relpath in _UPDATE_CRITICAL_FILES:
+            path = root / relpath
+            if not path.exists():
+                # Missing file is suspicious but not necessarily fatal — a future
+                # refactor may legitimately remove one of these. Skip and move on.
+                continue
+            # Mirror the relative path under the tmpdir so two different
+            # files with the same basename don't collide on the cfile name.
+            cfile = Path(tmpdir) / (relpath.replace("/", "__") + "c")
+            try:
+                py_compile.compile(str(path), cfile=str(cfile), doraise=True)
+            except py_compile.PyCompileError as exc:
+                return False, str(path), str(exc)
+            except OSError as exc:
+                return False, str(path), f"could not read: {exc}"
    return True, None, None


@@ -10413,7 +10585,7 @@ _BUILTIN_SUBCOMMANDS = frozenset(
        "model", "pairing", "plugins", "postinstall", "profile", "proxy",
        "send", "sessions", "setup",
        "skills", "slack", "status", "tools", "uninstall", "update",
-        "version", "webhook", "whatsapp", "chat",
+        "version", "webhook", "whatsapp", "chat", "secrets",
        # Help-ish invocations — plugin commands not being listed in
        # top-level --help is an acceptable trade-off for skipping an
        # expensive eager import of every bundled plugin module.
@@ -10503,6 +10675,137 @@ def _plugin_cli_discovery_needed() -> bool:
    return True


+_AGENT_COMMANDS = {None, "chat", "acp", "rl"}
+_AGENT_SUBCOMMANDS = {
+    "cron": ("cron_command", {"run", "tick"}),
+    "gateway": ("gateway_command", {"run"}),
+    "mcp": ("mcp_action", {"serve"}),
+}
+
+
+def _prepare_agent_startup(args) -> None:
+    """Discover plugins/MCP/hooks for commands that can run an agent turn."""
+    _sub_attr, _sub_set = _AGENT_SUBCOMMANDS.get(args.command, (None, None))
+    if not (
+        args.command in _AGENT_COMMANDS
+        or (_sub_attr and getattr(args, _sub_attr, None) in _sub_set)
+    ):
+        return
+
+    _accept_hooks = bool(getattr(args, "accept_hooks", False))
+    try:
+        from hermes_cli.plugins import discover_plugins
+
+        discover_plugins()
+    except Exception:
+        logger.warning(
+            "plugin discovery failed at CLI startup",
+            exc_info=True,
+        )
+    try:
+        # MCP tool discovery — no event loop running in CLI/TUI startup,
+        # so inline is safe.  Moved here from model_tools.py module scope
+        # to avoid freezing the gateway's event loop on its first message
+        # via the same lazy import path (#16856).
+        from tools.mcp_tool import discover_mcp_tools
+
+        discover_mcp_tools()
+    except Exception:
+        logger.debug(
+            "MCP tool discovery failed at CLI startup",
+            exc_info=True,
+        )
+    try:
+        from hermes_cli.config import load_config
+        from agent.shell_hooks import register_from_config
+
+        register_from_config(load_config(), accept_hooks=_accept_hooks)
+    except Exception:
+        logger.debug(
+            "shell-hook registration failed at CLI startup",
+            exc_info=True,
+        )
+
+
+def _set_chat_arg_defaults(args) -> None:
+    for attr, default in [
+        ("query", None),
+        ("model", None),
+        ("provider", None),
+        ("toolsets", None),
+        ("verbose", False),
+        ("resume", None),
+        ("continue_last", None),
+        ("worktree", False),
+    ]:
+        if not hasattr(args, attr):
+            setattr(args, attr, default)
+
+
+def _is_termux_fast_version_argv(argv: list[str]) -> bool:
+    return argv in (["--version"], ["-V"], ["version"])
+
+
+def _try_termux_fast_cli_launch() -> bool:
+    """Run obvious Termux non-TUI chat/oneshot/version paths on a light parser."""
+    if not _is_termux_startup_environment():
+        return False
+    if os.environ.get("HERMES_TERMUX_DISABLE_FAST_CLI") == "1":
+        return False
+
+    argv = sys.argv[1:]
+    if "-h" in argv or "--help" in argv:
+        return False
+    if os.environ.get("HERMES_TUI") == "1" or "--tui" in argv:
+        return False
+
+    if _is_termux_fast_version_argv(argv):
+        _print_version_info(check_updates=False)
+        return True
+
+    first = _first_positional_argv()
+    has_oneshot = any(
+        arg == "-z" or arg == "--oneshot" or arg.startswith("--oneshot=")
+        for arg in argv
+    )
+    if not has_oneshot and first not in {None, "chat"}:
+        return False
+
+    from hermes_cli._parser import build_top_level_parser
+
+    parser, _subparsers, chat_parser = build_top_level_parser()
+    chat_parser.set_defaults(func=cmd_chat)
+    args = parser.parse_args(_coalesce_session_name_args(argv))
+
+    if getattr(args, "version", False):
+        _print_version_info(check_updates=False)
+        return True
+
+    if getattr(args, "oneshot", None):
+        _prepare_agent_startup(args)
+        from hermes_cli.oneshot import run_oneshot
+
+        sys.exit(
+            run_oneshot(
+                args.oneshot,
+                model=getattr(args, "model", None),
+                provider=getattr(args, "provider", None),
+                toolsets=getattr(args, "toolsets", None),
+            )
+        )
+
+    if (args.resume or args.continue_last) and args.command is None:
+        args.command = "chat"
+
+    if args.command in {None, "chat"}:
+        _set_chat_arg_defaults(args)
+        _prepare_agent_startup(args)
+        cmd_chat(args)
+        return True
+
+    return False
+
+
 def _try_termux_fast_tui_launch() -> bool:
    """Launch obvious Termux TUI invocations before building every subparser.

@@ -10563,6 +10866,8 @@ def main():

    if _try_termux_fast_tui_launch():
        return
+    if _try_termux_fast_cli_launch():
+        return

    from hermes_cli._parser import build_top_level_parser

@@ -10660,6 +10965,42 @@ def main():
    )
    fallback_parser.set_defaults(func=cmd_fallback)

+    # =========================================================================
+    # secrets command — external secret managers (currently: Bitwarden)
+    # =========================================================================
+    secrets_parser = subparsers.add_parser(
+        "secrets",
+        help="Manage external secret sources (Bitwarden Secrets Manager)",
+        description=(
+            "Pull API keys from an external secret manager at process startup "
+            "instead of storing them in ~/.hermes/.env.  Currently supports "
+            "Bitwarden Secrets Manager.  See: "
+            "https://hermes-agent.nousresearch.com/docs/user-guide/secrets/bitwarden"
+        ),
+    )
+    secrets_subparsers = secrets_parser.add_subparsers(dest="secrets_command")
+
+    secrets_bw = secrets_subparsers.add_parser(
+        "bitwarden",
+        aliases=["bw"],
+        help="Bitwarden Secrets Manager integration",
+    )
+
+    # Lazy import — only pays for itself when this subcommand is actually used.
+    from hermes_cli import secrets_cli as _secrets_cli
+
+    _secrets_cli.register_cli(secrets_bw)
+
+    def _dispatch_secrets(args):  # noqa: ANN001
+        sub = getattr(args, "secrets_command", None)
+        bw_sub = getattr(args, "secrets_bw_command", None)
+        if sub in ("bitwarden", "bw") and bw_sub is not None:
+            return args.func(args)
+        secrets_parser.print_help()
+        return 0
+
+    secrets_parser.set_defaults(func=_dispatch_secrets)
+
    # =========================================================================
    # migrate command
    # =========================================================================
@@ -13325,51 +13666,7 @@ Examples:
    # so introspection/management commands (hermes hooks list, cron
    # list, gateway status, mcp add, ...) don't pay discovery cost or
    # trigger consent prompts for hooks the user is still inspecting.
-    # Groups with mixed admin/CRUD vs. agent-running entries narrow via
-    # the nested subcommand (dest varies by parser).
-    _AGENT_COMMANDS = {None, "chat", "acp", "rl"}
-    _AGENT_SUBCOMMANDS = {
-        "cron": ("cron_command", {"run", "tick"}),
-        "gateway": ("gateway_command", {"run"}),
-        "mcp": ("mcp_action", {"serve"}),
-    }
-    _sub_attr, _sub_set = _AGENT_SUBCOMMANDS.get(args.command, (None, None))
-    if args.command in _AGENT_COMMANDS or (
-        _sub_attr and getattr(args, _sub_attr, None) in _sub_set
-    ):
-        _accept_hooks = bool(getattr(args, "accept_hooks", False))
-        try:
-            from hermes_cli.plugins import discover_plugins
-
-            discover_plugins()
-        except Exception:
-            logger.warning(
-                "plugin discovery failed at CLI startup",
-                exc_info=True,
-            )
-        try:
-            # MCP tool discovery — no event loop running in CLI/TUI startup,
-            # so inline is safe.  Moved here from model_tools.py module scope
-            # to avoid freezing the gateway's event loop on its first message
-            # via the same lazy import path (#16856).
-            from tools.mcp_tool import discover_mcp_tools
-
-            discover_mcp_tools()
-        except Exception:
-            logger.debug(
-                "MCP tool discovery failed at CLI startup",
-                exc_info=True,
-            )
-        try:
-            from hermes_cli.config import load_config
-            from agent.shell_hooks import register_from_config
-
-            register_from_config(load_config(), accept_hooks=_accept_hooks)
-        except Exception:
-            logger.debug(
-                "shell-hook registration failed at CLI startup",
-                exc_info=True,
-            )
+    _prepare_agent_startup(args)

    # Handle top-level --oneshot / -z: single-shot mode, stdout = final
    # response only, nothing else. Bypasses cli.py entirely.
@@ -74,8 +74,12 @@ class NousSubscriptionFeatures:
    def modal(self) -> NousFeatureState:
        return self.features["modal"]

+    @property
+    def app_tools(self) -> NousFeatureState:
+        return self.features["app_tools"]
+
    def items(self) -> Iterable[NousFeatureState]:
-        ordered = ("web", "image_gen", "tts", "browser", "modal")
+        ordered = ("web", "image_gen", "tts", "browser", "modal", "app_tools")
        for key in ordered:
            yield self.features[key]

@@ -225,6 +229,22 @@ def _resolve_browser_feature_state(
    return "local", available, active, False


+def _read_portal_app_tools_enabled(config: Optional[Dict[str, object]] = None) -> bool:
+    """Return True when the portal.app_tools config flag is on."""
+    if config is not None:
+        # Fast path: use the pre-loaded config snapshot from the caller
+        import os
+        env_val = os.getenv("PORTAL_APP_TOOLS")
+        if env_val is not None:
+            return is_truthy_value(env_val)
+        portal = config.get("portal")
+        if isinstance(portal, dict):
+            return bool(portal.get("app_tools", True))
+        return True
+    from tools.tool_backend_helpers import portal_app_tools_enabled
+    return portal_app_tools_enabled()
+
+
 def get_nous_subscription_features(
    config: Optional[Dict[str, object]] = None,
 ) -> NousSubscriptionFeatures:
@@ -313,6 +333,8 @@ def get_nous_subscription_features(
    managed_tts_available = managed_tools_flag and nous_auth_present and is_managed_tool_gateway_ready("openai-audio")
    managed_browser_available = managed_tools_flag and nous_auth_present and is_managed_tool_gateway_ready("browser-use")
    managed_modal_available = managed_tools_flag and nous_auth_present and is_managed_tool_gateway_ready("modal")
+    app_gw_ready = bool(managed_tools_flag and nous_auth_present and is_managed_tool_gateway_ready("tools"))
+    app_config_on = _read_portal_app_tools_enabled(config)
    modal_state = resolve_modal_backend_state(
        modal_mode,
        has_direct=direct_modal,
@@ -476,6 +498,17 @@ def get_nous_subscription_features(
            current_provider="Modal" if terminal_backend == "modal" else terminal_backend or "local",
            explicit_configured=terminal_backend == "modal",
        ),
+        "app_tools": NousFeatureState(
+            key="app_tools",
+            label="App tools (500+ apps)",
+            included_by_default=True,
+            available=app_gw_ready,
+            active=app_gw_ready and app_config_on,
+            managed_by_nous=app_gw_ready and app_config_on,
+            direct_override=False,
+            toolset_enabled=app_config_on,
+            current_provider="Nous Tool Gateway",
+        ),
    }

    return NousSubscriptionFeatures(
@@ -1051,7 +1051,7 @@ def _run_composite_ui(curses, plugin_names, plugin_labels, plugin_selected,
            curses.init_pair(1, curses.COLOR_GREEN, -1)
            curses.init_pair(2, curses.COLOR_YELLOW, -1)
            curses.init_pair(3, curses.COLOR_CYAN, -1)
-            curses.init_pair(4, 8, -1)  # dim gray
+            curses.init_pair(4, 8 if curses.COLORS > 8 else curses.COLOR_WHITE, -1)  # dim gray
        cursor = 0
        scroll_offset = 0

@@ -1196,7 +1196,7 @@ def _run_composite_ui(curses, plugin_names, plugin_labels, plugin_selected,
                            curses.init_pair(1, curses.COLOR_GREEN, -1)
                            curses.init_pair(2, curses.COLOR_YELLOW, -1)
                            curses.init_pair(3, curses.COLOR_CYAN, -1)
-                            curses.init_pair(4, 8, -1)
+                            curses.init_pair(4, 8 if curses.COLORS > 8 else curses.COLOR_WHITE, -1)
                        curses.curs_set(0)
            elif key in {curses.KEY_ENTER, 10, 13}:
                if cursor < n_plugins:
@@ -1228,7 +1228,7 @@ def _run_composite_ui(curses, plugin_names, plugin_labels, plugin_selected,
                            curses.init_pair(1, curses.COLOR_GREEN, -1)
                            curses.init_pair(2, curses.COLOR_YELLOW, -1)
                            curses.init_pair(3, curses.COLOR_CYAN, -1)
-                            curses.init_pair(4, 8, -1)
+                            curses.init_pair(4, 8 if curses.COLORS > 8 else curses.COLOR_WHITE, -1)
                        curses.curs_set(0)
            elif key in {27, ord("q")}:
                # Save plugin changes on exit
@@ -35,6 +35,7 @@ from pathlib import Path
 from typing import Optional

 from hermes_cli import profiles as profiles_mod
+from agent.skill_utils import is_excluded_skill_path

 logger = logging.getLogger(__name__)

@@ -109,8 +110,7 @@ def _collect_skills(profile_dir: Path) -> list[str]:
        return []
    names: list[str] = []
    for md in skills_dir.rglob("SKILL.md"):
-        path_str = str(md)
-        if "/.hub/" in path_str or "/.git/" in path_str:
+        if is_excluded_skill_path(md):
            continue
        try:
            rel = md.relative_to(skills_dir)
@@ -201,7 +201,7 @@ def describe_profile(
    skill_list = "\n".join(f"  - {n}" for n in skill_names) or "  (no skills installed)"
    skill_count = sum(
        1 for _ in (profile_dir / "skills").rglob("SKILL.md")
-        if "/.hub/" not in str(_) and "/.git/" not in str(_)
+        if not is_excluded_skill_path(_)
    ) if (profile_dir / "skills").is_dir() else 0

    # Read model + provider from the profile's config.
@@ -70,6 +70,8 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple

+from agent.skill_utils import is_excluded_skill_path
+

 # ---------------------------------------------------------------------------
 # Constants
@@ -463,7 +465,9 @@ def _count_skills(staged: Path) -> int:
    skills_dir = staged / "skills"
    if not skills_dir.is_dir():
        return 0
-    return sum(1 for _ in skills_dir.rglob("SKILL.md"))
+    return sum(
+        1 for p in skills_dir.rglob("SKILL.md") if not is_excluded_skill_path(p)
+    )


 def plan_install(
@@ -30,6 +30,8 @@ from dataclasses import dataclass
 from pathlib import Path, PurePosixPath, PureWindowsPath
 from typing import List, Optional

+from agent.skill_utils import is_excluded_skill_path
+
 _PROFILE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,63}$")

 # Directories bootstrapped inside every new profile
@@ -485,8 +487,9 @@ def _count_skills(profile_dir: Path) -> int:
        return 0
    count = 0
    for md in skills_dir.rglob("SKILL.md"):
-        if "/.hub/" not in str(md) and "/.git/" not in str(md):
-            count += 1
+        if is_excluded_skill_path(md):
+            continue
+        count += 1
    return count


@@ -902,7 +905,49 @@ def delete_profile(name: str, yes: bool = False) -> Path:

    # 4. Remove profile directory
    try:
-        shutil.rmtree(profile_dir)
+        def _make_writable(func, path, exc):
+            """onexc/onerror handler: add +w on PermissionError so rmtree can proceed.
+
+            Handles two cases on NixOS (and other systems with read-only
+            copies from immutable stores):
+            1. The path itself isn't writable (e.g. a file with mode 0444)
+            2. The *parent* directory isn't writable (e.g. mode 0555)
+
+            Compatible with both the ``onexc`` API (3.12+, receives an
+            exception instance) and the ``onerror`` API (3.11-, receives
+            ``sys.exc_info()`` tuple).
+            """
+            import stat as _stat
+            import sys as _sys
+
+            # Normalise the two callback signatures:
+            #   onexc(func, path, exc_instance)   — 3.12+
+            #   onerror(func, path, exc_info_tuple) — 3.11
+            if isinstance(exc, tuple):
+                exc = exc[1]  # exc_info → actual exception object
+
+            if isinstance(exc, PermissionError):
+                # Make the path writable
+                try:
+                    os.chmod(path, os.stat(path).st_mode | _stat.S_IWUSR)
+                except OSError:
+                    pass
+                # Also make the parent writable (needed for unlink/rmdir)
+                parent = os.path.dirname(path)
+                if parent:
+                    try:
+                        os.chmod(parent, os.stat(parent).st_mode | _stat.S_IWUSR)
+                    except OSError:
+                        pass
+                func(path)
+            else:
+                raise
+
+        # ``onexc`` was added in 3.12; fall back to ``onerror`` on 3.11.
+        try:
+            shutil.rmtree(profile_dir, onexc=_make_writable)
+        except TypeError:
+            shutil.rmtree(profile_dir, onerror=_make_writable)
        print(f"✓ Removed {profile_dir}")
    except Exception as e:
        print(f"⚠ Could not remove {profile_dir}: {e}")
@@ -528,6 +528,9 @@ def _get_named_custom_provider(requested_provider: str) -> Optional[Dict[str, An
                        "api_key": resolved_api_key,
                        "model": entry.get("default_model", ""),
                    }
+                    extra_body = entry.get("extra_body")
+                    if isinstance(extra_body, dict):
+                        result["extra_body"] = dict(extra_body)
                    # The v11→v12 migration writes the API mode under the new
                    # ``transport`` field, but hand-edited configs may still
                    # use the legacy ``api_mode`` spelling.  Accept both —
@@ -553,6 +556,9 @@ def _get_named_custom_provider(requested_provider: str) -> Optional[Dict[str, An
                            "api_key": resolved_api_key,
                            "model": entry.get("default_model", ""),
                        }
+                        extra_body = entry.get("extra_body")
+                        if isinstance(extra_body, dict):
+                            result["extra_body"] = dict(extra_body)
                        api_mode = _parse_api_mode(entry.get("api_mode") or entry.get("transport"))
                        if api_mode:
                            result["api_mode"] = api_mode
@@ -596,6 +602,9 @@ def _get_named_custom_provider(requested_provider: str) -> Optional[Dict[str, An
            result["key_env"] = key_env
        if provider_key:
            result["provider_key"] = provider_key
+        extra_body = entry.get("extra_body")
+        if isinstance(extra_body, dict):
+            result["extra_body"] = dict(extra_body)
        api_mode = _parse_api_mode(entry.get("api_mode"))
        if api_mode:
            result["api_mode"] = api_mode
@@ -607,6 +616,13 @@ def _get_named_custom_provider(requested_provider: str) -> Optional[Dict[str, An
    return None


+def _custom_provider_request_overrides(custom_provider: Dict[str, Any]) -> Dict[str, Any]:
+    extra_body = custom_provider.get("extra_body")
+    if not isinstance(extra_body, dict) or not extra_body:
+        return {}
+    return {"extra_body": dict(extra_body)}
+
+
 def _resolve_named_custom_runtime(
    *,
    requested_provider: str,
@@ -683,6 +699,12 @@ def _resolve_named_custom_runtime(
        model_name = custom_provider.get("model")
        if model_name:
            pool_result["model"] = model_name
+        request_overrides = _custom_provider_request_overrides(custom_provider)
+        if request_overrides:
+            pool_result["request_overrides"] = {
+                **dict(pool_result.get("request_overrides") or {}),
+                **request_overrides,
+            }
        return pool_result

    _cp_is_openai_url   = base_url_host_matches(base_url, "openai.com") or base_url_host_matches(base_url, "openai.azure.com")
@@ -714,6 +736,9 @@ def _resolve_named_custom_runtime(
    # provider name differs from the actual model string the API expects.
    if custom_provider.get("model"):
        result["model"] = custom_provider["model"]
+    request_overrides = _custom_provider_request_overrides(custom_provider)
+    if request_overrides:
+        result["request_overrides"] = request_overrides
    return result


@@ -0,0 +1,445 @@
+"""CLI handlers for ``hermes secrets bitwarden ...``.
+
+Subcommands:
+    setup    — interactive wizard: install bws, prompt for token + project, test fetch
+    status   — show current config + binary version + last fetch outcome
+    sync     — run a fetch right now and show what would be applied (dry-run friendly)
+    disable  — flip ``secrets.bitwarden.enabled`` to False
+    install  — just download the bws binary (no token / project required)
+"""
+
+from __future__ import annotations
+
+import argparse
+import getpass
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+
+from agent.secret_sources import bitwarden as bw
+from hermes_cli.config import (
+    get_env_path,
+    load_config,
+    save_config,
+    save_env_value,
+)
+
+
+# ---------------------------------------------------------------------------
+# Argparse wiring — called from hermes_cli.main
+# ---------------------------------------------------------------------------
+
+
+def register_cli(parent_parser: argparse.ArgumentParser) -> None:
+    """Attach the ``bitwarden`` subcommand tree to a parent parser.
+
+    Called from ``hermes_cli.main`` as part of building the top-level
+    ``hermes secrets`` parser.
+    """
+    sub = parent_parser.add_subparsers(dest="secrets_bw_command")
+
+    setup = sub.add_parser(
+        "setup",
+        help="Interactive wizard: install bws, store access token, pick project",
+    )
+    setup.add_argument(
+        "--project-id",
+        help="Pre-select a project UUID instead of prompting",
+    )
+    setup.add_argument(
+        "--access-token",
+        help="Provide the access token non-interactively (will be stored in .env)",
+    )
+    setup.set_defaults(func=cmd_setup)
+
+    status = sub.add_parser("status", help="Show config + binary + last fetch")
+    status.set_defaults(func=cmd_status)
+
+    sync = sub.add_parser("sync", help="Fetch secrets now and report what changed")
+    sync.add_argument(
+        "--apply",
+        action="store_true",
+        help="Actually export the secrets into the current shell's env (default: dry-run)",
+    )
+    sync.set_defaults(func=cmd_sync)
+
+    disable = sub.add_parser("disable", help="Turn off the Bitwarden integration")
+    disable.set_defaults(func=cmd_disable)
+
+    install = sub.add_parser(
+        "install",
+        help=f"Download and verify the pinned bws binary (v{bw._BWS_VERSION})",
+    )
+    install.add_argument(
+        "--force",
+        action="store_true",
+        help="Re-download even if a managed copy already exists",
+    )
+    install.set_defaults(func=cmd_install)
+
+
+# ---------------------------------------------------------------------------
+# Handlers
+# ---------------------------------------------------------------------------
+
+
+def cmd_setup(args: argparse.Namespace) -> int:
+    console = Console()
+    console.print(
+        Panel.fit(
+            "[bold]Bitwarden Secrets Manager setup[/bold]\n\n"
+            "Need an access token? In the Bitwarden web app:\n"
+            "  Secrets Manager → Machine accounts → [your account] →\n"
+            "  Access tokens → Create access token\n\n"
+            "Copy the token (starts with [cyan]0.[/cyan]…) — it cannot be retrieved later.",
+            border_style="cyan",
+        )
+    )
+
+    # ------------------------------------------------------------------ binary
+    console.print()
+    console.print("[bold]Step 1[/bold]  Install the bws CLI")
+    try:
+        binary = bw.find_bws(install_if_missing=False)
+        if binary is None:
+            console.print("  No bws on PATH — downloading…")
+            binary = bw.install_bws()
+        version = _bws_version(binary)
+        console.print(f"  [green]✓[/green] {binary}  ({version})")
+    except Exception as exc:  # noqa: BLE001
+        console.print(f"  [red]✗ Could not install bws: {exc}[/red]")
+        console.print(
+            "  Manual install: "
+            "https://github.com/bitwarden/sdk-sm/releases"
+        )
+        return 1
+
+    # ------------------------------------------------------------------- token
+    console.print()
+    console.print("[bold]Step 2[/bold]  Provide your access token")
+    cfg = load_config()
+    secrets_cfg = (cfg.setdefault("secrets", {})
+                     .setdefault("bitwarden", {}))
+    token_env = secrets_cfg.get("access_token_env", "BWS_ACCESS_TOKEN")
+
+    token = (args.access_token or "").strip()
+    if not token:
+        token = getpass.getpass(f"  Paste access token ({token_env}): ").strip()
+    if not token:
+        console.print("  [red]Empty token, aborting.[/red]")
+        return 1
+    if not token.startswith("0."):
+        console.print(
+            "  [yellow]Warning: token doesn't start with '0.' — usually that means "
+            "you pasted something other than a BSM access token.  Continuing anyway.[/yellow]"
+        )
+
+    save_env_value(token_env, token)
+    os.environ[token_env] = token  # so the test fetch below sees it
+    console.print(f"  [green]✓[/green] stored in {get_env_path()} as {token_env}")
+
+    # ------------------------------------------------------------------- project
+    if args.project_id and args.project_id.strip():
+        project_id = args.project_id.strip()
+    else:
+        console.print()
+        console.print("[bold]Step 3[/bold]  Pick a project")
+        project_id = ""
+        projects = _list_projects(binary, token, console)
+        if projects is None:
+            return 1
+        if not projects:
+            console.print("  [yellow]No projects visible to this machine account.[/yellow]")
+            console.print(
+                "  In the Bitwarden web app, open the machine account → Projects tab "
+                "and grant it access to at least one project."
+            )
+            return 1
+
+        table = Table(show_header=True, header_style="bold")
+        table.add_column("#", style="cyan", width=4)
+        table.add_column("Name")
+        table.add_column("ID", style="dim")
+        for i, p in enumerate(projects, 1):
+            table.add_row(str(i), p.get("name", "?"), p.get("id", "?"))
+        console.print(table)
+
+        while True:
+            choice = console.input(f"  Select project [1-{len(projects)}]: ").strip()
+            if not choice:
+                continue
+            try:
+                idx = int(choice)
+            except ValueError:
+                console.print("  [red]Enter a number.[/red]")
+                continue
+            if 1 <= idx <= len(projects):
+                project_id = projects[idx - 1]["id"]
+                break
+            console.print(f"  [red]Out of range — pick 1-{len(projects)}.[/red]")
+
+    # ------------------------------------------------------------------- test
+    console.print()
+    step_num = 4 if not (args.project_id and args.project_id.strip()) else 3
+    console.print(f"[bold]Step {step_num}[/bold]  Test fetch")
+    try:
+        secrets, warnings = bw.fetch_bitwarden_secrets(
+            access_token=token,
+            project_id=project_id,
+            binary=binary,
+            use_cache=False,
+        )
+    except Exception as exc:  # noqa: BLE001
+        console.print(f"  [red]✗ Fetch failed: {exc}[/red]")
+        return 1
+
+    if not secrets:
+        console.print("  [yellow]Fetch succeeded but the project has no secrets.[/yellow]")
+    else:
+        table = Table(show_header=True, header_style="bold")
+        table.add_column("Name", style="cyan")
+        table.add_column("Status")
+        for key in sorted(secrets):
+            if key == token_env:
+                status = "[dim]bootstrap token — never overrides itself[/dim]"
+            elif os.environ.get(key):
+                status = "[yellow]already set in env (will be overwritten)[/yellow]"
+            else:
+                status = "[green]new[/green]"
+            table.add_row(key, status)
+        console.print(table)
+    for w in warnings:
+        console.print(f"  [yellow]warning:[/yellow] {w}")
+
+    # ------------------------------------------------------------------- save
+    secrets_cfg["enabled"] = True
+    secrets_cfg["project_id"] = project_id
+    secrets_cfg.setdefault("access_token_env", token_env)
+    secrets_cfg.setdefault("cache_ttl_seconds", 300)
+    secrets_cfg.setdefault("override_existing", True)
+    secrets_cfg.setdefault("auto_install", True)
+    save_config(cfg)
+
+    console.print()
+    console.print(
+        "[green]✓ Bitwarden Secrets Manager is enabled.[/green]  "
+        "Secrets will be pulled at the start of every Hermes process."
+    )
+    console.print(
+        "  Status:  [cyan]hermes secrets bitwarden status[/cyan]\n"
+        "  Refresh: [cyan]hermes secrets bitwarden sync[/cyan]\n"
+        "  Disable: [cyan]hermes secrets bitwarden disable[/cyan]"
+    )
+    return 0
+
+
+def cmd_status(args: argparse.Namespace) -> int:
+    console = Console()
+    cfg = load_config()
+    bw_cfg = (cfg.get("secrets") or {}).get("bitwarden") or {}
+
+    enabled = bool(bw_cfg.get("enabled"))
+    token_env = bw_cfg.get("access_token_env", "BWS_ACCESS_TOKEN")
+    project_id = bw_cfg.get("project_id", "")
+    token_set = bool(os.environ.get(token_env))
+
+    table = Table(show_header=False, box=None, padding=(0, 2))
+    table.add_column("", style="bold")
+    table.add_column("")
+    table.add_row("Enabled",         _yn(enabled))
+    table.add_row("Token env var",   token_env)
+    table.add_row("Token in env",    _yn(token_set))
+    table.add_row("Project ID",      project_id or "[dim](unset)[/dim]")
+    table.add_row("Override existing", _yn(bool(bw_cfg.get("override_existing", False))))
+    table.add_row("Cache TTL (s)",   str(bw_cfg.get("cache_ttl_seconds", 300)))
+    table.add_row("Auto-install",    _yn(bool(bw_cfg.get("auto_install", True))))
+
+    binary = bw.find_bws(install_if_missing=False)
+    if binary:
+        table.add_row("bws binary",  f"{binary} ({_bws_version(binary)})")
+    else:
+        table.add_row("bws binary",  "[yellow]not installed[/yellow]")
+
+    console.print(Panel(table, title="Bitwarden Secrets Manager", border_style="cyan"))
+
+    if not enabled:
+        console.print("\n  Run [cyan]hermes secrets bitwarden setup[/cyan] to enable.")
+        return 0
+    if not token_set:
+        console.print(
+            f"\n  [yellow]Enabled but {token_env} is not set — Hermes will skip BSM "
+            "and warn on next startup.[/yellow]"
+        )
+    if not project_id:
+        console.print(
+            "\n  [yellow]Enabled but no project_id — nothing to fetch.[/yellow]"
+        )
+    return 0
+
+
+def cmd_sync(args: argparse.Namespace) -> int:
+    console = Console()
+    cfg = load_config()
+    bw_cfg = (cfg.get("secrets") or {}).get("bitwarden") or {}
+    if not bw_cfg.get("enabled"):
+        console.print(
+            "[yellow]Bitwarden integration is disabled.  Run "
+            "`hermes secrets bitwarden setup` first.[/yellow]"
+        )
+        return 1
+
+    token_env = bw_cfg.get("access_token_env", "BWS_ACCESS_TOKEN")
+    token = os.environ.get(token_env, "").strip()
+    if not token:
+        console.print(f"[red]{token_env} is not set.[/red]")
+        return 1
+
+    project_id = bw_cfg.get("project_id", "")
+    if not project_id:
+        console.print("[red]No project_id configured.[/red]")
+        return 1
+
+    try:
+        secrets, warnings = bw.fetch_bitwarden_secrets(
+            access_token=token,
+            project_id=project_id,
+            use_cache=False,
+        )
+    except Exception as exc:  # noqa: BLE001
+        console.print(f"[red]Fetch failed: {exc}[/red]")
+        return 1
+
+    if not secrets:
+        console.print("[yellow]No secrets in project.[/yellow]")
+        return 0
+
+    override = bool(bw_cfg.get("override_existing", False)) or args.apply
+    table = Table(show_header=True, header_style="bold")
+    table.add_column("Name", style="cyan")
+    table.add_column("Action")
+    applied = 0
+    for key in sorted(secrets):
+        if key == token_env:
+            table.add_row(key, "[dim]skip (bootstrap token)[/dim]")
+            continue
+        already = bool(os.environ.get(key))
+        if already and not override:
+            table.add_row(key, "[dim]skip (already set)[/dim]")
+            continue
+        if args.apply:
+            os.environ[key] = secrets[key]
+            applied += 1
+            table.add_row(key, "[green]exported[/green]" + (" (overrode)" if already else ""))
+        else:
+            table.add_row(key, "[green]would export[/green]" + (" (overrides)" if already else ""))
+
+    console.print(table)
+    for w in warnings:
+        console.print(f"[yellow]warning:[/yellow] {w}")
+
+    if not args.apply:
+        console.print(
+            "\n  This was a dry-run — secrets are picked up automatically on the "
+            "next [cyan]hermes[/cyan] invocation.  Re-run with [cyan]--apply[/cyan] "
+            "to export into the current shell instead."
+        )
+    else:
+        console.print(f"\n  [green]Exported {applied} secret(s) into current process.[/green]")
+    return 0
+
+
+def cmd_disable(args: argparse.Namespace) -> int:
+    console = Console()
+    cfg = load_config()
+    bw_cfg = (cfg.setdefault("secrets", {})
+                .setdefault("bitwarden", {}))
+    bw_cfg["enabled"] = False
+    save_config(cfg)
+    console.print(
+        "[green]Disabled.[/green]  Bitwarden secrets will NOT be pulled on the next "
+        "Hermes invocation.\n"
+        "  Your access token is left in .env — remove it manually if you also want "
+        "to revoke the credential."
+    )
+    return 0
+
+
+def cmd_install(args: argparse.Namespace) -> int:
+    console = Console()
+    try:
+        path = bw.install_bws(force=bool(args.force))
+        console.print(f"[green]✓[/green] {path}  ({_bws_version(path)})")
+        return 0
+    except Exception as exc:  # noqa: BLE001
+        console.print(f"[red]Install failed: {exc}[/red]")
+        return 1
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _yn(b: bool) -> str:
+    return "[green]yes[/green]" if b else "[dim]no[/dim]"
+
+
+def _bws_version(binary: Path) -> str:
+    try:
+        res = subprocess.run(
+            [str(binary), "--version"],
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+        if res.returncode == 0:
+            return (res.stdout or res.stderr).strip().splitlines()[0]
+    except (OSError, subprocess.TimeoutExpired):
+        pass
+    return "version unknown"
+
+
+def _list_projects(
+    binary: Path, token: str, console: Console
+) -> Optional[List[dict]]:
+    """Call ``bws project list`` and return the parsed list, or None on failure."""
+    env = os.environ.copy()
+    env["BWS_ACCESS_TOKEN"] = token
+    env.setdefault("NO_COLOR", "1")
+    try:
+        res = subprocess.run(
+            [str(binary), "project", "list", "--output", "json"],
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=15,
+        )
+    except (OSError, subprocess.TimeoutExpired) as exc:
+        console.print(f"  [red]Couldn't list projects: {exc}[/red]")
+        return None
+
+    if res.returncode != 0:
+        err = (res.stderr or res.stdout).strip()[:300]
+        console.print(f"  [red]bws project list failed: {err}[/red]")
+        if "authorization" in err.lower() or "invalid" in err.lower():
+            console.print(
+                "  [yellow]This usually means the access token is wrong or revoked. "
+                "Double-check it in the Bitwarden web app.[/yellow]"
+            )
+        return None
+
+    try:
+        data = json.loads(res.stdout or "[]")
+    except json.JSONDecodeError as exc:
+        console.print(f"  [red]bws returned non-JSON: {exc}[/red]")
+        return None
+    if not isinstance(data, list):
+        return []
+    return [p for p in data if isinstance(p, dict) and p.get("id")]
@@ -23,6 +23,7 @@ from rich.table import Table
 # Lazy imports to avoid circular dependencies and slow startup.
 # tools.skills_hub and tools.skills_guard are imported inside functions.
 from hermes_constants import display_hermes_home
+from agent.skill_utils import is_excluded_skill_path

 _console = Console()

@@ -178,9 +179,12 @@ def _existing_categories() -> List[str]:
            # top level (no category); otherwise treat as a category bucket.
            if (entry / "SKILL.md").exists():
                continue
-            # Has at least one nested SKILL.md?
+            # Has at least one nested SKILL.md (excluding dependency/cache dirs)?
            try:
-                if any(entry.rglob("SKILL.md")):
+                if any(
+                    not is_excluded_skill_path(p)
+                    for p in entry.rglob("SKILL.md")
+                ):
                    out.append(entry.name)
            except OSError:
                continue
@@ -78,6 +78,7 @@ CONFIGURABLE_TOOLSETS = [
    ("discord_admin",   "🛡️  Discord Server Admin",    "list channels/roles, pin, assign roles"),
    ("yuanbao",          "🤖 Yuanbao",                  "group info, member queries, DM"),
    ("computer_use",     "🖱️  Computer Use (macOS)",     "background desktop control via cua-driver"),
+    ("app_tools",        "🔌 App Integrations (500+)",   "Gmail, Slack, GitHub, Jira, Notion, etc. via Nous tool gateway"),
 ]

 # Toolsets that are OFF by default for new installs.
@@ -311,6 +312,16 @@ TOOL_CATEGORIES = {
    "image_gen": {
        "name": "Image Generation",
        "icon": "🎨",
+        # Per-provider rows for FAL.ai (`plugins/image_gen/fal`), OpenAI,
+        # OpenAI Codex, and xAI are injected at runtime from each
+        # ``plugins.image_gen.<vendor>`` package via
+        # ``_plugin_image_gen_providers()`` in ``_visible_providers``.
+        # Only non-provider UX setup-flow rows remain here:
+        #   - "Nous Subscription" — managed FAL billed via the Nous
+        #     subscription (requires_nous_auth + override_env_vars).
+        #     Uses the fal plugin as the underlying backend but has a
+        #     distinct setup UX.
+        # Mirrors the shape browser/video_gen ship today.
        "providers": [
            {
                "name": "Nous Subscription",
@@ -322,15 +333,6 @@ TOOL_CATEGORIES = {
                "override_env_vars": ["FAL_KEY"],
                "imagegen_backend": "fal",
            },
-            {
-                "name": "FAL.ai",
-                "badge": "paid",
-                "tag": "Pick from flux-2-klein, flux-2-pro, gpt-image, nano-banana, etc.",
-                "env_vars": [
-                    {"key": "FAL_KEY", "prompt": "FAL API key", "url": "https://fal.ai/dashboard/keys"},
-                ],
-                "imagegen_backend": "fal",
-            },
        ],
    },
    "video_gen": {
@@ -482,6 +484,11 @@ TOOLSET_ENV_REQUIREMENTS = {
 # ─── Post-Setup Hooks ─────────────────────────────────────────────────────────


+def _cua_driver_cmd() -> str:
+    """Return the cua-driver executable name/path, honoring non-empty overrides."""
+    return os.environ.get("HERMES_CUA_DRIVER_CMD", "").strip() or "cua-driver"
+
+
 def _pip_install(
    args: List[str],
    *,
@@ -550,6 +557,55 @@ def _pip_install(
    )


+
+def _check_cua_driver_asset_for_arch() -> bool:
+    """Check whether the latest CUA release ships an asset for this architecture.
+
+    Returns True if the asset likely exists (or if we cannot determine it).
+    Returns False and prints a warning when the asset is confirmed missing,
+    so callers can skip the install attempt and avoid a raw 404.
+    """
+    import platform as _plat
+    import urllib.request
+
+    machine = _plat.machine()  # "x86_64" or "arm64"
+    if machine == "arm64":
+        # arm64 (Apple Silicon) assets are always published.
+        return True
+
+    # x86_64 / Intel — probe the latest release for an architecture-specific
+    # asset before falling through to the upstream installer.
+    api_url = (
+        "https://api.github.com/repos/trycua/cua/releases/latest"
+    )
+    try:
+        req = urllib.request.Request(api_url, headers={"Accept": "application/vnd.github+json"})
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            release = _json.loads(resp.read().decode())
+        tag = release.get("tag_name", "")
+        assets = release.get("assets", [])
+        arch_names = {"x86_64", "amd64"}
+        has_asset = any(
+            any(a in a_info.get("name", "").lower() for a in arch_names)
+            for a_info in assets
+        )
+        if not has_asset:
+            _print_warning(
+                f"    Latest CUA release ({tag}) has no Intel (x86_64) asset."
+            )
+            _print_info(
+                "    CUA Driver currently only ships Apple Silicon builds."
+            )
+            _print_info(
+                "    See: https://github.com/trycua/cua/issues/1493"
+            )
+            return False
+    except Exception:
+        # Network / API failure — proceed and let the installer handle it.
+        pass
+    return True
+
+
 def install_cua_driver(upgrade: bool = False) -> bool:
    """Install or refresh the cua-driver binary used by Computer Use.

@@ -579,7 +635,8 @@ def install_cua_driver(upgrade: bool = False) -> bool:
        _print_warning("    Computer Use (cua-driver) is macOS-only; skipping.")
        return False

-    binary = shutil.which("cua-driver")
+    driver_cmd = _cua_driver_cmd()
+    binary = shutil.which(driver_cmd)

    # Not installed → fresh install path (only when caller asked for it).
    if not binary and not upgrade:
@@ -587,18 +644,20 @@ def install_cua_driver(upgrade: bool = False) -> bool:
            _print_warning("    curl not found — install manually:")
            _print_info("      https://github.com/trycua/cua/blob/main/libs/cua-driver/README.md")
            return False
+        if not _check_cua_driver_asset_for_arch():
+            return False
        return _run_cua_driver_installer(label="Installing")

    # Already installed and caller didn't ask to upgrade → just confirm.
    if binary and not upgrade:
        try:
            version = subprocess.run(
-                ["cua-driver", "--version"],
+                [driver_cmd, "--version"],
                capture_output=True, text=True, timeout=5,
            ).stdout.strip()
-            _print_success(f"    cua-driver already installed: {version or 'unknown version'}")
+            _print_success(f"    {driver_cmd} already installed: {version or 'unknown version'}")
        except Exception:
-            _print_success("    cua-driver already installed.")
+            _print_success(f"    {driver_cmd} already installed.")
        _print_info("    Grant macOS permissions if not done yet:")
        _print_info("      System Settings > Privacy & Security > Accessibility")
        _print_info("      System Settings > Privacy & Security > Screen Recording")
@@ -609,11 +668,14 @@ def install_cua_driver(upgrade: bool = False) -> bool:
        _print_warning("    curl not found — cannot refresh cua-driver.")
        return bool(binary)

+    if not _check_cua_driver_asset_for_arch():
+        return bool(binary)
+
    if binary:
        # Show before/after version when we have a baseline. Best-effort.
        try:
            before = subprocess.run(
-                ["cua-driver", "--version"],
+                [driver_cmd, "--version"],
                capture_output=True, text=True, timeout=5,
            ).stdout.strip()
        except Exception:
@@ -625,13 +687,13 @@ def install_cua_driver(upgrade: bool = False) -> bool:
    if ok and before:
        try:
            after = subprocess.run(
-                ["cua-driver", "--version"],
+                [driver_cmd, "--version"],
                capture_output=True, text=True, timeout=5,
            ).stdout.strip()
            if after and after != before:
-                _print_success(f"    cua-driver upgraded: {before} → {after}")
+                _print_success(f"    {driver_cmd} upgraded: {before} → {after}")
            elif after:
-                _print_info(f"    cua-driver up to date: {after}")
+                _print_info(f"    {driver_cmd} up to date: {after}")
        except Exception:
            pass
    return ok
@@ -655,11 +717,12 @@ def _run_cua_driver_installer(label: str = "Installing", verbose: bool = True) -
        _print_info(f"    {label} cua-driver (macOS background computer-use)...")
    else:
        _print_info(f"    {label} cua-driver...")
+    driver_cmd = _cua_driver_cmd()
    try:
        result = subprocess.run(install_cmd, shell=True, timeout=300)
-        if result.returncode == 0 and shutil.which("cua-driver"):
+        if result.returncode == 0 and shutil.which(driver_cmd):
            if verbose:
-                _print_success("    cua-driver installed.")
+                _print_success(f"    {driver_cmd} installed.")
                _print_info("    IMPORTANT — grant macOS permissions now:")
                _print_info("      System Settings > Privacy & Security > Accessibility")
                _print_info("      System Settings > Privacy & Security > Screen Recording")
@@ -1506,12 +1569,9 @@ def _plugin_image_gen_providers() -> list[dict]:
    Each returned dict looks like a regular ``TOOL_CATEGORIES`` provider
    row but carries an ``image_gen_plugin_name`` marker so downstream
    code (config writing, model picker) knows to route through the
-    plugin registry instead of the in-tree FAL backend.
-
-    FAL is skipped — it's already exposed by the hardcoded
-    ``TOOL_CATEGORIES["image_gen"]`` entries. When FAL gets ported to
-    a plugin in a follow-up PR, the hardcoded entries go away and this
-    function surfaces it alongside OpenAI automatically.
+    plugin registry. Every image-gen backend is a plugin now — there
+    are no hardcoded rows left in ``TOOL_CATEGORIES["image_gen"]`` for
+    this function to dedupe against (see issue #26241).
    """
    try:
        from agent.image_gen_registry import list_providers
@@ -1524,9 +1584,6 @@ def _plugin_image_gen_providers() -> list[dict]:

    rows: list[dict] = []
    for provider in providers:
-        if getattr(provider, "name", None) == "fal":
-            # FAL has its own hardcoded rows today.
-            continue
        try:
            schema = provider.get_setup_schema()
        except Exception:
@@ -1751,7 +1808,7 @@ _POST_SETUP_INSTALLED: dict = {
    # entry when (a) the post_setup is the ONLY install side-effect for
    # a no-key provider, and (b) an installed-state check is cheap and
    # doesn't trigger a heavy import.
-    "cua_driver": lambda: bool(shutil.which("cua-driver")),
+    "cua_driver": lambda: bool(shutil.which(_cua_driver_cmd())),
 }


@@ -975,11 +975,13 @@ _AUX_TASK_SLOTS: Tuple[str, ...] = (
    "vision",
    "web_extract",
    "compression",
-    "session_search",
    "skills_hub",
    "approval",
    "mcp",
    "title_generation",
+    "triage_specifier",
+    "kanban_decomposer",
+    "profile_describer",
    "curator",
 )

@@ -0,0 +1,121 @@
+Create a professional infographic following these specifications:
+
+## Image Specifications
+
+- **Type**: Infographic
+- **Layout**: bento-grid
+- **Style**: retro-pop-grid
+- **Aspect Ratio**: 1:1 (square)
+- **Language**: en
+
+## Core Principles
+
+- Follow the layout structure precisely for information architecture
+- Apply style aesthetics consistently throughout
+- Keep information concise, highlight keywords and core concepts
+- Use ample whitespace for visual clarity
+- Maintain clear visual hierarchy
+
+## Text Requirements
+
+- All text must match the specified style treatment
+- Main titles should be prominent and readable
+- Key concepts should be visually emphasized
+- Labels should be clear and appropriately sized
+- Use English for all text content
+
+## Layout Guidelines (bento-grid)
+
+- Grid of rectangular cells with varied sizes (1x1, 2x1, 1x2, 2x2)
+- Hero cell ("ONE TOKEN, EVERY KEY") takes the largest position (top-center or upper-left, 2x2)
+- Supporting cells around the hero, mixed cell sizes for rhythm
+- Each cell self-contained with its own title + icon + brief content
+- Title strip at the top: "BITWARDEN SECRETS MANAGER — HERMES-AGENT PR #30035"
+- Footer strip at the bottom with commit SHA + repo
+
+## Style Guidelines (retro-pop-grid)
+
+- 1970s retro pop art with strict Swiss international grid
+- Background: warm vintage cream/beige (#F5F0E6)
+- Accents: salmon pink, sky blue, mustard yellow, mint green — all muted retro tones
+- Pure solid black (#000000) and solid white (#FFFFFF) for extreme-contrast cells
+- Uniform thick black outlines on ALL illustrations, text boxes, grid dividers
+- Pure 2D flat vector aesthetic with subtle screen-print texture
+- One cell inverted to black-background-with-white-text for the "NEVER BLOCKS STARTUP" warning section
+- Geometric fill patterns in empty cells: checkerboards, diagonal lines, dot grids
+- Flat abstract symbols: shields (security), wrenches (install), arrows (rotation), keyholes (auth), checkmarks (tests)
+- Vintage comic-style smiley face for "26/26 PASSING" cell
+- Bold brutalist or thick retro display fonts for headers; clean sans-serif body
+- Decorative stylistic labels acceptable: "WARNING", "NEW DEFAULT", "PINNED", "VERIFIED", "ROTATE"
+
+## Avoid
+
+- 3D rendering, gradients, soft shadows, sketch-like lines
+- Free-floating elements — everything anchored in grid cells
+- Pure white background — must use warm cream/beige
+
+---
+
+Generate the infographic based on the content below:
+
+### Title (top strip)
+BITWARDEN SECRETS MANAGER → HERMES-AGENT
+PR #30035
+
+### HERO CELL (largest, top-center, salmon pink background with thick black border)
+ONE TOKEN, EVERY KEY
+Rotate once in the Bitwarden web app.
+Every Hermes process picks it up on next start.
+NEW DEFAULT: override_existing = true
+
+### Cell — LAZY INSTALL (sky blue background)
+~/.hermes/bin/bws
+bws v2.0.0 PINNED
+SHA-256 VERIFIED
+No apt · no brew · no sudo
+Icon: wrench + downward arrow
+
+### Cell — CLI SURFACE (mustard yellow background, checkerboard accents)
+$ hermes secrets bitwarden
+  setup    wizard
+  status   diagnose
+  sync     fetch
+  install  binary
+  disable  off
+Icon: terminal prompt symbol
+
+### Cell — SOURCE OF TRUTH (mint green background)
+BITWARDEN WINS
+Overwrites stale .env on every start
+Bootstrap token never overwritten (exception)
+Icon: keyhole + arrow
+
+### Cell — INVERTED BLACK CELL with WHITE TEXT — NEVER BLOCKS STARTUP (extreme contrast)
+WARNING-FREE STARTUP
+Missing binary → warn + continue
+Bad token → warn + continue
+Network down → warn + continue
+Checksum mismatch → refuse + warn
+30s timeout ceiling
+Icon: white triangle warning sign
+
+### Cell — TESTS (cream with thick black outline, vintage comic smiley face)
+26 / 26
+HERMETIC
+subprocess + urllib mocked
+linux · macos · windows
+x86_64 · arm64
+Icon: comic-style smiley face with checkmark
+
+### Cell — CONFIG YAML (white background with black grid)
+secrets:
+  bitwarden:
+    enabled: true
+    project_id: ...
+    override_existing: true
+    cache_ttl_seconds: 300
+    auto_install: true
+
+### Footer strip (bottom, black-on-cream)
+PR #30035 · commit 7f9b05668 · NousResearch/hermes-agent
+10 files · +1743 / -1 · agent/secret_sources/ · hermes_cli/secrets_cli.py
@@ -0,0 +1,57 @@
+# Hermes-Agent PR #30035 — Bitwarden Secrets Manager Integration
+
+## Hero
+**ONE TOKEN, EVERY KEY**
+Rotate once. Every Hermes process picks it up on next start.
+`secrets.bitwarden.override_existing: true` (default)
+
+## Cells
+
+### Lazy Install
+- `bws v2.0.0` pinned
+- Downloaded into `~/.hermes/bin/bws`
+- SHA-256 verified vs GitHub Releases checksum file
+- No apt, no brew, no sudo
+- Cross-platform: linux gnu+musl, macos universal, windows x86_64+arm64
+
+### CLI Surface
+- `hermes secrets bitwarden setup`     wizard
+- `hermes secrets bitwarden status`    diagnose
+- `hermes secrets bitwarden sync`      dry-run / --apply
+- `hermes secrets bitwarden install`   binary only
+- `hermes secrets bitwarden disable`   off switch
+
+### Source of Truth
+- Bitwarden WINS on every Hermes start
+- BSM values overwrite stale `.env` lines
+- Rotate a key once → all your machines reload it
+- Bootstrap token `BWS_ACCESS_TOKEN` is the lone exception (never overwritten)
+
+### Never Blocks Startup
+- Missing binary → warn + continue
+- Bad token → warn + continue
+- Checksum mismatch → refuse install + warn
+- No network → warn + continue
+- Timeout → 30s ceiling, warn + continue
+
+### Tests
+- 26/26 passing, hermetic
+- subprocess + urllib mocked
+- Platform matrix tested (linux, macos, windows × x86_64, arm64)
+- Cache hit/miss, auth fail, non-JSON, timeout, override behavior
+
+### Config
+```yaml
+secrets:
+  bitwarden:
+    enabled: true
+    project_id: <uuid>
+    override_existing: true   # NEW DEFAULT
+    cache_ttl_seconds: 300
+    auto_install: true
+```
+
+## Footer
+PR #30035 · commit 7f9b05668 · NousResearch/hermes-agent
+
+10 files changed · +1743 / -1 · agent/secret_sources/ · hermes_cli/secrets_cli.py · tests · docs
@@ -0,0 +1,85 @@
+Create a professional infographic following these specifications:
+
+## Image Specifications
+
+- **Type**: Infographic
+- **Layout**: bento-grid
+- **Style**: technical-schematic (engineering blueprint variant)
+- **Aspect Ratio**: 1:1 (square)
+- **Language**: English
+
+## Core Principles
+
+- Follow the bento-grid layout precisely with varied cell sizes
+- Apply technical-schematic aesthetics consistently throughout
+- Keep information concise, highlight keywords and core concepts
+- Use ample whitespace for visual clarity
+- Maintain clear visual hierarchy with a hero cell for the headline metric
+
+## Style Guidelines (technical-schematic blueprint)
+
+- Color palette: deep blue background (#1E3A5F), white lines and text, amber accent (#F59E0B) ONLY on the hero metric and critical deltas, cyan callouts for measurement annotations
+- Grid pattern overlay across the entire canvas — fine white grid lines on the deep blue background
+- All-caps technical stencil typography for headers; clean sans-serif for body
+- Dimension lines with arrowheads connecting metrics to their cells
+- Technical symbols where appropriate (gear icons, flow arrows, modular block diagrams)
+- Consistent stroke weights — bold for cell borders, thin for grid, medium for connector lines
+- Engineering spec-sheet aesthetic: feels like a printed architectural blueprint, austere and precise
+
+## Layout Guidelines (bento-grid)
+
+- Hero cell (TOP-CENTER or LEFT, occupying ~40% of canvas): "−61 COMPLEXITY · 79 → 18" headline metric in massive amber-on-blue, with subtitle "convert_messages_to_anthropic refactored"
+- 7 helper cells in a 2x4 or 3x3 grid showing each extracted helper as its own modular block — each cell has the helper name in all-caps, its complexity number, and one-line role
+- Metrics strip cell: BEFORE/AFTER table with deltas (185 statements → ~70, 79 C → 18 C, +5 violations intentional)
+- Test validation cell: "152/152 + 213/213 PASS" with checkmark stencil
+- Footer strip across bottom: "PR #27784 · agent/anthropic_adapter.py · @kshitijk4poor · NousResearch/hermes-agent"
+
+## Content to render
+
+**Main title (top of canvas, all caps):** "ANTHROPIC ADAPTER · 1-INTO-7 EXTRACTION"
+**Subtitle:** "PR #27784 — convert_messages_to_anthropic refactor"
+
+**Hero cell (largest, amber accent):**
+- "−61"
+- "CYCLOMATIC COMPLEXITY"
+- "79 → 18 MAX (−77%)"
+- Subtext: "convert_messages_to_anthropic · pure code motion · zero behavior change"
+
+**7 helper cells (one per helper, each its own modular block):**
+
+1. _convert_assistant_message · C<10 · "Assistant msg → content blocks"
+2. _convert_tool_message_to_result · C=12 · "Tool msg → tool_result + merge"
+3. _convert_user_message · C<10 · "User msg validation"
+4. _strip_orphaned_tool_blocks · C=15 · "Orphan tool_use removal"
+5. _merge_consecutive_roles · C=13 · "Anthropic role-alternation"
+6. _manage_thinking_signatures · C=18 · "Strip/preserve by endpoint"
+7. _evict_old_screenshots · C<10 · "Keep most recent 3 images"
+
+**Metrics cell (table format with arrows):**
+- MAX FUNCTION COMPLEXITY: 79 → 18 (−77%)
+- MAX STATEMENTS/FUNCTION: 185 → ~70 (−62%)
+- LOC FILE-WIDE: −4
+- MAIN FUNCTION LOC: 395 → 63
+
+**Test validation cell (checkmark stencil):**
+- test_anthropic_adapter.py: 152/152 PASS
+- test_auxiliary_client.py: 172/172 PASS
+- test_azure_identity_adapter.py: 39/39 PASS
+- test_bedrock_1m_context.py: 2/2 PASS
+
+**Behavior preservation cell:**
+"ZERO LOGIC CHANGES · ANTHROPIC + KIMI + DEEPSEEK + MINIMAX + AZURE FOUNDRY + BEDROCK SEMANTICS PRESERVED"
+
+**Footer strip:**
+"PR #27784 · agent/anthropic_adapter.py · cherry-picked from #23968 · @kshitijk4poor · NousResearch/hermes-agent"
+
+## Text Requirements
+
+- All text in English, all-caps for headers
+- Hero metric "−61" in amber (#F59E0B), oversized, with thick blueprint stencil treatment
+- Helper names in white technical stencil
+- Complexity numbers (C=12, C=18, etc.) in cyan callouts
+- "BEFORE" labels in white-on-blue, "AFTER" labels in amber-on-blue
+- Footer in small white stencil
+
+Generate the infographic now as a square engineering blueprint.
@@ -0,0 +1,66 @@
+# Infographic: PR #27784 — convert_messages_to_anthropic refactor
+
+## Hero metric
+**−61 cyclomatic complexity** in `agent/anthropic_adapter.py` (79 → 18 max).
+**−4 LOC** net file-wide. **77% drop** in single-function complexity ceiling.
+
+## Title
+ANTHROPIC ADAPTER · 1-INTO-7 EXTRACTION
+PR #27784 · agent/anthropic_adapter.py · @kshitijk4poor
+
+## Section 1: BEFORE (left side)
+**convert_messages_to_anthropic**
+- 185 statements
+- 90 branches
+- Cyclomatic: 79
+- Did 7 jobs in one function
+
+Inline responsibilities mixed together:
+1. Walk + dispatch by role
+2. Tool-result conversion
+3. Orphan tool-use stripping
+4. Same-role merging
+5. Thinking-signature management
+6. Screenshot eviction
+7. Final assembly
+
+## Section 2: AFTER (right side)
+**convert_messages_to_anthropic** — now 63 lines, C<10
+Plus 7 single-responsibility helpers:
+
+| Helper | C | Role |
+|---|---|---|
+| _convert_assistant_message | <10 | Assistant msg → content blocks |
+| _convert_tool_message_to_result | 12 | Tool msg → tool_result + merge |
+| _convert_user_message | <10 | User msg validation + conversion |
+| _strip_orphaned_tool_blocks | 15 | Strip orphan tool_use + tool_result |
+| _merge_consecutive_roles | 13 | Anthropic role-alternation enforce |
+| _manage_thinking_signatures | 18 | Strip/preserve/downgrade by endpoint |
+| _evict_old_screenshots | <10 | Keep most recent 3 images |
+
+## Section 3: METRICS
+| Metric | Before | After | Δ |
+|---|---:|---:|---:|
+| Max function complexity | 79 | 18 | −77% |
+| Max statements/function | 185 | ~70 | −62% |
+| LOC (file-wide) | — | — | **−4** |
+| C901 violations | 3 | 8 | +5 (intentional split) |
+
+## Section 4: ZERO BEHAVIOR CHANGE
+- Pure code motion — no logic edits
+- Mutating helpers update `result` in place (same as inline)
+- `_merge_consecutive_roles` returns new list — caller rebinds
+- Anthropic / Kimi / DeepSeek / MiniMax / Azure Foundry / Bedrock semantics preserved
+- Thinking-signature handling identical to pre-refactor
+
+## Section 5: TEST VALIDATION
+- tests/agent/test_anthropic_adapter.py — **152 / 152 pass**
+- tests/agent/test_auxiliary_client.py — **172 / 172 pass**
+- tests/agent/test_azure_identity_adapter.py — **39 / 39 pass**
+- tests/agent/test_bedrock_1m_context.py — **2 / 2 pass**
+
+## Footer
+File: agent/anthropic_adapter.py
+Original PR: #27784 (cherry-pick of #23968)
+Salvage commit: 9c102b937 (kshitijk4poor authorship preserved)
+Repo: NousResearch/hermes-agent
@@ -4,7 +4,7 @@ let
  src = ../ui-tui;
  npmDeps = pkgs.fetchNpmDeps {
    inherit src;
-    hash = "sha256-dNL/J4tyQQ7Ji3xfIE5b5Jdi6rQyCFjqYpzLYftJVdc=";
+    hash = "sha256-F6/MzZOWc0zhW9mIfnaY+PrllPvJcsA/OdFdEM+NpLY=";
  };

  npm = hermesNpmLib.mkNpmPassthru { folder = "ui-tui"; attr = "tui"; pname = "hermes-tui"; };
@@ -4,7 +4,7 @@ let
  src = ../web;
  npmDeps = pkgs.fetchNpmDeps {
    inherit src;
-    hash = "sha256-GxSmEpclOwmv94KmGMediPITxqXAsxqTEQOoDIbYkUw=";
+    hash = "sha256-xSsyluzU2lNhwGqB6XMCGMv3QFHZizE6hgUyc1jvyOw=";
  };

  npm = hermesNpmLib.mkNpmPassthru { folder = "web"; attr = "web"; pname = "hermes-web"; };
@@ -148,7 +148,7 @@ class BrowserUseBrowserProvider(BrowserProvider):

        return {
            "api_key": managed.nous_user_token,
-            "base_url": managed.gateway_origin.rstrip("/"),
+            "base_url": managed.resolved_origin.rstrip("/"),
            "managed_mode": True,
        }

@@ -0,0 +1,182 @@
+"""FAL.ai image generation backend.
+
+Wraps the 18-model FAL catalog (FLUX 2, Z-Image, Nano Banana, GPT
+Image 1.5, Recraft, Imagen 4, Qwen, Ideogram, …) as an
+:class:`ImageGenProvider` implementation.
+
+The heavy lifting — model catalog, payload construction, request
+submission, managed-Nous-gateway selection, Clarity Upscaler chaining
+— lives in :mod:`tools.image_generation_tool`. This plugin reaches into
+that module via call-time indirection (``import tools.image_generation_tool as _it``)
+so:
+
+* the existing test suite (``tests/tools/test_image_generation.py``,
+  ``tests/tools/test_managed_media_gateways.py``) keeps patching
+  ``image_tool._submit_fal_request`` / ``image_tool.fal_client`` /
+  ``image_tool._managed_fal_client`` without modification, and
+* there's exactly one canonical FAL code path on disk — the plugin is a
+  registration adapter, not a parallel implementation.
+
+See issue #26241 for the migration plan and the
+``plugin-extraction-test-patch-compatibility.md`` rules this follows.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+from typing import Any, Dict, List, Optional
+
+from agent.image_gen_provider import (
+    DEFAULT_ASPECT_RATIO,
+    ImageGenProvider,
+    resolve_aspect_ratio,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class FalImageGenProvider(ImageGenProvider):
+    """FAL.ai image generation backend.
+
+    Delegates to ``tools.image_generation_tool.image_generate_tool`` so
+    the in-tree FAL implementation (model catalog, payload builder,
+    managed-gateway selection, Clarity Upscaler chaining) is the single
+    source of truth. Everything is resolved at call time via the
+    ``_it`` indirection so tests can monkey-patch the legacy module.
+    """
+
+    @property
+    def name(self) -> str:
+        return "fal"
+
+    @property
+    def display_name(self) -> str:
+        return "FAL.ai"
+
+    def is_available(self) -> bool:
+        # Available when direct FAL_KEY is set OR the managed Nous
+        # gateway resolves a fal-queue origin. Both checks come from the
+        # legacy module so this provider tracks whatever logic ships
+        # there.
+        import tools.image_generation_tool as _it
+        try:
+            return bool(_it.check_fal_api_key())
+        except Exception:  # noqa: BLE001 — defensive; never break the picker
+            return False
+
+    def list_models(self) -> List[Dict[str, Any]]:
+        import tools.image_generation_tool as _it
+        return [
+            {
+                "id": model_id,
+                "display": meta.get("display", model_id),
+                "speed": meta.get("speed", ""),
+                "strengths": meta.get("strengths", ""),
+                "price": meta.get("price", ""),
+            }
+            for model_id, meta in _it.FAL_MODELS.items()
+        ]
+
+    def default_model(self) -> Optional[str]:
+        import tools.image_generation_tool as _it
+        return _it.DEFAULT_MODEL
+
+    def get_setup_schema(self) -> Dict[str, Any]:
+        return {
+            "name": "FAL.ai",
+            "badge": "paid",
+            "tag": "Pick from flux-2-klein, flux-2-pro, gpt-image, nano-banana, etc.",
+            "env_vars": [
+                {
+                    "key": "FAL_KEY",
+                    "prompt": "FAL API key",
+                    "url": "https://fal.ai/dashboard/keys",
+                },
+            ],
+        }
+
+    def generate(
+        self,
+        prompt: str,
+        aspect_ratio: str = DEFAULT_ASPECT_RATIO,
+        **kwargs: Any,
+    ) -> Dict[str, Any]:
+        """Generate an image via the legacy FAL pipeline.
+
+        Forwards prompt + aspect_ratio (and any forward-compat extras
+        the schema supports) into :func:`tools.image_generation_tool.image_generate_tool`,
+        then reshapes its JSON-string response into the provider-ABC
+        dict format consumed by ``_dispatch_to_plugin_provider``.
+        """
+        import tools.image_generation_tool as _it
+
+        aspect = resolve_aspect_ratio(aspect_ratio)
+        passthrough = {
+            key: kwargs[key]
+            for key in (
+                "num_inference_steps",
+                "guidance_scale",
+                "num_images",
+                "output_format",
+                "seed",
+            )
+            if key in kwargs and kwargs[key] is not None
+        }
+
+        try:
+            raw = _it.image_generate_tool(
+                prompt=prompt,
+                aspect_ratio=aspect,
+                **passthrough,
+            )
+        except Exception as exc:  # noqa: BLE001 — never raise out of generate
+            logger.warning("FAL image_generate_tool raised: %s", exc, exc_info=True)
+            return {
+                "success": False,
+                "image": None,
+                "error": f"FAL image generation failed: {exc}",
+                "error_type": type(exc).__name__,
+                "provider": "fal",
+                "prompt": prompt,
+                "aspect_ratio": aspect,
+            }
+
+        try:
+            response = json.loads(raw) if isinstance(raw, str) else raw
+        except Exception:  # noqa: BLE001
+            response = {"success": False, "image": None, "error": "Invalid JSON from FAL pipeline"}
+
+        if not isinstance(response, dict):
+            response = {
+                "success": False,
+                "image": None,
+                "error": "FAL pipeline returned a non-dict response",
+                "error_type": "provider_contract",
+            }
+
+        # Stamp provider/prompt/aspect_ratio so downstream consumers see
+        # the uniform shape declared in ``agent.image_gen_provider``.
+        response.setdefault("provider", "fal")
+        response.setdefault("prompt", prompt)
+        response.setdefault("aspect_ratio", aspect)
+        # Annotate model best-effort — the legacy pipeline resolves it
+        # internally, so query it after the fact for the response shape.
+        if "model" not in response:
+            try:
+                model_id, _meta = _it._resolve_fal_model()
+                response["model"] = model_id
+            except Exception:  # noqa: BLE001
+                pass
+        return response
+
+
+# ---------------------------------------------------------------------------
+# Plugin entry point
+# ---------------------------------------------------------------------------
+
+
+def register(ctx) -> None:
+    """Plugin entry point — wire ``FalImageGenProvider`` into the registry."""
+    ctx.register_image_gen_provider(FalImageGenProvider())
@@ -0,0 +1,7 @@
+name: fal
+version: 1.0.0
+description: "FAL.ai image generation backend (flux-2-klein, flux-2-pro, nano-banana, gpt-image-1.5, recraft-v3, etc.)."
+author: NousResearch
+kind: backend
+requires_env:
+  - FAL_KEY
@@ -47,6 +47,25 @@ _DEFAULT_ENDPOINT = "http://127.0.0.1:1933"
 _TIMEOUT = 30.0
 _REMOTE_RESOURCE_PREFIXES = ("http://", "https://", "git@", "ssh://", "git://")

+# Maps the viking_remember `category` enum to a viking:// subdirectory.
+# Keep in sync with REMEMBER_SCHEMA.parameters.properties.category.enum.
+_CATEGORY_SUBDIR_MAP = {
+    "preference": "preferences",
+    "entity": "entities",
+    "event": "events",
+    "case": "cases",
+    "pattern": "patterns",
+}
+_DEFAULT_MEMORY_SUBDIR = "preferences"
+
+# Maps the built-in memory tool's `target` ("user" vs "memory") to a subdir
+# for on_memory_write mirroring. User profile facts → preferences; agent
+# notes / observations → patterns. Anything unknown falls back to the default.
+_MEMORY_WRITE_TARGET_SUBDIR_MAP = {
+    "user": "preferences",
+    "memory": "patterns",
+}
+

 # ---------------------------------------------------------------------------
 # Process-level atexit safety net — ensures pending sessions are committed
@@ -607,24 +626,35 @@ class OpenVikingMemoryProvider(MemoryProvider):
        except Exception as e:
            logger.warning("OpenViking session commit failed: %s", e)

-    def on_memory_write(self, action: str, target: str, content: str) -> None:
-        """Mirror built-in memory writes to OpenViking as explicit memories."""
+    def _build_memory_uri(self, subdir: str) -> str:
+        """Build a viking:// memory URI under the configured user/subdir."""
+        slug = uuid.uuid4().hex[:12]
+        return f"viking://user/{self._user}/memories/{subdir}/mem_{slug}.md"
+
+    def on_memory_write(
+        self,
+        action: str,
+        target: str,
+        content: str,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """Mirror built-in memory writes to OpenViking via content/write."""
        if not self._client or action != "add" or not content:
            return

+        subdir = _MEMORY_WRITE_TARGET_SUBDIR_MAP.get(target, _DEFAULT_MEMORY_SUBDIR)
+        uri = self._build_memory_uri(subdir)
+
        def _write():
            try:
                client = _VikingClient(
                    self._endpoint, self._api_key,
                    account=self._account, user=self._user, agent=self._agent,
                )
-                # Add as a user message with memory context so the commit
-                # picks it up as an explicit memory during extraction
-                client.post(f"/api/v1/sessions/{self._session_id}/messages", {
-                    "role": "user",
-                    "parts": [
-                        {"type": "text", "text": f"[Memory note — {target}] {content}"},
-                    ],
+                client.post("/api/v1/content/write", {
+                    "uri": uri,
+                    "content": content,
+                    "mode": "create",
                })
            except Exception as e:
                logger.debug("OpenViking memory mirror failed: %s", e)
@@ -858,24 +888,27 @@ class OpenVikingMemoryProvider(MemoryProvider):
        if not content:
            return tool_error("content is required")

-        # Store as a session message that will be extracted during commit.
-        # The category hint helps OpenViking's extraction classify correctly.
        category = args.get("category", "")
-        text = f"[Remember] {content}"
-        if category:
-            text = f"[Remember — {category}] {content}"
+        subdir = _CATEGORY_SUBDIR_MAP.get(category, _DEFAULT_MEMORY_SUBDIR)
+        uri = self._build_memory_uri(subdir)

-        self._client.post(f"/api/v1/sessions/{self._session_id}/messages", {
-            "role": "user",
-            "parts": [
-                {"type": "text", "text": text},
-            ],
-        })
-
-        return json.dumps({
-            "status": "stored",
-            "message": "Memory recorded. Will be extracted and indexed on session commit.",
-        })
+        # Write directly via content/write API.
+        # This creates the file, stores the content, and queues vector indexing
+        # in a single call — no dependency on session commit / VLM extraction.
+        try:
+            result = self._client.post("/api/v1/content/write", {
+                "uri": uri,
+                "content": content,
+                "mode": "create",
+            })
+            written = result.get("result", {}).get("written_bytes", 0)
+            return json.dumps({
+                "status": "stored",
+                "message": f"Memory stored ({written}b) and queued for vector indexing.",
+            })
+        except Exception as e:
+            logger.error("OpenViking content/write failed: %s", e)
+            return tool_error(f"Failed to store memory: {e}")

    def _tool_add_resource(self, args: dict) -> str:
        url = args.get("url", "")
@@ -282,20 +282,24 @@ def _build_payload(


 # ---------------------------------------------------------------------------
-# fal_client lazy import (same pattern as image_generation_tool)
+# fal_client lazy import (shared with image_generation_tool via fal_common)
 # ---------------------------------------------------------------------------

 _fal_client: Any = None


 def _load_fal_client() -> Any:
+    """Lazy-load the ``fal_client`` SDK and cache it on this module.
+
+    Delegates the actual import to :func:`tools.fal_common.import_fal_client`
+    so the ``lazy_deps`` ensure-install handling stays in one place.
+    """
    global _fal_client
    if _fal_client is not None:
        return _fal_client
-    import fal_client  # type: ignore
-
-    _fal_client = fal_client
-    return fal_client
+    from tools.fal_common import import_fal_client
+    _fal_client = import_fal_client()
+    return _fal_client


 # ---------------------------------------------------------------------------
@@ -238,7 +238,7 @@ def _get_firecrawl_client() -> Any:

        kwargs = {
            "api_key": managed_gateway.nous_user_token,
-            "api_url": managed_gateway.gateway_origin,
+            "api_url": managed_gateway.resolved_origin,
        }
        client_config = (
            "tool-gateway",
@@ -84,7 +84,7 @@ modal = ["modal==1.3.4"]
 daytona = ["daytona==0.155.0"]
 vercel = ["vercel==0.5.7"]
 hindsight = ["hindsight-client==0.6.1"]
-dev = ["debugpy==1.8.20", "pytest==9.0.2", "pytest-asyncio==1.3.0", "pytest-xdist==3.8.0", "pytest-split==0.11.0", "pytest-timeout==2.4.0", "mcp==1.26.0", "ty==0.0.21", "ruff==0.15.10"]
+dev = ["debugpy==1.8.20", "pytest==9.0.2", "pytest-asyncio==1.3.0", "pytest-timeout==2.4.0", "mcp==1.26.0", "ty==0.0.21", "ruff==0.15.10"]
 messaging = ["python-telegram-bot[webhooks]==22.6", "discord.py[voice]==2.7.1", "aiohttp==3.13.3", "brotlicffi==1.2.0.1", "slack-bolt==1.27.0", "slack-sdk==3.40.1", "qrcode==7.4.2"]
 cron = []  # croniter is now a core dependency; this extra kept for back-compat
 slack = ["slack-bolt==1.27.0", "slack-sdk==3.40.1", "aiohttp==3.13.3"]
@@ -232,16 +232,12 @@ markers = [
    "integration: marks tests requiring external services (API keys, Modal, etc.)",
    "real_concurrent_gate: opt out of the autouse stub that disables _detect_concurrent_hermes_instances",
 ]
-# pytest-timeout: per-test 60s hard cap with thread method.
-# Discovered May 2026: the suite reliably hangs at ~96% on full runs even
-# though every individual test completes in <30s. Root cause is leaked
-# threads / atexit handlers accumulating across thousands of tests until
-# something deadlocks at session teardown. Adding pytest-timeout (with
-# thread method, which forces an interrupt into the test thread) breaks
-# the deadlock — the suite then completes cleanly. The 60s cap is large
-# enough that no legitimate test trips it; if a test exceeds it that's a
-# real bug worth surfacing as a Timeout failure.
-addopts = "-m 'not integration' -n auto --timeout=30 --timeout-method=signal"
+# pytest-timeout: per-test 30s hard cap with signal method.
+# This is the fallback inside each per-file pytest subprocess (see
+# scripts/run_tests_parallel.py). Per-file isolation gives every test
+# file a fresh Python interpreter; pytest-timeout catches Python-level
+# hangs within a file.
+addopts = "-m 'not integration' --timeout=30 --timeout-method=signal"

 [tool.ty.environment]
 python-version = "3.13"
@@ -3357,6 +3357,25 @@ class AIAgent:
            return content

        if self._model_supports_vision():
+            # Vision-capable on paper — but if we've already learned in this
+            # session that the active (provider, model) rejects list-type
+            # tool content (e.g. Xiaomi MiMo's 400 "text is not set"),
+            # short-circuit to a text summary so we don't burn another
+            # round-trip relearning the same lesson.  Cache populated by
+            # the 400 recovery path in agent.conversation_loop.  Transient
+            # per-session; next session retries.
+            key = (
+                (getattr(self, "provider", "") or "").strip().lower(),
+                (getattr(self, "model", "") or "").strip(),
+            )
+            no_list = getattr(self, "_no_list_tool_content_models", None)
+            if no_list and key in no_list:
+                logger.debug(
+                    "Tool %s: model %s/%s known to reject list-type tool "
+                    "content this session — sending text summary",
+                    tool_name, key[0], key[1],
+                )
+                return _multimodal_text_summary(result)
            return content

        summary = _multimodal_text_summary(result)
@@ -3385,6 +3404,80 @@ class AIAgent:
        from agent.conversation_compression import try_shrink_image_parts_in_messages
        return try_shrink_image_parts_in_messages(api_messages)

+    def _try_strip_image_parts_from_tool_messages(self, api_messages: list) -> bool:
+        """Downgrade list-type tool messages to text summaries in-place.
+
+        Recovery path for providers that reject list-type tool message content
+        (e.g. Xiaomi MiMo's 400 "text is not set"; see issue #27344).  Walks
+        ``api_messages`` for any ``role: "tool"`` message whose ``content`` is
+        a list containing image parts, replaces the content with the existing
+        text part(s) (or a minimal placeholder if none survive), and records
+        the active (provider, model) in ``self._no_list_tool_content_models``
+        so subsequent ``_tool_result_content_for_active_model`` calls in this
+        session preemptively downgrade screenshots without a round-trip.
+
+        Returns True when at least one tool message was downgraded — the
+        caller (the 400 recovery branch in ``agent.conversation_loop``) uses
+        this to decide whether to retry the API call with the modified
+        history or surface the original error.
+        """
+        if not isinstance(api_messages, list):
+            return False
+
+        # Record (provider, model) so we don't relearn this lesson.
+        key = (
+            (getattr(self, "provider", "") or "").strip().lower(),
+            (getattr(self, "model", "") or "").strip(),
+        )
+        if not hasattr(self, "_no_list_tool_content_models"):
+            self._no_list_tool_content_models = set()
+        if key[1]:  # only record when we actually have a model id
+            self._no_list_tool_content_models.add(key)
+
+        changed = False
+        for msg in api_messages:
+            if not isinstance(msg, dict) or msg.get("role") != "tool":
+                continue
+            content = msg.get("content")
+            if not isinstance(content, list):
+                continue
+
+            # Salvage any text parts so the model still sees some signal.
+            text_parts: List[str] = []
+            had_image = False
+            for part in content:
+                if not isinstance(part, dict):
+                    if isinstance(part, str) and part.strip():
+                        text_parts.append(part.strip())
+                    continue
+                ptype = part.get("type")
+                if ptype == "image_url" or ptype == "input_image":
+                    had_image = True
+                    continue
+                if ptype in {"text", "input_text"}:
+                    text = str(part.get("text") or "").strip()
+                    if text:
+                        text_parts.append(text)
+
+            if not had_image:
+                # List-type content but no image parts — leave alone (some
+                # providers reject ANY list content, but stripping a
+                # text-only list doesn't reduce ambiguity; let the caller
+                # surface the original error if this turns out to be the
+                # case).
+                continue
+
+            if text_parts:
+                msg["content"] = "\n\n".join(text_parts)
+            else:
+                msg["content"] = (
+                    "[image content removed — provider does not accept "
+                    "list-type tool message content]"
+                )
+            changed = True
+
+        return changed
+
    def _anthropic_preserve_dots(self) -> bool:
        """True when using an anthropic-compatible endpoint that preserves dots in model names.
        Alibaba/DashScope keeps dots (e.g. qwen3.5-plus).
@@ -47,7 +47,9 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json"
 AUTHOR_MAP = {
    # teknium (multiple emails)
    "teknium1@gmail.com": "teknium1",
+    "cipherframe@users.noreply.github.com": "CipherFrame",
    "me@promplate.dev": "CNSeniorious000",
+    "yichengqiao21@gmail.com": "YarrowQiao",
    "erhanyasarx@gmail.com": "erhnysr",
    "30366221+WorldWriter@users.noreply.github.com": "WorldWriter",
    "dafeng@DafengdeMacBook-Pro.local": "WorldWriter",
@@ -58,13 +60,18 @@ AUTHOR_MAP = {
    "mgongzai@gmail.com": "vKongv",
    "0x.badfriend@gmail.com": "discodirector",
    "altriatree@gmail.com": "TruaShamu",
+    "contact-me@stark-x.cn": "Stark-X",
+    "nat@nthrow.io": "nthrow",
    "m@mobrienv.dev": "mikeyobrien",
    "saeed919@pm.me": "falasi",
+    "chrisdlc119@outlook.com": "chdlc",
    "omar@techdeveloper.site": "nycomar",
    "qiyin.zuo@pcitc.com": "qiyin-code",
    "mr.aashiz@gmail.com": "aashizpoudel",
    "70629228+shaun0927@users.noreply.github.com": "shaun0927",
    "98262967+Bihruze@users.noreply.github.com": "Bihruze",
+    "189280367+Lempkey@users.noreply.github.com": "Lempkey",
+    "leovillalbajr@gmail.com": "Lempkey",
    "nidhi2894@gmail.com": "nidhi-singh02",
    "30312689+aashizpoudel@users.noreply.github.com": "aashizpoudel",
    "oleksii.lisikh@gmail.com": "olisikh",
@@ -928,6 +935,8 @@ AUTHOR_MAP = {
    "holynn@placeholder.local": "holynn-q",
    "agent@hermes.local": "jacdevos",
    "sunsky.lau@gmail.com": "liuhao1024",
+    "fabianoeq@gmail.com": "rodrigoeqnit",
+    "178342791+sgtworkman@users.noreply.github.com": "sgtworkman",
    "qiuqfang98@qq.com": "keepcalmqqf",
    "261867348+ai-ag2026@users.noreply.github.com": "ai-ag2026",
    "yanzh.su@gmail.com": "YanzhongSu",
@@ -3,29 +3,36 @@
 # `pytest` directly to guarantee your local run matches CI behavior.
 #
 # What this script enforces:
-#   * -n 4 xdist workers (CI has 4 cores; -n auto diverges locally)
+#   * Per-file isolation via scripts/run_tests_parallel.py — each test
+#     file runs in its own freshly-spawned `python -m pytest <file>`
+#     subprocess. No xdist, no shared workers, no module-level leakage
+#     between files.
 #   * TZ=UTC, LANG=C.UTF-8, PYTHONHASHSEED=0 (deterministic)
-#   * Credential env vars blanked (conftest.py also does this, but this
-#     is belt-and-suspenders for anyone running `pytest` outside of
-#     our conftest path — e.g. calling pytest on a single file)
-#   * Proper venv activation
+#   * Env vars blanked (conftest.py also does this, but this
+#     is belt-and-suspenders for anyone running pytest outside our
+#     conftest path — e.g. on a single file)
+#   * Proper venv activation (probes .venv, venv, then ~/.hermes/...)
 #
 # Usage:
-#   scripts/run_tests.sh                     # full suite
-#   scripts/run_tests.sh tests/agent/        # one directory
-#   scripts/run_tests.sh tests/agent/test_foo.py::TestClass::test_method
-#   scripts/run_tests.sh --tb=long -v        # pass-through pytest args
+#   scripts/run_tests.sh                            # full suite
+#   scripts/run_tests.sh -j 4                       # cap parallelism
+#   scripts/run_tests.sh tests/agent/               # discover only here
+#   scripts/run_tests.sh tests/agent/ tests/acp/    # multiple roots
+#   scripts/run_tests.sh tests/foo.py               # single file
+#   scripts/run_tests.sh tests/foo.py -- --tb=long  # path + pytest args
+#   scripts/run_tests.sh -- -v --tb=long            # pytest args only
+#
+# Everything after a literal '--' is passed through to each per-file
+# pytest invocation. Positional path arguments before '--' override
+# the default discovery root (tests/).

 set -euo pipefail

 # ── Locate repo root ────────────────────────────────────────────────────────
-# Works whether this is the main checkout or a worktree.
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"

 # ── Activate venv ───────────────────────────────────────────────────────────
-# Prefer a .venv in the current tree, fall back to the main checkout's venv
-# (useful for worktrees where we don't always duplicate the venv).
 VENV=""
 for candidate in "$REPO_ROOT/.venv" "$REPO_ROOT/venv" "$HOME/.hermes/hermes-agent/venv"; do
  if [ -f "$candidate/bin/activate" ]; then
@@ -41,94 +48,31 @@ fi

 PYTHON="$VENV/bin/python"

-# ── Ensure pytest-split is installed (required for shard-equivalent runs) ──
-if ! "$PYTHON" -c "import pytest_split" 2>/dev/null; then
-  echo "→ installing pytest-split into $VENV"
-  if command -v uv >/dev/null 2>&1; then
-    uv pip install --python "$PYTHON" --quiet "pytest-split>=0.9,<1"
-  elif "$PYTHON" -m pip --version >/dev/null 2>&1; then
-    "$PYTHON" -m pip install --quiet "pytest-split>=0.9,<1"
-  else
-    echo "error: neither uv nor pip is available in $VENV — pytest-split is missing" >&2
-    echo "  fix: run  uv pip install -e \".[dev]\"  from $REPO_ROOT" >&2
-    exit 1
-  fi
-fi

-# ── Hermetic environment ────────────────────────────────────────────────────
-# Mirror what CI does in .github/workflows/tests.yml + what conftest.py does.
-# Unset every credential-shaped var currently in the environment.
-while IFS='=' read -r name _; do
-  case "$name" in
-    *_API_KEY|*_TOKEN|*_SECRET|*_PASSWORD|*_CREDENTIALS|*_ACCESS_KEY| \
-    *_SECRET_ACCESS_KEY|*_PRIVATE_KEY|*_OAUTH_TOKEN|*_WEBHOOK_SECRET| \
-    *_ENCRYPT_KEY|*_APP_SECRET|*_CLIENT_SECRET|*_CORP_SECRET|*_AES_KEY| \
-    AWS_ACCESS_KEY_ID|AWS_SECRET_ACCESS_KEY|AWS_SESSION_TOKEN|FAL_KEY| \
-    GH_TOKEN|GITHUB_TOKEN)
-      unset "$name"
-      ;;
-  esac
-done < <(env)
-
-# Unset HERMES_* behavioral vars too.
-unset HERMES_YOLO_MODE HERMES_INTERACTIVE HERMES_QUIET HERMES_TOOL_PROGRESS \
-      HERMES_TOOL_PROGRESS_MODE HERMES_MAX_ITERATIONS HERMES_SESSION_PLATFORM \
-      HERMES_SESSION_CHAT_ID HERMES_SESSION_CHAT_NAME HERMES_SESSION_THREAD_ID \
-      HERMES_SESSION_SOURCE HERMES_SESSION_KEY HERMES_GATEWAY_SESSION \
-      HERMES_CRON_SESSION \
-      HERMES_PLATFORM HERMES_INFERENCE_PROVIDER HERMES_MANAGED HERMES_DEV \
-      HERMES_CONTAINER HERMES_EPHEMERAL_SYSTEM_PROMPT HERMES_TIMEZONE \
-      HERMES_REDACT_SECRETS HERMES_BACKGROUND_NOTIFICATIONS HERMES_EXEC_ASK \
-      HERMES_HOME_MODE 2>/dev/null || true
-
-# Pin deterministic runtime.
-export TZ=UTC
-export LANG=C.UTF-8
-export LC_ALL=C.UTF-8
-export PYTHONHASHSEED=0
-
-# ── Live-gateway test guard (developer machines) ────────────────────────────
-# If a system-wide hermes pytest_live_guard plugin is installed at
-# $HOME/.hermes/pytest_live_guard.py, force-load it here so every test run
-# from this script gets the protection regardless of which worktree is
-# checked out (in-tree tests/conftest.py guard may be missing on stale
-# branches). Harmless on CI / fresh machines that don't have the file.
+# ── Live-gateway plugin (computed before we drop env) ───────────────────────
+EXTRA_PYTHONPATH=""
+EXTRA_PYTEST_PLUGINS=""
 if [ -f "$HOME/.hermes/pytest_live_guard.py" ]; then
-  case ":${PYTHONPATH:-}:" in
-    *":$HOME/.hermes:"*) ;;
-    *) export PYTHONPATH="${PYTHONPATH:+$PYTHONPATH:}$HOME/.hermes" ;;
-  esac
-  if [[ ",${PYTEST_PLUGINS:-}," != *,pytest_live_guard,* ]]; then
-    export PYTEST_PLUGINS="${PYTEST_PLUGINS:+$PYTEST_PLUGINS,}pytest_live_guard"
-  fi
+  EXTRA_PYTHONPATH="$HOME/.hermes"
+  EXTRA_PYTEST_PLUGINS="pytest_live_guard"
 fi

-# ── Worker count ────────────────────────────────────────────────────────────
-# CI uses `-n auto` on ubuntu-latest which gives 4 workers. A 20-core
-# workstation with `-n auto` gets 20 workers and exposes test-ordering
-# flakes that CI will never see. Pin to 4 so local matches CI.
-WORKERS="${HERMES_TEST_WORKERS:-4}"

-# ── Run pytest ──────────────────────────────────────────────────────────────
+# ── Run in hermetic env ──────────────────────────────────────────────────────
+# env -i: start with empty environment, opt-in only what we need.
+# No credential var can leak — you'd have to explicitly add it here.
+echo "▶ running per-file parallel test suite via run_tests_parallel.py"
+echo "  (TZ=UTC LANG=C.UTF-8 PYTHONHASHSEED=0; clean env)"
+
 cd "$REPO_ROOT"

-# If the first argument starts with `-` treat all args as pytest flags;
-# otherwise treat them as test paths.
-ARGS=("$@")
-
-echo "▶ running pytest with $WORKERS workers, hermetic env, in $REPO_ROOT"
-echo "  (TZ=UTC LANG=C.UTF-8 PYTHONHASHSEED=0; all credential env vars unset)"
-
-# -o "addopts=" clears pyproject.toml's `-n auto` so our -n wins.
-# We re-add --timeout/--timeout-method here because pyproject.toml's
-# addopts is wiped above. The 60s cap is essential: see pyproject.toml
-# for why (suite deadlocks at session teardown without it).
-exec "$PYTHON" -m pytest \
-  -o "addopts=" \
-  -n "$WORKERS" \
-  --timeout=30 \
-  --timeout-method=signal \
-  --ignore=tests/integration \
-  --ignore=tests/e2e \
-  -m "not integration" \
-  "${ARGS[@]}"
+exec env -i \
+  PATH="$PATH" \
+  HOME="$HOME" \
+  TZ=UTC \
+  LANG=C.UTF-8 \
+  LC_ALL=C.UTF-8 \
+  PYTHONHASHSEED=0 \
+  ${EXTRA_PYTHONPATH:+PYTHONPATH="$EXTRA_PYTHONPATH"} \
+  ${EXTRA_PYTEST_PLUGINS:+PYTEST_PLUGINS="$EXTRA_PYTEST_PLUGINS"} \
+  "$PYTHON" "$SCRIPT_DIR/run_tests_parallel.py" "$@"
@@ -0,0 +1,650 @@
+#!/usr/bin/env python3
+"""Per-file parallel test runner.
+
+The minimum-viable replacement for pytest-xdist + a subprocess-isolation
+plugin. Discovers test files under ``tests/`` (excluding integration/e2e
+unless explicitly requested), then runs one ``python -m pytest <file>``
+subprocess per file, with bounded parallelism (default: ``os.cpu_count()``).
+
+Why per-file rather than per-test?
+    Per-test spawn overhead (~250ms × 17k tests = 70min CPU minimum)
+    swamped the actual work. Per-file spawn (~250ms × ~850 files = ~3.5min)
+    fits in the budget while still giving every file a fresh Python
+    interpreter — the only isolation boundary that actually matters
+    (cross-file module-level state leakage was the original flake source;
+    intra-file state is the test author's responsibility).
+
+Why drop xdist entirely?
+    xdist's persistent workers accumulate state across files, which is
+    exactly the leakage we wanted to fix. xdist also adds complexity
+    (loadfile vs loadscope, --max-worker-restart, internal control plane)
+    that we don't need when the unit of work is "run pytest on one file".
+    A subprocess.Popen pool gated by a semaphore is ~60 lines and does
+    the job.
+
+Usage:
+    python scripts/run_tests_parallel.py [pytest_args...]
+
+    Common pytest args pass through (e.g. ``-v``, ``-x``, ``--tb=long``,
+    ``-k 'pattern'``, ``--lf``).
+
+Environment:
+    HERMES_TEST_WORKERS  Override worker count (default: os.cpu_count())
+    HERMES_TEST_PATHS    Override discovery roots (colon-sep, default: 'tests')
+
+Exit code: 0 if every file's pytest exited 0; 1 otherwise.
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import subprocess
+import sys
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor, Future
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+
+# Default test discovery roots.
+_DEFAULT_ROOTS = ["tests"]
+
+# Directories to skip during discovery — the e2e + integration suites
+# require real services and are run separately. Match exactly the
+# ``--ignore=`` flags the previous CI command used.
+_SKIP_PARTS = {"integration", "e2e"}
+
+# Per-file wall-clock cap. Generous default — pytest-timeout still
+# enforces per-test caps inside each subprocess; this is just an outer
+# safety net so a single hung file can't stall the whole suite. Override
+# via --file-timeout or HERMES_TEST_FILE_TIMEOUT.
+_DEFAULT_FILE_TIMEOUT_SECONDS = 600.0  # 10 minutes
+
+
+def _count_tests(
+    files: List[Path], repo_root: Path, pytest_passthrough: List[str]
+) -> dict[Path, int]:
+    """Run ``pytest --co -q`` once to count individual tests per file.
+
+    Returns a mapping ``{file_path: test_count}``. Files with zero
+    collected tests are omitted from the dict (not an error — e.g. the
+    file only defines fixtures / conftest helpers).
+
+    This is a single subprocess call (~2-5s for ~1k files) that gives
+    us the total test count for the discovery announcement and
+    per-file counts for the progress lines.
+
+    ``--ignore`` flags for directories in ``_SKIP_PARTS`` are added
+    automatically so that pytest's own collection machinery (conftest
+    walking, directory traversal) doesn't pull in tests we intend to
+    skip — matching what the per-file runs will actually execute.
+    """
+    # Build --ignore flags for skipped dirs so the --co collection
+    # mirrors what we'll actually run (not what pytest might find via
+    # conftest walking or directory traversal).
+    ignore_args: List[str] = []
+    for root in [repo_root / p for p in _DEFAULT_ROOTS]:
+        for part in _SKIP_PARTS:
+            d = root / part
+            if d.is_dir():
+                ignore_args.extend(["--ignore", str(d)])
+
+    cmd = [
+        sys.executable, "-m", "pytest",
+        "--co", "-q",
+        *ignore_args,
+        *[str(f) for f in files],
+        *pytest_passthrough,
+    ]
+    try:
+        result = subprocess.run(
+            cmd,
+            cwd=repo_root,
+            capture_output=True,
+            text=True,
+            timeout=120,
+        )
+    except (subprocess.TimeoutExpired, OSError):
+        return {}
+
+    counts: dict[Path, int] = {}
+    for line in result.stdout.splitlines():
+        # Lines look like: tests/acp/test_auth.py::TestClass::test_name
+        if "::" not in line:
+            continue
+        file_part = line.split("::", 1)[0]
+        key = repo_root / file_part
+        counts[key] = counts.get(key, 0) + 1
+
+    return counts
+
+
+def _discover_files(roots: List[Path]) -> List[Path]:
+    """Return every ``test_*.py`` under the given roots (sorted).
+
+    Roots may be directories (recursed for ``test_*.py``) or explicit
+    ``.py`` files (included as-is, even if they don't match the
+    ``test_*`` prefix — caller knows what they want).
+
+    Exclude any file whose path contains a component in ``_SKIP_PARTS``,
+    UNLESS the user explicitly named it as a root (in which case the
+    user's intent overrides the skip filter).
+    """
+    seen: set[Path] = set()
+    out: List[Path] = []
+    for root in roots:
+        if not root.exists():
+            continue
+        if root.is_file():
+            # Explicit file: include it as-is, skip the _SKIP_PARTS filter
+            # since the user named it directly.
+            real = root.resolve()
+            if real not in seen:
+                seen.add(real)
+                out.append(root)
+            continue
+        for path in root.rglob("test_*.py"):
+            if any(part in _SKIP_PARTS for part in path.parts):
+                continue
+            real = path.resolve()
+            if real in seen:
+                continue
+            seen.add(real)
+            out.append(path)
+    return sorted(out)
+
+
+def _kill_tree(proc: "subprocess.Popen", pgid: int | None = None) -> None:
+    """Kill the pytest subprocess and every descendant it spawned.
+
+    A test run can spin up uvicorn servers, async runtimes, or other
+    long-running grandchildren that survive the pytest subprocess exit
+    if we don't kill the whole tree. ``subprocess.Popen.kill()`` only
+    targets the immediate child; grandchildren reparent to PID 1
+    (Linux) / get adopted by services.exe (Windows) and leak.
+
+    POSIX: the caller must pass ``pgid`` — the process group id captured
+    immediately after Popen (via ``os.getpgid(proc.pid)``). We can't
+    look it up here in the happy path because by the time we get
+    called the leader process has already been reaped and its pid is
+    gone from the kernel's process table, even though descendants in
+    the group are still alive. SIGKILL'ing the captured pgid takes out
+    everything in that group atomically.
+
+    Windows: ``taskkill /F /T /PID`` walks the recorded ppid chain and
+    terminates the whole tree, even when the root has already exited.
+
+    Why not psutil: psutil walks the parent-child tree, but in the
+    happy path the root has already been reaped so ``psutil.Process(pid)``
+    can't find it; grandchildren reparented to PID 1 are also
+    unreachable by tree walk at that point. The platform-native
+    primitives (process groups / taskkill) handle both cases correctly
+    without an extra abstraction layer.
+    """
+    if proc.pid is None:
+        return
+
+    if sys.platform == "win32":
+        try:
+            
+            subprocess.run(
+                ["taskkill", "/F", "/T", "/PID", str(proc.pid)],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                timeout=10,
+            )  # windows-footgun: ok
+        except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
+            pass
+    else:
+        # POSIX: kill the captured pgid. Local-import signal so the
+        # SIGKILL attribute is never referenced on Windows.
+        if pgid is not None:
+            try:
+                import signal as _signal
+                os.killpg(pgid, _signal.SIGKILL)  # windows-footgun: ok
+            except (ProcessLookupError, PermissionError, OSError):
+                pass
+
+    # Belt-and-suspenders: ensure subprocess.communicate() sees the exit.
+    try:
+        proc.kill()
+    except (ProcessLookupError, OSError):
+        pass
+
+
+def _run_one_file(
+    file: Path,
+    pytest_args: List[str],
+    repo_root: Path,
+    file_timeout: float,
+) -> Tuple[Path, int, str, dict[str, int]]:
+    """Run ``python -m pytest <file> <pytest_args>`` in a fresh subprocess.
+
+    Returns (file, returncode, captured_combined_output, summary_counts).
+
+    ``summary_counts`` is the result of ``_parse_pytest_summary(output)`` —
+
+    pytest exit codes (https://docs.pytest.org/en/stable/reference/exit-codes.html):
+        0 = all tests passed
+        1 = some tests failed
+        2 = test execution interrupted
+        3 = internal error
+        4 = pytest CLI usage error
+        5 = no tests collected
+
+    We treat exit 5 as a pass: it just means every test in the file was
+    skipped or filtered by a marker (e.g. ``-m 'not integration'`` skips
+    files where every test is marked integration). That's intentional and
+    not a failure mode.
+
+    On per-file timeout (``file_timeout`` seconds) or any other exception
+    during ``communicate()``, we kill the whole process group / process
+    tree so grandchildren (uvicorn servers, async runtimes, etc.) do not
+    orphan onto PID 1. The pytest-timeout plugin enforces per-test
+    timeouts inside the subprocess; this outer timeout exists only to
+    bound a pathologically slow or hung file as a whole.
+    """
+    cmd = [sys.executable, "-m", "pytest", str(file), *pytest_args]
+    proc = subprocess.Popen(
+        cmd,
+        cwd=repo_root,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        # POSIX: place the child at the head of its own process group so
+        # _kill_tree can SIGKILL the group atomically.
+        # Windows: this maps to CREATE_NEW_PROCESS_GROUP in CPython 3.12+;
+        # _kill_tree handles the Windows path via taskkill /F /T.
+        start_new_session=True,
+    )
+
+    # Capture the pgid NOW, before the leader can exit and be reaped.
+    # Once the leader is reaped, os.getpgid(proc.pid) raises
+    # ProcessLookupError even though grandchildren in that group are
+    # still alive — defeating the whole cleanup. None on Windows where
+    # the pgid concept doesn't apply (taskkill walks ppid chain instead).
+    pgid: int | None = None
+    if sys.platform != "win32":
+        try:
+            pgid = os.getpgid(proc.pid)
+        except (ProcessLookupError, PermissionError):
+            # Astonishingly fast child? Already dead. _kill_tree's
+            # fallback will handle this case as a no-op.
+            pgid = None
+
+    try:
+        output, _ = proc.communicate(timeout=file_timeout)
+        rc = proc.returncode
+    except subprocess.TimeoutExpired:
+        _kill_tree(proc, pgid=pgid)
+        # Drain whatever the child wrote before we killed it so we have
+        # something to surface in the failure dump.
+        try:
+            output, _ = proc.communicate(timeout=10)
+        except subprocess.TimeoutExpired:
+            output = "(file timeout exceeded; output unavailable)"
+        rc = 124  # de facto convention for "killed by timeout".
+        output = (
+            f"(per-file timeout: {file_timeout:.0f}s exceeded; "
+            f"process tree SIGKILL'd)\n{output}"
+        )
+    except BaseException:
+        # KeyboardInterrupt / runner crash — make sure no zombie
+        # grandchildren outlive us.
+        _kill_tree(proc, pgid=pgid)
+        raise
+    else:
+        # Happy path: pytest exited on its own. The child process already
+        # cleaned up its grandchildren if it's well-behaved, but
+        # well-behaved is not universal — kill the group anyway. Already-
+        # dead processes are a no-op.
+        _kill_tree(proc, pgid=pgid)
+
+    if rc == 5:
+        # No tests collected — every test in the file was filtered out.
+        # Treat as a pass; surface info in a slightly distinct status
+        # so the operator can spot it.
+        rc = 0
+    summary = _parse_pytest_summary(output)
+    return file, rc, output, summary
+
+
+def _parse_pytest_summary(output: str) -> dict[str, int]:
+    """Extract per-file test pass/fail/skip counts from pytest output.
+
+    pytest prints a summary line like ``12 passed, 3 skipped, 1 failed in 2.1s``
+    as the last non-empty line before the short test summary.  We scrape that
+    line for the individual counts so the progress display can show test-level
+    granularity instead of just file-level pass/fail.
+
+    Returns a dict with keys ``passed``, ``failed``, ``skipped``, ``errors``,
+    ``xfailed``, ``xpassed`` (only keys found in the output are present).
+    """
+    import re
+
+    result: dict[str, int] = {}
+    # Walk backwards from the end — the summary line is always near the tail.
+    for line in reversed(output.splitlines()):
+        line = line.strip()
+        if not line:
+            continue
+        # Match "N passed", "N failed", "N skipped", "N errors", "N xfailed", "N xpassed"
+        for m in re.finditer(r"(\d+)\s+(passed|failed|skipped|errors|xfailed|xpassed)", line):
+            result[m.group(2)] = int(m.group(1))
+        # Also match "N error" (singular — pytest uses this sometimes).
+        for m in re.finditer(r"(\d+)\s+error\b", line):
+            result.setdefault("errors", result.get("errors", 0) + int(m.group(1)))
+        if result:
+            # Found the counts line — done.
+            break
+        # Stop at the short test summary header (if any) — everything above
+        # that is individual failure details, not the counts line.
+        if line.startswith("FAILED") or line.startswith("SHORT TEST SUMMARY"):
+            break
+    return result
+
+
+def _format_file(file: Path, repo_root: Path) -> str:
+    """Render a test-file path for display: strip the repo-root prefix
+    when possible so output reads ``tests/acp/test_auth.py`` instead of
+    ``/home/runner/work/hermes-agent/hermes-agent/tests/acp/test_auth.py``.
+
+    Falls back to the absolute path for anything outside the repo root.
+    """
+    try:
+        return str(file.resolve().relative_to(repo_root.resolve()))
+    except ValueError:
+        return str(file)
+
+
+def _print_progress(
+    tests_done: int,
+    total_tests: int,
+    file: Path,
+    rc: int,
+    dur: float,
+    repo_root: Path,
+    tests_passed: int,
+    tests_failed: int,
+    test_counts: dict[Path, int],
+    file_summary: dict[str, int] | None = None,
+) -> None:
+    """Single-line live progress.
+
+    When ``file_summary`` is provided (parsed from pytest output), the
+    per-file parenthetical shows individual test pass/fail counts instead
+    of just the total test count.
+    """
+    status = "✓" if rc == 0 else "✗"
+    pct = (tests_done / total_tests * 100) if total_tests else 0
+    # Digit width for left-side counter padding (derived from total file count).
+    fw = len(str(tests_passed + tests_failed))
+    # Build per-file test count string.
+    if file_summary:
+        parts = []
+        p = file_summary.get("passed", 0)
+        f = file_summary.get("failed", 0)
+        s = file_summary.get("skipped", 0)
+        e = file_summary.get("errors", 0)
+        if p:
+            parts.append(f"{p}✓")
+        if f:
+            parts.append(f"{f}✗")
+        if s:
+            parts.append(f"{s}s")
+        if e:
+            parts.append(f"{e}e")
+        # xfailed/xpassed are rare; include if present.
+        xf = file_summary.get("xfailed", 0)
+        xp = file_summary.get("xpassed", 0)
+        if xf:
+            parts.append(f"{xf}xf")
+        if xp:
+            parts.append(f"{xp}xp")
+        test_str = " ".join(parts) + ", " if parts else ""
+    else:
+        n_tests = test_counts.get(file, 0)
+        test_str = f"{n_tests} tests, " if n_tests else ""
+    msg = (
+        f"[{pct:5.1f}% | {tests_done:>5}/{total_tests}"
+        f" | ✓{tests_passed:>{fw}} | ✗{tests_failed:>{fw}}] "
+        f"{status} {_format_file(file, repo_root)} ({test_str}{dur:.1f}s)"
+    )
+    # Truncate to terminal width if available (no clobbering ANSI lines).
+    try:
+        cols = os.get_terminal_size().columns
+        if len(msg) > cols:
+            msg = msg[: cols - 1] + "…"
+    except OSError:
+        pass
+    print(msg, flush=True)
+
+
+def _print_inline_failure(
+    file: Path, output: str, repo_root: Path, pytest_passthrough: List[str]
+) -> None:
+    """Print a compact failure summary immediately when a file fails.
+
+    Shows the tail of the pytest output (the failure section with stack
+    traces) and a ready-to-run repro command, so the developer doesn't
+    have to wait for the full run to finish before seeing what broke.
+    """
+    rel = _format_file(file, repo_root)
+    # Build a repro command the developer can copy-paste.
+    passthrough_str = " ".join(pytest_passthrough) if pytest_passthrough else ""
+    repro = f"python -m pytest {rel}"
+    if passthrough_str:
+        repro += f" {passthrough_str}"
+
+    # Grab just the failure lines (last ~30 lines of pytest output —
+    # typically the FAILED summary + short test info).
+    lines = output.rstrip().splitlines()
+    tail = "\n".join(lines[-30:])
+
+    print(flush=True)
+    print(f"  ╔╍ Failed: {rel} ╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍", flush=True)
+    for line in tail.splitlines():
+        print(f"  ║ {line}", flush=True)
+    print(f"  ║", flush=True)
+    print(f"  ║  Repro: {repro}", flush=True)
+    print(f"  ╚╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍", flush=True)
+    print(flush=True)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "-j",
+        "--jobs",
+        type=int,
+        default=int(os.environ.get("HERMES_TEST_WORKERS") or (os.cpu_count() or 4) * 2),
+        help="Parallel worker count (default: $HERMES_TEST_WORKERS or cpu_count*2)",
+    )
+    parser.add_argument(
+        "--paths",
+        default=os.environ.get("HERMES_TEST_PATHS", ":".join(_DEFAULT_ROOTS)),
+        help="Colon-separated discovery roots (default: 'tests')",
+    )
+    parser.add_argument(
+        "--include-integration",
+        action="store_true",
+        help="Don't skip integration/ e2e/ during discovery",
+    )
+    parser.add_argument(
+        "--file-timeout",
+        type=float,
+        default=float(
+            os.environ.get("HERMES_TEST_FILE_TIMEOUT", _DEFAULT_FILE_TIMEOUT_SECONDS)
+        ),
+        help=(
+            "Per-file wall-clock cap in seconds. On timeout, the pytest "
+            "subprocess and its full process tree are SIGKILL'd. "
+            "Default: 600 (10 min), env: HERMES_TEST_FILE_TIMEOUT."
+        ),
+    )
+    parser.add_argument(
+        "paths_positional",
+        nargs="*",
+        metavar="PATH",
+        help=(
+            "Restrict discovery to these paths (directories or .py files). "
+            "Mutually exclusive with --paths. Anything after a literal '--' "
+            "separator is passed through to each per-file pytest invocation."
+        ),
+    )
+    # Manually split argv on '--' so positional paths and pytest passthrough
+    # args don't fight over each other. argparse's nargs="*" positional is
+    # greedy and will swallow everything after '--' including the pytest
+    # flags, defeating the convention.
+    argv = sys.argv[1:]
+    if "--" in argv:
+        sep = argv.index("--")
+        our_args, pytest_passthrough = argv[:sep], argv[sep + 1 :]
+    else:
+        our_args, pytest_passthrough = argv, []
+    args = parser.parse_args(our_args)
+
+    repo_root = Path(__file__).resolve().parent.parent
+
+    # Resolve discovery roots: positional path args override --paths if any
+    # were supplied, otherwise --paths (which itself defaults to 'tests').
+    if args.paths_positional:
+        # Positionals can be directories OR explicit .py files. Either is
+        # fine — _discover_files handles both via rglob('test_*.py') for
+        # dirs and direct inclusion for files.
+        roots = [repo_root / p for p in args.paths_positional]
+    else:
+        roots = [repo_root / p for p in args.paths.split(":") if p]
+
+    if args.include_integration:
+        # Caller takes responsibility — typically used via explicit -k filter.
+        global _SKIP_PARTS  # noqa: PLW0603 — config knob
+        _SKIP_PARTS = set()
+
+    files = _discover_files(roots)
+    if not files:
+        print(f"No test files discovered under {[str(r) for r in roots]}", file=sys.stderr)
+        return 1
+
+    # Count individual tests per file via a single pytest --co pass.
+    test_counts = _count_tests(files, repo_root, pytest_passthrough)
+    total_tests = sum(test_counts.values())
+
+    print(
+        f"Discovered {len(files)} test files ({total_tests} tests) under "
+        f"{[str(r.relative_to(repo_root)) if r.is_relative_to(repo_root) else str(r) for r in roots]}; "
+        f"running with -j {args.jobs}",
+        flush=True,
+    )
+
+    # Capture and print on completion (out-of-order is fine — keeps the
+    # terminal clean rather than interleaving N parallel pytest outputs).
+    failures: List[Tuple[Path, str, Dict[str, int]]] = []
+    started = time.monotonic()
+    files_done = 0
+    tests_done = 0
+    pass_count = 0
+    fail_count = 0
+    tests_passed = 0
+    tests_failed = 0
+    lock = threading.Lock()
+
+    def _on_done(file: Path, started_at: float, fut: "Future[Tuple[Path, int, str, dict[str, int]]]") -> None:
+        nonlocal files_done, tests_done, pass_count, fail_count, tests_passed, tests_failed
+        n_tests = test_counts.get(file, 0)
+        try:
+            fpath, rc, output, summary = fut.result()
+        except Exception as exc:  # noqa: BLE001 — must always advance counter
+            with lock:
+                files_done += 1
+                tests_done += n_tests
+                fail_count += 1
+                failures.append((file, f"runner crashed: {exc!r}", {}))
+                _print_progress(
+                    tests_done, total_tests, file, 1,
+                    time.monotonic() - started_at,
+                    repo_root, tests_passed, tests_failed,
+                    test_counts,
+                )
+            return
+        with lock:
+            files_done += 1
+            tests_done += n_tests
+            # Accumulate test-level counts from parsed summary.
+            tests_passed += summary.get("passed", 0)
+            tests_failed += summary.get("failed", 0)
+            if rc == 0:
+                pass_count += 1
+            else:
+                fail_count += 1
+                failures.append((fpath, output, summary))
+            _print_progress(
+                tests_done, total_tests, fpath, rc,
+                time.monotonic() - started_at,
+                repo_root, tests_passed, tests_failed,
+                test_counts,
+                file_summary=summary,
+            )
+            if rc != 0:
+                _print_inline_failure(fpath, output, repo_root, pytest_passthrough)
+
+    with ThreadPoolExecutor(max_workers=args.jobs) as pool:
+        futures: List[Future] = []
+        for file in files:
+            t0 = time.monotonic()
+            fut = pool.submit(
+                _run_one_file, file, pytest_passthrough, repo_root, args.file_timeout
+            )
+            fut.add_done_callback(lambda f, file=file, t0=t0: _on_done(file, t0, f))
+            futures.append(fut)
+        # Block until everything's done. ThreadPoolExecutor.__exit__ waits
+        # for all submitted work, but doing it explicitly here makes the
+        # control flow obvious.
+        for fut in futures:
+            fut.result() if fut.exception() is None else None
+
+    elapsed = time.monotonic() - started
+    print()
+    pct = (tests_done / total_tests * 100) if total_tests else 0
+    print(f"=== Summary: {len(files)} files, {tests_passed} tests passed, {tests_failed} failed ({pct:.0f}% complete) in {elapsed:.1f}s ({args.jobs} workers) ===")
+
+    if failures:
+        print()
+        print("=== Failure output ===")
+        for file, output, _summary in failures:
+            print()
+            print(f"--- {_format_file(file, repo_root)} ---")
+            print(output.rstrip())
+        print()
+        # Split: files with actual test failures vs non-zero exit for other reasons
+        test_fail_files = [(f, s) for f, _o, s in failures if s.get("failed", 0) > 0]
+        all_passed_but_nonzero = [(f, s) for f, _o, s in failures
+                                  if s.get("failed", 0) == 0 and s.get("passed", 0) > 0]
+        no_tests_ran = [(f, s) for f, _o, s in failures
+                        if s.get("failed", 0) == 0 and s.get("passed", 0) == 0]
+        if test_fail_files:
+            total_tf = sum(s.get("failed", 0) for _, s in test_fail_files)
+            print(f"=== {len(test_fail_files)} file{'s' if len(test_fail_files) != 1 else ''} with test failures ({total_tf} test{'s' if total_tf != 1 else ''} failed) ===")
+            for file, s in test_fail_files:
+                nf = s.get("failed", 0)
+                print(f"  {_format_file(file, repo_root)}  ({nf} test{'s' if nf != 1 else ''} failed)")
+        if all_passed_but_nonzero:
+            print(f"=== {len(all_passed_but_nonzero)} file{'s' if len(all_passed_but_nonzero) != 1 else ''} where all tests passed but pytest exited non-zero (warnings-as-errors, hook failures, etc.) ===")
+            for file, s in all_passed_but_nonzero:
+                print(f"  {_format_file(file, repo_root)}  ({s.get('passed', 0)} passed)")
+        if no_tests_ran:
+            print(f"=== {len(no_tests_ran)} file{'s' if len(no_tests_ran) != 1 else ''} where no tests ran (collection/import error, timeout before collection, etc.) ===")
+            for file, s in no_tests_ran:
+                print(f"  {_format_file(file, repo_root)}")
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -40,6 +40,16 @@ def _clean_env(monkeypatch):
        "ANTHROPIC_API_KEY", "ANTHROPIC_TOKEN", "CLAUDE_CODE_OAUTH_TOKEN",
    ):
        monkeypatch.delenv(key, raising=False)
+    # Module-level unhealthy cache (10-min TTL) leaks between tests;
+    # earlier tests that call _mark_provider_unhealthy() poison the
+    # cache for later ones, causing _resolve_auto to skip providers
+    # that the test patched to return valid clients.
+    import agent.auxiliary_client as _aux_mod
+    _aux_mod._aux_unhealthy_until.clear()
+    _aux_mod._aux_unhealthy_logged_at.clear()
+    yield
+    _aux_mod._aux_unhealthy_until.clear()
+    _aux_mod._aux_unhealthy_logged_at.clear()


@pytest.fixture
@@ -461,6 +471,17 @@ class TestExpiredCodexFallback:
        import base64
        import time as _time

+        # Belt-and-suspenders: _try_openrouter marks openrouter unhealthy
+        # when OPENROUTER_API_KEY is absent (which the preceding test in
+        # this class exercises).  The file-level _clean_env autouse fixture
+        # clears the cache, but fixture ordering with the conftest
+        # _hermetic_environment autouse can leave a narrow window where
+        # the mark reappears.  Explicitly clear here so this test is
+        # independent of run order.
+        import agent.auxiliary_client as _aux_mod
+        _aux_mod._aux_unhealthy_until.clear()
+        _aux_mod._aux_unhealthy_logged_at.clear()
+
        header = base64.urlsafe_b64encode(b'{"alg":"RS256","typ":"JWT"}').rstrip(b"=").decode()
        payload_data = json.dumps({"exp": int(_time.time()) - 3600}).encode()
        payload = base64.urlsafe_b64encode(payload_data).rstrip(b"=").decode()
@@ -1047,6 +1068,20 @@ class TestGetProviderChain:
 class TestTryPaymentFallback:
    """_try_payment_fallback skips the failed provider and tries alternatives."""

+    @pytest.fixture(autouse=True)
+    def _clear_unhealthy_cache(self):
+        """Earlier tests in this file call _mark_provider_unhealthy() which
+        pollutes the module-level ``_aux_unhealthy_until`` dict (10-min TTL).
+        Without this cleanup the fallback chain skips providers we've patched
+        to return valid clients — the patched function is never called.
+        """
+        from agent.auxiliary_client import _aux_unhealthy_until, _aux_unhealthy_logged_at
+        _aux_unhealthy_until.clear()
+        _aux_unhealthy_logged_at.clear()
+        yield
+        _aux_unhealthy_until.clear()
+        _aux_unhealthy_logged_at.clear()
+
    def test_skips_failed_provider(self):
        mock_client = MagicMock()
        with patch("agent.auxiliary_client._try_openrouter", return_value=(None, None)), \
@@ -0,0 +1,93 @@
+from types import SimpleNamespace
+
+from agent.agent_init import _merge_custom_provider_extra_body
+
+
+def test_custom_provider_extra_body_merges_into_request_overrides():
+    agent = SimpleNamespace(
+        provider="custom",
+        model="google/gemma-4-31b-it",
+        base_url="https://example.test/v1",
+        request_overrides={"service_tier": "priority"},
+    )
+
+    _merge_custom_provider_extra_body(
+        agent,
+        [
+            {
+                "name": "gemma",
+                "base_url": "https://example.test/v1/",
+                "model": "google/gemma-4-31b-it",
+                "extra_body": {
+                    "enable_thinking": True,
+                    "reasoning_effort": "high",
+                },
+            }
+        ],
+    )
+
+    assert agent.request_overrides == {
+        "service_tier": "priority",
+        "extra_body": {
+            "enable_thinking": True,
+            "reasoning_effort": "high",
+        },
+    }
+
+
+def test_custom_provider_extra_body_preserves_caller_override():
+    agent = SimpleNamespace(
+        provider="custom",
+        model="google/gemma-4-31b-it",
+        base_url="https://example.test/v1",
+        request_overrides={
+            "extra_body": {
+                "reasoning_effort": "low",
+                "caller_only": True,
+            }
+        },
+    )
+
+    _merge_custom_provider_extra_body(
+        agent,
+        [
+            {
+                "name": "gemma",
+                "base_url": "https://example.test/v1",
+                "model": "google/gemma-4-31b-it",
+                "extra_body": {
+                    "enable_thinking": True,
+                    "reasoning_effort": "high",
+                },
+            }
+        ],
+    )
+
+    assert agent.request_overrides["extra_body"] == {
+        "enable_thinking": True,
+        "reasoning_effort": "low",
+        "caller_only": True,
+    }
+
+
+def test_custom_provider_extra_body_ignores_other_custom_models():
+    agent = SimpleNamespace(
+        provider="custom",
+        model="other-model",
+        base_url="https://example.test/v1",
+        request_overrides={},
+    )
+
+    _merge_custom_provider_extra_body(
+        agent,
+        [
+            {
+                "name": "gemma",
+                "base_url": "https://example.test/v1",
+                "model": "google/gemma-4-31b-it",
+                "extra_body": {"enable_thinking": True},
+            }
+        ],
+    )
+
+    assert agent.request_overrides == {}
@@ -56,6 +56,7 @@ class TestFailoverReason:
            "overloaded", "server_error", "timeout",
            "context_overflow", "payload_too_large", "image_too_large",
            "model_not_found", "format_error",
+            "multimodal_tool_content_unsupported",
            "provider_policy_blocked",
            "thinking_signature", "long_context_tier",
            "oauth_long_context_beta_forbidden",
@@ -1256,3 +1257,66 @@ class TestRateLimitErrorWithoutStatusCode:
        e.status_code = None
        result = classify_api_error(e, provider="copilot", model="gpt-4o")
        assert result.reason != FailoverReason.rate_limit
+
+
+
+# ── Test: multimodal_tool_content_unsupported pattern ───────────────────
+
+class TestMultimodalToolContentUnsupported:
+    """Issue #27344 — providers that reject list-type tool message content
+    should be classified as ``multimodal_tool_content_unsupported`` so the
+    retry loop can downgrade screenshots to text and try again.
+    """
+
+    def test_xiaomi_mimo_text_is_not_set_pattern(self):
+        """The actual Xiaomi MiMo 400 wording from the bug report."""
+        e = MockAPIError(
+            "Error code: 400 - {'error': {'code': '400', 'message': 'Param Incorrect', 'param': 'text is not set', 'type': ''}}",
+            status_code=400,
+        )
+        result = classify_api_error(e, provider="xiaomi", model="mimo-v2.5")
+        assert result.reason == FailoverReason.multimodal_tool_content_unsupported
+        assert result.retryable is True
+
+    def test_generic_tool_message_must_be_string(self):
+        e = MockAPIError(
+            "tool message content must be a string",
+            status_code=400,
+        )
+        result = classify_api_error(e, provider="custom", model="some-model")
+        assert result.reason == FailoverReason.multimodal_tool_content_unsupported
+
+    def test_expected_string_got_list(self):
+        e = MockAPIError(
+            "Schema validation failed: expected string, got list",
+            status_code=400,
+        )
+        result = classify_api_error(e, provider="custom", model="some-model")
+        assert result.reason == FailoverReason.multimodal_tool_content_unsupported
+
+    def test_multimodal_tool_content_takes_priority_over_context_overflow(self):
+        """Some providers return a 400 whose message contains BOTH
+        'text is not set' and a length-shaped phrase; the tool-content
+        recovery is cheaper than compression so it must win the priority.
+        """
+        e = MockAPIError(
+            "text is not set; context length exceeded",
+            status_code=400,
+        )
+        result = classify_api_error(e, provider="xiaomi", model="mimo-v2.5")
+        assert result.reason == FailoverReason.multimodal_tool_content_unsupported
+
+    def test_no_status_code_path_also_classifies(self):
+        """When the error reaches us without a status code (transport
+        layer ate it) the message-only classifier branch must also
+        recognise the pattern.
+        """
+        e = MockTransportError("tool_call.content must be string")
+        result = classify_api_error(e, provider="alibaba", model="qwen3.5-plus")
+        assert result.reason == FailoverReason.multimodal_tool_content_unsupported
+
+    def test_unrelated_400_is_not_misclassified(self):
+        """Make sure the patterns don't false-positive on normal 400s."""
+        e = MockAPIError("bad request: missing field 'model'", status_code=400)
+        result = classify_api_error(e, provider="openrouter", model="anthropic/claude-sonnet-4")
+        assert result.reason != FailoverReason.multimodal_tool_content_unsupported
@@ -1060,3 +1060,191 @@ class TestHonchoCadenceTracking:
        p.on_turn_start(2, "second message")
        should_skip = p._injection_frequency == "first-turn" and p._turn_count > 1
        assert should_skip, "Second turn (turn 2) SHOULD be skipped"
+
+
+class TestMemoryToolToolsetGate:
+    """Issue #5544: memory provider tools must respect platform_toolsets.
+
+    Before the fix, MemoryManager.get_all_tool_schemas() output was appended
+    to AIAgent.tools unconditionally in agent_init.py — bypassing the
+    enabled_toolsets filter. Result: `platform_toolsets: telegram: []`
+    still leaked fact_store and other memory tools into the tool surface,
+    causing 10x latency on local models (Qwen3-30B: 1.7s → 42s) and
+    tool-call loops on small models.
+
+    These tests mirror the gate logic in agent/agent_init.py around the
+    memory provider tool injection block. The gate condition is:
+
+        enabled_toolsets is None        → no filter, inject (backward compat)
+        "memory" in enabled_toolsets    → user opted in, inject
+        otherwise (incl. [])            → skip injection
+    """
+
+    @staticmethod
+    def _run_memory_injection(enabled_toolsets, memory_manager):
+        """Simulate the gated memory-tool injection block from agent_init.py."""
+        tools = []
+        valid_tool_names = set()
+
+        if memory_manager and tools is not None and (
+            enabled_toolsets is None or "memory" in enabled_toolsets
+        ):
+            _existing = {
+                t.get("function", {}).get("name")
+                for t in tools
+                if isinstance(t, dict)
+            }
+            for _schema in memory_manager.get_all_tool_schemas():
+                _tname = _schema.get("name", "")
+                if _tname and _tname in _existing:
+                    continue
+                tools.append({"type": "function", "function": _schema})
+                if _tname:
+                    valid_tool_names.add(_tname)
+                    _existing.add(_tname)
+
+        return tools, valid_tool_names
+
+    def _mgr_with_tools(self, *tool_names):
+        """Build a MemoryManager whose providers expose the named tool schemas."""
+        mgr = MemoryManager()
+        p = FakeMemoryProvider(
+            "ext",
+            tools=[{"name": n, "description": n, "parameters": {}} for n in tool_names],
+        )
+        mgr.add_provider(p)
+        return mgr
+
+    def test_none_toolsets_injects(self):
+        """enabled_toolsets=None (no filter) injects memory tools — backward compat."""
+        mgr = self._mgr_with_tools("fact_store")
+        tools, names = self._run_memory_injection(None, mgr)
+        assert "fact_store" in names
+        assert any(t["function"]["name"] == "fact_store" for t in tools)
+
+    def test_memory_in_toolsets_injects(self):
+        """enabled_toolsets including 'memory' injects memory tools."""
+        mgr = self._mgr_with_tools("fact_store")
+        tools, names = self._run_memory_injection(["terminal", "memory", "web"], mgr)
+        assert "fact_store" in names
+
+    def test_empty_toolsets_blocks_injection(self):
+        """`platform_toolsets: telegram: []` must suppress memory tools. (#5544)"""
+        mgr = self._mgr_with_tools("fact_store")
+        tools, names = self._run_memory_injection([], mgr)
+        assert tools == []
+        assert names == set()
+
+    def test_toolsets_without_memory_blocks_injection(self):
+        """Toolset list that doesn't name 'memory' must suppress injection."""
+        mgr = self._mgr_with_tools("fact_store")
+        tools, names = self._run_memory_injection(["terminal", "web"], mgr)
+        assert tools == []
+        assert names == set()
+
+    def test_no_memory_manager_no_injection(self):
+        """Gate is moot without a memory manager."""
+        tools, names = self._run_memory_injection(None, None)
+        assert tools == []
+
+    def test_multiple_schemas_all_blocked_together(self):
+        """When the gate is closed, no memory tools leak — not even partially."""
+        mgr = self._mgr_with_tools("fact_store", "memory_search", "memory_add")
+        tools, names = self._run_memory_injection(["terminal"], mgr)
+        assert tools == []
+        assert names == set()
+
+    def test_multiple_schemas_all_injected_when_enabled(self):
+        """When the gate is open, every memory tool schema is injected."""
+        mgr = self._mgr_with_tools("fact_store", "memory_search", "memory_add")
+        tools, names = self._run_memory_injection(None, mgr)
+        assert names == {"fact_store", "memory_search", "memory_add"}
+
+
+class TestContextEngineToolsetGate:
+    """Issue #5544 (sibling): context engine tools follow the same gate.
+
+    `agent.context_compressor.get_tool_schemas()` (e.g. lcm_grep, lcm_describe,
+    lcm_expand) was appended to AIAgent.tools unconditionally. Same blind
+    injection class as the memory bug; same local-model penalty. Gate name:
+    "context_engine" (matches the existing plugin-system convention).
+    """
+
+    @staticmethod
+    def _run_context_engine_injection(enabled_toolsets, compressor):
+        """Simulate the gated context-engine injection block from agent_init.py."""
+        tools = []
+        valid_tool_names = set()
+        engine_tool_names = set()
+
+        if (
+            compressor is not None
+            and tools is not None
+            and (
+                enabled_toolsets is None
+                or "context_engine" in enabled_toolsets
+            )
+        ):
+            _existing = {
+                t.get("function", {}).get("name")
+                for t in tools
+                if isinstance(t, dict)
+            }
+            for _schema in compressor.get_tool_schemas():
+                _tname = _schema.get("name", "")
+                if _tname and _tname in _existing:
+                    continue
+                tools.append({"type": "function", "function": _schema})
+                if _tname:
+                    valid_tool_names.add(_tname)
+                    engine_tool_names.add(_tname)
+                    _existing.add(_tname)
+
+        return tools, valid_tool_names, engine_tool_names
+
+    class _FakeCompressor:
+        def __init__(self, schemas):
+            self._schemas = schemas
+
+        def get_tool_schemas(self):
+            return list(self._schemas)
+
+    def _compressor_with(self, *tool_names):
+        return self._FakeCompressor(
+            [{"name": n, "description": n, "parameters": {}} for n in tool_names]
+        )
+
+    def test_none_toolsets_injects(self):
+        """enabled_toolsets=None injects context-engine tools — backward compat."""
+        c = self._compressor_with("lcm_grep", "lcm_describe", "lcm_expand")
+        tools, names, engine_names = self._run_context_engine_injection(None, c)
+        assert engine_names == {"lcm_grep", "lcm_describe", "lcm_expand"}
+
+    def test_context_engine_in_toolsets_injects(self):
+        """enabled_toolsets including 'context_engine' injects the tools."""
+        c = self._compressor_with("lcm_grep")
+        tools, names, engine_names = self._run_context_engine_injection(
+            ["terminal", "context_engine"], c
+        )
+        assert "lcm_grep" in engine_names
+
+    def test_empty_toolsets_blocks_injection(self):
+        """`platform_toolsets: telegram: []` must suppress context-engine tools."""
+        c = self._compressor_with("lcm_grep")
+        tools, names, engine_names = self._run_context_engine_injection([], c)
+        assert tools == []
+        assert engine_names == set()
+
+    def test_toolsets_without_context_engine_blocks_injection(self):
+        """A toolset list that doesn't name 'context_engine' suppresses injection."""
+        c = self._compressor_with("lcm_grep", "lcm_describe")
+        tools, names, engine_names = self._run_context_engine_injection(
+            ["terminal", "memory"], c
+        )
+        assert tools == []
+        assert engine_names == set()
+
+    def test_no_compressor_no_injection(self):
+        """Gate is moot without a context_compressor."""
+        tools, names, engine_names = self._run_context_engine_injection(None, None)
+        assert tools == []
@@ -444,6 +444,7 @@ class TestBuildNousSubscriptionPrompt:
                    "tts": NousFeatureState("tts", "OpenAI TTS", True, True, True, True, False, True, "OpenAI TTS"),
                    "browser": NousFeatureState("browser", "Browser automation", True, True, True, True, False, True, "Browser Use"),
                    "modal": NousFeatureState("modal", "Modal execution", False, True, False, False, False, True, "local"),
+                    "app_tools": NousFeatureState("app_tools", "App tools (500+ apps)", True, True, True, True, False, True, "Nous Subscription"),
                },
            ),
        )
@@ -468,6 +469,7 @@ class TestBuildNousSubscriptionPrompt:
                    "tts": NousFeatureState("tts", "OpenAI TTS", True, False, False, False, False, True, ""),
                    "browser": NousFeatureState("browser", "Browser automation", True, False, False, False, False, True, ""),
                    "modal": NousFeatureState("modal", "Modal execution", False, False, False, False, False, True, ""),
+                    "app_tools": NousFeatureState("app_tools", "App tools (500+ apps)", True, False, False, False, False, True, ""),
                },
            ),
        )
@@ -556,10 +556,11 @@ Generate some audio.
            raising=False,
        )

-        with patch.dict(
-            os.environ, {"HERMES_SESSION_PLATFORM": "telegram"}, clear=False
-        ):
-            with patch("tools.skills_tool.SKILLS_DIR", tmp_path):
+        with patch("tools.skills_tool.SKILLS_DIR", tmp_path):
+            from gateway.session_context import clear_session_vars, set_session_vars
+
+            tokens = set_session_vars(platform="telegram")
+            try:
                _make_skill(
                    tmp_path,
                    "test-skill",
@@ -571,6 +572,8 @@ Generate some audio.
                )
                scan_skill_commands()
                msg = build_skill_invocation_message("/test-skill", "do stuff")
+            finally:
+                clear_session_vars(tokens)

        assert msg is not None
        assert "local cli" in msg.lower()
@@ -1,6 +1,12 @@
-"""Tests for agent/skill_utils.py — extract_skill_conditions metadata handling."""
+"""Tests for agent/skill_utils.py."""

-from agent.skill_utils import extract_skill_conditions
+from unittest.mock import patch
+
+from agent.skill_utils import (
+    extract_skill_conditions,
+    iter_skill_index_files,
+    skill_matches_platform,
+)


 def test_metadata_as_dict_with_hermes():
@@ -56,3 +62,138 @@ def test_metadata_missing_entirely():
        "fallback_for_tools": [],
        "requires_tools": [],
    }
+
+
+def test_iter_skill_index_files_prunes_dependency_dirs(tmp_path):
+    real = tmp_path / "real-skill"
+    real.mkdir()
+    (real / "SKILL.md").write_text("---\nname: real-skill\n---\n", encoding="utf-8")
+
+    nested = (
+        tmp_path
+        / "bring"
+        / "scripts"
+        / ".venv"
+        / "lib"
+        / "python3.13"
+        / "site-packages"
+        / "typer"
+        / ".agents"
+        / "skills"
+        / "typer"
+    )
+    nested.mkdir(parents=True)
+    (nested / "SKILL.md").write_text("---\nname: typer\n---\n", encoding="utf-8")
+
+    node_module = (
+        tmp_path
+        / "web-skill"
+        / "node_modules"
+        / "dep"
+        / ".agents"
+        / "skills"
+        / "dep"
+    )
+    node_module.mkdir(parents=True)
+    (node_module / "SKILL.md").write_text("---\nname: dep\n---\n", encoding="utf-8")
+
+    found = list(iter_skill_index_files(tmp_path, "SKILL.md"))
+
+    assert found == [real / "SKILL.md"]
+
+
+# ── skill_matches_platform on Termux ──────────────────────────────────────
+
+
+class TestSkillMatchesPlatformTermux:
+    """Termux is Linux userland on Android. Skills tagged platforms:[linux]
+    must load there regardless of whether Python reports sys.platform as
+    "linux" (pre-3.13) or "android" (3.13+). Reported by user @LikiusInik
+    in May 2026 — only 3 built-in skills appeared on Termux because every
+    github/productivity/mlops skill is tagged platforms:[linux,macos,windows]
+    and sys.platform=="android" did not start with "linux".
+    """
+
+    def test_no_platforms_field_matches_everywhere(self):
+        # Backward-compat default — skills without a platforms tag load
+        # on any OS, Termux included.
+        with patch("agent.skill_utils.sys.platform", "android"), patch(
+            "agent.skill_utils.is_termux", return_value=True
+        ):
+            assert skill_matches_platform({}) is True
+            assert skill_matches_platform({"name": "foo"}) is True
+
+    def test_linux_skill_loads_on_termux_android_platform(self):
+        # Python 3.13+ on Termux reports sys.platform == "android".
+        fm = {"platforms": ["linux"]}
+        with patch("agent.skill_utils.sys.platform", "android"), patch(
+            "agent.skill_utils.is_termux", return_value=True
+        ):
+            assert skill_matches_platform(fm) is True
+
+    def test_linux_macos_windows_skill_loads_on_termux(self):
+        # The common "[linux, macos, windows]" tag used by github-*,
+        # productivity, mlops, etc.
+        fm = {"platforms": ["linux", "macos", "windows"]}
+        with patch("agent.skill_utils.sys.platform", "android"), patch(
+            "agent.skill_utils.is_termux", return_value=True
+        ):
+            assert skill_matches_platform(fm) is True
+
+    def test_linux_skill_loads_on_termux_linux_platform(self):
+        # Pre-3.13 Termux reports sys.platform == "linux" already — this
+        # works without the Termux escape hatch but must still pass.
+        fm = {"platforms": ["linux"]}
+        with patch("agent.skill_utils.sys.platform", "linux"), patch(
+            "agent.skill_utils.is_termux", return_value=True
+        ):
+            assert skill_matches_platform(fm) is True
+
+    def test_macos_only_skill_still_excluded_on_termux(self):
+        # macOS-only skills (apple-notes, imessage, ...) should NOT load
+        # on Termux. The Termux fallback only widens platforms:[linux,...].
+        fm = {"platforms": ["macos"]}
+        with patch("agent.skill_utils.sys.platform", "android"), patch(
+            "agent.skill_utils.is_termux", return_value=True
+        ):
+            assert skill_matches_platform(fm) is False
+
+    def test_windows_only_skill_still_excluded_on_termux(self):
+        fm = {"platforms": ["windows"]}
+        with patch("agent.skill_utils.sys.platform", "android"), patch(
+            "agent.skill_utils.is_termux", return_value=True
+        ):
+            assert skill_matches_platform(fm) is False
+
+    def test_explicit_termux_or_android_tag_matches(self):
+        # Skills can also opt in explicitly via platforms:[termux] or
+        # platforms:[android] — both should match a Termux session.
+        with patch("agent.skill_utils.sys.platform", "android"), patch(
+            "agent.skill_utils.is_termux", return_value=True
+        ):
+            assert skill_matches_platform({"platforms": ["termux"]}) is True
+            assert skill_matches_platform({"platforms": ["android"]}) is True
+
+    def test_non_termux_android_does_not_widen(self):
+        # If we're somehow on a plain Android Python (not Termux), don't
+        # silently load Linux skills — Termux is the supported environment.
+        fm = {"platforms": ["linux"]}
+        with patch("agent.skill_utils.sys.platform", "android"), patch(
+            "agent.skill_utils.is_termux", return_value=False
+        ):
+            assert skill_matches_platform(fm) is False
+
+    def test_linux_skill_on_real_linux_unaffected(self):
+        # The non-Termux Linux path must not change.
+        fm = {"platforms": ["linux"]}
+        with patch("agent.skill_utils.sys.platform", "linux"), patch(
+            "agent.skill_utils.is_termux", return_value=False
+        ):
+            assert skill_matches_platform(fm) is True
+
+    def test_macos_skill_on_real_macos_unaffected(self):
+        fm = {"platforms": ["macos"]}
+        with patch("agent.skill_utils.sys.platform", "darwin"), patch(
+            "agent.skill_utils.is_termux", return_value=False
+        ):
+            assert skill_matches_platform(fm) is True
@@ -20,12 +20,9 @@ test runner at ``scripts/run_tests.sh``.
 """

 import asyncio
-import logging
 import os
 import re
-import signal
 import sys
-import tempfile
 from pathlib import Path
 from unittest.mock import patch

@@ -37,6 +34,22 @@ if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))


+# ── Per-file process isolation ──────────────────────────────────────────────
+# Tests run via ``scripts/run_tests_parallel.py``, which spawns a fresh
+# ``python -m pytest <file>`` subprocess per test file. Cross-file state
+# leakage (module-level dicts, ContextVars, caches) is impossible: each
+# file gets a clean Python interpreter. Intra-file ordering is the test
+# author's responsibility — if test A in foo.py mutates state that test B
+# in foo.py reads, that's a real bug to fix in the file (it would also
+# bite anyone running ``pytest tests/foo.py`` directly).
+#
+# This replaces the historic _reset_module_state autouse fixture (manual
+# state clearing) and the brief experiment with subprocess-per-test
+# isolation (too slow at ~17k tests).
+#
+# See ``scripts/run_tests_parallel.py`` for the runner.
+
+
 # ── Credential env-var filter ──────────────────────────────────────────────
 #
 # Any env var in the current process matching ONE of these patterns is
@@ -279,7 +292,7 @@ _HERMES_BEHAVIORAL_VARS = frozenset({
    "WECOM_HOME_CHANNEL_NAME",
    # Platform gating — set by load_gateway_config() as a side effect when
    # a config.yaml is present, so individual test bodies that call the
-    # loader leak these values into later tests on the same xdist worker.
+    # loader leak these values into later tests in the same process.
    # Force-clear on every test setup so the leak can't happen.
    "SLACK_REQUIRE_MENTION",
    "SLACK_STRICT_MENTION",
@@ -368,144 +381,21 @@ def _isolate_hermes_home(_hermetic_environment):
    return None


-# ── Module-level state reset ───────────────────────────────────────────────
+# ── Module-level state reset — replaced by per-file process isolation ──────
 #
-# Python modules are singletons per process, and pytest-xdist workers are
-# long-lived. Module-level dicts/sets (tool registries, approval state,
-# interrupt flags) and ContextVars persist across tests in the same worker,
-# causing tests that pass alone to fail when run with siblings.
+# Each test FILE runs in a freshly-spawned ``python -m pytest <file>``
+# subprocess via ``scripts/run_tests_parallel.py``, so module-level dicts /
+# sets / ContextVars from tests in one file cannot leak into tests in
+# another file. No manual per-module clearing needed.
 #
-# Each entry in this fixture clears state that belongs to a specific module.
-# New state buckets go here too — this is the single gate that prevents
-# "works alone, flakes in CI" bugs from state leakage.
+# Within a single file, ordering is the author's responsibility. If your
+# tests in the same file share mutable state, either reset it explicitly
+# in a fixture or split them across files.
 #
-# The skill `test-suite-cascade-diagnosis` documents the concrete patterns
-# this closes; the running example was `test_command_guards` failing 12/15
-# CI runs because ``tools.approval._session_approved`` carried approvals
-# from one test's session into another's.
-
-@pytest.fixture(autouse=True)
-def _reset_module_state():
-    """Clear module-level mutable state and ContextVars between tests.
-
-    Keeps state from leaking across tests on the same xdist worker. Modules
-    that don't exist yet (test collection before production import) are
-    skipped silently — production import later creates fresh empty state.
-    """
-    # --- logging — quiet/one-shot paths mutate process-global logger state ---
-    logging.disable(logging.NOTSET)
-    for _logger_name in ("tools", "run_agent", "trajectory_compressor", "cron", "hermes_cli"):
-        _logger = logging.getLogger(_logger_name)
-        _logger.disabled = False
-        _logger.setLevel(logging.NOTSET)
-        _logger.propagate = True
-
-    # --- tools.approval — the single biggest source of cross-test pollution ---
-    try:
-        from tools import approval as _approval_mod
-        _approval_mod._session_approved.clear()
-        _approval_mod._session_yolo.clear()
-        _approval_mod._permanent_approved.clear()
-        _approval_mod._pending.clear()
-        _approval_mod._gateway_queues.clear()
-        _approval_mod._gateway_notify_cbs.clear()
-        # ContextVar: reset to empty string so get_current_session_key()
-        # falls through to the env var / default path, matching a fresh
-        # process.
-        _approval_mod._approval_session_key.set("")
-    except Exception:
-        pass
-
-    # --- tools.interrupt — per-thread interrupt flag set ---
-    try:
-        from tools import interrupt as _interrupt_mod
-        with _interrupt_mod._lock:
-            _interrupt_mod._interrupted_threads.clear()
-    except Exception:
-        pass
-
-    # --- gateway.session_context — 9 ContextVars that represent
-    #     the active gateway session. If set in one test and not reset,
-    #     the next test's get_session_env() reads stale values.
-    try:
-        from gateway import session_context as _sc_mod
-        for _cv in (
-            _sc_mod._SESSION_PLATFORM,
-            _sc_mod._SESSION_CHAT_ID,
-            _sc_mod._SESSION_CHAT_NAME,
-            _sc_mod._SESSION_THREAD_ID,
-            _sc_mod._SESSION_USER_ID,
-            _sc_mod._SESSION_USER_NAME,
-            _sc_mod._SESSION_KEY,
-            _sc_mod._CRON_AUTO_DELIVER_PLATFORM,
-            _sc_mod._CRON_AUTO_DELIVER_CHAT_ID,
-            _sc_mod._CRON_AUTO_DELIVER_THREAD_ID,
-        ):
-            _cv.set(_sc_mod._UNSET)
-    except Exception:
-        pass
-
-    # --- tools.env_passthrough — ContextVar<set[str]> with no default ---
-    # LookupError is normal if the test never set it. Setting it to an
-    # empty set unconditionally normalizes the starting state.
-    try:
-        from tools import env_passthrough as _envp_mod
-        _envp_mod._allowed_env_vars_var.set(set())
-    except Exception:
-        pass
-
-    # --- tools.terminal_tool — active environment/cwd cache ---
-    # File tools prefer a live terminal cwd when one is cached for the task.
-    # Clear terminal environments between tests so a prior terminal call can't
-    # override TERMINAL_CWD in path-resolution tests.
-    try:
-        from tools import terminal_tool as _term_mod
-        _envs_to_cleanup = []
-        with _term_mod._env_lock:
-            _envs_to_cleanup = list(_term_mod._active_environments.values())
-            _term_mod._active_environments.clear()
-            _term_mod._last_activity.clear()
-            _term_mod._creation_locks.clear()
-        for _env in _envs_to_cleanup:
-            try:
-                _env.cleanup()
-            except Exception:
-                pass
-    except Exception:
-        pass
-
-    # --- tools.credential_files — ContextVar<dict> ---
-    try:
-        from tools import credential_files as _credf_mod
-        _credf_mod._registered_files_var.set({})
-    except Exception:
-        pass
-
-    # --- agent.auxiliary_client — runtime main provider/model override and
-    #     payment-error health cache. Both are process-global in production;
-    #     reset them per test so one worker's fallback/402 test does not make
-    #     later auxiliary-client tests skip otherwise-available providers.
-    try:
-        from agent import auxiliary_client as _aux_mod
-        _aux_mod.clear_runtime_main()
-        _aux_mod._reset_aux_unhealthy_cache()
-    except Exception:
-        pass
-
-    # --- tools.file_tools — per-task read history + file-ops cache ---
-    # _read_tracker accumulates per-task_id read history for loop detection,
-    # capped by _READ_HISTORY_CAP. If entries from a prior test persist, the
-    # cap is hit faster than expected and capacity-related tests flake.
-    try:
-        from tools import file_tools as _ft_mod
-        with _ft_mod._read_tracker_lock:
-            _ft_mod._read_tracker.clear()
-        with _ft_mod._file_ops_lock:
-            _ft_mod._file_ops_cache.clear()
-    except Exception:
-        pass
-
-    yield
+# The skill ``test-suite-cascade-diagnosis`` documents the cascade patterns
+# this replaces; the running example was ``test_command_guards`` failing
+# 12/15 CI runs because ``tools.approval._session_approved`` carried
+# approvals from one test's session into another's.


@pytest.fixture()
@@ -532,13 +422,12 @@ def mock_config():
    }


-# ── Global test timeout ─────────────────────────────────────────────────────
-# Kill any individual test that takes longer than 30 seconds.
-# Prevents hanging tests (subprocess spawns, blocking I/O) from stalling the
-# entire test suite.
+# ── Per-test timeout — handled by the isolation plugin ─────────────────────
+#
+# The subprocess-per-test plugin enforces the configured ``isolate_timeout``
+# ini key by terminating the child if it overruns. The old SIGALRM-based
+# fixture (POSIX-only, didn't work on Windows) is gone.

-def _timeout_handler(signum, frame):
-    raise TimeoutError("Test exceeded 30 second timeout")

@pytest.fixture(autouse=True)
 def _ensure_current_event_loop(request):
@@ -584,45 +473,6 @@ def _ensure_current_event_loop(request):
                asyncio.set_event_loop(None)


-@pytest.fixture(autouse=True)
-def _enforce_test_timeout():
-    """Kill any individual test that takes longer than 30 seconds.
-    SIGALRM is Unix-only; skip on Windows."""
-    if sys.platform == "win32":
-        yield
-        return
-    old = signal.signal(signal.SIGALRM, _timeout_handler)
-    signal.alarm(30)
-    yield
-    signal.alarm(0)
-    signal.signal(signal.SIGALRM, old)
-
-
-@pytest.fixture(autouse=True)
-def _reset_tool_registry_caches():
-    """Clear tool-registry-level caches between tests.
-
-    The production registry caches ``check_fn()`` results for 30 s
-    (see tools/registry.py) and :func:`get_tool_definitions` memoizes
-    its result (see model_tools.py). Both are keyed on state that tests
-    routinely mutate (env vars, registry._generation, config.yaml mtime)
-    — but a stale result from test A can still be served to test B
-    because 30 s covers the entire suite, and xdist worker reuse means
-    one test's cache lands in another's process. Clearing before every
-    test keeps hermetic behavior.
-    """
-    try:
-        from tools.registry import invalidate_check_fn_cache
-        invalidate_check_fn_cache()
-    except ImportError:
-        pass
-    try:
-        from model_tools import _clear_tool_defs_cache
-        _clear_tool_defs_cache()
-    except ImportError:
-        pass
-
-
 # ── Live-system guard ──────────────────────────────────────────────────────
 #
 # Several test files exercise the gateway-restart / kill code paths
@@ -313,19 +313,30 @@ def _scan_for_plugin_adapter_antipattern(source: str) -> list[str]:
    return offenses


-def pytest_configure(config):
-    """Reject plugin-adapter tests that use the sys.path anti-pattern.
+def _fingerprint_gateway_tests() -> str:
+    """Return a short fingerprint that changes when any gateway test file changes.

-    Runs once per pytest session on the controller, BEFORE any xdist
-    worker is spawned. If any file under ``tests/gateway/`` matches the
-    anti-pattern, we fail the whole session with a clear message —
-    before a polluted ``sys.path`` can cascade across workers.
+    Uses (mtime, size) pairs instead of content hashing — fast to compute
+    (stat-only, no reads) and sufficient for cache invalidation across
+    per-file subprocess runs.
    """
-    # Only run on the xdist controller (or in non-xdist runs). Skip on
-    # worker subprocesses so we don't scan the filesystem N times.
-    if hasattr(config, "workerinput"):
-        return
+    import hashlib

+    h = hashlib.sha256()
+    for path in sorted(_GATEWAY_DIR.rglob("test_*.py")):
+        try:
+            st = path.stat()
+            h.update(f"{path.name}:{st.st_mtime_ns}:{st.st_size}".encode())
+        except OSError:
+            h.update(f"{path.name}:missing".encode())
+    return h.hexdigest()[:16]
+
+
+def _run_adapter_antipattern_scan() -> list[str]:
+    """Scan gateway test files for the plugin-adapter anti-pattern.
+
+    Returns a list of violation strings (empty if clean).
+    """
    violations: list[str] = []
    for path in _GATEWAY_DIR.rglob("test_*.py"):
        if path.name in {"_plugin_adapter_loader.py", "conftest.py"}:
@@ -334,20 +345,108 @@ def pytest_configure(config):
            source = path.read_text(encoding="utf-8")
        except OSError:
            continue
+        # Fast string pre-filter: skip files that can't possibly violate.
+        # A violating file MUST contain both (a) an adapter/plugins/platforms
+        # reference AND (b) either sys.path manipulation or a bare adapter import.
        if "adapter" not in source and "plugins/platforms" not in source:
            continue
+        if not (
+            "sys.path" in source
+            or "import adapter" in source
+            or "from adapter import" in source
+        ):
+            continue
        offenses = _scan_for_plugin_adapter_antipattern(source)
        if offenses:
            violations.append(
                f"  {path.relative_to(_GATEWAY_DIR.parent.parent)}:\n    "
                + "\n    ".join(offenses)
            )
+    return violations

-    if violations:
-        raise pytest.UsageError(
-            "Plugin-adapter-import anti-pattern detected in gateway tests:\n"
-            + "\n".join(violations)
-            + "\n\n"
-            + _GUARD_HINT
-        )
+
+def pytest_configure(config):
+    """Reject plugin-adapter tests that use the sys.path anti-pattern.
+
+    Runs once per pytest session on the controller, BEFORE any xdist
+    worker is spawned. If any file under ``tests/gateway/`` matches the
+    anti-pattern, we fail the whole session with a clear message —
+    before a polluted ``sys.path`` can cascade across workers.
+
+    **Performance**: in the per-file subprocess isolation model (no xdist),
+    every subprocess is a "controller" — so the naive scan would run 257
+    times, each costing ~1s of AST walking.  We avoid this with two
+    strategies:
+
+    1. **Tight string pre-filter**: a file can only violate if it contains
+       *both* an adapter/plugins/platforms reference *and* a sys.path
+       manipulation or bare ``import adapter``.  This drops ~95% of files
+       from needing AST parsing.
+    2. **File-locked cache**: the scan result is cached in
+       ``.pytest-cache/gw-adapter-guard-<fingerprint>`` keyed on a
+       fingerprint of the gateway test file mtimes/sizes.  Concurrent
+       subprocesses acquire a lock; only the first performs the scan;
+       the rest wait and read the cached result.
+    """
+    # Only run on the xdist controller (or in non-xdist runs). Skip on
+    # worker subprocesses so we don't scan the filesystem N times.
+    if hasattr(config, "workerinput"):
+        return
+
+    fp = _fingerprint_gateway_tests()
+    cache_dir = Path.cwd() / ".pytest-cache"
+    cache_file = cache_dir / f"gw-adapter-guard-{fp}"
+    lock_file = cache_dir / f".gw-adapter-guard-{fp}.lock"
+
+    cache_dir.mkdir(parents=True, exist_ok=True)
+
+    # Evict stale cache entries from previous fingerprints (best-effort).
+    try:
+        for old in cache_dir.glob("gw-adapter-guard-*"):
+            if old.name != f"gw-adapter-guard-{fp}":
+                old.unlink(missing_ok=True)
+        for old in cache_dir.glob(".gw-adapter-guard-*.lock"):
+            if old.name != f".gw-adapter-guard-{fp}.lock":
+                old.unlink(missing_ok=True)
+    except OSError:
+        pass  # Non-critical; old files are harmless.
+
+    # Use filelock to ensure only one process scans at a time.
+    # Concurrent subprocesses all hit pytest_configure simultaneously;
+    # without a lock they'd all find no cache and all run the scan.
+    try:
+        from filelock import FileLock
+        lock = FileLock(str(lock_file), timeout=120)
+    except ImportError:
+        # Fallback: no locking (still correct, just slower under contention).
+        import contextlib
+
+        class _NoLock:
+            def __enter__(self):
+                return self
+            def __exit__(self, *a):
+                pass
+        lock = _NoLock()
+
+    with lock:
+        if cache_file.exists():
+            cached = cache_file.read_text(encoding="utf-8")
+            if cached == "clean":
+                return
+            raise pytest.UsageError(cached)
+
+        # Slow path: this process is the first to acquire the lock.
+        violations = _run_adapter_antipattern_scan()
+
+        if violations:
+            msg = (
+                "Plugin-adapter-import anti-pattern detected in gateway tests:\n"
+                + "\n".join(violations)
+                + "\n\n"
+                + _GUARD_HINT
+            )
+            cache_file.write_text(msg, encoding="utf-8")
+            raise pytest.UsageError(msg)
+        else:
+            cache_file.write_text("clean", encoding="utf-8")

@@ -22,19 +22,26 @@ from gateway.config import PlatformConfig


 def _ensure_telegram_mock():
-    if "telegram" in sys.modules and hasattr(sys.modules["telegram"], "__file__"):
-        return
-
    telegram_mod = MagicMock()
    telegram_mod.ext.ContextTypes.DEFAULT_TYPE = type(None)
-    telegram_mod.constants.ParseMode.MARKDOWN_V2 = "MarkdownV2"
-    telegram_mod.constants.ChatType.GROUP = "group"
-    telegram_mod.constants.ChatType.SUPERGROUP = "supergroup"
-    telegram_mod.constants.ChatType.CHANNEL = "channel"
-    telegram_mod.constants.ChatType.PRIVATE = "private"

-    for name in ("telegram", "telegram.ext", "telegram.constants", "telegram.request"):
-        sys.modules.setdefault(name, telegram_mod)
+    # Register telegram.constants as a separate module mock so that
+    # ``from telegram.constants import ChatType`` resolves to our mock
+    # with string-valued members (not auto-generated MagicMocks).
+    constants_mod = MagicMock()
+    constants_mod.ParseMode.MARKDOWN_V2 = "MarkdownV2"
+    constants_mod.ChatType.GROUP = "group"
+    constants_mod.ChatType.SUPERGROUP = "supergroup"
+    constants_mod.ChatType.CHANNEL = "channel"
+    constants_mod.ChatType.PRIVATE = "private"
+
+    sys.modules["telegram"] = telegram_mod
+    sys.modules["telegram.ext"] = telegram_mod.ext
+    sys.modules["telegram.constants"] = constants_mod
+    sys.modules["telegram.request"] = telegram_mod.request
+
+    # Force reimport so the adapter picks up the mock ChatType.
+    sys.modules.pop("gateway.platforms.telegram", None)


 _ensure_telegram_mock()
@@ -22,6 +22,11 @@ import pytest

 from gateway.config import Platform, PlatformConfig, load_gateway_config

+# Platform uses _missing_() for dynamic members, so "google_chat" is
+# resolvable via Platform("google_chat") even without a static
+# GOOGLE_CHAT attribute on the enum class.
+_GC = Platform("google_chat")
+

 # ---------------------------------------------------------------------------
 # Mock the google-* packages if they are not installed
@@ -229,7 +234,7 @@ def _make_chat_envelope(text="hello", sender_email="u@example.com", sender_type=

 class TestPlatformRegistration:
    def test_enum_value(self):
-        assert Platform.GOOGLE_CHAT.value == "google_chat"
+        assert _GC.value == "google_chat"

    def test_requirements_check_returns_true_when_available(self):
        # The shim flag is True in this test module.
@@ -266,14 +271,14 @@ class TestEnvConfigLoading:
        monkeypatch.setenv("GOOGLE_CHAT_PROJECT_ID", "p")
        # No subscription.
        cfg = load_gateway_config()
-        assert Platform.GOOGLE_CHAT not in cfg.platforms
+        assert _GC not in cfg.platforms

    def test_missing_project_does_not_enable(self, monkeypatch):
        self._clean_env(monkeypatch)
        monkeypatch.setenv("GOOGLE_CHAT_SUBSCRIPTION_NAME",
                           "projects/p/subscriptions/s")
        cfg = load_gateway_config()
-        assert Platform.GOOGLE_CHAT not in cfg.platforms
+        assert _GC not in cfg.platforms



@@ -2583,7 +2588,7 @@ class TestAuthorizationEmailMatch:
        runner.pairing_store.is_approved = MagicMock(return_value=False)

        source = SessionSource(
-            platform=Platform.GOOGLE_CHAT,
+            platform=_GC,
            chat_id="spaces/S",
            chat_type="dm",
            user_id="alice@example.com",       # post-swap: email is canonical
@@ -2604,7 +2609,7 @@ class TestAuthorizationEmailMatch:
        runner.pairing_store.is_approved = MagicMock(return_value=False)

        source = SessionSource(
-            platform=Platform.GOOGLE_CHAT,
+            platform=_GC,
            chat_id="spaces/S",
            chat_type="dm",
            user_id="bob@example.com",
@@ -2630,7 +2635,7 @@ class TestAuthorizationEmailMatch:
        runner.pairing_store.is_approved = MagicMock(return_value=False)

        source = SessionSource(
-            platform=Platform.GOOGLE_CHAT,
+            platform=_GC,
            chat_id="spaces/S",
            chat_type="dm",
            user_id="users/77777",  # no email available — resource name wins
@@ -75,9 +75,197 @@ class TestCodeGeneration:
            code = store.generate_code("telegram", "user1", "Alice")
            pending = store.list_pending("telegram")
        assert len(pending) == 1
-        assert pending[0]["code"] == code
+        # list_pending no longer returns the original code — it returns a
+        # truncated hash prefix.  Verify the metadata is correct instead.
        assert pending[0]["user_id"] == "user1"
        assert pending[0]["user_name"] == "Alice"
+        # The code field is now a hash prefix, not the original plaintext code
+        assert pending[0]["code"] != code
+
+
+# ---------------------------------------------------------------------------
+# Hashed storage
+# ---------------------------------------------------------------------------
+
+
+class TestHashedStorage:
+    def test_pending_file_contains_hash_and_salt(self, tmp_path):
+        """Stored entries must have 'hash' and 'salt', never the plaintext code."""
+        with patch("gateway.pairing.PAIRING_DIR", tmp_path):
+            store = PairingStore()
+            code = store.generate_code("telegram", "user1", "Alice")
+            raw = json.loads(
+                (tmp_path / "telegram-pending.json").read_text(encoding="utf-8")
+            )
+
+        assert len(raw) == 1
+        entry = next(iter(raw.values()))
+        # Must have hash and salt fields
+        assert "hash" in entry
+        assert "salt" in entry
+        # Hash must be a valid hex SHA-256 digest (64 hex chars)
+        assert len(entry["hash"]) == 64
+        assert all(c in "0123456789abcdef" for c in entry["hash"])
+        # Salt must be a valid hex string (32 hex chars for 16 bytes)
+        assert len(entry["salt"]) == 32
+        assert all(c in "0123456789abcdef" for c in entry["salt"])
+        # The plaintext code must NOT appear as a key or value anywhere
+        assert code not in raw  # not a key
+        for key, val in raw.items():
+            assert code != key
+            for field_val in val.values():
+                if isinstance(field_val, str):
+                    assert field_val != code
+
+    def test_plaintext_code_not_stored(self, tmp_path):
+        """The raw JSON file must not contain the plaintext code anywhere."""
+        with patch("gateway.pairing.PAIRING_DIR", tmp_path):
+            store = PairingStore()
+            code = store.generate_code("telegram", "user1")
+            raw_text = (tmp_path / "telegram-pending.json").read_text(encoding="utf-8")
+        assert code not in raw_text
+
+    def test_valid_code_verifies_against_hash(self, tmp_path):
+        """approve_code with the correct code should succeed."""
+        with patch("gateway.pairing.PAIRING_DIR", tmp_path):
+            store = PairingStore()
+            code = store.generate_code("telegram", "user1", "Bob")
+            result = store.approve_code("telegram", code)
+        assert result is not None
+        assert result["user_id"] == "user1"
+        assert result["user_name"] == "Bob"
+
+    def test_invalid_code_rejected(self, tmp_path):
+        """approve_code with a wrong code should fail."""
+        with patch("gateway.pairing.PAIRING_DIR", tmp_path):
+            store = PairingStore()
+            store.generate_code("telegram", "user1")
+            result = store.approve_code("telegram", "ZZZZZZZZ")
+        assert result is None
+
+    def test_different_salts_per_entry(self, tmp_path):
+        """Each pending entry should have a unique salt."""
+        with patch("gateway.pairing.PAIRING_DIR", tmp_path):
+            store = PairingStore()
+            store.generate_code("telegram", "user0")
+            store.generate_code("telegram", "user1")
+            store.generate_code("telegram", "user2")
+            raw = json.loads(
+                (tmp_path / "telegram-pending.json").read_text(encoding="utf-8")
+            )
+        salts = [entry["salt"] for entry in raw.values()]
+        assert len(set(salts)) == 3  # all unique
+
+    def test_hash_code_static_method(self, tmp_path):
+        """_hash_code should be deterministic for the same code+salt."""
+        salt = os.urandom(16)
+        h1 = PairingStore._hash_code("ABCD1234", salt)
+        h2 = PairingStore._hash_code("ABCD1234", salt)
+        assert h1 == h2
+        # Different salt should produce a different hash
+        salt2 = os.urandom(16)
+        h3 = PairingStore._hash_code("ABCD1234", salt2)
+        assert h3 != h1
+
+
+class TestLegacyPendingFileCompat:
+    """Defensive coverage for pre-hash pending.json on upgraded installs.
+
+    Existing user installs may have a pending.json written by the old
+    code (plaintext code as key, no hash/salt fields). The new
+    approve_code / list_pending / _cleanup_expired must not crash on
+    those entries — they should be ignored and aged out at TTL.
+    """
+
+    @staticmethod
+    def _write_legacy(tmp_path, code="ABCD1234", created_at=None):
+        """Write a pre-hash pending.json with plaintext code as the key."""
+        import time as _time
+        if created_at is None:
+            created_at = _time.time()
+        legacy = {
+            code: {
+                "user_id": "legacy-user",
+                "user_name": "Legacy",
+                "created_at": created_at,
+            }
+        }
+        (tmp_path / "telegram-pending.json").write_text(
+            json.dumps(legacy), encoding="utf-8"
+        )
+
+    def test_approve_code_ignores_legacy_entries(self, tmp_path):
+        """A valid old-format code must NOT silently approve under the new schema."""
+        with patch("gateway.pairing.PAIRING_DIR", tmp_path):
+            self._write_legacy(tmp_path, code="LEGACY01")
+            store = PairingStore()
+            # The plaintext "code" used to be the key — under the new schema
+            # it's not even looked at, and there's no hash/salt to verify.
+            # Result: approve_code returns None, the legacy entry is left
+            # alone (gets pruned by _cleanup_expired at TTL).
+            result = store.approve_code("telegram", "LEGACY01")
+            assert result is None
+            # Approved list must be empty
+            assert store.is_approved("telegram", "legacy-user") is False
+
+    def test_list_pending_handles_legacy_entries(self, tmp_path):
+        """list_pending must not KeyError on a missing 'hash' field."""
+        with patch("gateway.pairing.PAIRING_DIR", tmp_path):
+            self._write_legacy(tmp_path)
+            store = PairingStore()
+            pending = store.list_pending("telegram")
+        assert len(pending) == 1
+        assert pending[0]["user_id"] == "legacy-user"
+        assert pending[0]["code"] == "legacy"  # placeholder
+
+    def test_cleanup_expired_removes_legacy_at_ttl(self, tmp_path):
+        """Legacy entries past CODE_TTL must still get pruned."""
+        import time as _time
+        with patch("gateway.pairing.PAIRING_DIR", tmp_path):
+            self._write_legacy(
+                tmp_path,
+                code="LEGACY99",
+                created_at=_time.time() - CODE_TTL_SECONDS - 1,
+            )
+            store = PairingStore()
+            store._cleanup_expired("telegram")
+            raw = json.loads(
+                (tmp_path / "telegram-pending.json").read_text(encoding="utf-8")
+            )
+        assert raw == {}
+
+    def test_cleanup_expired_handles_malformed_entries(self, tmp_path):
+        """Non-dict / missing-created_at entries get evicted, not crashed on."""
+        with patch("gateway.pairing.PAIRING_DIR", tmp_path):
+            (tmp_path / "telegram-pending.json").write_text(
+                json.dumps({
+                    "broken1": "not a dict",
+                    "broken2": {"user_id": "x"},  # no created_at
+                    "broken3": {"created_at": "not a number"},
+                }),
+                encoding="utf-8",
+            )
+            store = PairingStore()
+            store._cleanup_expired("telegram")
+            raw = json.loads(
+                (tmp_path / "telegram-pending.json").read_text(encoding="utf-8")
+            )
+        assert raw == {}
+
+    def test_approve_code_skips_malformed_entries(self, tmp_path):
+        """Malformed entries must not crash approve_code's hash loop."""
+        import time as _time
+        with patch("gateway.pairing.PAIRING_DIR", tmp_path):
+            (tmp_path / "telegram-pending.json").write_text(
+                json.dumps({
+                    "broken": {"user_id": "x", "created_at": _time.time(),
+                               "salt": "not-hex", "hash": "doesntmatter"},
+                }),
+                encoding="utf-8",
+            )
+            store = PairingStore()
+            # Approving with any code must just return None, not crash.
+            assert store.approve_code("telegram", "ABCD1234") is None


 # ---------------------------------------------------------------------------
@@ -300,9 +488,10 @@ class TestCodeExpiry:
            store = PairingStore()
            code = store.generate_code("telegram", "user1")

-            # Manually expire the code
+            # Manually expire all pending entries
            pending = store._load_json(store._pending_path("telegram"))
-            pending[code]["created_at"] = time.time() - CODE_TTL_SECONDS - 1
+            for entry_id in pending:
+                pending[entry_id]["created_at"] = time.time() - CODE_TTL_SECONDS - 1
            store._save_json(store._pending_path("telegram"), pending)

            # Cleanup happens on next operation
@@ -314,9 +503,10 @@ class TestCodeExpiry:
            store = PairingStore()
            code = store.generate_code("telegram", "user1")

-            # Expire it
+            # Expire all entries
            pending = store._load_json(store._pending_path("telegram"))
-            pending[code]["created_at"] = time.time() - CODE_TTL_SECONDS - 1
+            for entry_id in pending:
+                pending[entry_id]["created_at"] = time.time() - CODE_TTL_SECONDS - 1
            store._save_json(store._pending_path("telegram"), pending)

            result = store.approve_code("telegram", code)
@@ -6,7 +6,11 @@ import pytest
 from pathlib import Path

 from gateway.config import PlatformConfig
-from gateway.platforms.webhook import WebhookAdapter, _DYNAMIC_ROUTES_FILENAME
+from gateway.platforms.webhook import (
+    WebhookAdapter,
+    _DYNAMIC_ROUTES_FILENAME,
+    _INSECURE_NO_AUTH,
+)


 def _make_adapter(routes=None, extra=None):
@@ -85,3 +89,78 @@ class TestDynamicRouteLoading:
        adapter._reload_dynamic_routes()
        assert "static" in adapter._routes
        assert len(adapter._dynamic_routes) == 0
+
+
+class TestDynamicRouteSecretValidation:
+    """Empty/missing secrets must be rejected during hot-reload.
+
+    Regression for HMAC bypass: prior to the fix, an agent-induced
+    dynamic route with `"secret": ""` would be merged into self._routes
+    by _reload_dynamic_routes(), then _handle_webhook's
+    `if secret and secret != _INSECURE_NO_AUTH` would skip signature
+    validation because empty string is falsy. Unauthenticated POSTs
+    would then execute the webhook prompt.
+    """
+
+    def test_empty_secret_rejected(self, tmp_path):
+        # Explicit empty-string secret must NOT fall back to the global
+        # secret, and the route must be skipped entirely.
+        (tmp_path / _DYNAMIC_ROUTES_FILENAME).write_text(
+            json.dumps({"evil": {"secret": "", "prompt": "rm -rf"}})
+        )
+        adapter = _make_adapter()  # has global secret
+        adapter._reload_dynamic_routes()
+        assert "evil" not in adapter._routes
+        assert "evil" not in adapter._dynamic_routes
+
+    def test_missing_secret_no_global_rejected(self, tmp_path):
+        (tmp_path / _DYNAMIC_ROUTES_FILENAME).write_text(
+            json.dumps({"orphan": {"prompt": "test"}})
+        )
+        # No global secret configured
+        adapter = _make_adapter(extra={"secret": ""})
+        adapter._reload_dynamic_routes()
+        assert "orphan" not in adapter._routes
+        assert "orphan" not in adapter._dynamic_routes
+
+    def test_missing_secret_inherits_global(self, tmp_path):
+        # No per-route secret but a global one is set → route is kept,
+        # the global secret protects it. Preserves existing fallback.
+        (tmp_path / _DYNAMIC_ROUTES_FILENAME).write_text(
+            json.dumps({"valid": {"prompt": "ok"}})
+        )
+        adapter = _make_adapter()  # global secret set
+        adapter._reload_dynamic_routes()
+        assert "valid" in adapter._routes
+
+    def test_insecure_no_auth_preserved(self, tmp_path):
+        # Explicit opt-in escape hatch for local testing — must still load.
+        (tmp_path / _DYNAMIC_ROUTES_FILENAME).write_text(
+            json.dumps({"test": {"secret": _INSECURE_NO_AUTH, "prompt": "p"}})
+        )
+        adapter = _make_adapter()
+        adapter._reload_dynamic_routes()
+        assert "test" in adapter._routes
+
+    def test_warning_logged_on_skip(self, tmp_path, caplog):
+        import logging
+        (tmp_path / _DYNAMIC_ROUTES_FILENAME).write_text(
+            json.dumps({"silent": {"secret": "", "prompt": "x"}})
+        )
+        adapter = _make_adapter()
+        with caplog.at_level(logging.WARNING, logger="gateway.platforms.webhook"):
+            adapter._reload_dynamic_routes()
+        assert any("silent" in rec.message for rec in caplog.records)
+
+    def test_partial_skip(self, tmp_path):
+        # One route bad, one route good — only the bad one is dropped.
+        (tmp_path / _DYNAMIC_ROUTES_FILENAME).write_text(
+            json.dumps({
+                "bad":  {"secret": "", "prompt": "x"},
+                "good": {"secret": "valid-secret", "prompt": "y"},
+            })
+        )
+        adapter = _make_adapter()
+        adapter._reload_dynamic_routes()
+        assert "good" in adapter._routes
+        assert "bad" not in adapter._routes
@@ -0,0 +1,131 @@
+"""Tests for curses color compatibility on low-color terminals (Docker).
+
+Regression test for #13688: ``hermes plugins`` crashes with
+``curses.error: init_pair() : color number is greater than COLORS-1``
+in Docker containers where curses.COLORS == 8 (only colors 0-7 exist).
+
+The bug was ``curses.init_pair(4, 8, -1)`` using raw color 8 ("bright
+black" / dim gray) which does not exist on 8-color terminals.  The fix
+clamps with ``min(8, curses.COLORS - 1)``.
+"""
+
+import curses
+import re
+from pathlib import Path
+from unittest.mock import patch, MagicMock, call
+
+import pytest
+
+
+# Path to the source files under test
+_SRC_ROOT = Path(__file__).parent.parent.parent / "hermes_cli"
+
+
+class TestInitPairClampingBehavior:
+    """Simulate curses color initialization on low-color terminals.
+
+    Patches curses.COLORS to 8 (Docker default) and verifies that
+    init_pair is never called with a color >= COLORS.
+    """
+
+    def _collect_init_pair_calls(self, draw_fn, colors_value):
+        """Run a curses draw function with a mock stdscr and patched COLORS.
+
+        Returns list of (pair_number, fg, bg) tuples from init_pair calls.
+        """
+        calls = []
+        real_init_pair = curses.init_pair
+
+        def tracking_init_pair(pair, fg, bg):
+            calls.append((pair, fg, bg))
+
+        mock_stdscr = MagicMock()
+        mock_stdscr.getmaxyx.return_value = (24, 80)
+        mock_stdscr.getch.return_value = 27  # ESC to exit
+
+        with patch("curses.COLORS", colors_value, create=True), \
+             patch("curses.init_pair", side_effect=tracking_init_pair), \
+             patch("curses.has_colors", return_value=True), \
+             patch("curses.start_color"), \
+             patch("curses.use_default_colors"), \
+             patch("curses.curs_set"):
+            try:
+                draw_fn(mock_stdscr)
+            except (SystemExit, StopIteration, Exception):
+                pass  # draw functions loop until keypress
+
+        return calls
+
+    def test_8_color_terminal_no_color_exceeds_limit(self):
+        """On an 8-color terminal (Docker), no init_pair fg color >= 8."""
+        # Simulate the color init pattern from plugins_cmd.py
+        def _simulated_color_init(stdscr):
+            if curses.has_colors():
+                curses.start_color()
+                curses.use_default_colors()
+                curses.init_pair(1, curses.COLOR_GREEN, -1)
+                curses.init_pair(2, curses.COLOR_YELLOW, -1)
+                curses.init_pair(3, curses.COLOR_CYAN, -1)
+                curses.init_pair(4, 8 if curses.COLORS > 8 else curses.COLOR_WHITE, -1)
+
+        calls = self._collect_init_pair_calls(_simulated_color_init, 8)
+        for pair, fg, bg in calls:
+            assert fg < 8, (
+                f"init_pair({pair}, {fg}, {bg}) uses color {fg} which "
+                f"does not exist on an 8-color terminal (valid: 0-7)"
+            )
+
+    def test_256_color_terminal_uses_color_8(self):
+        """On a 256-color terminal, color 8 (dim gray) should be used."""
+        def _simulated_color_init(stdscr):
+            if curses.has_colors():
+                curses.start_color()
+                curses.use_default_colors()
+                curses.init_pair(4, 8 if curses.COLORS > 8 else curses.COLOR_WHITE, -1)
+
+        calls = self._collect_init_pair_calls(_simulated_color_init, 256)
+        assert any(fg == 8 for _, fg, _ in calls), (
+            "On 256-color terminals, color 8 (dim gray) should be used"
+        )
+
+    def test_16_color_terminal_uses_color_8(self):
+        """On a 16-color terminal, color 8 should be available."""
+        def _simulated_color_init(stdscr):
+            if curses.has_colors():
+                curses.start_color()
+                curses.use_default_colors()
+                curses.init_pair(4, 8 if curses.COLORS > 8 else curses.COLOR_WHITE, -1)
+
+        calls = self._collect_init_pair_calls(_simulated_color_init, 16)
+        assert any(fg == 8 for _, fg, _ in calls)
+
+
+class TestSourceCodeGuardrails:
+    """Regression guardrails: raw color 8 must not reappear in source.
+
+    These complement the behavioral tests above — they catch regressions
+    introduced by copy-paste of the old pattern.
+    """
+
+    _RAW_COLOR_8_PATTERN = re.compile(r'init_pair\(\d+,\s*8\s*,')
+
+    def test_no_raw_color_8_in_plugins_cmd(self):
+        source = (_SRC_ROOT / "plugins_cmd.py").read_text()
+        matches = self._RAW_COLOR_8_PATTERN.findall(source)
+        assert not matches, (
+            f"plugins_cmd.py contains unclamped color 8: {matches}"
+        )
+
+    def test_no_raw_color_8_in_main(self):
+        source = (_SRC_ROOT / "main.py").read_text()
+        matches = self._RAW_COLOR_8_PATTERN.findall(source)
+        assert not matches, (
+            f"main.py contains unclamped color 8: {matches}"
+        )
+
+    def test_no_raw_color_8_in_curses_ui(self):
+        source = (_SRC_ROOT / "curses_ui.py").read_text()
+        matches = self._RAW_COLOR_8_PATTERN.findall(source)
+        assert not matches, (
+            f"curses_ui.py contains unclamped color 8: {matches}"
+        )
@@ -69,18 +69,19 @@ class TestPluginPickerInjection:
        assert "Myimg" in names
        assert "myimg" in plugin_names

-    def test_fal_skipped_to_avoid_duplicate(self, monkeypatch):
+    def test_fal_surfaced_alongside_other_plugins(self, monkeypatch):
        from hermes_cli import tools_config

-        # Simulate a FAL plugin being registered — the picker already has
-        # hardcoded FAL rows in TOOL_CATEGORIES, so plugin-FAL must be
-        # skipped to avoid showing FAL twice.
+        # After #26241, FAL is itself a plugin (`plugins/image_gen/fal/`)
+        # and the hardcoded `TOOL_CATEGORIES["image_gen"]` FAL row is
+        # gone. The plugin-row builder therefore surfaces it like any
+        # other backend — no deduplication step needed.
        image_gen_registry.register_provider(_FakeProvider("fal"))
        image_gen_registry.register_provider(_FakeProvider("openai"))

        rows = tools_config._plugin_image_gen_providers()
        names = [r.get("image_gen_plugin_name") for r in rows]
-        assert "fal" not in names
+        assert "fal" in names
        assert "openai" in names

    def test_visible_providers_includes_plugins_for_image_gen(self, monkeypatch):
@@ -1,4 +1,4 @@
-"""Tests for ``install_cua_driver`` upgrade semantics.
+"""Tests for ``install_cua_driver`` upgrade semantics and architecture pre-check.

 The cua-driver upstream installer always pulls the latest release tag, so
 re-running it is the canonical upgrade path. ``install_cua_driver(upgrade=True)``
@@ -10,18 +10,18 @@ must:
  fix for the "we only pulled cua-driver once on enable" complaint).
 * Preserve original ``upgrade=False`` behaviour for the toolset-enable flow:
  skip if installed, install otherwise, warn on non-macOS.
+* Pre-check architecture compatibility before downloading to avoid raw 404
+  errors on Intel macOS when the upstream release lacks x86_64 assets.
 """

 from __future__ import annotations

-from unittest.mock import patch
+import json
+from unittest.mock import MagicMock, patch


 class TestInstallCuaDriverUpgrade:
    def test_upgrade_on_non_macos_is_silent_noop(self):
-        """``hermes update`` calls install_cua_driver(upgrade=True) for every
-        user. On Linux/Windows it must return False without printing the
-        "macOS-only; skipping" warning that the toolset-enable path emits."""
        from hermes_cli import tools_config

        with patch.object(tools_config, "_print_warning") as warn, \
@@ -30,8 +30,6 @@ class TestInstallCuaDriverUpgrade:
            warn.assert_not_called()

    def test_non_upgrade_on_non_macos_warns(self):
-        """The toolset-enable path (upgrade=False) should still warn loudly
-        when the user tries to enable Computer Use on a non-macOS host."""
        from hermes_cli import tools_config

        with patch.object(tools_config, "_print_warning") as warn, \
@@ -40,43 +38,36 @@ class TestInstallCuaDriverUpgrade:
            warn.assert_called()

    def test_upgrade_on_macos_with_binary_runs_installer(self):
-        """When cua-driver is already on PATH and upgrade=True, we must
-        re-run the upstream installer (this is the fix for the bug report).
-        """
        from hermes_cli import tools_config

        with patch("platform.system", return_value="Darwin"), \
             patch.object(tools_config.shutil, "which",
                          side_effect=lambda n: "/usr/local/bin/" + n
                                                 if n in {"cua-driver", "curl"} else None), \
+             patch.object(tools_config, "_check_cua_driver_asset_for_arch",
+                          return_value=True), \
             patch.object(tools_config, "_run_cua_driver_installer",
                          return_value=True) as runner, \
             patch("subprocess.run"):
            assert tools_config.install_cua_driver(upgrade=True) is True
            runner.assert_called_once()
-            # Refresh path uses non-verbose mode so we don't re-print the
-            # "grant macOS permissions" block on every `hermes update`.
            kwargs = runner.call_args.kwargs
            assert kwargs.get("verbose") is False

    def test_upgrade_on_macos_without_binary_runs_installer(self):
-        """upgrade=True with cua-driver missing must still trigger an
-        install — equivalent to a fresh install. (Don't silently no-op.)"""
        from hermes_cli import tools_config

        with patch("platform.system", return_value="Darwin"), \
             patch.object(tools_config.shutil, "which",
                          side_effect=lambda n: "/usr/bin/curl" if n == "curl" else None), \
+             patch.object(tools_config, "_check_cua_driver_asset_for_arch",
+                          return_value=True), \
             patch.object(tools_config, "_run_cua_driver_installer",
                          return_value=True) as runner:
            assert tools_config.install_cua_driver(upgrade=True) is True
            runner.assert_called_once()

    def test_non_upgrade_on_macos_with_binary_skips_install(self):
-        """Original toolset-enable behaviour: cua-driver already installed
-        + upgrade=False → confirm and return without re-running installer.
-        This is the behaviour that ``hermes tools`` (re)enable depends on,
-        so the new helper must not regress it."""
        from hermes_cli import tools_config

        with patch("platform.system", return_value="Darwin"), \
@@ -89,27 +80,133 @@ class TestInstallCuaDriverUpgrade:
            runner.assert_not_called()

    def test_non_upgrade_on_macos_without_binary_runs_installer(self):
-        """Original fresh-install path must still work."""
        from hermes_cli import tools_config

        with patch("platform.system", return_value="Darwin"), \
             patch.object(tools_config.shutil, "which",
                          side_effect=lambda n: "/usr/bin/curl" if n == "curl" else None), \
+             patch.object(tools_config, "_check_cua_driver_asset_for_arch",
+                          return_value=True), \
             patch.object(tools_config, "_run_cua_driver_installer",
                          return_value=True) as runner:
            assert tools_config.install_cua_driver(upgrade=False) is True
-            runner.assert_called_once()

-    def test_upgrade_without_curl_does_not_crash(self):
-        """If curl isn't on PATH we can't refresh — must warn and return
-        the current install state, not raise."""
+
+class TestCheckCuaDriverAssetForArch:
+    def test_arm64_always_returns_true(self):
        from hermes_cli import tools_config

-        # cua-driver present, curl missing.
-        def _which(name):
-            return "/usr/local/bin/cua-driver" if name == "cua-driver" else None
+        with patch("platform.machine", return_value="arm64"):
+            assert tools_config._check_cua_driver_asset_for_arch() is True
+
+    def test_x86_64_with_asset_returns_true(self):
+        from hermes_cli import tools_config
+
+        release = {
+            "tag_name": "cua-driver-v0.1.6",
+            "assets": [
+                {"name": "cua-driver-0.1.6-darwin-arm64.tar.gz"},
+                {"name": "cua-driver-0.1.6-darwin-x86_64.tar.gz"},
+            ],
+        }
+        mock_resp = MagicMock()
+        mock_resp.read.return_value = json.dumps(release).encode()
+        mock_resp.__enter__ = lambda s: s
+        mock_resp.__exit__ = MagicMock(return_value=False)
+
+        with patch("platform.machine", return_value="x86_64"), \
+             patch("urllib.request.urlopen", return_value=mock_resp):
+            assert tools_config._check_cua_driver_asset_for_arch() is True
+
+    def test_x86_64_without_asset_returns_false(self):
+        from hermes_cli import tools_config
+
+        release = {
+            "tag_name": "cua-driver-v0.1.6",
+            "assets": [
+                {"name": "cua-driver-0.1.6-darwin-arm64.tar.gz"},
+                {"name": "cua-driver.tar.gz"},
+            ],
+        }
+        mock_resp = MagicMock()
+        mock_resp.read.return_value = json.dumps(release).encode()
+        mock_resp.__enter__ = lambda s: s
+        mock_resp.__exit__ = MagicMock(return_value=False)
+
+        with patch("platform.machine", return_value="x86_64"), \
+             patch("urllib.request.urlopen", return_value=mock_resp), \
+             patch.object(tools_config, "_print_warning") as warn, \
+             patch.object(tools_config, "_print_info"):
+            assert tools_config._check_cua_driver_asset_for_arch() is False
+            warn.assert_called_once()
+            assert "no Intel" in warn.call_args[0][0].lower() or "x86_64" in warn.call_args[0][0]
+
+    def test_x86_64_api_failure_returns_true(self):
+        """Network failure should fail open — let the installer handle it."""
+        from hermes_cli import tools_config
+
+        with patch("platform.machine", return_value="x86_64"), \
+             patch("urllib.request.urlopen", side_effect=Exception("timeout")):
+            assert tools_config._check_cua_driver_asset_for_arch() is True
+
+    def test_fresh_install_x86_64_no_asset_skips_installer(self):
+        """When the latest release has no Intel asset, skip the installer."""
+        from hermes_cli import tools_config
+
+        release = {
+            "tag_name": "cua-driver-v0.1.6",
+            "assets": [{"name": "cua-driver-0.1.6-darwin-arm64.tar.gz"}],
+        }
+        mock_resp = MagicMock()
+        mock_resp.read.return_value = json.dumps(release).encode()
+        mock_resp.__enter__ = lambda s: s
+        mock_resp.__exit__ = MagicMock(return_value=False)

        with patch("platform.system", return_value="Darwin"), \
-             patch.object(tools_config.shutil, "which", side_effect=_which), \
-             patch.object(tools_config, "_print_warning"):
+             patch.object(tools_config.shutil, "which",
+                          side_effect=lambda n: "/usr/bin/curl" if n == "curl" else None), \
+             patch("platform.machine", return_value="x86_64"), \
+             patch("urllib.request.urlopen", return_value=mock_resp), \
+             patch.object(tools_config, "_print_warning"), \
+             patch.object(tools_config, "_print_info"), \
+             patch.object(tools_config, "_run_cua_driver_installer") as runner:
+            assert tools_config.install_cua_driver(upgrade=False) is False
+            runner.assert_not_called()
+
+    def test_upgrade_x86_64_no_asset_returns_existing_status(self):
+        """On upgrade with no Intel asset, return whether binary existed."""
+        from hermes_cli import tools_config
+
+        release = {
+            "tag_name": "cua-driver-v0.1.6",
+            "assets": [{"name": "cua-driver-0.1.6-darwin-arm64.tar.gz"}],
+        }
+        mock_resp = MagicMock()
+        mock_resp.read.return_value = json.dumps(release).encode()
+        mock_resp.__enter__ = lambda s: s
+        mock_resp.__exit__ = MagicMock(return_value=False)
+
+        # With binary installed — returns True (binary exists)
+        with patch("platform.system", return_value="Darwin"), \
+             patch.object(tools_config.shutil, "which",
+                          side_effect=lambda n: "/usr/local/bin/" + n
+                                                 if n in ("cua-driver", "curl") else None), \
+             patch("platform.machine", return_value="x86_64"), \
+             patch("urllib.request.urlopen", return_value=mock_resp), \
+             patch.object(tools_config, "_print_warning"), \
+             patch.object(tools_config, "_print_info"), \
+             patch.object(tools_config, "_run_cua_driver_installer") as runner:
            assert tools_config.install_cua_driver(upgrade=True) is True
+            runner.assert_not_called()
+
+        # Without binary — returns False
+        with patch("platform.system", return_value="Darwin"), \
+             patch.object(tools_config.shutil, "which",
+                          side_effect=lambda n: "/usr/bin/curl" if n == "curl" else None), \
+             patch("platform.machine", return_value="x86_64"), \
+             patch("urllib.request.urlopen", return_value=mock_resp), \
+             patch.object(tools_config, "_print_warning"), \
+             patch.object(tools_config, "_print_info"), \
+             patch.object(tools_config, "_run_cua_driver_installer") as runner:
+            assert tools_config.install_cua_driver(upgrade=True) is False
+            runner.assert_not_called()
@@ -7,6 +7,7 @@ printf) to verify it behaves like a PTY you can read/write/resize/close.
 from __future__ import annotations

 import os
+import shutil
 import sys
 import time

@@ -66,7 +67,7 @@ class TestPtyBridgeIO:
    def test_write_sends_to_child_stdin(self):
        # `cat` with no args echoes stdin back to stdout.  We write a line,
        # read it back, then signal EOF to let cat exit cleanly.
-        bridge = PtyBridge.spawn(["/bin/cat"])
+        bridge = PtyBridge.spawn([shutil.which("cat") or "cat"])
        try:
            bridge.write(b"hello-pty\n")
            output = _read_until(bridge, b"hello-pty")
@@ -1631,6 +1631,33 @@ def test_named_custom_runtime_propagates_model_direct_path(monkeypatch):
    assert resolved["provider"] == "custom"


+def test_named_custom_runtime_propagates_extra_body_direct_path(monkeypatch):
+    """Custom provider extra_body should become runtime request_overrides."""
+    monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "my-gemma")
+    monkeypatch.setattr(
+        rp, "_get_named_custom_provider",
+        lambda p: {
+            "name": "my-gemma",
+            "base_url": "http://localhost:8000/v1",
+            "api_key": "test-key",
+            "model": "google/gemma-4-31b-it",
+            "extra_body": {
+                "enable_thinking": True,
+                "reasoning_effort": "high",
+            },
+        },
+    )
+    monkeypatch.setattr(rp, "_try_resolve_from_custom_pool", lambda *a, **k: None)
+
+    resolved = rp.resolve_runtime_provider(requested="my-gemma")
+    assert resolved["request_overrides"] == {
+        "extra_body": {
+            "enable_thinking": True,
+            "reasoning_effort": "high",
+        }
+    }
+
+
 def test_named_custom_runtime_propagates_model_pool_path(monkeypatch):
    """Model should propagate even when credential pool handles credentials."""
    monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "my-server")
@@ -1662,6 +1689,36 @@ def test_named_custom_runtime_propagates_model_pool_path(monkeypatch):
    assert resolved["api_key"] == "pool-key", "pool credentials should be used"


+def test_named_custom_runtime_propagates_extra_body_pool_path(monkeypatch):
+    """Custom provider extra_body should survive credential-pool resolution."""
+    monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "my-gemma")
+    monkeypatch.setattr(
+        rp, "_get_named_custom_provider",
+        lambda p: {
+            "name": "my-gemma",
+            "base_url": "http://localhost:8000/v1",
+            "api_key": "test-key",
+            "model": "google/gemma-4-31b-it",
+            "extra_body": {"enable_thinking": True},
+        },
+    )
+    monkeypatch.setattr(
+        rp, "_try_resolve_from_custom_pool",
+        lambda *a, **k: {
+            "provider": "custom",
+            "api_mode": "chat_completions",
+            "base_url": "http://localhost:8000/v1",
+            "api_key": "pool-key",
+            "source": "pool:custom:my-gemma",
+        },
+    )
+
+    resolved = rp.resolve_runtime_provider(requested="my-gemma")
+    assert resolved["request_overrides"] == {
+        "extra_body": {"enable_thinking": True}
+    }
+
+
 def test_named_custom_runtime_no_model_when_absent(monkeypatch):
    """When custom_providers entry has no model field, runtime should not either."""
    monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "my-server")
@@ -2150,6 +2207,24 @@ class TestProviderEntryApiKeyEnvAlias:
        key_env so the set stays in sync with what the runtime actually reads."""
        from hermes_cli.config import _VALID_CUSTOM_PROVIDER_FIELDS
        assert "key_env" in _VALID_CUSTOM_PROVIDER_FIELDS
+
+    def test_extra_body_is_supported_schema(self):
+        from hermes_cli.config import (
+            _VALID_CUSTOM_PROVIDER_FIELDS,
+            _normalize_custom_provider_entry,
+        )
+        entry = {
+            "name": "vendor",
+            "base_url": "https://api.vendor.example.com/v1",
+            "extra_body": {
+                "chat_template_kwargs": {"enable_thinking": True},
+                "include_reasoning": True,
+            },
+        }
+        normalized = _normalize_custom_provider_entry(dict(entry), provider_key="vendor")
+        assert normalized is not None
+        assert "extra_body" in _VALID_CUSTOM_PROVIDER_FIELDS
+        assert normalized["extra_body"] == entry["extra_body"]
 # =============================================================================
 # Tencent TokenHub — API-key provider runtime resolution
 # =============================================================================
@@ -90,6 +90,7 @@ def test_show_status_reports_managed_nous_features(monkeypatch, capsys, tmp_path
                "tts": NousFeatureState("tts", "OpenAI TTS", True, True, True, True, False, True, "OpenAI TTS"),
                "browser": NousFeatureState("browser", "Browser automation", True, True, True, True, False, True, "Browser Use"),
                "modal": NousFeatureState("modal", "Modal execution", False, True, False, False, False, True, "local"),
+                "app_tools": NousFeatureState("app_tools", "App tools (500+ apps)", True, True, True, True, False, True, "Nous Subscription"),
            },
        ),
        raising=False,
@@ -12,8 +12,10 @@ from hermes_cli.tools_config import (
    _get_platform_tools,
    _platform_toolset_summary,
    _reconfigure_tool,
+    _run_post_setup,
    _save_platform_tools,
    _toolset_has_keys,
+    _toolset_needs_configuration_prompt,
    CONFIGURABLE_TOOLSETS,
    TOOL_CATEGORIES,
    _visible_providers,
@@ -752,6 +754,91 @@ def test_numeric_mcp_server_name_does_not_crash_sorted():

 # ─── Imagegen Backend Picker Wiring ────────────────────────────────────────

+def test_toolset_has_keys_treats_no_key_providers_as_configured():
+    config = {}
+
+    assert _toolset_has_keys("computer_use", config) is True
+
+
+def test_computer_use_needs_configuration_when_cua_driver_post_setup_pending():
+    """No-key providers can still need setup when their post_setup is unsatisfied.
+
+    Returning users enabling Computer Use through `hermes tools` must reach the
+    cua-driver post-setup installer even though the provider has no API keys.
+    """
+    with patch("shutil.which", return_value=None):
+        assert _toolset_needs_configuration_prompt("computer_use", {}) is True
+
+
+def test_computer_use_skips_configuration_when_cua_driver_already_installed():
+    """Installed post_setup dependencies should keep returning-user toggles no-op."""
+    def fake_which(name: str):
+        return "/usr/local/bin/cua-driver" if name == "cua-driver" else None
+
+    with patch("shutil.which", side_effect=fake_which):
+        assert _toolset_needs_configuration_prompt("computer_use", {}) is False
+
+
+def test_computer_use_respects_custom_cua_driver_command():
+    """The setup gate should match runtime's HERMES_CUA_DRIVER_CMD override."""
+    def fake_which(name: str):
+        return "/opt/bin/custom-cua" if name == "custom-cua" else None
+
+    with patch.dict("os.environ", {"HERMES_CUA_DRIVER_CMD": "custom-cua"}), \
+         patch("shutil.which", side_effect=fake_which):
+        assert _toolset_needs_configuration_prompt("computer_use", {}) is False
+
+
+def test_computer_use_blank_custom_driver_command_falls_back_to_default():
+    """Blank overrides should not make the setup gate look for an empty command."""
+    def fake_which(name: str):
+        return "/usr/local/bin/cua-driver" if name == "cua-driver" else None
+
+    with patch.dict("os.environ", {"HERMES_CUA_DRIVER_CMD": "   "}), \
+         patch("shutil.which", side_effect=fake_which):
+        assert _toolset_needs_configuration_prompt("computer_use", {}) is False
+
+
+def test_computer_use_post_setup_respects_custom_driver_command_when_installed():
+    """post_setup already-installed checks should version-probe the override."""
+    def fake_which(name: str):
+        return "/opt/bin/custom-cua" if name == "custom-cua" else None
+
+    with patch.dict("os.environ", {"HERMES_CUA_DRIVER_CMD": "custom-cua"}), \
+         patch("platform.system", return_value="Darwin"), \
+         patch("shutil.which", side_effect=fake_which), \
+         patch("subprocess.run") as run:
+        run.return_value.stdout = "custom 1.2.3\n"
+
+        _run_post_setup("cua_driver")
+
+    run.assert_called_once()
+    assert run.call_args.args[0] == ["custom-cua", "--version"]
+
+
+def test_computer_use_post_setup_missing_override_does_not_accept_default_binary():
+    """A default cua-driver binary must not satisfy a missing runtime override."""
+    seen = []
+
+    def fake_which(name: str):
+        seen.append(name)
+        if name == "cua-driver":
+            return "/usr/local/bin/cua-driver"
+        if name == "curl":
+            return None
+        return None
+
+    with patch.dict("os.environ", {"HERMES_CUA_DRIVER_CMD": "custom-cua"}), \
+         patch("platform.system", return_value="Darwin"), \
+         patch("shutil.which", side_effect=fake_which), \
+         patch("subprocess.run") as run:
+        _run_post_setup("cua_driver")
+
+    run.assert_not_called()
+    assert "custom-cua" in seen
+    assert "curl" in seen
+
+
 class TestImagegenBackendRegistry:
    """IMAGEGEN_BACKENDS tags drive the model picker flow in tools_config."""

@@ -168,7 +168,7 @@ def test_make_tui_argv_skips_build_only_on_termux_when_fresh(

    argv, cwd = main_mod._make_tui_argv(tmp_path, tui_dev=False)

-    assert argv == ["/bin/node", str(tmp_path / "dist" / "entry.js")]
+    assert argv == ["/bin/node", "--expose-gc", str(tmp_path / "dist" / "entry.js")]
    assert cwd == tmp_path


@@ -283,6 +283,233 @@ def test_fast_tui_launch_is_termux_only(monkeypatch, main_mod):
    assert main_mod._try_termux_fast_tui_launch() is False


+def test_termux_fast_cli_launch_chat_uses_light_parser(monkeypatch, main_mod):
+    captured = {}
+    prepared = []
+
+    monkeypatch.setenv("TERMUX_VERSION", "1")
+    monkeypatch.delenv("HERMES_TUI", raising=False)
+    monkeypatch.setattr(
+        sys, "argv", ["hermes", "chat", "-q", "hello", "--toolsets", "web,terminal"]
+    )
+    monkeypatch.setattr(
+        main_mod, "_prepare_agent_startup", lambda args: prepared.append(args.command)
+    )
+    monkeypatch.setattr(
+        main_mod,
+        "cmd_chat",
+        lambda args: captured.update(
+            {"query": args.query, "toolsets": args.toolsets, "command": args.command}
+        ),
+    )
+
+    assert main_mod._try_termux_fast_cli_launch() is True
+    assert prepared == ["chat"]
+    assert captured == {
+        "query": "hello",
+        "toolsets": "web,terminal",
+        "command": "chat",
+    }
+
+
+def test_termux_fast_cli_launch_oneshot_uses_light_parser(monkeypatch, main_mod):
+    captured = {}
+    prepared = []
+
+    monkeypatch.setenv("TERMUX_VERSION", "1")
+    monkeypatch.delenv("HERMES_TUI", raising=False)
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        ["hermes", "-z", "hello", "--model", "gpt-test", "--provider", "openai"],
+    )
+    monkeypatch.setattr(
+        main_mod, "_prepare_agent_startup", lambda args: prepared.append(args.command)
+    )
+    monkeypatch.setitem(
+        sys.modules,
+        "hermes_cli.oneshot",
+        types.SimpleNamespace(
+            run_oneshot=lambda prompt, **kwargs: captured.update(
+                {"prompt": prompt, **kwargs}
+            )
+            or 17
+        ),
+    )
+
+    with pytest.raises(SystemExit) as exc:
+        main_mod._try_termux_fast_cli_launch()
+
+    assert exc.value.code == 17
+    assert prepared == [None]
+    assert captured == {
+        "prompt": "hello",
+        "model": "gpt-test",
+        "provider": "openai",
+        "toolsets": None,
+    }
+
+
+def test_termux_fast_cli_launch_version_skips_update_check(monkeypatch, main_mod):
+    captured = []
+
+    monkeypatch.setenv("TERMUX_VERSION", "1")
+    monkeypatch.delenv("HERMES_TUI", raising=False)
+    monkeypatch.setattr(sys, "argv", ["hermes", "version"])
+    monkeypatch.setattr(
+        main_mod, "_print_version_info", lambda *, check_updates: captured.append(check_updates)
+    )
+
+    assert main_mod._try_termux_fast_cli_launch() is True
+    assert captured == [False]
+
+
+def test_termux_fast_cli_launch_skips_help(monkeypatch, main_mod):
+    monkeypatch.setenv("TERMUX_VERSION", "1")
+    monkeypatch.delenv("HERMES_TUI", raising=False)
+    monkeypatch.setattr(sys, "argv", ["hermes", "chat", "--help"])
+
+    assert main_mod._try_termux_fast_cli_launch() is False
+
+
+def test_termux_fast_cli_launch_can_be_disabled(monkeypatch, main_mod):
+    monkeypatch.setenv("TERMUX_VERSION", "1")
+    monkeypatch.setenv("HERMES_TERMUX_DISABLE_FAST_CLI", "1")
+    monkeypatch.delenv("HERMES_TUI", raising=False)
+    monkeypatch.setattr(sys, "argv", ["hermes", "version"])
+
+    assert main_mod._try_termux_fast_cli_launch() is False
+
+
+def test_termux_bundled_skills_stamp_controls_sync(monkeypatch, tmp_path, main_mod):
+    monkeypatch.setenv("TERMUX_VERSION", "1")
+    monkeypatch.setattr(main_mod, "get_hermes_home", lambda: tmp_path)
+    monkeypatch.setattr(main_mod, "_termux_bundled_skills_fingerprint", lambda: "fp1")
+
+    assert main_mod._termux_bundled_skills_sync_needed() is True
+    main_mod._mark_termux_bundled_skills_synced()
+    assert main_mod._termux_bundled_skills_sync_needed() is False
+
+    monkeypatch.setenv("HERMES_TERMUX_FORCE_SKILLS_SYNC", "1")
+    assert main_mod._termux_bundled_skills_sync_needed() is True
+
+
+def test_termux_skips_bundled_skill_sync_when_stamp_fresh(monkeypatch, tmp_path, main_mod):
+    calls = []
+
+    monkeypatch.setenv("TERMUX_VERSION", "1")
+    monkeypatch.setattr(main_mod, "get_hermes_home", lambda: tmp_path)
+    monkeypatch.setattr(main_mod, "_termux_bundled_skills_fingerprint", lambda: "fp1")
+    main_mod._mark_termux_bundled_skills_synced()
+    monkeypatch.setitem(
+        sys.modules,
+        "tools.skills_sync",
+        types.SimpleNamespace(sync_skills=lambda quiet: calls.append(quiet)),
+    )
+
+    assert main_mod._sync_bundled_skills_for_startup() is False
+    assert calls == []
+
+
+def test_termux_forced_bundled_skill_sync_runs(monkeypatch, tmp_path, main_mod):
+    calls = []
+
+    monkeypatch.setenv("TERMUX_VERSION", "1")
+    monkeypatch.setenv("HERMES_TERMUX_FORCE_SKILLS_SYNC", "1")
+    monkeypatch.setattr(main_mod, "get_hermes_home", lambda: tmp_path)
+    monkeypatch.setattr(main_mod, "_termux_bundled_skills_fingerprint", lambda: "fp1")
+    monkeypatch.setitem(
+        sys.modules,
+        "tools.skills_sync",
+        types.SimpleNamespace(sync_skills=lambda quiet: calls.append(quiet)),
+    )
+
+    assert main_mod._sync_bundled_skills_for_startup() is True
+    assert calls == [True]
+
+
+def test_read_git_revision_fingerprint_resolves_packed_refs(tmp_path, main_mod):
+    repo = tmp_path / "repo"
+    git_dir = repo / ".git"
+    git_dir.mkdir(parents=True)
+    (git_dir / "HEAD").write_text("ref: refs/heads/main\n", encoding="utf-8")
+    packed_sha = "1234567890abcdef1234567890abcdef12345678"
+    (git_dir / "packed-refs").write_text(
+        "# pack-refs with: peeled fully-peeled sorted\n"
+        f"{packed_sha} refs/heads/main\n"
+        "abcdef0000000000000000000000000000000000 refs/tags/v1.0\n"
+        "^99999999aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n",
+        encoding="utf-8",
+    )
+
+    fingerprint = main_mod._read_git_revision_fingerprint(repo)
+
+    assert fingerprint == f"git:refs/heads/main:{packed_sha}"
+
+
+def test_read_git_revision_fingerprint_packed_refs_in_worktree_common_dir(
+    tmp_path, main_mod
+):
+    main_repo = tmp_path / "repo"
+    common_git = main_repo / ".git"
+    common_git.mkdir(parents=True)
+    packed_sha = "fedcba9876543210fedcba9876543210fedcba98"
+    (common_git / "packed-refs").write_text(
+        f"{packed_sha} refs/heads/main\n",
+        encoding="utf-8",
+    )
+
+    worktree = tmp_path / "wt"
+    worktree.mkdir()
+    wt_gitdir = common_git / "worktrees" / "wt"
+    wt_gitdir.mkdir(parents=True)
+    (wt_gitdir / "HEAD").write_text("ref: refs/heads/main\n", encoding="utf-8")
+    (wt_gitdir / "commondir").write_text("../..\n", encoding="utf-8")
+    (worktree / ".git").write_text(f"gitdir: {wt_gitdir}\n", encoding="utf-8")
+
+    fingerprint = main_mod._read_git_revision_fingerprint(worktree)
+
+    assert fingerprint == f"git:refs/heads/main:{packed_sha}"
+
+
+def test_read_git_revision_fingerprint_loose_ref_in_worktree_common_dir(
+    tmp_path, main_mod
+):
+    """`git worktree add -b NAME` writes the new branch ref to the common dir,
+    not the per-worktree gitdir. The fingerprint must still resolve it."""
+    main_repo = tmp_path / "repo"
+    common_git = main_repo / ".git"
+    common_git.mkdir(parents=True)
+    loose_sha = "0123456789abcdef0123456789abcdef01234567"
+    (common_git / "refs" / "heads").mkdir(parents=True)
+    (common_git / "refs" / "heads" / "feature").write_text(
+        loose_sha + "\n", encoding="utf-8"
+    )
+
+    worktree = tmp_path / "wt"
+    worktree.mkdir()
+    wt_gitdir = common_git / "worktrees" / "wt"
+    wt_gitdir.mkdir(parents=True)
+    (wt_gitdir / "HEAD").write_text("ref: refs/heads/feature\n", encoding="utf-8")
+    (wt_gitdir / "commondir").write_text("../..\n", encoding="utf-8")
+    (worktree / ".git").write_text(f"gitdir: {wt_gitdir}\n", encoding="utf-8")
+
+    fingerprint = main_mod._read_git_revision_fingerprint(worktree)
+
+    assert fingerprint == f"git:refs/heads/feature:{loose_sha}"
+
+
+def test_read_git_revision_fingerprint_unresolved_ref_is_stable(tmp_path, main_mod):
+    repo = tmp_path / "repo"
+    git_dir = repo / ".git"
+    git_dir.mkdir(parents=True)
+    (git_dir / "HEAD").write_text("ref: refs/heads/missing\n", encoding="utf-8")
+
+    fingerprint = main_mod._read_git_revision_fingerprint(repo)
+
+    assert fingerprint == "git:refs/heads/missing:unresolved"
+
+
 def test_main_top_level_oneshot_accepts_toolsets(monkeypatch, main_mod):
    captured = {}

@@ -0,0 +1,300 @@
+"""Behavior-parity check for the image-gen FAL plugin migration (#26241).
+
+Spawns one subprocess per (version, scenario) cell — pinned to either
+``origin/main`` (legacy in-tree FAL fall-through + ``configured == "fal"``
+skip in ``_dispatch_to_plugin_provider``) or this PR's worktree (FAL is
+itself a plugin and the dispatcher routes every set provider through
+the registry). Each subprocess clears all FAL-related env vars + writes
+a ``config.yaml``, then asks the dispatcher how it would route an
+``image_generate`` call. The emitted shape tuple is
+``{dispatch_kind, provider_name, model}``:
+
+* ``dispatch_kind`` ∈ ``{"legacy_fal", "plugin", "error", None}`` —
+  whether the call would go straight to the in-tree pipeline,
+  through ``_dispatch_to_plugin_provider``, raise an explicit
+  provider-not-registered error, or fall through silently.
+* ``provider_name`` — when ``dispatch_kind == "plugin"``, the
+  resolved provider name. ``None`` otherwise.
+* ``model`` — the resolved FAL model id when applicable.
+
+The parent process diffs the shapes per scenario. A diff means the
+migration introduced an observable behaviour change vs origin/main —
+likely a real regression for users on the existing config keys.
+
+Run from the PR worktree:
+
+    python tests/plugins/image_gen/check_parity_vs_main.py
+"""
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+
+
+# Pin one path to current main, one to the PR worktree.
+# ``REPO_ROOT`` is ``.../.worktrees/<name>``; the main checkout lives
+# two levels up. When running directly from a regular clone (no
+# worktree), ``MAIN_DIR`` falls back to a sibling ``hermes-agent-main``
+# checkout if one exists.
+def _resolve_main_dir() -> Path:
+    candidate = REPO_ROOT.parent.parent
+    if (candidate / "tools" / "image_generation_tool.py").exists() and candidate != REPO_ROOT:
+        return candidate
+    sibling = REPO_ROOT.parent / "hermes-agent-main"
+    if (sibling / "tools" / "image_generation_tool.py").exists():
+        return sibling
+    return REPO_ROOT
+
+
+MAIN_DIR = _resolve_main_dir()
+PR_DIR = REPO_ROOT
+assert (PR_DIR / "tools" / "image_generation_tool.py").exists(), (
+    f"PR_DIR={PR_DIR} doesn't look like a hermes-agent checkout"
+)
+
+
+SUBPROCESS_SCRIPT = r"""
+import json, os, sys, tempfile
+sys.path.insert(0, sys.argv[1])
+
+# Isolated HERMES_HOME so the config write is hermetic.
+home = tempfile.mkdtemp()
+os.environ["HERMES_HOME"] = home
+
+# Clear FAL-related env so dispatch decisions are config-driven.
+for k in (
+    "FAL_KEY", "FAL_QUEUE_GATEWAY_URL",
+    "TOOL_GATEWAY_DOMAIN", "TOOL_GATEWAY_USER_TOKEN",
+    "FAL_IMAGE_MODEL",
+):
+    os.environ.pop(k, None)
+
+scenario_env = json.loads(sys.argv[2])
+os.environ.update(scenario_env)
+
+config_yaml = sys.argv[3]
+config_path = os.path.join(home, "config.yaml")
+with open(config_path, "w") as f:
+    f.write(config_yaml)
+
+# Fresh import — must not have anything cached.
+for name in list(sys.modules):
+    if (name.startswith("tools.")
+            or name.startswith("agent.")
+            or name.startswith("plugins.")
+            or name.startswith("hermes_cli.")):
+        sys.modules.pop(name, None)
+
+import tools.image_generation_tool as image_tool
+
+dispatch_kind = None
+provider_name = None
+model = None
+error_text = None
+
+try:
+    raw = image_tool._dispatch_to_plugin_provider("ping", "landscape")
+    if raw is None:
+        dispatch_kind = "legacy_fal"
+    else:
+        parsed = json.loads(raw) if isinstance(raw, str) else raw
+        if isinstance(parsed, dict):
+            if parsed.get("error_type") == "provider_not_registered":
+                dispatch_kind = "error"
+                error_text = parsed.get("error")
+            else:
+                dispatch_kind = "plugin"
+                provider_name = parsed.get("provider")
+                model = parsed.get("model")
+        else:
+            dispatch_kind = "unknown_payload"
+
+    if model is None:
+        # _resolve_fal_model still returns the active FAL model id even
+        # when dispatch goes to a non-FAL plugin — used for the diff
+        # only when applicable.
+        try:
+            model_id, _meta = image_tool._resolve_fal_model()
+            if dispatch_kind == "legacy_fal":
+                model = model_id
+        except Exception:
+            pass
+except Exception as exc:
+    dispatch_kind = "exception"
+    error_text = repr(exc)
+
+shape = {
+    "dispatch_kind": dispatch_kind,
+    "provider_name": provider_name,
+    "model": model,
+    "error_present": error_text is not None,
+}
+print(json.dumps(shape))
+"""
+
+
+SCENARIOS: list[tuple[str, str, dict[str, str]]] = [
+    # (label, config.yaml body, extra env vars)
+    ("no-config-no-env", "", {}),
+    (
+        "explicit-fal-no-creds",
+        "image_gen:\n  provider: fal\n",
+        {},
+    ),
+    (
+        "explicit-fal-with-creds",
+        "image_gen:\n  provider: fal\n",
+        {"FAL_KEY": "test-key"},
+    ),
+    (
+        "explicit-fal-with-model",
+        "image_gen:\n  provider: fal\n  model: fal-ai/flux-2-pro\n",
+        {"FAL_KEY": "test-key"},
+    ),
+    (
+        "explicit-typo-provider",
+        "image_gen:\n  provider: not-a-real-backend\n",
+        {"FAL_KEY": "test-key"},
+    ),
+    (
+        "managed-gateway-only",
+        "",
+        {
+            "TOOL_GATEWAY_DOMAIN": "nousresearch.com",
+            "TOOL_GATEWAY_USER_TOKEN": "nous-token",
+        },
+    ),
+]
+
+
+def _run_scenario(repo_path: Path, label: str, config_yaml: str, env: dict) -> dict:
+    venv_python = repo_path / ".venv" / "bin" / "python"
+    if not venv_python.exists():
+        venv_python = MAIN_DIR / ".venv" / "bin" / "python"
+    if not venv_python.exists():
+        venv_python = Path("python3")
+
+    out = subprocess.run(
+        [
+            str(venv_python),
+            "-c",
+            SUBPROCESS_SCRIPT,
+            str(repo_path),
+            json.dumps(env),
+            config_yaml,
+        ],
+        capture_output=True,
+        text=True,
+        timeout=60,
+    )
+    if out.returncode != 0:
+        return {
+            "error": "subprocess failed",
+            "stdout": out.stdout[-500:],
+            "stderr": out.stderr[-500:],
+        }
+    try:
+        return json.loads(out.stdout.strip().splitlines()[-1])
+    except Exception as exc:
+        return {"error": f"could not parse output: {exc}", "stdout": out.stdout}
+
+
+def _reduce(shape: dict) -> dict:
+    """Reduce to the parts that matter for user-visible parity.
+
+    On origin/main, ``explicit-fal-*`` scenarios short-circuit to
+    ``legacy_fal`` because of the ``configured == "fal"`` skip. On the
+    PR, those same scenarios route through the plugin and emit
+    ``dispatch_kind == "plugin"`` with ``provider_name == "fal"``.
+
+    Both shapes are functionally equivalent — the plugin's ``generate()``
+    re-enters the same in-tree pipeline via ``_it`` indirection — but
+    we want the diff to be visible so reviewers can sign off on the
+    intentional behaviour delta.
+    """
+    return {
+        "dispatch_kind": shape.get("dispatch_kind"),
+        "provider_name": shape.get("provider_name"),
+        "model": shape.get("model"),
+        "error_present": shape.get("error_present"),
+    }
+
+
+def main() -> int:
+    print(f"main:    {MAIN_DIR}")
+    print(f"pr:      {PR_DIR}")
+    print()
+
+    if MAIN_DIR == PR_DIR:
+        print(
+            "WARN: MAIN_DIR == PR_DIR — diffs will be trivially identical.\n"
+            "      Set up a sibling 'hermes-agent-main' checkout pinned to "
+            "origin/main to get real parity coverage."
+        )
+        print()
+
+    failures: list[str] = []
+    errors: list[str] = []
+    intentional_diffs: list[tuple[str, dict, dict]] = []
+    for label, config_yaml, env in SCENARIOS:
+        main_shape = _run_scenario(MAIN_DIR, label, config_yaml, env)
+        pr_shape = _run_scenario(PR_DIR, label, config_yaml, env)
+
+        if "error" in main_shape or "error" in pr_shape:
+            print(f"  [ERR ] {label}: subprocess failed")
+            print(f"    main: {main_shape}")
+            print(f"    pr:   {pr_shape}")
+            errors.append(label)
+            continue
+
+        main_reduced = _reduce(main_shape)
+        pr_reduced = _reduce(pr_shape)
+
+        if main_reduced == pr_reduced:
+            print(f"  [OK]   {label}: {main_reduced}")
+            continue
+
+        # On main, "explicit-fal-*" returns legacy_fal; on PR, plugin
+        # dispatch. That's the only acceptable diff — flag everything
+        # else as a regression.
+        legacy_to_plugin_fal = (
+            main_reduced.get("dispatch_kind") == "legacy_fal"
+            and pr_reduced.get("dispatch_kind") == "plugin"
+            and pr_reduced.get("provider_name") == "fal"
+        )
+        if legacy_to_plugin_fal:
+            print(f"  [DIFF] {label}: legacy_fal → plugin (fal) — expected")
+            intentional_diffs.append((label, main_reduced, pr_reduced))
+        else:
+            print(f"  [FAIL] {label}")
+            print(f"    main: {main_reduced}")
+            print(f"    pr:   {pr_reduced}")
+            failures.append(label)
+
+    print()
+    if errors:
+        print(f"SUBPROCESS ERRORS in {len(errors)} scenario(s):")
+        for e in errors:
+            print(f"  - {e}")
+    if failures:
+        print(f"BEHAVIOUR REGRESSION in {len(failures)} scenario(s):")
+        for f in failures:
+            print(f"  - {f}")
+    if intentional_diffs:
+        print(
+            f"INTENTIONAL DIFFS ({len(intentional_diffs)}): "
+            f"legacy_fal → plugin dispatch for explicit FAL paths."
+        )
+    if failures or errors:
+        return 1
+    print(f"PARITY OK across {len(SCENARIOS)} scenarios.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+"""Tests for the FAL.ai image generation plugin.
+
+The plugin is a thin registration adapter — actual FAL pipeline logic
+lives in ``tools.image_generation_tool`` and is exercised by
+``tests/tools/test_image_generation.py``. These tests focus on:
+
+* the ``ImageGenProvider`` ABC surface (name, models, schema)
+* call-time indirection (``_it`` resolution at ``generate()`` time so
+  ``monkeypatch.setattr(image_tool, ...)`` keeps working)
+* response shape stamping (provider/prompt/aspect_ratio/model)
+"""
+
+from __future__ import annotations
+
+import json
+from unittest.mock import MagicMock
+
+import pytest
+
+
+# ---------------------------------------------------------------------------
+# Provider surface
+# ---------------------------------------------------------------------------
+
+
+class TestFalImageGenProviderSurface:
+    def test_name(self):
+        from plugins.image_gen.fal import FalImageGenProvider
+
+        assert FalImageGenProvider().name == "fal"
+
+    def test_display_name(self):
+        from plugins.image_gen.fal import FalImageGenProvider
+
+        assert FalImageGenProvider().display_name == "FAL.ai"
+
+    def test_default_model_matches_legacy(self):
+        from plugins.image_gen.fal import FalImageGenProvider
+        from tools.image_generation_tool import DEFAULT_MODEL
+
+        assert FalImageGenProvider().default_model() == DEFAULT_MODEL
+
+    def test_list_models_uses_legacy_catalog(self):
+        from plugins.image_gen.fal import FalImageGenProvider
+        from tools.image_generation_tool import FAL_MODELS
+
+        provider = FalImageGenProvider()
+        models = provider.list_models()
+        ids = {m["id"] for m in models}
+        # Whatever FAL_MODELS ships, the provider mirrors verbatim.
+        assert ids == set(FAL_MODELS.keys())
+        # Spot-check the expected first-class fields are present.
+        for entry in models:
+            for field in ("id", "display", "speed", "strengths", "price"):
+                assert field in entry
+
+    def test_setup_schema_advertises_fal_key(self):
+        from plugins.image_gen.fal import FalImageGenProvider
+
+        schema = FalImageGenProvider().get_setup_schema()
+        assert schema["name"] == "FAL.ai"
+        assert schema["badge"] == "paid"
+        env_keys = {entry["key"] for entry in schema.get("env_vars", [])}
+        assert "FAL_KEY" in env_keys
+
+
+class TestFalImageGenProviderAvailability:
+    def test_is_available_when_legacy_check_passes(self, monkeypatch):
+        import tools.image_generation_tool as image_tool
+        from plugins.image_gen.fal import FalImageGenProvider
+
+        monkeypatch.setattr(image_tool, "check_fal_api_key", lambda: True)
+        assert FalImageGenProvider().is_available() is True
+
+    def test_is_available_false_when_legacy_check_fails(self, monkeypatch):
+        import tools.image_generation_tool as image_tool
+        from plugins.image_gen.fal import FalImageGenProvider
+
+        monkeypatch.setattr(image_tool, "check_fal_api_key", lambda: False)
+        assert FalImageGenProvider().is_available() is False
+
+    def test_is_available_handles_legacy_exception(self, monkeypatch):
+        import tools.image_generation_tool as image_tool
+        from plugins.image_gen.fal import FalImageGenProvider
+
+        def _boom():
+            raise RuntimeError("config broke")
+
+        monkeypatch.setattr(image_tool, "check_fal_api_key", _boom)
+        # Picker must not propagate exceptions — show as "not available".
+        assert FalImageGenProvider().is_available() is False
+
+
+# ---------------------------------------------------------------------------
+# generate() — call-time indirection
+# ---------------------------------------------------------------------------
+
+
+class TestFalImageGenProviderGenerate:
+    def test_generate_delegates_to_legacy_image_generate_tool(self, monkeypatch):
+        """Plugin must look up ``image_generate_tool`` at call time so
+        ``monkeypatch.setattr(image_tool, "image_generate_tool", ...)``
+        takes effect."""
+        import tools.image_generation_tool as image_tool
+        from plugins.image_gen.fal import FalImageGenProvider
+
+        captured = {}
+
+        def fake_image_generate_tool(prompt, aspect_ratio, **kwargs):
+            captured["prompt"] = prompt
+            captured["aspect_ratio"] = aspect_ratio
+            captured["kwargs"] = kwargs
+            return json.dumps({"success": True, "image": "https://fake/image.png"})
+
+        monkeypatch.setattr(image_tool, "image_generate_tool", fake_image_generate_tool)
+        monkeypatch.setattr(image_tool, "_resolve_fal_model",
+                            lambda: ("fal-ai/flux-2/klein/9b", {}))
+
+        result = FalImageGenProvider().generate(
+            "a serene mountain landscape",
+            aspect_ratio="square",
+            seed=42,
+        )
+
+        assert captured["prompt"] == "a serene mountain landscape"
+        assert captured["aspect_ratio"] == "square"
+        assert captured["kwargs"] == {"seed": 42}
+        assert result["success"] is True
+        assert result["image"] == "https://fake/image.png"
+        # Stamped fields for the unified response shape
+        assert result["provider"] == "fal"
+        assert result["prompt"] == "a serene mountain landscape"
+        assert result["aspect_ratio"] == "square"
+        assert result["model"] == "fal-ai/flux-2/klein/9b"
+
+    def test_generate_invalid_aspect_ratio_is_coerced(self, monkeypatch):
+        import tools.image_generation_tool as image_tool
+        from plugins.image_gen.fal import FalImageGenProvider
+
+        seen_aspect = {}
+
+        def fake(prompt, aspect_ratio, **kwargs):
+            seen_aspect["v"] = aspect_ratio
+            return json.dumps({"success": True, "image": "x"})
+
+        monkeypatch.setattr(image_tool, "image_generate_tool", fake)
+        monkeypatch.setattr(image_tool, "_resolve_fal_model",
+                            lambda: ("fal-ai/flux-2/klein/9b", {}))
+
+        FalImageGenProvider().generate("p", aspect_ratio="not-a-real-ratio")
+        # ``resolve_aspect_ratio`` clamps to landscape.
+        assert seen_aspect["v"] == "landscape"
+
+    def test_generate_passthrough_drops_none_kwargs(self, monkeypatch):
+        import tools.image_generation_tool as image_tool
+        from plugins.image_gen.fal import FalImageGenProvider
+
+        seen = {}
+
+        def fake(prompt, aspect_ratio, **kwargs):
+            seen.update(kwargs)
+            return json.dumps({"success": True, "image": "x"})
+
+        monkeypatch.setattr(image_tool, "image_generate_tool", fake)
+        monkeypatch.setattr(image_tool, "_resolve_fal_model",
+                            lambda: ("fal-ai/flux-2/klein/9b", {}))
+
+        FalImageGenProvider().generate(
+            "p",
+            aspect_ratio="landscape",
+            seed=None,
+            num_images=2,
+            guidance_scale=None,
+        )
+
+        # ``None`` values must not be forwarded — they'd override the
+        # model's defaults inside the legacy payload builder.
+        assert "seed" not in seen
+        assert "guidance_scale" not in seen
+        assert seen.get("num_images") == 2
+
+    def test_generate_catches_exception_from_legacy(self, monkeypatch):
+        import tools.image_generation_tool as image_tool
+        from plugins.image_gen.fal import FalImageGenProvider
+
+        def boom(*args, **kwargs):
+            raise RuntimeError("FAL endpoint exploded")
+
+        monkeypatch.setattr(image_tool, "image_generate_tool", boom)
+
+        result = FalImageGenProvider().generate("p")
+        assert result["success"] is False
+        assert "FAL image generation failed" in result["error"]
+        assert result["error_type"] == "RuntimeError"
+        assert result["provider"] == "fal"
+
+    def test_generate_invalid_json_response(self, monkeypatch):
+        import tools.image_generation_tool as image_tool
+        from plugins.image_gen.fal import FalImageGenProvider
+
+        monkeypatch.setattr(image_tool, "image_generate_tool", lambda **kw: "not-json")
+        monkeypatch.setattr(image_tool, "_resolve_fal_model",
+                            lambda: ("fal-ai/flux-2/klein/9b", {}))
+
+        result = FalImageGenProvider().generate("p")
+        assert result["success"] is False
+        assert "Invalid JSON" in result["error"]
+        assert result["provider"] == "fal"
+
+
+# ---------------------------------------------------------------------------
+# Registry wiring
+# ---------------------------------------------------------------------------
+
+
+class TestFalImageGenPluginRegistration:
+    def test_register_wires_provider_into_registry(self):
+        from plugins.image_gen.fal import FalImageGenProvider, register
+
+        ctx = MagicMock()
+        register(ctx)
+
+        ctx.register_image_gen_provider.assert_called_once()
+        (registered,), _ = ctx.register_image_gen_provider.call_args
+        assert isinstance(registered, FalImageGenProvider)
@@ -62,8 +62,9 @@ def plugin_api(tmp_path, monkeypatch):
 class _FakeSessionDB:
    """Stand-in for hermes_state.SessionDB that records scan calls."""

-    def __init__(self, session_count: int):
+    def __init__(self, session_count: int, scan_delay: float = 0):
        self.session_count = session_count
+        self.scan_delay = scan_delay
        self.last_limit: Optional[int] = None
        self.last_include_children: Optional[bool] = None
        self.list_calls = 0
@@ -78,6 +79,8 @@ class _FakeSessionDB:
        include_children: bool = False,
        project_compression_tips: bool = True,
    ) -> List[Dict[str, Any]]:
+        if self.scan_delay:
+            time.sleep(self.scan_delay)
        self.last_limit = limit
        self.last_include_children = include_children
        self.list_calls += 1
@@ -225,10 +228,8 @@ def test_evaluate_all_stale_cache_serves_stale_and_refreshes_in_background(plugi
    the stale data immediately and kicks a background refresh. Users don't
    stare at a loading spinner every time TTL expires.
    """
-    fake_db = _FakeSessionDB(session_count=10)
+    fake_db = _FakeSessionDB(session_count=10, scan_delay=2.0)
    _install_fake_session_db(plugin_api, fake_db)
-
-    # Seed a stale snapshot on disk.
    stale_generated_at = int(time.time()) - plugin_api.SNAPSHOT_TTL_SECONDS - 60
    stale_payload = {
        "achievements": [],
@@ -2,8 +2,8 @@

 Covers:

- All seven bundled plugins (brave-free, ddgs, searxng, exa, parallel,
-  tavily, firecrawl) instantiate and self-report the expected
+- All eight bundled plugins (brave-free, ddgs, searxng, exa, parallel,
+  tavily, firecrawl, xai) instantiate and self-report the expected
  capabilities + ABC-derived defaults.
 - Each plugin's ``is_available()`` correctly reflects env-var presence.
 - The web_search_registry resolves an active provider in the documented
@@ -47,6 +47,7 @@ def _clear_web_env(monkeypatch: pytest.MonkeyPatch) -> None:
        "FIRECRAWL_GATEWAY_URL",
        "TOOL_GATEWAY_DOMAIN",
        "TOOL_GATEWAY_USER_TOKEN",
+        "XAI_API_KEY",
    ):
        monkeypatch.delenv(k, raising=False)

@@ -70,7 +71,7 @@ def _isolate_env(monkeypatch: pytest.MonkeyPatch) -> None:


 class TestBundledPluginsRegister:
-    """All seven bundled web plugins discover and register correctly."""
+    """All eight bundled web plugins discover and register correctly."""

    def test_all_seven_plugins_present_in_registry(self) -> None:
        _ensure_plugins_loaded()
@@ -85,6 +86,7 @@ class TestBundledPluginsRegister:
            "parallel",
            "searxng",
            "tavily",
+            "xai",
        ]

    @pytest.mark.parametrize(
@@ -100,6 +102,8 @@ class TestBundledPluginsRegister:
            # disabled in the migration (fell through to a legacy inline
            # path); the follow-up commit enabled it natively.
            ("firecrawl", True, True, True),
+            # xai: search-only via Grok's agentic web_search tool.
+            ("xai", True, False, False),
        ],
    )
    def test_capability_flags_match_spec(
@@ -120,7 +124,7 @@ class TestBundledPluginsRegister:

    @pytest.mark.parametrize(
        "plugin_name",
-        ["brave-free", "ddgs", "searxng", "exa", "parallel", "tavily", "firecrawl"],
+        ["brave-free", "ddgs", "searxng", "exa", "parallel", "tavily", "firecrawl", "xai"],
    )
    def test_each_plugin_has_name_and_display_name(self, plugin_name: str) -> None:
        _ensure_plugins_loaded()
@@ -133,7 +137,7 @@ class TestBundledPluginsRegister:

    @pytest.mark.parametrize(
        "plugin_name",
-        ["brave-free", "ddgs", "searxng", "exa", "parallel", "tavily", "firecrawl"],
+        ["brave-free", "ddgs", "searxng", "exa", "parallel", "tavily", "firecrawl", "xai"],
    )
    def test_each_plugin_has_setup_schema(self, plugin_name: str) -> None:
        """``get_setup_schema()`` returns a dict the picker can consume."""
@@ -239,6 +243,17 @@ class TestIsAvailable:
        # Truthy or falsy, just must not raise.
        _ = bool(p.is_available())

+    def test_xai_requires_api_key_or_oauth(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        """xAI needs XAI_API_KEY or OAuth tokens in auth.json."""
+        _ensure_plugins_loaded()
+        from agent.web_search_registry import get_provider
+
+        p = get_provider("xai")
+        assert p is not None
+        assert p.is_available() is False  # no XAI_API_KEY, no auth.json
+        monkeypatch.setenv("XAI_API_KEY", "real")
+        assert p.is_available() is True
+

 # ---------------------------------------------------------------------------
 # Registry resolution semantics (Option B — conservative smart fallback)
@@ -455,7 +470,7 @@ class TestErrorResponseShapes:
        if result["results"]:
            assert "error" in result["results"][0]

-    def test_firecrawl_crawl_returns_error_dict_when_unconfigured(self) -> None:
+    def test_firecrawl_crawl_returns_error_dict_when_unconfigured(self):
        """firecrawl crawl is async (wraps SDK in to_thread); error must be
        surfaced via the per-page result shape, not raised."""
        _ensure_plugins_loaded()
@@ -473,3 +488,15 @@ class TestErrorResponseShapes:
        assert len(result["results"]) >= 1
        assert "error" in result["results"][0]
        assert result["results"][0]["url"] == "https://example.com"
+
+    def test_xai_search_returns_error_dict_when_unconfigured(self) -> None:
+        """xAI returns a typed error dict (no XAI_API_KEY)."""
+        _ensure_plugins_loaded()
+        from agent.web_search_registry import get_provider
+
+        p = get_provider("xai")
+        assert p is not None
+        result = p.search("test", limit=5)
+        assert isinstance(result, dict)
+        assert result.get("success") is False
+        assert "error" in result
@@ -236,7 +236,7 @@ class TestQwenParity:


 class TestCustomOllamaParity:
-    """Custom/Ollama: num_ctx, think=false — now tested via profile."""
+    """Custom/Ollama: num_ctx, thinking controls — now tested via profile."""

    def test_ollama_num_ctx(self, transport):
        kw = transport.build_kwargs(
@@ -0,0 +1,260 @@
+"""Tests for reactive multimodal-tool-content recovery.
+
+Covers the full chain for providers that reject list-type content in
+``role: "tool"`` messages (Xiaomi MiMo's 400 "text is not set", etc.):
+
+  1. agent/error_classifier.py: 400 with the right wording classifies as
+     ``FailoverReason.multimodal_tool_content_unsupported``.
+  2. run_agent._try_strip_image_parts_from_tool_messages downgrades tool
+     messages whose ``content`` is a list-with-image to a string text
+     summary, in-place, and records the active (provider, model) in
+     ``self._no_list_tool_content_models`` so future tool results in this
+     session preemptively downgrade.
+  3. run_agent._tool_result_content_for_active_model short-circuits to a
+     text summary when the (provider, model) is in the cache, even though
+     ``_model_supports_vision`` returns True — avoiding a wasted round
+     trip on every subsequent screenshot in the session.
+
+The end-to-end retry loop wiring (`conversation_loop.py`) is exercised by
+the classifier signal + helper-mutation tests; the integration only adds
+a trivial flag-and-continue around the existing pattern used for
+``image_too_large`` recovery.
+
+See: https://github.com/NousResearch/hermes-agent/issues/27344
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from agent.error_classifier import FailoverReason, classify_api_error
+
+
+class _FakeApiError(Exception):
+    """Stand-in for an openai.BadRequestError with status_code + body."""
+
+    def __init__(self, status_code: int, message: str, body: dict | None = None):
+        super().__init__(message)
+        self.status_code = status_code
+        self.body = body or {"error": {"message": message}}
+        self.response = None
+
+
+def _make_agent(provider: str = "xiaomi", model: str = "mimo-v2.5"):
+    """Build a bare AIAgent for method-level testing, no provider setup."""
+    from run_agent import AIAgent
+    agent = object.__new__(AIAgent)
+    agent.provider = provider
+    agent.model = model
+    return agent
+
+
+# ─── Strip helper ────────────────────────────────────────────────────────────
+
+
+class TestStripImagePartsHelper:
+    def test_no_messages_returns_false(self):
+        agent = _make_agent()
+        assert agent._try_strip_image_parts_from_tool_messages([]) is False
+        assert agent._try_strip_image_parts_from_tool_messages(None) is False
+
+    def test_no_tool_messages_returns_false(self):
+        agent = _make_agent()
+        msgs = [
+            {"role": "user", "content": "plain text"},
+            {"role": "assistant", "content": "ack"},
+        ]
+        assert agent._try_strip_image_parts_from_tool_messages(msgs) is False
+
+    def test_tool_message_with_string_content_unchanged(self):
+        agent = _make_agent()
+        msgs = [
+            {"role": "tool", "tool_call_id": "x", "content": "plain string result"},
+        ]
+        assert agent._try_strip_image_parts_from_tool_messages(msgs) is False
+        assert msgs[0]["content"] == "plain string result"
+
+    def test_tool_message_list_without_image_unchanged(self):
+        """List content with only text parts is left alone — caller surfaces
+        the original error if this turns out to also be rejected."""
+        agent = _make_agent()
+        msgs = [
+            {"role": "tool", "tool_call_id": "x", "content": [
+                {"type": "text", "text": "hello"},
+            ]},
+        ]
+        assert agent._try_strip_image_parts_from_tool_messages(msgs) is False
+
+    def test_tool_message_list_with_image_downgrades(self):
+        agent = _make_agent()
+        msgs = [
+            {"role": "tool", "tool_call_id": "x", "content": [
+                {"type": "text", "text": "AX summary: 5 buttons visible"},
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}},
+            ]},
+        ]
+        assert agent._try_strip_image_parts_from_tool_messages(msgs) is True
+        # Image stripped; text preserved as a string.
+        assert isinstance(msgs[0]["content"], str)
+        assert "AX summary" in msgs[0]["content"]
+        assert "image_url" not in msgs[0]["content"]
+        assert "iVBOR" not in msgs[0]["content"]
+
+    def test_tool_message_image_only_gets_placeholder(self):
+        """If the list had nothing but image parts, leave a placeholder so
+        the assistant message has something to reference."""
+        agent = _make_agent()
+        msgs = [
+            {"role": "tool", "tool_call_id": "x", "content": [
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}},
+            ]},
+        ]
+        assert agent._try_strip_image_parts_from_tool_messages(msgs) is True
+        assert isinstance(msgs[0]["content"], str)
+        assert "image content removed" in msgs[0]["content"]
+
+    def test_records_provider_model_in_session_cache(self):
+        agent = _make_agent(provider="xiaomi", model="mimo-v2.5")
+        msgs = [
+            {"role": "tool", "tool_call_id": "x", "content": [
+                {"type": "text", "text": "summary"},
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,X"}},
+            ]},
+        ]
+        agent._try_strip_image_parts_from_tool_messages(msgs)
+        assert ("xiaomi", "mimo-v2.5") in agent._no_list_tool_content_models
+
+    def test_only_tool_messages_get_downgraded(self):
+        """User / assistant messages with list-type content are out of
+        scope — they're handled by the existing image-routing path."""
+        agent = _make_agent()
+        msgs = [
+            {"role": "user", "content": [
+                {"type": "text", "text": "describe"},
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,X"}},
+            ]},
+            {"role": "tool", "tool_call_id": "x", "content": [
+                {"type": "text", "text": "summary"},
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,Y"}},
+            ]},
+        ]
+        agent._try_strip_image_parts_from_tool_messages(msgs)
+        # User message untouched.
+        assert isinstance(msgs[0]["content"], list)
+        assert any(p.get("type") == "image_url" for p in msgs[0]["content"])
+        # Tool message downgraded.
+        assert isinstance(msgs[1]["content"], str)
+        assert "summary" in msgs[1]["content"]
+
+    def test_skips_recording_when_no_model_id(self):
+        """Don't poison the cache with empty keys when provider/model is
+        unset (e.g. lazy-initialised mid-handshake)."""
+        agent = _make_agent(provider="", model="")
+        msgs = [
+            {"role": "tool", "tool_call_id": "x", "content": [
+                {"type": "text", "text": "summary"},
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,X"}},
+            ]},
+        ]
+        agent._try_strip_image_parts_from_tool_messages(msgs)
+        assert agent._no_list_tool_content_models == set()
+
+
+# ─── Short-circuit on cached models ──────────────────────────────────────────
+
+
+class TestToolResultContentShortCircuit:
+    """Once the session has learned that (provider, model) rejects list
+    content, ``_tool_result_content_for_active_model`` returns a text
+    summary even though ``_model_supports_vision`` reports True.
+    """
+
+    def _multimodal_result(self, png_b64: str = "iVBORw0KGgoAAAA"):
+        return {
+            "_multimodal": True,
+            "content": [
+                {"type": "text", "text": "capture mode=som 800x600 app=Safari"},
+                {"type": "image_url",
+                 "image_url": {"url": f"data:image/png;base64,{png_b64}"}},
+            ],
+            "text_summary": "capture mode=som 800x600 app=Safari",
+            "meta": {"mode": "som", "width": 800, "height": 600, "elements": 5,
+                     "png_bytes": 1024},
+        }
+
+    def test_returns_list_when_cache_empty_and_vision_supported(self, monkeypatch):
+        agent = _make_agent(provider="xiaomi", model="mimo-v2.5")
+        agent._no_list_tool_content_models = set()  # explicit empty
+        monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
+        out = agent._tool_result_content_for_active_model(
+            "computer_use", self._multimodal_result()
+        )
+        # Native multimodal path: returns the content parts list.
+        assert isinstance(out, list)
+        assert any(p.get("type") == "image_url" for p in out)
+
+    def test_returns_text_summary_when_model_in_cache(self, monkeypatch):
+        agent = _make_agent(provider="xiaomi", model="mimo-v2.5")
+        agent._no_list_tool_content_models = {("xiaomi", "mimo-v2.5")}
+        monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
+        out = agent._tool_result_content_for_active_model(
+            "computer_use", self._multimodal_result()
+        )
+        # Short-circuit: a plain string summary, no image_url present.
+        assert isinstance(out, str)
+        assert "data:image" not in out
+        assert "image_url" not in out
+
+    def test_cache_miss_on_different_model(self, monkeypatch):
+        """Cache is per (provider, model). A cached entry for mimo-v2.5
+        must NOT affect a session running on a different model.
+        """
+        agent = _make_agent(provider="xiaomi", model="mimo-v2.5-pro")
+        agent._no_list_tool_content_models = {("xiaomi", "mimo-v2.5")}
+        monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
+        out = agent._tool_result_content_for_active_model(
+            "computer_use", self._multimodal_result()
+        )
+        assert isinstance(out, list)
+
+    def test_missing_cache_attribute_falls_through(self, monkeypatch):
+        """Tests that build agents via ``object.__new__`` without calling
+        ``__init__`` must not crash — the cache attribute may be absent.
+        """
+        agent = _make_agent()
+        # Deliberately do not assign _no_list_tool_content_models.
+        monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
+        out = agent._tool_result_content_for_active_model(
+            "computer_use", self._multimodal_result()
+        )
+        assert isinstance(out, list)
+
+
+# ─── Classifier ──────────────────────────────────────────────────────────────
+
+
+class TestRecoveryEndToEndClassification:
+    """Lock in that the patterns used by the recovery path classify to
+    the right ``FailoverReason``. (The recovery hook in
+    ``agent.conversation_loop`` consumes this reason directly.)
+    """
+
+    def test_xiaomi_mimo_classifies(self):
+        err = _FakeApiError(
+            status_code=400,
+            message=(
+                "Error code: 400 - {'error': {'code': '400', 'message': "
+                "'Param Incorrect', 'param': 'text is not set', 'type': ''}}"
+            ),
+        )
+        result = classify_api_error(err, provider="xiaomi", model="mimo-v2.5")
+        assert result.reason == FailoverReason.multimodal_tool_content_unsupported
+        assert result.retryable is True
+
+    def test_alibaba_variant_classifies(self):
+        err = _FakeApiError(
+            status_code=400,
+            message="tool_call.content must be string",
+        )
+        result = classify_api_error(err, provider="alibaba", model="qwen3.5-plus")
+        assert result.reason == FailoverReason.multimodal_tool_content_unsupported
@@ -2636,6 +2636,31 @@ class TestRunConversation:
        assert result["final_response"] == "Final answer"
        assert result["completed"] is True

+    def test_ollama_small_runtime_context_fails_before_api_call(self, agent, caplog):
+        self._setup_agent(agent)
+        agent.model = "qwen3.5:9b"
+        agent.provider = "custom"
+        agent.base_url = "http://host.docker.internal:11434/v1"
+        agent._ollama_num_ctx = 4096
+
+        with (
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+            caplog.at_level(logging.WARNING, logger="agent.conversation_loop"),
+        ):
+            result = agent.run_conversation("Call ps -aux")
+
+        assert result["failed"] is True
+        assert result["completed"] is False
+        assert result["api_calls"] == 0
+        assert result["turn_exit_reason"] == "ollama_runtime_context_too_small"
+        assert "Ollama loaded `qwen3.5:9b` with only 4,096 tokens" in result["final_response"]
+        assert "model.ollama_num_ctx: 65536" in result["final_response"]
+        assert not agent.client.chat.completions.create.called
+        assert "Ollama runtime context too small for Hermes tool use" in caplog.text
+        assert "runtime_context=4096" in caplog.text
+
    def test_tool_calls_then_stop(self, agent):
        self._setup_agent(agent)
        tc = _mock_tool_call(name="web_search", arguments="{}", call_id="c1")
@@ -0,0 +1,491 @@
+"""Hermetic tests for the Bitwarden Secrets Manager integration.
+
+We never hit GitHub or Bitwarden in tests — subprocess + urllib are
+mocked so the suite stays fast and offline-safe.  The "live" pull and
+binary download are exercised manually by `hermes secrets bitwarden
+setup` outside of pytest.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import io
+import json
+import os
+import stat
+import subprocess
+import sys
+import tempfile
+import time
+import zipfile
+from pathlib import Path
+from unittest import mock
+
+import pytest
+
+
+# Make the worktree importable without depending on the installed wheel.
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+from agent.secret_sources import bitwarden as bw  # noqa: E402
+
+
+@pytest.fixture(autouse=True)
+def _reset_caches():
+    bw._reset_cache_for_tests()
+    yield
+    bw._reset_cache_for_tests()
+
+
+@pytest.fixture
+def hermes_home(tmp_path, monkeypatch):
+    """Point Hermes at an isolated home directory."""
+    home = tmp_path / ".hermes"
+    home.mkdir()
+    monkeypatch.setenv("HERMES_HOME", str(home))
+    # Some modules cache get_hermes_home; clear if needed.
+    import hermes_constants
+    if hasattr(hermes_constants, "_HERMES_HOME_CACHE"):
+        hermes_constants._HERMES_HOME_CACHE = None  # type: ignore[attr-defined]
+    return home
+
+
+# ---------------------------------------------------------------------------
+# _platform_asset_name
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "system,machine,libc_text,expected",
+    [
+        ("Darwin", "x86_64", "",
+         f"bws-macos-universal-{bw._BWS_VERSION}.zip"),
+        ("Darwin", "arm64", "",
+         f"bws-macos-universal-{bw._BWS_VERSION}.zip"),
+        ("Linux", "x86_64", "glibc",
+         f"bws-x86_64-unknown-linux-gnu-{bw._BWS_VERSION}.zip"),
+        ("Linux", "x86_64", "musl libc",
+         f"bws-x86_64-unknown-linux-musl-{bw._BWS_VERSION}.zip"),
+        ("Linux", "aarch64", "",
+         f"bws-aarch64-unknown-linux-gnu-{bw._BWS_VERSION}.zip"),
+        ("Windows", "AMD64", "",
+         f"bws-x86_64-pc-windows-msvc-{bw._BWS_VERSION}.zip"),
+        ("Windows", "ARM64", "",
+         f"bws-aarch64-pc-windows-msvc-{bw._BWS_VERSION}.zip"),
+    ],
+)
+def test_platform_asset_name(system, machine, libc_text, expected):
+    with mock.patch.object(bw.platform, "system", return_value=system), \
+         mock.patch.object(bw.platform, "machine", return_value=machine), \
+         mock.patch.object(
+             bw.subprocess,
+             "run",
+             return_value=mock.Mock(stdout=libc_text, stderr=libc_text),
+         ):
+        assert bw._platform_asset_name() == expected
+
+
+# ---------------------------------------------------------------------------
+# install_bws — fully mocked HTTP
+# ---------------------------------------------------------------------------
+
+
+def _make_fake_zip(binary_bytes: bytes) -> bytes:
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w") as zf:
+        zf.writestr("bws", binary_bytes)
+    return buf.getvalue()
+
+
+def test_install_bws_happy_path(hermes_home, monkeypatch):
+    fake_binary = b"#!/bin/sh\necho 'bws fake 2.0.0'\n"
+    zip_bytes = _make_fake_zip(fake_binary)
+    asset_name = bw._platform_asset_name()
+    checksum_text = (
+        f"{hashlib.sha256(zip_bytes).hexdigest()}  {asset_name}\n"
+        "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  other-file\n"
+    )
+
+    def fake_download(url, dest):
+        if url.endswith(".zip"):
+            Path(dest).write_bytes(zip_bytes)
+        elif url.endswith(".txt"):
+            Path(dest).write_text(checksum_text)
+        else:
+            raise AssertionError(f"unexpected download url: {url}")
+
+    monkeypatch.setattr(bw, "_http_download", fake_download)
+
+    path = bw.install_bws()
+    assert path.exists()
+    assert path.read_bytes() == fake_binary
+    # Executable bit set
+    assert path.stat().st_mode & stat.S_IXUSR
+
+
+def test_install_bws_checksum_mismatch(hermes_home, monkeypatch):
+    zip_bytes = _make_fake_zip(b"contents")
+    asset_name = bw._platform_asset_name()
+    wrong_checksum = "0" * 64
+    checksum_text = f"{wrong_checksum}  {asset_name}\n"
+
+    def fake_download(url, dest):
+        if url.endswith(".zip"):
+            Path(dest).write_bytes(zip_bytes)
+        else:
+            Path(dest).write_text(checksum_text)
+
+    monkeypatch.setattr(bw, "_http_download", fake_download)
+
+    with pytest.raises(RuntimeError, match="Checksum mismatch"):
+        bw.install_bws()
+
+
+def test_install_bws_missing_checksum_entry(hermes_home, monkeypatch):
+    zip_bytes = _make_fake_zip(b"x")
+
+    def fake_download(url, dest):
+        if url.endswith(".zip"):
+            Path(dest).write_bytes(zip_bytes)
+        else:
+            Path(dest).write_text("ffffffff  some-other-file.zip\n")
+
+    monkeypatch.setattr(bw, "_http_download", fake_download)
+
+    with pytest.raises(RuntimeError, match="No checksum entry"):
+        bw.install_bws()
+
+
+# ---------------------------------------------------------------------------
+# fetch_bitwarden_secrets
+# ---------------------------------------------------------------------------
+
+
+def _fake_bws_payload(items):
+    return json.dumps(items)
+
+
+def test_fetch_happy_path(monkeypatch, tmp_path):
+    fake_binary = tmp_path / "bws"
+    fake_binary.write_text("")
+    payload = _fake_bws_payload([
+        {"key": "OPENAI_API_KEY", "value": "sk-abc"},
+        {"key": "ANTHROPIC_API_KEY", "value": "sk-ant-xyz"},
+    ])
+
+    def fake_run(cmd, **kwargs):
+        assert cmd[0] == str(fake_binary)
+        assert "secret" in cmd and "list" in cmd
+        assert kwargs["env"]["BWS_ACCESS_TOKEN"] == "0.fake.token"
+        return mock.Mock(returncode=0, stdout=payload, stderr="")
+
+    monkeypatch.setattr(bw.subprocess, "run", fake_run)
+
+    secrets, warnings = bw.fetch_bitwarden_secrets(
+        access_token="0.fake.token",
+        project_id="proj-uuid",
+        binary=fake_binary,
+        use_cache=False,
+    )
+    assert secrets == {
+        "OPENAI_API_KEY": "sk-abc",
+        "ANTHROPIC_API_KEY": "sk-ant-xyz",
+    }
+    assert warnings == []
+
+
+def test_fetch_skips_invalid_env_names(monkeypatch, tmp_path):
+    fake_binary = tmp_path / "bws"
+    fake_binary.write_text("")
+    payload = _fake_bws_payload([
+        {"key": "VALID_KEY", "value": "v1"},
+        {"key": "1BAD_START", "value": "v2"},
+        {"key": "has spaces", "value": "v3"},
+        {"key": "DASH-KEY", "value": "v4"},
+    ])
+
+    monkeypatch.setattr(
+        bw.subprocess,
+        "run",
+        lambda *a, **kw: mock.Mock(returncode=0, stdout=payload, stderr=""),
+    )
+
+    secrets, warnings = bw.fetch_bitwarden_secrets(
+        access_token="0.t",
+        project_id="p",
+        binary=fake_binary,
+        use_cache=False,
+    )
+    assert secrets == {"VALID_KEY": "v1"}
+    assert len(warnings) == 3
+
+
+def test_fetch_auth_failure(monkeypatch, tmp_path):
+    fake_binary = tmp_path / "bws"
+    fake_binary.write_text("")
+
+    monkeypatch.setattr(
+        bw.subprocess,
+        "run",
+        lambda *a, **kw: mock.Mock(
+            returncode=1, stdout="", stderr="Error: invalid access token"
+        ),
+    )
+
+    with pytest.raises(RuntimeError, match="invalid access token"):
+        bw.fetch_bitwarden_secrets(
+            access_token="0.bad",
+            project_id="p",
+            binary=fake_binary,
+            use_cache=False,
+        )
+
+
+def test_fetch_timeout(monkeypatch, tmp_path):
+    fake_binary = tmp_path / "bws"
+    fake_binary.write_text("")
+
+    def fake_run(*a, **kw):
+        raise subprocess.TimeoutExpired(cmd="bws", timeout=30)
+
+    monkeypatch.setattr(bw.subprocess, "run", fake_run)
+
+    with pytest.raises(RuntimeError, match="timed out"):
+        bw.fetch_bitwarden_secrets(
+            access_token="0.t",
+            project_id="p",
+            binary=fake_binary,
+            use_cache=False,
+        )
+
+
+def test_fetch_non_json(monkeypatch, tmp_path):
+    fake_binary = tmp_path / "bws"
+    fake_binary.write_text("")
+
+    monkeypatch.setattr(
+        bw.subprocess,
+        "run",
+        lambda *a, **kw: mock.Mock(
+            returncode=0, stdout="not json at all", stderr=""
+        ),
+    )
+
+    with pytest.raises(RuntimeError, match="non-JSON"):
+        bw.fetch_bitwarden_secrets(
+            access_token="0.t",
+            project_id="p",
+            binary=fake_binary,
+            use_cache=False,
+        )
+
+
+def test_fetch_cache_hits(monkeypatch, tmp_path):
+    fake_binary = tmp_path / "bws"
+    fake_binary.write_text("")
+    payload = _fake_bws_payload([{"key": "K", "value": "v"}])
+
+    call_count = {"n": 0}
+    def fake_run(*a, **kw):
+        call_count["n"] += 1
+        return mock.Mock(returncode=0, stdout=payload, stderr="")
+
+    monkeypatch.setattr(bw.subprocess, "run", fake_run)
+
+    bw.fetch_bitwarden_secrets(access_token="0.t", project_id="p",
+                                binary=fake_binary, cache_ttl_seconds=60)
+    bw.fetch_bitwarden_secrets(access_token="0.t", project_id="p",
+                                binary=fake_binary, cache_ttl_seconds=60)
+    assert call_count["n"] == 1  # cached on second call
+
+
+def test_fetch_cache_disabled(monkeypatch, tmp_path):
+    fake_binary = tmp_path / "bws"
+    fake_binary.write_text("")
+    payload = _fake_bws_payload([])
+    call_count = {"n": 0}
+    def fake_run(*a, **kw):
+        call_count["n"] += 1
+        return mock.Mock(returncode=0, stdout=payload, stderr="")
+    monkeypatch.setattr(bw.subprocess, "run", fake_run)
+
+    bw.fetch_bitwarden_secrets(access_token="0.t", project_id="p",
+                                binary=fake_binary, use_cache=False)
+    bw.fetch_bitwarden_secrets(access_token="0.t", project_id="p",
+                                binary=fake_binary, use_cache=False)
+    assert call_count["n"] == 2
+
+
+# ---------------------------------------------------------------------------
+# apply_bitwarden_secrets — the public entry point used by env_loader
+# ---------------------------------------------------------------------------
+
+
+def test_apply_disabled_returns_empty():
+    result = bw.apply_bitwarden_secrets(enabled=False, project_id="p")
+    assert result.ok
+    assert not result.applied
+    assert not result.error
+
+
+def test_apply_missing_token(monkeypatch):
+    monkeypatch.delenv("BWS_ACCESS_TOKEN", raising=False)
+    result = bw.apply_bitwarden_secrets(
+        enabled=True, project_id="p", auto_install=False
+    )
+    assert not result.ok
+    assert "BWS_ACCESS_TOKEN" in result.error
+
+
+def test_apply_missing_project_id(monkeypatch):
+    monkeypatch.setenv("BWS_ACCESS_TOKEN", "0.t")
+    result = bw.apply_bitwarden_secrets(
+        enabled=True, project_id="", auto_install=False
+    )
+    assert not result.ok
+    assert "project_id" in result.error
+
+
+def test_apply_does_not_override_existing(monkeypatch, tmp_path):
+    monkeypatch.setenv("BWS_ACCESS_TOKEN", "0.t")
+    monkeypatch.setenv("OPENAI_API_KEY", "existing-value")
+    fake_binary = tmp_path / "bws"
+    fake_binary.write_text("")
+    payload = _fake_bws_payload([
+        {"key": "OPENAI_API_KEY", "value": "bsm-value"},
+        {"key": "NEW_KEY", "value": "new-value"},
+    ])
+    monkeypatch.setattr(
+        bw.subprocess, "run",
+        lambda *a, **kw: mock.Mock(returncode=0, stdout=payload, stderr=""),
+    )
+    monkeypatch.setattr(bw, "find_bws", lambda **kw: fake_binary)
+
+    result = bw.apply_bitwarden_secrets(
+        enabled=True, project_id="p",
+        override_existing=False, auto_install=False,
+    )
+    assert result.ok
+    assert "NEW_KEY" in result.applied
+    assert "OPENAI_API_KEY" in result.skipped
+    assert os.environ["OPENAI_API_KEY"] == "existing-value"
+    assert os.environ["NEW_KEY"] == "new-value"
+
+
+def test_apply_override_existing(monkeypatch, tmp_path):
+    monkeypatch.setenv("BWS_ACCESS_TOKEN", "0.t")
+    monkeypatch.setenv("OPENAI_API_KEY", "stale")
+    fake_binary = tmp_path / "bws"
+    fake_binary.write_text("")
+    payload = _fake_bws_payload([{"key": "OPENAI_API_KEY", "value": "fresh"}])
+    monkeypatch.setattr(
+        bw.subprocess, "run",
+        lambda *a, **kw: mock.Mock(returncode=0, stdout=payload, stderr=""),
+    )
+    monkeypatch.setattr(bw, "find_bws", lambda **kw: fake_binary)
+
+    result = bw.apply_bitwarden_secrets(
+        enabled=True, project_id="p",
+        override_existing=True, auto_install=False,
+    )
+    assert result.ok
+    assert os.environ["OPENAI_API_KEY"] == "fresh"
+
+
+def test_apply_never_overrides_bootstrap_token(monkeypatch, tmp_path):
+    """Even with override_existing=True, the access-token var is preserved."""
+    monkeypatch.setenv("BWS_ACCESS_TOKEN", "0.original")
+    fake_binary = tmp_path / "bws"
+    fake_binary.write_text("")
+    payload = _fake_bws_payload([
+        {"key": "BWS_ACCESS_TOKEN", "value": "0.malicious-replacement"},
+    ])
+    monkeypatch.setattr(
+        bw.subprocess, "run",
+        lambda *a, **kw: mock.Mock(returncode=0, stdout=payload, stderr=""),
+    )
+    monkeypatch.setattr(bw, "find_bws", lambda **kw: fake_binary)
+
+    result = bw.apply_bitwarden_secrets(
+        enabled=True, project_id="p",
+        override_existing=True, auto_install=False,
+    )
+    assert os.environ["BWS_ACCESS_TOKEN"] == "0.original"
+    assert "BWS_ACCESS_TOKEN" in result.skipped
+
+
+def test_apply_swallows_fetch_errors(monkeypatch, tmp_path):
+    """A fetch failure produces an error, NOT an exception."""
+    monkeypatch.setenv("BWS_ACCESS_TOKEN", "0.t")
+    fake_binary = tmp_path / "bws"
+    fake_binary.write_text("")
+    monkeypatch.setattr(
+        bw.subprocess, "run",
+        lambda *a, **kw: mock.Mock(returncode=1, stdout="", stderr="bad token"),
+    )
+    monkeypatch.setattr(bw, "find_bws", lambda **kw: fake_binary)
+
+    result = bw.apply_bitwarden_secrets(
+        enabled=True, project_id="p", auto_install=False,
+    )
+    assert not result.ok
+    assert "bad token" in result.error
+
+
+# ---------------------------------------------------------------------------
+# env_loader integration
+# ---------------------------------------------------------------------------
+
+
+def test_env_loader_skips_when_disabled(tmp_path, monkeypatch):
+    """No config.yaml present → no BSM call, no crash."""
+    home = tmp_path / ".hermes"
+    home.mkdir()
+    monkeypatch.setenv("HERMES_HOME", str(home))
+    monkeypatch.setattr(Path, "home", lambda: tmp_path)
+
+    from hermes_cli.env_loader import _apply_external_secret_sources
+    # Should be a no-op (returns None).
+    assert _apply_external_secret_sources(home) is None
+
+
+def test_env_loader_calls_bsm_when_enabled(tmp_path, monkeypatch):
+    home = tmp_path / ".hermes"
+    home.mkdir()
+    (home / "config.yaml").write_text(
+        "secrets:\n"
+        "  bitwarden:\n"
+        "    enabled: true\n"
+        "    project_id: 'proj-1'\n"
+        "    access_token_env: 'BWS_ACCESS_TOKEN'\n"
+        "    cache_ttl_seconds: 0\n"
+        "    override_existing: false\n"
+        "    auto_install: false\n"
+    )
+    monkeypatch.setenv("HERMES_HOME", str(home))
+    monkeypatch.setenv("BWS_ACCESS_TOKEN", "0.t")
+    monkeypatch.delenv("MY_BSM_KEY", raising=False)
+
+    called = {"n": 0}
+    def fake_apply(**kwargs):
+        called["n"] += 1
+        assert kwargs["enabled"] is True
+        assert kwargs["project_id"] == "proj-1"
+        os.environ["MY_BSM_KEY"] = "from-bsm"
+        return bw.FetchResult(
+            secrets={"MY_BSM_KEY": "from-bsm"},
+            applied=["MY_BSM_KEY"],
+        )
+
+    monkeypatch.setattr(
+        "agent.secret_sources.bitwarden.apply_bitwarden_secrets",
+        fake_apply,
+    )
+
+    from hermes_cli.env_loader import _apply_external_secret_sources
+    _apply_external_secret_sources(home)
+
+    assert called["n"] == 1
+    assert os.environ.get("MY_BSM_KEY") == "from-bsm"
@@ -0,0 +1,119 @@
+"""Tests for the secret-source tracking in ``hermes_cli.env_loader``.
+
+These cover the small public surface that lets `hermes model` / `hermes setup`
+label detected credentials with their origin ("from Bitwarden") so users
+don't see an unexplained "credentials ✓" line when their .env is empty.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import pytest
+
+
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+from hermes_cli import env_loader  # noqa: E402
+
+
+@pytest.fixture(autouse=True)
+def _reset_sources():
+    """Each test starts with a clean source map."""
+    env_loader._SECRET_SOURCES.clear()
+    yield
+    env_loader._SECRET_SOURCES.clear()
+
+
+def test_get_secret_source_returns_none_for_untracked_var():
+    assert env_loader.get_secret_source("ANTHROPIC_API_KEY") is None
+
+
+def test_get_secret_source_returns_label_for_tracked_var():
+    env_loader._SECRET_SOURCES["ANTHROPIC_API_KEY"] = "bitwarden"
+    assert env_loader.get_secret_source("ANTHROPIC_API_KEY") == "bitwarden"
+
+
+def test_format_secret_source_suffix_empty_for_untracked():
+    # Credentials from .env or the shell shouldn't add noise — the
+    # implicit case stays unlabeled.
+    assert env_loader.format_secret_source_suffix("ANTHROPIC_API_KEY") == ""
+
+
+def test_format_secret_source_suffix_bitwarden_uses_proper_name():
+    env_loader._SECRET_SOURCES["ANTHROPIC_API_KEY"] = "bitwarden"
+    assert (
+        env_loader.format_secret_source_suffix("ANTHROPIC_API_KEY")
+        == " (from Bitwarden)"
+    )
+
+
+def test_format_secret_source_suffix_generic_label_for_future_sources():
+    # Future-proofing: a new secret source (e.g. "vault") should still
+    # produce a sensible label without needing to edit every call site.
+    env_loader._SECRET_SOURCES["OPENAI_API_KEY"] = "vault"
+    assert (
+        env_loader.format_secret_source_suffix("OPENAI_API_KEY")
+        == " (from vault)"
+    )
+
+
+def test_apply_external_secret_sources_records_bitwarden_origin(tmp_path, monkeypatch):
+    """End-to-end: when ``apply_bitwarden_secrets`` returns applied keys,
+    they end up in ``_SECRET_SOURCES`` so the UI can label them."""
+
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    config_path = tmp_path / "config.yaml"
+    config_path.write_text(
+        "secrets:\n"
+        "  bitwarden:\n"
+        "    enabled: true\n"
+        "    project_id: test-project\n"
+        "    access_token_env: BWS_ACCESS_TOKEN\n",
+        encoding="utf-8",
+    )
+
+    # Stub apply_bitwarden_secrets to return a synthetic FetchResult.
+    from agent.secret_sources.bitwarden import FetchResult
+
+    fake_result = FetchResult(
+        secrets={"ANTHROPIC_API_KEY": "sk-ant-test"},
+        applied=["ANTHROPIC_API_KEY"],
+    )
+
+    def _fake_apply(**_kwargs):
+        return fake_result
+
+    # The import inside _apply_external_secret_sources is lazy, so we
+    # patch the *module attribute* it will pull in.
+    import agent.secret_sources.bitwarden as bw_module
+
+    monkeypatch.setattr(bw_module, "apply_bitwarden_secrets", _fake_apply)
+
+    env_loader._apply_external_secret_sources(tmp_path)
+
+    assert env_loader.get_secret_source("ANTHROPIC_API_KEY") == "bitwarden"
+    assert (
+        env_loader.format_secret_source_suffix("ANTHROPIC_API_KEY")
+        == " (from Bitwarden)"
+    )
+
+
+def test_apply_external_secret_sources_noop_when_disabled(tmp_path, monkeypatch):
+    """Disabled Bitwarden config must not touch the source map."""
+
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    config_path = tmp_path / "config.yaml"
+    config_path.write_text(
+        "secrets:\n"
+        "  bitwarden:\n"
+        "    enabled: false\n",
+        encoding="utf-8",
+    )
+
+    env_loader._apply_external_secret_sources(tmp_path)
+
+    assert env_loader.get_secret_source("ANTHROPIC_API_KEY") is None
@@ -0,0 +1,187 @@
+"""Verify scripts/run_tests_parallel.py kills test-spawned grandchildren.
+
+Setup
+-----
+A test in this file spawns a long-lived Python grandchild that writes
+its PID + a nonce to a tempfile, then exits without cleaning up.
+With the old ``subprocess.run`` runner, that grandchild would orphan
+and outlive the test (and the whole runner). With the current Popen +
+``start_new_session`` + ``_kill_tree`` runner, the grandchild gets
+SIGKILL'd via process-group kill when its file's pytest exits.
+
+The leaker test always passes — its only job is to spawn a grandchild
+and walk away. The verifier runs the runner over the leaker file in a
+subprocess, then waits for the grandchild PID to disappear from the
+kernel's process table.
+
+POSIX-only: Windows has its own grandchild lifecycle (no shared session,
+``taskkill /F /T`` semantics). Marked accordingly.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+import sys
+import textwrap
+import time
+from pathlib import Path
+
+import pytest
+
+
+# Both tests share the same handoff file: the leaker writes here, the
+# verifier reads here. We park it in $TMPDIR with a unique-per-run name
+# so concurrent invocations of the suite don't clobber each other.
+_HANDOFF_DIR = Path(os.environ.get("TMPDIR", "/tmp")) / "hermes-isolation-probe"
+_HANDOFF_DIR.mkdir(exist_ok=True)
+
+
+def _handoff_path_for(nonce: str) -> Path:
+    return _HANDOFF_DIR / f"grandchild-{nonce}.json"
+
+
+def _pid_alive(pid: int) -> bool:
+    """POSIX: send signal 0 to probe whether ``pid`` is still alive.
+
+    ``os.kill(pid, 0)`` raises ``ProcessLookupError`` if the process is
+    gone, ``PermissionError`` if it exists but we can't signal it
+    (someone else's pid). We treat PermissionError as "alive" because
+    the process exists and that's all we need to know.
+    """
+    if sys.platform == "win32":  # pragma: no cover — POSIX-only test
+        # On Windows we'd use OpenProcess + GetExitCodeProcess; this
+        # test is skipped on Windows so the path is unreachable.
+        raise RuntimeError("_pid_alive POSIX-only")
+    try:
+        os.kill(pid, 0)
+    except ProcessLookupError:
+        return False
+    except PermissionError:
+        return True
+    return True
+
+
+@pytest.mark.skipif(sys.platform == "win32", reason="POSIX-only probe")
+@pytest.mark.live_system_guard_bypass
+def test_grandchild_leak_is_killed_by_runner(tmp_path: Path) -> None:
+    """Run the parallel runner over a probe file and verify cleanup.
+
+    1. Materialize a probe file that spawns a long-lived grandchild and
+       writes its PID to disk before exiting.
+    2. Invoke ``scripts/run_tests_parallel.py`` against the probe file.
+    3. Wait for the grandchild PID to vanish (poll for ~5s).
+    4. Assert the runner exited cleanly AND the grandchild is dead.
+    """
+    repo_root = Path(__file__).resolve().parent.parent
+    runner = repo_root / "scripts" / "run_tests_parallel.py"
+    assert runner.exists(), f"runner missing at {runner}"
+
+    # Probe lives in a temp dir, NOT under tests/, so the regular suite
+    # never picks it up — only our explicit invocation does.
+    probe_dir = tmp_path / "probe"
+    probe_dir.mkdir()
+    probe = probe_dir / "test_probe_leaker.py"
+    nonce = f"{os.getpid()}-{int(time.time() * 1000)}"
+    handoff = _handoff_path_for(nonce)
+    if handoff.exists():
+        handoff.unlink()
+
+    probe_src = textwrap.dedent(f"""
+        import json, os, subprocess, sys, time
+        from pathlib import Path
+
+        HANDOFF = Path({str(handoff)!r})
+
+        def test_spawns_grandchild_and_walks_away():
+            # Long-lived grandchild: detached, ignores SIGTERM (we want
+            # SIGKILL or process-group kill to be the only thing that
+            # works, simulating a misbehaving server).
+            child = subprocess.Popen(
+                [
+                    sys.executable, "-c",
+                    "import os, signal, sys, time; "
+                    "signal.signal(signal.SIGTERM, signal.SIG_IGN); "
+                    "sys.stdout.write(f'gc-pgid={{os.getpgid(0)}} gc-pid={{os.getpid()}}\\\\n'); "
+                    "sys.stdout.flush(); "
+                    "time.sleep(600)",
+                ],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                # IMPORTANT: do NOT pass start_new_session here. We want
+                # the grandchild to inherit the pytest subprocess's
+                # process group, so when the runner kills the group the
+                # grandchild dies too.
+            )
+            # Read the first line so we can record gc's pgid in the
+            # handoff, then walk away — don't close the pipe (would
+            # signal EOF and let the child see SIGPIPE on next write).
+            first_line = child.stdout.readline().decode().strip()
+            HANDOFF.write_text(json.dumps({{
+                "pid": child.pid,
+                "diag": first_line,
+                "test_pid": os.getpid(),
+                "test_pgid": os.getpgid(0),
+            }}))
+            assert child.pid > 0
+    """).strip()
+    probe.write_text(probe_src + "\n")
+
+    # Run the parallel runner against just the probe file. The runner
+    # discovers under ``tests/`` by default, so we override via --paths.
+    proc = subprocess.run(
+        [
+            sys.executable,
+            str(runner),
+            "--paths",
+            str(probe_dir),
+            "-j",
+            "1",
+            # Tight per-file timeout: the probe finishes in <1s, no
+            # need for 10min.
+            "--file-timeout",
+            "30",
+        ],
+        cwd=repo_root,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        timeout=60,
+    )
+
+    assert handoff.exists(), (
+        f"probe never wrote handoff file; runner output:\n{proc.stdout}"
+    )
+    handoff_data = json.loads(handoff.read_text())
+    grandchild_pid = handoff_data["pid"]
+    diag = handoff_data.get("diag", "(no diag)")
+    test_pid = handoff_data.get("test_pid")
+    test_pgid = handoff_data.get("test_pgid")
+    handoff.unlink()
+
+    # The runner must have exited cleanly (probe test passes).
+    assert proc.returncode == 0, (
+        f"runner exited {proc.returncode}; output:\n{proc.stdout}"
+    )
+
+    # The grandchild must be gone. Poll for a bit because process-group
+    # SIGKILL + reaping isn't synchronous; on a loaded box it can take
+    # a beat.
+    deadline = time.monotonic() + 5.0
+    while time.monotonic() < deadline:
+        if not _pid_alive(grandchild_pid):
+            break
+        time.sleep(0.05)
+    else:
+        # Test cleanup: kill the leaked grandchild ourselves so a
+        # FAILED assertion doesn't leave a sleep(600) running.
+        try:
+            os.kill(grandchild_pid, 9)
+        except ProcessLookupError:
+            pass
+        pytest.fail(
+            f"grandchild PID {grandchild_pid} survived runner exit; "
+            f"diag={diag!r} test_pid={test_pid} test_pgid={test_pgid}; "
+            f"runner output:\n{proc.stdout}"
+        )
@@ -59,6 +59,59 @@ def test_write_json_returns_false_on_broken_pipe(monkeypatch):
    assert server.write_json({"ok": True}) is False


+def test_tui_verbose_tool_details_fail_closed_when_redaction_fails(monkeypatch):
+    redact_module = types.ModuleType("agent.redact")
+
+    def fail_redaction(*_args, **_kwargs):
+        raise RuntimeError("redaction unavailable")
+
+    setattr(redact_module, "redact_sensitive_text", fail_redaction)
+    monkeypatch.setitem(sys.modules, "agent.redact", redact_module)
+
+    assert server._redact_tui_verbose_text("api_key=secret") == ""
+    assert server._tool_args_text({"api_key": "secret"}) == ""
+    assert server._tool_result_text("token=secret") == ""
+
+
+def test_tui_verbose_tool_details_are_capped_before_emit(monkeypatch):
+    monkeypatch.setattr(server, "_TUI_VERBOSE_TEXT_MAX_CHARS", 12)
+    monkeypatch.setattr(server, "_TUI_VERBOSE_TEXT_MAX_LINES", 2)
+
+    capped = server._cap_tui_verbose_text("one\ntwo\nthree\nfour")
+
+    assert capped.startswith("[showing verbose tail; omitted ")
+    assert capped.endswith("three\nfour")
+    assert "one" not in capped
+
+
+def test_tui_verbose_tool_events_omit_details_when_redaction_fails(monkeypatch):
+    redact_module = types.ModuleType("agent.redact")
+
+    def fail_redaction(*_args, **_kwargs):
+        raise RuntimeError("redaction unavailable")
+
+    setattr(redact_module, "redact_sensitive_text", fail_redaction)
+    monkeypatch.setitem(sys.modules, "agent.redact", redact_module)
+
+    events: list[tuple[str, str, dict]] = []
+    monkeypatch.setattr(
+        server, "_emit", lambda event_type, sid, payload: events.append((event_type, sid, payload))
+    )
+    monkeypatch.setitem(
+        server._sessions,
+        "redaction-test",
+        {"tool_progress_mode": "verbose", "tool_started_at": {}},
+    )
+
+    server._on_tool_start("redaction-test", "tool-1", "terminal", {"command": "pwd"})
+    server._on_tool_complete("redaction-test", "tool-1", "terminal", {"command": "pwd"}, "done")
+
+    assert events[0][0] == "tool.start"
+    assert events[1][0] == "tool.complete"
+    assert "args_text" not in events[0][2]
+    assert "result_text" not in events[1][2]
+
+
 def test_dispatch_rejects_non_object_request():
    resp = server.dispatch([])

@@ -1476,8 +1529,10 @@ def test_config_mouse_uses_documented_key_with_legacy_fallback(monkeypatch):
    set_toggle = server.handle_request(
        {"id": "2", "method": "config.set", "params": {"key": "mouse"}}
    )
-    assert set_toggle["result"] == {"key": "mouse", "value": "on"}
-    assert writes == [("display.mouse_tracking", True)]
+    # /mouse (no arg) toggles between 'all' and 'off'. Starting from
+    # tui_mouse: False (→ 'off'), the toggle flips to 'all'.
+    assert set_toggle["result"] == {"key": "mouse", "value": "all"}
+    assert writes == [("display.mouse_tracking", "all")]

    cfg["display"] = {"mouse_tracking": 0, "tui_mouse": True}
    get_canonical = server.handle_request(
@@ -1489,7 +1544,51 @@ def test_config_mouse_uses_documented_key_with_legacy_fallback(monkeypatch):
    get_null = server.handle_request(
        {"id": "4", "method": "config.get", "params": {"key": "mouse"}}
    )
-    assert get_null["result"]["value"] == "on"
+    # mouse_tracking present-but-None defers neither to tui_mouse nor to
+    # the legacy off bucket: it falls through to the 'all' default.
+    assert get_null["result"]["value"] == "all"
+
+
+def test_config_mouse_accepts_preset_strings_and_aliases(monkeypatch):
+    cfg = {"display": {"mouse_tracking": "all"}}
+    writes = []
+
+    monkeypatch.setattr(server, "_load_cfg", lambda: cfg)
+    monkeypatch.setattr(
+        server, "_write_config_key", lambda path, value: writes.append((path, value))
+    )
+
+    # Direct preset.
+    set_wheel = server.handle_request(
+        {
+            "id": "1",
+            "method": "config.set",
+            "params": {"key": "mouse", "value": "wheel"},
+        }
+    )
+    assert set_wheel["result"] == {"key": "mouse", "value": "wheel"}
+    assert writes[-1] == ("display.mouse_tracking", "wheel")
+
+    # Alias for buttons.
+    set_click = server.handle_request(
+        {
+            "id": "2",
+            "method": "config.set",
+            "params": {"key": "mouse", "value": "click"},
+        }
+    )
+    assert set_click["result"] == {"key": "mouse", "value": "buttons"}
+    assert writes[-1] == ("display.mouse_tracking", "buttons")
+
+    # Unknown value → 4002.
+    bad = server.handle_request(
+        {
+            "id": "3",
+            "method": "config.set",
+            "params": {"key": "mouse", "value": "rainbows"},
+        }
+    )
+    assert bad["error"]["code"] == 4002


 def test_enable_gateway_prompts_sets_gateway_env(monkeypatch):
@@ -0,0 +1,69 @@
+"""Shared fixtures for tests/tools/ web-provider tests.
+
+Per-file subprocess isolation means each test file gets a fresh interpreter,
+so module-level state (like the web-search-provider registry) is empty when
+a file starts.  The ``web_registry_populated`` fixture registers all bundled
+providers before each test and resets the registry afterwards — tests that
+depend on the registry being populated should use it explicitly or via
+``@pytest.mark.usefixtures("web_registry_populated")``.
+"""
+
+from unittest.mock import patch
+
+import pytest
+
+
+def register_all_web_providers():
+    """Register all bundled web-search providers into the global registry.
+
+    This is the single source of truth for the provider list used by
+    test classes that need the registry populated for dispatch checks.
+    """
+    from agent.web_search_registry import register_provider, _reset_for_tests
+    from plugins.web.brave_free.provider import BraveFreeWebSearchProvider
+    from plugins.web.ddgs.provider import DDGSWebSearchProvider
+    from plugins.web.exa.provider import ExaWebSearchProvider
+    from plugins.web.firecrawl.provider import FirecrawlWebSearchProvider
+    from plugins.web.parallel.provider import ParallelWebSearchProvider
+    from plugins.web.searxng.provider import SearXNGWebSearchProvider
+    from plugins.web.tavily.provider import TavilyWebSearchProvider
+    from plugins.web.xai.provider import XAIWebSearchProvider
+
+    _reset_for_tests()
+    for cls in (
+        BraveFreeWebSearchProvider,
+        DDGSWebSearchProvider,
+        ExaWebSearchProvider,
+        FirecrawlWebSearchProvider,
+        ParallelWebSearchProvider,
+        SearXNGWebSearchProvider,
+        TavilyWebSearchProvider,
+        XAIWebSearchProvider,
+    ):
+        register_provider(cls())
+
+
+@pytest.fixture
+def web_registry_populated():
+    """Populate the web-search-provider registry for one test, then reset."""
+    register_all_web_providers()
+    yield
+    from agent.web_search_registry import _reset_for_tests
+    _reset_for_tests()
+
+
+@pytest.fixture
+def disable_lazy_stt_install():
+    """Disarm the runtime lazy-install probe so static ``_HAS_FASTER_WHISPER``
+    patches accurately simulate 'faster-whisper not installed'.
+
+    Without this, ``_try_lazy_install_stt()`` calls
+    ``importlib.util.find_spec("faster_whisper")``, which returns truthy
+    whenever the package is installed in the dev / CI environment —
+    defeating the test's ``_HAS_FASTER_WHISPER=False`` patch.
+
+    Opt in at module scope with
+    ``pytestmark = pytest.mark.usefixtures("disable_lazy_stt_install")``.
+    """
+    with patch("tools.transcription_tools._try_lazy_install_stt", return_value=False):
+        yield
@@ -0,0 +1,246 @@
+"""Unit tests for tools/app_tools.py — the Nous tool gateway integration."""
+
+from __future__ import annotations
+
+import json
+from unittest.mock import MagicMock
+
+import httpx
+import pytest
+
+from tools.managed_tool_gateway import ManagedToolGatewayConfig
+
+
+_FAKE_GATEWAY = ManagedToolGatewayConfig(
+    vendor="tools",
+    gateway_origin="https://tools-gateway.example.com",
+    nous_user_token="test-token-abc123",
+    managed_mode=True,
+)
+
+
+@pytest.fixture(autouse=True)
+def _reset_http_client_cache():
+    """Clear the module-level cached httpx client between tests."""
+    import tools.app_tools as mod
+    mod._http_client = None
+    mod._http_client_origin = None
+    yield
+    mod._http_client = None
+    mod._http_client_origin = None
+
+
+@pytest.fixture()
+def gateway_post(monkeypatch):
+    """Patch the gateway and httpx.Client.post; return a dict capturing the request."""
+    monkeypatch.setattr(
+        "tools.app_tools.resolve_managed_tool_gateway", lambda v: _FAKE_GATEWAY
+    )
+    monkeypatch.setattr(
+        "tools.app_tools._get_current_model_name", lambda: None
+    )
+    captured = {}
+    resp = MagicMock(spec=httpx.Response)
+    resp.status_code = 200
+    resp.json.return_value = {"data": {}, "error": None}
+    resp.text = json.dumps({"data": {}, "error": None})
+
+    def fake_post(self, url, *, json=None, headers=None, **kw):
+        captured["url"] = url
+        captured["headers"] = headers
+        captured["json"] = json
+        return resp
+
+    monkeypatch.setattr(httpx.Client, "post", fake_post)
+    return captured
+
+
+# ---------------------------------------------------------------------------
+# check_fn gating
+# ---------------------------------------------------------------------------
+
+class TestAppToolsAvailability:
+    def test_returns_false_when_gateway_not_ready(self, monkeypatch):
+        monkeypatch.setattr("tools.app_tools.is_managed_tool_gateway_ready", lambda vendor: False)
+        monkeypatch.setattr("tools.app_tools._read_portal_app_tools_enabled", lambda: True)
+        from tools.app_tools import _app_tools_available
+        assert _app_tools_available() is False
+
+    def test_returns_true_when_gateway_ready_and_config_on(self, monkeypatch):
+        monkeypatch.setattr("tools.app_tools.is_managed_tool_gateway_ready", lambda vendor: True)
+        monkeypatch.setattr("tools.app_tools._read_portal_app_tools_enabled", lambda: True)
+        from tools.app_tools import _app_tools_available
+        assert _app_tools_available() is True
+
+    def test_returns_false_when_config_off(self, monkeypatch):
+        monkeypatch.setattr("tools.app_tools.is_managed_tool_gateway_ready", lambda vendor: True)
+        monkeypatch.setattr("tools.app_tools._read_portal_app_tools_enabled", lambda: False)
+        from tools.app_tools import _app_tools_available
+        assert _app_tools_available() is False
+
+
+# ---------------------------------------------------------------------------
+# URL + auth header
+# ---------------------------------------------------------------------------
+
+class TestSearchPostsCorrectUrlAndAuth:
+    def test_posts_to_v1_search_with_bearer_token(self, monkeypatch, gateway_post):
+        monkeypatch.setattr("tools.app_tools._get_current_model_name", lambda: "test-model")
+        from tools.app_tools import handle_app_search_tools
+        handle_app_search_tools({"queries": [{"use_case": "send email"}]})
+
+        assert gateway_post["url"] == "https://tools-gateway.example.com/v1/search"
+        assert gateway_post["headers"]["Authorization"] == "Bearer test-token-abc123"
+        assert gateway_post["headers"]["Content-Type"] == "application/json"
+        assert gateway_post["json"]["queries"] == [{"use_case": "send email"}]
+        assert gateway_post["json"]["model"] == "test-model"
+
+
+# ---------------------------------------------------------------------------
+# Model auto-injection
+# ---------------------------------------------------------------------------
+
+class TestModelAutoInjection:
+    def test_injects_model_from_config(self, monkeypatch, gateway_post):
+        monkeypatch.setattr("tools.app_tools._get_current_model_name", lambda: "claude-sonnet-4")
+        from tools.app_tools import handle_app_search_tools
+        handle_app_search_tools({"queries": [{"use_case": "test"}]})
+        assert gateway_post["json"]["model"] == "claude-sonnet-4"
+
+    def test_omits_model_when_unresolvable(self, gateway_post):
+        from tools.app_tools import handle_app_search_tools
+        handle_app_search_tools({"queries": [{"use_case": "test"}]})
+        assert "model" not in gateway_post["json"]
+
+
+# ---------------------------------------------------------------------------
+# Gateway-internal param stripping (allowlist approach)
+# ---------------------------------------------------------------------------
+
+class TestExecuteStripsInternalParams:
+    def test_strips_sync_response_thought_step_metric(self, gateway_post):
+        from tools.app_tools import handle_app_execute_tools
+        handle_app_execute_tools({
+            "tools": [{"tool_slug": "TEST", "arguments": {}}],
+            "sync_response_to_workbench": True,
+            "thought": "testing",
+            "current_step": "TESTING",
+            "current_step_metric": "1/1 tests",
+        })
+        body = gateway_post["json"]
+        for key in ("sync_response_to_workbench", "thought", "current_step", "current_step_metric"):
+            assert key not in body
+        assert body["tools"] == [{"tool_slug": "TEST", "arguments": {}}]
+
+
+# ---------------------------------------------------------------------------
+# HTTP error → tool result (not exception)
+# ---------------------------------------------------------------------------
+
+class TestHttpErrorReturnedAsToolResult:
+    @pytest.mark.parametrize("status_code", [402, 403, 422, 500])
+    def test_returns_error_json_not_exception(self, monkeypatch, status_code):
+        monkeypatch.setattr("tools.app_tools.resolve_managed_tool_gateway", lambda v: _FAKE_GATEWAY)
+        error_body = {"error": {"code": "TEST_ERROR", "message": "fail"}}
+        resp = MagicMock(spec=httpx.Response)
+        resp.status_code = status_code
+        resp.json.return_value = error_body
+        resp.text = json.dumps(error_body)
+        monkeypatch.setattr(httpx.Client, "post", lambda self, url, **kw: resp)
+
+        from tools.app_tools import handle_app_search_tools
+        result = json.loads(handle_app_search_tools({"queries": [{"use_case": "test"}]}))
+        assert result["error"]["code"] == "TEST_ERROR"
+
+
+# ---------------------------------------------------------------------------
+# Network failure → tool result
+# ---------------------------------------------------------------------------
+
+class TestNetworkFailureReturnedAsToolResult:
+    def test_connect_error_returns_gateway_unreachable(self, monkeypatch):
+        monkeypatch.setattr("tools.app_tools.resolve_managed_tool_gateway", lambda v: _FAKE_GATEWAY)
+
+        def raise_connect(self, url, **kw):
+            raise httpx.ConnectError("Connection refused")
+        monkeypatch.setattr(httpx.Client, "post", raise_connect)
+
+        from tools.app_tools import handle_app_search_tools
+        result = json.loads(handle_app_search_tools({"queries": [{"use_case": "test"}]}))
+        assert result["error"]["code"] == "GATEWAY_UNREACHABLE"
+
+    def test_timeout_returns_gateway_timeout(self, monkeypatch):
+        monkeypatch.setattr("tools.app_tools.resolve_managed_tool_gateway", lambda v: _FAKE_GATEWAY)
+
+        def raise_timeout(self, url, **kw):
+            raise httpx.ReadTimeout("timed out")
+        monkeypatch.setattr(httpx.Client, "post", raise_timeout)
+
+        from tools.app_tools import handle_app_search_tools
+        result = json.loads(handle_app_search_tools({"queries": [{"use_case": "test"}]}))
+        assert result["error"]["code"] == "GATEWAY_TIMEOUT"
+
+
+# ---------------------------------------------------------------------------
+# Endpoint routing + payload forwarding
+# ---------------------------------------------------------------------------
+
+class TestEndpointRouting:
+    def test_manage_connections_forwards_toolkits(self, gateway_post):
+        from tools.app_tools import handle_app_manage_connections
+        handle_app_manage_connections({"toolkits": ["gmail", "slack"], "reinitiate_all": True})
+        assert gateway_post["url"].endswith("/v1/connections")
+        assert gateway_post["json"]["toolkits"] == ["gmail", "slack"]
+        assert gateway_post["json"]["reinitiate_all"] is True
+
+    def test_tool_schemas_forwards_slugs(self, gateway_post):
+        from tools.app_tools import handle_app_tool_schemas
+        handle_app_tool_schemas({"tool_slugs": ["GMAIL_SEND_EMAIL"], "include": ["input_schema", "output_schema"]})
+        assert gateway_post["url"].endswith("/v1/schemas")
+        assert gateway_post["json"]["tool_slugs"] == ["GMAIL_SEND_EMAIL"]
+        assert gateway_post["json"]["include"] == ["input_schema", "output_schema"]
+
+
+# ---------------------------------------------------------------------------
+# Registry entries
+# ---------------------------------------------------------------------------
+
+class TestRegistryEntries:
+    def test_all_four_tools_registered_under_app_tools(self):
+        from tools.registry import registry
+        import tools.app_tools  # noqa: F401
+        expected = {"app_search_tools", "app_tool_schemas", "app_execute_tools", "app_manage_connections"}
+        for name in expected:
+            entry = registry._tools.get(name)
+            assert entry is not None, f"{name} not registered"
+            assert entry.toolset == "app_tools"
+
+
+# ---------------------------------------------------------------------------
+# session (object) vs session_id (string) asymmetry
+# ---------------------------------------------------------------------------
+
+class TestSessionHandling:
+    def test_search_uses_session_object(self, gateway_post):
+        from tools.app_tools import handle_app_search_tools
+        handle_app_search_tools({"queries": [{"use_case": "test"}], "session": {"generate_id": True}})
+        assert isinstance(gateway_post["json"]["session"], dict)
+        assert "session_id" not in gateway_post["json"]
+
+    def test_schemas_uses_session_id_string(self, gateway_post):
+        from tools.app_tools import handle_app_tool_schemas
+        handle_app_tool_schemas({"tool_slugs": ["TEST"], "session_id": "sess-123"})
+        assert gateway_post["json"]["session_id"] == "sess-123"
+        assert "session" not in gateway_post["json"]
+
+    def test_execute_uses_session_id_string(self, gateway_post):
+        from tools.app_tools import handle_app_execute_tools
+        handle_app_execute_tools({"tools": [{"tool_slug": "TEST", "arguments": {}}], "session_id": "sess-456"})
+        assert gateway_post["json"]["session_id"] == "sess-456"
+        assert "session" not in gateway_post["json"]
+
+    def test_connections_uses_session_id_string(self, gateway_post):
+        from tools.app_tools import handle_app_manage_connections
+        handle_app_manage_connections({"toolkits": ["gmail"], "session_id": "sess-789"})
+        assert gateway_post["json"]["session_id"] == "sess-789"
+        assert "session" not in gateway_post["json"]
@@ -22,18 +22,28 @@ from tools.approval import (


@pytest.fixture
-def isolated_session(monkeypatch):
-    """Give each test a fresh session_key and clean approval-state."""
+def isolated_session(monkeypatch, tmp_path):
+    """Give each test a fresh session_key, clean approval-state, and isolated
+    HERMES_HOME so the real user's command_allowlist doesn't leak in."""
+    import tools.approval as _am
+
    session_key = "test:session:approval_hooks"
    token = set_current_session_key(session_key)
    monkeypatch.setenv("HERMES_SESSION_KEY", session_key)
    # Make sure we don't skip guards via yolo / approvals.mode=off
    monkeypatch.delenv("HERMES_YOLO_MODE", raising=False)
+    # Isolate from the real user's permanent allowlist + session state
+    _saved_permanent = _am._permanent_approved.copy()
+    _saved_session = {k: v.copy() for k, v in _am._session_approved.items()}
+    _am._permanent_approved.clear()
+    _am._session_approved.clear()
    try:
        yield session_key
    finally:
+        _am._permanent_approved.update(_saved_permanent)
+        _am._session_approved.update(_saved_session)
        try:
-            approval_module._approval_session_key.reset(token)
+            _am._approval_session_key.reset(token)
        except Exception:
            pass
        clear_session(session_key)
@@ -41,7 +41,7 @@ def _find_chrome() -> str:


@pytest.fixture
-def chrome_cdp(worker_id):
+def chrome_cdp(request):
    """Start a headless Chrome with --remote-debugging-port, yield its WS URL.

    Uses a unique port per xdist worker to avoid cross-worker collisions.
@@ -51,6 +51,9 @@ def chrome_cdp(worker_id):
    import socket

    # xdist worker_id is "master" in single-process mode or "gw0".."gwN" otherwise.
+    # Under subprocess-per-file isolation there's no xdist, so we fall back
+    # to "master" via the session-scoped fixture below.
+    worker_id = request.getfixturevalue("worker_id") if "worker_id" in request.fixturenames else "master"
    if worker_id == "master":
        port_offset = 0
    else:
@@ -76,6 +76,27 @@ class TestSchema:
        modes = set(COMPUTER_USE_SCHEMA["parameters"]["properties"]["mode"]["enum"])
        assert modes == {"som", "vision", "ax"}

+    def test_schema_exposes_max_elements_cap_for_capture(self):
+        from tools.computer_use.schema import COMPUTER_USE_SCHEMA
+        props = COMPUTER_USE_SCHEMA["parameters"]["properties"]
+        assert "max_elements" in props
+        assert props["max_elements"]["type"] == "integer"
+        assert props["max_elements"].get("minimum", 1) >= 1
+
+    def test_schema_max_elements_documents_default_and_upper_bound(self):
+        """Schema description must agree with the runtime. The original PR
+        text said "Default 100" without a corresponding `default` field, and
+        had no upper bound — both Copilot findings.
+        """
+        from tools.computer_use.schema import COMPUTER_USE_SCHEMA
+        from tools.computer_use.tool import (
+            _DEFAULT_MAX_ELEMENTS,
+            _MAX_ALLOWED_MAX_ELEMENTS,
+        )
+        prop = COMPUTER_USE_SCHEMA["parameters"]["properties"]["max_elements"]
+        assert prop.get("default") == _DEFAULT_MAX_ELEMENTS
+        assert prop.get("maximum") == _MAX_ALLOWED_MAX_ELEMENTS
+

 class TestRegistration:
    def test_tool_registers_with_registry(self):
@@ -155,6 +176,104 @@ class TestDispatch:
        click_kw = next(c[1] for c in noop_backend.calls if c[0] == "click")
        assert click_kw["button"] == "right"

+    def test_type_action_routes_to_type_text_backend(self, noop_backend):
+        """type action must call backend.type_text, not type_text_chars (issue #24170, bug 3)."""
+        from tools.computer_use.tool import handle_computer_use
+        out = handle_computer_use({"action": "type", "text": "hello"})
+        parsed = json.loads(out)
+        assert "error" not in parsed
+        call_names = [c[0] for c in noop_backend.calls]
+        assert "type" in call_names
+        type_kw = next(c[1] for c in noop_backend.calls if c[0] == "type")
+        assert type_kw["text"] == "hello"
+
+    def test_drag_action_routes_to_backend_by_coordinate(self, noop_backend):
+        """drag action must dispatch to backend.drag with coordinates (issue #24170, bug 4)."""
+        from tools.computer_use.tool import handle_computer_use
+        out = handle_computer_use({
+            "action": "drag",
+            "from_coordinate": [100, 200],
+            "to_coordinate": [400, 500],
+        })
+        parsed = json.loads(out)
+        assert "error" not in parsed
+        call_names = [c[0] for c in noop_backend.calls]
+        assert "drag" in call_names
+        drag_kw = next(c[1] for c in noop_backend.calls if c[0] == "drag")
+        assert drag_kw["from_xy"] == (100, 200)
+        assert drag_kw["to_xy"] == (400, 500)
+
+    def test_drag_action_routes_to_backend_by_element(self, noop_backend):
+        """drag action must dispatch to backend.drag with element indices (issue #24170, bug 4)."""
+        from tools.computer_use.tool import handle_computer_use
+        out = handle_computer_use({
+            "action": "drag",
+            "from_element": 1,
+            "to_element": 5,
+        })
+        parsed = json.loads(out)
+        assert "error" not in parsed
+        call_names = [c[0] for c in noop_backend.calls]
+        assert "drag" in call_names
+        drag_kw = next(c[1] for c in noop_backend.calls if c[0] == "drag")
+        assert drag_kw["from_element"] == 1
+        assert drag_kw["to_element"] == 5
+
+    def test_drag_action_requires_coordinates_or_elements(self, noop_backend):
+        """drag without from/to must return an error."""
+        from tools.computer_use.tool import handle_computer_use
+        out = handle_computer_use({"action": "drag"})
+        parsed = json.loads(out)
+        assert "error" in parsed
+
+    def test_set_value_routes_to_backend(self, noop_backend):
+        """set_value must reach the backend — regression for missing _NoopBackend stub."""
+        from tools.computer_use.tool import handle_computer_use
+        out = handle_computer_use({"action": "set_value", "value": "Option A", "element": 5})
+        parsed = json.loads(out)
+        assert parsed.get("ok") is True
+        assert parsed.get("action") == "set_value"
+        assert any(c[0] == "set_value" for c in noop_backend.calls)
+
+    def test_set_value_missing_value_returns_error(self, noop_backend):
+        from tools.computer_use.tool import handle_computer_use
+        out = handle_computer_use({"action": "set_value"})
+        parsed = json.loads(out)
+        assert "error" in parsed
+    def test_capture_after_skipped_when_action_failed(self, noop_backend):
+        """capture_after must not fire when res.ok=False (regression guard).
+
+        A follow-up screenshot after a failed action shows the screen in a
+        normal state, misleading the model into thinking the action succeeded.
+        """
+        from unittest.mock import patch
+        from tools.computer_use.backend import ActionResult
+        from tools.computer_use.tool import handle_computer_use
+
+        # Make click() return a failure.
+        with patch.object(noop_backend, "click",
+                          return_value=ActionResult(ok=False, action="click",
+                                                    message="element not found")):
+            out = handle_computer_use({"action": "click", "element": 99,
+                                       "capture_after": True})
+
+        parsed = json.loads(out)
+        # Should return the error, not a multimodal capture.
+        assert parsed.get("ok") is False
+        assert parsed.get("action") == "click"
+        # No follow-up capture should have been issued.
+        capture_calls = [c for c in noop_backend.calls if c[0] == "capture"]
+        assert len(capture_calls) == 0, "capture must not be called after a failed action"
+
+    def test_capture_after_fires_when_action_succeeds(self, noop_backend):
+        """capture_after must trigger for successful actions."""
+        from tools.computer_use.tool import handle_computer_use
+        out = handle_computer_use({"action": "click", "element": 1,
+                                   "capture_after": True})
+        # Noop backend returns ok=True, so capture should have been called.
+        capture_calls = [c for c in noop_backend.calls if c[0] == "capture"]
+        assert len(capture_calls) == 1
+

 # ---------------------------------------------------------------------------
 # Safety guards (type / key block lists)
@@ -287,6 +406,193 @@ class TestCaptureResponse:
        assert "AXButton" in text_part["text"]
        assert "AXTextField" in text_part["text"]

+    def _ax_backend_with(self, count: int):
+        """Construct a fake backend that yields ``count`` AX elements."""
+        from tools.computer_use.backend import CaptureResult, UIElement
+
+        elements = [
+            UIElement(index=i + 1, role="AXButton", label=f"el-{i}", bounds=(0, 0, 1, 1))
+            for i in range(count)
+        ]
+
+        class FakeBackend:
+            def start(self): pass
+            def stop(self): pass
+            def is_available(self): return True
+            def capture(self, mode="som", app=None):
+                return CaptureResult(
+                    mode=mode, width=800, height=600,
+                    png_b64="",
+                    elements=list(elements),
+                    app="Obsidian",
+                )
+            def click(self, **kw): ...
+            def drag(self, **kw): ...
+            def scroll(self, **kw): ...
+            def type_text(self, text): ...
+            def key(self, keys): ...
+            def list_apps(self): return []
+            def focus_app(self, app, raise_window=False): ...
+
+        return FakeBackend()
+
+    def test_capture_ax_caps_elements_at_default_for_dense_trees(self):
+        """Regression for #22865: an Electron-style 600-element AX tree must
+        not emit the entire array verbatim into the tool result.
+        """
+        from tools.computer_use import tool as cu_tool
+
+        fake_backend = self._ax_backend_with(600)
+        cu_tool.reset_backend_for_tests()
+        with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
+            out = cu_tool.handle_computer_use({"action": "capture", "mode": "ax"})
+
+        parsed = json.loads(out)
+        assert parsed["mode"] == "ax"
+        assert parsed["total_elements"] == 600
+        assert len(parsed["elements"]) == cu_tool._DEFAULT_MAX_ELEMENTS
+        assert parsed["truncated_elements"] == 600 - cu_tool._DEFAULT_MAX_ELEMENTS
+        # Truncation must be visible in the human summary so the model knows
+        # the JSON view is partial and can re-issue with a tighter scope.
+        assert "truncated to" in parsed["summary"]
+
+    def test_capture_ax_honors_explicit_max_elements_override(self):
+        from tools.computer_use import tool as cu_tool
+
+        fake_backend = self._ax_backend_with(600)
+        cu_tool.reset_backend_for_tests()
+        with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
+            out = cu_tool.handle_computer_use(
+                {"action": "capture", "mode": "ax", "max_elements": 250}
+            )
+
+        parsed = json.loads(out)
+        assert len(parsed["elements"]) == 250
+        assert parsed["truncated_elements"] == 350
+
+    def test_capture_ax_below_cap_is_unchanged(self):
+        """Backwards-compat: small captures keep the full elements array and
+        do not surface a `truncated_elements` field.
+        """
+        from tools.computer_use import tool as cu_tool
+
+        fake_backend = self._ax_backend_with(5)
+        cu_tool.reset_backend_for_tests()
+        with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
+            out = cu_tool.handle_computer_use({"action": "capture", "mode": "ax"})
+
+        parsed = json.loads(out)
+        assert len(parsed["elements"]) == 5
+        assert parsed["total_elements"] == 5
+        assert "truncated_elements" not in parsed
+        assert "truncated to" not in parsed["summary"]
+
+    def test_capture_ax_invalid_max_elements_falls_back_to_default(self):
+        """Malformed `max_elements` (string, negative, zero) must not silently
+        disable the cap and re-introduce the original unbounded behavior.
+        """
+        from tools.computer_use import tool as cu_tool
+
+        fake_backend = self._ax_backend_with(600)
+        cu_tool.reset_backend_for_tests()
+        for bad in ("not-a-number", 0, -10):
+            with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
+                out = cu_tool.handle_computer_use(
+                    {"action": "capture", "mode": "ax", "max_elements": bad}
+                )
+            parsed = json.loads(out)
+            assert len(parsed["elements"]) == cu_tool._DEFAULT_MAX_ELEMENTS, (
+                f"bad max_elements={bad!r} disabled the cap"
+            )
+
+    def test_capture_ax_clamps_oversized_max_elements_to_hard_cap(self):
+        """A caller passing a very large `max_elements` must not be able to
+        disable the safeguard. The cap is clamped to a hard upper bound so
+        the context-blow-up protection cannot be bypassed by argument.
+        """
+        from tools.computer_use import tool as cu_tool
+
+        fake_backend = self._ax_backend_with(5000)
+        cu_tool.reset_backend_for_tests()
+        with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
+            out = cu_tool.handle_computer_use(
+                {"action": "capture", "mode": "ax", "max_elements": 10_000}
+            )
+        parsed = json.loads(out)
+        assert len(parsed["elements"]) == cu_tool._MAX_ALLOWED_MAX_ELEMENTS
+        assert parsed["total_elements"] == 5000
+        assert parsed["truncated_elements"] == 5000 - cu_tool._MAX_ALLOWED_MAX_ELEMENTS
+
+    def test_capture_ax_summary_indices_match_returned_elements(self):
+        """When `max_elements` is below the human-summary's own line cap, the
+        summary must not index elements that aren't in the returned array.
+        Otherwise the model sees `#15` in the summary and finds no matching
+        entry in `elements`.
+        """
+        from tools.computer_use import tool as cu_tool
+
+        fake_backend = self._ax_backend_with(600)
+        cu_tool.reset_backend_for_tests()
+        with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
+            out = cu_tool.handle_computer_use(
+                {"action": "capture", "mode": "ax", "max_elements": 5}
+            )
+        parsed = json.loads(out)
+        returned_indices = {e["index"] for e in parsed["elements"]}
+        summary_lines = parsed["summary"].splitlines()
+        indexed_lines = [ln for ln in summary_lines if ln.lstrip().startswith("#")]
+        for ln in indexed_lines:
+            idx_token = ln.lstrip().split()[0].lstrip("#")
+            idx = int(idx_token)
+            assert idx in returned_indices, (
+                f"summary references #{idx} but it is absent from elements payload "
+                f"(returned: {sorted(returned_indices)})"
+            )
+
+    def test_capture_multimodal_summary_omits_truncation_note(self):
+        """The som/vision multimodal envelope returns a screenshot, not an
+        `elements` array — so a "response truncated to N of M elements"
+        claim in the summary would be inaccurate.
+        """
+        from tools.computer_use.backend import CaptureResult, UIElement
+        from tools.computer_use import tool as cu_tool
+
+        fake_png = "iVBORw0KGgo="
+        elements = [
+            UIElement(index=i + 1, role="AXButton", label=f"el-{i}", bounds=(0, 0, 1, 1))
+            for i in range(600)
+        ]
+
+        class FakeBackend:
+            def start(self): pass
+            def stop(self): pass
+            def is_available(self): return True
+            def capture(self, mode="som", app=None):
+                return CaptureResult(
+                    mode=mode, width=800, height=600,
+                    png_b64=fake_png, elements=list(elements),
+                    app="Obsidian",
+                )
+            def click(self, **kw): ...
+            def drag(self, **kw): ...
+            def scroll(self, **kw): ...
+            def type_text(self, text): ...
+            def key(self, keys): ...
+            def list_apps(self): return []
+            def focus_app(self, app, raise_window=False): ...
+
+        cu_tool.reset_backend_for_tests()
+        with patch.object(cu_tool, "_get_backend", return_value=FakeBackend()):
+            out = cu_tool.handle_computer_use({"action": "capture", "mode": "som"})
+
+        assert isinstance(out, dict) and out["_multimodal"] is True
+        text_part = next(p for p in out["content"] if p.get("type") == "text")
+        assert "truncated to" not in text_part["text"], (
+            "multimodal response carries an image, not an elements array; "
+            "the truncation note describes a payload field that isn't present"
+        )
+        assert "truncated to" not in out["text_summary"]
+

 # ---------------------------------------------------------------------------
 # Anthropic adapter: multimodal tool-result conversion
@@ -679,3 +985,332 @@ class TestUniversality:
        source = inspect.getsource(entry.check_fn)
        assert "anthropic" not in source.lower()
        assert "openai" not in source.lower()
+
+
+# ---------------------------------------------------------------------------
+# Regression tests for bugs 2 & 5 from issue #24170 (cua-driver v0.1.6)
+# ---------------------------------------------------------------------------
+
+class TestElementLabelParsing:
+    """Bug 5: element labels stripped in capture results (cua-driver v0.1.6 format).
+
+    cua-driver ≥0.1.6 emits ``[N] AXRole (order) id=Label`` instead of
+    ``  - [N] AXRole "label"``.  _parse_elements_from_tree must handle both.
+    """
+
+    def test_classic_quoted_label_format(self):
+        from tools.computer_use.cua_backend import _parse_elements_from_tree
+        tree = (
+            '  - [14] AXButton "One"\n'
+            '  - [15] AXButton "Two"\n'
+            '  - [16] AXTextField ""\n'
+        )
+        els = _parse_elements_from_tree(tree)
+        assert len(els) == 3
+        assert els[0].index == 14
+        assert els[0].role == "AXButton"
+        assert els[0].label == "One"
+        assert els[1].label == "Two"
+        assert els[2].label == ""  # empty quoted label
+
+    def test_new_id_eq_format(self):
+        """cua-driver v0.1.6 format: [N] AXRole (order) id=Label"""
+        from tools.computer_use.cua_backend import _parse_elements_from_tree
+        tree = (
+            "[14] AXButton (1) id=One\n"
+            "[15] AXButton (2) id=Two\n"
+            "[16] AXTextField (3) id=\n"
+        )
+        els = _parse_elements_from_tree(tree)
+        assert len(els) == 3
+        assert els[0].index == 14
+        assert els[0].role == "AXButton"
+        assert els[0].label == "One"
+        assert els[1].label == "Two"
+        assert els[2].label == ""  # empty id= value
+
+    def test_mixed_formats_in_single_tree(self):
+        """Gracefully handles trees that mix old and new line formats."""
+        from tools.computer_use.cua_backend import _parse_elements_from_tree
+        tree = (
+            '  - [1] AXWindow "Main Window"\n'
+            "[14] AXButton (1) id=One\n"
+            '  - [15] AXTextField "Search"\n'
+        )
+        els = _parse_elements_from_tree(tree)
+        assert len(els) == 3
+        labels = {e.index: e.label for e in els}
+        assert labels[1] == "Main Window"
+        assert labels[14] == "One"
+        assert labels[15] == "Search"
+
+
+class TestCaptureAfterAppContext:
+    """Bug 2: capture_after=True loses app context after actions.
+
+    _maybe_follow_capture must re-target the same app that was set by
+    the preceding capture/focus_app call, rather than the frontmost window.
+    """
+
+    def test_capture_after_uses_last_app(self):
+        """capture_after=True should pass _last_app to the follow-up capture."""
+        from tools.computer_use.backend import ActionResult, CaptureResult
+        from tools.computer_use import tool as cu_tool
+
+        captured_app_args = []
+
+        class TrackingBackend:
+            _last_app = "Calculator"  # simulates a previous focus_app call
+
+            def start(self):
+                pass
+
+            def stop(self):
+                pass
+
+            def is_available(self):
+                return True
+
+            def capture(self, mode="som", app=None):
+                captured_app_args.append(app)
+                return CaptureResult(
+                    mode=mode, width=100, height=100,
+                    png_b64=None, elements=[],
+                    app=app or "Calculator", window_title="",
+                )
+
+            def click(self, **kw):
+                return ActionResult(ok=True, action="click")
+
+            def drag(self, **kw):
+                return ActionResult(ok=True, action="drag")
+
+            def scroll(self, **kw):
+                return ActionResult(ok=True, action="scroll")
+
+            def type_text(self, text):
+                return ActionResult(ok=True, action="type")
+
+            def key(self, keys):
+                return ActionResult(ok=True, action="key")
+
+            def list_apps(self):
+                return []
+
+            def focus_app(self, app, raise_window=False):
+                return ActionResult(ok=True, action="focus_app")
+
+            def set_value(self, value, element=None):
+                return ActionResult(ok=True, action="set_value")
+
+            def wait(self, seconds=1.0):
+                return ActionResult(ok=True, action="wait")
+
+        backend = TrackingBackend()
+        cu_tool.reset_backend_for_tests()
+        cu_tool._backend = backend
+
+        cu_tool.handle_computer_use({"action": "click", "element": 14, "capture_after": True})
+
+        # The follow-up capture must have been called with app="Calculator"
+        assert len(captured_app_args) == 1
+        assert captured_app_args[0] == "Calculator", (
+            f"Expected follow-up capture with app='Calculator', got {captured_app_args[0]!r}"
+        )
+
+    def test_capture_after_without_prior_app_uses_none(self):
+        """When no app context is set, follow-up capture uses app=None (frontmost)."""
+        from tools.computer_use.backend import ActionResult, CaptureResult
+        from tools.computer_use import tool as cu_tool
+
+        captured_app_args = []
+
+        class NoContextBackend:
+            _last_app = None  # no prior context
+
+            def start(self):
+                pass
+
+            def stop(self):
+                pass
+
+            def is_available(self):
+                return True
+
+            def capture(self, mode="som", app=None):
+                captured_app_args.append(app)
+                return CaptureResult(
+                    mode=mode, width=100, height=100,
+                    png_b64=None, elements=[],
+                    app="Finder", window_title="",
+                )
+
+            def click(self, **kw):
+                return ActionResult(ok=True, action="click")
+
+            def drag(self, **kw):
+                return ActionResult(ok=True, action="drag")
+
+            def scroll(self, **kw):
+                return ActionResult(ok=True, action="scroll")
+
+            def type_text(self, text):
+                return ActionResult(ok=True, action="type")
+
+            def key(self, keys):
+                return ActionResult(ok=True, action="key")
+
+            def list_apps(self):
+                return []
+
+            def focus_app(self, app, raise_window=False):
+                return ActionResult(ok=True, action="focus_app")
+
+            def set_value(self, value, element=None):
+                return ActionResult(ok=True, action="set_value")
+
+            def wait(self, seconds=1.0):
+                return ActionResult(ok=True, action="wait")
+
+        backend = NoContextBackend()
+        cu_tool.reset_backend_for_tests()
+        cu_tool._backend = backend
+
+        cu_tool.handle_computer_use({"action": "click", "element": 5, "capture_after": True})
+
+        # No app context — should pass None so cua-driver picks the frontmost window
+        assert len(captured_app_args) == 1
+        assert captured_app_args[0] is None
+
+# ---------------------------------------------------------------------------
+# Regression tests for bug 1 from issue #24170:
+#   capture(app=...) and focus_app(app=...) must surface when the filter
+#   matches nothing instead of silently picking the frontmost window.
+# ---------------------------------------------------------------------------
+
+def _make_cua_backend_with_windows(windows: List[Dict[str, Any]]):
+    """Construct a CuaDriverBackend with a mocked MCP session that returns
+    the supplied list_windows payload."""
+    from tools.computer_use.cua_backend import CuaDriverBackend
+
+    backend = CuaDriverBackend()
+    backend._session = MagicMock()
+    backend._session.call_tool.return_value = {
+        "data": "",
+        "images": [],
+        "structuredContent": {"windows": windows},
+        "isError": False,
+    }
+    return backend
+
+
+class TestCaptureAppFilterNoMatch:
+    """capture(app=X) must not silently fall back to the frontmost window
+    when X matches nothing — on a non-English macOS, list_windows returns
+    localized app names (e.g. "計算機"), so an English `app="Calculator"`
+    legitimately matches nothing and the caller needs to retry with the
+    localized name. The old code silently captured the frontmost window
+    (e.g. a menu-bar utility), giving the agent wrong UI elements.
+    """
+
+    def test_app_filter_no_match_returns_empty_capture_with_diagnostic(self):
+        # Simulates a localized macOS where Calculator's app_name is "計算機".
+        windows = [
+            {"app_name": "Fuwari", "pid": 100, "window_id": 1,
+             "is_on_screen": True, "title": "menu bar", "z_index": 0},
+            {"app_name": "計算機", "pid": 200, "window_id": 2,
+             "is_on_screen": True, "title": "Calculator", "z_index": 1},
+        ]
+        backend = _make_cua_backend_with_windows(windows)
+
+        cap = backend.capture(mode="som", app="Calculator")
+
+        # No window matched; capture must NOT pick the frontmost (Fuwari).
+        assert cap.app == "", (
+            f"app= filter no-match should not silently target a window; got {cap.app!r}"
+        )
+        assert cap.elements == []
+        assert "Calculator" in cap.window_title
+        assert "list_apps" in cap.window_title
+        # _active_pid must remain unset so a subsequent click doesn't hit Fuwari.
+        assert backend._active_pid is None
+        assert backend._active_window_id is None
+
+    def test_app_filter_match_still_works(self):
+        windows = [
+            {"app_name": "Fuwari", "pid": 100, "window_id": 1,
+             "is_on_screen": True, "title": "menu bar", "z_index": 0},
+            {"app_name": "計算機", "pid": 200, "window_id": 2,
+             "is_on_screen": True, "title": "Calculator", "z_index": 1},
+        ]
+        backend = _make_cua_backend_with_windows(windows)
+        # get_window_state for the matched window
+        backend._session.call_tool.side_effect = [
+            {"data": "", "images": [], "isError": False,
+             "structuredContent": {"windows": windows}},
+            {"data": '✅ 計算機 — 0 elements\n', "images": [], "isError": False,
+             "structuredContent": None},
+        ]
+
+        cap = backend.capture(mode="ax", app="計算機")
+
+        assert backend._active_pid == 200
+        assert backend._active_window_id == 2
+
+    def test_no_app_filter_still_picks_frontmost(self):
+        """When no app= is given, capture continues to pick the frontmost
+        window — the no-match early-return must not fire on the empty case."""
+        windows = [
+            {"app_name": "Fuwari", "pid": 100, "window_id": 1,
+             "is_on_screen": True, "title": "menu bar", "z_index": 0},
+        ]
+        backend = _make_cua_backend_with_windows(windows)
+        backend._session.call_tool.side_effect = [
+            {"data": "", "images": [], "isError": False,
+             "structuredContent": {"windows": windows}},
+            {"data": '✅ Fuwari — 0 elements\n', "images": [], "isError": False,
+             "structuredContent": None},
+        ]
+
+        cap = backend.capture(mode="ax", app=None)
+
+        assert backend._active_pid == 100
+
+
+class TestFocusAppFilterNoMatch:
+    """focus_app(app=X) must return ok=False when X matches nothing —
+    not silently target the frontmost window and report ok=True with a
+    misleading 'Targeted Fuwari' message.
+    """
+
+    def test_focus_app_no_match_returns_not_ok(self):
+        windows = [
+            {"app_name": "Fuwari", "pid": 100, "window_id": 1,
+             "is_on_screen": True, "title": "menu bar", "z_index": 0},
+            {"app_name": "計算機", "pid": 200, "window_id": 2,
+             "is_on_screen": True, "title": "Calculator", "z_index": 1},
+        ]
+        backend = _make_cua_backend_with_windows(windows)
+
+        res = backend.focus_app("Calculator")
+
+        assert res.ok is False
+        assert res.action == "focus_app"
+        assert "Calculator" in res.message
+        # _active_pid must remain unset so a subsequent click doesn't hit Fuwari.
+        assert backend._active_pid is None
+
+    def test_focus_app_match_still_works(self):
+        windows = [
+            {"app_name": "Fuwari", "pid": 100, "window_id": 1,
+             "is_on_screen": True, "title": "menu bar", "z_index": 0},
+            {"app_name": "計算機", "pid": 200, "window_id": 2,
+             "is_on_screen": True, "title": "Calculator", "z_index": 1},
+        ]
+        backend = _make_cua_backend_with_windows(windows)
+
+        res = backend.focus_app("計算機")
+
+        assert res.ok is True
+        assert backend._active_pid == 200
+        assert backend._active_window_id == 2
@@ -0,0 +1,431 @@
+"""End-to-end regression for #24015 — capture routing via auxiliary.vision.
+
+When ``computer_use(action='capture', mode='som'|'vision')`` returns a
+screenshot, ``_capture_response`` previously always returned a
+``_multimodal`` envelope. For non-vision main models, or when the user
+explicitly configured ``auxiliary.vision`` in ``config.yaml``, that
+envelope tripped HTTP 404 / 400 at the provider boundary even though a
+perfectly good vision backend was sitting in config waiting to be used.
+
+This file exercises the integrated ``_capture_response`` flow with
+deterministic stubs for:
+
+* ``should_route_capture_to_aux_vision`` (the policy decision)
+* ``_run_async`` (sync->async bridge)
+* ``vision_analyze_tool`` (the aux LLM call)
+* ``hermes_constants.get_hermes_dir`` (cache path)
+
+…so the full code path is covered without a live cua-driver, a real
+auxiliary client, or network access.
+"""
+
+from __future__ import annotations
+
+import base64
+import json
+import os
+from pathlib import Path
+from typing import Any
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+# ---------------------------------------------------------------------------
+# Fixtures / helpers
+# ---------------------------------------------------------------------------
+
+# 1×1 PNG (transparent) — minimal bytes that decode cleanly.
+_PNG_B64 = (
+    "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42m"
+    "NkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="
+)
+
+# 1×1 JPEG — used to verify mime detection works for either stream type.
+_JPEG_B64 = (
+    "/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEB"
+    "AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/"
+)
+
+
+@pytest.fixture
+def tmp_cache_dir(tmp_path):
+    """Override get_hermes_dir so cache writes land under tmp_path."""
+    cache_dir = tmp_path / "cache_vision"
+    cache_dir.mkdir()
+
+    def _fake_get(*_args, **_kw):
+        return cache_dir
+
+    with patch("hermes_constants.get_hermes_dir", _fake_get):
+        yield cache_dir
+
+
+def _make_capture(
+    *,
+    png_b64: str = _PNG_B64,
+    mode: str = "som",
+    elements=None,
+    app: str = "Safari",
+    window_title: str = "GitHub – Issue #24015",
+    width: int = 1280,
+    height: int = 800,
+):
+    from tools.computer_use.backend import CaptureResult, UIElement
+
+    elements = list(elements or [
+        UIElement(index=0, role="AXButton", label="Sign in",
+                  bounds=(10, 20, 80, 30)),
+        UIElement(index=1, role="AXTextField", label="username",
+                  bounds=(10, 60, 200, 24)),
+    ])
+    raw = base64.b64decode(png_b64, validate=False)
+    return CaptureResult(
+        mode=mode,
+        width=width,
+        height=height,
+        png_b64=png_b64,
+        elements=elements,
+        app=app,
+        window_title=window_title,
+        png_bytes_len=len(raw),
+    )
+
+
+def _stub_aux_analysis(text: str):
+    """Return a fake vision_analyze_tool coroutine result (JSON envelope)."""
+    return json.dumps({"success": True, "analysis": text})
+
+
+# ---------------------------------------------------------------------------
+# _capture_response: routing OFF (current/native behaviour)
+# ---------------------------------------------------------------------------
+
+class TestCaptureResponseDefaultPath:
+    """When routing helper says 'native', the existing multimodal envelope wins."""
+
+    def test_som_capture_returns_multimodal_envelope_when_native(self):
+        from tools.computer_use import tool as cu_tool
+
+        cap = _make_capture(png_b64=_PNG_B64, mode="som")
+        with patch.object(cu_tool, "_should_route_through_aux_vision",
+                          return_value=False):
+            resp = cu_tool._capture_response(cap)
+
+        assert isinstance(resp, dict)
+        assert resp.get("_multimodal") is True
+        # Image part must use image/png MIME for a PNG payload.
+        image_part = next(
+            p for p in resp["content"] if p.get("type") == "image_url"
+        )
+        url = image_part["image_url"]["url"]
+        assert url.startswith("data:image/png;base64,")
+        assert "vision_analysis" not in resp
+
+    def test_jpeg_capture_returns_image_jpeg_mime_when_native(self):
+        from tools.computer_use import tool as cu_tool
+
+        cap = _make_capture(png_b64=_JPEG_B64, mode="som")
+        with patch.object(cu_tool, "_should_route_through_aux_vision",
+                          return_value=False):
+            resp = cu_tool._capture_response(cap)
+
+        url = next(p for p in resp["content"] if p.get("type") == "image_url")
+        assert url["image_url"]["url"].startswith("data:image/jpeg;base64,")
+
+    def test_ax_only_capture_returns_text_regardless_of_routing(self):
+        from tools.computer_use import tool as cu_tool
+
+        cap = _make_capture(mode="ax", png_b64="")
+        # ax mode never has a PNG so neither path matters; assert pure text.
+        with patch.object(cu_tool, "_should_route_through_aux_vision",
+                          return_value=True) as routing:
+            resp = cu_tool._capture_response(cap)
+
+        # ax never even consults the routing helper — short-circuited above
+        # the image branch.
+        routing.assert_not_called()
+        assert isinstance(resp, str)
+        body = json.loads(resp)
+        assert body["mode"] == "ax"
+
+
+# ---------------------------------------------------------------------------
+# _capture_response: routing ON (the #24015 fix)
+# ---------------------------------------------------------------------------
+
+class TestCaptureResponseRoutedToAuxVision:
+    """When routing helper says 'aux', the PNG is pre-analysed and a text
+    response is returned with no image_url parts at all."""
+
+    def test_som_capture_returns_text_with_vision_analysis(
+        self, tmp_cache_dir,
+    ):
+        from tools.computer_use import tool as cu_tool
+
+        cap = _make_capture(mode="som")
+
+        captured_calls = {}
+
+        def _fake_run_async(coro):
+            captured_calls["called"] = True
+            return _stub_aux_analysis(
+                "A Safari window showing a GitHub issue page with a 'Sign "
+                "in' button and a 'username' text field."
+            )
+
+        # vision_analyze_tool is async; force a sync MagicMock so we can
+        # assert positional args without dealing with awaitables.
+        fake_vat = MagicMock(return_value="<coro>")
+
+        with patch.object(cu_tool, "_should_route_through_aux_vision",
+                          return_value=True), \
+             patch("model_tools._run_async", side_effect=_fake_run_async), \
+             patch("tools.vision_tools.vision_analyze_tool",
+                   new_callable=lambda: fake_vat):
+            resp = cu_tool._capture_response(cap)
+
+        # Must be a JSON string, NOT a multimodal envelope. This is exactly
+        # the contract that prevents #24015's HTTP 404 from firing on the
+        # next agent turn.
+        assert isinstance(resp, str)
+        body = json.loads(resp)
+        assert body["mode"] == "som"
+        assert body["app"] == "Safari"
+        assert "Sign in" in body["vision_analysis"]
+        assert body["vision_analysis_routed_via"] == "auxiliary.vision"
+        # The original AX-only metadata (window title, element index, app)
+        # is preserved alongside the new vision analysis so the agent loses
+        # no context vs the multimodal path.
+        assert body["window_title"] == "GitHub – Issue #24015"
+        assert len(body["elements"]) == 2
+
+        assert captured_calls.get("called") is True
+        # vision_analyze_tool was invoked with a path under the patched cache
+        # and a non-empty prompt.
+        args, _kwargs = fake_vat.call_args
+        path_arg, prompt_arg = args[0], args[1]
+        assert str(tmp_cache_dir) in path_arg
+        assert "macOS application screenshot" in prompt_arg
+        # AX summary is included so the aux model can ground its description
+        # against the same set-of-mark index the agent will see.
+        assert "Sign in" in prompt_arg
+
+    def test_temp_screenshot_file_is_cleaned_up_after_routing(
+        self, tmp_cache_dir,
+    ):
+        from tools.computer_use import tool as cu_tool
+
+        cap = _make_capture(mode="som")
+        # We capture the path the aux call sees so we can assert it's gone
+        # after _capture_response returns.
+        observed_path = {}
+
+        def _fake_run_async(_coro):
+            return _stub_aux_analysis("description goes here")
+
+        def _fake_vat(image_path, _prompt):
+            observed_path["path"] = image_path
+            # File must exist while aux is being arranged.
+            assert os.path.exists(image_path)
+            return "<coro>"
+
+        fake_vat = MagicMock(side_effect=_fake_vat)
+
+        with patch.object(cu_tool, "_should_route_through_aux_vision",
+                          return_value=True), \
+             patch("model_tools._run_async", side_effect=_fake_run_async), \
+             patch("tools.vision_tools.vision_analyze_tool",
+                   new_callable=lambda: fake_vat):
+            cu_tool._capture_response(cap)
+
+        # File must be unlinked after _capture_response returns.
+        assert observed_path["path"]
+        assert not os.path.exists(observed_path["path"])
+
+    def test_temp_file_cleaned_up_even_when_aux_call_raises(
+        self, tmp_cache_dir,
+    ):
+        from tools.computer_use import tool as cu_tool
+
+        cap = _make_capture(mode="som")
+        observed_path = {}
+
+        def _fake_vat(image_path, _prompt):
+            observed_path["path"] = image_path
+            return "<coro>"
+
+        def _fake_run_async(_coro):
+            raise RuntimeError("aux LLM down")
+
+        fake_vat = MagicMock(side_effect=_fake_vat)
+
+        with patch.object(cu_tool, "_should_route_through_aux_vision",
+                          return_value=True), \
+             patch("model_tools._run_async", side_effect=_fake_run_async), \
+             patch("tools.vision_tools.vision_analyze_tool",
+                   new_callable=lambda: fake_vat):
+            resp = cu_tool._capture_response(cap)
+
+        # Aux failure → fall back to multimodal envelope (so the user still
+        # gets *something* useful even if vision is broken).
+        assert isinstance(resp, dict)
+        assert resp.get("_multimodal") is True
+        # Temp file must still be cleaned up.
+        assert observed_path["path"]
+        assert not os.path.exists(observed_path["path"])
+
+    def test_empty_aux_analysis_falls_back_to_multimodal(self, tmp_cache_dir):
+        from tools.computer_use import tool as cu_tool
+
+        cap = _make_capture(mode="som")
+
+        def _fake_run_async(_coro):
+            return _stub_aux_analysis("")
+
+        fake_vat = MagicMock(return_value="<coro>")
+
+        with patch.object(cu_tool, "_should_route_through_aux_vision",
+                          return_value=True), \
+             patch("model_tools._run_async", side_effect=_fake_run_async), \
+             patch("tools.vision_tools.vision_analyze_tool",
+                   new_callable=lambda: fake_vat):
+            resp = cu_tool._capture_response(cap)
+
+        # Empty analysis is treated as failure — we'd rather show pixels
+        # than embed an empty 'vision_analysis' string into the result.
+        assert isinstance(resp, dict)
+        assert resp.get("_multimodal") is True
+
+    def test_invalid_aux_response_falls_back_to_multimodal(self, tmp_cache_dir):
+        from tools.computer_use import tool as cu_tool
+
+        cap = _make_capture(mode="som")
+
+        def _fake_run_async(_coro):
+            return 1234  # not a string at all
+
+        fake_vat = MagicMock(return_value="<coro>")
+
+        with patch.object(cu_tool, "_should_route_through_aux_vision",
+                          return_value=True), \
+             patch("model_tools._run_async", side_effect=_fake_run_async), \
+             patch("tools.vision_tools.vision_analyze_tool",
+                   new_callable=lambda: fake_vat):
+            resp = cu_tool._capture_response(cap)
+
+        assert isinstance(resp, dict)
+        assert resp.get("_multimodal") is True
+
+
+# ---------------------------------------------------------------------------
+# _should_route_through_aux_vision: end-to-end with real config plumbing
+# ---------------------------------------------------------------------------
+
+class TestRoutingDecisionWiring:
+    """Verify _should_route_through_aux_vision wires the right config + helper."""
+
+    def test_explicit_aux_vision_in_config_routes_to_aux(self):
+        from tools.computer_use import tool as cu_tool
+
+        cfg = {
+            "model": {"default": "tencent/hy3-preview", "provider": "openrouter"},
+            "auxiliary": {
+                "vision": {
+                    "provider": "openrouter",
+                    "model": "google/gemini-2.5-flash",
+                }
+            },
+        }
+        with patch("agent.auxiliary_client._read_main_provider",
+                   return_value="openrouter"), \
+             patch("agent.auxiliary_client._read_main_model",
+                   return_value="tencent/hy3-preview"), \
+             patch("hermes_cli.config.load_config", return_value=cfg):
+            assert cu_tool._should_route_through_aux_vision() is True
+
+    def test_no_explicit_aux_and_vision_capable_main_keeps_multimodal(self):
+        from tools.computer_use import tool as cu_tool
+
+        cfg = {
+            "model": {"default": "claude-opus-4-5", "provider": "anthropic"},
+        }
+        with patch("agent.auxiliary_client._read_main_provider",
+                   return_value="anthropic"), \
+             patch("agent.auxiliary_client._read_main_model",
+                   return_value="claude-opus-4-5"), \
+             patch("hermes_cli.config.load_config", return_value=cfg), \
+             patch("tools.computer_use.vision_routing._lookup_supports_vision",
+                   return_value=True), \
+             patch("tools.computer_use.vision_routing."
+                   "_provider_accepts_multimodal_tool_result",
+                   return_value=True):
+            assert cu_tool._should_route_through_aux_vision() is False
+
+    def test_config_load_failure_disables_routing_safely(self):
+        from tools.computer_use import tool as cu_tool
+
+        with patch("hermes_cli.config.load_config",
+                   side_effect=RuntimeError("config.yaml unreadable")):
+            # No exception should bubble up — fail open by returning False
+            # so the legacy multimodal envelope continues to work.
+            assert cu_tool._should_route_through_aux_vision() is False
+
+    def test_helper_decision_exception_is_swallowed(self):
+        from tools.computer_use import tool as cu_tool
+        from tools.computer_use import vision_routing as vr_mod
+
+        with patch("agent.auxiliary_client._read_main_provider",
+                   return_value="openrouter"), \
+             patch("agent.auxiliary_client._read_main_model",
+                   return_value="x"), \
+             patch("hermes_cli.config.load_config", return_value={}), \
+             patch.object(vr_mod, "should_route_capture_to_aux_vision",
+                          side_effect=ValueError("policy bug")):
+            assert cu_tool._should_route_through_aux_vision() is False
+
+
+# ---------------------------------------------------------------------------
+# Bug reproduction marker — proves the fix is needed.
+# ---------------------------------------------------------------------------
+
+class TestBugReproductionAnchor:
+    """Without the fix, this test would assert the wrong thing.
+
+    On upstream/main HEAD prior to this branch, _capture_response returns a
+    multimodal envelope unconditionally — so when a non-vision main model
+    is configured, the captured PNG is delivered to the main provider as
+    image_url content and the request is rejected with HTTP 404. We don't
+    have a live provider here, but we can pin the contract: with routing
+    enabled the response MUST be a JSON string with no image_url parts.
+    """
+
+    def test_non_vision_main_model_never_returns_image_url_when_routed(
+        self, tmp_cache_dir,
+    ):
+        from tools.computer_use import tool as cu_tool
+
+        cap = _make_capture(mode="som")
+
+        def _fake_run_async(_coro):
+            return _stub_aux_analysis(
+                "Screenshot showing a GitHub.com window with a sign-in "
+                "form."
+            )
+
+        fake_vat = MagicMock(return_value="<coro>")
+
+        with patch.object(cu_tool, "_should_route_through_aux_vision",
+                          return_value=True), \
+             patch("model_tools._run_async", side_effect=_fake_run_async), \
+             patch("tools.vision_tools.vision_analyze_tool",
+                   new_callable=lambda: fake_vat):
+            resp = cu_tool._capture_response(cap)
+
+        # Must be a string (text-only result).
+        assert isinstance(resp, str)
+        # Must NOT contain a base64 image URL anywhere — that's what tripped
+        # 'No endpoints found that support image input' on the reporter's
+        # main provider in #24015.
+        assert "data:image" not in resp
+        assert "image_url" not in resp
@@ -0,0 +1,260 @@
+"""Unit tests for tools.computer_use.vision_routing.
+
+Cover the small ``should_route_capture_to_aux_vision`` policy helper that
+decides whether a captured screenshot from ``computer_use(action='capture')``
+should be returned as a multimodal envelope (main model handles vision
+natively) or pre-analysed via the ``auxiliary.vision`` pipeline so the
+main model only sees text.
+
+The companion end-to-end regression for #24015 lives in
+``tests/tools/test_computer_use_capture_routing.py``; this file pins the
+unit contract of the helper in isolation so behaviour does not regress
+silently if the surrounding ``computer_use`` plumbing is refactored.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import patch
+
+import pytest
+
+
+# ---------------------------------------------------------------------------
+# _explicit_aux_vision_override
+# ---------------------------------------------------------------------------
+
+class TestExplicitAuxVisionOverride:
+    """Mirror agent.image_routing — config detection must agree across paths."""
+
+    def test_returns_false_for_none_cfg(self):
+        from tools.computer_use.vision_routing import _explicit_aux_vision_override
+        assert _explicit_aux_vision_override(None) is False
+
+    def test_returns_false_for_non_dict_cfg(self):
+        from tools.computer_use.vision_routing import _explicit_aux_vision_override
+        assert _explicit_aux_vision_override("not-a-dict") is False
+        assert _explicit_aux_vision_override([]) is False
+
+    def test_returns_false_when_auxiliary_block_missing(self):
+        from tools.computer_use.vision_routing import _explicit_aux_vision_override
+        assert _explicit_aux_vision_override({}) is False
+        assert _explicit_aux_vision_override({"model": {"default": "x"}}) is False
+
+    def test_returns_false_when_vision_block_missing(self):
+        from tools.computer_use.vision_routing import _explicit_aux_vision_override
+        cfg = {"auxiliary": {"compression": {"provider": "openai"}}}
+        assert _explicit_aux_vision_override(cfg) is False
+
+    def test_returns_false_for_blank_provider_no_model_no_base_url(self):
+        from tools.computer_use.vision_routing import _explicit_aux_vision_override
+        cfg = {"auxiliary": {"vision": {"provider": "", "model": "", "base_url": ""}}}
+        assert _explicit_aux_vision_override(cfg) is False
+
+    def test_returns_false_for_provider_auto(self):
+        from tools.computer_use.vision_routing import _explicit_aux_vision_override
+        cfg = {"auxiliary": {"vision": {"provider": "auto"}}}
+        assert _explicit_aux_vision_override(cfg) is False
+
+    def test_returns_false_for_provider_AUTO_uppercase(self):
+        from tools.computer_use.vision_routing import _explicit_aux_vision_override
+        cfg = {"auxiliary": {"vision": {"provider": "  AUTO  "}}}
+        assert _explicit_aux_vision_override(cfg) is False
+
+    def test_returns_true_for_explicit_provider(self):
+        from tools.computer_use.vision_routing import _explicit_aux_vision_override
+        cfg = {"auxiliary": {"vision": {"provider": "openrouter"}}}
+        assert _explicit_aux_vision_override(cfg) is True
+
+    def test_returns_true_for_explicit_model_only(self):
+        from tools.computer_use.vision_routing import _explicit_aux_vision_override
+        cfg = {"auxiliary": {"vision": {"model": "google/gemini-2.5-flash"}}}
+        assert _explicit_aux_vision_override(cfg) is True
+
+    def test_returns_true_for_explicit_base_url_only(self):
+        from tools.computer_use.vision_routing import _explicit_aux_vision_override
+        cfg = {"auxiliary": {"vision": {"base_url": "http://localhost:1234/v1"}}}
+        assert _explicit_aux_vision_override(cfg) is True
+
+    def test_returns_true_for_provider_auto_plus_explicit_model(self):
+        """``provider: auto`` + an explicit model still counts as override."""
+        from tools.computer_use.vision_routing import _explicit_aux_vision_override
+        cfg = {
+            "auxiliary": {
+                "vision": {"provider": "auto", "model": "claude-3-haiku"},
+            }
+        }
+        assert _explicit_aux_vision_override(cfg) is True
+
+    def test_handles_non_dict_vision_block(self):
+        from tools.computer_use.vision_routing import _explicit_aux_vision_override
+        cfg = {"auxiliary": {"vision": "not-a-dict"}}
+        assert _explicit_aux_vision_override(cfg) is False
+
+
+# ---------------------------------------------------------------------------
+# should_route_capture_to_aux_vision
+# ---------------------------------------------------------------------------
+
+class TestRouteDecision:
+    """End-to-end policy: explicit override > tool-result support > vision caps."""
+
+    def test_explicit_override_routes_to_aux_even_for_vision_main(self):
+        """Issue #24015 core repro: explicit aux config must win.
+
+        Even if the main model fully supports vision (Anthropic / Claude),
+        an explicit ``auxiliary.vision`` block means the user wants their
+        configured backend used. Don't silently bypass it.
+        """
+        from tools.computer_use import vision_routing
+
+        cfg = {
+            "auxiliary": {
+                "vision": {
+                    "provider": "openrouter",
+                    "model": "google/gemini-2.5-flash",
+                }
+            }
+        }
+        with patch.object(vision_routing, "_lookup_supports_vision", return_value=True), \
+             patch.object(vision_routing,
+                          "_provider_accepts_multimodal_tool_result",
+                          return_value=True):
+            assert vision_routing.should_route_capture_to_aux_vision(
+                "anthropic", "claude-opus-4-5", cfg
+            ) is True
+
+    def test_non_vision_main_model_routes_to_aux(self):
+        """The reported #24015 scenario: tencent/hy3-preview has no vision."""
+        from tools.computer_use import vision_routing
+
+        cfg = {"model": {"default": "tencent/hy3-preview", "provider": "openrouter"}}
+        with patch.object(vision_routing, "_lookup_supports_vision", return_value=False), \
+             patch.object(vision_routing,
+                          "_provider_accepts_multimodal_tool_result",
+                          return_value=True):
+            assert vision_routing.should_route_capture_to_aux_vision(
+                "openrouter", "tencent/hy3-preview", cfg
+            ) is True
+
+    def test_vision_main_model_no_override_keeps_multimodal(self):
+        """Default path: vision-capable main model + no aux override → native."""
+        from tools.computer_use import vision_routing
+
+        with patch.object(vision_routing, "_lookup_supports_vision", return_value=True), \
+             patch.object(vision_routing,
+                          "_provider_accepts_multimodal_tool_result",
+                          return_value=True):
+            assert vision_routing.should_route_capture_to_aux_vision(
+                "anthropic", "claude-opus-4-5", None
+            ) is False
+
+    def test_provider_rejects_multimodal_tool_results_routes_to_aux(self):
+        """Some providers' tool-result messages won't carry images at all."""
+        from tools.computer_use import vision_routing
+
+        with patch.object(vision_routing, "_lookup_supports_vision", return_value=True), \
+             patch.object(vision_routing,
+                          "_provider_accepts_multimodal_tool_result",
+                          return_value=False):
+            assert vision_routing.should_route_capture_to_aux_vision(
+                "some-aggregator", "some-vision-model", {}
+            ) is True
+
+    def test_unknown_provider_capabilities_fail_closed(self):
+        """When tool-result lookup returns None, route to aux (safe default)."""
+        from tools.computer_use import vision_routing
+
+        with patch.object(vision_routing, "_lookup_supports_vision", return_value=True), \
+             patch.object(vision_routing,
+                          "_provider_accepts_multimodal_tool_result",
+                          return_value=None):
+            assert vision_routing.should_route_capture_to_aux_vision(
+                "exotic-provider", "exotic-model", {}
+            ) is True
+
+    def test_unknown_vision_capability_fails_closed(self):
+        """When models.dev has no entry, prefer aux over a likely 404."""
+        from tools.computer_use import vision_routing
+
+        with patch.object(vision_routing, "_lookup_supports_vision", return_value=None), \
+             patch.object(vision_routing,
+                          "_provider_accepts_multimodal_tool_result",
+                          return_value=True):
+            assert vision_routing.should_route_capture_to_aux_vision(
+                "openrouter", "novel/never-seen-model", {}
+            ) is True
+
+    def test_explicit_override_wins_over_unknown_caps(self):
+        """Explicit aux config wins regardless of unknown caps elsewhere."""
+        from tools.computer_use import vision_routing
+
+        cfg = {"auxiliary": {"vision": {"provider": "openrouter"}}}
+        with patch.object(vision_routing, "_lookup_supports_vision", return_value=None), \
+             patch.object(vision_routing,
+                          "_provider_accepts_multimodal_tool_result",
+                          return_value=None):
+            assert vision_routing.should_route_capture_to_aux_vision(
+                "openrouter", "tencent/hy3-preview", cfg
+            ) is True
+
+
+# ---------------------------------------------------------------------------
+# Internal lookups — defensive paths
+# ---------------------------------------------------------------------------
+
+class TestLookupHelpers:
+    def test_lookup_supports_vision_returns_none_for_blank_provider(self):
+        from tools.computer_use.vision_routing import _lookup_supports_vision
+        assert _lookup_supports_vision("", "claude") is None
+
+    def test_lookup_supports_vision_returns_none_for_blank_model(self):
+        from tools.computer_use.vision_routing import _lookup_supports_vision
+        assert _lookup_supports_vision("anthropic", "") is None
+
+    def test_lookup_supports_vision_handles_lookup_exception(self):
+        """Underlying caps lookup may raise; helper must swallow + return None."""
+        from tools.computer_use import vision_routing
+
+        def _boom(_provider, _model):
+            raise RuntimeError("models.dev unreachable")
+
+        with patch("agent.models_dev.get_model_capabilities", side_effect=_boom):
+            assert vision_routing._lookup_supports_vision("anthropic", "claude") is None
+
+    def test_lookup_supports_vision_returns_none_when_caps_missing(self):
+        from tools.computer_use import vision_routing
+
+        with patch("agent.models_dev.get_model_capabilities", return_value=None):
+            assert vision_routing._lookup_supports_vision("anthropic", "claude") is None
+
+    def test_provider_accepts_multimodal_tool_result_returns_none_for_blank_provider(self):
+        from tools.computer_use.vision_routing import (
+            _provider_accepts_multimodal_tool_result,
+        )
+        assert _provider_accepts_multimodal_tool_result("", "claude") is None
+
+
+# ---------------------------------------------------------------------------
+# Module surface
+# ---------------------------------------------------------------------------
+
+class TestModuleSurface:
+    """Pin the public surface so dependents stay in lockstep."""
+
+    def test_should_route_capture_to_aux_vision_is_exported(self):
+        from tools.computer_use import vision_routing
+
+        assert "should_route_capture_to_aux_vision" in vision_routing.__all__
+        assert callable(vision_routing.should_route_capture_to_aux_vision)
+
+    @pytest.mark.parametrize("name", [
+        "_explicit_aux_vision_override",
+        "_lookup_supports_vision",
+        "_provider_accepts_multimodal_tool_result",
+    ])
+    def test_internal_helpers_are_addressable(self, name):
+        """Internal helpers stay importable so tests can monkeypatch them."""
+        from tools.computer_use import vision_routing
+
+        assert hasattr(vision_routing, name)
+        assert callable(getattr(vision_routing, name))
@@ -1089,9 +1089,17 @@ class Test403Enrichment:
 class TestModelToolsIntegration:
    def setup_method(self):
        _reset_capability_cache()
+        from model_tools import _clear_tool_defs_cache
+        from tools.registry import invalidate_check_fn_cache
+        _clear_tool_defs_cache()
+        invalidate_check_fn_cache()

    def teardown_method(self):
        _reset_capability_cache()
+        from model_tools import _clear_tool_defs_cache
+        from tools.registry import invalidate_check_fn_cache
+        _clear_tool_defs_cache()
+        invalidate_check_fn_cache()

    @patch("tools.discord_tool._discord_request")
    def test_discord_admin_schema_rebuilt_by_get_tool_definitions(
@@ -501,16 +501,18 @@ class TestRegistration:

    def test_check_fn_gates_availability(self, monkeypatch):
        """Registry should exclude HA tools when HASS_TOKEN is not set."""
-        from tools.registry import registry
+        from tools.registry import invalidate_check_fn_cache, registry

        monkeypatch.delenv("HASS_TOKEN", raising=False)
+        invalidate_check_fn_cache()
        defs = registry.get_definitions({"ha_list_entities", "ha_get_state", "ha_call_service"})
        assert len(defs) == 0

    def test_check_fn_includes_when_token_set(self, monkeypatch):
        """Registry should include HA tools when HASS_TOKEN is set."""
-        from tools.registry import registry
+        from tools.registry import invalidate_check_fn_cache, registry

        monkeypatch.setenv("HASS_TOKEN", "test-token")
+        invalidate_check_fn_cache()
        defs = registry.get_definitions({"ha_list_entities", "ha_get_state", "ha_call_service"})
        assert len(defs) == 3
@@ -1093,6 +1093,11 @@ def test_kanban_guidance_not_in_normal_prompt(monkeypatch, tmp_path):
    from pathlib import Path as _P
    monkeypatch.setattr(_P, "home", lambda: tmp_path)

+    from tools.registry import invalidate_check_fn_cache
+    from model_tools import _clear_tool_defs_cache
+    invalidate_check_fn_cache()
+    _clear_tool_defs_cache()
+
    from run_agent import AIAgent
    a = AIAgent(
        api_key="test",
@@ -1116,6 +1121,11 @@ def test_kanban_guidance_in_worker_prompt(monkeypatch, tmp_path):
    from pathlib import Path as _P
    monkeypatch.setattr(_P, "home", lambda: tmp_path)

+    from tools.registry import invalidate_check_fn_cache
+    from model_tools import _clear_tool_defs_cache
+    invalidate_check_fn_cache()
+    _clear_tool_defs_cache()
+
    from run_agent import AIAgent
    a = AIAgent(
        api_key="test",
@@ -78,6 +78,63 @@ def test_resolve_managed_tool_gateway_is_disabled_without_subscription():
    assert result is None


+def test_rewrite_localhost_origin_rewrites_subdomain():
+    rewrite = managed_tool_gateway._rewrite_localhost_origin
+    resolved, host = rewrite("http://tools-gateway.localhost:3009")
+    assert resolved == "http://127.0.0.1:3009"
+    assert host == "tools-gateway.localhost:3009"
+
+
+def test_rewrite_localhost_origin_preserves_path():
+    rewrite = managed_tool_gateway._rewrite_localhost_origin
+    resolved, host = rewrite("http://tools-gateway.localhost:3009/v1/foo")
+    assert resolved == "http://127.0.0.1:3009/v1/foo"
+    assert host == "tools-gateway.localhost:3009"
+
+
+def test_rewrite_localhost_origin_no_port():
+    rewrite = managed_tool_gateway._rewrite_localhost_origin
+    resolved, host = rewrite("http://tools-gateway.localhost")
+    assert resolved == "http://127.0.0.1"
+    assert host == "tools-gateway.localhost"
+
+
+def test_rewrite_localhost_origin_ignores_bare_localhost():
+    rewrite = managed_tool_gateway._rewrite_localhost_origin
+    resolved, host = rewrite("http://localhost:3009")
+    assert resolved == "http://localhost:3009"
+    assert host is None
+
+
+def test_rewrite_localhost_origin_ignores_real_domains():
+    rewrite = managed_tool_gateway._rewrite_localhost_origin
+    resolved, host = rewrite("https://tools-gateway.nousresearch.com")
+    assert resolved == "https://tools-gateway.nousresearch.com"
+    assert host is None
+
+
+def test_gateway_config_resolved_origin_and_host_header():
+    cfg = managed_tool_gateway.ManagedToolGatewayConfig(
+        vendor="tools",
+        gateway_origin="http://tools-gateway.localhost:3009",
+        nous_user_token="tok",
+        managed_mode=True,
+    )
+    assert cfg.resolved_origin == "http://127.0.0.1:3009"
+    assert cfg.gateway_host_header == "tools-gateway.localhost:3009"
+
+
+def test_gateway_config_resolved_origin_passthrough_for_real_domain():
+    cfg = managed_tool_gateway.ManagedToolGatewayConfig(
+        vendor="firecrawl",
+        gateway_origin="https://firecrawl-gateway.nousresearch.com",
+        nous_user_token="tok",
+        managed_mode=True,
+    )
+    assert cfg.resolved_origin == "https://firecrawl-gateway.nousresearch.com"
+    assert cfg.gateway_host_header is None
+
+
 def test_read_nous_access_token_refreshes_expiring_cached_token(tmp_path, monkeypatch):
    monkeypatch.delenv("TOOL_GATEWAY_USER_TOKEN", raising=False)
    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
--- a/Show More
+++ b/Show More