Compare commits
80 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 2c4f3ea196 | |||
| cb12ee4b2d | |||
| a57781f8a9 | |||
| 6749e335a3 | |||
| 53814b39c3 | |||
| efd71e8914 | |||
| bc2ba1356e | |||
| e0b3fa6eb3 | |||
| 929245ba69 | |||
| 73a3de5798 | |||
| 3a26076194 | |||
| 04d3a2e2be | |||
| 70882abe9b | |||
| 2771d404a3 | |||
| 7150715e19 | |||
| 4eab358ff7 | |||
| f96db81d3b | |||
| 6f436a463e | |||
| 9d61408837 | |||
| ec2ab5bfaf | |||
| 82c2035823 | |||
| 2e509422ef | |||
| 3ac2125140 | |||
| 7dea33303a | |||
| d246f9a278 | |||
| c1e93aa331 | |||
| 8b49012a0a | |||
| 3fc715ddf5 | |||
| 9c90b3a597 | |||
| 22b0d6dc1a | |||
| 5dc232a6e2 | |||
| c25f9d1d36 | |||
| d617858896 | |||
| 2d587c5662 | |||
| caf0f30eab | |||
| 70d53d8b75 | |||
| fbdca64f73 | |||
| 07b7cf6fe4 | |||
| c52cd48e25 | |||
| d3f62c6913 | |||
| c769be344a | |||
| 372e9a18cd | |||
| b5c6d9ac08 | |||
| f6f25b9449 | |||
| e77f1ed5f7 | |||
| 4c61fb6cf6 | |||
| 1264fab156 | |||
| 4e2c66a098 | |||
| eb51fb6f50 | |||
| 4a2fa77c15 | |||
| 9896e43db5 | |||
| d08c2a016a | |||
| 0e2873a77d | |||
| 280dd4513a | |||
| bb694bad42 | |||
| 9e30ef224d | |||
| a7cd254c29 | |||
| 4d58e48cdb | |||
| bec2250d2c | |||
| e02a7e5e1c | |||
| 5ce5fe3181 | |||
| 531efe7208 | |||
| 2a474bcf72 | |||
| 6dbbf20ff4 | |||
| 5aa4727f34 | |||
| 4cc18877c6 | |||
| 3fde8c153d | |||
| 3462b097e2 | |||
| 552e9c7881 | |||
| 18cd1e5c72 | |||
| 0ce12a9241 | |||
| 56b79f12ac | |||
| 3d2f146460 | |||
| 2e3f576298 | |||
| 2ea7cf287e | |||
| ba9964ff0d | |||
| 2fdefca570 | |||
| 48be2e0e4d | |||
| 8ad34db551 | |||
| ef43938e2b |
@@ -27,9 +27,9 @@ on:
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
# Concurrency: push/release runs are NEVER cancelled so every merge gets its
|
||||
# own SHA-tagged image; :main and :latest are guarded separately by the
|
||||
# move-main and move-latest jobs. PR runs reuse a PR-scoped group with
|
||||
# Concurrency: push/release runs are NEVER cancelled so every merge gets
|
||||
# its own :main or release-tagged image. :latest is guarded separately
|
||||
# by the move-latest job. PR runs reuse a PR-scoped group with
|
||||
# cancel-in-progress: true so rapid pushes to the same PR collapse to the
|
||||
# latest commit.
|
||||
concurrency:
|
||||
@@ -92,10 +92,10 @@ jobs:
|
||||
# pattern for multi-runner multi-platform builds.
|
||||
#
|
||||
# We apply the OCI revision label here (and again on arm64) because
|
||||
# the move-main / move-latest jobs read it off the linux/amd64
|
||||
# sub-manifest config of the floating tag to decide whether it's safe
|
||||
# to advance. The label must be on each per-arch image — manifest
|
||||
# lists themselves don't carry image config labels.
|
||||
# the move-latest job reads it off the linux/amd64 sub-manifest
|
||||
# config of the floating tag to decide whether it's safe to advance.
|
||||
# The label must be on each per-arch image — manifest lists themselves
|
||||
# don't carry image config labels.
|
||||
- name: Push amd64 by digest
|
||||
id: push
|
||||
if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
|
||||
@@ -208,8 +208,14 @@ jobs:
|
||||
# ---------------------------------------------------------------------------
|
||||
# Stitch both per-arch digests into a single tagged multi-arch manifest.
|
||||
# This is a registry-side operation — no building, no layer re-push —
|
||||
# so it runs in ~30 seconds. On main pushes it produces :sha-<sha>.
|
||||
# On releases it produces :<release_tag_name>.
|
||||
# so it runs in ~30 seconds. On main pushes it produces :main; on
|
||||
# releases it produces :<release_tag_name>.
|
||||
#
|
||||
# For main pushes the ancestor check runs BEFORE the manifest push so
|
||||
# we never overwrite :main with an older commit. The top-level
|
||||
# concurrency group (`docker-${{ github.ref }}` with
|
||||
# `cancel-in-progress: false`) already serialises runs per ref; the
|
||||
# ancestor check is defense-in-depth.
|
||||
# ---------------------------------------------------------------------------
|
||||
merge:
|
||||
if: github.repository == 'NousResearch/hermes-agent' && (github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release')
|
||||
@@ -217,10 +223,15 @@ jobs:
|
||||
needs: [build-amd64, build-arm64]
|
||||
timeout-minutes: 10
|
||||
outputs:
|
||||
pushed_sha_tag: ${{ steps.mark_pushed.outputs.pushed }}
|
||||
pushed_release_tag: ${{ steps.mark_release_pushed.outputs.pushed }}
|
||||
release_tag: ${{ steps.tag.outputs.tag }}
|
||||
steps:
|
||||
- name: Checkout code
|
||||
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
fetch-depth: 1000
|
||||
|
||||
- name: Download digests
|
||||
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
|
||||
with:
|
||||
@@ -237,120 +248,19 @@ jobs:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
# Compute the tag for this run. Main pushes use sha-<sha> (so every
|
||||
# commit gets its own immutable tag); releases use the release tag name.
|
||||
- name: Compute tag
|
||||
id: tag
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" = "release" ]; then
|
||||
echo "tag=${{ github.event.release.tag_name }}" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "tag=sha-${{ github.sha }}" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
- name: Create manifest list and push
|
||||
working-directory: /tmp/digests
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# Build the arg array from each digest file (filename = the digest
|
||||
# hex, with no sha256: prefix; empty file content, only the name
|
||||
# matters). Using an array avoids shellcheck SC2046 and keeps
|
||||
# every digest a single argv token even under pathological names.
|
||||
args=()
|
||||
for digest_file in *; do
|
||||
args+=("${IMAGE_NAME}@sha256:${digest_file}")
|
||||
done
|
||||
docker buildx imagetools create \
|
||||
-t "${IMAGE_NAME}:${TAG}" \
|
||||
"${args[@]}"
|
||||
env:
|
||||
IMAGE_NAME: ${{ env.IMAGE_NAME }}
|
||||
TAG: ${{ steps.tag.outputs.tag }}
|
||||
|
||||
- name: Inspect image
|
||||
run: |
|
||||
docker buildx imagetools inspect "${IMAGE_NAME}:${TAG}"
|
||||
env:
|
||||
IMAGE_NAME: ${{ env.IMAGE_NAME }}
|
||||
TAG: ${{ steps.tag.outputs.tag }}
|
||||
|
||||
# Signal to move-main that the SHA tag is live. Only on main pushes;
|
||||
# releases set pushed_release_tag instead.
|
||||
- name: Mark SHA tag pushed
|
||||
id: mark_pushed
|
||||
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
||||
run: echo "pushed=true" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# Signal to move-latest that the release tag is live.
|
||||
- name: Mark release tag pushed
|
||||
id: mark_release_pushed
|
||||
if: github.event_name == 'release'
|
||||
run: echo "pushed=true" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Move :main to point at the SHA tag the merge job pushed.
|
||||
#
|
||||
# :main is the floating tag that tracks the tip of the main branch. Every
|
||||
# merge to main retags :main forward. Users who want "latest dev build"
|
||||
# pull :main; users who want stable releases pull :latest.
|
||||
#
|
||||
# The real serialization guarantee comes from the top-level concurrency
|
||||
# group (`docker-${{ github.ref }}` with `cancel-in-progress: false`),
|
||||
# which ensures at most one workflow run for this ref executes at a time.
|
||||
# That means two move-main steps for the same ref cannot overlap.
|
||||
#
|
||||
# This job has its own concurrency group as defense-in-depth: if the
|
||||
# top-level group is ever loosened, queued move-mains will run serially
|
||||
# in arrival order, each one running the ancestor check below and either
|
||||
# advancing :main or skipping. `cancel-in-progress: false` matches the
|
||||
# top-level setting — we don't want rapid pushes to cancel a queued
|
||||
# move-main, because the ancestor check is the real safety mechanism
|
||||
# and queueing is cheap (move-main is a ~30s registry op).
|
||||
#
|
||||
# Combined with the ancestor check, this means :main only ever moves
|
||||
# forward in git history.
|
||||
# ---------------------------------------------------------------------------
|
||||
move-main:
|
||||
if: |
|
||||
github.repository == 'NousResearch/hermes-agent'
|
||||
&& github.event_name == 'push'
|
||||
&& github.ref == 'refs/heads/main'
|
||||
&& needs.merge.outputs.pushed_sha_tag == 'true'
|
||||
needs: merge
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 10
|
||||
concurrency:
|
||||
group: docker-move-main-${{ github.ref }}
|
||||
cancel-in-progress: false
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
fetch-depth: 1000
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
# Read the git revision label off the current :main manifest, then
|
||||
# use `git merge-base --is-ancestor` to check whether our commit is a
|
||||
# descendant of it. If :main doesn't exist yet, or its label is
|
||||
# missing, we treat that as "safe to publish". If another run already
|
||||
# advanced :main past us (or diverged), we skip and leave it alone.
|
||||
# use `git merge-base --is-ancestor` to check whether our commit is
|
||||
# a descendant of it. If :main doesn't exist yet, or its label is
|
||||
# missing, we treat that as "safe to publish". If another run
|
||||
# already advanced :main past us (or diverged), we skip and leave
|
||||
# it alone.
|
||||
- name: Decide whether to move :main
|
||||
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
||||
id: main_check
|
||||
run: |
|
||||
set -euo pipefail
|
||||
image=nousresearch/hermes-agent
|
||||
|
||||
# Pull the JSON for the linux/amd64 sub-manifest's config and extract
|
||||
# the OCI revision label with jq — Go template field access can't
|
||||
# handle dots in map keys, so using json+jq is the robust route.
|
||||
image_json=$(
|
||||
docker buildx imagetools inspect "${image}:main" \
|
||||
--format '{{ json (index .Image "linux/amd64") }}' \
|
||||
@@ -383,7 +293,6 @@ jobs:
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Make sure we have the :main commit locally for merge-base.
|
||||
if ! git cat-file -e "${current_sha}^{commit}" 2>/dev/null; then
|
||||
git fetch --no-tags --prune origin \
|
||||
"+refs/heads/main:refs/remotes/origin/main" \
|
||||
@@ -396,7 +305,6 @@ jobs:
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Our SHA must be a descendant of the current :main to be safe.
|
||||
if git merge-base --is-ancestor "${current_sha}" "${GITHUB_SHA}"; then
|
||||
echo "Our commit is a descendant of :main — safe to advance."
|
||||
echo "push_main=true" >> "$GITHUB_OUTPUT"
|
||||
@@ -405,19 +313,48 @@ jobs:
|
||||
echo "push_main=false" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
# Retag the already-pushed SHA manifest as :main. This is a registry-
|
||||
# side operation — no rebuild, no layer re-push — so it's quick and
|
||||
# atomic per-tag. The ancestor check above plus the cancel-in-progress
|
||||
# concurrency on this job together guarantee we only ever move :main
|
||||
# forward in git history.
|
||||
- name: Move :main to this SHA
|
||||
if: steps.main_check.outputs.push_main == 'true'
|
||||
# Compute the tag for this run. Main pushes tag directly as :main
|
||||
# (no per-commit SHA tags); releases use the release tag name.
|
||||
- name: Compute tag
|
||||
id: tag
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" = "release" ]; then
|
||||
echo "tag=${{ github.event.release.tag_name }}" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "tag=main" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
# Gate the manifest push on the ancestor check for main pushes.
|
||||
# For releases there is no gate — the check doesn't even run.
|
||||
- name: Create manifest list and push
|
||||
if: github.event_name != 'push' || steps.main_check.outputs.push_main == 'true'
|
||||
working-directory: /tmp/digests
|
||||
run: |
|
||||
set -euo pipefail
|
||||
image=nousresearch/hermes-agent
|
||||
args=()
|
||||
for digest_file in *; do
|
||||
args+=("${IMAGE_NAME}@sha256:${digest_file}")
|
||||
done
|
||||
docker buildx imagetools create \
|
||||
--tag "${image}:main" \
|
||||
"${image}:sha-${GITHUB_SHA}"
|
||||
-t "${IMAGE_NAME}:${TAG}" \
|
||||
"${args[@]}"
|
||||
env:
|
||||
IMAGE_NAME: ${{ env.IMAGE_NAME }}
|
||||
TAG: ${{ steps.tag.outputs.tag }}
|
||||
|
||||
- name: Inspect image
|
||||
if: github.event_name != 'push' || steps.main_check.outputs.push_main == 'true'
|
||||
run: |
|
||||
docker buildx imagetools inspect "${IMAGE_NAME}:${TAG}"
|
||||
env:
|
||||
IMAGE_NAME: ${{ env.IMAGE_NAME }}
|
||||
TAG: ${{ steps.tag.outputs.tag }}
|
||||
|
||||
# Signal to move-latest that the release tag is live.
|
||||
- name: Mark release tag pushed
|
||||
id: mark_release_pushed
|
||||
if: github.event_name == 'release'
|
||||
run: echo "pushed=true" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Move :latest to point at the release tag the merge job pushed.
|
||||
@@ -427,10 +364,10 @@ jobs:
|
||||
#
|
||||
# We still run an ancestor check against the existing :latest so that a
|
||||
# backport release on an older branch (e.g. patching v1.1.5 after v1.2.3
|
||||
# is out) doesn't drag :latest backwards. The check is the same shape as
|
||||
# move-main: read the OCI revision label off the current :latest, look up
|
||||
# that commit in git, and only advance if our release commit is a strict
|
||||
# descendant.
|
||||
# is out) doesn't drag :latest backwards. The check is the same shape
|
||||
# as the ancestor check in the merge job for :main: read the OCI
|
||||
# revision label off the current :latest, look up that commit in git,
|
||||
# and only advance if our release commit is a strict descendant.
|
||||
# ---------------------------------------------------------------------------
|
||||
move-latest:
|
||||
if: |
|
||||
|
||||
@@ -23,13 +23,24 @@ concurrency:
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 30
|
||||
timeout-minutes: 60
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Install system dependencies
|
||||
run: sudo apt-get update && sudo apt-get install -y ripgrep
|
||||
- name: Install ripgrep (prebuilt binary)
|
||||
run: |
|
||||
set -euo pipefail
|
||||
RG_VERSION=15.1.0
|
||||
RG_SHA256=1c9297be4a084eea7ecaedf93eb03d058d6faae29bbc57ecdaf5063921491599
|
||||
RG_TARBALL=ripgrep-${RG_VERSION}-x86_64-unknown-linux-musl.tar.gz
|
||||
curl -sSfL -o "$RG_TARBALL" \
|
||||
"https://github.com/BurntSushi/ripgrep/releases/download/${RG_VERSION}/${RG_TARBALL}"
|
||||
echo "${RG_SHA256} ${RG_TARBALL}" | sha256sum -c -
|
||||
tar -xzf "$RG_TARBALL"
|
||||
sudo mv "ripgrep-${RG_VERSION}-x86_64-unknown-linux-musl/rg" /usr/local/bin/rg
|
||||
rm -rf "$RG_TARBALL" "ripgrep-${RG_VERSION}-x86_64-unknown-linux-musl"
|
||||
rg --version
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
|
||||
@@ -44,9 +55,26 @@ jobs:
|
||||
uv pip install -e ".[all,dev]"
|
||||
|
||||
- name: Run tests
|
||||
# Per-file isolation via scripts/run_tests_parallel.py: discovers
|
||||
# every test_*.py file under tests/ (excluding integration/ + e2e/),
|
||||
# then runs `python -m pytest <file>` in a freshly-spawned subprocess
|
||||
# with bounded parallelism. No xdist, no shared workers, no
|
||||
# module-level state leakage between files.
|
||||
#
|
||||
# Why per-file (not per-test): per-test spawn cost (~250ms × 17k
|
||||
# tests = 70min CPU minimum) blew the wall-clock budget. Per-file
|
||||
# spawn (~250ms × ~850 files = ~3.5min) fits while still giving
|
||||
# every file a fresh interpreter — the only isolation boundary
|
||||
# that matters in practice (cross-file leakage was the original
|
||||
# flake source; intra-file is the test author's responsibility).
|
||||
#
|
||||
# Why drop xdist entirely: xdist's persistent workers accumulate
|
||||
# state across files, which is exactly the leakage we wanted to
|
||||
# fix. ThreadPoolExecutor + subprocess.run is ~60 lines and does
|
||||
# the job with cleaner semantics.
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
python -m pytest tests/ -q --ignore=tests/integration --ignore=tests/e2e --tb=short -n auto --timeout=30 --timeout-method=signal
|
||||
python scripts/run_tests_parallel.py
|
||||
env:
|
||||
# Ensure tests don't accidentally call real APIs
|
||||
OPENROUTER_API_KEY: ""
|
||||
@@ -60,8 +88,19 @@ jobs:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Install system dependencies
|
||||
run: sudo apt-get update && sudo apt-get install -y ripgrep
|
||||
- name: Install ripgrep (prebuilt binary)
|
||||
run: |
|
||||
set -euo pipefail
|
||||
RG_VERSION=15.1.0
|
||||
RG_SHA256=1c9297be4a084eea7ecaedf93eb03d058d6faae29bbc57ecdaf5063921491599
|
||||
RG_TARBALL=ripgrep-${RG_VERSION}-x86_64-unknown-linux-musl.tar.gz
|
||||
curl -sSfL -o "$RG_TARBALL" \
|
||||
"https://github.com/BurntSushi/ripgrep/releases/download/${RG_VERSION}/${RG_TARBALL}"
|
||||
echo "${RG_SHA256} ${RG_TARBALL}" | sha256sum -c -
|
||||
tar -xzf "$RG_TARBALL"
|
||||
sudo mv "ripgrep-${RG_VERSION}-x86_64-unknown-linux-musl/rg" /usr/local/bin/rg
|
||||
rm -rf "$RG_TARBALL" "ripgrep-${RG_VERSION}-x86_64-unknown-linux-musl"
|
||||
rg --version
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
|
||||
|
||||
@@ -18,6 +18,7 @@ __pycache__/web_tools.cpython-310.pyc
|
||||
logs/
|
||||
data/
|
||||
.pytest_cache/
|
||||
.pytest-cache/
|
||||
tmp/
|
||||
temp_vision_images/
|
||||
hermes-*/*
|
||||
|
||||
@@ -1013,17 +1013,39 @@ def profile_env(tmp_path, monkeypatch):
|
||||
|
||||
**ALWAYS use `scripts/run_tests.sh`** — do not call `pytest` directly. The script enforces
|
||||
hermetic environment parity with CI (unset credential vars, TZ=UTC, LANG=C.UTF-8,
|
||||
4 xdist workers matching GHA ubuntu-latest). Direct `pytest` on a 16+ core
|
||||
developer machine with API keys set diverges from CI in ways that have caused
|
||||
multiple "works locally, fails in CI" incidents (and the reverse).
|
||||
`-n auto` xdist workers, in-tree subprocess-isolation plugin). Direct `pytest`
|
||||
on a 16+ core developer machine with API keys set diverges from CI in ways
|
||||
that have caused multiple "works locally, fails in CI" incidents (and the reverse).
|
||||
|
||||
```bash
|
||||
scripts/run_tests.sh # full suite, CI-parity
|
||||
scripts/run_tests.sh tests/gateway/ # one directory
|
||||
scripts/run_tests.sh tests/agent/test_foo.py::test_x # one test
|
||||
scripts/run_tests.sh -v --tb=long # pass-through pytest flags
|
||||
scripts/run_tests.sh --no-isolate tests/foo/ # disable subprocess isolation (faster, for debugging)
|
||||
```
|
||||
|
||||
### Subprocess-per-test isolation
|
||||
|
||||
Every test runs in a freshly-spawned Python subprocess via the in-tree plugin
|
||||
at `tests/_isolate_plugin.py`. This means module-level dicts/sets and
|
||||
ContextVars from one test cannot leak into the next — the historic
|
||||
`_reset_module_state` autouse fixture is gone.
|
||||
|
||||
Implementation notes:
|
||||
|
||||
- The plugin uses `multiprocessing.get_context("spawn")`, which works on
|
||||
Linux, macOS, and Windows alike (POSIX `fork` is not used).
|
||||
- Per-test overhead is ~0.5–1.0s (Python startup + pytest collection). xdist
|
||||
parallelism amortizes this across cores; on a 20-core box the full suite
|
||||
finishes in roughly the same wall time as before, but flake-free.
|
||||
- `isolate_timeout` (configured in `pyproject.toml`) caps each test at 30s.
|
||||
Hangs are killed and surfaced as a failure report.
|
||||
- Pass `--no-isolate` to disable isolation — useful when debugging a single
|
||||
test interactively, or when you specifically want to verify state leakage.
|
||||
- The plugin disables itself in child processes (sentinel envvar
|
||||
`HERMES_ISOLATE_CHILD=1`), so there's no fork-bomb risk.
|
||||
|
||||
### Why the wrapper (and why the old "just call pytest" doesn't work)
|
||||
|
||||
Five real sources of local-vs-CI drift the script closes:
|
||||
@@ -1034,7 +1056,7 @@ Five real sources of local-vs-CI drift the script closes:
|
||||
| HOME / `~/.hermes/` | Your real config+auth.json | Temp dir per test |
|
||||
| Timezone | Local TZ (PDT etc.) | UTC |
|
||||
| Locale | Whatever is set | C.UTF-8 |
|
||||
| xdist workers | `-n auto` = all cores (20+ on a workstation) | `-n 4` matching CI |
|
||||
| xdist workers | `-n auto` = all cores | `-n auto` (safe — subprocess isolation prevents cross-worker flakes) |
|
||||
|
||||
`tests/conftest.py` also enforces points 1-4 as an autouse fixture so ANY pytest
|
||||
invocation (including IDE integrations) gets hermetic behavior — but the wrapper
|
||||
@@ -1042,15 +1064,21 @@ is belt-and-suspenders.
|
||||
|
||||
### Running without the wrapper (only if you must)
|
||||
|
||||
If you can't use the wrapper (e.g. on Windows or inside an IDE that shells
|
||||
pytest directly), at minimum activate the venv and pass `-n 4`:
|
||||
If you can't use the wrapper (e.g. inside an IDE that shells pytest directly),
|
||||
at minimum activate the venv. The isolation plugin loads automatically from
|
||||
`addopts` in `pyproject.toml`, so you get the same per-test process isolation
|
||||
either way.
|
||||
|
||||
```bash
|
||||
source .venv/bin/activate # or: source venv/bin/activate
|
||||
python -m pytest tests/ -q -n 4
|
||||
python -m pytest tests/ -q
|
||||
```
|
||||
|
||||
Worker count above 4 will surface test-ordering flakes that CI never sees.
|
||||
If you need to bypass isolation for fast feedback while debugging:
|
||||
|
||||
```bash
|
||||
python -m pytest tests/agent/test_foo.py -q --no-isolate
|
||||
```
|
||||
|
||||
Always run the full suite before pushing changes.
|
||||
|
||||
|
||||
+93
-2
@@ -71,6 +71,71 @@ def _ra():
|
||||
return run_agent
|
||||
|
||||
|
||||
def _normalized_custom_base_url(value: Any) -> str:
|
||||
if not isinstance(value, str):
|
||||
return ""
|
||||
return value.strip().rstrip("/")
|
||||
|
||||
|
||||
def _custom_provider_model_matches(agent_model: str, entry: Dict[str, Any]) -> bool:
|
||||
provider_model = str(entry.get("model", "") or "").strip().lower()
|
||||
if not provider_model:
|
||||
return True
|
||||
return provider_model == str(agent_model or "").strip().lower()
|
||||
|
||||
|
||||
def _custom_provider_extra_body_for_agent(
|
||||
*,
|
||||
provider: str,
|
||||
model: str,
|
||||
base_url: str,
|
||||
custom_providers: List[Dict[str, Any]],
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
if (provider or "").strip().lower() != "custom":
|
||||
return None
|
||||
|
||||
target_url = _normalized_custom_base_url(base_url)
|
||||
if not target_url:
|
||||
return None
|
||||
|
||||
fallback: Optional[Dict[str, Any]] = None
|
||||
for entry in custom_providers or []:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
if _normalized_custom_base_url(entry.get("base_url")) != target_url:
|
||||
continue
|
||||
extra_body = entry.get("extra_body")
|
||||
if not isinstance(extra_body, dict) or not extra_body:
|
||||
continue
|
||||
provider_model = str(entry.get("model", "") or "").strip()
|
||||
if provider_model:
|
||||
if _custom_provider_model_matches(model, entry):
|
||||
return dict(extra_body)
|
||||
elif fallback is None:
|
||||
fallback = dict(extra_body)
|
||||
|
||||
return fallback
|
||||
|
||||
|
||||
def _merge_custom_provider_extra_body(agent, custom_providers: List[Dict[str, Any]]) -> None:
|
||||
extra_body = _custom_provider_extra_body_for_agent(
|
||||
provider=agent.provider,
|
||||
model=agent.model,
|
||||
base_url=agent.base_url,
|
||||
custom_providers=custom_providers,
|
||||
)
|
||||
if not extra_body:
|
||||
return
|
||||
|
||||
overrides = dict(getattr(agent, "request_overrides", {}) or {})
|
||||
merged_extra_body = dict(extra_body)
|
||||
existing_extra_body = overrides.get("extra_body")
|
||||
if isinstance(existing_extra_body, dict):
|
||||
merged_extra_body.update(existing_extra_body)
|
||||
overrides["extra_body"] = merged_extra_body
|
||||
agent.request_overrides = overrides
|
||||
|
||||
|
||||
def init_agent(
|
||||
agent,
|
||||
base_url: str = None,
|
||||
@@ -1060,7 +1125,18 @@ def init_agent(
|
||||
# through _ra().get_tool_definitions()). Duplicate function names cause
|
||||
# 400 errors on providers that enforce unique names (e.g. Xiaomi
|
||||
# MiMo via Nous Portal).
|
||||
if agent._memory_manager and agent.tools is not None:
|
||||
#
|
||||
# Respect the platform's enabled_toolsets configuration (#5544):
|
||||
# enabled_toolsets is None → no filter, inject (backward compat)
|
||||
# "memory" in enabled_toolsets → user opted in, inject
|
||||
# otherwise (incl. []) → user excluded memory, skip injection
|
||||
#
|
||||
# Without this gate, `platform_toolsets: telegram: []` still leaks memory
|
||||
# provider tools (fact_store, etc.) into the tool surface — a 10x latency
|
||||
# penalty on local models and a frequent trigger of tool-call loops.
|
||||
if agent._memory_manager and agent.tools is not None and (
|
||||
agent.enabled_toolsets is None or "memory" in agent.enabled_toolsets
|
||||
):
|
||||
_existing_tool_names = {
|
||||
t.get("function", {}).get("name")
|
||||
for t in agent.tools
|
||||
@@ -1213,6 +1289,7 @@ def init_agent(
|
||||
# Store for reuse by _check_compression_model_feasibility (auxiliary
|
||||
# compression model context-length detection needs the same list).
|
||||
agent._custom_providers = _custom_providers
|
||||
_merge_custom_provider_extra_body(agent, _custom_providers)
|
||||
|
||||
# Check custom_providers per-model context_length
|
||||
if _config_context_length is None and _custom_providers:
|
||||
@@ -1369,8 +1446,22 @@ def init_agent(
|
||||
# errors. Even with the cache fix, dedup is the right defense
|
||||
# against plugin paths that may register the same schemas via
|
||||
# ctx.register_tool(). Mirrors the memory tools dedup above.
|
||||
#
|
||||
# Respect the platform's enabled_toolsets configuration (#5544):
|
||||
# context engine tools follow the same gating pattern as memory
|
||||
# provider tools — without the gate, `platform_toolsets: telegram: []`
|
||||
# would still leak lcm_* tools into the tool surface and incur the
|
||||
# same local-model latency penalty.
|
||||
agent._context_engine_tool_names: set = set()
|
||||
if hasattr(agent, "context_compressor") and agent.context_compressor and agent.tools is not None:
|
||||
if (
|
||||
hasattr(agent, "context_compressor")
|
||||
and agent.context_compressor
|
||||
and agent.tools is not None
|
||||
and (
|
||||
agent.enabled_toolsets is None
|
||||
or "context_engine" in agent.enabled_toolsets
|
||||
)
|
||||
):
|
||||
_existing_tool_names = {
|
||||
t.get("function", {}).get("name")
|
||||
for t in agent.tools
|
||||
|
||||
+254
-230
@@ -1606,182 +1606,155 @@ def _content_parts_to_anthropic_blocks(parts: Any) -> List[Dict[str, Any]]:
|
||||
return out
|
||||
|
||||
|
||||
def convert_messages_to_anthropic(
|
||||
messages: List[Dict],
|
||||
base_url: str | None = None,
|
||||
model: str | None = None,
|
||||
) -> Tuple[Optional[Any], List[Dict]]:
|
||||
"""Convert OpenAI-format messages to Anthropic format.
|
||||
def _convert_assistant_message(m: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Convert an assistant message to Anthropic content blocks.
|
||||
|
||||
Returns (system_prompt, anthropic_messages).
|
||||
System messages are extracted since Anthropic takes them as a separate param.
|
||||
system_prompt is a string or list of content blocks (when cache_control present).
|
||||
|
||||
When *base_url* is provided and points to a third-party Anthropic-compatible
|
||||
endpoint, all thinking block signatures are stripped. Signatures are
|
||||
Anthropic-proprietary — third-party endpoints cannot validate them and will
|
||||
reject them with HTTP 400 "Invalid signature in thinking block".
|
||||
|
||||
When *model* is provided and matches the Kimi / Moonshot family (or
|
||||
*base_url* is a Kimi / Moonshot host), unsigned thinking blocks
|
||||
synthesised from ``reasoning_content`` are preserved on replayed
|
||||
assistant tool-call messages — Kimi requires the field to exist, even
|
||||
if empty.
|
||||
Handles thinking blocks, regular content, tool calls, and
|
||||
reasoning_content injection for Kimi/DeepSeek endpoints.
|
||||
"""
|
||||
system = None
|
||||
result = []
|
||||
|
||||
for m in messages:
|
||||
role = m.get("role", "user")
|
||||
content = m.get("content", "")
|
||||
|
||||
if role == "system":
|
||||
if isinstance(content, list):
|
||||
# Preserve cache_control markers on content blocks
|
||||
has_cache = any(
|
||||
p.get("cache_control") for p in content if isinstance(p, dict)
|
||||
)
|
||||
if has_cache:
|
||||
system = [p for p in content if isinstance(p, dict)]
|
||||
else:
|
||||
system = "\n".join(
|
||||
p["text"] for p in content if p.get("type") == "text"
|
||||
)
|
||||
else:
|
||||
system = content
|
||||
continue
|
||||
|
||||
if role == "assistant":
|
||||
blocks = _extract_preserved_thinking_blocks(m)
|
||||
if content:
|
||||
if isinstance(content, list):
|
||||
converted_content = _convert_content_to_anthropic(content)
|
||||
if isinstance(converted_content, list):
|
||||
blocks.extend(converted_content)
|
||||
else:
|
||||
blocks.append({"type": "text", "text": str(content)})
|
||||
for tc in m.get("tool_calls", []):
|
||||
if not tc or not isinstance(tc, dict):
|
||||
continue
|
||||
fn = tc.get("function", {})
|
||||
args = fn.get("arguments", "{}")
|
||||
try:
|
||||
parsed_args = json.loads(args) if isinstance(args, str) else args
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
parsed_args = {}
|
||||
blocks.append({
|
||||
"type": "tool_use",
|
||||
"id": _sanitize_tool_id(tc.get("id", "")),
|
||||
"name": fn.get("name", ""),
|
||||
"input": parsed_args,
|
||||
})
|
||||
# Kimi's /coding endpoint (Anthropic protocol) requires assistant
|
||||
# tool-call messages to carry reasoning_content when thinking is
|
||||
# enabled server-side. Preserve it as a thinking block so Kimi
|
||||
# can validate the message history. See hermes-agent#13848.
|
||||
#
|
||||
# Accept empty string "" — _copy_reasoning_content_for_api()
|
||||
# injects "" as a tier-3 fallback for Kimi tool-call messages
|
||||
# that had no reasoning. Kimi requires the field to exist, even
|
||||
# if empty.
|
||||
#
|
||||
# Prepend (not append): Anthropic protocol requires thinking
|
||||
# blocks before text and tool_use blocks.
|
||||
#
|
||||
# Guard: only add when reasoning_details didn't already contribute
|
||||
# thinking blocks. On native Anthropic, reasoning_details produces
|
||||
# signed thinking blocks — adding another unsigned one from
|
||||
# reasoning_content would create a duplicate (same text) that gets
|
||||
# downgraded to a spurious text block on the last assistant message.
|
||||
reasoning_content = m.get("reasoning_content")
|
||||
_already_has_thinking = any(
|
||||
isinstance(b, dict) and b.get("type") in {"thinking", "redacted_thinking"}
|
||||
for b in blocks
|
||||
)
|
||||
if isinstance(reasoning_content, str) and not _already_has_thinking:
|
||||
blocks.insert(0, {"type": "thinking", "thinking": reasoning_content})
|
||||
# Anthropic rejects empty assistant content
|
||||
effective = blocks or content
|
||||
if not effective or effective == "":
|
||||
effective = [{"type": "text", "text": "(empty)"}]
|
||||
result.append({"role": "assistant", "content": effective})
|
||||
continue
|
||||
|
||||
if role == "tool":
|
||||
# Sanitize tool_use_id and ensure non-empty content.
|
||||
# Computer-use (and other multimodal) tool results arrive as
|
||||
# either a list of OpenAI-style content parts, or a dict
|
||||
# marked `_multimodal` with an embedded `content` list. Convert
|
||||
# both into Anthropic `tool_result` inner blocks (text + image).
|
||||
multimodal_blocks: Optional[List[Dict[str, Any]]] = None
|
||||
if isinstance(content, dict) and content.get("_multimodal"):
|
||||
multimodal_blocks = _content_parts_to_anthropic_blocks(
|
||||
content.get("content") or []
|
||||
)
|
||||
# Fallback text if the conversion produced nothing usable.
|
||||
if not multimodal_blocks and content.get("text_summary"):
|
||||
multimodal_blocks = [
|
||||
{"type": "text", "text": str(content["text_summary"])}
|
||||
]
|
||||
elif isinstance(content, list):
|
||||
converted = _content_parts_to_anthropic_blocks(content)
|
||||
if any(b.get("type") == "image" for b in converted):
|
||||
multimodal_blocks = converted
|
||||
# Back-compat: some callers stash blocks under a private key.
|
||||
if multimodal_blocks is None:
|
||||
stashed = m.get("_anthropic_content_blocks")
|
||||
if isinstance(stashed, list) and stashed:
|
||||
text_content = content if isinstance(content, str) and content.strip() else None
|
||||
multimodal_blocks = (
|
||||
[{"type": "text", "text": text_content}] + stashed
|
||||
if text_content else list(stashed)
|
||||
)
|
||||
|
||||
if multimodal_blocks:
|
||||
result_content: Any = multimodal_blocks
|
||||
elif isinstance(content, str):
|
||||
result_content = content
|
||||
else:
|
||||
result_content = json.dumps(content) if content else "(no output)"
|
||||
if not result_content:
|
||||
result_content = "(no output)"
|
||||
tool_result = {
|
||||
"type": "tool_result",
|
||||
"tool_use_id": _sanitize_tool_id(m.get("tool_call_id", "")),
|
||||
"content": result_content,
|
||||
}
|
||||
if isinstance(m.get("cache_control"), dict):
|
||||
tool_result["cache_control"] = dict(m["cache_control"])
|
||||
# Merge consecutive tool results into one user message
|
||||
if (
|
||||
result
|
||||
and result[-1]["role"] == "user"
|
||||
and isinstance(result[-1]["content"], list)
|
||||
and result[-1]["content"]
|
||||
and result[-1]["content"][0].get("type") == "tool_result"
|
||||
):
|
||||
result[-1]["content"].append(tool_result)
|
||||
else:
|
||||
result.append({"role": "user", "content": [tool_result]})
|
||||
continue
|
||||
|
||||
# Regular user message — validate non-empty content (Anthropic rejects empty)
|
||||
content = m.get("content", "")
|
||||
blocks = _extract_preserved_thinking_blocks(m)
|
||||
if content:
|
||||
if isinstance(content, list):
|
||||
converted_blocks = _convert_content_to_anthropic(content)
|
||||
# Check if all text blocks are empty
|
||||
if not converted_blocks or all(
|
||||
b.get("text", "").strip() == ""
|
||||
for b in converted_blocks
|
||||
if isinstance(b, dict) and b.get("type") == "text"
|
||||
):
|
||||
converted_blocks = [{"type": "text", "text": "(empty message)"}]
|
||||
result.append({"role": "user", "content": converted_blocks})
|
||||
converted_content = _convert_content_to_anthropic(content)
|
||||
if isinstance(converted_content, list):
|
||||
blocks.extend(converted_content)
|
||||
else:
|
||||
# Validate string content is non-empty
|
||||
if not content or (isinstance(content, str) and not content.strip()):
|
||||
content = "(empty message)"
|
||||
result.append({"role": "user", "content": content})
|
||||
blocks.append({"type": "text", "text": str(content)})
|
||||
for tc in m.get("tool_calls", []):
|
||||
if not tc or not isinstance(tc, dict):
|
||||
continue
|
||||
fn = tc.get("function", {})
|
||||
args = fn.get("arguments", "{}")
|
||||
try:
|
||||
parsed_args = json.loads(args) if isinstance(args, str) else args
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
parsed_args = {}
|
||||
blocks.append({
|
||||
"type": "tool_use",
|
||||
"id": _sanitize_tool_id(tc.get("id", "")),
|
||||
"name": fn.get("name", ""),
|
||||
"input": parsed_args,
|
||||
})
|
||||
# Kimi's /coding endpoint (Anthropic protocol) requires assistant
|
||||
# tool-call messages to carry reasoning_content when thinking is
|
||||
# enabled server-side. Preserve it as a thinking block so Kimi
|
||||
# can validate the message history. See hermes-agent#13848.
|
||||
#
|
||||
# Accept empty string "" — _copy_reasoning_content_for_api()
|
||||
# injects "" as a tier-3 fallback for Kimi tool-call messages
|
||||
# that had no reasoning. Kimi requires the field to exist, even
|
||||
# if empty.
|
||||
#
|
||||
# Prepend (not append): Anthropic protocol requires thinking
|
||||
# blocks before text and tool_use blocks.
|
||||
#
|
||||
# Guard: only add when reasoning_details didn't already contribute
|
||||
# thinking blocks. On native Anthropic, reasoning_details produces
|
||||
# signed thinking blocks — adding another unsigned one from
|
||||
# reasoning_content would create a duplicate (same text) that gets
|
||||
# downgraded to a spurious text block on the last assistant message.
|
||||
reasoning_content = m.get("reasoning_content")
|
||||
_already_has_thinking = any(
|
||||
isinstance(b, dict) and b.get("type") in {"thinking", "redacted_thinking"}
|
||||
for b in blocks
|
||||
)
|
||||
if isinstance(reasoning_content, str) and not _already_has_thinking:
|
||||
blocks.insert(0, {"type": "thinking", "thinking": reasoning_content})
|
||||
# Anthropic rejects empty assistant content
|
||||
effective = blocks or content
|
||||
if not effective or effective == "":
|
||||
effective = [{"type": "text", "text": "(empty)"}]
|
||||
return {"role": "assistant", "content": effective}
|
||||
|
||||
|
||||
def _convert_tool_message_to_result(
|
||||
result: List[Dict[str, Any]], m: Dict[str, Any]
|
||||
) -> None:
|
||||
"""Convert a tool message to an Anthropic tool_result, merging consecutive
|
||||
results into one user message.
|
||||
|
||||
Mutates ``result`` in place — either appends a new user message or extends
|
||||
the trailing user message's tool_result list.
|
||||
"""
|
||||
content = m.get("content", "")
|
||||
multimodal_blocks: Optional[List[Dict[str, Any]]] = None
|
||||
if isinstance(content, dict) and content.get("_multimodal"):
|
||||
multimodal_blocks = _content_parts_to_anthropic_blocks(
|
||||
content.get("content") or []
|
||||
)
|
||||
# Fallback text if the conversion produced nothing usable.
|
||||
if not multimodal_blocks and content.get("text_summary"):
|
||||
multimodal_blocks = [
|
||||
{"type": "text", "text": str(content["text_summary"])}
|
||||
]
|
||||
elif isinstance(content, list):
|
||||
converted = _content_parts_to_anthropic_blocks(content)
|
||||
if any(b.get("type") == "image" for b in converted):
|
||||
multimodal_blocks = converted
|
||||
# Back-compat: some callers stash blocks under a private key.
|
||||
if multimodal_blocks is None:
|
||||
stashed = m.get("_anthropic_content_blocks")
|
||||
if isinstance(stashed, list) and stashed:
|
||||
text_content = content if isinstance(content, str) and content.strip() else None
|
||||
multimodal_blocks = (
|
||||
[{"type": "text", "text": text_content}] + stashed
|
||||
if text_content else list(stashed)
|
||||
)
|
||||
|
||||
if multimodal_blocks:
|
||||
result_content: Any = multimodal_blocks
|
||||
elif isinstance(content, str):
|
||||
result_content = content
|
||||
else:
|
||||
result_content = json.dumps(content) if content else "(no output)"
|
||||
if not result_content:
|
||||
result_content = "(no output)"
|
||||
tool_result = {
|
||||
"type": "tool_result",
|
||||
"tool_use_id": _sanitize_tool_id(m.get("tool_call_id", "")),
|
||||
"content": result_content,
|
||||
}
|
||||
if isinstance(m.get("cache_control"), dict):
|
||||
tool_result["cache_control"] = dict(m["cache_control"])
|
||||
# Merge consecutive tool results into one user message
|
||||
if (
|
||||
result
|
||||
and result[-1]["role"] == "user"
|
||||
and isinstance(result[-1]["content"], list)
|
||||
and result[-1]["content"]
|
||||
and result[-1]["content"][0].get("type") == "tool_result"
|
||||
):
|
||||
result[-1]["content"].append(tool_result)
|
||||
else:
|
||||
result.append({"role": "user", "content": [tool_result]})
|
||||
|
||||
|
||||
def _convert_user_message(content: Any) -> Dict[str, Any]:
|
||||
"""Validate and convert a user message to anthropic format."""
|
||||
if isinstance(content, list):
|
||||
converted_blocks = _convert_content_to_anthropic(content)
|
||||
if not converted_blocks or all(
|
||||
b.get("text", "").strip() == ""
|
||||
for b in converted_blocks
|
||||
if isinstance(b, dict) and b.get("type") == "text"
|
||||
):
|
||||
converted_blocks = [{"type": "text", "text": "(empty message)"}]
|
||||
return {"role": "user", "content": converted_blocks}
|
||||
else:
|
||||
if not content or (isinstance(content, str) and not content.strip()):
|
||||
content = "(empty message)"
|
||||
return {"role": "user", "content": content}
|
||||
|
||||
|
||||
def _strip_orphaned_tool_blocks(result: List[Dict[str, Any]]) -> None:
|
||||
"""Strip tool_use blocks with no matching tool_result, and vice versa.
|
||||
|
||||
Context compression or session truncation can remove either side of a
|
||||
tool-call pair. Anthropic rejects both orphans with HTTP 400.
|
||||
|
||||
Mutates ``result`` in place.
|
||||
"""
|
||||
# Strip orphaned tool_use blocks (no matching tool_result follows)
|
||||
tool_result_ids = set()
|
||||
for m in result:
|
||||
@@ -1799,10 +1772,7 @@ def convert_messages_to_anthropic(
|
||||
if not m["content"]:
|
||||
m["content"] = [{"type": "text", "text": "(tool call removed)"}]
|
||||
|
||||
# Strip orphaned tool_result blocks (no matching tool_use precedes them).
|
||||
# This is the mirror of the above: context compression or session truncation
|
||||
# can remove an assistant message containing a tool_use while leaving the
|
||||
# subsequent tool_result intact. Anthropic rejects these with a 400.
|
||||
# Strip orphaned tool_result blocks (no matching tool_use precedes them)
|
||||
tool_use_ids = set()
|
||||
for m in result:
|
||||
if m["role"] == "assistant" and isinstance(m["content"], list):
|
||||
@@ -1819,12 +1789,16 @@ def convert_messages_to_anthropic(
|
||||
if not m["content"]:
|
||||
m["content"] = [{"type": "text", "text": "(tool result removed)"}]
|
||||
|
||||
# Enforce strict role alternation (Anthropic rejects consecutive same-role messages)
|
||||
|
||||
def _merge_consecutive_roles(result: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Merge consecutive same-role messages to enforce Anthropic alternation.
|
||||
|
||||
Returns a new list (caller must rebind ``result``).
|
||||
"""
|
||||
fixed = []
|
||||
for m in result:
|
||||
if fixed and fixed[-1]["role"] == m["role"]:
|
||||
if m["role"] == "user":
|
||||
# Merge consecutive user messages
|
||||
prev_content = fixed[-1]["content"]
|
||||
curr_content = m["content"]
|
||||
if isinstance(prev_content, str) and isinstance(curr_content, str):
|
||||
@@ -1832,7 +1806,6 @@ def convert_messages_to_anthropic(
|
||||
elif isinstance(prev_content, list) and isinstance(curr_content, list):
|
||||
fixed[-1]["content"] = prev_content + curr_content
|
||||
else:
|
||||
# Mixed types — wrap string in list
|
||||
if isinstance(prev_content, str):
|
||||
prev_content = [{"type": "text", "text": prev_content}]
|
||||
if isinstance(curr_content, str):
|
||||
@@ -1855,7 +1828,6 @@ def convert_messages_to_anthropic(
|
||||
elif isinstance(prev_blocks, str) and isinstance(curr_blocks, str):
|
||||
fixed[-1]["content"] = prev_blocks + "\n" + curr_blocks
|
||||
else:
|
||||
# Mixed types — normalize both to list and merge
|
||||
if isinstance(prev_blocks, str):
|
||||
prev_blocks = [{"type": "text", "text": prev_blocks}]
|
||||
if isinstance(curr_blocks, str):
|
||||
@@ -1863,37 +1835,34 @@ def convert_messages_to_anthropic(
|
||||
fixed[-1]["content"] = prev_blocks + curr_blocks
|
||||
else:
|
||||
fixed.append(m)
|
||||
result = fixed
|
||||
return fixed
|
||||
|
||||
# ── Thinking block signature management ──────────────────────────
|
||||
# Anthropic signs thinking blocks against the full turn content.
|
||||
# Any upstream mutation (context compression, session truncation,
|
||||
# orphan stripping, message merging) invalidates the signature,
|
||||
# causing HTTP 400 "Invalid signature in thinking block".
|
||||
#
|
||||
# Signatures are Anthropic-proprietary. Third-party endpoints
|
||||
# (MiniMax, Microsoft Foundry, self-hosted proxies) cannot validate
|
||||
# them and will reject them outright. When targeting a third-party
|
||||
# endpoint, strip ALL thinking/redacted_thinking blocks from every
|
||||
# assistant message — the third-party will generate its own
|
||||
# thinking blocks if it supports extended thinking.
|
||||
#
|
||||
# For direct Anthropic (strategy following clawdbot/OpenClaw):
|
||||
# 1. Strip thinking/redacted_thinking from all assistant messages
|
||||
# EXCEPT the last one — preserves reasoning continuity on the
|
||||
# current tool-use chain while avoiding stale signature errors.
|
||||
# 2. Downgrade unsigned thinking blocks (no signature) to text —
|
||||
# Anthropic can't validate them and will reject them.
|
||||
# 3. Strip cache_control from thinking/redacted_thinking blocks —
|
||||
# cache markers can interfere with signature validation.
|
||||
|
||||
def _manage_thinking_signatures(
|
||||
result: List[Dict[str, Any]], base_url: str | None, model: str | None
|
||||
) -> None:
|
||||
"""Strip or preserve thinking blocks based on endpoint type.
|
||||
|
||||
Anthropic signs thinking blocks against the full turn content.
|
||||
Any upstream mutation (context compression, session truncation, orphan
|
||||
stripping, message merging) invalidates the signature, causing HTTP 400
|
||||
"Invalid signature in thinking block".
|
||||
|
||||
Signatures are Anthropic-proprietary. Third-party endpoints (MiniMax,
|
||||
Azure AI Foundry, AWS Bedrock, self-hosted proxies) cannot validate them
|
||||
and will reject them outright. Kimi's /coding and DeepSeek's /anthropic
|
||||
endpoints speak the Anthropic protocol upstream but require unsigned
|
||||
thinking blocks (synthesised from ``reasoning_content``) to round-trip on
|
||||
replayed assistant tool-call messages. See hermes-agent#13848 (Kimi) and
|
||||
hermes-agent#16748 (DeepSeek).
|
||||
|
||||
Mutates ``result`` in place.
|
||||
"""
|
||||
_THINKING_TYPES = frozenset(("thinking", "redacted_thinking"))
|
||||
_is_third_party = _is_third_party_anthropic_endpoint(base_url)
|
||||
# Kimi /coding and DeepSeek /anthropic share a contract: both speak the
|
||||
# Anthropic Messages protocol upstream but require that thinking blocks
|
||||
# synthesised from reasoning_content round-trip on subsequent turns when
|
||||
# thinking is enabled. Signed Anthropic blocks still have to be stripped
|
||||
# (neither endpoint can validate Anthropic's signatures); unsigned blocks
|
||||
# are preserved. See hermes-agent#13848 (Kimi) and #16748 (DeepSeek).
|
||||
# Kimi / DeepSeek share a contract: strip signed Anthropic blocks
|
||||
# (neither upstream can validate Anthropic signatures), preserve unsigned
|
||||
# ones synthesised from reasoning_content. See #13848, #16748.
|
||||
_preserve_unsigned_thinking = (
|
||||
_is_kimi_family_endpoint(base_url, model)
|
||||
or _is_deepseek_anthropic_endpoint(base_url)
|
||||
@@ -1910,26 +1879,19 @@ def convert_messages_to_anthropic(
|
||||
continue
|
||||
|
||||
if _preserve_unsigned_thinking:
|
||||
# Kimi's /coding and DeepSeek's /anthropic endpoints both enable
|
||||
# thinking server-side and require unsigned thinking blocks on
|
||||
# replayed assistant tool-call messages. Strip signed Anthropic
|
||||
# blocks (neither upstream can validate Anthropic signatures) but
|
||||
# preserve the unsigned ones we synthesised from reasoning_content.
|
||||
# Kimi / DeepSeek: strip signed, preserve unsigned.
|
||||
new_content = []
|
||||
for b in m["content"]:
|
||||
if not isinstance(b, dict) or b.get("type") not in _THINKING_TYPES:
|
||||
new_content.append(b)
|
||||
continue
|
||||
if b.get("signature") or b.get("data"):
|
||||
# Anthropic-signed block — upstream can't validate, strip
|
||||
# Signed (or redacted-with-data) — upstream can't validate, strip.
|
||||
continue
|
||||
# Unsigned thinking (synthesised from reasoning_content) —
|
||||
# keep it: the upstream needs it for message-history validation.
|
||||
new_content.append(b)
|
||||
m["content"] = new_content or [{"type": "text", "text": "(empty)"}]
|
||||
elif _is_third_party or idx != last_assistant_idx:
|
||||
# Third-party endpoint: strip ALL thinking blocks from every
|
||||
# assistant message — signatures are Anthropic-proprietary.
|
||||
# Third-party: strip ALL thinking blocks (signatures are proprietary).
|
||||
# Direct Anthropic: strip from non-latest assistant messages only.
|
||||
stripped = [
|
||||
b for b in m["content"]
|
||||
@@ -1937,24 +1899,21 @@ def convert_messages_to_anthropic(
|
||||
]
|
||||
m["content"] = stripped or [{"type": "text", "text": "(thinking elided)"}]
|
||||
else:
|
||||
# Latest assistant on direct Anthropic: keep signed thinking
|
||||
# blocks for reasoning continuity; downgrade unsigned ones to
|
||||
# plain text.
|
||||
# Latest assistant on direct Anthropic: keep signed, downgrade unsigned
|
||||
# to text so the reasoning isn't lost.
|
||||
new_content = []
|
||||
for b in m["content"]:
|
||||
if not isinstance(b, dict) or b.get("type") not in _THINKING_TYPES:
|
||||
new_content.append(b)
|
||||
continue
|
||||
if b.get("type") == "redacted_thinking":
|
||||
# Redacted blocks use 'data' for the signature payload
|
||||
# Redacted blocks use 'data' for the signature payload —
|
||||
# drop the block when 'data' is missing (can't be validated).
|
||||
if b.get("data"):
|
||||
new_content.append(b)
|
||||
# else: drop — no data means it can't be validated
|
||||
elif b.get("signature"):
|
||||
# Signed thinking block — keep it
|
||||
new_content.append(b)
|
||||
else:
|
||||
# Unsigned thinking — downgrade to text so it's not lost
|
||||
thinking_text = b.get("thinking", "")
|
||||
if thinking_text:
|
||||
new_content.append({"type": "text", "text": thinking_text})
|
||||
@@ -1966,12 +1925,15 @@ def convert_messages_to_anthropic(
|
||||
if isinstance(b, dict) and b.get("type") in _THINKING_TYPES:
|
||||
b.pop("cache_control", None)
|
||||
|
||||
# ── Image eviction: keep only the most recent N screenshots ─────
|
||||
# computer_use screenshots (base64 images) sit inside tool_result
|
||||
# blocks: they accumulate and are sent with every API call. Each
|
||||
# costs ~1,465 tokens; after 10+ the conversation becomes slow
|
||||
# even for simple text queries. Walk backward, keep the most recent
|
||||
# _MAX_KEEP_IMAGES, replace older ones with a text placeholder.
|
||||
|
||||
def _evict_old_screenshots(result: List[Dict[str, Any]]) -> None:
|
||||
"""Keep only the most recent ``_MAX_KEEP_IMAGES`` computer-use screenshots.
|
||||
|
||||
Base64 images cost ~1,465 tokens each and accumulate across tool calls.
|
||||
Walk backward, keep the most recent N, replace older ones with a placeholder.
|
||||
|
||||
Mutates ``result`` in place.
|
||||
"""
|
||||
_MAX_KEEP_IMAGES = 3
|
||||
_image_count = 0
|
||||
for msg in reversed(result):
|
||||
@@ -1998,6 +1960,68 @@ def convert_messages_to_anthropic(
|
||||
for b in inner
|
||||
]
|
||||
|
||||
|
||||
def convert_messages_to_anthropic(
|
||||
messages: List[Dict],
|
||||
base_url: str | None = None,
|
||||
model: str | None = None,
|
||||
) -> Tuple[Optional[Any], List[Dict]]:
|
||||
"""Convert OpenAI-format messages to Anthropic format.
|
||||
|
||||
Returns (system_prompt, anthropic_messages).
|
||||
System messages are extracted since Anthropic takes them as a separate param.
|
||||
system_prompt is a string or list of content blocks (when cache_control present).
|
||||
|
||||
When *base_url* is provided and points to a third-party Anthropic-compatible
|
||||
endpoint, all thinking block signatures are stripped. Signatures are
|
||||
Anthropic-proprietary — third-party endpoints cannot validate them and will
|
||||
reject them with HTTP 400 "Invalid signature in thinking block".
|
||||
|
||||
When *model* is provided and matches the Kimi / Moonshot family (or
|
||||
*base_url* is a Kimi / Moonshot host), unsigned thinking blocks
|
||||
synthesised from ``reasoning_content`` are preserved on replayed
|
||||
assistant tool-call messages — Kimi requires the field to exist, even
|
||||
if empty.
|
||||
"""
|
||||
system = None
|
||||
result: List[Dict[str, Any]] = []
|
||||
|
||||
for m in messages:
|
||||
role = m.get("role", "user")
|
||||
content = m.get("content", "")
|
||||
|
||||
if role == "system":
|
||||
if isinstance(content, list):
|
||||
# Preserve cache_control markers on content blocks
|
||||
has_cache = any(
|
||||
p.get("cache_control") for p in content if isinstance(p, dict)
|
||||
)
|
||||
if has_cache:
|
||||
system = [p for p in content if isinstance(p, dict)]
|
||||
else:
|
||||
system = "\n".join(
|
||||
p["text"] for p in content if p.get("type") == "text"
|
||||
)
|
||||
else:
|
||||
system = content
|
||||
continue
|
||||
|
||||
if role == "assistant":
|
||||
result.append(_convert_assistant_message(m))
|
||||
continue
|
||||
|
||||
if role == "tool":
|
||||
_convert_tool_message_to_result(result, m)
|
||||
continue
|
||||
|
||||
# Regular user message
|
||||
result.append(_convert_user_message(content))
|
||||
|
||||
_strip_orphaned_tool_blocks(result)
|
||||
result = _merge_consecutive_roles(result)
|
||||
_manage_thinking_signatures(result, base_url, model)
|
||||
_evict_old_screenshots(result)
|
||||
|
||||
return system, result
|
||||
|
||||
|
||||
|
||||
@@ -46,6 +46,7 @@ from agent.message_sanitization import (
|
||||
_strip_non_ascii,
|
||||
)
|
||||
from agent.model_metadata import (
|
||||
MINIMUM_CONTEXT_LENGTH,
|
||||
estimate_messages_tokens_rough,
|
||||
estimate_request_tokens_rough,
|
||||
get_next_probe_tier,
|
||||
@@ -73,6 +74,50 @@ from utils import base_url_host_matches, env_var_enabled
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _ollama_context_limit_error(agent: Any, request_tokens: int) -> Optional[str]:
|
||||
"""Return a user-facing error when Ollama is loaded with too little context."""
|
||||
if not getattr(agent, "tools", None):
|
||||
return None
|
||||
|
||||
runtime_ctx = getattr(agent, "_ollama_num_ctx", None)
|
||||
if not isinstance(runtime_ctx, int) or runtime_ctx <= 0:
|
||||
return None
|
||||
if runtime_ctx >= MINIMUM_CONTEXT_LENGTH:
|
||||
return None
|
||||
|
||||
model = getattr(agent, "model", "") or "the selected model"
|
||||
base_url = getattr(agent, "base_url", "") or "unknown base URL"
|
||||
provider = getattr(agent, "provider", "") or "unknown"
|
||||
tool_count = len(getattr(agent, "tools", None) or [])
|
||||
|
||||
logger.warning(
|
||||
"Ollama runtime context too small for Hermes tool use: "
|
||||
"model=%s provider=%s base_url=%s runtime_context=%d "
|
||||
"minimum_context=%d estimated_request_tokens=%d tool_count=%d "
|
||||
"session=%s",
|
||||
model,
|
||||
provider,
|
||||
base_url,
|
||||
runtime_ctx,
|
||||
MINIMUM_CONTEXT_LENGTH,
|
||||
request_tokens,
|
||||
tool_count,
|
||||
getattr(agent, "session_id", None) or "none",
|
||||
)
|
||||
|
||||
return (
|
||||
f"Ollama loaded `{model}` with only {runtime_ctx:,} tokens of runtime "
|
||||
f"context, but Hermes needs at least {MINIMUM_CONTEXT_LENGTH:,} tokens "
|
||||
"for reliable tool use.\n\n"
|
||||
"Increase the Ollama context for this model and restart/reload the "
|
||||
"model before trying again. A known-good starting point is 65,536 "
|
||||
"tokens. In Hermes config, set `model.ollama_num_ctx: 65536` "
|
||||
"(and `model.context_length: 65536` if you also override the displayed "
|
||||
"model context). If you manage the model through an Ollama Modelfile, "
|
||||
"set `PARAMETER num_ctx 65536` there instead."
|
||||
)
|
||||
|
||||
|
||||
def _ra():
|
||||
"""Lazy reference to ``run_agent`` so callers can patch
|
||||
``run_agent.handle_function_call`` / ``run_agent._set_interrupt`` /
|
||||
@@ -527,6 +572,7 @@ def run_conversation(
|
||||
api_call_count = 0
|
||||
final_response = None
|
||||
interrupted = False
|
||||
failed = False
|
||||
codex_ack_continuations = 0
|
||||
length_continue_retries = 0
|
||||
truncated_tool_call_retries = 0
|
||||
@@ -883,6 +929,26 @@ def run_conversation(
|
||||
# Calculate approximate request size for logging
|
||||
total_chars = sum(len(str(msg)) for msg in api_messages)
|
||||
approx_tokens = estimate_messages_tokens_rough(api_messages)
|
||||
approx_request_tokens = estimate_request_tokens_rough(
|
||||
api_messages, tools=agent.tools or None
|
||||
)
|
||||
|
||||
_runtime_context_error = _ollama_context_limit_error(
|
||||
agent, approx_request_tokens
|
||||
)
|
||||
if _runtime_context_error:
|
||||
final_response = _runtime_context_error
|
||||
failed = True
|
||||
_turn_exit_reason = "ollama_runtime_context_too_small"
|
||||
messages.append({"role": "assistant", "content": final_response})
|
||||
agent._emit_status("❌ Ollama runtime context is too small for Hermes tool use")
|
||||
api_call_count -= 1
|
||||
agent._api_call_count = api_call_count
|
||||
try:
|
||||
agent.iteration_budget.refund()
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
|
||||
# Thinking spinner for quiet mode (animated during API call)
|
||||
thinking_spinner = None
|
||||
@@ -923,6 +989,7 @@ def run_conversation(
|
||||
copilot_auth_retry_attempted=False
|
||||
thinking_sig_retry_attempted = False
|
||||
image_shrink_retry_attempted = False
|
||||
multimodal_tool_content_retry_attempted = False
|
||||
oauth_1m_beta_retry_attempted = False
|
||||
llama_cpp_grammar_retry_attempted = False
|
||||
has_retried_429 = False
|
||||
@@ -1994,6 +2061,31 @@ def run_conversation(
|
||||
"or shrink didn't reduce size; surfacing original error."
|
||||
)
|
||||
|
||||
# Multimodal-tool-content recovery: providers that follow
|
||||
# the OpenAI spec strictly (tool message content must be a
|
||||
# string) reject our list-type content with a 400. Strip
|
||||
# image parts from any list-type tool messages, mark the
|
||||
# (provider, model) as no-list-tool-content for the rest
|
||||
# of this session so future tool results preemptively
|
||||
# downgrade, and retry once. See issue #27344.
|
||||
if (
|
||||
classified.reason == FailoverReason.multimodal_tool_content_unsupported
|
||||
and not multimodal_tool_content_retry_attempted
|
||||
):
|
||||
multimodal_tool_content_retry_attempted = True
|
||||
if agent._try_strip_image_parts_from_tool_messages(api_messages):
|
||||
agent._vprint(
|
||||
f"{agent.log_prefix}📐 Provider rejected list-type tool content — "
|
||||
f"downgraded screenshots to text and retrying...",
|
||||
force=True,
|
||||
)
|
||||
continue
|
||||
else:
|
||||
logger.info(
|
||||
"multimodal-tool-content recovery: no list-type tool "
|
||||
"messages with image parts found; surfacing original error."
|
||||
)
|
||||
|
||||
# Anthropic OAuth subscription rejected the 1M-context beta
|
||||
# header ("long context beta is not yet available for this
|
||||
# subscription"). Disable the beta for the rest of this
|
||||
@@ -3848,7 +3940,11 @@ def run_conversation(
|
||||
)
|
||||
|
||||
# Determine if conversation completed successfully
|
||||
completed = final_response is not None and api_call_count < agent.max_iterations
|
||||
completed = (
|
||||
final_response is not None
|
||||
and api_call_count < agent.max_iterations
|
||||
and not failed
|
||||
)
|
||||
|
||||
# Save trajectory if enabled. ``user_message`` may be a multimodal
|
||||
# list of parts; the trajectory format wants a plain string.
|
||||
@@ -3998,6 +4094,7 @@ def run_conversation(
|
||||
"api_calls": api_call_count,
|
||||
"completed": completed,
|
||||
"turn_exit_reason": _turn_exit_reason,
|
||||
"failed": failed,
|
||||
"partial": False, # True only when stopped due to invalid tool calls
|
||||
"interrupted": interrupted,
|
||||
"response_previewed": getattr(agent, "_response_was_previewed", False),
|
||||
|
||||
@@ -50,6 +50,7 @@ from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from hermes_constants import get_hermes_home
|
||||
from agent.skill_utils import is_excluded_skill_path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -176,7 +177,9 @@ def get_keep() -> int:
|
||||
|
||||
def _count_skill_files(base: Path) -> int:
|
||||
try:
|
||||
return sum(1 for _ in base.rglob("SKILL.md"))
|
||||
return sum(
|
||||
1 for p in base.rglob("SKILL.md") if not is_excluded_skill_path(p)
|
||||
)
|
||||
except OSError:
|
||||
return 0
|
||||
|
||||
|
||||
@@ -50,6 +50,7 @@ class FailoverReason(enum.Enum):
|
||||
|
||||
# Request format
|
||||
format_error = "format_error" # 400 bad request — abort or strip + retry
|
||||
multimodal_tool_content_unsupported = "multimodal_tool_content_unsupported" # Provider rejected list-type content in tool messages (e.g. Xiaomi MiMo) — downgrade to text and retry
|
||||
|
||||
# Provider-specific
|
||||
thinking_signature = "thinking_signature" # Anthropic thinking block sig invalid
|
||||
@@ -165,6 +166,32 @@ _IMAGE_TOO_LARGE_PATTERNS = [
|
||||
# the likely culprit; we still try the shrink path before giving up.
|
||||
]
|
||||
|
||||
# Providers that follow the OpenAI spec strictly require tool message
|
||||
# ``content`` to be a string. Some (Anthropic native, Codex Responses,
|
||||
# Gemini native, first-party OpenAI) extend this to accept a content-parts
|
||||
# list (text + image_url) so screenshots from computer_use survive. Others
|
||||
# (Xiaomi MiMo, some Alibaba endpoints, a long tail of OpenAI-compatible
|
||||
# providers) reject the list with a 400 — the patterns below are the most
|
||||
# common error shapes we see. Recovery: strip image parts from tool
|
||||
# messages in-place, record the (provider, model) for the rest of the
|
||||
# session so we don't waste another call learning the same lesson, retry.
|
||||
#
|
||||
# See: https://github.com/NousResearch/hermes-agent/issues/27344
|
||||
_MULTIMODAL_TOOL_CONTENT_PATTERNS = [
|
||||
# Xiaomi MiMo: {"error":{"code":"400","message":"Param Incorrect","param":"text is not set"}}
|
||||
"text is not set",
|
||||
# Generic "tool message must be string" shapes
|
||||
"tool message content must be a string",
|
||||
"tool content must be a string",
|
||||
"tool message must be a string",
|
||||
# OpenAI-compat servers that reject list-type tool content with a
|
||||
# schema-validation message
|
||||
"expected string, got list",
|
||||
"expected string, got array",
|
||||
# Alibaba/DashScope variant
|
||||
"tool_call.content must be string",
|
||||
]
|
||||
|
||||
# Context overflow patterns
|
||||
_CONTEXT_OVERFLOW_PATTERNS = [
|
||||
"context length",
|
||||
@@ -781,6 +808,19 @@ def _classify_400(
|
||||
) -> ClassifiedError:
|
||||
"""Classify 400 Bad Request — context overflow, format error, or generic."""
|
||||
|
||||
# Multimodal tool content rejected from 400. Must be checked BEFORE
|
||||
# image_too_large because the recovery is different (strip image parts
|
||||
# from tool messages, mark the model as no-list-tool-content for the
|
||||
# rest of the session) and BEFORE context_overflow because some of the
|
||||
# patterns ("text is not set") are ambiguous in isolation but become
|
||||
# specific when combined with a 400 on a request known to contain
|
||||
# multimodal tool content.
|
||||
if any(p in error_msg for p in _MULTIMODAL_TOOL_CONTENT_PATTERNS):
|
||||
return result_fn(
|
||||
FailoverReason.multimodal_tool_content_unsupported,
|
||||
retryable=True,
|
||||
)
|
||||
|
||||
# Image-too-large from 400 (Anthropic's 5 MB per-image check fires this way).
|
||||
# Must be checked BEFORE context_overflow because messages can trip both
|
||||
# patterns ("exceeds" + "image") and image-shrink is a cheaper recovery.
|
||||
@@ -922,6 +962,13 @@ def _classify_by_message(
|
||||
should_compress=True,
|
||||
)
|
||||
|
||||
# Multimodal tool content patterns (from message text when no status_code)
|
||||
if any(p in error_msg for p in _MULTIMODAL_TOOL_CONTENT_PATTERNS):
|
||||
return result_fn(
|
||||
FailoverReason.multimodal_tool_content_unsupported,
|
||||
retryable=True,
|
||||
)
|
||||
|
||||
# Image-too-large patterns (from message text when no status_code)
|
||||
if any(p in error_msg for p in _IMAGE_TOO_LARGE_PATTERNS):
|
||||
return result_fn(
|
||||
|
||||
+38
-1
@@ -1258,6 +1258,10 @@ def build_nous_subscription_prompt(valid_tool_names: "set[str] | None" = None) -
|
||||
"terminal",
|
||||
"process",
|
||||
"execute_code",
|
||||
"app_search_tools",
|
||||
"app_tool_schemas",
|
||||
"app_execute_tools",
|
||||
"app_manage_connections",
|
||||
}
|
||||
|
||||
if valid_names and not (valid_names & relevant_tool_names):
|
||||
@@ -1279,7 +1283,7 @@ def build_nous_subscription_prompt(valid_tool_names: "set[str] | None" = None) -
|
||||
|
||||
lines = [
|
||||
"# Nous Subscription",
|
||||
"Nous subscription includes managed web tools (Firecrawl), image generation (FAL), OpenAI TTS, and browser automation (Browser Use) by default. Modal execution is optional.",
|
||||
"Nous subscription includes managed web tools (Firecrawl), image generation (FAL), OpenAI TTS, browser automation (Browser Use), and app integrations (500+ apps) by default. Modal execution is optional.",
|
||||
"Current capability status:",
|
||||
]
|
||||
lines.extend(_status_line(feature) for feature in features.items())
|
||||
@@ -1294,6 +1298,39 @@ def build_nous_subscription_prompt(valid_tool_names: "set[str] | None" = None) -
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# App tools (500+ external integrations) behavioural prompt
|
||||
# =========================================================================
|
||||
|
||||
_APP_TOOLS_PROMPT = """\
|
||||
## App Tools (500+ External Integrations)
|
||||
|
||||
You have app_search_tools, app_tool_schemas, app_execute_tools, and app_manage_connections available RIGHT NOW as callable tools. They are already configured and connected to the Nous tool gateway — no SDK installation, no API keys, no plugin setup needed. Just call them.
|
||||
|
||||
**When to use:** When a user asks to interact with ANY external app or service — Gmail, Slack, GitHub, Jira, Notion, Google Sheets, Linear, HubSpot, Figma, Salesforce, or any of 500+ other apps. ALWAYS prefer these tools over loading skills about the same service (e.g. do NOT load the 'linear', 'airtable', 'google-workspace', 'notion', or any similar skill — use app_search_tools instead). Do NOT suggest installing SDKs, CLI tools, MCP servers, or API keys for external services — call app_search_tools directly.
|
||||
|
||||
**Workflow:**
|
||||
1. Call app_search_tools with a clear use_case description to discover available tools
|
||||
2. Check the response for connection status — if no active connection, call app_manage_connections and share the auth link with the user
|
||||
3. Review the execution plan and pitfalls in the search response before executing
|
||||
4. If a tool has schemaRef instead of input_schema, call app_tool_schemas to get the full schema
|
||||
5. Execute tools via app_execute_tools with schema-compliant arguments
|
||||
|
||||
**Session tracking:** Pass session: {generate_id: true} on your first app_search_tools call. Reuse the returned session.id in all subsequent calls. Generate a new session when the user pivots to a different task.
|
||||
|
||||
**Important:** Never fabricate tool slugs or argument field names. Only use slugs and schemas returned by app_search_tools or app_tool_schemas."""
|
||||
|
||||
|
||||
def build_app_tools_prompt(valid_tool_names: "set[str] | None" = None) -> str:
|
||||
"""Return the app tools behavioural guidance when the toolset is active."""
|
||||
if valid_tool_names and "app_search_tools" not in valid_tool_names:
|
||||
return ""
|
||||
if not valid_tool_names:
|
||||
# No tool names known — skip (conservative)
|
||||
return ""
|
||||
return _APP_TOOLS_PROMPT
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Context files (SOUL.md, AGENTS.md, .cursorrules)
|
||||
# =========================================================================
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
"""External secret source integrations.
|
||||
|
||||
A secret source is anything that can supply environment-variable-shaped
|
||||
credentials at process startup, _after_ ~/.hermes/.env has loaded. By
|
||||
default sources are non-destructive: they only set values for env vars
|
||||
that aren't already present, so .env and shell exports continue to win.
|
||||
|
||||
Currently shipped:
|
||||
|
||||
- ``bitwarden`` — Bitwarden Secrets Manager (`bws` CLI). See
|
||||
``agent.secret_sources.bitwarden`` for the integration and
|
||||
``hermes_cli.secrets_cli`` for the user-facing setup wizard.
|
||||
"""
|
||||
@@ -0,0 +1,515 @@
|
||||
"""Bitwarden Secrets Manager (`bws` CLI) integration.
|
||||
|
||||
Hermes pulls API keys from Bitwarden Secrets Manager at process startup
|
||||
so they don't have to live in plaintext in ``~/.hermes/.env``.
|
||||
|
||||
Design summary
|
||||
--------------
|
||||
|
||||
* The ``bws`` binary is auto-installed into ``<hermes_home>/bin/bws`` on
|
||||
first use. Hermes pins one version (``_BWS_VERSION``) and downloads
|
||||
the matching asset from the official GitHub Releases page, verifying
|
||||
the SHA-256 against the release's published checksum file.
|
||||
* The access token is stored in ``~/.hermes/.env`` as
|
||||
``BWS_ACCESS_TOKEN`` (or whatever name the user picked in
|
||||
``secrets.bitwarden.access_token_env``). This is the one
|
||||
bootstrap secret — every other provider key can live in Bitwarden.
|
||||
* Pulling secrets is a single ``bws secret list <project_id>
|
||||
--output json`` call. We cache the result in-process for
|
||||
``cache_ttl_seconds`` so back-to-back ``hermes`` invocations don't
|
||||
hammer the API.
|
||||
* Failures NEVER block Hermes startup. Missing binary, no network,
|
||||
expired token, etc. all emit a one-line warning and continue with
|
||||
whatever credentials ``.env`` already had.
|
||||
|
||||
The module is intentionally subprocess-driven rather than going through
|
||||
the ``bitwarden-sdk-secrets`` Python package: one cross-platform binary
|
||||
is easier to lazy-install than a wheels-with-Rust-extension dependency.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import shutil
|
||||
import stat
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
import zipfile
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration constants
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Pinned upstream version. Bump in a follow-up PR — never auto-resolve
|
||||
# "latest" because upstream release shape (asset names, CLI flags) is
|
||||
# allowed to change between majors and we want updates to be deliberate.
|
||||
_BWS_VERSION = "2.0.0"
|
||||
|
||||
_BWS_RELEASE_BASE = (
|
||||
f"https://github.com/bitwarden/sdk-sm/releases/download/bws-v{_BWS_VERSION}"
|
||||
)
|
||||
_BWS_CHECKSUM_NAME = f"bws-sha256-checksums-{_BWS_VERSION}.txt"
|
||||
|
||||
# How long to wait for bws subprocesses and HTTP downloads, in seconds.
|
||||
_BWS_DOWNLOAD_TIMEOUT = 60
|
||||
_BWS_RUN_TIMEOUT = 30
|
||||
|
||||
# In-process cache so repeated load_hermes_dotenv() calls (CLI startup,
|
||||
# gateway hot-reload, test suites) don't re-fetch from BSM.
|
||||
_CacheKey = Tuple[str, str] # (access_token_fingerprint, project_id)
|
||||
_CACHE: Dict[_CacheKey, "_CachedFetch"] = {}
|
||||
|
||||
|
||||
@dataclass
|
||||
class _CachedFetch:
|
||||
secrets: Dict[str, str]
|
||||
fetched_at: float
|
||||
|
||||
def is_fresh(self, ttl_seconds: float) -> bool:
|
||||
if ttl_seconds <= 0:
|
||||
return False
|
||||
return (time.time() - self.fetched_at) < ttl_seconds
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public dataclasses
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class FetchResult:
|
||||
"""Outcome of a single BSM pull."""
|
||||
|
||||
secrets: Dict[str, str] = field(default_factory=dict)
|
||||
applied: List[str] = field(default_factory=list) # set into os.environ
|
||||
skipped: List[str] = field(default_factory=list) # already set, not overridden
|
||||
warnings: List[str] = field(default_factory=list) # non-fatal issues
|
||||
error: Optional[str] = None # fatal: nothing was fetched
|
||||
binary_path: Optional[Path] = None
|
||||
|
||||
@property
|
||||
def ok(self) -> bool:
|
||||
return self.error is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Binary discovery + lazy install
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _hermes_bin_dir() -> Path:
|
||||
"""Where Hermes stores its managed binaries. Profile-aware."""
|
||||
from hermes_constants import get_hermes_home
|
||||
|
||||
return get_hermes_home() / "bin"
|
||||
|
||||
|
||||
def find_bws(*, install_if_missing: bool = False) -> Optional[Path]:
|
||||
"""Return a path to a usable ``bws`` binary, or None.
|
||||
|
||||
Resolution order:
|
||||
1. ``<hermes_home>/bin/bws`` (our managed copy — preferred)
|
||||
2. ``shutil.which("bws")`` (system PATH)
|
||||
|
||||
When ``install_if_missing`` is True and neither resolves, this calls
|
||||
:func:`install_bws` to download and verify the pinned version.
|
||||
"""
|
||||
managed = _hermes_bin_dir() / _platform_binary_name()
|
||||
if managed.exists() and os.access(managed, os.X_OK):
|
||||
return managed
|
||||
|
||||
system = shutil.which("bws")
|
||||
if system:
|
||||
return Path(system)
|
||||
|
||||
if install_if_missing:
|
||||
try:
|
||||
return install_bws()
|
||||
except Exception as exc: # noqa: BLE001 — never block startup
|
||||
logger.warning("bws auto-install failed: %s", exc)
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _platform_binary_name() -> str:
|
||||
return "bws.exe" if platform.system() == "Windows" else "bws"
|
||||
|
||||
|
||||
def _platform_asset_name() -> str:
|
||||
"""Map (uname, arch, libc) → the upstream asset filename.
|
||||
|
||||
Asset names follow Rust's target triple convention. Linux defaults
|
||||
to gnu (glibc); we switch to musl only if ldd --version says so.
|
||||
"""
|
||||
system = platform.system()
|
||||
machine = platform.machine().lower()
|
||||
|
||||
if system == "Darwin":
|
||||
# Universal binary works on both Intel and Apple Silicon — no
|
||||
# need to pick a per-arch asset.
|
||||
return f"bws-macos-universal-{_BWS_VERSION}.zip"
|
||||
|
||||
if system == "Windows":
|
||||
arch = "aarch64" if machine in ("arm64", "aarch64") else "x86_64"
|
||||
return f"bws-{arch}-pc-windows-msvc-{_BWS_VERSION}.zip"
|
||||
|
||||
if system == "Linux":
|
||||
arch = "aarch64" if machine in ("arm64", "aarch64") else "x86_64"
|
||||
libc = "gnu"
|
||||
# ldd --version writes to stderr on glibc, stdout on musl. We
|
||||
# don't need bullet-proof detection — getting it wrong falls
|
||||
# back to a clear error from the binary loader, which we catch.
|
||||
try:
|
||||
res = subprocess.run(
|
||||
["ldd", "--version"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=2,
|
||||
)
|
||||
if "musl" in (res.stdout + res.stderr).lower():
|
||||
libc = "musl"
|
||||
except (OSError, subprocess.TimeoutExpired):
|
||||
pass
|
||||
return f"bws-{arch}-unknown-linux-{libc}-{_BWS_VERSION}.zip"
|
||||
|
||||
raise RuntimeError(
|
||||
f"Unsupported platform for bws auto-install: {system} {machine}"
|
||||
)
|
||||
|
||||
|
||||
def install_bws(*, force: bool = False) -> Path:
|
||||
"""Download, verify, and install the pinned ``bws`` binary.
|
||||
|
||||
Returns the path to the installed executable. Raises on any
|
||||
failure (network, checksum, extraction) — callers in the auto-install
|
||||
path catch these; the user-facing ``hermes secrets bitwarden setup``
|
||||
surface lets them propagate so the wizard can show a clear error.
|
||||
"""
|
||||
bin_dir = _hermes_bin_dir()
|
||||
bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
target = bin_dir / _platform_binary_name()
|
||||
|
||||
if target.exists() and not force:
|
||||
return target
|
||||
|
||||
asset_name = _platform_asset_name()
|
||||
asset_url = f"{_BWS_RELEASE_BASE}/{asset_name}"
|
||||
checksum_url = f"{_BWS_RELEASE_BASE}/{_BWS_CHECKSUM_NAME}"
|
||||
|
||||
with tempfile.TemporaryDirectory(prefix="hermes-bws-") as tmpdir:
|
||||
tmp = Path(tmpdir)
|
||||
zip_path = tmp / asset_name
|
||||
checksum_path = tmp / _BWS_CHECKSUM_NAME
|
||||
|
||||
logger.info("Downloading %s", asset_url)
|
||||
_http_download(asset_url, zip_path)
|
||||
_http_download(checksum_url, checksum_path)
|
||||
|
||||
expected = _expected_sha256(checksum_path, asset_name)
|
||||
actual = _sha256_file(zip_path)
|
||||
if expected.lower() != actual.lower():
|
||||
raise RuntimeError(
|
||||
f"Checksum mismatch for {asset_name}: "
|
||||
f"expected {expected}, got {actual}"
|
||||
)
|
||||
|
||||
with zipfile.ZipFile(zip_path) as zf:
|
||||
member = _pick_zip_member(zf, _platform_binary_name())
|
||||
zf.extract(member, tmp)
|
||||
extracted = tmp / member
|
||||
|
||||
# Move into place atomically. We write to a sibling tempfile in
|
||||
# the final directory so the rename can't cross filesystems.
|
||||
fd, staged = tempfile.mkstemp(dir=str(bin_dir), prefix=".bws_")
|
||||
os.close(fd)
|
||||
shutil.copy2(extracted, staged)
|
||||
os.chmod(
|
||||
staged,
|
||||
stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR
|
||||
| stat.S_IRGRP | stat.S_IXGRP
|
||||
| stat.S_IROTH | stat.S_IXOTH,
|
||||
)
|
||||
os.replace(staged, target)
|
||||
|
||||
logger.info("Installed bws %s at %s", _BWS_VERSION, target)
|
||||
return target
|
||||
|
||||
|
||||
def _http_download(url: str, dest: Path) -> None:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": "hermes-agent"})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=_BWS_DOWNLOAD_TIMEOUT) as resp: # noqa: S310
|
||||
with open(dest, "wb") as f:
|
||||
shutil.copyfileobj(resp, f)
|
||||
except urllib.error.URLError as exc:
|
||||
raise RuntimeError(f"Failed to download {url}: {exc}") from exc
|
||||
|
||||
|
||||
def _expected_sha256(checksum_file: Path, asset_name: str) -> str:
|
||||
"""Parse the upstream ``bws-sha256-checksums-X.Y.Z.txt`` file.
|
||||
|
||||
Format is the standard ``sha256sum`` output: ``<hex> <filename>``,
|
||||
one per line.
|
||||
"""
|
||||
text = checksum_file.read_text(encoding="utf-8", errors="replace")
|
||||
for line in text.splitlines():
|
||||
parts = line.strip().split()
|
||||
if len(parts) >= 2 and parts[-1] == asset_name:
|
||||
return parts[0]
|
||||
raise RuntimeError(
|
||||
f"No checksum entry for {asset_name} in {checksum_file.name}"
|
||||
)
|
||||
|
||||
|
||||
def _sha256_file(path: Path) -> str:
|
||||
h = hashlib.sha256()
|
||||
with open(path, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(65536), b""):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def _pick_zip_member(zf: zipfile.ZipFile, binary_name: str) -> str:
|
||||
"""Find the binary inside the upstream zip.
|
||||
|
||||
Historically the archive has been flat (``bws`` at the root) but we
|
||||
tolerate a top-level directory just in case upstream changes.
|
||||
"""
|
||||
candidates = [n for n in zf.namelist() if n.split("/")[-1] == binary_name]
|
||||
if not candidates:
|
||||
raise RuntimeError(
|
||||
f"Could not find {binary_name} inside downloaded archive "
|
||||
f"(members: {zf.namelist()[:5]}...)"
|
||||
)
|
||||
# Prefer the shortest path (i.e. root over nested) for determinism.
|
||||
candidates.sort(key=len)
|
||||
return candidates[0]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Secret fetch + apply
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _token_fingerprint(token: str) -> str:
|
||||
"""SHA-256 prefix used as a cache key — never logged, never displayed."""
|
||||
return hashlib.sha256(token.encode("utf-8")).hexdigest()[:16]
|
||||
|
||||
|
||||
def fetch_bitwarden_secrets(
|
||||
*,
|
||||
access_token: str,
|
||||
project_id: str,
|
||||
binary: Optional[Path] = None,
|
||||
cache_ttl_seconds: float = 300,
|
||||
use_cache: bool = True,
|
||||
) -> Tuple[Dict[str, str], List[str]]:
|
||||
"""Pull the secrets for ``project_id`` from Bitwarden Secrets Manager.
|
||||
|
||||
Returns ``(secrets_dict, warnings_list)``.
|
||||
|
||||
Raises :class:`RuntimeError` for fatal conditions (missing binary,
|
||||
auth failure, unparseable output). Callers in the env_loader path
|
||||
catch this and emit a single warning; callers in the user-facing
|
||||
setup wizard let it propagate.
|
||||
"""
|
||||
if not access_token:
|
||||
raise RuntimeError("Bitwarden access token is empty")
|
||||
if not project_id:
|
||||
raise RuntimeError("Bitwarden project_id is empty")
|
||||
|
||||
cache_key = (_token_fingerprint(access_token), project_id)
|
||||
if use_cache:
|
||||
cached = _CACHE.get(cache_key)
|
||||
if cached and cached.is_fresh(cache_ttl_seconds):
|
||||
return cached.secrets, []
|
||||
|
||||
bws = binary or find_bws(install_if_missing=True)
|
||||
if bws is None:
|
||||
raise RuntimeError(
|
||||
"bws binary not available — auto-install failed and `bws` is "
|
||||
"not on PATH. Install manually from "
|
||||
"https://github.com/bitwarden/sdk-sm/releases or re-run "
|
||||
"`hermes secrets bitwarden setup`."
|
||||
)
|
||||
|
||||
secrets, warnings = _run_bws_list(bws, access_token, project_id)
|
||||
_CACHE[cache_key] = _CachedFetch(secrets=secrets, fetched_at=time.time())
|
||||
return secrets, warnings
|
||||
|
||||
|
||||
def _run_bws_list(
|
||||
bws: Path, access_token: str, project_id: str
|
||||
) -> Tuple[Dict[str, str], List[str]]:
|
||||
cmd = [str(bws), "secret", "list", project_id, "--output", "json"]
|
||||
env = os.environ.copy()
|
||||
env["BWS_ACCESS_TOKEN"] = access_token
|
||||
# Make sure we're not echoing telemetry / colour codes into json.
|
||||
env.setdefault("NO_COLOR", "1")
|
||||
|
||||
try:
|
||||
proc = subprocess.run( # noqa: S603 — bws path is trusted
|
||||
cmd,
|
||||
env=env,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=_BWS_RUN_TIMEOUT,
|
||||
)
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
raise RuntimeError(
|
||||
f"bws timed out after {_BWS_RUN_TIMEOUT}s fetching secrets"
|
||||
) from exc
|
||||
except OSError as exc:
|
||||
raise RuntimeError(f"failed to invoke bws: {exc}") from exc
|
||||
|
||||
if proc.returncode != 0:
|
||||
# bws writes auth/network errors to stderr in plain English.
|
||||
# Strip ANSI just in case and surface the first 200 chars.
|
||||
err = (proc.stderr or proc.stdout or "").strip().replace("\x1b", "")
|
||||
raise RuntimeError(
|
||||
f"bws exited {proc.returncode}: {err[:200]}"
|
||||
)
|
||||
|
||||
raw = proc.stdout.strip()
|
||||
if not raw:
|
||||
return {}, ["bws returned no output (empty project?)"]
|
||||
|
||||
try:
|
||||
payload = json.loads(raw)
|
||||
except json.JSONDecodeError as exc:
|
||||
raise RuntimeError(f"bws returned non-JSON output: {exc}") from exc
|
||||
|
||||
if not isinstance(payload, list):
|
||||
raise RuntimeError(
|
||||
f"bws returned unexpected shape: {type(payload).__name__}"
|
||||
)
|
||||
|
||||
secrets: Dict[str, str] = {}
|
||||
warnings: List[str] = []
|
||||
for item in payload:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
key = item.get("key")
|
||||
value = item.get("value")
|
||||
if not isinstance(key, str) or not isinstance(value, str):
|
||||
continue
|
||||
if not _is_valid_env_name(key):
|
||||
warnings.append(
|
||||
f"Skipping secret {key!r}: not a valid env-var name"
|
||||
)
|
||||
continue
|
||||
secrets[key] = value
|
||||
return secrets, warnings
|
||||
|
||||
|
||||
def _is_valid_env_name(name: str) -> bool:
|
||||
if not name:
|
||||
return False
|
||||
if not (name[0].isalpha() or name[0] == "_"):
|
||||
return False
|
||||
return all(c.isalnum() or c == "_" for c in name)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public entry point — called from hermes_cli.env_loader
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def apply_bitwarden_secrets(
|
||||
*,
|
||||
enabled: bool,
|
||||
access_token_env: str = "BWS_ACCESS_TOKEN",
|
||||
project_id: str = "",
|
||||
override_existing: bool = False,
|
||||
cache_ttl_seconds: float = 300,
|
||||
auto_install: bool = True,
|
||||
) -> FetchResult:
|
||||
"""Pull secrets from BSM and set them on ``os.environ``.
|
||||
|
||||
This is the function ``load_hermes_dotenv()`` calls after the .env
|
||||
files have loaded. It is intentionally defensive — any failure
|
||||
returns a :class:`FetchResult` with ``error`` set; it never raises.
|
||||
|
||||
Parameters mirror the ``secrets.bitwarden.*`` config keys so the
|
||||
caller can just splat the dict in.
|
||||
"""
|
||||
result = FetchResult()
|
||||
|
||||
if not enabled:
|
||||
return result
|
||||
|
||||
access_token = os.environ.get(access_token_env, "").strip()
|
||||
if not access_token:
|
||||
result.error = (
|
||||
f"secrets.bitwarden.enabled is true but {access_token_env} is "
|
||||
"not set. Run `hermes secrets bitwarden setup`."
|
||||
)
|
||||
return result
|
||||
|
||||
if not project_id:
|
||||
result.error = (
|
||||
"secrets.bitwarden.project_id is empty. "
|
||||
"Run `hermes secrets bitwarden setup`."
|
||||
)
|
||||
return result
|
||||
|
||||
binary = find_bws(install_if_missing=auto_install)
|
||||
result.binary_path = binary
|
||||
if binary is None:
|
||||
result.error = (
|
||||
"bws binary not available and auto-install is disabled. "
|
||||
"Run `hermes secrets bitwarden setup` to install."
|
||||
)
|
||||
return result
|
||||
|
||||
try:
|
||||
secrets, warnings = fetch_bitwarden_secrets(
|
||||
access_token=access_token,
|
||||
project_id=project_id,
|
||||
binary=binary,
|
||||
cache_ttl_seconds=cache_ttl_seconds,
|
||||
)
|
||||
except RuntimeError as exc:
|
||||
result.error = str(exc)
|
||||
return result
|
||||
|
||||
result.secrets = secrets
|
||||
result.warnings.extend(warnings)
|
||||
|
||||
for key, value in secrets.items():
|
||||
if key == access_token_env:
|
||||
# Don't let BSM clobber the very token we used to fetch
|
||||
# itself — that would be a footgun if someone stored the
|
||||
# token as a BSM secret too.
|
||||
result.skipped.append(key)
|
||||
continue
|
||||
if not override_existing and os.environ.get(key):
|
||||
result.skipped.append(key)
|
||||
continue
|
||||
os.environ[key] = value
|
||||
result.applied.append(key)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test hook — used by hermetic tests to flush the cache between cases.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _reset_cache_for_tests() -> None:
|
||||
_CACHE.clear()
|
||||
+58
-3
@@ -12,7 +12,7 @@ import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple
|
||||
|
||||
from hermes_constants import get_config_path, get_skills_dir
|
||||
from hermes_constants import get_config_path, get_skills_dir, is_termux
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -24,7 +24,43 @@ PLATFORM_MAP = {
|
||||
"windows": "win32",
|
||||
}
|
||||
|
||||
EXCLUDED_SKILL_DIRS = frozenset((".git", ".github", ".hub", ".archive"))
|
||||
EXCLUDED_SKILL_DIRS = frozenset(
|
||||
(
|
||||
".git",
|
||||
".github",
|
||||
".hub",
|
||||
".archive",
|
||||
".venv",
|
||||
"venv",
|
||||
"node_modules",
|
||||
"site-packages",
|
||||
"__pycache__",
|
||||
".tox",
|
||||
".nox",
|
||||
".pytest_cache",
|
||||
".mypy_cache",
|
||||
".ruff_cache",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def is_excluded_skill_path(path) -> bool:
|
||||
"""True if any component of *path* is in EXCLUDED_SKILL_DIRS.
|
||||
|
||||
Use this on every SKILL.md path produced by ``rglob`` to prune
|
||||
dependency, virtualenv, VCS, and cache directories. Centralising the
|
||||
check here keeps every skill-scanning site in sync with the shared
|
||||
exclusion set.
|
||||
|
||||
Accepts a Path or string.
|
||||
"""
|
||||
try:
|
||||
parts = path.parts # Path
|
||||
except AttributeError:
|
||||
from pathlib import PurePath
|
||||
parts = PurePath(str(path)).parts
|
||||
return any(part in EXCLUDED_SKILL_DIRS for part in parts)
|
||||
|
||||
|
||||
# ── Lazy YAML loader ─────────────────────────────────────────────────────
|
||||
|
||||
@@ -100,6 +136,14 @@ def skill_matches_platform(frontmatter: Dict[str, Any]) -> bool:
|
||||
|
||||
If the field is absent or empty the skill is compatible with **all**
|
||||
platforms (backward-compatible default).
|
||||
|
||||
Termux note: on Termux/Android, ``sys.platform`` is ``"linux"`` on
|
||||
older Pythons but became ``"android"`` on Python 3.13+. Termux is a
|
||||
Linux userland riding on the Android kernel, so skills tagged
|
||||
``linux`` are treated as compatible in Termux regardless of which
|
||||
``sys.platform`` value Python reports. Individual Linux commands
|
||||
inside a skill may still misbehave (no systemd, BusyBox utils, no
|
||||
apt/dnf, etc.) but that is on the skill, not on platform gating.
|
||||
"""
|
||||
platforms = frontmatter.get("platforms")
|
||||
if not platforms:
|
||||
@@ -107,11 +151,21 @@ def skill_matches_platform(frontmatter: Dict[str, Any]) -> bool:
|
||||
if not isinstance(platforms, list):
|
||||
platforms = [platforms]
|
||||
current = sys.platform
|
||||
running_in_termux = is_termux()
|
||||
for platform in platforms:
|
||||
normalized = str(platform).lower().strip()
|
||||
mapped = PLATFORM_MAP.get(normalized, normalized)
|
||||
if current.startswith(mapped):
|
||||
return True
|
||||
# Termux runs a Linux userland on Android. Accept linux-tagged
|
||||
# skills regardless of whether sys.platform is "linux" (pre-3.13
|
||||
# Termux) or "android" (Python 3.13+ Termux, and any other
|
||||
# Android runtime).
|
||||
if running_in_termux and mapped == "linux":
|
||||
return True
|
||||
# Explicit termux/android tags match a Termux session too.
|
||||
if running_in_termux and mapped in ("termux", "android"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
@@ -478,7 +532,8 @@ def extract_skill_description(frontmatter: Dict[str, Any]) -> str:
|
||||
def iter_skill_index_files(skills_dir: Path, filename: str):
|
||||
"""Walk skills_dir yielding sorted paths matching *filename*.
|
||||
|
||||
Excludes ``.git``, ``.github``, ``.hub``, ``.archive`` directories.
|
||||
Excludes Hermes metadata, VCS, virtualenv/dependency, and cache
|
||||
directories so dependencies cannot register nested skills.
|
||||
"""
|
||||
matches = []
|
||||
for root, dirs, files in os.walk(skills_dir, followlinks=True):
|
||||
|
||||
@@ -130,6 +130,12 @@ def build_system_prompt_parts(agent: Any, system_message: Optional[str] = None)
|
||||
nous_subscription_prompt = _r.build_nous_subscription_prompt(agent.valid_tool_names)
|
||||
if nous_subscription_prompt:
|
||||
stable_parts.append(nous_subscription_prompt)
|
||||
|
||||
# App tools (500+ external integrations) behavioural guidance
|
||||
app_tools_prompt = _r.build_app_tools_prompt(agent.valid_tool_names)
|
||||
if app_tools_prompt:
|
||||
stable_parts.append(app_tools_prompt)
|
||||
|
||||
# Tool-use enforcement: tells the model to actually call tools instead
|
||||
# of describing intended actions. Controlled by config.yaml
|
||||
# agent.tool_use_enforcement:
|
||||
|
||||
@@ -10221,6 +10221,7 @@ class HermesCLI:
|
||||
self._voice_processing = True
|
||||
|
||||
submitted = False
|
||||
transcription_failed = False
|
||||
wav_path = None
|
||||
try:
|
||||
if self._voice_recorder is None:
|
||||
@@ -10269,18 +10270,24 @@ class HermesCLI:
|
||||
else:
|
||||
error = result.get("error", "Unknown error")
|
||||
_cprint(f"\n{_DIM}Transcription failed: {error}{_RST}")
|
||||
transcription_failed = True
|
||||
|
||||
except Exception as e:
|
||||
_cprint(f"\n{_DIM}Voice processing error: {e}{_RST}")
|
||||
transcription_failed = wav_path is not None
|
||||
finally:
|
||||
with self._voice_lock:
|
||||
self._voice_processing = False
|
||||
if hasattr(self, '_app') and self._app:
|
||||
self._app.invalidate()
|
||||
# Clean up temp file
|
||||
# Clean up temp file unless transcription failed. On failure, keep
|
||||
# the source recording so long dictation is not lost.
|
||||
try:
|
||||
if wav_path and os.path.isfile(wav_path):
|
||||
os.unlink(wav_path)
|
||||
if transcription_failed:
|
||||
_cprint(f"{_DIM}Recording preserved at: {wav_path}{_RST}")
|
||||
else:
|
||||
os.unlink(wav_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
+95
-20
@@ -18,6 +18,7 @@ Security features (based on OWASP + NIST SP 800-63-4 guidance):
|
||||
Storage: ~/.hermes/pairing/
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import secrets
|
||||
@@ -148,6 +149,11 @@ class PairingStore:
|
||||
|
||||
# ----- Pending codes -----
|
||||
|
||||
@staticmethod
|
||||
def _hash_code(code: str, salt: bytes) -> str:
|
||||
"""Hash a pairing code with the given salt using SHA-256."""
|
||||
return hashlib.sha256(salt + code.encode("utf-8")).hexdigest()
|
||||
|
||||
def generate_code(
|
||||
self, platform: str, user_id: str, user_name: str = ""
|
||||
) -> Optional[str]:
|
||||
@@ -158,6 +164,9 @@ class PairingStore:
|
||||
- User is rate-limited (too recent request)
|
||||
- Max pending codes reached for this platform
|
||||
- User/platform is in lockout due to failed attempts
|
||||
|
||||
The code is NOT stored in plaintext. Only a salted SHA-256 hash is
|
||||
persisted so that reading the pending file does not reveal codes.
|
||||
"""
|
||||
with self._lock:
|
||||
self._cleanup_expired(platform)
|
||||
@@ -178,8 +187,17 @@ class PairingStore:
|
||||
# Generate cryptographically random code
|
||||
code = "".join(secrets.choice(ALPHABET) for _ in range(CODE_LENGTH))
|
||||
|
||||
# Store pending request
|
||||
pending[code] = {
|
||||
# Hash the code with a random salt before storing
|
||||
salt = os.urandom(16)
|
||||
code_hash = self._hash_code(code, salt)
|
||||
|
||||
# Use a unique entry id as the key (not the code itself)
|
||||
entry_id = secrets.token_hex(8)
|
||||
|
||||
# Store pending request with hashed code
|
||||
pending[entry_id] = {
|
||||
"hash": code_hash,
|
||||
"salt": salt.hex(),
|
||||
"user_id": user_id,
|
||||
"user_name": user_name,
|
||||
"created_at": time.time(),
|
||||
@@ -195,10 +213,16 @@ class PairingStore:
|
||||
"""
|
||||
Approve a pairing code. Adds the user to the approved list.
|
||||
|
||||
Returns {user_id, user_name} on success, None if code is
|
||||
Returns ``{user_id, user_name}`` on success, ``None`` if the code is
|
||||
invalid/expired OR the platform is currently locked out after
|
||||
``MAX_FAILED_ATTEMPTS`` failed approvals (#10195). Callers can
|
||||
disambiguate with ``_is_locked_out(platform)``.
|
||||
|
||||
Verification: the user-provided code is hashed with each stored
|
||||
entry's salt and compared to the stored hash using constant-time
|
||||
comparison. Pre-hash entries (legacy plaintext-key format from
|
||||
pre-upgrade pending.json files) are silently ignored — they get
|
||||
pruned at TTL by ``_cleanup_expired``.
|
||||
"""
|
||||
with self._lock:
|
||||
self._cleanup_expired(platform)
|
||||
@@ -213,34 +237,73 @@ class PairingStore:
|
||||
return None
|
||||
|
||||
pending = self._load_json(self._pending_path(platform))
|
||||
if code not in pending:
|
||||
|
||||
# Find the entry whose hash matches the provided code.
|
||||
# Tolerate legacy plaintext-key entries (no salt/hash) and
|
||||
# malformed entries — skip them rather than KeyError, so an
|
||||
# in-place upgrade across an existing pending.json doesn't
|
||||
# crash on the first approve call. Legacy entries get pruned
|
||||
# at their TTL by _cleanup_expired.
|
||||
matched_key = None
|
||||
matched_entry = None
|
||||
for entry_id, entry in pending.items():
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
if "salt" not in entry or "hash" not in entry:
|
||||
continue
|
||||
try:
|
||||
salt = bytes.fromhex(entry["salt"])
|
||||
except ValueError:
|
||||
continue
|
||||
candidate_hash = self._hash_code(code, salt)
|
||||
if secrets.compare_digest(candidate_hash, entry["hash"]):
|
||||
matched_key = entry_id
|
||||
matched_entry = entry
|
||||
break
|
||||
|
||||
if matched_key is None:
|
||||
self._record_failed_attempt(platform)
|
||||
return None
|
||||
|
||||
entry = pending.pop(code)
|
||||
del pending[matched_key]
|
||||
self._save_json(self._pending_path(platform), pending)
|
||||
|
||||
# Add to approved list
|
||||
self._approve_user(platform, entry["user_id"], entry.get("user_name", ""))
|
||||
self._approve_user(platform, matched_entry["user_id"],
|
||||
matched_entry.get("user_name", ""))
|
||||
|
||||
return {
|
||||
"user_id": entry["user_id"],
|
||||
"user_name": entry.get("user_name", ""),
|
||||
"user_id": matched_entry["user_id"],
|
||||
"user_name": matched_entry.get("user_name", ""),
|
||||
}
|
||||
|
||||
def list_pending(self, platform: str = None) -> list:
|
||||
"""List pending pairing requests, optionally filtered by platform."""
|
||||
"""List pending pairing requests, optionally filtered by platform.
|
||||
|
||||
Codes are stored hashed — the ``code`` field is replaced with the
|
||||
first 8 hex characters of the hash so admins can distinguish entries
|
||||
without revealing the original code. Legacy plaintext-key entries
|
||||
(pre-hash format) are shown with a "legacy" placeholder so admins
|
||||
can see them age out without crashing on a missing ``hash`` field.
|
||||
"""
|
||||
results = []
|
||||
platforms = [platform] if platform else self._all_platforms("pending")
|
||||
for p in platforms:
|
||||
self._cleanup_expired(p)
|
||||
pending = self._load_json(self._pending_path(p))
|
||||
for code, info in pending.items():
|
||||
age_min = int((time.time() - info["created_at"]) / 60)
|
||||
for entry_id, info in pending.items():
|
||||
if not isinstance(info, dict):
|
||||
continue
|
||||
created_at = info.get("created_at")
|
||||
if not isinstance(created_at, (int, float)):
|
||||
continue
|
||||
age_min = int((time.time() - created_at) / 60)
|
||||
hash_val = info.get("hash")
|
||||
code_display = hash_val[:8] if isinstance(hash_val, str) else "legacy"
|
||||
results.append({
|
||||
"platform": p,
|
||||
"code": code,
|
||||
"user_id": info["user_id"],
|
||||
"code": code_display,
|
||||
"user_id": info.get("user_id", ""),
|
||||
"user_name": info.get("user_name", ""),
|
||||
"age_minutes": age_min,
|
||||
})
|
||||
@@ -297,17 +360,29 @@ class PairingStore:
|
||||
# ----- Cleanup -----
|
||||
|
||||
def _cleanup_expired(self, platform: str) -> None:
|
||||
"""Remove expired pending codes."""
|
||||
"""Remove expired pending codes.
|
||||
|
||||
Tolerant of malformed / legacy entries — anything without a numeric
|
||||
``created_at`` is treated as expired (it's effectively unusable
|
||||
with the new hash-keyed schema anyway).
|
||||
"""
|
||||
path = self._pending_path(platform)
|
||||
pending = self._load_json(path)
|
||||
now = time.time()
|
||||
expired = [
|
||||
code for code, info in pending.items()
|
||||
if (now - info["created_at"]) > CODE_TTL_SECONDS
|
||||
]
|
||||
expired = []
|
||||
for entry_id, info in pending.items():
|
||||
if not isinstance(info, dict):
|
||||
expired.append(entry_id)
|
||||
continue
|
||||
created_at = info.get("created_at")
|
||||
if not isinstance(created_at, (int, float)):
|
||||
expired.append(entry_id)
|
||||
continue
|
||||
if (now - created_at) > CODE_TTL_SECONDS:
|
||||
expired.append(entry_id)
|
||||
if expired:
|
||||
for code in expired:
|
||||
del pending[code]
|
||||
for entry_id in expired:
|
||||
del pending[entry_id]
|
||||
self._save_json(path, pending)
|
||||
|
||||
def _all_platforms(self, suffix: str) -> list:
|
||||
|
||||
@@ -308,11 +308,26 @@ class WebhookAdapter(BasePlatformAdapter):
|
||||
data = json.loads(subs_path.read_text(encoding="utf-8"))
|
||||
if not isinstance(data, dict):
|
||||
return
|
||||
# Merge: static routes take precedence over dynamic ones
|
||||
self._dynamic_routes = {
|
||||
k: v for k, v in data.items()
|
||||
if k not in self._static_routes
|
||||
}
|
||||
# Merge: static routes take precedence over dynamic ones.
|
||||
# Reject any dynamic route whose effective secret is empty —
|
||||
# an empty secret would cause _handle_webhook to skip HMAC
|
||||
# validation entirely, letting unauthenticated callers in.
|
||||
new_dynamic: Dict[str, dict] = {}
|
||||
for k, v in data.items():
|
||||
if k in self._static_routes:
|
||||
continue
|
||||
effective_secret = v.get("secret", self._global_secret)
|
||||
if not effective_secret:
|
||||
logger.warning(
|
||||
"[webhook] Dynamic route '%s' skipped: 'secret' is "
|
||||
"missing or empty. Set a valid HMAC secret, or use "
|
||||
"'%s' to explicitly disable auth (testing only).",
|
||||
k,
|
||||
_INSECURE_NO_AUTH,
|
||||
)
|
||||
continue
|
||||
new_dynamic[k] = v
|
||||
self._dynamic_routes = new_dynamic
|
||||
self._routes = {**self._dynamic_routes, **self._static_routes}
|
||||
self._dynamic_routes_mtime = mtime
|
||||
logger.info(
|
||||
|
||||
+4
-2
@@ -1109,7 +1109,7 @@ def _check_unavailable_skill(command_name: str) -> str | None:
|
||||
normalized = command_name.lower().replace("_", "-")
|
||||
try:
|
||||
from tools.skills_tool import _get_disabled_skill_names
|
||||
from agent.skill_utils import get_all_skills_dirs
|
||||
from agent.skill_utils import get_all_skills_dirs, is_excluded_skill_path
|
||||
disabled = _get_disabled_skill_names()
|
||||
|
||||
# Check disabled skills across all dirs (local + external)
|
||||
@@ -1117,7 +1117,7 @@ def _check_unavailable_skill(command_name: str) -> str | None:
|
||||
if not skills_dir.exists():
|
||||
continue
|
||||
for skill_md in skills_dir.rglob("SKILL.md"):
|
||||
if any(part in {'.git', '.github', '.hub', '.archive'} for part in skill_md.parts):
|
||||
if is_excluded_skill_path(skill_md):
|
||||
continue
|
||||
slug, declared_name = _skill_slug_from_frontmatter(skill_md)
|
||||
if not slug or not declared_name:
|
||||
@@ -1136,6 +1136,8 @@ def _check_unavailable_skill(command_name: str) -> str | None:
|
||||
optional_dir = get_optional_skills_dir(repo_root / "optional-skills")
|
||||
if optional_dir.exists():
|
||||
for skill_md in optional_dir.rglob("SKILL.md"):
|
||||
if is_excluded_skill_path(skill_md):
|
||||
continue
|
||||
slug, _declared = _skill_slug_from_frontmatter(skill_md)
|
||||
if not slug:
|
||||
continue
|
||||
|
||||
+84
-4
@@ -1747,8 +1747,48 @@ DEFAULT_CONFIG = {
|
||||
"retries": 2,
|
||||
},
|
||||
|
||||
# =========================================================================
|
||||
# External secret sources
|
||||
# =========================================================================
|
||||
# Pull credentials from external secret managers at process startup
|
||||
# rather than storing them in ~/.hermes/.env.
|
||||
"secrets": {
|
||||
"bitwarden": {
|
||||
# Master switch. When false, BSM is never contacted and the
|
||||
# bws binary is never auto-installed — same as not having
|
||||
# this section at all.
|
||||
"enabled": False,
|
||||
# Name of the env var that holds the Bitwarden machine-account
|
||||
# access token. This is the one bootstrap secret; it lives
|
||||
# in ~/.hermes/.env (or your shell) and never in config.yaml.
|
||||
"access_token_env": "BWS_ACCESS_TOKEN",
|
||||
# UUID of the BSM project to sync from.
|
||||
"project_id": "",
|
||||
# Seconds to cache fetched secrets in-process. 0 disables.
|
||||
"cache_ttl_seconds": 300,
|
||||
# When True, BSM values overwrite existing env vars. Default
|
||||
# True because the point of using BSM is centralized rotation —
|
||||
# if .env had the final say, rotating in Bitwarden wouldn't
|
||||
# take effect until you also cleared the matching .env line.
|
||||
"override_existing": True,
|
||||
# When True, the bws binary is auto-downloaded into
|
||||
# ~/.hermes/bin/ on first use. When False you must install
|
||||
# bws yourself and have it on PATH.
|
||||
"auto_install": True,
|
||||
},
|
||||
},
|
||||
|
||||
# ── Nous Portal feature flags ──────────────────────────────────────
|
||||
"portal": {
|
||||
# App tools: 500+ external app integrations (Gmail, Slack, GitHub,
|
||||
# Notion, etc.) via the Nous tool gateway. Requires an active Nous
|
||||
# subscription. Set to False to hide the app_tools toolset even
|
||||
# when a subscription is present.
|
||||
"app_tools": True,
|
||||
},
|
||||
|
||||
# Config schema version - bump this when adding new required fields
|
||||
"_config_version": 23,
|
||||
"_config_version": 24,
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
@@ -2236,6 +2276,22 @@ OPTIONAL_ENV_VARS = {
|
||||
"category": "tool",
|
||||
"advanced": True,
|
||||
},
|
||||
"TOOLS_GATEWAY_URL": {
|
||||
"description": "Explicit URL for the tools-gateway (app integrations). Overrides the auto-derived tools-gateway.nousresearch.com",
|
||||
"prompt": "Tools-gateway URL",
|
||||
"url": None,
|
||||
"password": False,
|
||||
"category": "tool",
|
||||
"advanced": True,
|
||||
},
|
||||
"PORTAL_APP_TOOLS": {
|
||||
"description": "Enable app integration tools (500+ apps via Nous tool gateway). Requires Nous subscription.",
|
||||
"prompt": "Enable app tools (500+ apps)",
|
||||
"url": None,
|
||||
"password": False,
|
||||
"category": "tool",
|
||||
"advanced": True,
|
||||
},
|
||||
"TAVILY_API_KEY": {
|
||||
"description": "Tavily API key for AI-native web search, extract, and crawl",
|
||||
"prompt": "Tavily API key",
|
||||
@@ -3017,7 +3073,7 @@ def _normalize_custom_provider_entry(
|
||||
"api_mode", "transport", "model", "default_model", "models",
|
||||
"context_length", "rate_limit_delay",
|
||||
"request_timeout_seconds", "stale_timeout_seconds",
|
||||
"discover_models",
|
||||
"discover_models", "extra_body",
|
||||
}
|
||||
for camel, snake in _CAMEL_ALIASES.items():
|
||||
if camel in entry and snake not in entry:
|
||||
@@ -3112,6 +3168,10 @@ def _normalize_custom_provider_entry(
|
||||
if isinstance(discover_models, bool):
|
||||
normalized["discover_models"] = discover_models
|
||||
|
||||
extra_body = entry.get("extra_body")
|
||||
if isinstance(extra_body, dict):
|
||||
normalized["extra_body"] = dict(extra_body)
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
@@ -3266,13 +3326,13 @@ _KNOWN_ROOT_KEYS = {
|
||||
"fallback_providers", "credential_pool_strategies", "toolsets",
|
||||
"agent", "terminal", "display", "compression", "delegation",
|
||||
"auxiliary", "custom_providers", "context", "memory", "gateway",
|
||||
"sessions",
|
||||
"sessions", "portal",
|
||||
}
|
||||
|
||||
# Valid fields inside a custom_providers list entry
|
||||
_VALID_CUSTOM_PROVIDER_FIELDS = {
|
||||
"name", "base_url", "api_key", "api_mode", "model", "models",
|
||||
"context_length", "rate_limit_delay",
|
||||
"context_length", "rate_limit_delay", "extra_body",
|
||||
# key_env is read at runtime by runtime_provider.py and auxiliary_client.py
|
||||
# — include it here so the set accurately describes the supported schema.
|
||||
"key_env",
|
||||
@@ -3929,6 +3989,26 @@ def migrate_config(interactive: bool = True, quiet: bool = False) -> Dict[str, A
|
||||
f"{', '.join(added_aux)}"
|
||||
)
|
||||
|
||||
# ── Version 23 → 24: inject app_tools into saved platform_toolsets ──
|
||||
# The portal.app_tools config flag is handled by deep-merge (DEFAULT_CONFIG
|
||||
# has it, so load_config() always includes it). But platform_toolsets are
|
||||
# user-owned lists that deep-merge can't append to — existing users who
|
||||
# ran `hermes tools` have a saved list that won't include app_tools.
|
||||
if current_ver < 24:
|
||||
config = read_raw_config()
|
||||
pt = config.get("platform_toolsets")
|
||||
if isinstance(pt, dict):
|
||||
patched = False
|
||||
for plat_key, ts_list in pt.items():
|
||||
if isinstance(ts_list, list) and "app_tools" not in ts_list:
|
||||
ts_list.append("app_tools")
|
||||
patched = True
|
||||
if patched:
|
||||
save_config(config)
|
||||
results["config_added"].append("app_tools added to platform_toolsets")
|
||||
if not quiet:
|
||||
print(" ✓ Added app_tools to saved platform toolset lists")
|
||||
|
||||
if current_ver < latest_ver and not quiet:
|
||||
print(f"Config version: {current_ver} → {latest_ver}")
|
||||
|
||||
|
||||
@@ -71,7 +71,7 @@ def curses_checklist(
|
||||
curses.use_default_colors()
|
||||
curses.init_pair(1, curses.COLOR_GREEN, -1)
|
||||
curses.init_pair(2, curses.COLOR_YELLOW, -1)
|
||||
curses.init_pair(3, 8, -1) # dim gray
|
||||
curses.init_pair(3, 8 if curses.COLORS > 8 else curses.COLOR_WHITE, -1) # dim gray
|
||||
cursor = 0
|
||||
scroll_offset = 0
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ from pathlib import Path
|
||||
from hermes_cli.config import get_hermes_home, get_env_path, get_project_root, load_config
|
||||
from hermes_cli.env_loader import load_hermes_dotenv
|
||||
from hermes_constants import display_hermes_home
|
||||
from agent.skill_utils import is_excluded_skill_path
|
||||
|
||||
|
||||
def _get_git_commit(project_root: Path) -> str:
|
||||
@@ -69,6 +70,8 @@ def _count_skills(hermes_home: Path) -> int:
|
||||
return 0
|
||||
count = 0
|
||||
for item in skills_dir.rglob("SKILL.md"):
|
||||
if is_excluded_skill_path(item):
|
||||
continue
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
@@ -21,6 +21,44 @@ _CREDENTIAL_SUFFIXES = ("_API_KEY", "_TOKEN", "_SECRET", "_KEY")
|
||||
# tests) don't spam the same warning multiple times.
|
||||
_WARNED_KEYS: set[str] = set()
|
||||
|
||||
# Map of env-var name → source label ("bitwarden", etc.) for credentials
|
||||
# that were injected by an external secret source during load_hermes_dotenv().
|
||||
# Used by setup / `hermes model` flows to label detected credentials so
|
||||
# users understand WHERE a key came from when their .env doesn't contain it
|
||||
# directly (otherwise the "credentials detected ✓" line looks identical to
|
||||
# the .env case and they don't know Bitwarden is wired up).
|
||||
_SECRET_SOURCES: dict[str, str] = {}
|
||||
|
||||
|
||||
def get_secret_source(env_var: str) -> str | None:
|
||||
"""Return the label of the secret source that supplied ``env_var``, if any.
|
||||
|
||||
Returns ``"bitwarden"`` for keys pulled from Bitwarden Secrets Manager
|
||||
during the current process's ``load_hermes_dotenv()`` call. Returns
|
||||
``None`` for keys that came from ``.env``, the shell environment, or
|
||||
aren't tracked.
|
||||
"""
|
||||
return _SECRET_SOURCES.get(env_var)
|
||||
|
||||
|
||||
def format_secret_source_suffix(env_var: str) -> str:
|
||||
"""Return a human-readable suffix like ``" (from Bitwarden)"`` or ``""``.
|
||||
|
||||
Use this when printing a detected credential so the user can see where
|
||||
it came from. Empty string when the credential came from ``.env`` or
|
||||
the shell — those are the implicit / "default" cases users already
|
||||
understand.
|
||||
"""
|
||||
source = get_secret_source(env_var)
|
||||
if not source:
|
||||
return ""
|
||||
if source == "bitwarden":
|
||||
return " (from Bitwarden)"
|
||||
# Generic fallback — future-proofing for additional secret sources
|
||||
# (e.g. 1Password, HashiCorp Vault) without having to update every
|
||||
# call site.
|
||||
return f" (from {source})"
|
||||
|
||||
|
||||
def _format_offending_chars(value: str, limit: int = 3) -> str:
|
||||
"""Return a compact 'U+XXXX ('c'), ...' summary of non-ASCII codepoints."""
|
||||
@@ -172,4 +210,87 @@ def load_hermes_dotenv(
|
||||
_load_dotenv_with_fallback(project_env_path, override=not loaded)
|
||||
loaded.append(project_env_path)
|
||||
|
||||
_apply_external_secret_sources(home_path)
|
||||
|
||||
return loaded
|
||||
|
||||
|
||||
def _apply_external_secret_sources(home_path: Path) -> None:
|
||||
"""Pull secrets from external sources (currently Bitwarden) into env.
|
||||
|
||||
Runs AFTER dotenv loads so .env values are visible (we use them to
|
||||
locate the access token) but BEFORE the rest of Hermes reads
|
||||
``os.environ`` for credentials. Any failure here is logged and
|
||||
swallowed — external secret sources must never block startup.
|
||||
"""
|
||||
try:
|
||||
cfg = _load_secrets_config(home_path)
|
||||
except Exception: # noqa: BLE001 — config errors must not block startup
|
||||
return
|
||||
|
||||
bw_cfg = (cfg or {}).get("bitwarden") or {}
|
||||
if not bw_cfg.get("enabled"):
|
||||
return
|
||||
|
||||
try:
|
||||
from agent.secret_sources.bitwarden import apply_bitwarden_secrets
|
||||
except ImportError:
|
||||
return
|
||||
|
||||
result = apply_bitwarden_secrets(
|
||||
enabled=True,
|
||||
access_token_env=bw_cfg.get("access_token_env", "BWS_ACCESS_TOKEN"),
|
||||
project_id=bw_cfg.get("project_id", ""),
|
||||
override_existing=bool(bw_cfg.get("override_existing", False)),
|
||||
cache_ttl_seconds=float(bw_cfg.get("cache_ttl_seconds", 300)),
|
||||
auto_install=bool(bw_cfg.get("auto_install", True)),
|
||||
)
|
||||
|
||||
if result.applied:
|
||||
# Re-run the ASCII sanitization pass: BSM values are user-supplied
|
||||
# and might have the same copy-paste corruption as a manually
|
||||
# edited .env (see #6843).
|
||||
_sanitize_loaded_credentials()
|
||||
# Remember where these came from so the setup / `hermes model`
|
||||
# flows can label detected credentials with "(from Bitwarden)" —
|
||||
# otherwise users see "credentials ✓" with no hint that the value
|
||||
# came from BSM rather than .env.
|
||||
for name in result.applied:
|
||||
_SECRET_SOURCES[name] = "bitwarden"
|
||||
print(
|
||||
f" Bitwarden Secrets Manager: applied {len(result.applied)} "
|
||||
f"secret{'s' if len(result.applied) != 1 else ''} "
|
||||
f"({', '.join(sorted(result.applied))})",
|
||||
file=sys.stderr,
|
||||
)
|
||||
if result.error:
|
||||
print(
|
||||
f" Bitwarden Secrets Manager: {result.error}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
for warn in result.warnings:
|
||||
print(
|
||||
f" Bitwarden Secrets Manager: {warn}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
|
||||
def _load_secrets_config(home_path: Path) -> dict:
|
||||
"""Read just the ``secrets:`` section out of config.yaml.
|
||||
|
||||
Imported lazily and isolated from the main config loader so a
|
||||
malformed config can't take down dotenv loading entirely.
|
||||
"""
|
||||
config_path = home_path / "config.yaml"
|
||||
if not config_path.exists():
|
||||
return {}
|
||||
try:
|
||||
import yaml # type: ignore
|
||||
except ImportError:
|
||||
return {}
|
||||
try:
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
except Exception: # noqa: BLE001
|
||||
return {}
|
||||
return data.get("secrets") or {}
|
||||
|
||||
+381
-84
@@ -275,6 +275,133 @@ def _is_termux_startup_environment(env: dict[str, str] | None = None) -> bool:
|
||||
)
|
||||
|
||||
|
||||
def _read_packed_ref(common_dir: Path, ref: str) -> str | None:
|
||||
"""Look up a ref in .git/packed-refs without spawning git.
|
||||
|
||||
packed-refs lines look like ``<sha> <ref>`` with optional ``^<sha>``
|
||||
peel lines and ``#``-prefixed comments / ``# pack-refs with:`` header.
|
||||
"""
|
||||
try:
|
||||
text = (common_dir / "packed-refs").read_text(encoding="utf-8", errors="replace")
|
||||
except OSError:
|
||||
return None
|
||||
for line in text.splitlines():
|
||||
if not line or line.startswith("#") or line.startswith("^"):
|
||||
continue
|
||||
parts = line.split(" ", 1)
|
||||
if len(parts) == 2 and parts[1].strip() == ref:
|
||||
return parts[0].strip()
|
||||
return None
|
||||
|
||||
|
||||
def _read_git_revision_fingerprint(repo_root: Path) -> str | None:
|
||||
"""Return a cheap checkout fingerprint without spawning git."""
|
||||
git_dir = repo_root / ".git"
|
||||
try:
|
||||
if git_dir.is_file():
|
||||
for line in git_dir.read_text(encoding="utf-8", errors="replace").splitlines():
|
||||
key, _, value = line.partition(":")
|
||||
if key.strip() == "gitdir" and value.strip():
|
||||
git_dir = (repo_root / value.strip()).resolve()
|
||||
break
|
||||
# Worktrees point HEAD at a per-worktree gitdir but pack their refs
|
||||
# in the main repo's gitdir (referenced via ``commondir``). Resolve
|
||||
# that up front so packed-refs lookups hit the right file.
|
||||
common_dir = git_dir
|
||||
commondir_file = git_dir / "commondir"
|
||||
if commondir_file.exists():
|
||||
try:
|
||||
rel = commondir_file.read_text(encoding="utf-8", errors="replace").strip()
|
||||
if rel:
|
||||
common_dir = (git_dir / rel).resolve()
|
||||
except OSError:
|
||||
pass
|
||||
head_file = git_dir / "HEAD"
|
||||
head = head_file.read_text(encoding="utf-8", errors="replace").strip()
|
||||
if head.startswith("ref:"):
|
||||
ref = head.split(":", 1)[1].strip()
|
||||
# Loose refs may live in the worktree gitdir OR the common dir
|
||||
# (branches created via `git worktree add` typically live in the
|
||||
# common dir's refs/heads/).
|
||||
for candidate in (git_dir, common_dir):
|
||||
ref_file = candidate / ref
|
||||
if ref_file.exists():
|
||||
return f"git:{ref}:{ref_file.read_text(encoding='utf-8', errors='replace').strip()}"
|
||||
packed_sha = _read_packed_ref(common_dir, ref)
|
||||
if packed_sha:
|
||||
return f"git:{ref}:{packed_sha}"
|
||||
# Ref name is known but unresolved — still stable across launches,
|
||||
# and the version/release fallback in the caller will invalidate
|
||||
# after `hermes update`.
|
||||
return f"git:{ref}:unresolved"
|
||||
return f"git:HEAD:{head}"
|
||||
except OSError:
|
||||
return None
|
||||
|
||||
|
||||
def _termux_bundled_skills_fingerprint() -> str:
|
||||
"""Cheap invalidation key for Termux bundled-skill startup sync."""
|
||||
git_fp = _read_git_revision_fingerprint(PROJECT_ROOT)
|
||||
if git_fp:
|
||||
return git_fp
|
||||
skills_dir = PROJECT_ROOT / "skills"
|
||||
try:
|
||||
stat = skills_dir.stat()
|
||||
return f"skills:{__version__}:{__release_date__}:{stat.st_mtime_ns}:{stat.st_size}"
|
||||
except OSError:
|
||||
return f"skills:{__version__}:{__release_date__}:missing"
|
||||
|
||||
|
||||
def _termux_bundled_skills_stamp_path() -> Path:
|
||||
return get_hermes_home() / "skills" / ".termux_bundled_sync_stamp"
|
||||
|
||||
|
||||
def _termux_bundled_skills_sync_needed() -> bool:
|
||||
if not _is_termux_startup_environment():
|
||||
return True
|
||||
if os.environ.get("HERMES_TERMUX_FORCE_SKILLS_SYNC") == "1":
|
||||
return True
|
||||
try:
|
||||
stamp = _termux_bundled_skills_stamp_path()
|
||||
return stamp.read_text(encoding="utf-8").strip() != _termux_bundled_skills_fingerprint()
|
||||
except OSError:
|
||||
return True
|
||||
|
||||
|
||||
def _mark_termux_bundled_skills_synced() -> None:
|
||||
if not _is_termux_startup_environment():
|
||||
return
|
||||
try:
|
||||
stamp = _termux_bundled_skills_stamp_path()
|
||||
stamp.parent.mkdir(parents=True, exist_ok=True)
|
||||
stamp.write_text(_termux_bundled_skills_fingerprint() + "\n", encoding="utf-8")
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _sync_bundled_skills_for_startup() -> bool:
|
||||
"""Sync bundled skills, but skip unchanged Termux checkouts cheaply.
|
||||
|
||||
Hashing every bundled skill is safe but expensive on older Android
|
||||
storage. The git/ref stamp keeps post-update correctness: a changed
|
||||
checkout revision forces one real sync, then later starts skip it.
|
||||
"""
|
||||
if _is_termux_startup_environment() and not _termux_bundled_skills_sync_needed():
|
||||
return False
|
||||
|
||||
from tools.skills_sync import sync_skills
|
||||
|
||||
sync_skills(quiet=True)
|
||||
_mark_termux_bundled_skills_synced()
|
||||
return True
|
||||
|
||||
|
||||
def _termux_should_prefetch_update_check() -> bool:
|
||||
if not _is_termux_startup_environment():
|
||||
return True
|
||||
return os.environ.get("HERMES_TERMUX_PREFETCH_UPDATES") == "1"
|
||||
|
||||
|
||||
def _relative_time(ts) -> str:
|
||||
"""Format a timestamp as relative time (e.g., '2h ago', 'yesterday')."""
|
||||
if not ts:
|
||||
@@ -464,7 +591,7 @@ def _session_browse_picker(sessions: list) -> Optional[str]:
|
||||
curses.init_pair(1, curses.COLOR_GREEN, -1) # selected
|
||||
curses.init_pair(2, curses.COLOR_YELLOW, -1) # header
|
||||
curses.init_pair(3, curses.COLOR_CYAN, -1) # search
|
||||
curses.init_pair(4, 8, -1) # dim
|
||||
curses.init_pair(4, 8 if curses.COLORS > 8 else curses.COLOR_WHITE, -1) # dim
|
||||
|
||||
cursor = 0
|
||||
scroll_offset = 0
|
||||
@@ -1146,13 +1273,13 @@ def _make_tui_argv(tui_dir: Path, tui_dev: bool) -> tuple[list[str], Path]:
|
||||
p = Path(ext_dir)
|
||||
if (p / "dist" / "entry.js").is_file():
|
||||
node = _node_bin("node")
|
||||
return [node, str(p / "dist" / "entry.js")], p
|
||||
return [node, "--expose-gc", str(p / "dist" / "entry.js")], p
|
||||
|
||||
# 1b. Bundled in wheel (pip install)
|
||||
bundled = _find_bundled_tui()
|
||||
if bundled is not None:
|
||||
node = _node_bin("node")
|
||||
return [node, str(bundled)], bundled.parent
|
||||
return [node, "--expose-gc", str(bundled)], bundled.parent
|
||||
|
||||
# 2. Normal flow: npm install if needed, always esbuild, then node dist/entry.js.
|
||||
# --dev flow: npm install if needed, then tsx src/entry.tsx.
|
||||
@@ -1229,7 +1356,7 @@ def _make_tui_argv(tui_dir: Path, tui_dev: bool) -> tuple[list[str], Path]:
|
||||
sys.exit(1)
|
||||
|
||||
node = _node_bin("node")
|
||||
return [node, str(tui_dir / "dist" / "entry.js")], tui_dir
|
||||
return [node, "--expose-gc", str(tui_dir / "dist" / "entry.js")], tui_dir
|
||||
|
||||
|
||||
def _normalize_tui_toolsets(toolsets: object) -> list[str]:
|
||||
@@ -1351,16 +1478,16 @@ def _launch_tui(
|
||||
env["HERMES_TUI_TOOL_PROGRESS"] = "off"
|
||||
if accept_hooks:
|
||||
env["HERMES_ACCEPT_HOOKS"] = "1"
|
||||
# Guarantee an 8GB V8 heap + exposed GC for the TUI. Default node cap is
|
||||
# ~1.5–4GB depending on version and can fatal-OOM on long sessions with
|
||||
# large transcripts / reasoning blobs. Token-level merge: respect any
|
||||
# user-supplied --max-old-space-size (they may have set it higher) and
|
||||
# avoid duplicating --expose-gc.
|
||||
# Guarantee an 8GB V8 heap for the TUI. Default node cap is ~1.5–4GB
|
||||
# depending on version and can fatal-OOM on long sessions with large
|
||||
# transcripts / reasoning blobs. Token-level merge: respect any
|
||||
# user-supplied --max-old-space-size (they may have set it higher).
|
||||
# --expose-gc is *not* added here: Node rejects it in NODE_OPTIONS
|
||||
# ("--expose-gc is not allowed in NODE_OPTIONS") and refuses to start.
|
||||
# It is passed as a direct argv flag in _make_tui_argv() instead.
|
||||
_tokens = env.get("NODE_OPTIONS", "").split()
|
||||
if not any(t.startswith("--max-old-space-size=") for t in _tokens):
|
||||
_tokens.append("--max-old-space-size=8192")
|
||||
if "--expose-gc" not in _tokens:
|
||||
_tokens.append("--expose-gc")
|
||||
env["NODE_OPTIONS"] = " ".join(_tokens)
|
||||
# HERMES_TUI_RESUME is an internal hand-off from the Python wrapper to the
|
||||
# Ink app. Because we start from os.environ.copy(), an exported/stale value
|
||||
@@ -1523,19 +1650,20 @@ def cmd_chat(args):
|
||||
print("You can run 'hermes setup' at any time to configure.")
|
||||
sys.exit(1)
|
||||
|
||||
# Start update check in background (runs while other init happens)
|
||||
try:
|
||||
from hermes_cli.banner import prefetch_update_check
|
||||
# Start update check in background (runs while other init happens).
|
||||
# On Termux this imports rich/prompt_toolkit in the foreground and then
|
||||
# competes for CPU on single-core devices, so keep it opt-in there.
|
||||
if _termux_should_prefetch_update_check():
|
||||
try:
|
||||
from hermes_cli.banner import prefetch_update_check
|
||||
|
||||
prefetch_update_check()
|
||||
except Exception:
|
||||
pass
|
||||
prefetch_update_check()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Sync bundled skills on every CLI launch (fast -- skips unchanged skills)
|
||||
try:
|
||||
from tools.skills_sync import sync_skills
|
||||
|
||||
sync_skills(quiet=True)
|
||||
_sync_bundled_skills_for_startup()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -2305,6 +2433,9 @@ _AUX_TASKS: list[tuple[str, str, str]] = [
|
||||
("mcp", "MCP", "MCP tool reasoning"),
|
||||
("title_generation", "Title generation", "session titles"),
|
||||
("skills_hub", "Skills hub", "skills search/install"),
|
||||
("triage_specifier", "Triage specifier", "kanban spec fleshing"),
|
||||
("kanban_decomposer", "Kanban decomposer", "task decomposition"),
|
||||
("profile_describer", "Profile describer", "auto profile descriptions"),
|
||||
("curator", "Curator", "skill-usage review pass"),
|
||||
]
|
||||
|
||||
@@ -4534,7 +4665,9 @@ def _model_flow_copilot(config, current_model=""):
|
||||
source = creds.get("source", "")
|
||||
else:
|
||||
if source in {"GITHUB_TOKEN", "GH_TOKEN"}:
|
||||
print(f" GitHub token: {api_key[:8]}... ✓ ({source})")
|
||||
from hermes_cli.env_loader import format_secret_source_suffix
|
||||
bw_suffix = format_secret_source_suffix(source)
|
||||
print(f" GitHub token: {api_key[:8]}... ✓ ({source}{bw_suffix})")
|
||||
elif source == "gh auth token":
|
||||
print(" GitHub token: ✓ (from `gh auth token`)")
|
||||
else:
|
||||
@@ -4791,7 +4924,10 @@ def _prompt_api_key(pconfig, existing_key: str, provider_id: str = "") -> tuple:
|
||||
return new_key, False
|
||||
|
||||
# Already configured — offer K / R / C ────────────────────────────────
|
||||
print(f" {pconfig.name} API key: {existing_key[:8]}... ✓")
|
||||
from hermes_cli.env_loader import format_secret_source_suffix
|
||||
|
||||
source_suffix = format_secret_source_suffix(key_env) if key_env else ""
|
||||
print(f" {pconfig.name} API key: {existing_key[:8]}... ✓{source_suffix}")
|
||||
if not key_env:
|
||||
# Nothing we can rewrite; just acknowledge and move on.
|
||||
print()
|
||||
@@ -5074,7 +5210,9 @@ def _model_flow_bedrock_api_key(config, region, current_model=""):
|
||||
# Prompt for API key
|
||||
existing_key = get_env_value("AWS_BEARER_TOKEN_BEDROCK") or ""
|
||||
if existing_key:
|
||||
print(f" Bedrock API Key: {existing_key[:12]}... ✓")
|
||||
from hermes_cli.env_loader import format_secret_source_suffix
|
||||
source_suffix = format_secret_source_suffix("AWS_BEARER_TOKEN_BEDROCK")
|
||||
print(f" Bedrock API Key: {existing_key[:12]}... ✓{source_suffix}")
|
||||
else:
|
||||
print(f" Endpoint: {mantle_base_url}")
|
||||
print()
|
||||
@@ -5745,7 +5883,22 @@ def _model_flow_anthropic(config, current_model=""):
|
||||
if has_creds:
|
||||
# Show what we found
|
||||
if existing_key:
|
||||
print(f" Anthropic credentials: {existing_key[:12]}... ✓")
|
||||
from hermes_cli.env_loader import format_secret_source_suffix
|
||||
from hermes_cli.auth import PROVIDER_REGISTRY
|
||||
|
||||
# Surface which env var supplied the key so users with
|
||||
# Bitwarden see "(from Bitwarden)" — without this, a detected
|
||||
# BSM key looks identical to a key in .env and users assume
|
||||
# nothing is wired up.
|
||||
source_suffix = ""
|
||||
for var in PROVIDER_REGISTRY["anthropic"].api_key_env_vars:
|
||||
if os.getenv(var, "").strip() == existing_key:
|
||||
source_suffix = format_secret_source_suffix(var)
|
||||
if source_suffix:
|
||||
break
|
||||
print(
|
||||
f" Anthropic credentials: {existing_key[:12]}... ✓{source_suffix}"
|
||||
)
|
||||
elif cc_available:
|
||||
print(" Claude Code credentials: ✓ (auto-detected)")
|
||||
print()
|
||||
@@ -5971,8 +6124,7 @@ def cmd_import(args):
|
||||
run_import(args)
|
||||
|
||||
|
||||
def cmd_version(args):
|
||||
"""Show version."""
|
||||
def _print_version_info(*, check_updates: bool = True) -> None:
|
||||
print(f"Hermes Agent v{__version__} ({__release_date__})")
|
||||
print(f"Project: {PROJECT_ROOT}")
|
||||
|
||||
@@ -5992,6 +6144,9 @@ def cmd_version(args):
|
||||
except ImportError:
|
||||
print("OpenAI SDK: Not installed")
|
||||
|
||||
if not check_updates:
|
||||
return
|
||||
|
||||
# Show update status (synchronous — acceptable since user asked for version info)
|
||||
try:
|
||||
from hermes_cli.banner import check_for_updates
|
||||
@@ -6010,6 +6165,11 @@ def cmd_version(args):
|
||||
pass
|
||||
|
||||
|
||||
def cmd_version(args):
|
||||
"""Show version."""
|
||||
_print_version_info(check_updates=True)
|
||||
|
||||
|
||||
def cmd_uninstall(args):
|
||||
"""Uninstall Hermes Agent."""
|
||||
_require_tty("uninstall")
|
||||
@@ -6086,24 +6246,36 @@ def _validate_critical_files_syntax(root) -> tuple[bool, str | None, str | None]
|
||||
them after a successful ``git pull`` so we can auto-roll-back instead of
|
||||
leaving the user with a bricked install.
|
||||
|
||||
The compiled ``.pyc`` is written to a temp directory rather than the
|
||||
source tree's ``__pycache__/`` so we don't race with concurrent test
|
||||
workers that walk the same dir, and so we don't leave a stale pyc
|
||||
behind in production if the next interpreter run picks a different
|
||||
Python version. The pyc is discarded on function return either way —
|
||||
we only care about the compile-or-not signal.
|
||||
|
||||
Returns ``(ok, failing_path, error_message)``. ``ok=True`` means every
|
||||
file parsed cleanly.
|
||||
"""
|
||||
import py_compile
|
||||
import tempfile
|
||||
|
||||
root = Path(root)
|
||||
for relpath in _UPDATE_CRITICAL_FILES:
|
||||
path = root / relpath
|
||||
if not path.exists():
|
||||
# Missing file is suspicious but not necessarily fatal — a future
|
||||
# refactor may legitimately remove one of these. Skip and move on.
|
||||
continue
|
||||
try:
|
||||
py_compile.compile(str(path), doraise=True)
|
||||
except py_compile.PyCompileError as exc:
|
||||
return False, str(path), str(exc)
|
||||
except OSError as exc:
|
||||
return False, str(path), f"could not read: {exc}"
|
||||
with tempfile.TemporaryDirectory(prefix="hermes-syntax-check-") as tmpdir:
|
||||
for relpath in _UPDATE_CRITICAL_FILES:
|
||||
path = root / relpath
|
||||
if not path.exists():
|
||||
# Missing file is suspicious but not necessarily fatal — a future
|
||||
# refactor may legitimately remove one of these. Skip and move on.
|
||||
continue
|
||||
# Mirror the relative path under the tmpdir so two different
|
||||
# files with the same basename don't collide on the cfile name.
|
||||
cfile = Path(tmpdir) / (relpath.replace("/", "__") + "c")
|
||||
try:
|
||||
py_compile.compile(str(path), cfile=str(cfile), doraise=True)
|
||||
except py_compile.PyCompileError as exc:
|
||||
return False, str(path), str(exc)
|
||||
except OSError as exc:
|
||||
return False, str(path), f"could not read: {exc}"
|
||||
return True, None, None
|
||||
|
||||
|
||||
@@ -10413,7 +10585,7 @@ _BUILTIN_SUBCOMMANDS = frozenset(
|
||||
"model", "pairing", "plugins", "postinstall", "profile", "proxy",
|
||||
"send", "sessions", "setup",
|
||||
"skills", "slack", "status", "tools", "uninstall", "update",
|
||||
"version", "webhook", "whatsapp", "chat",
|
||||
"version", "webhook", "whatsapp", "chat", "secrets",
|
||||
# Help-ish invocations — plugin commands not being listed in
|
||||
# top-level --help is an acceptable trade-off for skipping an
|
||||
# expensive eager import of every bundled plugin module.
|
||||
@@ -10503,6 +10675,137 @@ def _plugin_cli_discovery_needed() -> bool:
|
||||
return True
|
||||
|
||||
|
||||
_AGENT_COMMANDS = {None, "chat", "acp", "rl"}
|
||||
_AGENT_SUBCOMMANDS = {
|
||||
"cron": ("cron_command", {"run", "tick"}),
|
||||
"gateway": ("gateway_command", {"run"}),
|
||||
"mcp": ("mcp_action", {"serve"}),
|
||||
}
|
||||
|
||||
|
||||
def _prepare_agent_startup(args) -> None:
|
||||
"""Discover plugins/MCP/hooks for commands that can run an agent turn."""
|
||||
_sub_attr, _sub_set = _AGENT_SUBCOMMANDS.get(args.command, (None, None))
|
||||
if not (
|
||||
args.command in _AGENT_COMMANDS
|
||||
or (_sub_attr and getattr(args, _sub_attr, None) in _sub_set)
|
||||
):
|
||||
return
|
||||
|
||||
_accept_hooks = bool(getattr(args, "accept_hooks", False))
|
||||
try:
|
||||
from hermes_cli.plugins import discover_plugins
|
||||
|
||||
discover_plugins()
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"plugin discovery failed at CLI startup",
|
||||
exc_info=True,
|
||||
)
|
||||
try:
|
||||
# MCP tool discovery — no event loop running in CLI/TUI startup,
|
||||
# so inline is safe. Moved here from model_tools.py module scope
|
||||
# to avoid freezing the gateway's event loop on its first message
|
||||
# via the same lazy import path (#16856).
|
||||
from tools.mcp_tool import discover_mcp_tools
|
||||
|
||||
discover_mcp_tools()
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"MCP tool discovery failed at CLI startup",
|
||||
exc_info=True,
|
||||
)
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
from agent.shell_hooks import register_from_config
|
||||
|
||||
register_from_config(load_config(), accept_hooks=_accept_hooks)
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"shell-hook registration failed at CLI startup",
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
|
||||
def _set_chat_arg_defaults(args) -> None:
|
||||
for attr, default in [
|
||||
("query", None),
|
||||
("model", None),
|
||||
("provider", None),
|
||||
("toolsets", None),
|
||||
("verbose", False),
|
||||
("resume", None),
|
||||
("continue_last", None),
|
||||
("worktree", False),
|
||||
]:
|
||||
if not hasattr(args, attr):
|
||||
setattr(args, attr, default)
|
||||
|
||||
|
||||
def _is_termux_fast_version_argv(argv: list[str]) -> bool:
|
||||
return argv in (["--version"], ["-V"], ["version"])
|
||||
|
||||
|
||||
def _try_termux_fast_cli_launch() -> bool:
|
||||
"""Run obvious Termux non-TUI chat/oneshot/version paths on a light parser."""
|
||||
if not _is_termux_startup_environment():
|
||||
return False
|
||||
if os.environ.get("HERMES_TERMUX_DISABLE_FAST_CLI") == "1":
|
||||
return False
|
||||
|
||||
argv = sys.argv[1:]
|
||||
if "-h" in argv or "--help" in argv:
|
||||
return False
|
||||
if os.environ.get("HERMES_TUI") == "1" or "--tui" in argv:
|
||||
return False
|
||||
|
||||
if _is_termux_fast_version_argv(argv):
|
||||
_print_version_info(check_updates=False)
|
||||
return True
|
||||
|
||||
first = _first_positional_argv()
|
||||
has_oneshot = any(
|
||||
arg == "-z" or arg == "--oneshot" or arg.startswith("--oneshot=")
|
||||
for arg in argv
|
||||
)
|
||||
if not has_oneshot and first not in {None, "chat"}:
|
||||
return False
|
||||
|
||||
from hermes_cli._parser import build_top_level_parser
|
||||
|
||||
parser, _subparsers, chat_parser = build_top_level_parser()
|
||||
chat_parser.set_defaults(func=cmd_chat)
|
||||
args = parser.parse_args(_coalesce_session_name_args(argv))
|
||||
|
||||
if getattr(args, "version", False):
|
||||
_print_version_info(check_updates=False)
|
||||
return True
|
||||
|
||||
if getattr(args, "oneshot", None):
|
||||
_prepare_agent_startup(args)
|
||||
from hermes_cli.oneshot import run_oneshot
|
||||
|
||||
sys.exit(
|
||||
run_oneshot(
|
||||
args.oneshot,
|
||||
model=getattr(args, "model", None),
|
||||
provider=getattr(args, "provider", None),
|
||||
toolsets=getattr(args, "toolsets", None),
|
||||
)
|
||||
)
|
||||
|
||||
if (args.resume or args.continue_last) and args.command is None:
|
||||
args.command = "chat"
|
||||
|
||||
if args.command in {None, "chat"}:
|
||||
_set_chat_arg_defaults(args)
|
||||
_prepare_agent_startup(args)
|
||||
cmd_chat(args)
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _try_termux_fast_tui_launch() -> bool:
|
||||
"""Launch obvious Termux TUI invocations before building every subparser.
|
||||
|
||||
@@ -10563,6 +10866,8 @@ def main():
|
||||
|
||||
if _try_termux_fast_tui_launch():
|
||||
return
|
||||
if _try_termux_fast_cli_launch():
|
||||
return
|
||||
|
||||
from hermes_cli._parser import build_top_level_parser
|
||||
|
||||
@@ -10660,6 +10965,42 @@ def main():
|
||||
)
|
||||
fallback_parser.set_defaults(func=cmd_fallback)
|
||||
|
||||
# =========================================================================
|
||||
# secrets command — external secret managers (currently: Bitwarden)
|
||||
# =========================================================================
|
||||
secrets_parser = subparsers.add_parser(
|
||||
"secrets",
|
||||
help="Manage external secret sources (Bitwarden Secrets Manager)",
|
||||
description=(
|
||||
"Pull API keys from an external secret manager at process startup "
|
||||
"instead of storing them in ~/.hermes/.env. Currently supports "
|
||||
"Bitwarden Secrets Manager. See: "
|
||||
"https://hermes-agent.nousresearch.com/docs/user-guide/secrets/bitwarden"
|
||||
),
|
||||
)
|
||||
secrets_subparsers = secrets_parser.add_subparsers(dest="secrets_command")
|
||||
|
||||
secrets_bw = secrets_subparsers.add_parser(
|
||||
"bitwarden",
|
||||
aliases=["bw"],
|
||||
help="Bitwarden Secrets Manager integration",
|
||||
)
|
||||
|
||||
# Lazy import — only pays for itself when this subcommand is actually used.
|
||||
from hermes_cli import secrets_cli as _secrets_cli
|
||||
|
||||
_secrets_cli.register_cli(secrets_bw)
|
||||
|
||||
def _dispatch_secrets(args): # noqa: ANN001
|
||||
sub = getattr(args, "secrets_command", None)
|
||||
bw_sub = getattr(args, "secrets_bw_command", None)
|
||||
if sub in ("bitwarden", "bw") and bw_sub is not None:
|
||||
return args.func(args)
|
||||
secrets_parser.print_help()
|
||||
return 0
|
||||
|
||||
secrets_parser.set_defaults(func=_dispatch_secrets)
|
||||
|
||||
# =========================================================================
|
||||
# migrate command
|
||||
# =========================================================================
|
||||
@@ -13325,51 +13666,7 @@ Examples:
|
||||
# so introspection/management commands (hermes hooks list, cron
|
||||
# list, gateway status, mcp add, ...) don't pay discovery cost or
|
||||
# trigger consent prompts for hooks the user is still inspecting.
|
||||
# Groups with mixed admin/CRUD vs. agent-running entries narrow via
|
||||
# the nested subcommand (dest varies by parser).
|
||||
_AGENT_COMMANDS = {None, "chat", "acp", "rl"}
|
||||
_AGENT_SUBCOMMANDS = {
|
||||
"cron": ("cron_command", {"run", "tick"}),
|
||||
"gateway": ("gateway_command", {"run"}),
|
||||
"mcp": ("mcp_action", {"serve"}),
|
||||
}
|
||||
_sub_attr, _sub_set = _AGENT_SUBCOMMANDS.get(args.command, (None, None))
|
||||
if args.command in _AGENT_COMMANDS or (
|
||||
_sub_attr and getattr(args, _sub_attr, None) in _sub_set
|
||||
):
|
||||
_accept_hooks = bool(getattr(args, "accept_hooks", False))
|
||||
try:
|
||||
from hermes_cli.plugins import discover_plugins
|
||||
|
||||
discover_plugins()
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"plugin discovery failed at CLI startup",
|
||||
exc_info=True,
|
||||
)
|
||||
try:
|
||||
# MCP tool discovery — no event loop running in CLI/TUI startup,
|
||||
# so inline is safe. Moved here from model_tools.py module scope
|
||||
# to avoid freezing the gateway's event loop on its first message
|
||||
# via the same lazy import path (#16856).
|
||||
from tools.mcp_tool import discover_mcp_tools
|
||||
|
||||
discover_mcp_tools()
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"MCP tool discovery failed at CLI startup",
|
||||
exc_info=True,
|
||||
)
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
from agent.shell_hooks import register_from_config
|
||||
|
||||
register_from_config(load_config(), accept_hooks=_accept_hooks)
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"shell-hook registration failed at CLI startup",
|
||||
exc_info=True,
|
||||
)
|
||||
_prepare_agent_startup(args)
|
||||
|
||||
# Handle top-level --oneshot / -z: single-shot mode, stdout = final
|
||||
# response only, nothing else. Bypasses cli.py entirely.
|
||||
|
||||
@@ -74,8 +74,12 @@ class NousSubscriptionFeatures:
|
||||
def modal(self) -> NousFeatureState:
|
||||
return self.features["modal"]
|
||||
|
||||
@property
|
||||
def app_tools(self) -> NousFeatureState:
|
||||
return self.features["app_tools"]
|
||||
|
||||
def items(self) -> Iterable[NousFeatureState]:
|
||||
ordered = ("web", "image_gen", "tts", "browser", "modal")
|
||||
ordered = ("web", "image_gen", "tts", "browser", "modal", "app_tools")
|
||||
for key in ordered:
|
||||
yield self.features[key]
|
||||
|
||||
@@ -225,6 +229,22 @@ def _resolve_browser_feature_state(
|
||||
return "local", available, active, False
|
||||
|
||||
|
||||
def _read_portal_app_tools_enabled(config: Optional[Dict[str, object]] = None) -> bool:
|
||||
"""Return True when the portal.app_tools config flag is on."""
|
||||
if config is not None:
|
||||
# Fast path: use the pre-loaded config snapshot from the caller
|
||||
import os
|
||||
env_val = os.getenv("PORTAL_APP_TOOLS")
|
||||
if env_val is not None:
|
||||
return is_truthy_value(env_val)
|
||||
portal = config.get("portal")
|
||||
if isinstance(portal, dict):
|
||||
return bool(portal.get("app_tools", True))
|
||||
return True
|
||||
from tools.tool_backend_helpers import portal_app_tools_enabled
|
||||
return portal_app_tools_enabled()
|
||||
|
||||
|
||||
def get_nous_subscription_features(
|
||||
config: Optional[Dict[str, object]] = None,
|
||||
) -> NousSubscriptionFeatures:
|
||||
@@ -313,6 +333,8 @@ def get_nous_subscription_features(
|
||||
managed_tts_available = managed_tools_flag and nous_auth_present and is_managed_tool_gateway_ready("openai-audio")
|
||||
managed_browser_available = managed_tools_flag and nous_auth_present and is_managed_tool_gateway_ready("browser-use")
|
||||
managed_modal_available = managed_tools_flag and nous_auth_present and is_managed_tool_gateway_ready("modal")
|
||||
app_gw_ready = bool(managed_tools_flag and nous_auth_present and is_managed_tool_gateway_ready("tools"))
|
||||
app_config_on = _read_portal_app_tools_enabled(config)
|
||||
modal_state = resolve_modal_backend_state(
|
||||
modal_mode,
|
||||
has_direct=direct_modal,
|
||||
@@ -476,6 +498,17 @@ def get_nous_subscription_features(
|
||||
current_provider="Modal" if terminal_backend == "modal" else terminal_backend or "local",
|
||||
explicit_configured=terminal_backend == "modal",
|
||||
),
|
||||
"app_tools": NousFeatureState(
|
||||
key="app_tools",
|
||||
label="App tools (500+ apps)",
|
||||
included_by_default=True,
|
||||
available=app_gw_ready,
|
||||
active=app_gw_ready and app_config_on,
|
||||
managed_by_nous=app_gw_ready and app_config_on,
|
||||
direct_override=False,
|
||||
toolset_enabled=app_config_on,
|
||||
current_provider="Nous Tool Gateway",
|
||||
),
|
||||
}
|
||||
|
||||
return NousSubscriptionFeatures(
|
||||
|
||||
@@ -1051,7 +1051,7 @@ def _run_composite_ui(curses, plugin_names, plugin_labels, plugin_selected,
|
||||
curses.init_pair(1, curses.COLOR_GREEN, -1)
|
||||
curses.init_pair(2, curses.COLOR_YELLOW, -1)
|
||||
curses.init_pair(3, curses.COLOR_CYAN, -1)
|
||||
curses.init_pair(4, 8, -1) # dim gray
|
||||
curses.init_pair(4, 8 if curses.COLORS > 8 else curses.COLOR_WHITE, -1) # dim gray
|
||||
cursor = 0
|
||||
scroll_offset = 0
|
||||
|
||||
@@ -1196,7 +1196,7 @@ def _run_composite_ui(curses, plugin_names, plugin_labels, plugin_selected,
|
||||
curses.init_pair(1, curses.COLOR_GREEN, -1)
|
||||
curses.init_pair(2, curses.COLOR_YELLOW, -1)
|
||||
curses.init_pair(3, curses.COLOR_CYAN, -1)
|
||||
curses.init_pair(4, 8, -1)
|
||||
curses.init_pair(4, 8 if curses.COLORS > 8 else curses.COLOR_WHITE, -1)
|
||||
curses.curs_set(0)
|
||||
elif key in {curses.KEY_ENTER, 10, 13}:
|
||||
if cursor < n_plugins:
|
||||
@@ -1228,7 +1228,7 @@ def _run_composite_ui(curses, plugin_names, plugin_labels, plugin_selected,
|
||||
curses.init_pair(1, curses.COLOR_GREEN, -1)
|
||||
curses.init_pair(2, curses.COLOR_YELLOW, -1)
|
||||
curses.init_pair(3, curses.COLOR_CYAN, -1)
|
||||
curses.init_pair(4, 8, -1)
|
||||
curses.init_pair(4, 8 if curses.COLORS > 8 else curses.COLOR_WHITE, -1)
|
||||
curses.curs_set(0)
|
||||
elif key in {27, ord("q")}:
|
||||
# Save plugin changes on exit
|
||||
|
||||
@@ -35,6 +35,7 @@ from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from hermes_cli import profiles as profiles_mod
|
||||
from agent.skill_utils import is_excluded_skill_path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -109,8 +110,7 @@ def _collect_skills(profile_dir: Path) -> list[str]:
|
||||
return []
|
||||
names: list[str] = []
|
||||
for md in skills_dir.rglob("SKILL.md"):
|
||||
path_str = str(md)
|
||||
if "/.hub/" in path_str or "/.git/" in path_str:
|
||||
if is_excluded_skill_path(md):
|
||||
continue
|
||||
try:
|
||||
rel = md.relative_to(skills_dir)
|
||||
@@ -201,7 +201,7 @@ def describe_profile(
|
||||
skill_list = "\n".join(f" - {n}" for n in skill_names) or " (no skills installed)"
|
||||
skill_count = sum(
|
||||
1 for _ in (profile_dir / "skills").rglob("SKILL.md")
|
||||
if "/.hub/" not in str(_) and "/.git/" not in str(_)
|
||||
if not is_excluded_skill_path(_)
|
||||
) if (profile_dir / "skills").is_dir() else 0
|
||||
|
||||
# Read model + provider from the profile's config.
|
||||
|
||||
@@ -70,6 +70,8 @@ from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from agent.skill_utils import is_excluded_skill_path
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constants
|
||||
@@ -463,7 +465,9 @@ def _count_skills(staged: Path) -> int:
|
||||
skills_dir = staged / "skills"
|
||||
if not skills_dir.is_dir():
|
||||
return 0
|
||||
return sum(1 for _ in skills_dir.rglob("SKILL.md"))
|
||||
return sum(
|
||||
1 for p in skills_dir.rglob("SKILL.md") if not is_excluded_skill_path(p)
|
||||
)
|
||||
|
||||
|
||||
def plan_install(
|
||||
|
||||
+48
-3
@@ -30,6 +30,8 @@ from dataclasses import dataclass
|
||||
from pathlib import Path, PurePosixPath, PureWindowsPath
|
||||
from typing import List, Optional
|
||||
|
||||
from agent.skill_utils import is_excluded_skill_path
|
||||
|
||||
_PROFILE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,63}$")
|
||||
|
||||
# Directories bootstrapped inside every new profile
|
||||
@@ -485,8 +487,9 @@ def _count_skills(profile_dir: Path) -> int:
|
||||
return 0
|
||||
count = 0
|
||||
for md in skills_dir.rglob("SKILL.md"):
|
||||
if "/.hub/" not in str(md) and "/.git/" not in str(md):
|
||||
count += 1
|
||||
if is_excluded_skill_path(md):
|
||||
continue
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
@@ -902,7 +905,49 @@ def delete_profile(name: str, yes: bool = False) -> Path:
|
||||
|
||||
# 4. Remove profile directory
|
||||
try:
|
||||
shutil.rmtree(profile_dir)
|
||||
def _make_writable(func, path, exc):
|
||||
"""onexc/onerror handler: add +w on PermissionError so rmtree can proceed.
|
||||
|
||||
Handles two cases on NixOS (and other systems with read-only
|
||||
copies from immutable stores):
|
||||
1. The path itself isn't writable (e.g. a file with mode 0444)
|
||||
2. The *parent* directory isn't writable (e.g. mode 0555)
|
||||
|
||||
Compatible with both the ``onexc`` API (3.12+, receives an
|
||||
exception instance) and the ``onerror`` API (3.11-, receives
|
||||
``sys.exc_info()`` tuple).
|
||||
"""
|
||||
import stat as _stat
|
||||
import sys as _sys
|
||||
|
||||
# Normalise the two callback signatures:
|
||||
# onexc(func, path, exc_instance) — 3.12+
|
||||
# onerror(func, path, exc_info_tuple) — 3.11
|
||||
if isinstance(exc, tuple):
|
||||
exc = exc[1] # exc_info → actual exception object
|
||||
|
||||
if isinstance(exc, PermissionError):
|
||||
# Make the path writable
|
||||
try:
|
||||
os.chmod(path, os.stat(path).st_mode | _stat.S_IWUSR)
|
||||
except OSError:
|
||||
pass
|
||||
# Also make the parent writable (needed for unlink/rmdir)
|
||||
parent = os.path.dirname(path)
|
||||
if parent:
|
||||
try:
|
||||
os.chmod(parent, os.stat(parent).st_mode | _stat.S_IWUSR)
|
||||
except OSError:
|
||||
pass
|
||||
func(path)
|
||||
else:
|
||||
raise
|
||||
|
||||
# ``onexc`` was added in 3.12; fall back to ``onerror`` on 3.11.
|
||||
try:
|
||||
shutil.rmtree(profile_dir, onexc=_make_writable)
|
||||
except TypeError:
|
||||
shutil.rmtree(profile_dir, onerror=_make_writable)
|
||||
print(f"✓ Removed {profile_dir}")
|
||||
except Exception as e:
|
||||
print(f"⚠ Could not remove {profile_dir}: {e}")
|
||||
|
||||
@@ -528,6 +528,9 @@ def _get_named_custom_provider(requested_provider: str) -> Optional[Dict[str, An
|
||||
"api_key": resolved_api_key,
|
||||
"model": entry.get("default_model", ""),
|
||||
}
|
||||
extra_body = entry.get("extra_body")
|
||||
if isinstance(extra_body, dict):
|
||||
result["extra_body"] = dict(extra_body)
|
||||
# The v11→v12 migration writes the API mode under the new
|
||||
# ``transport`` field, but hand-edited configs may still
|
||||
# use the legacy ``api_mode`` spelling. Accept both —
|
||||
@@ -553,6 +556,9 @@ def _get_named_custom_provider(requested_provider: str) -> Optional[Dict[str, An
|
||||
"api_key": resolved_api_key,
|
||||
"model": entry.get("default_model", ""),
|
||||
}
|
||||
extra_body = entry.get("extra_body")
|
||||
if isinstance(extra_body, dict):
|
||||
result["extra_body"] = dict(extra_body)
|
||||
api_mode = _parse_api_mode(entry.get("api_mode") or entry.get("transport"))
|
||||
if api_mode:
|
||||
result["api_mode"] = api_mode
|
||||
@@ -596,6 +602,9 @@ def _get_named_custom_provider(requested_provider: str) -> Optional[Dict[str, An
|
||||
result["key_env"] = key_env
|
||||
if provider_key:
|
||||
result["provider_key"] = provider_key
|
||||
extra_body = entry.get("extra_body")
|
||||
if isinstance(extra_body, dict):
|
||||
result["extra_body"] = dict(extra_body)
|
||||
api_mode = _parse_api_mode(entry.get("api_mode"))
|
||||
if api_mode:
|
||||
result["api_mode"] = api_mode
|
||||
@@ -607,6 +616,13 @@ def _get_named_custom_provider(requested_provider: str) -> Optional[Dict[str, An
|
||||
return None
|
||||
|
||||
|
||||
def _custom_provider_request_overrides(custom_provider: Dict[str, Any]) -> Dict[str, Any]:
|
||||
extra_body = custom_provider.get("extra_body")
|
||||
if not isinstance(extra_body, dict) or not extra_body:
|
||||
return {}
|
||||
return {"extra_body": dict(extra_body)}
|
||||
|
||||
|
||||
def _resolve_named_custom_runtime(
|
||||
*,
|
||||
requested_provider: str,
|
||||
@@ -683,6 +699,12 @@ def _resolve_named_custom_runtime(
|
||||
model_name = custom_provider.get("model")
|
||||
if model_name:
|
||||
pool_result["model"] = model_name
|
||||
request_overrides = _custom_provider_request_overrides(custom_provider)
|
||||
if request_overrides:
|
||||
pool_result["request_overrides"] = {
|
||||
**dict(pool_result.get("request_overrides") or {}),
|
||||
**request_overrides,
|
||||
}
|
||||
return pool_result
|
||||
|
||||
_cp_is_openai_url = base_url_host_matches(base_url, "openai.com") or base_url_host_matches(base_url, "openai.azure.com")
|
||||
@@ -714,6 +736,9 @@ def _resolve_named_custom_runtime(
|
||||
# provider name differs from the actual model string the API expects.
|
||||
if custom_provider.get("model"):
|
||||
result["model"] = custom_provider["model"]
|
||||
request_overrides = _custom_provider_request_overrides(custom_provider)
|
||||
if request_overrides:
|
||||
result["request_overrides"] = request_overrides
|
||||
return result
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,445 @@
|
||||
"""CLI handlers for ``hermes secrets bitwarden ...``.
|
||||
|
||||
Subcommands:
|
||||
setup — interactive wizard: install bws, prompt for token + project, test fetch
|
||||
status — show current config + binary version + last fetch outcome
|
||||
sync — run a fetch right now and show what would be applied (dry-run friendly)
|
||||
disable — flip ``secrets.bitwarden.enabled`` to False
|
||||
install — just download the bws binary (no token / project required)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import getpass
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
|
||||
from agent.secret_sources import bitwarden as bw
|
||||
from hermes_cli.config import (
|
||||
get_env_path,
|
||||
load_config,
|
||||
save_config,
|
||||
save_env_value,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Argparse wiring — called from hermes_cli.main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def register_cli(parent_parser: argparse.ArgumentParser) -> None:
|
||||
"""Attach the ``bitwarden`` subcommand tree to a parent parser.
|
||||
|
||||
Called from ``hermes_cli.main`` as part of building the top-level
|
||||
``hermes secrets`` parser.
|
||||
"""
|
||||
sub = parent_parser.add_subparsers(dest="secrets_bw_command")
|
||||
|
||||
setup = sub.add_parser(
|
||||
"setup",
|
||||
help="Interactive wizard: install bws, store access token, pick project",
|
||||
)
|
||||
setup.add_argument(
|
||||
"--project-id",
|
||||
help="Pre-select a project UUID instead of prompting",
|
||||
)
|
||||
setup.add_argument(
|
||||
"--access-token",
|
||||
help="Provide the access token non-interactively (will be stored in .env)",
|
||||
)
|
||||
setup.set_defaults(func=cmd_setup)
|
||||
|
||||
status = sub.add_parser("status", help="Show config + binary + last fetch")
|
||||
status.set_defaults(func=cmd_status)
|
||||
|
||||
sync = sub.add_parser("sync", help="Fetch secrets now and report what changed")
|
||||
sync.add_argument(
|
||||
"--apply",
|
||||
action="store_true",
|
||||
help="Actually export the secrets into the current shell's env (default: dry-run)",
|
||||
)
|
||||
sync.set_defaults(func=cmd_sync)
|
||||
|
||||
disable = sub.add_parser("disable", help="Turn off the Bitwarden integration")
|
||||
disable.set_defaults(func=cmd_disable)
|
||||
|
||||
install = sub.add_parser(
|
||||
"install",
|
||||
help=f"Download and verify the pinned bws binary (v{bw._BWS_VERSION})",
|
||||
)
|
||||
install.add_argument(
|
||||
"--force",
|
||||
action="store_true",
|
||||
help="Re-download even if a managed copy already exists",
|
||||
)
|
||||
install.set_defaults(func=cmd_install)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Handlers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def cmd_setup(args: argparse.Namespace) -> int:
|
||||
console = Console()
|
||||
console.print(
|
||||
Panel.fit(
|
||||
"[bold]Bitwarden Secrets Manager setup[/bold]\n\n"
|
||||
"Need an access token? In the Bitwarden web app:\n"
|
||||
" Secrets Manager → Machine accounts → [your account] →\n"
|
||||
" Access tokens → Create access token\n\n"
|
||||
"Copy the token (starts with [cyan]0.[/cyan]…) — it cannot be retrieved later.",
|
||||
border_style="cyan",
|
||||
)
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------ binary
|
||||
console.print()
|
||||
console.print("[bold]Step 1[/bold] Install the bws CLI")
|
||||
try:
|
||||
binary = bw.find_bws(install_if_missing=False)
|
||||
if binary is None:
|
||||
console.print(" No bws on PATH — downloading…")
|
||||
binary = bw.install_bws()
|
||||
version = _bws_version(binary)
|
||||
console.print(f" [green]✓[/green] {binary} ({version})")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
console.print(f" [red]✗ Could not install bws: {exc}[/red]")
|
||||
console.print(
|
||||
" Manual install: "
|
||||
"https://github.com/bitwarden/sdk-sm/releases"
|
||||
)
|
||||
return 1
|
||||
|
||||
# ------------------------------------------------------------------- token
|
||||
console.print()
|
||||
console.print("[bold]Step 2[/bold] Provide your access token")
|
||||
cfg = load_config()
|
||||
secrets_cfg = (cfg.setdefault("secrets", {})
|
||||
.setdefault("bitwarden", {}))
|
||||
token_env = secrets_cfg.get("access_token_env", "BWS_ACCESS_TOKEN")
|
||||
|
||||
token = (args.access_token or "").strip()
|
||||
if not token:
|
||||
token = getpass.getpass(f" Paste access token ({token_env}): ").strip()
|
||||
if not token:
|
||||
console.print(" [red]Empty token, aborting.[/red]")
|
||||
return 1
|
||||
if not token.startswith("0."):
|
||||
console.print(
|
||||
" [yellow]Warning: token doesn't start with '0.' — usually that means "
|
||||
"you pasted something other than a BSM access token. Continuing anyway.[/yellow]"
|
||||
)
|
||||
|
||||
save_env_value(token_env, token)
|
||||
os.environ[token_env] = token # so the test fetch below sees it
|
||||
console.print(f" [green]✓[/green] stored in {get_env_path()} as {token_env}")
|
||||
|
||||
# ------------------------------------------------------------------- project
|
||||
if args.project_id and args.project_id.strip():
|
||||
project_id = args.project_id.strip()
|
||||
else:
|
||||
console.print()
|
||||
console.print("[bold]Step 3[/bold] Pick a project")
|
||||
project_id = ""
|
||||
projects = _list_projects(binary, token, console)
|
||||
if projects is None:
|
||||
return 1
|
||||
if not projects:
|
||||
console.print(" [yellow]No projects visible to this machine account.[/yellow]")
|
||||
console.print(
|
||||
" In the Bitwarden web app, open the machine account → Projects tab "
|
||||
"and grant it access to at least one project."
|
||||
)
|
||||
return 1
|
||||
|
||||
table = Table(show_header=True, header_style="bold")
|
||||
table.add_column("#", style="cyan", width=4)
|
||||
table.add_column("Name")
|
||||
table.add_column("ID", style="dim")
|
||||
for i, p in enumerate(projects, 1):
|
||||
table.add_row(str(i), p.get("name", "?"), p.get("id", "?"))
|
||||
console.print(table)
|
||||
|
||||
while True:
|
||||
choice = console.input(f" Select project [1-{len(projects)}]: ").strip()
|
||||
if not choice:
|
||||
continue
|
||||
try:
|
||||
idx = int(choice)
|
||||
except ValueError:
|
||||
console.print(" [red]Enter a number.[/red]")
|
||||
continue
|
||||
if 1 <= idx <= len(projects):
|
||||
project_id = projects[idx - 1]["id"]
|
||||
break
|
||||
console.print(f" [red]Out of range — pick 1-{len(projects)}.[/red]")
|
||||
|
||||
# ------------------------------------------------------------------- test
|
||||
console.print()
|
||||
step_num = 4 if not (args.project_id and args.project_id.strip()) else 3
|
||||
console.print(f"[bold]Step {step_num}[/bold] Test fetch")
|
||||
try:
|
||||
secrets, warnings = bw.fetch_bitwarden_secrets(
|
||||
access_token=token,
|
||||
project_id=project_id,
|
||||
binary=binary,
|
||||
use_cache=False,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
console.print(f" [red]✗ Fetch failed: {exc}[/red]")
|
||||
return 1
|
||||
|
||||
if not secrets:
|
||||
console.print(" [yellow]Fetch succeeded but the project has no secrets.[/yellow]")
|
||||
else:
|
||||
table = Table(show_header=True, header_style="bold")
|
||||
table.add_column("Name", style="cyan")
|
||||
table.add_column("Status")
|
||||
for key in sorted(secrets):
|
||||
if key == token_env:
|
||||
status = "[dim]bootstrap token — never overrides itself[/dim]"
|
||||
elif os.environ.get(key):
|
||||
status = "[yellow]already set in env (will be overwritten)[/yellow]"
|
||||
else:
|
||||
status = "[green]new[/green]"
|
||||
table.add_row(key, status)
|
||||
console.print(table)
|
||||
for w in warnings:
|
||||
console.print(f" [yellow]warning:[/yellow] {w}")
|
||||
|
||||
# ------------------------------------------------------------------- save
|
||||
secrets_cfg["enabled"] = True
|
||||
secrets_cfg["project_id"] = project_id
|
||||
secrets_cfg.setdefault("access_token_env", token_env)
|
||||
secrets_cfg.setdefault("cache_ttl_seconds", 300)
|
||||
secrets_cfg.setdefault("override_existing", True)
|
||||
secrets_cfg.setdefault("auto_install", True)
|
||||
save_config(cfg)
|
||||
|
||||
console.print()
|
||||
console.print(
|
||||
"[green]✓ Bitwarden Secrets Manager is enabled.[/green] "
|
||||
"Secrets will be pulled at the start of every Hermes process."
|
||||
)
|
||||
console.print(
|
||||
" Status: [cyan]hermes secrets bitwarden status[/cyan]\n"
|
||||
" Refresh: [cyan]hermes secrets bitwarden sync[/cyan]\n"
|
||||
" Disable: [cyan]hermes secrets bitwarden disable[/cyan]"
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_status(args: argparse.Namespace) -> int:
|
||||
console = Console()
|
||||
cfg = load_config()
|
||||
bw_cfg = (cfg.get("secrets") or {}).get("bitwarden") or {}
|
||||
|
||||
enabled = bool(bw_cfg.get("enabled"))
|
||||
token_env = bw_cfg.get("access_token_env", "BWS_ACCESS_TOKEN")
|
||||
project_id = bw_cfg.get("project_id", "")
|
||||
token_set = bool(os.environ.get(token_env))
|
||||
|
||||
table = Table(show_header=False, box=None, padding=(0, 2))
|
||||
table.add_column("", style="bold")
|
||||
table.add_column("")
|
||||
table.add_row("Enabled", _yn(enabled))
|
||||
table.add_row("Token env var", token_env)
|
||||
table.add_row("Token in env", _yn(token_set))
|
||||
table.add_row("Project ID", project_id or "[dim](unset)[/dim]")
|
||||
table.add_row("Override existing", _yn(bool(bw_cfg.get("override_existing", False))))
|
||||
table.add_row("Cache TTL (s)", str(bw_cfg.get("cache_ttl_seconds", 300)))
|
||||
table.add_row("Auto-install", _yn(bool(bw_cfg.get("auto_install", True))))
|
||||
|
||||
binary = bw.find_bws(install_if_missing=False)
|
||||
if binary:
|
||||
table.add_row("bws binary", f"{binary} ({_bws_version(binary)})")
|
||||
else:
|
||||
table.add_row("bws binary", "[yellow]not installed[/yellow]")
|
||||
|
||||
console.print(Panel(table, title="Bitwarden Secrets Manager", border_style="cyan"))
|
||||
|
||||
if not enabled:
|
||||
console.print("\n Run [cyan]hermes secrets bitwarden setup[/cyan] to enable.")
|
||||
return 0
|
||||
if not token_set:
|
||||
console.print(
|
||||
f"\n [yellow]Enabled but {token_env} is not set — Hermes will skip BSM "
|
||||
"and warn on next startup.[/yellow]"
|
||||
)
|
||||
if not project_id:
|
||||
console.print(
|
||||
"\n [yellow]Enabled but no project_id — nothing to fetch.[/yellow]"
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_sync(args: argparse.Namespace) -> int:
|
||||
console = Console()
|
||||
cfg = load_config()
|
||||
bw_cfg = (cfg.get("secrets") or {}).get("bitwarden") or {}
|
||||
if not bw_cfg.get("enabled"):
|
||||
console.print(
|
||||
"[yellow]Bitwarden integration is disabled. Run "
|
||||
"`hermes secrets bitwarden setup` first.[/yellow]"
|
||||
)
|
||||
return 1
|
||||
|
||||
token_env = bw_cfg.get("access_token_env", "BWS_ACCESS_TOKEN")
|
||||
token = os.environ.get(token_env, "").strip()
|
||||
if not token:
|
||||
console.print(f"[red]{token_env} is not set.[/red]")
|
||||
return 1
|
||||
|
||||
project_id = bw_cfg.get("project_id", "")
|
||||
if not project_id:
|
||||
console.print("[red]No project_id configured.[/red]")
|
||||
return 1
|
||||
|
||||
try:
|
||||
secrets, warnings = bw.fetch_bitwarden_secrets(
|
||||
access_token=token,
|
||||
project_id=project_id,
|
||||
use_cache=False,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
console.print(f"[red]Fetch failed: {exc}[/red]")
|
||||
return 1
|
||||
|
||||
if not secrets:
|
||||
console.print("[yellow]No secrets in project.[/yellow]")
|
||||
return 0
|
||||
|
||||
override = bool(bw_cfg.get("override_existing", False)) or args.apply
|
||||
table = Table(show_header=True, header_style="bold")
|
||||
table.add_column("Name", style="cyan")
|
||||
table.add_column("Action")
|
||||
applied = 0
|
||||
for key in sorted(secrets):
|
||||
if key == token_env:
|
||||
table.add_row(key, "[dim]skip (bootstrap token)[/dim]")
|
||||
continue
|
||||
already = bool(os.environ.get(key))
|
||||
if already and not override:
|
||||
table.add_row(key, "[dim]skip (already set)[/dim]")
|
||||
continue
|
||||
if args.apply:
|
||||
os.environ[key] = secrets[key]
|
||||
applied += 1
|
||||
table.add_row(key, "[green]exported[/green]" + (" (overrode)" if already else ""))
|
||||
else:
|
||||
table.add_row(key, "[green]would export[/green]" + (" (overrides)" if already else ""))
|
||||
|
||||
console.print(table)
|
||||
for w in warnings:
|
||||
console.print(f"[yellow]warning:[/yellow] {w}")
|
||||
|
||||
if not args.apply:
|
||||
console.print(
|
||||
"\n This was a dry-run — secrets are picked up automatically on the "
|
||||
"next [cyan]hermes[/cyan] invocation. Re-run with [cyan]--apply[/cyan] "
|
||||
"to export into the current shell instead."
|
||||
)
|
||||
else:
|
||||
console.print(f"\n [green]Exported {applied} secret(s) into current process.[/green]")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_disable(args: argparse.Namespace) -> int:
|
||||
console = Console()
|
||||
cfg = load_config()
|
||||
bw_cfg = (cfg.setdefault("secrets", {})
|
||||
.setdefault("bitwarden", {}))
|
||||
bw_cfg["enabled"] = False
|
||||
save_config(cfg)
|
||||
console.print(
|
||||
"[green]Disabled.[/green] Bitwarden secrets will NOT be pulled on the next "
|
||||
"Hermes invocation.\n"
|
||||
" Your access token is left in .env — remove it manually if you also want "
|
||||
"to revoke the credential."
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_install(args: argparse.Namespace) -> int:
|
||||
console = Console()
|
||||
try:
|
||||
path = bw.install_bws(force=bool(args.force))
|
||||
console.print(f"[green]✓[/green] {path} ({_bws_version(path)})")
|
||||
return 0
|
||||
except Exception as exc: # noqa: BLE001
|
||||
console.print(f"[red]Install failed: {exc}[/red]")
|
||||
return 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _yn(b: bool) -> str:
|
||||
return "[green]yes[/green]" if b else "[dim]no[/dim]"
|
||||
|
||||
|
||||
def _bws_version(binary: Path) -> str:
|
||||
try:
|
||||
res = subprocess.run(
|
||||
[str(binary), "--version"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if res.returncode == 0:
|
||||
return (res.stdout or res.stderr).strip().splitlines()[0]
|
||||
except (OSError, subprocess.TimeoutExpired):
|
||||
pass
|
||||
return "version unknown"
|
||||
|
||||
|
||||
def _list_projects(
|
||||
binary: Path, token: str, console: Console
|
||||
) -> Optional[List[dict]]:
|
||||
"""Call ``bws project list`` and return the parsed list, or None on failure."""
|
||||
env = os.environ.copy()
|
||||
env["BWS_ACCESS_TOKEN"] = token
|
||||
env.setdefault("NO_COLOR", "1")
|
||||
try:
|
||||
res = subprocess.run(
|
||||
[str(binary), "project", "list", "--output", "json"],
|
||||
env=env,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=15,
|
||||
)
|
||||
except (OSError, subprocess.TimeoutExpired) as exc:
|
||||
console.print(f" [red]Couldn't list projects: {exc}[/red]")
|
||||
return None
|
||||
|
||||
if res.returncode != 0:
|
||||
err = (res.stderr or res.stdout).strip()[:300]
|
||||
console.print(f" [red]bws project list failed: {err}[/red]")
|
||||
if "authorization" in err.lower() or "invalid" in err.lower():
|
||||
console.print(
|
||||
" [yellow]This usually means the access token is wrong or revoked. "
|
||||
"Double-check it in the Bitwarden web app.[/yellow]"
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
data = json.loads(res.stdout or "[]")
|
||||
except json.JSONDecodeError as exc:
|
||||
console.print(f" [red]bws returned non-JSON: {exc}[/red]")
|
||||
return None
|
||||
if not isinstance(data, list):
|
||||
return []
|
||||
return [p for p in data if isinstance(p, dict) and p.get("id")]
|
||||
@@ -23,6 +23,7 @@ from rich.table import Table
|
||||
# Lazy imports to avoid circular dependencies and slow startup.
|
||||
# tools.skills_hub and tools.skills_guard are imported inside functions.
|
||||
from hermes_constants import display_hermes_home
|
||||
from agent.skill_utils import is_excluded_skill_path
|
||||
|
||||
_console = Console()
|
||||
|
||||
@@ -178,9 +179,12 @@ def _existing_categories() -> List[str]:
|
||||
# top level (no category); otherwise treat as a category bucket.
|
||||
if (entry / "SKILL.md").exists():
|
||||
continue
|
||||
# Has at least one nested SKILL.md?
|
||||
# Has at least one nested SKILL.md (excluding dependency/cache dirs)?
|
||||
try:
|
||||
if any(entry.rglob("SKILL.md")):
|
||||
if any(
|
||||
not is_excluded_skill_path(p)
|
||||
for p in entry.rglob("SKILL.md")
|
||||
):
|
||||
out.append(entry.name)
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
+86
-29
@@ -78,6 +78,7 @@ CONFIGURABLE_TOOLSETS = [
|
||||
("discord_admin", "🛡️ Discord Server Admin", "list channels/roles, pin, assign roles"),
|
||||
("yuanbao", "🤖 Yuanbao", "group info, member queries, DM"),
|
||||
("computer_use", "🖱️ Computer Use (macOS)", "background desktop control via cua-driver"),
|
||||
("app_tools", "🔌 App Integrations (500+)", "Gmail, Slack, GitHub, Jira, Notion, etc. via Nous tool gateway"),
|
||||
]
|
||||
|
||||
# Toolsets that are OFF by default for new installs.
|
||||
@@ -311,6 +312,16 @@ TOOL_CATEGORIES = {
|
||||
"image_gen": {
|
||||
"name": "Image Generation",
|
||||
"icon": "🎨",
|
||||
# Per-provider rows for FAL.ai (`plugins/image_gen/fal`), OpenAI,
|
||||
# OpenAI Codex, and xAI are injected at runtime from each
|
||||
# ``plugins.image_gen.<vendor>`` package via
|
||||
# ``_plugin_image_gen_providers()`` in ``_visible_providers``.
|
||||
# Only non-provider UX setup-flow rows remain here:
|
||||
# - "Nous Subscription" — managed FAL billed via the Nous
|
||||
# subscription (requires_nous_auth + override_env_vars).
|
||||
# Uses the fal plugin as the underlying backend but has a
|
||||
# distinct setup UX.
|
||||
# Mirrors the shape browser/video_gen ship today.
|
||||
"providers": [
|
||||
{
|
||||
"name": "Nous Subscription",
|
||||
@@ -322,15 +333,6 @@ TOOL_CATEGORIES = {
|
||||
"override_env_vars": ["FAL_KEY"],
|
||||
"imagegen_backend": "fal",
|
||||
},
|
||||
{
|
||||
"name": "FAL.ai",
|
||||
"badge": "paid",
|
||||
"tag": "Pick from flux-2-klein, flux-2-pro, gpt-image, nano-banana, etc.",
|
||||
"env_vars": [
|
||||
{"key": "FAL_KEY", "prompt": "FAL API key", "url": "https://fal.ai/dashboard/keys"},
|
||||
],
|
||||
"imagegen_backend": "fal",
|
||||
},
|
||||
],
|
||||
},
|
||||
"video_gen": {
|
||||
@@ -482,6 +484,11 @@ TOOLSET_ENV_REQUIREMENTS = {
|
||||
# ─── Post-Setup Hooks ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _cua_driver_cmd() -> str:
|
||||
"""Return the cua-driver executable name/path, honoring non-empty overrides."""
|
||||
return os.environ.get("HERMES_CUA_DRIVER_CMD", "").strip() or "cua-driver"
|
||||
|
||||
|
||||
def _pip_install(
|
||||
args: List[str],
|
||||
*,
|
||||
@@ -550,6 +557,55 @@ def _pip_install(
|
||||
)
|
||||
|
||||
|
||||
|
||||
def _check_cua_driver_asset_for_arch() -> bool:
|
||||
"""Check whether the latest CUA release ships an asset for this architecture.
|
||||
|
||||
Returns True if the asset likely exists (or if we cannot determine it).
|
||||
Returns False and prints a warning when the asset is confirmed missing,
|
||||
so callers can skip the install attempt and avoid a raw 404.
|
||||
"""
|
||||
import platform as _plat
|
||||
import urllib.request
|
||||
|
||||
machine = _plat.machine() # "x86_64" or "arm64"
|
||||
if machine == "arm64":
|
||||
# arm64 (Apple Silicon) assets are always published.
|
||||
return True
|
||||
|
||||
# x86_64 / Intel — probe the latest release for an architecture-specific
|
||||
# asset before falling through to the upstream installer.
|
||||
api_url = (
|
||||
"https://api.github.com/repos/trycua/cua/releases/latest"
|
||||
)
|
||||
try:
|
||||
req = urllib.request.Request(api_url, headers={"Accept": "application/vnd.github+json"})
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
release = _json.loads(resp.read().decode())
|
||||
tag = release.get("tag_name", "")
|
||||
assets = release.get("assets", [])
|
||||
arch_names = {"x86_64", "amd64"}
|
||||
has_asset = any(
|
||||
any(a in a_info.get("name", "").lower() for a in arch_names)
|
||||
for a_info in assets
|
||||
)
|
||||
if not has_asset:
|
||||
_print_warning(
|
||||
f" Latest CUA release ({tag}) has no Intel (x86_64) asset."
|
||||
)
|
||||
_print_info(
|
||||
" CUA Driver currently only ships Apple Silicon builds."
|
||||
)
|
||||
_print_info(
|
||||
" See: https://github.com/trycua/cua/issues/1493"
|
||||
)
|
||||
return False
|
||||
except Exception:
|
||||
# Network / API failure — proceed and let the installer handle it.
|
||||
pass
|
||||
return True
|
||||
|
||||
|
||||
def install_cua_driver(upgrade: bool = False) -> bool:
|
||||
"""Install or refresh the cua-driver binary used by Computer Use.
|
||||
|
||||
@@ -579,7 +635,8 @@ def install_cua_driver(upgrade: bool = False) -> bool:
|
||||
_print_warning(" Computer Use (cua-driver) is macOS-only; skipping.")
|
||||
return False
|
||||
|
||||
binary = shutil.which("cua-driver")
|
||||
driver_cmd = _cua_driver_cmd()
|
||||
binary = shutil.which(driver_cmd)
|
||||
|
||||
# Not installed → fresh install path (only when caller asked for it).
|
||||
if not binary and not upgrade:
|
||||
@@ -587,18 +644,20 @@ def install_cua_driver(upgrade: bool = False) -> bool:
|
||||
_print_warning(" curl not found — install manually:")
|
||||
_print_info(" https://github.com/trycua/cua/blob/main/libs/cua-driver/README.md")
|
||||
return False
|
||||
if not _check_cua_driver_asset_for_arch():
|
||||
return False
|
||||
return _run_cua_driver_installer(label="Installing")
|
||||
|
||||
# Already installed and caller didn't ask to upgrade → just confirm.
|
||||
if binary and not upgrade:
|
||||
try:
|
||||
version = subprocess.run(
|
||||
["cua-driver", "--version"],
|
||||
[driver_cmd, "--version"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
).stdout.strip()
|
||||
_print_success(f" cua-driver already installed: {version or 'unknown version'}")
|
||||
_print_success(f" {driver_cmd} already installed: {version or 'unknown version'}")
|
||||
except Exception:
|
||||
_print_success(" cua-driver already installed.")
|
||||
_print_success(f" {driver_cmd} already installed.")
|
||||
_print_info(" Grant macOS permissions if not done yet:")
|
||||
_print_info(" System Settings > Privacy & Security > Accessibility")
|
||||
_print_info(" System Settings > Privacy & Security > Screen Recording")
|
||||
@@ -609,11 +668,14 @@ def install_cua_driver(upgrade: bool = False) -> bool:
|
||||
_print_warning(" curl not found — cannot refresh cua-driver.")
|
||||
return bool(binary)
|
||||
|
||||
if not _check_cua_driver_asset_for_arch():
|
||||
return bool(binary)
|
||||
|
||||
if binary:
|
||||
# Show before/after version when we have a baseline. Best-effort.
|
||||
try:
|
||||
before = subprocess.run(
|
||||
["cua-driver", "--version"],
|
||||
[driver_cmd, "--version"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
).stdout.strip()
|
||||
except Exception:
|
||||
@@ -625,13 +687,13 @@ def install_cua_driver(upgrade: bool = False) -> bool:
|
||||
if ok and before:
|
||||
try:
|
||||
after = subprocess.run(
|
||||
["cua-driver", "--version"],
|
||||
[driver_cmd, "--version"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
).stdout.strip()
|
||||
if after and after != before:
|
||||
_print_success(f" cua-driver upgraded: {before} → {after}")
|
||||
_print_success(f" {driver_cmd} upgraded: {before} → {after}")
|
||||
elif after:
|
||||
_print_info(f" cua-driver up to date: {after}")
|
||||
_print_info(f" {driver_cmd} up to date: {after}")
|
||||
except Exception:
|
||||
pass
|
||||
return ok
|
||||
@@ -655,11 +717,12 @@ def _run_cua_driver_installer(label: str = "Installing", verbose: bool = True) -
|
||||
_print_info(f" {label} cua-driver (macOS background computer-use)...")
|
||||
else:
|
||||
_print_info(f" {label} cua-driver...")
|
||||
driver_cmd = _cua_driver_cmd()
|
||||
try:
|
||||
result = subprocess.run(install_cmd, shell=True, timeout=300)
|
||||
if result.returncode == 0 and shutil.which("cua-driver"):
|
||||
if result.returncode == 0 and shutil.which(driver_cmd):
|
||||
if verbose:
|
||||
_print_success(" cua-driver installed.")
|
||||
_print_success(f" {driver_cmd} installed.")
|
||||
_print_info(" IMPORTANT — grant macOS permissions now:")
|
||||
_print_info(" System Settings > Privacy & Security > Accessibility")
|
||||
_print_info(" System Settings > Privacy & Security > Screen Recording")
|
||||
@@ -1506,12 +1569,9 @@ def _plugin_image_gen_providers() -> list[dict]:
|
||||
Each returned dict looks like a regular ``TOOL_CATEGORIES`` provider
|
||||
row but carries an ``image_gen_plugin_name`` marker so downstream
|
||||
code (config writing, model picker) knows to route through the
|
||||
plugin registry instead of the in-tree FAL backend.
|
||||
|
||||
FAL is skipped — it's already exposed by the hardcoded
|
||||
``TOOL_CATEGORIES["image_gen"]`` entries. When FAL gets ported to
|
||||
a plugin in a follow-up PR, the hardcoded entries go away and this
|
||||
function surfaces it alongside OpenAI automatically.
|
||||
plugin registry. Every image-gen backend is a plugin now — there
|
||||
are no hardcoded rows left in ``TOOL_CATEGORIES["image_gen"]`` for
|
||||
this function to dedupe against (see issue #26241).
|
||||
"""
|
||||
try:
|
||||
from agent.image_gen_registry import list_providers
|
||||
@@ -1524,9 +1584,6 @@ def _plugin_image_gen_providers() -> list[dict]:
|
||||
|
||||
rows: list[dict] = []
|
||||
for provider in providers:
|
||||
if getattr(provider, "name", None) == "fal":
|
||||
# FAL has its own hardcoded rows today.
|
||||
continue
|
||||
try:
|
||||
schema = provider.get_setup_schema()
|
||||
except Exception:
|
||||
@@ -1751,7 +1808,7 @@ _POST_SETUP_INSTALLED: dict = {
|
||||
# entry when (a) the post_setup is the ONLY install side-effect for
|
||||
# a no-key provider, and (b) an installed-state check is cheap and
|
||||
# doesn't trigger a heavy import.
|
||||
"cua_driver": lambda: bool(shutil.which("cua-driver")),
|
||||
"cua_driver": lambda: bool(shutil.which(_cua_driver_cmd())),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -975,11 +975,13 @@ _AUX_TASK_SLOTS: Tuple[str, ...] = (
|
||||
"vision",
|
||||
"web_extract",
|
||||
"compression",
|
||||
"session_search",
|
||||
"skills_hub",
|
||||
"approval",
|
||||
"mcp",
|
||||
"title_generation",
|
||||
"triage_specifier",
|
||||
"kanban_decomposer",
|
||||
"profile_describer",
|
||||
"curator",
|
||||
)
|
||||
|
||||
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 1.2 MiB |
Binary file not shown.
|
After Width: | Height: | Size: 1.9 MiB |
@@ -0,0 +1,121 @@
|
||||
Create a professional infographic following these specifications:
|
||||
|
||||
## Image Specifications
|
||||
|
||||
- **Type**: Infographic
|
||||
- **Layout**: bento-grid
|
||||
- **Style**: retro-pop-grid
|
||||
- **Aspect Ratio**: 1:1 (square)
|
||||
- **Language**: en
|
||||
|
||||
## Core Principles
|
||||
|
||||
- Follow the layout structure precisely for information architecture
|
||||
- Apply style aesthetics consistently throughout
|
||||
- Keep information concise, highlight keywords and core concepts
|
||||
- Use ample whitespace for visual clarity
|
||||
- Maintain clear visual hierarchy
|
||||
|
||||
## Text Requirements
|
||||
|
||||
- All text must match the specified style treatment
|
||||
- Main titles should be prominent and readable
|
||||
- Key concepts should be visually emphasized
|
||||
- Labels should be clear and appropriately sized
|
||||
- Use English for all text content
|
||||
|
||||
## Layout Guidelines (bento-grid)
|
||||
|
||||
- Grid of rectangular cells with varied sizes (1x1, 2x1, 1x2, 2x2)
|
||||
- Hero cell ("ONE TOKEN, EVERY KEY") takes the largest position (top-center or upper-left, 2x2)
|
||||
- Supporting cells around the hero, mixed cell sizes for rhythm
|
||||
- Each cell self-contained with its own title + icon + brief content
|
||||
- Title strip at the top: "BITWARDEN SECRETS MANAGER — HERMES-AGENT PR #30035"
|
||||
- Footer strip at the bottom with commit SHA + repo
|
||||
|
||||
## Style Guidelines (retro-pop-grid)
|
||||
|
||||
- 1970s retro pop art with strict Swiss international grid
|
||||
- Background: warm vintage cream/beige (#F5F0E6)
|
||||
- Accents: salmon pink, sky blue, mustard yellow, mint green — all muted retro tones
|
||||
- Pure solid black (#000000) and solid white (#FFFFFF) for extreme-contrast cells
|
||||
- Uniform thick black outlines on ALL illustrations, text boxes, grid dividers
|
||||
- Pure 2D flat vector aesthetic with subtle screen-print texture
|
||||
- One cell inverted to black-background-with-white-text for the "NEVER BLOCKS STARTUP" warning section
|
||||
- Geometric fill patterns in empty cells: checkerboards, diagonal lines, dot grids
|
||||
- Flat abstract symbols: shields (security), wrenches (install), arrows (rotation), keyholes (auth), checkmarks (tests)
|
||||
- Vintage comic-style smiley face for "26/26 PASSING" cell
|
||||
- Bold brutalist or thick retro display fonts for headers; clean sans-serif body
|
||||
- Decorative stylistic labels acceptable: "WARNING", "NEW DEFAULT", "PINNED", "VERIFIED", "ROTATE"
|
||||
|
||||
## Avoid
|
||||
|
||||
- 3D rendering, gradients, soft shadows, sketch-like lines
|
||||
- Free-floating elements — everything anchored in grid cells
|
||||
- Pure white background — must use warm cream/beige
|
||||
|
||||
---
|
||||
|
||||
Generate the infographic based on the content below:
|
||||
|
||||
### Title (top strip)
|
||||
BITWARDEN SECRETS MANAGER → HERMES-AGENT
|
||||
PR #30035
|
||||
|
||||
### HERO CELL (largest, top-center, salmon pink background with thick black border)
|
||||
ONE TOKEN, EVERY KEY
|
||||
Rotate once in the Bitwarden web app.
|
||||
Every Hermes process picks it up on next start.
|
||||
NEW DEFAULT: override_existing = true
|
||||
|
||||
### Cell — LAZY INSTALL (sky blue background)
|
||||
~/.hermes/bin/bws
|
||||
bws v2.0.0 PINNED
|
||||
SHA-256 VERIFIED
|
||||
No apt · no brew · no sudo
|
||||
Icon: wrench + downward arrow
|
||||
|
||||
### Cell — CLI SURFACE (mustard yellow background, checkerboard accents)
|
||||
$ hermes secrets bitwarden
|
||||
setup wizard
|
||||
status diagnose
|
||||
sync fetch
|
||||
install binary
|
||||
disable off
|
||||
Icon: terminal prompt symbol
|
||||
|
||||
### Cell — SOURCE OF TRUTH (mint green background)
|
||||
BITWARDEN WINS
|
||||
Overwrites stale .env on every start
|
||||
Bootstrap token never overwritten (exception)
|
||||
Icon: keyhole + arrow
|
||||
|
||||
### Cell — INVERTED BLACK CELL with WHITE TEXT — NEVER BLOCKS STARTUP (extreme contrast)
|
||||
WARNING-FREE STARTUP
|
||||
Missing binary → warn + continue
|
||||
Bad token → warn + continue
|
||||
Network down → warn + continue
|
||||
Checksum mismatch → refuse + warn
|
||||
30s timeout ceiling
|
||||
Icon: white triangle warning sign
|
||||
|
||||
### Cell — TESTS (cream with thick black outline, vintage comic smiley face)
|
||||
26 / 26
|
||||
HERMETIC
|
||||
subprocess + urllib mocked
|
||||
linux · macos · windows
|
||||
x86_64 · arm64
|
||||
Icon: comic-style smiley face with checkmark
|
||||
|
||||
### Cell — CONFIG YAML (white background with black grid)
|
||||
secrets:
|
||||
bitwarden:
|
||||
enabled: true
|
||||
project_id: ...
|
||||
override_existing: true
|
||||
cache_ttl_seconds: 300
|
||||
auto_install: true
|
||||
|
||||
### Footer strip (bottom, black-on-cream)
|
||||
PR #30035 · commit 7f9b05668 · NousResearch/hermes-agent
|
||||
10 files · +1743 / -1 · agent/secret_sources/ · hermes_cli/secrets_cli.py
|
||||
@@ -0,0 +1,57 @@
|
||||
# Hermes-Agent PR #30035 — Bitwarden Secrets Manager Integration
|
||||
|
||||
## Hero
|
||||
**ONE TOKEN, EVERY KEY**
|
||||
Rotate once. Every Hermes process picks it up on next start.
|
||||
`secrets.bitwarden.override_existing: true` (default)
|
||||
|
||||
## Cells
|
||||
|
||||
### Lazy Install
|
||||
- `bws v2.0.0` pinned
|
||||
- Downloaded into `~/.hermes/bin/bws`
|
||||
- SHA-256 verified vs GitHub Releases checksum file
|
||||
- No apt, no brew, no sudo
|
||||
- Cross-platform: linux gnu+musl, macos universal, windows x86_64+arm64
|
||||
|
||||
### CLI Surface
|
||||
- `hermes secrets bitwarden setup` wizard
|
||||
- `hermes secrets bitwarden status` diagnose
|
||||
- `hermes secrets bitwarden sync` dry-run / --apply
|
||||
- `hermes secrets bitwarden install` binary only
|
||||
- `hermes secrets bitwarden disable` off switch
|
||||
|
||||
### Source of Truth
|
||||
- Bitwarden WINS on every Hermes start
|
||||
- BSM values overwrite stale `.env` lines
|
||||
- Rotate a key once → all your machines reload it
|
||||
- Bootstrap token `BWS_ACCESS_TOKEN` is the lone exception (never overwritten)
|
||||
|
||||
### Never Blocks Startup
|
||||
- Missing binary → warn + continue
|
||||
- Bad token → warn + continue
|
||||
- Checksum mismatch → refuse install + warn
|
||||
- No network → warn + continue
|
||||
- Timeout → 30s ceiling, warn + continue
|
||||
|
||||
### Tests
|
||||
- 26/26 passing, hermetic
|
||||
- subprocess + urllib mocked
|
||||
- Platform matrix tested (linux, macos, windows × x86_64, arm64)
|
||||
- Cache hit/miss, auth fail, non-JSON, timeout, override behavior
|
||||
|
||||
### Config
|
||||
```yaml
|
||||
secrets:
|
||||
bitwarden:
|
||||
enabled: true
|
||||
project_id: <uuid>
|
||||
override_existing: true # NEW DEFAULT
|
||||
cache_ttl_seconds: 300
|
||||
auto_install: true
|
||||
```
|
||||
|
||||
## Footer
|
||||
PR #30035 · commit 7f9b05668 · NousResearch/hermes-agent
|
||||
|
||||
10 files changed · +1743 / -1 · agent/secret_sources/ · hermes_cli/secrets_cli.py · tests · docs
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 2.1 MiB |
Binary file not shown.
|
After Width: | Height: | Size: 1.6 MiB |
Binary file not shown.
|
After Width: | Height: | Size: 1.4 MiB |
@@ -0,0 +1,85 @@
|
||||
Create a professional infographic following these specifications:
|
||||
|
||||
## Image Specifications
|
||||
|
||||
- **Type**: Infographic
|
||||
- **Layout**: bento-grid
|
||||
- **Style**: technical-schematic (engineering blueprint variant)
|
||||
- **Aspect Ratio**: 1:1 (square)
|
||||
- **Language**: English
|
||||
|
||||
## Core Principles
|
||||
|
||||
- Follow the bento-grid layout precisely with varied cell sizes
|
||||
- Apply technical-schematic aesthetics consistently throughout
|
||||
- Keep information concise, highlight keywords and core concepts
|
||||
- Use ample whitespace for visual clarity
|
||||
- Maintain clear visual hierarchy with a hero cell for the headline metric
|
||||
|
||||
## Style Guidelines (technical-schematic blueprint)
|
||||
|
||||
- Color palette: deep blue background (#1E3A5F), white lines and text, amber accent (#F59E0B) ONLY on the hero metric and critical deltas, cyan callouts for measurement annotations
|
||||
- Grid pattern overlay across the entire canvas — fine white grid lines on the deep blue background
|
||||
- All-caps technical stencil typography for headers; clean sans-serif for body
|
||||
- Dimension lines with arrowheads connecting metrics to their cells
|
||||
- Technical symbols where appropriate (gear icons, flow arrows, modular block diagrams)
|
||||
- Consistent stroke weights — bold for cell borders, thin for grid, medium for connector lines
|
||||
- Engineering spec-sheet aesthetic: feels like a printed architectural blueprint, austere and precise
|
||||
|
||||
## Layout Guidelines (bento-grid)
|
||||
|
||||
- Hero cell (TOP-CENTER or LEFT, occupying ~40% of canvas): "−61 COMPLEXITY · 79 → 18" headline metric in massive amber-on-blue, with subtitle "convert_messages_to_anthropic refactored"
|
||||
- 7 helper cells in a 2x4 or 3x3 grid showing each extracted helper as its own modular block — each cell has the helper name in all-caps, its complexity number, and one-line role
|
||||
- Metrics strip cell: BEFORE/AFTER table with deltas (185 statements → ~70, 79 C → 18 C, +5 violations intentional)
|
||||
- Test validation cell: "152/152 + 213/213 PASS" with checkmark stencil
|
||||
- Footer strip across bottom: "PR #27784 · agent/anthropic_adapter.py · @kshitijk4poor · NousResearch/hermes-agent"
|
||||
|
||||
## Content to render
|
||||
|
||||
**Main title (top of canvas, all caps):** "ANTHROPIC ADAPTER · 1-INTO-7 EXTRACTION"
|
||||
**Subtitle:** "PR #27784 — convert_messages_to_anthropic refactor"
|
||||
|
||||
**Hero cell (largest, amber accent):**
|
||||
- "−61"
|
||||
- "CYCLOMATIC COMPLEXITY"
|
||||
- "79 → 18 MAX (−77%)"
|
||||
- Subtext: "convert_messages_to_anthropic · pure code motion · zero behavior change"
|
||||
|
||||
**7 helper cells (one per helper, each its own modular block):**
|
||||
|
||||
1. _convert_assistant_message · C<10 · "Assistant msg → content blocks"
|
||||
2. _convert_tool_message_to_result · C=12 · "Tool msg → tool_result + merge"
|
||||
3. _convert_user_message · C<10 · "User msg validation"
|
||||
4. _strip_orphaned_tool_blocks · C=15 · "Orphan tool_use removal"
|
||||
5. _merge_consecutive_roles · C=13 · "Anthropic role-alternation"
|
||||
6. _manage_thinking_signatures · C=18 · "Strip/preserve by endpoint"
|
||||
7. _evict_old_screenshots · C<10 · "Keep most recent 3 images"
|
||||
|
||||
**Metrics cell (table format with arrows):**
|
||||
- MAX FUNCTION COMPLEXITY: 79 → 18 (−77%)
|
||||
- MAX STATEMENTS/FUNCTION: 185 → ~70 (−62%)
|
||||
- LOC FILE-WIDE: −4
|
||||
- MAIN FUNCTION LOC: 395 → 63
|
||||
|
||||
**Test validation cell (checkmark stencil):**
|
||||
- test_anthropic_adapter.py: 152/152 PASS
|
||||
- test_auxiliary_client.py: 172/172 PASS
|
||||
- test_azure_identity_adapter.py: 39/39 PASS
|
||||
- test_bedrock_1m_context.py: 2/2 PASS
|
||||
|
||||
**Behavior preservation cell:**
|
||||
"ZERO LOGIC CHANGES · ANTHROPIC + KIMI + DEEPSEEK + MINIMAX + AZURE FOUNDRY + BEDROCK SEMANTICS PRESERVED"
|
||||
|
||||
**Footer strip:**
|
||||
"PR #27784 · agent/anthropic_adapter.py · cherry-picked from #23968 · @kshitijk4poor · NousResearch/hermes-agent"
|
||||
|
||||
## Text Requirements
|
||||
|
||||
- All text in English, all-caps for headers
|
||||
- Hero metric "−61" in amber (#F59E0B), oversized, with thick blueprint stencil treatment
|
||||
- Helper names in white technical stencil
|
||||
- Complexity numbers (C=12, C=18, etc.) in cyan callouts
|
||||
- "BEFORE" labels in white-on-blue, "AFTER" labels in amber-on-blue
|
||||
- Footer in small white stencil
|
||||
|
||||
Generate the infographic now as a square engineering blueprint.
|
||||
@@ -0,0 +1,66 @@
|
||||
# Infographic: PR #27784 — convert_messages_to_anthropic refactor
|
||||
|
||||
## Hero metric
|
||||
**−61 cyclomatic complexity** in `agent/anthropic_adapter.py` (79 → 18 max).
|
||||
**−4 LOC** net file-wide. **77% drop** in single-function complexity ceiling.
|
||||
|
||||
## Title
|
||||
ANTHROPIC ADAPTER · 1-INTO-7 EXTRACTION
|
||||
PR #27784 · agent/anthropic_adapter.py · @kshitijk4poor
|
||||
|
||||
## Section 1: BEFORE (left side)
|
||||
**convert_messages_to_anthropic**
|
||||
- 185 statements
|
||||
- 90 branches
|
||||
- Cyclomatic: 79
|
||||
- Did 7 jobs in one function
|
||||
|
||||
Inline responsibilities mixed together:
|
||||
1. Walk + dispatch by role
|
||||
2. Tool-result conversion
|
||||
3. Orphan tool-use stripping
|
||||
4. Same-role merging
|
||||
5. Thinking-signature management
|
||||
6. Screenshot eviction
|
||||
7. Final assembly
|
||||
|
||||
## Section 2: AFTER (right side)
|
||||
**convert_messages_to_anthropic** — now 63 lines, C<10
|
||||
Plus 7 single-responsibility helpers:
|
||||
|
||||
| Helper | C | Role |
|
||||
|---|---|---|
|
||||
| _convert_assistant_message | <10 | Assistant msg → content blocks |
|
||||
| _convert_tool_message_to_result | 12 | Tool msg → tool_result + merge |
|
||||
| _convert_user_message | <10 | User msg validation + conversion |
|
||||
| _strip_orphaned_tool_blocks | 15 | Strip orphan tool_use + tool_result |
|
||||
| _merge_consecutive_roles | 13 | Anthropic role-alternation enforce |
|
||||
| _manage_thinking_signatures | 18 | Strip/preserve/downgrade by endpoint |
|
||||
| _evict_old_screenshots | <10 | Keep most recent 3 images |
|
||||
|
||||
## Section 3: METRICS
|
||||
| Metric | Before | After | Δ |
|
||||
|---|---:|---:|---:|
|
||||
| Max function complexity | 79 | 18 | −77% |
|
||||
| Max statements/function | 185 | ~70 | −62% |
|
||||
| LOC (file-wide) | — | — | **−4** |
|
||||
| C901 violations | 3 | 8 | +5 (intentional split) |
|
||||
|
||||
## Section 4: ZERO BEHAVIOR CHANGE
|
||||
- Pure code motion — no logic edits
|
||||
- Mutating helpers update `result` in place (same as inline)
|
||||
- `_merge_consecutive_roles` returns new list — caller rebinds
|
||||
- Anthropic / Kimi / DeepSeek / MiniMax / Azure Foundry / Bedrock semantics preserved
|
||||
- Thinking-signature handling identical to pre-refactor
|
||||
|
||||
## Section 5: TEST VALIDATION
|
||||
- tests/agent/test_anthropic_adapter.py — **152 / 152 pass**
|
||||
- tests/agent/test_auxiliary_client.py — **172 / 172 pass**
|
||||
- tests/agent/test_azure_identity_adapter.py — **39 / 39 pass**
|
||||
- tests/agent/test_bedrock_1m_context.py — **2 / 2 pass**
|
||||
|
||||
## Footer
|
||||
File: agent/anthropic_adapter.py
|
||||
Original PR: #27784 (cherry-pick of #23968)
|
||||
Salvage commit: 9c102b937 (kshitijk4poor authorship preserved)
|
||||
Repo: NousResearch/hermes-agent
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 1.9 MiB |
+1
-1
@@ -4,7 +4,7 @@ let
|
||||
src = ../ui-tui;
|
||||
npmDeps = pkgs.fetchNpmDeps {
|
||||
inherit src;
|
||||
hash = "sha256-dNL/J4tyQQ7Ji3xfIE5b5Jdi6rQyCFjqYpzLYftJVdc=";
|
||||
hash = "sha256-F6/MzZOWc0zhW9mIfnaY+PrllPvJcsA/OdFdEM+NpLY=";
|
||||
};
|
||||
|
||||
npm = hermesNpmLib.mkNpmPassthru { folder = "ui-tui"; attr = "tui"; pname = "hermes-tui"; };
|
||||
|
||||
+1
-1
@@ -4,7 +4,7 @@ let
|
||||
src = ../web;
|
||||
npmDeps = pkgs.fetchNpmDeps {
|
||||
inherit src;
|
||||
hash = "sha256-GxSmEpclOwmv94KmGMediPITxqXAsxqTEQOoDIbYkUw=";
|
||||
hash = "sha256-xSsyluzU2lNhwGqB6XMCGMv3QFHZizE6hgUyc1jvyOw=";
|
||||
};
|
||||
|
||||
npm = hermesNpmLib.mkNpmPassthru { folder = "web"; attr = "web"; pname = "hermes-web"; };
|
||||
|
||||
@@ -148,7 +148,7 @@ class BrowserUseBrowserProvider(BrowserProvider):
|
||||
|
||||
return {
|
||||
"api_key": managed.nous_user_token,
|
||||
"base_url": managed.gateway_origin.rstrip("/"),
|
||||
"base_url": managed.resolved_origin.rstrip("/"),
|
||||
"managed_mode": True,
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,182 @@
|
||||
"""FAL.ai image generation backend.
|
||||
|
||||
Wraps the 18-model FAL catalog (FLUX 2, Z-Image, Nano Banana, GPT
|
||||
Image 1.5, Recraft, Imagen 4, Qwen, Ideogram, …) as an
|
||||
:class:`ImageGenProvider` implementation.
|
||||
|
||||
The heavy lifting — model catalog, payload construction, request
|
||||
submission, managed-Nous-gateway selection, Clarity Upscaler chaining
|
||||
— lives in :mod:`tools.image_generation_tool`. This plugin reaches into
|
||||
that module via call-time indirection (``import tools.image_generation_tool as _it``)
|
||||
so:
|
||||
|
||||
* the existing test suite (``tests/tools/test_image_generation.py``,
|
||||
``tests/tools/test_managed_media_gateways.py``) keeps patching
|
||||
``image_tool._submit_fal_request`` / ``image_tool.fal_client`` /
|
||||
``image_tool._managed_fal_client`` without modification, and
|
||||
* there's exactly one canonical FAL code path on disk — the plugin is a
|
||||
registration adapter, not a parallel implementation.
|
||||
|
||||
See issue #26241 for the migration plan and the
|
||||
``plugin-extraction-test-patch-compatibility.md`` rules this follows.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from agent.image_gen_provider import (
|
||||
DEFAULT_ASPECT_RATIO,
|
||||
ImageGenProvider,
|
||||
resolve_aspect_ratio,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FalImageGenProvider(ImageGenProvider):
|
||||
"""FAL.ai image generation backend.
|
||||
|
||||
Delegates to ``tools.image_generation_tool.image_generate_tool`` so
|
||||
the in-tree FAL implementation (model catalog, payload builder,
|
||||
managed-gateway selection, Clarity Upscaler chaining) is the single
|
||||
source of truth. Everything is resolved at call time via the
|
||||
``_it`` indirection so tests can monkey-patch the legacy module.
|
||||
"""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "fal"
|
||||
|
||||
@property
|
||||
def display_name(self) -> str:
|
||||
return "FAL.ai"
|
||||
|
||||
def is_available(self) -> bool:
|
||||
# Available when direct FAL_KEY is set OR the managed Nous
|
||||
# gateway resolves a fal-queue origin. Both checks come from the
|
||||
# legacy module so this provider tracks whatever logic ships
|
||||
# there.
|
||||
import tools.image_generation_tool as _it
|
||||
try:
|
||||
return bool(_it.check_fal_api_key())
|
||||
except Exception: # noqa: BLE001 — defensive; never break the picker
|
||||
return False
|
||||
|
||||
def list_models(self) -> List[Dict[str, Any]]:
|
||||
import tools.image_generation_tool as _it
|
||||
return [
|
||||
{
|
||||
"id": model_id,
|
||||
"display": meta.get("display", model_id),
|
||||
"speed": meta.get("speed", ""),
|
||||
"strengths": meta.get("strengths", ""),
|
||||
"price": meta.get("price", ""),
|
||||
}
|
||||
for model_id, meta in _it.FAL_MODELS.items()
|
||||
]
|
||||
|
||||
def default_model(self) -> Optional[str]:
|
||||
import tools.image_generation_tool as _it
|
||||
return _it.DEFAULT_MODEL
|
||||
|
||||
def get_setup_schema(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"name": "FAL.ai",
|
||||
"badge": "paid",
|
||||
"tag": "Pick from flux-2-klein, flux-2-pro, gpt-image, nano-banana, etc.",
|
||||
"env_vars": [
|
||||
{
|
||||
"key": "FAL_KEY",
|
||||
"prompt": "FAL API key",
|
||||
"url": "https://fal.ai/dashboard/keys",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
def generate(
|
||||
self,
|
||||
prompt: str,
|
||||
aspect_ratio: str = DEFAULT_ASPECT_RATIO,
|
||||
**kwargs: Any,
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate an image via the legacy FAL pipeline.
|
||||
|
||||
Forwards prompt + aspect_ratio (and any forward-compat extras
|
||||
the schema supports) into :func:`tools.image_generation_tool.image_generate_tool`,
|
||||
then reshapes its JSON-string response into the provider-ABC
|
||||
dict format consumed by ``_dispatch_to_plugin_provider``.
|
||||
"""
|
||||
import tools.image_generation_tool as _it
|
||||
|
||||
aspect = resolve_aspect_ratio(aspect_ratio)
|
||||
passthrough = {
|
||||
key: kwargs[key]
|
||||
for key in (
|
||||
"num_inference_steps",
|
||||
"guidance_scale",
|
||||
"num_images",
|
||||
"output_format",
|
||||
"seed",
|
||||
)
|
||||
if key in kwargs and kwargs[key] is not None
|
||||
}
|
||||
|
||||
try:
|
||||
raw = _it.image_generate_tool(
|
||||
prompt=prompt,
|
||||
aspect_ratio=aspect,
|
||||
**passthrough,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001 — never raise out of generate
|
||||
logger.warning("FAL image_generate_tool raised: %s", exc, exc_info=True)
|
||||
return {
|
||||
"success": False,
|
||||
"image": None,
|
||||
"error": f"FAL image generation failed: {exc}",
|
||||
"error_type": type(exc).__name__,
|
||||
"provider": "fal",
|
||||
"prompt": prompt,
|
||||
"aspect_ratio": aspect,
|
||||
}
|
||||
|
||||
try:
|
||||
response = json.loads(raw) if isinstance(raw, str) else raw
|
||||
except Exception: # noqa: BLE001
|
||||
response = {"success": False, "image": None, "error": "Invalid JSON from FAL pipeline"}
|
||||
|
||||
if not isinstance(response, dict):
|
||||
response = {
|
||||
"success": False,
|
||||
"image": None,
|
||||
"error": "FAL pipeline returned a non-dict response",
|
||||
"error_type": "provider_contract",
|
||||
}
|
||||
|
||||
# Stamp provider/prompt/aspect_ratio so downstream consumers see
|
||||
# the uniform shape declared in ``agent.image_gen_provider``.
|
||||
response.setdefault("provider", "fal")
|
||||
response.setdefault("prompt", prompt)
|
||||
response.setdefault("aspect_ratio", aspect)
|
||||
# Annotate model best-effort — the legacy pipeline resolves it
|
||||
# internally, so query it after the fact for the response shape.
|
||||
if "model" not in response:
|
||||
try:
|
||||
model_id, _meta = _it._resolve_fal_model()
|
||||
response["model"] = model_id
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
return response
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Plugin entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def register(ctx) -> None:
|
||||
"""Plugin entry point — wire ``FalImageGenProvider`` into the registry."""
|
||||
ctx.register_image_gen_provider(FalImageGenProvider())
|
||||
@@ -0,0 +1,7 @@
|
||||
name: fal
|
||||
version: 1.0.0
|
||||
description: "FAL.ai image generation backend (flux-2-klein, flux-2-pro, nano-banana, gpt-image-1.5, recraft-v3, etc.)."
|
||||
author: NousResearch
|
||||
kind: backend
|
||||
requires_env:
|
||||
- FAL_KEY
|
||||
@@ -47,6 +47,25 @@ _DEFAULT_ENDPOINT = "http://127.0.0.1:1933"
|
||||
_TIMEOUT = 30.0
|
||||
_REMOTE_RESOURCE_PREFIXES = ("http://", "https://", "git@", "ssh://", "git://")
|
||||
|
||||
# Maps the viking_remember `category` enum to a viking:// subdirectory.
|
||||
# Keep in sync with REMEMBER_SCHEMA.parameters.properties.category.enum.
|
||||
_CATEGORY_SUBDIR_MAP = {
|
||||
"preference": "preferences",
|
||||
"entity": "entities",
|
||||
"event": "events",
|
||||
"case": "cases",
|
||||
"pattern": "patterns",
|
||||
}
|
||||
_DEFAULT_MEMORY_SUBDIR = "preferences"
|
||||
|
||||
# Maps the built-in memory tool's `target` ("user" vs "memory") to a subdir
|
||||
# for on_memory_write mirroring. User profile facts → preferences; agent
|
||||
# notes / observations → patterns. Anything unknown falls back to the default.
|
||||
_MEMORY_WRITE_TARGET_SUBDIR_MAP = {
|
||||
"user": "preferences",
|
||||
"memory": "patterns",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Process-level atexit safety net — ensures pending sessions are committed
|
||||
@@ -607,24 +626,35 @@ class OpenVikingMemoryProvider(MemoryProvider):
|
||||
except Exception as e:
|
||||
logger.warning("OpenViking session commit failed: %s", e)
|
||||
|
||||
def on_memory_write(self, action: str, target: str, content: str) -> None:
|
||||
"""Mirror built-in memory writes to OpenViking as explicit memories."""
|
||||
def _build_memory_uri(self, subdir: str) -> str:
|
||||
"""Build a viking:// memory URI under the configured user/subdir."""
|
||||
slug = uuid.uuid4().hex[:12]
|
||||
return f"viking://user/{self._user}/memories/{subdir}/mem_{slug}.md"
|
||||
|
||||
def on_memory_write(
|
||||
self,
|
||||
action: str,
|
||||
target: str,
|
||||
content: str,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> None:
|
||||
"""Mirror built-in memory writes to OpenViking via content/write."""
|
||||
if not self._client or action != "add" or not content:
|
||||
return
|
||||
|
||||
subdir = _MEMORY_WRITE_TARGET_SUBDIR_MAP.get(target, _DEFAULT_MEMORY_SUBDIR)
|
||||
uri = self._build_memory_uri(subdir)
|
||||
|
||||
def _write():
|
||||
try:
|
||||
client = _VikingClient(
|
||||
self._endpoint, self._api_key,
|
||||
account=self._account, user=self._user, agent=self._agent,
|
||||
)
|
||||
# Add as a user message with memory context so the commit
|
||||
# picks it up as an explicit memory during extraction
|
||||
client.post(f"/api/v1/sessions/{self._session_id}/messages", {
|
||||
"role": "user",
|
||||
"parts": [
|
||||
{"type": "text", "text": f"[Memory note — {target}] {content}"},
|
||||
],
|
||||
client.post("/api/v1/content/write", {
|
||||
"uri": uri,
|
||||
"content": content,
|
||||
"mode": "create",
|
||||
})
|
||||
except Exception as e:
|
||||
logger.debug("OpenViking memory mirror failed: %s", e)
|
||||
@@ -858,24 +888,27 @@ class OpenVikingMemoryProvider(MemoryProvider):
|
||||
if not content:
|
||||
return tool_error("content is required")
|
||||
|
||||
# Store as a session message that will be extracted during commit.
|
||||
# The category hint helps OpenViking's extraction classify correctly.
|
||||
category = args.get("category", "")
|
||||
text = f"[Remember] {content}"
|
||||
if category:
|
||||
text = f"[Remember — {category}] {content}"
|
||||
subdir = _CATEGORY_SUBDIR_MAP.get(category, _DEFAULT_MEMORY_SUBDIR)
|
||||
uri = self._build_memory_uri(subdir)
|
||||
|
||||
self._client.post(f"/api/v1/sessions/{self._session_id}/messages", {
|
||||
"role": "user",
|
||||
"parts": [
|
||||
{"type": "text", "text": text},
|
||||
],
|
||||
})
|
||||
|
||||
return json.dumps({
|
||||
"status": "stored",
|
||||
"message": "Memory recorded. Will be extracted and indexed on session commit.",
|
||||
})
|
||||
# Write directly via content/write API.
|
||||
# This creates the file, stores the content, and queues vector indexing
|
||||
# in a single call — no dependency on session commit / VLM extraction.
|
||||
try:
|
||||
result = self._client.post("/api/v1/content/write", {
|
||||
"uri": uri,
|
||||
"content": content,
|
||||
"mode": "create",
|
||||
})
|
||||
written = result.get("result", {}).get("written_bytes", 0)
|
||||
return json.dumps({
|
||||
"status": "stored",
|
||||
"message": f"Memory stored ({written}b) and queued for vector indexing.",
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error("OpenViking content/write failed: %s", e)
|
||||
return tool_error(f"Failed to store memory: {e}")
|
||||
|
||||
def _tool_add_resource(self, args: dict) -> str:
|
||||
url = args.get("url", "")
|
||||
|
||||
@@ -282,20 +282,24 @@ def _build_payload(
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# fal_client lazy import (same pattern as image_generation_tool)
|
||||
# fal_client lazy import (shared with image_generation_tool via fal_common)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_fal_client: Any = None
|
||||
|
||||
|
||||
def _load_fal_client() -> Any:
|
||||
"""Lazy-load the ``fal_client`` SDK and cache it on this module.
|
||||
|
||||
Delegates the actual import to :func:`tools.fal_common.import_fal_client`
|
||||
so the ``lazy_deps`` ensure-install handling stays in one place.
|
||||
"""
|
||||
global _fal_client
|
||||
if _fal_client is not None:
|
||||
return _fal_client
|
||||
import fal_client # type: ignore
|
||||
|
||||
_fal_client = fal_client
|
||||
return fal_client
|
||||
from tools.fal_common import import_fal_client
|
||||
_fal_client = import_fal_client()
|
||||
return _fal_client
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@@ -238,7 +238,7 @@ def _get_firecrawl_client() -> Any:
|
||||
|
||||
kwargs = {
|
||||
"api_key": managed_gateway.nous_user_token,
|
||||
"api_url": managed_gateway.gateway_origin,
|
||||
"api_url": managed_gateway.resolved_origin,
|
||||
}
|
||||
client_config = (
|
||||
"tool-gateway",
|
||||
|
||||
+7
-11
@@ -84,7 +84,7 @@ modal = ["modal==1.3.4"]
|
||||
daytona = ["daytona==0.155.0"]
|
||||
vercel = ["vercel==0.5.7"]
|
||||
hindsight = ["hindsight-client==0.6.1"]
|
||||
dev = ["debugpy==1.8.20", "pytest==9.0.2", "pytest-asyncio==1.3.0", "pytest-xdist==3.8.0", "pytest-split==0.11.0", "pytest-timeout==2.4.0", "mcp==1.26.0", "ty==0.0.21", "ruff==0.15.10"]
|
||||
dev = ["debugpy==1.8.20", "pytest==9.0.2", "pytest-asyncio==1.3.0", "pytest-timeout==2.4.0", "mcp==1.26.0", "ty==0.0.21", "ruff==0.15.10"]
|
||||
messaging = ["python-telegram-bot[webhooks]==22.6", "discord.py[voice]==2.7.1", "aiohttp==3.13.3", "brotlicffi==1.2.0.1", "slack-bolt==1.27.0", "slack-sdk==3.40.1", "qrcode==7.4.2"]
|
||||
cron = [] # croniter is now a core dependency; this extra kept for back-compat
|
||||
slack = ["slack-bolt==1.27.0", "slack-sdk==3.40.1", "aiohttp==3.13.3"]
|
||||
@@ -232,16 +232,12 @@ markers = [
|
||||
"integration: marks tests requiring external services (API keys, Modal, etc.)",
|
||||
"real_concurrent_gate: opt out of the autouse stub that disables _detect_concurrent_hermes_instances",
|
||||
]
|
||||
# pytest-timeout: per-test 60s hard cap with thread method.
|
||||
# Discovered May 2026: the suite reliably hangs at ~96% on full runs even
|
||||
# though every individual test completes in <30s. Root cause is leaked
|
||||
# threads / atexit handlers accumulating across thousands of tests until
|
||||
# something deadlocks at session teardown. Adding pytest-timeout (with
|
||||
# thread method, which forces an interrupt into the test thread) breaks
|
||||
# the deadlock — the suite then completes cleanly. The 60s cap is large
|
||||
# enough that no legitimate test trips it; if a test exceeds it that's a
|
||||
# real bug worth surfacing as a Timeout failure.
|
||||
addopts = "-m 'not integration' -n auto --timeout=30 --timeout-method=signal"
|
||||
# pytest-timeout: per-test 30s hard cap with signal method.
|
||||
# This is the fallback inside each per-file pytest subprocess (see
|
||||
# scripts/run_tests_parallel.py). Per-file isolation gives every test
|
||||
# file a fresh Python interpreter; pytest-timeout catches Python-level
|
||||
# hangs within a file.
|
||||
addopts = "-m 'not integration' --timeout=30 --timeout-method=signal"
|
||||
|
||||
[tool.ty.environment]
|
||||
python-version = "3.13"
|
||||
|
||||
@@ -3357,6 +3357,25 @@ class AIAgent:
|
||||
return content
|
||||
|
||||
if self._model_supports_vision():
|
||||
# Vision-capable on paper — but if we've already learned in this
|
||||
# session that the active (provider, model) rejects list-type
|
||||
# tool content (e.g. Xiaomi MiMo's 400 "text is not set"),
|
||||
# short-circuit to a text summary so we don't burn another
|
||||
# round-trip relearning the same lesson. Cache populated by
|
||||
# the 400 recovery path in agent.conversation_loop. Transient
|
||||
# per-session; next session retries.
|
||||
key = (
|
||||
(getattr(self, "provider", "") or "").strip().lower(),
|
||||
(getattr(self, "model", "") or "").strip(),
|
||||
)
|
||||
no_list = getattr(self, "_no_list_tool_content_models", None)
|
||||
if no_list and key in no_list:
|
||||
logger.debug(
|
||||
"Tool %s: model %s/%s known to reject list-type tool "
|
||||
"content this session — sending text summary",
|
||||
tool_name, key[0], key[1],
|
||||
)
|
||||
return _multimodal_text_summary(result)
|
||||
return content
|
||||
|
||||
summary = _multimodal_text_summary(result)
|
||||
@@ -3385,6 +3404,80 @@ class AIAgent:
|
||||
from agent.conversation_compression import try_shrink_image_parts_in_messages
|
||||
return try_shrink_image_parts_in_messages(api_messages)
|
||||
|
||||
def _try_strip_image_parts_from_tool_messages(self, api_messages: list) -> bool:
|
||||
"""Downgrade list-type tool messages to text summaries in-place.
|
||||
|
||||
Recovery path for providers that reject list-type tool message content
|
||||
(e.g. Xiaomi MiMo's 400 "text is not set"; see issue #27344). Walks
|
||||
``api_messages`` for any ``role: "tool"`` message whose ``content`` is
|
||||
a list containing image parts, replaces the content with the existing
|
||||
text part(s) (or a minimal placeholder if none survive), and records
|
||||
the active (provider, model) in ``self._no_list_tool_content_models``
|
||||
so subsequent ``_tool_result_content_for_active_model`` calls in this
|
||||
session preemptively downgrade screenshots without a round-trip.
|
||||
|
||||
Returns True when at least one tool message was downgraded — the
|
||||
caller (the 400 recovery branch in ``agent.conversation_loop``) uses
|
||||
this to decide whether to retry the API call with the modified
|
||||
history or surface the original error.
|
||||
"""
|
||||
if not isinstance(api_messages, list):
|
||||
return False
|
||||
|
||||
# Record (provider, model) so we don't relearn this lesson.
|
||||
key = (
|
||||
(getattr(self, "provider", "") or "").strip().lower(),
|
||||
(getattr(self, "model", "") or "").strip(),
|
||||
)
|
||||
if not hasattr(self, "_no_list_tool_content_models"):
|
||||
self._no_list_tool_content_models = set()
|
||||
if key[1]: # only record when we actually have a model id
|
||||
self._no_list_tool_content_models.add(key)
|
||||
|
||||
changed = False
|
||||
for msg in api_messages:
|
||||
if not isinstance(msg, dict) or msg.get("role") != "tool":
|
||||
continue
|
||||
content = msg.get("content")
|
||||
if not isinstance(content, list):
|
||||
continue
|
||||
|
||||
# Salvage any text parts so the model still sees some signal.
|
||||
text_parts: List[str] = []
|
||||
had_image = False
|
||||
for part in content:
|
||||
if not isinstance(part, dict):
|
||||
if isinstance(part, str) and part.strip():
|
||||
text_parts.append(part.strip())
|
||||
continue
|
||||
ptype = part.get("type")
|
||||
if ptype == "image_url" or ptype == "input_image":
|
||||
had_image = True
|
||||
continue
|
||||
if ptype in {"text", "input_text"}:
|
||||
text = str(part.get("text") or "").strip()
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
|
||||
if not had_image:
|
||||
# List-type content but no image parts — leave alone (some
|
||||
# providers reject ANY list content, but stripping a
|
||||
# text-only list doesn't reduce ambiguity; let the caller
|
||||
# surface the original error if this turns out to be the
|
||||
# case).
|
||||
continue
|
||||
|
||||
if text_parts:
|
||||
msg["content"] = "\n\n".join(text_parts)
|
||||
else:
|
||||
msg["content"] = (
|
||||
"[image content removed — provider does not accept "
|
||||
"list-type tool message content]"
|
||||
)
|
||||
changed = True
|
||||
|
||||
return changed
|
||||
|
||||
def _anthropic_preserve_dots(self) -> bool:
|
||||
"""True when using an anthropic-compatible endpoint that preserves dots in model names.
|
||||
Alibaba/DashScope keeps dots (e.g. qwen3.5-plus).
|
||||
|
||||
@@ -47,7 +47,9 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json"
|
||||
AUTHOR_MAP = {
|
||||
# teknium (multiple emails)
|
||||
"teknium1@gmail.com": "teknium1",
|
||||
"cipherframe@users.noreply.github.com": "CipherFrame",
|
||||
"me@promplate.dev": "CNSeniorious000",
|
||||
"yichengqiao21@gmail.com": "YarrowQiao",
|
||||
"erhanyasarx@gmail.com": "erhnysr",
|
||||
"30366221+WorldWriter@users.noreply.github.com": "WorldWriter",
|
||||
"dafeng@DafengdeMacBook-Pro.local": "WorldWriter",
|
||||
@@ -58,13 +60,18 @@ AUTHOR_MAP = {
|
||||
"mgongzai@gmail.com": "vKongv",
|
||||
"0x.badfriend@gmail.com": "discodirector",
|
||||
"altriatree@gmail.com": "TruaShamu",
|
||||
"contact-me@stark-x.cn": "Stark-X",
|
||||
"nat@nthrow.io": "nthrow",
|
||||
"m@mobrienv.dev": "mikeyobrien",
|
||||
"saeed919@pm.me": "falasi",
|
||||
"chrisdlc119@outlook.com": "chdlc",
|
||||
"omar@techdeveloper.site": "nycomar",
|
||||
"qiyin.zuo@pcitc.com": "qiyin-code",
|
||||
"mr.aashiz@gmail.com": "aashizpoudel",
|
||||
"70629228+shaun0927@users.noreply.github.com": "shaun0927",
|
||||
"98262967+Bihruze@users.noreply.github.com": "Bihruze",
|
||||
"189280367+Lempkey@users.noreply.github.com": "Lempkey",
|
||||
"leovillalbajr@gmail.com": "Lempkey",
|
||||
"nidhi2894@gmail.com": "nidhi-singh02",
|
||||
"30312689+aashizpoudel@users.noreply.github.com": "aashizpoudel",
|
||||
"oleksii.lisikh@gmail.com": "olisikh",
|
||||
@@ -928,6 +935,8 @@ AUTHOR_MAP = {
|
||||
"holynn@placeholder.local": "holynn-q",
|
||||
"agent@hermes.local": "jacdevos",
|
||||
"sunsky.lau@gmail.com": "liuhao1024",
|
||||
"fabianoeq@gmail.com": "rodrigoeqnit",
|
||||
"178342791+sgtworkman@users.noreply.github.com": "sgtworkman",
|
||||
"qiuqfang98@qq.com": "keepcalmqqf",
|
||||
"261867348+ai-ag2026@users.noreply.github.com": "ai-ag2026",
|
||||
"yanzh.su@gmail.com": "YanzhongSu",
|
||||
|
||||
+40
-96
@@ -3,29 +3,36 @@
|
||||
# `pytest` directly to guarantee your local run matches CI behavior.
|
||||
#
|
||||
# What this script enforces:
|
||||
# * -n 4 xdist workers (CI has 4 cores; -n auto diverges locally)
|
||||
# * Per-file isolation via scripts/run_tests_parallel.py — each test
|
||||
# file runs in its own freshly-spawned `python -m pytest <file>`
|
||||
# subprocess. No xdist, no shared workers, no module-level leakage
|
||||
# between files.
|
||||
# * TZ=UTC, LANG=C.UTF-8, PYTHONHASHSEED=0 (deterministic)
|
||||
# * Credential env vars blanked (conftest.py also does this, but this
|
||||
# is belt-and-suspenders for anyone running `pytest` outside of
|
||||
# our conftest path — e.g. calling pytest on a single file)
|
||||
# * Proper venv activation
|
||||
# * Env vars blanked (conftest.py also does this, but this
|
||||
# is belt-and-suspenders for anyone running pytest outside our
|
||||
# conftest path — e.g. on a single file)
|
||||
# * Proper venv activation (probes .venv, venv, then ~/.hermes/...)
|
||||
#
|
||||
# Usage:
|
||||
# scripts/run_tests.sh # full suite
|
||||
# scripts/run_tests.sh tests/agent/ # one directory
|
||||
# scripts/run_tests.sh tests/agent/test_foo.py::TestClass::test_method
|
||||
# scripts/run_tests.sh --tb=long -v # pass-through pytest args
|
||||
# scripts/run_tests.sh # full suite
|
||||
# scripts/run_tests.sh -j 4 # cap parallelism
|
||||
# scripts/run_tests.sh tests/agent/ # discover only here
|
||||
# scripts/run_tests.sh tests/agent/ tests/acp/ # multiple roots
|
||||
# scripts/run_tests.sh tests/foo.py # single file
|
||||
# scripts/run_tests.sh tests/foo.py -- --tb=long # path + pytest args
|
||||
# scripts/run_tests.sh -- -v --tb=long # pytest args only
|
||||
#
|
||||
# Everything after a literal '--' is passed through to each per-file
|
||||
# pytest invocation. Positional path arguments before '--' override
|
||||
# the default discovery root (tests/).
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ── Locate repo root ────────────────────────────────────────────────────────
|
||||
# Works whether this is the main checkout or a worktree.
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
# ── Activate venv ───────────────────────────────────────────────────────────
|
||||
# Prefer a .venv in the current tree, fall back to the main checkout's venv
|
||||
# (useful for worktrees where we don't always duplicate the venv).
|
||||
VENV=""
|
||||
for candidate in "$REPO_ROOT/.venv" "$REPO_ROOT/venv" "$HOME/.hermes/hermes-agent/venv"; do
|
||||
if [ -f "$candidate/bin/activate" ]; then
|
||||
@@ -41,94 +48,31 @@ fi
|
||||
|
||||
PYTHON="$VENV/bin/python"
|
||||
|
||||
# ── Ensure pytest-split is installed (required for shard-equivalent runs) ──
|
||||
if ! "$PYTHON" -c "import pytest_split" 2>/dev/null; then
|
||||
echo "→ installing pytest-split into $VENV"
|
||||
if command -v uv >/dev/null 2>&1; then
|
||||
uv pip install --python "$PYTHON" --quiet "pytest-split>=0.9,<1"
|
||||
elif "$PYTHON" -m pip --version >/dev/null 2>&1; then
|
||||
"$PYTHON" -m pip install --quiet "pytest-split>=0.9,<1"
|
||||
else
|
||||
echo "error: neither uv nor pip is available in $VENV — pytest-split is missing" >&2
|
||||
echo " fix: run uv pip install -e \".[dev]\" from $REPO_ROOT" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── Hermetic environment ────────────────────────────────────────────────────
|
||||
# Mirror what CI does in .github/workflows/tests.yml + what conftest.py does.
|
||||
# Unset every credential-shaped var currently in the environment.
|
||||
while IFS='=' read -r name _; do
|
||||
case "$name" in
|
||||
*_API_KEY|*_TOKEN|*_SECRET|*_PASSWORD|*_CREDENTIALS|*_ACCESS_KEY| \
|
||||
*_SECRET_ACCESS_KEY|*_PRIVATE_KEY|*_OAUTH_TOKEN|*_WEBHOOK_SECRET| \
|
||||
*_ENCRYPT_KEY|*_APP_SECRET|*_CLIENT_SECRET|*_CORP_SECRET|*_AES_KEY| \
|
||||
AWS_ACCESS_KEY_ID|AWS_SECRET_ACCESS_KEY|AWS_SESSION_TOKEN|FAL_KEY| \
|
||||
GH_TOKEN|GITHUB_TOKEN)
|
||||
unset "$name"
|
||||
;;
|
||||
esac
|
||||
done < <(env)
|
||||
|
||||
# Unset HERMES_* behavioral vars too.
|
||||
unset HERMES_YOLO_MODE HERMES_INTERACTIVE HERMES_QUIET HERMES_TOOL_PROGRESS \
|
||||
HERMES_TOOL_PROGRESS_MODE HERMES_MAX_ITERATIONS HERMES_SESSION_PLATFORM \
|
||||
HERMES_SESSION_CHAT_ID HERMES_SESSION_CHAT_NAME HERMES_SESSION_THREAD_ID \
|
||||
HERMES_SESSION_SOURCE HERMES_SESSION_KEY HERMES_GATEWAY_SESSION \
|
||||
HERMES_CRON_SESSION \
|
||||
HERMES_PLATFORM HERMES_INFERENCE_PROVIDER HERMES_MANAGED HERMES_DEV \
|
||||
HERMES_CONTAINER HERMES_EPHEMERAL_SYSTEM_PROMPT HERMES_TIMEZONE \
|
||||
HERMES_REDACT_SECRETS HERMES_BACKGROUND_NOTIFICATIONS HERMES_EXEC_ASK \
|
||||
HERMES_HOME_MODE 2>/dev/null || true
|
||||
|
||||
# Pin deterministic runtime.
|
||||
export TZ=UTC
|
||||
export LANG=C.UTF-8
|
||||
export LC_ALL=C.UTF-8
|
||||
export PYTHONHASHSEED=0
|
||||
|
||||
# ── Live-gateway test guard (developer machines) ────────────────────────────
|
||||
# If a system-wide hermes pytest_live_guard plugin is installed at
|
||||
# $HOME/.hermes/pytest_live_guard.py, force-load it here so every test run
|
||||
# from this script gets the protection regardless of which worktree is
|
||||
# checked out (in-tree tests/conftest.py guard may be missing on stale
|
||||
# branches). Harmless on CI / fresh machines that don't have the file.
|
||||
# ── Live-gateway plugin (computed before we drop env) ───────────────────────
|
||||
EXTRA_PYTHONPATH=""
|
||||
EXTRA_PYTEST_PLUGINS=""
|
||||
if [ -f "$HOME/.hermes/pytest_live_guard.py" ]; then
|
||||
case ":${PYTHONPATH:-}:" in
|
||||
*":$HOME/.hermes:"*) ;;
|
||||
*) export PYTHONPATH="${PYTHONPATH:+$PYTHONPATH:}$HOME/.hermes" ;;
|
||||
esac
|
||||
if [[ ",${PYTEST_PLUGINS:-}," != *,pytest_live_guard,* ]]; then
|
||||
export PYTEST_PLUGINS="${PYTEST_PLUGINS:+$PYTEST_PLUGINS,}pytest_live_guard"
|
||||
fi
|
||||
EXTRA_PYTHONPATH="$HOME/.hermes"
|
||||
EXTRA_PYTEST_PLUGINS="pytest_live_guard"
|
||||
fi
|
||||
|
||||
# ── Worker count ────────────────────────────────────────────────────────────
|
||||
# CI uses `-n auto` on ubuntu-latest which gives 4 workers. A 20-core
|
||||
# workstation with `-n auto` gets 20 workers and exposes test-ordering
|
||||
# flakes that CI will never see. Pin to 4 so local matches CI.
|
||||
WORKERS="${HERMES_TEST_WORKERS:-4}"
|
||||
|
||||
# ── Run pytest ──────────────────────────────────────────────────────────────
|
||||
# ── Run in hermetic env ──────────────────────────────────────────────────────
|
||||
# env -i: start with empty environment, opt-in only what we need.
|
||||
# No credential var can leak — you'd have to explicitly add it here.
|
||||
echo "▶ running per-file parallel test suite via run_tests_parallel.py"
|
||||
echo " (TZ=UTC LANG=C.UTF-8 PYTHONHASHSEED=0; clean env)"
|
||||
|
||||
cd "$REPO_ROOT"
|
||||
|
||||
# If the first argument starts with `-` treat all args as pytest flags;
|
||||
# otherwise treat them as test paths.
|
||||
ARGS=("$@")
|
||||
|
||||
echo "▶ running pytest with $WORKERS workers, hermetic env, in $REPO_ROOT"
|
||||
echo " (TZ=UTC LANG=C.UTF-8 PYTHONHASHSEED=0; all credential env vars unset)"
|
||||
|
||||
# -o "addopts=" clears pyproject.toml's `-n auto` so our -n wins.
|
||||
# We re-add --timeout/--timeout-method here because pyproject.toml's
|
||||
# addopts is wiped above. The 60s cap is essential: see pyproject.toml
|
||||
# for why (suite deadlocks at session teardown without it).
|
||||
exec "$PYTHON" -m pytest \
|
||||
-o "addopts=" \
|
||||
-n "$WORKERS" \
|
||||
--timeout=30 \
|
||||
--timeout-method=signal \
|
||||
--ignore=tests/integration \
|
||||
--ignore=tests/e2e \
|
||||
-m "not integration" \
|
||||
"${ARGS[@]}"
|
||||
exec env -i \
|
||||
PATH="$PATH" \
|
||||
HOME="$HOME" \
|
||||
TZ=UTC \
|
||||
LANG=C.UTF-8 \
|
||||
LC_ALL=C.UTF-8 \
|
||||
PYTHONHASHSEED=0 \
|
||||
${EXTRA_PYTHONPATH:+PYTHONPATH="$EXTRA_PYTHONPATH"} \
|
||||
${EXTRA_PYTEST_PLUGINS:+PYTEST_PLUGINS="$EXTRA_PYTEST_PLUGINS"} \
|
||||
"$PYTHON" "$SCRIPT_DIR/run_tests_parallel.py" "$@"
|
||||
|
||||
Executable
+650
@@ -0,0 +1,650 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Per-file parallel test runner.
|
||||
|
||||
The minimum-viable replacement for pytest-xdist + a subprocess-isolation
|
||||
plugin. Discovers test files under ``tests/`` (excluding integration/e2e
|
||||
unless explicitly requested), then runs one ``python -m pytest <file>``
|
||||
subprocess per file, with bounded parallelism (default: ``os.cpu_count()``).
|
||||
|
||||
Why per-file rather than per-test?
|
||||
Per-test spawn overhead (~250ms × 17k tests = 70min CPU minimum)
|
||||
swamped the actual work. Per-file spawn (~250ms × ~850 files = ~3.5min)
|
||||
fits in the budget while still giving every file a fresh Python
|
||||
interpreter — the only isolation boundary that actually matters
|
||||
(cross-file module-level state leakage was the original flake source;
|
||||
intra-file state is the test author's responsibility).
|
||||
|
||||
Why drop xdist entirely?
|
||||
xdist's persistent workers accumulate state across files, which is
|
||||
exactly the leakage we wanted to fix. xdist also adds complexity
|
||||
(loadfile vs loadscope, --max-worker-restart, internal control plane)
|
||||
that we don't need when the unit of work is "run pytest on one file".
|
||||
A subprocess.Popen pool gated by a semaphore is ~60 lines and does
|
||||
the job.
|
||||
|
||||
Usage:
|
||||
python scripts/run_tests_parallel.py [pytest_args...]
|
||||
|
||||
Common pytest args pass through (e.g. ``-v``, ``-x``, ``--tb=long``,
|
||||
``-k 'pattern'``, ``--lf``).
|
||||
|
||||
Environment:
|
||||
HERMES_TEST_WORKERS Override worker count (default: os.cpu_count())
|
||||
HERMES_TEST_PATHS Override discovery roots (colon-sep, default: 'tests')
|
||||
|
||||
Exit code: 0 if every file's pytest exited 0; 1 otherwise.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, Future
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
|
||||
# Default test discovery roots.
|
||||
_DEFAULT_ROOTS = ["tests"]
|
||||
|
||||
# Directories to skip during discovery — the e2e + integration suites
|
||||
# require real services and are run separately. Match exactly the
|
||||
# ``--ignore=`` flags the previous CI command used.
|
||||
_SKIP_PARTS = {"integration", "e2e"}
|
||||
|
||||
# Per-file wall-clock cap. Generous default — pytest-timeout still
|
||||
# enforces per-test caps inside each subprocess; this is just an outer
|
||||
# safety net so a single hung file can't stall the whole suite. Override
|
||||
# via --file-timeout or HERMES_TEST_FILE_TIMEOUT.
|
||||
_DEFAULT_FILE_TIMEOUT_SECONDS = 600.0 # 10 minutes
|
||||
|
||||
|
||||
def _count_tests(
|
||||
files: List[Path], repo_root: Path, pytest_passthrough: List[str]
|
||||
) -> dict[Path, int]:
|
||||
"""Run ``pytest --co -q`` once to count individual tests per file.
|
||||
|
||||
Returns a mapping ``{file_path: test_count}``. Files with zero
|
||||
collected tests are omitted from the dict (not an error — e.g. the
|
||||
file only defines fixtures / conftest helpers).
|
||||
|
||||
This is a single subprocess call (~2-5s for ~1k files) that gives
|
||||
us the total test count for the discovery announcement and
|
||||
per-file counts for the progress lines.
|
||||
|
||||
``--ignore`` flags for directories in ``_SKIP_PARTS`` are added
|
||||
automatically so that pytest's own collection machinery (conftest
|
||||
walking, directory traversal) doesn't pull in tests we intend to
|
||||
skip — matching what the per-file runs will actually execute.
|
||||
"""
|
||||
# Build --ignore flags for skipped dirs so the --co collection
|
||||
# mirrors what we'll actually run (not what pytest might find via
|
||||
# conftest walking or directory traversal).
|
||||
ignore_args: List[str] = []
|
||||
for root in [repo_root / p for p in _DEFAULT_ROOTS]:
|
||||
for part in _SKIP_PARTS:
|
||||
d = root / part
|
||||
if d.is_dir():
|
||||
ignore_args.extend(["--ignore", str(d)])
|
||||
|
||||
cmd = [
|
||||
sys.executable, "-m", "pytest",
|
||||
"--co", "-q",
|
||||
*ignore_args,
|
||||
*[str(f) for f in files],
|
||||
*pytest_passthrough,
|
||||
]
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
cwd=repo_root,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
)
|
||||
except (subprocess.TimeoutExpired, OSError):
|
||||
return {}
|
||||
|
||||
counts: dict[Path, int] = {}
|
||||
for line in result.stdout.splitlines():
|
||||
# Lines look like: tests/acp/test_auth.py::TestClass::test_name
|
||||
if "::" not in line:
|
||||
continue
|
||||
file_part = line.split("::", 1)[0]
|
||||
key = repo_root / file_part
|
||||
counts[key] = counts.get(key, 0) + 1
|
||||
|
||||
return counts
|
||||
|
||||
|
||||
def _discover_files(roots: List[Path]) -> List[Path]:
|
||||
"""Return every ``test_*.py`` under the given roots (sorted).
|
||||
|
||||
Roots may be directories (recursed for ``test_*.py``) or explicit
|
||||
``.py`` files (included as-is, even if they don't match the
|
||||
``test_*`` prefix — caller knows what they want).
|
||||
|
||||
Exclude any file whose path contains a component in ``_SKIP_PARTS``,
|
||||
UNLESS the user explicitly named it as a root (in which case the
|
||||
user's intent overrides the skip filter).
|
||||
"""
|
||||
seen: set[Path] = set()
|
||||
out: List[Path] = []
|
||||
for root in roots:
|
||||
if not root.exists():
|
||||
continue
|
||||
if root.is_file():
|
||||
# Explicit file: include it as-is, skip the _SKIP_PARTS filter
|
||||
# since the user named it directly.
|
||||
real = root.resolve()
|
||||
if real not in seen:
|
||||
seen.add(real)
|
||||
out.append(root)
|
||||
continue
|
||||
for path in root.rglob("test_*.py"):
|
||||
if any(part in _SKIP_PARTS for part in path.parts):
|
||||
continue
|
||||
real = path.resolve()
|
||||
if real in seen:
|
||||
continue
|
||||
seen.add(real)
|
||||
out.append(path)
|
||||
return sorted(out)
|
||||
|
||||
|
||||
def _kill_tree(proc: "subprocess.Popen", pgid: int | None = None) -> None:
|
||||
"""Kill the pytest subprocess and every descendant it spawned.
|
||||
|
||||
A test run can spin up uvicorn servers, async runtimes, or other
|
||||
long-running grandchildren that survive the pytest subprocess exit
|
||||
if we don't kill the whole tree. ``subprocess.Popen.kill()`` only
|
||||
targets the immediate child; grandchildren reparent to PID 1
|
||||
(Linux) / get adopted by services.exe (Windows) and leak.
|
||||
|
||||
POSIX: the caller must pass ``pgid`` — the process group id captured
|
||||
immediately after Popen (via ``os.getpgid(proc.pid)``). We can't
|
||||
look it up here in the happy path because by the time we get
|
||||
called the leader process has already been reaped and its pid is
|
||||
gone from the kernel's process table, even though descendants in
|
||||
the group are still alive. SIGKILL'ing the captured pgid takes out
|
||||
everything in that group atomically.
|
||||
|
||||
Windows: ``taskkill /F /T /PID`` walks the recorded ppid chain and
|
||||
terminates the whole tree, even when the root has already exited.
|
||||
|
||||
Why not psutil: psutil walks the parent-child tree, but in the
|
||||
happy path the root has already been reaped so ``psutil.Process(pid)``
|
||||
can't find it; grandchildren reparented to PID 1 are also
|
||||
unreachable by tree walk at that point. The platform-native
|
||||
primitives (process groups / taskkill) handle both cases correctly
|
||||
without an extra abstraction layer.
|
||||
"""
|
||||
if proc.pid is None:
|
||||
return
|
||||
|
||||
if sys.platform == "win32":
|
||||
try:
|
||||
|
||||
subprocess.run(
|
||||
["taskkill", "/F", "/T", "/PID", str(proc.pid)],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
timeout=10,
|
||||
) # windows-footgun: ok
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
||||
pass
|
||||
else:
|
||||
# POSIX: kill the captured pgid. Local-import signal so the
|
||||
# SIGKILL attribute is never referenced on Windows.
|
||||
if pgid is not None:
|
||||
try:
|
||||
import signal as _signal
|
||||
os.killpg(pgid, _signal.SIGKILL) # windows-footgun: ok
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
pass
|
||||
|
||||
# Belt-and-suspenders: ensure subprocess.communicate() sees the exit.
|
||||
try:
|
||||
proc.kill()
|
||||
except (ProcessLookupError, OSError):
|
||||
pass
|
||||
|
||||
|
||||
def _run_one_file(
|
||||
file: Path,
|
||||
pytest_args: List[str],
|
||||
repo_root: Path,
|
||||
file_timeout: float,
|
||||
) -> Tuple[Path, int, str, dict[str, int]]:
|
||||
"""Run ``python -m pytest <file> <pytest_args>`` in a fresh subprocess.
|
||||
|
||||
Returns (file, returncode, captured_combined_output, summary_counts).
|
||||
|
||||
``summary_counts`` is the result of ``_parse_pytest_summary(output)`` —
|
||||
|
||||
pytest exit codes (https://docs.pytest.org/en/stable/reference/exit-codes.html):
|
||||
0 = all tests passed
|
||||
1 = some tests failed
|
||||
2 = test execution interrupted
|
||||
3 = internal error
|
||||
4 = pytest CLI usage error
|
||||
5 = no tests collected
|
||||
|
||||
We treat exit 5 as a pass: it just means every test in the file was
|
||||
skipped or filtered by a marker (e.g. ``-m 'not integration'`` skips
|
||||
files where every test is marked integration). That's intentional and
|
||||
not a failure mode.
|
||||
|
||||
On per-file timeout (``file_timeout`` seconds) or any other exception
|
||||
during ``communicate()``, we kill the whole process group / process
|
||||
tree so grandchildren (uvicorn servers, async runtimes, etc.) do not
|
||||
orphan onto PID 1. The pytest-timeout plugin enforces per-test
|
||||
timeouts inside the subprocess; this outer timeout exists only to
|
||||
bound a pathologically slow or hung file as a whole.
|
||||
"""
|
||||
cmd = [sys.executable, "-m", "pytest", str(file), *pytest_args]
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
cwd=repo_root,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
# POSIX: place the child at the head of its own process group so
|
||||
# _kill_tree can SIGKILL the group atomically.
|
||||
# Windows: this maps to CREATE_NEW_PROCESS_GROUP in CPython 3.12+;
|
||||
# _kill_tree handles the Windows path via taskkill /F /T.
|
||||
start_new_session=True,
|
||||
)
|
||||
|
||||
# Capture the pgid NOW, before the leader can exit and be reaped.
|
||||
# Once the leader is reaped, os.getpgid(proc.pid) raises
|
||||
# ProcessLookupError even though grandchildren in that group are
|
||||
# still alive — defeating the whole cleanup. None on Windows where
|
||||
# the pgid concept doesn't apply (taskkill walks ppid chain instead).
|
||||
pgid: int | None = None
|
||||
if sys.platform != "win32":
|
||||
try:
|
||||
pgid = os.getpgid(proc.pid)
|
||||
except (ProcessLookupError, PermissionError):
|
||||
# Astonishingly fast child? Already dead. _kill_tree's
|
||||
# fallback will handle this case as a no-op.
|
||||
pgid = None
|
||||
|
||||
try:
|
||||
output, _ = proc.communicate(timeout=file_timeout)
|
||||
rc = proc.returncode
|
||||
except subprocess.TimeoutExpired:
|
||||
_kill_tree(proc, pgid=pgid)
|
||||
# Drain whatever the child wrote before we killed it so we have
|
||||
# something to surface in the failure dump.
|
||||
try:
|
||||
output, _ = proc.communicate(timeout=10)
|
||||
except subprocess.TimeoutExpired:
|
||||
output = "(file timeout exceeded; output unavailable)"
|
||||
rc = 124 # de facto convention for "killed by timeout".
|
||||
output = (
|
||||
f"(per-file timeout: {file_timeout:.0f}s exceeded; "
|
||||
f"process tree SIGKILL'd)\n{output}"
|
||||
)
|
||||
except BaseException:
|
||||
# KeyboardInterrupt / runner crash — make sure no zombie
|
||||
# grandchildren outlive us.
|
||||
_kill_tree(proc, pgid=pgid)
|
||||
raise
|
||||
else:
|
||||
# Happy path: pytest exited on its own. The child process already
|
||||
# cleaned up its grandchildren if it's well-behaved, but
|
||||
# well-behaved is not universal — kill the group anyway. Already-
|
||||
# dead processes are a no-op.
|
||||
_kill_tree(proc, pgid=pgid)
|
||||
|
||||
if rc == 5:
|
||||
# No tests collected — every test in the file was filtered out.
|
||||
# Treat as a pass; surface info in a slightly distinct status
|
||||
# so the operator can spot it.
|
||||
rc = 0
|
||||
summary = _parse_pytest_summary(output)
|
||||
return file, rc, output, summary
|
||||
|
||||
|
||||
def _parse_pytest_summary(output: str) -> dict[str, int]:
|
||||
"""Extract per-file test pass/fail/skip counts from pytest output.
|
||||
|
||||
pytest prints a summary line like ``12 passed, 3 skipped, 1 failed in 2.1s``
|
||||
as the last non-empty line before the short test summary. We scrape that
|
||||
line for the individual counts so the progress display can show test-level
|
||||
granularity instead of just file-level pass/fail.
|
||||
|
||||
Returns a dict with keys ``passed``, ``failed``, ``skipped``, ``errors``,
|
||||
``xfailed``, ``xpassed`` (only keys found in the output are present).
|
||||
"""
|
||||
import re
|
||||
|
||||
result: dict[str, int] = {}
|
||||
# Walk backwards from the end — the summary line is always near the tail.
|
||||
for line in reversed(output.splitlines()):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
# Match "N passed", "N failed", "N skipped", "N errors", "N xfailed", "N xpassed"
|
||||
for m in re.finditer(r"(\d+)\s+(passed|failed|skipped|errors|xfailed|xpassed)", line):
|
||||
result[m.group(2)] = int(m.group(1))
|
||||
# Also match "N error" (singular — pytest uses this sometimes).
|
||||
for m in re.finditer(r"(\d+)\s+error\b", line):
|
||||
result.setdefault("errors", result.get("errors", 0) + int(m.group(1)))
|
||||
if result:
|
||||
# Found the counts line — done.
|
||||
break
|
||||
# Stop at the short test summary header (if any) — everything above
|
||||
# that is individual failure details, not the counts line.
|
||||
if line.startswith("FAILED") or line.startswith("SHORT TEST SUMMARY"):
|
||||
break
|
||||
return result
|
||||
|
||||
|
||||
def _format_file(file: Path, repo_root: Path) -> str:
|
||||
"""Render a test-file path for display: strip the repo-root prefix
|
||||
when possible so output reads ``tests/acp/test_auth.py`` instead of
|
||||
``/home/runner/work/hermes-agent/hermes-agent/tests/acp/test_auth.py``.
|
||||
|
||||
Falls back to the absolute path for anything outside the repo root.
|
||||
"""
|
||||
try:
|
||||
return str(file.resolve().relative_to(repo_root.resolve()))
|
||||
except ValueError:
|
||||
return str(file)
|
||||
|
||||
|
||||
def _print_progress(
|
||||
tests_done: int,
|
||||
total_tests: int,
|
||||
file: Path,
|
||||
rc: int,
|
||||
dur: float,
|
||||
repo_root: Path,
|
||||
tests_passed: int,
|
||||
tests_failed: int,
|
||||
test_counts: dict[Path, int],
|
||||
file_summary: dict[str, int] | None = None,
|
||||
) -> None:
|
||||
"""Single-line live progress.
|
||||
|
||||
When ``file_summary`` is provided (parsed from pytest output), the
|
||||
per-file parenthetical shows individual test pass/fail counts instead
|
||||
of just the total test count.
|
||||
"""
|
||||
status = "✓" if rc == 0 else "✗"
|
||||
pct = (tests_done / total_tests * 100) if total_tests else 0
|
||||
# Digit width for left-side counter padding (derived from total file count).
|
||||
fw = len(str(tests_passed + tests_failed))
|
||||
# Build per-file test count string.
|
||||
if file_summary:
|
||||
parts = []
|
||||
p = file_summary.get("passed", 0)
|
||||
f = file_summary.get("failed", 0)
|
||||
s = file_summary.get("skipped", 0)
|
||||
e = file_summary.get("errors", 0)
|
||||
if p:
|
||||
parts.append(f"{p}✓")
|
||||
if f:
|
||||
parts.append(f"{f}✗")
|
||||
if s:
|
||||
parts.append(f"{s}s")
|
||||
if e:
|
||||
parts.append(f"{e}e")
|
||||
# xfailed/xpassed are rare; include if present.
|
||||
xf = file_summary.get("xfailed", 0)
|
||||
xp = file_summary.get("xpassed", 0)
|
||||
if xf:
|
||||
parts.append(f"{xf}xf")
|
||||
if xp:
|
||||
parts.append(f"{xp}xp")
|
||||
test_str = " ".join(parts) + ", " if parts else ""
|
||||
else:
|
||||
n_tests = test_counts.get(file, 0)
|
||||
test_str = f"{n_tests} tests, " if n_tests else ""
|
||||
msg = (
|
||||
f"[{pct:5.1f}% | {tests_done:>5}/{total_tests}"
|
||||
f" | ✓{tests_passed:>{fw}} | ✗{tests_failed:>{fw}}] "
|
||||
f"{status} {_format_file(file, repo_root)} ({test_str}{dur:.1f}s)"
|
||||
)
|
||||
# Truncate to terminal width if available (no clobbering ANSI lines).
|
||||
try:
|
||||
cols = os.get_terminal_size().columns
|
||||
if len(msg) > cols:
|
||||
msg = msg[: cols - 1] + "…"
|
||||
except OSError:
|
||||
pass
|
||||
print(msg, flush=True)
|
||||
|
||||
|
||||
def _print_inline_failure(
|
||||
file: Path, output: str, repo_root: Path, pytest_passthrough: List[str]
|
||||
) -> None:
|
||||
"""Print a compact failure summary immediately when a file fails.
|
||||
|
||||
Shows the tail of the pytest output (the failure section with stack
|
||||
traces) and a ready-to-run repro command, so the developer doesn't
|
||||
have to wait for the full run to finish before seeing what broke.
|
||||
"""
|
||||
rel = _format_file(file, repo_root)
|
||||
# Build a repro command the developer can copy-paste.
|
||||
passthrough_str = " ".join(pytest_passthrough) if pytest_passthrough else ""
|
||||
repro = f"python -m pytest {rel}"
|
||||
if passthrough_str:
|
||||
repro += f" {passthrough_str}"
|
||||
|
||||
# Grab just the failure lines (last ~30 lines of pytest output —
|
||||
# typically the FAILED summary + short test info).
|
||||
lines = output.rstrip().splitlines()
|
||||
tail = "\n".join(lines[-30:])
|
||||
|
||||
print(flush=True)
|
||||
print(f" ╔╍ Failed: {rel} ╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍", flush=True)
|
||||
for line in tail.splitlines():
|
||||
print(f" ║ {line}", flush=True)
|
||||
print(f" ║", flush=True)
|
||||
print(f" ║ Repro: {repro}", flush=True)
|
||||
print(f" ╚╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍", flush=True)
|
||||
print(flush=True)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-j",
|
||||
"--jobs",
|
||||
type=int,
|
||||
default=int(os.environ.get("HERMES_TEST_WORKERS") or (os.cpu_count() or 4) * 2),
|
||||
help="Parallel worker count (default: $HERMES_TEST_WORKERS or cpu_count*2)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--paths",
|
||||
default=os.environ.get("HERMES_TEST_PATHS", ":".join(_DEFAULT_ROOTS)),
|
||||
help="Colon-separated discovery roots (default: 'tests')",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--include-integration",
|
||||
action="store_true",
|
||||
help="Don't skip integration/ e2e/ during discovery",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--file-timeout",
|
||||
type=float,
|
||||
default=float(
|
||||
os.environ.get("HERMES_TEST_FILE_TIMEOUT", _DEFAULT_FILE_TIMEOUT_SECONDS)
|
||||
),
|
||||
help=(
|
||||
"Per-file wall-clock cap in seconds. On timeout, the pytest "
|
||||
"subprocess and its full process tree are SIGKILL'd. "
|
||||
"Default: 600 (10 min), env: HERMES_TEST_FILE_TIMEOUT."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"paths_positional",
|
||||
nargs="*",
|
||||
metavar="PATH",
|
||||
help=(
|
||||
"Restrict discovery to these paths (directories or .py files). "
|
||||
"Mutually exclusive with --paths. Anything after a literal '--' "
|
||||
"separator is passed through to each per-file pytest invocation."
|
||||
),
|
||||
)
|
||||
# Manually split argv on '--' so positional paths and pytest passthrough
|
||||
# args don't fight over each other. argparse's nargs="*" positional is
|
||||
# greedy and will swallow everything after '--' including the pytest
|
||||
# flags, defeating the convention.
|
||||
argv = sys.argv[1:]
|
||||
if "--" in argv:
|
||||
sep = argv.index("--")
|
||||
our_args, pytest_passthrough = argv[:sep], argv[sep + 1 :]
|
||||
else:
|
||||
our_args, pytest_passthrough = argv, []
|
||||
args = parser.parse_args(our_args)
|
||||
|
||||
repo_root = Path(__file__).resolve().parent.parent
|
||||
|
||||
# Resolve discovery roots: positional path args override --paths if any
|
||||
# were supplied, otherwise --paths (which itself defaults to 'tests').
|
||||
if args.paths_positional:
|
||||
# Positionals can be directories OR explicit .py files. Either is
|
||||
# fine — _discover_files handles both via rglob('test_*.py') for
|
||||
# dirs and direct inclusion for files.
|
||||
roots = [repo_root / p for p in args.paths_positional]
|
||||
else:
|
||||
roots = [repo_root / p for p in args.paths.split(":") if p]
|
||||
|
||||
if args.include_integration:
|
||||
# Caller takes responsibility — typically used via explicit -k filter.
|
||||
global _SKIP_PARTS # noqa: PLW0603 — config knob
|
||||
_SKIP_PARTS = set()
|
||||
|
||||
files = _discover_files(roots)
|
||||
if not files:
|
||||
print(f"No test files discovered under {[str(r) for r in roots]}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Count individual tests per file via a single pytest --co pass.
|
||||
test_counts = _count_tests(files, repo_root, pytest_passthrough)
|
||||
total_tests = sum(test_counts.values())
|
||||
|
||||
print(
|
||||
f"Discovered {len(files)} test files ({total_tests} tests) under "
|
||||
f"{[str(r.relative_to(repo_root)) if r.is_relative_to(repo_root) else str(r) for r in roots]}; "
|
||||
f"running with -j {args.jobs}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
# Capture and print on completion (out-of-order is fine — keeps the
|
||||
# terminal clean rather than interleaving N parallel pytest outputs).
|
||||
failures: List[Tuple[Path, str, Dict[str, int]]] = []
|
||||
started = time.monotonic()
|
||||
files_done = 0
|
||||
tests_done = 0
|
||||
pass_count = 0
|
||||
fail_count = 0
|
||||
tests_passed = 0
|
||||
tests_failed = 0
|
||||
lock = threading.Lock()
|
||||
|
||||
def _on_done(file: Path, started_at: float, fut: "Future[Tuple[Path, int, str, dict[str, int]]]") -> None:
|
||||
nonlocal files_done, tests_done, pass_count, fail_count, tests_passed, tests_failed
|
||||
n_tests = test_counts.get(file, 0)
|
||||
try:
|
||||
fpath, rc, output, summary = fut.result()
|
||||
except Exception as exc: # noqa: BLE001 — must always advance counter
|
||||
with lock:
|
||||
files_done += 1
|
||||
tests_done += n_tests
|
||||
fail_count += 1
|
||||
failures.append((file, f"runner crashed: {exc!r}", {}))
|
||||
_print_progress(
|
||||
tests_done, total_tests, file, 1,
|
||||
time.monotonic() - started_at,
|
||||
repo_root, tests_passed, tests_failed,
|
||||
test_counts,
|
||||
)
|
||||
return
|
||||
with lock:
|
||||
files_done += 1
|
||||
tests_done += n_tests
|
||||
# Accumulate test-level counts from parsed summary.
|
||||
tests_passed += summary.get("passed", 0)
|
||||
tests_failed += summary.get("failed", 0)
|
||||
if rc == 0:
|
||||
pass_count += 1
|
||||
else:
|
||||
fail_count += 1
|
||||
failures.append((fpath, output, summary))
|
||||
_print_progress(
|
||||
tests_done, total_tests, fpath, rc,
|
||||
time.monotonic() - started_at,
|
||||
repo_root, tests_passed, tests_failed,
|
||||
test_counts,
|
||||
file_summary=summary,
|
||||
)
|
||||
if rc != 0:
|
||||
_print_inline_failure(fpath, output, repo_root, pytest_passthrough)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=args.jobs) as pool:
|
||||
futures: List[Future] = []
|
||||
for file in files:
|
||||
t0 = time.monotonic()
|
||||
fut = pool.submit(
|
||||
_run_one_file, file, pytest_passthrough, repo_root, args.file_timeout
|
||||
)
|
||||
fut.add_done_callback(lambda f, file=file, t0=t0: _on_done(file, t0, f))
|
||||
futures.append(fut)
|
||||
# Block until everything's done. ThreadPoolExecutor.__exit__ waits
|
||||
# for all submitted work, but doing it explicitly here makes the
|
||||
# control flow obvious.
|
||||
for fut in futures:
|
||||
fut.result() if fut.exception() is None else None
|
||||
|
||||
elapsed = time.monotonic() - started
|
||||
print()
|
||||
pct = (tests_done / total_tests * 100) if total_tests else 0
|
||||
print(f"=== Summary: {len(files)} files, {tests_passed} tests passed, {tests_failed} failed ({pct:.0f}% complete) in {elapsed:.1f}s ({args.jobs} workers) ===")
|
||||
|
||||
if failures:
|
||||
print()
|
||||
print("=== Failure output ===")
|
||||
for file, output, _summary in failures:
|
||||
print()
|
||||
print(f"--- {_format_file(file, repo_root)} ---")
|
||||
print(output.rstrip())
|
||||
print()
|
||||
# Split: files with actual test failures vs non-zero exit for other reasons
|
||||
test_fail_files = [(f, s) for f, _o, s in failures if s.get("failed", 0) > 0]
|
||||
all_passed_but_nonzero = [(f, s) for f, _o, s in failures
|
||||
if s.get("failed", 0) == 0 and s.get("passed", 0) > 0]
|
||||
no_tests_ran = [(f, s) for f, _o, s in failures
|
||||
if s.get("failed", 0) == 0 and s.get("passed", 0) == 0]
|
||||
if test_fail_files:
|
||||
total_tf = sum(s.get("failed", 0) for _, s in test_fail_files)
|
||||
print(f"=== {len(test_fail_files)} file{'s' if len(test_fail_files) != 1 else ''} with test failures ({total_tf} test{'s' if total_tf != 1 else ''} failed) ===")
|
||||
for file, s in test_fail_files:
|
||||
nf = s.get("failed", 0)
|
||||
print(f" {_format_file(file, repo_root)} ({nf} test{'s' if nf != 1 else ''} failed)")
|
||||
if all_passed_but_nonzero:
|
||||
print(f"=== {len(all_passed_but_nonzero)} file{'s' if len(all_passed_but_nonzero) != 1 else ''} where all tests passed but pytest exited non-zero (warnings-as-errors, hook failures, etc.) ===")
|
||||
for file, s in all_passed_but_nonzero:
|
||||
print(f" {_format_file(file, repo_root)} ({s.get('passed', 0)} passed)")
|
||||
if no_tests_ran:
|
||||
print(f"=== {len(no_tests_ran)} file{'s' if len(no_tests_ran) != 1 else ''} where no tests ran (collection/import error, timeout before collection, etc.) ===")
|
||||
for file, s in no_tests_ran:
|
||||
print(f" {_format_file(file, repo_root)}")
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -40,6 +40,16 @@ def _clean_env(monkeypatch):
|
||||
"ANTHROPIC_API_KEY", "ANTHROPIC_TOKEN", "CLAUDE_CODE_OAUTH_TOKEN",
|
||||
):
|
||||
monkeypatch.delenv(key, raising=False)
|
||||
# Module-level unhealthy cache (10-min TTL) leaks between tests;
|
||||
# earlier tests that call _mark_provider_unhealthy() poison the
|
||||
# cache for later ones, causing _resolve_auto to skip providers
|
||||
# that the test patched to return valid clients.
|
||||
import agent.auxiliary_client as _aux_mod
|
||||
_aux_mod._aux_unhealthy_until.clear()
|
||||
_aux_mod._aux_unhealthy_logged_at.clear()
|
||||
yield
|
||||
_aux_mod._aux_unhealthy_until.clear()
|
||||
_aux_mod._aux_unhealthy_logged_at.clear()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -461,6 +471,17 @@ class TestExpiredCodexFallback:
|
||||
import base64
|
||||
import time as _time
|
||||
|
||||
# Belt-and-suspenders: _try_openrouter marks openrouter unhealthy
|
||||
# when OPENROUTER_API_KEY is absent (which the preceding test in
|
||||
# this class exercises). The file-level _clean_env autouse fixture
|
||||
# clears the cache, but fixture ordering with the conftest
|
||||
# _hermetic_environment autouse can leave a narrow window where
|
||||
# the mark reappears. Explicitly clear here so this test is
|
||||
# independent of run order.
|
||||
import agent.auxiliary_client as _aux_mod
|
||||
_aux_mod._aux_unhealthy_until.clear()
|
||||
_aux_mod._aux_unhealthy_logged_at.clear()
|
||||
|
||||
header = base64.urlsafe_b64encode(b'{"alg":"RS256","typ":"JWT"}').rstrip(b"=").decode()
|
||||
payload_data = json.dumps({"exp": int(_time.time()) - 3600}).encode()
|
||||
payload = base64.urlsafe_b64encode(payload_data).rstrip(b"=").decode()
|
||||
@@ -1047,6 +1068,20 @@ class TestGetProviderChain:
|
||||
class TestTryPaymentFallback:
|
||||
"""_try_payment_fallback skips the failed provider and tries alternatives."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _clear_unhealthy_cache(self):
|
||||
"""Earlier tests in this file call _mark_provider_unhealthy() which
|
||||
pollutes the module-level ``_aux_unhealthy_until`` dict (10-min TTL).
|
||||
Without this cleanup the fallback chain skips providers we've patched
|
||||
to return valid clients — the patched function is never called.
|
||||
"""
|
||||
from agent.auxiliary_client import _aux_unhealthy_until, _aux_unhealthy_logged_at
|
||||
_aux_unhealthy_until.clear()
|
||||
_aux_unhealthy_logged_at.clear()
|
||||
yield
|
||||
_aux_unhealthy_until.clear()
|
||||
_aux_unhealthy_logged_at.clear()
|
||||
|
||||
def test_skips_failed_provider(self):
|
||||
mock_client = MagicMock()
|
||||
with patch("agent.auxiliary_client._try_openrouter", return_value=(None, None)), \
|
||||
|
||||
@@ -0,0 +1,93 @@
|
||||
from types import SimpleNamespace
|
||||
|
||||
from agent.agent_init import _merge_custom_provider_extra_body
|
||||
|
||||
|
||||
def test_custom_provider_extra_body_merges_into_request_overrides():
|
||||
agent = SimpleNamespace(
|
||||
provider="custom",
|
||||
model="google/gemma-4-31b-it",
|
||||
base_url="https://example.test/v1",
|
||||
request_overrides={"service_tier": "priority"},
|
||||
)
|
||||
|
||||
_merge_custom_provider_extra_body(
|
||||
agent,
|
||||
[
|
||||
{
|
||||
"name": "gemma",
|
||||
"base_url": "https://example.test/v1/",
|
||||
"model": "google/gemma-4-31b-it",
|
||||
"extra_body": {
|
||||
"enable_thinking": True,
|
||||
"reasoning_effort": "high",
|
||||
},
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
assert agent.request_overrides == {
|
||||
"service_tier": "priority",
|
||||
"extra_body": {
|
||||
"enable_thinking": True,
|
||||
"reasoning_effort": "high",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def test_custom_provider_extra_body_preserves_caller_override():
|
||||
agent = SimpleNamespace(
|
||||
provider="custom",
|
||||
model="google/gemma-4-31b-it",
|
||||
base_url="https://example.test/v1",
|
||||
request_overrides={
|
||||
"extra_body": {
|
||||
"reasoning_effort": "low",
|
||||
"caller_only": True,
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
_merge_custom_provider_extra_body(
|
||||
agent,
|
||||
[
|
||||
{
|
||||
"name": "gemma",
|
||||
"base_url": "https://example.test/v1",
|
||||
"model": "google/gemma-4-31b-it",
|
||||
"extra_body": {
|
||||
"enable_thinking": True,
|
||||
"reasoning_effort": "high",
|
||||
},
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
assert agent.request_overrides["extra_body"] == {
|
||||
"enable_thinking": True,
|
||||
"reasoning_effort": "low",
|
||||
"caller_only": True,
|
||||
}
|
||||
|
||||
|
||||
def test_custom_provider_extra_body_ignores_other_custom_models():
|
||||
agent = SimpleNamespace(
|
||||
provider="custom",
|
||||
model="other-model",
|
||||
base_url="https://example.test/v1",
|
||||
request_overrides={},
|
||||
)
|
||||
|
||||
_merge_custom_provider_extra_body(
|
||||
agent,
|
||||
[
|
||||
{
|
||||
"name": "gemma",
|
||||
"base_url": "https://example.test/v1",
|
||||
"model": "google/gemma-4-31b-it",
|
||||
"extra_body": {"enable_thinking": True},
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
assert agent.request_overrides == {}
|
||||
@@ -56,6 +56,7 @@ class TestFailoverReason:
|
||||
"overloaded", "server_error", "timeout",
|
||||
"context_overflow", "payload_too_large", "image_too_large",
|
||||
"model_not_found", "format_error",
|
||||
"multimodal_tool_content_unsupported",
|
||||
"provider_policy_blocked",
|
||||
"thinking_signature", "long_context_tier",
|
||||
"oauth_long_context_beta_forbidden",
|
||||
@@ -1256,3 +1257,66 @@ class TestRateLimitErrorWithoutStatusCode:
|
||||
e.status_code = None
|
||||
result = classify_api_error(e, provider="copilot", model="gpt-4o")
|
||||
assert result.reason != FailoverReason.rate_limit
|
||||
|
||||
|
||||
|
||||
# ── Test: multimodal_tool_content_unsupported pattern ───────────────────
|
||||
|
||||
class TestMultimodalToolContentUnsupported:
|
||||
"""Issue #27344 — providers that reject list-type tool message content
|
||||
should be classified as ``multimodal_tool_content_unsupported`` so the
|
||||
retry loop can downgrade screenshots to text and try again.
|
||||
"""
|
||||
|
||||
def test_xiaomi_mimo_text_is_not_set_pattern(self):
|
||||
"""The actual Xiaomi MiMo 400 wording from the bug report."""
|
||||
e = MockAPIError(
|
||||
"Error code: 400 - {'error': {'code': '400', 'message': 'Param Incorrect', 'param': 'text is not set', 'type': ''}}",
|
||||
status_code=400,
|
||||
)
|
||||
result = classify_api_error(e, provider="xiaomi", model="mimo-v2.5")
|
||||
assert result.reason == FailoverReason.multimodal_tool_content_unsupported
|
||||
assert result.retryable is True
|
||||
|
||||
def test_generic_tool_message_must_be_string(self):
|
||||
e = MockAPIError(
|
||||
"tool message content must be a string",
|
||||
status_code=400,
|
||||
)
|
||||
result = classify_api_error(e, provider="custom", model="some-model")
|
||||
assert result.reason == FailoverReason.multimodal_tool_content_unsupported
|
||||
|
||||
def test_expected_string_got_list(self):
|
||||
e = MockAPIError(
|
||||
"Schema validation failed: expected string, got list",
|
||||
status_code=400,
|
||||
)
|
||||
result = classify_api_error(e, provider="custom", model="some-model")
|
||||
assert result.reason == FailoverReason.multimodal_tool_content_unsupported
|
||||
|
||||
def test_multimodal_tool_content_takes_priority_over_context_overflow(self):
|
||||
"""Some providers return a 400 whose message contains BOTH
|
||||
'text is not set' and a length-shaped phrase; the tool-content
|
||||
recovery is cheaper than compression so it must win the priority.
|
||||
"""
|
||||
e = MockAPIError(
|
||||
"text is not set; context length exceeded",
|
||||
status_code=400,
|
||||
)
|
||||
result = classify_api_error(e, provider="xiaomi", model="mimo-v2.5")
|
||||
assert result.reason == FailoverReason.multimodal_tool_content_unsupported
|
||||
|
||||
def test_no_status_code_path_also_classifies(self):
|
||||
"""When the error reaches us without a status code (transport
|
||||
layer ate it) the message-only classifier branch must also
|
||||
recognise the pattern.
|
||||
"""
|
||||
e = MockTransportError("tool_call.content must be string")
|
||||
result = classify_api_error(e, provider="alibaba", model="qwen3.5-plus")
|
||||
assert result.reason == FailoverReason.multimodal_tool_content_unsupported
|
||||
|
||||
def test_unrelated_400_is_not_misclassified(self):
|
||||
"""Make sure the patterns don't false-positive on normal 400s."""
|
||||
e = MockAPIError("bad request: missing field 'model'", status_code=400)
|
||||
result = classify_api_error(e, provider="openrouter", model="anthropic/claude-sonnet-4")
|
||||
assert result.reason != FailoverReason.multimodal_tool_content_unsupported
|
||||
|
||||
@@ -1060,3 +1060,191 @@ class TestHonchoCadenceTracking:
|
||||
p.on_turn_start(2, "second message")
|
||||
should_skip = p._injection_frequency == "first-turn" and p._turn_count > 1
|
||||
assert should_skip, "Second turn (turn 2) SHOULD be skipped"
|
||||
|
||||
|
||||
class TestMemoryToolToolsetGate:
|
||||
"""Issue #5544: memory provider tools must respect platform_toolsets.
|
||||
|
||||
Before the fix, MemoryManager.get_all_tool_schemas() output was appended
|
||||
to AIAgent.tools unconditionally in agent_init.py — bypassing the
|
||||
enabled_toolsets filter. Result: `platform_toolsets: telegram: []`
|
||||
still leaked fact_store and other memory tools into the tool surface,
|
||||
causing 10x latency on local models (Qwen3-30B: 1.7s → 42s) and
|
||||
tool-call loops on small models.
|
||||
|
||||
These tests mirror the gate logic in agent/agent_init.py around the
|
||||
memory provider tool injection block. The gate condition is:
|
||||
|
||||
enabled_toolsets is None → no filter, inject (backward compat)
|
||||
"memory" in enabled_toolsets → user opted in, inject
|
||||
otherwise (incl. []) → skip injection
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _run_memory_injection(enabled_toolsets, memory_manager):
|
||||
"""Simulate the gated memory-tool injection block from agent_init.py."""
|
||||
tools = []
|
||||
valid_tool_names = set()
|
||||
|
||||
if memory_manager and tools is not None and (
|
||||
enabled_toolsets is None or "memory" in enabled_toolsets
|
||||
):
|
||||
_existing = {
|
||||
t.get("function", {}).get("name")
|
||||
for t in tools
|
||||
if isinstance(t, dict)
|
||||
}
|
||||
for _schema in memory_manager.get_all_tool_schemas():
|
||||
_tname = _schema.get("name", "")
|
||||
if _tname and _tname in _existing:
|
||||
continue
|
||||
tools.append({"type": "function", "function": _schema})
|
||||
if _tname:
|
||||
valid_tool_names.add(_tname)
|
||||
_existing.add(_tname)
|
||||
|
||||
return tools, valid_tool_names
|
||||
|
||||
def _mgr_with_tools(self, *tool_names):
|
||||
"""Build a MemoryManager whose providers expose the named tool schemas."""
|
||||
mgr = MemoryManager()
|
||||
p = FakeMemoryProvider(
|
||||
"ext",
|
||||
tools=[{"name": n, "description": n, "parameters": {}} for n in tool_names],
|
||||
)
|
||||
mgr.add_provider(p)
|
||||
return mgr
|
||||
|
||||
def test_none_toolsets_injects(self):
|
||||
"""enabled_toolsets=None (no filter) injects memory tools — backward compat."""
|
||||
mgr = self._mgr_with_tools("fact_store")
|
||||
tools, names = self._run_memory_injection(None, mgr)
|
||||
assert "fact_store" in names
|
||||
assert any(t["function"]["name"] == "fact_store" for t in tools)
|
||||
|
||||
def test_memory_in_toolsets_injects(self):
|
||||
"""enabled_toolsets including 'memory' injects memory tools."""
|
||||
mgr = self._mgr_with_tools("fact_store")
|
||||
tools, names = self._run_memory_injection(["terminal", "memory", "web"], mgr)
|
||||
assert "fact_store" in names
|
||||
|
||||
def test_empty_toolsets_blocks_injection(self):
|
||||
"""`platform_toolsets: telegram: []` must suppress memory tools. (#5544)"""
|
||||
mgr = self._mgr_with_tools("fact_store")
|
||||
tools, names = self._run_memory_injection([], mgr)
|
||||
assert tools == []
|
||||
assert names == set()
|
||||
|
||||
def test_toolsets_without_memory_blocks_injection(self):
|
||||
"""Toolset list that doesn't name 'memory' must suppress injection."""
|
||||
mgr = self._mgr_with_tools("fact_store")
|
||||
tools, names = self._run_memory_injection(["terminal", "web"], mgr)
|
||||
assert tools == []
|
||||
assert names == set()
|
||||
|
||||
def test_no_memory_manager_no_injection(self):
|
||||
"""Gate is moot without a memory manager."""
|
||||
tools, names = self._run_memory_injection(None, None)
|
||||
assert tools == []
|
||||
|
||||
def test_multiple_schemas_all_blocked_together(self):
|
||||
"""When the gate is closed, no memory tools leak — not even partially."""
|
||||
mgr = self._mgr_with_tools("fact_store", "memory_search", "memory_add")
|
||||
tools, names = self._run_memory_injection(["terminal"], mgr)
|
||||
assert tools == []
|
||||
assert names == set()
|
||||
|
||||
def test_multiple_schemas_all_injected_when_enabled(self):
|
||||
"""When the gate is open, every memory tool schema is injected."""
|
||||
mgr = self._mgr_with_tools("fact_store", "memory_search", "memory_add")
|
||||
tools, names = self._run_memory_injection(None, mgr)
|
||||
assert names == {"fact_store", "memory_search", "memory_add"}
|
||||
|
||||
|
||||
class TestContextEngineToolsetGate:
|
||||
"""Issue #5544 (sibling): context engine tools follow the same gate.
|
||||
|
||||
`agent.context_compressor.get_tool_schemas()` (e.g. lcm_grep, lcm_describe,
|
||||
lcm_expand) was appended to AIAgent.tools unconditionally. Same blind
|
||||
injection class as the memory bug; same local-model penalty. Gate name:
|
||||
"context_engine" (matches the existing plugin-system convention).
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _run_context_engine_injection(enabled_toolsets, compressor):
|
||||
"""Simulate the gated context-engine injection block from agent_init.py."""
|
||||
tools = []
|
||||
valid_tool_names = set()
|
||||
engine_tool_names = set()
|
||||
|
||||
if (
|
||||
compressor is not None
|
||||
and tools is not None
|
||||
and (
|
||||
enabled_toolsets is None
|
||||
or "context_engine" in enabled_toolsets
|
||||
)
|
||||
):
|
||||
_existing = {
|
||||
t.get("function", {}).get("name")
|
||||
for t in tools
|
||||
if isinstance(t, dict)
|
||||
}
|
||||
for _schema in compressor.get_tool_schemas():
|
||||
_tname = _schema.get("name", "")
|
||||
if _tname and _tname in _existing:
|
||||
continue
|
||||
tools.append({"type": "function", "function": _schema})
|
||||
if _tname:
|
||||
valid_tool_names.add(_tname)
|
||||
engine_tool_names.add(_tname)
|
||||
_existing.add(_tname)
|
||||
|
||||
return tools, valid_tool_names, engine_tool_names
|
||||
|
||||
class _FakeCompressor:
|
||||
def __init__(self, schemas):
|
||||
self._schemas = schemas
|
||||
|
||||
def get_tool_schemas(self):
|
||||
return list(self._schemas)
|
||||
|
||||
def _compressor_with(self, *tool_names):
|
||||
return self._FakeCompressor(
|
||||
[{"name": n, "description": n, "parameters": {}} for n in tool_names]
|
||||
)
|
||||
|
||||
def test_none_toolsets_injects(self):
|
||||
"""enabled_toolsets=None injects context-engine tools — backward compat."""
|
||||
c = self._compressor_with("lcm_grep", "lcm_describe", "lcm_expand")
|
||||
tools, names, engine_names = self._run_context_engine_injection(None, c)
|
||||
assert engine_names == {"lcm_grep", "lcm_describe", "lcm_expand"}
|
||||
|
||||
def test_context_engine_in_toolsets_injects(self):
|
||||
"""enabled_toolsets including 'context_engine' injects the tools."""
|
||||
c = self._compressor_with("lcm_grep")
|
||||
tools, names, engine_names = self._run_context_engine_injection(
|
||||
["terminal", "context_engine"], c
|
||||
)
|
||||
assert "lcm_grep" in engine_names
|
||||
|
||||
def test_empty_toolsets_blocks_injection(self):
|
||||
"""`platform_toolsets: telegram: []` must suppress context-engine tools."""
|
||||
c = self._compressor_with("lcm_grep")
|
||||
tools, names, engine_names = self._run_context_engine_injection([], c)
|
||||
assert tools == []
|
||||
assert engine_names == set()
|
||||
|
||||
def test_toolsets_without_context_engine_blocks_injection(self):
|
||||
"""A toolset list that doesn't name 'context_engine' suppresses injection."""
|
||||
c = self._compressor_with("lcm_grep", "lcm_describe")
|
||||
tools, names, engine_names = self._run_context_engine_injection(
|
||||
["terminal", "memory"], c
|
||||
)
|
||||
assert tools == []
|
||||
assert engine_names == set()
|
||||
|
||||
def test_no_compressor_no_injection(self):
|
||||
"""Gate is moot without a context_compressor."""
|
||||
tools, names, engine_names = self._run_context_engine_injection(None, None)
|
||||
assert tools == []
|
||||
|
||||
@@ -444,6 +444,7 @@ class TestBuildNousSubscriptionPrompt:
|
||||
"tts": NousFeatureState("tts", "OpenAI TTS", True, True, True, True, False, True, "OpenAI TTS"),
|
||||
"browser": NousFeatureState("browser", "Browser automation", True, True, True, True, False, True, "Browser Use"),
|
||||
"modal": NousFeatureState("modal", "Modal execution", False, True, False, False, False, True, "local"),
|
||||
"app_tools": NousFeatureState("app_tools", "App tools (500+ apps)", True, True, True, True, False, True, "Nous Subscription"),
|
||||
},
|
||||
),
|
||||
)
|
||||
@@ -468,6 +469,7 @@ class TestBuildNousSubscriptionPrompt:
|
||||
"tts": NousFeatureState("tts", "OpenAI TTS", True, False, False, False, False, True, ""),
|
||||
"browser": NousFeatureState("browser", "Browser automation", True, False, False, False, False, True, ""),
|
||||
"modal": NousFeatureState("modal", "Modal execution", False, False, False, False, False, True, ""),
|
||||
"app_tools": NousFeatureState("app_tools", "App tools (500+ apps)", True, False, False, False, False, True, ""),
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
@@ -556,10 +556,11 @@ Generate some audio.
|
||||
raising=False,
|
||||
)
|
||||
|
||||
with patch.dict(
|
||||
os.environ, {"HERMES_SESSION_PLATFORM": "telegram"}, clear=False
|
||||
):
|
||||
with patch("tools.skills_tool.SKILLS_DIR", tmp_path):
|
||||
with patch("tools.skills_tool.SKILLS_DIR", tmp_path):
|
||||
from gateway.session_context import clear_session_vars, set_session_vars
|
||||
|
||||
tokens = set_session_vars(platform="telegram")
|
||||
try:
|
||||
_make_skill(
|
||||
tmp_path,
|
||||
"test-skill",
|
||||
@@ -571,6 +572,8 @@ Generate some audio.
|
||||
)
|
||||
scan_skill_commands()
|
||||
msg = build_skill_invocation_message("/test-skill", "do stuff")
|
||||
finally:
|
||||
clear_session_vars(tokens)
|
||||
|
||||
assert msg is not None
|
||||
assert "local cli" in msg.lower()
|
||||
|
||||
@@ -1,6 +1,12 @@
|
||||
"""Tests for agent/skill_utils.py — extract_skill_conditions metadata handling."""
|
||||
"""Tests for agent/skill_utils.py."""
|
||||
|
||||
from agent.skill_utils import extract_skill_conditions
|
||||
from unittest.mock import patch
|
||||
|
||||
from agent.skill_utils import (
|
||||
extract_skill_conditions,
|
||||
iter_skill_index_files,
|
||||
skill_matches_platform,
|
||||
)
|
||||
|
||||
|
||||
def test_metadata_as_dict_with_hermes():
|
||||
@@ -56,3 +62,138 @@ def test_metadata_missing_entirely():
|
||||
"fallback_for_tools": [],
|
||||
"requires_tools": [],
|
||||
}
|
||||
|
||||
|
||||
def test_iter_skill_index_files_prunes_dependency_dirs(tmp_path):
|
||||
real = tmp_path / "real-skill"
|
||||
real.mkdir()
|
||||
(real / "SKILL.md").write_text("---\nname: real-skill\n---\n", encoding="utf-8")
|
||||
|
||||
nested = (
|
||||
tmp_path
|
||||
/ "bring"
|
||||
/ "scripts"
|
||||
/ ".venv"
|
||||
/ "lib"
|
||||
/ "python3.13"
|
||||
/ "site-packages"
|
||||
/ "typer"
|
||||
/ ".agents"
|
||||
/ "skills"
|
||||
/ "typer"
|
||||
)
|
||||
nested.mkdir(parents=True)
|
||||
(nested / "SKILL.md").write_text("---\nname: typer\n---\n", encoding="utf-8")
|
||||
|
||||
node_module = (
|
||||
tmp_path
|
||||
/ "web-skill"
|
||||
/ "node_modules"
|
||||
/ "dep"
|
||||
/ ".agents"
|
||||
/ "skills"
|
||||
/ "dep"
|
||||
)
|
||||
node_module.mkdir(parents=True)
|
||||
(node_module / "SKILL.md").write_text("---\nname: dep\n---\n", encoding="utf-8")
|
||||
|
||||
found = list(iter_skill_index_files(tmp_path, "SKILL.md"))
|
||||
|
||||
assert found == [real / "SKILL.md"]
|
||||
|
||||
|
||||
# ── skill_matches_platform on Termux ──────────────────────────────────────
|
||||
|
||||
|
||||
class TestSkillMatchesPlatformTermux:
|
||||
"""Termux is Linux userland on Android. Skills tagged platforms:[linux]
|
||||
must load there regardless of whether Python reports sys.platform as
|
||||
"linux" (pre-3.13) or "android" (3.13+). Reported by user @LikiusInik
|
||||
in May 2026 — only 3 built-in skills appeared on Termux because every
|
||||
github/productivity/mlops skill is tagged platforms:[linux,macos,windows]
|
||||
and sys.platform=="android" did not start with "linux".
|
||||
"""
|
||||
|
||||
def test_no_platforms_field_matches_everywhere(self):
|
||||
# Backward-compat default — skills without a platforms tag load
|
||||
# on any OS, Termux included.
|
||||
with patch("agent.skill_utils.sys.platform", "android"), patch(
|
||||
"agent.skill_utils.is_termux", return_value=True
|
||||
):
|
||||
assert skill_matches_platform({}) is True
|
||||
assert skill_matches_platform({"name": "foo"}) is True
|
||||
|
||||
def test_linux_skill_loads_on_termux_android_platform(self):
|
||||
# Python 3.13+ on Termux reports sys.platform == "android".
|
||||
fm = {"platforms": ["linux"]}
|
||||
with patch("agent.skill_utils.sys.platform", "android"), patch(
|
||||
"agent.skill_utils.is_termux", return_value=True
|
||||
):
|
||||
assert skill_matches_platform(fm) is True
|
||||
|
||||
def test_linux_macos_windows_skill_loads_on_termux(self):
|
||||
# The common "[linux, macos, windows]" tag used by github-*,
|
||||
# productivity, mlops, etc.
|
||||
fm = {"platforms": ["linux", "macos", "windows"]}
|
||||
with patch("agent.skill_utils.sys.platform", "android"), patch(
|
||||
"agent.skill_utils.is_termux", return_value=True
|
||||
):
|
||||
assert skill_matches_platform(fm) is True
|
||||
|
||||
def test_linux_skill_loads_on_termux_linux_platform(self):
|
||||
# Pre-3.13 Termux reports sys.platform == "linux" already — this
|
||||
# works without the Termux escape hatch but must still pass.
|
||||
fm = {"platforms": ["linux"]}
|
||||
with patch("agent.skill_utils.sys.platform", "linux"), patch(
|
||||
"agent.skill_utils.is_termux", return_value=True
|
||||
):
|
||||
assert skill_matches_platform(fm) is True
|
||||
|
||||
def test_macos_only_skill_still_excluded_on_termux(self):
|
||||
# macOS-only skills (apple-notes, imessage, ...) should NOT load
|
||||
# on Termux. The Termux fallback only widens platforms:[linux,...].
|
||||
fm = {"platforms": ["macos"]}
|
||||
with patch("agent.skill_utils.sys.platform", "android"), patch(
|
||||
"agent.skill_utils.is_termux", return_value=True
|
||||
):
|
||||
assert skill_matches_platform(fm) is False
|
||||
|
||||
def test_windows_only_skill_still_excluded_on_termux(self):
|
||||
fm = {"platforms": ["windows"]}
|
||||
with patch("agent.skill_utils.sys.platform", "android"), patch(
|
||||
"agent.skill_utils.is_termux", return_value=True
|
||||
):
|
||||
assert skill_matches_platform(fm) is False
|
||||
|
||||
def test_explicit_termux_or_android_tag_matches(self):
|
||||
# Skills can also opt in explicitly via platforms:[termux] or
|
||||
# platforms:[android] — both should match a Termux session.
|
||||
with patch("agent.skill_utils.sys.platform", "android"), patch(
|
||||
"agent.skill_utils.is_termux", return_value=True
|
||||
):
|
||||
assert skill_matches_platform({"platforms": ["termux"]}) is True
|
||||
assert skill_matches_platform({"platforms": ["android"]}) is True
|
||||
|
||||
def test_non_termux_android_does_not_widen(self):
|
||||
# If we're somehow on a plain Android Python (not Termux), don't
|
||||
# silently load Linux skills — Termux is the supported environment.
|
||||
fm = {"platforms": ["linux"]}
|
||||
with patch("agent.skill_utils.sys.platform", "android"), patch(
|
||||
"agent.skill_utils.is_termux", return_value=False
|
||||
):
|
||||
assert skill_matches_platform(fm) is False
|
||||
|
||||
def test_linux_skill_on_real_linux_unaffected(self):
|
||||
# The non-Termux Linux path must not change.
|
||||
fm = {"platforms": ["linux"]}
|
||||
with patch("agent.skill_utils.sys.platform", "linux"), patch(
|
||||
"agent.skill_utils.is_termux", return_value=False
|
||||
):
|
||||
assert skill_matches_platform(fm) is True
|
||||
|
||||
def test_macos_skill_on_real_macos_unaffected(self):
|
||||
fm = {"platforms": ["macos"]}
|
||||
with patch("agent.skill_utils.sys.platform", "darwin"), patch(
|
||||
"agent.skill_utils.is_termux", return_value=False
|
||||
):
|
||||
assert skill_matches_platform(fm) is True
|
||||
|
||||
+34
-184
@@ -20,12 +20,9 @@ test runner at ``scripts/run_tests.sh``.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
@@ -37,6 +34,22 @@ if str(PROJECT_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
|
||||
# ── Per-file process isolation ──────────────────────────────────────────────
|
||||
# Tests run via ``scripts/run_tests_parallel.py``, which spawns a fresh
|
||||
# ``python -m pytest <file>`` subprocess per test file. Cross-file state
|
||||
# leakage (module-level dicts, ContextVars, caches) is impossible: each
|
||||
# file gets a clean Python interpreter. Intra-file ordering is the test
|
||||
# author's responsibility — if test A in foo.py mutates state that test B
|
||||
# in foo.py reads, that's a real bug to fix in the file (it would also
|
||||
# bite anyone running ``pytest tests/foo.py`` directly).
|
||||
#
|
||||
# This replaces the historic _reset_module_state autouse fixture (manual
|
||||
# state clearing) and the brief experiment with subprocess-per-test
|
||||
# isolation (too slow at ~17k tests).
|
||||
#
|
||||
# See ``scripts/run_tests_parallel.py`` for the runner.
|
||||
|
||||
|
||||
# ── Credential env-var filter ──────────────────────────────────────────────
|
||||
#
|
||||
# Any env var in the current process matching ONE of these patterns is
|
||||
@@ -279,7 +292,7 @@ _HERMES_BEHAVIORAL_VARS = frozenset({
|
||||
"WECOM_HOME_CHANNEL_NAME",
|
||||
# Platform gating — set by load_gateway_config() as a side effect when
|
||||
# a config.yaml is present, so individual test bodies that call the
|
||||
# loader leak these values into later tests on the same xdist worker.
|
||||
# loader leak these values into later tests in the same process.
|
||||
# Force-clear on every test setup so the leak can't happen.
|
||||
"SLACK_REQUIRE_MENTION",
|
||||
"SLACK_STRICT_MENTION",
|
||||
@@ -368,144 +381,21 @@ def _isolate_hermes_home(_hermetic_environment):
|
||||
return None
|
||||
|
||||
|
||||
# ── Module-level state reset ───────────────────────────────────────────────
|
||||
# ── Module-level state reset — replaced by per-file process isolation ──────
|
||||
#
|
||||
# Python modules are singletons per process, and pytest-xdist workers are
|
||||
# long-lived. Module-level dicts/sets (tool registries, approval state,
|
||||
# interrupt flags) and ContextVars persist across tests in the same worker,
|
||||
# causing tests that pass alone to fail when run with siblings.
|
||||
# Each test FILE runs in a freshly-spawned ``python -m pytest <file>``
|
||||
# subprocess via ``scripts/run_tests_parallel.py``, so module-level dicts /
|
||||
# sets / ContextVars from tests in one file cannot leak into tests in
|
||||
# another file. No manual per-module clearing needed.
|
||||
#
|
||||
# Each entry in this fixture clears state that belongs to a specific module.
|
||||
# New state buckets go here too — this is the single gate that prevents
|
||||
# "works alone, flakes in CI" bugs from state leakage.
|
||||
# Within a single file, ordering is the author's responsibility. If your
|
||||
# tests in the same file share mutable state, either reset it explicitly
|
||||
# in a fixture or split them across files.
|
||||
#
|
||||
# The skill `test-suite-cascade-diagnosis` documents the concrete patterns
|
||||
# this closes; the running example was `test_command_guards` failing 12/15
|
||||
# CI runs because ``tools.approval._session_approved`` carried approvals
|
||||
# from one test's session into another's.
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset_module_state():
|
||||
"""Clear module-level mutable state and ContextVars between tests.
|
||||
|
||||
Keeps state from leaking across tests on the same xdist worker. Modules
|
||||
that don't exist yet (test collection before production import) are
|
||||
skipped silently — production import later creates fresh empty state.
|
||||
"""
|
||||
# --- logging — quiet/one-shot paths mutate process-global logger state ---
|
||||
logging.disable(logging.NOTSET)
|
||||
for _logger_name in ("tools", "run_agent", "trajectory_compressor", "cron", "hermes_cli"):
|
||||
_logger = logging.getLogger(_logger_name)
|
||||
_logger.disabled = False
|
||||
_logger.setLevel(logging.NOTSET)
|
||||
_logger.propagate = True
|
||||
|
||||
# --- tools.approval — the single biggest source of cross-test pollution ---
|
||||
try:
|
||||
from tools import approval as _approval_mod
|
||||
_approval_mod._session_approved.clear()
|
||||
_approval_mod._session_yolo.clear()
|
||||
_approval_mod._permanent_approved.clear()
|
||||
_approval_mod._pending.clear()
|
||||
_approval_mod._gateway_queues.clear()
|
||||
_approval_mod._gateway_notify_cbs.clear()
|
||||
# ContextVar: reset to empty string so get_current_session_key()
|
||||
# falls through to the env var / default path, matching a fresh
|
||||
# process.
|
||||
_approval_mod._approval_session_key.set("")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- tools.interrupt — per-thread interrupt flag set ---
|
||||
try:
|
||||
from tools import interrupt as _interrupt_mod
|
||||
with _interrupt_mod._lock:
|
||||
_interrupt_mod._interrupted_threads.clear()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- gateway.session_context — 9 ContextVars that represent
|
||||
# the active gateway session. If set in one test and not reset,
|
||||
# the next test's get_session_env() reads stale values.
|
||||
try:
|
||||
from gateway import session_context as _sc_mod
|
||||
for _cv in (
|
||||
_sc_mod._SESSION_PLATFORM,
|
||||
_sc_mod._SESSION_CHAT_ID,
|
||||
_sc_mod._SESSION_CHAT_NAME,
|
||||
_sc_mod._SESSION_THREAD_ID,
|
||||
_sc_mod._SESSION_USER_ID,
|
||||
_sc_mod._SESSION_USER_NAME,
|
||||
_sc_mod._SESSION_KEY,
|
||||
_sc_mod._CRON_AUTO_DELIVER_PLATFORM,
|
||||
_sc_mod._CRON_AUTO_DELIVER_CHAT_ID,
|
||||
_sc_mod._CRON_AUTO_DELIVER_THREAD_ID,
|
||||
):
|
||||
_cv.set(_sc_mod._UNSET)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- tools.env_passthrough — ContextVar<set[str]> with no default ---
|
||||
# LookupError is normal if the test never set it. Setting it to an
|
||||
# empty set unconditionally normalizes the starting state.
|
||||
try:
|
||||
from tools import env_passthrough as _envp_mod
|
||||
_envp_mod._allowed_env_vars_var.set(set())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- tools.terminal_tool — active environment/cwd cache ---
|
||||
# File tools prefer a live terminal cwd when one is cached for the task.
|
||||
# Clear terminal environments between tests so a prior terminal call can't
|
||||
# override TERMINAL_CWD in path-resolution tests.
|
||||
try:
|
||||
from tools import terminal_tool as _term_mod
|
||||
_envs_to_cleanup = []
|
||||
with _term_mod._env_lock:
|
||||
_envs_to_cleanup = list(_term_mod._active_environments.values())
|
||||
_term_mod._active_environments.clear()
|
||||
_term_mod._last_activity.clear()
|
||||
_term_mod._creation_locks.clear()
|
||||
for _env in _envs_to_cleanup:
|
||||
try:
|
||||
_env.cleanup()
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- tools.credential_files — ContextVar<dict> ---
|
||||
try:
|
||||
from tools import credential_files as _credf_mod
|
||||
_credf_mod._registered_files_var.set({})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- agent.auxiliary_client — runtime main provider/model override and
|
||||
# payment-error health cache. Both are process-global in production;
|
||||
# reset them per test so one worker's fallback/402 test does not make
|
||||
# later auxiliary-client tests skip otherwise-available providers.
|
||||
try:
|
||||
from agent import auxiliary_client as _aux_mod
|
||||
_aux_mod.clear_runtime_main()
|
||||
_aux_mod._reset_aux_unhealthy_cache()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- tools.file_tools — per-task read history + file-ops cache ---
|
||||
# _read_tracker accumulates per-task_id read history for loop detection,
|
||||
# capped by _READ_HISTORY_CAP. If entries from a prior test persist, the
|
||||
# cap is hit faster than expected and capacity-related tests flake.
|
||||
try:
|
||||
from tools import file_tools as _ft_mod
|
||||
with _ft_mod._read_tracker_lock:
|
||||
_ft_mod._read_tracker.clear()
|
||||
with _ft_mod._file_ops_lock:
|
||||
_ft_mod._file_ops_cache.clear()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
yield
|
||||
# The skill ``test-suite-cascade-diagnosis`` documents the cascade patterns
|
||||
# this replaces; the running example was ``test_command_guards`` failing
|
||||
# 12/15 CI runs because ``tools.approval._session_approved`` carried
|
||||
# approvals from one test's session into another's.
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
@@ -532,13 +422,12 @@ def mock_config():
|
||||
}
|
||||
|
||||
|
||||
# ── Global test timeout ─────────────────────────────────────────────────────
|
||||
# Kill any individual test that takes longer than 30 seconds.
|
||||
# Prevents hanging tests (subprocess spawns, blocking I/O) from stalling the
|
||||
# entire test suite.
|
||||
# ── Per-test timeout — handled by the isolation plugin ─────────────────────
|
||||
#
|
||||
# The subprocess-per-test plugin enforces the configured ``isolate_timeout``
|
||||
# ini key by terminating the child if it overruns. The old SIGALRM-based
|
||||
# fixture (POSIX-only, didn't work on Windows) is gone.
|
||||
|
||||
def _timeout_handler(signum, frame):
|
||||
raise TimeoutError("Test exceeded 30 second timeout")
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _ensure_current_event_loop(request):
|
||||
@@ -584,45 +473,6 @@ def _ensure_current_event_loop(request):
|
||||
asyncio.set_event_loop(None)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _enforce_test_timeout():
|
||||
"""Kill any individual test that takes longer than 30 seconds.
|
||||
SIGALRM is Unix-only; skip on Windows."""
|
||||
if sys.platform == "win32":
|
||||
yield
|
||||
return
|
||||
old = signal.signal(signal.SIGALRM, _timeout_handler)
|
||||
signal.alarm(30)
|
||||
yield
|
||||
signal.alarm(0)
|
||||
signal.signal(signal.SIGALRM, old)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset_tool_registry_caches():
|
||||
"""Clear tool-registry-level caches between tests.
|
||||
|
||||
The production registry caches ``check_fn()`` results for 30 s
|
||||
(see tools/registry.py) and :func:`get_tool_definitions` memoizes
|
||||
its result (see model_tools.py). Both are keyed on state that tests
|
||||
routinely mutate (env vars, registry._generation, config.yaml mtime)
|
||||
— but a stale result from test A can still be served to test B
|
||||
because 30 s covers the entire suite, and xdist worker reuse means
|
||||
one test's cache lands in another's process. Clearing before every
|
||||
test keeps hermetic behavior.
|
||||
"""
|
||||
try:
|
||||
from tools.registry import invalidate_check_fn_cache
|
||||
invalidate_check_fn_cache()
|
||||
except ImportError:
|
||||
pass
|
||||
try:
|
||||
from model_tools import _clear_tool_defs_cache
|
||||
_clear_tool_defs_cache()
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
# ── Live-system guard ──────────────────────────────────────────────────────
|
||||
#
|
||||
# Several test files exercise the gateway-restart / kill code paths
|
||||
|
||||
+116
-17
@@ -313,19 +313,30 @@ def _scan_for_plugin_adapter_antipattern(source: str) -> list[str]:
|
||||
return offenses
|
||||
|
||||
|
||||
def pytest_configure(config):
|
||||
"""Reject plugin-adapter tests that use the sys.path anti-pattern.
|
||||
def _fingerprint_gateway_tests() -> str:
|
||||
"""Return a short fingerprint that changes when any gateway test file changes.
|
||||
|
||||
Runs once per pytest session on the controller, BEFORE any xdist
|
||||
worker is spawned. If any file under ``tests/gateway/`` matches the
|
||||
anti-pattern, we fail the whole session with a clear message —
|
||||
before a polluted ``sys.path`` can cascade across workers.
|
||||
Uses (mtime, size) pairs instead of content hashing — fast to compute
|
||||
(stat-only, no reads) and sufficient for cache invalidation across
|
||||
per-file subprocess runs.
|
||||
"""
|
||||
# Only run on the xdist controller (or in non-xdist runs). Skip on
|
||||
# worker subprocesses so we don't scan the filesystem N times.
|
||||
if hasattr(config, "workerinput"):
|
||||
return
|
||||
import hashlib
|
||||
|
||||
h = hashlib.sha256()
|
||||
for path in sorted(_GATEWAY_DIR.rglob("test_*.py")):
|
||||
try:
|
||||
st = path.stat()
|
||||
h.update(f"{path.name}:{st.st_mtime_ns}:{st.st_size}".encode())
|
||||
except OSError:
|
||||
h.update(f"{path.name}:missing".encode())
|
||||
return h.hexdigest()[:16]
|
||||
|
||||
|
||||
def _run_adapter_antipattern_scan() -> list[str]:
|
||||
"""Scan gateway test files for the plugin-adapter anti-pattern.
|
||||
|
||||
Returns a list of violation strings (empty if clean).
|
||||
"""
|
||||
violations: list[str] = []
|
||||
for path in _GATEWAY_DIR.rglob("test_*.py"):
|
||||
if path.name in {"_plugin_adapter_loader.py", "conftest.py"}:
|
||||
@@ -334,20 +345,108 @@ def pytest_configure(config):
|
||||
source = path.read_text(encoding="utf-8")
|
||||
except OSError:
|
||||
continue
|
||||
# Fast string pre-filter: skip files that can't possibly violate.
|
||||
# A violating file MUST contain both (a) an adapter/plugins/platforms
|
||||
# reference AND (b) either sys.path manipulation or a bare adapter import.
|
||||
if "adapter" not in source and "plugins/platforms" not in source:
|
||||
continue
|
||||
if not (
|
||||
"sys.path" in source
|
||||
or "import adapter" in source
|
||||
or "from adapter import" in source
|
||||
):
|
||||
continue
|
||||
offenses = _scan_for_plugin_adapter_antipattern(source)
|
||||
if offenses:
|
||||
violations.append(
|
||||
f" {path.relative_to(_GATEWAY_DIR.parent.parent)}:\n "
|
||||
+ "\n ".join(offenses)
|
||||
)
|
||||
return violations
|
||||
|
||||
if violations:
|
||||
raise pytest.UsageError(
|
||||
"Plugin-adapter-import anti-pattern detected in gateway tests:\n"
|
||||
+ "\n".join(violations)
|
||||
+ "\n\n"
|
||||
+ _GUARD_HINT
|
||||
)
|
||||
|
||||
def pytest_configure(config):
|
||||
"""Reject plugin-adapter tests that use the sys.path anti-pattern.
|
||||
|
||||
Runs once per pytest session on the controller, BEFORE any xdist
|
||||
worker is spawned. If any file under ``tests/gateway/`` matches the
|
||||
anti-pattern, we fail the whole session with a clear message —
|
||||
before a polluted ``sys.path`` can cascade across workers.
|
||||
|
||||
**Performance**: in the per-file subprocess isolation model (no xdist),
|
||||
every subprocess is a "controller" — so the naive scan would run 257
|
||||
times, each costing ~1s of AST walking. We avoid this with two
|
||||
strategies:
|
||||
|
||||
1. **Tight string pre-filter**: a file can only violate if it contains
|
||||
*both* an adapter/plugins/platforms reference *and* a sys.path
|
||||
manipulation or bare ``import adapter``. This drops ~95% of files
|
||||
from needing AST parsing.
|
||||
2. **File-locked cache**: the scan result is cached in
|
||||
``.pytest-cache/gw-adapter-guard-<fingerprint>`` keyed on a
|
||||
fingerprint of the gateway test file mtimes/sizes. Concurrent
|
||||
subprocesses acquire a lock; only the first performs the scan;
|
||||
the rest wait and read the cached result.
|
||||
"""
|
||||
# Only run on the xdist controller (or in non-xdist runs). Skip on
|
||||
# worker subprocesses so we don't scan the filesystem N times.
|
||||
if hasattr(config, "workerinput"):
|
||||
return
|
||||
|
||||
fp = _fingerprint_gateway_tests()
|
||||
cache_dir = Path.cwd() / ".pytest-cache"
|
||||
cache_file = cache_dir / f"gw-adapter-guard-{fp}"
|
||||
lock_file = cache_dir / f".gw-adapter-guard-{fp}.lock"
|
||||
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Evict stale cache entries from previous fingerprints (best-effort).
|
||||
try:
|
||||
for old in cache_dir.glob("gw-adapter-guard-*"):
|
||||
if old.name != f"gw-adapter-guard-{fp}":
|
||||
old.unlink(missing_ok=True)
|
||||
for old in cache_dir.glob(".gw-adapter-guard-*.lock"):
|
||||
if old.name != f".gw-adapter-guard-{fp}.lock":
|
||||
old.unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass # Non-critical; old files are harmless.
|
||||
|
||||
# Use filelock to ensure only one process scans at a time.
|
||||
# Concurrent subprocesses all hit pytest_configure simultaneously;
|
||||
# without a lock they'd all find no cache and all run the scan.
|
||||
try:
|
||||
from filelock import FileLock
|
||||
lock = FileLock(str(lock_file), timeout=120)
|
||||
except ImportError:
|
||||
# Fallback: no locking (still correct, just slower under contention).
|
||||
import contextlib
|
||||
|
||||
class _NoLock:
|
||||
def __enter__(self):
|
||||
return self
|
||||
def __exit__(self, *a):
|
||||
pass
|
||||
lock = _NoLock()
|
||||
|
||||
with lock:
|
||||
if cache_file.exists():
|
||||
cached = cache_file.read_text(encoding="utf-8")
|
||||
if cached == "clean":
|
||||
return
|
||||
raise pytest.UsageError(cached)
|
||||
|
||||
# Slow path: this process is the first to acquire the lock.
|
||||
violations = _run_adapter_antipattern_scan()
|
||||
|
||||
if violations:
|
||||
msg = (
|
||||
"Plugin-adapter-import anti-pattern detected in gateway tests:\n"
|
||||
+ "\n".join(violations)
|
||||
+ "\n\n"
|
||||
+ _GUARD_HINT
|
||||
)
|
||||
cache_file.write_text(msg, encoding="utf-8")
|
||||
raise pytest.UsageError(msg)
|
||||
else:
|
||||
cache_file.write_text("clean", encoding="utf-8")
|
||||
|
||||
|
||||
@@ -22,19 +22,26 @@ from gateway.config import PlatformConfig
|
||||
|
||||
|
||||
def _ensure_telegram_mock():
|
||||
if "telegram" in sys.modules and hasattr(sys.modules["telegram"], "__file__"):
|
||||
return
|
||||
|
||||
telegram_mod = MagicMock()
|
||||
telegram_mod.ext.ContextTypes.DEFAULT_TYPE = type(None)
|
||||
telegram_mod.constants.ParseMode.MARKDOWN_V2 = "MarkdownV2"
|
||||
telegram_mod.constants.ChatType.GROUP = "group"
|
||||
telegram_mod.constants.ChatType.SUPERGROUP = "supergroup"
|
||||
telegram_mod.constants.ChatType.CHANNEL = "channel"
|
||||
telegram_mod.constants.ChatType.PRIVATE = "private"
|
||||
|
||||
for name in ("telegram", "telegram.ext", "telegram.constants", "telegram.request"):
|
||||
sys.modules.setdefault(name, telegram_mod)
|
||||
# Register telegram.constants as a separate module mock so that
|
||||
# ``from telegram.constants import ChatType`` resolves to our mock
|
||||
# with string-valued members (not auto-generated MagicMocks).
|
||||
constants_mod = MagicMock()
|
||||
constants_mod.ParseMode.MARKDOWN_V2 = "MarkdownV2"
|
||||
constants_mod.ChatType.GROUP = "group"
|
||||
constants_mod.ChatType.SUPERGROUP = "supergroup"
|
||||
constants_mod.ChatType.CHANNEL = "channel"
|
||||
constants_mod.ChatType.PRIVATE = "private"
|
||||
|
||||
sys.modules["telegram"] = telegram_mod
|
||||
sys.modules["telegram.ext"] = telegram_mod.ext
|
||||
sys.modules["telegram.constants"] = constants_mod
|
||||
sys.modules["telegram.request"] = telegram_mod.request
|
||||
|
||||
# Force reimport so the adapter picks up the mock ChatType.
|
||||
sys.modules.pop("gateway.platforms.telegram", None)
|
||||
|
||||
|
||||
_ensure_telegram_mock()
|
||||
|
||||
@@ -22,6 +22,11 @@ import pytest
|
||||
|
||||
from gateway.config import Platform, PlatformConfig, load_gateway_config
|
||||
|
||||
# Platform uses _missing_() for dynamic members, so "google_chat" is
|
||||
# resolvable via Platform("google_chat") even without a static
|
||||
# GOOGLE_CHAT attribute on the enum class.
|
||||
_GC = Platform("google_chat")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Mock the google-* packages if they are not installed
|
||||
@@ -229,7 +234,7 @@ def _make_chat_envelope(text="hello", sender_email="u@example.com", sender_type=
|
||||
|
||||
class TestPlatformRegistration:
|
||||
def test_enum_value(self):
|
||||
assert Platform.GOOGLE_CHAT.value == "google_chat"
|
||||
assert _GC.value == "google_chat"
|
||||
|
||||
def test_requirements_check_returns_true_when_available(self):
|
||||
# The shim flag is True in this test module.
|
||||
@@ -266,14 +271,14 @@ class TestEnvConfigLoading:
|
||||
monkeypatch.setenv("GOOGLE_CHAT_PROJECT_ID", "p")
|
||||
# No subscription.
|
||||
cfg = load_gateway_config()
|
||||
assert Platform.GOOGLE_CHAT not in cfg.platforms
|
||||
assert _GC not in cfg.platforms
|
||||
|
||||
def test_missing_project_does_not_enable(self, monkeypatch):
|
||||
self._clean_env(monkeypatch)
|
||||
monkeypatch.setenv("GOOGLE_CHAT_SUBSCRIPTION_NAME",
|
||||
"projects/p/subscriptions/s")
|
||||
cfg = load_gateway_config()
|
||||
assert Platform.GOOGLE_CHAT not in cfg.platforms
|
||||
assert _GC not in cfg.platforms
|
||||
|
||||
|
||||
|
||||
@@ -2583,7 +2588,7 @@ class TestAuthorizationEmailMatch:
|
||||
runner.pairing_store.is_approved = MagicMock(return_value=False)
|
||||
|
||||
source = SessionSource(
|
||||
platform=Platform.GOOGLE_CHAT,
|
||||
platform=_GC,
|
||||
chat_id="spaces/S",
|
||||
chat_type="dm",
|
||||
user_id="alice@example.com", # post-swap: email is canonical
|
||||
@@ -2604,7 +2609,7 @@ class TestAuthorizationEmailMatch:
|
||||
runner.pairing_store.is_approved = MagicMock(return_value=False)
|
||||
|
||||
source = SessionSource(
|
||||
platform=Platform.GOOGLE_CHAT,
|
||||
platform=_GC,
|
||||
chat_id="spaces/S",
|
||||
chat_type="dm",
|
||||
user_id="bob@example.com",
|
||||
@@ -2630,7 +2635,7 @@ class TestAuthorizationEmailMatch:
|
||||
runner.pairing_store.is_approved = MagicMock(return_value=False)
|
||||
|
||||
source = SessionSource(
|
||||
platform=Platform.GOOGLE_CHAT,
|
||||
platform=_GC,
|
||||
chat_id="spaces/S",
|
||||
chat_type="dm",
|
||||
user_id="users/77777", # no email available — resource name wins
|
||||
|
||||
@@ -75,9 +75,197 @@ class TestCodeGeneration:
|
||||
code = store.generate_code("telegram", "user1", "Alice")
|
||||
pending = store.list_pending("telegram")
|
||||
assert len(pending) == 1
|
||||
assert pending[0]["code"] == code
|
||||
# list_pending no longer returns the original code — it returns a
|
||||
# truncated hash prefix. Verify the metadata is correct instead.
|
||||
assert pending[0]["user_id"] == "user1"
|
||||
assert pending[0]["user_name"] == "Alice"
|
||||
# The code field is now a hash prefix, not the original plaintext code
|
||||
assert pending[0]["code"] != code
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Hashed storage
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestHashedStorage:
|
||||
def test_pending_file_contains_hash_and_salt(self, tmp_path):
|
||||
"""Stored entries must have 'hash' and 'salt', never the plaintext code."""
|
||||
with patch("gateway.pairing.PAIRING_DIR", tmp_path):
|
||||
store = PairingStore()
|
||||
code = store.generate_code("telegram", "user1", "Alice")
|
||||
raw = json.loads(
|
||||
(tmp_path / "telegram-pending.json").read_text(encoding="utf-8")
|
||||
)
|
||||
|
||||
assert len(raw) == 1
|
||||
entry = next(iter(raw.values()))
|
||||
# Must have hash and salt fields
|
||||
assert "hash" in entry
|
||||
assert "salt" in entry
|
||||
# Hash must be a valid hex SHA-256 digest (64 hex chars)
|
||||
assert len(entry["hash"]) == 64
|
||||
assert all(c in "0123456789abcdef" for c in entry["hash"])
|
||||
# Salt must be a valid hex string (32 hex chars for 16 bytes)
|
||||
assert len(entry["salt"]) == 32
|
||||
assert all(c in "0123456789abcdef" for c in entry["salt"])
|
||||
# The plaintext code must NOT appear as a key or value anywhere
|
||||
assert code not in raw # not a key
|
||||
for key, val in raw.items():
|
||||
assert code != key
|
||||
for field_val in val.values():
|
||||
if isinstance(field_val, str):
|
||||
assert field_val != code
|
||||
|
||||
def test_plaintext_code_not_stored(self, tmp_path):
|
||||
"""The raw JSON file must not contain the plaintext code anywhere."""
|
||||
with patch("gateway.pairing.PAIRING_DIR", tmp_path):
|
||||
store = PairingStore()
|
||||
code = store.generate_code("telegram", "user1")
|
||||
raw_text = (tmp_path / "telegram-pending.json").read_text(encoding="utf-8")
|
||||
assert code not in raw_text
|
||||
|
||||
def test_valid_code_verifies_against_hash(self, tmp_path):
|
||||
"""approve_code with the correct code should succeed."""
|
||||
with patch("gateway.pairing.PAIRING_DIR", tmp_path):
|
||||
store = PairingStore()
|
||||
code = store.generate_code("telegram", "user1", "Bob")
|
||||
result = store.approve_code("telegram", code)
|
||||
assert result is not None
|
||||
assert result["user_id"] == "user1"
|
||||
assert result["user_name"] == "Bob"
|
||||
|
||||
def test_invalid_code_rejected(self, tmp_path):
|
||||
"""approve_code with a wrong code should fail."""
|
||||
with patch("gateway.pairing.PAIRING_DIR", tmp_path):
|
||||
store = PairingStore()
|
||||
store.generate_code("telegram", "user1")
|
||||
result = store.approve_code("telegram", "ZZZZZZZZ")
|
||||
assert result is None
|
||||
|
||||
def test_different_salts_per_entry(self, tmp_path):
|
||||
"""Each pending entry should have a unique salt."""
|
||||
with patch("gateway.pairing.PAIRING_DIR", tmp_path):
|
||||
store = PairingStore()
|
||||
store.generate_code("telegram", "user0")
|
||||
store.generate_code("telegram", "user1")
|
||||
store.generate_code("telegram", "user2")
|
||||
raw = json.loads(
|
||||
(tmp_path / "telegram-pending.json").read_text(encoding="utf-8")
|
||||
)
|
||||
salts = [entry["salt"] for entry in raw.values()]
|
||||
assert len(set(salts)) == 3 # all unique
|
||||
|
||||
def test_hash_code_static_method(self, tmp_path):
|
||||
"""_hash_code should be deterministic for the same code+salt."""
|
||||
salt = os.urandom(16)
|
||||
h1 = PairingStore._hash_code("ABCD1234", salt)
|
||||
h2 = PairingStore._hash_code("ABCD1234", salt)
|
||||
assert h1 == h2
|
||||
# Different salt should produce a different hash
|
||||
salt2 = os.urandom(16)
|
||||
h3 = PairingStore._hash_code("ABCD1234", salt2)
|
||||
assert h3 != h1
|
||||
|
||||
|
||||
class TestLegacyPendingFileCompat:
|
||||
"""Defensive coverage for pre-hash pending.json on upgraded installs.
|
||||
|
||||
Existing user installs may have a pending.json written by the old
|
||||
code (plaintext code as key, no hash/salt fields). The new
|
||||
approve_code / list_pending / _cleanup_expired must not crash on
|
||||
those entries — they should be ignored and aged out at TTL.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _write_legacy(tmp_path, code="ABCD1234", created_at=None):
|
||||
"""Write a pre-hash pending.json with plaintext code as the key."""
|
||||
import time as _time
|
||||
if created_at is None:
|
||||
created_at = _time.time()
|
||||
legacy = {
|
||||
code: {
|
||||
"user_id": "legacy-user",
|
||||
"user_name": "Legacy",
|
||||
"created_at": created_at,
|
||||
}
|
||||
}
|
||||
(tmp_path / "telegram-pending.json").write_text(
|
||||
json.dumps(legacy), encoding="utf-8"
|
||||
)
|
||||
|
||||
def test_approve_code_ignores_legacy_entries(self, tmp_path):
|
||||
"""A valid old-format code must NOT silently approve under the new schema."""
|
||||
with patch("gateway.pairing.PAIRING_DIR", tmp_path):
|
||||
self._write_legacy(tmp_path, code="LEGACY01")
|
||||
store = PairingStore()
|
||||
# The plaintext "code" used to be the key — under the new schema
|
||||
# it's not even looked at, and there's no hash/salt to verify.
|
||||
# Result: approve_code returns None, the legacy entry is left
|
||||
# alone (gets pruned by _cleanup_expired at TTL).
|
||||
result = store.approve_code("telegram", "LEGACY01")
|
||||
assert result is None
|
||||
# Approved list must be empty
|
||||
assert store.is_approved("telegram", "legacy-user") is False
|
||||
|
||||
def test_list_pending_handles_legacy_entries(self, tmp_path):
|
||||
"""list_pending must not KeyError on a missing 'hash' field."""
|
||||
with patch("gateway.pairing.PAIRING_DIR", tmp_path):
|
||||
self._write_legacy(tmp_path)
|
||||
store = PairingStore()
|
||||
pending = store.list_pending("telegram")
|
||||
assert len(pending) == 1
|
||||
assert pending[0]["user_id"] == "legacy-user"
|
||||
assert pending[0]["code"] == "legacy" # placeholder
|
||||
|
||||
def test_cleanup_expired_removes_legacy_at_ttl(self, tmp_path):
|
||||
"""Legacy entries past CODE_TTL must still get pruned."""
|
||||
import time as _time
|
||||
with patch("gateway.pairing.PAIRING_DIR", tmp_path):
|
||||
self._write_legacy(
|
||||
tmp_path,
|
||||
code="LEGACY99",
|
||||
created_at=_time.time() - CODE_TTL_SECONDS - 1,
|
||||
)
|
||||
store = PairingStore()
|
||||
store._cleanup_expired("telegram")
|
||||
raw = json.loads(
|
||||
(tmp_path / "telegram-pending.json").read_text(encoding="utf-8")
|
||||
)
|
||||
assert raw == {}
|
||||
|
||||
def test_cleanup_expired_handles_malformed_entries(self, tmp_path):
|
||||
"""Non-dict / missing-created_at entries get evicted, not crashed on."""
|
||||
with patch("gateway.pairing.PAIRING_DIR", tmp_path):
|
||||
(tmp_path / "telegram-pending.json").write_text(
|
||||
json.dumps({
|
||||
"broken1": "not a dict",
|
||||
"broken2": {"user_id": "x"}, # no created_at
|
||||
"broken3": {"created_at": "not a number"},
|
||||
}),
|
||||
encoding="utf-8",
|
||||
)
|
||||
store = PairingStore()
|
||||
store._cleanup_expired("telegram")
|
||||
raw = json.loads(
|
||||
(tmp_path / "telegram-pending.json").read_text(encoding="utf-8")
|
||||
)
|
||||
assert raw == {}
|
||||
|
||||
def test_approve_code_skips_malformed_entries(self, tmp_path):
|
||||
"""Malformed entries must not crash approve_code's hash loop."""
|
||||
import time as _time
|
||||
with patch("gateway.pairing.PAIRING_DIR", tmp_path):
|
||||
(tmp_path / "telegram-pending.json").write_text(
|
||||
json.dumps({
|
||||
"broken": {"user_id": "x", "created_at": _time.time(),
|
||||
"salt": "not-hex", "hash": "doesntmatter"},
|
||||
}),
|
||||
encoding="utf-8",
|
||||
)
|
||||
store = PairingStore()
|
||||
# Approving with any code must just return None, not crash.
|
||||
assert store.approve_code("telegram", "ABCD1234") is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -300,9 +488,10 @@ class TestCodeExpiry:
|
||||
store = PairingStore()
|
||||
code = store.generate_code("telegram", "user1")
|
||||
|
||||
# Manually expire the code
|
||||
# Manually expire all pending entries
|
||||
pending = store._load_json(store._pending_path("telegram"))
|
||||
pending[code]["created_at"] = time.time() - CODE_TTL_SECONDS - 1
|
||||
for entry_id in pending:
|
||||
pending[entry_id]["created_at"] = time.time() - CODE_TTL_SECONDS - 1
|
||||
store._save_json(store._pending_path("telegram"), pending)
|
||||
|
||||
# Cleanup happens on next operation
|
||||
@@ -314,9 +503,10 @@ class TestCodeExpiry:
|
||||
store = PairingStore()
|
||||
code = store.generate_code("telegram", "user1")
|
||||
|
||||
# Expire it
|
||||
# Expire all entries
|
||||
pending = store._load_json(store._pending_path("telegram"))
|
||||
pending[code]["created_at"] = time.time() - CODE_TTL_SECONDS - 1
|
||||
for entry_id in pending:
|
||||
pending[entry_id]["created_at"] = time.time() - CODE_TTL_SECONDS - 1
|
||||
store._save_json(store._pending_path("telegram"), pending)
|
||||
|
||||
result = store.approve_code("telegram", code)
|
||||
|
||||
@@ -6,7 +6,11 @@ import pytest
|
||||
from pathlib import Path
|
||||
|
||||
from gateway.config import PlatformConfig
|
||||
from gateway.platforms.webhook import WebhookAdapter, _DYNAMIC_ROUTES_FILENAME
|
||||
from gateway.platforms.webhook import (
|
||||
WebhookAdapter,
|
||||
_DYNAMIC_ROUTES_FILENAME,
|
||||
_INSECURE_NO_AUTH,
|
||||
)
|
||||
|
||||
|
||||
def _make_adapter(routes=None, extra=None):
|
||||
@@ -85,3 +89,78 @@ class TestDynamicRouteLoading:
|
||||
adapter._reload_dynamic_routes()
|
||||
assert "static" in adapter._routes
|
||||
assert len(adapter._dynamic_routes) == 0
|
||||
|
||||
|
||||
class TestDynamicRouteSecretValidation:
|
||||
"""Empty/missing secrets must be rejected during hot-reload.
|
||||
|
||||
Regression for HMAC bypass: prior to the fix, an agent-induced
|
||||
dynamic route with `"secret": ""` would be merged into self._routes
|
||||
by _reload_dynamic_routes(), then _handle_webhook's
|
||||
`if secret and secret != _INSECURE_NO_AUTH` would skip signature
|
||||
validation because empty string is falsy. Unauthenticated POSTs
|
||||
would then execute the webhook prompt.
|
||||
"""
|
||||
|
||||
def test_empty_secret_rejected(self, tmp_path):
|
||||
# Explicit empty-string secret must NOT fall back to the global
|
||||
# secret, and the route must be skipped entirely.
|
||||
(tmp_path / _DYNAMIC_ROUTES_FILENAME).write_text(
|
||||
json.dumps({"evil": {"secret": "", "prompt": "rm -rf"}})
|
||||
)
|
||||
adapter = _make_adapter() # has global secret
|
||||
adapter._reload_dynamic_routes()
|
||||
assert "evil" not in adapter._routes
|
||||
assert "evil" not in adapter._dynamic_routes
|
||||
|
||||
def test_missing_secret_no_global_rejected(self, tmp_path):
|
||||
(tmp_path / _DYNAMIC_ROUTES_FILENAME).write_text(
|
||||
json.dumps({"orphan": {"prompt": "test"}})
|
||||
)
|
||||
# No global secret configured
|
||||
adapter = _make_adapter(extra={"secret": ""})
|
||||
adapter._reload_dynamic_routes()
|
||||
assert "orphan" not in adapter._routes
|
||||
assert "orphan" not in adapter._dynamic_routes
|
||||
|
||||
def test_missing_secret_inherits_global(self, tmp_path):
|
||||
# No per-route secret but a global one is set → route is kept,
|
||||
# the global secret protects it. Preserves existing fallback.
|
||||
(tmp_path / _DYNAMIC_ROUTES_FILENAME).write_text(
|
||||
json.dumps({"valid": {"prompt": "ok"}})
|
||||
)
|
||||
adapter = _make_adapter() # global secret set
|
||||
adapter._reload_dynamic_routes()
|
||||
assert "valid" in adapter._routes
|
||||
|
||||
def test_insecure_no_auth_preserved(self, tmp_path):
|
||||
# Explicit opt-in escape hatch for local testing — must still load.
|
||||
(tmp_path / _DYNAMIC_ROUTES_FILENAME).write_text(
|
||||
json.dumps({"test": {"secret": _INSECURE_NO_AUTH, "prompt": "p"}})
|
||||
)
|
||||
adapter = _make_adapter()
|
||||
adapter._reload_dynamic_routes()
|
||||
assert "test" in adapter._routes
|
||||
|
||||
def test_warning_logged_on_skip(self, tmp_path, caplog):
|
||||
import logging
|
||||
(tmp_path / _DYNAMIC_ROUTES_FILENAME).write_text(
|
||||
json.dumps({"silent": {"secret": "", "prompt": "x"}})
|
||||
)
|
||||
adapter = _make_adapter()
|
||||
with caplog.at_level(logging.WARNING, logger="gateway.platforms.webhook"):
|
||||
adapter._reload_dynamic_routes()
|
||||
assert any("silent" in rec.message for rec in caplog.records)
|
||||
|
||||
def test_partial_skip(self, tmp_path):
|
||||
# One route bad, one route good — only the bad one is dropped.
|
||||
(tmp_path / _DYNAMIC_ROUTES_FILENAME).write_text(
|
||||
json.dumps({
|
||||
"bad": {"secret": "", "prompt": "x"},
|
||||
"good": {"secret": "valid-secret", "prompt": "y"},
|
||||
})
|
||||
)
|
||||
adapter = _make_adapter()
|
||||
adapter._reload_dynamic_routes()
|
||||
assert "good" in adapter._routes
|
||||
assert "bad" not in adapter._routes
|
||||
|
||||
@@ -0,0 +1,131 @@
|
||||
"""Tests for curses color compatibility on low-color terminals (Docker).
|
||||
|
||||
Regression test for #13688: ``hermes plugins`` crashes with
|
||||
``curses.error: init_pair() : color number is greater than COLORS-1``
|
||||
in Docker containers where curses.COLORS == 8 (only colors 0-7 exist).
|
||||
|
||||
The bug was ``curses.init_pair(4, 8, -1)`` using raw color 8 ("bright
|
||||
black" / dim gray) which does not exist on 8-color terminals. The fix
|
||||
clamps with ``min(8, curses.COLORS - 1)``.
|
||||
"""
|
||||
|
||||
import curses
|
||||
import re
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock, call
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# Path to the source files under test
|
||||
_SRC_ROOT = Path(__file__).parent.parent.parent / "hermes_cli"
|
||||
|
||||
|
||||
class TestInitPairClampingBehavior:
|
||||
"""Simulate curses color initialization on low-color terminals.
|
||||
|
||||
Patches curses.COLORS to 8 (Docker default) and verifies that
|
||||
init_pair is never called with a color >= COLORS.
|
||||
"""
|
||||
|
||||
def _collect_init_pair_calls(self, draw_fn, colors_value):
|
||||
"""Run a curses draw function with a mock stdscr and patched COLORS.
|
||||
|
||||
Returns list of (pair_number, fg, bg) tuples from init_pair calls.
|
||||
"""
|
||||
calls = []
|
||||
real_init_pair = curses.init_pair
|
||||
|
||||
def tracking_init_pair(pair, fg, bg):
|
||||
calls.append((pair, fg, bg))
|
||||
|
||||
mock_stdscr = MagicMock()
|
||||
mock_stdscr.getmaxyx.return_value = (24, 80)
|
||||
mock_stdscr.getch.return_value = 27 # ESC to exit
|
||||
|
||||
with patch("curses.COLORS", colors_value, create=True), \
|
||||
patch("curses.init_pair", side_effect=tracking_init_pair), \
|
||||
patch("curses.has_colors", return_value=True), \
|
||||
patch("curses.start_color"), \
|
||||
patch("curses.use_default_colors"), \
|
||||
patch("curses.curs_set"):
|
||||
try:
|
||||
draw_fn(mock_stdscr)
|
||||
except (SystemExit, StopIteration, Exception):
|
||||
pass # draw functions loop until keypress
|
||||
|
||||
return calls
|
||||
|
||||
def test_8_color_terminal_no_color_exceeds_limit(self):
|
||||
"""On an 8-color terminal (Docker), no init_pair fg color >= 8."""
|
||||
# Simulate the color init pattern from plugins_cmd.py
|
||||
def _simulated_color_init(stdscr):
|
||||
if curses.has_colors():
|
||||
curses.start_color()
|
||||
curses.use_default_colors()
|
||||
curses.init_pair(1, curses.COLOR_GREEN, -1)
|
||||
curses.init_pair(2, curses.COLOR_YELLOW, -1)
|
||||
curses.init_pair(3, curses.COLOR_CYAN, -1)
|
||||
curses.init_pair(4, 8 if curses.COLORS > 8 else curses.COLOR_WHITE, -1)
|
||||
|
||||
calls = self._collect_init_pair_calls(_simulated_color_init, 8)
|
||||
for pair, fg, bg in calls:
|
||||
assert fg < 8, (
|
||||
f"init_pair({pair}, {fg}, {bg}) uses color {fg} which "
|
||||
f"does not exist on an 8-color terminal (valid: 0-7)"
|
||||
)
|
||||
|
||||
def test_256_color_terminal_uses_color_8(self):
|
||||
"""On a 256-color terminal, color 8 (dim gray) should be used."""
|
||||
def _simulated_color_init(stdscr):
|
||||
if curses.has_colors():
|
||||
curses.start_color()
|
||||
curses.use_default_colors()
|
||||
curses.init_pair(4, 8 if curses.COLORS > 8 else curses.COLOR_WHITE, -1)
|
||||
|
||||
calls = self._collect_init_pair_calls(_simulated_color_init, 256)
|
||||
assert any(fg == 8 for _, fg, _ in calls), (
|
||||
"On 256-color terminals, color 8 (dim gray) should be used"
|
||||
)
|
||||
|
||||
def test_16_color_terminal_uses_color_8(self):
|
||||
"""On a 16-color terminal, color 8 should be available."""
|
||||
def _simulated_color_init(stdscr):
|
||||
if curses.has_colors():
|
||||
curses.start_color()
|
||||
curses.use_default_colors()
|
||||
curses.init_pair(4, 8 if curses.COLORS > 8 else curses.COLOR_WHITE, -1)
|
||||
|
||||
calls = self._collect_init_pair_calls(_simulated_color_init, 16)
|
||||
assert any(fg == 8 for _, fg, _ in calls)
|
||||
|
||||
|
||||
class TestSourceCodeGuardrails:
|
||||
"""Regression guardrails: raw color 8 must not reappear in source.
|
||||
|
||||
These complement the behavioral tests above — they catch regressions
|
||||
introduced by copy-paste of the old pattern.
|
||||
"""
|
||||
|
||||
_RAW_COLOR_8_PATTERN = re.compile(r'init_pair\(\d+,\s*8\s*,')
|
||||
|
||||
def test_no_raw_color_8_in_plugins_cmd(self):
|
||||
source = (_SRC_ROOT / "plugins_cmd.py").read_text()
|
||||
matches = self._RAW_COLOR_8_PATTERN.findall(source)
|
||||
assert not matches, (
|
||||
f"plugins_cmd.py contains unclamped color 8: {matches}"
|
||||
)
|
||||
|
||||
def test_no_raw_color_8_in_main(self):
|
||||
source = (_SRC_ROOT / "main.py").read_text()
|
||||
matches = self._RAW_COLOR_8_PATTERN.findall(source)
|
||||
assert not matches, (
|
||||
f"main.py contains unclamped color 8: {matches}"
|
||||
)
|
||||
|
||||
def test_no_raw_color_8_in_curses_ui(self):
|
||||
source = (_SRC_ROOT / "curses_ui.py").read_text()
|
||||
matches = self._RAW_COLOR_8_PATTERN.findall(source)
|
||||
assert not matches, (
|
||||
f"curses_ui.py contains unclamped color 8: {matches}"
|
||||
)
|
||||
@@ -69,18 +69,19 @@ class TestPluginPickerInjection:
|
||||
assert "Myimg" in names
|
||||
assert "myimg" in plugin_names
|
||||
|
||||
def test_fal_skipped_to_avoid_duplicate(self, monkeypatch):
|
||||
def test_fal_surfaced_alongside_other_plugins(self, monkeypatch):
|
||||
from hermes_cli import tools_config
|
||||
|
||||
# Simulate a FAL plugin being registered — the picker already has
|
||||
# hardcoded FAL rows in TOOL_CATEGORIES, so plugin-FAL must be
|
||||
# skipped to avoid showing FAL twice.
|
||||
# After #26241, FAL is itself a plugin (`plugins/image_gen/fal/`)
|
||||
# and the hardcoded `TOOL_CATEGORIES["image_gen"]` FAL row is
|
||||
# gone. The plugin-row builder therefore surfaces it like any
|
||||
# other backend — no deduplication step needed.
|
||||
image_gen_registry.register_provider(_FakeProvider("fal"))
|
||||
image_gen_registry.register_provider(_FakeProvider("openai"))
|
||||
|
||||
rows = tools_config._plugin_image_gen_providers()
|
||||
names = [r.get("image_gen_plugin_name") for r in rows]
|
||||
assert "fal" not in names
|
||||
assert "fal" in names
|
||||
assert "openai" in names
|
||||
|
||||
def test_visible_providers_includes_plugins_for_image_gen(self, monkeypatch):
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""Tests for ``install_cua_driver`` upgrade semantics.
|
||||
"""Tests for ``install_cua_driver`` upgrade semantics and architecture pre-check.
|
||||
|
||||
The cua-driver upstream installer always pulls the latest release tag, so
|
||||
re-running it is the canonical upgrade path. ``install_cua_driver(upgrade=True)``
|
||||
@@ -10,18 +10,18 @@ must:
|
||||
fix for the "we only pulled cua-driver once on enable" complaint).
|
||||
* Preserve original ``upgrade=False`` behaviour for the toolset-enable flow:
|
||||
skip if installed, install otherwise, warn on non-macOS.
|
||||
* Pre-check architecture compatibility before downloading to avoid raw 404
|
||||
errors on Intel macOS when the upstream release lacks x86_64 assets.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import patch
|
||||
import json
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
|
||||
class TestInstallCuaDriverUpgrade:
|
||||
def test_upgrade_on_non_macos_is_silent_noop(self):
|
||||
"""``hermes update`` calls install_cua_driver(upgrade=True) for every
|
||||
user. On Linux/Windows it must return False without printing the
|
||||
"macOS-only; skipping" warning that the toolset-enable path emits."""
|
||||
from hermes_cli import tools_config
|
||||
|
||||
with patch.object(tools_config, "_print_warning") as warn, \
|
||||
@@ -30,8 +30,6 @@ class TestInstallCuaDriverUpgrade:
|
||||
warn.assert_not_called()
|
||||
|
||||
def test_non_upgrade_on_non_macos_warns(self):
|
||||
"""The toolset-enable path (upgrade=False) should still warn loudly
|
||||
when the user tries to enable Computer Use on a non-macOS host."""
|
||||
from hermes_cli import tools_config
|
||||
|
||||
with patch.object(tools_config, "_print_warning") as warn, \
|
||||
@@ -40,43 +38,36 @@ class TestInstallCuaDriverUpgrade:
|
||||
warn.assert_called()
|
||||
|
||||
def test_upgrade_on_macos_with_binary_runs_installer(self):
|
||||
"""When cua-driver is already on PATH and upgrade=True, we must
|
||||
re-run the upstream installer (this is the fix for the bug report).
|
||||
"""
|
||||
from hermes_cli import tools_config
|
||||
|
||||
with patch("platform.system", return_value="Darwin"), \
|
||||
patch.object(tools_config.shutil, "which",
|
||||
side_effect=lambda n: "/usr/local/bin/" + n
|
||||
if n in {"cua-driver", "curl"} else None), \
|
||||
patch.object(tools_config, "_check_cua_driver_asset_for_arch",
|
||||
return_value=True), \
|
||||
patch.object(tools_config, "_run_cua_driver_installer",
|
||||
return_value=True) as runner, \
|
||||
patch("subprocess.run"):
|
||||
assert tools_config.install_cua_driver(upgrade=True) is True
|
||||
runner.assert_called_once()
|
||||
# Refresh path uses non-verbose mode so we don't re-print the
|
||||
# "grant macOS permissions" block on every `hermes update`.
|
||||
kwargs = runner.call_args.kwargs
|
||||
assert kwargs.get("verbose") is False
|
||||
|
||||
def test_upgrade_on_macos_without_binary_runs_installer(self):
|
||||
"""upgrade=True with cua-driver missing must still trigger an
|
||||
install — equivalent to a fresh install. (Don't silently no-op.)"""
|
||||
from hermes_cli import tools_config
|
||||
|
||||
with patch("platform.system", return_value="Darwin"), \
|
||||
patch.object(tools_config.shutil, "which",
|
||||
side_effect=lambda n: "/usr/bin/curl" if n == "curl" else None), \
|
||||
patch.object(tools_config, "_check_cua_driver_asset_for_arch",
|
||||
return_value=True), \
|
||||
patch.object(tools_config, "_run_cua_driver_installer",
|
||||
return_value=True) as runner:
|
||||
assert tools_config.install_cua_driver(upgrade=True) is True
|
||||
runner.assert_called_once()
|
||||
|
||||
def test_non_upgrade_on_macos_with_binary_skips_install(self):
|
||||
"""Original toolset-enable behaviour: cua-driver already installed
|
||||
+ upgrade=False → confirm and return without re-running installer.
|
||||
This is the behaviour that ``hermes tools`` (re)enable depends on,
|
||||
so the new helper must not regress it."""
|
||||
from hermes_cli import tools_config
|
||||
|
||||
with patch("platform.system", return_value="Darwin"), \
|
||||
@@ -89,27 +80,133 @@ class TestInstallCuaDriverUpgrade:
|
||||
runner.assert_not_called()
|
||||
|
||||
def test_non_upgrade_on_macos_without_binary_runs_installer(self):
|
||||
"""Original fresh-install path must still work."""
|
||||
from hermes_cli import tools_config
|
||||
|
||||
with patch("platform.system", return_value="Darwin"), \
|
||||
patch.object(tools_config.shutil, "which",
|
||||
side_effect=lambda n: "/usr/bin/curl" if n == "curl" else None), \
|
||||
patch.object(tools_config, "_check_cua_driver_asset_for_arch",
|
||||
return_value=True), \
|
||||
patch.object(tools_config, "_run_cua_driver_installer",
|
||||
return_value=True) as runner:
|
||||
assert tools_config.install_cua_driver(upgrade=False) is True
|
||||
runner.assert_called_once()
|
||||
|
||||
def test_upgrade_without_curl_does_not_crash(self):
|
||||
"""If curl isn't on PATH we can't refresh — must warn and return
|
||||
the current install state, not raise."""
|
||||
|
||||
class TestCheckCuaDriverAssetForArch:
|
||||
def test_arm64_always_returns_true(self):
|
||||
from hermes_cli import tools_config
|
||||
|
||||
# cua-driver present, curl missing.
|
||||
def _which(name):
|
||||
return "/usr/local/bin/cua-driver" if name == "cua-driver" else None
|
||||
with patch("platform.machine", return_value="arm64"):
|
||||
assert tools_config._check_cua_driver_asset_for_arch() is True
|
||||
|
||||
def test_x86_64_with_asset_returns_true(self):
|
||||
from hermes_cli import tools_config
|
||||
|
||||
release = {
|
||||
"tag_name": "cua-driver-v0.1.6",
|
||||
"assets": [
|
||||
{"name": "cua-driver-0.1.6-darwin-arm64.tar.gz"},
|
||||
{"name": "cua-driver-0.1.6-darwin-x86_64.tar.gz"},
|
||||
],
|
||||
}
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.read.return_value = json.dumps(release).encode()
|
||||
mock_resp.__enter__ = lambda s: s
|
||||
mock_resp.__exit__ = MagicMock(return_value=False)
|
||||
|
||||
with patch("platform.machine", return_value="x86_64"), \
|
||||
patch("urllib.request.urlopen", return_value=mock_resp):
|
||||
assert tools_config._check_cua_driver_asset_for_arch() is True
|
||||
|
||||
def test_x86_64_without_asset_returns_false(self):
|
||||
from hermes_cli import tools_config
|
||||
|
||||
release = {
|
||||
"tag_name": "cua-driver-v0.1.6",
|
||||
"assets": [
|
||||
{"name": "cua-driver-0.1.6-darwin-arm64.tar.gz"},
|
||||
{"name": "cua-driver.tar.gz"},
|
||||
],
|
||||
}
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.read.return_value = json.dumps(release).encode()
|
||||
mock_resp.__enter__ = lambda s: s
|
||||
mock_resp.__exit__ = MagicMock(return_value=False)
|
||||
|
||||
with patch("platform.machine", return_value="x86_64"), \
|
||||
patch("urllib.request.urlopen", return_value=mock_resp), \
|
||||
patch.object(tools_config, "_print_warning") as warn, \
|
||||
patch.object(tools_config, "_print_info"):
|
||||
assert tools_config._check_cua_driver_asset_for_arch() is False
|
||||
warn.assert_called_once()
|
||||
assert "no Intel" in warn.call_args[0][0].lower() or "x86_64" in warn.call_args[0][0]
|
||||
|
||||
def test_x86_64_api_failure_returns_true(self):
|
||||
"""Network failure should fail open — let the installer handle it."""
|
||||
from hermes_cli import tools_config
|
||||
|
||||
with patch("platform.machine", return_value="x86_64"), \
|
||||
patch("urllib.request.urlopen", side_effect=Exception("timeout")):
|
||||
assert tools_config._check_cua_driver_asset_for_arch() is True
|
||||
|
||||
def test_fresh_install_x86_64_no_asset_skips_installer(self):
|
||||
"""When the latest release has no Intel asset, skip the installer."""
|
||||
from hermes_cli import tools_config
|
||||
|
||||
release = {
|
||||
"tag_name": "cua-driver-v0.1.6",
|
||||
"assets": [{"name": "cua-driver-0.1.6-darwin-arm64.tar.gz"}],
|
||||
}
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.read.return_value = json.dumps(release).encode()
|
||||
mock_resp.__enter__ = lambda s: s
|
||||
mock_resp.__exit__ = MagicMock(return_value=False)
|
||||
|
||||
with patch("platform.system", return_value="Darwin"), \
|
||||
patch.object(tools_config.shutil, "which", side_effect=_which), \
|
||||
patch.object(tools_config, "_print_warning"):
|
||||
patch.object(tools_config.shutil, "which",
|
||||
side_effect=lambda n: "/usr/bin/curl" if n == "curl" else None), \
|
||||
patch("platform.machine", return_value="x86_64"), \
|
||||
patch("urllib.request.urlopen", return_value=mock_resp), \
|
||||
patch.object(tools_config, "_print_warning"), \
|
||||
patch.object(tools_config, "_print_info"), \
|
||||
patch.object(tools_config, "_run_cua_driver_installer") as runner:
|
||||
assert tools_config.install_cua_driver(upgrade=False) is False
|
||||
runner.assert_not_called()
|
||||
|
||||
def test_upgrade_x86_64_no_asset_returns_existing_status(self):
|
||||
"""On upgrade with no Intel asset, return whether binary existed."""
|
||||
from hermes_cli import tools_config
|
||||
|
||||
release = {
|
||||
"tag_name": "cua-driver-v0.1.6",
|
||||
"assets": [{"name": "cua-driver-0.1.6-darwin-arm64.tar.gz"}],
|
||||
}
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.read.return_value = json.dumps(release).encode()
|
||||
mock_resp.__enter__ = lambda s: s
|
||||
mock_resp.__exit__ = MagicMock(return_value=False)
|
||||
|
||||
# With binary installed — returns True (binary exists)
|
||||
with patch("platform.system", return_value="Darwin"), \
|
||||
patch.object(tools_config.shutil, "which",
|
||||
side_effect=lambda n: "/usr/local/bin/" + n
|
||||
if n in ("cua-driver", "curl") else None), \
|
||||
patch("platform.machine", return_value="x86_64"), \
|
||||
patch("urllib.request.urlopen", return_value=mock_resp), \
|
||||
patch.object(tools_config, "_print_warning"), \
|
||||
patch.object(tools_config, "_print_info"), \
|
||||
patch.object(tools_config, "_run_cua_driver_installer") as runner:
|
||||
assert tools_config.install_cua_driver(upgrade=True) is True
|
||||
runner.assert_not_called()
|
||||
|
||||
# Without binary — returns False
|
||||
with patch("platform.system", return_value="Darwin"), \
|
||||
patch.object(tools_config.shutil, "which",
|
||||
side_effect=lambda n: "/usr/bin/curl" if n == "curl" else None), \
|
||||
patch("platform.machine", return_value="x86_64"), \
|
||||
patch("urllib.request.urlopen", return_value=mock_resp), \
|
||||
patch.object(tools_config, "_print_warning"), \
|
||||
patch.object(tools_config, "_print_info"), \
|
||||
patch.object(tools_config, "_run_cua_driver_installer") as runner:
|
||||
assert tools_config.install_cua_driver(upgrade=True) is False
|
||||
runner.assert_not_called()
|
||||
|
||||
@@ -7,6 +7,7 @@ printf) to verify it behaves like a PTY you can read/write/resize/close.
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import time
|
||||
|
||||
@@ -66,7 +67,7 @@ class TestPtyBridgeIO:
|
||||
def test_write_sends_to_child_stdin(self):
|
||||
# `cat` with no args echoes stdin back to stdout. We write a line,
|
||||
# read it back, then signal EOF to let cat exit cleanly.
|
||||
bridge = PtyBridge.spawn(["/bin/cat"])
|
||||
bridge = PtyBridge.spawn([shutil.which("cat") or "cat"])
|
||||
try:
|
||||
bridge.write(b"hello-pty\n")
|
||||
output = _read_until(bridge, b"hello-pty")
|
||||
|
||||
@@ -1631,6 +1631,33 @@ def test_named_custom_runtime_propagates_model_direct_path(monkeypatch):
|
||||
assert resolved["provider"] == "custom"
|
||||
|
||||
|
||||
def test_named_custom_runtime_propagates_extra_body_direct_path(monkeypatch):
|
||||
"""Custom provider extra_body should become runtime request_overrides."""
|
||||
monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "my-gemma")
|
||||
monkeypatch.setattr(
|
||||
rp, "_get_named_custom_provider",
|
||||
lambda p: {
|
||||
"name": "my-gemma",
|
||||
"base_url": "http://localhost:8000/v1",
|
||||
"api_key": "test-key",
|
||||
"model": "google/gemma-4-31b-it",
|
||||
"extra_body": {
|
||||
"enable_thinking": True,
|
||||
"reasoning_effort": "high",
|
||||
},
|
||||
},
|
||||
)
|
||||
monkeypatch.setattr(rp, "_try_resolve_from_custom_pool", lambda *a, **k: None)
|
||||
|
||||
resolved = rp.resolve_runtime_provider(requested="my-gemma")
|
||||
assert resolved["request_overrides"] == {
|
||||
"extra_body": {
|
||||
"enable_thinking": True,
|
||||
"reasoning_effort": "high",
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def test_named_custom_runtime_propagates_model_pool_path(monkeypatch):
|
||||
"""Model should propagate even when credential pool handles credentials."""
|
||||
monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "my-server")
|
||||
@@ -1662,6 +1689,36 @@ def test_named_custom_runtime_propagates_model_pool_path(monkeypatch):
|
||||
assert resolved["api_key"] == "pool-key", "pool credentials should be used"
|
||||
|
||||
|
||||
def test_named_custom_runtime_propagates_extra_body_pool_path(monkeypatch):
|
||||
"""Custom provider extra_body should survive credential-pool resolution."""
|
||||
monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "my-gemma")
|
||||
monkeypatch.setattr(
|
||||
rp, "_get_named_custom_provider",
|
||||
lambda p: {
|
||||
"name": "my-gemma",
|
||||
"base_url": "http://localhost:8000/v1",
|
||||
"api_key": "test-key",
|
||||
"model": "google/gemma-4-31b-it",
|
||||
"extra_body": {"enable_thinking": True},
|
||||
},
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
rp, "_try_resolve_from_custom_pool",
|
||||
lambda *a, **k: {
|
||||
"provider": "custom",
|
||||
"api_mode": "chat_completions",
|
||||
"base_url": "http://localhost:8000/v1",
|
||||
"api_key": "pool-key",
|
||||
"source": "pool:custom:my-gemma",
|
||||
},
|
||||
)
|
||||
|
||||
resolved = rp.resolve_runtime_provider(requested="my-gemma")
|
||||
assert resolved["request_overrides"] == {
|
||||
"extra_body": {"enable_thinking": True}
|
||||
}
|
||||
|
||||
|
||||
def test_named_custom_runtime_no_model_when_absent(monkeypatch):
|
||||
"""When custom_providers entry has no model field, runtime should not either."""
|
||||
monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "my-server")
|
||||
@@ -2150,6 +2207,24 @@ class TestProviderEntryApiKeyEnvAlias:
|
||||
key_env so the set stays in sync with what the runtime actually reads."""
|
||||
from hermes_cli.config import _VALID_CUSTOM_PROVIDER_FIELDS
|
||||
assert "key_env" in _VALID_CUSTOM_PROVIDER_FIELDS
|
||||
|
||||
def test_extra_body_is_supported_schema(self):
|
||||
from hermes_cli.config import (
|
||||
_VALID_CUSTOM_PROVIDER_FIELDS,
|
||||
_normalize_custom_provider_entry,
|
||||
)
|
||||
entry = {
|
||||
"name": "vendor",
|
||||
"base_url": "https://api.vendor.example.com/v1",
|
||||
"extra_body": {
|
||||
"chat_template_kwargs": {"enable_thinking": True},
|
||||
"include_reasoning": True,
|
||||
},
|
||||
}
|
||||
normalized = _normalize_custom_provider_entry(dict(entry), provider_key="vendor")
|
||||
assert normalized is not None
|
||||
assert "extra_body" in _VALID_CUSTOM_PROVIDER_FIELDS
|
||||
assert normalized["extra_body"] == entry["extra_body"]
|
||||
# =============================================================================
|
||||
# Tencent TokenHub — API-key provider runtime resolution
|
||||
# =============================================================================
|
||||
|
||||
@@ -90,6 +90,7 @@ def test_show_status_reports_managed_nous_features(monkeypatch, capsys, tmp_path
|
||||
"tts": NousFeatureState("tts", "OpenAI TTS", True, True, True, True, False, True, "OpenAI TTS"),
|
||||
"browser": NousFeatureState("browser", "Browser automation", True, True, True, True, False, True, "Browser Use"),
|
||||
"modal": NousFeatureState("modal", "Modal execution", False, True, False, False, False, True, "local"),
|
||||
"app_tools": NousFeatureState("app_tools", "App tools (500+ apps)", True, True, True, True, False, True, "Nous Subscription"),
|
||||
},
|
||||
),
|
||||
raising=False,
|
||||
|
||||
@@ -12,8 +12,10 @@ from hermes_cli.tools_config import (
|
||||
_get_platform_tools,
|
||||
_platform_toolset_summary,
|
||||
_reconfigure_tool,
|
||||
_run_post_setup,
|
||||
_save_platform_tools,
|
||||
_toolset_has_keys,
|
||||
_toolset_needs_configuration_prompt,
|
||||
CONFIGURABLE_TOOLSETS,
|
||||
TOOL_CATEGORIES,
|
||||
_visible_providers,
|
||||
@@ -752,6 +754,91 @@ def test_numeric_mcp_server_name_does_not_crash_sorted():
|
||||
|
||||
# ─── Imagegen Backend Picker Wiring ────────────────────────────────────────
|
||||
|
||||
def test_toolset_has_keys_treats_no_key_providers_as_configured():
|
||||
config = {}
|
||||
|
||||
assert _toolset_has_keys("computer_use", config) is True
|
||||
|
||||
|
||||
def test_computer_use_needs_configuration_when_cua_driver_post_setup_pending():
|
||||
"""No-key providers can still need setup when their post_setup is unsatisfied.
|
||||
|
||||
Returning users enabling Computer Use through `hermes tools` must reach the
|
||||
cua-driver post-setup installer even though the provider has no API keys.
|
||||
"""
|
||||
with patch("shutil.which", return_value=None):
|
||||
assert _toolset_needs_configuration_prompt("computer_use", {}) is True
|
||||
|
||||
|
||||
def test_computer_use_skips_configuration_when_cua_driver_already_installed():
|
||||
"""Installed post_setup dependencies should keep returning-user toggles no-op."""
|
||||
def fake_which(name: str):
|
||||
return "/usr/local/bin/cua-driver" if name == "cua-driver" else None
|
||||
|
||||
with patch("shutil.which", side_effect=fake_which):
|
||||
assert _toolset_needs_configuration_prompt("computer_use", {}) is False
|
||||
|
||||
|
||||
def test_computer_use_respects_custom_cua_driver_command():
|
||||
"""The setup gate should match runtime's HERMES_CUA_DRIVER_CMD override."""
|
||||
def fake_which(name: str):
|
||||
return "/opt/bin/custom-cua" if name == "custom-cua" else None
|
||||
|
||||
with patch.dict("os.environ", {"HERMES_CUA_DRIVER_CMD": "custom-cua"}), \
|
||||
patch("shutil.which", side_effect=fake_which):
|
||||
assert _toolset_needs_configuration_prompt("computer_use", {}) is False
|
||||
|
||||
|
||||
def test_computer_use_blank_custom_driver_command_falls_back_to_default():
|
||||
"""Blank overrides should not make the setup gate look for an empty command."""
|
||||
def fake_which(name: str):
|
||||
return "/usr/local/bin/cua-driver" if name == "cua-driver" else None
|
||||
|
||||
with patch.dict("os.environ", {"HERMES_CUA_DRIVER_CMD": " "}), \
|
||||
patch("shutil.which", side_effect=fake_which):
|
||||
assert _toolset_needs_configuration_prompt("computer_use", {}) is False
|
||||
|
||||
|
||||
def test_computer_use_post_setup_respects_custom_driver_command_when_installed():
|
||||
"""post_setup already-installed checks should version-probe the override."""
|
||||
def fake_which(name: str):
|
||||
return "/opt/bin/custom-cua" if name == "custom-cua" else None
|
||||
|
||||
with patch.dict("os.environ", {"HERMES_CUA_DRIVER_CMD": "custom-cua"}), \
|
||||
patch("platform.system", return_value="Darwin"), \
|
||||
patch("shutil.which", side_effect=fake_which), \
|
||||
patch("subprocess.run") as run:
|
||||
run.return_value.stdout = "custom 1.2.3\n"
|
||||
|
||||
_run_post_setup("cua_driver")
|
||||
|
||||
run.assert_called_once()
|
||||
assert run.call_args.args[0] == ["custom-cua", "--version"]
|
||||
|
||||
|
||||
def test_computer_use_post_setup_missing_override_does_not_accept_default_binary():
|
||||
"""A default cua-driver binary must not satisfy a missing runtime override."""
|
||||
seen = []
|
||||
|
||||
def fake_which(name: str):
|
||||
seen.append(name)
|
||||
if name == "cua-driver":
|
||||
return "/usr/local/bin/cua-driver"
|
||||
if name == "curl":
|
||||
return None
|
||||
return None
|
||||
|
||||
with patch.dict("os.environ", {"HERMES_CUA_DRIVER_CMD": "custom-cua"}), \
|
||||
patch("platform.system", return_value="Darwin"), \
|
||||
patch("shutil.which", side_effect=fake_which), \
|
||||
patch("subprocess.run") as run:
|
||||
_run_post_setup("cua_driver")
|
||||
|
||||
run.assert_not_called()
|
||||
assert "custom-cua" in seen
|
||||
assert "curl" in seen
|
||||
|
||||
|
||||
class TestImagegenBackendRegistry:
|
||||
"""IMAGEGEN_BACKENDS tags drive the model picker flow in tools_config."""
|
||||
|
||||
|
||||
@@ -168,7 +168,7 @@ def test_make_tui_argv_skips_build_only_on_termux_when_fresh(
|
||||
|
||||
argv, cwd = main_mod._make_tui_argv(tmp_path, tui_dev=False)
|
||||
|
||||
assert argv == ["/bin/node", str(tmp_path / "dist" / "entry.js")]
|
||||
assert argv == ["/bin/node", "--expose-gc", str(tmp_path / "dist" / "entry.js")]
|
||||
assert cwd == tmp_path
|
||||
|
||||
|
||||
|
||||
@@ -283,6 +283,233 @@ def test_fast_tui_launch_is_termux_only(monkeypatch, main_mod):
|
||||
assert main_mod._try_termux_fast_tui_launch() is False
|
||||
|
||||
|
||||
def test_termux_fast_cli_launch_chat_uses_light_parser(monkeypatch, main_mod):
|
||||
captured = {}
|
||||
prepared = []
|
||||
|
||||
monkeypatch.setenv("TERMUX_VERSION", "1")
|
||||
monkeypatch.delenv("HERMES_TUI", raising=False)
|
||||
monkeypatch.setattr(
|
||||
sys, "argv", ["hermes", "chat", "-q", "hello", "--toolsets", "web,terminal"]
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
main_mod, "_prepare_agent_startup", lambda args: prepared.append(args.command)
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
main_mod,
|
||||
"cmd_chat",
|
||||
lambda args: captured.update(
|
||||
{"query": args.query, "toolsets": args.toolsets, "command": args.command}
|
||||
),
|
||||
)
|
||||
|
||||
assert main_mod._try_termux_fast_cli_launch() is True
|
||||
assert prepared == ["chat"]
|
||||
assert captured == {
|
||||
"query": "hello",
|
||||
"toolsets": "web,terminal",
|
||||
"command": "chat",
|
||||
}
|
||||
|
||||
|
||||
def test_termux_fast_cli_launch_oneshot_uses_light_parser(monkeypatch, main_mod):
|
||||
captured = {}
|
||||
prepared = []
|
||||
|
||||
monkeypatch.setenv("TERMUX_VERSION", "1")
|
||||
monkeypatch.delenv("HERMES_TUI", raising=False)
|
||||
monkeypatch.setattr(
|
||||
sys,
|
||||
"argv",
|
||||
["hermes", "-z", "hello", "--model", "gpt-test", "--provider", "openai"],
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
main_mod, "_prepare_agent_startup", lambda args: prepared.append(args.command)
|
||||
)
|
||||
monkeypatch.setitem(
|
||||
sys.modules,
|
||||
"hermes_cli.oneshot",
|
||||
types.SimpleNamespace(
|
||||
run_oneshot=lambda prompt, **kwargs: captured.update(
|
||||
{"prompt": prompt, **kwargs}
|
||||
)
|
||||
or 17
|
||||
),
|
||||
)
|
||||
|
||||
with pytest.raises(SystemExit) as exc:
|
||||
main_mod._try_termux_fast_cli_launch()
|
||||
|
||||
assert exc.value.code == 17
|
||||
assert prepared == [None]
|
||||
assert captured == {
|
||||
"prompt": "hello",
|
||||
"model": "gpt-test",
|
||||
"provider": "openai",
|
||||
"toolsets": None,
|
||||
}
|
||||
|
||||
|
||||
def test_termux_fast_cli_launch_version_skips_update_check(monkeypatch, main_mod):
|
||||
captured = []
|
||||
|
||||
monkeypatch.setenv("TERMUX_VERSION", "1")
|
||||
monkeypatch.delenv("HERMES_TUI", raising=False)
|
||||
monkeypatch.setattr(sys, "argv", ["hermes", "version"])
|
||||
monkeypatch.setattr(
|
||||
main_mod, "_print_version_info", lambda *, check_updates: captured.append(check_updates)
|
||||
)
|
||||
|
||||
assert main_mod._try_termux_fast_cli_launch() is True
|
||||
assert captured == [False]
|
||||
|
||||
|
||||
def test_termux_fast_cli_launch_skips_help(monkeypatch, main_mod):
|
||||
monkeypatch.setenv("TERMUX_VERSION", "1")
|
||||
monkeypatch.delenv("HERMES_TUI", raising=False)
|
||||
monkeypatch.setattr(sys, "argv", ["hermes", "chat", "--help"])
|
||||
|
||||
assert main_mod._try_termux_fast_cli_launch() is False
|
||||
|
||||
|
||||
def test_termux_fast_cli_launch_can_be_disabled(monkeypatch, main_mod):
|
||||
monkeypatch.setenv("TERMUX_VERSION", "1")
|
||||
monkeypatch.setenv("HERMES_TERMUX_DISABLE_FAST_CLI", "1")
|
||||
monkeypatch.delenv("HERMES_TUI", raising=False)
|
||||
monkeypatch.setattr(sys, "argv", ["hermes", "version"])
|
||||
|
||||
assert main_mod._try_termux_fast_cli_launch() is False
|
||||
|
||||
|
||||
def test_termux_bundled_skills_stamp_controls_sync(monkeypatch, tmp_path, main_mod):
|
||||
monkeypatch.setenv("TERMUX_VERSION", "1")
|
||||
monkeypatch.setattr(main_mod, "get_hermes_home", lambda: tmp_path)
|
||||
monkeypatch.setattr(main_mod, "_termux_bundled_skills_fingerprint", lambda: "fp1")
|
||||
|
||||
assert main_mod._termux_bundled_skills_sync_needed() is True
|
||||
main_mod._mark_termux_bundled_skills_synced()
|
||||
assert main_mod._termux_bundled_skills_sync_needed() is False
|
||||
|
||||
monkeypatch.setenv("HERMES_TERMUX_FORCE_SKILLS_SYNC", "1")
|
||||
assert main_mod._termux_bundled_skills_sync_needed() is True
|
||||
|
||||
|
||||
def test_termux_skips_bundled_skill_sync_when_stamp_fresh(monkeypatch, tmp_path, main_mod):
|
||||
calls = []
|
||||
|
||||
monkeypatch.setenv("TERMUX_VERSION", "1")
|
||||
monkeypatch.setattr(main_mod, "get_hermes_home", lambda: tmp_path)
|
||||
monkeypatch.setattr(main_mod, "_termux_bundled_skills_fingerprint", lambda: "fp1")
|
||||
main_mod._mark_termux_bundled_skills_synced()
|
||||
monkeypatch.setitem(
|
||||
sys.modules,
|
||||
"tools.skills_sync",
|
||||
types.SimpleNamespace(sync_skills=lambda quiet: calls.append(quiet)),
|
||||
)
|
||||
|
||||
assert main_mod._sync_bundled_skills_for_startup() is False
|
||||
assert calls == []
|
||||
|
||||
|
||||
def test_termux_forced_bundled_skill_sync_runs(monkeypatch, tmp_path, main_mod):
|
||||
calls = []
|
||||
|
||||
monkeypatch.setenv("TERMUX_VERSION", "1")
|
||||
monkeypatch.setenv("HERMES_TERMUX_FORCE_SKILLS_SYNC", "1")
|
||||
monkeypatch.setattr(main_mod, "get_hermes_home", lambda: tmp_path)
|
||||
monkeypatch.setattr(main_mod, "_termux_bundled_skills_fingerprint", lambda: "fp1")
|
||||
monkeypatch.setitem(
|
||||
sys.modules,
|
||||
"tools.skills_sync",
|
||||
types.SimpleNamespace(sync_skills=lambda quiet: calls.append(quiet)),
|
||||
)
|
||||
|
||||
assert main_mod._sync_bundled_skills_for_startup() is True
|
||||
assert calls == [True]
|
||||
|
||||
|
||||
def test_read_git_revision_fingerprint_resolves_packed_refs(tmp_path, main_mod):
|
||||
repo = tmp_path / "repo"
|
||||
git_dir = repo / ".git"
|
||||
git_dir.mkdir(parents=True)
|
||||
(git_dir / "HEAD").write_text("ref: refs/heads/main\n", encoding="utf-8")
|
||||
packed_sha = "1234567890abcdef1234567890abcdef12345678"
|
||||
(git_dir / "packed-refs").write_text(
|
||||
"# pack-refs with: peeled fully-peeled sorted\n"
|
||||
f"{packed_sha} refs/heads/main\n"
|
||||
"abcdef0000000000000000000000000000000000 refs/tags/v1.0\n"
|
||||
"^99999999aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
fingerprint = main_mod._read_git_revision_fingerprint(repo)
|
||||
|
||||
assert fingerprint == f"git:refs/heads/main:{packed_sha}"
|
||||
|
||||
|
||||
def test_read_git_revision_fingerprint_packed_refs_in_worktree_common_dir(
|
||||
tmp_path, main_mod
|
||||
):
|
||||
main_repo = tmp_path / "repo"
|
||||
common_git = main_repo / ".git"
|
||||
common_git.mkdir(parents=True)
|
||||
packed_sha = "fedcba9876543210fedcba9876543210fedcba98"
|
||||
(common_git / "packed-refs").write_text(
|
||||
f"{packed_sha} refs/heads/main\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
worktree = tmp_path / "wt"
|
||||
worktree.mkdir()
|
||||
wt_gitdir = common_git / "worktrees" / "wt"
|
||||
wt_gitdir.mkdir(parents=True)
|
||||
(wt_gitdir / "HEAD").write_text("ref: refs/heads/main\n", encoding="utf-8")
|
||||
(wt_gitdir / "commondir").write_text("../..\n", encoding="utf-8")
|
||||
(worktree / ".git").write_text(f"gitdir: {wt_gitdir}\n", encoding="utf-8")
|
||||
|
||||
fingerprint = main_mod._read_git_revision_fingerprint(worktree)
|
||||
|
||||
assert fingerprint == f"git:refs/heads/main:{packed_sha}"
|
||||
|
||||
|
||||
def test_read_git_revision_fingerprint_loose_ref_in_worktree_common_dir(
|
||||
tmp_path, main_mod
|
||||
):
|
||||
"""`git worktree add -b NAME` writes the new branch ref to the common dir,
|
||||
not the per-worktree gitdir. The fingerprint must still resolve it."""
|
||||
main_repo = tmp_path / "repo"
|
||||
common_git = main_repo / ".git"
|
||||
common_git.mkdir(parents=True)
|
||||
loose_sha = "0123456789abcdef0123456789abcdef01234567"
|
||||
(common_git / "refs" / "heads").mkdir(parents=True)
|
||||
(common_git / "refs" / "heads" / "feature").write_text(
|
||||
loose_sha + "\n", encoding="utf-8"
|
||||
)
|
||||
|
||||
worktree = tmp_path / "wt"
|
||||
worktree.mkdir()
|
||||
wt_gitdir = common_git / "worktrees" / "wt"
|
||||
wt_gitdir.mkdir(parents=True)
|
||||
(wt_gitdir / "HEAD").write_text("ref: refs/heads/feature\n", encoding="utf-8")
|
||||
(wt_gitdir / "commondir").write_text("../..\n", encoding="utf-8")
|
||||
(worktree / ".git").write_text(f"gitdir: {wt_gitdir}\n", encoding="utf-8")
|
||||
|
||||
fingerprint = main_mod._read_git_revision_fingerprint(worktree)
|
||||
|
||||
assert fingerprint == f"git:refs/heads/feature:{loose_sha}"
|
||||
|
||||
|
||||
def test_read_git_revision_fingerprint_unresolved_ref_is_stable(tmp_path, main_mod):
|
||||
repo = tmp_path / "repo"
|
||||
git_dir = repo / ".git"
|
||||
git_dir.mkdir(parents=True)
|
||||
(git_dir / "HEAD").write_text("ref: refs/heads/missing\n", encoding="utf-8")
|
||||
|
||||
fingerprint = main_mod._read_git_revision_fingerprint(repo)
|
||||
|
||||
assert fingerprint == "git:refs/heads/missing:unresolved"
|
||||
|
||||
|
||||
def test_main_top_level_oneshot_accepts_toolsets(monkeypatch, main_mod):
|
||||
captured = {}
|
||||
|
||||
|
||||
@@ -0,0 +1,300 @@
|
||||
"""Behavior-parity check for the image-gen FAL plugin migration (#26241).
|
||||
|
||||
Spawns one subprocess per (version, scenario) cell — pinned to either
|
||||
``origin/main`` (legacy in-tree FAL fall-through + ``configured == "fal"``
|
||||
skip in ``_dispatch_to_plugin_provider``) or this PR's worktree (FAL is
|
||||
itself a plugin and the dispatcher routes every set provider through
|
||||
the registry). Each subprocess clears all FAL-related env vars + writes
|
||||
a ``config.yaml``, then asks the dispatcher how it would route an
|
||||
``image_generate`` call. The emitted shape tuple is
|
||||
``{dispatch_kind, provider_name, model}``:
|
||||
|
||||
* ``dispatch_kind`` ∈ ``{"legacy_fal", "plugin", "error", None}`` —
|
||||
whether the call would go straight to the in-tree pipeline,
|
||||
through ``_dispatch_to_plugin_provider``, raise an explicit
|
||||
provider-not-registered error, or fall through silently.
|
||||
* ``provider_name`` — when ``dispatch_kind == "plugin"``, the
|
||||
resolved provider name. ``None`` otherwise.
|
||||
* ``model`` — the resolved FAL model id when applicable.
|
||||
|
||||
The parent process diffs the shapes per scenario. A diff means the
|
||||
migration introduced an observable behaviour change vs origin/main —
|
||||
likely a real regression for users on the existing config keys.
|
||||
|
||||
Run from the PR worktree:
|
||||
|
||||
python tests/plugins/image_gen/check_parity_vs_main.py
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[3]
|
||||
|
||||
|
||||
# Pin one path to current main, one to the PR worktree.
|
||||
# ``REPO_ROOT`` is ``.../.worktrees/<name>``; the main checkout lives
|
||||
# two levels up. When running directly from a regular clone (no
|
||||
# worktree), ``MAIN_DIR`` falls back to a sibling ``hermes-agent-main``
|
||||
# checkout if one exists.
|
||||
def _resolve_main_dir() -> Path:
|
||||
candidate = REPO_ROOT.parent.parent
|
||||
if (candidate / "tools" / "image_generation_tool.py").exists() and candidate != REPO_ROOT:
|
||||
return candidate
|
||||
sibling = REPO_ROOT.parent / "hermes-agent-main"
|
||||
if (sibling / "tools" / "image_generation_tool.py").exists():
|
||||
return sibling
|
||||
return REPO_ROOT
|
||||
|
||||
|
||||
MAIN_DIR = _resolve_main_dir()
|
||||
PR_DIR = REPO_ROOT
|
||||
assert (PR_DIR / "tools" / "image_generation_tool.py").exists(), (
|
||||
f"PR_DIR={PR_DIR} doesn't look like a hermes-agent checkout"
|
||||
)
|
||||
|
||||
|
||||
SUBPROCESS_SCRIPT = r"""
|
||||
import json, os, sys, tempfile
|
||||
sys.path.insert(0, sys.argv[1])
|
||||
|
||||
# Isolated HERMES_HOME so the config write is hermetic.
|
||||
home = tempfile.mkdtemp()
|
||||
os.environ["HERMES_HOME"] = home
|
||||
|
||||
# Clear FAL-related env so dispatch decisions are config-driven.
|
||||
for k in (
|
||||
"FAL_KEY", "FAL_QUEUE_GATEWAY_URL",
|
||||
"TOOL_GATEWAY_DOMAIN", "TOOL_GATEWAY_USER_TOKEN",
|
||||
"FAL_IMAGE_MODEL",
|
||||
):
|
||||
os.environ.pop(k, None)
|
||||
|
||||
scenario_env = json.loads(sys.argv[2])
|
||||
os.environ.update(scenario_env)
|
||||
|
||||
config_yaml = sys.argv[3]
|
||||
config_path = os.path.join(home, "config.yaml")
|
||||
with open(config_path, "w") as f:
|
||||
f.write(config_yaml)
|
||||
|
||||
# Fresh import — must not have anything cached.
|
||||
for name in list(sys.modules):
|
||||
if (name.startswith("tools.")
|
||||
or name.startswith("agent.")
|
||||
or name.startswith("plugins.")
|
||||
or name.startswith("hermes_cli.")):
|
||||
sys.modules.pop(name, None)
|
||||
|
||||
import tools.image_generation_tool as image_tool
|
||||
|
||||
dispatch_kind = None
|
||||
provider_name = None
|
||||
model = None
|
||||
error_text = None
|
||||
|
||||
try:
|
||||
raw = image_tool._dispatch_to_plugin_provider("ping", "landscape")
|
||||
if raw is None:
|
||||
dispatch_kind = "legacy_fal"
|
||||
else:
|
||||
parsed = json.loads(raw) if isinstance(raw, str) else raw
|
||||
if isinstance(parsed, dict):
|
||||
if parsed.get("error_type") == "provider_not_registered":
|
||||
dispatch_kind = "error"
|
||||
error_text = parsed.get("error")
|
||||
else:
|
||||
dispatch_kind = "plugin"
|
||||
provider_name = parsed.get("provider")
|
||||
model = parsed.get("model")
|
||||
else:
|
||||
dispatch_kind = "unknown_payload"
|
||||
|
||||
if model is None:
|
||||
# _resolve_fal_model still returns the active FAL model id even
|
||||
# when dispatch goes to a non-FAL plugin — used for the diff
|
||||
# only when applicable.
|
||||
try:
|
||||
model_id, _meta = image_tool._resolve_fal_model()
|
||||
if dispatch_kind == "legacy_fal":
|
||||
model = model_id
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as exc:
|
||||
dispatch_kind = "exception"
|
||||
error_text = repr(exc)
|
||||
|
||||
shape = {
|
||||
"dispatch_kind": dispatch_kind,
|
||||
"provider_name": provider_name,
|
||||
"model": model,
|
||||
"error_present": error_text is not None,
|
||||
}
|
||||
print(json.dumps(shape))
|
||||
"""
|
||||
|
||||
|
||||
SCENARIOS: list[tuple[str, str, dict[str, str]]] = [
|
||||
# (label, config.yaml body, extra env vars)
|
||||
("no-config-no-env", "", {}),
|
||||
(
|
||||
"explicit-fal-no-creds",
|
||||
"image_gen:\n provider: fal\n",
|
||||
{},
|
||||
),
|
||||
(
|
||||
"explicit-fal-with-creds",
|
||||
"image_gen:\n provider: fal\n",
|
||||
{"FAL_KEY": "test-key"},
|
||||
),
|
||||
(
|
||||
"explicit-fal-with-model",
|
||||
"image_gen:\n provider: fal\n model: fal-ai/flux-2-pro\n",
|
||||
{"FAL_KEY": "test-key"},
|
||||
),
|
||||
(
|
||||
"explicit-typo-provider",
|
||||
"image_gen:\n provider: not-a-real-backend\n",
|
||||
{"FAL_KEY": "test-key"},
|
||||
),
|
||||
(
|
||||
"managed-gateway-only",
|
||||
"",
|
||||
{
|
||||
"TOOL_GATEWAY_DOMAIN": "nousresearch.com",
|
||||
"TOOL_GATEWAY_USER_TOKEN": "nous-token",
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def _run_scenario(repo_path: Path, label: str, config_yaml: str, env: dict) -> dict:
|
||||
venv_python = repo_path / ".venv" / "bin" / "python"
|
||||
if not venv_python.exists():
|
||||
venv_python = MAIN_DIR / ".venv" / "bin" / "python"
|
||||
if not venv_python.exists():
|
||||
venv_python = Path("python3")
|
||||
|
||||
out = subprocess.run(
|
||||
[
|
||||
str(venv_python),
|
||||
"-c",
|
||||
SUBPROCESS_SCRIPT,
|
||||
str(repo_path),
|
||||
json.dumps(env),
|
||||
config_yaml,
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
if out.returncode != 0:
|
||||
return {
|
||||
"error": "subprocess failed",
|
||||
"stdout": out.stdout[-500:],
|
||||
"stderr": out.stderr[-500:],
|
||||
}
|
||||
try:
|
||||
return json.loads(out.stdout.strip().splitlines()[-1])
|
||||
except Exception as exc:
|
||||
return {"error": f"could not parse output: {exc}", "stdout": out.stdout}
|
||||
|
||||
|
||||
def _reduce(shape: dict) -> dict:
|
||||
"""Reduce to the parts that matter for user-visible parity.
|
||||
|
||||
On origin/main, ``explicit-fal-*`` scenarios short-circuit to
|
||||
``legacy_fal`` because of the ``configured == "fal"`` skip. On the
|
||||
PR, those same scenarios route through the plugin and emit
|
||||
``dispatch_kind == "plugin"`` with ``provider_name == "fal"``.
|
||||
|
||||
Both shapes are functionally equivalent — the plugin's ``generate()``
|
||||
re-enters the same in-tree pipeline via ``_it`` indirection — but
|
||||
we want the diff to be visible so reviewers can sign off on the
|
||||
intentional behaviour delta.
|
||||
"""
|
||||
return {
|
||||
"dispatch_kind": shape.get("dispatch_kind"),
|
||||
"provider_name": shape.get("provider_name"),
|
||||
"model": shape.get("model"),
|
||||
"error_present": shape.get("error_present"),
|
||||
}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
print(f"main: {MAIN_DIR}")
|
||||
print(f"pr: {PR_DIR}")
|
||||
print()
|
||||
|
||||
if MAIN_DIR == PR_DIR:
|
||||
print(
|
||||
"WARN: MAIN_DIR == PR_DIR — diffs will be trivially identical.\n"
|
||||
" Set up a sibling 'hermes-agent-main' checkout pinned to "
|
||||
"origin/main to get real parity coverage."
|
||||
)
|
||||
print()
|
||||
|
||||
failures: list[str] = []
|
||||
errors: list[str] = []
|
||||
intentional_diffs: list[tuple[str, dict, dict]] = []
|
||||
for label, config_yaml, env in SCENARIOS:
|
||||
main_shape = _run_scenario(MAIN_DIR, label, config_yaml, env)
|
||||
pr_shape = _run_scenario(PR_DIR, label, config_yaml, env)
|
||||
|
||||
if "error" in main_shape or "error" in pr_shape:
|
||||
print(f" [ERR ] {label}: subprocess failed")
|
||||
print(f" main: {main_shape}")
|
||||
print(f" pr: {pr_shape}")
|
||||
errors.append(label)
|
||||
continue
|
||||
|
||||
main_reduced = _reduce(main_shape)
|
||||
pr_reduced = _reduce(pr_shape)
|
||||
|
||||
if main_reduced == pr_reduced:
|
||||
print(f" [OK] {label}: {main_reduced}")
|
||||
continue
|
||||
|
||||
# On main, "explicit-fal-*" returns legacy_fal; on PR, plugin
|
||||
# dispatch. That's the only acceptable diff — flag everything
|
||||
# else as a regression.
|
||||
legacy_to_plugin_fal = (
|
||||
main_reduced.get("dispatch_kind") == "legacy_fal"
|
||||
and pr_reduced.get("dispatch_kind") == "plugin"
|
||||
and pr_reduced.get("provider_name") == "fal"
|
||||
)
|
||||
if legacy_to_plugin_fal:
|
||||
print(f" [DIFF] {label}: legacy_fal → plugin (fal) — expected")
|
||||
intentional_diffs.append((label, main_reduced, pr_reduced))
|
||||
else:
|
||||
print(f" [FAIL] {label}")
|
||||
print(f" main: {main_reduced}")
|
||||
print(f" pr: {pr_reduced}")
|
||||
failures.append(label)
|
||||
|
||||
print()
|
||||
if errors:
|
||||
print(f"SUBPROCESS ERRORS in {len(errors)} scenario(s):")
|
||||
for e in errors:
|
||||
print(f" - {e}")
|
||||
if failures:
|
||||
print(f"BEHAVIOUR REGRESSION in {len(failures)} scenario(s):")
|
||||
for f in failures:
|
||||
print(f" - {f}")
|
||||
if intentional_diffs:
|
||||
print(
|
||||
f"INTENTIONAL DIFFS ({len(intentional_diffs)}): "
|
||||
f"legacy_fal → plugin dispatch for explicit FAL paths."
|
||||
)
|
||||
if failures or errors:
|
||||
return 1
|
||||
print(f"PARITY OK across {len(SCENARIOS)} scenarios.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,226 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Tests for the FAL.ai image generation plugin.
|
||||
|
||||
The plugin is a thin registration adapter — actual FAL pipeline logic
|
||||
lives in ``tools.image_generation_tool`` and is exercised by
|
||||
``tests/tools/test_image_generation.py``. These tests focus on:
|
||||
|
||||
* the ``ImageGenProvider`` ABC surface (name, models, schema)
|
||||
* call-time indirection (``_it`` resolution at ``generate()`` time so
|
||||
``monkeypatch.setattr(image_tool, ...)`` keeps working)
|
||||
* response shape stamping (provider/prompt/aspect_ratio/model)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Provider surface
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestFalImageGenProviderSurface:
|
||||
def test_name(self):
|
||||
from plugins.image_gen.fal import FalImageGenProvider
|
||||
|
||||
assert FalImageGenProvider().name == "fal"
|
||||
|
||||
def test_display_name(self):
|
||||
from plugins.image_gen.fal import FalImageGenProvider
|
||||
|
||||
assert FalImageGenProvider().display_name == "FAL.ai"
|
||||
|
||||
def test_default_model_matches_legacy(self):
|
||||
from plugins.image_gen.fal import FalImageGenProvider
|
||||
from tools.image_generation_tool import DEFAULT_MODEL
|
||||
|
||||
assert FalImageGenProvider().default_model() == DEFAULT_MODEL
|
||||
|
||||
def test_list_models_uses_legacy_catalog(self):
|
||||
from plugins.image_gen.fal import FalImageGenProvider
|
||||
from tools.image_generation_tool import FAL_MODELS
|
||||
|
||||
provider = FalImageGenProvider()
|
||||
models = provider.list_models()
|
||||
ids = {m["id"] for m in models}
|
||||
# Whatever FAL_MODELS ships, the provider mirrors verbatim.
|
||||
assert ids == set(FAL_MODELS.keys())
|
||||
# Spot-check the expected first-class fields are present.
|
||||
for entry in models:
|
||||
for field in ("id", "display", "speed", "strengths", "price"):
|
||||
assert field in entry
|
||||
|
||||
def test_setup_schema_advertises_fal_key(self):
|
||||
from plugins.image_gen.fal import FalImageGenProvider
|
||||
|
||||
schema = FalImageGenProvider().get_setup_schema()
|
||||
assert schema["name"] == "FAL.ai"
|
||||
assert schema["badge"] == "paid"
|
||||
env_keys = {entry["key"] for entry in schema.get("env_vars", [])}
|
||||
assert "FAL_KEY" in env_keys
|
||||
|
||||
|
||||
class TestFalImageGenProviderAvailability:
|
||||
def test_is_available_when_legacy_check_passes(self, monkeypatch):
|
||||
import tools.image_generation_tool as image_tool
|
||||
from plugins.image_gen.fal import FalImageGenProvider
|
||||
|
||||
monkeypatch.setattr(image_tool, "check_fal_api_key", lambda: True)
|
||||
assert FalImageGenProvider().is_available() is True
|
||||
|
||||
def test_is_available_false_when_legacy_check_fails(self, monkeypatch):
|
||||
import tools.image_generation_tool as image_tool
|
||||
from plugins.image_gen.fal import FalImageGenProvider
|
||||
|
||||
monkeypatch.setattr(image_tool, "check_fal_api_key", lambda: False)
|
||||
assert FalImageGenProvider().is_available() is False
|
||||
|
||||
def test_is_available_handles_legacy_exception(self, monkeypatch):
|
||||
import tools.image_generation_tool as image_tool
|
||||
from plugins.image_gen.fal import FalImageGenProvider
|
||||
|
||||
def _boom():
|
||||
raise RuntimeError("config broke")
|
||||
|
||||
monkeypatch.setattr(image_tool, "check_fal_api_key", _boom)
|
||||
# Picker must not propagate exceptions — show as "not available".
|
||||
assert FalImageGenProvider().is_available() is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# generate() — call-time indirection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestFalImageGenProviderGenerate:
|
||||
def test_generate_delegates_to_legacy_image_generate_tool(self, monkeypatch):
|
||||
"""Plugin must look up ``image_generate_tool`` at call time so
|
||||
``monkeypatch.setattr(image_tool, "image_generate_tool", ...)``
|
||||
takes effect."""
|
||||
import tools.image_generation_tool as image_tool
|
||||
from plugins.image_gen.fal import FalImageGenProvider
|
||||
|
||||
captured = {}
|
||||
|
||||
def fake_image_generate_tool(prompt, aspect_ratio, **kwargs):
|
||||
captured["prompt"] = prompt
|
||||
captured["aspect_ratio"] = aspect_ratio
|
||||
captured["kwargs"] = kwargs
|
||||
return json.dumps({"success": True, "image": "https://fake/image.png"})
|
||||
|
||||
monkeypatch.setattr(image_tool, "image_generate_tool", fake_image_generate_tool)
|
||||
monkeypatch.setattr(image_tool, "_resolve_fal_model",
|
||||
lambda: ("fal-ai/flux-2/klein/9b", {}))
|
||||
|
||||
result = FalImageGenProvider().generate(
|
||||
"a serene mountain landscape",
|
||||
aspect_ratio="square",
|
||||
seed=42,
|
||||
)
|
||||
|
||||
assert captured["prompt"] == "a serene mountain landscape"
|
||||
assert captured["aspect_ratio"] == "square"
|
||||
assert captured["kwargs"] == {"seed": 42}
|
||||
assert result["success"] is True
|
||||
assert result["image"] == "https://fake/image.png"
|
||||
# Stamped fields for the unified response shape
|
||||
assert result["provider"] == "fal"
|
||||
assert result["prompt"] == "a serene mountain landscape"
|
||||
assert result["aspect_ratio"] == "square"
|
||||
assert result["model"] == "fal-ai/flux-2/klein/9b"
|
||||
|
||||
def test_generate_invalid_aspect_ratio_is_coerced(self, monkeypatch):
|
||||
import tools.image_generation_tool as image_tool
|
||||
from plugins.image_gen.fal import FalImageGenProvider
|
||||
|
||||
seen_aspect = {}
|
||||
|
||||
def fake(prompt, aspect_ratio, **kwargs):
|
||||
seen_aspect["v"] = aspect_ratio
|
||||
return json.dumps({"success": True, "image": "x"})
|
||||
|
||||
monkeypatch.setattr(image_tool, "image_generate_tool", fake)
|
||||
monkeypatch.setattr(image_tool, "_resolve_fal_model",
|
||||
lambda: ("fal-ai/flux-2/klein/9b", {}))
|
||||
|
||||
FalImageGenProvider().generate("p", aspect_ratio="not-a-real-ratio")
|
||||
# ``resolve_aspect_ratio`` clamps to landscape.
|
||||
assert seen_aspect["v"] == "landscape"
|
||||
|
||||
def test_generate_passthrough_drops_none_kwargs(self, monkeypatch):
|
||||
import tools.image_generation_tool as image_tool
|
||||
from plugins.image_gen.fal import FalImageGenProvider
|
||||
|
||||
seen = {}
|
||||
|
||||
def fake(prompt, aspect_ratio, **kwargs):
|
||||
seen.update(kwargs)
|
||||
return json.dumps({"success": True, "image": "x"})
|
||||
|
||||
monkeypatch.setattr(image_tool, "image_generate_tool", fake)
|
||||
monkeypatch.setattr(image_tool, "_resolve_fal_model",
|
||||
lambda: ("fal-ai/flux-2/klein/9b", {}))
|
||||
|
||||
FalImageGenProvider().generate(
|
||||
"p",
|
||||
aspect_ratio="landscape",
|
||||
seed=None,
|
||||
num_images=2,
|
||||
guidance_scale=None,
|
||||
)
|
||||
|
||||
# ``None`` values must not be forwarded — they'd override the
|
||||
# model's defaults inside the legacy payload builder.
|
||||
assert "seed" not in seen
|
||||
assert "guidance_scale" not in seen
|
||||
assert seen.get("num_images") == 2
|
||||
|
||||
def test_generate_catches_exception_from_legacy(self, monkeypatch):
|
||||
import tools.image_generation_tool as image_tool
|
||||
from plugins.image_gen.fal import FalImageGenProvider
|
||||
|
||||
def boom(*args, **kwargs):
|
||||
raise RuntimeError("FAL endpoint exploded")
|
||||
|
||||
monkeypatch.setattr(image_tool, "image_generate_tool", boom)
|
||||
|
||||
result = FalImageGenProvider().generate("p")
|
||||
assert result["success"] is False
|
||||
assert "FAL image generation failed" in result["error"]
|
||||
assert result["error_type"] == "RuntimeError"
|
||||
assert result["provider"] == "fal"
|
||||
|
||||
def test_generate_invalid_json_response(self, monkeypatch):
|
||||
import tools.image_generation_tool as image_tool
|
||||
from plugins.image_gen.fal import FalImageGenProvider
|
||||
|
||||
monkeypatch.setattr(image_tool, "image_generate_tool", lambda **kw: "not-json")
|
||||
monkeypatch.setattr(image_tool, "_resolve_fal_model",
|
||||
lambda: ("fal-ai/flux-2/klein/9b", {}))
|
||||
|
||||
result = FalImageGenProvider().generate("p")
|
||||
assert result["success"] is False
|
||||
assert "Invalid JSON" in result["error"]
|
||||
assert result["provider"] == "fal"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registry wiring
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestFalImageGenPluginRegistration:
|
||||
def test_register_wires_provider_into_registry(self):
|
||||
from plugins.image_gen.fal import FalImageGenProvider, register
|
||||
|
||||
ctx = MagicMock()
|
||||
register(ctx)
|
||||
|
||||
ctx.register_image_gen_provider.assert_called_once()
|
||||
(registered,), _ = ctx.register_image_gen_provider.call_args
|
||||
assert isinstance(registered, FalImageGenProvider)
|
||||
@@ -62,8 +62,9 @@ def plugin_api(tmp_path, monkeypatch):
|
||||
class _FakeSessionDB:
|
||||
"""Stand-in for hermes_state.SessionDB that records scan calls."""
|
||||
|
||||
def __init__(self, session_count: int):
|
||||
def __init__(self, session_count: int, scan_delay: float = 0):
|
||||
self.session_count = session_count
|
||||
self.scan_delay = scan_delay
|
||||
self.last_limit: Optional[int] = None
|
||||
self.last_include_children: Optional[bool] = None
|
||||
self.list_calls = 0
|
||||
@@ -78,6 +79,8 @@ class _FakeSessionDB:
|
||||
include_children: bool = False,
|
||||
project_compression_tips: bool = True,
|
||||
) -> List[Dict[str, Any]]:
|
||||
if self.scan_delay:
|
||||
time.sleep(self.scan_delay)
|
||||
self.last_limit = limit
|
||||
self.last_include_children = include_children
|
||||
self.list_calls += 1
|
||||
@@ -225,10 +228,8 @@ def test_evaluate_all_stale_cache_serves_stale_and_refreshes_in_background(plugi
|
||||
the stale data immediately and kicks a background refresh. Users don't
|
||||
stare at a loading spinner every time TTL expires.
|
||||
"""
|
||||
fake_db = _FakeSessionDB(session_count=10)
|
||||
fake_db = _FakeSessionDB(session_count=10, scan_delay=2.0)
|
||||
_install_fake_session_db(plugin_api, fake_db)
|
||||
|
||||
# Seed a stale snapshot on disk.
|
||||
stale_generated_at = int(time.time()) - plugin_api.SNAPSHOT_TTL_SECONDS - 60
|
||||
stale_payload = {
|
||||
"achievements": [],
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
|
||||
Covers:
|
||||
|
||||
- All seven bundled plugins (brave-free, ddgs, searxng, exa, parallel,
|
||||
tavily, firecrawl) instantiate and self-report the expected
|
||||
- All eight bundled plugins (brave-free, ddgs, searxng, exa, parallel,
|
||||
tavily, firecrawl, xai) instantiate and self-report the expected
|
||||
capabilities + ABC-derived defaults.
|
||||
- Each plugin's ``is_available()`` correctly reflects env-var presence.
|
||||
- The web_search_registry resolves an active provider in the documented
|
||||
@@ -47,6 +47,7 @@ def _clear_web_env(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"FIRECRAWL_GATEWAY_URL",
|
||||
"TOOL_GATEWAY_DOMAIN",
|
||||
"TOOL_GATEWAY_USER_TOKEN",
|
||||
"XAI_API_KEY",
|
||||
):
|
||||
monkeypatch.delenv(k, raising=False)
|
||||
|
||||
@@ -70,7 +71,7 @@ def _isolate_env(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
|
||||
|
||||
class TestBundledPluginsRegister:
|
||||
"""All seven bundled web plugins discover and register correctly."""
|
||||
"""All eight bundled web plugins discover and register correctly."""
|
||||
|
||||
def test_all_seven_plugins_present_in_registry(self) -> None:
|
||||
_ensure_plugins_loaded()
|
||||
@@ -85,6 +86,7 @@ class TestBundledPluginsRegister:
|
||||
"parallel",
|
||||
"searxng",
|
||||
"tavily",
|
||||
"xai",
|
||||
]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -100,6 +102,8 @@ class TestBundledPluginsRegister:
|
||||
# disabled in the migration (fell through to a legacy inline
|
||||
# path); the follow-up commit enabled it natively.
|
||||
("firecrawl", True, True, True),
|
||||
# xai: search-only via Grok's agentic web_search tool.
|
||||
("xai", True, False, False),
|
||||
],
|
||||
)
|
||||
def test_capability_flags_match_spec(
|
||||
@@ -120,7 +124,7 @@ class TestBundledPluginsRegister:
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"plugin_name",
|
||||
["brave-free", "ddgs", "searxng", "exa", "parallel", "tavily", "firecrawl"],
|
||||
["brave-free", "ddgs", "searxng", "exa", "parallel", "tavily", "firecrawl", "xai"],
|
||||
)
|
||||
def test_each_plugin_has_name_and_display_name(self, plugin_name: str) -> None:
|
||||
_ensure_plugins_loaded()
|
||||
@@ -133,7 +137,7 @@ class TestBundledPluginsRegister:
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"plugin_name",
|
||||
["brave-free", "ddgs", "searxng", "exa", "parallel", "tavily", "firecrawl"],
|
||||
["brave-free", "ddgs", "searxng", "exa", "parallel", "tavily", "firecrawl", "xai"],
|
||||
)
|
||||
def test_each_plugin_has_setup_schema(self, plugin_name: str) -> None:
|
||||
"""``get_setup_schema()`` returns a dict the picker can consume."""
|
||||
@@ -239,6 +243,17 @@ class TestIsAvailable:
|
||||
# Truthy or falsy, just must not raise.
|
||||
_ = bool(p.is_available())
|
||||
|
||||
def test_xai_requires_api_key_or_oauth(self, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""xAI needs XAI_API_KEY or OAuth tokens in auth.json."""
|
||||
_ensure_plugins_loaded()
|
||||
from agent.web_search_registry import get_provider
|
||||
|
||||
p = get_provider("xai")
|
||||
assert p is not None
|
||||
assert p.is_available() is False # no XAI_API_KEY, no auth.json
|
||||
monkeypatch.setenv("XAI_API_KEY", "real")
|
||||
assert p.is_available() is True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registry resolution semantics (Option B — conservative smart fallback)
|
||||
@@ -455,7 +470,7 @@ class TestErrorResponseShapes:
|
||||
if result["results"]:
|
||||
assert "error" in result["results"][0]
|
||||
|
||||
def test_firecrawl_crawl_returns_error_dict_when_unconfigured(self) -> None:
|
||||
def test_firecrawl_crawl_returns_error_dict_when_unconfigured(self):
|
||||
"""firecrawl crawl is async (wraps SDK in to_thread); error must be
|
||||
surfaced via the per-page result shape, not raised."""
|
||||
_ensure_plugins_loaded()
|
||||
@@ -473,3 +488,15 @@ class TestErrorResponseShapes:
|
||||
assert len(result["results"]) >= 1
|
||||
assert "error" in result["results"][0]
|
||||
assert result["results"][0]["url"] == "https://example.com"
|
||||
|
||||
def test_xai_search_returns_error_dict_when_unconfigured(self) -> None:
|
||||
"""xAI returns a typed error dict (no XAI_API_KEY)."""
|
||||
_ensure_plugins_loaded()
|
||||
from agent.web_search_registry import get_provider
|
||||
|
||||
p = get_provider("xai")
|
||||
assert p is not None
|
||||
result = p.search("test", limit=5)
|
||||
assert isinstance(result, dict)
|
||||
assert result.get("success") is False
|
||||
assert "error" in result
|
||||
|
||||
@@ -236,7 +236,7 @@ class TestQwenParity:
|
||||
|
||||
|
||||
class TestCustomOllamaParity:
|
||||
"""Custom/Ollama: num_ctx, think=false — now tested via profile."""
|
||||
"""Custom/Ollama: num_ctx, thinking controls — now tested via profile."""
|
||||
|
||||
def test_ollama_num_ctx(self, transport):
|
||||
kw = transport.build_kwargs(
|
||||
|
||||
@@ -0,0 +1,260 @@
|
||||
"""Tests for reactive multimodal-tool-content recovery.
|
||||
|
||||
Covers the full chain for providers that reject list-type content in
|
||||
``role: "tool"`` messages (Xiaomi MiMo's 400 "text is not set", etc.):
|
||||
|
||||
1. agent/error_classifier.py: 400 with the right wording classifies as
|
||||
``FailoverReason.multimodal_tool_content_unsupported``.
|
||||
2. run_agent._try_strip_image_parts_from_tool_messages downgrades tool
|
||||
messages whose ``content`` is a list-with-image to a string text
|
||||
summary, in-place, and records the active (provider, model) in
|
||||
``self._no_list_tool_content_models`` so future tool results in this
|
||||
session preemptively downgrade.
|
||||
3. run_agent._tool_result_content_for_active_model short-circuits to a
|
||||
text summary when the (provider, model) is in the cache, even though
|
||||
``_model_supports_vision`` returns True — avoiding a wasted round
|
||||
trip on every subsequent screenshot in the session.
|
||||
|
||||
The end-to-end retry loop wiring (`conversation_loop.py`) is exercised by
|
||||
the classifier signal + helper-mutation tests; the integration only adds
|
||||
a trivial flag-and-continue around the existing pattern used for
|
||||
``image_too_large`` recovery.
|
||||
|
||||
See: https://github.com/NousResearch/hermes-agent/issues/27344
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from agent.error_classifier import FailoverReason, classify_api_error
|
||||
|
||||
|
||||
class _FakeApiError(Exception):
|
||||
"""Stand-in for an openai.BadRequestError with status_code + body."""
|
||||
|
||||
def __init__(self, status_code: int, message: str, body: dict | None = None):
|
||||
super().__init__(message)
|
||||
self.status_code = status_code
|
||||
self.body = body or {"error": {"message": message}}
|
||||
self.response = None
|
||||
|
||||
|
||||
def _make_agent(provider: str = "xiaomi", model: str = "mimo-v2.5"):
|
||||
"""Build a bare AIAgent for method-level testing, no provider setup."""
|
||||
from run_agent import AIAgent
|
||||
agent = object.__new__(AIAgent)
|
||||
agent.provider = provider
|
||||
agent.model = model
|
||||
return agent
|
||||
|
||||
|
||||
# ─── Strip helper ────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestStripImagePartsHelper:
|
||||
def test_no_messages_returns_false(self):
|
||||
agent = _make_agent()
|
||||
assert agent._try_strip_image_parts_from_tool_messages([]) is False
|
||||
assert agent._try_strip_image_parts_from_tool_messages(None) is False
|
||||
|
||||
def test_no_tool_messages_returns_false(self):
|
||||
agent = _make_agent()
|
||||
msgs = [
|
||||
{"role": "user", "content": "plain text"},
|
||||
{"role": "assistant", "content": "ack"},
|
||||
]
|
||||
assert agent._try_strip_image_parts_from_tool_messages(msgs) is False
|
||||
|
||||
def test_tool_message_with_string_content_unchanged(self):
|
||||
agent = _make_agent()
|
||||
msgs = [
|
||||
{"role": "tool", "tool_call_id": "x", "content": "plain string result"},
|
||||
]
|
||||
assert agent._try_strip_image_parts_from_tool_messages(msgs) is False
|
||||
assert msgs[0]["content"] == "plain string result"
|
||||
|
||||
def test_tool_message_list_without_image_unchanged(self):
|
||||
"""List content with only text parts is left alone — caller surfaces
|
||||
the original error if this turns out to also be rejected."""
|
||||
agent = _make_agent()
|
||||
msgs = [
|
||||
{"role": "tool", "tool_call_id": "x", "content": [
|
||||
{"type": "text", "text": "hello"},
|
||||
]},
|
||||
]
|
||||
assert agent._try_strip_image_parts_from_tool_messages(msgs) is False
|
||||
|
||||
def test_tool_message_list_with_image_downgrades(self):
|
||||
agent = _make_agent()
|
||||
msgs = [
|
||||
{"role": "tool", "tool_call_id": "x", "content": [
|
||||
{"type": "text", "text": "AX summary: 5 buttons visible"},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}},
|
||||
]},
|
||||
]
|
||||
assert agent._try_strip_image_parts_from_tool_messages(msgs) is True
|
||||
# Image stripped; text preserved as a string.
|
||||
assert isinstance(msgs[0]["content"], str)
|
||||
assert "AX summary" in msgs[0]["content"]
|
||||
assert "image_url" not in msgs[0]["content"]
|
||||
assert "iVBOR" not in msgs[0]["content"]
|
||||
|
||||
def test_tool_message_image_only_gets_placeholder(self):
|
||||
"""If the list had nothing but image parts, leave a placeholder so
|
||||
the assistant message has something to reference."""
|
||||
agent = _make_agent()
|
||||
msgs = [
|
||||
{"role": "tool", "tool_call_id": "x", "content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}},
|
||||
]},
|
||||
]
|
||||
assert agent._try_strip_image_parts_from_tool_messages(msgs) is True
|
||||
assert isinstance(msgs[0]["content"], str)
|
||||
assert "image content removed" in msgs[0]["content"]
|
||||
|
||||
def test_records_provider_model_in_session_cache(self):
|
||||
agent = _make_agent(provider="xiaomi", model="mimo-v2.5")
|
||||
msgs = [
|
||||
{"role": "tool", "tool_call_id": "x", "content": [
|
||||
{"type": "text", "text": "summary"},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,X"}},
|
||||
]},
|
||||
]
|
||||
agent._try_strip_image_parts_from_tool_messages(msgs)
|
||||
assert ("xiaomi", "mimo-v2.5") in agent._no_list_tool_content_models
|
||||
|
||||
def test_only_tool_messages_get_downgraded(self):
|
||||
"""User / assistant messages with list-type content are out of
|
||||
scope — they're handled by the existing image-routing path."""
|
||||
agent = _make_agent()
|
||||
msgs = [
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": "describe"},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,X"}},
|
||||
]},
|
||||
{"role": "tool", "tool_call_id": "x", "content": [
|
||||
{"type": "text", "text": "summary"},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,Y"}},
|
||||
]},
|
||||
]
|
||||
agent._try_strip_image_parts_from_tool_messages(msgs)
|
||||
# User message untouched.
|
||||
assert isinstance(msgs[0]["content"], list)
|
||||
assert any(p.get("type") == "image_url" for p in msgs[0]["content"])
|
||||
# Tool message downgraded.
|
||||
assert isinstance(msgs[1]["content"], str)
|
||||
assert "summary" in msgs[1]["content"]
|
||||
|
||||
def test_skips_recording_when_no_model_id(self):
|
||||
"""Don't poison the cache with empty keys when provider/model is
|
||||
unset (e.g. lazy-initialised mid-handshake)."""
|
||||
agent = _make_agent(provider="", model="")
|
||||
msgs = [
|
||||
{"role": "tool", "tool_call_id": "x", "content": [
|
||||
{"type": "text", "text": "summary"},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,X"}},
|
||||
]},
|
||||
]
|
||||
agent._try_strip_image_parts_from_tool_messages(msgs)
|
||||
assert agent._no_list_tool_content_models == set()
|
||||
|
||||
|
||||
# ─── Short-circuit on cached models ──────────────────────────────────────────
|
||||
|
||||
|
||||
class TestToolResultContentShortCircuit:
|
||||
"""Once the session has learned that (provider, model) rejects list
|
||||
content, ``_tool_result_content_for_active_model`` returns a text
|
||||
summary even though ``_model_supports_vision`` reports True.
|
||||
"""
|
||||
|
||||
def _multimodal_result(self, png_b64: str = "iVBORw0KGgoAAAA"):
|
||||
return {
|
||||
"_multimodal": True,
|
||||
"content": [
|
||||
{"type": "text", "text": "capture mode=som 800x600 app=Safari"},
|
||||
{"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{png_b64}"}},
|
||||
],
|
||||
"text_summary": "capture mode=som 800x600 app=Safari",
|
||||
"meta": {"mode": "som", "width": 800, "height": 600, "elements": 5,
|
||||
"png_bytes": 1024},
|
||||
}
|
||||
|
||||
def test_returns_list_when_cache_empty_and_vision_supported(self, monkeypatch):
|
||||
agent = _make_agent(provider="xiaomi", model="mimo-v2.5")
|
||||
agent._no_list_tool_content_models = set() # explicit empty
|
||||
monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
|
||||
out = agent._tool_result_content_for_active_model(
|
||||
"computer_use", self._multimodal_result()
|
||||
)
|
||||
# Native multimodal path: returns the content parts list.
|
||||
assert isinstance(out, list)
|
||||
assert any(p.get("type") == "image_url" for p in out)
|
||||
|
||||
def test_returns_text_summary_when_model_in_cache(self, monkeypatch):
|
||||
agent = _make_agent(provider="xiaomi", model="mimo-v2.5")
|
||||
agent._no_list_tool_content_models = {("xiaomi", "mimo-v2.5")}
|
||||
monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
|
||||
out = agent._tool_result_content_for_active_model(
|
||||
"computer_use", self._multimodal_result()
|
||||
)
|
||||
# Short-circuit: a plain string summary, no image_url present.
|
||||
assert isinstance(out, str)
|
||||
assert "data:image" not in out
|
||||
assert "image_url" not in out
|
||||
|
||||
def test_cache_miss_on_different_model(self, monkeypatch):
|
||||
"""Cache is per (provider, model). A cached entry for mimo-v2.5
|
||||
must NOT affect a session running on a different model.
|
||||
"""
|
||||
agent = _make_agent(provider="xiaomi", model="mimo-v2.5-pro")
|
||||
agent._no_list_tool_content_models = {("xiaomi", "mimo-v2.5")}
|
||||
monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
|
||||
out = agent._tool_result_content_for_active_model(
|
||||
"computer_use", self._multimodal_result()
|
||||
)
|
||||
assert isinstance(out, list)
|
||||
|
||||
def test_missing_cache_attribute_falls_through(self, monkeypatch):
|
||||
"""Tests that build agents via ``object.__new__`` without calling
|
||||
``__init__`` must not crash — the cache attribute may be absent.
|
||||
"""
|
||||
agent = _make_agent()
|
||||
# Deliberately do not assign _no_list_tool_content_models.
|
||||
monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
|
||||
out = agent._tool_result_content_for_active_model(
|
||||
"computer_use", self._multimodal_result()
|
||||
)
|
||||
assert isinstance(out, list)
|
||||
|
||||
|
||||
# ─── Classifier ──────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestRecoveryEndToEndClassification:
|
||||
"""Lock in that the patterns used by the recovery path classify to
|
||||
the right ``FailoverReason``. (The recovery hook in
|
||||
``agent.conversation_loop`` consumes this reason directly.)
|
||||
"""
|
||||
|
||||
def test_xiaomi_mimo_classifies(self):
|
||||
err = _FakeApiError(
|
||||
status_code=400,
|
||||
message=(
|
||||
"Error code: 400 - {'error': {'code': '400', 'message': "
|
||||
"'Param Incorrect', 'param': 'text is not set', 'type': ''}}"
|
||||
),
|
||||
)
|
||||
result = classify_api_error(err, provider="xiaomi", model="mimo-v2.5")
|
||||
assert result.reason == FailoverReason.multimodal_tool_content_unsupported
|
||||
assert result.retryable is True
|
||||
|
||||
def test_alibaba_variant_classifies(self):
|
||||
err = _FakeApiError(
|
||||
status_code=400,
|
||||
message="tool_call.content must be string",
|
||||
)
|
||||
result = classify_api_error(err, provider="alibaba", model="qwen3.5-plus")
|
||||
assert result.reason == FailoverReason.multimodal_tool_content_unsupported
|
||||
@@ -2636,6 +2636,31 @@ class TestRunConversation:
|
||||
assert result["final_response"] == "Final answer"
|
||||
assert result["completed"] is True
|
||||
|
||||
def test_ollama_small_runtime_context_fails_before_api_call(self, agent, caplog):
|
||||
self._setup_agent(agent)
|
||||
agent.model = "qwen3.5:9b"
|
||||
agent.provider = "custom"
|
||||
agent.base_url = "http://host.docker.internal:11434/v1"
|
||||
agent._ollama_num_ctx = 4096
|
||||
|
||||
with (
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
caplog.at_level(logging.WARNING, logger="agent.conversation_loop"),
|
||||
):
|
||||
result = agent.run_conversation("Call ps -aux")
|
||||
|
||||
assert result["failed"] is True
|
||||
assert result["completed"] is False
|
||||
assert result["api_calls"] == 0
|
||||
assert result["turn_exit_reason"] == "ollama_runtime_context_too_small"
|
||||
assert "Ollama loaded `qwen3.5:9b` with only 4,096 tokens" in result["final_response"]
|
||||
assert "model.ollama_num_ctx: 65536" in result["final_response"]
|
||||
assert not agent.client.chat.completions.create.called
|
||||
assert "Ollama runtime context too small for Hermes tool use" in caplog.text
|
||||
assert "runtime_context=4096" in caplog.text
|
||||
|
||||
def test_tool_calls_then_stop(self, agent):
|
||||
self._setup_agent(agent)
|
||||
tc = _mock_tool_call(name="web_search", arguments="{}", call_id="c1")
|
||||
|
||||
@@ -0,0 +1,491 @@
|
||||
"""Hermetic tests for the Bitwarden Secrets Manager integration.
|
||||
|
||||
We never hit GitHub or Bitwarden in tests — subprocess + urllib are
|
||||
mocked so the suite stays fast and offline-safe. The "live" pull and
|
||||
binary download are exercised manually by `hermes secrets bitwarden
|
||||
setup` outside of pytest.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import stat
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# Make the worktree importable without depending on the installed wheel.
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
if str(ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from agent.secret_sources import bitwarden as bw # noqa: E402
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset_caches():
|
||||
bw._reset_cache_for_tests()
|
||||
yield
|
||||
bw._reset_cache_for_tests()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def hermes_home(tmp_path, monkeypatch):
|
||||
"""Point Hermes at an isolated home directory."""
|
||||
home = tmp_path / ".hermes"
|
||||
home.mkdir()
|
||||
monkeypatch.setenv("HERMES_HOME", str(home))
|
||||
# Some modules cache get_hermes_home; clear if needed.
|
||||
import hermes_constants
|
||||
if hasattr(hermes_constants, "_HERMES_HOME_CACHE"):
|
||||
hermes_constants._HERMES_HOME_CACHE = None # type: ignore[attr-defined]
|
||||
return home
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _platform_asset_name
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"system,machine,libc_text,expected",
|
||||
[
|
||||
("Darwin", "x86_64", "",
|
||||
f"bws-macos-universal-{bw._BWS_VERSION}.zip"),
|
||||
("Darwin", "arm64", "",
|
||||
f"bws-macos-universal-{bw._BWS_VERSION}.zip"),
|
||||
("Linux", "x86_64", "glibc",
|
||||
f"bws-x86_64-unknown-linux-gnu-{bw._BWS_VERSION}.zip"),
|
||||
("Linux", "x86_64", "musl libc",
|
||||
f"bws-x86_64-unknown-linux-musl-{bw._BWS_VERSION}.zip"),
|
||||
("Linux", "aarch64", "",
|
||||
f"bws-aarch64-unknown-linux-gnu-{bw._BWS_VERSION}.zip"),
|
||||
("Windows", "AMD64", "",
|
||||
f"bws-x86_64-pc-windows-msvc-{bw._BWS_VERSION}.zip"),
|
||||
("Windows", "ARM64", "",
|
||||
f"bws-aarch64-pc-windows-msvc-{bw._BWS_VERSION}.zip"),
|
||||
],
|
||||
)
|
||||
def test_platform_asset_name(system, machine, libc_text, expected):
|
||||
with mock.patch.object(bw.platform, "system", return_value=system), \
|
||||
mock.patch.object(bw.platform, "machine", return_value=machine), \
|
||||
mock.patch.object(
|
||||
bw.subprocess,
|
||||
"run",
|
||||
return_value=mock.Mock(stdout=libc_text, stderr=libc_text),
|
||||
):
|
||||
assert bw._platform_asset_name() == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# install_bws — fully mocked HTTP
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _make_fake_zip(binary_bytes: bytes) -> bytes:
|
||||
buf = io.BytesIO()
|
||||
with zipfile.ZipFile(buf, "w") as zf:
|
||||
zf.writestr("bws", binary_bytes)
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def test_install_bws_happy_path(hermes_home, monkeypatch):
|
||||
fake_binary = b"#!/bin/sh\necho 'bws fake 2.0.0'\n"
|
||||
zip_bytes = _make_fake_zip(fake_binary)
|
||||
asset_name = bw._platform_asset_name()
|
||||
checksum_text = (
|
||||
f"{hashlib.sha256(zip_bytes).hexdigest()} {asset_name}\n"
|
||||
"ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff other-file\n"
|
||||
)
|
||||
|
||||
def fake_download(url, dest):
|
||||
if url.endswith(".zip"):
|
||||
Path(dest).write_bytes(zip_bytes)
|
||||
elif url.endswith(".txt"):
|
||||
Path(dest).write_text(checksum_text)
|
||||
else:
|
||||
raise AssertionError(f"unexpected download url: {url}")
|
||||
|
||||
monkeypatch.setattr(bw, "_http_download", fake_download)
|
||||
|
||||
path = bw.install_bws()
|
||||
assert path.exists()
|
||||
assert path.read_bytes() == fake_binary
|
||||
# Executable bit set
|
||||
assert path.stat().st_mode & stat.S_IXUSR
|
||||
|
||||
|
||||
def test_install_bws_checksum_mismatch(hermes_home, monkeypatch):
|
||||
zip_bytes = _make_fake_zip(b"contents")
|
||||
asset_name = bw._platform_asset_name()
|
||||
wrong_checksum = "0" * 64
|
||||
checksum_text = f"{wrong_checksum} {asset_name}\n"
|
||||
|
||||
def fake_download(url, dest):
|
||||
if url.endswith(".zip"):
|
||||
Path(dest).write_bytes(zip_bytes)
|
||||
else:
|
||||
Path(dest).write_text(checksum_text)
|
||||
|
||||
monkeypatch.setattr(bw, "_http_download", fake_download)
|
||||
|
||||
with pytest.raises(RuntimeError, match="Checksum mismatch"):
|
||||
bw.install_bws()
|
||||
|
||||
|
||||
def test_install_bws_missing_checksum_entry(hermes_home, monkeypatch):
|
||||
zip_bytes = _make_fake_zip(b"x")
|
||||
|
||||
def fake_download(url, dest):
|
||||
if url.endswith(".zip"):
|
||||
Path(dest).write_bytes(zip_bytes)
|
||||
else:
|
||||
Path(dest).write_text("ffffffff some-other-file.zip\n")
|
||||
|
||||
monkeypatch.setattr(bw, "_http_download", fake_download)
|
||||
|
||||
with pytest.raises(RuntimeError, match="No checksum entry"):
|
||||
bw.install_bws()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# fetch_bitwarden_secrets
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _fake_bws_payload(items):
|
||||
return json.dumps(items)
|
||||
|
||||
|
||||
def test_fetch_happy_path(monkeypatch, tmp_path):
|
||||
fake_binary = tmp_path / "bws"
|
||||
fake_binary.write_text("")
|
||||
payload = _fake_bws_payload([
|
||||
{"key": "OPENAI_API_KEY", "value": "sk-abc"},
|
||||
{"key": "ANTHROPIC_API_KEY", "value": "sk-ant-xyz"},
|
||||
])
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
assert cmd[0] == str(fake_binary)
|
||||
assert "secret" in cmd and "list" in cmd
|
||||
assert kwargs["env"]["BWS_ACCESS_TOKEN"] == "0.fake.token"
|
||||
return mock.Mock(returncode=0, stdout=payload, stderr="")
|
||||
|
||||
monkeypatch.setattr(bw.subprocess, "run", fake_run)
|
||||
|
||||
secrets, warnings = bw.fetch_bitwarden_secrets(
|
||||
access_token="0.fake.token",
|
||||
project_id="proj-uuid",
|
||||
binary=fake_binary,
|
||||
use_cache=False,
|
||||
)
|
||||
assert secrets == {
|
||||
"OPENAI_API_KEY": "sk-abc",
|
||||
"ANTHROPIC_API_KEY": "sk-ant-xyz",
|
||||
}
|
||||
assert warnings == []
|
||||
|
||||
|
||||
def test_fetch_skips_invalid_env_names(monkeypatch, tmp_path):
|
||||
fake_binary = tmp_path / "bws"
|
||||
fake_binary.write_text("")
|
||||
payload = _fake_bws_payload([
|
||||
{"key": "VALID_KEY", "value": "v1"},
|
||||
{"key": "1BAD_START", "value": "v2"},
|
||||
{"key": "has spaces", "value": "v3"},
|
||||
{"key": "DASH-KEY", "value": "v4"},
|
||||
])
|
||||
|
||||
monkeypatch.setattr(
|
||||
bw.subprocess,
|
||||
"run",
|
||||
lambda *a, **kw: mock.Mock(returncode=0, stdout=payload, stderr=""),
|
||||
)
|
||||
|
||||
secrets, warnings = bw.fetch_bitwarden_secrets(
|
||||
access_token="0.t",
|
||||
project_id="p",
|
||||
binary=fake_binary,
|
||||
use_cache=False,
|
||||
)
|
||||
assert secrets == {"VALID_KEY": "v1"}
|
||||
assert len(warnings) == 3
|
||||
|
||||
|
||||
def test_fetch_auth_failure(monkeypatch, tmp_path):
|
||||
fake_binary = tmp_path / "bws"
|
||||
fake_binary.write_text("")
|
||||
|
||||
monkeypatch.setattr(
|
||||
bw.subprocess,
|
||||
"run",
|
||||
lambda *a, **kw: mock.Mock(
|
||||
returncode=1, stdout="", stderr="Error: invalid access token"
|
||||
),
|
||||
)
|
||||
|
||||
with pytest.raises(RuntimeError, match="invalid access token"):
|
||||
bw.fetch_bitwarden_secrets(
|
||||
access_token="0.bad",
|
||||
project_id="p",
|
||||
binary=fake_binary,
|
||||
use_cache=False,
|
||||
)
|
||||
|
||||
|
||||
def test_fetch_timeout(monkeypatch, tmp_path):
|
||||
fake_binary = tmp_path / "bws"
|
||||
fake_binary.write_text("")
|
||||
|
||||
def fake_run(*a, **kw):
|
||||
raise subprocess.TimeoutExpired(cmd="bws", timeout=30)
|
||||
|
||||
monkeypatch.setattr(bw.subprocess, "run", fake_run)
|
||||
|
||||
with pytest.raises(RuntimeError, match="timed out"):
|
||||
bw.fetch_bitwarden_secrets(
|
||||
access_token="0.t",
|
||||
project_id="p",
|
||||
binary=fake_binary,
|
||||
use_cache=False,
|
||||
)
|
||||
|
||||
|
||||
def test_fetch_non_json(monkeypatch, tmp_path):
|
||||
fake_binary = tmp_path / "bws"
|
||||
fake_binary.write_text("")
|
||||
|
||||
monkeypatch.setattr(
|
||||
bw.subprocess,
|
||||
"run",
|
||||
lambda *a, **kw: mock.Mock(
|
||||
returncode=0, stdout="not json at all", stderr=""
|
||||
),
|
||||
)
|
||||
|
||||
with pytest.raises(RuntimeError, match="non-JSON"):
|
||||
bw.fetch_bitwarden_secrets(
|
||||
access_token="0.t",
|
||||
project_id="p",
|
||||
binary=fake_binary,
|
||||
use_cache=False,
|
||||
)
|
||||
|
||||
|
||||
def test_fetch_cache_hits(monkeypatch, tmp_path):
|
||||
fake_binary = tmp_path / "bws"
|
||||
fake_binary.write_text("")
|
||||
payload = _fake_bws_payload([{"key": "K", "value": "v"}])
|
||||
|
||||
call_count = {"n": 0}
|
||||
def fake_run(*a, **kw):
|
||||
call_count["n"] += 1
|
||||
return mock.Mock(returncode=0, stdout=payload, stderr="")
|
||||
|
||||
monkeypatch.setattr(bw.subprocess, "run", fake_run)
|
||||
|
||||
bw.fetch_bitwarden_secrets(access_token="0.t", project_id="p",
|
||||
binary=fake_binary, cache_ttl_seconds=60)
|
||||
bw.fetch_bitwarden_secrets(access_token="0.t", project_id="p",
|
||||
binary=fake_binary, cache_ttl_seconds=60)
|
||||
assert call_count["n"] == 1 # cached on second call
|
||||
|
||||
|
||||
def test_fetch_cache_disabled(monkeypatch, tmp_path):
|
||||
fake_binary = tmp_path / "bws"
|
||||
fake_binary.write_text("")
|
||||
payload = _fake_bws_payload([])
|
||||
call_count = {"n": 0}
|
||||
def fake_run(*a, **kw):
|
||||
call_count["n"] += 1
|
||||
return mock.Mock(returncode=0, stdout=payload, stderr="")
|
||||
monkeypatch.setattr(bw.subprocess, "run", fake_run)
|
||||
|
||||
bw.fetch_bitwarden_secrets(access_token="0.t", project_id="p",
|
||||
binary=fake_binary, use_cache=False)
|
||||
bw.fetch_bitwarden_secrets(access_token="0.t", project_id="p",
|
||||
binary=fake_binary, use_cache=False)
|
||||
assert call_count["n"] == 2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# apply_bitwarden_secrets — the public entry point used by env_loader
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_apply_disabled_returns_empty():
|
||||
result = bw.apply_bitwarden_secrets(enabled=False, project_id="p")
|
||||
assert result.ok
|
||||
assert not result.applied
|
||||
assert not result.error
|
||||
|
||||
|
||||
def test_apply_missing_token(monkeypatch):
|
||||
monkeypatch.delenv("BWS_ACCESS_TOKEN", raising=False)
|
||||
result = bw.apply_bitwarden_secrets(
|
||||
enabled=True, project_id="p", auto_install=False
|
||||
)
|
||||
assert not result.ok
|
||||
assert "BWS_ACCESS_TOKEN" in result.error
|
||||
|
||||
|
||||
def test_apply_missing_project_id(monkeypatch):
|
||||
monkeypatch.setenv("BWS_ACCESS_TOKEN", "0.t")
|
||||
result = bw.apply_bitwarden_secrets(
|
||||
enabled=True, project_id="", auto_install=False
|
||||
)
|
||||
assert not result.ok
|
||||
assert "project_id" in result.error
|
||||
|
||||
|
||||
def test_apply_does_not_override_existing(monkeypatch, tmp_path):
|
||||
monkeypatch.setenv("BWS_ACCESS_TOKEN", "0.t")
|
||||
monkeypatch.setenv("OPENAI_API_KEY", "existing-value")
|
||||
fake_binary = tmp_path / "bws"
|
||||
fake_binary.write_text("")
|
||||
payload = _fake_bws_payload([
|
||||
{"key": "OPENAI_API_KEY", "value": "bsm-value"},
|
||||
{"key": "NEW_KEY", "value": "new-value"},
|
||||
])
|
||||
monkeypatch.setattr(
|
||||
bw.subprocess, "run",
|
||||
lambda *a, **kw: mock.Mock(returncode=0, stdout=payload, stderr=""),
|
||||
)
|
||||
monkeypatch.setattr(bw, "find_bws", lambda **kw: fake_binary)
|
||||
|
||||
result = bw.apply_bitwarden_secrets(
|
||||
enabled=True, project_id="p",
|
||||
override_existing=False, auto_install=False,
|
||||
)
|
||||
assert result.ok
|
||||
assert "NEW_KEY" in result.applied
|
||||
assert "OPENAI_API_KEY" in result.skipped
|
||||
assert os.environ["OPENAI_API_KEY"] == "existing-value"
|
||||
assert os.environ["NEW_KEY"] == "new-value"
|
||||
|
||||
|
||||
def test_apply_override_existing(monkeypatch, tmp_path):
|
||||
monkeypatch.setenv("BWS_ACCESS_TOKEN", "0.t")
|
||||
monkeypatch.setenv("OPENAI_API_KEY", "stale")
|
||||
fake_binary = tmp_path / "bws"
|
||||
fake_binary.write_text("")
|
||||
payload = _fake_bws_payload([{"key": "OPENAI_API_KEY", "value": "fresh"}])
|
||||
monkeypatch.setattr(
|
||||
bw.subprocess, "run",
|
||||
lambda *a, **kw: mock.Mock(returncode=0, stdout=payload, stderr=""),
|
||||
)
|
||||
monkeypatch.setattr(bw, "find_bws", lambda **kw: fake_binary)
|
||||
|
||||
result = bw.apply_bitwarden_secrets(
|
||||
enabled=True, project_id="p",
|
||||
override_existing=True, auto_install=False,
|
||||
)
|
||||
assert result.ok
|
||||
assert os.environ["OPENAI_API_KEY"] == "fresh"
|
||||
|
||||
|
||||
def test_apply_never_overrides_bootstrap_token(monkeypatch, tmp_path):
|
||||
"""Even with override_existing=True, the access-token var is preserved."""
|
||||
monkeypatch.setenv("BWS_ACCESS_TOKEN", "0.original")
|
||||
fake_binary = tmp_path / "bws"
|
||||
fake_binary.write_text("")
|
||||
payload = _fake_bws_payload([
|
||||
{"key": "BWS_ACCESS_TOKEN", "value": "0.malicious-replacement"},
|
||||
])
|
||||
monkeypatch.setattr(
|
||||
bw.subprocess, "run",
|
||||
lambda *a, **kw: mock.Mock(returncode=0, stdout=payload, stderr=""),
|
||||
)
|
||||
monkeypatch.setattr(bw, "find_bws", lambda **kw: fake_binary)
|
||||
|
||||
result = bw.apply_bitwarden_secrets(
|
||||
enabled=True, project_id="p",
|
||||
override_existing=True, auto_install=False,
|
||||
)
|
||||
assert os.environ["BWS_ACCESS_TOKEN"] == "0.original"
|
||||
assert "BWS_ACCESS_TOKEN" in result.skipped
|
||||
|
||||
|
||||
def test_apply_swallows_fetch_errors(monkeypatch, tmp_path):
|
||||
"""A fetch failure produces an error, NOT an exception."""
|
||||
monkeypatch.setenv("BWS_ACCESS_TOKEN", "0.t")
|
||||
fake_binary = tmp_path / "bws"
|
||||
fake_binary.write_text("")
|
||||
monkeypatch.setattr(
|
||||
bw.subprocess, "run",
|
||||
lambda *a, **kw: mock.Mock(returncode=1, stdout="", stderr="bad token"),
|
||||
)
|
||||
monkeypatch.setattr(bw, "find_bws", lambda **kw: fake_binary)
|
||||
|
||||
result = bw.apply_bitwarden_secrets(
|
||||
enabled=True, project_id="p", auto_install=False,
|
||||
)
|
||||
assert not result.ok
|
||||
assert "bad token" in result.error
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# env_loader integration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_env_loader_skips_when_disabled(tmp_path, monkeypatch):
|
||||
"""No config.yaml present → no BSM call, no crash."""
|
||||
home = tmp_path / ".hermes"
|
||||
home.mkdir()
|
||||
monkeypatch.setenv("HERMES_HOME", str(home))
|
||||
monkeypatch.setattr(Path, "home", lambda: tmp_path)
|
||||
|
||||
from hermes_cli.env_loader import _apply_external_secret_sources
|
||||
# Should be a no-op (returns None).
|
||||
assert _apply_external_secret_sources(home) is None
|
||||
|
||||
|
||||
def test_env_loader_calls_bsm_when_enabled(tmp_path, monkeypatch):
|
||||
home = tmp_path / ".hermes"
|
||||
home.mkdir()
|
||||
(home / "config.yaml").write_text(
|
||||
"secrets:\n"
|
||||
" bitwarden:\n"
|
||||
" enabled: true\n"
|
||||
" project_id: 'proj-1'\n"
|
||||
" access_token_env: 'BWS_ACCESS_TOKEN'\n"
|
||||
" cache_ttl_seconds: 0\n"
|
||||
" override_existing: false\n"
|
||||
" auto_install: false\n"
|
||||
)
|
||||
monkeypatch.setenv("HERMES_HOME", str(home))
|
||||
monkeypatch.setenv("BWS_ACCESS_TOKEN", "0.t")
|
||||
monkeypatch.delenv("MY_BSM_KEY", raising=False)
|
||||
|
||||
called = {"n": 0}
|
||||
def fake_apply(**kwargs):
|
||||
called["n"] += 1
|
||||
assert kwargs["enabled"] is True
|
||||
assert kwargs["project_id"] == "proj-1"
|
||||
os.environ["MY_BSM_KEY"] = "from-bsm"
|
||||
return bw.FetchResult(
|
||||
secrets={"MY_BSM_KEY": "from-bsm"},
|
||||
applied=["MY_BSM_KEY"],
|
||||
)
|
||||
|
||||
monkeypatch.setattr(
|
||||
"agent.secret_sources.bitwarden.apply_bitwarden_secrets",
|
||||
fake_apply,
|
||||
)
|
||||
|
||||
from hermes_cli.env_loader import _apply_external_secret_sources
|
||||
_apply_external_secret_sources(home)
|
||||
|
||||
assert called["n"] == 1
|
||||
assert os.environ.get("MY_BSM_KEY") == "from-bsm"
|
||||
@@ -0,0 +1,119 @@
|
||||
"""Tests for the secret-source tracking in ``hermes_cli.env_loader``.
|
||||
|
||||
These cover the small public surface that lets `hermes model` / `hermes setup`
|
||||
label detected credentials with their origin ("from Bitwarden") so users
|
||||
don't see an unexplained "credentials ✓" line when their .env is empty.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
if str(ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from hermes_cli import env_loader # noqa: E402
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset_sources():
|
||||
"""Each test starts with a clean source map."""
|
||||
env_loader._SECRET_SOURCES.clear()
|
||||
yield
|
||||
env_loader._SECRET_SOURCES.clear()
|
||||
|
||||
|
||||
def test_get_secret_source_returns_none_for_untracked_var():
|
||||
assert env_loader.get_secret_source("ANTHROPIC_API_KEY") is None
|
||||
|
||||
|
||||
def test_get_secret_source_returns_label_for_tracked_var():
|
||||
env_loader._SECRET_SOURCES["ANTHROPIC_API_KEY"] = "bitwarden"
|
||||
assert env_loader.get_secret_source("ANTHROPIC_API_KEY") == "bitwarden"
|
||||
|
||||
|
||||
def test_format_secret_source_suffix_empty_for_untracked():
|
||||
# Credentials from .env or the shell shouldn't add noise — the
|
||||
# implicit case stays unlabeled.
|
||||
assert env_loader.format_secret_source_suffix("ANTHROPIC_API_KEY") == ""
|
||||
|
||||
|
||||
def test_format_secret_source_suffix_bitwarden_uses_proper_name():
|
||||
env_loader._SECRET_SOURCES["ANTHROPIC_API_KEY"] = "bitwarden"
|
||||
assert (
|
||||
env_loader.format_secret_source_suffix("ANTHROPIC_API_KEY")
|
||||
== " (from Bitwarden)"
|
||||
)
|
||||
|
||||
|
||||
def test_format_secret_source_suffix_generic_label_for_future_sources():
|
||||
# Future-proofing: a new secret source (e.g. "vault") should still
|
||||
# produce a sensible label without needing to edit every call site.
|
||||
env_loader._SECRET_SOURCES["OPENAI_API_KEY"] = "vault"
|
||||
assert (
|
||||
env_loader.format_secret_source_suffix("OPENAI_API_KEY")
|
||||
== " (from vault)"
|
||||
)
|
||||
|
||||
|
||||
def test_apply_external_secret_sources_records_bitwarden_origin(tmp_path, monkeypatch):
|
||||
"""End-to-end: when ``apply_bitwarden_secrets`` returns applied keys,
|
||||
they end up in ``_SECRET_SOURCES`` so the UI can label them."""
|
||||
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
config_path = tmp_path / "config.yaml"
|
||||
config_path.write_text(
|
||||
"secrets:\n"
|
||||
" bitwarden:\n"
|
||||
" enabled: true\n"
|
||||
" project_id: test-project\n"
|
||||
" access_token_env: BWS_ACCESS_TOKEN\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
# Stub apply_bitwarden_secrets to return a synthetic FetchResult.
|
||||
from agent.secret_sources.bitwarden import FetchResult
|
||||
|
||||
fake_result = FetchResult(
|
||||
secrets={"ANTHROPIC_API_KEY": "sk-ant-test"},
|
||||
applied=["ANTHROPIC_API_KEY"],
|
||||
)
|
||||
|
||||
def _fake_apply(**_kwargs):
|
||||
return fake_result
|
||||
|
||||
# The import inside _apply_external_secret_sources is lazy, so we
|
||||
# patch the *module attribute* it will pull in.
|
||||
import agent.secret_sources.bitwarden as bw_module
|
||||
|
||||
monkeypatch.setattr(bw_module, "apply_bitwarden_secrets", _fake_apply)
|
||||
|
||||
env_loader._apply_external_secret_sources(tmp_path)
|
||||
|
||||
assert env_loader.get_secret_source("ANTHROPIC_API_KEY") == "bitwarden"
|
||||
assert (
|
||||
env_loader.format_secret_source_suffix("ANTHROPIC_API_KEY")
|
||||
== " (from Bitwarden)"
|
||||
)
|
||||
|
||||
|
||||
def test_apply_external_secret_sources_noop_when_disabled(tmp_path, monkeypatch):
|
||||
"""Disabled Bitwarden config must not touch the source map."""
|
||||
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
config_path = tmp_path / "config.yaml"
|
||||
config_path.write_text(
|
||||
"secrets:\n"
|
||||
" bitwarden:\n"
|
||||
" enabled: false\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
env_loader._apply_external_secret_sources(tmp_path)
|
||||
|
||||
assert env_loader.get_secret_source("ANTHROPIC_API_KEY") is None
|
||||
@@ -0,0 +1,187 @@
|
||||
"""Verify scripts/run_tests_parallel.py kills test-spawned grandchildren.
|
||||
|
||||
Setup
|
||||
-----
|
||||
A test in this file spawns a long-lived Python grandchild that writes
|
||||
its PID + a nonce to a tempfile, then exits without cleaning up.
|
||||
With the old ``subprocess.run`` runner, that grandchild would orphan
|
||||
and outlive the test (and the whole runner). With the current Popen +
|
||||
``start_new_session`` + ``_kill_tree`` runner, the grandchild gets
|
||||
SIGKILL'd via process-group kill when its file's pytest exits.
|
||||
|
||||
The leaker test always passes — its only job is to spawn a grandchild
|
||||
and walk away. The verifier runs the runner over the leaker file in a
|
||||
subprocess, then waits for the grandchild PID to disappear from the
|
||||
kernel's process table.
|
||||
|
||||
POSIX-only: Windows has its own grandchild lifecycle (no shared session,
|
||||
``taskkill /F /T`` semantics). Marked accordingly.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import textwrap
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# Both tests share the same handoff file: the leaker writes here, the
|
||||
# verifier reads here. We park it in $TMPDIR with a unique-per-run name
|
||||
# so concurrent invocations of the suite don't clobber each other.
|
||||
_HANDOFF_DIR = Path(os.environ.get("TMPDIR", "/tmp")) / "hermes-isolation-probe"
|
||||
_HANDOFF_DIR.mkdir(exist_ok=True)
|
||||
|
||||
|
||||
def _handoff_path_for(nonce: str) -> Path:
|
||||
return _HANDOFF_DIR / f"grandchild-{nonce}.json"
|
||||
|
||||
|
||||
def _pid_alive(pid: int) -> bool:
|
||||
"""POSIX: send signal 0 to probe whether ``pid`` is still alive.
|
||||
|
||||
``os.kill(pid, 0)`` raises ``ProcessLookupError`` if the process is
|
||||
gone, ``PermissionError`` if it exists but we can't signal it
|
||||
(someone else's pid). We treat PermissionError as "alive" because
|
||||
the process exists and that's all we need to know.
|
||||
"""
|
||||
if sys.platform == "win32": # pragma: no cover — POSIX-only test
|
||||
# On Windows we'd use OpenProcess + GetExitCodeProcess; this
|
||||
# test is skipped on Windows so the path is unreachable.
|
||||
raise RuntimeError("_pid_alive POSIX-only")
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
except PermissionError:
|
||||
return True
|
||||
return True
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="POSIX-only probe")
|
||||
@pytest.mark.live_system_guard_bypass
|
||||
def test_grandchild_leak_is_killed_by_runner(tmp_path: Path) -> None:
|
||||
"""Run the parallel runner over a probe file and verify cleanup.
|
||||
|
||||
1. Materialize a probe file that spawns a long-lived grandchild and
|
||||
writes its PID to disk before exiting.
|
||||
2. Invoke ``scripts/run_tests_parallel.py`` against the probe file.
|
||||
3. Wait for the grandchild PID to vanish (poll for ~5s).
|
||||
4. Assert the runner exited cleanly AND the grandchild is dead.
|
||||
"""
|
||||
repo_root = Path(__file__).resolve().parent.parent
|
||||
runner = repo_root / "scripts" / "run_tests_parallel.py"
|
||||
assert runner.exists(), f"runner missing at {runner}"
|
||||
|
||||
# Probe lives in a temp dir, NOT under tests/, so the regular suite
|
||||
# never picks it up — only our explicit invocation does.
|
||||
probe_dir = tmp_path / "probe"
|
||||
probe_dir.mkdir()
|
||||
probe = probe_dir / "test_probe_leaker.py"
|
||||
nonce = f"{os.getpid()}-{int(time.time() * 1000)}"
|
||||
handoff = _handoff_path_for(nonce)
|
||||
if handoff.exists():
|
||||
handoff.unlink()
|
||||
|
||||
probe_src = textwrap.dedent(f"""
|
||||
import json, os, subprocess, sys, time
|
||||
from pathlib import Path
|
||||
|
||||
HANDOFF = Path({str(handoff)!r})
|
||||
|
||||
def test_spawns_grandchild_and_walks_away():
|
||||
# Long-lived grandchild: detached, ignores SIGTERM (we want
|
||||
# SIGKILL or process-group kill to be the only thing that
|
||||
# works, simulating a misbehaving server).
|
||||
child = subprocess.Popen(
|
||||
[
|
||||
sys.executable, "-c",
|
||||
"import os, signal, sys, time; "
|
||||
"signal.signal(signal.SIGTERM, signal.SIG_IGN); "
|
||||
"sys.stdout.write(f'gc-pgid={{os.getpgid(0)}} gc-pid={{os.getpid()}}\\\\n'); "
|
||||
"sys.stdout.flush(); "
|
||||
"time.sleep(600)",
|
||||
],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
# IMPORTANT: do NOT pass start_new_session here. We want
|
||||
# the grandchild to inherit the pytest subprocess's
|
||||
# process group, so when the runner kills the group the
|
||||
# grandchild dies too.
|
||||
)
|
||||
# Read the first line so we can record gc's pgid in the
|
||||
# handoff, then walk away — don't close the pipe (would
|
||||
# signal EOF and let the child see SIGPIPE on next write).
|
||||
first_line = child.stdout.readline().decode().strip()
|
||||
HANDOFF.write_text(json.dumps({{
|
||||
"pid": child.pid,
|
||||
"diag": first_line,
|
||||
"test_pid": os.getpid(),
|
||||
"test_pgid": os.getpgid(0),
|
||||
}}))
|
||||
assert child.pid > 0
|
||||
""").strip()
|
||||
probe.write_text(probe_src + "\n")
|
||||
|
||||
# Run the parallel runner against just the probe file. The runner
|
||||
# discovers under ``tests/`` by default, so we override via --paths.
|
||||
proc = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(runner),
|
||||
"--paths",
|
||||
str(probe_dir),
|
||||
"-j",
|
||||
"1",
|
||||
# Tight per-file timeout: the probe finishes in <1s, no
|
||||
# need for 10min.
|
||||
"--file-timeout",
|
||||
"30",
|
||||
],
|
||||
cwd=repo_root,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
assert handoff.exists(), (
|
||||
f"probe never wrote handoff file; runner output:\n{proc.stdout}"
|
||||
)
|
||||
handoff_data = json.loads(handoff.read_text())
|
||||
grandchild_pid = handoff_data["pid"]
|
||||
diag = handoff_data.get("diag", "(no diag)")
|
||||
test_pid = handoff_data.get("test_pid")
|
||||
test_pgid = handoff_data.get("test_pgid")
|
||||
handoff.unlink()
|
||||
|
||||
# The runner must have exited cleanly (probe test passes).
|
||||
assert proc.returncode == 0, (
|
||||
f"runner exited {proc.returncode}; output:\n{proc.stdout}"
|
||||
)
|
||||
|
||||
# The grandchild must be gone. Poll for a bit because process-group
|
||||
# SIGKILL + reaping isn't synchronous; on a loaded box it can take
|
||||
# a beat.
|
||||
deadline = time.monotonic() + 5.0
|
||||
while time.monotonic() < deadline:
|
||||
if not _pid_alive(grandchild_pid):
|
||||
break
|
||||
time.sleep(0.05)
|
||||
else:
|
||||
# Test cleanup: kill the leaked grandchild ourselves so a
|
||||
# FAILED assertion doesn't leave a sleep(600) running.
|
||||
try:
|
||||
os.kill(grandchild_pid, 9)
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
pytest.fail(
|
||||
f"grandchild PID {grandchild_pid} survived runner exit; "
|
||||
f"diag={diag!r} test_pid={test_pid} test_pgid={test_pgid}; "
|
||||
f"runner output:\n{proc.stdout}"
|
||||
)
|
||||
@@ -59,6 +59,59 @@ def test_write_json_returns_false_on_broken_pipe(monkeypatch):
|
||||
assert server.write_json({"ok": True}) is False
|
||||
|
||||
|
||||
def test_tui_verbose_tool_details_fail_closed_when_redaction_fails(monkeypatch):
|
||||
redact_module = types.ModuleType("agent.redact")
|
||||
|
||||
def fail_redaction(*_args, **_kwargs):
|
||||
raise RuntimeError("redaction unavailable")
|
||||
|
||||
setattr(redact_module, "redact_sensitive_text", fail_redaction)
|
||||
monkeypatch.setitem(sys.modules, "agent.redact", redact_module)
|
||||
|
||||
assert server._redact_tui_verbose_text("api_key=secret") == ""
|
||||
assert server._tool_args_text({"api_key": "secret"}) == ""
|
||||
assert server._tool_result_text("token=secret") == ""
|
||||
|
||||
|
||||
def test_tui_verbose_tool_details_are_capped_before_emit(monkeypatch):
|
||||
monkeypatch.setattr(server, "_TUI_VERBOSE_TEXT_MAX_CHARS", 12)
|
||||
monkeypatch.setattr(server, "_TUI_VERBOSE_TEXT_MAX_LINES", 2)
|
||||
|
||||
capped = server._cap_tui_verbose_text("one\ntwo\nthree\nfour")
|
||||
|
||||
assert capped.startswith("[showing verbose tail; omitted ")
|
||||
assert capped.endswith("three\nfour")
|
||||
assert "one" not in capped
|
||||
|
||||
|
||||
def test_tui_verbose_tool_events_omit_details_when_redaction_fails(monkeypatch):
|
||||
redact_module = types.ModuleType("agent.redact")
|
||||
|
||||
def fail_redaction(*_args, **_kwargs):
|
||||
raise RuntimeError("redaction unavailable")
|
||||
|
||||
setattr(redact_module, "redact_sensitive_text", fail_redaction)
|
||||
monkeypatch.setitem(sys.modules, "agent.redact", redact_module)
|
||||
|
||||
events: list[tuple[str, str, dict]] = []
|
||||
monkeypatch.setattr(
|
||||
server, "_emit", lambda event_type, sid, payload: events.append((event_type, sid, payload))
|
||||
)
|
||||
monkeypatch.setitem(
|
||||
server._sessions,
|
||||
"redaction-test",
|
||||
{"tool_progress_mode": "verbose", "tool_started_at": {}},
|
||||
)
|
||||
|
||||
server._on_tool_start("redaction-test", "tool-1", "terminal", {"command": "pwd"})
|
||||
server._on_tool_complete("redaction-test", "tool-1", "terminal", {"command": "pwd"}, "done")
|
||||
|
||||
assert events[0][0] == "tool.start"
|
||||
assert events[1][0] == "tool.complete"
|
||||
assert "args_text" not in events[0][2]
|
||||
assert "result_text" not in events[1][2]
|
||||
|
||||
|
||||
def test_dispatch_rejects_non_object_request():
|
||||
resp = server.dispatch([])
|
||||
|
||||
@@ -1476,8 +1529,10 @@ def test_config_mouse_uses_documented_key_with_legacy_fallback(monkeypatch):
|
||||
set_toggle = server.handle_request(
|
||||
{"id": "2", "method": "config.set", "params": {"key": "mouse"}}
|
||||
)
|
||||
assert set_toggle["result"] == {"key": "mouse", "value": "on"}
|
||||
assert writes == [("display.mouse_tracking", True)]
|
||||
# /mouse (no arg) toggles between 'all' and 'off'. Starting from
|
||||
# tui_mouse: False (→ 'off'), the toggle flips to 'all'.
|
||||
assert set_toggle["result"] == {"key": "mouse", "value": "all"}
|
||||
assert writes == [("display.mouse_tracking", "all")]
|
||||
|
||||
cfg["display"] = {"mouse_tracking": 0, "tui_mouse": True}
|
||||
get_canonical = server.handle_request(
|
||||
@@ -1489,7 +1544,51 @@ def test_config_mouse_uses_documented_key_with_legacy_fallback(monkeypatch):
|
||||
get_null = server.handle_request(
|
||||
{"id": "4", "method": "config.get", "params": {"key": "mouse"}}
|
||||
)
|
||||
assert get_null["result"]["value"] == "on"
|
||||
# mouse_tracking present-but-None defers neither to tui_mouse nor to
|
||||
# the legacy off bucket: it falls through to the 'all' default.
|
||||
assert get_null["result"]["value"] == "all"
|
||||
|
||||
|
||||
def test_config_mouse_accepts_preset_strings_and_aliases(monkeypatch):
|
||||
cfg = {"display": {"mouse_tracking": "all"}}
|
||||
writes = []
|
||||
|
||||
monkeypatch.setattr(server, "_load_cfg", lambda: cfg)
|
||||
monkeypatch.setattr(
|
||||
server, "_write_config_key", lambda path, value: writes.append((path, value))
|
||||
)
|
||||
|
||||
# Direct preset.
|
||||
set_wheel = server.handle_request(
|
||||
{
|
||||
"id": "1",
|
||||
"method": "config.set",
|
||||
"params": {"key": "mouse", "value": "wheel"},
|
||||
}
|
||||
)
|
||||
assert set_wheel["result"] == {"key": "mouse", "value": "wheel"}
|
||||
assert writes[-1] == ("display.mouse_tracking", "wheel")
|
||||
|
||||
# Alias for buttons.
|
||||
set_click = server.handle_request(
|
||||
{
|
||||
"id": "2",
|
||||
"method": "config.set",
|
||||
"params": {"key": "mouse", "value": "click"},
|
||||
}
|
||||
)
|
||||
assert set_click["result"] == {"key": "mouse", "value": "buttons"}
|
||||
assert writes[-1] == ("display.mouse_tracking", "buttons")
|
||||
|
||||
# Unknown value → 4002.
|
||||
bad = server.handle_request(
|
||||
{
|
||||
"id": "3",
|
||||
"method": "config.set",
|
||||
"params": {"key": "mouse", "value": "rainbows"},
|
||||
}
|
||||
)
|
||||
assert bad["error"]["code"] == 4002
|
||||
|
||||
|
||||
def test_enable_gateway_prompts_sets_gateway_env(monkeypatch):
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
"""Shared fixtures for tests/tools/ web-provider tests.
|
||||
|
||||
Per-file subprocess isolation means each test file gets a fresh interpreter,
|
||||
so module-level state (like the web-search-provider registry) is empty when
|
||||
a file starts. The ``web_registry_populated`` fixture registers all bundled
|
||||
providers before each test and resets the registry afterwards — tests that
|
||||
depend on the registry being populated should use it explicitly or via
|
||||
``@pytest.mark.usefixtures("web_registry_populated")``.
|
||||
"""
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def register_all_web_providers():
|
||||
"""Register all bundled web-search providers into the global registry.
|
||||
|
||||
This is the single source of truth for the provider list used by
|
||||
test classes that need the registry populated for dispatch checks.
|
||||
"""
|
||||
from agent.web_search_registry import register_provider, _reset_for_tests
|
||||
from plugins.web.brave_free.provider import BraveFreeWebSearchProvider
|
||||
from plugins.web.ddgs.provider import DDGSWebSearchProvider
|
||||
from plugins.web.exa.provider import ExaWebSearchProvider
|
||||
from plugins.web.firecrawl.provider import FirecrawlWebSearchProvider
|
||||
from plugins.web.parallel.provider import ParallelWebSearchProvider
|
||||
from plugins.web.searxng.provider import SearXNGWebSearchProvider
|
||||
from plugins.web.tavily.provider import TavilyWebSearchProvider
|
||||
from plugins.web.xai.provider import XAIWebSearchProvider
|
||||
|
||||
_reset_for_tests()
|
||||
for cls in (
|
||||
BraveFreeWebSearchProvider,
|
||||
DDGSWebSearchProvider,
|
||||
ExaWebSearchProvider,
|
||||
FirecrawlWebSearchProvider,
|
||||
ParallelWebSearchProvider,
|
||||
SearXNGWebSearchProvider,
|
||||
TavilyWebSearchProvider,
|
||||
XAIWebSearchProvider,
|
||||
):
|
||||
register_provider(cls())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def web_registry_populated():
|
||||
"""Populate the web-search-provider registry for one test, then reset."""
|
||||
register_all_web_providers()
|
||||
yield
|
||||
from agent.web_search_registry import _reset_for_tests
|
||||
_reset_for_tests()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def disable_lazy_stt_install():
|
||||
"""Disarm the runtime lazy-install probe so static ``_HAS_FASTER_WHISPER``
|
||||
patches accurately simulate 'faster-whisper not installed'.
|
||||
|
||||
Without this, ``_try_lazy_install_stt()`` calls
|
||||
``importlib.util.find_spec("faster_whisper")``, which returns truthy
|
||||
whenever the package is installed in the dev / CI environment —
|
||||
defeating the test's ``_HAS_FASTER_WHISPER=False`` patch.
|
||||
|
||||
Opt in at module scope with
|
||||
``pytestmark = pytest.mark.usefixtures("disable_lazy_stt_install")``.
|
||||
"""
|
||||
with patch("tools.transcription_tools._try_lazy_install_stt", return_value=False):
|
||||
yield
|
||||
@@ -0,0 +1,246 @@
|
||||
"""Unit tests for tools/app_tools.py — the Nous tool gateway integration."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from tools.managed_tool_gateway import ManagedToolGatewayConfig
|
||||
|
||||
|
||||
_FAKE_GATEWAY = ManagedToolGatewayConfig(
|
||||
vendor="tools",
|
||||
gateway_origin="https://tools-gateway.example.com",
|
||||
nous_user_token="test-token-abc123",
|
||||
managed_mode=True,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset_http_client_cache():
|
||||
"""Clear the module-level cached httpx client between tests."""
|
||||
import tools.app_tools as mod
|
||||
mod._http_client = None
|
||||
mod._http_client_origin = None
|
||||
yield
|
||||
mod._http_client = None
|
||||
mod._http_client_origin = None
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def gateway_post(monkeypatch):
|
||||
"""Patch the gateway and httpx.Client.post; return a dict capturing the request."""
|
||||
monkeypatch.setattr(
|
||||
"tools.app_tools.resolve_managed_tool_gateway", lambda v: _FAKE_GATEWAY
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"tools.app_tools._get_current_model_name", lambda: None
|
||||
)
|
||||
captured = {}
|
||||
resp = MagicMock(spec=httpx.Response)
|
||||
resp.status_code = 200
|
||||
resp.json.return_value = {"data": {}, "error": None}
|
||||
resp.text = json.dumps({"data": {}, "error": None})
|
||||
|
||||
def fake_post(self, url, *, json=None, headers=None, **kw):
|
||||
captured["url"] = url
|
||||
captured["headers"] = headers
|
||||
captured["json"] = json
|
||||
return resp
|
||||
|
||||
monkeypatch.setattr(httpx.Client, "post", fake_post)
|
||||
return captured
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# check_fn gating
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestAppToolsAvailability:
|
||||
def test_returns_false_when_gateway_not_ready(self, monkeypatch):
|
||||
monkeypatch.setattr("tools.app_tools.is_managed_tool_gateway_ready", lambda vendor: False)
|
||||
monkeypatch.setattr("tools.app_tools._read_portal_app_tools_enabled", lambda: True)
|
||||
from tools.app_tools import _app_tools_available
|
||||
assert _app_tools_available() is False
|
||||
|
||||
def test_returns_true_when_gateway_ready_and_config_on(self, monkeypatch):
|
||||
monkeypatch.setattr("tools.app_tools.is_managed_tool_gateway_ready", lambda vendor: True)
|
||||
monkeypatch.setattr("tools.app_tools._read_portal_app_tools_enabled", lambda: True)
|
||||
from tools.app_tools import _app_tools_available
|
||||
assert _app_tools_available() is True
|
||||
|
||||
def test_returns_false_when_config_off(self, monkeypatch):
|
||||
monkeypatch.setattr("tools.app_tools.is_managed_tool_gateway_ready", lambda vendor: True)
|
||||
monkeypatch.setattr("tools.app_tools._read_portal_app_tools_enabled", lambda: False)
|
||||
from tools.app_tools import _app_tools_available
|
||||
assert _app_tools_available() is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# URL + auth header
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSearchPostsCorrectUrlAndAuth:
|
||||
def test_posts_to_v1_search_with_bearer_token(self, monkeypatch, gateway_post):
|
||||
monkeypatch.setattr("tools.app_tools._get_current_model_name", lambda: "test-model")
|
||||
from tools.app_tools import handle_app_search_tools
|
||||
handle_app_search_tools({"queries": [{"use_case": "send email"}]})
|
||||
|
||||
assert gateway_post["url"] == "https://tools-gateway.example.com/v1/search"
|
||||
assert gateway_post["headers"]["Authorization"] == "Bearer test-token-abc123"
|
||||
assert gateway_post["headers"]["Content-Type"] == "application/json"
|
||||
assert gateway_post["json"]["queries"] == [{"use_case": "send email"}]
|
||||
assert gateway_post["json"]["model"] == "test-model"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Model auto-injection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestModelAutoInjection:
|
||||
def test_injects_model_from_config(self, monkeypatch, gateway_post):
|
||||
monkeypatch.setattr("tools.app_tools._get_current_model_name", lambda: "claude-sonnet-4")
|
||||
from tools.app_tools import handle_app_search_tools
|
||||
handle_app_search_tools({"queries": [{"use_case": "test"}]})
|
||||
assert gateway_post["json"]["model"] == "claude-sonnet-4"
|
||||
|
||||
def test_omits_model_when_unresolvable(self, gateway_post):
|
||||
from tools.app_tools import handle_app_search_tools
|
||||
handle_app_search_tools({"queries": [{"use_case": "test"}]})
|
||||
assert "model" not in gateway_post["json"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Gateway-internal param stripping (allowlist approach)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestExecuteStripsInternalParams:
|
||||
def test_strips_sync_response_thought_step_metric(self, gateway_post):
|
||||
from tools.app_tools import handle_app_execute_tools
|
||||
handle_app_execute_tools({
|
||||
"tools": [{"tool_slug": "TEST", "arguments": {}}],
|
||||
"sync_response_to_workbench": True,
|
||||
"thought": "testing",
|
||||
"current_step": "TESTING",
|
||||
"current_step_metric": "1/1 tests",
|
||||
})
|
||||
body = gateway_post["json"]
|
||||
for key in ("sync_response_to_workbench", "thought", "current_step", "current_step_metric"):
|
||||
assert key not in body
|
||||
assert body["tools"] == [{"tool_slug": "TEST", "arguments": {}}]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HTTP error → tool result (not exception)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestHttpErrorReturnedAsToolResult:
|
||||
@pytest.mark.parametrize("status_code", [402, 403, 422, 500])
|
||||
def test_returns_error_json_not_exception(self, monkeypatch, status_code):
|
||||
monkeypatch.setattr("tools.app_tools.resolve_managed_tool_gateway", lambda v: _FAKE_GATEWAY)
|
||||
error_body = {"error": {"code": "TEST_ERROR", "message": "fail"}}
|
||||
resp = MagicMock(spec=httpx.Response)
|
||||
resp.status_code = status_code
|
||||
resp.json.return_value = error_body
|
||||
resp.text = json.dumps(error_body)
|
||||
monkeypatch.setattr(httpx.Client, "post", lambda self, url, **kw: resp)
|
||||
|
||||
from tools.app_tools import handle_app_search_tools
|
||||
result = json.loads(handle_app_search_tools({"queries": [{"use_case": "test"}]}))
|
||||
assert result["error"]["code"] == "TEST_ERROR"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Network failure → tool result
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestNetworkFailureReturnedAsToolResult:
|
||||
def test_connect_error_returns_gateway_unreachable(self, monkeypatch):
|
||||
monkeypatch.setattr("tools.app_tools.resolve_managed_tool_gateway", lambda v: _FAKE_GATEWAY)
|
||||
|
||||
def raise_connect(self, url, **kw):
|
||||
raise httpx.ConnectError("Connection refused")
|
||||
monkeypatch.setattr(httpx.Client, "post", raise_connect)
|
||||
|
||||
from tools.app_tools import handle_app_search_tools
|
||||
result = json.loads(handle_app_search_tools({"queries": [{"use_case": "test"}]}))
|
||||
assert result["error"]["code"] == "GATEWAY_UNREACHABLE"
|
||||
|
||||
def test_timeout_returns_gateway_timeout(self, monkeypatch):
|
||||
monkeypatch.setattr("tools.app_tools.resolve_managed_tool_gateway", lambda v: _FAKE_GATEWAY)
|
||||
|
||||
def raise_timeout(self, url, **kw):
|
||||
raise httpx.ReadTimeout("timed out")
|
||||
monkeypatch.setattr(httpx.Client, "post", raise_timeout)
|
||||
|
||||
from tools.app_tools import handle_app_search_tools
|
||||
result = json.loads(handle_app_search_tools({"queries": [{"use_case": "test"}]}))
|
||||
assert result["error"]["code"] == "GATEWAY_TIMEOUT"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Endpoint routing + payload forwarding
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEndpointRouting:
|
||||
def test_manage_connections_forwards_toolkits(self, gateway_post):
|
||||
from tools.app_tools import handle_app_manage_connections
|
||||
handle_app_manage_connections({"toolkits": ["gmail", "slack"], "reinitiate_all": True})
|
||||
assert gateway_post["url"].endswith("/v1/connections")
|
||||
assert gateway_post["json"]["toolkits"] == ["gmail", "slack"]
|
||||
assert gateway_post["json"]["reinitiate_all"] is True
|
||||
|
||||
def test_tool_schemas_forwards_slugs(self, gateway_post):
|
||||
from tools.app_tools import handle_app_tool_schemas
|
||||
handle_app_tool_schemas({"tool_slugs": ["GMAIL_SEND_EMAIL"], "include": ["input_schema", "output_schema"]})
|
||||
assert gateway_post["url"].endswith("/v1/schemas")
|
||||
assert gateway_post["json"]["tool_slugs"] == ["GMAIL_SEND_EMAIL"]
|
||||
assert gateway_post["json"]["include"] == ["input_schema", "output_schema"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registry entries
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRegistryEntries:
|
||||
def test_all_four_tools_registered_under_app_tools(self):
|
||||
from tools.registry import registry
|
||||
import tools.app_tools # noqa: F401
|
||||
expected = {"app_search_tools", "app_tool_schemas", "app_execute_tools", "app_manage_connections"}
|
||||
for name in expected:
|
||||
entry = registry._tools.get(name)
|
||||
assert entry is not None, f"{name} not registered"
|
||||
assert entry.toolset == "app_tools"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# session (object) vs session_id (string) asymmetry
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSessionHandling:
|
||||
def test_search_uses_session_object(self, gateway_post):
|
||||
from tools.app_tools import handle_app_search_tools
|
||||
handle_app_search_tools({"queries": [{"use_case": "test"}], "session": {"generate_id": True}})
|
||||
assert isinstance(gateway_post["json"]["session"], dict)
|
||||
assert "session_id" not in gateway_post["json"]
|
||||
|
||||
def test_schemas_uses_session_id_string(self, gateway_post):
|
||||
from tools.app_tools import handle_app_tool_schemas
|
||||
handle_app_tool_schemas({"tool_slugs": ["TEST"], "session_id": "sess-123"})
|
||||
assert gateway_post["json"]["session_id"] == "sess-123"
|
||||
assert "session" not in gateway_post["json"]
|
||||
|
||||
def test_execute_uses_session_id_string(self, gateway_post):
|
||||
from tools.app_tools import handle_app_execute_tools
|
||||
handle_app_execute_tools({"tools": [{"tool_slug": "TEST", "arguments": {}}], "session_id": "sess-456"})
|
||||
assert gateway_post["json"]["session_id"] == "sess-456"
|
||||
assert "session" not in gateway_post["json"]
|
||||
|
||||
def test_connections_uses_session_id_string(self, gateway_post):
|
||||
from tools.app_tools import handle_app_manage_connections
|
||||
handle_app_manage_connections({"toolkits": ["gmail"], "session_id": "sess-789"})
|
||||
assert gateway_post["json"]["session_id"] == "sess-789"
|
||||
assert "session" not in gateway_post["json"]
|
||||
@@ -22,18 +22,28 @@ from tools.approval import (
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def isolated_session(monkeypatch):
|
||||
"""Give each test a fresh session_key and clean approval-state."""
|
||||
def isolated_session(monkeypatch, tmp_path):
|
||||
"""Give each test a fresh session_key, clean approval-state, and isolated
|
||||
HERMES_HOME so the real user's command_allowlist doesn't leak in."""
|
||||
import tools.approval as _am
|
||||
|
||||
session_key = "test:session:approval_hooks"
|
||||
token = set_current_session_key(session_key)
|
||||
monkeypatch.setenv("HERMES_SESSION_KEY", session_key)
|
||||
# Make sure we don't skip guards via yolo / approvals.mode=off
|
||||
monkeypatch.delenv("HERMES_YOLO_MODE", raising=False)
|
||||
# Isolate from the real user's permanent allowlist + session state
|
||||
_saved_permanent = _am._permanent_approved.copy()
|
||||
_saved_session = {k: v.copy() for k, v in _am._session_approved.items()}
|
||||
_am._permanent_approved.clear()
|
||||
_am._session_approved.clear()
|
||||
try:
|
||||
yield session_key
|
||||
finally:
|
||||
_am._permanent_approved.update(_saved_permanent)
|
||||
_am._session_approved.update(_saved_session)
|
||||
try:
|
||||
approval_module._approval_session_key.reset(token)
|
||||
_am._approval_session_key.reset(token)
|
||||
except Exception:
|
||||
pass
|
||||
clear_session(session_key)
|
||||
|
||||
@@ -41,7 +41,7 @@ def _find_chrome() -> str:
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def chrome_cdp(worker_id):
|
||||
def chrome_cdp(request):
|
||||
"""Start a headless Chrome with --remote-debugging-port, yield its WS URL.
|
||||
|
||||
Uses a unique port per xdist worker to avoid cross-worker collisions.
|
||||
@@ -51,6 +51,9 @@ def chrome_cdp(worker_id):
|
||||
import socket
|
||||
|
||||
# xdist worker_id is "master" in single-process mode or "gw0".."gwN" otherwise.
|
||||
# Under subprocess-per-file isolation there's no xdist, so we fall back
|
||||
# to "master" via the session-scoped fixture below.
|
||||
worker_id = request.getfixturevalue("worker_id") if "worker_id" in request.fixturenames else "master"
|
||||
if worker_id == "master":
|
||||
port_offset = 0
|
||||
else:
|
||||
|
||||
@@ -76,6 +76,27 @@ class TestSchema:
|
||||
modes = set(COMPUTER_USE_SCHEMA["parameters"]["properties"]["mode"]["enum"])
|
||||
assert modes == {"som", "vision", "ax"}
|
||||
|
||||
def test_schema_exposes_max_elements_cap_for_capture(self):
|
||||
from tools.computer_use.schema import COMPUTER_USE_SCHEMA
|
||||
props = COMPUTER_USE_SCHEMA["parameters"]["properties"]
|
||||
assert "max_elements" in props
|
||||
assert props["max_elements"]["type"] == "integer"
|
||||
assert props["max_elements"].get("minimum", 1) >= 1
|
||||
|
||||
def test_schema_max_elements_documents_default_and_upper_bound(self):
|
||||
"""Schema description must agree with the runtime. The original PR
|
||||
text said "Default 100" without a corresponding `default` field, and
|
||||
had no upper bound — both Copilot findings.
|
||||
"""
|
||||
from tools.computer_use.schema import COMPUTER_USE_SCHEMA
|
||||
from tools.computer_use.tool import (
|
||||
_DEFAULT_MAX_ELEMENTS,
|
||||
_MAX_ALLOWED_MAX_ELEMENTS,
|
||||
)
|
||||
prop = COMPUTER_USE_SCHEMA["parameters"]["properties"]["max_elements"]
|
||||
assert prop.get("default") == _DEFAULT_MAX_ELEMENTS
|
||||
assert prop.get("maximum") == _MAX_ALLOWED_MAX_ELEMENTS
|
||||
|
||||
|
||||
class TestRegistration:
|
||||
def test_tool_registers_with_registry(self):
|
||||
@@ -155,6 +176,104 @@ class TestDispatch:
|
||||
click_kw = next(c[1] for c in noop_backend.calls if c[0] == "click")
|
||||
assert click_kw["button"] == "right"
|
||||
|
||||
def test_type_action_routes_to_type_text_backend(self, noop_backend):
|
||||
"""type action must call backend.type_text, not type_text_chars (issue #24170, bug 3)."""
|
||||
from tools.computer_use.tool import handle_computer_use
|
||||
out = handle_computer_use({"action": "type", "text": "hello"})
|
||||
parsed = json.loads(out)
|
||||
assert "error" not in parsed
|
||||
call_names = [c[0] for c in noop_backend.calls]
|
||||
assert "type" in call_names
|
||||
type_kw = next(c[1] for c in noop_backend.calls if c[0] == "type")
|
||||
assert type_kw["text"] == "hello"
|
||||
|
||||
def test_drag_action_routes_to_backend_by_coordinate(self, noop_backend):
|
||||
"""drag action must dispatch to backend.drag with coordinates (issue #24170, bug 4)."""
|
||||
from tools.computer_use.tool import handle_computer_use
|
||||
out = handle_computer_use({
|
||||
"action": "drag",
|
||||
"from_coordinate": [100, 200],
|
||||
"to_coordinate": [400, 500],
|
||||
})
|
||||
parsed = json.loads(out)
|
||||
assert "error" not in parsed
|
||||
call_names = [c[0] for c in noop_backend.calls]
|
||||
assert "drag" in call_names
|
||||
drag_kw = next(c[1] for c in noop_backend.calls if c[0] == "drag")
|
||||
assert drag_kw["from_xy"] == (100, 200)
|
||||
assert drag_kw["to_xy"] == (400, 500)
|
||||
|
||||
def test_drag_action_routes_to_backend_by_element(self, noop_backend):
|
||||
"""drag action must dispatch to backend.drag with element indices (issue #24170, bug 4)."""
|
||||
from tools.computer_use.tool import handle_computer_use
|
||||
out = handle_computer_use({
|
||||
"action": "drag",
|
||||
"from_element": 1,
|
||||
"to_element": 5,
|
||||
})
|
||||
parsed = json.loads(out)
|
||||
assert "error" not in parsed
|
||||
call_names = [c[0] for c in noop_backend.calls]
|
||||
assert "drag" in call_names
|
||||
drag_kw = next(c[1] for c in noop_backend.calls if c[0] == "drag")
|
||||
assert drag_kw["from_element"] == 1
|
||||
assert drag_kw["to_element"] == 5
|
||||
|
||||
def test_drag_action_requires_coordinates_or_elements(self, noop_backend):
|
||||
"""drag without from/to must return an error."""
|
||||
from tools.computer_use.tool import handle_computer_use
|
||||
out = handle_computer_use({"action": "drag"})
|
||||
parsed = json.loads(out)
|
||||
assert "error" in parsed
|
||||
|
||||
def test_set_value_routes_to_backend(self, noop_backend):
|
||||
"""set_value must reach the backend — regression for missing _NoopBackend stub."""
|
||||
from tools.computer_use.tool import handle_computer_use
|
||||
out = handle_computer_use({"action": "set_value", "value": "Option A", "element": 5})
|
||||
parsed = json.loads(out)
|
||||
assert parsed.get("ok") is True
|
||||
assert parsed.get("action") == "set_value"
|
||||
assert any(c[0] == "set_value" for c in noop_backend.calls)
|
||||
|
||||
def test_set_value_missing_value_returns_error(self, noop_backend):
|
||||
from tools.computer_use.tool import handle_computer_use
|
||||
out = handle_computer_use({"action": "set_value"})
|
||||
parsed = json.loads(out)
|
||||
assert "error" in parsed
|
||||
def test_capture_after_skipped_when_action_failed(self, noop_backend):
|
||||
"""capture_after must not fire when res.ok=False (regression guard).
|
||||
|
||||
A follow-up screenshot after a failed action shows the screen in a
|
||||
normal state, misleading the model into thinking the action succeeded.
|
||||
"""
|
||||
from unittest.mock import patch
|
||||
from tools.computer_use.backend import ActionResult
|
||||
from tools.computer_use.tool import handle_computer_use
|
||||
|
||||
# Make click() return a failure.
|
||||
with patch.object(noop_backend, "click",
|
||||
return_value=ActionResult(ok=False, action="click",
|
||||
message="element not found")):
|
||||
out = handle_computer_use({"action": "click", "element": 99,
|
||||
"capture_after": True})
|
||||
|
||||
parsed = json.loads(out)
|
||||
# Should return the error, not a multimodal capture.
|
||||
assert parsed.get("ok") is False
|
||||
assert parsed.get("action") == "click"
|
||||
# No follow-up capture should have been issued.
|
||||
capture_calls = [c for c in noop_backend.calls if c[0] == "capture"]
|
||||
assert len(capture_calls) == 0, "capture must not be called after a failed action"
|
||||
|
||||
def test_capture_after_fires_when_action_succeeds(self, noop_backend):
|
||||
"""capture_after must trigger for successful actions."""
|
||||
from tools.computer_use.tool import handle_computer_use
|
||||
out = handle_computer_use({"action": "click", "element": 1,
|
||||
"capture_after": True})
|
||||
# Noop backend returns ok=True, so capture should have been called.
|
||||
capture_calls = [c for c in noop_backend.calls if c[0] == "capture"]
|
||||
assert len(capture_calls) == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Safety guards (type / key block lists)
|
||||
@@ -287,6 +406,193 @@ class TestCaptureResponse:
|
||||
assert "AXButton" in text_part["text"]
|
||||
assert "AXTextField" in text_part["text"]
|
||||
|
||||
def _ax_backend_with(self, count: int):
|
||||
"""Construct a fake backend that yields ``count`` AX elements."""
|
||||
from tools.computer_use.backend import CaptureResult, UIElement
|
||||
|
||||
elements = [
|
||||
UIElement(index=i + 1, role="AXButton", label=f"el-{i}", bounds=(0, 0, 1, 1))
|
||||
for i in range(count)
|
||||
]
|
||||
|
||||
class FakeBackend:
|
||||
def start(self): pass
|
||||
def stop(self): pass
|
||||
def is_available(self): return True
|
||||
def capture(self, mode="som", app=None):
|
||||
return CaptureResult(
|
||||
mode=mode, width=800, height=600,
|
||||
png_b64="",
|
||||
elements=list(elements),
|
||||
app="Obsidian",
|
||||
)
|
||||
def click(self, **kw): ...
|
||||
def drag(self, **kw): ...
|
||||
def scroll(self, **kw): ...
|
||||
def type_text(self, text): ...
|
||||
def key(self, keys): ...
|
||||
def list_apps(self): return []
|
||||
def focus_app(self, app, raise_window=False): ...
|
||||
|
||||
return FakeBackend()
|
||||
|
||||
def test_capture_ax_caps_elements_at_default_for_dense_trees(self):
|
||||
"""Regression for #22865: an Electron-style 600-element AX tree must
|
||||
not emit the entire array verbatim into the tool result.
|
||||
"""
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
fake_backend = self._ax_backend_with(600)
|
||||
cu_tool.reset_backend_for_tests()
|
||||
with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
|
||||
out = cu_tool.handle_computer_use({"action": "capture", "mode": "ax"})
|
||||
|
||||
parsed = json.loads(out)
|
||||
assert parsed["mode"] == "ax"
|
||||
assert parsed["total_elements"] == 600
|
||||
assert len(parsed["elements"]) == cu_tool._DEFAULT_MAX_ELEMENTS
|
||||
assert parsed["truncated_elements"] == 600 - cu_tool._DEFAULT_MAX_ELEMENTS
|
||||
# Truncation must be visible in the human summary so the model knows
|
||||
# the JSON view is partial and can re-issue with a tighter scope.
|
||||
assert "truncated to" in parsed["summary"]
|
||||
|
||||
def test_capture_ax_honors_explicit_max_elements_override(self):
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
fake_backend = self._ax_backend_with(600)
|
||||
cu_tool.reset_backend_for_tests()
|
||||
with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
|
||||
out = cu_tool.handle_computer_use(
|
||||
{"action": "capture", "mode": "ax", "max_elements": 250}
|
||||
)
|
||||
|
||||
parsed = json.loads(out)
|
||||
assert len(parsed["elements"]) == 250
|
||||
assert parsed["truncated_elements"] == 350
|
||||
|
||||
def test_capture_ax_below_cap_is_unchanged(self):
|
||||
"""Backwards-compat: small captures keep the full elements array and
|
||||
do not surface a `truncated_elements` field.
|
||||
"""
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
fake_backend = self._ax_backend_with(5)
|
||||
cu_tool.reset_backend_for_tests()
|
||||
with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
|
||||
out = cu_tool.handle_computer_use({"action": "capture", "mode": "ax"})
|
||||
|
||||
parsed = json.loads(out)
|
||||
assert len(parsed["elements"]) == 5
|
||||
assert parsed["total_elements"] == 5
|
||||
assert "truncated_elements" not in parsed
|
||||
assert "truncated to" not in parsed["summary"]
|
||||
|
||||
def test_capture_ax_invalid_max_elements_falls_back_to_default(self):
|
||||
"""Malformed `max_elements` (string, negative, zero) must not silently
|
||||
disable the cap and re-introduce the original unbounded behavior.
|
||||
"""
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
fake_backend = self._ax_backend_with(600)
|
||||
cu_tool.reset_backend_for_tests()
|
||||
for bad in ("not-a-number", 0, -10):
|
||||
with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
|
||||
out = cu_tool.handle_computer_use(
|
||||
{"action": "capture", "mode": "ax", "max_elements": bad}
|
||||
)
|
||||
parsed = json.loads(out)
|
||||
assert len(parsed["elements"]) == cu_tool._DEFAULT_MAX_ELEMENTS, (
|
||||
f"bad max_elements={bad!r} disabled the cap"
|
||||
)
|
||||
|
||||
def test_capture_ax_clamps_oversized_max_elements_to_hard_cap(self):
|
||||
"""A caller passing a very large `max_elements` must not be able to
|
||||
disable the safeguard. The cap is clamped to a hard upper bound so
|
||||
the context-blow-up protection cannot be bypassed by argument.
|
||||
"""
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
fake_backend = self._ax_backend_with(5000)
|
||||
cu_tool.reset_backend_for_tests()
|
||||
with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
|
||||
out = cu_tool.handle_computer_use(
|
||||
{"action": "capture", "mode": "ax", "max_elements": 10_000}
|
||||
)
|
||||
parsed = json.loads(out)
|
||||
assert len(parsed["elements"]) == cu_tool._MAX_ALLOWED_MAX_ELEMENTS
|
||||
assert parsed["total_elements"] == 5000
|
||||
assert parsed["truncated_elements"] == 5000 - cu_tool._MAX_ALLOWED_MAX_ELEMENTS
|
||||
|
||||
def test_capture_ax_summary_indices_match_returned_elements(self):
|
||||
"""When `max_elements` is below the human-summary's own line cap, the
|
||||
summary must not index elements that aren't in the returned array.
|
||||
Otherwise the model sees `#15` in the summary and finds no matching
|
||||
entry in `elements`.
|
||||
"""
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
fake_backend = self._ax_backend_with(600)
|
||||
cu_tool.reset_backend_for_tests()
|
||||
with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
|
||||
out = cu_tool.handle_computer_use(
|
||||
{"action": "capture", "mode": "ax", "max_elements": 5}
|
||||
)
|
||||
parsed = json.loads(out)
|
||||
returned_indices = {e["index"] for e in parsed["elements"]}
|
||||
summary_lines = parsed["summary"].splitlines()
|
||||
indexed_lines = [ln for ln in summary_lines if ln.lstrip().startswith("#")]
|
||||
for ln in indexed_lines:
|
||||
idx_token = ln.lstrip().split()[0].lstrip("#")
|
||||
idx = int(idx_token)
|
||||
assert idx in returned_indices, (
|
||||
f"summary references #{idx} but it is absent from elements payload "
|
||||
f"(returned: {sorted(returned_indices)})"
|
||||
)
|
||||
|
||||
def test_capture_multimodal_summary_omits_truncation_note(self):
|
||||
"""The som/vision multimodal envelope returns a screenshot, not an
|
||||
`elements` array — so a "response truncated to N of M elements"
|
||||
claim in the summary would be inaccurate.
|
||||
"""
|
||||
from tools.computer_use.backend import CaptureResult, UIElement
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
fake_png = "iVBORw0KGgo="
|
||||
elements = [
|
||||
UIElement(index=i + 1, role="AXButton", label=f"el-{i}", bounds=(0, 0, 1, 1))
|
||||
for i in range(600)
|
||||
]
|
||||
|
||||
class FakeBackend:
|
||||
def start(self): pass
|
||||
def stop(self): pass
|
||||
def is_available(self): return True
|
||||
def capture(self, mode="som", app=None):
|
||||
return CaptureResult(
|
||||
mode=mode, width=800, height=600,
|
||||
png_b64=fake_png, elements=list(elements),
|
||||
app="Obsidian",
|
||||
)
|
||||
def click(self, **kw): ...
|
||||
def drag(self, **kw): ...
|
||||
def scroll(self, **kw): ...
|
||||
def type_text(self, text): ...
|
||||
def key(self, keys): ...
|
||||
def list_apps(self): return []
|
||||
def focus_app(self, app, raise_window=False): ...
|
||||
|
||||
cu_tool.reset_backend_for_tests()
|
||||
with patch.object(cu_tool, "_get_backend", return_value=FakeBackend()):
|
||||
out = cu_tool.handle_computer_use({"action": "capture", "mode": "som"})
|
||||
|
||||
assert isinstance(out, dict) and out["_multimodal"] is True
|
||||
text_part = next(p for p in out["content"] if p.get("type") == "text")
|
||||
assert "truncated to" not in text_part["text"], (
|
||||
"multimodal response carries an image, not an elements array; "
|
||||
"the truncation note describes a payload field that isn't present"
|
||||
)
|
||||
assert "truncated to" not in out["text_summary"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Anthropic adapter: multimodal tool-result conversion
|
||||
@@ -679,3 +985,332 @@ class TestUniversality:
|
||||
source = inspect.getsource(entry.check_fn)
|
||||
assert "anthropic" not in source.lower()
|
||||
assert "openai" not in source.lower()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Regression tests for bugs 2 & 5 from issue #24170 (cua-driver v0.1.6)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestElementLabelParsing:
|
||||
"""Bug 5: element labels stripped in capture results (cua-driver v0.1.6 format).
|
||||
|
||||
cua-driver ≥0.1.6 emits ``[N] AXRole (order) id=Label`` instead of
|
||||
`` - [N] AXRole "label"``. _parse_elements_from_tree must handle both.
|
||||
"""
|
||||
|
||||
def test_classic_quoted_label_format(self):
|
||||
from tools.computer_use.cua_backend import _parse_elements_from_tree
|
||||
tree = (
|
||||
' - [14] AXButton "One"\n'
|
||||
' - [15] AXButton "Two"\n'
|
||||
' - [16] AXTextField ""\n'
|
||||
)
|
||||
els = _parse_elements_from_tree(tree)
|
||||
assert len(els) == 3
|
||||
assert els[0].index == 14
|
||||
assert els[0].role == "AXButton"
|
||||
assert els[0].label == "One"
|
||||
assert els[1].label == "Two"
|
||||
assert els[2].label == "" # empty quoted label
|
||||
|
||||
def test_new_id_eq_format(self):
|
||||
"""cua-driver v0.1.6 format: [N] AXRole (order) id=Label"""
|
||||
from tools.computer_use.cua_backend import _parse_elements_from_tree
|
||||
tree = (
|
||||
"[14] AXButton (1) id=One\n"
|
||||
"[15] AXButton (2) id=Two\n"
|
||||
"[16] AXTextField (3) id=\n"
|
||||
)
|
||||
els = _parse_elements_from_tree(tree)
|
||||
assert len(els) == 3
|
||||
assert els[0].index == 14
|
||||
assert els[0].role == "AXButton"
|
||||
assert els[0].label == "One"
|
||||
assert els[1].label == "Two"
|
||||
assert els[2].label == "" # empty id= value
|
||||
|
||||
def test_mixed_formats_in_single_tree(self):
|
||||
"""Gracefully handles trees that mix old and new line formats."""
|
||||
from tools.computer_use.cua_backend import _parse_elements_from_tree
|
||||
tree = (
|
||||
' - [1] AXWindow "Main Window"\n'
|
||||
"[14] AXButton (1) id=One\n"
|
||||
' - [15] AXTextField "Search"\n'
|
||||
)
|
||||
els = _parse_elements_from_tree(tree)
|
||||
assert len(els) == 3
|
||||
labels = {e.index: e.label for e in els}
|
||||
assert labels[1] == "Main Window"
|
||||
assert labels[14] == "One"
|
||||
assert labels[15] == "Search"
|
||||
|
||||
|
||||
class TestCaptureAfterAppContext:
|
||||
"""Bug 2: capture_after=True loses app context after actions.
|
||||
|
||||
_maybe_follow_capture must re-target the same app that was set by
|
||||
the preceding capture/focus_app call, rather than the frontmost window.
|
||||
"""
|
||||
|
||||
def test_capture_after_uses_last_app(self):
|
||||
"""capture_after=True should pass _last_app to the follow-up capture."""
|
||||
from tools.computer_use.backend import ActionResult, CaptureResult
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
captured_app_args = []
|
||||
|
||||
class TrackingBackend:
|
||||
_last_app = "Calculator" # simulates a previous focus_app call
|
||||
|
||||
def start(self):
|
||||
pass
|
||||
|
||||
def stop(self):
|
||||
pass
|
||||
|
||||
def is_available(self):
|
||||
return True
|
||||
|
||||
def capture(self, mode="som", app=None):
|
||||
captured_app_args.append(app)
|
||||
return CaptureResult(
|
||||
mode=mode, width=100, height=100,
|
||||
png_b64=None, elements=[],
|
||||
app=app or "Calculator", window_title="",
|
||||
)
|
||||
|
||||
def click(self, **kw):
|
||||
return ActionResult(ok=True, action="click")
|
||||
|
||||
def drag(self, **kw):
|
||||
return ActionResult(ok=True, action="drag")
|
||||
|
||||
def scroll(self, **kw):
|
||||
return ActionResult(ok=True, action="scroll")
|
||||
|
||||
def type_text(self, text):
|
||||
return ActionResult(ok=True, action="type")
|
||||
|
||||
def key(self, keys):
|
||||
return ActionResult(ok=True, action="key")
|
||||
|
||||
def list_apps(self):
|
||||
return []
|
||||
|
||||
def focus_app(self, app, raise_window=False):
|
||||
return ActionResult(ok=True, action="focus_app")
|
||||
|
||||
def set_value(self, value, element=None):
|
||||
return ActionResult(ok=True, action="set_value")
|
||||
|
||||
def wait(self, seconds=1.0):
|
||||
return ActionResult(ok=True, action="wait")
|
||||
|
||||
backend = TrackingBackend()
|
||||
cu_tool.reset_backend_for_tests()
|
||||
cu_tool._backend = backend
|
||||
|
||||
cu_tool.handle_computer_use({"action": "click", "element": 14, "capture_after": True})
|
||||
|
||||
# The follow-up capture must have been called with app="Calculator"
|
||||
assert len(captured_app_args) == 1
|
||||
assert captured_app_args[0] == "Calculator", (
|
||||
f"Expected follow-up capture with app='Calculator', got {captured_app_args[0]!r}"
|
||||
)
|
||||
|
||||
def test_capture_after_without_prior_app_uses_none(self):
|
||||
"""When no app context is set, follow-up capture uses app=None (frontmost)."""
|
||||
from tools.computer_use.backend import ActionResult, CaptureResult
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
captured_app_args = []
|
||||
|
||||
class NoContextBackend:
|
||||
_last_app = None # no prior context
|
||||
|
||||
def start(self):
|
||||
pass
|
||||
|
||||
def stop(self):
|
||||
pass
|
||||
|
||||
def is_available(self):
|
||||
return True
|
||||
|
||||
def capture(self, mode="som", app=None):
|
||||
captured_app_args.append(app)
|
||||
return CaptureResult(
|
||||
mode=mode, width=100, height=100,
|
||||
png_b64=None, elements=[],
|
||||
app="Finder", window_title="",
|
||||
)
|
||||
|
||||
def click(self, **kw):
|
||||
return ActionResult(ok=True, action="click")
|
||||
|
||||
def drag(self, **kw):
|
||||
return ActionResult(ok=True, action="drag")
|
||||
|
||||
def scroll(self, **kw):
|
||||
return ActionResult(ok=True, action="scroll")
|
||||
|
||||
def type_text(self, text):
|
||||
return ActionResult(ok=True, action="type")
|
||||
|
||||
def key(self, keys):
|
||||
return ActionResult(ok=True, action="key")
|
||||
|
||||
def list_apps(self):
|
||||
return []
|
||||
|
||||
def focus_app(self, app, raise_window=False):
|
||||
return ActionResult(ok=True, action="focus_app")
|
||||
|
||||
def set_value(self, value, element=None):
|
||||
return ActionResult(ok=True, action="set_value")
|
||||
|
||||
def wait(self, seconds=1.0):
|
||||
return ActionResult(ok=True, action="wait")
|
||||
|
||||
backend = NoContextBackend()
|
||||
cu_tool.reset_backend_for_tests()
|
||||
cu_tool._backend = backend
|
||||
|
||||
cu_tool.handle_computer_use({"action": "click", "element": 5, "capture_after": True})
|
||||
|
||||
# No app context — should pass None so cua-driver picks the frontmost window
|
||||
assert len(captured_app_args) == 1
|
||||
assert captured_app_args[0] is None
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Regression tests for bug 1 from issue #24170:
|
||||
# capture(app=...) and focus_app(app=...) must surface when the filter
|
||||
# matches nothing instead of silently picking the frontmost window.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_cua_backend_with_windows(windows: List[Dict[str, Any]]):
|
||||
"""Construct a CuaDriverBackend with a mocked MCP session that returns
|
||||
the supplied list_windows payload."""
|
||||
from tools.computer_use.cua_backend import CuaDriverBackend
|
||||
|
||||
backend = CuaDriverBackend()
|
||||
backend._session = MagicMock()
|
||||
backend._session.call_tool.return_value = {
|
||||
"data": "",
|
||||
"images": [],
|
||||
"structuredContent": {"windows": windows},
|
||||
"isError": False,
|
||||
}
|
||||
return backend
|
||||
|
||||
|
||||
class TestCaptureAppFilterNoMatch:
|
||||
"""capture(app=X) must not silently fall back to the frontmost window
|
||||
when X matches nothing — on a non-English macOS, list_windows returns
|
||||
localized app names (e.g. "計算機"), so an English `app="Calculator"`
|
||||
legitimately matches nothing and the caller needs to retry with the
|
||||
localized name. The old code silently captured the frontmost window
|
||||
(e.g. a menu-bar utility), giving the agent wrong UI elements.
|
||||
"""
|
||||
|
||||
def test_app_filter_no_match_returns_empty_capture_with_diagnostic(self):
|
||||
# Simulates a localized macOS where Calculator's app_name is "計算機".
|
||||
windows = [
|
||||
{"app_name": "Fuwari", "pid": 100, "window_id": 1,
|
||||
"is_on_screen": True, "title": "menu bar", "z_index": 0},
|
||||
{"app_name": "計算機", "pid": 200, "window_id": 2,
|
||||
"is_on_screen": True, "title": "Calculator", "z_index": 1},
|
||||
]
|
||||
backend = _make_cua_backend_with_windows(windows)
|
||||
|
||||
cap = backend.capture(mode="som", app="Calculator")
|
||||
|
||||
# No window matched; capture must NOT pick the frontmost (Fuwari).
|
||||
assert cap.app == "", (
|
||||
f"app= filter no-match should not silently target a window; got {cap.app!r}"
|
||||
)
|
||||
assert cap.elements == []
|
||||
assert "Calculator" in cap.window_title
|
||||
assert "list_apps" in cap.window_title
|
||||
# _active_pid must remain unset so a subsequent click doesn't hit Fuwari.
|
||||
assert backend._active_pid is None
|
||||
assert backend._active_window_id is None
|
||||
|
||||
def test_app_filter_match_still_works(self):
|
||||
windows = [
|
||||
{"app_name": "Fuwari", "pid": 100, "window_id": 1,
|
||||
"is_on_screen": True, "title": "menu bar", "z_index": 0},
|
||||
{"app_name": "計算機", "pid": 200, "window_id": 2,
|
||||
"is_on_screen": True, "title": "Calculator", "z_index": 1},
|
||||
]
|
||||
backend = _make_cua_backend_with_windows(windows)
|
||||
# get_window_state for the matched window
|
||||
backend._session.call_tool.side_effect = [
|
||||
{"data": "", "images": [], "isError": False,
|
||||
"structuredContent": {"windows": windows}},
|
||||
{"data": '✅ 計算機 — 0 elements\n', "images": [], "isError": False,
|
||||
"structuredContent": None},
|
||||
]
|
||||
|
||||
cap = backend.capture(mode="ax", app="計算機")
|
||||
|
||||
assert backend._active_pid == 200
|
||||
assert backend._active_window_id == 2
|
||||
|
||||
def test_no_app_filter_still_picks_frontmost(self):
|
||||
"""When no app= is given, capture continues to pick the frontmost
|
||||
window — the no-match early-return must not fire on the empty case."""
|
||||
windows = [
|
||||
{"app_name": "Fuwari", "pid": 100, "window_id": 1,
|
||||
"is_on_screen": True, "title": "menu bar", "z_index": 0},
|
||||
]
|
||||
backend = _make_cua_backend_with_windows(windows)
|
||||
backend._session.call_tool.side_effect = [
|
||||
{"data": "", "images": [], "isError": False,
|
||||
"structuredContent": {"windows": windows}},
|
||||
{"data": '✅ Fuwari — 0 elements\n', "images": [], "isError": False,
|
||||
"structuredContent": None},
|
||||
]
|
||||
|
||||
cap = backend.capture(mode="ax", app=None)
|
||||
|
||||
assert backend._active_pid == 100
|
||||
|
||||
|
||||
class TestFocusAppFilterNoMatch:
|
||||
"""focus_app(app=X) must return ok=False when X matches nothing —
|
||||
not silently target the frontmost window and report ok=True with a
|
||||
misleading 'Targeted Fuwari' message.
|
||||
"""
|
||||
|
||||
def test_focus_app_no_match_returns_not_ok(self):
|
||||
windows = [
|
||||
{"app_name": "Fuwari", "pid": 100, "window_id": 1,
|
||||
"is_on_screen": True, "title": "menu bar", "z_index": 0},
|
||||
{"app_name": "計算機", "pid": 200, "window_id": 2,
|
||||
"is_on_screen": True, "title": "Calculator", "z_index": 1},
|
||||
]
|
||||
backend = _make_cua_backend_with_windows(windows)
|
||||
|
||||
res = backend.focus_app("Calculator")
|
||||
|
||||
assert res.ok is False
|
||||
assert res.action == "focus_app"
|
||||
assert "Calculator" in res.message
|
||||
# _active_pid must remain unset so a subsequent click doesn't hit Fuwari.
|
||||
assert backend._active_pid is None
|
||||
|
||||
def test_focus_app_match_still_works(self):
|
||||
windows = [
|
||||
{"app_name": "Fuwari", "pid": 100, "window_id": 1,
|
||||
"is_on_screen": True, "title": "menu bar", "z_index": 0},
|
||||
{"app_name": "計算機", "pid": 200, "window_id": 2,
|
||||
"is_on_screen": True, "title": "Calculator", "z_index": 1},
|
||||
]
|
||||
backend = _make_cua_backend_with_windows(windows)
|
||||
|
||||
res = backend.focus_app("計算機")
|
||||
|
||||
assert res.ok is True
|
||||
assert backend._active_pid == 200
|
||||
assert backend._active_window_id == 2
|
||||
|
||||
@@ -0,0 +1,431 @@
|
||||
"""End-to-end regression for #24015 — capture routing via auxiliary.vision.
|
||||
|
||||
When ``computer_use(action='capture', mode='som'|'vision')`` returns a
|
||||
screenshot, ``_capture_response`` previously always returned a
|
||||
``_multimodal`` envelope. For non-vision main models, or when the user
|
||||
explicitly configured ``auxiliary.vision`` in ``config.yaml``, that
|
||||
envelope tripped HTTP 404 / 400 at the provider boundary even though a
|
||||
perfectly good vision backend was sitting in config waiting to be used.
|
||||
|
||||
This file exercises the integrated ``_capture_response`` flow with
|
||||
deterministic stubs for:
|
||||
|
||||
* ``should_route_capture_to_aux_vision`` (the policy decision)
|
||||
* ``_run_async`` (sync->async bridge)
|
||||
* ``vision_analyze_tool`` (the aux LLM call)
|
||||
* ``hermes_constants.get_hermes_dir`` (cache path)
|
||||
|
||||
…so the full code path is covered without a live cua-driver, a real
|
||||
auxiliary client, or network access.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures / helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# 1×1 PNG (transparent) — minimal bytes that decode cleanly.
|
||||
_PNG_B64 = (
|
||||
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42m"
|
||||
"NkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="
|
||||
)
|
||||
|
||||
# 1×1 JPEG — used to verify mime detection works for either stream type.
|
||||
_JPEG_B64 = (
|
||||
"/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEB"
|
||||
"AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tmp_cache_dir(tmp_path):
|
||||
"""Override get_hermes_dir so cache writes land under tmp_path."""
|
||||
cache_dir = tmp_path / "cache_vision"
|
||||
cache_dir.mkdir()
|
||||
|
||||
def _fake_get(*_args, **_kw):
|
||||
return cache_dir
|
||||
|
||||
with patch("hermes_constants.get_hermes_dir", _fake_get):
|
||||
yield cache_dir
|
||||
|
||||
|
||||
def _make_capture(
|
||||
*,
|
||||
png_b64: str = _PNG_B64,
|
||||
mode: str = "som",
|
||||
elements=None,
|
||||
app: str = "Safari",
|
||||
window_title: str = "GitHub – Issue #24015",
|
||||
width: int = 1280,
|
||||
height: int = 800,
|
||||
):
|
||||
from tools.computer_use.backend import CaptureResult, UIElement
|
||||
|
||||
elements = list(elements or [
|
||||
UIElement(index=0, role="AXButton", label="Sign in",
|
||||
bounds=(10, 20, 80, 30)),
|
||||
UIElement(index=1, role="AXTextField", label="username",
|
||||
bounds=(10, 60, 200, 24)),
|
||||
])
|
||||
raw = base64.b64decode(png_b64, validate=False)
|
||||
return CaptureResult(
|
||||
mode=mode,
|
||||
width=width,
|
||||
height=height,
|
||||
png_b64=png_b64,
|
||||
elements=elements,
|
||||
app=app,
|
||||
window_title=window_title,
|
||||
png_bytes_len=len(raw),
|
||||
)
|
||||
|
||||
|
||||
def _stub_aux_analysis(text: str):
|
||||
"""Return a fake vision_analyze_tool coroutine result (JSON envelope)."""
|
||||
return json.dumps({"success": True, "analysis": text})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _capture_response: routing OFF (current/native behaviour)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCaptureResponseDefaultPath:
|
||||
"""When routing helper says 'native', the existing multimodal envelope wins."""
|
||||
|
||||
def test_som_capture_returns_multimodal_envelope_when_native(self):
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
cap = _make_capture(png_b64=_PNG_B64, mode="som")
|
||||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||||
return_value=False):
|
||||
resp = cu_tool._capture_response(cap)
|
||||
|
||||
assert isinstance(resp, dict)
|
||||
assert resp.get("_multimodal") is True
|
||||
# Image part must use image/png MIME for a PNG payload.
|
||||
image_part = next(
|
||||
p for p in resp["content"] if p.get("type") == "image_url"
|
||||
)
|
||||
url = image_part["image_url"]["url"]
|
||||
assert url.startswith("data:image/png;base64,")
|
||||
assert "vision_analysis" not in resp
|
||||
|
||||
def test_jpeg_capture_returns_image_jpeg_mime_when_native(self):
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
cap = _make_capture(png_b64=_JPEG_B64, mode="som")
|
||||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||||
return_value=False):
|
||||
resp = cu_tool._capture_response(cap)
|
||||
|
||||
url = next(p for p in resp["content"] if p.get("type") == "image_url")
|
||||
assert url["image_url"]["url"].startswith("data:image/jpeg;base64,")
|
||||
|
||||
def test_ax_only_capture_returns_text_regardless_of_routing(self):
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
cap = _make_capture(mode="ax", png_b64="")
|
||||
# ax mode never has a PNG so neither path matters; assert pure text.
|
||||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||||
return_value=True) as routing:
|
||||
resp = cu_tool._capture_response(cap)
|
||||
|
||||
# ax never even consults the routing helper — short-circuited above
|
||||
# the image branch.
|
||||
routing.assert_not_called()
|
||||
assert isinstance(resp, str)
|
||||
body = json.loads(resp)
|
||||
assert body["mode"] == "ax"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _capture_response: routing ON (the #24015 fix)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCaptureResponseRoutedToAuxVision:
|
||||
"""When routing helper says 'aux', the PNG is pre-analysed and a text
|
||||
response is returned with no image_url parts at all."""
|
||||
|
||||
def test_som_capture_returns_text_with_vision_analysis(
|
||||
self, tmp_cache_dir,
|
||||
):
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
cap = _make_capture(mode="som")
|
||||
|
||||
captured_calls = {}
|
||||
|
||||
def _fake_run_async(coro):
|
||||
captured_calls["called"] = True
|
||||
return _stub_aux_analysis(
|
||||
"A Safari window showing a GitHub issue page with a 'Sign "
|
||||
"in' button and a 'username' text field."
|
||||
)
|
||||
|
||||
# vision_analyze_tool is async; force a sync MagicMock so we can
|
||||
# assert positional args without dealing with awaitables.
|
||||
fake_vat = MagicMock(return_value="<coro>")
|
||||
|
||||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||||
return_value=True), \
|
||||
patch("model_tools._run_async", side_effect=_fake_run_async), \
|
||||
patch("tools.vision_tools.vision_analyze_tool",
|
||||
new_callable=lambda: fake_vat):
|
||||
resp = cu_tool._capture_response(cap)
|
||||
|
||||
# Must be a JSON string, NOT a multimodal envelope. This is exactly
|
||||
# the contract that prevents #24015's HTTP 404 from firing on the
|
||||
# next agent turn.
|
||||
assert isinstance(resp, str)
|
||||
body = json.loads(resp)
|
||||
assert body["mode"] == "som"
|
||||
assert body["app"] == "Safari"
|
||||
assert "Sign in" in body["vision_analysis"]
|
||||
assert body["vision_analysis_routed_via"] == "auxiliary.vision"
|
||||
# The original AX-only metadata (window title, element index, app)
|
||||
# is preserved alongside the new vision analysis so the agent loses
|
||||
# no context vs the multimodal path.
|
||||
assert body["window_title"] == "GitHub – Issue #24015"
|
||||
assert len(body["elements"]) == 2
|
||||
|
||||
assert captured_calls.get("called") is True
|
||||
# vision_analyze_tool was invoked with a path under the patched cache
|
||||
# and a non-empty prompt.
|
||||
args, _kwargs = fake_vat.call_args
|
||||
path_arg, prompt_arg = args[0], args[1]
|
||||
assert str(tmp_cache_dir) in path_arg
|
||||
assert "macOS application screenshot" in prompt_arg
|
||||
# AX summary is included so the aux model can ground its description
|
||||
# against the same set-of-mark index the agent will see.
|
||||
assert "Sign in" in prompt_arg
|
||||
|
||||
def test_temp_screenshot_file_is_cleaned_up_after_routing(
|
||||
self, tmp_cache_dir,
|
||||
):
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
cap = _make_capture(mode="som")
|
||||
# We capture the path the aux call sees so we can assert it's gone
|
||||
# after _capture_response returns.
|
||||
observed_path = {}
|
||||
|
||||
def _fake_run_async(_coro):
|
||||
return _stub_aux_analysis("description goes here")
|
||||
|
||||
def _fake_vat(image_path, _prompt):
|
||||
observed_path["path"] = image_path
|
||||
# File must exist while aux is being arranged.
|
||||
assert os.path.exists(image_path)
|
||||
return "<coro>"
|
||||
|
||||
fake_vat = MagicMock(side_effect=_fake_vat)
|
||||
|
||||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||||
return_value=True), \
|
||||
patch("model_tools._run_async", side_effect=_fake_run_async), \
|
||||
patch("tools.vision_tools.vision_analyze_tool",
|
||||
new_callable=lambda: fake_vat):
|
||||
cu_tool._capture_response(cap)
|
||||
|
||||
# File must be unlinked after _capture_response returns.
|
||||
assert observed_path["path"]
|
||||
assert not os.path.exists(observed_path["path"])
|
||||
|
||||
def test_temp_file_cleaned_up_even_when_aux_call_raises(
|
||||
self, tmp_cache_dir,
|
||||
):
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
cap = _make_capture(mode="som")
|
||||
observed_path = {}
|
||||
|
||||
def _fake_vat(image_path, _prompt):
|
||||
observed_path["path"] = image_path
|
||||
return "<coro>"
|
||||
|
||||
def _fake_run_async(_coro):
|
||||
raise RuntimeError("aux LLM down")
|
||||
|
||||
fake_vat = MagicMock(side_effect=_fake_vat)
|
||||
|
||||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||||
return_value=True), \
|
||||
patch("model_tools._run_async", side_effect=_fake_run_async), \
|
||||
patch("tools.vision_tools.vision_analyze_tool",
|
||||
new_callable=lambda: fake_vat):
|
||||
resp = cu_tool._capture_response(cap)
|
||||
|
||||
# Aux failure → fall back to multimodal envelope (so the user still
|
||||
# gets *something* useful even if vision is broken).
|
||||
assert isinstance(resp, dict)
|
||||
assert resp.get("_multimodal") is True
|
||||
# Temp file must still be cleaned up.
|
||||
assert observed_path["path"]
|
||||
assert not os.path.exists(observed_path["path"])
|
||||
|
||||
def test_empty_aux_analysis_falls_back_to_multimodal(self, tmp_cache_dir):
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
cap = _make_capture(mode="som")
|
||||
|
||||
def _fake_run_async(_coro):
|
||||
return _stub_aux_analysis("")
|
||||
|
||||
fake_vat = MagicMock(return_value="<coro>")
|
||||
|
||||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||||
return_value=True), \
|
||||
patch("model_tools._run_async", side_effect=_fake_run_async), \
|
||||
patch("tools.vision_tools.vision_analyze_tool",
|
||||
new_callable=lambda: fake_vat):
|
||||
resp = cu_tool._capture_response(cap)
|
||||
|
||||
# Empty analysis is treated as failure — we'd rather show pixels
|
||||
# than embed an empty 'vision_analysis' string into the result.
|
||||
assert isinstance(resp, dict)
|
||||
assert resp.get("_multimodal") is True
|
||||
|
||||
def test_invalid_aux_response_falls_back_to_multimodal(self, tmp_cache_dir):
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
cap = _make_capture(mode="som")
|
||||
|
||||
def _fake_run_async(_coro):
|
||||
return 1234 # not a string at all
|
||||
|
||||
fake_vat = MagicMock(return_value="<coro>")
|
||||
|
||||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||||
return_value=True), \
|
||||
patch("model_tools._run_async", side_effect=_fake_run_async), \
|
||||
patch("tools.vision_tools.vision_analyze_tool",
|
||||
new_callable=lambda: fake_vat):
|
||||
resp = cu_tool._capture_response(cap)
|
||||
|
||||
assert isinstance(resp, dict)
|
||||
assert resp.get("_multimodal") is True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _should_route_through_aux_vision: end-to-end with real config plumbing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRoutingDecisionWiring:
|
||||
"""Verify _should_route_through_aux_vision wires the right config + helper."""
|
||||
|
||||
def test_explicit_aux_vision_in_config_routes_to_aux(self):
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
cfg = {
|
||||
"model": {"default": "tencent/hy3-preview", "provider": "openrouter"},
|
||||
"auxiliary": {
|
||||
"vision": {
|
||||
"provider": "openrouter",
|
||||
"model": "google/gemini-2.5-flash",
|
||||
}
|
||||
},
|
||||
}
|
||||
with patch("agent.auxiliary_client._read_main_provider",
|
||||
return_value="openrouter"), \
|
||||
patch("agent.auxiliary_client._read_main_model",
|
||||
return_value="tencent/hy3-preview"), \
|
||||
patch("hermes_cli.config.load_config", return_value=cfg):
|
||||
assert cu_tool._should_route_through_aux_vision() is True
|
||||
|
||||
def test_no_explicit_aux_and_vision_capable_main_keeps_multimodal(self):
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
cfg = {
|
||||
"model": {"default": "claude-opus-4-5", "provider": "anthropic"},
|
||||
}
|
||||
with patch("agent.auxiliary_client._read_main_provider",
|
||||
return_value="anthropic"), \
|
||||
patch("agent.auxiliary_client._read_main_model",
|
||||
return_value="claude-opus-4-5"), \
|
||||
patch("hermes_cli.config.load_config", return_value=cfg), \
|
||||
patch("tools.computer_use.vision_routing._lookup_supports_vision",
|
||||
return_value=True), \
|
||||
patch("tools.computer_use.vision_routing."
|
||||
"_provider_accepts_multimodal_tool_result",
|
||||
return_value=True):
|
||||
assert cu_tool._should_route_through_aux_vision() is False
|
||||
|
||||
def test_config_load_failure_disables_routing_safely(self):
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
with patch("hermes_cli.config.load_config",
|
||||
side_effect=RuntimeError("config.yaml unreadable")):
|
||||
# No exception should bubble up — fail open by returning False
|
||||
# so the legacy multimodal envelope continues to work.
|
||||
assert cu_tool._should_route_through_aux_vision() is False
|
||||
|
||||
def test_helper_decision_exception_is_swallowed(self):
|
||||
from tools.computer_use import tool as cu_tool
|
||||
from tools.computer_use import vision_routing as vr_mod
|
||||
|
||||
with patch("agent.auxiliary_client._read_main_provider",
|
||||
return_value="openrouter"), \
|
||||
patch("agent.auxiliary_client._read_main_model",
|
||||
return_value="x"), \
|
||||
patch("hermes_cli.config.load_config", return_value={}), \
|
||||
patch.object(vr_mod, "should_route_capture_to_aux_vision",
|
||||
side_effect=ValueError("policy bug")):
|
||||
assert cu_tool._should_route_through_aux_vision() is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bug reproduction marker — proves the fix is needed.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestBugReproductionAnchor:
|
||||
"""Without the fix, this test would assert the wrong thing.
|
||||
|
||||
On upstream/main HEAD prior to this branch, _capture_response returns a
|
||||
multimodal envelope unconditionally — so when a non-vision main model
|
||||
is configured, the captured PNG is delivered to the main provider as
|
||||
image_url content and the request is rejected with HTTP 404. We don't
|
||||
have a live provider here, but we can pin the contract: with routing
|
||||
enabled the response MUST be a JSON string with no image_url parts.
|
||||
"""
|
||||
|
||||
def test_non_vision_main_model_never_returns_image_url_when_routed(
|
||||
self, tmp_cache_dir,
|
||||
):
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
cap = _make_capture(mode="som")
|
||||
|
||||
def _fake_run_async(_coro):
|
||||
return _stub_aux_analysis(
|
||||
"Screenshot showing a GitHub.com window with a sign-in "
|
||||
"form."
|
||||
)
|
||||
|
||||
fake_vat = MagicMock(return_value="<coro>")
|
||||
|
||||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||||
return_value=True), \
|
||||
patch("model_tools._run_async", side_effect=_fake_run_async), \
|
||||
patch("tools.vision_tools.vision_analyze_tool",
|
||||
new_callable=lambda: fake_vat):
|
||||
resp = cu_tool._capture_response(cap)
|
||||
|
||||
# Must be a string (text-only result).
|
||||
assert isinstance(resp, str)
|
||||
# Must NOT contain a base64 image URL anywhere — that's what tripped
|
||||
# 'No endpoints found that support image input' on the reporter's
|
||||
# main provider in #24015.
|
||||
assert "data:image" not in resp
|
||||
assert "image_url" not in resp
|
||||
@@ -0,0 +1,260 @@
|
||||
"""Unit tests for tools.computer_use.vision_routing.
|
||||
|
||||
Cover the small ``should_route_capture_to_aux_vision`` policy helper that
|
||||
decides whether a captured screenshot from ``computer_use(action='capture')``
|
||||
should be returned as a multimodal envelope (main model handles vision
|
||||
natively) or pre-analysed via the ``auxiliary.vision`` pipeline so the
|
||||
main model only sees text.
|
||||
|
||||
The companion end-to-end regression for #24015 lives in
|
||||
``tests/tools/test_computer_use_capture_routing.py``; this file pins the
|
||||
unit contract of the helper in isolation so behaviour does not regress
|
||||
silently if the surrounding ``computer_use`` plumbing is refactored.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _explicit_aux_vision_override
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestExplicitAuxVisionOverride:
|
||||
"""Mirror agent.image_routing — config detection must agree across paths."""
|
||||
|
||||
def test_returns_false_for_none_cfg(self):
|
||||
from tools.computer_use.vision_routing import _explicit_aux_vision_override
|
||||
assert _explicit_aux_vision_override(None) is False
|
||||
|
||||
def test_returns_false_for_non_dict_cfg(self):
|
||||
from tools.computer_use.vision_routing import _explicit_aux_vision_override
|
||||
assert _explicit_aux_vision_override("not-a-dict") is False
|
||||
assert _explicit_aux_vision_override([]) is False
|
||||
|
||||
def test_returns_false_when_auxiliary_block_missing(self):
|
||||
from tools.computer_use.vision_routing import _explicit_aux_vision_override
|
||||
assert _explicit_aux_vision_override({}) is False
|
||||
assert _explicit_aux_vision_override({"model": {"default": "x"}}) is False
|
||||
|
||||
def test_returns_false_when_vision_block_missing(self):
|
||||
from tools.computer_use.vision_routing import _explicit_aux_vision_override
|
||||
cfg = {"auxiliary": {"compression": {"provider": "openai"}}}
|
||||
assert _explicit_aux_vision_override(cfg) is False
|
||||
|
||||
def test_returns_false_for_blank_provider_no_model_no_base_url(self):
|
||||
from tools.computer_use.vision_routing import _explicit_aux_vision_override
|
||||
cfg = {"auxiliary": {"vision": {"provider": "", "model": "", "base_url": ""}}}
|
||||
assert _explicit_aux_vision_override(cfg) is False
|
||||
|
||||
def test_returns_false_for_provider_auto(self):
|
||||
from tools.computer_use.vision_routing import _explicit_aux_vision_override
|
||||
cfg = {"auxiliary": {"vision": {"provider": "auto"}}}
|
||||
assert _explicit_aux_vision_override(cfg) is False
|
||||
|
||||
def test_returns_false_for_provider_AUTO_uppercase(self):
|
||||
from tools.computer_use.vision_routing import _explicit_aux_vision_override
|
||||
cfg = {"auxiliary": {"vision": {"provider": " AUTO "}}}
|
||||
assert _explicit_aux_vision_override(cfg) is False
|
||||
|
||||
def test_returns_true_for_explicit_provider(self):
|
||||
from tools.computer_use.vision_routing import _explicit_aux_vision_override
|
||||
cfg = {"auxiliary": {"vision": {"provider": "openrouter"}}}
|
||||
assert _explicit_aux_vision_override(cfg) is True
|
||||
|
||||
def test_returns_true_for_explicit_model_only(self):
|
||||
from tools.computer_use.vision_routing import _explicit_aux_vision_override
|
||||
cfg = {"auxiliary": {"vision": {"model": "google/gemini-2.5-flash"}}}
|
||||
assert _explicit_aux_vision_override(cfg) is True
|
||||
|
||||
def test_returns_true_for_explicit_base_url_only(self):
|
||||
from tools.computer_use.vision_routing import _explicit_aux_vision_override
|
||||
cfg = {"auxiliary": {"vision": {"base_url": "http://localhost:1234/v1"}}}
|
||||
assert _explicit_aux_vision_override(cfg) is True
|
||||
|
||||
def test_returns_true_for_provider_auto_plus_explicit_model(self):
|
||||
"""``provider: auto`` + an explicit model still counts as override."""
|
||||
from tools.computer_use.vision_routing import _explicit_aux_vision_override
|
||||
cfg = {
|
||||
"auxiliary": {
|
||||
"vision": {"provider": "auto", "model": "claude-3-haiku"},
|
||||
}
|
||||
}
|
||||
assert _explicit_aux_vision_override(cfg) is True
|
||||
|
||||
def test_handles_non_dict_vision_block(self):
|
||||
from tools.computer_use.vision_routing import _explicit_aux_vision_override
|
||||
cfg = {"auxiliary": {"vision": "not-a-dict"}}
|
||||
assert _explicit_aux_vision_override(cfg) is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# should_route_capture_to_aux_vision
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRouteDecision:
|
||||
"""End-to-end policy: explicit override > tool-result support > vision caps."""
|
||||
|
||||
def test_explicit_override_routes_to_aux_even_for_vision_main(self):
|
||||
"""Issue #24015 core repro: explicit aux config must win.
|
||||
|
||||
Even if the main model fully supports vision (Anthropic / Claude),
|
||||
an explicit ``auxiliary.vision`` block means the user wants their
|
||||
configured backend used. Don't silently bypass it.
|
||||
"""
|
||||
from tools.computer_use import vision_routing
|
||||
|
||||
cfg = {
|
||||
"auxiliary": {
|
||||
"vision": {
|
||||
"provider": "openrouter",
|
||||
"model": "google/gemini-2.5-flash",
|
||||
}
|
||||
}
|
||||
}
|
||||
with patch.object(vision_routing, "_lookup_supports_vision", return_value=True), \
|
||||
patch.object(vision_routing,
|
||||
"_provider_accepts_multimodal_tool_result",
|
||||
return_value=True):
|
||||
assert vision_routing.should_route_capture_to_aux_vision(
|
||||
"anthropic", "claude-opus-4-5", cfg
|
||||
) is True
|
||||
|
||||
def test_non_vision_main_model_routes_to_aux(self):
|
||||
"""The reported #24015 scenario: tencent/hy3-preview has no vision."""
|
||||
from tools.computer_use import vision_routing
|
||||
|
||||
cfg = {"model": {"default": "tencent/hy3-preview", "provider": "openrouter"}}
|
||||
with patch.object(vision_routing, "_lookup_supports_vision", return_value=False), \
|
||||
patch.object(vision_routing,
|
||||
"_provider_accepts_multimodal_tool_result",
|
||||
return_value=True):
|
||||
assert vision_routing.should_route_capture_to_aux_vision(
|
||||
"openrouter", "tencent/hy3-preview", cfg
|
||||
) is True
|
||||
|
||||
def test_vision_main_model_no_override_keeps_multimodal(self):
|
||||
"""Default path: vision-capable main model + no aux override → native."""
|
||||
from tools.computer_use import vision_routing
|
||||
|
||||
with patch.object(vision_routing, "_lookup_supports_vision", return_value=True), \
|
||||
patch.object(vision_routing,
|
||||
"_provider_accepts_multimodal_tool_result",
|
||||
return_value=True):
|
||||
assert vision_routing.should_route_capture_to_aux_vision(
|
||||
"anthropic", "claude-opus-4-5", None
|
||||
) is False
|
||||
|
||||
def test_provider_rejects_multimodal_tool_results_routes_to_aux(self):
|
||||
"""Some providers' tool-result messages won't carry images at all."""
|
||||
from tools.computer_use import vision_routing
|
||||
|
||||
with patch.object(vision_routing, "_lookup_supports_vision", return_value=True), \
|
||||
patch.object(vision_routing,
|
||||
"_provider_accepts_multimodal_tool_result",
|
||||
return_value=False):
|
||||
assert vision_routing.should_route_capture_to_aux_vision(
|
||||
"some-aggregator", "some-vision-model", {}
|
||||
) is True
|
||||
|
||||
def test_unknown_provider_capabilities_fail_closed(self):
|
||||
"""When tool-result lookup returns None, route to aux (safe default)."""
|
||||
from tools.computer_use import vision_routing
|
||||
|
||||
with patch.object(vision_routing, "_lookup_supports_vision", return_value=True), \
|
||||
patch.object(vision_routing,
|
||||
"_provider_accepts_multimodal_tool_result",
|
||||
return_value=None):
|
||||
assert vision_routing.should_route_capture_to_aux_vision(
|
||||
"exotic-provider", "exotic-model", {}
|
||||
) is True
|
||||
|
||||
def test_unknown_vision_capability_fails_closed(self):
|
||||
"""When models.dev has no entry, prefer aux over a likely 404."""
|
||||
from tools.computer_use import vision_routing
|
||||
|
||||
with patch.object(vision_routing, "_lookup_supports_vision", return_value=None), \
|
||||
patch.object(vision_routing,
|
||||
"_provider_accepts_multimodal_tool_result",
|
||||
return_value=True):
|
||||
assert vision_routing.should_route_capture_to_aux_vision(
|
||||
"openrouter", "novel/never-seen-model", {}
|
||||
) is True
|
||||
|
||||
def test_explicit_override_wins_over_unknown_caps(self):
|
||||
"""Explicit aux config wins regardless of unknown caps elsewhere."""
|
||||
from tools.computer_use import vision_routing
|
||||
|
||||
cfg = {"auxiliary": {"vision": {"provider": "openrouter"}}}
|
||||
with patch.object(vision_routing, "_lookup_supports_vision", return_value=None), \
|
||||
patch.object(vision_routing,
|
||||
"_provider_accepts_multimodal_tool_result",
|
||||
return_value=None):
|
||||
assert vision_routing.should_route_capture_to_aux_vision(
|
||||
"openrouter", "tencent/hy3-preview", cfg
|
||||
) is True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Internal lookups — defensive paths
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestLookupHelpers:
|
||||
def test_lookup_supports_vision_returns_none_for_blank_provider(self):
|
||||
from tools.computer_use.vision_routing import _lookup_supports_vision
|
||||
assert _lookup_supports_vision("", "claude") is None
|
||||
|
||||
def test_lookup_supports_vision_returns_none_for_blank_model(self):
|
||||
from tools.computer_use.vision_routing import _lookup_supports_vision
|
||||
assert _lookup_supports_vision("anthropic", "") is None
|
||||
|
||||
def test_lookup_supports_vision_handles_lookup_exception(self):
|
||||
"""Underlying caps lookup may raise; helper must swallow + return None."""
|
||||
from tools.computer_use import vision_routing
|
||||
|
||||
def _boom(_provider, _model):
|
||||
raise RuntimeError("models.dev unreachable")
|
||||
|
||||
with patch("agent.models_dev.get_model_capabilities", side_effect=_boom):
|
||||
assert vision_routing._lookup_supports_vision("anthropic", "claude") is None
|
||||
|
||||
def test_lookup_supports_vision_returns_none_when_caps_missing(self):
|
||||
from tools.computer_use import vision_routing
|
||||
|
||||
with patch("agent.models_dev.get_model_capabilities", return_value=None):
|
||||
assert vision_routing._lookup_supports_vision("anthropic", "claude") is None
|
||||
|
||||
def test_provider_accepts_multimodal_tool_result_returns_none_for_blank_provider(self):
|
||||
from tools.computer_use.vision_routing import (
|
||||
_provider_accepts_multimodal_tool_result,
|
||||
)
|
||||
assert _provider_accepts_multimodal_tool_result("", "claude") is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Module surface
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestModuleSurface:
|
||||
"""Pin the public surface so dependents stay in lockstep."""
|
||||
|
||||
def test_should_route_capture_to_aux_vision_is_exported(self):
|
||||
from tools.computer_use import vision_routing
|
||||
|
||||
assert "should_route_capture_to_aux_vision" in vision_routing.__all__
|
||||
assert callable(vision_routing.should_route_capture_to_aux_vision)
|
||||
|
||||
@pytest.mark.parametrize("name", [
|
||||
"_explicit_aux_vision_override",
|
||||
"_lookup_supports_vision",
|
||||
"_provider_accepts_multimodal_tool_result",
|
||||
])
|
||||
def test_internal_helpers_are_addressable(self, name):
|
||||
"""Internal helpers stay importable so tests can monkeypatch them."""
|
||||
from tools.computer_use import vision_routing
|
||||
|
||||
assert hasattr(vision_routing, name)
|
||||
assert callable(getattr(vision_routing, name))
|
||||
@@ -1089,9 +1089,17 @@ class Test403Enrichment:
|
||||
class TestModelToolsIntegration:
|
||||
def setup_method(self):
|
||||
_reset_capability_cache()
|
||||
from model_tools import _clear_tool_defs_cache
|
||||
from tools.registry import invalidate_check_fn_cache
|
||||
_clear_tool_defs_cache()
|
||||
invalidate_check_fn_cache()
|
||||
|
||||
def teardown_method(self):
|
||||
_reset_capability_cache()
|
||||
from model_tools import _clear_tool_defs_cache
|
||||
from tools.registry import invalidate_check_fn_cache
|
||||
_clear_tool_defs_cache()
|
||||
invalidate_check_fn_cache()
|
||||
|
||||
@patch("tools.discord_tool._discord_request")
|
||||
def test_discord_admin_schema_rebuilt_by_get_tool_definitions(
|
||||
|
||||
@@ -501,16 +501,18 @@ class TestRegistration:
|
||||
|
||||
def test_check_fn_gates_availability(self, monkeypatch):
|
||||
"""Registry should exclude HA tools when HASS_TOKEN is not set."""
|
||||
from tools.registry import registry
|
||||
from tools.registry import invalidate_check_fn_cache, registry
|
||||
|
||||
monkeypatch.delenv("HASS_TOKEN", raising=False)
|
||||
invalidate_check_fn_cache()
|
||||
defs = registry.get_definitions({"ha_list_entities", "ha_get_state", "ha_call_service"})
|
||||
assert len(defs) == 0
|
||||
|
||||
def test_check_fn_includes_when_token_set(self, monkeypatch):
|
||||
"""Registry should include HA tools when HASS_TOKEN is set."""
|
||||
from tools.registry import registry
|
||||
from tools.registry import invalidate_check_fn_cache, registry
|
||||
|
||||
monkeypatch.setenv("HASS_TOKEN", "test-token")
|
||||
invalidate_check_fn_cache()
|
||||
defs = registry.get_definitions({"ha_list_entities", "ha_get_state", "ha_call_service"})
|
||||
assert len(defs) == 3
|
||||
|
||||
@@ -1093,6 +1093,11 @@ def test_kanban_guidance_not_in_normal_prompt(monkeypatch, tmp_path):
|
||||
from pathlib import Path as _P
|
||||
monkeypatch.setattr(_P, "home", lambda: tmp_path)
|
||||
|
||||
from tools.registry import invalidate_check_fn_cache
|
||||
from model_tools import _clear_tool_defs_cache
|
||||
invalidate_check_fn_cache()
|
||||
_clear_tool_defs_cache()
|
||||
|
||||
from run_agent import AIAgent
|
||||
a = AIAgent(
|
||||
api_key="test",
|
||||
@@ -1116,6 +1121,11 @@ def test_kanban_guidance_in_worker_prompt(monkeypatch, tmp_path):
|
||||
from pathlib import Path as _P
|
||||
monkeypatch.setattr(_P, "home", lambda: tmp_path)
|
||||
|
||||
from tools.registry import invalidate_check_fn_cache
|
||||
from model_tools import _clear_tool_defs_cache
|
||||
invalidate_check_fn_cache()
|
||||
_clear_tool_defs_cache()
|
||||
|
||||
from run_agent import AIAgent
|
||||
a = AIAgent(
|
||||
api_key="test",
|
||||
|
||||
@@ -78,6 +78,63 @@ def test_resolve_managed_tool_gateway_is_disabled_without_subscription():
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_rewrite_localhost_origin_rewrites_subdomain():
|
||||
rewrite = managed_tool_gateway._rewrite_localhost_origin
|
||||
resolved, host = rewrite("http://tools-gateway.localhost:3009")
|
||||
assert resolved == "http://127.0.0.1:3009"
|
||||
assert host == "tools-gateway.localhost:3009"
|
||||
|
||||
|
||||
def test_rewrite_localhost_origin_preserves_path():
|
||||
rewrite = managed_tool_gateway._rewrite_localhost_origin
|
||||
resolved, host = rewrite("http://tools-gateway.localhost:3009/v1/foo")
|
||||
assert resolved == "http://127.0.0.1:3009/v1/foo"
|
||||
assert host == "tools-gateway.localhost:3009"
|
||||
|
||||
|
||||
def test_rewrite_localhost_origin_no_port():
|
||||
rewrite = managed_tool_gateway._rewrite_localhost_origin
|
||||
resolved, host = rewrite("http://tools-gateway.localhost")
|
||||
assert resolved == "http://127.0.0.1"
|
||||
assert host == "tools-gateway.localhost"
|
||||
|
||||
|
||||
def test_rewrite_localhost_origin_ignores_bare_localhost():
|
||||
rewrite = managed_tool_gateway._rewrite_localhost_origin
|
||||
resolved, host = rewrite("http://localhost:3009")
|
||||
assert resolved == "http://localhost:3009"
|
||||
assert host is None
|
||||
|
||||
|
||||
def test_rewrite_localhost_origin_ignores_real_domains():
|
||||
rewrite = managed_tool_gateway._rewrite_localhost_origin
|
||||
resolved, host = rewrite("https://tools-gateway.nousresearch.com")
|
||||
assert resolved == "https://tools-gateway.nousresearch.com"
|
||||
assert host is None
|
||||
|
||||
|
||||
def test_gateway_config_resolved_origin_and_host_header():
|
||||
cfg = managed_tool_gateway.ManagedToolGatewayConfig(
|
||||
vendor="tools",
|
||||
gateway_origin="http://tools-gateway.localhost:3009",
|
||||
nous_user_token="tok",
|
||||
managed_mode=True,
|
||||
)
|
||||
assert cfg.resolved_origin == "http://127.0.0.1:3009"
|
||||
assert cfg.gateway_host_header == "tools-gateway.localhost:3009"
|
||||
|
||||
|
||||
def test_gateway_config_resolved_origin_passthrough_for_real_domain():
|
||||
cfg = managed_tool_gateway.ManagedToolGatewayConfig(
|
||||
vendor="firecrawl",
|
||||
gateway_origin="https://firecrawl-gateway.nousresearch.com",
|
||||
nous_user_token="tok",
|
||||
managed_mode=True,
|
||||
)
|
||||
assert cfg.resolved_origin == "https://firecrawl-gateway.nousresearch.com"
|
||||
assert cfg.gateway_host_header is None
|
||||
|
||||
|
||||
def test_read_nous_access_token_refreshes_expiring_cached_token(tmp_path, monkeypatch):
|
||||
monkeypatch.delenv("TOOL_GATEWAY_USER_TOKEN", raising=False)
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user