Compare commits
163 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a91a57fa5a | |||
| 72f94f4a7c | |||
| 345821b4a1 | |||
| fcd9011f8d | |||
| 585d6b6430 | |||
| 374dc81c23 | |||
| b85b938b1f | |||
| 4ded3ede33 | |||
| 7bb97b952f | |||
| d0a183cadd | |||
| 5f91b1a48b | |||
| d725407c56 | |||
| 6ba35ec336 | |||
| 395e9dd9e2 | |||
| c445f48b78 | |||
| 74d0b392e7 | |||
| 627f8a5f1d | |||
| 70b663504f | |||
| 559c6ad94a | |||
| afb97dbc53 | |||
| 016c772e7f | |||
| 9c304a7f56 | |||
| 53637fb17d | |||
| c9b32a654c | |||
| e377833fa6 | |||
| 16ff9464a5 | |||
| 965610f922 | |||
| ca413c6164 | |||
| c5dc9700eb | |||
| a31191c3f5 | |||
| 44b63fc6de | |||
| 97a32afdc4 | |||
| 63503ebb14 | |||
| c7db6a5800 | |||
| 86a368d832 | |||
| 55c9f32060 | |||
| 006937f7d0 | |||
| 566d8f0d75 | |||
| 6784c80794 | |||
| 9818b9a1ac | |||
| ce0e189d3e | |||
| dc4cde278b | |||
| cd9470f416 | |||
| 068c24f8a4 | |||
| 31ba2b0cbc | |||
| 4aec25bc44 | |||
| 7fee1f61eb | |||
| 6068363311 | |||
| 2d7182f72c | |||
| 42070ecefb | |||
| 887ba1fb03 | |||
| 233d4170cf | |||
| a480d345e6 | |||
| 47c0efe1c0 | |||
| 164a77dec9 | |||
| 99b81cd54b | |||
| b1edf3dfc8 | |||
| c57709a3d6 | |||
| e38a478c05 | |||
| 55a7c45d37 | |||
| 96917fb74a | |||
| 259ae846c8 | |||
| bea96e5cac | |||
| 79afa50703 | |||
| 624ce11ee8 | |||
| b2bf658442 | |||
| d69eab1efd | |||
| c4bda3f27c | |||
| cc07e30f45 | |||
| 384ec9684e | |||
| 3215ef1609 | |||
| 032fb84222 | |||
| 518f39557b | |||
| 3b9368a0c4 | |||
| 9e67c8e8be | |||
| 622c27e55c | |||
| bd3a5873e1 | |||
| 4444d5fe4f | |||
| 6fc0fa6e50 | |||
| 13c3d4b4ef | |||
| 4e89c53082 | |||
| 931caf2b2d | |||
| 734aa0f367 | |||
| 4ad5fa702f | |||
| aac6d97a14 | |||
| 7d7cdd48e0 | |||
| 1e4801b8d0 | |||
| 7fdc16dd4a | |||
| e13c1b8060 | |||
| 9eef53b960 | |||
| e4d7a5dffa | |||
| b62c997973 | |||
| 9fb40e6a3d | |||
| d5416284f1 | |||
| db84a78e61 | |||
| f199cd9f84 | |||
| 77276070f5 | |||
| 274217316e | |||
| 13c72fb486 | |||
| 6af9942327 | |||
| 8373956850 | |||
| 94bdc63ff5 | |||
| eacb398f75 | |||
| 5301cc212b | |||
| c4a21d7831 | |||
| 59c7cc64f0 | |||
| 55f3262e78 | |||
| 5360b54244 | |||
| 647cc0bb0d | |||
| 4f8aaf1046 | |||
| b6e07417c5 | |||
| 47614dbfca | |||
| 09d9724a09 | |||
| 85782a4ed7 | |||
| 9f57f2286d | |||
| 6682f91b80 | |||
| 05d9f641c0 | |||
| 9329e06696 | |||
| 04b1fdaecf | |||
| 681778a0b7 | |||
| 0161d4bb6c | |||
| 814c60092b | |||
| 23ac522d37 | |||
| e0e7397c32 | |||
| e0e4856d46 | |||
| 0086cdaf93 | |||
| fc2754dbdf | |||
| 3df26b925c | |||
| 80efe664ce | |||
| d57a4b3eb5 | |||
| 6bdad1f3b2 | |||
| f9ad7400e3 | |||
| 965ae7fa97 | |||
| cbd1f8e4be | |||
| f8745f59c2 | |||
| bcca5ed34d | |||
| c8c6ce1731 | |||
| 5af672c753 | |||
| d364132114 | |||
| 4c94396206 | |||
| e8b9f5ff9a | |||
| d3d5916089 | |||
| eabd8c1fd1 | |||
| e8a4c85e88 | |||
| ad7d3bc84c | |||
| 4695d2716f | |||
| 8ed2ef6f46 | |||
| 1702a94c88 | |||
| 55622b5525 | |||
| 74e47c081f | |||
| d6c488f2dc | |||
| 09d970160b | |||
| db82c453b9 | |||
| 38ea2a57a5 | |||
| 0854640537 | |||
| 19071529f6 | |||
| ed84637d11 | |||
| 4abfb6bc24 | |||
| e84fe483bc | |||
| ccb5aae0d2 | |||
| 34fc94d1f4 | |||
| 4813aaf0ba | |||
| 5ce0067c08 |
+7
-18
@@ -281,6 +281,13 @@ BROWSER_SESSION_TIMEOUT=300
|
||||
# Browser sessions are automatically closed after this period of no activity
|
||||
BROWSER_INACTIVITY_TIMEOUT=120
|
||||
|
||||
# Extra Chromium launch flags passed to agent-browser, comma- or newline-separated.
|
||||
# Hermes auto-injects "--no-sandbox,--disable-dev-shm-usage" when it detects root
|
||||
# or AppArmor-restricted unprivileged user namespaces (Ubuntu 23.10+, DGX Spark,
|
||||
# many container images), so leave this unset unless you need extra flags.
|
||||
# Setting this disables the auto-injection.
|
||||
# AGENT_BROWSER_ARGS=--no-sandbox
|
||||
|
||||
# Camofox local anti-detection browser (Camoufox-based Firefox).
|
||||
# Set CAMOFOX_URL to route the browser tools through a local Camofox server
|
||||
# instead of agent-browser/Browserbase. See docs/user-guide/features/browser.md.
|
||||
@@ -387,24 +394,6 @@ IMAGE_TOOLS_DEBUG=false
|
||||
# CONTEXT_COMPRESSION_THRESHOLD=0.85 # Compress at 85% of context limit
|
||||
# Model is set via compression.summary_model in config.yaml (default: google/gemini-3-flash-preview)
|
||||
|
||||
# =============================================================================
|
||||
# RL TRAINING (Tinker + Atropos)
|
||||
# =============================================================================
|
||||
# Run reinforcement learning training on language models using the Tinker API.
|
||||
# Requires the rl-server to be running (from tinker-atropos package).
|
||||
|
||||
# Tinker API Key - RL training service
|
||||
# Get at: https://tinker-console.thinkingmachines.ai/keys
|
||||
# TINKER_API_KEY=
|
||||
|
||||
# Weights & Biases API Key - Experiment tracking and metrics
|
||||
# Get at: https://wandb.ai/authorize
|
||||
# WANDB_API_KEY=
|
||||
|
||||
# RL API Server URL (default: http://localhost:8080)
|
||||
# Change if running the rl-server on a different host/port
|
||||
# RL_API_URL=http://localhost:8080
|
||||
|
||||
# =============================================================================
|
||||
# SKILLS HUB (GitHub integration for skill search/install/publish)
|
||||
# =============================================================================
|
||||
|
||||
@@ -0,0 +1,58 @@
|
||||
name: History Check
|
||||
|
||||
# Rejects PRs whose branch has no common ancestor with main.
|
||||
#
|
||||
# In May 2026 PR #25045 was merged from a branch that had been disconnected
|
||||
# from main's history (likely an accidental `git checkout --orphan` or
|
||||
# `.git/` re-init). GitHub's merge UI does not refuse merges of unrelated
|
||||
# histories, so the PR landed cleanly with the intended one-file change —
|
||||
# but its parent-less root commit (413990c94) got grafted into main as a
|
||||
# second root, and ~1500 files' worth of `git blame` history collapsed
|
||||
# onto that single commit.
|
||||
#
|
||||
# This check catches the failure mode by requiring `git merge-base` between
|
||||
# the PR head and main to be non-empty.
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: [main]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
check-common-ancestor:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
|
||||
with:
|
||||
fetch-depth: 0 # full history both sides for merge-base
|
||||
|
||||
- name: Reject PRs with no common ancestor on main
|
||||
run: |
|
||||
# `git merge-base` exits non-zero AND prints nothing when the two
|
||||
# commits share no ancestor. We check both conditions explicitly
|
||||
# so the failure message is clear regardless of which signal fires
|
||||
# first.
|
||||
if ! BASE=$(git merge-base origin/main HEAD 2>/dev/null) || [ -z "$BASE" ]; then
|
||||
echo ""
|
||||
echo "::error::This PR has no common ancestor with main."
|
||||
echo ""
|
||||
echo "Your branch's history is disconnected from main. Common causes:"
|
||||
echo " - the branch was created with 'git checkout --orphan'"
|
||||
echo " - '.git/' was re-initialized at some point during the work"
|
||||
echo " - the branch was force-pushed from an unrelated repository"
|
||||
echo ""
|
||||
echo "Merging an unrelated-history PR grafts a parent-less root commit"
|
||||
echo "into main and collapses git blame for every file in that snapshot."
|
||||
echo "Reference: PR #25045 caused this and re-rooted blame on ~1500"
|
||||
echo "files to a single orphan commit."
|
||||
echo ""
|
||||
echo "To fix, rebase your changes onto current main:"
|
||||
echo " git fetch origin main"
|
||||
echo " git checkout -b fix-branch origin/main"
|
||||
echo " # re-apply your changes (cherry-pick, copy files, etc.)"
|
||||
echo " git push -f origin fix-branch"
|
||||
exit 1
|
||||
fi
|
||||
echo "::notice::Common ancestor with main: $BASE"
|
||||
@@ -11,6 +11,7 @@ on:
|
||||
- '**/sitecustomize.py'
|
||||
- '**/usercustomize.py'
|
||||
- '**/__init__.pth'
|
||||
- 'pyproject.toml'
|
||||
|
||||
permissions:
|
||||
pull-requests: write
|
||||
@@ -137,3 +138,68 @@ jobs:
|
||||
run: |
|
||||
echo "::error::CRITICAL supply chain risk patterns detected in this PR. See the PR comment for details."
|
||||
exit 1
|
||||
|
||||
dep-bounds:
|
||||
name: Check PyPI dependency upper bounds
|
||||
runs-on: ubuntu-latest
|
||||
if: contains(github.event.pull_request.changed_files_url, 'pyproject.toml') || true
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for unbounded PyPI deps
|
||||
id: bounds
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
BASE="${{ github.event.pull_request.base.sha }}"
|
||||
HEAD="${{ github.event.pull_request.head.sha }}"
|
||||
|
||||
# Only check added lines in pyproject.toml
|
||||
ADDED=$(git diff "$BASE".."$HEAD" -- pyproject.toml | grep '^+' | grep -v '^+++' || true)
|
||||
|
||||
if [ -z "$ADDED" ]; then
|
||||
echo "found=false" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Match PyPI dep specs that have >= but no < ceiling.
|
||||
# Pattern: "package>=version" without a following ",<" bound.
|
||||
# Excludes git+ URLs (which use commit SHAs) and comments.
|
||||
UNBOUNDED=$(echo "$ADDED" | grep -oE '"[a-zA-Z0-9_-]+(\[[^\]]*\])?>=[ 0-9.]+"' | grep -v ',<' || true)
|
||||
|
||||
if [ -n "$UNBOUNDED" ]; then
|
||||
echo "found=true" >> "$GITHUB_OUTPUT"
|
||||
echo "$UNBOUNDED" > /tmp/unbounded.txt
|
||||
else
|
||||
echo "found=false" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
- name: Post unbounded dep warning
|
||||
if: steps.bounds.outputs.found == 'true'
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
BODY="## ⚠️ Unbounded PyPI Dependency Detected
|
||||
|
||||
This PR adds PyPI dependencies without a \`<next_major\` upper bound. Per our [supply chain policy](../blob/main/CONTRIBUTING.md#dependency-pinning-policy-supply-chain-hardening), all PyPI deps must be pinned as \`>=floor,<next_major\`.
|
||||
|
||||
**Unbounded specs found:**
|
||||
\`\`\`
|
||||
$(cat /tmp/unbounded.txt)
|
||||
\`\`\`
|
||||
|
||||
**Fix:** Add an upper bound, e.g. \`\"package>=1.2.0,<2\"\`
|
||||
|
||||
---
|
||||
*See PR #2810 and CONTRIBUTING.md for the full policy rationale.*"
|
||||
|
||||
gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY" || echo "::warning::Could not post PR comment (expected for fork PRs)"
|
||||
|
||||
- name: Fail on unbounded deps
|
||||
if: steps.bounds.outputs.found == 'true'
|
||||
run: |
|
||||
echo "::error::PyPI dependencies without upper bounds detected. Add <next_major ceiling per CONTRIBUTING.md policy."
|
||||
exit 1
|
||||
|
||||
@@ -0,0 +1,163 @@
|
||||
name: Publish to PyPI
|
||||
|
||||
# Triggered by CalVer tag pushes from scripts/release.py (e.g. v2026.5.15)
|
||||
# Can also be triggered manually from the Actions tab as an escape hatch.
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v20*' # CalVer tags: v2026.5.15, v2026.5.15.2, etc.
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
confirm_tag:
|
||||
description: 'Tag to publish (e.g. v2026.5.15). Must already exist.'
|
||||
required: true
|
||||
type: string
|
||||
|
||||
# Restrict default token to read-only; each job escalates as needed.
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
# Prevent overlapping publishes (e.g. two same-day tags pushed quickly).
|
||||
concurrency:
|
||||
group: pypi-publish
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: Build distribution 📦
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
|
||||
with:
|
||||
persist-credentials: false
|
||||
# On workflow_dispatch, check out the confirmed tag.
|
||||
ref: ${{ inputs.confirm_tag || github.ref }}
|
||||
fetch-tags: true
|
||||
|
||||
- name: Validate tag exists
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
run: |
|
||||
if ! git tag -l "${{ inputs.confirm_tag }}" | grep -q .; then
|
||||
echo "::error::Tag '${{ inputs.confirm_tag }}' does not exist in the repo"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
|
||||
with:
|
||||
python-version: '3.13'
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@d0cc045d04ccac9d8b7881df0226f9e82c39688e # v6
|
||||
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
|
||||
with:
|
||||
node-version: '22'
|
||||
|
||||
- name: Build web dashboard
|
||||
run: cd web && npm ci && npm run build
|
||||
|
||||
- name: Build TUI bundle
|
||||
run: cd ui-tui && npm ci && npm run build
|
||||
|
||||
- name: Bundle TUI into hermes_cli
|
||||
run: |
|
||||
mkdir -p hermes_cli/tui_dist
|
||||
cp ui-tui/dist/entry.js hermes_cli/tui_dist/entry.js
|
||||
|
||||
- name: Verify frontend assets exist
|
||||
run: |
|
||||
test -f hermes_cli/web_dist/index.html || { echo "ERROR: web_dist not built"; exit 1; }
|
||||
test -f hermes_cli/tui_dist/entry.js || { echo "ERROR: tui_dist not built"; exit 1; }
|
||||
|
||||
- name: Bundle install.sh into wheel
|
||||
run: |
|
||||
mkdir -p hermes_cli/scripts
|
||||
cp scripts/install.sh hermes_cli/scripts/install.sh
|
||||
|
||||
- name: Build wheel and sdist
|
||||
run: uv build --sdist --wheel
|
||||
|
||||
- name: Upload distribution artifacts
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
|
||||
with:
|
||||
name: python-package-distributions
|
||||
path: dist/
|
||||
|
||||
publish:
|
||||
name: Publish to PyPI
|
||||
needs: build
|
||||
runs-on: ubuntu-latest
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/hermes-agent
|
||||
permissions:
|
||||
id-token: write # OIDC trusted publishing
|
||||
|
||||
steps:
|
||||
- name: Download distribution artifacts
|
||||
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
|
||||
with:
|
||||
name: python-package-distributions
|
||||
path: dist/
|
||||
|
||||
- name: Publish to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0
|
||||
with:
|
||||
skip-existing: true
|
||||
|
||||
sign:
|
||||
name: Sign and attach to GitHub Release
|
||||
# Only runs on tag pushes — release.py creates the GitHub Release,
|
||||
# and workflow_dispatch won't have a matching release to attach to.
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
needs: publish
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write # attach assets to the existing release
|
||||
id-token: write # sigstore signing
|
||||
|
||||
steps:
|
||||
- name: Download distribution artifacts
|
||||
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
|
||||
with:
|
||||
name: python-package-distributions
|
||||
path: dist/
|
||||
|
||||
- name: Wait for GitHub Release to exist
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ github.token }}
|
||||
# release.py creates the GitHub Release after pushing the tag,
|
||||
# but this workflow starts from the tag push — wait for it.
|
||||
run: |
|
||||
for i in $(seq 1 30); do
|
||||
if gh release view "$GITHUB_REF_NAME" --repo "$GITHUB_REPOSITORY" >/dev/null 2>&1; then
|
||||
echo "Release $GITHUB_REF_NAME found"
|
||||
exit 0
|
||||
fi
|
||||
echo "Waiting for release... ($i/30)"
|
||||
sleep 10
|
||||
done
|
||||
echo "::warning::Release $GITHUB_REF_NAME not found after 5 minutes — skipping signature upload"
|
||||
echo "skip_sign=true" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Sign with Sigstore
|
||||
if: env.skip_sign != 'true'
|
||||
uses: sigstore/gh-action-sigstore-python@f514d46b907ebcd5bedc05145c03b69c1edd8b46 # v3.0.0
|
||||
with:
|
||||
inputs: >-
|
||||
./dist/*.tar.gz
|
||||
./dist/*.whl
|
||||
|
||||
- name: Attach signed artifacts to GitHub Release
|
||||
if: env.skip_sign != 'true'
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ github.token }}
|
||||
# release.py already created the GitHub Release — just upload
|
||||
# the Sigstore signatures alongside the existing assets.
|
||||
run: >-
|
||||
gh release upload
|
||||
"$GITHUB_REF_NAME" dist/*.sigstore.json
|
||||
--repo "$GITHUB_REPOSITORY"
|
||||
--clobber
|
||||
@@ -70,3 +70,6 @@ mini-swe-agent/
|
||||
result
|
||||
website/static/api/skills-index.json
|
||||
models-dev-upstream/
|
||||
hermes_cli/tui_dist/*
|
||||
hermes_cli/scripts/
|
||||
docs/superpowers/*
|
||||
@@ -1,3 +0,0 @@
|
||||
[submodule "tinker-atropos"]
|
||||
path = tinker-atropos
|
||||
url = https://github.com/nousresearch/tinker-atropos
|
||||
@@ -56,7 +56,6 @@ hermes-agent/
|
||||
├── tui_gateway/ # Python JSON-RPC backend for the TUI
|
||||
├── acp_adapter/ # ACP server (VS Code / Zed / JetBrains integration)
|
||||
├── cron/ # Scheduler — jobs.py, scheduler.py
|
||||
├── environments/ # RL training environments (Atropos)
|
||||
├── scripts/ # run_tests.sh, release.py, auxiliary scripts
|
||||
├── website/ # Docusaurus docs site
|
||||
└── tests/ # Pytest suite (~17k tests across ~900 files as of May 2026)
|
||||
@@ -309,6 +308,29 @@ The registry handles schema collection, dispatch, availability checking, and err
|
||||
|
||||
---
|
||||
|
||||
## Dependency Pinning Policy
|
||||
|
||||
All dependencies must have upper bounds to limit supply-chain attack surface.
|
||||
This policy was established after the litellm compromise (PR #2796, #2810) and
|
||||
reinforced after the Mini Shai-Hulud worm campaign (May 2026).
|
||||
|
||||
| Source type | Treatment | Example |
|
||||
|---|---|---|
|
||||
| PyPI package | `>=floor,<next_major` | `"httpx>=0.28.1,<1"` |
|
||||
| Git URL | Commit SHA | `git+https://...@<40-char-sha>` |
|
||||
| GitHub Actions | Commit SHA + comment | `uses: actions/checkout@<sha> # v4` |
|
||||
| CI-only pip | `==exact` | `pyyaml==6.0.2` |
|
||||
|
||||
**When adding a new dependency to `pyproject.toml`:**
|
||||
1. Pin to `>=current_version,<next_major` for post-1.0 (e.g. `>=1.5.0,<2`).
|
||||
2. For pre-1.0 packages, use `<0.(current_minor + 2)` (e.g. `>=0.29,<0.32`).
|
||||
3. Never commit a bare `>=X.Y.Z` without a ceiling — CI and reviewers will reject it.
|
||||
4. Run `uv lock` to regenerate `uv.lock` with hashes.
|
||||
|
||||
Reference: #2810 (bounds pass), #9801 (SHA pinning + audit CI).
|
||||
|
||||
---
|
||||
|
||||
## Adding Configuration
|
||||
|
||||
### config.yaml options:
|
||||
|
||||
+41
-4
@@ -91,9 +91,6 @@ export VIRTUAL_ENV="$(pwd)/venv"
|
||||
# Install with all extras (messaging, cron, CLI menus, dev tools)
|
||||
uv pip install -e ".[all,dev]"
|
||||
|
||||
# Optional: RL training submodule
|
||||
# git submodule update --init tinker-atropos && uv pip install -e "./tinker-atropos"
|
||||
|
||||
# Optional: browser tools
|
||||
npm install
|
||||
```
|
||||
@@ -196,7 +193,6 @@ hermes-agent/
|
||||
│
|
||||
├── skills/ # Bundled skills (copied to ~/.hermes/skills/ on install)
|
||||
├── optional-skills/ # Official optional skills (discoverable via hub, not activated by default)
|
||||
├── environments/ # RL training environments (Atropos integration)
|
||||
├── tests/ # Test suite
|
||||
├── website/ # Documentation site (hermes-agent.nousresearch.com)
|
||||
│
|
||||
@@ -804,6 +800,47 @@ Hermes has terminal access. Security matters.
|
||||
|
||||
If your PR affects security, note it explicitly in the description.
|
||||
|
||||
### Dependency pinning policy (supply chain hardening)
|
||||
|
||||
After the [litellm supply chain compromise](https://github.com/BerriAI/litellm/issues/24512) in March 2026 and the [Mini Shai-Hulud worm campaign](https://socket.dev/blog/tanstack-npm-packages-compromised-mini-shai-hulud-supply-chain-attack) in May 2026, all dependencies must follow these rules:
|
||||
|
||||
| Source type | Required treatment | Rationale |
|
||||
|---|---|---|
|
||||
| **PyPI package** | `>=floor,<next_major` | PyPI versions are immutable once published, but new versions can be pushed into your range. A `<next_major` ceiling stops a 1.x install from upgrading to a malicious 2.0.0. |
|
||||
| **Git URL** (atroposlib, tinker, yc-bench, Baileys) | Full commit SHA | Branches and tags are mutable refs; SHA is content-addressed. |
|
||||
| **GitHub Actions** | Full commit SHA + version comment | Action tags are mutable refs (e.g. tj-actions/changed-files March 2025). Pin as `uses: owner/action@<sha> # vX.Y.Z` |
|
||||
| **CI-only pip installs** | `==exact` | Hermetic CI builds; churn is acceptable. |
|
||||
|
||||
**Every new PyPI dependency in a PR must have a `<next_major` upper bound.** PRs adding unbounded `>=X.Y.Z` specs will be rejected by reviewers. The `supply-chain-audit.yml` CI workflow also flags dependency manifest changes for manual review.
|
||||
|
||||
**How to determine the ceiling:**
|
||||
- If the package is at version `1.x.y`, use `<2`.
|
||||
- If the package is at version `0.x.y` (pre-1.0), use `<0.(current_minor + 2)` — e.g. if current is `0.29.x`, use `<0.32`. This gives ~2 minor versions of headroom while keeping the window small enough that a hostile takeover version is unlikely to land inside it.
|
||||
- Exception: packages with very stable APIs (e.g. `aiohttp-socks`) can use `<1` at reviewer discretion.
|
||||
|
||||
**Examples:**
|
||||
```toml
|
||||
# ✅ Correct — post-1.0
|
||||
"openai>=2.21.0,<3"
|
||||
"pydantic>=2.12.5,<3"
|
||||
|
||||
# ✅ Correct — pre-1.0 (tight minor window)
|
||||
"asyncpg>=0.29,<0.32"
|
||||
"aiosqlite>=0.20,<0.23"
|
||||
"hindsight-client>=0.4.22,<0.5"
|
||||
|
||||
# ❌ Rejected — no upper bound
|
||||
"some-package>=1.2.3"
|
||||
|
||||
# ❌ Rejected — too tight (blocks legitimate patches)
|
||||
"some-package==1.2.3"
|
||||
|
||||
# ❌ Rejected — too loose for pre-1.0 (allows 80 minor versions)
|
||||
"some-package>=0.20,<1"
|
||||
```
|
||||
|
||||
**Reference PRs:** #2796 (litellm removal), #2810 (upper bounds pass), #9801 (SHA pinning + supply-chain-audit CI).
|
||||
|
||||
---
|
||||
|
||||
## Pull Request Process
|
||||
|
||||
@@ -23,7 +23,7 @@ Use any model you want — [Nous Portal](https://portal.nousresearch.com), [Open
|
||||
<tr><td><b>Scheduled automations</b></td><td>Built-in cron scheduler with delivery to any platform. Daily reports, nightly backups, weekly audits — all in natural language, running unattended.</td></tr>
|
||||
<tr><td><b>Delegates and parallelizes</b></td><td>Spawn isolated subagents for parallel workstreams. Write Python scripts that call tools via RPC, collapsing multi-step pipelines into zero-context-cost turns.</td></tr>
|
||||
<tr><td><b>Runs anywhere, not just your laptop</b></td><td>Seven terminal backends — local, Docker, SSH, Singularity, Modal, Daytona, and Vercel Sandbox. Daytona and Modal offer serverless persistence — your agent's environment hibernates when idle and wakes on demand, costing nearly nothing between sessions. Run it on a $5 VPS or a GPU cluster.</td></tr>
|
||||
<tr><td><b>Research-ready</b></td><td>Batch trajectory generation, Atropos RL environments, trajectory compression for training the next generation of tool-calling models.</td></tr>
|
||||
<tr><td><b>Research-ready</b></td><td>Batch trajectory generation, trajectory compression for training the next generation of tool-calling models.</td></tr>
|
||||
</table>
|
||||
|
||||
---
|
||||
@@ -175,8 +175,6 @@ uv pip install -e ".[all,dev]"
|
||||
scripts/run_tests.sh
|
||||
```
|
||||
|
||||
> **RL Training (optional):** The RL/Atropos integration (`environments/`) — see [`CONTRIBUTING.md`](https://github.com/NousResearch/hermes-agent/blob/main/CONTRIBUTING.md#development-setup) for the full setup.
|
||||
|
||||
---
|
||||
|
||||
## Community
|
||||
|
||||
+1
-7
@@ -23,7 +23,7 @@
|
||||
<tr><td><b>定时自动化</b></td><td>内置 cron 调度器,支持向任何平台投递。日报、夜间备份、周审计——全部用自然语言描述,无人值守运行。</td></tr>
|
||||
<tr><td><b>委派与并行</b></td><td>生成隔离子代理处理并行工作流。编写 Python 脚本通过 RPC 调用工具,将多步管道压缩为零上下文开销的轮次。</td></tr>
|
||||
<tr><td><b>随处运行</b></td><td>六种终端后端——本地、Docker、SSH、Daytona、Singularity 和 Modal。Daytona 和 Modal 提供 Serverless 持久化——代理环境空闲时休眠、按需唤醒,空闲期间几乎零成本。$5 VPS 或 GPU 集群都能跑。</td></tr>
|
||||
<tr><td><b>研究就绪</b></td><td>批量轨迹生成、Atropos RL 环境、轨迹压缩——用于训练下一代工具调用模型。</td></tr>
|
||||
<tr><td><b>研究就绪</b></td><td>批量轨迹生成、轨迹压缩——用于训练下一代工具调用模型。</td></tr>
|
||||
</table>
|
||||
|
||||
---
|
||||
@@ -161,12 +161,6 @@ uv pip install -e ".[all,dev]"
|
||||
python -m pytest tests/ -q
|
||||
```
|
||||
|
||||
> **RL 训练(可选):** 如需参与 RL/Tinker-Atropos 集成开发:
|
||||
> ```bash
|
||||
> git submodule update --init tinker-atropos
|
||||
> uv pip install -e "./tinker-atropos"
|
||||
> ```
|
||||
|
||||
---
|
||||
|
||||
## 社区
|
||||
|
||||
@@ -0,0 +1,477 @@
|
||||
# Hermes Agent v0.14.0 (v2026.5.16)
|
||||
|
||||
**Release Date:** May 16, 2026
|
||||
**Since v0.13.0:** 808 commits · 633 merged PRs · 1393 files changed · 165,061 insertions · 545 issues closed (12 P0, 50 P1) · 215 community contributors (including co-authors)
|
||||
|
||||
> The Foundation Release — Hermes Agent installs and runs anywhere now. Native Windows ships in early beta with a full PowerShell installer story, a `pip install hermes-agent` wheel lands on PyPI, lazy-deps reshape what `pip install hermes-agent` actually pulls down, the supply-chain checker scans every install/upgrade for unsafe versions, and a new OpenAI-compatible local proxy lets Codex / Aider / Cline talk to OAuth-only providers (Claude Pro, ChatGPT Pro, SuperGrok). The cold-start wave shaves ~19 seconds off `hermes` launch, browser-tool CDP calls run 180x faster, and `hermes tools` All-Platforms drops from 14s to under 1.5s. Two new messaging platforms (LINE and SimpleX Chat) and a Microsoft Graph foundation (Teams pipeline + webhook adapter) land alongside `/handoff` that finally transfers sessions live, `vision_analyze` passing pixels through to vision-capable models, `x_search` as a first-class tool, LSP semantic diagnostics on every `write_file` / `patch`, a unified pluggable `video_generate`, a `computer_use` cua-driver backend, cross-session 1-hour Claude prompt caching, a per-turn file-mutation verifier, plus 9 new optional skills. 50+ P1 closures, 12 P0 closures.
|
||||
|
||||
---
|
||||
|
||||
## ✨ Highlights
|
||||
|
||||
- **Native Windows support (early beta)** — full PowerShell installer, native subprocess/PTY paths, taskkill-based process management, MinGit auto-install, Microsoft Store python stub detection, foreground Ctrl+C preservation, taskkill+ps2 fallback, npm prefix handling, and ~40 follow-up Windows-only fixes across CLI / gateway / TUI / curator / tools. Hermes finally runs natively on `cmd.exe` and PowerShell, no WSL required. ([#21561](https://github.com/NousResearch/hermes-agent/pull/21561), [#22130](https://github.com/NousResearch/hermes-agent/pull/22130), [#22752](https://github.com/NousResearch/hermes-agent/pull/22752), [#26618](https://github.com/NousResearch/hermes-agent/pull/26618), and many more)
|
||||
|
||||
- **`pip install hermes-agent && hermes`** — Hermes Agent is now a real PyPI package. One command, no clone, no git, no shell installer. Wheel includes the Ink TUI bundle and shell launcher. (salvage of [#26350](https://github.com/NousResearch/hermes-agent/pull/26350)) ([#26593](https://github.com/NousResearch/hermes-agent/pull/26593))
|
||||
|
||||
- **Cold-start performance wave — ~19s off `hermes` launch** — skills cache, lazy Feishu import, no Nous HTTP at startup, plus PEP-562 lazy adapter imports (QQ, Yuanbao, Teams, Google Chat), deferred `fal_client` / `google-cloud` / `httpx` loads, models.dev disk-cache-first lookup, parallel doctor API checks, eager-skip plugin discovery on built-in subcommands, `hermes tools` All-Platforms drops from 14s to <1.5s, welcome banner skipped on `chat -q`. ([#22138](https://github.com/NousResearch/hermes-agent/pull/22138), [#22120](https://github.com/NousResearch/hermes-agent/pull/22120), [#22681](https://github.com/NousResearch/hermes-agent/pull/22681), [#22790](https://github.com/NousResearch/hermes-agent/pull/22790), [#22808](https://github.com/NousResearch/hermes-agent/pull/22808), [#22831](https://github.com/NousResearch/hermes-agent/pull/22831), [#22859](https://github.com/NousResearch/hermes-agent/pull/22859), [#22904](https://github.com/NousResearch/hermes-agent/pull/22904), [#22766](https://github.com/NousResearch/hermes-agent/pull/22766), [#25341](https://github.com/NousResearch/hermes-agent/pull/25341))
|
||||
|
||||
- **180x faster `browser_console` evaluations** — routed through the supervisor's persistent CDP WebSocket instead of spawning a fresh DevTools session per call. Real-world page interactions feel instant. ([#23226](https://github.com/NousResearch/hermes-agent/pull/23226))
|
||||
|
||||
- **Supply-chain advisory checker + lazy-deps framework + tiered install fallback** — every `pip install` / `hermes update` scans dependencies against an advisory list, lazy-deps replace heavy import-time loads with first-use installs, and the installer falls back through extras tiers when a wheel rejects on the target platform. ([#24220](https://github.com/NousResearch/hermes-agent/pull/24220))
|
||||
|
||||
- **OpenAI-compatible local proxy** — `hermes proxy` exposes any OAuth-authed provider (Claude Pro, ChatGPT Pro, SuperGrok) as an OpenAI-compatible endpoint that Codex / Aider / Cline / VS Code Continue can hit. Your subscription, your tools. ([#25969](https://github.com/NousResearch/hermes-agent/pull/25969))
|
||||
|
||||
- **Cross-session 1-hour Claude prompt cache** — Anthropic / OpenRouter / Nous Portal now share a 1h prefix cache across sessions for Claude models. Fast resume, fast `/new`, lower cost on repeat work. ([#23828](https://github.com/NousResearch/hermes-agent/pull/23828))
|
||||
|
||||
- **Two new messaging platforms — LINE + SimpleX Chat** — LINE Messaging API lands as a first-class platform, SimpleX Chat salvages #2558 onto the modern adapter spec. Hermes is now on 22 platforms. ([#23197](https://github.com/NousResearch/hermes-agent/pull/23197), [#26232](https://github.com/NousResearch/hermes-agent/pull/26232))
|
||||
|
||||
- **Microsoft Graph foundation — Teams pipeline + webhook adapter** — `msgraph` auth/client foundation, webhook listener platform, Teams pipeline plugin runtime, and Teams outbound delivery via the existing adapter — Hermes can now read and post to Teams. (salvages of #21408–#21411) ([#21922](https://github.com/NousResearch/hermes-agent/pull/21922), [#21969](https://github.com/NousResearch/hermes-agent/pull/21969), [#22007](https://github.com/NousResearch/hermes-agent/pull/22007), [#22024](https://github.com/NousResearch/hermes-agent/pull/22024))
|
||||
|
||||
- **`/handoff` actually transfers the session live** — the agent's active session moves to a different model / persona / profile mid-conversation, with messages, tool history, and context preserved. ([#23395](https://github.com/NousResearch/hermes-agent/pull/23395))
|
||||
|
||||
- **`x_search` — first-class X (Twitter) search tool** — gated tool with OAuth-or-API-key auth, no skill needed to query the timeline. ([#26763](https://github.com/NousResearch/hermes-agent/pull/26763))
|
||||
|
||||
- **`vision_analyze` returns pixels to vision-capable models** — when the active model can see, `vision_analyze` now hands the image straight through instead of falling back to a text description. ([#22955](https://github.com/NousResearch/hermes-agent/pull/22955))
|
||||
|
||||
- **LSP semantic diagnostics on every write** — `write_file` and `patch` now run real language-server diagnostics on the post-edit file (delta-only) and surface real errors before they ship downstream. ([#24168](https://github.com/NousResearch/hermes-agent/pull/24168), [#25978](https://github.com/NousResearch/hermes-agent/pull/25978))
|
||||
|
||||
- **Per-turn file-mutation verifier footer** — after every turn that wrote files, the agent gets a verifier footer summarizing what actually changed on disk — catches silent overwrites and "wrote it but it didn't land" bugs. ([#24498](https://github.com/NousResearch/hermes-agent/pull/24498))
|
||||
|
||||
- **Unified `video_generate` with pluggable provider backends** — single tool, any backend. Drop in a new video provider as a plugin, no core changes. ([#25126](https://github.com/NousResearch/hermes-agent/pull/25126))
|
||||
|
||||
- **`computer_use` cua-driver backend** — proper focus-safe ops, non-Anthropic provider support, refresh on `hermes update`. Computer-use is no longer locked to a single SDK. (re-salvage of #16936) ([#21967](https://github.com/NousResearch/hermes-agent/pull/21967), [#24063](https://github.com/NousResearch/hermes-agent/pull/24063))
|
||||
|
||||
- **xAI Grok OAuth provider — SuperGrok via subscription** — sign in with your xAI account, talk to Grok models from Hermes. ([#26534](https://github.com/NousResearch/hermes-agent/pull/26534))
|
||||
|
||||
- **Clarify with buttons — native inline keyboards on Telegram + Discord** — the `clarify` tool renders multi-choice prompts as platform-native buttons instead of typed responses. ([#24199](https://github.com/NousResearch/hermes-agent/pull/24199), [#25485](https://github.com/NousResearch/hermes-agent/pull/25485))
|
||||
|
||||
- **Discord channel history backfill (default on)** — Hermes reads recent channel history when joining a thread so it actually knows what's been said. ([#25984](https://github.com/NousResearch/hermes-agent/pull/25984))
|
||||
|
||||
- **Watchers skill — RSS / HTTP JSON / GitHub polling via cron `no_agent` mode** — skill recipes that wire change-detection sources directly into cron's script-only watchdog mode. ([#21881](https://github.com/NousResearch/hermes-agent/pull/21881))
|
||||
|
||||
- **Zed ACP Registry integration + uvx distribution** — Hermes is in the Zed registry, installable via `uvx` (no npm). Plus `hermes acp --setup-browser` bootstraps browser tools for registry installs. (salvage of [#25908](https://github.com/NousResearch/hermes-agent/pull/25908)) ([#26079](https://github.com/NousResearch/hermes-agent/pull/26079), [#26120](https://github.com/NousResearch/hermes-agent/pull/26120), [#26234](https://github.com/NousResearch/hermes-agent/pull/26234))
|
||||
|
||||
- **OpenRouter Pareto Code router** — wire a new OpenRouter router with `min_coding_score` knob. Pick the cheapest model that meets your quality bar. ([#22838](https://github.com/NousResearch/hermes-agent/pull/22838))
|
||||
|
||||
- **Optional codex app-server runtime for OpenAI/Codex models** — drives the OpenAI Codex CLI under the hood for OpenAI/Codex paths, with session reuse, wedge retirement, and OAuth refresh classification. ([#24182](https://github.com/NousResearch/hermes-agent/pull/24182), [#25769](https://github.com/NousResearch/hermes-agent/pull/25769))
|
||||
|
||||
- **`hermes-skills/huggingface` as a trusted default tap** — community skills index from huggingface.co/skills is available by default in the Skills Hub. ([#26219](https://github.com/NousResearch/hermes-agent/pull/26219))
|
||||
|
||||
- **9 new optional skills** — Hyperliquid (perp/spot trading via SDK + REST) (@kshitijk4poor & Hermes), Yahoo Finance market data, api-testing (REST/GraphQL debug), unified EVM multi-chain skill (folds #25291 + #2010 + base/), darwinian-evolver, osint-investigation (closes #355), pinggy-tunnel, watchers (RSS/HTTP/GitHub via cron), Notion overhaul for the Developer Platform (May 2026). ([#23582](https://github.com/NousResearch/hermes-agent/pull/23582), [#23583](https://github.com/NousResearch/hermes-agent/pull/23583), [#23590](https://github.com/NousResearch/hermes-agent/pull/23590), [#25299](https://github.com/NousResearch/hermes-agent/pull/25299), [#26760](https://github.com/NousResearch/hermes-agent/pull/26760), [#26729](https://github.com/NousResearch/hermes-agent/pull/26729), [#26765](https://github.com/NousResearch/hermes-agent/pull/26765), [#21881](https://github.com/NousResearch/hermes-agent/pull/21881), [#26612](https://github.com/NousResearch/hermes-agent/pull/26612))
|
||||
|
||||
- **API server exposes run approval events** — long-running runs surface approval requests over the API stream, no more silent stalls. (salvage of [#20311](https://github.com/NousResearch/hermes-agent/pull/20311)) ([#21899](https://github.com/NousResearch/hermes-agent/pull/21899))
|
||||
|
||||
- **`/subgoal` — user-added criteria appended to active `/goal`** — layer extra success criteria onto a running goal loop. The judge sees them in the prompt, no behavior change when subgoals are empty. ([#25449](https://github.com/NousResearch/hermes-agent/pull/25449))
|
||||
|
||||
- **Plugins can run any LLM call via `ctx.llm`** — plugins get a first-class hook to make their own LLM requests through the active provider/credentials, no manual wiring. Plus `tool_override` flag for replacing built-in tools. ([#23194](https://github.com/NousResearch/hermes-agent/pull/23194), [#26759](https://github.com/NousResearch/hermes-agent/pull/26759))
|
||||
|
||||
- **Brave Search (free tier) + DuckDuckGo (DDGS) as web-search providers** — two new free search backends alongside Tavily / SearXNG / Exa. ([#21337](https://github.com/NousResearch/hermes-agent/pull/21337))
|
||||
|
||||
- **Sudo brute-force block + sudo-stdin/askpass DANGEROUS classification** — closes the `sudo -S` brute-force avenue; approval gates classify stdin-fed and askpass-stripped sudo invocations as dangerous. (salvages of #22194 + #21128) ([#23736](https://github.com/NousResearch/hermes-agent/pull/23736))
|
||||
|
||||
- **Provider rename — Alibaba Cloud → Qwen Cloud, picker reorder** — matches what the world calls it. Existing config keys still work. ([#24835](https://github.com/NousResearch/hermes-agent/pull/24835))
|
||||
|
||||
|
||||
---
|
||||
|
||||
## 🪟 Windows — Native Support (Early Beta)
|
||||
|
||||
### Bootstrap & installer
|
||||
- **Native Windows support (early beta)** — first-class native Windows path across CLI / gateway / TUI / tools ([#21561](https://github.com/NousResearch/hermes-agent/pull/21561))
|
||||
- **PyPI wheel packaging — `pip install hermes-agent && hermes`** (salvage of #26350) ([#26593](https://github.com/NousResearch/hermes-agent/pull/26593))
|
||||
- **Recognise Shift+Enter as a newline key** + Windows docs (salvage #21545) ([#22130](https://github.com/NousResearch/hermes-agent/pull/22130))
|
||||
- **Preserve Ctrl+C for Windows foreground runs** (@helix4u) ([#22752](https://github.com/NousResearch/hermes-agent/pull/22752))
|
||||
- **Stop spamming cwd-missing + tirith-spawn warnings on every terminal call** ([#26618](https://github.com/NousResearch/hermes-agent/pull/26618))
|
||||
- **Use `--extra all` not `--all-extras`; drop lazy-covered extras from `[all]`** ([#24515](https://github.com/NousResearch/hermes-agent/pull/24515))
|
||||
|
||||
### Windows-specific fixes (40+ across cli / tools / gateway / curator / TUI)
|
||||
A long tail of native-Windows fixes shipped alongside the beta — taskkill-based subprocess management, MinGit auto-install, Microsoft Store python stub detection, npm prefix handling, native PTY paths, signal handling differences, foreground process management, ANSI sequence handling, path normalization, file-locking semantics, and many more. Full list in commit log under `fix(windows)` / `feat(windows)` / `windows`.
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Performance Wave
|
||||
|
||||
### Cold start
|
||||
- **Cut ~19s from `hermes` cold start** — skills cache + lazy Feishu + no Nous HTTP at startup ([#22138](https://github.com/NousResearch/hermes-agent/pull/22138))
|
||||
- **Skip eager plugin discovery on known built-in subcommands** ([#22120](https://github.com/NousResearch/hermes-agent/pull/22120))
|
||||
- **Cache Nous auth + .env loads** — `hermes tools` All Platforms from 14s to <1.5s ([#25341](https://github.com/NousResearch/hermes-agent/pull/25341))
|
||||
- **Skip welcome banner on `chat -q` single-query mode** ([#22904](https://github.com/NousResearch/hermes-agent/pull/22904))
|
||||
- **Defer heavy google-cloud imports in google_chat to first adapter use** ([#22681](https://github.com/NousResearch/hermes-agent/pull/22681))
|
||||
- **Defer QQAdapter and YuanbaoAdapter imports via PEP 562** ([#22790](https://github.com/NousResearch/hermes-agent/pull/22790))
|
||||
- **Defer httpx import in teams to first webhook call** ([#22831](https://github.com/NousResearch/hermes-agent/pull/22831))
|
||||
- **Defer fal_client import to first generation request** ([#22859](https://github.com/NousResearch/hermes-agent/pull/22859))
|
||||
- **models.dev cache-first lookup, skip network when disk cache is fresh** ([#22808](https://github.com/NousResearch/hermes-agent/pull/22808))
|
||||
- **Parallelize API connectivity checks in `hermes doctor` and disable IMDS** ([#22766](https://github.com/NousResearch/hermes-agent/pull/22766))
|
||||
|
||||
### Runtime
|
||||
- **180x faster `browser_console` evaluations** — route through supervisor's persistent CDP WebSocket ([#23226](https://github.com/NousResearch/hermes-agent/pull/23226))
|
||||
- **Tune Telegram cadence + adaptive fast-path for short replies** (salvage of #10388) ([#23587](https://github.com/NousResearch/hermes-agent/pull/23587))
|
||||
- **Accumulate length-continuation prefix via list+join** ([#26237](https://github.com/NousResearch/hermes-agent/pull/26237))
|
||||
|
||||
### Prompt caching
|
||||
- **Cross-session 1h prefix cache for Claude on Anthropic / OpenRouter / Nous Portal** ([#23828](https://github.com/NousResearch/hermes-agent/pull/23828))
|
||||
- **Hit prefix cache in background review fork** (salvage #17276 + #25427) ([#25434](https://github.com/NousResearch/hermes-agent/pull/25434))
|
||||
|
||||
---
|
||||
|
||||
## 📦 Installation & Distribution
|
||||
|
||||
### PyPI + supply-chain
|
||||
- **PyPI wheel packaging — `pip install hermes-agent && hermes`** (salvage of #26350) ([#26593](https://github.com/NousResearch/hermes-agent/pull/26593))
|
||||
- **Supply-chain advisory checker + lazy-install framework + tiered install fallback** ([#24220](https://github.com/NousResearch/hermes-agent/pull/24220))
|
||||
- **Use `--extra all` not `--all-extras`; drop lazy-covered extras from `[all]`** ([#24515](https://github.com/NousResearch/hermes-agent/pull/24515))
|
||||
- **Skip browser download when system chromium exists** (@helix4u) ([#25317](https://github.com/NousResearch/hermes-agent/pull/25317))
|
||||
|
||||
### Nix
|
||||
- **`extraDependencyGroups` for sealed venv extras** (@alt-glitch) ([#21817](https://github.com/NousResearch/hermes-agent/pull/21817))
|
||||
- **Refresh npm lockfile hashes** — keeps Nix flake builds reproducible
|
||||
|
||||
### Docker
|
||||
- **Bootstrap auth.json from env on first boot** ([#21880](https://github.com/NousResearch/hermes-agent/pull/21880))
|
||||
- **Drop manual @hermes/ink build, rely on esbuild bundle** — slimmer image
|
||||
|
||||
### ACP / Zed
|
||||
- **Zed ACP Registry integration** (salvage of #25908) ([#26079](https://github.com/NousResearch/hermes-agent/pull/26079))
|
||||
- **Switch to uvx distribution, drop npm launcher** ([#26120](https://github.com/NousResearch/hermes-agent/pull/26120))
|
||||
- **`hermes acp --setup-browser` bootstraps browser tools for registry installs** ([#26234](https://github.com/NousResearch/hermes-agent/pull/26234))
|
||||
|
||||
---
|
||||
|
||||
## 🏗️ Core Agent & Architecture
|
||||
|
||||
### Sessions & handoff
|
||||
- **`/handoff` actually transfers the session live** ([#23395](https://github.com/NousResearch/hermes-agent/pull/23395))
|
||||
- **Expose `HERMES_SESSION_ID` env var to agent tools** (@alt-glitch) ([#23847](https://github.com/NousResearch/hermes-agent/pull/23847))
|
||||
|
||||
### Goals (Ralph loop)
|
||||
- **`/subgoal` — user-added criteria appended to active `/goal`** ([#25449](https://github.com/NousResearch/hermes-agent/pull/25449))
|
||||
- **`/goal` checklist + /subgoal user controls** ([#23456](https://github.com/NousResearch/hermes-agent/pull/23456)) — rolled back in window ([#23813](https://github.com/NousResearch/hermes-agent/pull/23813)); /subgoal returned in simpler form via #25449
|
||||
|
||||
### Compression
|
||||
- **Make `protect_first_n` configurable** ([#25447](https://github.com/NousResearch/hermes-agent/pull/25447))
|
||||
|
||||
### Verification
|
||||
- **Per-turn file-mutation verifier footer** ([#24498](https://github.com/NousResearch/hermes-agent/pull/24498))
|
||||
|
||||
### Stream retry
|
||||
- **Log inner cause, upstream headers, bytes/elapsed on every drop** ([#23005](https://github.com/NousResearch/hermes-agent/pull/23005))
|
||||
|
||||
---
|
||||
|
||||
## 🤖 Models & Providers
|
||||
|
||||
### New providers
|
||||
- **xAI Grok OAuth (SuperGrok Subscription) provider** ([#26534](https://github.com/NousResearch/hermes-agent/pull/26534))
|
||||
- **NovitaAI provider** (salvage #7219) (@kshitijk4poor) ([#25507](https://github.com/NousResearch/hermes-agent/pull/25507))
|
||||
- **NVIDIA NIM billing origin header** (salvage #25211) ([#26585](https://github.com/NousResearch/hermes-agent/pull/26585))
|
||||
|
||||
### Provider work
|
||||
- **OpenRouter Pareto Code router with `min_coding_score` knob** ([#22838](https://github.com/NousResearch/hermes-agent/pull/22838))
|
||||
- **Optional codex app-server runtime for OpenAI/Codex models** ([#24182](https://github.com/NousResearch/hermes-agent/pull/24182))
|
||||
- **Codex-runtime: retire wedged sessions + post-tool watchdog + OAuth refresh classify** ([#25769](https://github.com/NousResearch/hermes-agent/pull/25769))
|
||||
- **Codex-runtime: skip unavailable plugins during migration** ([#25437](https://github.com/NousResearch/hermes-agent/pull/25437))
|
||||
- **Codex-runtime: de-dup `[plugins.X]` tables and stop leaking HERMES_HOME into config.toml** (#26250) (@kshitijk4poor) ([#26260](https://github.com/NousResearch/hermes-agent/pull/26260))
|
||||
- **Pass `reasoning.effort` to xAI Responses API** ([#22807](https://github.com/NousResearch/hermes-agent/pull/22807))
|
||||
- **Custom provider: prompt and persist explicit `api_mode`** ([#25068](https://github.com/NousResearch/hermes-agent/pull/25068))
|
||||
- **Rename Alibaba Cloud → Qwen Cloud, reorder picker** ([#24835](https://github.com/NousResearch/hermes-agent/pull/24835))
|
||||
- **Restore gpt-5.3-codex-spark for ChatGPT Pro** (salvage #18286 + #19530, fixes #16172) (@kshitijk4poor) ([#22991](https://github.com/NousResearch/hermes-agent/pull/22991))
|
||||
- **Inject tool-use enforcement for GLM models** ([#24715](https://github.com/NousResearch/hermes-agent/pull/24715))
|
||||
- **Use Nous Portal as model metadata authority** (@rob-maron) ([#24502](https://github.com/NousResearch/hermes-agent/pull/24502))
|
||||
- **Unified `client=hermes-client-v<version>` tag on every Portal request** ([#24779](https://github.com/NousResearch/hermes-agent/pull/24779))
|
||||
- **Prevent stale Ollama credentials after provider switch** (@kshitijk4poor) ([#21703](https://github.com/NousResearch/hermes-agent/pull/21703))
|
||||
- **Auxiliary client: rotate pooled auth after quota failures** (salvage #22779) ([#22792](https://github.com/NousResearch/hermes-agent/pull/22792))
|
||||
- **Auxiliary client: skip providers without credentials immediately** (#25395) ([#25487](https://github.com/NousResearch/hermes-agent/pull/25487))
|
||||
- **Auth: send Nous refresh token via header** (@shannonsands) ([#21578](https://github.com/NousResearch/hermes-agent/pull/21578))
|
||||
- **MiniMax: harden OAuth dashboard and runtime** ([#24165](https://github.com/NousResearch/hermes-agent/pull/24165))
|
||||
|
||||
### OpenAI-compatible proxy
|
||||
- **Local OpenAI-compatible proxy for OAuth providers** — Codex / Aider / Cline can hit Claude Pro, ChatGPT Pro, SuperGrok ([#25969](https://github.com/NousResearch/hermes-agent/pull/25969))
|
||||
|
||||
---
|
||||
|
||||
## 📱 Messaging Platforms (Gateway)
|
||||
|
||||
### New platforms
|
||||
- **LINE Messaging API platform plugin** ([#23197](https://github.com/NousResearch/hermes-agent/pull/23197))
|
||||
- **SimpleX Chat platform plugin** (salvages #2558) ([#26232](https://github.com/NousResearch/hermes-agent/pull/26232))
|
||||
|
||||
### Microsoft Graph foundation
|
||||
- **msgraph: add auth and client foundation** (salvage of #21408) ([#21922](https://github.com/NousResearch/hermes-agent/pull/21922))
|
||||
- **msgraph: add webhook listener platform** (salvage of #21409) ([#21969](https://github.com/NousResearch/hermes-agent/pull/21969))
|
||||
- **teams-pipeline: add plugin runtime and operator cli** (salvage of #21410) ([#22007](https://github.com/NousResearch/hermes-agent/pull/22007))
|
||||
- **teams: add pipeline outbound delivery via existing adapter** (salvage of #21411) ([#22024](https://github.com/NousResearch/hermes-agent/pull/22024))
|
||||
|
||||
### Cross-platform
|
||||
- **Per-platform admin/user split for slash commands** (salvage of #4443) ([#23373](https://github.com/NousResearch/hermes-agent/pull/23373))
|
||||
- **Forensics on signal handling — non-blocking diag, per-phase timing, stale-unit warning** ([#23285](https://github.com/NousResearch/hermes-agent/pull/23285))
|
||||
- **Keep gateway running when platforms fail; add per-platform circuit breaker + `/platform`** ([#26600](https://github.com/NousResearch/hermes-agent/pull/26600))
|
||||
- **Wire `clarify` tool with inline keyboard buttons on Telegram** ([#24199](https://github.com/NousResearch/hermes-agent/pull/24199))
|
||||
- **Add `chat_id` to `hook_ctx` for message source tracking** ([#24710](https://github.com/NousResearch/hermes-agent/pull/24710))
|
||||
|
||||
### Telegram
|
||||
- **Native draft streaming via `sendMessageDraft` (Bot API 9.5+)** (salvage of #3412) ([#23512](https://github.com/NousResearch/hermes-agent/pull/23512))
|
||||
- **Stream Telegram edits safely** — salvage of #22264 (@kshitijk4poor) ([#22518](https://github.com/NousResearch/hermes-agent/pull/22518))
|
||||
- **Telegram notification mode** (salvage #22772) ([#22793](https://github.com/NousResearch/hermes-agent/pull/22793))
|
||||
- **Telegram guest mention mode** (@kshitijk4poor) ([#22759](https://github.com/NousResearch/hermes-agent/pull/22759))
|
||||
- **Split-and-deliver oversized edits instead of silent truncation** (salvage of #19537) ([#23576](https://github.com/NousResearch/hermes-agent/pull/23576))
|
||||
- **Preserve DM topic routing via reply fallback** (salvage #22053) (@kshitijk4poor) ([#22410](https://github.com/NousResearch/hermes-agent/pull/22410))
|
||||
- **Pass `source.thread_id` explicitly on auto-reset notice** (carve-out of #7404) ([#23440](https://github.com/NousResearch/hermes-agent/pull/23440))
|
||||
|
||||
### Discord
|
||||
- **Render clarify choices as buttons** ([#25485](https://github.com/NousResearch/hermes-agent/pull/25485))
|
||||
- **Channel history backfill — default on, broadened scope** ([#25984](https://github.com/NousResearch/hermes-agent/pull/25984))
|
||||
- **`thread_require_mention` for multi-bot threads** (salvage #25313) ([#25445](https://github.com/NousResearch/hermes-agent/pull/25445))
|
||||
|
||||
### Slack
|
||||
- **Support `!cmd` as alternate prefix for slash commands in threads** ([#25355](https://github.com/NousResearch/hermes-agent/pull/25355))
|
||||
|
||||
### WhatsApp
|
||||
- **Surface quoted reply metadata from Baileys** (#25398) ([#25489](https://github.com/NousResearch/hermes-agent/pull/25489))
|
||||
|
||||
### Feishu / Google Chat / others
|
||||
- **Feishu: native update prompt cards** (@kshitijk4poor) ([#22448](https://github.com/NousResearch/hermes-agent/pull/22448))
|
||||
- **Google Chat: repair setup prompt imports** (@helix4u) ([#22038](https://github.com/NousResearch/hermes-agent/pull/22038))
|
||||
- **Google Chat: honor relay-declared sender_type** (salvage of #22107) (@kshitijk4poor) ([#22432](https://github.com/NousResearch/hermes-agent/pull/22432))
|
||||
- **LINE: use `build_source` instead of nonexistent `create_source`** ([#24717](https://github.com/NousResearch/hermes-agent/pull/24717))
|
||||
- **Add `weixin, and more` to gateway docs** (salvage of #21063 by @wuwuzhijing)
|
||||
|
||||
---
|
||||
|
||||
## 🖥️ CLI & TUI
|
||||
|
||||
### CLI
|
||||
- **Show YOLO mode warning in banner and status bar** ([#26238](https://github.com/NousResearch/hermes-agent/pull/26238))
|
||||
- **Confirm prompt for destructive slash commands** (#4069) ([#22687](https://github.com/NousResearch/hermes-agent/pull/22687))
|
||||
- **`docker_extra_args` + `display.timestamps`** ([#23599](https://github.com/NousResearch/hermes-agent/pull/23599))
|
||||
- **Delegate tool: show user's actual concurrency / spawn-depth limits in description** ([#22694](https://github.com/NousResearch/hermes-agent/pull/22694))
|
||||
|
||||
### TUI
|
||||
- **`/sessions` slash command for browsing and resuming previous sessions** (@austinpickett) ([#20805](https://github.com/NousResearch/hermes-agent/pull/20805))
|
||||
- **Segment turns with rule above non-first user msgs; trim ticker dead space** (@OutThisLife) ([#21846](https://github.com/NousResearch/hermes-agent/pull/21846))
|
||||
- **Support attaching to an existing gateway** (@OutThisLife) ([#21978](https://github.com/NousResearch/hermes-agent/pull/21978))
|
||||
- **Resolve markdown links to readable page titles** (@OutThisLife) ([#24013](https://github.com/NousResearch/hermes-agent/pull/24013))
|
||||
- **Width-aware markdown table rendering with vertical fallback** (@alt-glitch) ([#26195](https://github.com/NousResearch/hermes-agent/pull/26195))
|
||||
- **Keep Ink displayCursor in sync with fast-echo writes so cursor stops drifting** (@OutThisLife) ([#26717](https://github.com/NousResearch/hermes-agent/pull/26717))
|
||||
- **Allow transcript scroll + Esc during approval/clarify/confirm prompts** (@OutThisLife) ([#26414](https://github.com/NousResearch/hermes-agent/pull/26414))
|
||||
- **Preserve session when switching personality** (@austinpickett) ([#20942](https://github.com/NousResearch/hermes-agent/pull/20942))
|
||||
- **Skip native safety net on OSC52-capable terminals** (@benbarclay) ([#20954](https://github.com/NousResearch/hermes-agent/pull/20954))
|
||||
|
||||
### Dashboard / GUI
|
||||
- **Route embedded TUI through dashboard gateway** (@OutThisLife) ([#21979](https://github.com/NousResearch/hermes-agent/pull/21979))
|
||||
- **Hide token/cost analytics behind config flag (default off)** ([#25438](https://github.com/NousResearch/hermes-agent/pull/25438))
|
||||
- **Fix Langfuse observability — trace I/O, tool outputs, placeholder credentials** (closes #22342, #22763) (@kshitijk4poor) ([#26320](https://github.com/NousResearch/hermes-agent/pull/26320))
|
||||
- **MiniMax 'Login' button launched Claude OAuth** (salvage #22849) ([#24058](https://github.com/NousResearch/hermes-agent/pull/24058))
|
||||
- **Update cron modals** (@austinpickett) ([#25985](https://github.com/NousResearch/hermes-agent/pull/25985))
|
||||
- **Analytics: prevent silent token loss and add Claude 4.5–4.7 pricing** (@austinpickett) ([#21455](https://github.com/NousResearch/hermes-agent/pull/21455))
|
||||
|
||||
---
|
||||
|
||||
## 🔧 Tools & Capabilities
|
||||
|
||||
### Vision & video
|
||||
- **`vision_analyze` returns pixels to vision-capable models** ([#22955](https://github.com/NousResearch/hermes-agent/pull/22955))
|
||||
- **Unified `video_generate` with pluggable provider backends** ([#25126](https://github.com/NousResearch/hermes-agent/pull/25126))
|
||||
- **`image_gen`: actionable setup message when no FAL backend is reachable** ([#26222](https://github.com/NousResearch/hermes-agent/pull/26222))
|
||||
|
||||
### Computer use
|
||||
- **`computer_use` cua-driver backend + focus-safe ops + non-Anthropic provider fix** (re-salvage #16936) ([#21967](https://github.com/NousResearch/hermes-agent/pull/21967))
|
||||
- **Refresh cua-driver on `hermes update` + add `install --upgrade`** ([#24063](https://github.com/NousResearch/hermes-agent/pull/24063))
|
||||
|
||||
### LSP & write-time diagnostics
|
||||
- **Semantic diagnostics from real language servers in `write_file`/`patch`** ([#24168](https://github.com/NousResearch/hermes-agent/pull/24168))
|
||||
- **Shift baseline diagnostics into post-edit coordinates** ([#25978](https://github.com/NousResearch/hermes-agent/pull/25978))
|
||||
|
||||
### Search & web
|
||||
- **Brave Search (free tier) and DDGS search providers** ([#21337](https://github.com/NousResearch/hermes-agent/pull/21337))
|
||||
- **Bearer auth header for Tavily `/crawl` endpoint** ([#24658](https://github.com/NousResearch/hermes-agent/pull/24658))
|
||||
|
||||
### X (Twitter)
|
||||
- **Gated `x_search` tool with OAuth-or-API-key auth** ([#26763](https://github.com/NousResearch/hermes-agent/pull/26763))
|
||||
|
||||
### Browser
|
||||
- **Route `browser_console` eval through supervisor's persistent CDP WS (180x faster)** ([#23226](https://github.com/NousResearch/hermes-agent/pull/23226))
|
||||
- **Support externally managed Camofox sessions** ([#24499](https://github.com/NousResearch/hermes-agent/pull/24499))
|
||||
|
||||
### MCP
|
||||
- **`supports_parallel_tool_calls` for MCP servers** (salvage of #9944) ([#26825](https://github.com/NousResearch/hermes-agent/pull/26825))
|
||||
- **Codex preset for Codex CLI MCP server** (salvage #22663) ([#22679](https://github.com/NousResearch/hermes-agent/pull/22679))
|
||||
- **Stop retrying initial MCP auth failures** (#25624) ([#25776](https://github.com/NousResearch/hermes-agent/pull/25776))
|
||||
|
||||
### Google Workspace
|
||||
- **Drive write ops + Docs/Sheets create/append** ([#21895](https://github.com/NousResearch/hermes-agent/pull/21895))
|
||||
|
||||
### Per-turn verifier
|
||||
- **Per-turn file-mutation verifier footer** ([#24498](https://github.com/NousResearch/hermes-agent/pull/24498))
|
||||
|
||||
---
|
||||
|
||||
## 🧩 Kanban (Multi-Agent)
|
||||
|
||||
- **`specify` — auxiliary LLM fleshes out triage tasks** ([#21435](https://github.com/NousResearch/hermes-agent/pull/21435))
|
||||
- **Orchestrator board tools — `kanban_list` + `kanban_unblock`** (carve-out of #20568) ([#23012](https://github.com/NousResearch/hermes-agent/pull/23012))
|
||||
- **`stranded_in_ready` diagnostic for unclaimed tasks** ([#23578](https://github.com/NousResearch/hermes-agent/pull/23578))
|
||||
- **Dashboard batch QOL upgrade** (salvage of #23240) ([#23550](https://github.com/NousResearch/hermes-agent/pull/23550))
|
||||
- **Tooltips and docs link across dashboard** ([#21541](https://github.com/NousResearch/hermes-agent/pull/21541))
|
||||
- **Dedupe notifier delivery via atomic claim + rewind on failure** (salvage #22558) ([#23401](https://github.com/NousResearch/hermes-agent/pull/23401))
|
||||
- **Keep notifier subscriptions alive across retry cycles** (salvage #21398) ([#23423](https://github.com/NousResearch/hermes-agent/pull/23423))
|
||||
- **Drop caller-controlled author override in `kanban_comment`** (salvage of #22109) (@kshitijk4poor) ([#22435](https://github.com/NousResearch/hermes-agent/pull/22435))
|
||||
- **Sanitize comment author rendering in `build_worker_context`** ([#22769](https://github.com/NousResearch/hermes-agent/pull/22769))
|
||||
|
||||
---
|
||||
|
||||
## 🧠 Plugins & Extension
|
||||
|
||||
### Plugin surface
|
||||
- **Run any LLM call from inside a plugin via `ctx.llm`** ([#23194](https://github.com/NousResearch/hermes-agent/pull/23194))
|
||||
- **`tool_override` flag for replacing built-in tools** (closes #11049) ([#26759](https://github.com/NousResearch/hermes-agent/pull/26759))
|
||||
- **`standalone_sender_fn` for out-of-process cron delivery** (@kshitijk4poor) ([#22461](https://github.com/NousResearch/hermes-agent/pull/22461))
|
||||
- **`HERMES_PLUGINS_DEBUG=1` surfaces plugin discovery logs** ([#22684](https://github.com/NousResearch/hermes-agent/pull/22684))
|
||||
- **Hindsight-client as optional dependency** (@alt-glitch) ([#21818](https://github.com/NousResearch/hermes-agent/pull/21818))
|
||||
|
||||
### Profile & distribution
|
||||
- **Shareable profile distributions via git** ([#20831](https://github.com/NousResearch/hermes-agent/pull/20831))
|
||||
|
||||
---
|
||||
|
||||
## ⏰ Cron
|
||||
|
||||
- **Routing intent — `deliver=all` fans out to every connected channel** ([#21495](https://github.com/NousResearch/hermes-agent/pull/21495))
|
||||
- **Support name-based lookup for job operations** ([#26231](https://github.com/NousResearch/hermes-agent/pull/26231))
|
||||
- **Blank Cron dashboard tab + partial-record crashes** (salvage #21042 + #22330) (@kshitijk4poor) ([#22389](https://github.com/NousResearch/hermes-agent/pull/22389))
|
||||
- **Do not seed `HERMES_SESSION_*` contextvars from cron origin** (salvage of #22356) (@kshitijk4poor) ([#22382](https://github.com/NousResearch/hermes-agent/pull/22382))
|
||||
- **Scan assembled prompt including skill content for prompt injection** (#3968)
|
||||
|
||||
---
|
||||
|
||||
## 🧩 Skills Ecosystem
|
||||
|
||||
### Skills Hub
|
||||
- **`hermes-skills/huggingface` as a trusted default tap** (closes #2549) ([#26219](https://github.com/NousResearch/hermes-agent/pull/26219))
|
||||
- **Show per-skill pages in the left sidebar** ([#26646](https://github.com/NousResearch/hermes-agent/pull/26646))
|
||||
- **Richer info panels on the Skills Hub** ([#22905](https://github.com/NousResearch/hermes-agent/pull/22905))
|
||||
- **Refuse `skill_view` name collisions instead of guessing** (closes #6136 @polkn)
|
||||
|
||||
### Curator
|
||||
- **Show rename map in user-visible summary** ([#22910](https://github.com/NousResearch/hermes-agent/pull/22910))
|
||||
- **Hint at `hermes curator pin` in the rename block** ([#23212](https://github.com/NousResearch/hermes-agent/pull/23212))
|
||||
|
||||
### New optional skills
|
||||
- **Hyperliquid** — perp/spot trading via SDK + REST (salvage of #1952) ([#23583](https://github.com/NousResearch/hermes-agent/pull/23583))
|
||||
- **Yahoo Finance** market data ([#23590](https://github.com/NousResearch/hermes-agent/pull/23590))
|
||||
- **api-testing** (REST/GraphQL debug, salvages #1800) ([#23582](https://github.com/NousResearch/hermes-agent/pull/23582))
|
||||
- **Unified EVM multi-chain skill** (salvages #25291 + #2010 + folds in base/) ([#25299](https://github.com/NousResearch/hermes-agent/pull/25299))
|
||||
- **darwinian-evolver** ([#26760](https://github.com/NousResearch/hermes-agent/pull/26760))
|
||||
- **osint-investigation** (closes #355) ([#26729](https://github.com/NousResearch/hermes-agent/pull/26729))
|
||||
- **pinggy-tunnel** ([#26765](https://github.com/NousResearch/hermes-agent/pull/26765))
|
||||
- **watchers** — RSS / HTTP JSON / GitHub polling via cron no-agent ([#21881](https://github.com/NousResearch/hermes-agent/pull/21881))
|
||||
- **Notion overhaul for the Developer Platform** (May 2026) ([#26612](https://github.com/NousResearch/hermes-agent/pull/26612))
|
||||
|
||||
---
|
||||
|
||||
## 🔒 Security & Reliability
|
||||
|
||||
### Security hardening
|
||||
- **Sudo brute-force block + sudo-stdin/askpass DANGEROUS** (salvage of #22194 + #21128) (@kshitijk4poor) ([#23736](https://github.com/NousResearch/hermes-agent/pull/23736))
|
||||
- **Drop caller-controlled author override in `kanban_comment`** (salvage of #22109) (@kshitijk4poor) ([#22435](https://github.com/NousResearch/hermes-agent/pull/22435))
|
||||
- **Cover remaining SSRF fetch paths in skills-hub** (salvage #22804) ([#22843](https://github.com/NousResearch/hermes-agent/pull/22843))
|
||||
- **Use credential_pool for custom endpoint model listing probes** (salvage #22810) ([#22842](https://github.com/NousResearch/hermes-agent/pull/22842))
|
||||
- **Require dashboard auth for plugin API routes** (salvage #19541) ([#23220](https://github.com/NousResearch/hermes-agent/pull/23220))
|
||||
- **Sanitize env and redact output in quick commands + remove write-only `_pending_messages`** ([#23584](https://github.com/NousResearch/hermes-agent/pull/23584))
|
||||
- **Reduce unnecessary `shell=True` in subprocess calls** ([#25149](https://github.com/NousResearch/hermes-agent/pull/25149))
|
||||
- **Sanitize Google Chat sender_type from relay** (salvage of #22107) (@kshitijk4poor) ([#22432](https://github.com/NousResearch/hermes-agent/pull/22432))
|
||||
- **Supply-chain advisory checker** ([#24220](https://github.com/NousResearch/hermes-agent/pull/24220))
|
||||
- **Rewrite security policy around OS-level isolation as the boundary** (@jquesnelle) ([#20317](https://github.com/NousResearch/hermes-agent/pull/20317))
|
||||
- **Remove public security advisory page** ([#24253](https://github.com/NousResearch/hermes-agent/pull/24253))
|
||||
|
||||
### Reliability — notable bug closures
|
||||
- **SQLite: fall back to `journal_mode=DELETE` on NFS/SMB/FUSE** (fixes `/resume` on network mounts) (@kshitijk4poor) ([#22043](https://github.com/NousResearch/hermes-agent/pull/22043))
|
||||
- **Codex-runtime: retire wedged sessions + post-tool watchdog + OAuth refresh classify** ([#25769](https://github.com/NousResearch/hermes-agent/pull/25769))
|
||||
- **Codex-runtime: de-dup `[plugins.X]` tables and stop leaking HERMES_HOME** (#26250) (@kshitijk4poor) ([#26260](https://github.com/NousResearch/hermes-agent/pull/26260))
|
||||
- **Daytona: migrate legacy-sandbox lookup to cursor-based `list()`** ([#24587](https://github.com/NousResearch/hermes-agent/pull/24587))
|
||||
- **MCP: stop retrying initial MCP auth failures** (#25624) ([#25776](https://github.com/NousResearch/hermes-agent/pull/25776))
|
||||
- **Gateway: enable text-intercept for multi-choice clarify fallback** (#25587) ([#25778](https://github.com/NousResearch/hermes-agent/pull/25778))
|
||||
- **Gateway: keep running when platforms fail; per-platform circuit breaker + `/platform`** ([#26600](https://github.com/NousResearch/hermes-agent/pull/26600))
|
||||
- **Delegate: salvage #21933 JSON-string batch + diagnostic logging** (@kshitijk4poor) ([#22436](https://github.com/NousResearch/hermes-agent/pull/22436))
|
||||
- **Profiles+banner: exclude infrastructure from `--clone-all` + fix stale update-check repo resolution** (@kshitijk4poor) ([#22475](https://github.com/NousResearch/hermes-agent/pull/22475))
|
||||
- **ACP: inline file attachment resources** (salvage #21400 + image support) ([#21407](https://github.com/NousResearch/hermes-agent/pull/21407))
|
||||
- **CI: unblock shared PR checks** (@stephenschoettler) ([#21012](https://github.com/NousResearch/hermes-agent/pull/21012), [#25957](https://github.com/NousResearch/hermes-agent/pull/25957))
|
||||
|
||||
### Notable reverts in window
|
||||
- **`/goal` checklist + /subgoal feature stack** — rolled back ([#23813](https://github.com/NousResearch/hermes-agent/pull/23813)); `/subgoal` returned in simpler form via [#25449](https://github.com/NousResearch/hermes-agent/pull/25449)
|
||||
- **Scrollback box width clamp** (#25975) rolled back to restore full-width borders ([#26163](https://github.com/NousResearch/hermes-agent/pull/26163))
|
||||
- **`fix(cli): tolerate unreadable dirs when building systemd PATH`** rolled back
|
||||
|
||||
---
|
||||
|
||||
## 🌍 i18n
|
||||
|
||||
- **Localize all gateway commands + web dashboard, add 8 new locales (16 total)** ([#22914](https://github.com/NousResearch/hermes-agent/pull/22914))
|
||||
|
||||
---
|
||||
|
||||
## 📚 Documentation
|
||||
|
||||
- **Repair Voice & TTS provider table** (@nightcityblade, fixes #24101) ([#24138](https://github.com/NousResearch/hermes-agent/pull/24138))
|
||||
- **Show per-skill pages in the left sidebar** ([#26646](https://github.com/NousResearch/hermes-agent/pull/26646))
|
||||
- **Mention Weixin in gateway help and docstrings** (salvage of #21063 by @wuwuzhijing)
|
||||
- **Richer info panels on the Skills Hub** ([#22905](https://github.com/NousResearch/hermes-agent/pull/22905))
|
||||
- Many more doc updates across providers, platforms, skills, Windows install paths, and dashboard.
|
||||
|
||||
---
|
||||
|
||||
## 🧪 Testing & CI
|
||||
|
||||
- **Unblock shared PR checks** (@stephenschoettler) ([#21012](https://github.com/NousResearch/hermes-agent/pull/21012))
|
||||
- **Stabilize shared test state after 21012** (@stephenschoettler) ([#25957](https://github.com/NousResearch/hermes-agent/pull/25957))
|
||||
- A long tail of test additions for platforms, providers, plugins, and edge cases — 8 explicit `test:` PRs plus ~250 fix PRs that also added regression coverage.
|
||||
|
||||
---
|
||||
|
||||
## 👥 Contributors
|
||||
|
||||
### Core
|
||||
- @teknium1 — release lead, architecture, ~406 PRs merged in window
|
||||
|
||||
### Top community contributors
|
||||
- **@kshitijk4poor** — 38 PRs · Telegram cadence/streaming/topic routing, security hardening (sudo, SSRF, kanban_comment, dashboard auth), codex-runtime hygiene, NovitaAI provider, profile/banner fixes, Feishu update cards, gateway QOL across the board
|
||||
- **@alt-glitch** — 13 PRs · Markdown-table TUI rendering, `HERMES_SESSION_ID` env var, hindsight-client optional dep, Nix `extraDependencyGroups`
|
||||
- **@OutThisLife** (Brooklyn Nicholson) — 12 PRs · TUI turn segmentation, attach-to-gateway, markdown link titles, embedded TUI via dashboard gateway, Ink cursor sync, scroll/Esc during prompts
|
||||
- **@austinpickett** — 8 PRs · `/sessions` slash command, personality switching preserves session, cron modals, dashboard analytics
|
||||
- **@helix4u** — 5 PRs · Google Chat setup, browser install skip on system chromium, Windows Ctrl+C preservation
|
||||
- **@rob-maron** — 4 PRs · Nous Portal as model metadata authority, provider polish
|
||||
- **@stephenschoettler** — 3 PRs · CI stabilization
|
||||
- **@ethernet8023** — 3 PRs · platform/gateway work
|
||||
|
||||
### All contributors (alphabetical)
|
||||
|
||||
@02356abc, @0xbyt4, @0xharryriddle, @1000Delta, @1RB, @29206394, @A-kamal, @aashizpoudel, @Abd0r,
|
||||
@adybag14-cyber, @AgentArcLab, @ahmedbadr3, @AhmetArif0, @alblez, @Alex-yang00, @ALIYILD, @AllynSheep,
|
||||
@alt-glitch, @am423, @amathxbt, @amethystani, @ArecaNon, @Arkmusn, @askclaw-vesper, @AsoTora, @austinpickett,
|
||||
@aydnOktay, @ayushere, @baocin, @Bartok9, @benbarclay, @BennetYrWang, @Bihruze, @binhnt92, @briandevans,
|
||||
@brooklynnicholson, @btorresgil, @buntingszn, @CalmProton, @chrisworksai, @CoinTheHat, @dandacompany, @Dangooy,
|
||||
@DanielLSM, @David-0x221Eight, @ddupont808, @dhruv-saxena, @diablozzc, @dlkakbs, @dmahan93, @dmnkhorvath,
|
||||
@domtriola, @donrhmexe, @Dusk1e, @eloklam, @emozilla, @ephron-ren, @erenkarakus, @EthanGuo-coder,
|
||||
@ethernet8023, @evgyur, @explainanalyze, @fahdad, @fr33d3m0n, @Freeman-Consulting, @freqyfreqy, @Frowtek,
|
||||
@fu576, @github-actions[bot], @gnanirahulnutakki, @GodsBoy, @guglielmofonda, @Gutslabs, @hanzckernel,
|
||||
@heathley, @hekaru-agent, @helix4u, @HenkDz, @HiddenPuppy, @hllqkb, @hrygo, @HuangYuChuh, @Hugo-SEQUIER, @HxT9,
|
||||
@iacker, @InB4DevOps, @isaachuangGMICLOUD, @iuyup, @Jaaneek, @jackey8616, @jackjin1997, @Jaggia, @jak983464779,
|
||||
@jelrod27, @jethac, @JithendraNara, @johnisag, @Julientalbot, @Jwd-gity, @kallidean, @keyuyuan, @kfa-ai,
|
||||
@kidonng, @KiraKatana, @kjames2001, @konsisumer, @Korkyzer, @kshitijk4poor, @KvnGz, @lars-hagen, @leehack,
|
||||
@leepoweii, @LeonSGP43, @li0near, @libo1106, @liquidchen, @littlewwwhite, @liuhao1024, @liyoungc, @luandiasrj,
|
||||
@luoyuctl, @luyao618, @magic524, @mbac, @McClean, @memosr, @Mibayy, @ming1523, @mizgyo, @mrshu, @ms-alan,
|
||||
@MustafaKara7, @nederev, @nicoechaniz, @nidhi-singh02, @nightcityblade, @nik1t7n, @Ninso112, @NivOO5,
|
||||
@novax635, @nv-kasikritc, @oferlaor, @oswaldb22, @outdoorsea, @oxngon, @PaTTeeL, @pearjelly, @pefontana,
|
||||
@perng, @PhilipAD, @phuongvm, @polkn, @Prasanna28Devadiga, @princepal9120, @pty819, @purzbeats, @Quarkex,
|
||||
@quocanh261997, @qWaitCrypto, @Qwinty, @rahimsais, @raymaylee, @ReqX, @rewbs, @RhombusMaximus, @rob-maron,
|
||||
@Ruzzgar, @ryptotalent, @Sanjays2402, @shannonsands, @shaun0927, @SiliconID, @silv-mt-holdings, @simpolism,
|
||||
@smwbev, @soichiyo, @sprmn24, @steezkelly, @stephenschoettler, @Sylw3ster, @szymonclawd, @teyrebaz33,
|
||||
@Tianyu199509, @Tranquil-Flow, @TreyDong, @TurgutKural, @tw2818, @tymrtn, @uzunkuyruk, @v1b3coder,
|
||||
@vanthinh6886, @VinceZcrikl, @vKongv, @vominh1919, @voteblake, @VTRiot, @wali-reheman, @wesleysimplicio,
|
||||
@wilsen0, @WorldWriter, @worlldz, @wuli666, @wuwuzhijing, @Wysie, @XiaoXiao0221, @xieNniu, @xxxigm, @yehuosi,
|
||||
@ygd58, @yifengingit, @yuga-hashimoto, @zccyman, @ZeterMordio, @Zhekinmaksim, @zhengyn0001
|
||||
|
||||
Also: @Nagatha (Claude Opus 4.7).
|
||||
|
||||
---
|
||||
|
||||
**Full Changelog**: [v2026.5.7...v2026.5.16](https://github.com/NousResearch/hermes-agent/compare/v2026.5.7...v2026.5.16)
|
||||
+46
-2
@@ -1,8 +1,11 @@
|
||||
"""ACP auth helpers — detect the currently configured Hermes provider."""
|
||||
"""ACP auth helpers — detect and advertise Hermes authentication methods."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
from typing import Any, Optional
|
||||
|
||||
|
||||
TERMINAL_SETUP_AUTH_METHOD_ID = "hermes-setup"
|
||||
|
||||
|
||||
def detect_provider() -> Optional[str]:
|
||||
@@ -22,3 +25,44 @@ def detect_provider() -> Optional[str]:
|
||||
def has_provider() -> bool:
|
||||
"""Return True if Hermes can resolve any runtime provider credentials."""
|
||||
return detect_provider() is not None
|
||||
|
||||
|
||||
def build_auth_methods() -> list[Any]:
|
||||
"""Return registry-compatible ACP auth methods for Hermes.
|
||||
|
||||
The official ACP registry validates that agents advertise at least one
|
||||
usable auth method during the initial handshake. A fresh Zed install may
|
||||
not have Hermes provider credentials configured yet, so Hermes always
|
||||
advertises a terminal setup method. When credentials are already present,
|
||||
it also advertises the resolved provider as the default agent-managed
|
||||
runtime credential method.
|
||||
"""
|
||||
from acp.schema import AuthMethodAgent, TerminalAuthMethod
|
||||
|
||||
methods: list[Any] = []
|
||||
provider = detect_provider()
|
||||
if provider:
|
||||
methods.append(
|
||||
AuthMethodAgent(
|
||||
id=provider,
|
||||
name=f"{provider} runtime credentials",
|
||||
description=(
|
||||
"Authenticate Hermes using the currently configured "
|
||||
f"{provider} runtime credentials."
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
methods.append(
|
||||
TerminalAuthMethod(
|
||||
id=TERMINAL_SETUP_AUTH_METHOD_ID,
|
||||
name="Configure Hermes provider",
|
||||
description=(
|
||||
"Open Hermes' interactive model/provider setup in a terminal. "
|
||||
"Use this when Hermes has not been configured on this machine yet."
|
||||
),
|
||||
type="terminal",
|
||||
args=["--setup"],
|
||||
)
|
||||
)
|
||||
return methods
|
||||
|
||||
@@ -0,0 +1,288 @@
|
||||
# bootstrap_browser_tools.ps1 — install agent-browser + Playwright Chromium
|
||||
# into ~/.hermes/node/ for use by Hermes Agent's browser tools on Windows.
|
||||
#
|
||||
# Targets the registry-install path: users who got Hermes via
|
||||
# `uvx --from 'hermes-agent[acp]==X' hermes-acp` don't have a repo clone,
|
||||
# so the install.ps1 `npm install`-in-repo flow doesn't apply. This script
|
||||
# is a self-contained, idempotent slice of install.ps1's browser block.
|
||||
#
|
||||
# Usage:
|
||||
# .\bootstrap_browser_tools.ps1 # use defaults
|
||||
# .\bootstrap_browser_tools.ps1 -Yes # accept Chromium download
|
||||
# .\bootstrap_browser_tools.ps1 -SkipChromium # Node + agent-browser only
|
||||
#
|
||||
# Idempotent: re-running this is safe and fast.
|
||||
|
||||
[CmdletBinding()]
|
||||
param(
|
||||
[switch]$Yes,
|
||||
[switch]$SkipChromium
|
||||
)
|
||||
|
||||
$ErrorActionPreference = "Stop"
|
||||
$NodeVersion = "22"
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# Logging
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
function Write-Info { param([string]$msg) Write-Host "[*] $msg" -ForegroundColor Cyan }
|
||||
function Write-Success { param([string]$msg) Write-Host "[+] $msg" -ForegroundColor Green }
|
||||
function Write-Warn { param([string]$msg) Write-Host "[!] $msg" -ForegroundColor Yellow }
|
||||
function Write-Err { param([string]$msg) Write-Host "[x] $msg" -ForegroundColor Red }
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# Paths
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
$HermesHome = $env:HERMES_HOME
|
||||
if (-not $HermesHome) {
|
||||
$HermesHome = Join-Path $env:USERPROFILE ".hermes"
|
||||
}
|
||||
$NodePrefix = Join-Path $HermesHome "node"
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# Step 1: Node.js
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
function Resolve-NpmExe {
|
||||
# Same gotcha as install.ps1: prefer npm.cmd over npm.ps1 so the
|
||||
# PowerShell execution policy doesn't block us.
|
||||
$cmd = Get-Command npm -ErrorAction SilentlyContinue
|
||||
if (-not $cmd) { return $null }
|
||||
$npmExe = $cmd.Source
|
||||
if ($npmExe -like "*.ps1") {
|
||||
$sibling = Join-Path (Split-Path $npmExe -Parent) "npm.cmd"
|
||||
if (Test-Path $sibling) { return $sibling }
|
||||
}
|
||||
return $npmExe
|
||||
}
|
||||
|
||||
function Resolve-NpxExe {
|
||||
$cmd = Get-Command npx -ErrorAction SilentlyContinue
|
||||
if (-not $cmd) { return $null }
|
||||
$npxExe = $cmd.Source
|
||||
if ($npxExe -like "*.ps1") {
|
||||
$sibling = Join-Path (Split-Path $npxExe -Parent) "npx.cmd"
|
||||
if (Test-Path $sibling) { return $sibling }
|
||||
}
|
||||
return $npxExe
|
||||
}
|
||||
|
||||
function Ensure-Node {
|
||||
# System Node on PATH?
|
||||
$sysNode = Get-Command node -ErrorAction SilentlyContinue
|
||||
if ($sysNode) {
|
||||
try {
|
||||
$v = & $sysNode.Source --version
|
||||
$major = [int]($v -replace '^v(\d+).*', '$1')
|
||||
if ($major -ge 20) {
|
||||
Write-Success "Node.js $v found on PATH"
|
||||
return
|
||||
}
|
||||
Write-Warn "Node.js $v is older than v20 — installing managed Node."
|
||||
} catch {
|
||||
Write-Warn "Failed to query Node version: $_"
|
||||
}
|
||||
}
|
||||
|
||||
# Hermes-managed Node?
|
||||
$managedNode = Join-Path $NodePrefix "node.exe"
|
||||
if (Test-Path $managedNode) {
|
||||
$v = & $managedNode --version
|
||||
Write-Success "Node.js $v found (Hermes-managed at $NodePrefix)"
|
||||
# Prepend to current-process PATH so subsequent npm/npx calls find it.
|
||||
$env:PATH = "$NodePrefix;$env:PATH"
|
||||
return
|
||||
}
|
||||
|
||||
Write-Info "Installing Node.js $NodeVersion LTS into $NodePrefix ..."
|
||||
|
||||
$arch = if ([Environment]::Is64BitOperatingSystem) { "x64" } else { "x86" }
|
||||
$indexUrl = "https://nodejs.org/dist/latest-v${NodeVersion}.x/"
|
||||
|
||||
try {
|
||||
$indexPage = Invoke-WebRequest -Uri $indexUrl -UseBasicParsing
|
||||
$matches = [regex]::Matches($indexPage.Content, "node-v${NodeVersion}\.\d+\.\d+-win-${arch}\.zip")
|
||||
if ($matches.Count -eq 0) {
|
||||
Write-Err "Could not locate Node.js $NodeVersion zip for win-$arch"
|
||||
throw "no tarball"
|
||||
}
|
||||
$zipName = $matches[0].Value
|
||||
$zipUrl = "$indexUrl$zipName"
|
||||
|
||||
$tmpDir = Join-Path $env:TEMP "hermes-node-$([guid]::NewGuid().ToString('N'))"
|
||||
New-Item -ItemType Directory -Force -Path $tmpDir | Out-Null
|
||||
$zipPath = Join-Path $tmpDir $zipName
|
||||
|
||||
Write-Info "Downloading $zipName ..."
|
||||
Invoke-WebRequest -Uri $zipUrl -OutFile $zipPath -UseBasicParsing
|
||||
|
||||
Expand-Archive -Path $zipPath -DestinationPath $tmpDir -Force
|
||||
$extracted = Get-ChildItem -Path $tmpDir -Directory | Where-Object { $_.Name -like "node-v*" } | Select-Object -First 1
|
||||
|
||||
if (-not $extracted) { Write-Err "Node.js extraction failed"; throw "extract" }
|
||||
|
||||
if (Test-Path $NodePrefix) { Remove-Item -Recurse -Force $NodePrefix }
|
||||
New-Item -ItemType Directory -Force -Path $HermesHome | Out-Null
|
||||
Move-Item -Path $extracted.FullName -Destination $NodePrefix
|
||||
|
||||
Remove-Item -Recurse -Force $tmpDir -ErrorAction SilentlyContinue
|
||||
|
||||
$env:PATH = "$NodePrefix;$env:PATH"
|
||||
$v = & "$NodePrefix\node.exe" --version
|
||||
Write-Success "Node.js $v installed to $NodePrefix"
|
||||
} catch {
|
||||
Write-Err "Node.js install failed: $_"
|
||||
Write-Info "Install Node 20+ manually from https://nodejs.org/en/download/ and re-run."
|
||||
throw
|
||||
}
|
||||
}
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# Step 2: agent-browser
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
function Ensure-AgentBrowser {
|
||||
$npmExe = Resolve-NpmExe
|
||||
if (-not $npmExe) {
|
||||
Write-Err "npm not on PATH after Node install — aborting"
|
||||
throw "npm missing"
|
||||
}
|
||||
|
||||
# Already installed?
|
||||
$existing = Get-Command agent-browser -ErrorAction SilentlyContinue
|
||||
if ($existing) {
|
||||
Write-Success "agent-browser already installed at $($existing.Source)"
|
||||
return
|
||||
}
|
||||
|
||||
# When the user has system Node (winget / installer-based), `npm install
|
||||
# -g` writes to a directory that may require admin rights. Force the
|
||||
# prefix to the user-writable Hermes-managed Node directory so we never
|
||||
# need elevation and the agent can always find the result. Mirrors the
|
||||
# bash bootstrap's `--prefix $NODE_PREFIX` strategy.
|
||||
New-Item -ItemType Directory -Force -Path $NodePrefix | Out-Null
|
||||
|
||||
Write-Info "Installing agent-browser (npm, prefix=$NodePrefix)..."
|
||||
& $npmExe install -g --prefix $NodePrefix --silent `
|
||||
"agent-browser@^0.26.0" "@askjo/camofox-browser@^1.5.2"
|
||||
if ($LASTEXITCODE -ne 0) {
|
||||
Write-Err "npm install -g agent-browser failed (exit $LASTEXITCODE)"
|
||||
throw "npm install"
|
||||
}
|
||||
|
||||
# Windows npm global installs drop shims at $NodePrefix\ root (not bin/).
|
||||
# Prepend to PATH so any subsequent npx call resolves them.
|
||||
$env:PATH = "$NodePrefix;$env:PATH"
|
||||
|
||||
Write-Success "agent-browser installed to $NodePrefix"
|
||||
}
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# Step 3: Playwright Chromium
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
function Find-SystemBrowser {
|
||||
$candidates = @(
|
||||
"C:\Program Files\Google\Chrome\Application\chrome.exe",
|
||||
"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
|
||||
"C:\Program Files\Chromium\Application\chromium.exe",
|
||||
"${env:LOCALAPPDATA}\Google\Chrome\Application\chrome.exe",
|
||||
"${env:LOCALAPPDATA}\Chromium\Application\chromium.exe"
|
||||
)
|
||||
foreach ($p in $candidates) {
|
||||
if (Test-Path $p) { return $p }
|
||||
}
|
||||
# Edge — Chromium-based, agent-browser can use it
|
||||
foreach ($p in @(
|
||||
"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe",
|
||||
"C:\Program Files\Microsoft\Edge\Application\msedge.exe"
|
||||
)) {
|
||||
if (Test-Path $p) { return $p }
|
||||
}
|
||||
return $null
|
||||
}
|
||||
|
||||
function Write-BrowserEnv {
|
||||
param([string]$BrowserPath)
|
||||
$envFile = Join-Path $HermesHome ".env"
|
||||
New-Item -ItemType Directory -Force -Path $HermesHome | Out-Null
|
||||
if (Test-Path $envFile) {
|
||||
$existing = Get-Content $envFile -Raw -ErrorAction SilentlyContinue
|
||||
if ($existing -and ($existing -match "(?m)^AGENT_BROWSER_EXECUTABLE_PATH=")) {
|
||||
return
|
||||
}
|
||||
}
|
||||
Add-Content -Path $envFile -Value ""
|
||||
Add-Content -Path $envFile -Value "# Hermes Agent browser tools — use the system Chrome/Chromium/Edge binary."
|
||||
Add-Content -Path $envFile -Value "AGENT_BROWSER_EXECUTABLE_PATH=$BrowserPath"
|
||||
Write-Success "Configured browser tools to use $BrowserPath"
|
||||
}
|
||||
|
||||
function Confirm-ChromiumDownload {
|
||||
if ($Yes) { return $true }
|
||||
if (-not [Environment]::UserInteractive) {
|
||||
Write-Warn "Non-interactive shell — skipping Chromium prompt."
|
||||
Write-Info "Re-run with -Yes to install Chromium (~400 MB download)."
|
||||
return $false
|
||||
}
|
||||
$reply = Read-Host "Install Playwright Chromium (~400 MB download)? [y/N]"
|
||||
return ($reply -match "^(y|yes)$")
|
||||
}
|
||||
|
||||
function Ensure-Chromium {
|
||||
if ($SkipChromium) {
|
||||
Write-Info "Skipping Chromium install (-SkipChromium)"
|
||||
return
|
||||
}
|
||||
|
||||
# agent-browser on Windows expects a Playwright-managed Chromium under
|
||||
# %LOCALAPPDATA%\ms-playwright. The system-browser shortcut from the
|
||||
# Linux/macOS path doesn't apply the same way on Windows — Playwright's
|
||||
# default launch path won't pick up a stock Chrome install without an
|
||||
# explicit AGENT_BROWSER_EXECUTABLE_PATH. We still offer it as a
|
||||
# fallback when the user doesn't want the download.
|
||||
|
||||
if (-not (Confirm-ChromiumDownload)) {
|
||||
$sys = Find-SystemBrowser
|
||||
if ($sys) {
|
||||
Write-Info "Using system browser at $sys (Chromium download skipped)."
|
||||
Write-BrowserEnv -BrowserPath $sys
|
||||
} else {
|
||||
Write-Info "Chromium install skipped. Browser tools won't launch until"
|
||||
Write-Info "Chromium is installed or AGENT_BROWSER_EXECUTABLE_PATH is set."
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
$npxExe = Resolve-NpxExe
|
||||
if (-not $npxExe) {
|
||||
Write-Err "npx not on PATH — cannot install Playwright Chromium"
|
||||
throw "npx missing"
|
||||
}
|
||||
|
||||
Write-Info "Installing Playwright Chromium (~400 MB) ..."
|
||||
& $npxExe --yes playwright install chromium
|
||||
if ($LASTEXITCODE -ne 0) {
|
||||
Write-Err "Playwright Chromium install failed (exit $LASTEXITCODE)"
|
||||
Write-Info "Try again later: npx --yes playwright install chromium"
|
||||
throw "playwright"
|
||||
}
|
||||
Write-Success "Playwright Chromium installed"
|
||||
}
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# Main
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
Write-Info "Hermes Agent: bootstrapping browser tools"
|
||||
Write-Info " HERMES_HOME = $HermesHome"
|
||||
Write-Info " OS = Windows"
|
||||
|
||||
Ensure-Node
|
||||
Ensure-AgentBrowser
|
||||
Ensure-Chromium
|
||||
|
||||
Write-Success "Browser tools setup complete."
|
||||
Write-Info "Hermes Agent will pick up agent-browser from $NodePrefix on next launch."
|
||||
+399
@@ -0,0 +1,399 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# bootstrap_browser_tools.sh — install agent-browser + Playwright Chromium
|
||||
# into ~/.hermes/node/ for use by Hermes Agent's browser tools.
|
||||
#
|
||||
# Targets the registry-install path: users who got Hermes via
|
||||
# `uvx --from 'hermes-agent[acp]==X' hermes-acp` don't have a repo clone,
|
||||
# so the install.sh `npm install`-in-repo flow doesn't apply. This script
|
||||
# is a self-contained, idempotent slice of install.sh's browser block —
|
||||
# safe to run from `hermes-acp --setup-browser`, from a fresh terminal,
|
||||
# or from install.sh itself (it's a no-op when everything is already in place).
|
||||
#
|
||||
# Usage:
|
||||
# bootstrap_browser_tools.sh # use defaults
|
||||
# bootstrap_browser_tools.sh --yes # accept the ~400MB Chromium download
|
||||
# bootstrap_browser_tools.sh --skip-chromium # only install Node + agent-browser
|
||||
# HERMES_HOME=/custom/path bootstrap_browser_tools.sh
|
||||
#
|
||||
# Idempotent: re-running this is safe and fast. Each step checks whether
|
||||
# the work is already done.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# Config
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
NODE_VERSION="22"
|
||||
HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
|
||||
NODE_PREFIX="$HERMES_HOME/node"
|
||||
|
||||
SKIP_CHROMIUM=false
|
||||
ASSUME_YES=false
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# Logging
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
if [ -t 1 ]; then
|
||||
C_GREEN='\033[0;32m'
|
||||
C_YELLOW='\033[0;33m'
|
||||
C_BLUE='\033[0;34m'
|
||||
C_RED='\033[0;31m'
|
||||
C_RESET='\033[0m'
|
||||
else
|
||||
C_GREEN='' ; C_YELLOW='' ; C_BLUE='' ; C_RED='' ; C_RESET=''
|
||||
fi
|
||||
|
||||
log_info() { printf "${C_BLUE}[*]${C_RESET} %s\n" "$*"; }
|
||||
log_success() { printf "${C_GREEN}[✓]${C_RESET} %s\n" "$*"; }
|
||||
log_warn() { printf "${C_YELLOW}[!]${C_RESET} %s\n" "$*" >&2; }
|
||||
log_error() { printf "${C_RED}[✗]${C_RESET} %s\n" "$*" >&2; }
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# Arg parsing
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--skip-chromium) SKIP_CHROMIUM=true ;;
|
||||
--yes|-y) ASSUME_YES=true ;;
|
||||
-h|--help)
|
||||
cat <<EOF
|
||||
Bootstrap Hermes Agent browser tools.
|
||||
|
||||
Installs Node.js (into ~/.hermes/node/), the agent-browser npm package,
|
||||
and the Playwright Chromium browser engine.
|
||||
|
||||
Options:
|
||||
--skip-chromium Install Node + agent-browser but skip Chromium download
|
||||
--yes, -y Accept the ~400 MB Chromium download without prompting
|
||||
-h, --help Show this help
|
||||
|
||||
Environment:
|
||||
HERMES_HOME Override Hermes data dir (default: \$HOME/.hermes)
|
||||
EOF
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
log_error "Unknown option: $1"
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# OS / arch detection
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
OS="unknown"
|
||||
case "$(uname -s)" in
|
||||
Linux*) OS="linux" ;;
|
||||
Darwin*) OS="macos" ;;
|
||||
*)
|
||||
log_error "Unsupported OS: $(uname -s)"
|
||||
log_info "Windows users: run scripts/bootstrap_browser_tools.ps1 in PowerShell."
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
NODE_ARCH=""
|
||||
case "$(uname -m)" in
|
||||
x86_64) NODE_ARCH="x64" ;;
|
||||
aarch64|arm64) NODE_ARCH="arm64" ;;
|
||||
armv7l) NODE_ARCH="armv7l" ;;
|
||||
*)
|
||||
log_error "Unsupported architecture: $(uname -m)"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
NODE_OS=""
|
||||
case "$OS" in
|
||||
linux) NODE_OS="linux" ;;
|
||||
macos) NODE_OS="darwin" ;;
|
||||
esac
|
||||
|
||||
DISTRO=""
|
||||
if [ -f /etc/os-release ]; then
|
||||
# shellcheck disable=SC1091
|
||||
. /etc/os-release
|
||||
DISTRO="${ID:-}"
|
||||
fi
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# Step 1: Node.js
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
ensure_node() {
|
||||
# Already on PATH and recent enough?
|
||||
if command -v node >/dev/null 2>&1; then
|
||||
local found_ver major
|
||||
found_ver=$(node --version 2>/dev/null)
|
||||
major=$(echo "$found_ver" | sed -E 's/^v([0-9]+).*/\1/')
|
||||
if [ -n "$major" ] && [ "$major" -ge 20 ]; then
|
||||
log_success "Node.js $found_ver found on PATH"
|
||||
return 0
|
||||
fi
|
||||
log_warn "Node.js $found_ver is older than v20 — installing managed Node."
|
||||
fi
|
||||
|
||||
if [ -x "$NODE_PREFIX/bin/node" ]; then
|
||||
local found_ver
|
||||
found_ver=$("$NODE_PREFIX/bin/node" --version 2>/dev/null || echo "?")
|
||||
export PATH="$NODE_PREFIX/bin:$PATH"
|
||||
log_success "Node.js $found_ver found (Hermes-managed at $NODE_PREFIX)"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log_info "Installing Node.js $NODE_VERSION LTS into $NODE_PREFIX ..."
|
||||
|
||||
local index_url="https://nodejs.org/dist/latest-v${NODE_VERSION}.x/"
|
||||
local tarball_name
|
||||
tarball_name=$(curl -fsSL "$index_url" \
|
||||
| grep -oE "node-v${NODE_VERSION}\.[0-9]+\.[0-9]+-${NODE_OS}-${NODE_ARCH}\.tar\.xz" \
|
||||
| head -1)
|
||||
|
||||
if [ -z "$tarball_name" ]; then
|
||||
tarball_name=$(curl -fsSL "$index_url" \
|
||||
| grep -oE "node-v${NODE_VERSION}\.[0-9]+\.[0-9]+-${NODE_OS}-${NODE_ARCH}\.tar\.gz" \
|
||||
| head -1)
|
||||
fi
|
||||
|
||||
if [ -z "$tarball_name" ]; then
|
||||
log_error "Could not locate Node.js $NODE_VERSION tarball for $NODE_OS-$NODE_ARCH"
|
||||
log_info "Install Node 20+ manually: https://nodejs.org/en/download/"
|
||||
return 1
|
||||
fi
|
||||
|
||||
local tmp_dir
|
||||
tmp_dir=$(mktemp -d)
|
||||
trap 'rm -rf "$tmp_dir"' RETURN
|
||||
|
||||
log_info "Downloading $tarball_name ..."
|
||||
if ! curl -fsSL "${index_url}${tarball_name}" -o "$tmp_dir/$tarball_name"; then
|
||||
log_error "Node.js download failed"
|
||||
return 1
|
||||
fi
|
||||
|
||||
if [[ "$tarball_name" == *.tar.xz ]]; then
|
||||
tar xf "$tmp_dir/$tarball_name" -C "$tmp_dir"
|
||||
else
|
||||
tar xzf "$tmp_dir/$tarball_name" -C "$tmp_dir"
|
||||
fi
|
||||
|
||||
local extracted_dir
|
||||
extracted_dir=$(ls -d "$tmp_dir"/node-v* 2>/dev/null | head -1)
|
||||
if [ ! -d "$extracted_dir" ]; then
|
||||
log_error "Node.js extraction failed"
|
||||
return 1
|
||||
fi
|
||||
|
||||
mkdir -p "$HERMES_HOME"
|
||||
rm -rf "$NODE_PREFIX"
|
||||
mv "$extracted_dir" "$NODE_PREFIX"
|
||||
|
||||
export PATH="$NODE_PREFIX/bin:$PATH"
|
||||
|
||||
local installed_ver
|
||||
installed_ver=$("$NODE_PREFIX/bin/node" --version 2>/dev/null || echo "?")
|
||||
log_success "Node.js $installed_ver installed to $NODE_PREFIX"
|
||||
}
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# Step 2: agent-browser + @askjo/camofox-browser via global npm install
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
ensure_agent_browser() {
|
||||
if ! command -v npm >/dev/null 2>&1; then
|
||||
log_error "npm not on PATH after Node install — aborting"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# _find_agent_browser() in tools/browser_tool.py walks ~/.hermes/node/bin
|
||||
# plus a few standard prefixes, so installing globally into the managed
|
||||
# Node prefix is enough — no PATH manipulation needed from the agent side.
|
||||
if [ -x "$NODE_PREFIX/bin/agent-browser" ] || command -v agent-browser >/dev/null 2>&1; then
|
||||
log_success "agent-browser already installed"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# When the system's `npm` resolves to a root-owned prefix (e.g.
|
||||
# /usr/lib/node_modules), `npm install -g` fails with EACCES without
|
||||
# sudo. Force the prefix to the user-writable Hermes-managed Node
|
||||
# directory so we never need sudo and the agent can always find the
|
||||
# result. If we installed Node ourselves above, this is a no-op
|
||||
# (managed Node already uses $NODE_PREFIX). If the user has system
|
||||
# Node, we still drop agent-browser under $NODE_PREFIX/bin/ — which
|
||||
# is exactly where _browser_candidate_path_dirs() looks first.
|
||||
mkdir -p "$NODE_PREFIX"
|
||||
|
||||
log_info "Installing agent-browser (npm, prefix=$NODE_PREFIX)..."
|
||||
if ! npm install -g --prefix "$NODE_PREFIX" --silent \
|
||||
agent-browser@^0.26.0 \
|
||||
"@askjo/camofox-browser@^1.5.2"; then
|
||||
log_error "npm install -g agent-browser failed"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# macOS/Linux global installs place the shim into $NODE_PREFIX/bin/.
|
||||
# Add it to PATH for any subsequent steps (npx playwright).
|
||||
export PATH="$NODE_PREFIX/bin:$PATH"
|
||||
|
||||
log_success "agent-browser installed to $NODE_PREFIX/bin/"
|
||||
}
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# Step 3: Playwright Chromium
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
confirm_chromium_download() {
|
||||
if [ "$ASSUME_YES" = true ]; then return 0; fi
|
||||
if [ ! -t 0 ]; then
|
||||
log_warn "Non-interactive shell — skipping Chromium prompt."
|
||||
log_info "Re-run with --yes to install Chromium (~400 MB download)."
|
||||
return 1
|
||||
fi
|
||||
printf "Install Playwright Chromium (~400 MB download)? [y/N] "
|
||||
local reply=""
|
||||
read -r reply || reply=""
|
||||
case "$reply" in
|
||||
y|Y|yes|YES) return 0 ;;
|
||||
*) return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Detect a usable system Chrome/Chromium. agent-browser's Chrome engine can
|
||||
# use it instead of downloading Playwright's bundled Chromium, saving the
|
||||
# download cost. Returns the path or empty string.
|
||||
find_system_browser() {
|
||||
local candidate
|
||||
for candidate in google-chrome google-chrome-stable chromium chromium-browser chrome; do
|
||||
if command -v "$candidate" >/dev/null 2>&1; then
|
||||
command -v "$candidate"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
# macOS app-bundle locations
|
||||
if [ "$OS" = "macos" ]; then
|
||||
for candidate in \
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" \
|
||||
"/Applications/Chromium.app/Contents/MacOS/Chromium" ; do
|
||||
if [ -x "$candidate" ]; then
|
||||
echo "$candidate"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
write_browser_env() {
|
||||
local browser_path="$1"
|
||||
local env_file="$HERMES_HOME/.env"
|
||||
mkdir -p "$HERMES_HOME"
|
||||
if [ -f "$env_file" ] && grep -q "^AGENT_BROWSER_EXECUTABLE_PATH=" "$env_file"; then
|
||||
return 0
|
||||
fi
|
||||
{
|
||||
echo ""
|
||||
echo "# Hermes Agent browser tools — use the system Chrome/Chromium binary."
|
||||
echo "AGENT_BROWSER_EXECUTABLE_PATH=$browser_path"
|
||||
} >> "$env_file"
|
||||
log_success "Configured browser tools to use $browser_path"
|
||||
}
|
||||
|
||||
ensure_chromium() {
|
||||
if [ "$SKIP_CHROMIUM" = true ]; then
|
||||
log_info "Skipping Chromium install (--skip-chromium)"
|
||||
return 0
|
||||
fi
|
||||
|
||||
local system_browser
|
||||
system_browser="$(find_system_browser 2>/dev/null || true)"
|
||||
if [ -n "$system_browser" ]; then
|
||||
log_success "Found system browser: $system_browser"
|
||||
log_info "Skipping Playwright Chromium download; agent-browser will use it."
|
||||
write_browser_env "$system_browser"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if ! confirm_chromium_download; then
|
||||
log_info "Chromium install skipped. Browser tools will only work if you"
|
||||
log_info "set AGENT_BROWSER_EXECUTABLE_PATH or install Chromium later."
|
||||
return 0
|
||||
fi
|
||||
|
||||
if ! command -v npx >/dev/null 2>&1; then
|
||||
log_error "npx not on PATH — cannot install Playwright Chromium"
|
||||
return 1
|
||||
fi
|
||||
|
||||
log_info "Installing Playwright Chromium (~400 MB) ..."
|
||||
|
||||
# On apt-based distros, --with-deps requires sudo. Try non-interactively
|
||||
# only — never prompt — and fall back to the bare browser-only install.
|
||||
local installed=false
|
||||
if [ "$OS" = "linux" ]; then
|
||||
case "$DISTRO" in
|
||||
ubuntu|debian|raspbian|pop|linuxmint|elementary|zorin|kali|parrot)
|
||||
if [ "$(id -u)" -eq 0 ] || (command -v sudo >/dev/null 2>&1 && sudo -n true 2>/dev/null); then
|
||||
log_info "Installing system deps with --with-deps (sudo available)"
|
||||
if npx --yes playwright install --with-deps chromium; then
|
||||
installed=true
|
||||
fi
|
||||
else
|
||||
log_warn "sudo not available non-interactively — installing Chromium without system deps."
|
||||
log_info "If browser tools fail to launch, an administrator should run:"
|
||||
log_info " sudo npx playwright install-deps chromium"
|
||||
fi
|
||||
;;
|
||||
arch|manjaro|cachyos|endeavouros|garuda)
|
||||
log_info "Arch-family system dependencies are not auto-installed."
|
||||
log_info "If launch fails, run: sudo pacman -S nss atk at-spi2-core cups libdrm libxkbcommon mesa pango cairo alsa-lib"
|
||||
;;
|
||||
fedora|rhel|centos|rocky|alma)
|
||||
log_info "Fedora/RHEL system dependencies are not auto-installed."
|
||||
log_info "If launch fails, run: sudo dnf install nss atk at-spi2-core cups-libs libdrm libxkbcommon mesa-libgbm pango cairo alsa-lib"
|
||||
;;
|
||||
opensuse*|sles)
|
||||
log_info "openSUSE system dependencies are not auto-installed."
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
if [ "$installed" = false ]; then
|
||||
if npx --yes playwright install chromium; then
|
||||
installed=true
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$installed" = true ]; then
|
||||
log_success "Playwright Chromium installed"
|
||||
else
|
||||
log_error "Playwright Chromium install failed"
|
||||
log_info "Try again later: npx --yes playwright install chromium"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# Main
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
main() {
|
||||
log_info "Hermes Agent: bootstrapping browser tools"
|
||||
log_info " HERMES_HOME = $HERMES_HOME"
|
||||
log_info " OS / arch = $NODE_OS-$NODE_ARCH ${DISTRO:+($DISTRO)}"
|
||||
|
||||
ensure_node
|
||||
ensure_agent_browser
|
||||
ensure_chromium
|
||||
|
||||
log_success "Browser tools setup complete."
|
||||
log_info "Hermes Agent will pick up agent-browser from $NODE_PREFIX/bin/ on next launch."
|
||||
}
|
||||
|
||||
main
|
||||
+144
-1
@@ -24,6 +24,7 @@ except ModuleNotFoundError:
|
||||
# means UTF-8 stdio setup is skipped on Windows; POSIX is unaffected.
|
||||
pass
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import sys
|
||||
@@ -107,8 +108,150 @@ def _load_env() -> None:
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="hermes-acp",
|
||||
description="Run Hermes Agent as an ACP stdio server.",
|
||||
)
|
||||
parser.add_argument("--version", action="store_true", help="Print Hermes version and exit")
|
||||
parser.add_argument(
|
||||
"--check",
|
||||
action="store_true",
|
||||
help="Verify ACP dependencies and adapter imports, then exit",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--setup",
|
||||
action="store_true",
|
||||
help="Run interactive Hermes provider/model setup for ACP terminal auth",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--setup-browser",
|
||||
action="store_true",
|
||||
help="Install agent-browser + Playwright Chromium into ~/.hermes/node/ "
|
||||
"for browser tool support. Idempotent.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--yes",
|
||||
"-y",
|
||||
action="store_true",
|
||||
dest="assume_yes",
|
||||
help="Accept all prompts (currently used by --setup-browser to skip the "
|
||||
"~400 MB Chromium download confirmation).",
|
||||
)
|
||||
return parser.parse_args(argv)
|
||||
|
||||
|
||||
def _print_version() -> None:
|
||||
from hermes_cli import __version__ as hermes_version
|
||||
|
||||
print(hermes_version)
|
||||
|
||||
|
||||
def _run_check() -> None:
|
||||
import acp # noqa: F401
|
||||
from acp_adapter.server import HermesACPAgent # noqa: F401
|
||||
|
||||
print("Hermes ACP check OK")
|
||||
|
||||
|
||||
def _run_setup() -> None:
|
||||
from hermes_cli.main import main as hermes_main
|
||||
|
||||
old_argv = sys.argv[:]
|
||||
try:
|
||||
sys.argv = [old_argv[0] if old_argv else "hermes", "model"]
|
||||
hermes_main()
|
||||
finally:
|
||||
sys.argv = old_argv
|
||||
|
||||
# Offer browser-tools install as a follow-up. The terminal auth method
|
||||
# is the one supported first-run UX for registry installs, so this is
|
||||
# the natural moment to ask. Skip silently if stdin isn't a TTY (the
|
||||
# answer can't be collected anyway).
|
||||
if not sys.stdin.isatty():
|
||||
return
|
||||
try:
|
||||
reply = input(
|
||||
"\nInstall browser tools? Downloads agent-browser (npm) and "
|
||||
"optionally Playwright Chromium (~400 MB). [y/N] "
|
||||
).strip().lower()
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
return
|
||||
if reply in {"y", "yes"}:
|
||||
_run_setup_browser(assume_yes=False)
|
||||
|
||||
|
||||
def _run_setup_browser(assume_yes: bool = False) -> int:
|
||||
"""Bootstrap agent-browser + Playwright Chromium for the registry-install path.
|
||||
|
||||
Shells out to the bundled platform-specific bootstrap script
|
||||
(acp_adapter/bootstrap/bootstrap_browser_tools.{sh,ps1}) so the install
|
||||
logic lives in one place — readable, debuggable, and shareable with
|
||||
install.sh / install.ps1 if we ever want to call it from there too.
|
||||
|
||||
Returns the script's exit code (0 on success).
|
||||
"""
|
||||
import platform
|
||||
import subprocess
|
||||
|
||||
bootstrap_dir = Path(__file__).resolve().parent / "bootstrap"
|
||||
|
||||
if platform.system() == "Windows":
|
||||
script = bootstrap_dir / "bootstrap_browser_tools.ps1"
|
||||
if not script.is_file():
|
||||
print(
|
||||
f"Bootstrap script not found at {script} — wheel may be incomplete.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
cmd = [
|
||||
"powershell.exe",
|
||||
"-NoProfile",
|
||||
"-ExecutionPolicy", "Bypass",
|
||||
"-File", str(script),
|
||||
]
|
||||
if assume_yes:
|
||||
cmd.append("-Yes")
|
||||
else:
|
||||
script = bootstrap_dir / "bootstrap_browser_tools.sh"
|
||||
if not script.is_file():
|
||||
print(
|
||||
f"Bootstrap script not found at {script} — wheel may be incomplete.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
cmd = ["bash", str(script)]
|
||||
if assume_yes:
|
||||
cmd.append("--yes")
|
||||
|
||||
# stdio is inherited so the user sees the bootstrap's progress live.
|
||||
try:
|
||||
result = subprocess.run(cmd, check=False)
|
||||
except FileNotFoundError as exc:
|
||||
# bash / powershell.exe not on PATH
|
||||
print(f"Could not launch browser bootstrap: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
return result.returncode
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> None:
|
||||
"""Entry point: load env, configure logging, run the ACP agent."""
|
||||
args = _parse_args(argv)
|
||||
if args.version:
|
||||
_print_version()
|
||||
return
|
||||
if args.check:
|
||||
_run_check()
|
||||
return
|
||||
if args.setup:
|
||||
_run_setup()
|
||||
return
|
||||
if args.setup_browser:
|
||||
rc = _run_setup_browser(assume_yes=args.assume_yes)
|
||||
if rc != 0:
|
||||
sys.exit(rc)
|
||||
return
|
||||
|
||||
_setup_logging()
|
||||
_load_env()
|
||||
|
||||
|
||||
+74
-3
@@ -14,6 +14,7 @@ from collections import deque
|
||||
from typing import Any, Callable, Deque, Dict
|
||||
|
||||
import acp
|
||||
from acp.schema import AgentPlanUpdate, PlanEntry
|
||||
|
||||
from .tools import (
|
||||
build_tool_complete,
|
||||
@@ -24,6 +25,65 @@ from .tools import (
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _json_loads_maybe_prefix(value: str) -> Any:
|
||||
"""Parse a JSON object even when Hermes appended a human hint after it."""
|
||||
text = value.strip()
|
||||
try:
|
||||
return json.loads(text)
|
||||
except Exception:
|
||||
decoder = json.JSONDecoder()
|
||||
data, _ = decoder.raw_decode(text)
|
||||
return data
|
||||
|
||||
|
||||
def _build_plan_update_from_todo_result(result: Any) -> AgentPlanUpdate | None:
|
||||
"""Translate Hermes' todo tool result into ACP's native plan update.
|
||||
|
||||
Zed renders ``sessionUpdate: plan`` as its first-class task/todo panel. The
|
||||
Hermes agent already maintains task state through the ``todo`` tool, so the
|
||||
ACP adapter should expose that state natively instead of only as a generic
|
||||
tool-call transcript block.
|
||||
"""
|
||||
if not isinstance(result, str) or not result.strip():
|
||||
return None
|
||||
|
||||
try:
|
||||
data = _json_loads_maybe_prefix(result)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
if not isinstance(data, dict) or not isinstance(data.get("todos"), list):
|
||||
return None
|
||||
|
||||
todos = data["todos"]
|
||||
if not todos:
|
||||
return AgentPlanUpdate(session_update="plan", entries=[])
|
||||
|
||||
status_map = {
|
||||
"pending": "pending",
|
||||
"in_progress": "in_progress",
|
||||
"completed": "completed",
|
||||
# ACP plans only support pending/in_progress/completed. Preserve
|
||||
# cancelled tasks as terminal entries instead of dropping them and
|
||||
# making the client's full-list replacement lose visible context.
|
||||
"cancelled": "completed",
|
||||
}
|
||||
entries: list[PlanEntry] = []
|
||||
for item in todos:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
content = str(item.get("content") or item.get("id") or "").strip()
|
||||
if not content:
|
||||
continue
|
||||
raw_status = str(item.get("status") or "pending").strip()
|
||||
status = status_map.get(raw_status, "pending")
|
||||
if raw_status == "cancelled":
|
||||
content = f"[cancelled] {content}"
|
||||
entries.append(PlanEntry(content=content, priority="medium", status=status))
|
||||
|
||||
return AgentPlanUpdate(session_update="plan", entries=entries)
|
||||
|
||||
|
||||
def _send_update(
|
||||
conn: acp.Client,
|
||||
session_id: str,
|
||||
@@ -31,10 +91,17 @@ def _send_update(
|
||||
update: Any,
|
||||
) -> None:
|
||||
"""Fire-and-forget an ACP session update from a worker thread."""
|
||||
from agent.async_utils import safe_schedule_threadsafe
|
||||
|
||||
future = safe_schedule_threadsafe(
|
||||
conn.session_update(session_id, update),
|
||||
loop,
|
||||
logger=logger,
|
||||
log_message="Failed to send ACP update",
|
||||
)
|
||||
if future is None:
|
||||
return
|
||||
try:
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
conn.session_update(session_id, update), loop
|
||||
)
|
||||
future.result(timeout=5)
|
||||
except Exception:
|
||||
logger.debug("Failed to send ACP update", exc_info=True)
|
||||
@@ -168,6 +235,10 @@ def make_step_cb(
|
||||
snapshot=meta.get("snapshot"),
|
||||
)
|
||||
_send_update(conn, session_id, loop, update)
|
||||
if tool_name == "todo":
|
||||
plan_update = _build_plan_update_from_todo_result(result)
|
||||
if plan_update is not None:
|
||||
_send_update(conn, session_id, loop, plan_update)
|
||||
if not queue:
|
||||
tool_call_ids.pop(tool_name, None)
|
||||
|
||||
|
||||
+17
-10
@@ -111,21 +111,28 @@ def make_approval_callback(
|
||||
allow_permanent: bool = True,
|
||||
**_: object,
|
||||
) -> str:
|
||||
from agent.async_utils import safe_schedule_threadsafe
|
||||
|
||||
options = _build_permission_options(allow_permanent=allow_permanent)
|
||||
|
||||
future = None
|
||||
tool_call = _build_permission_tool_call(command, description)
|
||||
coro = request_permission_fn(
|
||||
session_id=session_id,
|
||||
tool_call=tool_call,
|
||||
options=options,
|
||||
)
|
||||
future = safe_schedule_threadsafe(
|
||||
coro, loop,
|
||||
logger=logger,
|
||||
log_message="Permission request: failed to schedule on loop",
|
||||
)
|
||||
if future is None:
|
||||
return "deny"
|
||||
|
||||
try:
|
||||
tool_call = _build_permission_tool_call(command, description)
|
||||
coro = request_permission_fn(
|
||||
session_id=session_id,
|
||||
tool_call=tool_call,
|
||||
options=options,
|
||||
)
|
||||
future = asyncio.run_coroutine_threadsafe(coro, loop)
|
||||
response = future.result(timeout=timeout)
|
||||
except (FutureTimeout, Exception) as exc:
|
||||
if future is not None:
|
||||
future.cancel()
|
||||
future.cancel()
|
||||
logger.warning("Permission request timed out or failed: %s", exc)
|
||||
return "deny"
|
||||
|
||||
|
||||
+20
-21
@@ -57,14 +57,9 @@ from acp.schema import (
|
||||
UserMessageChunk,
|
||||
)
|
||||
|
||||
# AuthMethodAgent was renamed from AuthMethod in agent-client-protocol 0.9.0
|
||||
try:
|
||||
from acp.schema import AuthMethodAgent
|
||||
except ImportError:
|
||||
from acp.schema import AuthMethod as AuthMethodAgent # type: ignore[attr-defined]
|
||||
|
||||
from acp_adapter.auth import detect_provider
|
||||
from acp_adapter.auth import TERMINAL_SETUP_AUTH_METHOD_ID, build_auth_methods, detect_provider
|
||||
from acp_adapter.events import (
|
||||
_build_plan_update_from_todo_result,
|
||||
make_message_cb,
|
||||
make_step_cb,
|
||||
make_thinking_cb,
|
||||
@@ -744,16 +739,7 @@ class HermesACPAgent(acp.Agent):
|
||||
resolved_protocol_version = (
|
||||
protocol_version if isinstance(protocol_version, int) else acp.PROTOCOL_VERSION
|
||||
)
|
||||
provider = detect_provider()
|
||||
auth_methods = None
|
||||
if provider:
|
||||
auth_methods = [
|
||||
AuthMethodAgent(
|
||||
id=provider,
|
||||
name=f"{provider} runtime credentials",
|
||||
description=f"Authenticate Hermes using the currently configured {provider} runtime credentials.",
|
||||
)
|
||||
]
|
||||
auth_methods = build_auth_methods()
|
||||
|
||||
client_name = client_info.name if client_info else "unknown"
|
||||
logger.info(
|
||||
@@ -784,10 +770,18 @@ class HermesACPAgent(acp.Agent):
|
||||
# server has provider credentials configured — harmless under
|
||||
# Hermes' threat model (ACP is stdio-only, local-trust), but poor
|
||||
# API hygiene and confusing if ACP ever grows multi-method auth.
|
||||
provider = detect_provider()
|
||||
if not provider:
|
||||
if not isinstance(method_id, str):
|
||||
return None
|
||||
if not isinstance(method_id, str) or method_id.strip().lower() != provider:
|
||||
normalized_method = method_id.strip().lower()
|
||||
provider = detect_provider()
|
||||
|
||||
if normalized_method == TERMINAL_SETUP_AUTH_METHOD_ID:
|
||||
# Terminal auth launches Hermes setup/model selection out-of-band.
|
||||
# Only report success once that flow has produced usable runtime
|
||||
# credentials for the normal ACP session.
|
||||
return AuthenticateResponse() if provider else None
|
||||
|
||||
if not provider or normalized_method != provider:
|
||||
return None
|
||||
return AuthenticateResponse()
|
||||
|
||||
@@ -917,15 +911,20 @@ class HermesACPAgent(acp.Agent):
|
||||
if not tool_call_id or not tool_name:
|
||||
continue
|
||||
result = message.get("content")
|
||||
result_text = result if isinstance(result, str) else None
|
||||
if not await _send(
|
||||
build_tool_complete(
|
||||
tool_call_id,
|
||||
tool_name,
|
||||
result=result if isinstance(result, str) else None,
|
||||
result=result_text,
|
||||
function_args=function_args,
|
||||
)
|
||||
):
|
||||
return
|
||||
if tool_name == "todo":
|
||||
plan_update = _build_plan_update_from_todo_result(result_text)
|
||||
if plan_update is not None and not await _send(plan_update):
|
||||
return
|
||||
|
||||
async def new_session(
|
||||
self,
|
||||
|
||||
+12
-8
@@ -1,12 +1,16 @@
|
||||
{
|
||||
"schema_version": 1,
|
||||
"name": "hermes-agent",
|
||||
"display_name": "Hermes Agent",
|
||||
"description": "AI agent by Nous Research with 90+ tools, persistent memory, and multi-platform support",
|
||||
"icon": "icon.svg",
|
||||
"id": "hermes-agent",
|
||||
"name": "Hermes Agent",
|
||||
"version": "0.13.0",
|
||||
"description": "Self-improving open-source AI agent by Nous Research with ACP editor integration, persistent memory, skills, and rich tool support.",
|
||||
"repository": "https://github.com/NousResearch/hermes-agent",
|
||||
"website": "https://hermes-agent.nousresearch.com/docs/user-guide/features/acp",
|
||||
"authors": ["Nous Research"],
|
||||
"license": "MIT",
|
||||
"distribution": {
|
||||
"type": "command",
|
||||
"command": "hermes",
|
||||
"args": ["acp"]
|
||||
"uvx": {
|
||||
"package": "hermes-agent[acp]==0.13.0",
|
||||
"args": ["hermes-acp"]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+7
-24
@@ -1,25 +1,8 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 64 64" width="64" height="64">
|
||||
<defs>
|
||||
<linearGradient id="gold" x1="0%" y1="0%" x2="0%" y2="100%">
|
||||
<stop offset="0%" style="stop-color:#F5C542;stop-opacity:1" />
|
||||
<stop offset="100%" style="stop-color:#D4961C;stop-opacity:1" />
|
||||
</linearGradient>
|
||||
</defs>
|
||||
<!-- Staff -->
|
||||
<rect x="30" y="10" width="4" height="46" rx="2" fill="url(#gold)" />
|
||||
<!-- Wings (left) -->
|
||||
<path d="M30 18 C24 14, 14 14, 10 18 C14 16, 22 16, 28 20" fill="#F5C542" opacity="0.9" />
|
||||
<path d="M30 22 C26 19, 18 19, 14 22 C18 20, 24 20, 28 24" fill="#D4961C" opacity="0.8" />
|
||||
<!-- Wings (right) -->
|
||||
<path d="M34 18 C40 14, 50 14, 54 18 C50 16, 42 16, 36 20" fill="#F5C542" opacity="0.9" />
|
||||
<path d="M34 22 C38 19, 46 19, 50 22 C46 20, 40 20, 36 24" fill="#D4961C" opacity="0.8" />
|
||||
<!-- Left serpent -->
|
||||
<path d="M32 48 C22 44, 20 38, 26 34 C20 36, 18 42, 24 46 C18 40, 22 30, 30 28 C24 32, 22 38, 28 42"
|
||||
fill="none" stroke="#F5C542" stroke-width="2.5" stroke-linecap="round" />
|
||||
<!-- Right serpent -->
|
||||
<path d="M32 48 C42 44, 44 38, 38 34 C44 36, 46 42, 40 46 C46 40, 42 30, 34 28 C40 32, 42 38, 36 42"
|
||||
fill="none" stroke="#D4961C" stroke-width="2.5" stroke-linecap="round" />
|
||||
<!-- Orb at top -->
|
||||
<circle cx="32" cy="10" r="4" fill="#F5C542" />
|
||||
<circle cx="32" cy="10" r="2" fill="#FFF8E1" opacity="0.7" />
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" width="16" height="16" fill="none">
|
||||
<path d="M8 1.5v13" stroke="currentColor" stroke-width="1.5" stroke-linecap="round"/>
|
||||
<path d="M8 3.25c-2.35-1.4-4.7-.95-6.25.35 1.85-.2 3.8.2 5.55 1.55" stroke="currentColor" stroke-width="1.1" stroke-linecap="round" stroke-linejoin="round"/>
|
||||
<path d="M8 3.25c2.35-1.4 4.7-.95 6.25.35-1.85-.2-3.8.2-5.55 1.55" stroke="currentColor" stroke-width="1.1" stroke-linecap="round" stroke-linejoin="round"/>
|
||||
<path d="M8 13.25c-2.3-1-3.05-2.65-1.35-4.15-2 .8-2.35 2.95-.35 4" stroke="currentColor" stroke-width="1.1" stroke-linecap="round" stroke-linejoin="round"/>
|
||||
<path d="M8 13.25c2.3-1 3.05-2.65 1.35-4.15 2 .8 2.35 2.95.35 4" stroke="currentColor" stroke-width="1.1" stroke-linecap="round" stroke-linejoin="round"/>
|
||||
<circle cx="8" cy="1.8" r="1.1" fill="currentColor"/>
|
||||
</svg>
|
||||
|
||||
|
Before Width: | Height: | Size: 1.4 KiB After Width: | Height: | Size: 882 B |
@@ -1060,10 +1060,12 @@ def _generate_pkce() -> tuple:
|
||||
|
||||
def run_hermes_oauth_login_pure() -> Optional[Dict[str, Any]]:
|
||||
"""Run Hermes-native OAuth PKCE flow and return credential state."""
|
||||
import secrets
|
||||
import time
|
||||
import webbrowser
|
||||
|
||||
verifier, challenge = _generate_pkce()
|
||||
oauth_state = secrets.token_urlsafe(32)
|
||||
|
||||
params = {
|
||||
"code": "true",
|
||||
@@ -1073,7 +1075,7 @@ def run_hermes_oauth_login_pure() -> Optional[Dict[str, Any]]:
|
||||
"scope": _OAUTH_SCOPES,
|
||||
"code_challenge": challenge,
|
||||
"code_challenge_method": "S256",
|
||||
"state": verifier,
|
||||
"state": oauth_state,
|
||||
}
|
||||
from urllib.parse import urlencode
|
||||
|
||||
@@ -1110,7 +1112,12 @@ def run_hermes_oauth_login_pure() -> Optional[Dict[str, Any]]:
|
||||
|
||||
splits = auth_code.split("#")
|
||||
code = splits[0]
|
||||
state = splits[1] if len(splits) > 1 else ""
|
||||
received_state = splits[1] if len(splits) > 1 else ""
|
||||
|
||||
# Validate state to prevent CSRF (RFC 6749 §10.12)
|
||||
if received_state != oauth_state:
|
||||
logger.warning("OAuth state mismatch — possible CSRF, aborting")
|
||||
return None
|
||||
|
||||
try:
|
||||
import urllib.request
|
||||
@@ -1119,7 +1126,7 @@ def run_hermes_oauth_login_pure() -> Optional[Dict[str, Any]]:
|
||||
"grant_type": "authorization_code",
|
||||
"client_id": _OAUTH_CLIENT_ID,
|
||||
"code": code,
|
||||
"state": state,
|
||||
"state": received_state,
|
||||
"redirect_uri": _OAUTH_REDIRECT_URI,
|
||||
"code_verifier": verifier,
|
||||
}).encode()
|
||||
|
||||
@@ -0,0 +1,68 @@
|
||||
"""Async/sync bridging helpers.
|
||||
|
||||
The codebase has ~30 sites that schedule a coroutine onto an event loop from a
|
||||
worker thread via :func:`asyncio.run_coroutine_threadsafe`. That function can
|
||||
raise :class:`RuntimeError` (e.g. the loop was closed during a shutdown race),
|
||||
and when it does the coroutine object is never awaited and never closed —
|
||||
which triggers a ``"coroutine '<name>' was never awaited"`` RuntimeWarning and
|
||||
leaks the coroutine's frame until GC.
|
||||
|
||||
:func:`safe_schedule_threadsafe` wraps the call, closes the coroutine on
|
||||
scheduling failure, and returns ``None`` (instead of a half-formed future) so
|
||||
callers can branch cleanly:
|
||||
|
||||
fut = safe_schedule_threadsafe(coro, loop)
|
||||
if fut is None:
|
||||
return # or fallback behavior
|
||||
fut.result(timeout=5)
|
||||
|
||||
The helper deliberately does NOT also handle ``future.result()`` failures —
|
||||
that is a separate concern. Once the loop has accepted the coroutine, its
|
||||
lifecycle belongs to the loop, not the scheduling thread.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from concurrent.futures import Future
|
||||
from typing import Any, Coroutine, Optional
|
||||
|
||||
|
||||
_DEFAULT_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def safe_schedule_threadsafe(
|
||||
coro: Coroutine[Any, Any, Any],
|
||||
loop: Optional[asyncio.AbstractEventLoop],
|
||||
*,
|
||||
logger: Optional[logging.Logger] = None,
|
||||
log_message: str = "Failed to schedule coroutine on loop",
|
||||
log_level: int = logging.DEBUG,
|
||||
) -> Optional[Future]:
|
||||
"""Schedule ``coro`` on ``loop`` from a sync context, leak-safe.
|
||||
|
||||
Returns the :class:`concurrent.futures.Future` on success, or ``None`` if
|
||||
the loop is missing or :func:`asyncio.run_coroutine_threadsafe` raised
|
||||
(e.g. the loop was closed during a shutdown race). In all failure paths
|
||||
the coroutine is :meth:`close`-d so it does not trigger
|
||||
``"coroutine was never awaited"`` warnings or leak its frame.
|
||||
|
||||
Callers retain full control over what to do with the returned future
|
||||
(call ``.result(timeout=...)``, attach ``add_done_callback``, ignore it
|
||||
fire-and-forget, etc.).
|
||||
"""
|
||||
log = logger if logger is not None else _DEFAULT_LOGGER
|
||||
|
||||
if loop is None:
|
||||
if asyncio.iscoroutine(coro):
|
||||
coro.close()
|
||||
log.log(log_level, "%s: loop is None", log_message)
|
||||
return None
|
||||
|
||||
try:
|
||||
return asyncio.run_coroutine_threadsafe(coro, loop)
|
||||
except Exception as exc:
|
||||
if asyncio.iscoroutine(coro):
|
||||
coro.close()
|
||||
log.log(log_level, "%s: %s", log_message, exc)
|
||||
return None
|
||||
@@ -369,6 +369,21 @@ def build_or_headers(or_config: dict | None = None) -> dict:
|
||||
|
||||
return headers
|
||||
|
||||
|
||||
# NVIDIA NIM cloud billing attribution. Keep this host-gated because the
|
||||
# nvidia provider also supports local/on-prem NIM endpoints via NVIDIA_BASE_URL.
|
||||
_NVIDIA_NIM_CLOUD_HEADERS = {
|
||||
"X-BILLING-INVOKE-ORIGIN": "HermesAgent",
|
||||
}
|
||||
|
||||
|
||||
def build_nvidia_nim_headers(base_url: str | None) -> dict:
|
||||
"""Return NVIDIA NIM cloud attribution headers for build.nvidia.com traffic."""
|
||||
if base_url_host_matches(str(base_url or ""), "integrate.api.nvidia.com"):
|
||||
return dict(_NVIDIA_NIM_CLOUD_HEADERS)
|
||||
return {}
|
||||
|
||||
|
||||
# Vercel AI Gateway app attribution headers. HTTP-Referer maps to
|
||||
# referrerUrl and X-Title maps to appName in the gateway's analytics.
|
||||
from hermes_cli import __version__ as _HERMES_VERSION
|
||||
@@ -1254,6 +1269,58 @@ def _resolve_nous_runtime_api(*, force_refresh: bool = False) -> Optional[tuple[
|
||||
return api_key, base_url
|
||||
|
||||
|
||||
def _resolve_xai_oauth_for_aux() -> Optional[Tuple[str, str]]:
|
||||
"""Resolve a fresh xAI OAuth (api_key, base_url) for auxiliary clients.
|
||||
|
||||
Prefer the credential pool, matching the main runtime/provider status
|
||||
path. Some xAI OAuth logins live only as pool entries; falling straight
|
||||
to the singleton auth-store resolver would make auxiliary tasks such as
|
||||
compression report "no provider configured" even though ``hermes auth
|
||||
status`` shows xAI OAuth as logged in.
|
||||
|
||||
Falls back to ``hermes_cli.auth``'s singleton runtime resolver for older
|
||||
auth-store-only logins. Returns ``None`` if the user is not authenticated
|
||||
with xAI Grok OAuth.
|
||||
"""
|
||||
try:
|
||||
from hermes_cli.auth import DEFAULT_XAI_OAUTH_BASE_URL
|
||||
|
||||
pool = load_pool("xai-oauth")
|
||||
if pool and pool.has_credentials():
|
||||
entry = pool.select()
|
||||
if entry is not None:
|
||||
api_key = str(
|
||||
getattr(entry, "runtime_api_key", None)
|
||||
or getattr(entry, "access_token", "")
|
||||
or ""
|
||||
).strip()
|
||||
base_url = str(
|
||||
os.getenv("HERMES_XAI_BASE_URL", "").strip().rstrip("/")
|
||||
or os.getenv("XAI_BASE_URL", "").strip().rstrip("/")
|
||||
or getattr(entry, "runtime_base_url", None)
|
||||
or getattr(entry, "base_url", None)
|
||||
or DEFAULT_XAI_OAUTH_BASE_URL
|
||||
).strip().rstrip("/")
|
||||
if api_key and base_url:
|
||||
return api_key, base_url
|
||||
except Exception as exc:
|
||||
logger.debug("Auxiliary xAI OAuth pool credential resolution failed: %s", exc)
|
||||
|
||||
try:
|
||||
from hermes_cli.auth import resolve_xai_oauth_runtime_credentials
|
||||
|
||||
creds = resolve_xai_oauth_runtime_credentials()
|
||||
except Exception as exc:
|
||||
logger.debug("Auxiliary xAI OAuth runtime credential resolution failed: %s", exc)
|
||||
return None
|
||||
|
||||
api_key = str(creds.get("api_key") or "").strip()
|
||||
base_url = str(creds.get("base_url") or "").strip().rstrip("/")
|
||||
if not api_key or not base_url:
|
||||
return None
|
||||
return api_key, base_url
|
||||
|
||||
|
||||
def _read_codex_access_token() -> Optional[str]:
|
||||
"""Read a valid, non-expired Codex OAuth access token from Hermes auth store.
|
||||
|
||||
@@ -1348,6 +1415,8 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||
from hermes_cli.models import copilot_default_headers
|
||||
|
||||
extra["default_headers"] = copilot_default_headers()
|
||||
elif base_url_host_matches(base_url, "integrate.api.nvidia.com"):
|
||||
extra["default_headers"] = build_nvidia_nim_headers(base_url)
|
||||
else:
|
||||
try:
|
||||
from providers import get_provider_profile as _gpf_aux
|
||||
@@ -1383,6 +1452,8 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||
from hermes_cli.models import copilot_default_headers
|
||||
|
||||
extra["default_headers"] = copilot_default_headers()
|
||||
elif base_url_host_matches(base_url, "integrate.api.nvidia.com"):
|
||||
extra["default_headers"] = build_nvidia_nim_headers(base_url)
|
||||
else:
|
||||
try:
|
||||
from providers import get_provider_profile as _gpf_aux2
|
||||
@@ -1456,8 +1527,21 @@ def _try_nous(vision: bool = False) -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||
nous = _read_nous_auth()
|
||||
runtime = _resolve_nous_runtime_api(force_refresh=False)
|
||||
if runtime is None and not nous:
|
||||
logger.warning(
|
||||
"Auxiliary Nous client unavailable: no Nous authentication found "
|
||||
"(run: hermes auth)."
|
||||
)
|
||||
_mark_provider_unhealthy("nous", ttl=60)
|
||||
return None, None
|
||||
if runtime is None and nous:
|
||||
# Runtime credential mint failed but stored Nous auth is still present.
|
||||
# Falls back to the raw stored token below; surface a debug line so
|
||||
# operators investigating expired/invalid sessions have a breadcrumb,
|
||||
# without blocking the fallback path the rest of this function relies on.
|
||||
logger.debug(
|
||||
"Auxiliary Nous: runtime credential mint failed; falling back to "
|
||||
"stored auth.json token."
|
||||
)
|
||||
global auxiliary_is_nous
|
||||
auxiliary_is_nous = True
|
||||
logger.debug("Auxiliary client: Nous Portal")
|
||||
@@ -1731,6 +1815,32 @@ def _try_custom_endpoint() -> Tuple[Optional[Any], Optional[str]]:
|
||||
return _fallback_client, model
|
||||
|
||||
|
||||
def _build_xai_oauth_aux_client(model: str) -> Tuple[Optional[Any], Optional[str]]:
|
||||
"""Build a CodexAuxiliaryClient for an xAI Grok OAuth-authenticated session.
|
||||
|
||||
xAI's ``/v1/responses`` endpoint speaks the OpenAI Responses API, so we
|
||||
wrap a plain ``OpenAI`` client in ``CodexAuxiliaryClient`` to translate
|
||||
``chat.completions.create()`` calls into ``responses.stream()`` requests.
|
||||
|
||||
The caller must pass an explicit model — pinning a default for Grok
|
||||
would silently rot when xAI's allowlist drifts. Returns ``(None, None)``
|
||||
when the user has not authenticated with xAI Grok OAuth.
|
||||
"""
|
||||
if not model:
|
||||
logger.warning(
|
||||
"Auxiliary client: xai-oauth requested without a model; "
|
||||
"pass model explicitly (auxiliary.<task>.model in config.yaml)."
|
||||
)
|
||||
return None, None
|
||||
resolved = _resolve_xai_oauth_for_aux()
|
||||
if resolved is None:
|
||||
return None, None
|
||||
api_key, base_url = resolved
|
||||
logger.debug("Auxiliary client: xAI OAuth (%s via Responses API)", model)
|
||||
real_client = OpenAI(api_key=api_key, base_url=base_url)
|
||||
return CodexAuxiliaryClient(real_client, model), model
|
||||
|
||||
|
||||
def _build_codex_client(model: str) -> Tuple[Optional[Any], Optional[str]]:
|
||||
"""Build a CodexAuxiliaryClient for an explicitly-requested model.
|
||||
|
||||
@@ -2627,6 +2737,8 @@ def _to_async_client(sync_client, model: str, is_vision: bool = False):
|
||||
)
|
||||
elif base_url_host_matches(sync_base_url, "api.kimi.com"):
|
||||
async_kwargs["default_headers"] = {"User-Agent": "claude-code/0.1.0"}
|
||||
elif base_url_host_matches(sync_base_url, "integrate.api.nvidia.com"):
|
||||
async_kwargs["default_headers"] = build_nvidia_nim_headers(sync_base_url)
|
||||
else:
|
||||
# Fall back to profile.default_headers for providers that declare
|
||||
# client-level headers on their ProviderProfile (e.g. attribution
|
||||
@@ -2838,6 +2950,26 @@ def resolve_provider_client(
|
||||
return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
|
||||
else (client, final_model))
|
||||
|
||||
# ── xAI Grok OAuth (loopback PKCE → Responses API) ───────────────
|
||||
# Without this branch, an xai-oauth main provider falls through to the
|
||||
# generic ``oauth_external`` arm below and returns ``(None, None)``,
|
||||
# silently re-routing every auxiliary task (compression, web extract,
|
||||
# session search, curator, etc.) to whatever Step-2 fallback the user
|
||||
# has configured. Users on xAI Grok OAuth would then see surprise
|
||||
# OpenRouter / Nous bills for side tasks they thought were running on
|
||||
# their xAI subscription.
|
||||
if provider == "xai-oauth":
|
||||
client, default = _build_xai_oauth_aux_client(model)
|
||||
if client is None:
|
||||
logger.warning(
|
||||
"resolve_provider_client: xai-oauth requested but no xAI "
|
||||
"OAuth token found (run: hermes model -> xAI Grok OAuth — SuperGrok Subscription)"
|
||||
)
|
||||
return None, None
|
||||
final_model = _normalize_resolved_model(model or default, provider)
|
||||
return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
|
||||
else (client, final_model))
|
||||
|
||||
# ── Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY) ───────────
|
||||
if provider == "custom":
|
||||
if explicit_base_url:
|
||||
@@ -2868,6 +3000,8 @@ def resolve_provider_client(
|
||||
extra["default_headers"] = copilot_request_headers(
|
||||
is_agent_turn=True, is_vision=is_vision
|
||||
)
|
||||
elif base_url_host_matches(custom_base, "integrate.api.nvidia.com"):
|
||||
extra["default_headers"] = build_nvidia_nim_headers(custom_base)
|
||||
else:
|
||||
# Fall back to profile.default_headers for providers that
|
||||
# declare client-level attribution headers on their profile.
|
||||
@@ -3066,6 +3200,8 @@ def resolve_provider_client(
|
||||
headers.update(copilot_request_headers(
|
||||
is_agent_turn=True, is_vision=is_vision
|
||||
))
|
||||
elif base_url_host_matches(base_url, "integrate.api.nvidia.com"):
|
||||
headers.update(build_nvidia_nim_headers(base_url))
|
||||
else:
|
||||
# Fall back to profile.default_headers for providers that declare
|
||||
# client-level attribution headers on their profile (e.g. GMI
|
||||
@@ -3188,6 +3324,8 @@ def resolve_provider_client(
|
||||
return resolve_provider_client("nous", model, async_mode)
|
||||
if provider == "openai-codex":
|
||||
return resolve_provider_client("openai-codex", model, async_mode)
|
||||
if provider == "xai-oauth":
|
||||
return resolve_provider_client("xai-oauth", model, async_mode)
|
||||
# Other OAuth providers not directly supported
|
||||
logger.warning("resolve_provider_client: OAuth provider %s not "
|
||||
"directly supported, try 'auto'", provider)
|
||||
|
||||
@@ -244,8 +244,21 @@ def _normalize_responses_message_status(value: Any, *, default: str = "completed
|
||||
return default
|
||||
|
||||
|
||||
def _chat_messages_to_responses_input(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Convert internal chat-style messages to Responses input items."""
|
||||
def _chat_messages_to_responses_input(
|
||||
messages: List[Dict[str, Any]],
|
||||
*,
|
||||
is_xai_responses: bool = False,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Convert internal chat-style messages to Responses input items.
|
||||
|
||||
``is_xai_responses=True`` strips ``encrypted_content`` from replayed
|
||||
reasoning items. xAI's OAuth/SuperGrok ``/v1/responses`` surface
|
||||
rejects encrypted reasoning blobs minted by prior turns: the request
|
||||
streams an ``error`` SSE frame before ``response.created`` and the
|
||||
OpenAI SDK collapses it into a generic stream-ordering error. Native
|
||||
Codex (chatgpt.com backend-api) DOES accept replayed encrypted_content
|
||||
— keep the default off.
|
||||
"""
|
||||
items: List[Dict[str, Any]] = []
|
||||
seen_item_ids: set = set()
|
||||
|
||||
@@ -271,9 +284,17 @@ def _chat_messages_to_responses_input(messages: List[Dict[str, Any]]) -> List[Di
|
||||
if role == "assistant":
|
||||
# Replay encrypted reasoning items from previous turns
|
||||
# so the API can maintain coherent reasoning chains.
|
||||
#
|
||||
# xAI OAuth (SuperGrok/Premium) rejects replayed
|
||||
# ``encrypted_content`` reasoning items minted by prior
|
||||
# turns — see _chat_messages_to_responses_input docstring.
|
||||
# When ``is_xai_responses`` is set we drop the replay
|
||||
# entirely; Grok still reasons on each turn server-side,
|
||||
# we just don't try to thread the prior turn's encrypted
|
||||
# blob back in.
|
||||
codex_reasoning = msg.get("codex_reasoning_items")
|
||||
has_codex_reasoning = False
|
||||
if isinstance(codex_reasoning, list):
|
||||
if isinstance(codex_reasoning, list) and not is_xai_responses:
|
||||
for ri in codex_reasoning:
|
||||
if isinstance(ri, dict) and ri.get("encrypted_content"):
|
||||
item_id = ri.get("id")
|
||||
@@ -726,7 +747,7 @@ def _preflight_codex_api_kwargs(
|
||||
"model", "instructions", "input", "tools", "store",
|
||||
"reasoning", "include", "max_output_tokens", "temperature",
|
||||
"tool_choice", "parallel_tool_calls", "prompt_cache_key", "service_tier",
|
||||
"extra_headers",
|
||||
"extra_headers", "extra_body",
|
||||
}
|
||||
normalized: Dict[str, Any] = {
|
||||
"model": model,
|
||||
@@ -776,6 +797,19 @@ def _preflight_codex_api_kwargs(
|
||||
if normalized_headers:
|
||||
normalized["extra_headers"] = normalized_headers
|
||||
|
||||
extra_body = api_kwargs.get("extra_body")
|
||||
if extra_body is not None:
|
||||
if not isinstance(extra_body, dict):
|
||||
raise ValueError("Codex Responses request 'extra_body' must be an object.")
|
||||
# Pass extra_body through verbatim — used by xAI Responses to
|
||||
# carry `prompt_cache_key` as a body-level field (the documented
|
||||
# cache-routing surface on /v1/responses). The openai SDK
|
||||
# serializes extra_body into the JSON body without per-field
|
||||
# type checks, so it survives Responses.stream() kwarg-signature
|
||||
# changes that would otherwise raise TypeError before the wire.
|
||||
if extra_body:
|
||||
normalized["extra_body"] = dict(extra_body)
|
||||
|
||||
if allow_stream:
|
||||
stream = api_kwargs.get("stream")
|
||||
if stream is not None and stream is not True:
|
||||
|
||||
@@ -1429,15 +1429,23 @@ The user has requested that this compaction PRIORITISE preserving all informatio
|
||||
return messages
|
||||
|
||||
turns_to_summarize = messages[compress_start:compress_end]
|
||||
# A persisted handoff summary can sit in the protected head after a
|
||||
# resume (commonly immediately after the system prompt). Search from
|
||||
# the first non-system message through the compression window so we can
|
||||
# rehydrate iterative-summary state without serializing that handoff as
|
||||
# a new turn. Protected messages after the handoff remain live context,
|
||||
# so only summarize messages that are both after the handoff and inside
|
||||
# the current compression window.
|
||||
summary_search_start = 1 if messages and messages[0].get("role") == "system" else 0
|
||||
summary_idx, summary_body = self._find_latest_context_summary(
|
||||
messages,
|
||||
compress_start,
|
||||
summary_search_start,
|
||||
compress_end,
|
||||
)
|
||||
if summary_idx is not None:
|
||||
if summary_body and not self._previous_summary:
|
||||
self._previous_summary = summary_body
|
||||
turns_to_summarize = messages[summary_idx + 1:compress_end]
|
||||
turns_to_summarize = messages[max(compress_start, summary_idx + 1):compress_end]
|
||||
|
||||
if not self.quiet_mode:
|
||||
logger.info(
|
||||
|
||||
@@ -30,6 +30,28 @@ _DEFAULT_TIMEOUT_SECONDS = 900.0
|
||||
_TOOL_CALL_BLOCK_RE = re.compile(r"<tool_call>\s*(\{.*?\})\s*</tool_call>", re.DOTALL)
|
||||
_TOOL_CALL_JSON_RE = re.compile(r"\{\s*\"id\"\s*:\s*\"[^\"]+\"\s*,\s*\"type\"\s*:\s*\"function\"\s*,\s*\"function\"\s*:\s*\{.*?\}\s*\}", re.DOTALL)
|
||||
|
||||
# Stderr fingerprint of the deprecated `gh copilot` CLI extension
|
||||
# (https://github.blog/changelog/2025-09-25-upcoming-deprecation-of-gh-copilot-cli-extension).
|
||||
# We require BOTH the literal product name ("gh-copilot") AND a deprecation
|
||||
# marker, so generic stderr from the NEW `@github/copilot` CLI — whose repo
|
||||
# is github.com/github/copilot-cli and which legitimately mentions "copilot-cli"
|
||||
# in its own banners and error messages — doesn't get misclassified as the
|
||||
# deprecated extension.
|
||||
_DEPRECATION_REQUIRED = ("gh-copilot",)
|
||||
_DEPRECATION_MARKERS = (
|
||||
"has been deprecated",
|
||||
"no commands will be executed",
|
||||
)
|
||||
|
||||
|
||||
def _is_gh_copilot_deprecation_message(stderr_text: str) -> bool:
|
||||
"""True iff stderr looks like the deprecated gh-copilot extension's banner."""
|
||||
|
||||
lower = stderr_text.lower()
|
||||
if not any(req in lower for req in _DEPRECATION_REQUIRED):
|
||||
return False
|
||||
return any(marker in lower for marker in _DEPRECATION_MARKERS)
|
||||
|
||||
|
||||
def _resolve_command() -> str:
|
||||
return (
|
||||
@@ -506,6 +528,21 @@ class CopilotACPClient:
|
||||
|
||||
stderr_text = "\n".join(stderr_tail).strip()
|
||||
if proc.poll() is not None and stderr_text:
|
||||
if _is_gh_copilot_deprecation_message(stderr_text):
|
||||
raise RuntimeError(
|
||||
"Hermes ACP mode requires the NEW GitHub Copilot CLI "
|
||||
"(github.com/github/copilot-cli), but the binary it just "
|
||||
"spawned is the deprecated `gh copilot` extension.\n\n"
|
||||
"Install the new CLI:\n"
|
||||
" npm install -g @github/copilot\n"
|
||||
" # then verify with: copilot --help\n\n"
|
||||
"If `copilot` already resolves to the new CLI but you still see this,\n"
|
||||
"point Hermes at it explicitly:\n"
|
||||
" export HERMES_COPILOT_ACP_COMMAND=/path/to/new/copilot\n\n"
|
||||
"Alternative: use the `copilot` provider (no ACP, hits the Copilot API\n"
|
||||
"directly with a Copilot subscription token) via `hermes setup`.\n\n"
|
||||
f"Original error:\n{stderr_text}"
|
||||
)
|
||||
raise RuntimeError(f"Copilot ACP process exited early: {stderr_text}")
|
||||
raise TimeoutError(f"Timed out waiting for Copilot ACP response to {method}.")
|
||||
|
||||
|
||||
+180
-4
@@ -29,6 +29,7 @@ from hermes_cli.auth import (
|
||||
_resolve_zai_base_url,
|
||||
_save_auth_store,
|
||||
_save_provider_state,
|
||||
_store_provider_state,
|
||||
read_credential_pool,
|
||||
write_credential_pool,
|
||||
)
|
||||
@@ -539,6 +540,64 @@ class CredentialPool:
|
||||
logger.debug("Failed to sync Codex entry from auth.json: %s", exc)
|
||||
return entry
|
||||
|
||||
def _sync_xai_oauth_entry_from_auth_store(self, entry: PooledCredential) -> PooledCredential:
|
||||
"""Sync an xAI OAuth pool entry from auth.json if tokens differ.
|
||||
|
||||
xAI OAuth refresh tokens are single-use. When another Hermes process
|
||||
(or another profile sharing the same auth.json) refreshes the token,
|
||||
it writes the new pair to ``providers["xai-oauth"]["tokens"]`` under
|
||||
``_auth_store_lock``. Without this resync, our in-memory pool entry
|
||||
keeps the consumed refresh_token and the next ``_refresh_entry`` call
|
||||
would replay it and get a ``refresh_token_reused``-style 4xx.
|
||||
|
||||
Only applies to entries seeded from the singleton (``loopback_pkce``);
|
||||
manually added entries (``manual:xai_pkce``) are independent
|
||||
credentials with their own refresh-token lifecycle.
|
||||
"""
|
||||
if self.provider != "xai-oauth" or entry.source != "loopback_pkce":
|
||||
return entry
|
||||
try:
|
||||
with _auth_store_lock():
|
||||
auth_store = _load_auth_store()
|
||||
state = _load_provider_state(auth_store, "xai-oauth")
|
||||
if not isinstance(state, dict):
|
||||
return entry
|
||||
tokens = state.get("tokens")
|
||||
if not isinstance(tokens, dict):
|
||||
return entry
|
||||
store_access = tokens.get("access_token", "")
|
||||
store_refresh = tokens.get("refresh_token", "")
|
||||
entry_access = entry.access_token or ""
|
||||
entry_refresh = entry.refresh_token or ""
|
||||
if store_access and (
|
||||
store_access != entry_access
|
||||
or (store_refresh and store_refresh != entry_refresh)
|
||||
):
|
||||
logger.debug(
|
||||
"Pool entry %s: syncing xAI OAuth tokens from auth.json "
|
||||
"(refreshed by another process)",
|
||||
entry.id,
|
||||
)
|
||||
field_updates: Dict[str, Any] = {
|
||||
"access_token": store_access,
|
||||
"refresh_token": store_refresh or entry.refresh_token,
|
||||
"last_status": None,
|
||||
"last_status_at": None,
|
||||
"last_error_code": None,
|
||||
"last_error_reason": None,
|
||||
"last_error_message": None,
|
||||
"last_error_reset_at": None,
|
||||
}
|
||||
if state.get("last_refresh"):
|
||||
field_updates["last_refresh"] = state["last_refresh"]
|
||||
updated = replace(entry, **field_updates)
|
||||
self._replace_entry(entry, updated)
|
||||
self._persist()
|
||||
return updated
|
||||
except Exception as exc:
|
||||
logger.debug("Failed to sync xAI OAuth entry from auth.json: %s", exc)
|
||||
return entry
|
||||
|
||||
def _sync_nous_entry_from_auth_store(self, entry: PooledCredential) -> PooledCredential:
|
||||
"""Sync a Nous pool entry from auth.json if tokens differ.
|
||||
|
||||
@@ -604,9 +663,22 @@ class CredentialPool:
|
||||
re-seeding a consumed single-use refresh token.
|
||||
|
||||
Applies to any OAuth provider whose singleton lives in auth.json
|
||||
(currently Nous and OpenAI Codex).
|
||||
(currently Nous, OpenAI Codex, and xAI Grok OAuth).
|
||||
|
||||
``set_active=False`` on every write: a pool sync-back is a
|
||||
token-rotation side effect, not the user choosing a provider.
|
||||
Using ``_save_provider_state`` (which sets ``active_provider``)
|
||||
here would mean every Nous/Codex/xAI refresh in a multi-provider
|
||||
setup silently flips the ``active_provider`` flag — the next
|
||||
``hermes`` invocation that defaults to the active provider
|
||||
(e.g. setup wizard, ``hermes auth status``) would land on
|
||||
whatever provider happened to refresh last, not whatever the
|
||||
user actually chose.
|
||||
"""
|
||||
if entry.source != "device_code":
|
||||
# Only sync entries that were seeded *from* a singleton. Manually
|
||||
# added pool entries (source="manual:*") are independent credentials
|
||||
# and must not write back to the singleton.
|
||||
if entry.source not in {"device_code", "loopback_pkce"}:
|
||||
return
|
||||
try:
|
||||
with _auth_store_lock():
|
||||
@@ -632,7 +704,7 @@ class CredentialPool:
|
||||
state[extra_key] = val
|
||||
if entry.inference_base_url:
|
||||
state["inference_base_url"] = entry.inference_base_url
|
||||
_save_provider_state(auth_store, "nous", state)
|
||||
_store_provider_state(auth_store, "nous", state, set_active=False)
|
||||
|
||||
elif self.provider == "openai-codex":
|
||||
state = _load_provider_state(auth_store, "openai-codex")
|
||||
@@ -646,7 +718,21 @@ class CredentialPool:
|
||||
tokens["refresh_token"] = entry.refresh_token
|
||||
if entry.last_refresh:
|
||||
state["last_refresh"] = entry.last_refresh
|
||||
_save_provider_state(auth_store, "openai-codex", state)
|
||||
_store_provider_state(auth_store, "openai-codex", state, set_active=False)
|
||||
|
||||
elif self.provider == "xai-oauth":
|
||||
state = _load_provider_state(auth_store, "xai-oauth")
|
||||
if not isinstance(state, dict):
|
||||
return
|
||||
tokens = state.get("tokens")
|
||||
if not isinstance(tokens, dict):
|
||||
return
|
||||
tokens["access_token"] = entry.access_token
|
||||
if entry.refresh_token:
|
||||
tokens["refresh_token"] = entry.refresh_token
|
||||
if entry.last_refresh:
|
||||
state["last_refresh"] = entry.last_refresh
|
||||
_store_provider_state(auth_store, "xai-oauth", state, set_active=False)
|
||||
|
||||
else:
|
||||
return
|
||||
@@ -699,6 +785,25 @@ class CredentialPool:
|
||||
refresh_token=refreshed["refresh_token"],
|
||||
last_refresh=refreshed.get("last_refresh"),
|
||||
)
|
||||
elif self.provider == "xai-oauth":
|
||||
# Adopt fresher tokens from auth.json before spending the
|
||||
# refresh_token — single-use tokens consumed by another
|
||||
# process (or another profile sharing the singleton) would
|
||||
# otherwise trigger ``refresh_token_reused`` on the next
|
||||
# POST. Only meaningful for singleton-seeded entries.
|
||||
synced = self._sync_xai_oauth_entry_from_auth_store(entry)
|
||||
if synced is not entry:
|
||||
entry = synced
|
||||
refreshed = auth_mod.refresh_xai_oauth_pure(
|
||||
entry.access_token,
|
||||
entry.refresh_token,
|
||||
)
|
||||
updated = replace(
|
||||
entry,
|
||||
access_token=refreshed["access_token"],
|
||||
refresh_token=refreshed["refresh_token"],
|
||||
last_refresh=refreshed.get("last_refresh"),
|
||||
)
|
||||
elif self.provider == "nous":
|
||||
synced = self._sync_nous_entry_from_auth_store(entry)
|
||||
if synced is not entry:
|
||||
@@ -777,6 +882,30 @@ class CredentialPool:
|
||||
# Credentials file had a valid (non-expired) token — use it directly
|
||||
logger.debug("Credentials file has valid token, using without refresh")
|
||||
return synced
|
||||
# For xai-oauth: same race as nous — another process may have
|
||||
# consumed the refresh token between our proactive sync and the
|
||||
# HTTP call. Re-check auth.json and adopt the fresh tokens if
|
||||
# they have rotated since. Only meaningful for singleton-seeded
|
||||
# (loopback_pkce) entries; manual entries don't share state with
|
||||
# the singleton.
|
||||
if self.provider == "xai-oauth":
|
||||
synced = self._sync_xai_oauth_entry_from_auth_store(entry)
|
||||
if synced.refresh_token != entry.refresh_token:
|
||||
logger.debug(
|
||||
"xAI OAuth refresh failed but auth.json has newer tokens — adopting"
|
||||
)
|
||||
updated = replace(
|
||||
synced,
|
||||
last_status=STATUS_OK,
|
||||
last_status_at=None,
|
||||
last_error_code=None,
|
||||
last_error_reason=None,
|
||||
last_error_message=None,
|
||||
last_error_reset_at=None,
|
||||
)
|
||||
self._replace_entry(synced, updated)
|
||||
self._persist()
|
||||
return updated
|
||||
# For nous: another process may have consumed the refresh token
|
||||
# between our proactive sync and the HTTP call. Re-sync from
|
||||
# auth.json and adopt the fresh tokens if available.
|
||||
@@ -829,6 +958,11 @@ class CredentialPool:
|
||||
entry.access_token,
|
||||
CODEX_ACCESS_TOKEN_REFRESH_SKEW_SECONDS,
|
||||
)
|
||||
if self.provider == "xai-oauth":
|
||||
return auth_mod._xai_access_token_is_expiring(
|
||||
entry.access_token,
|
||||
auth_mod.XAI_ACCESS_TOKEN_REFRESH_SKEW_SECONDS,
|
||||
)
|
||||
if self.provider == "nous":
|
||||
# Nous refresh/mint can require network access and should happen when
|
||||
# runtime credentials are actually resolved, not merely when the pool
|
||||
@@ -883,6 +1017,17 @@ class CredentialPool:
|
||||
if synced is not entry:
|
||||
entry = synced
|
||||
cleared_any = True
|
||||
# For xai-oauth singleton-seeded entries, identical pattern:
|
||||
# an entry frozen as exhausted may simply be holding stale
|
||||
# tokens that another process (or a fresh `hermes model` ->
|
||||
# xAI Grok OAuth login) has since rotated in auth.json.
|
||||
if (self.provider == "xai-oauth"
|
||||
and entry.source == "loopback_pkce"
|
||||
and entry.last_status == STATUS_EXHAUSTED):
|
||||
synced = self._sync_xai_oauth_entry_from_auth_store(entry)
|
||||
if synced is not entry:
|
||||
entry = synced
|
||||
cleared_any = True
|
||||
if entry.last_status == STATUS_EXHAUSTED:
|
||||
exhausted_until = _exhausted_until(entry)
|
||||
if exhausted_until is not None and now < exhausted_until:
|
||||
@@ -1394,6 +1539,37 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup
|
||||
},
|
||||
)
|
||||
|
||||
elif provider == "xai-oauth":
|
||||
# When the user logs in via ``hermes model`` -> xAI Grok OAuth,
|
||||
# tokens are written to the auth.json singleton
|
||||
# (``providers["xai-oauth"]``). Surface them in the pool too so
|
||||
# ``hermes auth list`` reflects the logged-in state and so the pool
|
||||
# is the single source of truth for refresh during runtime resolution.
|
||||
if _is_suppressed(provider, "loopback_pkce"):
|
||||
return changed, active_sources
|
||||
|
||||
state = _load_provider_state(auth_store, "xai-oauth")
|
||||
tokens = state.get("tokens") if isinstance(state, dict) else None
|
||||
if isinstance(tokens, dict) and tokens.get("access_token"):
|
||||
active_sources.add("loopback_pkce")
|
||||
from hermes_cli.auth import DEFAULT_XAI_OAUTH_BASE_URL
|
||||
|
||||
base_url = DEFAULT_XAI_OAUTH_BASE_URL
|
||||
changed |= _upsert_entry(
|
||||
entries,
|
||||
provider,
|
||||
"loopback_pkce",
|
||||
{
|
||||
"source": "loopback_pkce",
|
||||
"auth_type": AUTH_TYPE_OAUTH,
|
||||
"access_token": tokens.get("access_token", ""),
|
||||
"refresh_token": tokens.get("refresh_token"),
|
||||
"base_url": base_url,
|
||||
"last_refresh": state.get("last_refresh"),
|
||||
"label": label_from_token(tokens.get("access_token", ""), "loopback_pkce"),
|
||||
},
|
||||
)
|
||||
|
||||
return changed, active_sources
|
||||
|
||||
|
||||
|
||||
@@ -265,6 +265,31 @@ def _remove_minimax_oauth(provider: str, removed) -> RemovalResult:
|
||||
return result
|
||||
|
||||
|
||||
def _remove_xai_oauth_loopback_pkce(provider: str, removed) -> RemovalResult:
|
||||
"""xAI OAuth tokens live in auth.json providers.xai-oauth — clear them.
|
||||
|
||||
Without this step, ``hermes auth remove xai-oauth <N>`` silently undoes
|
||||
itself: the central dispatcher only removes the in-memory pool entry,
|
||||
leaves ``providers.xai-oauth`` in auth.json intact, and on the next
|
||||
``load_pool("xai-oauth")`` call ``_seed_from_singletons`` re-seeds the
|
||||
entry from the still-present singleton — credentials reappear with no
|
||||
user feedback. Clearing the singleton in step with the suppression set
|
||||
by the central dispatcher makes the removal stick.
|
||||
|
||||
Belt-and-braces against the manual entry path: ``hermes auth add
|
||||
xai-oauth`` produces a ``manual:xai_pkce`` entry whose removal step
|
||||
falls through to "unregistered → nothing to clean up" (correct —
|
||||
manual entries are pool-only).
|
||||
"""
|
||||
result = RemovalResult()
|
||||
if _clear_auth_store_provider(provider):
|
||||
result.cleaned.append(f"Cleared {provider} OAuth tokens from auth store")
|
||||
result.hints.append(
|
||||
"Run `hermes model` → xAI Grok OAuth (SuperGrok Subscription) to re-authenticate if needed."
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def _remove_codex_device_code(provider: str, removed) -> RemovalResult:
|
||||
"""Codex tokens live in TWO places: our auth store AND ~/.codex/auth.json.
|
||||
|
||||
@@ -397,6 +422,11 @@ def _register_all_sources() -> None:
|
||||
remove_fn=_remove_codex_device_code,
|
||||
description="auth.json providers.openai-codex + ~/.codex/auth.json",
|
||||
))
|
||||
register(RemovalStep(
|
||||
provider="xai-oauth", source_id="loopback_pkce",
|
||||
remove_fn=_remove_xai_oauth_loopback_pkce,
|
||||
description="auth.json providers.xai-oauth",
|
||||
))
|
||||
register(RemovalStep(
|
||||
provider="qwen-oauth", source_id="qwen-cli",
|
||||
remove_fn=_remove_qwen_cli,
|
||||
|
||||
@@ -240,21 +240,6 @@ def build_tool_preview(tool_name: str, args: dict, max_len: int | None = None) -
|
||||
msg = msg[:17] + "..."
|
||||
return f"to {target}: \"{msg}\""
|
||||
|
||||
if tool_name.startswith("rl_"):
|
||||
rl_previews = {
|
||||
"rl_list_environments": "listing envs",
|
||||
"rl_select_environment": args.get("name", ""),
|
||||
"rl_get_current_config": "reading config",
|
||||
"rl_edit_config": f"{args.get('field', '')}={args.get('value', '')}",
|
||||
"rl_start_training": "starting",
|
||||
"rl_check_status": args.get("run_id", "")[:16],
|
||||
"rl_stop_training": f"stopping {args.get('run_id', '')[:16]}",
|
||||
"rl_get_results": args.get("run_id", "")[:16],
|
||||
"rl_list_runs": "listing runs",
|
||||
"rl_test_inference": f"{args.get('num_steps', 3)} steps",
|
||||
}
|
||||
return rl_previews.get(tool_name)
|
||||
|
||||
key = primary_args.get(tool_name)
|
||||
if not key:
|
||||
for fallback_key in ("query", "text", "command", "path", "name", "prompt", "code", "goal"):
|
||||
@@ -981,15 +966,6 @@ def get_cute_tool_message(
|
||||
if action == "list":
|
||||
return _wrap(f"┊ ⏰ cron listing {dur}")
|
||||
return _wrap(f"┊ ⏰ cron {action} {args.get('job_id', '')} {dur}")
|
||||
if tool_name.startswith("rl_"):
|
||||
rl = {
|
||||
"rl_list_environments": "list envs", "rl_select_environment": f"select {args.get('name', '')}",
|
||||
"rl_get_current_config": "get config", "rl_edit_config": f"set {args.get('field', '?')}",
|
||||
"rl_start_training": "start training", "rl_check_status": f"status {args.get('run_id', '?')[:12]}",
|
||||
"rl_stop_training": f"stop {args.get('run_id', '?')[:12]}", "rl_get_results": f"results {args.get('run_id', '?')[:12]}",
|
||||
"rl_list_runs": "list runs", "rl_test_inference": "test inference",
|
||||
}
|
||||
return _wrap(f"┊ 🧪 rl {rl.get(tool_name, tool_name.replace('rl_', ''))} {dur}")
|
||||
if tool_name == "execute_code":
|
||||
code = args.get("code", "")
|
||||
first_line = code.strip().split("\n")[0] if code.strip() else ""
|
||||
|
||||
+41
-4
@@ -40,7 +40,7 @@ import os
|
||||
import threading
|
||||
import time
|
||||
from concurrent.futures import Future as ConcurrentFuture
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||
|
||||
from agent.lsp import eventlog
|
||||
from agent.lsp.client import (
|
||||
@@ -107,9 +107,14 @@ class _BackgroundLoop:
|
||||
|
||||
Returns the coroutine's result, or raises its exception.
|
||||
"""
|
||||
from agent.async_utils import safe_schedule_threadsafe
|
||||
if self._loop is None:
|
||||
if asyncio.iscoroutine(coro):
|
||||
coro.close()
|
||||
raise RuntimeError("background loop not started")
|
||||
fut: ConcurrentFuture = asyncio.run_coroutine_threadsafe(coro, self._loop)
|
||||
fut = safe_schedule_threadsafe(coro, self._loop)
|
||||
if fut is None:
|
||||
raise RuntimeError("background loop not running")
|
||||
try:
|
||||
return fut.result(timeout=timeout)
|
||||
except Exception:
|
||||
@@ -305,6 +310,7 @@ class LSPService:
|
||||
*,
|
||||
delta: bool = True,
|
||||
timeout: Optional[float] = None,
|
||||
line_shift: Optional[Callable[[int], Optional[int]]] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Synchronously open ``file_path`` in the right server, wait for
|
||||
diagnostics, return them.
|
||||
@@ -314,6 +320,18 @@ class LSPService:
|
||||
Diagnostics present in the baseline are removed so the caller
|
||||
only sees errors introduced by the current edit.
|
||||
|
||||
When ``line_shift`` is provided, baseline diagnostics are
|
||||
remapped through it before the set-difference. This handles
|
||||
the case where the edit deleted or inserted lines, causing
|
||||
pre-existing diagnostics below the edit point to surface at
|
||||
different line numbers in the post-edit snapshot — without
|
||||
the shift, they'd all look "introduced by this edit". Pass
|
||||
a callable built by
|
||||
:func:`agent.lsp.range_shift.build_line_shift` (pre_text,
|
||||
post_text). Omit when pre/post content isn't available;
|
||||
the unshifted comparison still catches diagnostics that
|
||||
didn't move.
|
||||
|
||||
Returns an empty list when LSP is disabled, when no workspace
|
||||
can be detected, when no server matches, or when the server
|
||||
can't be spawned. Never raises.
|
||||
@@ -344,6 +362,14 @@ class LSPService:
|
||||
if delta:
|
||||
baseline = self._delta_baseline.get(abs_path) or []
|
||||
if baseline:
|
||||
if line_shift is not None:
|
||||
# Remap baseline diagnostics into post-edit
|
||||
# coordinates so shifted-but-otherwise-identical
|
||||
# entries hash equal under _diag_key. Entries
|
||||
# that mapped into a deleted region drop out
|
||||
# silently — they no longer apply.
|
||||
from agent.lsp.range_shift import shift_baseline
|
||||
baseline = shift_baseline(baseline, line_shift)
|
||||
seen = {_diag_key(d) for d in baseline}
|
||||
diags = [d for d in diags if _diag_key(d) not in seen]
|
||||
# Roll baseline forward — next call returns deltas relative
|
||||
@@ -585,8 +611,19 @@ class LSPService:
|
||||
|
||||
|
||||
def _diag_key(d: Dict[str, Any]) -> str:
|
||||
"""Content equality key used for delta filtering. Mirrors
|
||||
:func:`agent.lsp.client._diagnostic_key`."""
|
||||
"""Content equality key used for cross-edit delta filtering.
|
||||
|
||||
Includes the diagnostic's position range — when used together
|
||||
with :func:`agent.lsp.range_shift.shift_baseline`, the baseline
|
||||
is line-shifted into post-edit coordinates BEFORE this key is
|
||||
computed, so identical-but-shifted diagnostics hash equal. Two
|
||||
genuinely distinct diagnostics at different lines (e.g. the same
|
||||
error class introduced at a second site) hash differently and
|
||||
are surfaced as new.
|
||||
|
||||
Mirrors :func:`agent.lsp.client._diagnostic_key`; intentionally
|
||||
identical so the two layers agree on diagnostic identity.
|
||||
"""
|
||||
rng = d.get("range") or {}
|
||||
start = rng.get("start") or {}
|
||||
end = rng.get("end") or {}
|
||||
|
||||
@@ -0,0 +1,149 @@
|
||||
"""Diff-aware line-shift map for cross-edit LSP delta filtering.
|
||||
|
||||
When an edit deletes or inserts lines in the middle of a file, every
|
||||
diagnostic below the edit point shifts to a new line number. The
|
||||
LSPService delta filter subtracts the pre-edit baseline from the
|
||||
post-edit diagnostics keyed on ``(severity, code, source, message,
|
||||
range)`` — without an adjustment, the shifted-but-otherwise-identical
|
||||
diagnostics look brand-new and the agent gets flooded with noise.
|
||||
|
||||
The fix used here is the same trick git's blame and unified diff use:
|
||||
build a piecewise-linear map from pre-edit line numbers to post-edit
|
||||
line numbers, then apply that map to baseline diagnostics before the
|
||||
set-difference. Diagnostics whose pre-edit line is in a region the
|
||||
edit deleted return ``None`` and are dropped from the baseline (they
|
||||
genuinely no longer apply).
|
||||
|
||||
Trade-off vs. dropping range from the key entirely (the previous
|
||||
fix): preserves the "new instance of an identical error at a
|
||||
different line" signal — if the model introduces a second instance
|
||||
of the same error class at a different location, that one will be
|
||||
surfaced as new instead of swallowed by content-only dedup.
|
||||
|
||||
The map is derived from ``difflib.SequenceMatcher.get_opcodes()`` and
|
||||
exposed as a single callable so callers don't have to reason about
|
||||
diff regions.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import difflib
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
|
||||
|
||||
def build_line_shift(pre_text: str, post_text: str) -> Callable[[int], Optional[int]]:
|
||||
"""Build a function mapping pre-edit line numbers to post-edit line numbers.
|
||||
|
||||
Lines are 0-indexed to match the LSP wire format
|
||||
(``range.start.line`` is 0-indexed).
|
||||
|
||||
The returned callable takes a pre-edit 0-indexed line number and
|
||||
returns the corresponding post-edit 0-indexed line number, or
|
||||
``None`` if that line was deleted by the edit (no post-edit
|
||||
counterpart exists).
|
||||
|
||||
Cost: one ``SequenceMatcher.get_opcodes()`` call up front; the
|
||||
returned closure is O(log n) per call (binary search over opcode
|
||||
regions). Cheap enough to call once per write/patch and apply to
|
||||
every baseline diagnostic.
|
||||
"""
|
||||
pre_lines = pre_text.splitlines() if pre_text else []
|
||||
post_lines = post_text.splitlines() if post_text else []
|
||||
|
||||
# Trivial case: identical content or no content — identity map.
|
||||
if pre_lines == post_lines:
|
||||
return lambda line: line
|
||||
|
||||
# SequenceMatcher.get_opcodes() returns a list of
|
||||
# (tag, i1, i2, j1, j2) where tag is 'equal', 'replace', 'delete',
|
||||
# or 'insert'. i1:i2 is the range in pre, j1:j2 is the range in
|
||||
# post. We build a list of (i1, i2, j1, j2, tag) tuples and
|
||||
# binary-search by i for each lookup.
|
||||
sm = difflib.SequenceMatcher(a=pre_lines, b=post_lines, autojunk=False)
|
||||
opcodes = sm.get_opcodes()
|
||||
|
||||
def shift(line: int) -> Optional[int]:
|
||||
# Find the opcode region whose i1 <= line < i2.
|
||||
# Linear scan is fine — typical opcode count is small (single
|
||||
# digits for a typical patch-tool edit).
|
||||
for tag, i1, i2, j1, j2 in opcodes:
|
||||
if i1 <= line < i2:
|
||||
if tag == "equal":
|
||||
# Pre-line N → post-line (N - i1 + j1).
|
||||
return line - i1 + j1
|
||||
if tag == "delete":
|
||||
# Pre-line is in a deleted region — no post counterpart.
|
||||
return None
|
||||
if tag == "replace":
|
||||
# Replace == delete + insert; the pre-line has no
|
||||
# post counterpart in any meaningful sense. Drop.
|
||||
return None
|
||||
# 'insert' has i1 == i2 so line < i2 can't be hit.
|
||||
if line < i1:
|
||||
# Past the relevant region — handled in earlier iteration.
|
||||
break
|
||||
# Past the last opcode region (line >= len(pre_lines)).
|
||||
# Anchor at end of post.
|
||||
return max(0, len(post_lines) - 1) if post_lines else None
|
||||
|
||||
return shift
|
||||
|
||||
|
||||
def shift_diagnostic_range(diag: Dict[str, Any],
|
||||
shift: Callable[[int], Optional[int]]) -> Optional[Dict[str, Any]]:
|
||||
"""Return a copy of ``diag`` with its line range remapped through ``shift``.
|
||||
|
||||
Returns ``None`` if the diagnostic's start line maps to ``None``
|
||||
(the line was deleted by the edit) — caller drops it from the
|
||||
baseline since the diagnostic no longer applies.
|
||||
|
||||
Both ``start.line`` and ``end.line`` are remapped independently;
|
||||
when only the end maps to ``None`` (rare, multi-line diagnostic
|
||||
straddling the edit boundary) we collapse to a single-line range
|
||||
at the shifted start to keep the diagnostic in the baseline.
|
||||
|
||||
The original ``diag`` is not mutated.
|
||||
"""
|
||||
rng = diag.get("range") or {}
|
||||
start = rng.get("start") or {}
|
||||
end = rng.get("end") or {}
|
||||
|
||||
pre_start_line = int(start.get("line", 0))
|
||||
pre_end_line = int(end.get("line", pre_start_line))
|
||||
|
||||
new_start_line = shift(pre_start_line)
|
||||
if new_start_line is None:
|
||||
return None
|
||||
|
||||
new_end_line = shift(pre_end_line)
|
||||
if new_end_line is None:
|
||||
# Diagnostic straddled the deletion — collapse to start.
|
||||
new_end_line = new_start_line
|
||||
|
||||
shifted = dict(diag)
|
||||
shifted["range"] = {
|
||||
"start": {
|
||||
"line": new_start_line,
|
||||
"character": int(start.get("character", 0)),
|
||||
},
|
||||
"end": {
|
||||
"line": new_end_line,
|
||||
"character": int(end.get("character", 0)),
|
||||
},
|
||||
}
|
||||
return shifted
|
||||
|
||||
|
||||
def shift_baseline(baseline: List[Dict[str, Any]],
|
||||
shift: Callable[[int], Optional[int]]) -> List[Dict[str, Any]]:
|
||||
"""Apply ``shift`` to every diagnostic in ``baseline``, dropping deleted entries."""
|
||||
out: List[Dict[str, Any]] = []
|
||||
for d in baseline:
|
||||
if not isinstance(d, dict):
|
||||
continue
|
||||
shifted = shift_diagnostic_range(d, shift)
|
||||
if shifted is not None:
|
||||
out.append(shifted)
|
||||
return out
|
||||
|
||||
|
||||
__all__ = ["build_line_shift", "shift_diagnostic_range", "shift_baseline"]
|
||||
@@ -213,6 +213,7 @@ DEFAULT_CONTEXT_LENGTHS = {
|
||||
"grok-2-vision": 8192, # grok-2-vision, -1212, -latest
|
||||
"grok-4-fast": 2000000, # grok-4-fast-(non-)reasoning
|
||||
"grok-4.20": 2000000, # grok-4.20-0309-(non-)reasoning, -multi-agent-0309
|
||||
"grok-4.3": 1000000, # grok-4.3, grok-4.3-latest — 1M context per docs.x.ai
|
||||
"grok-4": 256000, # grok-4, grok-4-0709
|
||||
"grok-3": 131072, # grok-3, grok-3-mini, grok-3-fast, grok-3-mini-fast
|
||||
"grok-2": 131072, # grok-2, grok-2-1212, grok-2-latest
|
||||
@@ -357,6 +358,12 @@ _URL_TO_PROVIDER: Dict[str, str] = {
|
||||
"api.deepseek.com": "deepseek",
|
||||
"api.githubcopilot.com": "copilot",
|
||||
"models.github.ai": "copilot",
|
||||
# GitHub Models free tier (Azure-hosted prototyping endpoint) — same
|
||||
# canonical provider as the Copilot API. Hard per-request token cap
|
||||
# (often 8K) makes it unusable for Hermes' system prompt, but mapping
|
||||
# it here lets us recognize the endpoint and emit a targeted hint
|
||||
# instead of falling through the unknown-custom-endpoint path.
|
||||
"models.inference.ai.azure.com": "copilot",
|
||||
"api.fireworks.ai": "fireworks",
|
||||
"opencode.ai": "opencode-go",
|
||||
"api.x.ai": "xai",
|
||||
|
||||
@@ -24,7 +24,10 @@ class ResponsesApiTransport(ProviderTransport):
|
||||
def convert_messages(self, messages: List[Dict[str, Any]], **kwargs) -> Any:
|
||||
"""Convert OpenAI chat messages to Responses API input items."""
|
||||
from agent.codex_responses_adapter import _chat_messages_to_responses_input
|
||||
return _chat_messages_to_responses_input(messages)
|
||||
return _chat_messages_to_responses_input(
|
||||
messages,
|
||||
is_xai_responses=bool(kwargs.get("is_xai_responses")),
|
||||
)
|
||||
|
||||
def convert_tools(self, tools: List[Dict[str, Any]]) -> Any:
|
||||
"""Convert OpenAI tool schemas to Responses API function definitions."""
|
||||
@@ -89,24 +92,38 @@ class ResponsesApiTransport(ProviderTransport):
|
||||
_effort_clamp = {"minimal": "low"}
|
||||
reasoning_effort = _effort_clamp.get(reasoning_effort, reasoning_effort)
|
||||
|
||||
response_tools = _responses_tools(tools)
|
||||
kwargs = {
|
||||
"model": model,
|
||||
"instructions": instructions,
|
||||
"input": _chat_messages_to_responses_input(payload_messages),
|
||||
"tools": _responses_tools(tools),
|
||||
"tool_choice": "auto",
|
||||
"parallel_tool_calls": True,
|
||||
"input": _chat_messages_to_responses_input(
|
||||
payload_messages,
|
||||
is_xai_responses=is_xai_responses,
|
||||
),
|
||||
"tools": response_tools,
|
||||
"store": False,
|
||||
}
|
||||
if response_tools:
|
||||
kwargs["tool_choice"] = "auto"
|
||||
kwargs["parallel_tool_calls"] = True
|
||||
|
||||
session_id = params.get("session_id")
|
||||
if not is_github_responses and session_id:
|
||||
# xAI Responses takes prompt_cache_key in extra_body (set further
|
||||
# down); GitHub Models opts out of cache-key routing entirely.
|
||||
if not is_github_responses and not is_xai_responses and session_id:
|
||||
kwargs["prompt_cache_key"] = session_id
|
||||
|
||||
if reasoning_enabled and is_xai_responses:
|
||||
from agent.model_metadata import grok_supports_reasoning_effort
|
||||
|
||||
kwargs["include"] = ["reasoning.encrypted_content"]
|
||||
# NOTE: Hermes does NOT ask xAI to return ``reasoning.encrypted_content``
|
||||
# any more. xAI's OAuth/SuperGrok ``/v1/responses`` surface rejects
|
||||
# replayed encrypted reasoning items on turn 2+ — see
|
||||
# _chat_messages_to_responses_input docstring. Requesting the field
|
||||
# back would just have us cache something we then must strip. Grok
|
||||
# still reasons natively each turn; coherence across turns rides on
|
||||
# the visible message text alone.
|
||||
kwargs["include"] = []
|
||||
# xAI rejects `reasoning.effort` on grok-4 / grok-4-fast / grok-3
|
||||
# / grok-code-fast / grok-4.20-0309-* with HTTP 400 even though
|
||||
# those models reason natively. Only send the effort dial when
|
||||
@@ -165,6 +182,17 @@ class ResponsesApiTransport(ProviderTransport):
|
||||
merged_extra_headers["x-grok-conv-id"] = session_id
|
||||
kwargs["extra_headers"] = merged_extra_headers
|
||||
|
||||
# xAI Responses cache-routing — body-level field per
|
||||
# https://docs.x.ai/developers/advanced-api-usage/prompt-caching/maximizing-cache-hits.
|
||||
# Sent via extra_body (not the typed kwarg) so it survives openai
|
||||
# SDK builds whose Responses.stream() signature has dropped the field.
|
||||
existing_extra_body = kwargs.get("extra_body")
|
||||
merged_extra_body: Dict[str, Any] = {}
|
||||
if isinstance(existing_extra_body, dict):
|
||||
merged_extra_body.update(existing_extra_body)
|
||||
merged_extra_body.setdefault("prompt_cache_key", session_id)
|
||||
kwargs["extra_body"] = merged_extra_body
|
||||
|
||||
return kwargs
|
||||
|
||||
def normalize_response(self, response: Any, **kwargs) -> NormalizedResponse:
|
||||
|
||||
@@ -14,20 +14,28 @@ the user gets full Hermes capability inside a Codex turn.
|
||||
Scope (what we expose):
|
||||
- web_search, web_extract — Firecrawl, no codex equivalent
|
||||
- browser_navigate / _click / _type / — Camofox/Browserbase automation
|
||||
_snapshot / _screenshot / _scroll / _back / _press / _vision
|
||||
- delegate_task — Hermes subagents
|
||||
_snapshot / _scroll / _back / _press /
|
||||
_get_images / _console / _vision
|
||||
- vision_analyze — image inspection by vision model
|
||||
- image_generate — image generation
|
||||
- memory — Hermes' persistent memory store
|
||||
- skill_view, skills_list — Hermes' skill library
|
||||
- session_search — cross-session search
|
||||
- text_to_speech — TTS
|
||||
- kanban_* (complete/block/comment/ — kanban worker + orchestrator
|
||||
heartbeat/show/list/create/ handoff (stateless: read env var,
|
||||
unblock/link) write ~/.hermes/kanban.db)
|
||||
|
||||
What we DO NOT expose (codex has equivalents):
|
||||
What we DO NOT expose:
|
||||
- terminal / shell — codex's own shell tool
|
||||
- read_file / write_file / patch — codex's apply_patch + shell
|
||||
- search_files / process — codex's shell
|
||||
- clarify, todo — codex's own UX
|
||||
- clarify — codex's own UX
|
||||
- delegate_task / memory / — `_AGENT_LOOP_TOOLS` in Hermes
|
||||
session_search / todo (model_tools.py). They require
|
||||
the running AIAgent context to
|
||||
dispatch (mid-loop state), so a
|
||||
stateless MCP callback can't
|
||||
drive them. See the inline
|
||||
comment on EXPOSED_TOOLS below.
|
||||
|
||||
Run with: python -m agent.transports.hermes_tools_mcp_server
|
||||
Spawned by: CodexAppServerSession.ensure_started() when the runtime is
|
||||
|
||||
+12
-4
@@ -457,7 +457,7 @@ prompt_caching:
|
||||
# Two stores: MEMORY.md (agent's notes) and USER.md (user profile).
|
||||
# Character limits keep the memory small and focused. The agent manages
|
||||
# pruning -- when at the limit, it must consolidate or replace entries.
|
||||
# Disabled by default in batch_runner and RL environments.
|
||||
# Disabled by default in batch_runner.
|
||||
#
|
||||
memory:
|
||||
# Agent's personal notes: environment facts, conventions, things learned
|
||||
@@ -681,6 +681,16 @@ platform_toolsets:
|
||||
# # allowed_chats: ["-1001234567890"]
|
||||
# extra:
|
||||
# disable_link_previews: false # Set true to suppress Telegram URL previews in bot messages
|
||||
#
|
||||
# Discord-specific settings (config.yaml top-level, not under platforms:):
|
||||
#
|
||||
# discord:
|
||||
# require_mention: true # Require @mention in server channels (default: true)
|
||||
# auto_thread: true # Auto-create thread on @mention (default: true)
|
||||
# free_response_channels: "" # Channel IDs where no mention is needed
|
||||
# reactions: true # Show processing reactions (default: true)
|
||||
# history_backfill: true # Recover missed channel messages on mention (default: true)
|
||||
# history_backfill_limit: 50 # Max messages to scan backwards (default: 50)
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Available toolsets (use these names in platform_toolsets or the toolsets list)
|
||||
@@ -705,10 +715,9 @@ platform_toolsets:
|
||||
# todo - todo (in-memory task planning, no deps)
|
||||
# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI/MINIMAX/MISTRAL key)
|
||||
# cronjob - cronjob (create/list/update/pause/resume/run/remove scheduled tasks)
|
||||
# rl - rl_list_environments, rl_start_training, etc. (requires TINKER_API_KEY)
|
||||
#
|
||||
# PRESETS (curated bundles):
|
||||
# hermes-cli - All of the above except rl + send_message
|
||||
# hermes-cli - All of the above except send_message
|
||||
# hermes-telegram - terminal, file, web, vision, image_gen, tts, browser,
|
||||
# skills, todo, cronjob, send_message
|
||||
# hermes-discord - Same as hermes-telegram
|
||||
@@ -734,7 +743,6 @@ platform_toolsets:
|
||||
# session_search - Search and recall past conversations (FTS5 + Gemini Flash summarization)
|
||||
# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, MiniMax, Mistral)
|
||||
# cronjob - Schedule and manage automated tasks (CLI-only)
|
||||
# rl - RL training tools (Tinker-Atropos)
|
||||
#
|
||||
# Composite toolsets:
|
||||
# debugging - terminal + web + file (for troubleshooting)
|
||||
|
||||
@@ -1965,43 +1965,7 @@ def _resolve_attachment_path(raw_path: str) -> Path | None:
|
||||
return resolved
|
||||
|
||||
|
||||
def _format_process_notification(evt: dict) -> "str | None":
|
||||
"""Format a process notification event into a [IMPORTANT: ...] message.
|
||||
|
||||
Handles both completion events (notify_on_complete) and watch pattern
|
||||
match events from the unified completion_queue.
|
||||
"""
|
||||
evt_type = evt.get("type", "completion")
|
||||
_sid = evt.get("session_id", "unknown")
|
||||
_cmd = evt.get("command", "unknown")
|
||||
|
||||
if evt_type == "watch_disabled":
|
||||
return f"[IMPORTANT: {evt.get('message', '')}]"
|
||||
|
||||
if evt_type == "watch_match":
|
||||
_pat = evt.get("pattern", "?")
|
||||
_out = evt.get("output", "")
|
||||
_sup = evt.get("suppressed", 0)
|
||||
text = (
|
||||
f"[IMPORTANT: Background process {_sid} matched "
|
||||
f"watch pattern \"{_pat}\".\n"
|
||||
f"Command: {_cmd}\n"
|
||||
f"Matched output:\n{_out}"
|
||||
)
|
||||
if _sup:
|
||||
text += f"\n({_sup} earlier matches were suppressed by rate limit)"
|
||||
text += "]"
|
||||
return text
|
||||
|
||||
# Default: completion event
|
||||
_exit = evt.get("exit_code", "?")
|
||||
_out = evt.get("output", "")
|
||||
return (
|
||||
f"[IMPORTANT: Background process {_sid} completed "
|
||||
f"(exit code {_exit}).\n"
|
||||
f"Command: {_cmd}\n"
|
||||
f"Output:\n{_out}]"
|
||||
)
|
||||
|
||||
|
||||
def _detect_file_drop(user_input: str) -> "dict | None":
|
||||
@@ -3235,25 +3199,27 @@ class HermesCLI:
|
||||
|
||||
@staticmethod
|
||||
def _scrollback_box_width(width: Optional[int] = None) -> int:
|
||||
"""Return a resize-safe width for printed scrollback box rules.
|
||||
"""Return the full viewport width for printed scrollback box rules.
|
||||
|
||||
Lines already printed to terminal scrollback are reflowed by the
|
||||
terminal emulator when the column count shrinks. A full-width response
|
||||
border drawn at, say, 200 columns will wrap into two or three rows of
|
||||
dashes after the user resizes to 80 columns, looking like duplicated
|
||||
separator lines (the family of bugs tracked by #18449, #19280, #22976).
|
||||
Previously this clamped to ``max(32, min(width, 56))`` as a defense
|
||||
against terminal-emulator reflow on column-shrink (#25975, salvaging
|
||||
#24403). That clamp made response/reasoning borders look stubby on
|
||||
any modern wide terminal. We now trust the prompt_toolkit
|
||||
``_output_screen_diff`` monkey-patch landed in #26137 (salvaging
|
||||
#25981) to keep chrome out of scrollback in the first place, and
|
||||
accept that an aggressive column-shrink may visually reflow already
|
||||
printed Panel borders — that's a cosmetic artifact of stamped
|
||||
scrollback history, not a live-render bug.
|
||||
|
||||
Keep decorative scrollback boxes intentionally narrower than the
|
||||
viewport so a moderate resize never triggers reflow. The live TUI
|
||||
footer (status bar, input rule) still uses the full width — only
|
||||
content that is *stamped into scrollback* needs this clamp.
|
||||
A small floor (32 cols) is kept so the box still renders on tiny
|
||||
terminals without negative ``'─' * (w - 2)`` math.
|
||||
"""
|
||||
if width is None:
|
||||
try:
|
||||
width = shutil.get_terminal_size((80, 24)).columns
|
||||
except Exception:
|
||||
width = 80
|
||||
return max(32, min(int(width or 80), 56))
|
||||
return max(32, int(width or 80))
|
||||
|
||||
def _tui_input_rule_height(self, position: str, width: Optional[int] = None) -> int:
|
||||
"""Return the visible height for the top/bottom input separator rules."""
|
||||
@@ -3368,8 +3334,11 @@ class HermesCLI:
|
||||
percent_label = f"{percent}%" if percent is not None else "--"
|
||||
duration_label = snapshot["duration"]
|
||||
|
||||
yolo_active = bool(os.getenv("HERMES_YOLO_MODE"))
|
||||
if width < 52:
|
||||
text = f"⚕ {snapshot['model_short']} · {duration_label}"
|
||||
if yolo_active:
|
||||
text += " · ⚠ YOLO"
|
||||
return self._trim_status_bar_text(text, width)
|
||||
if width < 76:
|
||||
parts = [f"⚕ {snapshot['model_short']}", percent_label]
|
||||
@@ -3377,6 +3346,8 @@ class HermesCLI:
|
||||
if compressions:
|
||||
parts.append(f"🗜️ {compressions}")
|
||||
parts.append(duration_label)
|
||||
if yolo_active:
|
||||
parts.append("⚠ YOLO")
|
||||
return self._trim_status_bar_text(" · ".join(parts), width)
|
||||
|
||||
if snapshot["context_length"]:
|
||||
@@ -3394,6 +3365,8 @@ class HermesCLI:
|
||||
prompt_elapsed = snapshot.get("prompt_elapsed")
|
||||
if prompt_elapsed:
|
||||
parts.append(prompt_elapsed)
|
||||
if yolo_active:
|
||||
parts.append("⚠ YOLO")
|
||||
return self._trim_status_bar_text(" │ ".join(parts), width)
|
||||
except Exception:
|
||||
return f"⚕ {self.model if getattr(self, 'model', None) else 'Hermes'}"
|
||||
@@ -3410,6 +3383,7 @@ class HermesCLI:
|
||||
# line and produce duplicated status bar rows over long sessions.
|
||||
width = self._get_tui_terminal_width()
|
||||
duration_label = snapshot["duration"]
|
||||
yolo_active = bool(os.getenv("HERMES_YOLO_MODE"))
|
||||
|
||||
if width < 52:
|
||||
frags = [
|
||||
@@ -3417,8 +3391,11 @@ class HermesCLI:
|
||||
("class:status-bar-strong", snapshot["model_short"]),
|
||||
("class:status-bar-dim", " · "),
|
||||
("class:status-bar-dim", duration_label),
|
||||
("class:status-bar", " "),
|
||||
]
|
||||
if yolo_active:
|
||||
frags.append(("class:status-bar-dim", " · "))
|
||||
frags.append(("class:status-bar-yolo", "⚠ YOLO"))
|
||||
frags.append(("class:status-bar", " "))
|
||||
else:
|
||||
percent = snapshot["context_percent"]
|
||||
percent_label = f"{percent}%" if percent is not None else "--"
|
||||
@@ -3436,8 +3413,11 @@ class HermesCLI:
|
||||
frags.extend([
|
||||
("class:status-bar-dim", " · "),
|
||||
("class:status-bar-dim", duration_label),
|
||||
("class:status-bar", " "),
|
||||
])
|
||||
if yolo_active:
|
||||
frags.append(("class:status-bar-dim", " · "))
|
||||
frags.append(("class:status-bar-yolo", "⚠ YOLO"))
|
||||
frags.append(("class:status-bar", " "))
|
||||
else:
|
||||
if snapshot["context_length"]:
|
||||
ctx_total = _format_context_length(snapshot["context_length"])
|
||||
@@ -3470,6 +3450,9 @@ class HermesCLI:
|
||||
if prompt_elapsed:
|
||||
frags.append(("class:status-bar-dim", " │ "))
|
||||
frags.append(("class:status-bar-dim", prompt_elapsed))
|
||||
if yolo_active:
|
||||
frags.append(("class:status-bar-dim", " │ "))
|
||||
frags.append(("class:status-bar-yolo", "⚠ YOLO"))
|
||||
frags.append(("class:status-bar", " "))
|
||||
|
||||
total_width = sum(self._status_bar_display_width(text) for _, text in frags)
|
||||
@@ -6216,6 +6199,38 @@ class HermesCLI:
|
||||
else:
|
||||
_cprint(f" ↻ Resumed session {target_id}{title_part} — no messages, starting fresh.")
|
||||
|
||||
def _handle_sessions_command(self, cmd_original: str) -> None:
|
||||
"""Handle /sessions [list|<id_or_title>] — browse or resume previous sessions.
|
||||
|
||||
Without arguments, prints the same recent-sessions table that /resume
|
||||
shows when called without a target, and tells the user how to resume.
|
||||
With an explicit subcommand or target, delegates to the resume flow so
|
||||
``/sessions <id>`` and ``/resume <id>`` behave identically.
|
||||
|
||||
The TUI ships an interactive picker overlay for this command; the
|
||||
classic CLI prints an inline list because there is no equivalent
|
||||
overlay primitive here. Without this handler the canonical name
|
||||
``sessions`` falls through ``process_command``'s elif chain and
|
||||
prints ``Unknown command: sessions`` even though the command is
|
||||
registered in the central COMMAND_REGISTRY.
|
||||
"""
|
||||
parts = cmd_original.split(None, 1)
|
||||
arg = parts[1].strip() if len(parts) > 1 else ""
|
||||
sub = arg.lower()
|
||||
|
||||
# Bare /sessions or /sessions list — show recent sessions inline.
|
||||
if not arg or sub in {"list", "ls", "browse"}:
|
||||
if not self._session_db:
|
||||
from hermes_state import format_session_db_unavailable
|
||||
_cprint(f" {format_session_db_unavailable()}")
|
||||
return
|
||||
if not self._show_recent_sessions(reason="sessions"):
|
||||
_cprint(" (._.) No previous sessions yet.")
|
||||
return
|
||||
|
||||
# /sessions <id_or_title> behaves the same as /resume <id_or_title>.
|
||||
self._handle_resume_command(f"/resume {arg}")
|
||||
|
||||
def _handle_branch_command(self, cmd_original: str) -> None:
|
||||
"""Handle /branch [name] — fork the current session into a new independent copy.
|
||||
|
||||
@@ -7795,6 +7810,8 @@ class HermesCLI:
|
||||
self.new_session(title=title)
|
||||
elif canonical == "resume":
|
||||
self._handle_resume_command(cmd_original)
|
||||
elif canonical == "sessions":
|
||||
self._handle_sessions_command(cmd_original)
|
||||
elif canonical == "model":
|
||||
self._handle_model_switch(cmd_original)
|
||||
elif canonical == "codex-runtime":
|
||||
@@ -11719,11 +11736,13 @@ class HermesCLI:
|
||||
|
||||
# Ensure tirith security scanner is available (downloads if needed).
|
||||
# Warn the user if tirith is enabled in config but not available,
|
||||
# so they know command security scanning is degraded.
|
||||
# so they know command security scanning is degraded. Suppressed
|
||||
# on platforms where tirith ships no binary (Windows etc.) — the
|
||||
# user can't act on it and pattern-matching guards still run.
|
||||
try:
|
||||
from tools.tirith_security import ensure_installed
|
||||
from tools.tirith_security import ensure_installed, is_platform_supported
|
||||
tirith_path = ensure_installed(log_failures=False)
|
||||
if tirith_path is None:
|
||||
if tirith_path is None and is_platform_supported():
|
||||
security_cfg = self.config.get("security", {}) or {}
|
||||
tirith_enabled = security_cfg.get("tirith_enabled", True)
|
||||
if tirith_enabled:
|
||||
@@ -13308,6 +13327,7 @@ class HermesCLI:
|
||||
'status-bar-warn': 'bg:#1a1a2e #FFD700 bold',
|
||||
'status-bar-bad': 'bg:#1a1a2e #FF8C00 bold',
|
||||
'status-bar-critical': 'bg:#1a1a2e #FF6B6B bold',
|
||||
'status-bar-yolo': 'bg:#1a1a2e #FF4444 bold',
|
||||
# Bronze horizontal rules around the input area
|
||||
'input-rule': '#CD7F32',
|
||||
# Clipboard image attachment badges
|
||||
@@ -13464,16 +13484,8 @@ class HermesCLI:
|
||||
# and watch pattern matches) while agent is idle.
|
||||
try:
|
||||
from tools.process_registry import process_registry
|
||||
if not process_registry.completion_queue.empty():
|
||||
evt = process_registry.completion_queue.get_nowait()
|
||||
# Skip if the agent already consumed this via wait/poll/log
|
||||
_evt_sid = evt.get("session_id", "")
|
||||
if evt.get("type") == "completion" and process_registry.is_completion_consumed(_evt_sid):
|
||||
pass # already delivered via tool result
|
||||
else:
|
||||
_synth = _format_process_notification(evt)
|
||||
if _synth:
|
||||
self._pending_input.put(_synth)
|
||||
for _evt, _synth in process_registry.drain_notifications():
|
||||
self._pending_input.put(_synth)
|
||||
except Exception:
|
||||
pass
|
||||
continue
|
||||
@@ -13581,15 +13593,8 @@ class HermesCLI:
|
||||
# that arrived while the agent was running.
|
||||
try:
|
||||
from tools.process_registry import process_registry
|
||||
while not process_registry.completion_queue.empty():
|
||||
evt = process_registry.completion_queue.get_nowait()
|
||||
# Skip if the agent already consumed this via wait/poll/log
|
||||
_evt_sid = evt.get("session_id", "")
|
||||
if evt.get("type") == "completion" and process_registry.is_completion_consumed(_evt_sid):
|
||||
continue # already delivered via tool result
|
||||
_synth = _format_process_notification(evt)
|
||||
if _synth:
|
||||
self._pending_input.put(_synth)
|
||||
for _evt, _synth in process_registry.drain_notifications():
|
||||
self._pending_input.put(_synth)
|
||||
except Exception:
|
||||
pass # Non-fatal — don't break the main loop
|
||||
|
||||
@@ -13721,6 +13726,30 @@ class HermesCLI:
|
||||
self._print_exit_summary()
|
||||
return
|
||||
|
||||
# On macOS with uv-managed Python, kqueue's selector cannot register
|
||||
# fd 0, raising OSError(EINVAL) from kqueue.control() when prompt_toolkit
|
||||
# calls loop.add_reader (#6393). Probe kqueue and, if it can't watch
|
||||
# stdin, switch to a SelectSelector-backed event loop policy.
|
||||
if sys.platform == "darwin":
|
||||
try:
|
||||
import selectors as _selectors
|
||||
if hasattr(_selectors, "KqueueSelector"):
|
||||
_kq = _selectors.KqueueSelector()
|
||||
try:
|
||||
_kq.register(0, _selectors.EVENT_READ)
|
||||
_kq.unregister(0)
|
||||
finally:
|
||||
_kq.close()
|
||||
except (OSError, ValueError, KeyError):
|
||||
import asyncio as _aio_probe
|
||||
import selectors as _selectors
|
||||
|
||||
class _SelectEventLoopPolicy(_aio_probe.DefaultEventLoopPolicy):
|
||||
def new_event_loop(self):
|
||||
return _aio_probe.SelectorEventLoop(_selectors.SelectSelector())
|
||||
|
||||
_aio_probe.set_event_loop_policy(_SelectEventLoopPolicy())
|
||||
|
||||
# Run the application with patch_stdout for proper output handling
|
||||
try:
|
||||
with patch_stdout():
|
||||
@@ -13741,12 +13770,20 @@ class HermesCLI:
|
||||
except (KeyError, OSError) as _stdin_err:
|
||||
# Catch selector registration failures from broken stdin (#6393)
|
||||
# and I/O errors from broken stdout during interrupt (#13710).
|
||||
if isinstance(_stdin_err, OSError) and getattr(_stdin_err, "errno", None) == errno.EIO:
|
||||
_errno = getattr(_stdin_err, "errno", None) if isinstance(_stdin_err, OSError) else None
|
||||
_msg = str(_stdin_err)
|
||||
if _errno == errno.EIO:
|
||||
pass # suppress broken-stdout I/O errors on interrupt (#13710)
|
||||
elif "is not registered" in str(_stdin_err) or "Bad file descriptor" in str(_stdin_err):
|
||||
elif (
|
||||
_errno in (errno.EINVAL, errno.EBADF)
|
||||
or "is not registered" in _msg
|
||||
or "Bad file descriptor" in _msg
|
||||
or "Invalid argument" in _msg
|
||||
):
|
||||
print(
|
||||
f"\nError: stdin is not usable ({_stdin_err}).\n"
|
||||
"This can happen with certain Python installations (e.g. uv-managed cPython on macOS).\n"
|
||||
"This can happen with certain Python installations (e.g. uv-managed cPython on macOS)\n"
|
||||
"where kqueue cannot register fd 0.\n"
|
||||
"Try reinstalling Python via pyenv or Homebrew, then re-run: hermes setup"
|
||||
)
|
||||
else:
|
||||
|
||||
+56
-11
@@ -645,6 +645,44 @@ def get_job(job_id: str) -> Optional[Dict[str, Any]]:
|
||||
return None
|
||||
|
||||
|
||||
class AmbiguousJobReference(LookupError):
|
||||
"""Raised when a job name matches more than one job."""
|
||||
|
||||
def __init__(self, ref: str, matches: List[Dict[str, Any]]):
|
||||
self.ref = ref
|
||||
self.matches = matches
|
||||
ids = ", ".join(m["id"] for m in matches)
|
||||
super().__init__(
|
||||
f"Job name '{ref}' is ambiguous — matches {len(matches)} jobs: {ids}. "
|
||||
f"Use the job ID instead."
|
||||
)
|
||||
|
||||
|
||||
def resolve_job_ref(ref: str) -> Optional[Dict[str, Any]]:
|
||||
"""Resolve a job reference (ID or name) to a job record.
|
||||
|
||||
- Exact ID match wins (works even if a different job's name equals this ID).
|
||||
- Otherwise, case-insensitive name match.
|
||||
- If a name matches more than one job, raises AmbiguousJobReference so the
|
||||
caller can surface the matching IDs rather than silently picking one.
|
||||
"""
|
||||
if not ref:
|
||||
return None
|
||||
jobs = load_jobs()
|
||||
for job in jobs:
|
||||
if job["id"] == ref:
|
||||
return _normalize_job_record(job)
|
||||
ref_lower = ref.lower()
|
||||
name_matches = [j for j in jobs if (j.get("name") or "").lower() == ref_lower]
|
||||
if not name_matches:
|
||||
return None
|
||||
if len(name_matches) > 1:
|
||||
raise AmbiguousJobReference(
|
||||
ref, [_normalize_job_record(j) for j in name_matches]
|
||||
)
|
||||
return _normalize_job_record(name_matches[0])
|
||||
|
||||
|
||||
def list_jobs(include_disabled: bool = False) -> List[Dict[str, Any]]:
|
||||
"""List all jobs, optionally including disabled ones."""
|
||||
jobs = [_normalize_job_record(j) for j in load_jobs()]
|
||||
@@ -702,9 +740,12 @@ def update_job(job_id: str, updates: Dict[str, Any]) -> Optional[Dict[str, Any]]
|
||||
|
||||
|
||||
def pause_job(job_id: str, reason: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
||||
"""Pause a job without deleting it."""
|
||||
"""Pause a job without deleting it. Accepts a job ID or name."""
|
||||
job = resolve_job_ref(job_id)
|
||||
if not job:
|
||||
return None
|
||||
return update_job(
|
||||
job_id,
|
||||
job["id"],
|
||||
{
|
||||
"enabled": False,
|
||||
"state": "paused",
|
||||
@@ -715,14 +756,14 @@ def pause_job(job_id: str, reason: Optional[str] = None) -> Optional[Dict[str, A
|
||||
|
||||
|
||||
def resume_job(job_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Resume a paused job and compute the next future run from now."""
|
||||
job = get_job(job_id)
|
||||
"""Resume a paused job and compute the next future run from now. Accepts a job ID or name."""
|
||||
job = resolve_job_ref(job_id)
|
||||
if not job:
|
||||
return None
|
||||
|
||||
next_run_at = compute_next_run(job["schedule"])
|
||||
return update_job(
|
||||
job_id,
|
||||
job["id"],
|
||||
{
|
||||
"enabled": True,
|
||||
"state": "scheduled",
|
||||
@@ -734,12 +775,12 @@ def resume_job(job_id: str) -> Optional[Dict[str, Any]]:
|
||||
|
||||
|
||||
def trigger_job(job_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Schedule a job to run on the next scheduler tick."""
|
||||
job = get_job(job_id)
|
||||
"""Schedule a job to run on the next scheduler tick. Accepts a job ID or name."""
|
||||
job = resolve_job_ref(job_id)
|
||||
if not job:
|
||||
return None
|
||||
return update_job(
|
||||
job_id,
|
||||
job["id"],
|
||||
{
|
||||
"enabled": True,
|
||||
"state": "scheduled",
|
||||
@@ -751,14 +792,18 @@ def trigger_job(job_id: str) -> Optional[Dict[str, Any]]:
|
||||
|
||||
|
||||
def remove_job(job_id: str) -> bool:
|
||||
"""Remove a job by ID."""
|
||||
"""Remove a job by ID or name."""
|
||||
job = resolve_job_ref(job_id)
|
||||
if not job:
|
||||
return False
|
||||
canonical_id = job["id"]
|
||||
jobs = load_jobs()
|
||||
original_len = len(jobs)
|
||||
jobs = [j for j in jobs if j["id"] != job_id]
|
||||
jobs = [j for j in jobs if j["id"] != canonical_id]
|
||||
if len(jobs) < original_len:
|
||||
save_jobs(jobs)
|
||||
# Clean up output directory to prevent orphaned dirs accumulating
|
||||
job_output_dir = OUTPUT_DIR / job_id
|
||||
job_output_dir = OUTPUT_DIR / canonical_id
|
||||
if job_output_dir.exists():
|
||||
shutil.rmtree(job_output_dir)
|
||||
return True
|
||||
|
||||
+25
-14
@@ -464,7 +464,14 @@ def _send_media_via_adapter(
|
||||
else:
|
||||
coro = adapter.send_document(chat_id=chat_id, file_path=media_path, metadata=metadata)
|
||||
|
||||
future = asyncio.run_coroutine_threadsafe(coro, loop)
|
||||
from agent.async_utils import safe_schedule_threadsafe
|
||||
future = safe_schedule_threadsafe(coro, loop)
|
||||
if future is None:
|
||||
logger.warning(
|
||||
"Job '%s': cannot send media %s, gateway loop unavailable",
|
||||
job.get("id", "?"), media_path,
|
||||
)
|
||||
return
|
||||
try:
|
||||
result = future.result(timeout=30)
|
||||
except TimeoutError:
|
||||
@@ -585,22 +592,26 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option
|
||||
text_to_send = cleaned_delivery_content.strip()
|
||||
adapter_ok = True
|
||||
if text_to_send:
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
from agent.async_utils import safe_schedule_threadsafe
|
||||
future = safe_schedule_threadsafe(
|
||||
runtime_adapter.send(chat_id, text_to_send, metadata=send_metadata),
|
||||
loop,
|
||||
)
|
||||
try:
|
||||
send_result = future.result(timeout=60)
|
||||
except TimeoutError:
|
||||
future.cancel()
|
||||
raise
|
||||
if send_result and not getattr(send_result, "success", True):
|
||||
err = getattr(send_result, "error", "unknown")
|
||||
logger.warning(
|
||||
"Job '%s': live adapter send to %s:%s failed (%s), falling back to standalone",
|
||||
job["id"], platform_name, chat_id, err,
|
||||
)
|
||||
adapter_ok = False # fall through to standalone path
|
||||
if future is None:
|
||||
adapter_ok = False
|
||||
else:
|
||||
try:
|
||||
send_result = future.result(timeout=60)
|
||||
except TimeoutError:
|
||||
future.cancel()
|
||||
raise
|
||||
if send_result and not getattr(send_result, "success", True):
|
||||
err = getattr(send_result, "error", "unknown")
|
||||
logger.warning(
|
||||
"Job '%s': live adapter send to %s:%s failed (%s), falling back to standalone",
|
||||
job["id"], platform_name, chat_id, err,
|
||||
)
|
||||
adapter_ok = False # fall through to standalone path
|
||||
|
||||
# Send extracted media files as native attachments via the live adapter
|
||||
if adapter_ok and media_files:
|
||||
|
||||
@@ -1,324 +0,0 @@
|
||||
# Hermes-Agent Atropos Environments
|
||||
|
||||
This directory contains the integration layer between **hermes-agent's** tool-calling capabilities and the **Atropos** RL training framework. It provides everything needed to run agentic LLMs through multi-turn tool-calling loops, score their output with arbitrary reward functions, and feed results into Atropos for training or evaluation.
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
```
|
||||
Atropos Framework
|
||||
┌───────────────────────┐
|
||||
│ BaseEnv │ (atroposlib)
|
||||
│ - Server management │
|
||||
│ - Worker scheduling │
|
||||
│ - Wandb logging │
|
||||
│ - CLI (serve/process/ │
|
||||
│ evaluate) │
|
||||
└───────────┬───────────┘
|
||||
│ inherits
|
||||
┌───────────┴───────────┐
|
||||
│ HermesAgentBaseEnv │ hermes_base_env.py
|
||||
│ - Terminal backend │
|
||||
│ - Tool resolution │
|
||||
│ - Agent loop │
|
||||
│ - ToolContext │
|
||||
│ - Async patches │
|
||||
└───────────┬───────────┘
|
||||
│ inherits
|
||||
┌─────────────────┼─────────────────┐
|
||||
│ │ │
|
||||
TerminalTestEnv HermesSweEnv TerminalBench2EvalEnv
|
||||
(stack testing) (SWE training) (TB2 benchmark eval)
|
||||
```
|
||||
|
||||
### Inheritance Chain
|
||||
|
||||
**BaseEnv** (from `atroposlib`) is the Atropos base class. It provides:
|
||||
- Server management (OpenAI-compatible API servers, VLLM, SGLang)
|
||||
- Worker scheduling for parallel rollouts
|
||||
- Wandb integration for metrics and rollout logging
|
||||
- CLI interface with three subcommands: `serve`, `process`, `evaluate`
|
||||
- `evaluate_log()` for saving eval results to JSON + samples.jsonl
|
||||
|
||||
**HermesAgentBaseEnv** (`hermes_base_env.py`) extends BaseEnv with hermes-agent specifics:
|
||||
- Sets `os.environ["TERMINAL_ENV"]` to configure the terminal backend (local, docker, ssh, singularity, modal, daytona, vercel_sandbox)
|
||||
- Resolves hermes-agent toolsets via `_resolve_tools_for_group()` (calls `get_tool_definitions()` which queries `tools/registry.py`)
|
||||
- Implements `collect_trajectory()` which runs the full agent loop and computes rewards
|
||||
- Supports two-phase operation (Phase 1: OpenAI server, Phase 2: VLLM ManagedServer)
|
||||
- Applies monkey patches for async-safe tool operation at import time
|
||||
|
||||
Concrete environments inherit from `HermesAgentBaseEnv` and implement:
|
||||
- `setup()` -- Load dataset, initialize state
|
||||
- `get_next_item()` -- Return the next item for rollout
|
||||
- `format_prompt()` -- Convert a dataset item into the user message
|
||||
- `compute_reward()` -- Score the rollout using ToolContext
|
||||
- `evaluate()` -- Periodic evaluation logic
|
||||
|
||||
## Core Components
|
||||
|
||||
### Agent Loop (`agent_loop.py`)
|
||||
|
||||
`HermesAgentLoop` is the reusable multi-turn agent engine. It runs the same pattern as hermes-agent's `run_agent.py`:
|
||||
|
||||
1. Send messages + tools to the API via `server.chat_completion()`
|
||||
2. If the response contains `tool_calls`, execute each one via `handle_function_call()` (which delegates to `tools/registry.py`'s `dispatch()`)
|
||||
3. Append tool results to the conversation and go back to step 1
|
||||
4. If the response has no tool_calls, the agent is done
|
||||
|
||||
Tool calls are executed in a thread pool (`run_in_executor`) so backends that use `asyncio.run()` internally (Modal, Docker) don't deadlock inside Atropos's event loop.
|
||||
|
||||
Returns an `AgentResult` containing the full conversation history, turn count, reasoning content per turn, tool errors, and optional ManagedServer state (for Phase 2).
|
||||
|
||||
### Tool Context (`tool_context.py`)
|
||||
|
||||
`ToolContext` is a per-rollout handle that gives reward/verification functions direct access to **all** hermes-agent tools, scoped to the rollout's `task_id`. The same `task_id` means the terminal/browser session is the SAME one the model used during its rollout -- all state (files, processes, browser tabs) is preserved.
|
||||
|
||||
```python
|
||||
async def compute_reward(self, item, result, ctx: ToolContext):
|
||||
# Run tests in the model's terminal sandbox
|
||||
test = ctx.terminal("pytest -v")
|
||||
if test["exit_code"] == 0:
|
||||
return 1.0
|
||||
|
||||
# Check if a file was created
|
||||
content = ctx.read_file("/workspace/solution.py")
|
||||
if content.get("content"):
|
||||
return 0.5
|
||||
|
||||
# Download files locally for verification (binary-safe)
|
||||
ctx.download_file("/remote/output.bin", "/local/output.bin")
|
||||
|
||||
return 0.0
|
||||
```
|
||||
|
||||
Available methods:
|
||||
- **Terminal**: `terminal(command, timeout)` -- run shell commands
|
||||
- **Files**: `read_file(path)`, `write_file(path, content)`, `search(query, path)`
|
||||
- **Transfers**: `upload_file()`, `upload_dir()`, `download_file()`, `download_dir()` -- binary-safe file transfers between host and sandbox
|
||||
- **Web**: `web_search(query)`, `web_extract(urls)`
|
||||
- **Browser**: `browser_navigate(url)`, `browser_snapshot()`
|
||||
- **Generic**: `call_tool(name, args)` -- call any hermes-agent tool by name
|
||||
- **Cleanup**: `cleanup()` -- release all resources (called automatically after `compute_reward`)
|
||||
|
||||
### Patches (`patches.py`)
|
||||
|
||||
**Problem**: Some hermes-agent tools use `asyncio.run()` internally (e.g., the Modal backend). This crashes when called from inside Atropos's event loop because `asyncio.run()` cannot be nested.
|
||||
|
||||
**Solution**: `ModalEnvironment` uses a dedicated `_AsyncWorker` background thread with its own event loop. The calling code sees a sync interface, but internally all async Modal SDK calls happen on the worker thread so they don't conflict with Atropos's loop. This is built directly into `tools/environments/modal.py` — no monkey-patching required.
|
||||
|
||||
`patches.py` is now a no-op (kept for backward compatibility with imports).
|
||||
|
||||
### Tool Call Parsers (`tool_call_parsers/`)
|
||||
|
||||
Client-side parsers that extract structured `tool_calls` from raw model output text. Used in **Phase 2** (VLLM server type) where ManagedServer's `/generate` endpoint returns raw text without tool call parsing.
|
||||
|
||||
Each parser is a standalone reimplementation of the corresponding VLLM parser's `extract_tool_calls()` logic. No VLLM dependency -- only standard library (`re`, `json`, `uuid`) and `openai` types.
|
||||
|
||||
Available parsers:
|
||||
- `hermes` -- Hermes/ChatML `<tool_call>` XML format
|
||||
- `mistral` -- Mistral `[TOOL_CALLS]` format
|
||||
- `llama3_json` -- Llama 3 JSON tool calling
|
||||
- `qwen` -- Qwen tool calling format
|
||||
- `qwen3_coder` -- Qwen3 Coder format
|
||||
- `deepseek_v3` -- DeepSeek V3 format
|
||||
- `deepseek_v3_1` -- DeepSeek V3.1 format
|
||||
- `kimi_k2` -- Kimi K2 format
|
||||
- `longcat` -- Longcat format
|
||||
- `glm45` / `glm47` -- GLM model formats
|
||||
|
||||
Usage:
|
||||
```python
|
||||
from environments.tool_call_parsers import get_parser
|
||||
|
||||
parser = get_parser("hermes")
|
||||
content, tool_calls = parser.parse(raw_model_output)
|
||||
```
|
||||
|
||||
In Phase 1 (OpenAI server type), these parsers are not needed -- the server handles tool call parsing natively.
|
||||
|
||||
## Two-Phase Operation
|
||||
|
||||
### Phase 1: OpenAI Server (Evaluation / SFT Data Generation)
|
||||
|
||||
Uses `server.chat_completion()` with `tools=` parameter. The server (VLLM, SGLang, OpenRouter, OpenAI) handles tool call parsing natively. Returns `ChatCompletion` objects with structured `tool_calls`.
|
||||
|
||||
- Good for: evaluation, SFT data generation, testing
|
||||
- Run with: `serve` (with `run-api`), `process`, or `evaluate` subcommands
|
||||
- Placeholder tokens are created for the Atropos pipeline
|
||||
|
||||
### Phase 2: VLLM ManagedServer (Full RL Training)
|
||||
|
||||
Uses ManagedServer for exact token IDs + logprobs via `/generate`. Client-side tool call parser (from `tool_call_parsers/`) reconstructs structured `tool_calls` from raw output.
|
||||
|
||||
- Good for: full RL training with GRPO/PPO
|
||||
- Run with: `serve` subcommand
|
||||
- Real tokens, masks, and logprobs flow through the pipeline
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
environments/
|
||||
├── README.md # This file
|
||||
├── __init__.py # Package exports
|
||||
├── hermes_base_env.py # Abstract base (HermesAgentBaseEnv)
|
||||
├── agent_loop.py # Multi-turn agent engine (HermesAgentLoop)
|
||||
├── tool_context.py # Per-rollout tool access for reward functions
|
||||
├── patches.py # Async-safety patches for Modal backend
|
||||
│
|
||||
├── tool_call_parsers/ # Phase 2 client-side parsers
|
||||
│ ├── __init__.py # Registry + base class
|
||||
│ ├── hermes_parser.py
|
||||
│ ├── mistral_parser.py
|
||||
│ ├── llama_parser.py
|
||||
│ ├── qwen_parser.py
|
||||
│ ├── qwen3_coder_parser.py
|
||||
│ ├── deepseek_v3_parser.py
|
||||
│ ├── deepseek_v3_1_parser.py
|
||||
│ ├── kimi_k2_parser.py
|
||||
│ ├── longcat_parser.py
|
||||
│ ├── glm45_parser.py
|
||||
│ └── glm47_parser.py
|
||||
│
|
||||
├── terminal_test_env/ # Stack validation environment
|
||||
│ └── terminal_test_env.py
|
||||
│
|
||||
├── hermes_swe_env/ # SWE-bench style training environment
|
||||
│ └── hermes_swe_env.py
|
||||
│
|
||||
└── benchmarks/ # Evaluation benchmarks
|
||||
├── terminalbench_2/ # 89 terminal tasks, Modal sandboxes
|
||||
│ └── terminalbench2_env.py
|
||||
├── tblite/ # 100 calibrated tasks (fast TB2 proxy)
|
||||
│ └── tblite_env.py
|
||||
└── yc_bench/ # Long-horizon strategic benchmark
|
||||
└── yc_bench_env.py
|
||||
```
|
||||
|
||||
## Concrete Environments
|
||||
|
||||
### TerminalTestEnv (`terminal_test_env/`)
|
||||
|
||||
A self-contained environment with inline tasks (no external dataset needed) for validating the full stack end-to-end. Each task asks the model to create a file at a known path, and the verifier checks the content matches.
|
||||
|
||||
```bash
|
||||
# Serve mode (needs run-api)
|
||||
run-api
|
||||
python environments/terminal_test_env/terminal_test_env.py serve
|
||||
|
||||
# Process mode (no run-api, saves to JSONL)
|
||||
python environments/terminal_test_env/terminal_test_env.py process \
|
||||
--env.data_path_to_save_groups terminal_test_output.jsonl
|
||||
```
|
||||
|
||||
### HermesSweEnv (`hermes_swe_env/`)
|
||||
|
||||
SWE-bench style training environment. The model gets a coding task, uses terminal + file + web tools to solve it, and the reward function runs tests in the same Modal sandbox.
|
||||
|
||||
```bash
|
||||
python environments/hermes_swe_env/hermes_swe_env.py serve \
|
||||
--openai.model_name YourModel \
|
||||
--env.dataset_name bigcode/humanevalpack \
|
||||
--env.terminal_backend modal
|
||||
```
|
||||
|
||||
### TerminalBench2EvalEnv (`benchmarks/terminalbench_2/`)
|
||||
|
||||
**Eval-only** environment for the Terminal-Bench 2.0 benchmark (89 tasks). Each task gets a pre-built Docker Hub image, a natural language instruction, and a test suite. The agent uses terminal + file tools to solve the task, then the test suite verifies correctness.
|
||||
|
||||
Follows the standard Atropos eval pattern (like GPQA, MMLU, etc.):
|
||||
- Run via `evaluate` subcommand (no `run-api` needed)
|
||||
- `setup()` loads the dataset, `evaluate()` runs all tasks
|
||||
- `rollout_and_score_eval()` handles per-task agent loop + test verification
|
||||
- Downloads verifier output locally for reliable reward checking (Harbor pattern)
|
||||
|
||||
```bash
|
||||
# Run full benchmark
|
||||
python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
||||
--openai.model_name anthropic/claude-opus-4.6
|
||||
|
||||
# Run subset of tasks
|
||||
python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
||||
--openai.model_name anthropic/claude-opus-4.6 \
|
||||
--env.task_filter fix-git,git-multibranch
|
||||
|
||||
# Skip specific tasks
|
||||
python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
||||
--openai.model_name anthropic/claude-opus-4.6 \
|
||||
--env.skip_tasks heavy-task,slow-task
|
||||
```
|
||||
|
||||
## Creating a New Environment
|
||||
|
||||
### Training Environment
|
||||
|
||||
1. Create a new directory under `environments/`
|
||||
2. Create your env file inheriting from `HermesAgentBaseEnv`
|
||||
3. Implement the four abstract methods + `evaluate()`
|
||||
|
||||
```python
|
||||
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||
|
||||
class MyEnvConfig(HermesAgentEnvConfig):
|
||||
pass # Add custom fields as needed
|
||||
|
||||
class MyEnv(HermesAgentBaseEnv):
|
||||
name = "my-env"
|
||||
env_config_cls = MyEnvConfig
|
||||
|
||||
@classmethod
|
||||
def config_init(cls):
|
||||
env_config = MyEnvConfig(
|
||||
enabled_toolsets=["terminal", "file"],
|
||||
terminal_backend="modal",
|
||||
# ... other config
|
||||
)
|
||||
server_configs = [APIServerConfig(...)]
|
||||
return env_config, server_configs
|
||||
|
||||
async def setup(self):
|
||||
self.dataset = load_dataset(...)
|
||||
self.iter = 0
|
||||
|
||||
async def get_next_item(self):
|
||||
item = self.dataset[self.iter % len(self.dataset)]
|
||||
self.iter += 1
|
||||
return item
|
||||
|
||||
def format_prompt(self, item):
|
||||
return item["instruction"]
|
||||
|
||||
async def compute_reward(self, item, result, ctx):
|
||||
# ctx gives you full tool access to the rollout's sandbox
|
||||
test = ctx.terminal("pytest -v")
|
||||
return 1.0 if test["exit_code"] == 0 else 0.0
|
||||
|
||||
async def evaluate(self, *args, **kwargs):
|
||||
# Periodic evaluation logic
|
||||
...
|
||||
|
||||
if __name__ == "__main__":
|
||||
MyEnv.cli()
|
||||
```
|
||||
|
||||
### Eval-Only Environment (Benchmark)
|
||||
|
||||
For eval benchmarks, follow the pattern in `terminalbench2_env.py`:
|
||||
1. Create under `environments/benchmarks/your-benchmark/`
|
||||
2. Inherit from `HermesAgentBaseEnv`
|
||||
3. Set eval-only config: `eval_handling=STOP_TRAIN`, `steps_per_eval=1`, `total_steps=1`
|
||||
4. Stub the training methods (`collect_trajectories`, `score`)
|
||||
5. Implement `rollout_and_score_eval()` and `evaluate()`
|
||||
6. Run with `evaluate` subcommand
|
||||
|
||||
## Key Config Fields
|
||||
|
||||
| Field | Description | Default |
|
||||
|-------|-------------|---------|
|
||||
| `enabled_toolsets` | Which hermes toolsets to enable | `None` (all) |
|
||||
| `disabled_toolsets` | Toolsets to disable | `None` |
|
||||
| `distribution` | Probabilistic toolset distribution name | `None` |
|
||||
| `max_agent_turns` | Max LLM calls per rollout | `30` |
|
||||
| `agent_temperature` | Sampling temperature | `1.0` |
|
||||
| `terminal_backend` | `local`, `docker`, `modal`, `daytona`, `ssh`, `singularity` | `local` |
|
||||
| `system_prompt` | System message for the agent | `None` |
|
||||
| `tool_call_parser` | Parser name for Phase 2 | `hermes` |
|
||||
| `eval_handling` | `STOP_TRAIN`, `LIMIT_TRAIN`, `NONE` | `STOP_TRAIN` |
|
||||
@@ -1,36 +0,0 @@
|
||||
"""
|
||||
Hermes-Agent Atropos Environments
|
||||
|
||||
Provides a layered integration between hermes-agent's tool-calling capabilities
|
||||
and the Atropos RL training framework.
|
||||
|
||||
Core layers:
|
||||
- agent_loop: Reusable multi-turn agent loop with standard OpenAI-spec tool calling
|
||||
- tool_context: Per-rollout tool access handle for reward/verification functions
|
||||
- hermes_base_env: Abstract base environment (BaseEnv subclass) for Atropos
|
||||
- tool_call_parsers: Client-side tool call parser registry for Phase 2 (VLLM /generate)
|
||||
|
||||
Concrete environments:
|
||||
- terminal_test_env/: Simple file-creation tasks for testing the stack
|
||||
- hermes_swe_env/: SWE-bench style tasks with Modal sandboxes
|
||||
|
||||
Benchmarks (eval-only):
|
||||
- benchmarks/terminalbench_2/: Terminal-Bench 2.0 evaluation
|
||||
"""
|
||||
|
||||
try:
|
||||
from environments.agent_loop import AgentResult, HermesAgentLoop
|
||||
from environments.tool_context import ToolContext
|
||||
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||
except ImportError:
|
||||
# atroposlib not installed — environments are unavailable but
|
||||
# submodules like tool_call_parsers can still be imported directly.
|
||||
pass
|
||||
|
||||
__all__ = [
|
||||
"AgentResult",
|
||||
"HermesAgentLoop",
|
||||
"ToolContext",
|
||||
"HermesAgentBaseEnv",
|
||||
"HermesAgentEnvConfig",
|
||||
]
|
||||
@@ -1,534 +0,0 @@
|
||||
"""
|
||||
HermesAgentLoop -- Reusable Multi-Turn Agent Engine
|
||||
|
||||
Runs the hermes-agent tool-calling loop using standard OpenAI-spec tool calling.
|
||||
Works with any server that returns ChatCompletion objects with tool_calls:
|
||||
- Phase 1: OpenAI server type (VLLM, SGLang, OpenRouter, OpenAI API)
|
||||
- Phase 2: ManagedServer with client-side tool call parser
|
||||
|
||||
The loop passes tools= and checks response.choices[0].message.tool_calls,
|
||||
identical to hermes-agent's run_agent.py. Tool execution is dispatched via
|
||||
handle_function_call() from model_tools.py.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional, Set
|
||||
|
||||
from model_tools import handle_function_call
|
||||
from tools.terminal_tool import get_active_env
|
||||
from tools.tool_result_storage import maybe_persist_tool_result, enforce_turn_budget
|
||||
|
||||
# Thread pool for running sync tool calls that internally use asyncio.run()
|
||||
# (e.g., the Modal/Docker/Daytona terminal backends). Running them in a separate
|
||||
# thread gives them a clean event loop so they don't deadlock inside Atropos's loop.
|
||||
# Size must be large enough for concurrent eval tasks (e.g., 89 TB2 tasks all
|
||||
# making tool calls). Too small = thread pool starvation, tasks queue for minutes.
|
||||
# Resized at runtime by HermesAgentBaseEnv.__init__ via resize_tool_pool().
|
||||
_tool_executor = concurrent.futures.ThreadPoolExecutor(max_workers=128)
|
||||
|
||||
|
||||
def resize_tool_pool(max_workers: int):
|
||||
"""
|
||||
Replace the global tool executor with a new one of the given size.
|
||||
|
||||
Called by HermesAgentBaseEnv.__init__ based on config.tool_pool_size.
|
||||
Safe to call before any tasks are submitted.
|
||||
"""
|
||||
global _tool_executor
|
||||
old_executor = _tool_executor
|
||||
_tool_executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers)
|
||||
old_executor.shutdown(wait=False)
|
||||
logger.info("Tool thread pool resized to %d workers", max_workers)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ToolError:
|
||||
"""Record of a tool execution error during the agent loop."""
|
||||
|
||||
turn: int # Which turn the error occurred on
|
||||
tool_name: str # Which tool was called
|
||||
arguments: str # The arguments passed (truncated)
|
||||
error: str # The error message
|
||||
tool_result: str # The raw result returned to the model
|
||||
|
||||
|
||||
@dataclass
|
||||
class AgentResult:
|
||||
"""Result of running the agent loop."""
|
||||
|
||||
# Full conversation history in OpenAI message format
|
||||
messages: List[Dict[str, Any]]
|
||||
# ManagedServer.get_state() if available (Phase 2), None otherwise
|
||||
managed_state: Optional[Dict[str, Any]] = None
|
||||
# How many LLM calls were made
|
||||
turns_used: int = 0
|
||||
# True if model stopped calling tools naturally (vs hitting max_turns)
|
||||
finished_naturally: bool = False
|
||||
# Extracted reasoning content per turn (from PR #297 helpers)
|
||||
reasoning_per_turn: List[Optional[str]] = field(default_factory=list)
|
||||
# Tool errors encountered during the loop
|
||||
tool_errors: List[ToolError] = field(default_factory=list)
|
||||
|
||||
|
||||
def _extract_reasoning_from_message(message) -> Optional[str]:
|
||||
"""
|
||||
Extract reasoning content from a ChatCompletion message.
|
||||
|
||||
Handles multiple provider formats:
|
||||
1. message.reasoning_content field (some providers)
|
||||
2. message.reasoning field (some providers)
|
||||
3. message.reasoning_details[].text (OpenRouter style)
|
||||
|
||||
Note: <think> block extraction from content is NOT done here -- that's
|
||||
handled by the response already in Phase 1 (server does it) or by
|
||||
ManagedServer's patch in Phase 2.
|
||||
|
||||
Args:
|
||||
message: The assistant message from ChatCompletion response
|
||||
|
||||
Returns:
|
||||
Extracted reasoning text, or None if not found
|
||||
"""
|
||||
# Check reasoning_content field (common across providers)
|
||||
if hasattr(message, "reasoning_content") and message.reasoning_content:
|
||||
return message.reasoning_content
|
||||
|
||||
# Check reasoning field
|
||||
if hasattr(message, "reasoning") and message.reasoning:
|
||||
return message.reasoning
|
||||
|
||||
# Check reasoning_details (OpenRouter style)
|
||||
if hasattr(message, "reasoning_details") and message.reasoning_details:
|
||||
for detail in message.reasoning_details:
|
||||
if hasattr(detail, "text") and detail.text:
|
||||
return detail.text
|
||||
if isinstance(detail, dict) and detail.get("text"):
|
||||
return detail["text"]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class HermesAgentLoop:
|
||||
"""
|
||||
Runs hermes-agent's tool-calling loop using standard OpenAI-spec tool calling.
|
||||
|
||||
Same pattern as run_agent.py:
|
||||
- Pass tools= to the API
|
||||
- Check response.choices[0].message.tool_calls
|
||||
- Dispatch via handle_function_call()
|
||||
|
||||
Works identically with any server type -- OpenAI, VLLM, SGLang, OpenRouter,
|
||||
or ManagedServer with a parser. The server determines how tool_calls get
|
||||
populated on the response.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
server,
|
||||
tool_schemas: List[Dict[str, Any]],
|
||||
valid_tool_names: Set[str],
|
||||
max_turns: int = 30,
|
||||
task_id: Optional[str] = None,
|
||||
temperature: float = 1.0,
|
||||
max_tokens: Optional[int] = None,
|
||||
extra_body: Optional[Dict[str, Any]] = None,
|
||||
budget_config: Optional["BudgetConfig"] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the agent loop.
|
||||
|
||||
Args:
|
||||
server: Server object with chat_completion() method (OpenAIServer,
|
||||
ManagedServer, ServerManager, etc.)
|
||||
tool_schemas: OpenAI-format tool definitions from get_tool_definitions()
|
||||
valid_tool_names: Set of tool names the model is allowed to call
|
||||
max_turns: Maximum number of LLM calls before stopping
|
||||
task_id: Unique ID for terminal/browser session isolation
|
||||
temperature: Sampling temperature for generation
|
||||
max_tokens: Max tokens per generation (None for server default)
|
||||
extra_body: Extra parameters passed to the OpenAI client's create() call.
|
||||
Used for OpenRouter provider preferences, transforms, etc.
|
||||
e.g. {"provider": {"ignore": ["DeepInfra"]}}
|
||||
budget_config: Tool result persistence budget. Controls per-tool
|
||||
thresholds, per-turn aggregate budget, and preview size.
|
||||
If None, uses DEFAULT_BUDGET (current hardcoded values).
|
||||
"""
|
||||
from tools.budget_config import DEFAULT_BUDGET
|
||||
self.server = server
|
||||
self.tool_schemas = tool_schemas
|
||||
self.valid_tool_names = valid_tool_names
|
||||
self.max_turns = max_turns
|
||||
self.task_id = task_id or str(uuid.uuid4())
|
||||
self.temperature = temperature
|
||||
self.max_tokens = max_tokens
|
||||
self.extra_body = extra_body
|
||||
self.budget_config = budget_config or DEFAULT_BUDGET
|
||||
|
||||
async def run(self, messages: List[Dict[str, Any]]) -> AgentResult:
|
||||
"""
|
||||
Execute the full agent loop using standard OpenAI tool calling.
|
||||
|
||||
Args:
|
||||
messages: Initial conversation messages (system + user).
|
||||
Modified in-place as the conversation progresses.
|
||||
|
||||
Returns:
|
||||
AgentResult with full conversation history, managed state, and metadata
|
||||
"""
|
||||
reasoning_per_turn = []
|
||||
tool_errors: List[ToolError] = []
|
||||
|
||||
# Per-loop TodoStore for the todo tool (ephemeral, dies with the loop)
|
||||
from tools.todo_tool import TodoStore, todo_tool as _todo_tool
|
||||
_todo_store = TodoStore()
|
||||
|
||||
# Extract user task from first user message for browser_snapshot context
|
||||
_user_task = None
|
||||
for msg in messages:
|
||||
if msg.get("role") == "user":
|
||||
content = msg.get("content", "")
|
||||
if isinstance(content, str) and content.strip():
|
||||
_user_task = content.strip()[:500] # Cap to avoid huge strings
|
||||
break
|
||||
|
||||
import time as _time
|
||||
|
||||
for turn in range(self.max_turns):
|
||||
turn_start = _time.monotonic()
|
||||
|
||||
# Build the chat_completion kwargs
|
||||
chat_kwargs = {
|
||||
"messages": messages,
|
||||
"n": 1,
|
||||
"temperature": self.temperature,
|
||||
}
|
||||
|
||||
# Only pass tools if we have them
|
||||
if self.tool_schemas:
|
||||
chat_kwargs["tools"] = self.tool_schemas
|
||||
|
||||
# Only pass max_tokens if explicitly set
|
||||
if self.max_tokens is not None:
|
||||
chat_kwargs["max_tokens"] = self.max_tokens
|
||||
|
||||
# Inject extra_body for provider-specific params (e.g., OpenRouter
|
||||
# provider preferences like banned/preferred providers, transforms)
|
||||
if self.extra_body:
|
||||
chat_kwargs["extra_body"] = self.extra_body
|
||||
|
||||
# Make the API call -- standard OpenAI spec
|
||||
api_start = _time.monotonic()
|
||||
try:
|
||||
response = await self.server.chat_completion(**chat_kwargs)
|
||||
except Exception as e:
|
||||
api_elapsed = _time.monotonic() - api_start
|
||||
logger.error("API call failed on turn %d (%.1fs): %s", turn + 1, api_elapsed, e)
|
||||
return AgentResult(
|
||||
messages=messages,
|
||||
managed_state=self._get_managed_state(),
|
||||
turns_used=turn + 1,
|
||||
finished_naturally=False,
|
||||
reasoning_per_turn=reasoning_per_turn,
|
||||
tool_errors=tool_errors,
|
||||
)
|
||||
|
||||
api_elapsed = _time.monotonic() - api_start
|
||||
|
||||
if not response or not response.choices:
|
||||
logger.warning("Empty response on turn %d (api=%.1fs)", turn + 1, api_elapsed)
|
||||
return AgentResult(
|
||||
messages=messages,
|
||||
managed_state=self._get_managed_state(),
|
||||
turns_used=turn + 1,
|
||||
finished_naturally=False,
|
||||
reasoning_per_turn=reasoning_per_turn,
|
||||
tool_errors=tool_errors,
|
||||
)
|
||||
|
||||
assistant_msg = response.choices[0].message
|
||||
|
||||
# Extract reasoning content from the response (all provider formats)
|
||||
reasoning = _extract_reasoning_from_message(assistant_msg)
|
||||
reasoning_per_turn.append(reasoning)
|
||||
|
||||
# Check for tool calls -- standard OpenAI spec.
|
||||
# Fallback: if response has no structured tool_calls but content
|
||||
# contains raw tool call tags (e.g. <tool_call>), parse them using
|
||||
# hermes-agent's standalone parsers. This handles the case where
|
||||
# ManagedServer's ToolCallTranslator couldn't parse because vLLM
|
||||
# isn't installed.
|
||||
if (
|
||||
not assistant_msg.tool_calls
|
||||
and assistant_msg.content
|
||||
and self.tool_schemas
|
||||
and "<tool_call>" in (assistant_msg.content or "")
|
||||
):
|
||||
try:
|
||||
from environments.tool_call_parsers import get_parser
|
||||
fallback_parser = get_parser("hermes")
|
||||
parsed_content, parsed_calls = fallback_parser.parse(
|
||||
assistant_msg.content
|
||||
)
|
||||
if parsed_calls:
|
||||
assistant_msg.tool_calls = parsed_calls
|
||||
if parsed_content is not None:
|
||||
assistant_msg.content = parsed_content
|
||||
logger.debug(
|
||||
"Fallback parser extracted %d tool calls from raw content",
|
||||
len(parsed_calls),
|
||||
)
|
||||
except Exception:
|
||||
pass # Fall through to no tool calls
|
||||
|
||||
if assistant_msg.tool_calls:
|
||||
# Normalize tool calls to dicts — they may come as objects
|
||||
# (OpenAI API) or dicts (vLLM ToolCallTranslator).
|
||||
def _tc_to_dict(tc):
|
||||
if isinstance(tc, dict):
|
||||
return {
|
||||
"id": tc.get("id", f"call_{uuid.uuid4().hex[:8]}"),
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tc.get("function", {}).get("name", tc.get("name", "")),
|
||||
"arguments": tc.get("function", {}).get("arguments", tc.get("arguments", "{}")),
|
||||
},
|
||||
}
|
||||
return {
|
||||
"id": tc.id,
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tc.function.name,
|
||||
"arguments": tc.function.arguments,
|
||||
},
|
||||
}
|
||||
|
||||
# Build the assistant message dict for conversation history
|
||||
msg_dict: Dict[str, Any] = {
|
||||
"role": "assistant",
|
||||
"content": assistant_msg.content or "",
|
||||
"tool_calls": [_tc_to_dict(tc) for tc in assistant_msg.tool_calls],
|
||||
}
|
||||
|
||||
# Preserve reasoning_content for multi-turn chat template handling
|
||||
# (e.g., Kimi-K2's template renders <think> blocks differently
|
||||
# for history vs. the latest turn based on this field)
|
||||
if reasoning:
|
||||
msg_dict["reasoning_content"] = reasoning
|
||||
|
||||
messages.append(msg_dict)
|
||||
|
||||
# Execute each tool call via hermes-agent's dispatch
|
||||
for tc in assistant_msg.tool_calls:
|
||||
# Handle both object (OpenAI) and dict (vLLM) formats
|
||||
if isinstance(tc, dict):
|
||||
tool_name = tc.get("function", {}).get("name", tc.get("name", ""))
|
||||
tool_args_raw = tc.get("function", {}).get("arguments", tc.get("arguments", "{}"))
|
||||
else:
|
||||
tool_name = tc.function.name
|
||||
tool_args_raw = tc.function.arguments
|
||||
|
||||
# Validate tool name
|
||||
if tool_name not in self.valid_tool_names:
|
||||
tool_result = json.dumps(
|
||||
{
|
||||
"error": f"Unknown tool '{tool_name}'. "
|
||||
f"Available tools: {sorted(self.valid_tool_names)}"
|
||||
}
|
||||
)
|
||||
tool_errors.append(ToolError(
|
||||
turn=turn + 1, tool_name=tool_name,
|
||||
arguments=tool_args_raw[:200],
|
||||
error=f"Unknown tool '{tool_name}'",
|
||||
tool_result=tool_result,
|
||||
))
|
||||
logger.warning(
|
||||
"Model called unknown tool '%s' on turn %d",
|
||||
tool_name, turn + 1,
|
||||
)
|
||||
else:
|
||||
# Parse arguments
|
||||
try:
|
||||
args = json.loads(tool_args_raw)
|
||||
except json.JSONDecodeError as e:
|
||||
args = None
|
||||
tool_result = json.dumps(
|
||||
{"error": f"Invalid JSON in tool arguments: {e}. Please retry with valid JSON."}
|
||||
)
|
||||
tool_errors.append(ToolError(
|
||||
turn=turn + 1, tool_name=tool_name,
|
||||
arguments=tool_args_raw[:200],
|
||||
error=f"Invalid JSON: {e}",
|
||||
tool_result=tool_result,
|
||||
))
|
||||
logger.warning(
|
||||
"Invalid JSON in tool call arguments for '%s': %s",
|
||||
tool_name, tool_args_raw[:200],
|
||||
)
|
||||
|
||||
# Dispatch tool only if arguments parsed successfully
|
||||
if args is not None:
|
||||
try:
|
||||
if tool_name == "terminal":
|
||||
backend = os.getenv("TERMINAL_ENV", "local")
|
||||
cmd_preview = args.get("command", "")[:80]
|
||||
logger.info(
|
||||
"[%s] $ %s", self.task_id[:8], cmd_preview,
|
||||
)
|
||||
|
||||
tool_submit_time = _time.monotonic()
|
||||
|
||||
# Todo tool -- handle locally (needs per-loop TodoStore)
|
||||
if tool_name == "todo":
|
||||
tool_result = _todo_tool(
|
||||
todos=args.get("todos"),
|
||||
merge=args.get("merge", False),
|
||||
store=_todo_store,
|
||||
)
|
||||
tool_elapsed = _time.monotonic() - tool_submit_time
|
||||
elif tool_name == "memory":
|
||||
tool_result = json.dumps({"error": "Memory is not available in RL environments."})
|
||||
tool_elapsed = _time.monotonic() - tool_submit_time
|
||||
elif tool_name == "session_search":
|
||||
tool_result = json.dumps({"error": "Session search is not available in RL environments."})
|
||||
tool_elapsed = _time.monotonic() - tool_submit_time
|
||||
else:
|
||||
# Run tool calls in a thread pool so backends that
|
||||
# use asyncio.run() internally (modal, docker, daytona) get
|
||||
# a clean event loop instead of deadlocking.
|
||||
loop = asyncio.get_running_loop()
|
||||
# Capture current tool_name/args for the lambda
|
||||
_tn, _ta, _tid = tool_name, args, self.task_id
|
||||
tool_result = await loop.run_in_executor(
|
||||
_tool_executor,
|
||||
lambda: handle_function_call(
|
||||
_tn, _ta, task_id=_tid,
|
||||
user_task=_user_task,
|
||||
),
|
||||
)
|
||||
tool_elapsed = _time.monotonic() - tool_submit_time
|
||||
|
||||
# Log slow tools and thread pool stats for debugging
|
||||
pool_active = _tool_executor._work_queue.qsize()
|
||||
if tool_elapsed > 30:
|
||||
logger.warning(
|
||||
"[%s] turn %d: %s took %.1fs (pool queue=%d)",
|
||||
self.task_id[:8], turn + 1, tool_name,
|
||||
tool_elapsed, pool_active,
|
||||
)
|
||||
except Exception as e:
|
||||
tool_result = json.dumps(
|
||||
{"error": f"Tool execution failed: {type(e).__name__}: {str(e)}"}
|
||||
)
|
||||
tool_errors.append(ToolError(
|
||||
turn=turn + 1, tool_name=tool_name,
|
||||
arguments=tool_args_raw[:200],
|
||||
error=f"{type(e).__name__}: {str(e)}",
|
||||
tool_result=tool_result,
|
||||
))
|
||||
logger.error(
|
||||
"Tool '%s' execution failed on turn %d: %s",
|
||||
tool_name, turn + 1, e,
|
||||
)
|
||||
|
||||
# Also check if the tool returned an error in its JSON result
|
||||
try:
|
||||
result_data = json.loads(tool_result)
|
||||
if isinstance(result_data, dict):
|
||||
err = result_data.get("error")
|
||||
exit_code = result_data.get("exit_code")
|
||||
if err and exit_code and exit_code < 0:
|
||||
tool_errors.append(ToolError(
|
||||
turn=turn + 1, tool_name=tool_name,
|
||||
arguments=tool_args_raw[:200],
|
||||
error=str(err),
|
||||
tool_result=tool_result[:500],
|
||||
))
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
|
||||
tc_id = tc.get("id", "") if isinstance(tc, dict) else tc.id
|
||||
tool_result = maybe_persist_tool_result(
|
||||
content=tool_result,
|
||||
tool_name=tool_name,
|
||||
tool_use_id=tc_id,
|
||||
env=get_active_env(self.task_id),
|
||||
config=self.budget_config,
|
||||
)
|
||||
|
||||
messages.append(
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": tc_id,
|
||||
"content": tool_result,
|
||||
}
|
||||
)
|
||||
|
||||
num_tcs = len(assistant_msg.tool_calls)
|
||||
if num_tcs > 0:
|
||||
enforce_turn_budget(
|
||||
messages[-num_tcs:],
|
||||
env=get_active_env(self.task_id),
|
||||
config=self.budget_config,
|
||||
)
|
||||
|
||||
turn_elapsed = _time.monotonic() - turn_start
|
||||
logger.info(
|
||||
"[%s] turn %d: api=%.1fs, %d tools, turn_total=%.1fs",
|
||||
self.task_id[:8], turn + 1, api_elapsed,
|
||||
len(assistant_msg.tool_calls), turn_elapsed,
|
||||
)
|
||||
|
||||
else:
|
||||
# No tool calls -- model is done
|
||||
msg_dict = {
|
||||
"role": "assistant",
|
||||
"content": assistant_msg.content or "",
|
||||
}
|
||||
if reasoning:
|
||||
msg_dict["reasoning_content"] = reasoning
|
||||
messages.append(msg_dict)
|
||||
|
||||
turn_elapsed = _time.monotonic() - turn_start
|
||||
logger.info(
|
||||
"[%s] turn %d: api=%.1fs, no tools (finished), turn_total=%.1fs",
|
||||
self.task_id[:8], turn + 1, api_elapsed, turn_elapsed,
|
||||
)
|
||||
|
||||
return AgentResult(
|
||||
messages=messages,
|
||||
managed_state=self._get_managed_state(),
|
||||
turns_used=turn + 1,
|
||||
finished_naturally=True,
|
||||
reasoning_per_turn=reasoning_per_turn,
|
||||
tool_errors=tool_errors,
|
||||
)
|
||||
|
||||
# Hit max turns without the model stopping
|
||||
logger.info("Agent hit max_turns (%d) without finishing", self.max_turns)
|
||||
return AgentResult(
|
||||
messages=messages,
|
||||
managed_state=self._get_managed_state(),
|
||||
turns_used=self.max_turns,
|
||||
finished_naturally=False,
|
||||
reasoning_per_turn=reasoning_per_turn,
|
||||
tool_errors=tool_errors,
|
||||
)
|
||||
|
||||
def _get_managed_state(self) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get ManagedServer state if the server supports it.
|
||||
|
||||
Returns state dict with SequenceNodes containing tokens/logprobs/masks,
|
||||
or None if the server doesn't support get_state() (e.g., regular OpenAI server).
|
||||
"""
|
||||
if hasattr(self.server, "get_state"):
|
||||
return self.server.get_state()
|
||||
return None
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,73 +0,0 @@
|
||||
# OpenThoughts-TBLite Evaluation Environment
|
||||
|
||||
This environment evaluates terminal agents on the [OpenThoughts-TBLite](https://huggingface.co/datasets/open-thoughts/OpenThoughts-TBLite) benchmark, a difficulty-calibrated subset of [Terminal-Bench 2.0](https://www.tbench.ai/leaderboard/terminal-bench/2.0).
|
||||
|
||||
## Source
|
||||
|
||||
OpenThoughts-TBLite was created by the [OpenThoughts](https://www.openthoughts.ai/) Agent team in collaboration with [Snorkel AI](https://snorkel.ai/) and [Bespoke Labs](https://bespokelabs.ai/). The original dataset and documentation live at:
|
||||
|
||||
- **Dataset (source):** [open-thoughts/OpenThoughts-TBLite](https://huggingface.co/datasets/open-thoughts/OpenThoughts-TBLite)
|
||||
- **GitHub:** [open-thoughts/OpenThoughts-TBLite](https://github.com/open-thoughts/OpenThoughts-TBLite)
|
||||
- **Blog post:** [openthoughts.ai/blog/openthoughts-tblite](https://www.openthoughts.ai/blog/openthoughts-tblite)
|
||||
|
||||
## Our Dataset
|
||||
|
||||
We converted the source into the same schema used by our Terminal-Bench 2.0 environment (pre-built Docker Hub images, base64-encoded test tarballs, etc.) and published it as:
|
||||
|
||||
- **Dataset (ours):** [NousResearch/openthoughts-tblite](https://huggingface.co/datasets/NousResearch/openthoughts-tblite)
|
||||
- **Docker images:** `nousresearch/tblite-<task-name>:latest` on Docker Hub (100 images)
|
||||
|
||||
The conversion script is at `scripts/prepare_tblite_dataset.py`.
|
||||
|
||||
## Why TBLite?
|
||||
|
||||
Terminal-Bench 2.0 is one of the strongest frontier evaluations for terminal agents, but when a model scores near the floor (e.g., Qwen 3 8B at <1%), many changes look identical in aggregate score. TBLite addresses this by calibrating task difficulty using Claude Haiku 4.5 as a reference:
|
||||
|
||||
| Difficulty | Pass Rate Range | Tasks |
|
||||
|------------|----------------|-------|
|
||||
| Easy | >= 70% | 40 |
|
||||
| Medium | 40-69% | 26 |
|
||||
| Hard | 10-39% | 26 |
|
||||
| Extreme | < 10% | 8 |
|
||||
|
||||
This gives enough solvable tasks to detect small improvements quickly, while preserving enough hard tasks to avoid saturation. The correlation between TBLite and TB2 scores is **r = 0.911**.
|
||||
|
||||
TBLite also runs 2.6-8x faster than the full TB2, making it practical for iteration loops.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
# Run the full benchmark
|
||||
python environments/benchmarks/tblite/tblite_env.py evaluate
|
||||
|
||||
# Filter to specific tasks
|
||||
python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||
--env.task_filter "broken-python,pandas-etl"
|
||||
|
||||
# Use a different model
|
||||
python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||
--server.model_name "qwen/qwen3-30b"
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
`TBLiteEvalEnv` is a thin subclass of `TerminalBench2EvalEnv`. All evaluation logic (agent loop, Docker sandbox management, test verification, metrics) is inherited. Only the defaults differ:
|
||||
|
||||
| Setting | TB2 | TBLite |
|
||||
|----------------|----------------------------------|-----------------------------------------|
|
||||
| Dataset | `NousResearch/terminal-bench-2` | `NousResearch/openthoughts-tblite` |
|
||||
| Tasks | 89 | 100 |
|
||||
| Task timeout | 1800s (30 min) | 1200s (20 min) |
|
||||
| Wandb name | `terminal-bench-2` | `openthoughts-tblite` |
|
||||
|
||||
## Citation
|
||||
|
||||
```bibtex
|
||||
@software{OpenThoughts-TBLite,
|
||||
author = {OpenThoughts-Agent team, Snorkel AI, Bespoke Labs},
|
||||
month = Feb,
|
||||
title = {{OpenThoughts-TBLite: A High-Signal Benchmark for Iterating on Terminal Agents}},
|
||||
howpublished = {https://www.openthoughts.ai/blog/openthoughts-tblite},
|
||||
year = {2026}
|
||||
}
|
||||
```
|
||||
@@ -1,39 +0,0 @@
|
||||
# OpenThoughts-TBLite Evaluation -- Default Configuration
|
||||
#
|
||||
# Eval-only environment for the TBLite benchmark (100 difficulty-calibrated
|
||||
# terminal tasks, a faster proxy for Terminal-Bench 2.0).
|
||||
# Uses Modal terminal backend for per-task cloud-isolated sandboxes
|
||||
# and OpenRouter for inference.
|
||||
#
|
||||
# Usage:
|
||||
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||
# --config environments/benchmarks/tblite/default.yaml
|
||||
#
|
||||
# # Override model:
|
||||
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||
# --config environments/benchmarks/tblite/default.yaml \
|
||||
# --openai.model_name anthropic/claude-sonnet-4
|
||||
|
||||
env:
|
||||
enabled_toolsets: ["terminal", "file"]
|
||||
max_agent_turns: 60
|
||||
max_token_length: 32000
|
||||
agent_temperature: 0.8
|
||||
terminal_backend: "modal"
|
||||
terminal_timeout: 300 # 5 min per command (builds, pip install)
|
||||
tool_pool_size: 128 # thread pool for 100 parallel tasks
|
||||
dataset_name: "NousResearch/openthoughts-tblite"
|
||||
test_timeout: 600
|
||||
task_timeout: 1200 # 20 min wall-clock per task (TBLite tasks are faster)
|
||||
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
use_wandb: true
|
||||
wandb_name: "openthoughts-tblite"
|
||||
ensure_scores_are_not_same: false
|
||||
data_dir_to_save_evals: "environments/benchmarks/evals/openthoughts-tblite"
|
||||
|
||||
openai:
|
||||
base_url: "https://openrouter.ai/api/v1"
|
||||
model_name: "anthropic/claude-opus-4.6"
|
||||
server_type: "openai"
|
||||
health_check: false
|
||||
# api_key loaded from OPENROUTER_API_KEY in .env
|
||||
@@ -1,38 +0,0 @@
|
||||
# OpenThoughts-TBLite Evaluation -- Docker Backend (Local Compute)
|
||||
#
|
||||
# Runs tasks in Docker containers on the local machine.
|
||||
# Sandboxed like Modal but no cloud costs. Good for dev/testing.
|
||||
#
|
||||
# Usage:
|
||||
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||
# --config environments/benchmarks/tblite/local.yaml
|
||||
#
|
||||
# # Override concurrency:
|
||||
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||
# --config environments/benchmarks/tblite/local.yaml \
|
||||
# --env.eval_concurrency 4
|
||||
|
||||
env:
|
||||
enabled_toolsets: ["terminal", "file"]
|
||||
max_agent_turns: 60
|
||||
max_token_length: 32000
|
||||
agent_temperature: 0.8
|
||||
terminal_backend: "docker"
|
||||
terminal_timeout: 300
|
||||
tool_pool_size: 16
|
||||
dataset_name: "NousResearch/openthoughts-tblite"
|
||||
test_timeout: 600
|
||||
task_timeout: 1200
|
||||
eval_concurrency: 8 # max 8 tasks at once
|
||||
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
use_wandb: false
|
||||
wandb_name: "openthoughts-tblite-local"
|
||||
ensure_scores_are_not_same: false
|
||||
data_dir_to_save_evals: "environments/benchmarks/evals/openthoughts-tblite-local"
|
||||
|
||||
openai:
|
||||
base_url: "https://openrouter.ai/api/v1"
|
||||
model_name: "anthropic/claude-sonnet-4"
|
||||
server_type: "openai"
|
||||
health_check: false
|
||||
# api_key loaded from OPENROUTER_API_KEY in .env
|
||||
@@ -1,40 +0,0 @@
|
||||
# OpenThoughts-TBLite Evaluation -- Local vLLM Backend
|
||||
#
|
||||
# Runs against a local vLLM server with Docker sandboxes.
|
||||
#
|
||||
# Start the vLLM server from the atropos directory:
|
||||
# python -m example_trainer.vllm_api_server \
|
||||
# --model Qwen/Qwen3-4B-Instruct-2507 \
|
||||
# --port 9001 \
|
||||
# --gpu-memory-utilization 0.8 \
|
||||
# --max-model-len=32000
|
||||
#
|
||||
# Then run:
|
||||
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||
# --config environments/benchmarks/tblite/local_vllm.yaml
|
||||
|
||||
env:
|
||||
enabled_toolsets: ["terminal", "file"]
|
||||
max_agent_turns: 60
|
||||
max_token_length: 16000
|
||||
agent_temperature: 0.6
|
||||
terminal_backend: "docker"
|
||||
terminal_timeout: 300
|
||||
tool_pool_size: 16
|
||||
dataset_name: "NousResearch/openthoughts-tblite"
|
||||
test_timeout: 600
|
||||
task_timeout: 1200
|
||||
eval_concurrency: 8
|
||||
tool_call_parser: "hermes"
|
||||
system_prompt: "You are an expert terminal agent. You MUST use the provided tools to complete tasks. Use the terminal tool to run shell commands, read_file to read files, write_file to write files, search_files to search, and patch to edit files. Do NOT write out solutions as text - execute them using the tools. Always start by exploring the environment with terminal commands."
|
||||
tokenizer_name: "Qwen/Qwen3-4B-Instruct-2507"
|
||||
use_wandb: false
|
||||
wandb_name: "tblite-qwen3-4b-instruct"
|
||||
ensure_scores_are_not_same: false
|
||||
data_dir_to_save_evals: "environments/benchmarks/evals/tblite-qwen3-4b-local"
|
||||
|
||||
openai:
|
||||
base_url: "http://localhost:9001"
|
||||
model_name: "Qwen/Qwen3-4B-Instruct-2507"
|
||||
server_type: "vllm"
|
||||
health_check: false
|
||||
@@ -1,42 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# OpenThoughts-TBLite Evaluation
|
||||
#
|
||||
# Run from repo root:
|
||||
# bash environments/benchmarks/tblite/run_eval.sh
|
||||
#
|
||||
# Override model:
|
||||
# bash environments/benchmarks/tblite/run_eval.sh \
|
||||
# --openai.model_name anthropic/claude-sonnet-4
|
||||
#
|
||||
# Run a subset:
|
||||
# bash environments/benchmarks/tblite/run_eval.sh \
|
||||
# --env.task_filter broken-python,pandas-etl
|
||||
#
|
||||
# All terminal settings (backend, timeout, lifetime, pool size) are
|
||||
# configured via env config fields -- no env vars needed.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
mkdir -p logs evals/openthoughts-tblite
|
||||
LOG_FILE="logs/tblite_$(date +%Y%m%d_%H%M%S).log"
|
||||
|
||||
echo "OpenThoughts-TBLite Evaluation"
|
||||
echo "Log file: $LOG_FILE"
|
||||
echo ""
|
||||
|
||||
# Unbuffered python output so logs are written in real-time
|
||||
export PYTHONUNBUFFERED=1
|
||||
|
||||
# Show INFO-level agent loop timing (api/tool durations per turn)
|
||||
# These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal
|
||||
export LOGLEVEL=INFO
|
||||
|
||||
python tblite_env.py evaluate \
|
||||
--config default.yaml \
|
||||
"$@" \
|
||||
2>&1 | tee "$LOG_FILE"
|
||||
|
||||
echo ""
|
||||
echo "Log saved to: $LOG_FILE"
|
||||
echo "Eval results: evals/openthoughts-tblite/"
|
||||
@@ -1,119 +0,0 @@
|
||||
"""
|
||||
OpenThoughts-TBLite Evaluation Environment
|
||||
|
||||
A lighter, faster alternative to Terminal-Bench 2.0 for iterating on terminal
|
||||
agents. Uses the same evaluation logic as TerminalBench2EvalEnv but defaults
|
||||
to the NousResearch/openthoughts-tblite dataset (100 difficulty-calibrated
|
||||
tasks vs TB2's 89 harder tasks).
|
||||
|
||||
TBLite tasks are a curated subset of TB2 with a difficulty distribution
|
||||
designed to give meaningful signal even for smaller models:
|
||||
- Easy (40 tasks): >= 70% pass rate with Claude Haiku 4.5
|
||||
- Medium (26 tasks): 40-69% pass rate
|
||||
- Hard (26 tasks): 10-39% pass rate
|
||||
- Extreme (8 tasks): < 10% pass rate
|
||||
|
||||
Usage:
|
||||
python environments/benchmarks/tblite/tblite_env.py evaluate
|
||||
|
||||
# Filter to specific tasks:
|
||||
python environments/benchmarks/tblite/tblite_env.py evaluate \\
|
||||
--env.task_filter "broken-python,pandas-etl"
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
_repo_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if str(_repo_root) not in sys.path:
|
||||
sys.path.insert(0, str(_repo_root))
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from atroposlib.envs.base import EvalHandlingEnum
|
||||
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||
|
||||
from environments.benchmarks.terminalbench_2.terminalbench2_env import (
|
||||
TerminalBench2EvalConfig,
|
||||
TerminalBench2EvalEnv,
|
||||
)
|
||||
|
||||
|
||||
class TBLiteEvalConfig(TerminalBench2EvalConfig):
|
||||
"""Configuration for the OpenThoughts-TBLite evaluation environment.
|
||||
|
||||
Inherits all TB2 config fields. Only the dataset default and task timeout
|
||||
differ -- TBLite tasks are calibrated to be faster.
|
||||
"""
|
||||
|
||||
dataset_name: str = Field(
|
||||
default="NousResearch/openthoughts-tblite",
|
||||
description="HuggingFace dataset containing TBLite tasks.",
|
||||
)
|
||||
|
||||
task_timeout: int = Field(
|
||||
default=1200,
|
||||
description="Maximum wall-clock seconds per task. TBLite tasks are "
|
||||
"generally faster than TB2, so 20 minutes is usually sufficient.",
|
||||
)
|
||||
|
||||
|
||||
class TBLiteEvalEnv(TerminalBench2EvalEnv):
|
||||
"""OpenThoughts-TBLite evaluation environment.
|
||||
|
||||
Inherits all evaluation logic from TerminalBench2EvalEnv (agent loop,
|
||||
test verification, Docker image resolution, metrics, wandb logging).
|
||||
Only the default configuration differs.
|
||||
"""
|
||||
|
||||
name = "openthoughts-tblite"
|
||||
env_config_cls = TBLiteEvalConfig
|
||||
|
||||
@classmethod
|
||||
def config_init(cls) -> Tuple[TBLiteEvalConfig, List[APIServerConfig]]:
|
||||
env_config = TBLiteEvalConfig(
|
||||
enabled_toolsets=["terminal", "file"],
|
||||
disabled_toolsets=None,
|
||||
distribution=None,
|
||||
|
||||
max_agent_turns=60,
|
||||
max_token_length=16000,
|
||||
agent_temperature=0.6,
|
||||
system_prompt=None,
|
||||
|
||||
terminal_backend="modal",
|
||||
terminal_timeout=300,
|
||||
|
||||
test_timeout=180,
|
||||
|
||||
# 100 tasks in parallel
|
||||
tool_pool_size=128,
|
||||
|
||||
eval_handling=EvalHandlingEnum.STOP_TRAIN,
|
||||
group_size=1,
|
||||
steps_per_eval=1,
|
||||
total_steps=1,
|
||||
|
||||
tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B",
|
||||
use_wandb=True,
|
||||
wandb_name="openthoughts-tblite",
|
||||
ensure_scores_are_not_same=False,
|
||||
)
|
||||
|
||||
server_configs = [
|
||||
APIServerConfig(
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
model_name="anthropic/claude-sonnet-4",
|
||||
server_type="openai",
|
||||
api_key=os.getenv("OPENROUTER_API_KEY", ""),
|
||||
health_check=False,
|
||||
)
|
||||
]
|
||||
|
||||
return env_config, server_configs
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
TBLiteEvalEnv.cli()
|
||||
@@ -1,42 +0,0 @@
|
||||
# Terminal-Bench 2.0 Evaluation -- Default Configuration
|
||||
#
|
||||
# Eval-only environment for the TB2 benchmark (89 terminal tasks).
|
||||
# Uses Modal terminal backend for per-task cloud-isolated sandboxes
|
||||
# and OpenRouter for inference.
|
||||
#
|
||||
# Usage:
|
||||
# python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
||||
# --config environments/benchmarks/terminalbench_2/default.yaml
|
||||
#
|
||||
# # Override model:
|
||||
# python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
||||
# --config environments/benchmarks/terminalbench_2/default.yaml \
|
||||
# --openai.model_name anthropic/claude-sonnet-4
|
||||
|
||||
env:
|
||||
enabled_toolsets: ["terminal", "file"]
|
||||
max_agent_turns: 60
|
||||
max_token_length: 32000
|
||||
agent_temperature: 0.8
|
||||
terminal_backend: "modal"
|
||||
terminal_timeout: 300 # 5 min per command (builds, pip install)
|
||||
tool_pool_size: 128 # thread pool for 89 parallel tasks
|
||||
dataset_name: "NousResearch/terminal-bench-2"
|
||||
test_timeout: 600
|
||||
task_timeout: 1800 # 30 min wall-clock per task, auto-FAIL if exceeded
|
||||
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
use_wandb: true
|
||||
wandb_name: "terminal-bench-2"
|
||||
ensure_scores_are_not_same: false
|
||||
data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2"
|
||||
# CRITICAL: Limit concurrent Modal sandbox creations to avoid deadlocks.
|
||||
# Modal's blocking calls (App.lookup, etc.) deadlock when too many sandboxes
|
||||
# are created simultaneously inside thread pool workers via asyncio.run().
|
||||
max_concurrent_tasks: 8
|
||||
|
||||
openai:
|
||||
base_url: "https://openrouter.ai/api/v1"
|
||||
model_name: "anthropic/claude-opus-4.6"
|
||||
server_type: "openai"
|
||||
health_check: false
|
||||
# api_key loaded from OPENROUTER_API_KEY in .env
|
||||
@@ -1,42 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Terminal-Bench 2.0 Evaluation
|
||||
#
|
||||
# Run from repo root:
|
||||
# bash environments/benchmarks/terminalbench_2/run_eval.sh
|
||||
#
|
||||
# Override model:
|
||||
# bash environments/benchmarks/terminalbench_2/run_eval.sh \
|
||||
# --openai.model_name anthropic/claude-sonnet-4
|
||||
#
|
||||
# Run a subset:
|
||||
# bash environments/benchmarks/terminalbench_2/run_eval.sh \
|
||||
# --env.task_filter fix-git,git-multibranch
|
||||
#
|
||||
# All terminal settings (backend, timeout, lifetime, pool size) are
|
||||
# configured via env config fields -- no env vars needed.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
mkdir -p logs evals/terminal-bench-2
|
||||
LOG_FILE="logs/terminalbench2_$(date +%Y%m%d_%H%M%S).log"
|
||||
|
||||
echo "Terminal-Bench 2.0 Evaluation"
|
||||
echo "Log file: $LOG_FILE"
|
||||
echo ""
|
||||
|
||||
# Unbuffered python output so logs are written in real-time
|
||||
export PYTHONUNBUFFERED=1
|
||||
|
||||
# Show INFO-level agent loop timing (api/tool durations per turn)
|
||||
# These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal
|
||||
export LOGLEVEL=INFO
|
||||
|
||||
python terminalbench2_env.py evaluate \
|
||||
--config default.yaml \
|
||||
"$@" \
|
||||
2>&1 | tee "$LOG_FILE"
|
||||
|
||||
echo ""
|
||||
echo "Log saved to: $LOG_FILE"
|
||||
echo "Eval results: evals/terminal-bench-2/"
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,115 +0,0 @@
|
||||
# YC-Bench: Long-Horizon Agent Benchmark
|
||||
|
||||
[YC-Bench](https://github.com/collinear-ai/yc-bench) by [Collinear AI](https://collinear.ai/) is a deterministic, long-horizon benchmark that tests LLM agents' ability to act as a tech startup CEO. The agent manages a simulated company over 1-3 years, making compounding decisions about resource allocation, cash flow, task management, and prestige specialisation across 4 skill domains.
|
||||
|
||||
Unlike TerminalBench2 (which evaluates per-task coding ability with binary pass/fail), YC-Bench measures **long-term strategic coherence** — whether an agent can maintain consistent strategy, manage compounding consequences, and adapt plans over hundreds of turns.
|
||||
|
||||
## Setup
|
||||
|
||||
```bash
|
||||
# Install yc-bench (optional dependency)
|
||||
pip install "hermes-agent[yc-bench]"
|
||||
|
||||
# Or install from source
|
||||
git clone https://github.com/collinear-ai/yc-bench
|
||||
cd yc-bench && pip install -e .
|
||||
|
||||
# Verify
|
||||
yc-bench --help
|
||||
```
|
||||
|
||||
## Running
|
||||
|
||||
```bash
|
||||
# From the repo root:
|
||||
bash environments/benchmarks/yc_bench/run_eval.sh
|
||||
|
||||
# Or directly:
|
||||
python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
|
||||
--config environments/benchmarks/yc_bench/default.yaml
|
||||
|
||||
# Override model:
|
||||
bash environments/benchmarks/yc_bench/run_eval.sh \
|
||||
--openai.model_name anthropic/claude-opus-4-20250514
|
||||
|
||||
# Quick single-preset test:
|
||||
bash environments/benchmarks/yc_bench/run_eval.sh \
|
||||
--env.presets '["fast_test"]' --env.seeds '[1]'
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
### Architecture
|
||||
|
||||
```
|
||||
HermesAgentLoop (our agent)
|
||||
-> terminal tool -> subprocess("yc-bench company status") -> JSON output
|
||||
-> terminal tool -> subprocess("yc-bench task accept --task-id X") -> JSON
|
||||
-> terminal tool -> subprocess("yc-bench sim resume") -> JSON (advance time)
|
||||
-> ... (100-500 turns per run)
|
||||
```
|
||||
|
||||
The environment initialises the simulation via `yc-bench sim init` (NOT `yc-bench run`, which would start yc-bench's own built-in agent loop). Our `HermesAgentLoop` then drives all interaction through CLI commands.
|
||||
|
||||
### Simulation Mechanics
|
||||
|
||||
- **4 skill domains**: research, inference, data_environment, training
|
||||
- **Prestige system** (1.0-10.0): Gates access to higher-paying tasks
|
||||
- **Employee management**: Junior/Mid/Senior with domain-specific skill rates
|
||||
- **Throughput splitting**: `effective_rate = base_rate / N` active tasks per employee
|
||||
- **Financial pressure**: Monthly payroll, bankruptcy = game over
|
||||
- **Deterministic**: SHA256-based RNG — same seed + preset = same world
|
||||
|
||||
### Difficulty Presets
|
||||
|
||||
| Preset | Employees | Tasks | Focus |
|
||||
|-----------|-----------|-------|-------|
|
||||
| tutorial | 3 | 50 | Basic loop mechanics |
|
||||
| easy | 5 | 100 | Throughput awareness |
|
||||
| **medium**| 5 | 150 | Prestige climbing + domain specialisation |
|
||||
| **hard** | 7 | 200 | Precise ETA reasoning |
|
||||
| nightmare | 8 | 300 | Sustained perfection under payroll pressure |
|
||||
| fast_test | (varies) | (varies) | Quick validation (~50 turns) |
|
||||
|
||||
Default eval runs **fast_test + medium + hard** × 3 seeds = 9 runs.
|
||||
|
||||
### Scoring
|
||||
|
||||
```
|
||||
composite = 0.5 × survival + 0.5 × normalised_funds
|
||||
```
|
||||
|
||||
- **Survival** (binary): Did the company avoid bankruptcy?
|
||||
- **Normalised funds** (0.0-1.0): Log-scale relative to initial $250K capital
|
||||
|
||||
## Configuration
|
||||
|
||||
Key fields in `default.yaml`:
|
||||
|
||||
| Field | Default | Description |
|
||||
|-------|---------|-------------|
|
||||
| `presets` | `["fast_test", "medium", "hard"]` | Which presets to evaluate |
|
||||
| `seeds` | `[1, 2, 3]` | RNG seeds per preset |
|
||||
| `max_agent_turns` | 200 | Max LLM calls per run |
|
||||
| `run_timeout` | 3600 | Wall-clock timeout per run (seconds) |
|
||||
| `survival_weight` | 0.5 | Weight of survival in composite score |
|
||||
| `funds_weight` | 0.5 | Weight of normalised funds in composite |
|
||||
| `horizon_years` | null | Override horizon (null = auto from preset) |
|
||||
|
||||
## Cost & Time Estimates
|
||||
|
||||
Each run is 100-500 LLM turns. Approximate costs per run at typical API rates:
|
||||
|
||||
| Preset | Turns | Time | Est. Cost |
|
||||
|--------|-------|------|-----------|
|
||||
| fast_test | ~50 | 5-10 min | $1-5 |
|
||||
| medium | ~200 | 20-40 min | $5-15 |
|
||||
| hard | ~300 | 30-60 min | $10-25 |
|
||||
|
||||
Full default eval (9 runs): ~3-6 hours, $50-200 depending on model.
|
||||
|
||||
## References
|
||||
|
||||
- [collinear-ai/yc-bench](https://github.com/collinear-ai/yc-bench) — Official repository
|
||||
- [Collinear AI](https://collinear.ai/) — Company behind yc-bench
|
||||
- [TerminalBench2](../terminalbench_2/) — Per-task coding benchmark (complementary)
|
||||
@@ -1,43 +0,0 @@
|
||||
# YC-Bench Evaluation -- Default Configuration
|
||||
#
|
||||
# Long-horizon agent benchmark: agent plays CEO of an AI startup over
|
||||
# a simulated 1-3 year run, interacting via yc-bench CLI subcommands.
|
||||
#
|
||||
# Requires: pip install "hermes-agent[yc-bench]"
|
||||
#
|
||||
# Usage:
|
||||
# python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
|
||||
# --config environments/benchmarks/yc_bench/default.yaml
|
||||
#
|
||||
# # Override model:
|
||||
# python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
|
||||
# --config environments/benchmarks/yc_bench/default.yaml \
|
||||
# --openai.model_name anthropic/claude-opus-4-20250514
|
||||
|
||||
env:
|
||||
enabled_toolsets: ["terminal"]
|
||||
max_agent_turns: 200
|
||||
max_token_length: 32000
|
||||
agent_temperature: 0.0
|
||||
terminal_backend: "local"
|
||||
terminal_timeout: 60
|
||||
presets: ["fast_test", "medium", "hard"]
|
||||
seeds: [1, 2, 3]
|
||||
run_timeout: 3600 # 60 min wall-clock per run, auto-FAIL if exceeded
|
||||
survival_weight: 0.5 # weight of binary survival in composite score
|
||||
funds_weight: 0.5 # weight of normalised final funds in composite score
|
||||
db_dir: "/tmp/yc_bench_dbs"
|
||||
company_name: "BenchCo"
|
||||
start_date: "01/01/2025" # MM/DD/YYYY (yc-bench convention)
|
||||
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
use_wandb: true
|
||||
wandb_name: "yc-bench"
|
||||
ensure_scores_are_not_same: false
|
||||
data_dir_to_save_evals: "environments/benchmarks/evals/yc-bench"
|
||||
|
||||
openai:
|
||||
base_url: "https://openrouter.ai/api/v1"
|
||||
model_name: "anthropic/claude-sonnet-4.6"
|
||||
server_type: "openai"
|
||||
health_check: false
|
||||
# api_key loaded from OPENROUTER_API_KEY in .env
|
||||
@@ -1,34 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# YC-Bench Evaluation
|
||||
#
|
||||
# Requires: pip install "hermes-agent[yc-bench]"
|
||||
#
|
||||
# Run from repo root:
|
||||
# bash environments/benchmarks/yc_bench/run_eval.sh
|
||||
#
|
||||
# Override model:
|
||||
# bash environments/benchmarks/yc_bench/run_eval.sh \
|
||||
# --openai.model_name anthropic/claude-opus-4-20250514
|
||||
#
|
||||
# Run a single preset:
|
||||
# bash environments/benchmarks/yc_bench/run_eval.sh \
|
||||
# --env.presets '["fast_test"]' --env.seeds '[1]'
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
mkdir -p logs evals/yc-bench
|
||||
LOG_FILE="logs/yc_bench_$(date +%Y%m%d_%H%M%S).log"
|
||||
|
||||
echo "YC-Bench Evaluation"
|
||||
echo "Log: $LOG_FILE"
|
||||
echo ""
|
||||
|
||||
PYTHONUNBUFFERED=1 LOGLEVEL="${LOGLEVEL:-INFO}" \
|
||||
python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
|
||||
--config environments/benchmarks/yc_bench/default.yaml \
|
||||
"$@" \
|
||||
2>&1 | tee "$LOG_FILE"
|
||||
|
||||
echo ""
|
||||
echo "Log saved to: $LOG_FILE"
|
||||
@@ -1,848 +0,0 @@
|
||||
"""
|
||||
YCBenchEvalEnv -- YC-Bench Long-Horizon Agent Benchmark Environment
|
||||
|
||||
Evaluates agentic LLMs on YC-Bench: a deterministic, long-horizon benchmark
|
||||
where the agent acts as CEO of an AI startup over a simulated 1-3 year run.
|
||||
The agent manages cash flow, employees, tasks, and prestige across 4 domains,
|
||||
interacting exclusively via CLI subprocess calls against a SQLite-backed
|
||||
discrete-event simulation.
|
||||
|
||||
Unlike TerminalBench2 (per-task binary pass/fail), YC-Bench measures sustained
|
||||
multi-turn strategic coherence -- whether an agent can manage compounding
|
||||
decisions over hundreds of turns without going bankrupt.
|
||||
|
||||
This is an eval-only environment. Run via:
|
||||
|
||||
python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
|
||||
--config environments/benchmarks/yc_bench/default.yaml
|
||||
|
||||
The evaluate flow:
|
||||
1. setup() -- Verifies yc-bench installed, builds eval matrix (preset x seed)
|
||||
2. evaluate() -- Iterates over all runs sequentially through:
|
||||
a. rollout_and_score_eval() -- Per-run agent loop
|
||||
- Initialises a fresh yc-bench simulation via `sim init` (NOT `run`)
|
||||
- Runs HermesAgentLoop with terminal tool only
|
||||
- Reads final SQLite DB to extract score
|
||||
- Returns survival (0/1) + normalised funds score
|
||||
b. Aggregates per-preset and overall metrics
|
||||
c. Logs results via evaluate_log() and wandb
|
||||
|
||||
Key features:
|
||||
- CLI-only interface: agent calls yc-bench subcommands via terminal tool
|
||||
- Deterministic: same seed + preset = same world (SHA256-based RNG)
|
||||
- Multi-dimensional scoring: survival + normalised final funds
|
||||
- Per-preset difficulty breakdown in results
|
||||
- Isolated SQLite DB per run (no cross-run state leakage)
|
||||
|
||||
Requires: pip install hermes-agent[yc-bench]
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
_repo_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if str(_repo_root) not in sys.path:
|
||||
sys.path.insert(0, str(_repo_root))
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from atroposlib.envs.base import EvalHandlingEnum
|
||||
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||
|
||||
from environments.agent_loop import HermesAgentLoop
|
||||
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# =============================================================================
|
||||
# System prompt
|
||||
# =============================================================================
|
||||
|
||||
YC_BENCH_SYSTEM_PROMPT = """\
|
||||
You are the autonomous CEO of an early-stage AI startup in a deterministic
|
||||
business simulation. You manage the company exclusively through the `yc-bench`
|
||||
CLI tool. Your primary goal is to **survive** until the simulation horizon ends
|
||||
without going bankrupt, while **maximising final funds**.
|
||||
|
||||
## Simulation Mechanics
|
||||
|
||||
- **Funds**: You start with $250,000 seed capital. Revenue comes from completing
|
||||
tasks. Rewards scale with your prestige: `base × (1 + scale × (prestige − 1))`.
|
||||
- **Domains**: There are 4 skill domains: **research**, **inference**,
|
||||
**data_environment**, and **training**. Each has its own prestige level
|
||||
(1.0-10.0). Higher prestige unlocks better-paying tasks.
|
||||
- **Employees**: You have employees (Junior/Mid/Senior) with domain-specific
|
||||
skill rates. **Throughput splits**: `effective_rate = base_rate / N` where N
|
||||
is the number of active tasks assigned to that employee. Focus beats breadth.
|
||||
- **Payroll**: Deducted automatically on the first business day of each month.
|
||||
Running out of funds = bankruptcy = game over.
|
||||
- **Time**: The simulation runs on business days (Mon-Fri), 09:00-18:00.
|
||||
Time only advances when you call `yc-bench sim resume`.
|
||||
|
||||
## Task Lifecycle
|
||||
|
||||
1. Browse market tasks with `market browse`
|
||||
2. Accept a task with `task accept` (this sets its deadline)
|
||||
3. Assign employees with `task assign`
|
||||
4. Dispatch with `task dispatch` to start work
|
||||
5. Call `sim resume` to advance time and let employees make progress
|
||||
6. Tasks complete when all domain requirements are fulfilled
|
||||
|
||||
**Penalties for failure vary by difficulty preset.** Completing a task on time
|
||||
earns full reward + prestige gain. Missing a deadline or cancelling a task
|
||||
incurs prestige penalties -- cancelling is always more costly than letting a
|
||||
task fail, so cancel only as a last resort.
|
||||
|
||||
## CLI Commands
|
||||
|
||||
### Observe
|
||||
- `yc-bench company status` -- funds, prestige, runway
|
||||
- `yc-bench employee list` -- skills, salary, active tasks
|
||||
- `yc-bench market browse [--domain D] [--required-prestige-lte N]` -- available tasks
|
||||
- `yc-bench task list [--status active|planned]` -- your tasks
|
||||
- `yc-bench task inspect --task-id UUID` -- progress, deadline, assignments
|
||||
- `yc-bench finance ledger [--category monthly_payroll|task_reward]` -- transaction history
|
||||
- `yc-bench report monthly` -- monthly P&L
|
||||
|
||||
### Act
|
||||
- `yc-bench task accept --task-id UUID` -- accept from market
|
||||
- `yc-bench task assign --task-id UUID --employee-id UUID` -- assign employee
|
||||
- `yc-bench task dispatch --task-id UUID` -- start work (needs >=1 assignment)
|
||||
- `yc-bench task cancel --task-id UUID --reason "text"` -- cancel (prestige penalty)
|
||||
- `yc-bench sim resume` -- advance simulation clock
|
||||
|
||||
### Memory (persists across context truncation)
|
||||
- `yc-bench scratchpad read` -- read your persistent notes
|
||||
- `yc-bench scratchpad write --content "text"` -- overwrite notes
|
||||
- `yc-bench scratchpad append --content "text"` -- append to notes
|
||||
- `yc-bench scratchpad clear` -- clear notes
|
||||
|
||||
## Strategy Guidelines
|
||||
|
||||
1. **Specialise in 2-3 domains** to climb the prestige ladder faster and unlock
|
||||
high-reward tasks. Don't spread thin across all 4 domains early on.
|
||||
2. **Focus employees** -- assigning one employee to many tasks halves their
|
||||
throughput per additional task. Keep assignments concentrated.
|
||||
3. **Use the scratchpad** to track your strategy, upcoming deadlines, and
|
||||
employee assignments. This persists even if conversation context is truncated.
|
||||
4. **Monitor runway** -- always know how many months of payroll you can cover.
|
||||
Accept high-reward tasks before payroll dates.
|
||||
5. **Don't over-accept** -- taking too many tasks and missing deadlines cascades
|
||||
into prestige loss, locking you out of profitable contracts.
|
||||
6. Use `finance ledger` and `report monthly` to track revenue trends.
|
||||
|
||||
## Your Turn
|
||||
|
||||
Each turn:
|
||||
1. Call `yc-bench company status` and `yc-bench task list` to orient yourself.
|
||||
2. Check for completed tasks and pending deadlines.
|
||||
3. Browse market for profitable tasks within your prestige level.
|
||||
4. Accept, assign, and dispatch tasks strategically.
|
||||
5. Call `yc-bench sim resume` to advance time.
|
||||
6. Repeat until the simulation ends.
|
||||
|
||||
Think step by step before acting."""
|
||||
|
||||
# Starting funds in cents ($250,000)
|
||||
INITIAL_FUNDS_CENTS = 25_000_000
|
||||
|
||||
# Default horizon per preset (years)
|
||||
_PRESET_HORIZONS = {
|
||||
"tutorial": 1,
|
||||
"easy": 1,
|
||||
"medium": 1,
|
||||
"hard": 1,
|
||||
"nightmare": 1,
|
||||
"fast_test": 1,
|
||||
"default": 3,
|
||||
"high_reward": 1,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
|
||||
class YCBenchEvalConfig(HermesAgentEnvConfig):
|
||||
"""
|
||||
Configuration for the YC-Bench evaluation environment.
|
||||
|
||||
Extends HermesAgentEnvConfig with YC-Bench-specific settings for
|
||||
preset selection, seed control, scoring, and simulation parameters.
|
||||
"""
|
||||
|
||||
presets: List[str] = Field(
|
||||
default=["fast_test", "medium", "hard"],
|
||||
description="YC-Bench preset names to evaluate.",
|
||||
)
|
||||
seeds: List[int] = Field(
|
||||
default=[1, 2, 3],
|
||||
description="Random seeds -- each preset x seed = one run.",
|
||||
)
|
||||
run_timeout: int = Field(
|
||||
default=3600,
|
||||
description="Maximum wall-clock seconds per run. Default 60 minutes.",
|
||||
)
|
||||
survival_weight: float = Field(
|
||||
default=0.5,
|
||||
description="Weight of survival (0/1) in composite score.",
|
||||
)
|
||||
funds_weight: float = Field(
|
||||
default=0.5,
|
||||
description="Weight of normalised final funds in composite score.",
|
||||
)
|
||||
db_dir: str = Field(
|
||||
default="/tmp/yc_bench_dbs",
|
||||
description="Directory for per-run SQLite databases.",
|
||||
)
|
||||
horizon_years: Optional[int] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"Simulation horizon in years. If None (default), inferred from "
|
||||
"preset name (1 year for most, 3 for 'default')."
|
||||
),
|
||||
)
|
||||
company_name: str = Field(
|
||||
default="BenchCo",
|
||||
description="Name of the simulated company.",
|
||||
)
|
||||
start_date: str = Field(
|
||||
default="01/01/2025",
|
||||
description="Simulation start date in MM/DD/YYYY format (yc-bench convention).",
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Scoring helpers
|
||||
# =============================================================================
|
||||
|
||||
def _read_final_score(db_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Read final game state from a YC-Bench SQLite database.
|
||||
|
||||
Returns dict with final_funds_cents (int), survived (bool),
|
||||
terminal_reason (str).
|
||||
|
||||
Note: yc-bench table names are plural -- 'companies' not 'company',
|
||||
'sim_events' not 'simulation_log'.
|
||||
"""
|
||||
if not os.path.exists(db_path):
|
||||
logger.warning("DB not found at %s", db_path)
|
||||
return {
|
||||
"final_funds_cents": 0,
|
||||
"survived": False,
|
||||
"terminal_reason": "db_missing",
|
||||
}
|
||||
|
||||
conn = None
|
||||
try:
|
||||
conn = sqlite3.connect(db_path)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Read final funds from the 'companies' table
|
||||
cur.execute("SELECT funds_cents FROM companies LIMIT 1")
|
||||
row = cur.fetchone()
|
||||
funds = row[0] if row else 0
|
||||
|
||||
# Determine terminal reason from 'sim_events' table
|
||||
terminal_reason = "unknown"
|
||||
try:
|
||||
cur.execute(
|
||||
"SELECT event_type FROM sim_events "
|
||||
"WHERE event_type IN ('bankruptcy', 'horizon_end') "
|
||||
"ORDER BY scheduled_at DESC LIMIT 1"
|
||||
)
|
||||
event_row = cur.fetchone()
|
||||
if event_row:
|
||||
terminal_reason = event_row[0]
|
||||
except sqlite3.OperationalError:
|
||||
# Table may not exist if simulation didn't progress
|
||||
pass
|
||||
|
||||
survived = funds >= 0 and terminal_reason != "bankruptcy"
|
||||
return {
|
||||
"final_funds_cents": funds,
|
||||
"survived": survived,
|
||||
"terminal_reason": terminal_reason,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to read DB %s: %s", db_path, e)
|
||||
return {
|
||||
"final_funds_cents": 0,
|
||||
"survived": False,
|
||||
"terminal_reason": f"db_error: {e}",
|
||||
}
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _compute_composite_score(
|
||||
final_funds_cents: int,
|
||||
survived: bool,
|
||||
survival_weight: float = 0.5,
|
||||
funds_weight: float = 0.5,
|
||||
initial_funds_cents: int = INITIAL_FUNDS_CENTS,
|
||||
) -> float:
|
||||
"""
|
||||
Compute composite score from survival and final funds.
|
||||
|
||||
Score = survival_weight * survival_score
|
||||
+ funds_weight * normalised_funds_score
|
||||
|
||||
Normalised funds uses log-scale relative to initial capital:
|
||||
- funds <= 0: 0.0
|
||||
- funds == initial: ~0.15
|
||||
- funds == 10x: ~0.52
|
||||
- funds == 100x: 1.0
|
||||
"""
|
||||
survival_score = 1.0 if survived else 0.0
|
||||
|
||||
if final_funds_cents <= 0:
|
||||
funds_score = 0.0
|
||||
else:
|
||||
max_ratio = 100.0
|
||||
ratio = final_funds_cents / max(initial_funds_cents, 1)
|
||||
funds_score = min(math.log1p(ratio) / math.log1p(max_ratio), 1.0)
|
||||
|
||||
return survival_weight * survival_score + funds_weight * funds_score
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Main Environment
|
||||
# =============================================================================
|
||||
|
||||
class YCBenchEvalEnv(HermesAgentBaseEnv):
|
||||
"""
|
||||
YC-Bench long-horizon agent benchmark environment (eval-only).
|
||||
|
||||
Each eval item is a (preset, seed) pair. The environment initialises the
|
||||
simulation via ``yc-bench sim init`` (NOT ``yc-bench run`` which would start
|
||||
a competing built-in agent loop). The HermesAgentLoop then drives the
|
||||
interaction by calling individual yc-bench CLI commands via the terminal tool.
|
||||
|
||||
After the agent loop ends, the SQLite DB is read to extract the final score.
|
||||
|
||||
Scoring:
|
||||
composite = 0.5 * survival + 0.5 * normalised_funds
|
||||
"""
|
||||
|
||||
name = "yc-bench"
|
||||
env_config_cls = YCBenchEvalConfig
|
||||
|
||||
@classmethod
|
||||
def config_init(cls) -> Tuple[YCBenchEvalConfig, List[APIServerConfig]]:
|
||||
env_config = YCBenchEvalConfig(
|
||||
enabled_toolsets=["terminal"],
|
||||
disabled_toolsets=None,
|
||||
distribution=None,
|
||||
max_agent_turns=200,
|
||||
max_token_length=32000,
|
||||
agent_temperature=0.0,
|
||||
system_prompt=YC_BENCH_SYSTEM_PROMPT,
|
||||
terminal_backend="local",
|
||||
terminal_timeout=60,
|
||||
presets=["fast_test", "medium", "hard"],
|
||||
seeds=[1, 2, 3],
|
||||
run_timeout=3600,
|
||||
survival_weight=0.5,
|
||||
funds_weight=0.5,
|
||||
db_dir="/tmp/yc_bench_dbs",
|
||||
eval_handling=EvalHandlingEnum.STOP_TRAIN,
|
||||
group_size=1,
|
||||
steps_per_eval=1,
|
||||
total_steps=1,
|
||||
tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B",
|
||||
use_wandb=True,
|
||||
wandb_name="yc-bench",
|
||||
ensure_scores_are_not_same=False,
|
||||
)
|
||||
|
||||
server_configs = [
|
||||
APIServerConfig(
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
model_name="anthropic/claude-sonnet-4.6",
|
||||
server_type="openai",
|
||||
api_key=os.getenv("OPENROUTER_API_KEY", ""),
|
||||
health_check=False,
|
||||
)
|
||||
]
|
||||
|
||||
return env_config, server_configs
|
||||
|
||||
# =========================================================================
|
||||
# Setup
|
||||
# =========================================================================
|
||||
|
||||
async def setup(self):
|
||||
"""Verify yc-bench is installed and build the eval matrix."""
|
||||
# Verify yc-bench CLI is available
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["yc-bench", "--help"], capture_output=True, text=True, timeout=10
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise FileNotFoundError
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired):
|
||||
raise RuntimeError(
|
||||
"yc-bench CLI not found. Install with:\n"
|
||||
' pip install "hermes-agent[yc-bench]"\n'
|
||||
"Or: git clone https://github.com/collinear-ai/yc-bench "
|
||||
"&& cd yc-bench && pip install -e ."
|
||||
)
|
||||
print("yc-bench CLI verified.")
|
||||
|
||||
# Build eval matrix: preset x seed
|
||||
self.all_eval_items = [
|
||||
{"preset": preset, "seed": seed}
|
||||
for preset in self.config.presets
|
||||
for seed in self.config.seeds
|
||||
]
|
||||
self.iter = 0
|
||||
|
||||
os.makedirs(self.config.db_dir, exist_ok=True)
|
||||
self.eval_metrics: List[Tuple[str, float]] = []
|
||||
|
||||
# Streaming JSONL log for crash-safe result persistence
|
||||
log_dir = os.path.join(os.path.dirname(__file__), "logs")
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
run_ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
self._streaming_path = os.path.join(log_dir, f"samples_{run_ts}.jsonl")
|
||||
self._streaming_file = open(self._streaming_path, "w", encoding="utf-8")
|
||||
self._streaming_lock = threading.Lock()
|
||||
|
||||
print(f"\nYC-Bench eval matrix: {len(self.all_eval_items)} runs")
|
||||
for item in self.all_eval_items:
|
||||
print(f" preset={item['preset']!r} seed={item['seed']}")
|
||||
print(f"Streaming results to: {self._streaming_path}\n")
|
||||
|
||||
def _save_result(self, result: Dict[str, Any]):
|
||||
"""Write a single run result to the streaming JSONL file immediately."""
|
||||
if not hasattr(self, "_streaming_file") or self._streaming_file.closed:
|
||||
return
|
||||
with self._streaming_lock:
|
||||
self._streaming_file.write(
|
||||
json.dumps(result, ensure_ascii=False, default=str) + "\n"
|
||||
)
|
||||
self._streaming_file.flush()
|
||||
|
||||
# =========================================================================
|
||||
# Training pipeline stubs (eval-only -- not used)
|
||||
# =========================================================================
|
||||
|
||||
async def get_next_item(self):
|
||||
item = self.all_eval_items[self.iter % len(self.all_eval_items)]
|
||||
self.iter += 1
|
||||
return item
|
||||
|
||||
def format_prompt(self, item: Dict[str, Any]) -> str:
|
||||
preset = item["preset"]
|
||||
seed = item["seed"]
|
||||
return (
|
||||
f"A new YC-Bench simulation has been initialized "
|
||||
f"(preset='{preset}', seed={seed}).\n"
|
||||
f"Your company '{self.config.company_name}' is ready.\n\n"
|
||||
"Begin by calling:\n"
|
||||
"1. `yc-bench company status` -- see your starting funds and prestige\n"
|
||||
"2. `yc-bench employee list` -- see your team and their skills\n"
|
||||
"3. `yc-bench market browse --required-prestige-lte 1` -- find tasks "
|
||||
"you can take\n\n"
|
||||
"Then accept 2-3 tasks, assign employees, dispatch them, and call "
|
||||
"`yc-bench sim resume` to advance time. Repeat this loop until the "
|
||||
"simulation ends (horizon reached or bankruptcy)."
|
||||
)
|
||||
|
||||
async def compute_reward(self, item, result, ctx) -> float:
|
||||
return 0.0
|
||||
|
||||
async def collect_trajectories(self, item):
|
||||
return None, []
|
||||
|
||||
async def score(self, rollout_group_data):
|
||||
return None
|
||||
|
||||
# =========================================================================
|
||||
# Per-run evaluation
|
||||
# =========================================================================
|
||||
|
||||
async def rollout_and_score_eval(self, eval_item: Dict[str, Any]) -> Dict:
|
||||
"""
|
||||
Evaluate a single (preset, seed) run.
|
||||
|
||||
1. Sets DATABASE_URL and YC_BENCH_EXPERIMENT env vars
|
||||
2. Initialises the simulation via ``yc-bench sim init`` (NOT ``run``)
|
||||
3. Runs HermesAgentLoop with terminal tool
|
||||
4. Reads SQLite DB to compute final score
|
||||
5. Returns result dict with survival, funds, and composite score
|
||||
"""
|
||||
preset = eval_item["preset"]
|
||||
seed = eval_item["seed"]
|
||||
run_id = str(uuid.uuid4())[:8]
|
||||
run_key = f"{preset}_seed{seed}_{run_id}"
|
||||
|
||||
from tqdm import tqdm
|
||||
tqdm.write(f" [START] preset={preset!r} seed={seed} (run_id={run_id})")
|
||||
run_start = time.time()
|
||||
|
||||
# Isolated DB per run -- prevents cross-run state leakage
|
||||
db_path = os.path.join(self.config.db_dir, f"yc_bench_{run_key}.db")
|
||||
os.environ["DATABASE_URL"] = f"sqlite:///{db_path}"
|
||||
os.environ["YC_BENCH_EXPERIMENT"] = preset
|
||||
|
||||
# Determine horizon: explicit config override > preset lookup > default 1
|
||||
horizon = self.config.horizon_years or _PRESET_HORIZONS.get(preset, 1)
|
||||
|
||||
try:
|
||||
# ----------------------------------------------------------
|
||||
# Step 1: Initialise the simulation via CLI
|
||||
# IMPORTANT: We use `sim init`, NOT `yc-bench run`.
|
||||
# `yc-bench run` starts yc-bench's own LLM agent loop (via
|
||||
# LiteLLM), which would compete with our HermesAgentLoop.
|
||||
# `sim init` just sets up the world and returns.
|
||||
# ----------------------------------------------------------
|
||||
init_cmd = [
|
||||
"yc-bench", "sim", "init",
|
||||
"--seed", str(seed),
|
||||
"--start-date", self.config.start_date,
|
||||
"--company-name", self.config.company_name,
|
||||
"--horizon-years", str(horizon),
|
||||
]
|
||||
init_result = subprocess.run(
|
||||
init_cmd, capture_output=True, text=True, timeout=30,
|
||||
)
|
||||
if init_result.returncode != 0:
|
||||
error_msg = (init_result.stderr or init_result.stdout).strip()
|
||||
raise RuntimeError(f"yc-bench sim init failed: {error_msg}")
|
||||
|
||||
tqdm.write(f" Simulation initialized (horizon={horizon}yr)")
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Step 2: Run the HermesAgentLoop
|
||||
# ----------------------------------------------------------
|
||||
tools, valid_names = self._resolve_tools_for_group()
|
||||
|
||||
messages: List[Dict[str, Any]] = [
|
||||
{"role": "system", "content": YC_BENCH_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": self.format_prompt(eval_item)},
|
||||
]
|
||||
|
||||
agent = HermesAgentLoop(
|
||||
server=self.server,
|
||||
tool_schemas=tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=self.config.max_agent_turns,
|
||||
task_id=run_id,
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
budget_config=self.config.build_budget_config(),
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Step 3: Read final score from the simulation DB
|
||||
# ----------------------------------------------------------
|
||||
score_data = _read_final_score(db_path)
|
||||
final_funds = score_data["final_funds_cents"]
|
||||
survived = score_data["survived"]
|
||||
terminal_reason = score_data["terminal_reason"]
|
||||
|
||||
composite = _compute_composite_score(
|
||||
final_funds_cents=final_funds,
|
||||
survived=survived,
|
||||
survival_weight=self.config.survival_weight,
|
||||
funds_weight=self.config.funds_weight,
|
||||
)
|
||||
|
||||
elapsed = time.time() - run_start
|
||||
status = "SURVIVED" if survived else "BANKRUPT"
|
||||
if final_funds >= 0:
|
||||
funds_str = f"${final_funds / 100:,.0f}"
|
||||
else:
|
||||
funds_str = f"-${abs(final_funds) / 100:,.0f}"
|
||||
|
||||
tqdm.write(
|
||||
f" [{status}] preset={preset!r} seed={seed} "
|
||||
f"funds={funds_str} score={composite:.3f} "
|
||||
f"turns={result.turns_used} ({elapsed:.0f}s)"
|
||||
)
|
||||
|
||||
out = {
|
||||
"preset": preset,
|
||||
"seed": seed,
|
||||
"survived": survived,
|
||||
"final_funds_cents": final_funds,
|
||||
"final_funds_usd": final_funds / 100,
|
||||
"terminal_reason": terminal_reason,
|
||||
"composite_score": composite,
|
||||
"turns_used": result.turns_used,
|
||||
"finished_naturally": result.finished_naturally,
|
||||
"elapsed_seconds": elapsed,
|
||||
"db_path": db_path,
|
||||
"messages": result.messages,
|
||||
}
|
||||
self._save_result(out)
|
||||
return out
|
||||
|
||||
except Exception as e:
|
||||
elapsed = time.time() - run_start
|
||||
logger.error("Run %s failed: %s", run_key, e, exc_info=True)
|
||||
tqdm.write(
|
||||
f" [ERROR] preset={preset!r} seed={seed}: {e} ({elapsed:.0f}s)"
|
||||
)
|
||||
out = {
|
||||
"preset": preset,
|
||||
"seed": seed,
|
||||
"survived": False,
|
||||
"final_funds_cents": 0,
|
||||
"final_funds_usd": 0.0,
|
||||
"terminal_reason": f"error: {e}",
|
||||
"composite_score": 0.0,
|
||||
"turns_used": 0,
|
||||
"error": str(e),
|
||||
"elapsed_seconds": elapsed,
|
||||
}
|
||||
self._save_result(out)
|
||||
return out
|
||||
|
||||
# =========================================================================
|
||||
# Evaluate
|
||||
# =========================================================================
|
||||
|
||||
async def _run_with_timeout(self, item: Dict[str, Any]) -> Dict:
|
||||
"""Wrap a single rollout with a wall-clock timeout."""
|
||||
preset = item["preset"]
|
||||
seed = item["seed"]
|
||||
try:
|
||||
return await asyncio.wait_for(
|
||||
self.rollout_and_score_eval(item),
|
||||
timeout=self.config.run_timeout,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
from tqdm import tqdm
|
||||
tqdm.write(
|
||||
f" [TIMEOUT] preset={preset!r} seed={seed} "
|
||||
f"(exceeded {self.config.run_timeout}s)"
|
||||
)
|
||||
out = {
|
||||
"preset": preset,
|
||||
"seed": seed,
|
||||
"survived": False,
|
||||
"final_funds_cents": 0,
|
||||
"final_funds_usd": 0.0,
|
||||
"terminal_reason": f"timeout ({self.config.run_timeout}s)",
|
||||
"composite_score": 0.0,
|
||||
"turns_used": 0,
|
||||
"error": "timeout",
|
||||
}
|
||||
self._save_result(out)
|
||||
return out
|
||||
|
||||
async def evaluate(self, *args, **kwargs) -> None:
|
||||
"""
|
||||
Run YC-Bench evaluation over all (preset, seed) combinations.
|
||||
|
||||
Runs sequentially -- each run is 100-500 turns, parallelising would
|
||||
be prohibitively expensive and cause env var conflicts.
|
||||
"""
|
||||
start_time = time.time()
|
||||
from tqdm import tqdm
|
||||
|
||||
# --- tqdm-compatible logging handler (TB2 pattern) ---
|
||||
class _TqdmHandler(logging.Handler):
|
||||
def emit(self, record):
|
||||
try:
|
||||
tqdm.write(self.format(record))
|
||||
except Exception:
|
||||
self.handleError(record)
|
||||
|
||||
root = logging.getLogger()
|
||||
handler = _TqdmHandler()
|
||||
handler.setFormatter(
|
||||
logging.Formatter("%(levelname)s %(name)s: %(message)s")
|
||||
)
|
||||
root.handlers = [handler]
|
||||
for noisy in ("httpx", "openai"):
|
||||
logging.getLogger(noisy).setLevel(logging.WARNING)
|
||||
|
||||
# --- Print config summary ---
|
||||
print(f"\n{'='*60}")
|
||||
print("Starting YC-Bench Evaluation")
|
||||
print(f"{'='*60}")
|
||||
print(f" Presets: {self.config.presets}")
|
||||
print(f" Seeds: {self.config.seeds}")
|
||||
print(f" Total runs: {len(self.all_eval_items)}")
|
||||
print(f" Max turns/run: {self.config.max_agent_turns}")
|
||||
print(f" Run timeout: {self.config.run_timeout}s")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
results = []
|
||||
pbar = tqdm(
|
||||
total=len(self.all_eval_items), desc="YC-Bench", dynamic_ncols=True
|
||||
)
|
||||
|
||||
try:
|
||||
for item in self.all_eval_items:
|
||||
result = await self._run_with_timeout(item)
|
||||
results.append(result)
|
||||
survived_count = sum(1 for r in results if r.get("survived"))
|
||||
pbar.set_postfix_str(
|
||||
f"survived={survived_count}/{len(results)}"
|
||||
)
|
||||
pbar.update(1)
|
||||
|
||||
except (KeyboardInterrupt, asyncio.CancelledError):
|
||||
tqdm.write("\n[INTERRUPTED] Stopping evaluation...")
|
||||
pbar.close()
|
||||
try:
|
||||
from tools.terminal_tool import cleanup_all_environments
|
||||
cleanup_all_environments()
|
||||
except Exception:
|
||||
pass
|
||||
if hasattr(self, "_streaming_file") and not self._streaming_file.closed:
|
||||
self._streaming_file.close()
|
||||
return
|
||||
|
||||
pbar.close()
|
||||
end_time = time.time()
|
||||
|
||||
# --- Compute metrics ---
|
||||
valid = [r for r in results if r is not None]
|
||||
if not valid:
|
||||
print("Warning: No valid results.")
|
||||
return
|
||||
|
||||
total = len(valid)
|
||||
survived_total = sum(1 for r in valid if r.get("survived"))
|
||||
survival_rate = survived_total / total if total else 0.0
|
||||
avg_score = (
|
||||
sum(r.get("composite_score", 0) for r in valid) / total
|
||||
if total
|
||||
else 0.0
|
||||
)
|
||||
|
||||
preset_results: Dict[str, List[Dict]] = defaultdict(list)
|
||||
for r in valid:
|
||||
preset_results[r["preset"]].append(r)
|
||||
|
||||
eval_metrics = {
|
||||
"eval/survival_rate": survival_rate,
|
||||
"eval/avg_composite_score": avg_score,
|
||||
"eval/total_runs": total,
|
||||
"eval/survived_runs": survived_total,
|
||||
"eval/evaluation_time_seconds": end_time - start_time,
|
||||
}
|
||||
|
||||
for preset, items in sorted(preset_results.items()):
|
||||
ps = sum(1 for r in items if r.get("survived"))
|
||||
pt = len(items)
|
||||
pa = (
|
||||
sum(r.get("composite_score", 0) for r in items) / pt
|
||||
if pt
|
||||
else 0
|
||||
)
|
||||
key = preset.replace("-", "_")
|
||||
eval_metrics[f"eval/survival_rate_{key}"] = ps / pt if pt else 0
|
||||
eval_metrics[f"eval/avg_score_{key}"] = pa
|
||||
|
||||
self.eval_metrics = list(eval_metrics.items())
|
||||
|
||||
# --- Print summary ---
|
||||
print(f"\n{'='*60}")
|
||||
print("YC-Bench Evaluation Results")
|
||||
print(f"{'='*60}")
|
||||
print(
|
||||
f"Overall survival rate: {survival_rate:.1%} "
|
||||
f"({survived_total}/{total})"
|
||||
)
|
||||
print(f"Average composite score: {avg_score:.4f}")
|
||||
print(f"Evaluation time: {end_time - start_time:.1f}s")
|
||||
|
||||
print("\nPer-preset breakdown:")
|
||||
for preset, items in sorted(preset_results.items()):
|
||||
ps = sum(1 for r in items if r.get("survived"))
|
||||
pt = len(items)
|
||||
pa = (
|
||||
sum(r.get("composite_score", 0) for r in items) / pt
|
||||
if pt
|
||||
else 0
|
||||
)
|
||||
print(f" {preset}: {ps}/{pt} survived avg_score={pa:.4f}")
|
||||
for r in items:
|
||||
status = "SURVIVED" if r.get("survived") else "BANKRUPT"
|
||||
funds = r.get("final_funds_usd", 0)
|
||||
print(
|
||||
f" seed={r['seed']} [{status}] "
|
||||
f"${funds:,.0f} "
|
||||
f"score={r.get('composite_score', 0):.3f}"
|
||||
)
|
||||
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# --- Log results ---
|
||||
samples = [
|
||||
{k: v for k, v in r.items() if k != "messages"} for r in valid
|
||||
]
|
||||
|
||||
try:
|
||||
await self.evaluate_log(
|
||||
metrics=eval_metrics,
|
||||
samples=samples,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
generation_parameters={
|
||||
"temperature": self.config.agent_temperature,
|
||||
"max_tokens": self.config.max_token_length,
|
||||
"max_agent_turns": self.config.max_agent_turns,
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error logging results: {e}")
|
||||
|
||||
# --- Cleanup (TB2 pattern) ---
|
||||
if hasattr(self, "_streaming_file") and not self._streaming_file.closed:
|
||||
self._streaming_file.close()
|
||||
print(f"Results saved to: {self._streaming_path}")
|
||||
|
||||
try:
|
||||
from tools.terminal_tool import cleanup_all_environments
|
||||
cleanup_all_environments()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
from environments.agent_loop import _tool_executor
|
||||
_tool_executor.shutdown(wait=False, cancel_futures=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# =========================================================================
|
||||
# Wandb logging
|
||||
# =========================================================================
|
||||
|
||||
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
|
||||
"""Log YC-Bench-specific metrics to wandb."""
|
||||
if wandb_metrics is None:
|
||||
wandb_metrics = {}
|
||||
for k, v in self.eval_metrics:
|
||||
wandb_metrics[k] = v
|
||||
self.eval_metrics = []
|
||||
await super().wandb_log(wandb_metrics)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
YCBenchEvalEnv.cli()
|
||||
@@ -1,714 +0,0 @@
|
||||
"""
|
||||
HermesAgentBaseEnv -- Abstract Base Environment for Hermes-Agent + Atropos
|
||||
|
||||
Provides the Atropos integration plumbing that all hermes-agent environments share:
|
||||
- Two-mode operation (OpenAI server for Phase 1, VLLM ManagedServer for Phase 2)
|
||||
- Per-group toolset/distribution resolution
|
||||
- Agent loop orchestration via HermesAgentLoop
|
||||
- ToolContext creation for reward functions
|
||||
- ScoredDataGroup construction from ManagedServer state
|
||||
|
||||
Subclasses only need to implement:
|
||||
setup() -- Load dataset, initialize state
|
||||
get_next_item() -- Return the next item from the dataset
|
||||
format_prompt() -- Convert a dataset item into the user message
|
||||
compute_reward() -- Score the rollout (has full ToolContext access)
|
||||
evaluate() -- Periodic evaluation
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
from abc import abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
||||
|
||||
# Ensure the hermes-agent repo root is on sys.path so that imports like
|
||||
# `from model_tools import ...` and `from environments.X import ...` work
|
||||
# regardless of where the script is invoked from.
|
||||
_repo_root = Path(__file__).resolve().parent.parent
|
||||
if str(_repo_root) not in sys.path:
|
||||
sys.path.insert(0, str(_repo_root))
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from pydantic import Field
|
||||
|
||||
# Load API keys from hermes-agent/.env so all environments can access them
|
||||
_env_path = _repo_root / ".env"
|
||||
if _env_path.exists():
|
||||
load_dotenv(dotenv_path=_env_path)
|
||||
|
||||
# Apply monkey patches for async-safe tool operation inside Atropos's event loop.
|
||||
# This patches SwerexModalEnvironment to use a background thread instead of
|
||||
# asyncio.run(), which would deadlock inside Atropos. Safe for normal CLI too.
|
||||
from environments.patches import apply_patches
|
||||
apply_patches()
|
||||
|
||||
from atroposlib.envs.base import (
|
||||
BaseEnv,
|
||||
BaseEnvConfig,
|
||||
ScoredDataGroup,
|
||||
ScoredDataItem,
|
||||
)
|
||||
from atroposlib.envs.server_handling.server_manager import (
|
||||
APIServerConfig,
|
||||
ServerBaseline,
|
||||
ServerManager,
|
||||
)
|
||||
from atroposlib.type_definitions import Item
|
||||
|
||||
from environments.agent_loop import AgentResult, HermesAgentLoop
|
||||
from environments.tool_context import ToolContext
|
||||
from tools.budget_config import (
|
||||
DEFAULT_RESULT_SIZE_CHARS,
|
||||
DEFAULT_TURN_BUDGET_CHARS,
|
||||
DEFAULT_PREVIEW_SIZE_CHARS,
|
||||
)
|
||||
|
||||
# Import hermes-agent toolset infrastructure
|
||||
from model_tools import get_tool_definitions
|
||||
from toolset_distributions import sample_toolsets_from_distribution
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HermesAgentEnvConfig(BaseEnvConfig):
|
||||
"""
|
||||
Configuration for hermes-agent Atropos environments.
|
||||
|
||||
Extends BaseEnvConfig with agent-specific settings for toolsets,
|
||||
terminal backend, dataset loading, and tool call parsing.
|
||||
"""
|
||||
|
||||
# --- Toolset configuration ---
|
||||
# Mutually exclusive: use either enabled_toolsets OR distribution
|
||||
enabled_toolsets: Optional[List[str]] = Field(
|
||||
default=None,
|
||||
description="Explicit list of hermes toolsets to enable (e.g., ['terminal', 'file', 'web']). "
|
||||
"If None and distribution is also None, all available toolsets are enabled.",
|
||||
)
|
||||
disabled_toolsets: Optional[List[str]] = Field(
|
||||
default=None,
|
||||
description="Toolsets to disable. Applied as a filter on top of enabled_toolsets or distribution.",
|
||||
)
|
||||
distribution: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Name of a toolset distribution from toolset_distributions.py "
|
||||
"(e.g., 'development', 'terminal_tasks'). Sampled once per group. "
|
||||
"Mutually exclusive with enabled_toolsets.",
|
||||
)
|
||||
|
||||
# --- Agent loop configuration ---
|
||||
max_agent_turns: int = Field(
|
||||
default=30,
|
||||
description="Maximum number of LLM calls (tool-calling iterations) per rollout.",
|
||||
)
|
||||
system_prompt: Optional[str] = Field(
|
||||
default=None,
|
||||
description="System prompt for the agent. Tools are handled via the tools= parameter, "
|
||||
"not embedded in the prompt text.",
|
||||
)
|
||||
agent_temperature: float = Field(
|
||||
default=1.0,
|
||||
description="Sampling temperature for agent generation during rollouts.",
|
||||
)
|
||||
|
||||
# --- Terminal backend ---
|
||||
terminal_backend: str = Field(
|
||||
default="local",
|
||||
description="Terminal backend: 'local', 'docker', 'modal', 'daytona', 'ssh', 'singularity'. "
|
||||
"Modal or Daytona recommended for production RL (cloud isolation per rollout).",
|
||||
)
|
||||
terminal_timeout: int = Field(
|
||||
default=120,
|
||||
description="Per-command timeout in seconds for terminal tool calls. "
|
||||
"Commands exceeding this are killed. Increase for tasks with long-running "
|
||||
"commands (compilation, pip install, etc.).",
|
||||
)
|
||||
terminal_lifetime: int = Field(
|
||||
default=3600,
|
||||
description="Sandbox inactivity lifetime in seconds. The cleanup thread kills "
|
||||
"sandboxes that have been idle longer than this. Must be longer than "
|
||||
"the longest gap between tool calls (e.g., waiting for LLM response).",
|
||||
)
|
||||
|
||||
# --- Dataset ---
|
||||
dataset_name: Optional[str] = Field(
|
||||
default=None,
|
||||
description="HuggingFace dataset name. Optional if tasks are defined inline.",
|
||||
)
|
||||
dataset_split: str = Field(
|
||||
default="train",
|
||||
description="Dataset split to use.",
|
||||
)
|
||||
prompt_field: str = Field(
|
||||
default="prompt",
|
||||
description="Which field in the dataset contains the prompt.",
|
||||
)
|
||||
|
||||
# --- Thread pool ---
|
||||
tool_pool_size: int = Field(
|
||||
default=128,
|
||||
description="Thread pool size for tool execution. Each concurrent task needs a "
|
||||
"thread for tool calls. Must be large enough for parallel evaluation. "
|
||||
"Too small = thread pool starvation.",
|
||||
)
|
||||
|
||||
# --- Phase 2: Tool call parsing ---
|
||||
tool_call_parser: str = Field(
|
||||
default="hermes",
|
||||
description="Tool call parser name for Phase 2 (VLLM server type). "
|
||||
"Ignored in Phase 1 (OpenAI server type where VLLM parses natively). "
|
||||
"Options: hermes, mistral, llama3_json, qwen, deepseek_v3, etc.",
|
||||
)
|
||||
|
||||
# --- Tool result budget ---
|
||||
# Defaults imported from tools.budget_config (single source of truth).
|
||||
default_result_size_chars: int = Field(
|
||||
default=DEFAULT_RESULT_SIZE_CHARS,
|
||||
description="Default per-tool threshold (chars) for persisting large results "
|
||||
"to sandbox. Results exceeding this are written to /tmp/hermes-results/ "
|
||||
"and replaced with a preview. Per-tool registry values take precedence "
|
||||
"unless overridden via tool_result_overrides.",
|
||||
)
|
||||
turn_budget_chars: int = Field(
|
||||
default=DEFAULT_TURN_BUDGET_CHARS,
|
||||
description="Aggregate char budget per assistant turn. If all tool results "
|
||||
"in a single turn exceed this, the largest are persisted to disk first.",
|
||||
)
|
||||
preview_size_chars: int = Field(
|
||||
default=DEFAULT_PREVIEW_SIZE_CHARS,
|
||||
description="Size of the inline preview shown after a tool result is persisted.",
|
||||
)
|
||||
tool_result_overrides: Optional[Dict[str, int]] = Field(
|
||||
default=None,
|
||||
description="Per-tool threshold overrides (chars). Keys are tool names, "
|
||||
"values are char thresholds. Overrides both the default and registry "
|
||||
"per-tool values. Example: {'terminal': 10000, 'search_files': 5000}. "
|
||||
"Note: read_file is pinned to infinity and cannot be overridden.",
|
||||
)
|
||||
|
||||
# --- Provider-specific parameters ---
|
||||
# Passed as extra_body to the OpenAI client's chat.completions.create() call.
|
||||
# Useful for OpenRouter provider preferences, transforms, route settings, etc.
|
||||
# Example YAML:
|
||||
# extra_body:
|
||||
# provider:
|
||||
# ignore: ["DeepInfra", "Fireworks"]
|
||||
# order: ["Together"]
|
||||
# transforms: ["middle-out"]
|
||||
extra_body: Optional[Dict[str, Any]] = Field(
|
||||
default=None,
|
||||
description="Extra body parameters passed to the OpenAI client's "
|
||||
"chat.completions.create(). Used for OpenRouter provider preferences, "
|
||||
"transforms, and other provider-specific settings.",
|
||||
)
|
||||
|
||||
def build_budget_config(self):
|
||||
"""Build a BudgetConfig from env config fields."""
|
||||
from tools.budget_config import BudgetConfig
|
||||
return BudgetConfig(
|
||||
default_result_size=self.default_result_size_chars,
|
||||
turn_budget=self.turn_budget_chars,
|
||||
preview_size=self.preview_size_chars,
|
||||
tool_overrides=dict(self.tool_result_overrides) if self.tool_result_overrides else {},
|
||||
)
|
||||
|
||||
|
||||
class HermesAgentBaseEnv(BaseEnv):
|
||||
"""
|
||||
Abstract base environment for hermes-agent Atropos integration.
|
||||
|
||||
Handles two modes of operation:
|
||||
- Phase 1 (OpenAI server type): Uses server.chat_completion() directly.
|
||||
The server (VLLM, SGLang, OpenRouter, OpenAI) handles tool call parsing
|
||||
and reasoning extraction natively. DummyManagedServer provides placeholder
|
||||
tokens. Good for SFT data gen, verifier testing, evaluation.
|
||||
|
||||
- Phase 2 (VLLM server type): Uses ManagedServer for exact token IDs + logprobs
|
||||
via /generate. Client-side tool call parser reconstructs structured tool_calls
|
||||
from raw output. Full RL training capability.
|
||||
|
||||
Subclasses must implement:
|
||||
setup() -- Load dataset, initialize state
|
||||
get_next_item() -- Return the next item to roll out
|
||||
format_prompt() -- Convert a dataset item into the user message string
|
||||
compute_reward() -- Score the rollout using ToolContext
|
||||
evaluate() -- Periodic evaluation
|
||||
"""
|
||||
|
||||
name: Optional[str] = "hermes-agent"
|
||||
env_config_cls = HermesAgentEnvConfig
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: HermesAgentEnvConfig,
|
||||
server_configs: Union[ServerBaseline, List[APIServerConfig]],
|
||||
slurm=False,
|
||||
testing=False,
|
||||
):
|
||||
super().__init__(config, server_configs, slurm, testing)
|
||||
|
||||
# Set terminal environment variables so hermes tools pick them up.
|
||||
# These can all be overridden per-environment via config fields instead
|
||||
# of requiring users to set shell env vars.
|
||||
if config.terminal_backend:
|
||||
os.environ["TERMINAL_ENV"] = config.terminal_backend
|
||||
os.environ["TERMINAL_TIMEOUT"] = str(config.terminal_timeout)
|
||||
os.environ["TERMINAL_LIFETIME_SECONDS"] = str(config.terminal_lifetime)
|
||||
print(
|
||||
f"🖥️ Terminal: backend={config.terminal_backend}, "
|
||||
f"timeout={config.terminal_timeout}s, lifetime={config.terminal_lifetime}s"
|
||||
)
|
||||
|
||||
# Resize the agent loop's thread pool for tool execution.
|
||||
# This must be large enough for the number of concurrent tasks
|
||||
# (e.g., 89 parallel TB2 eval tasks each need a thread for tool calls).
|
||||
from environments.agent_loop import resize_tool_pool
|
||||
resize_tool_pool(config.tool_pool_size)
|
||||
|
||||
# Set tool_parser on the ServerManager so ManagedServer uses it
|
||||
# for bidirectional tool call translation (raw text ↔ OpenAI tool_calls).
|
||||
if hasattr(self.server, 'tool_parser'):
|
||||
self.server.tool_parser = config.tool_call_parser
|
||||
print(f"🔧 Tool parser: {config.tool_call_parser}")
|
||||
|
||||
# Current group's resolved tools (set in collect_trajectories)
|
||||
self._current_group_tools: Optional[Tuple[List[Dict], Set[str]]] = None
|
||||
|
||||
# Tool error tracking for wandb logging
|
||||
self._tool_error_buffer: List[Dict[str, Any]] = []
|
||||
|
||||
# =========================================================================
|
||||
# Toolset resolution (per-group)
|
||||
# =========================================================================
|
||||
|
||||
def _resolve_tools_for_group(self) -> Tuple[List[Dict[str, Any]], Set[str]]:
|
||||
"""
|
||||
Resolve toolsets for a group. Called once in collect_trajectories(),
|
||||
then shared by all collect_trajectory() calls in the group.
|
||||
|
||||
If distribution is set, samples probabilistically.
|
||||
If enabled_toolsets is set, uses that explicit list.
|
||||
disabled_toolsets is applied as a filter on top.
|
||||
|
||||
Returns:
|
||||
(tool_schemas, valid_tool_names) tuple
|
||||
"""
|
||||
config = self.config
|
||||
|
||||
if config.distribution:
|
||||
group_toolsets = sample_toolsets_from_distribution(config.distribution)
|
||||
logger.info("Sampled toolsets from '%s': %s", config.distribution, group_toolsets)
|
||||
else:
|
||||
group_toolsets = config.enabled_toolsets # None means "all available"
|
||||
if group_toolsets is None:
|
||||
logger.warning(
|
||||
"enabled_toolsets is None -- loading ALL tools including messaging. "
|
||||
"Set explicit enabled_toolsets for RL training."
|
||||
)
|
||||
|
||||
tools = get_tool_definitions(
|
||||
enabled_toolsets=group_toolsets,
|
||||
disabled_toolsets=config.disabled_toolsets,
|
||||
quiet_mode=True,
|
||||
)
|
||||
|
||||
valid_names = {t["function"]["name"] for t in tools} if tools else set()
|
||||
logger.info("Resolved %d tools for group: %s", len(valid_names), sorted(valid_names))
|
||||
return tools, valid_names
|
||||
|
||||
# =========================================================================
|
||||
# Server mode detection
|
||||
# =========================================================================
|
||||
|
||||
def _use_managed_server(self) -> bool:
|
||||
"""
|
||||
Determine if we should use ManagedServer (Phase 2) or direct server (Phase 1).
|
||||
|
||||
Phase 2 (ManagedServer) is used when the server type is 'vllm' or 'sglang',
|
||||
which go through the /generate endpoint for exact token tracking.
|
||||
|
||||
Phase 1 (direct server) is used for 'openai' server type, which uses
|
||||
/v1/chat/completions with native tool call parsing.
|
||||
"""
|
||||
if not self.server.servers:
|
||||
return False
|
||||
|
||||
server = self.server.servers[0]
|
||||
# If the server is an OpenAI server (not VLLM/SGLang), use direct mode
|
||||
from atroposlib.envs.server_handling.openai_server import OpenAIServer
|
||||
return not isinstance(server, OpenAIServer)
|
||||
|
||||
# =========================================================================
|
||||
# Core Atropos integration
|
||||
# =========================================================================
|
||||
|
||||
async def collect_trajectories(
|
||||
self, item: Item
|
||||
) -> Tuple[
|
||||
Union[Optional[ScoredDataGroup], List[Optional[ScoredDataGroup]]],
|
||||
List[Item],
|
||||
]:
|
||||
"""
|
||||
Override collect_trajectories to resolve toolsets once per group,
|
||||
then delegate to the standard group-level collection.
|
||||
|
||||
The default BaseEnv.collect_trajectories() calls collect_trajectory()
|
||||
group_size times in parallel. We resolve tools once here and store
|
||||
them for all those calls to use.
|
||||
"""
|
||||
# Resolve toolsets for this group (shared by all rollouts in the group)
|
||||
self._current_group_tools = self._resolve_tools_for_group()
|
||||
|
||||
# Delegate to the default implementation which calls collect_trajectory()
|
||||
# group_size times via asyncio.gather
|
||||
return await super().collect_trajectories(item)
|
||||
|
||||
# =========================================================================
|
||||
# Wandb rollout display -- format trajectories nicely
|
||||
# =========================================================================
|
||||
|
||||
@staticmethod
|
||||
def _format_trajectory_for_display(messages: List[Dict[str, Any]]) -> str:
|
||||
"""
|
||||
Format a conversation's messages into a readable trajectory string
|
||||
for wandb rollout tables. Shows tool calls, tool results, and reasoning
|
||||
in a structured way instead of raw token decoding.
|
||||
"""
|
||||
parts = []
|
||||
for msg in messages:
|
||||
role = msg.get("role", "unknown")
|
||||
content = msg.get("content", "")
|
||||
|
||||
if role == "system":
|
||||
parts.append(f"[SYSTEM]\n{content}")
|
||||
|
||||
elif role == "user":
|
||||
parts.append(f"[USER]\n{content}")
|
||||
|
||||
elif role == "assistant":
|
||||
# Show reasoning if present
|
||||
reasoning = msg.get("reasoning_content", "")
|
||||
if reasoning:
|
||||
# Truncate long reasoning for display
|
||||
if len(reasoning) > 300:
|
||||
reasoning = reasoning[:300] + "..."
|
||||
parts.append(f"[ASSISTANT thinking]\n{reasoning}")
|
||||
|
||||
# Show content
|
||||
if content:
|
||||
parts.append(f"[ASSISTANT]\n{content}")
|
||||
|
||||
# Show tool calls
|
||||
tool_calls = msg.get("tool_calls", [])
|
||||
for tc in tool_calls:
|
||||
func = tc.get("function", {})
|
||||
name = func.get("name", "?")
|
||||
args = func.get("arguments", "{}")
|
||||
# Truncate long arguments for display
|
||||
if len(args) > 200:
|
||||
args = args[:200] + "..."
|
||||
parts.append(f"[TOOL CALL] {name}({args})")
|
||||
|
||||
elif role == "tool":
|
||||
tool_id = msg.get("tool_call_id", "")
|
||||
result = content
|
||||
# Truncate long tool results for display
|
||||
if len(result) > 500:
|
||||
result = result[:500] + "..."
|
||||
parts.append(f"[TOOL RESULT] {result}")
|
||||
|
||||
return "\n\n".join(parts)
|
||||
|
||||
async def add_rollouts_for_wandb(
|
||||
self,
|
||||
scored_data,
|
||||
item=None,
|
||||
):
|
||||
"""
|
||||
Override to show formatted trajectories with tool calls visible,
|
||||
instead of raw token decoding which loses all structure.
|
||||
"""
|
||||
num_keep = self.config.num_rollouts_per_group_for_logging
|
||||
if num_keep == -1:
|
||||
num_keep = self.config.group_size
|
||||
|
||||
group = []
|
||||
for i in range(min(num_keep, len(scored_data.get("scores", [])))):
|
||||
score = scored_data["scores"][i]
|
||||
|
||||
# Use messages if available for rich display
|
||||
messages = None
|
||||
if scored_data.get("messages") and i < len(scored_data["messages"]):
|
||||
messages = scored_data["messages"][i]
|
||||
|
||||
if messages:
|
||||
text = self._format_trajectory_for_display(messages)
|
||||
elif scored_data.get("tokens") and i < len(scored_data["tokens"]):
|
||||
text = self.tokenizer.decode(scored_data["tokens"][i])
|
||||
else:
|
||||
text = "(no data)"
|
||||
|
||||
group.append((text, score))
|
||||
|
||||
self.rollouts_for_wandb.append(group)
|
||||
if len(self.rollouts_for_wandb) > self.config.num_rollouts_to_keep:
|
||||
self.rollouts_for_wandb.pop(0)
|
||||
|
||||
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
|
||||
"""Log base metrics including tool errors to wandb."""
|
||||
if wandb_metrics is None:
|
||||
wandb_metrics = {}
|
||||
|
||||
# Log tool error stats
|
||||
if self._tool_error_buffer:
|
||||
wandb_metrics["train/tool_errors_count"] = len(self._tool_error_buffer)
|
||||
|
||||
# Log error details as a summary string (tables can crash wandb on tmp cleanup)
|
||||
error_summaries = []
|
||||
for err in self._tool_error_buffer:
|
||||
error_summaries.append(
|
||||
f"[turn {err['turn']}] {err['tool']}({err['args'][:80]}) -> {err['error'][:150]}"
|
||||
)
|
||||
wandb_metrics["train/tool_error_details"] = "\n".join(error_summaries)
|
||||
|
||||
# Also print to stdout for immediate visibility
|
||||
for summary in error_summaries:
|
||||
print(f" Tool Error: {summary}")
|
||||
|
||||
self._tool_error_buffer = []
|
||||
else:
|
||||
wandb_metrics["train/tool_errors_count"] = 0
|
||||
|
||||
await super().wandb_log(wandb_metrics)
|
||||
|
||||
async def collect_trajectory(
|
||||
self, item: Item
|
||||
) -> Tuple[Optional[Union[ScoredDataItem, Any]], List[Item]]:
|
||||
"""
|
||||
Run a single rollout: agent loop + reward computation.
|
||||
|
||||
This is called group_size times in parallel by collect_trajectories().
|
||||
Each call gets its own task_id for terminal/browser session isolation.
|
||||
"""
|
||||
task_id = str(uuid.uuid4())
|
||||
|
||||
# Get group-level tools (resolved once in collect_trajectories)
|
||||
if self._current_group_tools is None:
|
||||
# Fallback: resolve per-trajectory if called outside collect_trajectories
|
||||
tools, valid_names = self._resolve_tools_for_group()
|
||||
else:
|
||||
tools, valid_names = self._current_group_tools
|
||||
|
||||
# Build initial messages
|
||||
messages: List[Dict[str, Any]] = []
|
||||
if self.config.system_prompt:
|
||||
messages.append({"role": "system", "content": self.config.system_prompt})
|
||||
messages.append({"role": "user", "content": self.format_prompt(item)})
|
||||
|
||||
# Run the agent loop
|
||||
result: AgentResult
|
||||
if self._use_managed_server():
|
||||
# Phase 2: ManagedServer with ToolCallTranslator -- exact tokens + logprobs
|
||||
# tool_parser is set on ServerManager in __init__ and passed through
|
||||
# to ManagedServer, which uses ToolCallTranslator for bidirectional
|
||||
# translation between raw text and OpenAI tool_calls.
|
||||
try:
|
||||
async with self.server.managed_server(
|
||||
tokenizer=self.tokenizer,
|
||||
preserve_think_blocks=bool(self.config.thinking_mode),
|
||||
) as managed:
|
||||
agent = HermesAgentLoop(
|
||||
server=managed,
|
||||
tool_schemas=tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=self.config.max_agent_turns,
|
||||
task_id=task_id,
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
budget_config=self.config.build_budget_config(),
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
except NotImplementedError:
|
||||
# DummyManagedServer not allowed -- fall back to Phase 1
|
||||
logger.warning(
|
||||
"ManagedServer not available (OpenAI server?). "
|
||||
"Falling back to direct server mode."
|
||||
)
|
||||
agent = HermesAgentLoop(
|
||||
server=self.server,
|
||||
tool_schemas=tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=self.config.max_agent_turns,
|
||||
task_id=task_id,
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
budget_config=self.config.build_budget_config(),
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
else:
|
||||
# Phase 1: OpenAI server -- native tool_calls, placeholder tokens
|
||||
agent = HermesAgentLoop(
|
||||
server=self.server,
|
||||
tool_schemas=tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=self.config.max_agent_turns,
|
||||
task_id=task_id,
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
budget_config=self.config.build_budget_config(),
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
|
||||
# Skip reward computation if the agent loop produced no meaningful work
|
||||
# (e.g., API call failed on turn 1). No point spinning up a Modal sandbox
|
||||
# just to verify files that were never created.
|
||||
only_system_and_user = all(
|
||||
msg.get("role") in {"system", "user"} for msg in result.messages
|
||||
)
|
||||
if result.turns_used == 0 or only_system_and_user:
|
||||
logger.warning(
|
||||
"Agent loop produced no output (turns=%d, msgs=%d). Skipping reward.",
|
||||
result.turns_used, len(result.messages),
|
||||
)
|
||||
reward = 0.0
|
||||
else:
|
||||
# Compute reward using ToolContext (gives verifier full tool access)
|
||||
ctx = ToolContext(task_id)
|
||||
try:
|
||||
reward = await self.compute_reward(item, result, ctx)
|
||||
except Exception as e:
|
||||
logger.error("compute_reward failed: %s", e)
|
||||
reward = 0.0
|
||||
finally:
|
||||
ctx.cleanup()
|
||||
|
||||
# Track tool errors for wandb logging
|
||||
if result.tool_errors:
|
||||
for err in result.tool_errors:
|
||||
self._tool_error_buffer.append({
|
||||
"turn": err.turn,
|
||||
"tool": err.tool_name,
|
||||
"args": err.arguments[:150],
|
||||
"error": err.error[:300],
|
||||
"result": err.tool_result[:300],
|
||||
})
|
||||
|
||||
# Build ScoredDataItem from ManagedServer state
|
||||
# Phase 2: real tokens/masks/logprobs from SequenceNodes
|
||||
# Phase 1: placeholder tokens (still need a valid ScoredDataItem for the pipeline)
|
||||
nodes = (result.managed_state or {}).get("nodes", [])
|
||||
|
||||
if nodes:
|
||||
# Phase 2 (or DummyManagedServer): use actual node data
|
||||
node = nodes[-1] # Final sequence node = full trajectory
|
||||
scored_item: Dict[str, Any] = {
|
||||
"tokens": node.tokens,
|
||||
"masks": node.masked_tokens,
|
||||
"scores": reward,
|
||||
}
|
||||
|
||||
# Include logprobs if available (Phase 2)
|
||||
if hasattr(node, "logprobs") and node.logprobs:
|
||||
scored_item["advantages"] = None # Computed by trainer
|
||||
scored_item["ref_logprobs"] = None
|
||||
else:
|
||||
# Phase 1 with no managed state: create placeholder tokens
|
||||
# so the data pipeline doesn't break. These are NOT suitable
|
||||
# for training but allow process mode (SFT data gen) to work.
|
||||
# Tokenize the full conversation to get approximate tokens.
|
||||
full_text = "\n".join(
|
||||
msg.get("content", "") for msg in result.messages if msg.get("content")
|
||||
)
|
||||
if self.tokenizer:
|
||||
tokens = self.tokenizer.encode(full_text, add_special_tokens=True)
|
||||
else:
|
||||
tokens = list(range(min(len(full_text) // 4, 128)))
|
||||
|
||||
scored_item = {
|
||||
"tokens": tokens,
|
||||
"masks": [-100] + tokens[1:], # Mask first token as prompt
|
||||
"scores": reward,
|
||||
}
|
||||
|
||||
# Always include messages for wandb rollout display and data logging
|
||||
scored_item["messages"] = result.messages
|
||||
|
||||
return scored_item, []
|
||||
|
||||
# =========================================================================
|
||||
# Abstract methods -- subclasses must implement
|
||||
# =========================================================================
|
||||
|
||||
@abstractmethod
|
||||
async def setup(self):
|
||||
"""
|
||||
Load dataset, initialize state.
|
||||
|
||||
Called once when the environment starts. Typical implementation:
|
||||
self.dataset = load_dataset(self.config.dataset_name, split=self.config.dataset_split)
|
||||
self.iter = 0
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def get_next_item(self) -> Item:
|
||||
"""
|
||||
Return the next item from the dataset for rollout.
|
||||
|
||||
Called by the base env's main loop to get items for workers.
|
||||
Should cycle through the dataset.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def format_prompt(self, item: Item) -> str:
|
||||
"""
|
||||
Convert a dataset item into the user message for the agent.
|
||||
|
||||
Args:
|
||||
item: Dataset item (dict, tuple, etc.)
|
||||
|
||||
Returns:
|
||||
The prompt string to send to the agent
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def compute_reward(
|
||||
self, item: Item, result: AgentResult, ctx: ToolContext
|
||||
) -> float:
|
||||
"""
|
||||
Score the rollout. Has full access to:
|
||||
- item: the original dataset item (ground truth, test commands, etc.)
|
||||
- result: AgentResult with full messages, turn count, reasoning, etc.
|
||||
- ctx: ToolContext -- call ANY hermes-agent tool (terminal, file, web,
|
||||
browser, vision...) scoped to this rollout's sandbox. Nothing
|
||||
is off-limits.
|
||||
|
||||
Args:
|
||||
item: The dataset item that was rolled out
|
||||
result: The agent's rollout result
|
||||
ctx: ToolContext with full tool access for verification
|
||||
|
||||
Returns:
|
||||
Reward float (typically 0.0 to 1.0, but any float is valid)
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def evaluate(self, *args, **kwargs):
|
||||
"""
|
||||
Periodic evaluation. Called every steps_per_eval steps.
|
||||
|
||||
Typical implementation runs the agent on a held-out eval set
|
||||
and logs metrics via wandb/evaluate_log.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
@@ -1,34 +0,0 @@
|
||||
# SWE Environment -- Default Configuration
|
||||
#
|
||||
# SWE-bench style tasks with Modal sandboxes for cloud isolation.
|
||||
# Uses terminal + file + web toolsets.
|
||||
#
|
||||
# Usage:
|
||||
# python environments/hermes_swe_env/hermes_swe_env.py serve \
|
||||
# --config environments/hermes_swe_env/default.yaml
|
||||
|
||||
env:
|
||||
enabled_toolsets: ["terminal", "file", "web"]
|
||||
max_agent_turns: 30
|
||||
max_token_length: 4096
|
||||
group_size: 4
|
||||
terminal_backend: "modal"
|
||||
tool_call_parser: "hermes"
|
||||
tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
|
||||
dataset_name: "bigcode/humanevalpack"
|
||||
dataset_split: "test"
|
||||
prompt_field: "prompt"
|
||||
steps_per_eval: 50
|
||||
total_steps: 500
|
||||
use_wandb: true
|
||||
wandb_name: "hermes-swe"
|
||||
system_prompt: >
|
||||
You are a skilled software engineer. You have access to a terminal,
|
||||
file tools, and web search. Use these tools to complete the coding task.
|
||||
Write clean, working code and verify it runs correctly before finishing.
|
||||
|
||||
openai:
|
||||
base_url: "http://localhost:8000/v1"
|
||||
model_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
|
||||
server_type: "openai"
|
||||
api_key: ""
|
||||
@@ -1,229 +0,0 @@
|
||||
"""
|
||||
HermesSweEnv -- SWE-Bench Style Environment with Modal Sandboxes
|
||||
|
||||
A concrete environment for software engineering tasks where the model writes code
|
||||
and the reward function runs tests to verify correctness. Uses Modal terminal
|
||||
backend for cloud-isolated sandboxes per rollout.
|
||||
|
||||
The reward function uses ToolContext.terminal() to run test commands in the same
|
||||
Modal sandbox the model used during its agentic loop. All filesystem state from
|
||||
the model's tool calls is preserved for verification.
|
||||
|
||||
Usage:
|
||||
# Phase 1: OpenAI server type
|
||||
vllm serve YourModel --tool-parser hermes
|
||||
run-api
|
||||
python environments/hermes_swe_env.py serve \\
|
||||
--openai.base_url http://localhost:8000/v1 \\
|
||||
--openai.model_name YourModel \\
|
||||
--openai.server_type openai \\
|
||||
--env.dataset_name bigcode/humanevalpack \\
|
||||
--env.terminal_backend modal
|
||||
|
||||
# Phase 2: VLLM server type (full RL training)
|
||||
python environments/hermes_swe_env.py serve \\
|
||||
--openai.base_url http://localhost:8000/v1 \\
|
||||
--openai.model_name YourModel \\
|
||||
--openai.server_type vllm \\
|
||||
--env.tool_call_parser hermes \\
|
||||
--env.terminal_backend modal
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
# Ensure repo root is on sys.path for imports
|
||||
_repo_root = Path(__file__).resolve().parent.parent.parent
|
||||
if str(_repo_root) not in sys.path:
|
||||
sys.path.insert(0, str(_repo_root))
|
||||
|
||||
from datasets import load_dataset
|
||||
|
||||
from atroposlib.envs.base import ScoredDataGroup
|
||||
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||
from atroposlib.type_definitions import Item
|
||||
|
||||
from environments.agent_loop import AgentResult
|
||||
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||
from environments.tool_context import ToolContext
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HermesSweEnvConfig(HermesAgentEnvConfig):
|
||||
"""Config with defaults for SWE-bench style tasks."""
|
||||
|
||||
pass # Inherits all fields, overrides defaults in config_init
|
||||
|
||||
|
||||
class HermesSweEnv(HermesAgentBaseEnv):
|
||||
"""
|
||||
SWE-bench style environment using Modal terminal backend.
|
||||
|
||||
The model gets a coding task, uses terminal + file + web tools to solve it,
|
||||
and the reward function runs tests in the same Modal sandbox to verify.
|
||||
|
||||
Subclass this for specific SWE datasets (HumanEval, SWE-bench, etc.)
|
||||
and customize format_prompt() and compute_reward() as needed.
|
||||
"""
|
||||
|
||||
name = "hermes-swe"
|
||||
env_config_cls = HermesSweEnvConfig
|
||||
|
||||
@classmethod
|
||||
def config_init(cls) -> Tuple[HermesSweEnvConfig, List[APIServerConfig]]:
|
||||
"""
|
||||
Default configuration for the SWE environment.
|
||||
|
||||
Uses Modal terminal backend for cloud isolation and terminal + file + web toolsets.
|
||||
"""
|
||||
env_config = HermesSweEnvConfig(
|
||||
# Toolsets: terminal for running code, file for reading/writing, web for docs
|
||||
enabled_toolsets=["terminal", "file", "web"],
|
||||
disabled_toolsets=None,
|
||||
distribution=None,
|
||||
# Agent settings -- SWE tasks need more turns
|
||||
max_agent_turns=30,
|
||||
max_token_length=4096,
|
||||
agent_temperature=1.0,
|
||||
system_prompt=(
|
||||
"You are a skilled software engineer. You have access to a terminal, "
|
||||
"file tools, and web search. Use these tools to complete the coding task. "
|
||||
"Write clean, working code and verify it runs correctly before finishing."
|
||||
),
|
||||
# Modal backend for cloud-isolated sandboxes
|
||||
terminal_backend="modal",
|
||||
# Dataset -- override via CLI for your specific SWE dataset
|
||||
dataset_name="bigcode/humanevalpack",
|
||||
dataset_split="test",
|
||||
prompt_field="prompt",
|
||||
# Atropos settings
|
||||
group_size=4,
|
||||
tokenizer_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
|
||||
tool_call_parser="hermes",
|
||||
steps_per_eval=50,
|
||||
total_steps=500,
|
||||
use_wandb=True,
|
||||
wandb_name="hermes-swe",
|
||||
)
|
||||
|
||||
server_configs = [
|
||||
APIServerConfig(
|
||||
base_url="http://localhost:8000/v1",
|
||||
model_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
|
||||
server_type="openai", # Phase 1; switch to "vllm" for Phase 2
|
||||
api_key="",
|
||||
)
|
||||
]
|
||||
|
||||
return env_config, server_configs
|
||||
|
||||
async def setup(self):
|
||||
"""Load the SWE dataset."""
|
||||
if self.config.dataset_name:
|
||||
self.dataset = load_dataset(
|
||||
self.config.dataset_name, split=self.config.dataset_split
|
||||
)
|
||||
else:
|
||||
# Placeholder if no dataset specified
|
||||
self.dataset = []
|
||||
self.iter = 0
|
||||
self.reward_buffer: List[float] = []
|
||||
|
||||
async def get_next_item(self) -> Dict[str, Any]:
|
||||
"""Cycle through the SWE dataset."""
|
||||
if not self.dataset:
|
||||
raise ValueError("No dataset loaded. Set dataset_name in config.")
|
||||
item = self.dataset[self.iter % len(self.dataset)]
|
||||
self.iter += 1
|
||||
return item
|
||||
|
||||
def format_prompt(self, item: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Format the SWE task prompt.
|
||||
|
||||
Override this in subclasses for different dataset formats.
|
||||
Default assumes the dataset has a 'prompt' field and optionally a 'test' field.
|
||||
"""
|
||||
prompt = item.get(self.config.prompt_field, "")
|
||||
|
||||
# If the dataset has test information, include it in the prompt
|
||||
test_info = item.get("test", item.get("test_code", item.get("tests", "")))
|
||||
if test_info:
|
||||
prompt += f"\n\nTests to pass:\n{test_info}"
|
||||
|
||||
return prompt
|
||||
|
||||
async def compute_reward(
|
||||
self, item: Dict[str, Any], result: AgentResult, ctx: ToolContext
|
||||
) -> float:
|
||||
"""
|
||||
Score by running tests in the model's Modal sandbox.
|
||||
|
||||
Default implementation:
|
||||
- If the dataset item has a 'test' or 'test_code' field, run it
|
||||
- Check exit code: 0 = pass, non-zero = fail
|
||||
- Partial credit for file creation
|
||||
|
||||
Override this in subclasses for more sophisticated reward logic.
|
||||
"""
|
||||
# Find the test command from the dataset item
|
||||
test_code = item.get("test", item.get("test_code", item.get("tests", "")))
|
||||
|
||||
if test_code:
|
||||
# Run the test in the model's sandbox
|
||||
test_result = ctx.terminal(
|
||||
f'cd /workspace && python3 -c "{test_code}"', timeout=60
|
||||
)
|
||||
|
||||
if test_result["exit_code"] == 0:
|
||||
self.reward_buffer.append(1.0)
|
||||
return 1.0
|
||||
|
||||
# Partial credit: check if the model created any Python files
|
||||
file_check = ctx.terminal("find /workspace -name '*.py' -newer /tmp/.start_marker 2>/dev/null | head -5")
|
||||
if file_check["exit_code"] == 0 and file_check.get("output", "").strip():
|
||||
self.reward_buffer.append(0.1)
|
||||
return 0.1
|
||||
|
||||
self.reward_buffer.append(0.0)
|
||||
return 0.0
|
||||
|
||||
async def evaluate(self, *args, **kwargs):
|
||||
"""
|
||||
Run evaluation on a held-out set.
|
||||
|
||||
Override for dataset-specific evaluation logic.
|
||||
"""
|
||||
start_time = time.time()
|
||||
end_time = time.time()
|
||||
|
||||
eval_metrics = {"eval/placeholder": 0.0}
|
||||
await self.evaluate_log(
|
||||
metrics=eval_metrics,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
|
||||
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
|
||||
"""Log SWE-specific metrics."""
|
||||
if wandb_metrics is None:
|
||||
wandb_metrics = {}
|
||||
|
||||
if self.reward_buffer:
|
||||
wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / len(
|
||||
self.reward_buffer
|
||||
)
|
||||
wandb_metrics["train/pass_rate"] = sum(
|
||||
1 for r in self.reward_buffer if r == 1.0
|
||||
) / len(self.reward_buffer)
|
||||
self.reward_buffer = []
|
||||
|
||||
await super().wandb_log(wandb_metrics)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
HermesSweEnv.cli()
|
||||
@@ -1,35 +0,0 @@
|
||||
"""
|
||||
Monkey patches for making hermes-agent tools work inside async frameworks (Atropos).
|
||||
|
||||
Problem:
|
||||
Some tools use asyncio.run() internally (e.g., Modal backend via SWE-ReX,
|
||||
web_extract). This crashes when called from inside Atropos's event loop because
|
||||
asyncio.run() can't be nested.
|
||||
|
||||
Solution:
|
||||
The Modal environment (tools/environments/modal.py) now uses a dedicated
|
||||
_AsyncWorker thread internally, making it safe for both CLI and Atropos use.
|
||||
No monkey-patching is required.
|
||||
|
||||
This module is kept for backward compatibility. apply_patches() is a no-op.
|
||||
|
||||
Usage:
|
||||
Call apply_patches() once at import time (done automatically by hermes_base_env.py).
|
||||
This is idempotent and safe to call multiple times.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_patches_applied = False
|
||||
|
||||
|
||||
def apply_patches():
|
||||
"""Apply all monkey patches needed for Atropos compatibility."""
|
||||
global _patches_applied
|
||||
if _patches_applied:
|
||||
return
|
||||
|
||||
logger.debug("apply_patches() called; no patches needed (async safety is built-in)")
|
||||
_patches_applied = True
|
||||
@@ -1,34 +0,0 @@
|
||||
# Terminal Test Environment -- Default Configuration
|
||||
#
|
||||
# Simple file-creation tasks for validating the full Atropos + hermes-agent stack.
|
||||
# Uses Modal terminal backend and OpenRouter (Claude) for inference.
|
||||
# API keys loaded from ~/hermes-agent/.env
|
||||
#
|
||||
# Usage:
|
||||
# run-api
|
||||
# python environments/terminal_test_env/terminal_test_env.py serve \
|
||||
# --config environments/terminal_test_env/default.yaml
|
||||
|
||||
env:
|
||||
enabled_toolsets: ["terminal", "file"]
|
||||
max_agent_turns: 10
|
||||
max_token_length: 2048
|
||||
group_size: 3
|
||||
total_steps: 3
|
||||
steps_per_eval: 3
|
||||
terminal_backend: "modal"
|
||||
tool_call_parser: "hermes"
|
||||
tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
|
||||
ensure_scores_are_not_same: false
|
||||
use_wandb: false
|
||||
system_prompt: >
|
||||
You are a helpful assistant with access to a terminal and file tools.
|
||||
Complete the user's request by using the available tools.
|
||||
Be precise and follow instructions exactly.
|
||||
|
||||
openai:
|
||||
base_url: "https://openrouter.ai/api/v1"
|
||||
model_name: "anthropic/claude-opus-4.6"
|
||||
server_type: "openai"
|
||||
health_check: false
|
||||
# api_key loaded from OPENROUTER_API_KEY in .env
|
||||
@@ -1,292 +0,0 @@
|
||||
"""
|
||||
TerminalTestEnv -- Simple Test Environment for Validating the Stack
|
||||
|
||||
A self-contained environment with inline tasks (no external dataset needed).
|
||||
Each task asks the model to create a file at a known path with specific content.
|
||||
The reward verifier cats the file and checks if the content matches.
|
||||
|
||||
Enables only terminal + file toolsets. Uses Modal terminal backend with
|
||||
OpenRouter (Claude) by default.
|
||||
|
||||
Training tasks (3):
|
||||
1. Create ~/greeting.txt with "Hello from Hermes Agent"
|
||||
2. Create ~/count.txt with numbers 1-5, one per line
|
||||
3. Create ~/answer.txt with the result of 123 + 456
|
||||
|
||||
Eval task (1):
|
||||
1. Create ~/result.txt with the result of 6 * 7
|
||||
|
||||
Usage:
|
||||
# Start Atropos API server
|
||||
run-api
|
||||
|
||||
# Run environment (uses OpenRouter + Modal by default)
|
||||
python environments/terminal_test_env.py serve
|
||||
|
||||
# Process mode (no run-api needed, saves to JSONL)
|
||||
python environments/terminal_test_env.py process \\
|
||||
--env.data_path_to_save_groups terminal_test_output.jsonl
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
# Ensure repo root is on sys.path for imports
|
||||
_repo_root = Path(__file__).resolve().parent.parent.parent
|
||||
if str(_repo_root) not in sys.path:
|
||||
sys.path.insert(0, str(_repo_root))
|
||||
|
||||
from atroposlib.envs.base import ScoredDataGroup
|
||||
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||
from atroposlib.type_definitions import Item
|
||||
|
||||
from environments.agent_loop import AgentResult
|
||||
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||
from environments.tool_context import ToolContext
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Inline task definitions -- no external dataset needed
|
||||
# =============================================================================
|
||||
|
||||
TRAIN_TASKS = [
|
||||
{
|
||||
"prompt": "Create a file at ~/greeting.txt containing exactly the text: Hello from Hermes Agent",
|
||||
"verify_path": "~/greeting.txt",
|
||||
"expected_content": "Hello from Hermes Agent",
|
||||
},
|
||||
{
|
||||
"prompt": "Create a file at ~/count.txt containing the numbers 1 through 5, one per line",
|
||||
"verify_path": "~/count.txt",
|
||||
"expected_content": "1\n2\n3\n4\n5",
|
||||
},
|
||||
{
|
||||
"prompt": "Create a file at ~/answer.txt containing the result of 123 + 456",
|
||||
"verify_path": "~/answer.txt",
|
||||
"expected_content": "579",
|
||||
},
|
||||
]
|
||||
|
||||
EVAL_TASKS = [
|
||||
{
|
||||
"prompt": "Create a file at ~/result.txt containing the result of 6 * 7",
|
||||
"verify_path": "~/result.txt",
|
||||
"expected_content": "42",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class TerminalTestEnvConfig(HermesAgentEnvConfig):
|
||||
"""Config with defaults suitable for terminal testing."""
|
||||
|
||||
pass # Inherits all fields, overrides defaults in config_init
|
||||
|
||||
|
||||
class TerminalTestEnv(HermesAgentBaseEnv):
|
||||
"""
|
||||
Simple test environment with inline file-creation tasks.
|
||||
|
||||
All tasks follow the same pattern: "create a file at ~/X.txt with content Y".
|
||||
The verifier runs `cat ~/X.txt` in the rollout's terminal and checks the output
|
||||
against the expected string. Same verifier logic for all tasks.
|
||||
|
||||
This environment is designed to validate the full stack end-to-end:
|
||||
- Agent loop executes tool calls (terminal/file)
|
||||
- ToolContext provides terminal access to the reward function
|
||||
- Reward function verifies file content via cat
|
||||
- Scored data flows through the Atropos pipeline
|
||||
"""
|
||||
|
||||
name = "terminal-test"
|
||||
env_config_cls = TerminalTestEnvConfig
|
||||
|
||||
@classmethod
|
||||
def config_init(cls) -> Tuple[TerminalTestEnvConfig, List[APIServerConfig]]:
|
||||
"""
|
||||
Default configuration for the terminal test environment.
|
||||
|
||||
Uses Modal terminal backend for cloud isolation and OpenRouter with
|
||||
Claude for inference. API keys loaded from ~/hermes-agent/.env.
|
||||
"""
|
||||
env_config = TerminalTestEnvConfig(
|
||||
# Terminal + file tools only
|
||||
enabled_toolsets=["terminal", "file"],
|
||||
disabled_toolsets=None,
|
||||
distribution=None,
|
||||
# Agent settings
|
||||
max_agent_turns=10, # Simple tasks, don't need many turns
|
||||
max_token_length=16000,
|
||||
agent_temperature=1.0,
|
||||
system_prompt=(
|
||||
"You are a helpful assistant with access to a terminal and file tools. "
|
||||
"Complete the user's request by using the available tools. "
|
||||
"Be precise and follow instructions exactly."
|
||||
),
|
||||
# Modal terminal backend for cloud-isolated sandboxes per rollout
|
||||
terminal_backend="modal",
|
||||
# Atropos settings
|
||||
group_size=3, # 3 rollouts per group
|
||||
tokenizer_name="NousResearch/q-30b-t-h45-e1",
|
||||
tool_call_parser="hermes",
|
||||
steps_per_eval=3, # Eval after all 3 steps
|
||||
total_steps=3, # 3 groups total (1 group per step)
|
||||
use_wandb=True,
|
||||
wandb_name="terminal-test",
|
||||
ensure_scores_are_not_same=False, # Allow all-same scores for simple tasks
|
||||
# No external dataset
|
||||
dataset_name=None,
|
||||
)
|
||||
|
||||
# OpenRouter with Claude -- API key loaded from .env (OPENROUTER_API_KEY)
|
||||
server_configs = [
|
||||
APIServerConfig(
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
model_name="anthropic/claude-opus-4.6",
|
||||
server_type="openai",
|
||||
api_key=os.getenv("OPENROUTER_API_KEY", ""),
|
||||
health_check=False, # OpenRouter doesn't have a /health endpoint
|
||||
)
|
||||
]
|
||||
|
||||
return env_config, server_configs
|
||||
|
||||
async def setup(self):
|
||||
"""Initialize inline task lists."""
|
||||
self.train_tasks = list(TRAIN_TASKS)
|
||||
self.eval_tasks = list(EVAL_TASKS)
|
||||
self.iter = 0
|
||||
# Track reward stats for wandb logging
|
||||
self.reward_buffer: List[float] = []
|
||||
|
||||
async def get_next_item(self) -> Dict[str, str]:
|
||||
"""Cycle through training tasks."""
|
||||
item = self.train_tasks[self.iter % len(self.train_tasks)]
|
||||
self.iter += 1
|
||||
return item
|
||||
|
||||
def format_prompt(self, item: Dict[str, str]) -> str:
|
||||
"""The prompt is directly in the task item."""
|
||||
return item["prompt"]
|
||||
|
||||
async def compute_reward(
|
||||
self, item: Dict[str, str], result: AgentResult, ctx: ToolContext
|
||||
) -> float:
|
||||
"""
|
||||
Verify by cat-ing the expected file path and checking content matches.
|
||||
Same verifier for all tasks -- they all write a file at a known path.
|
||||
|
||||
Scoring:
|
||||
1.0 = exact match
|
||||
0.5 = expected content is present but has extra stuff
|
||||
0.0 = file doesn't exist or content doesn't match
|
||||
"""
|
||||
verify_result = ctx.terminal(f"cat {item['verify_path']}")
|
||||
|
||||
# File doesn't exist or can't be read
|
||||
if verify_result["exit_code"] != 0:
|
||||
self.reward_buffer.append(0.0)
|
||||
return 0.0
|
||||
|
||||
actual = verify_result.get("output", "").strip()
|
||||
expected = item["expected_content"].strip()
|
||||
|
||||
# Exact match
|
||||
if actual == expected:
|
||||
self.reward_buffer.append(1.0)
|
||||
return 1.0
|
||||
|
||||
# Partial credit: expected content is present but has extra stuff
|
||||
if expected in actual:
|
||||
self.reward_buffer.append(0.5)
|
||||
return 0.5
|
||||
|
||||
self.reward_buffer.append(0.0)
|
||||
return 0.0
|
||||
|
||||
async def evaluate(self, *args, **kwargs):
|
||||
"""
|
||||
Run eval tasks using the agent loop and verify results.
|
||||
Logs accuracy metrics.
|
||||
"""
|
||||
start_time = time.time()
|
||||
correct = 0
|
||||
total = len(self.eval_tasks)
|
||||
samples = []
|
||||
|
||||
for eval_item in self.eval_tasks:
|
||||
try:
|
||||
# For eval, we do a simple single-turn completion (not full agent loop)
|
||||
# to keep eval fast. The agent loop is tested via training.
|
||||
completion = await self.server.chat_completion(
|
||||
messages=[
|
||||
{"role": "system", "content": self.config.system_prompt or ""},
|
||||
{"role": "user", "content": eval_item["prompt"]},
|
||||
],
|
||||
n=1,
|
||||
max_tokens=self.config.max_token_length,
|
||||
temperature=0.0,
|
||||
split="eval",
|
||||
)
|
||||
|
||||
response_content = (
|
||||
completion.choices[0].message.content if completion.choices else ""
|
||||
)
|
||||
|
||||
samples.append(
|
||||
{
|
||||
"prompt": eval_item["prompt"],
|
||||
"response": response_content,
|
||||
"expected": eval_item["expected_content"],
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Eval failed for item: %s", e)
|
||||
samples.append(
|
||||
{
|
||||
"prompt": eval_item["prompt"],
|
||||
"response": f"ERROR: {e}",
|
||||
"expected": eval_item["expected_content"],
|
||||
}
|
||||
)
|
||||
|
||||
end_time = time.time()
|
||||
|
||||
eval_metrics = {
|
||||
"eval/num_samples": total,
|
||||
}
|
||||
|
||||
await self.evaluate_log(
|
||||
metrics=eval_metrics,
|
||||
samples=samples,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
|
||||
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
|
||||
"""Log training metrics including reward stats and accuracy."""
|
||||
if wandb_metrics is None:
|
||||
wandb_metrics = {}
|
||||
|
||||
if self.reward_buffer:
|
||||
total = len(self.reward_buffer)
|
||||
correct = sum(1 for r in self.reward_buffer if r == 1.0)
|
||||
partial = sum(1 for r in self.reward_buffer if r == 0.5)
|
||||
|
||||
wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / total
|
||||
wandb_metrics["train/accuracy"] = correct / total
|
||||
wandb_metrics["train/partial_match_rate"] = partial / total
|
||||
wandb_metrics["train/total_rollouts"] = total
|
||||
self.reward_buffer = []
|
||||
|
||||
await super().wandb_log(wandb_metrics)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
TerminalTestEnv.cli()
|
||||
@@ -1,120 +0,0 @@
|
||||
"""
|
||||
Tool Call Parser Registry
|
||||
|
||||
Client-side parsers that extract structured tool_calls from raw model output text.
|
||||
Used in Phase 2 (VLLM server type) where ManagedServer's /generate endpoint returns
|
||||
raw text without tool call parsing.
|
||||
|
||||
Each parser is a standalone reimplementation of the corresponding VLLM parser's
|
||||
non-streaming extract_tool_calls() logic. No VLLM dependency -- only standard library
|
||||
(re, json, uuid) and openai types.
|
||||
|
||||
Usage:
|
||||
from environments.tool_call_parsers import get_parser
|
||||
|
||||
parser = get_parser("hermes")
|
||||
content, tool_calls = parser.parse(raw_model_output)
|
||||
# content = text with tool call markup stripped
|
||||
# tool_calls = list of ChatCompletionMessageToolCall objects, or None
|
||||
"""
|
||||
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, List, Optional, Tuple, Type
|
||||
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
ChatCompletionMessageToolCall,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Type alias for parser return value
|
||||
ParseResult = Tuple[Optional[str], Optional[List[ChatCompletionMessageToolCall]]]
|
||||
|
||||
|
||||
class ToolCallParser(ABC):
|
||||
"""
|
||||
Base class for tool call parsers.
|
||||
|
||||
Each parser knows how to extract structured tool_calls from a specific
|
||||
model family's raw output text format.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def parse(self, text: str) -> ParseResult:
|
||||
"""
|
||||
Parse raw model output text for tool calls.
|
||||
|
||||
Args:
|
||||
text: Raw decoded text from the model's completion
|
||||
|
||||
Returns:
|
||||
Tuple of (content, tool_calls) where:
|
||||
- content: text with tool call markup stripped (the message 'content' field),
|
||||
or None if the entire output was tool calls
|
||||
- tool_calls: list of ChatCompletionMessageToolCall objects,
|
||||
or None if no tool calls were found
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
# Global parser registry: name -> parser class
|
||||
PARSER_REGISTRY: Dict[str, Type[ToolCallParser]] = {}
|
||||
|
||||
|
||||
def register_parser(name: str):
|
||||
"""
|
||||
Decorator to register a parser class under a given name.
|
||||
|
||||
Usage:
|
||||
@register_parser("hermes")
|
||||
class HermesToolCallParser(ToolCallParser):
|
||||
...
|
||||
"""
|
||||
|
||||
def decorator(cls: Type[ToolCallParser]) -> Type[ToolCallParser]:
|
||||
PARSER_REGISTRY[name] = cls
|
||||
return cls
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def get_parser(name: str) -> ToolCallParser:
|
||||
"""
|
||||
Get a parser instance by name.
|
||||
|
||||
Args:
|
||||
name: Parser name (e.g., "hermes", "mistral", "llama3_json")
|
||||
|
||||
Returns:
|
||||
Instantiated parser
|
||||
|
||||
Raises:
|
||||
KeyError: If parser name is not found in registry
|
||||
"""
|
||||
if name not in PARSER_REGISTRY:
|
||||
available = sorted(PARSER_REGISTRY.keys())
|
||||
raise KeyError(
|
||||
f"Tool call parser '{name}' not found. Available parsers: {available}"
|
||||
)
|
||||
return PARSER_REGISTRY[name]()
|
||||
|
||||
|
||||
def list_parsers() -> List[str]:
|
||||
"""Return sorted list of registered parser names."""
|
||||
return sorted(PARSER_REGISTRY.keys())
|
||||
|
||||
|
||||
# Import all parser modules to trigger registration via @register_parser decorators
|
||||
# Each module registers itself when imported
|
||||
from environments.tool_call_parsers.hermes_parser import HermesToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.longcat_parser import LongcatToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.mistral_parser import MistralToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.llama_parser import LlamaToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.qwen_parser import QwenToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.deepseek_v3_parser import DeepSeekV3ToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.deepseek_v3_1_parser import DeepSeekV31ToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.kimi_k2_parser import KimiK2ToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.glm45_parser import Glm45ToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.glm47_parser import Glm47ToolCallParser # noqa: E402, F401
|
||||
from environments.tool_call_parsers.qwen3_coder_parser import Qwen3CoderToolCallParser # noqa: E402, F401
|
||||
@@ -1,72 +0,0 @@
|
||||
"""
|
||||
DeepSeek V3.1 tool call parser.
|
||||
|
||||
Similar to V3 but with a slightly different format:
|
||||
<|tool▁call▁begin|>function_name<|tool▁sep|>arguments<|tool▁call▁end|>
|
||||
|
||||
Note: V3 has type+name before the separator, V3.1 has name before and args after.
|
||||
|
||||
Based on VLLM's DeepSeekV31ToolParser.extract_tool_calls()
|
||||
"""
|
||||
|
||||
import re
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
ChatCompletionMessageToolCall,
|
||||
Function,
|
||||
)
|
||||
|
||||
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||
|
||||
|
||||
@register_parser("deepseek_v3_1")
|
||||
@register_parser("deepseek_v31")
|
||||
class DeepSeekV31ToolCallParser(ToolCallParser):
|
||||
"""
|
||||
Parser for DeepSeek V3.1 tool calls.
|
||||
|
||||
Slightly different regex than V3: function_name comes before the separator,
|
||||
arguments come after (no type field, no json code block wrapper).
|
||||
"""
|
||||
|
||||
START_TOKEN = "<|tool▁calls▁begin|>"
|
||||
|
||||
# Regex captures: function_name, function_arguments
|
||||
PATTERN = re.compile(
|
||||
r"<|tool▁call▁begin|>(?P<function_name>.*?)<|tool▁sep|>(?P<function_arguments>.*?)<|tool▁call▁end|>",
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
def parse(self, text: str) -> ParseResult:
|
||||
if self.START_TOKEN not in text:
|
||||
return text, None
|
||||
|
||||
try:
|
||||
matches = self.PATTERN.findall(text)
|
||||
if not matches:
|
||||
return text, None
|
||||
|
||||
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||
for match in matches:
|
||||
func_name, func_args = match
|
||||
tool_calls.append(
|
||||
ChatCompletionMessageToolCall(
|
||||
id=f"call_{uuid.uuid4().hex[:8]}",
|
||||
type="function",
|
||||
function=Function(
|
||||
name=func_name.strip(),
|
||||
arguments=func_args.strip(),
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
if not tool_calls:
|
||||
return text, None
|
||||
|
||||
content = text[: text.find(self.START_TOKEN)].strip()
|
||||
return content if content else None, tool_calls
|
||||
|
||||
except Exception:
|
||||
return text, None
|
||||
@@ -1,89 +0,0 @@
|
||||
"""
|
||||
DeepSeek V3 tool call parser.
|
||||
|
||||
Format uses special unicode tokens:
|
||||
<|tool▁calls▁begin|>
|
||||
<|tool▁call▁begin|>type<|tool▁sep|>function_name
|
||||
```json
|
||||
{"arg": "value"}
|
||||
```
|
||||
<|tool▁call▁end|>
|
||||
<|tool▁calls▁end|>
|
||||
|
||||
Fixes Issue #989: Support for multiple simultaneous tool calls.
|
||||
"""
|
||||
|
||||
import re
|
||||
import uuid
|
||||
import logging
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
ChatCompletionMessageToolCall,
|
||||
Function,
|
||||
)
|
||||
|
||||
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@register_parser("deepseek_v3")
|
||||
class DeepSeekV3ToolCallParser(ToolCallParser):
|
||||
"""
|
||||
Parser for DeepSeek V3 tool calls.
|
||||
|
||||
Uses special unicode tokens with fullwidth angle brackets and block elements.
|
||||
Extracts type, function name, and JSON arguments from the structured format.
|
||||
Ensures all tool calls are captured when the model executes multiple actions.
|
||||
"""
|
||||
|
||||
START_TOKEN = "<|tool▁calls▁begin|>"
|
||||
|
||||
# Updated PATTERN: Using \s* instead of literal \n for increased robustness
|
||||
# against variations in model formatting (Issue #989).
|
||||
PATTERN = re.compile(
|
||||
r"<|tool▁call▁begin|>(?P<type>.*?)<|tool▁sep|>(?P<function_name>.*?)\s*```json\s*(?P<function_arguments>.*?)\s*```\s*<|tool▁call▁end|>",
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
def parse(self, text: str) -> ParseResult:
|
||||
"""
|
||||
Parses the input text and extracts all available tool calls.
|
||||
"""
|
||||
if self.START_TOKEN not in text:
|
||||
return text, None
|
||||
|
||||
try:
|
||||
# Using finditer to capture ALL tool calls in the sequence
|
||||
matches = list(self.PATTERN.finditer(text))
|
||||
if not matches:
|
||||
return text, None
|
||||
|
||||
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||
|
||||
for match in matches:
|
||||
func_name = match.group("function_name").strip()
|
||||
func_args = match.group("function_arguments").strip()
|
||||
|
||||
tool_calls.append(
|
||||
ChatCompletionMessageToolCall(
|
||||
id=f"call_{uuid.uuid4().hex[:8]}",
|
||||
type="function",
|
||||
function=Function(
|
||||
name=func_name,
|
||||
arguments=func_args,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
if tool_calls:
|
||||
# Content is text before the first tool call block
|
||||
content_index = text.find(self.START_TOKEN)
|
||||
content = text[:content_index].strip()
|
||||
return content if content else None, tool_calls
|
||||
|
||||
return text, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing DeepSeek V3 tool calls: {e}")
|
||||
return text, None
|
||||
@@ -1,109 +0,0 @@
|
||||
"""
|
||||
GLM 4.5 (GLM-4-MoE) tool call parser.
|
||||
|
||||
Format uses custom arg_key/arg_value tags rather than standard JSON:
|
||||
<tool_call>function_name
|
||||
<arg_key>param1</arg_key><arg_value>value1</arg_value>
|
||||
<arg_key>param2</arg_key><arg_value>value2</arg_value>
|
||||
</tool_call>
|
||||
|
||||
Values are deserialized using json.loads -> ast.literal_eval -> raw string fallback.
|
||||
|
||||
Based on VLLM's Glm4MoeModelToolParser.extract_tool_calls()
|
||||
"""
|
||||
|
||||
import ast
|
||||
import json
|
||||
import re
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
ChatCompletionMessageToolCall,
|
||||
Function,
|
||||
)
|
||||
|
||||
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||
|
||||
|
||||
def _deserialize_value(value: str) -> Any:
|
||||
"""
|
||||
Try to deserialize a string value to its native Python type.
|
||||
Attempts json.loads, then ast.literal_eval, then returns raw string.
|
||||
"""
|
||||
try:
|
||||
return json.loads(value)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
|
||||
try:
|
||||
return ast.literal_eval(value)
|
||||
except (ValueError, SyntaxError, TypeError):
|
||||
pass
|
||||
|
||||
return value
|
||||
|
||||
|
||||
@register_parser("glm45")
|
||||
class Glm45ToolCallParser(ToolCallParser):
|
||||
"""
|
||||
Parser for GLM 4.5 (GLM-4-MoE) tool calls.
|
||||
|
||||
Uses <tool_call>...</tool_call> tags with <arg_key>/<arg_value> pairs
|
||||
instead of standard JSON arguments.
|
||||
"""
|
||||
|
||||
FUNC_CALL_REGEX = re.compile(r"<tool_call>.*?</tool_call>", re.DOTALL)
|
||||
FUNC_DETAIL_REGEX = re.compile(r"<tool_call>([^\n]*)\n(.*)</tool_call>", re.DOTALL)
|
||||
FUNC_ARG_REGEX = re.compile(
|
||||
r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>", re.DOTALL
|
||||
)
|
||||
|
||||
START_TOKEN = "<tool_call>"
|
||||
|
||||
def parse(self, text: str) -> ParseResult:
|
||||
if self.START_TOKEN not in text:
|
||||
return text, None
|
||||
|
||||
try:
|
||||
matched_calls = self.FUNC_CALL_REGEX.findall(text)
|
||||
if not matched_calls:
|
||||
return text, None
|
||||
|
||||
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||
|
||||
for match in matched_calls:
|
||||
detail = self.FUNC_DETAIL_REGEX.search(match)
|
||||
if not detail:
|
||||
continue
|
||||
|
||||
func_name = detail.group(1).strip()
|
||||
func_args_raw = detail.group(2)
|
||||
|
||||
# Parse arg_key/arg_value pairs
|
||||
pairs = self.FUNC_ARG_REGEX.findall(func_args_raw) if func_args_raw else []
|
||||
arg_dict: Dict[str, Any] = {}
|
||||
for key, value in pairs:
|
||||
arg_key = key.strip()
|
||||
arg_val = _deserialize_value(value.strip())
|
||||
arg_dict[arg_key] = arg_val
|
||||
|
||||
tool_calls.append(
|
||||
ChatCompletionMessageToolCall(
|
||||
id=f"call_{uuid.uuid4().hex[:8]}",
|
||||
type="function",
|
||||
function=Function(
|
||||
name=func_name,
|
||||
arguments=json.dumps(arg_dict, ensure_ascii=False),
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
if not tool_calls:
|
||||
return text, None
|
||||
|
||||
content = text[: text.find(self.START_TOKEN)].strip()
|
||||
return content if content else None, tool_calls
|
||||
|
||||
except Exception:
|
||||
return text, None
|
||||
@@ -1,35 +0,0 @@
|
||||
"""
|
||||
GLM 4.7 tool call parser.
|
||||
|
||||
Same as GLM 4.5 but with slightly different regex patterns.
|
||||
The tool_call tags may wrap differently and arg parsing handles
|
||||
newlines between key/value pairs.
|
||||
|
||||
Based on VLLM's Glm47MoeModelToolParser (extends Glm4MoeModelToolParser).
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from environments.tool_call_parsers import ParseResult, register_parser
|
||||
from environments.tool_call_parsers.glm45_parser import Glm45ToolCallParser
|
||||
|
||||
|
||||
@register_parser("glm47")
|
||||
class Glm47ToolCallParser(Glm45ToolCallParser):
|
||||
"""
|
||||
Parser for GLM 4.7 tool calls.
|
||||
Extends GLM 4.5 with updated regex patterns.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# GLM 4.7 uses a slightly different detail regex that includes
|
||||
# the <tool_call> wrapper and optional arg_key content
|
||||
self.FUNC_DETAIL_REGEX = re.compile(
|
||||
r"<tool_call>(.*?)(<arg_key>.*?)?</tool_call>", re.DOTALL
|
||||
)
|
||||
# GLM 4.7 handles newlines between arg_key and arg_value tags
|
||||
self.FUNC_ARG_REGEX = re.compile(
|
||||
r"<arg_key>(.*?)</arg_key>(?:\\n|\s)*<arg_value>(.*?)</arg_value>",
|
||||
re.DOTALL,
|
||||
)
|
||||
@@ -1,75 +0,0 @@
|
||||
"""
|
||||
Hermes tool call parser.
|
||||
|
||||
Format: <tool_call>{"name": "func", "arguments": {...}}</tool_call>
|
||||
Based on VLLM's Hermes2ProToolParser.extract_tool_calls()
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import uuid
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
ChatCompletionMessageToolCall,
|
||||
Function,
|
||||
)
|
||||
|
||||
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||
|
||||
|
||||
@register_parser("hermes")
|
||||
class HermesToolCallParser(ToolCallParser):
|
||||
"""
|
||||
Parser for Hermes-format tool calls.
|
||||
|
||||
Matches <tool_call>...</tool_call> tags containing JSON with "name" and "arguments".
|
||||
Also handles unclosed <tool_call> at end-of-string (truncated generation).
|
||||
"""
|
||||
|
||||
# Matches both closed and unclosed tool_call tags
|
||||
PATTERN = re.compile(
|
||||
r"<tool_call>\s*(.*?)\s*</tool_call>|<tool_call>\s*(.*)", re.DOTALL
|
||||
)
|
||||
|
||||
def parse(self, text: str) -> ParseResult:
|
||||
if "<tool_call>" not in text:
|
||||
return text, None
|
||||
|
||||
try:
|
||||
matches = self.PATTERN.findall(text)
|
||||
if not matches:
|
||||
return text, None
|
||||
|
||||
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||
for match in matches:
|
||||
# match is a tuple: (closed_content, unclosed_content)
|
||||
raw_json = match[0] if match[0] else match[1]
|
||||
if not raw_json.strip():
|
||||
continue
|
||||
|
||||
tc_data = json.loads(raw_json)
|
||||
if "name" not in tc_data:
|
||||
continue
|
||||
tool_calls.append(
|
||||
ChatCompletionMessageToolCall(
|
||||
id=f"call_{uuid.uuid4().hex[:8]}",
|
||||
type="function",
|
||||
function=Function(
|
||||
name=tc_data["name"],
|
||||
arguments=json.dumps(
|
||||
tc_data.get("arguments", {}), ensure_ascii=False
|
||||
),
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
if not tool_calls:
|
||||
return text, None
|
||||
|
||||
# Content is everything before the first <tool_call> tag
|
||||
content = text[: text.find("<tool_call>")].strip()
|
||||
return content if content else None, tool_calls
|
||||
|
||||
except Exception:
|
||||
return text, None
|
||||
@@ -1,93 +0,0 @@
|
||||
"""
|
||||
Kimi K2 tool call parser.
|
||||
|
||||
Format:
|
||||
<|tool_calls_section_begin|>
|
||||
<|tool_call_begin|>function_id:0<|tool_call_argument_begin|>{"arg": "val"}<|tool_call_end|>
|
||||
<|tool_calls_section_end|>
|
||||
|
||||
The function_id format is typically "functions.func_name:index" or "func_name:index".
|
||||
|
||||
Based on VLLM's KimiK2ToolParser.extract_tool_calls()
|
||||
"""
|
||||
|
||||
import re
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
ChatCompletionMessageToolCall,
|
||||
Function,
|
||||
)
|
||||
|
||||
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||
|
||||
|
||||
@register_parser("kimi_k2")
|
||||
class KimiK2ToolCallParser(ToolCallParser):
|
||||
"""
|
||||
Parser for Kimi K2 tool calls.
|
||||
|
||||
Uses section begin/end tokens wrapping individual tool call begin/end tokens.
|
||||
The tool_call_id contains the function name (after last dot, before colon).
|
||||
"""
|
||||
|
||||
# Support both singular and plural variants
|
||||
START_TOKENS = [
|
||||
"<|tool_calls_section_begin|>",
|
||||
"<|tool_call_section_begin|>",
|
||||
]
|
||||
|
||||
# Regex captures: tool_call_id (e.g., "functions.get_weather:0"), function_arguments
|
||||
PATTERN = re.compile(
|
||||
r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[^<]+:\d+)\s*"
|
||||
r"<\|tool_call_argument_begin\|>\s*"
|
||||
r"(?P<function_arguments>(?:(?!<\|tool_call_begin\|>).)*?)\s*"
|
||||
r"<\|tool_call_end\|>",
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
def parse(self, text: str) -> ParseResult:
|
||||
# Check for any variant of the start token
|
||||
has_start = any(token in text for token in self.START_TOKENS)
|
||||
if not has_start:
|
||||
return text, None
|
||||
|
||||
try:
|
||||
matches = self.PATTERN.findall(text)
|
||||
if not matches:
|
||||
return text, None
|
||||
|
||||
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||
for match in matches:
|
||||
function_id, function_args = match
|
||||
|
||||
# Extract function name from ID format: "functions.get_weather:0" -> "get_weather"
|
||||
function_name = function_id.split(":")[0].split(".")[-1]
|
||||
|
||||
tool_calls.append(
|
||||
ChatCompletionMessageToolCall(
|
||||
id=function_id, # Preserve the original ID format
|
||||
type="function",
|
||||
function=Function(
|
||||
name=function_name,
|
||||
arguments=function_args.strip(),
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
if not tool_calls:
|
||||
return text, None
|
||||
|
||||
# Content is everything before the tool calls section
|
||||
earliest_start = len(text)
|
||||
for token in self.START_TOKENS:
|
||||
idx = text.find(token)
|
||||
if idx >= 0 and idx < earliest_start:
|
||||
earliest_start = idx
|
||||
|
||||
content = text[:earliest_start].strip()
|
||||
return content if content else None, tool_calls
|
||||
|
||||
except Exception:
|
||||
return text, None
|
||||
@@ -1,96 +0,0 @@
|
||||
"""
|
||||
Llama 3.x / 4 tool call parser.
|
||||
|
||||
Format: The model outputs JSON objects with "name" and "arguments" (or "parameters") keys.
|
||||
May be preceded by <|python_tag|> token. Supports multiple JSON objects separated
|
||||
by content or semicolons.
|
||||
|
||||
Based on VLLM's Llama3JsonToolParser.extract_tool_calls()
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
ChatCompletionMessageToolCall,
|
||||
Function,
|
||||
)
|
||||
|
||||
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||
|
||||
|
||||
@register_parser("llama3_json")
|
||||
@register_parser("llama4_json")
|
||||
class LlamaToolCallParser(ToolCallParser):
|
||||
"""
|
||||
Parser for Llama 3.x and 4 JSON-format tool calls.
|
||||
|
||||
Finds JSON objects containing "name" + ("arguments" or "parameters") keys.
|
||||
Uses Python's json.JSONDecoder.raw_decode for robust extraction of
|
||||
JSON objects from mixed text.
|
||||
"""
|
||||
|
||||
BOT_TOKEN = "<|python_tag|>"
|
||||
|
||||
# Regex to find the start of potential JSON objects
|
||||
JSON_START = re.compile(r"\{")
|
||||
|
||||
def parse(self, text: str) -> ParseResult:
|
||||
# Quick check: need either the bot token or a JSON brace
|
||||
if self.BOT_TOKEN not in text and "{" not in text:
|
||||
return text, None
|
||||
|
||||
try:
|
||||
decoder = json.JSONDecoder()
|
||||
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||
end_index = -1 # Track where the last parsed JSON ended
|
||||
|
||||
for match in self.JSON_START.finditer(text):
|
||||
start = match.start()
|
||||
# Skip if this brace is inside a previously parsed JSON object
|
||||
if start <= end_index:
|
||||
continue
|
||||
|
||||
try:
|
||||
obj, json_end = decoder.raw_decode(text[start:])
|
||||
end_index = start + json_end
|
||||
|
||||
# Must have "name" and either "arguments" or "parameters"
|
||||
name = obj.get("name")
|
||||
args = obj.get("arguments", obj.get("parameters"))
|
||||
|
||||
if not name or args is None:
|
||||
continue
|
||||
|
||||
# Normalize arguments to JSON string
|
||||
if isinstance(args, dict):
|
||||
args = json.dumps(args, ensure_ascii=False)
|
||||
elif not isinstance(args, str):
|
||||
args = json.dumps(args, ensure_ascii=False)
|
||||
|
||||
tool_calls.append(
|
||||
ChatCompletionMessageToolCall(
|
||||
id=f"call_{uuid.uuid4().hex[:8]}",
|
||||
type="function",
|
||||
function=Function(name=name, arguments=args),
|
||||
)
|
||||
)
|
||||
except (json.JSONDecodeError, KeyError, ValueError):
|
||||
continue
|
||||
|
||||
if not tool_calls:
|
||||
return text, None
|
||||
|
||||
# Content is everything before the first tool call JSON
|
||||
# Find where the first tool call starts in the text
|
||||
first_tc_start = text.find("{")
|
||||
if self.BOT_TOKEN in text:
|
||||
first_tc_start = text.find(self.BOT_TOKEN)
|
||||
content = text[:first_tc_start].strip() if first_tc_start > 0 else None
|
||||
|
||||
return content, tool_calls
|
||||
|
||||
except Exception:
|
||||
return text, None
|
||||
@@ -1,69 +0,0 @@
|
||||
"""
|
||||
Longcat Flash Chat tool call parser.
|
||||
|
||||
Same as Hermes but uses <longcat_tool_call> tags instead of <tool_call>.
|
||||
Based on VLLM's LongcatFlashToolParser (extends Hermes2ProToolParser).
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
ChatCompletionMessageToolCall,
|
||||
Function,
|
||||
)
|
||||
|
||||
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||
|
||||
|
||||
@register_parser("longcat")
|
||||
class LongcatToolCallParser(ToolCallParser):
|
||||
"""
|
||||
Parser for Longcat Flash Chat tool calls.
|
||||
Identical logic to Hermes, just different tag names.
|
||||
"""
|
||||
|
||||
PATTERN = re.compile(
|
||||
r"<longcat_tool_call>\s*(.*?)\s*</longcat_tool_call>|<longcat_tool_call>\s*(.*)",
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
def parse(self, text: str) -> ParseResult:
|
||||
if "<longcat_tool_call>" not in text:
|
||||
return text, None
|
||||
|
||||
try:
|
||||
matches = self.PATTERN.findall(text)
|
||||
if not matches:
|
||||
return text, None
|
||||
|
||||
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||
for match in matches:
|
||||
raw_json = match[0] if match[0] else match[1]
|
||||
if not raw_json.strip():
|
||||
continue
|
||||
|
||||
tc_data = json.loads(raw_json)
|
||||
tool_calls.append(
|
||||
ChatCompletionMessageToolCall(
|
||||
id=f"call_{uuid.uuid4().hex[:8]}",
|
||||
type="function",
|
||||
function=Function(
|
||||
name=tc_data["name"],
|
||||
arguments=json.dumps(
|
||||
tc_data.get("arguments", {}), ensure_ascii=False
|
||||
),
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
if not tool_calls:
|
||||
return text, None
|
||||
|
||||
content = text[: text.find("<longcat_tool_call>")].strip()
|
||||
return content if content else None, tool_calls
|
||||
|
||||
except Exception:
|
||||
return text, None
|
||||
@@ -1,137 +0,0 @@
|
||||
"""
|
||||
Mistral tool call parser.
|
||||
|
||||
Supports two formats depending on tokenizer version:
|
||||
- Pre-v11: content[TOOL_CALLS] [{"name": ..., "arguments": {...}}, ...]
|
||||
- v11+: content[TOOL_CALLS]tool_name1{"arg": "val"}[TOOL_CALLS]tool_name2{"arg": "val"}
|
||||
|
||||
Based on VLLM's MistralToolParser.extract_tool_calls()
|
||||
The [TOOL_CALLS] token is the bot_token used by Mistral models.
|
||||
"""
|
||||
|
||||
import json
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
ChatCompletionMessageToolCall,
|
||||
Function,
|
||||
)
|
||||
|
||||
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||
|
||||
|
||||
def _generate_mistral_id() -> str:
|
||||
"""Mistral tool call IDs are 9-char alphanumeric strings."""
|
||||
import random
|
||||
import string
|
||||
|
||||
return "".join(random.choices(string.ascii_letters + string.digits, k=9))
|
||||
|
||||
|
||||
@register_parser("mistral")
|
||||
class MistralToolCallParser(ToolCallParser):
|
||||
"""
|
||||
Parser for Mistral-format tool calls.
|
||||
|
||||
Detects format by checking if the content after [TOOL_CALLS] starts with '['
|
||||
(pre-v11 JSON array) or with a tool name (v11+ format).
|
||||
"""
|
||||
|
||||
# The [TOOL_CALLS] token -- may appear as different strings depending on tokenizer
|
||||
BOT_TOKEN = "[TOOL_CALLS]"
|
||||
|
||||
def parse(self, text: str) -> ParseResult:
|
||||
if self.BOT_TOKEN not in text:
|
||||
return text, None
|
||||
|
||||
try:
|
||||
parts = text.split(self.BOT_TOKEN)
|
||||
content = parts[0].strip()
|
||||
raw_tool_calls = parts[1:]
|
||||
|
||||
# Detect format: if the first raw part starts with '[', it's pre-v11
|
||||
first_raw = raw_tool_calls[0].strip() if raw_tool_calls else ""
|
||||
is_pre_v11 = first_raw.startswith("[") or first_raw.startswith("{")
|
||||
|
||||
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||
|
||||
if not is_pre_v11:
|
||||
# v11+ format: [TOOL_CALLS]tool_name{args}[TOOL_CALLS]tool_name2{args2}
|
||||
for raw in raw_tool_calls:
|
||||
raw = raw.strip()
|
||||
if not raw or "{" not in raw:
|
||||
continue
|
||||
|
||||
brace_idx = raw.find("{")
|
||||
tool_name = raw[:brace_idx].strip()
|
||||
args_str = raw[brace_idx:]
|
||||
|
||||
# Validate and clean the JSON arguments
|
||||
try:
|
||||
parsed_args = json.loads(args_str)
|
||||
args_str = json.dumps(parsed_args, ensure_ascii=False)
|
||||
except json.JSONDecodeError:
|
||||
pass # Keep raw if parsing fails
|
||||
|
||||
tool_calls.append(
|
||||
ChatCompletionMessageToolCall(
|
||||
id=_generate_mistral_id(),
|
||||
type="function",
|
||||
function=Function(name=tool_name, arguments=args_str),
|
||||
)
|
||||
)
|
||||
else:
|
||||
# Pre-v11 format: [TOOL_CALLS] [{"name": ..., "arguments": {...}}]
|
||||
try:
|
||||
parsed = json.loads(first_raw)
|
||||
if isinstance(parsed, dict):
|
||||
parsed = [parsed]
|
||||
|
||||
for tc in parsed:
|
||||
if "name" not in tc:
|
||||
continue
|
||||
args = tc.get("arguments", {})
|
||||
if isinstance(args, dict):
|
||||
args = json.dumps(args, ensure_ascii=False)
|
||||
|
||||
tool_calls.append(
|
||||
ChatCompletionMessageToolCall(
|
||||
id=_generate_mistral_id(),
|
||||
type="function",
|
||||
function=Function(
|
||||
name=tc["name"], arguments=args
|
||||
),
|
||||
)
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
# Fallback: extract JSON objects using raw_decode
|
||||
decoder = json.JSONDecoder()
|
||||
idx = 0
|
||||
while idx < len(first_raw):
|
||||
try:
|
||||
obj, end_idx = decoder.raw_decode(first_raw, idx)
|
||||
if isinstance(obj, dict) and "name" in obj:
|
||||
args = obj.get("arguments", {})
|
||||
if isinstance(args, dict):
|
||||
args = json.dumps(args, ensure_ascii=False)
|
||||
tool_calls.append(
|
||||
ChatCompletionMessageToolCall(
|
||||
id=_generate_mistral_id(),
|
||||
type="function",
|
||||
function=Function(
|
||||
name=obj["name"], arguments=args
|
||||
),
|
||||
)
|
||||
)
|
||||
idx = end_idx
|
||||
except json.JSONDecodeError:
|
||||
idx += 1
|
||||
|
||||
if not tool_calls:
|
||||
return text, None
|
||||
|
||||
return content if content else None, tool_calls
|
||||
|
||||
except Exception:
|
||||
return text, None
|
||||
@@ -1,163 +0,0 @@
|
||||
"""
|
||||
Qwen3-Coder tool call parser.
|
||||
|
||||
Format uses XML-style nested tags:
|
||||
<tool_call>
|
||||
<function=function_name>
|
||||
<parameter=param_name>value</parameter>
|
||||
<parameter=param_name2>value2</parameter>
|
||||
</function>
|
||||
</tool_call>
|
||||
|
||||
Parameters are extracted from <parameter=name>value</parameter> tags and
|
||||
type-converted using the schema if available, otherwise treated as strings.
|
||||
|
||||
Based on VLLM's Qwen3CoderToolParser.extract_tool_calls()
|
||||
"""
|
||||
|
||||
import ast
|
||||
import json
|
||||
import re
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
ChatCompletionMessageToolCall,
|
||||
Function,
|
||||
)
|
||||
|
||||
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
|
||||
|
||||
|
||||
def _try_convert_value(value: str) -> Any:
|
||||
"""
|
||||
Try to convert a parameter value string to a native Python type.
|
||||
Handles null, numbers, booleans, JSON objects/arrays, and falls back to string.
|
||||
"""
|
||||
stripped = value.strip()
|
||||
|
||||
# Handle null
|
||||
if stripped.lower() == "null":
|
||||
return None
|
||||
|
||||
# Try JSON first (handles objects, arrays, strings, numbers, booleans)
|
||||
try:
|
||||
return json.loads(stripped)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
|
||||
# Try Python literal eval (handles tuples, etc.)
|
||||
try:
|
||||
return ast.literal_eval(stripped)
|
||||
except (ValueError, SyntaxError, TypeError):
|
||||
pass
|
||||
|
||||
# Return as string
|
||||
return stripped
|
||||
|
||||
|
||||
@register_parser("qwen3_coder")
|
||||
class Qwen3CoderToolCallParser(ToolCallParser):
|
||||
"""
|
||||
Parser for Qwen3-Coder XML-format tool calls.
|
||||
|
||||
Uses nested XML tags: <tool_call><function=name><parameter=key>val</parameter></function></tool_call>
|
||||
"""
|
||||
|
||||
START_TOKEN = "<tool_call>"
|
||||
FUNCTION_PREFIX = "<function="
|
||||
|
||||
# Find complete tool_call blocks (or unclosed at end)
|
||||
TOOL_CALL_REGEX = re.compile(
|
||||
r"<tool_call>(.*?)</tool_call>|<tool_call>(.*?)$", re.DOTALL
|
||||
)
|
||||
|
||||
# Find function blocks within a tool_call
|
||||
FUNCTION_REGEX = re.compile(
|
||||
r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL
|
||||
)
|
||||
|
||||
# Find parameter blocks within a function
|
||||
PARAMETER_REGEX = re.compile(
|
||||
r"<parameter=(.*?)(?:</parameter>|(?=<parameter=)|(?=</function>)|$)",
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
def _parse_function_call(self, function_str: str) -> Optional[ChatCompletionMessageToolCall]:
|
||||
"""Parse a single <function=name>...</function> block into a ToolCall."""
|
||||
try:
|
||||
# Extract function name: everything before the first '>'
|
||||
gt_idx = function_str.index(">")
|
||||
func_name = function_str[:gt_idx].strip()
|
||||
params_str = function_str[gt_idx + 1:]
|
||||
|
||||
# Extract parameters
|
||||
param_dict: Dict[str, Any] = {}
|
||||
for match_text in self.PARAMETER_REGEX.findall(params_str):
|
||||
if ">" not in match_text:
|
||||
continue
|
||||
eq_idx = match_text.index(">")
|
||||
param_name = match_text[:eq_idx].strip()
|
||||
param_value = match_text[eq_idx + 1:]
|
||||
|
||||
# Clean up whitespace
|
||||
if param_value.startswith("\n"):
|
||||
param_value = param_value[1:]
|
||||
if param_value.endswith("\n"):
|
||||
param_value = param_value[:-1]
|
||||
|
||||
param_dict[param_name] = _try_convert_value(param_value)
|
||||
|
||||
return ChatCompletionMessageToolCall(
|
||||
id=f"call_{uuid.uuid4().hex[:24]}",
|
||||
type="function",
|
||||
function=Function(
|
||||
name=func_name,
|
||||
arguments=json.dumps(param_dict, ensure_ascii=False),
|
||||
),
|
||||
)
|
||||
except (ValueError, IndexError):
|
||||
return None
|
||||
|
||||
def parse(self, text: str) -> ParseResult:
|
||||
if self.FUNCTION_PREFIX not in text:
|
||||
return text, None
|
||||
|
||||
try:
|
||||
# Find all tool_call blocks
|
||||
tc_matches = self.TOOL_CALL_REGEX.findall(text)
|
||||
raw_blocks = [m[0] if m[0] else m[1] for m in tc_matches]
|
||||
|
||||
# Fallback: if no tool_call tags, try the whole text
|
||||
if not raw_blocks:
|
||||
raw_blocks = [text]
|
||||
|
||||
# Find function blocks within each tool_call
|
||||
function_strs: List[str] = []
|
||||
for block in raw_blocks:
|
||||
func_matches = self.FUNCTION_REGEX.findall(block)
|
||||
function_strs.extend(m[0] if m[0] else m[1] for m in func_matches)
|
||||
|
||||
if not function_strs:
|
||||
return text, None
|
||||
|
||||
# Parse each function call
|
||||
tool_calls: List[ChatCompletionMessageToolCall] = []
|
||||
for func_str in function_strs:
|
||||
tc = self._parse_function_call(func_str)
|
||||
if tc is not None:
|
||||
tool_calls.append(tc)
|
||||
|
||||
if not tool_calls:
|
||||
return text, None
|
||||
|
||||
# Content before tool calls
|
||||
first_tc = text.find(self.START_TOKEN)
|
||||
if first_tc < 0:
|
||||
first_tc = text.find(self.FUNCTION_PREFIX)
|
||||
content = text[:first_tc].strip() if first_tc > 0 else None
|
||||
|
||||
return content, tool_calls
|
||||
|
||||
except Exception:
|
||||
return text, None
|
||||
@@ -1,19 +0,0 @@
|
||||
"""
|
||||
Qwen 2.5 tool call parser.
|
||||
|
||||
Uses the same <tool_call> format as Hermes.
|
||||
Registered as a separate parser name for clarity when using --tool-parser=qwen.
|
||||
"""
|
||||
|
||||
from environments.tool_call_parsers import register_parser
|
||||
from environments.tool_call_parsers.hermes_parser import HermesToolCallParser
|
||||
|
||||
|
||||
@register_parser("qwen")
|
||||
class QwenToolCallParser(HermesToolCallParser):
|
||||
"""
|
||||
Parser for Qwen 2.5 tool calls.
|
||||
Same <tool_call>{"name": ..., "arguments": ...}</tool_call> format as Hermes.
|
||||
"""
|
||||
|
||||
pass # Identical format -- inherits everything from Hermes
|
||||
@@ -1,473 +0,0 @@
|
||||
"""
|
||||
ToolContext -- Unrestricted Tool Access for Reward Functions
|
||||
|
||||
A per-rollout handle that gives reward/verification functions direct access to
|
||||
ALL hermes-agent tools, scoped to the rollout's task_id. The same task_id means
|
||||
the terminal/browser session is the SAME one the model used during its rollout --
|
||||
all state (files, processes, browser tabs) is preserved.
|
||||
|
||||
The verifier author decides which tools to use. Nothing is hardcoded or gated.
|
||||
|
||||
Example usage in a compute_reward():
|
||||
async def compute_reward(self, item, result, ctx):
|
||||
# Run tests in the model's terminal sandbox
|
||||
test = ctx.terminal("pytest -v")
|
||||
if test["exit_code"] == 0:
|
||||
return 1.0
|
||||
|
||||
# Check if a file was created
|
||||
content = ctx.read_file("/workspace/solution.py")
|
||||
if content.get("content"):
|
||||
return 0.5
|
||||
|
||||
return 0.0
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
|
||||
from model_tools import handle_function_call
|
||||
from tools.terminal_tool import cleanup_vm
|
||||
from tools.browser_tool import cleanup_browser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Thread pool for running sync tool calls that internally use asyncio.run()
|
||||
_tool_executor = concurrent.futures.ThreadPoolExecutor(max_workers=4)
|
||||
|
||||
|
||||
def _run_tool_in_thread(tool_name: str, arguments: Dict[str, Any], task_id: str) -> str:
|
||||
"""
|
||||
Run a tool call in a thread pool executor so backends that use asyncio.run()
|
||||
internally (modal, docker, daytona) get a clean event loop.
|
||||
|
||||
If we're already in an async context, executes handle_function_call() in a
|
||||
disposable worker thread and blocks for the result.
|
||||
If not (e.g., called from sync code), runs directly.
|
||||
"""
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
# We're in an async context -- need to run in thread
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
||||
future = pool.submit(
|
||||
handle_function_call, tool_name, arguments, task_id
|
||||
)
|
||||
return future.result(timeout=300)
|
||||
except RuntimeError:
|
||||
# No running event loop -- safe to call directly
|
||||
return handle_function_call(tool_name, arguments, task_id)
|
||||
|
||||
|
||||
class ToolContext:
|
||||
"""
|
||||
Open-ended access to all hermes-agent tools for a specific rollout.
|
||||
|
||||
Passed to compute_reward() so verifiers can use any tool they need:
|
||||
terminal commands, file reads/writes, web searches, browser automation, etc.
|
||||
All calls share the rollout's task_id for session isolation.
|
||||
"""
|
||||
|
||||
def __init__(self, task_id: str):
|
||||
self.task_id = task_id
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Terminal tools
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def terminal(self, command: str, timeout: int = 180) -> Dict[str, Any]:
|
||||
"""
|
||||
Run a command in the rollout's terminal session.
|
||||
|
||||
Args:
|
||||
command: Shell command to execute
|
||||
timeout: Command timeout in seconds
|
||||
|
||||
Returns:
|
||||
Dict with 'exit_code' (int) and 'output' (str)
|
||||
"""
|
||||
import os
|
||||
backend = os.getenv("TERMINAL_ENV", "local")
|
||||
logger.debug("ToolContext.terminal [%s backend] task=%s: %s", backend, self.task_id[:8], command[:100])
|
||||
|
||||
# Run via thread helper so modal/docker/daytona backends' asyncio.run() doesn't deadlock
|
||||
result = _run_tool_in_thread(
|
||||
"terminal",
|
||||
{"command": command, "timeout": timeout},
|
||||
self.task_id,
|
||||
)
|
||||
try:
|
||||
return json.loads(result)
|
||||
except json.JSONDecodeError:
|
||||
return {"exit_code": -1, "output": result}
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# File tools
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def read_file(self, path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Read a file from the rollout's filesystem.
|
||||
|
||||
Args:
|
||||
path: File path to read
|
||||
|
||||
Returns:
|
||||
Dict with file content or error
|
||||
"""
|
||||
result = handle_function_call(
|
||||
"read_file", {"path": path}, task_id=self.task_id
|
||||
)
|
||||
try:
|
||||
return json.loads(result)
|
||||
except json.JSONDecodeError:
|
||||
return {"error": result}
|
||||
|
||||
def write_file(self, path: str, content: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Write a TEXT file in the rollout's filesystem.
|
||||
|
||||
Uses a shell heredoc under the hood, so this is only safe for text content.
|
||||
For binary files (images, compiled artifacts, etc.), use upload_file() instead.
|
||||
|
||||
Args:
|
||||
path: File path to write
|
||||
content: Text content to write
|
||||
|
||||
Returns:
|
||||
Dict with success status or error
|
||||
"""
|
||||
result = handle_function_call(
|
||||
"write_file", {"path": path, "content": content}, task_id=self.task_id
|
||||
)
|
||||
try:
|
||||
return json.loads(result)
|
||||
except json.JSONDecodeError:
|
||||
return {"error": result}
|
||||
|
||||
def upload_file(self, local_path: str, remote_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Upload a local file to the rollout's sandbox (binary-safe).
|
||||
|
||||
Unlike write_file() which passes content through a shell heredoc (text-only),
|
||||
this method base64-encodes the file and decodes it inside the sandbox.
|
||||
Safe for any file type: binaries, images, archives, etc.
|
||||
|
||||
For large files (>1MB), the content is split into chunks to avoid
|
||||
hitting shell command-length limits.
|
||||
|
||||
Args:
|
||||
local_path: Path to a local file on the host
|
||||
remote_path: Destination path inside the sandbox
|
||||
|
||||
Returns:
|
||||
Dict with 'exit_code' and 'output'
|
||||
"""
|
||||
import base64
|
||||
from pathlib import Path as _Path
|
||||
|
||||
local = _Path(local_path)
|
||||
if not local.exists():
|
||||
return {"exit_code": -1, "output": f"Local file not found: {local_path}"}
|
||||
|
||||
raw = local.read_bytes()
|
||||
b64 = base64.b64encode(raw).decode("ascii")
|
||||
|
||||
# Ensure parent directory exists in the sandbox
|
||||
parent = str(_Path(remote_path).parent)
|
||||
if parent not in {".", "/"}:
|
||||
self.terminal(f"mkdir -p {parent}", timeout=10)
|
||||
|
||||
# For small files, single command is fine
|
||||
chunk_size = 60_000 # ~60KB per chunk (well within shell limits)
|
||||
if len(b64) <= chunk_size:
|
||||
result = self.terminal(
|
||||
f"printf '%s' '{b64}' | base64 -d > {remote_path}",
|
||||
timeout=30,
|
||||
)
|
||||
else:
|
||||
# For larger files, write base64 in chunks then decode
|
||||
tmp_b64 = "/tmp/_hermes_upload.b64"
|
||||
self.terminal(f": > {tmp_b64}", timeout=5) # truncate
|
||||
for i in range(0, len(b64), chunk_size):
|
||||
chunk = b64[i : i + chunk_size]
|
||||
self.terminal(f"printf '%s' '{chunk}' >> {tmp_b64}", timeout=15)
|
||||
result = self.terminal(
|
||||
f"base64 -d {tmp_b64} > {remote_path} && rm -f {tmp_b64}",
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
def upload_dir(self, local_dir: str, remote_dir: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Upload an entire local directory to the rollout's sandbox (binary-safe).
|
||||
|
||||
Recursively uploads all files, preserving directory structure.
|
||||
|
||||
Args:
|
||||
local_dir: Path to a local directory on the host
|
||||
remote_dir: Destination directory inside the sandbox
|
||||
|
||||
Returns:
|
||||
List of results, one per file uploaded
|
||||
"""
|
||||
from pathlib import Path as _Path
|
||||
|
||||
local = _Path(local_dir)
|
||||
if not local.exists() or not local.is_dir():
|
||||
return [{"exit_code": -1, "output": f"Local directory not found: {local_dir}"}]
|
||||
|
||||
results = []
|
||||
for file_path in sorted(local.rglob("*")):
|
||||
if file_path.is_file():
|
||||
relative = file_path.relative_to(local)
|
||||
target = f"{remote_dir}/{relative}"
|
||||
results.append(self.upload_file(str(file_path), target))
|
||||
return results
|
||||
|
||||
def download_file(self, remote_path: str, local_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Download a file from the rollout's sandbox to the host (binary-safe).
|
||||
|
||||
The inverse of upload_file(). Base64-encodes the file inside the sandbox,
|
||||
reads the encoded data through the terminal, and decodes it locally.
|
||||
Safe for any file type.
|
||||
|
||||
Args:
|
||||
remote_path: Path to the file inside the sandbox
|
||||
local_path: Destination path on the host
|
||||
|
||||
Returns:
|
||||
Dict with 'success' (bool) and 'bytes' (int) or 'error' (str)
|
||||
"""
|
||||
import base64
|
||||
from pathlib import Path as _Path
|
||||
|
||||
# Base64-encode the file inside the sandbox and capture output
|
||||
result = self.terminal(
|
||||
f"base64 {remote_path} 2>/dev/null",
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
if result.get("exit_code", -1) != 0:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Failed to read remote file: {result.get('output', '')}",
|
||||
}
|
||||
|
||||
b64_data = result.get("output", "").strip()
|
||||
if not b64_data:
|
||||
return {"success": False, "error": f"Remote file is empty or missing: {remote_path}"}
|
||||
|
||||
try:
|
||||
raw = base64.b64decode(b64_data)
|
||||
except Exception as e:
|
||||
return {"success": False, "error": f"Base64 decode failed: {e}"}
|
||||
|
||||
# Write to local host filesystem
|
||||
local = _Path(local_path)
|
||||
local.parent.mkdir(parents=True, exist_ok=True)
|
||||
local.write_bytes(raw)
|
||||
|
||||
return {"success": True, "bytes": len(raw)}
|
||||
|
||||
def download_dir(self, remote_dir: str, local_dir: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Download a directory from the rollout's sandbox to the host (binary-safe).
|
||||
|
||||
Lists all files in the remote directory, then downloads each one.
|
||||
Preserves directory structure.
|
||||
|
||||
Args:
|
||||
remote_dir: Path to the directory inside the sandbox
|
||||
local_dir: Destination directory on the host
|
||||
|
||||
Returns:
|
||||
List of results, one per file downloaded
|
||||
"""
|
||||
from pathlib import Path as _Path
|
||||
|
||||
# List files in the remote directory
|
||||
ls_result = self.terminal(
|
||||
f"find {remote_dir} -type f 2>/dev/null",
|
||||
timeout=15,
|
||||
)
|
||||
|
||||
if ls_result.get("exit_code", -1) != 0:
|
||||
return [{"success": False, "error": f"Failed to list remote dir: {remote_dir}"}]
|
||||
|
||||
file_list = ls_result.get("output", "").strip()
|
||||
if not file_list:
|
||||
return [{"success": False, "error": f"Remote directory is empty or missing: {remote_dir}"}]
|
||||
|
||||
results = []
|
||||
for remote_file in file_list.splitlines():
|
||||
remote_file = remote_file.strip()
|
||||
if not remote_file:
|
||||
continue
|
||||
# Compute the relative path to preserve directory structure
|
||||
if remote_file.startswith(remote_dir):
|
||||
relative = remote_file[len(remote_dir):].lstrip("/")
|
||||
else:
|
||||
relative = _Path(remote_file).name
|
||||
local_file = str(_Path(local_dir) / relative)
|
||||
results.append(self.download_file(remote_file, local_file))
|
||||
|
||||
return results
|
||||
|
||||
def search(self, query: str, path: str = ".") -> Dict[str, Any]:
|
||||
"""
|
||||
Search for text in the rollout's filesystem.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
path: Directory to search in
|
||||
|
||||
Returns:
|
||||
Dict with search results
|
||||
"""
|
||||
result = handle_function_call(
|
||||
"search_files", {"pattern": query, "path": path}, task_id=self.task_id
|
||||
)
|
||||
try:
|
||||
return json.loads(result)
|
||||
except json.JSONDecodeError:
|
||||
return {"error": result}
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Web tools
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def web_search(self, query: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Search the web.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
|
||||
Returns:
|
||||
Dict with search results
|
||||
"""
|
||||
result = handle_function_call("web_search", {"query": query})
|
||||
try:
|
||||
return json.loads(result)
|
||||
except json.JSONDecodeError:
|
||||
return {"error": result}
|
||||
|
||||
def web_extract(self, urls: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract content from URLs.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to extract content from
|
||||
|
||||
Returns:
|
||||
Dict with extracted content
|
||||
"""
|
||||
result = handle_function_call("web_extract", {"urls": urls})
|
||||
try:
|
||||
return json.loads(result)
|
||||
except json.JSONDecodeError:
|
||||
return {"error": result}
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Browser tools
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def browser_navigate(self, url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Navigate the rollout's browser session to a URL.
|
||||
|
||||
Args:
|
||||
url: URL to navigate to
|
||||
|
||||
Returns:
|
||||
Dict with page snapshot or error
|
||||
"""
|
||||
result = handle_function_call(
|
||||
"browser_navigate", {"url": url}, task_id=self.task_id
|
||||
)
|
||||
try:
|
||||
return json.loads(result)
|
||||
except json.JSONDecodeError:
|
||||
return {"error": result}
|
||||
|
||||
def browser_snapshot(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Take a snapshot of the current browser page.
|
||||
|
||||
Returns:
|
||||
Dict with page content/accessibility snapshot
|
||||
"""
|
||||
result = handle_function_call(
|
||||
"browser_snapshot", {}, task_id=self.task_id
|
||||
)
|
||||
try:
|
||||
return json.loads(result)
|
||||
except json.JSONDecodeError:
|
||||
return {"error": result}
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Generic tool access
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def call_tool(self, tool_name: str, arguments: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Call any hermes-agent tool by name.
|
||||
|
||||
This is the generic escape hatch -- if a tool doesn't have a convenience
|
||||
wrapper above, you can call it directly here.
|
||||
|
||||
Args:
|
||||
tool_name: Name of the tool (e.g., "vision_analyze", "skills_list")
|
||||
arguments: Dict of arguments for the tool
|
||||
|
||||
Returns:
|
||||
Raw JSON string result from the tool
|
||||
"""
|
||||
return _run_tool_in_thread(tool_name, arguments, self.task_id)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Cleanup
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def cleanup(self):
|
||||
"""
|
||||
Release all resources (terminal VMs, browser sessions, background processes)
|
||||
for this rollout.
|
||||
|
||||
Called automatically by the base environment via try/finally after
|
||||
compute_reward() completes. You generally don't need to call this yourself.
|
||||
"""
|
||||
# Kill any background processes from this rollout (safety net)
|
||||
try:
|
||||
from tools.process_registry import process_registry
|
||||
killed = process_registry.kill_all(task_id=self.task_id)
|
||||
if killed:
|
||||
logger.debug("Process cleanup for task %s: killed %d process(es)", self.task_id, killed)
|
||||
except Exception as e:
|
||||
logger.debug("Process cleanup for task %s: %s", self.task_id, e)
|
||||
|
||||
try:
|
||||
cleanup_vm(self.task_id)
|
||||
except Exception as e:
|
||||
logger.debug("VM cleanup for task %s: %s", self.task_id, e)
|
||||
|
||||
# Suppress browser_tool's noisy debug prints during cleanup.
|
||||
# The cleanup still runs (safe), it just doesn't spam the console.
|
||||
_prev_quiet = os.environ.get("HERMES_QUIET")
|
||||
os.environ["HERMES_QUIET"] = "1"
|
||||
try:
|
||||
cleanup_browser(self.task_id)
|
||||
except Exception as e:
|
||||
logger.debug("Browser cleanup for task %s: %s", self.task_id, e)
|
||||
finally:
|
||||
if _prev_quiet is None:
|
||||
os.environ.pop("HERMES_QUIET", None)
|
||||
else:
|
||||
os.environ["HERMES_QUIET"] = _prev_quiet
|
||||
@@ -1,719 +0,0 @@
|
||||
"""
|
||||
WebResearchEnv — RL Environment for Multi-Step Web Research
|
||||
============================================================
|
||||
|
||||
Trains models to do accurate, efficient, multi-source web research.
|
||||
|
||||
Reward signals:
|
||||
- Answer correctness (LLM judge, 0.0–1.0)
|
||||
- Source diversity (used ≥2 distinct domains)
|
||||
- Efficiency (penalizes excessive tool calls)
|
||||
- Tool usage (bonus for actually using web tools)
|
||||
|
||||
Dataset: FRAMES benchmark (Google, 2024) — multi-hop factual questions
|
||||
HuggingFace: google/frames-benchmark
|
||||
Fallback: built-in sample questions (no HF token needed)
|
||||
|
||||
Usage:
|
||||
# Phase 1 (OpenAI-compatible server)
|
||||
python environments/web_research_env.py serve \\
|
||||
--openai.base_url http://localhost:8000/v1 \\
|
||||
--openai.model_name YourModel \\
|
||||
--openai.server_type openai
|
||||
|
||||
# Process mode (offline data generation)
|
||||
python environments/web_research_env.py process \\
|
||||
--env.data_path_to_save_groups data/web_research.jsonl
|
||||
|
||||
# Standalone eval
|
||||
python environments/web_research_env.py evaluate \\
|
||||
--openai.base_url http://localhost:8000/v1 \\
|
||||
--openai.model_name YourModel
|
||||
|
||||
Built by: github.com/jackx707
|
||||
Inspired by: GroceryMind — production Hermes agent doing live web research
|
||||
across German grocery stores (firecrawl + hermes-agent)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
# Ensure hermes-agent root is on path
|
||||
_repo_root = Path(__file__).resolve().parent.parent
|
||||
if str(_repo_root) not in sys.path:
|
||||
sys.path.insert(0, str(_repo_root))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Optional HuggingFace datasets import
|
||||
# ---------------------------------------------------------------------------
|
||||
try:
|
||||
from datasets import load_dataset
|
||||
HF_AVAILABLE = True
|
||||
except ImportError:
|
||||
HF_AVAILABLE = False
|
||||
|
||||
from atroposlib.envs.base import ScoredDataGroup
|
||||
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||
from atroposlib.type_definitions import Item
|
||||
|
||||
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||
from environments.agent_loop import AgentResult
|
||||
from environments.tool_context import ToolContext
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fallback sample dataset (used when HuggingFace is unavailable)
|
||||
# Multi-hop questions requiring real web search to answer.
|
||||
# ---------------------------------------------------------------------------
|
||||
SAMPLE_QUESTIONS = [
|
||||
{
|
||||
"question": "What is the current population of the capital city of the country that won the 2022 FIFA World Cup?",
|
||||
"answer": "Buenos Aires has approximately 3 million people in the city proper, or around 15 million in the greater metro area.",
|
||||
"difficulty": "medium",
|
||||
"hops": 2,
|
||||
},
|
||||
{
|
||||
"question": "Who is the CEO of the company that makes the most widely used open-source container orchestration platform?",
|
||||
"answer": "The Linux Foundation oversees Kubernetes. CNCF (Cloud Native Computing Foundation) is the specific body — it does not have a traditional CEO but has an executive director.",
|
||||
"difficulty": "medium",
|
||||
"hops": 2,
|
||||
},
|
||||
{
|
||||
"question": "What programming language was used to write the original version of the web framework used by Instagram?",
|
||||
"answer": "Django, which Instagram was built on, is written in Python.",
|
||||
"difficulty": "easy",
|
||||
"hops": 2,
|
||||
},
|
||||
{
|
||||
"question": "In what year was the university founded where the inventor of the World Wide Web currently holds a professorship?",
|
||||
"answer": "Tim Berners-Lee holds a professorship at MIT (founded 1861) and the University of Southampton (founded 1952).",
|
||||
"difficulty": "hard",
|
||||
"hops": 3,
|
||||
},
|
||||
{
|
||||
"question": "What is the latest stable version of the programming language that ranks #1 on the TIOBE index as of this year?",
|
||||
"answer": "Python is currently #1 on TIOBE. The latest stable version should be verified via the official python.org site.",
|
||||
"difficulty": "medium",
|
||||
"hops": 2,
|
||||
},
|
||||
{
|
||||
"question": "How many employees does the parent company of Instagram have?",
|
||||
"answer": "Meta Platforms (parent of Instagram) employs approximately 70,000+ people as of recent reports.",
|
||||
"difficulty": "medium",
|
||||
"hops": 2,
|
||||
},
|
||||
{
|
||||
"question": "What is the current interest rate set by the central bank of the country where the Eiffel Tower is located?",
|
||||
"answer": "The European Central Bank sets rates for France/eurozone. The current rate should be verified — it has changed frequently in 2023-2025.",
|
||||
"difficulty": "hard",
|
||||
"hops": 2,
|
||||
},
|
||||
{
|
||||
"question": "Which company acquired the startup founded by the creator of Oculus VR?",
|
||||
"answer": "Palmer Luckey founded Oculus VR, which was acquired by Facebook (now Meta). He later founded Anduril Industries.",
|
||||
"difficulty": "medium",
|
||||
"hops": 2,
|
||||
},
|
||||
{
|
||||
"question": "What is the market cap of the company that owns the most popular search engine in Russia?",
|
||||
"answer": "Yandex (now split into separate entities after 2024 restructuring). Current market cap should be verified via financial sources.",
|
||||
"difficulty": "hard",
|
||||
"hops": 2,
|
||||
},
|
||||
{
|
||||
"question": "What was the GDP growth rate of the country that hosted the most recent Summer Olympics?",
|
||||
"answer": "Paris, France hosted the 2024 Summer Olympics. France's recent GDP growth should be verified via World Bank or IMF data.",
|
||||
"difficulty": "hard",
|
||||
"hops": 2,
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class WebResearchEnvConfig(HermesAgentEnvConfig):
|
||||
"""Configuration for the web research RL environment."""
|
||||
|
||||
# Reward weights
|
||||
correctness_weight: float = Field(
|
||||
default=0.6,
|
||||
description="Weight for answer correctness in reward (LLM judge score).",
|
||||
)
|
||||
tool_usage_weight: float = Field(
|
||||
default=0.2,
|
||||
description="Weight for tool usage signal (did the model actually use web tools?).",
|
||||
)
|
||||
efficiency_weight: float = Field(
|
||||
default=0.2,
|
||||
description="Weight for efficiency signal (penalizes excessive tool calls).",
|
||||
)
|
||||
diversity_bonus: float = Field(
|
||||
default=0.1,
|
||||
description="Bonus reward for citing ≥2 distinct domains.",
|
||||
)
|
||||
|
||||
# Efficiency thresholds
|
||||
efficient_max_calls: int = Field(
|
||||
default=5,
|
||||
description="Maximum tool calls before efficiency penalty begins.",
|
||||
)
|
||||
heavy_penalty_calls: int = Field(
|
||||
default=10,
|
||||
description="Tool call count where efficiency penalty steepens.",
|
||||
)
|
||||
|
||||
# Eval
|
||||
eval_size: int = Field(
|
||||
default=20,
|
||||
description="Number of held-out items for evaluation.",
|
||||
)
|
||||
eval_split_ratio: float = Field(
|
||||
default=0.1,
|
||||
description="Fraction of dataset to hold out for evaluation (0.0–1.0).",
|
||||
)
|
||||
|
||||
# Dataset
|
||||
dataset_name: str = Field(
|
||||
default="google/frames-benchmark",
|
||||
description="HuggingFace dataset name for research questions.",
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Environment
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class WebResearchEnv(HermesAgentBaseEnv):
|
||||
"""
|
||||
RL environment for training multi-step web research skills.
|
||||
|
||||
The model is given a factual question requiring 2-3 hops of web research
|
||||
and must use web_search / web_extract tools to find and synthesize the answer.
|
||||
|
||||
Reward is multi-signal:
|
||||
60% — answer correctness (LLM judge)
|
||||
20% — tool usage (did the model actually search the web?)
|
||||
20% — efficiency (penalizes >5 tool calls)
|
||||
|
||||
Bonus +0.1 for source diversity (≥2 distinct domains cited).
|
||||
"""
|
||||
|
||||
name = "web-research"
|
||||
env_config_cls = WebResearchEnvConfig
|
||||
|
||||
# Default toolsets for this environment — web + file for saving notes
|
||||
default_toolsets = ["web", "file"]
|
||||
|
||||
@classmethod
|
||||
def config_init(cls) -> Tuple[WebResearchEnvConfig, List[APIServerConfig]]:
|
||||
"""Default configuration for the web research environment."""
|
||||
env_config = WebResearchEnvConfig(
|
||||
enabled_toolsets=["web", "file"],
|
||||
max_agent_turns=15,
|
||||
agent_temperature=1.0,
|
||||
system_prompt=(
|
||||
"You are a highly capable research agent. When asked a factual question, "
|
||||
"always use web_search to find current, accurate information before answering. "
|
||||
"Cite at least 2 sources. Be concise and accurate."
|
||||
),
|
||||
group_size=4,
|
||||
total_steps=1000,
|
||||
steps_per_eval=100,
|
||||
use_wandb=True,
|
||||
wandb_name="web-research",
|
||||
)
|
||||
|
||||
server_configs = [
|
||||
APIServerConfig(
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
model_name="anthropic/claude-sonnet-4.5",
|
||||
server_type="openai",
|
||||
api_key=os.getenv("OPENROUTER_API_KEY", ""),
|
||||
health_check=False,
|
||||
)
|
||||
]
|
||||
|
||||
return env_config, server_configs
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._items: list[dict] = []
|
||||
self._eval_items: list[dict] = []
|
||||
self._index: int = 0
|
||||
|
||||
# Metrics tracking for wandb
|
||||
self._reward_buffer: list[float] = []
|
||||
self._correctness_buffer: list[float] = []
|
||||
self._tool_usage_buffer: list[float] = []
|
||||
self._efficiency_buffer: list[float] = []
|
||||
self._diversity_buffer: list[float] = []
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 1. Setup — load dataset
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def setup(self) -> None:
|
||||
"""Load the FRAMES benchmark or fall back to built-in samples."""
|
||||
if HF_AVAILABLE:
|
||||
try:
|
||||
logger.info("Loading FRAMES benchmark from HuggingFace...")
|
||||
ds = load_dataset(self.config.dataset_name, split="test")
|
||||
self._items = [
|
||||
{
|
||||
"question": row["Prompt"],
|
||||
"answer": row["Answer"],
|
||||
"difficulty": row.get("reasoning_types", "unknown"),
|
||||
"hops": 2,
|
||||
}
|
||||
for row in ds
|
||||
]
|
||||
# Hold out for eval
|
||||
eval_size = max(
|
||||
self.config.eval_size,
|
||||
int(len(self._items) * self.config.eval_split_ratio),
|
||||
)
|
||||
random.shuffle(self._items)
|
||||
self._eval_items = self._items[:eval_size]
|
||||
self._items = self._items[eval_size:]
|
||||
logger.info(
|
||||
f"Loaded {len(self._items)} train / {len(self._eval_items)} eval items "
|
||||
f"from FRAMES benchmark."
|
||||
)
|
||||
return
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not load FRAMES from HuggingFace: {e}. Using built-in samples.")
|
||||
|
||||
# Fallback
|
||||
random.shuffle(SAMPLE_QUESTIONS)
|
||||
split = max(1, len(SAMPLE_QUESTIONS) * 8 // 10)
|
||||
self._items = SAMPLE_QUESTIONS[:split]
|
||||
self._eval_items = SAMPLE_QUESTIONS[split:]
|
||||
logger.info(
|
||||
f"Using built-in sample dataset: {len(self._items)} train / "
|
||||
f"{len(self._eval_items)} eval items."
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 2. get_next_item — return the next question
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def get_next_item(self) -> dict:
|
||||
"""Return the next item, cycling through the dataset."""
|
||||
if not self._items:
|
||||
raise RuntimeError("Dataset is empty. Did you call setup()?")
|
||||
item = self._items[self._index % len(self._items)]
|
||||
self._index += 1
|
||||
return item
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 3. format_prompt — build the user-facing prompt
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def format_prompt(self, item: dict) -> str:
|
||||
"""Format the research question as a task prompt."""
|
||||
return (
|
||||
f"Research the following question thoroughly using web search. "
|
||||
f"You MUST search the web to find current, accurate information — "
|
||||
f"do not rely solely on your training data.\n\n"
|
||||
f"Question: {item['question']}\n\n"
|
||||
f"Requirements:\n"
|
||||
f"- Use web_search and/or web_extract tools to find information\n"
|
||||
f"- Search at least 2 different sources\n"
|
||||
f"- Provide a concise, accurate answer (2-4 sentences)\n"
|
||||
f"- Cite the sources you used"
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 4. compute_reward — multi-signal scoring
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def compute_reward(
|
||||
self,
|
||||
item: dict,
|
||||
result: AgentResult,
|
||||
ctx: ToolContext,
|
||||
) -> float:
|
||||
"""
|
||||
Multi-signal reward function:
|
||||
|
||||
correctness_weight * correctness — LLM judge comparing answer to ground truth
|
||||
tool_usage_weight * tool_used — binary: did the model use web tools?
|
||||
efficiency_weight * efficiency — penalizes wasteful tool usage
|
||||
+ diversity_bonus — source diversity (≥2 distinct domains)
|
||||
"""
|
||||
# Extract final response from messages (last assistant message with content)
|
||||
final_response = ""
|
||||
tools_used: list[str] = []
|
||||
for msg in reversed(result.messages):
|
||||
if msg.get("role") == "assistant" and msg.get("content") and not final_response:
|
||||
final_response = msg["content"]
|
||||
# Collect tool names from tool call messages
|
||||
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
||||
for tc in msg["tool_calls"]:
|
||||
fn = tc.get("function", {}) if isinstance(tc, dict) else {}
|
||||
name = fn.get("name", "")
|
||||
if name:
|
||||
tools_used.append(name)
|
||||
tool_call_count: int = result.turns_used or len(tools_used)
|
||||
|
||||
cfg = self.config
|
||||
|
||||
# ---- Signal 1: Answer correctness (LLM judge) ----------------
|
||||
correctness = await self._llm_judge(
|
||||
question=item["question"],
|
||||
expected=item["answer"],
|
||||
model_answer=final_response,
|
||||
)
|
||||
|
||||
# ---- Signal 2: Web tool usage --------------------------------
|
||||
web_tools = {"web_search", "web_extract", "search", "firecrawl"}
|
||||
tool_used = 1.0 if any(t in web_tools for t in tools_used) else 0.0
|
||||
|
||||
# ---- Signal 3: Efficiency ------------------------------------
|
||||
if tool_call_count <= cfg.efficient_max_calls:
|
||||
efficiency = 1.0
|
||||
elif tool_call_count <= cfg.heavy_penalty_calls:
|
||||
efficiency = 1.0 - (tool_call_count - cfg.efficient_max_calls) * 0.08
|
||||
else:
|
||||
efficiency = max(0.0, 1.0 - (tool_call_count - cfg.efficient_max_calls) * 0.12)
|
||||
|
||||
# ---- Bonus: Source diversity ---------------------------------
|
||||
domains = self._extract_domains(final_response)
|
||||
diversity = cfg.diversity_bonus if len(domains) >= 2 else 0.0
|
||||
|
||||
# ---- Combine ------------------------------------------------
|
||||
reward = (
|
||||
cfg.correctness_weight * correctness
|
||||
+ cfg.tool_usage_weight * tool_used
|
||||
+ cfg.efficiency_weight * efficiency
|
||||
+ diversity
|
||||
)
|
||||
reward = min(1.0, max(0.0, reward)) # clamp to [0, 1]
|
||||
|
||||
# Track for wandb
|
||||
self._reward_buffer.append(reward)
|
||||
self._correctness_buffer.append(correctness)
|
||||
self._tool_usage_buffer.append(tool_used)
|
||||
self._efficiency_buffer.append(efficiency)
|
||||
self._diversity_buffer.append(diversity)
|
||||
|
||||
logger.debug(
|
||||
f"Reward breakdown — correctness={correctness:.2f}, "
|
||||
f"tool_used={tool_used:.1f}, efficiency={efficiency:.2f}, "
|
||||
f"diversity={diversity:.1f} → total={reward:.3f}"
|
||||
)
|
||||
|
||||
return reward
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 5. evaluate — run on held-out eval split
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def evaluate(self, *args, **kwargs) -> None:
|
||||
"""Run evaluation on the held-out split using the full agent loop with tools.
|
||||
|
||||
Each eval item runs through the same agent loop as training —
|
||||
the model can use web_search, web_extract, etc. to research answers.
|
||||
This measures actual agentic research capability, not just knowledge.
|
||||
"""
|
||||
import time
|
||||
import uuid
|
||||
from environments.agent_loop import HermesAgentLoop
|
||||
from environments.tool_context import ToolContext
|
||||
|
||||
items = self._eval_items
|
||||
if not items:
|
||||
logger.warning("No eval items available.")
|
||||
return
|
||||
|
||||
eval_size = min(self.config.eval_size, len(items))
|
||||
eval_items = items[:eval_size]
|
||||
|
||||
logger.info(f"Running eval on {len(eval_items)} questions (with agent loop + tools)...")
|
||||
start_time = time.time()
|
||||
samples = []
|
||||
|
||||
# Resolve tools once for all eval items
|
||||
tools, valid_names = self._resolve_tools_for_group()
|
||||
|
||||
for i, item in enumerate(eval_items):
|
||||
task_id = str(uuid.uuid4())
|
||||
logger.info(f"Eval [{i+1}/{len(eval_items)}]: {item['question'][:80]}...")
|
||||
|
||||
try:
|
||||
# Build messages
|
||||
messages: List[Dict[str, Any]] = []
|
||||
if self.config.system_prompt:
|
||||
messages.append({"role": "system", "content": self.config.system_prompt})
|
||||
messages.append({"role": "user", "content": self.format_prompt(item)})
|
||||
|
||||
# Run the full agent loop with tools
|
||||
agent = HermesAgentLoop(
|
||||
server=self.server,
|
||||
tool_schemas=tools,
|
||||
valid_tool_names=valid_names,
|
||||
max_turns=self.config.max_agent_turns,
|
||||
task_id=task_id,
|
||||
temperature=0.0, # Deterministic for eval
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
budget_config=self.config.build_budget_config(),
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
|
||||
# Extract final response and tool usage from messages
|
||||
final_response = ""
|
||||
tool_call_count = 0
|
||||
for msg in reversed(result.messages):
|
||||
if msg.get("role") == "assistant" and msg.get("content") and not final_response:
|
||||
final_response = msg["content"]
|
||||
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
||||
tool_call_count += len(msg["tool_calls"])
|
||||
|
||||
# Compute reward (includes LLM judge for correctness)
|
||||
# Temporarily save buffer lengths so we can extract the
|
||||
# correctness score without calling judge twice, and avoid
|
||||
# polluting training metric buffers with eval data.
|
||||
buf_len = len(self._correctness_buffer)
|
||||
ctx = ToolContext(task_id)
|
||||
try:
|
||||
reward = await self.compute_reward(item, result, ctx)
|
||||
finally:
|
||||
ctx.cleanup()
|
||||
|
||||
# Extract correctness from the buffer (compute_reward appended it)
|
||||
# then remove eval entries from training buffers
|
||||
correctness = (
|
||||
self._correctness_buffer[buf_len]
|
||||
if len(self._correctness_buffer) > buf_len
|
||||
else 0.0
|
||||
)
|
||||
# Roll back buffers to avoid polluting training metrics
|
||||
for buf in (
|
||||
self._reward_buffer, self._correctness_buffer,
|
||||
self._tool_usage_buffer, self._efficiency_buffer,
|
||||
self._diversity_buffer,
|
||||
):
|
||||
if len(buf) > buf_len:
|
||||
buf.pop()
|
||||
|
||||
samples.append({
|
||||
"prompt": item["question"],
|
||||
"response": final_response[:500],
|
||||
"expected": item["answer"],
|
||||
"correctness": correctness,
|
||||
"reward": reward,
|
||||
"tool_calls": tool_call_count,
|
||||
"turns": result.turns_used,
|
||||
})
|
||||
|
||||
logger.info(
|
||||
f" → correctness={correctness:.2f}, reward={reward:.3f}, "
|
||||
f"tools={tool_call_count}, turns={result.turns_used}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Eval error on item: {e}")
|
||||
samples.append({
|
||||
"prompt": item["question"],
|
||||
"response": f"ERROR: {e}",
|
||||
"expected": item["answer"],
|
||||
"correctness": 0.0,
|
||||
"reward": 0.0,
|
||||
"tool_calls": 0,
|
||||
"turns": 0,
|
||||
})
|
||||
|
||||
end_time = time.time()
|
||||
|
||||
# Compute aggregate metrics
|
||||
correctness_scores = [s["correctness"] for s in samples]
|
||||
rewards = [s["reward"] for s in samples]
|
||||
tool_counts = [s["tool_calls"] for s in samples]
|
||||
n = len(samples)
|
||||
|
||||
eval_metrics = {
|
||||
"eval/mean_correctness": sum(correctness_scores) / n if n else 0.0,
|
||||
"eval/mean_reward": sum(rewards) / n if n else 0.0,
|
||||
"eval/mean_tool_calls": sum(tool_counts) / n if n else 0.0,
|
||||
"eval/tool_usage_rate": sum(1 for t in tool_counts if t > 0) / n if n else 0.0,
|
||||
"eval/n_items": n,
|
||||
}
|
||||
|
||||
logger.info(
|
||||
f"Eval complete — correctness={eval_metrics['eval/mean_correctness']:.3f}, "
|
||||
f"reward={eval_metrics['eval/mean_reward']:.3f}, "
|
||||
f"tool_usage={eval_metrics['eval/tool_usage_rate']:.0%}"
|
||||
)
|
||||
|
||||
await self.evaluate_log(
|
||||
metrics=eval_metrics,
|
||||
samples=samples,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 6. wandb_log — custom metrics
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def wandb_log(self, wandb_metrics: Optional[Dict] = None) -> None:
|
||||
"""Log reward breakdown metrics to wandb."""
|
||||
if wandb_metrics is None:
|
||||
wandb_metrics = {}
|
||||
|
||||
if self._reward_buffer:
|
||||
n = len(self._reward_buffer)
|
||||
wandb_metrics["train/mean_reward"] = sum(self._reward_buffer) / n
|
||||
wandb_metrics["train/mean_correctness"] = sum(self._correctness_buffer) / n
|
||||
wandb_metrics["train/mean_tool_usage"] = sum(self._tool_usage_buffer) / n
|
||||
wandb_metrics["train/mean_efficiency"] = sum(self._efficiency_buffer) / n
|
||||
wandb_metrics["train/mean_diversity"] = sum(self._diversity_buffer) / n
|
||||
wandb_metrics["train/total_rollouts"] = n
|
||||
|
||||
# Accuracy buckets
|
||||
wandb_metrics["train/correct_rate"] = (
|
||||
sum(1 for c in self._correctness_buffer if c >= 0.7) / n
|
||||
)
|
||||
wandb_metrics["train/tool_usage_rate"] = (
|
||||
sum(1 for t in self._tool_usage_buffer if t > 0) / n
|
||||
)
|
||||
|
||||
# Clear buffers
|
||||
self._reward_buffer.clear()
|
||||
self._correctness_buffer.clear()
|
||||
self._tool_usage_buffer.clear()
|
||||
self._efficiency_buffer.clear()
|
||||
self._diversity_buffer.clear()
|
||||
|
||||
await super().wandb_log(wandb_metrics)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Private helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _llm_judge(
|
||||
self,
|
||||
question: str,
|
||||
expected: str,
|
||||
model_answer: str,
|
||||
) -> float:
|
||||
"""
|
||||
Use the server's LLM to judge answer correctness.
|
||||
Falls back to keyword heuristic if LLM call fails.
|
||||
"""
|
||||
if not model_answer or not model_answer.strip():
|
||||
return 0.0
|
||||
|
||||
judge_prompt = (
|
||||
"You are an impartial judge evaluating the quality of an AI research answer.\n\n"
|
||||
f"Question: {question}\n\n"
|
||||
f"Reference answer: {expected}\n\n"
|
||||
f"Model answer: {model_answer}\n\n"
|
||||
"Score the model answer on a scale from 0.0 to 1.0 where:\n"
|
||||
" 1.0 = fully correct and complete\n"
|
||||
" 0.7 = mostly correct with minor gaps\n"
|
||||
" 0.4 = partially correct\n"
|
||||
" 0.1 = mentions relevant topic but wrong or very incomplete\n"
|
||||
" 0.0 = completely wrong or no answer\n\n"
|
||||
"Consider: factual accuracy, completeness, and relevance.\n"
|
||||
'Respond with ONLY a JSON object: {"score": <float>, "reason": "<one sentence>"}'
|
||||
)
|
||||
|
||||
try:
|
||||
response = await self.server.chat_completion(
|
||||
messages=[{"role": "user", "content": judge_prompt}],
|
||||
n=1,
|
||||
max_tokens=150,
|
||||
temperature=0.0,
|
||||
split="eval",
|
||||
)
|
||||
text = response.choices[0].message.content if response.choices else ""
|
||||
parsed = self._parse_judge_json(text)
|
||||
if parsed is not None:
|
||||
return float(parsed)
|
||||
except Exception as e:
|
||||
logger.debug(f"LLM judge failed: {e}. Using heuristic.")
|
||||
|
||||
return self._heuristic_score(expected, model_answer)
|
||||
|
||||
@staticmethod
|
||||
def _parse_judge_json(text: str) -> Optional[float]:
|
||||
"""Extract the score float from LLM judge JSON response."""
|
||||
try:
|
||||
clean = re.sub(r"```(?:json)?|```", "", text).strip()
|
||||
data = json.loads(clean)
|
||||
score = float(data.get("score", -1))
|
||||
if 0.0 <= score <= 1.0:
|
||||
return score
|
||||
except Exception:
|
||||
match = re.search(r'"score"\s*:\s*([0-9.]+)', text)
|
||||
if match:
|
||||
score = float(match.group(1))
|
||||
if 0.0 <= score <= 1.0:
|
||||
return score
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _heuristic_score(expected: str, model_answer: str) -> float:
|
||||
"""Lightweight keyword overlap score as fallback."""
|
||||
stopwords = {
|
||||
"the", "a", "an", "is", "are", "was", "were", "of", "in", "on",
|
||||
"at", "to", "for", "with", "and", "or", "but", "it", "its",
|
||||
"this", "that", "as", "by", "from", "be", "has", "have", "had",
|
||||
}
|
||||
|
||||
def tokenize(text: str) -> set:
|
||||
tokens = re.findall(r'\b\w+\b', text.lower())
|
||||
return {t for t in tokens if t not in stopwords and len(t) > 2}
|
||||
|
||||
expected_tokens = tokenize(expected)
|
||||
answer_tokens = tokenize(model_answer)
|
||||
|
||||
if not expected_tokens:
|
||||
return 0.5
|
||||
|
||||
overlap = len(expected_tokens & answer_tokens)
|
||||
union = len(expected_tokens | answer_tokens)
|
||||
|
||||
jaccard = overlap / union if union > 0 else 0.0
|
||||
recall = overlap / len(expected_tokens)
|
||||
return min(1.0, 0.4 * jaccard + 0.6 * recall)
|
||||
|
||||
@staticmethod
|
||||
def _extract_domains(text: str) -> set:
|
||||
"""Extract unique domains from URLs cited in the response."""
|
||||
urls = re.findall(r'https?://[^\s\)>\]"\']+', text)
|
||||
domains = set()
|
||||
for url in urls:
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower().lstrip("www.")
|
||||
if domain:
|
||||
domains.add(domain)
|
||||
except Exception:
|
||||
pass
|
||||
return domains
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
WebResearchEnv.cli()
|
||||
@@ -941,6 +941,14 @@ def load_gateway_config() -> GatewayConfig:
|
||||
if isinstance(ntc, list):
|
||||
ntc = ",".join(str(v) for v in ntc)
|
||||
os.environ["DISCORD_NO_THREAD_CHANNELS"] = str(ntc)
|
||||
# history_backfill: recover missed channel messages for shared sessions
|
||||
# when require_mention is active. Fetches messages between bot turns
|
||||
# and prepends them to the user message for context.
|
||||
if "history_backfill" in discord_cfg and not os.getenv("DISCORD_HISTORY_BACKFILL"):
|
||||
os.environ["DISCORD_HISTORY_BACKFILL"] = str(discord_cfg["history_backfill"]).lower()
|
||||
hbl = discord_cfg.get("history_backfill_limit")
|
||||
if hbl is not None and not os.getenv("DISCORD_HISTORY_BACKFILL_LIMIT"):
|
||||
os.environ["DISCORD_HISTORY_BACKFILL_LIMIT"] = str(hbl)
|
||||
# allow_mentions: granular control over what the bot can ping.
|
||||
# Safe defaults (no @everyone/roles) are applied in the adapter;
|
||||
# these YAML keys only override when set and let users opt back
|
||||
|
||||
@@ -356,15 +356,34 @@ class ResponseStore:
|
||||
# Evict oldest entries beyond max_size
|
||||
count = self._conn.execute("SELECT COUNT(*) FROM responses").fetchone()[0]
|
||||
if count > self._max_size:
|
||||
self._conn.execute(
|
||||
"DELETE FROM responses WHERE response_id IN "
|
||||
"(SELECT response_id FROM responses ORDER BY accessed_at ASC LIMIT ?)",
|
||||
(count - self._max_size,),
|
||||
)
|
||||
# Collect IDs that will be evicted
|
||||
evict_ids = [
|
||||
row[0]
|
||||
for row in self._conn.execute(
|
||||
"SELECT response_id FROM responses ORDER BY accessed_at ASC LIMIT ?",
|
||||
(count - self._max_size,),
|
||||
).fetchall()
|
||||
]
|
||||
if evict_ids:
|
||||
placeholders = ",".join("?" for _ in evict_ids)
|
||||
# Clear conversation mappings pointing to evicted responses
|
||||
self._conn.execute(
|
||||
f"DELETE FROM conversations WHERE response_id IN ({placeholders})",
|
||||
evict_ids,
|
||||
)
|
||||
# Delete evicted responses
|
||||
self._conn.execute(
|
||||
f"DELETE FROM responses WHERE response_id IN ({placeholders})",
|
||||
evict_ids,
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def delete(self, response_id: str) -> bool:
|
||||
"""Remove a response from the store. Returns True if found and deleted."""
|
||||
# Clear conversation mappings pointing to this response
|
||||
self._conn.execute(
|
||||
"DELETE FROM conversations WHERE response_id = ?", (response_id,)
|
||||
)
|
||||
cursor = self._conn.execute(
|
||||
"DELETE FROM responses WHERE response_id = ?", (response_id,)
|
||||
)
|
||||
|
||||
@@ -955,6 +955,12 @@ class MessageEvent:
|
||||
# Per-channel ephemeral system prompt (e.g. Discord channel_prompts).
|
||||
# Applied at API call time and never persisted to transcript history.
|
||||
channel_prompt: Optional[str] = None
|
||||
|
||||
# Channel context recovered by history backfill (e.g. messages between
|
||||
# bot turns that were missed due to require_mention). Kept separate
|
||||
# from ``text`` so the sender-prefix logic in run.py can operate on the
|
||||
# trigger message alone, then prepend this context afterward.
|
||||
channel_context: Optional[str] = None
|
||||
|
||||
# Internal flag — set for synthetic events (e.g. background process
|
||||
# completion notifications) that must bypass user authorization checks.
|
||||
@@ -2955,9 +2961,25 @@ class BasePlatformAdapter(ABC):
|
||||
merge_pending_message_event(self._pending_messages, session_key, event)
|
||||
return # Don't interrupt now - will run after current task completes
|
||||
|
||||
# Default behavior for non-photo follow-ups: interrupt the running agent
|
||||
# Default behavior for non-photo follow-ups: interrupt the running agent.
|
||||
#
|
||||
# Use merge_text=True so rapid TEXT follow-ups (#4469) accumulate
|
||||
# into the single pending slot instead of clobbering each other.
|
||||
# Without merging, three rapid messages "A", "B", "C" land like:
|
||||
# _pending_messages[k] = A (interrupts)
|
||||
# _pending_messages[k] = B (replaces A before consumer reads)
|
||||
# _pending_messages[k] = C (replaces B)
|
||||
# ...and only "C" reaches the next turn. merge_pending_message_event
|
||||
# already does the right thing for photo/media bursts; the
|
||||
# ``merge_text=True`` flag extends that to plain TEXT events.
|
||||
# Same shape as the Telegram bursty-grace path in gateway/run.py.
|
||||
logger.debug("[%s] New message while session %s is active — triggering interrupt", self.name, session_key)
|
||||
self._pending_messages[session_key] = event
|
||||
merge_pending_message_event(
|
||||
self._pending_messages,
|
||||
session_key,
|
||||
event,
|
||||
merge_text=True,
|
||||
)
|
||||
# Signal the interrupt (the processing task checks this)
|
||||
self._active_sessions[session_key].set()
|
||||
return # Don't process now - will be handled after current task finishes
|
||||
|
||||
@@ -589,6 +589,10 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||
# chunk only, default), "all" (reply-reference on every chunk).
|
||||
self._reply_to_mode: str = getattr(config, 'reply_to_mode', 'first') or 'first'
|
||||
self._slash_commands: bool = self.config.extra.get("slash_commands", True)
|
||||
# In-memory cache of the bot's last message ID per channel, used by
|
||||
# history backfill to skip the full scan on hot paths. Falls back to
|
||||
# scanning channel.history() on cache miss (cold start / restart).
|
||||
self._last_self_message_id: Dict[str, str] = {}
|
||||
|
||||
async def connect(self) -> bool:
|
||||
"""Connect to Discord and start receiving events."""
|
||||
@@ -1459,6 +1463,12 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||
raise
|
||||
message_ids.append(str(msg.id))
|
||||
|
||||
# Track the last message we sent in this channel for history
|
||||
# backfill — avoids a full channel.history() scan on hot paths.
|
||||
if message_ids:
|
||||
_target_id = thread_id or chat_id
|
||||
self._last_self_message_id[_target_id] = message_ids[-1]
|
||||
|
||||
return SendResult(
|
||||
success=True,
|
||||
message_id=message_ids[0] if message_ids else None,
|
||||
@@ -3596,6 +3606,134 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||
return bool(configured)
|
||||
return os.getenv("DISCORD_THREAD_REQUIRE_MENTION", "false").lower() in ("true", "1", "yes", "on")
|
||||
|
||||
def _discord_history_backfill(self) -> bool:
|
||||
"""Return whether history backfill is enabled for shared sessions."""
|
||||
configured = self.config.extra.get("history_backfill")
|
||||
if configured is not None:
|
||||
if isinstance(configured, str):
|
||||
return configured.lower() not in ("false", "0", "no", "off")
|
||||
return bool(configured)
|
||||
return os.getenv("DISCORD_HISTORY_BACKFILL", "true").lower() in ("true", "1", "yes")
|
||||
|
||||
def _discord_history_backfill_limit(self) -> int:
|
||||
"""Return the max number of messages to scan backwards for context.
|
||||
|
||||
In practice the scan usually stops much earlier — at the bot's own
|
||||
last message in the channel (the natural partition point). This
|
||||
limit is a safety cap for cold starts and long gaps where no prior
|
||||
bot message exists in recent history.
|
||||
"""
|
||||
configured = self.config.extra.get("history_backfill_limit")
|
||||
if configured is not None:
|
||||
try:
|
||||
return int(configured)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
raw = os.getenv("DISCORD_HISTORY_BACKFILL_LIMIT", "50")
|
||||
try:
|
||||
return int(raw)
|
||||
except (ValueError, TypeError):
|
||||
return 50
|
||||
|
||||
async def _fetch_channel_context(
|
||||
self,
|
||||
channel: Any,
|
||||
before: "DiscordMessage",
|
||||
) -> str:
|
||||
"""Fetch recent channel messages for conversational context.
|
||||
|
||||
Scans backwards from *before* and collects messages until it hits
|
||||
a message sent by this bot (the natural partition point between
|
||||
bot turns) or reaches ``history_backfill_limit``.
|
||||
|
||||
Returns a formatted block like::
|
||||
|
||||
[Recent channel messages]
|
||||
[Alice] some message
|
||||
[Bob [bot]] another message
|
||||
|
||||
Returns an empty string if no context is available.
|
||||
"""
|
||||
limit = self._discord_history_backfill_limit()
|
||||
if limit <= 0:
|
||||
return ""
|
||||
|
||||
# Determine which bot messages to include in context
|
||||
allow_bots_raw = os.getenv("DISCORD_ALLOW_BOTS", "none").lower().strip()
|
||||
include_other_bots = allow_bots_raw != "none"
|
||||
|
||||
# Use the in-memory cache to narrow the fetch window on hot paths.
|
||||
# If we know our last message ID in this channel, pass it as `after`
|
||||
# to avoid scanning the full limit. Falls back to scanning on cache
|
||||
# miss (cold start / restart).
|
||||
# Guard: only use the cache when it's chronologically before the
|
||||
# trigger — Discord snowflake IDs are monotonically increasing, so
|
||||
# a simple int comparison suffices.
|
||||
channel_id = str(getattr(channel, "id", ""))
|
||||
_cached_id = self._last_self_message_id.get(channel_id)
|
||||
_after_obj = None
|
||||
try:
|
||||
if _cached_id and int(_cached_id) < int(before.id):
|
||||
_after_obj = discord.Object(id=int(_cached_id))
|
||||
except (ValueError, TypeError):
|
||||
pass # Malformed cache entry — fall back to cold-start scan
|
||||
|
||||
try:
|
||||
collected = []
|
||||
# IMPORTANT: pass oldest_first=False explicitly. discord.py 2.x
|
||||
# silently flips the default to True when `after=` is supplied,
|
||||
# which would select the *earliest* N messages after our last
|
||||
# response instead of the *latest* N before the trigger. In
|
||||
# high-traffic windows that returns stale tool traces and drops
|
||||
# the actual final answer. See the regression test
|
||||
# `test_fetch_channel_context_cache_uses_latest_window_when_after_set`.
|
||||
async for msg in channel.history(
|
||||
limit=limit,
|
||||
before=before,
|
||||
after=_after_obj,
|
||||
oldest_first=False,
|
||||
):
|
||||
# Stop at our own message — this is the partition point.
|
||||
# Everything before this is already in the session transcript.
|
||||
# (Redundant when _after_obj is set, but needed for cold start.)
|
||||
if msg.author == self._client.user:
|
||||
break
|
||||
|
||||
# Skip system messages (pins, joins, thread renames, etc.)
|
||||
if msg.type not in (discord.MessageType.default, discord.MessageType.reply):
|
||||
continue
|
||||
|
||||
# Respect DISCORD_ALLOW_BOTS for other bots.
|
||||
# For history context, "mentions" is treated as "all" — we are
|
||||
# deciding what context to show, not whether to respond.
|
||||
if getattr(msg.author, "bot", False) and not include_other_bots:
|
||||
continue
|
||||
|
||||
content = getattr(msg, "clean_content", msg.content) or ""
|
||||
if not content and msg.attachments:
|
||||
content = "(attachment)"
|
||||
if not content:
|
||||
continue
|
||||
|
||||
name = msg.author.display_name
|
||||
if getattr(msg.author, "bot", False):
|
||||
name = f"{name} [bot]"
|
||||
collected.append(f"[{name}] {content}")
|
||||
|
||||
if not collected:
|
||||
return ""
|
||||
|
||||
# channel.history returns newest-first (oldest_first=False); reverse for chronological order
|
||||
collected.reverse()
|
||||
return "[Recent channel messages]\n" + "\n".join(collected)
|
||||
|
||||
except discord.Forbidden:
|
||||
logger.debug("[%s] Missing permissions to fetch channel history", self.name)
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.warning("[%s] Failed to fetch channel history: %s", self.name, e)
|
||||
return ""
|
||||
|
||||
def _thread_parent_channel(self, channel: Any) -> Any:
|
||||
"""Return the parent text channel when invoked from a thread."""
|
||||
return getattr(channel, "parent", None) or channel
|
||||
@@ -4504,9 +4642,50 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||
if pending_text_injection:
|
||||
event_text = f"{pending_text_injection}\n\n{event_text}" if event_text else pending_text_injection
|
||||
|
||||
# ── History backfill ─────────────────────────────────────────
|
||||
# When require_mention is active, the bot only processes messages
|
||||
# that @mention it. Messages in the channel between bot turns are
|
||||
# invisible to the session transcript. To recover that context,
|
||||
# fetch recent channel history and prepend it to the user message.
|
||||
#
|
||||
# The fetch window is: everything after the bot's last message in
|
||||
# the channel up to (but not including) the current trigger. On
|
||||
# cold start (no prior bot message found), fetch the last N messages
|
||||
# and stop at the first self-message encountered.
|
||||
#
|
||||
# Threads naturally scope to thread-only history (channel.history()
|
||||
# on a thread returns only that thread's messages). DMs are skipped
|
||||
# because every DM message triggers the bot — there's no mention gap
|
||||
# to fill; the session transcript already has everything.
|
||||
#
|
||||
# Per-user sessions also benefit: Alice's session is missing the
|
||||
# other-channel-participants' context, and her own messages from
|
||||
# before she mentioned the bot. Backfill fills that gap.
|
||||
#
|
||||
# Messages that arrive while the bot is processing (between trigger
|
||||
# and response) are not captured — this is an accepted simplification
|
||||
# to keep the partition rule clean.
|
||||
_channel_context = None
|
||||
_is_dm = isinstance(message.channel, discord.DMChannel)
|
||||
if not _is_dm:
|
||||
_needed_mention = (
|
||||
require_mention
|
||||
and not is_free_channel
|
||||
and not in_bot_thread
|
||||
)
|
||||
_backfill_enabled = self._discord_history_backfill()
|
||||
if _needed_mention and _backfill_enabled:
|
||||
_backfill_text = await self._fetch_channel_context(
|
||||
message.channel, before=message,
|
||||
)
|
||||
if _backfill_text:
|
||||
_channel_context = _backfill_text
|
||||
|
||||
# Defense-in-depth: prevent empty user messages from entering session
|
||||
# (can happen when user sends @mention-only with no other text)
|
||||
if not event_text or not event_text.strip():
|
||||
# (can happen when user sends @mention-only with no other text).
|
||||
# When channel_context is present, a bare mention means "catch me up"
|
||||
# — the context IS the message, so skip the placeholder.
|
||||
if (not event_text or not event_text.strip()) and not _channel_context:
|
||||
event_text = "(The user sent a message with no text content)"
|
||||
|
||||
_chan = message.channel
|
||||
@@ -4535,6 +4714,7 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||
timestamp=message.created_at,
|
||||
auto_skill=_skills,
|
||||
channel_prompt=_channel_prompt,
|
||||
channel_context=_channel_context,
|
||||
)
|
||||
|
||||
# Track thread participation so the bot won't require @mention for
|
||||
|
||||
+17
-27
@@ -2273,11 +2273,7 @@ class FeishuAdapter(BasePlatformAdapter):
|
||||
daemon=True,
|
||||
).start()
|
||||
return
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
self._handle_message_event_data(data),
|
||||
loop,
|
||||
)
|
||||
future.add_done_callback(self._log_background_failure)
|
||||
self._submit_on_loop(loop, self._handle_message_event_data(data))
|
||||
|
||||
def _enqueue_pending_inbound_event(self, data: Any) -> bool:
|
||||
"""Append an event to the pending-inbound queue.
|
||||
@@ -2353,16 +2349,12 @@ class FeishuAdapter(BasePlatformAdapter):
|
||||
dispatched = 0
|
||||
requeue: List[Any] = []
|
||||
for event in batch:
|
||||
try:
|
||||
fut = asyncio.run_coroutine_threadsafe(
|
||||
self._handle_message_event_data(event),
|
||||
loop,
|
||||
)
|
||||
fut.add_done_callback(self._log_background_failure)
|
||||
if self._submit_on_loop(
|
||||
loop, self._handle_message_event_data(event)
|
||||
):
|
||||
dispatched += 1
|
||||
except RuntimeError:
|
||||
# Loop closed between check and submit — requeue
|
||||
# and poll again.
|
||||
else:
|
||||
# Loop closed/unavailable — requeue and poll again.
|
||||
requeue.append(event)
|
||||
if requeue:
|
||||
with self._pending_inbound_lock:
|
||||
@@ -2466,11 +2458,10 @@ class FeishuAdapter(BasePlatformAdapter):
|
||||
if not self._loop_accepts_callbacks(loop):
|
||||
logger.warning("[Feishu] Dropping drive comment event before adapter loop is ready")
|
||||
return
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
handle_drive_comment_event(self._client, data, self_open_id=self._bot_open_id),
|
||||
self._submit_on_loop(
|
||||
loop,
|
||||
handle_drive_comment_event(self._client, data, self_open_id=self._bot_open_id),
|
||||
)
|
||||
future.add_done_callback(self._log_background_failure)
|
||||
|
||||
def _on_reaction_event(self, event_type: str, data: Any) -> None:
|
||||
"""Route user reactions on bot messages as synthetic text events."""
|
||||
@@ -2498,11 +2489,7 @@ class FeishuAdapter(BasePlatformAdapter):
|
||||
or bool(getattr(loop, "is_closed", lambda: False)())
|
||||
):
|
||||
return
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
self._handle_reaction_event(event_type, data),
|
||||
loop,
|
||||
)
|
||||
future.add_done_callback(self._log_background_failure)
|
||||
self._submit_on_loop(loop, self._handle_reaction_event(event_type, data))
|
||||
|
||||
def _on_card_action_trigger(self, data: Any) -> Any:
|
||||
"""Handle card-action callback from the Feishu SDK (synchronous).
|
||||
@@ -2548,11 +2535,14 @@ class FeishuAdapter(BasePlatformAdapter):
|
||||
|
||||
def _submit_on_loop(self, loop: Any, coro: Any) -> bool:
|
||||
"""Schedule background work on the adapter loop with shared failure logging."""
|
||||
try:
|
||||
future = asyncio.run_coroutine_threadsafe(coro, loop)
|
||||
except Exception:
|
||||
coro.close()
|
||||
logger.warning("[Feishu] Failed to schedule background callback work", exc_info=True)
|
||||
from agent.async_utils import safe_schedule_threadsafe
|
||||
future = safe_schedule_threadsafe(
|
||||
coro, loop,
|
||||
logger=logger,
|
||||
log_message="[Feishu] Failed to schedule background callback work",
|
||||
log_level=logging.WARNING,
|
||||
)
|
||||
if future is None:
|
||||
return False
|
||||
future.add_done_callback(self._log_background_failure)
|
||||
return True
|
||||
|
||||
@@ -2785,7 +2785,10 @@ class SlackAdapter(BasePlatformAdapter):
|
||||
from hermes_cli.commands import slack_subcommand_map
|
||||
subcommand_map = slack_subcommand_map()
|
||||
subcommand_map["compact"] = "/compress"
|
||||
first_word = text.split()[0] if text else ""
|
||||
# Guard against whitespace-only text where ``text`` is truthy but
|
||||
# ``text.split()`` returns ``[]`` (e.g. user sends ``/hermes ``).
|
||||
parts = text.split() if text else []
|
||||
first_word = parts[0] if parts else ""
|
||||
if first_word in subcommand_map:
|
||||
rest = text[len(first_word):].strip()
|
||||
text = f"{subcommand_map[first_word]} {rest}".strip() if rest else subcommand_map[first_word]
|
||||
|
||||
@@ -493,13 +493,45 @@ class WhatsAppAdapter(BasePlatformAdapter):
|
||||
"""
|
||||
if not check_whatsapp_requirements():
|
||||
logger.warning("[%s] Node.js not found. WhatsApp requires Node.js.", self.name)
|
||||
self._set_fatal_error(
|
||||
"whatsapp_node_missing",
|
||||
"Node.js is not installed — install Node.js and re-run `hermes gateway`.",
|
||||
retryable=False,
|
||||
)
|
||||
return False
|
||||
|
||||
bridge_path = Path(self._bridge_script)
|
||||
if not bridge_path.exists():
|
||||
logger.warning("[%s] Bridge script not found: %s", self.name, bridge_path)
|
||||
self._set_fatal_error(
|
||||
"whatsapp_bridge_missing",
|
||||
f"WhatsApp bridge script missing at {bridge_path}.",
|
||||
retryable=False,
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
# Pre-flight: skip the 30s bridge bootstrap entirely if the user
|
||||
# never finished pairing. Without creds.json the bridge prints
|
||||
# QR codes to its log file and never reaches status:connected,
|
||||
# so every gateway restart paid the 30s timeout + queued WhatsApp
|
||||
# for indefinite retries. Mark non-retryable so the user gets a
|
||||
# clear "run hermes whatsapp" message instead of the watcher
|
||||
# silently hammering an unconfigured platform.
|
||||
creds_path = self._session_path / "creds.json"
|
||||
if not creds_path.exists():
|
||||
logger.warning(
|
||||
"[%s] WhatsApp is enabled but not paired (no creds.json at %s). "
|
||||
"Run `hermes whatsapp` to pair, or remove WHATSAPP_ENABLED from "
|
||||
"your .env to disable.",
|
||||
self.name, creds_path,
|
||||
)
|
||||
self._set_fatal_error(
|
||||
"whatsapp_not_paired",
|
||||
"WhatsApp enabled but not paired — run `hermes whatsapp` to pair.",
|
||||
retryable=False,
|
||||
)
|
||||
return False
|
||||
|
||||
logger.info("[%s] Bridge found at %s", self.name, bridge_path)
|
||||
|
||||
# Acquire scoped lock to prevent duplicate sessions
|
||||
|
||||
+150
-34
@@ -147,6 +147,9 @@ _YB_RES_REF_RE = re.compile(
|
||||
r"\[(image|voice|video|file(?::[^|\]]*)?)\|ybres:([A-Za-z0-9_\-]+)\]"
|
||||
)
|
||||
|
||||
# Media kinds that can be resolved and injected into the model context
|
||||
_RESOLVABLE_MEDIA_KINDS = frozenset({"image", "file"})
|
||||
|
||||
# Strip page indicators like (1/3) appended by BasePlatformAdapter
|
||||
_INDICATOR_RE = re.compile(r'\s*\(\d+/\d+\)$')
|
||||
|
||||
@@ -925,6 +928,7 @@ class InboundContext:
|
||||
# Populated by QuoteContextMiddleware
|
||||
reply_to_message_id: Optional[str] = None
|
||||
reply_to_text: Optional[str] = None
|
||||
quote_media_refs: list = dc_field(default_factory=list) # List of (rid, kind, filename)
|
||||
|
||||
# Populated by MediaResolveMiddleware
|
||||
media_urls: list = dc_field(default_factory=list)
|
||||
@@ -1645,6 +1649,25 @@ class ExtractContentMiddleware(InboundMiddleware):
|
||||
return None
|
||||
return f"[link: {link} | visit link for full content]"
|
||||
|
||||
@staticmethod
|
||||
def _parse_resource_id(url: str) -> str:
|
||||
"""Extract resourceId from Yuanbao resource URL query parameters.
|
||||
|
||||
Args:
|
||||
url: Resource URL (e.g., https://...?resourceId=abc123)
|
||||
|
||||
Returns:
|
||||
Resource ID string, or empty string if not found
|
||||
"""
|
||||
if not url:
|
||||
return ""
|
||||
try:
|
||||
query = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
|
||||
ids = query.get("resourceId") or query.get("resourceid") or []
|
||||
return str(ids[0]).strip() if ids else ""
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
@classmethod
|
||||
def _extract_text(cls, msg_body: list) -> str:
|
||||
"""Extract plain text content from MsgBody.
|
||||
@@ -1668,14 +1691,35 @@ class ExtractContentMiddleware(InboundMiddleware):
|
||||
if text:
|
||||
parts.append(text)
|
||||
elif elem_type == "TIMImageElem":
|
||||
parts.append("[image]")
|
||||
# Extract resourceId from image_info_array URL
|
||||
image_info_array = content.get("image_info_array")
|
||||
if not isinstance(image_info_array, list):
|
||||
image_info_array = []
|
||||
image_info = None
|
||||
# Prefer medium image (index 1), fallback to index 0
|
||||
if len(image_info_array) > 1 and isinstance(image_info_array[1], dict):
|
||||
image_info = image_info_array[1]
|
||||
elif len(image_info_array) > 0 and isinstance(image_info_array[0], dict):
|
||||
image_info = image_info_array[0]
|
||||
image_url = str((image_info or {}).get("url") or "").strip()
|
||||
rid = cls._parse_resource_id(image_url)
|
||||
parts.append(f"[image|ybres:{rid}]" if rid else "[image]")
|
||||
elif elem_type == "TIMFileElem":
|
||||
filename = content.get("file_name", content.get("fileName", content.get("filename", "")))
|
||||
parts.append(f"[file: {filename}]" if filename else "[file]")
|
||||
file_url = str(content.get("url") or "").strip()
|
||||
rid = cls._parse_resource_id(file_url)
|
||||
if rid:
|
||||
parts.append(f"[file:{filename}|ybres:{rid}]" if filename else f"[file|ybres:{rid}]")
|
||||
else:
|
||||
parts.append(f"[file: {filename}]" if filename else "[file]")
|
||||
elif elem_type == "TIMSoundElem":
|
||||
parts.append("[voice]")
|
||||
sound_url = str(content.get("url") or "").strip()
|
||||
rid = cls._parse_resource_id(sound_url)
|
||||
parts.append(f"[voice|ybres:{rid}]" if rid else "[voice]")
|
||||
elif elem_type == "TIMVideoFileElem":
|
||||
parts.append("[video]")
|
||||
video_url = str(content.get("url") or "").strip()
|
||||
rid = cls._parse_resource_id(video_url)
|
||||
parts.append(f"[video|ybres:{rid}]" if rid else "[video]")
|
||||
elif elem_type == "TIMCustomElem":
|
||||
data_val = content.get("data", "")
|
||||
if data_val:
|
||||
@@ -2132,22 +2176,23 @@ class QuoteContextMiddleware(InboundMiddleware):
|
||||
name = "quote-context"
|
||||
|
||||
@staticmethod
|
||||
def _extract_quote_context(cloud_custom_data: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
def _extract_quote_context(cloud_custom_data: str) -> Tuple[Optional[str], Optional[str], list]:
|
||||
"""Extract quote context, mapping to MessageEvent.reply_to_*.
|
||||
|
||||
Returns:
|
||||
(reply_to_message_id, reply_to_text)
|
||||
(reply_to_message_id, reply_to_text, quote_media_refs)
|
||||
where quote_media_refs is a list of (rid, kind, filename) tuples
|
||||
"""
|
||||
if not cloud_custom_data:
|
||||
return None, None
|
||||
return None, None, []
|
||||
try:
|
||||
parsed = json.loads(cloud_custom_data)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return None, None
|
||||
return None, None, []
|
||||
|
||||
quote = parsed.get("quote") if isinstance(parsed, dict) else None
|
||||
if not isinstance(quote, dict):
|
||||
return None, None
|
||||
return None, None, []
|
||||
|
||||
# type=2 corresponds to image reference; desc may be empty, provide a placeholder.
|
||||
quote_type = int(quote.get("type") or 0)
|
||||
@@ -2155,15 +2200,26 @@ class QuoteContextMiddleware(InboundMiddleware):
|
||||
if quote_type == 2 and not desc:
|
||||
desc = "[image]"
|
||||
if not desc:
|
||||
return None, None
|
||||
return None, None, []
|
||||
|
||||
quote_id = str(quote.get("id") or "").strip() or None
|
||||
sender = str(quote.get("sender_nickname") or quote.get("sender_id") or "").strip()
|
||||
quote_text = f"{sender}: {desc}" if sender else desc
|
||||
return quote_id, quote_text
|
||||
|
||||
# Extract media references from desc using _YB_RES_REF_RE regex
|
||||
media_refs: list = []
|
||||
for m in _YB_RES_REF_RE.finditer(desc):
|
||||
head = m.group(1) # "image" | "file:<name>" | "voice" | "video"
|
||||
rid = m.group(2)
|
||||
kind, _, filename = head.partition(":")
|
||||
kind = kind.strip()
|
||||
media_refs.append((rid, kind, filename.strip()))
|
||||
|
||||
return quote_id, quote_text, media_refs
|
||||
|
||||
async def handle(self, ctx: InboundContext, next_fn) -> None:
|
||||
ctx.reply_to_message_id, ctx.reply_to_text = self._extract_quote_context(ctx.cloud_custom_data)
|
||||
ctx.reply_to_message_id, ctx.reply_to_text, ctx.quote_media_refs = self._extract_quote_context(ctx.cloud_custom_data)
|
||||
|
||||
await next_fn()
|
||||
|
||||
|
||||
@@ -2332,7 +2388,7 @@ class MediaResolveMiddleware(InboundMiddleware):
|
||||
for ref in media_refs:
|
||||
kind = str(ref.get("kind") or "").strip().lower()
|
||||
url = str(ref.get("url") or "").strip()
|
||||
if kind not in {"image", "file"} or not url:
|
||||
if kind not in _RESOLVABLE_MEDIA_KINDS or not url:
|
||||
continue
|
||||
|
||||
try:
|
||||
@@ -2391,7 +2447,7 @@ class MediaResolveMiddleware(InboundMiddleware):
|
||||
rid = m.group(2)
|
||||
kind, _, filename = head.partition(":")
|
||||
kind = kind.strip()
|
||||
if kind not in {"image", "file"}:
|
||||
if kind not in _RESOLVABLE_MEDIA_KINDS:
|
||||
continue
|
||||
if rid in seen:
|
||||
continue
|
||||
@@ -2458,26 +2514,82 @@ class DispatchMiddleware(InboundMiddleware):
|
||||
media_urls = list(ctx.media_urls)
|
||||
media_types = list(ctx.media_types)
|
||||
|
||||
# Backfill observed media from recent transcript history
|
||||
extra_img_urls: List[str] = []
|
||||
extra_img_mimes: List[str] = []
|
||||
try:
|
||||
extra_img_urls, extra_img_mimes = await MediaResolveMiddleware._collect_observed_media(
|
||||
adapter, ctx.source,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"[%s] observed-image hydration raised, continuing anyway: %s",
|
||||
adapter.name, exc,
|
||||
)
|
||||
if extra_img_urls:
|
||||
current = set(media_urls)
|
||||
for u, m in zip(extra_img_urls, extra_img_mimes):
|
||||
if u in current:
|
||||
# If user quoted a message (reply_to_message_id is set), resolve only
|
||||
# quote_media_refs to avoid injecting unrelated history media.
|
||||
# Otherwise, backfill observed media from recent transcript history.
|
||||
if ctx.reply_to_message_id is not None:
|
||||
# Fallback: if desc didn't contain ybres refs, look up transcript
|
||||
if not ctx.quote_media_refs:
|
||||
try:
|
||||
store = getattr(adapter, "_session_store", None)
|
||||
if store:
|
||||
session_entry = store.get_or_create_session(ctx.source)
|
||||
history = store.load_transcript(session_entry.session_id)
|
||||
for msg in reversed(history or []):
|
||||
mid = msg.get("message_id", "")
|
||||
if mid and mid == ctx.reply_to_message_id:
|
||||
_content = msg.get("content", "")
|
||||
if isinstance(_content, str) and "|ybres:" in _content:
|
||||
for m in _YB_RES_REF_RE.finditer(_content):
|
||||
head = m.group(1)
|
||||
rid = m.group(2)
|
||||
kind, _, filename = head.partition(":")
|
||||
kind = kind.strip()
|
||||
if kind in _RESOLVABLE_MEDIA_KINDS:
|
||||
ctx.quote_media_refs.append((rid, kind, filename.strip()))
|
||||
break
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"[%s] quote transcript lookup failed: %s",
|
||||
adapter.name, exc,
|
||||
)
|
||||
# User quoted a message — resolve only media from the quote
|
||||
for rid, kind, filename in ctx.quote_media_refs:
|
||||
if kind not in _RESOLVABLE_MEDIA_KINDS:
|
||||
continue
|
||||
media_urls.append(u)
|
||||
media_types.append(m)
|
||||
current.add(u)
|
||||
try:
|
||||
fresh_url = await MediaResolveMiddleware._resolve_by_resource_id(adapter, rid)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"[%s] quote media resolve failed: rid=%s kind=%s err=%s",
|
||||
adapter.name, rid, kind, exc,
|
||||
)
|
||||
continue
|
||||
cached = await MediaResolveMiddleware._download_and_cache(
|
||||
adapter,
|
||||
fetch_url=fresh_url,
|
||||
kind=kind,
|
||||
file_name=filename or None,
|
||||
log_tag=f"quote rid={rid}",
|
||||
)
|
||||
if cached is None:
|
||||
continue
|
||||
path, mime = cached
|
||||
# Avoid duplicates
|
||||
if path not in media_urls:
|
||||
media_urls.append(path)
|
||||
media_types.append(mime)
|
||||
else:
|
||||
# No quote — backfill observed media from recent transcript history
|
||||
extra_img_urls: List[str] = []
|
||||
extra_img_mimes: List[str] = []
|
||||
try:
|
||||
extra_img_urls, extra_img_mimes = await MediaResolveMiddleware._collect_observed_media(
|
||||
adapter, ctx.source,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"[%s] observed-image hydration raised, continuing anyway: %s",
|
||||
adapter.name, exc,
|
||||
)
|
||||
if extra_img_urls:
|
||||
current = set(media_urls)
|
||||
for u, m in zip(extra_img_urls, extra_img_mimes):
|
||||
if u in current:
|
||||
continue
|
||||
media_urls.append(u)
|
||||
media_types.append(m)
|
||||
current.add(u)
|
||||
|
||||
# Replace [kind|ybres:xxx] anchors with local cache paths so
|
||||
# the transcript records usable paths for the model.
|
||||
@@ -2506,7 +2618,11 @@ class DispatchMiddleware(InboundMiddleware):
|
||||
|
||||
event = MessageEvent(
|
||||
text=_patched_event_text,
|
||||
message_type=ctx.msg_type,
|
||||
message_type=(
|
||||
MessageType.DOCUMENT
|
||||
if any(mt.startswith(("application/", "text/")) for mt in media_types)
|
||||
else ctx.msg_type
|
||||
),
|
||||
source=ctx.source,
|
||||
message_id=ctx.msg_id or None,
|
||||
raw_message=ctx.push,
|
||||
|
||||
+345
-124
@@ -50,6 +50,7 @@ from typing import Dict, Optional, Any, List, Union
|
||||
# gateway is a long-running daemon, so its boot cost matters less than
|
||||
# preserving the established test-patch surface.
|
||||
from agent.account_usage import fetch_account_usage, render_account_usage_lines
|
||||
from agent.async_utils import safe_schedule_threadsafe
|
||||
from agent.i18n import t
|
||||
from hermes_cli.config import cfg_get
|
||||
|
||||
@@ -1989,21 +1990,21 @@ class GatewayRunner:
|
||||
await self.stop()
|
||||
elif not self.adapters and self._failed_platforms:
|
||||
# All platforms are down and queued for background reconnection.
|
||||
# If the error is retryable, exit with failure so systemd Restart=on-failure
|
||||
# can restart the process. Otherwise stay alive and keep retrying in background.
|
||||
if adapter.fatal_error_retryable:
|
||||
self._exit_reason = adapter.fatal_error_message or "All messaging platforms failed with retryable errors"
|
||||
self._exit_with_failure = True
|
||||
logger.error(
|
||||
"All messaging platforms failed with retryable errors. "
|
||||
"Shutting down gateway for service restart (systemd will retry)."
|
||||
)
|
||||
await self.stop()
|
||||
else:
|
||||
logger.warning(
|
||||
"No connected messaging platforms remain, but %d platform(s) queued for reconnection",
|
||||
len(self._failed_platforms),
|
||||
)
|
||||
# Keep the gateway alive so:
|
||||
# • cron jobs still run
|
||||
# • the reconnect watcher can recover platforms when the
|
||||
# underlying problem clears (proxy comes back, user runs
|
||||
# `hermes whatsapp`, etc.)
|
||||
# We used to exit-with-failure here to trigger systemd restart,
|
||||
# but that converted a transient outage into a restart loop and
|
||||
# killed in-process state every time. The reconnect watcher
|
||||
# already handles long-running recovery — let it do its job.
|
||||
logger.warning(
|
||||
"No connected messaging platforms remain, but %d platform(s) "
|
||||
"queued for reconnection — gateway staying alive, watcher will "
|
||||
"retry in background.",
|
||||
len(self._failed_platforms),
|
||||
)
|
||||
|
||||
def _request_clean_exit(self, reason: str) -> None:
|
||||
self._exit_cleanly = True
|
||||
@@ -2179,6 +2180,73 @@ class GatewayRunner:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Per-platform circuit breaker (pause/resume) — used by the reconnect
|
||||
# watcher when a retryable failure recurs past a threshold, and by the
|
||||
# /platform pause|resume slash command for manual control.
|
||||
# ------------------------------------------------------------------
|
||||
def _pause_failed_platform(self, platform, *, reason: str = "") -> None:
|
||||
"""Mark a queued platform as paused — keep it in ``_failed_platforms``
|
||||
but stop the reconnect watcher from hammering it.
|
||||
|
||||
Used by the circuit breaker after ``_PAUSE_AFTER_FAILURES`` consecutive
|
||||
retryable failures, and by ``/platform pause <name>`` for manual
|
||||
intervention. Paused platforms are surfaced in ``/platform list``
|
||||
and resumed with ``/platform resume <name>``.
|
||||
"""
|
||||
info = getattr(self, "_failed_platforms", {}).get(platform)
|
||||
if info is None:
|
||||
return
|
||||
if info.get("paused"):
|
||||
return
|
||||
info["paused"] = True
|
||||
info["pause_reason"] = reason or "auto-paused after repeated failures"
|
||||
# Push next_retry far enough out that even if "paused" is missed
|
||||
# by a stale code path, the watcher won't fire on it.
|
||||
info["next_retry"] = float("inf")
|
||||
try:
|
||||
self._update_platform_runtime_status(
|
||||
platform.value,
|
||||
platform_state="paused",
|
||||
error_code=None,
|
||||
error_message=info["pause_reason"],
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
logger.warning(
|
||||
"%s paused after %d consecutive failures (%s) — "
|
||||
"fix the underlying issue then run `/platform resume %s` "
|
||||
"to retry, or `hermes gateway restart` to restart the gateway.",
|
||||
platform.value, info.get("attempts", 0),
|
||||
info["pause_reason"], platform.value,
|
||||
)
|
||||
|
||||
def _resume_paused_platform(self, platform) -> bool:
|
||||
"""Unpause a platform — reset its attempt counter and schedule an
|
||||
immediate retry. Returns True if the platform was paused and is
|
||||
now queued; False if it wasn't paused (or wasn't in the queue).
|
||||
"""
|
||||
info = getattr(self, "_failed_platforms", {}).get(platform)
|
||||
if info is None:
|
||||
return False
|
||||
if not info.get("paused"):
|
||||
return False
|
||||
info["paused"] = False
|
||||
info.pop("pause_reason", None)
|
||||
info["attempts"] = 0
|
||||
info["next_retry"] = time.monotonic() # retry on next watcher tick
|
||||
try:
|
||||
self._update_platform_runtime_status(
|
||||
platform.value,
|
||||
platform_state="retrying",
|
||||
error_code=None,
|
||||
error_message=None,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
logger.info("%s resumed — retrying on next watcher tick", platform.value)
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def _load_prefill_messages() -> List[Dict[str, Any]]:
|
||||
"""Load ephemeral prefill messages from config or env var.
|
||||
@@ -3612,16 +3680,32 @@ class GatewayRunner:
|
||||
return True
|
||||
if enabled_platform_count > 0:
|
||||
if startup_retryable_errors:
|
||||
# At least one platform attempted a connection and failed —
|
||||
# this is a real startup error that should block the gateway.
|
||||
# All enabled platforms hit retryable failures (network
|
||||
# blip, bridge not paired, npm install timeout, etc.).
|
||||
# Keep the gateway alive so:
|
||||
# • cron jobs still run
|
||||
# • the reconnect watcher gets a chance to recover the
|
||||
# failing platforms once the underlying problem is
|
||||
# fixed (e.g. user runs `hermes whatsapp`, fixes
|
||||
# proxy, etc.)
|
||||
# Exiting here used to convert a single misconfigured
|
||||
# platform into an infinite systemd restart loop.
|
||||
reason = "; ".join(startup_retryable_errors)
|
||||
logger.error("Gateway failed to connect any configured messaging platform: %s", reason)
|
||||
logger.warning(
|
||||
"Gateway started with no connected platforms — "
|
||||
"%d platform(s) queued for retry: %s",
|
||||
len(self._failed_platforms), reason,
|
||||
)
|
||||
try:
|
||||
from gateway.status import write_runtime_status
|
||||
write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
|
||||
write_runtime_status(
|
||||
gateway_state="degraded",
|
||||
exit_reason=None,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
# Fall through to the normal "running" state — reconnect
|
||||
# watcher takes it from here.
|
||||
# All enabled platforms had no adapter (missing library or credentials).
|
||||
# In fleet deployments the same config.yaml is shared across nodes that
|
||||
# may only have credentials for a subset of platforms. Rather than
|
||||
@@ -4736,11 +4820,15 @@ class GatewayRunner:
|
||||
"""Background task that periodically retries connecting failed platforms.
|
||||
|
||||
Uses exponential backoff: 30s → 60s → 120s → 240s → 300s (cap).
|
||||
Stops retrying a platform after 20 failed attempts or if the error
|
||||
is non-retryable (e.g. bad auth token).
|
||||
Retryable failures keep retrying at the backoff cap indefinitely
|
||||
— but if a platform fails ``_PAUSE_AFTER_FAILURES`` times in a row
|
||||
without ever succeeding, it is *paused*: kept in the retry queue
|
||||
but no longer hammered. The user surfaces it with ``/platform list``
|
||||
and resumes it with ``/platform resume <name>``. Non-retryable
|
||||
failures (bad auth, etc.) still drop out of the queue immediately.
|
||||
"""
|
||||
_MAX_ATTEMPTS = 20
|
||||
_BACKOFF_CAP = 300 # 5 minutes max between retries
|
||||
_PAUSE_AFTER_FAILURES = 10 # circuit-breaker threshold
|
||||
|
||||
await asyncio.sleep(10) # initial delay — let startup finish
|
||||
while self._running:
|
||||
@@ -4757,22 +4845,18 @@ class GatewayRunner:
|
||||
if not self._running:
|
||||
return
|
||||
info = self._failed_platforms[platform]
|
||||
# Skip paused platforms entirely — they need explicit
|
||||
# /platform resume to come back.
|
||||
if info.get("paused"):
|
||||
continue
|
||||
if now < info["next_retry"]:
|
||||
continue # not time yet
|
||||
|
||||
if info["attempts"] >= _MAX_ATTEMPTS:
|
||||
logger.warning(
|
||||
"Giving up reconnecting %s after %d attempts",
|
||||
platform.value, info["attempts"],
|
||||
)
|
||||
del self._failed_platforms[platform]
|
||||
continue
|
||||
|
||||
platform_config = info["config"]
|
||||
attempt = info["attempts"] + 1
|
||||
logger.info(
|
||||
"Reconnecting %s (attempt %d/%d)...",
|
||||
platform.value, attempt, _MAX_ATTEMPTS,
|
||||
"Reconnecting %s (attempt %d)...",
|
||||
platform.value, attempt,
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -4837,6 +4921,14 @@ class GatewayRunner:
|
||||
"Reconnect %s failed, next retry in %ds",
|
||||
platform.value, backoff,
|
||||
)
|
||||
if attempt >= _PAUSE_AFTER_FAILURES:
|
||||
self._pause_failed_platform(
|
||||
platform,
|
||||
reason=(
|
||||
adapter.fatal_error_message
|
||||
or "failed to reconnect"
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
self._update_platform_runtime_status(
|
||||
platform.value,
|
||||
@@ -4851,6 +4943,8 @@ class GatewayRunner:
|
||||
"Reconnect %s error: %s, next retry in %ds",
|
||||
platform.value, e, backoff,
|
||||
)
|
||||
if attempt >= _PAUSE_AFTER_FAILURES:
|
||||
self._pause_failed_platform(platform, reason=str(e))
|
||||
|
||||
# Check every 10 seconds for platforms that need reconnection
|
||||
for _ in range(10):
|
||||
@@ -6450,6 +6544,9 @@ class GatewayRunner:
|
||||
if canonical == "agents":
|
||||
return await self._handle_agents_command(event)
|
||||
|
||||
if canonical == "platform":
|
||||
return await self._handle_platform_command(event)
|
||||
|
||||
if canonical == "restart":
|
||||
return await self._handle_restart_command(event)
|
||||
|
||||
@@ -6809,6 +6906,12 @@ class GatewayRunner:
|
||||
if _is_shared_multi_user and source.user_name:
|
||||
message_text = f"[{source.user_name}] {message_text}"
|
||||
|
||||
# Prepend channel context from history backfill (if any). This
|
||||
# happens after sender-prefix so the prefix only applies to the
|
||||
# trigger message, not the backfill block.
|
||||
if getattr(event, "channel_context", None):
|
||||
message_text = f"{event.channel_context}\n\n[New message]\n{message_text}"
|
||||
|
||||
if event.media_urls:
|
||||
image_paths = []
|
||||
audio_paths = []
|
||||
@@ -7985,6 +8088,8 @@ class GatewayRunner:
|
||||
try:
|
||||
if _err_body is not None:
|
||||
_err_json = _err_body.json().get("error", {})
|
||||
if not isinstance(_err_json, dict):
|
||||
_err_json = {}
|
||||
except Exception:
|
||||
pass
|
||||
if _err_json.get("type") == "usage_limit_reached":
|
||||
@@ -8689,6 +8794,99 @@ class GatewayRunner:
|
||||
else:
|
||||
return t("gateway.stop.no_active")
|
||||
|
||||
async def _handle_platform_command(self, event: MessageEvent) -> str:
|
||||
"""Handle ``/platform list|pause|resume [name]`` — surface and
|
||||
manually control failed/paused gateway adapters.
|
||||
|
||||
Examples:
|
||||
``/platform list`` — show connected + failed/paused platforms
|
||||
``/platform pause whatsapp`` — stop the reconnect watcher hammering whatsapp
|
||||
``/platform resume whatsapp`` — re-queue a paused platform for retry
|
||||
"""
|
||||
text = (getattr(event, "content", "") or "").strip()
|
||||
# Strip the leading "/platform" (or "/PLATFORM") token if present
|
||||
parts = text.split(maxsplit=2)
|
||||
if parts and parts[0].lower().lstrip("/").startswith("platform"):
|
||||
parts = parts[1:]
|
||||
action = (parts[0] if parts else "list").lower()
|
||||
target = parts[1].lower() if len(parts) > 1 else ""
|
||||
|
||||
# Resolve platform name (case-insensitive, value match)
|
||||
def _resolve_platform(name: str):
|
||||
if not name:
|
||||
return None
|
||||
for p in Platform.__members__.values():
|
||||
if p.value.lower() == name:
|
||||
return p
|
||||
return None
|
||||
|
||||
if action == "list":
|
||||
lines = ["**Gateway platforms**"]
|
||||
connected = sorted(p.value for p in self.adapters.keys())
|
||||
if connected:
|
||||
lines.append("Connected: " + ", ".join(connected))
|
||||
else:
|
||||
lines.append("Connected: (none)")
|
||||
failed = getattr(self, "_failed_platforms", {}) or {}
|
||||
if failed:
|
||||
for p, info in failed.items():
|
||||
if info.get("paused"):
|
||||
reason = info.get("pause_reason") or "paused"
|
||||
lines.append(
|
||||
f" · {p.value} — PAUSED ({reason}). "
|
||||
f"Resume with `/platform resume {p.value}`."
|
||||
)
|
||||
else:
|
||||
attempts = info.get("attempts", 0)
|
||||
lines.append(
|
||||
f" · {p.value} — retrying (attempt {attempts})"
|
||||
)
|
||||
else:
|
||||
lines.append("Failed/paused: (none)")
|
||||
return "\n".join(lines)
|
||||
|
||||
if action in ("pause", "resume"):
|
||||
if not target:
|
||||
return f"Usage: /platform {action} <name>"
|
||||
platform = _resolve_platform(target)
|
||||
if platform is None:
|
||||
return f"Unknown platform: {target}"
|
||||
failed = getattr(self, "_failed_platforms", {}) or {}
|
||||
if action == "pause":
|
||||
if platform not in failed:
|
||||
return (
|
||||
f"{platform.value} is not in the retry queue "
|
||||
f"(it's either connected or not enabled)."
|
||||
)
|
||||
if failed[platform].get("paused"):
|
||||
return f"{platform.value} is already paused."
|
||||
self._pause_failed_platform(platform, reason="paused via /platform pause")
|
||||
return (
|
||||
f"✓ {platform.value} paused. "
|
||||
f"Resume with `/platform resume {platform.value}` or "
|
||||
f"`hermes gateway restart` to reset."
|
||||
)
|
||||
# action == "resume"
|
||||
if platform not in failed:
|
||||
return (
|
||||
f"{platform.value} is not in the retry queue — "
|
||||
f"nothing to resume."
|
||||
)
|
||||
if not failed[platform].get("paused"):
|
||||
return (
|
||||
f"{platform.value} is already retrying — "
|
||||
f"no resume needed."
|
||||
)
|
||||
self._resume_paused_platform(platform)
|
||||
return f"✓ {platform.value} resumed — retrying on next watcher tick."
|
||||
|
||||
return (
|
||||
"Usage: /platform <list|pause|resume> [name]\n"
|
||||
" /platform list — show platform status\n"
|
||||
" /platform pause <name> — stop retrying a failing platform\n"
|
||||
" /platform resume <name> — re-queue a paused platform"
|
||||
)
|
||||
|
||||
async def _handle_restart_command(self, event: MessageEvent) -> Union[str, EphemeralReply]:
|
||||
"""Handle /restart command - drain active work, then restart the gateway."""
|
||||
# Defensive idempotency check: if the previous gateway process
|
||||
@@ -11209,10 +11407,14 @@ class GatewayRunner:
|
||||
copied_source = dataclasses.replace(source)
|
||||
except Exception:
|
||||
copied_source = source
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
future = safe_schedule_threadsafe(
|
||||
self._rename_telegram_topic_for_session_title(copied_source, session_id, title),
|
||||
loop,
|
||||
logger=logger,
|
||||
log_message="Telegram topic title rename failed to schedule",
|
||||
)
|
||||
if future is None:
|
||||
return
|
||||
def _log_rename_failure(fut) -> None:
|
||||
try:
|
||||
fut.result()
|
||||
@@ -14802,29 +15004,28 @@ class GatewayRunner:
|
||||
def _step_callback_sync(iteration: int, prev_tools: list) -> None:
|
||||
if not _run_still_current():
|
||||
return
|
||||
try:
|
||||
# prev_tools may be list[str] or list[dict] with "name"/"result"
|
||||
# keys. Normalise to keep "tool_names" backward-compatible for
|
||||
# user-authored hooks that do ', '.join(tool_names)'.
|
||||
_names: list[str] = []
|
||||
for _t in (prev_tools or []):
|
||||
if isinstance(_t, dict):
|
||||
_names.append(_t.get("name") or "")
|
||||
else:
|
||||
_names.append(str(_t))
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
_hooks_ref.emit("agent:step", {
|
||||
"platform": source.platform.value if source.platform else "",
|
||||
"user_id": source.user_id,
|
||||
"session_id": session_id,
|
||||
"iteration": iteration,
|
||||
"tool_names": _names,
|
||||
"tools": prev_tools,
|
||||
}),
|
||||
_loop_for_step,
|
||||
)
|
||||
except Exception as _e:
|
||||
logger.debug("agent:step hook error: %s", _e)
|
||||
# prev_tools may be list[str] or list[dict] with "name"/"result"
|
||||
# keys. Normalise to keep "tool_names" backward-compatible for
|
||||
# user-authored hooks that do ', '.join(tool_names)'.
|
||||
_names: list[str] = []
|
||||
for _t in (prev_tools or []):
|
||||
if isinstance(_t, dict):
|
||||
_names.append(_t.get("name") or "")
|
||||
else:
|
||||
_names.append(str(_t))
|
||||
safe_schedule_threadsafe(
|
||||
_hooks_ref.emit("agent:step", {
|
||||
"platform": source.platform.value if source.platform else "",
|
||||
"user_id": source.user_id,
|
||||
"session_id": session_id,
|
||||
"iteration": iteration,
|
||||
"tool_names": _names,
|
||||
"tools": prev_tools,
|
||||
}),
|
||||
_loop_for_step,
|
||||
logger=logger,
|
||||
log_message="agent:step hook scheduling error",
|
||||
)
|
||||
|
||||
# Bridge sync status_callback → async adapter.send for context pressure
|
||||
_status_adapter = self.adapters.get(source.platform)
|
||||
@@ -14844,27 +15045,28 @@ class GatewayRunner:
|
||||
def _status_callback_sync(event_type: str, message: str) -> None:
|
||||
if not _status_adapter or not _run_still_current():
|
||||
return
|
||||
try:
|
||||
_fut = asyncio.run_coroutine_threadsafe(
|
||||
_status_adapter.send(
|
||||
_status_chat_id,
|
||||
message,
|
||||
metadata=_status_thread_metadata,
|
||||
),
|
||||
_loop_for_step,
|
||||
)
|
||||
if _cleanup_progress:
|
||||
def _track_status_id(fut) -> None:
|
||||
try:
|
||||
res = fut.result()
|
||||
except Exception:
|
||||
return
|
||||
mid = getattr(res, "message_id", None)
|
||||
if getattr(res, "success", False) and mid:
|
||||
_cleanup_msg_ids.append(str(mid))
|
||||
_fut.add_done_callback(_track_status_id)
|
||||
except Exception as _e:
|
||||
logger.debug("status_callback error (%s): %s", event_type, _e)
|
||||
_fut = safe_schedule_threadsafe(
|
||||
_status_adapter.send(
|
||||
_status_chat_id,
|
||||
message,
|
||||
metadata=_status_thread_metadata,
|
||||
),
|
||||
_loop_for_step,
|
||||
logger=logger,
|
||||
log_message=f"status_callback ({event_type}) scheduling error",
|
||||
)
|
||||
if _fut is None:
|
||||
return
|
||||
if _cleanup_progress:
|
||||
def _track_status_id(fut) -> None:
|
||||
try:
|
||||
res = fut.result()
|
||||
except Exception:
|
||||
return
|
||||
mid = getattr(res, "message_id", None)
|
||||
if getattr(res, "success", False) and mid:
|
||||
_cleanup_msg_ids.append(str(mid))
|
||||
_fut.add_done_callback(_track_status_id)
|
||||
|
||||
def run_sync():
|
||||
# The conditional re-assignment of `message` further below
|
||||
@@ -15018,17 +15220,16 @@ class GatewayRunner:
|
||||
return
|
||||
if already_streamed or not _status_adapter or not str(text or "").strip():
|
||||
return
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
_status_adapter.send(
|
||||
_status_chat_id,
|
||||
text,
|
||||
metadata=_status_thread_metadata,
|
||||
),
|
||||
_loop_for_step,
|
||||
)
|
||||
except Exception as _e:
|
||||
logger.debug("interim_assistant_callback error: %s", _e)
|
||||
safe_schedule_threadsafe(
|
||||
_status_adapter.send(
|
||||
_status_chat_id,
|
||||
text,
|
||||
metadata=_status_thread_metadata,
|
||||
),
|
||||
_loop_for_step,
|
||||
logger=logger,
|
||||
log_message="interim_assistant_callback scheduling error",
|
||||
)
|
||||
|
||||
turn_route = self._resolve_turn_agent_config(message, model, runtime_kwargs)
|
||||
|
||||
@@ -15117,17 +15318,16 @@ class GatewayRunner:
|
||||
def _deliver_bg_review_message(message: str) -> None:
|
||||
if not _status_adapter or not _run_still_current():
|
||||
return
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
_status_adapter.send(
|
||||
_status_chat_id,
|
||||
message,
|
||||
metadata=_status_thread_metadata,
|
||||
),
|
||||
_loop_for_step,
|
||||
)
|
||||
except Exception as _e:
|
||||
logger.debug("background_review_callback error: %s", _e)
|
||||
safe_schedule_threadsafe(
|
||||
_status_adapter.send(
|
||||
_status_chat_id,
|
||||
message,
|
||||
metadata=_status_thread_metadata,
|
||||
),
|
||||
_loop_for_step,
|
||||
logger=logger,
|
||||
log_message="background_review_callback scheduling error",
|
||||
)
|
||||
|
||||
def _release_bg_review_messages() -> None:
|
||||
_bg_review_release.set()
|
||||
@@ -15199,23 +15399,28 @@ class GatewayRunner:
|
||||
pass
|
||||
|
||||
send_ok = False
|
||||
try:
|
||||
fut = asyncio.run_coroutine_threadsafe(
|
||||
_status_adapter.send_clarify(
|
||||
chat_id=_status_chat_id,
|
||||
question=question,
|
||||
choices=list(choices) if choices else None,
|
||||
clarify_id=clarify_id,
|
||||
session_key=session_key or "",
|
||||
metadata=_status_thread_metadata,
|
||||
),
|
||||
_loop_for_step,
|
||||
)
|
||||
result = fut.result(timeout=15)
|
||||
send_ok = bool(getattr(result, "success", False))
|
||||
except Exception as exc:
|
||||
logger.warning("Clarify send failed: %s", exc)
|
||||
fut = safe_schedule_threadsafe(
|
||||
_status_adapter.send_clarify(
|
||||
chat_id=_status_chat_id,
|
||||
question=question,
|
||||
choices=list(choices) if choices else None,
|
||||
clarify_id=clarify_id,
|
||||
session_key=session_key or "",
|
||||
metadata=_status_thread_metadata,
|
||||
),
|
||||
_loop_for_step,
|
||||
logger=logger,
|
||||
log_message="Clarify send failed to schedule",
|
||||
)
|
||||
if fut is None:
|
||||
send_ok = False
|
||||
else:
|
||||
try:
|
||||
result = fut.result(timeout=15)
|
||||
send_ok = bool(getattr(result, "success", False))
|
||||
except Exception as exc:
|
||||
logger.warning("Clarify send failed: %s", exc)
|
||||
send_ok = False
|
||||
|
||||
if not send_ok:
|
||||
# Couldn't deliver the prompt — clean up and return
|
||||
@@ -15335,7 +15540,7 @@ class GatewayRunner:
|
||||
# false positives from MagicMock auto-attribute creation in tests.
|
||||
if getattr(type(_status_adapter), "send_exec_approval", None) is not None:
|
||||
try:
|
||||
_approval_result = asyncio.run_coroutine_threadsafe(
|
||||
_approval_fut = safe_schedule_threadsafe(
|
||||
_status_adapter.send_exec_approval(
|
||||
chat_id=_status_chat_id,
|
||||
command=cmd,
|
||||
@@ -15344,7 +15549,12 @@ class GatewayRunner:
|
||||
metadata=_status_thread_metadata,
|
||||
),
|
||||
_loop_for_step,
|
||||
).result(timeout=15)
|
||||
logger=logger,
|
||||
log_message="send_exec_approval scheduling error",
|
||||
)
|
||||
if _approval_fut is None:
|
||||
raise RuntimeError("send_exec_approval: loop unavailable")
|
||||
_approval_result = _approval_fut.result(timeout=15)
|
||||
if _approval_result.success:
|
||||
return
|
||||
logger.warning(
|
||||
@@ -15366,14 +15576,18 @@ class GatewayRunner:
|
||||
f"for the session, `/approve always` to approve permanently, or `/deny` to cancel."
|
||||
)
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
_approval_send_fut = safe_schedule_threadsafe(
|
||||
_status_adapter.send(
|
||||
_status_chat_id,
|
||||
msg,
|
||||
metadata=_status_thread_metadata,
|
||||
),
|
||||
_loop_for_step,
|
||||
).result(timeout=15)
|
||||
logger=logger,
|
||||
log_message="Approval text-send scheduling error",
|
||||
)
|
||||
if _approval_send_fut is not None:
|
||||
_approval_send_fut.result(timeout=15)
|
||||
except Exception as _e:
|
||||
logger.error("Failed to send approval request: %s", _e)
|
||||
|
||||
@@ -16335,7 +16549,11 @@ class GatewayRunner:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(_delete_all(), _loop_snapshot)
|
||||
safe_schedule_threadsafe(
|
||||
_delete_all(), _loop_snapshot,
|
||||
logger=logger,
|
||||
log_message="Temp bubble cleanup scheduling error",
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -16392,10 +16610,13 @@ def _start_cron_ticker(stop_event: threading.Event, adapters=None, loop=None, in
|
||||
# this ticker runs in a background thread. Schedule onto
|
||||
# the gateway event loop and wait briefly for completion
|
||||
# so refresh failures are still logged via the except.
|
||||
fut = asyncio.run_coroutine_threadsafe(
|
||||
build_channel_directory(adapters), loop
|
||||
fut = safe_schedule_threadsafe(
|
||||
build_channel_directory(adapters), loop,
|
||||
logger=logger,
|
||||
log_message="Channel directory refresh scheduling error",
|
||||
)
|
||||
fut.result(timeout=30)
|
||||
if fut is not None:
|
||||
fut.result(timeout=30)
|
||||
except Exception as e:
|
||||
logger.debug("Channel directory refresh error: %s", e)
|
||||
|
||||
|
||||
@@ -518,6 +518,9 @@ class SessionEntry:
|
||||
else None
|
||||
),
|
||||
"is_fresh_reset": self.is_fresh_reset,
|
||||
"was_auto_reset": self.was_auto_reset,
|
||||
"auto_reset_reason": self.auto_reset_reason,
|
||||
"reset_had_activity": self.reset_had_activity,
|
||||
}
|
||||
if self.origin:
|
||||
result["origin"] = self.origin.to_dict()
|
||||
@@ -567,6 +570,9 @@ class SessionEntry:
|
||||
resume_reason=data.get("resume_reason"),
|
||||
last_resume_marked_at=last_resume_marked_at,
|
||||
is_fresh_reset=data.get("is_fresh_reset", False),
|
||||
was_auto_reset=data.get("was_auto_reset", False),
|
||||
auto_reset_reason=data.get("auto_reset_reason"),
|
||||
reset_had_activity=data.get("reset_had_activity", False),
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -14,8 +14,8 @@ Provides subcommands for:
|
||||
import os
|
||||
import sys
|
||||
|
||||
__version__ = "0.13.0"
|
||||
__release_date__ = "2026.5.7"
|
||||
__version__ = "0.14.0"
|
||||
__release_date__ = "2026.5.16"
|
||||
|
||||
|
||||
def _ensure_utf8():
|
||||
|
||||
+851
-1
@@ -72,6 +72,7 @@ DEFAULT_AGENT_KEY_MIN_TTL_SECONDS = 30 * 60 # 30 minutes
|
||||
ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 120 # refresh 2 min before expiry
|
||||
DEVICE_AUTH_POLL_INTERVAL_CAP_SECONDS = 1 # poll at most every 1s
|
||||
DEFAULT_CODEX_BASE_URL = "https://chatgpt.com/backend-api/codex"
|
||||
DEFAULT_XAI_OAUTH_BASE_URL = "https://api.x.ai/v1"
|
||||
MINIMAX_OAUTH_CLIENT_ID = "78257093-7e40-4613-99e0-527b14b39113"
|
||||
MINIMAX_OAUTH_SCOPE = "group_id profile model.completion"
|
||||
MINIMAX_OAUTH_GRANT_TYPE = "urn:ietf:params:oauth:grant-type:user_code"
|
||||
@@ -89,6 +90,14 @@ STEPFUN_STEP_PLAN_CN_BASE_URL = "https://api.stepfun.com/step_plan/v1"
|
||||
CODEX_OAUTH_CLIENT_ID = "app_EMoamEEZ73f0CkXaXp7hrann"
|
||||
CODEX_OAUTH_TOKEN_URL = "https://auth.openai.com/oauth/token"
|
||||
CODEX_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 120
|
||||
XAI_OAUTH_ISSUER = "https://auth.x.ai"
|
||||
XAI_OAUTH_DISCOVERY_URL = f"{XAI_OAUTH_ISSUER}/.well-known/openid-configuration"
|
||||
XAI_OAUTH_CLIENT_ID = "b1a00492-073a-47ea-816f-4c329264a828"
|
||||
XAI_OAUTH_SCOPE = "openid profile email offline_access grok-cli:access api:access"
|
||||
XAI_OAUTH_REDIRECT_HOST = "127.0.0.1"
|
||||
XAI_OAUTH_REDIRECT_PORT = 56121
|
||||
XAI_OAUTH_REDIRECT_PATH = "/callback"
|
||||
XAI_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 120
|
||||
QWEN_OAUTH_CLIENT_ID = "f0304373b74a44d2b584a3fb70ca9e56"
|
||||
QWEN_OAUTH_TOKEN_URL = "https://chat.qwen.ai/api/v1/oauth2/token"
|
||||
QWEN_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 120
|
||||
@@ -98,6 +107,9 @@ DEFAULT_SPOTIFY_REDIRECT_URI = "http://127.0.0.1:43827/spotify/callback"
|
||||
SPOTIFY_DOCS_URL = "https://hermes-agent.nousresearch.com/docs/user-guide/features/spotify"
|
||||
SPOTIFY_DASHBOARD_URL = "https://developer.spotify.com/dashboard"
|
||||
SPOTIFY_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 120
|
||||
|
||||
XAI_OAUTH_DOCS_URL = "https://hermes-agent.nousresearch.com/docs/guides/xai-grok-oauth"
|
||||
OAUTH_OVER_SSH_DOCS_URL = "https://hermes-agent.nousresearch.com/docs/guides/oauth-over-ssh"
|
||||
DEFAULT_SPOTIFY_SCOPE = " ".join((
|
||||
"user-modify-playback-state",
|
||||
"user-read-playback-state",
|
||||
@@ -162,6 +174,12 @@ PROVIDER_REGISTRY: Dict[str, ProviderConfig] = {
|
||||
auth_type="oauth_external",
|
||||
inference_base_url=DEFAULT_CODEX_BASE_URL,
|
||||
),
|
||||
"xai-oauth": ProviderConfig(
|
||||
id="xai-oauth",
|
||||
name="xAI Grok OAuth (SuperGrok Subscription)",
|
||||
auth_type="oauth_external",
|
||||
inference_base_url=DEFAULT_XAI_OAUTH_BASE_URL,
|
||||
),
|
||||
"qwen-oauth": ProviderConfig(
|
||||
id="qwen-oauth",
|
||||
name="Qwen OAuth",
|
||||
@@ -1364,6 +1382,8 @@ def resolve_provider(
|
||||
"glm": "zai", "z-ai": "zai", "z.ai": "zai", "zhipu": "zai",
|
||||
"google": "gemini", "google-gemini": "gemini", "google-ai-studio": "gemini",
|
||||
"x-ai": "xai", "x.ai": "xai", "grok": "xai",
|
||||
"xai-oauth": "xai-oauth", "x-ai-oauth": "xai-oauth",
|
||||
"grok-oauth": "xai-oauth", "xai-grok-oauth": "xai-oauth",
|
||||
"kimi": "kimi-coding", "kimi-for-coding": "kimi-coding", "moonshot": "kimi-coding",
|
||||
"kimi-cn": "kimi-coding-cn", "moonshot-cn": "kimi-coding-cn",
|
||||
"step": "stepfun", "stepfun-coding-plan": "stepfun",
|
||||
@@ -1907,6 +1927,16 @@ def _spotify_code_challenge(code_verifier: str) -> str:
|
||||
return base64.urlsafe_b64encode(digest).decode("ascii").rstrip("=")
|
||||
|
||||
|
||||
def _oauth_pkce_code_verifier(length: int = 64) -> str:
|
||||
raw = base64.urlsafe_b64encode(os.urandom(length)).decode("ascii")
|
||||
return raw.rstrip("=")[:128]
|
||||
|
||||
|
||||
def _oauth_pkce_code_challenge(code_verifier: str) -> str:
|
||||
digest = hashlib.sha256(code_verifier.encode("utf-8")).digest()
|
||||
return base64.urlsafe_b64encode(digest).decode("ascii").rstrip("=")
|
||||
|
||||
|
||||
def _spotify_build_authorize_url(
|
||||
*,
|
||||
client_id: str,
|
||||
@@ -2029,6 +2059,158 @@ def _spotify_wait_for_callback(
|
||||
)
|
||||
|
||||
|
||||
def _xai_validate_loopback_redirect_uri(redirect_uri: str) -> tuple[str, int, str]:
|
||||
parsed = urlparse(redirect_uri)
|
||||
if parsed.scheme != "http":
|
||||
raise AuthError(
|
||||
"xAI OAuth redirect_uri must use http://127.0.0.1.",
|
||||
provider="xai-oauth",
|
||||
code="xai_redirect_invalid",
|
||||
)
|
||||
host = parsed.hostname or ""
|
||||
if host != XAI_OAUTH_REDIRECT_HOST:
|
||||
raise AuthError(
|
||||
"xAI OAuth redirect_uri must point to 127.0.0.1.",
|
||||
provider="xai-oauth",
|
||||
code="xai_redirect_invalid",
|
||||
)
|
||||
if not parsed.port:
|
||||
raise AuthError(
|
||||
"xAI OAuth redirect_uri must include an explicit localhost port.",
|
||||
provider="xai-oauth",
|
||||
code="xai_redirect_invalid",
|
||||
)
|
||||
return host, parsed.port, parsed.path or "/"
|
||||
|
||||
|
||||
def _xai_callback_cors_origin(origin: Optional[str]) -> str:
|
||||
# CORS allowlist for the loopback callback. Only xAI's own auth origins
|
||||
# are accepted; the redirect_uri itself is bound to 127.0.0.1 and gated by
|
||||
# PKCE+state, so additional dev/3p origins are not needed here.
|
||||
allowed = {
|
||||
"https://accounts.x.ai",
|
||||
"https://auth.x.ai",
|
||||
}
|
||||
return origin if origin in allowed else ""
|
||||
|
||||
|
||||
def _make_xai_callback_handler(expected_path: str) -> tuple[type[BaseHTTPRequestHandler], dict[str, Any]]:
|
||||
result: dict[str, Any] = {
|
||||
"code": None,
|
||||
"state": None,
|
||||
"error": None,
|
||||
"error_description": None,
|
||||
}
|
||||
|
||||
class _XAICallbackHandler(BaseHTTPRequestHandler):
|
||||
def _maybe_write_cors_headers(self) -> None:
|
||||
origin = self.headers.get("Origin")
|
||||
allow_origin = _xai_callback_cors_origin(origin)
|
||||
if allow_origin:
|
||||
self.send_header("Access-Control-Allow-Origin", allow_origin)
|
||||
self.send_header("Access-Control-Allow-Methods", "GET, OPTIONS")
|
||||
self.send_header("Access-Control-Allow-Headers", "Content-Type")
|
||||
self.send_header("Access-Control-Allow-Private-Network", "true")
|
||||
self.send_header("Vary", "Origin")
|
||||
|
||||
def do_OPTIONS(self) -> None: # noqa: N802
|
||||
self.send_response(204)
|
||||
self._maybe_write_cors_headers()
|
||||
self.end_headers()
|
||||
|
||||
def do_GET(self) -> None: # noqa: N802
|
||||
parsed = urlparse(self.path)
|
||||
if parsed.path != expected_path:
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
self.wfile.write(b"Not found.")
|
||||
return
|
||||
|
||||
params = parse_qs(parsed.query)
|
||||
result["code"] = params.get("code", [None])[0]
|
||||
result["state"] = params.get("state", [None])[0]
|
||||
result["error"] = params.get("error", [None])[0]
|
||||
result["error_description"] = params.get("error_description", [None])[0]
|
||||
|
||||
self.send_response(200)
|
||||
self._maybe_write_cors_headers()
|
||||
self.send_header("Content-Type", "text/html; charset=utf-8")
|
||||
self.end_headers()
|
||||
if result["error"]:
|
||||
body = "<html><body><h1>xAI authorization failed.</h1>You can close this tab.</body></html>"
|
||||
else:
|
||||
body = "<html><body><h1>xAI authorization received.</h1>You can close this tab.</body></html>"
|
||||
self.wfile.write(body.encode("utf-8"))
|
||||
|
||||
def log_message(self, format: str, *args: Any) -> None: # noqa: A003
|
||||
return
|
||||
|
||||
return _XAICallbackHandler, result
|
||||
|
||||
|
||||
def _xai_start_callback_server(
|
||||
preferred_port: int = XAI_OAUTH_REDIRECT_PORT,
|
||||
) -> tuple[HTTPServer, threading.Thread, dict[str, Any], str]:
|
||||
host = XAI_OAUTH_REDIRECT_HOST
|
||||
expected_path = XAI_OAUTH_REDIRECT_PATH
|
||||
handler_cls, result = _make_xai_callback_handler(expected_path)
|
||||
|
||||
class _ReuseHTTPServer(HTTPServer):
|
||||
allow_reuse_address = True
|
||||
|
||||
ports_to_try = [preferred_port]
|
||||
if preferred_port != 0:
|
||||
ports_to_try.append(0)
|
||||
server = None
|
||||
last_error: Optional[OSError] = None
|
||||
for port in ports_to_try:
|
||||
try:
|
||||
server = _ReuseHTTPServer((host, port), handler_cls)
|
||||
break
|
||||
except OSError as exc:
|
||||
last_error = exc
|
||||
if server is None:
|
||||
raise AuthError(
|
||||
f"Could not bind xAI callback server on {host}:{preferred_port}: {last_error}",
|
||||
provider="xai-oauth",
|
||||
code="xai_callback_bind_failed",
|
||||
) from last_error
|
||||
|
||||
actual_port = int(server.server_address[1])
|
||||
redirect_uri = f"http://{host}:{actual_port}{expected_path}"
|
||||
thread = threading.Thread(
|
||||
target=server.serve_forever,
|
||||
kwargs={"poll_interval": 0.1},
|
||||
daemon=True,
|
||||
)
|
||||
thread.start()
|
||||
return server, thread, result, redirect_uri
|
||||
|
||||
|
||||
def _xai_wait_for_callback(
|
||||
server: HTTPServer,
|
||||
thread: threading.Thread,
|
||||
result: dict[str, Any],
|
||||
*,
|
||||
timeout_seconds: float = 180.0,
|
||||
) -> dict[str, Any]:
|
||||
deadline = time.monotonic() + max(5.0, timeout_seconds)
|
||||
try:
|
||||
while time.monotonic() < deadline:
|
||||
if result["code"] or result["error"]:
|
||||
return result
|
||||
time.sleep(0.1)
|
||||
finally:
|
||||
server.shutdown()
|
||||
server.server_close()
|
||||
thread.join(timeout=1.0)
|
||||
raise AuthError(
|
||||
"xAI authorization timed out waiting for the local callback.",
|
||||
provider="xai-oauth",
|
||||
code="xai_callback_timeout",
|
||||
)
|
||||
|
||||
|
||||
def _spotify_token_payload_to_state(
|
||||
token_payload: Dict[str, Any],
|
||||
*,
|
||||
@@ -2349,6 +2531,8 @@ def login_spotify_command(args) -> None:
|
||||
print(f"Full setup guide: {SPOTIFY_DOCS_URL}")
|
||||
print()
|
||||
|
||||
_print_loopback_ssh_hint(redirect_uri, docs_url=SPOTIFY_DOCS_URL)
|
||||
|
||||
if open_browser and not _is_remote_session():
|
||||
try:
|
||||
opened = webbrowser.open(authorize_url)
|
||||
@@ -2405,6 +2589,45 @@ def _is_remote_session() -> bool:
|
||||
return bool(os.getenv("SSH_CLIENT") or os.getenv("SSH_TTY"))
|
||||
|
||||
|
||||
def _print_loopback_ssh_hint(redirect_uri: str, *, docs_url: str | None = None) -> None:
|
||||
"""Print an SSH tunnel hint when running a loopback-redirect OAuth flow on a
|
||||
remote host. The auth server (xAI, Spotify, ...) will redirect the user's
|
||||
browser to ``127.0.0.1:<port>/callback``. If the browser is on a different
|
||||
machine than the loopback listener (the usual SSH case), the redirect can't
|
||||
reach the listener without a local port forward.
|
||||
|
||||
The hint is best-effort: silent if we don't think we're remote, or if we
|
||||
can't parse a host/port out of the redirect URI.
|
||||
|
||||
Pass ``docs_url`` for a provider-specific guide (e.g. the xAI Grok OAuth
|
||||
page); the generic OAuth-over-SSH guide is always shown after it.
|
||||
"""
|
||||
if not _is_remote_session():
|
||||
return
|
||||
try:
|
||||
parsed = urlparse(redirect_uri)
|
||||
except Exception:
|
||||
return
|
||||
host = parsed.hostname or ""
|
||||
port = parsed.port
|
||||
if host not in ("127.0.0.1", "::1", "localhost") or not port:
|
||||
return
|
||||
print()
|
||||
print("Remote session detected. Your browser will redirect to")
|
||||
print(f" {redirect_uri}")
|
||||
print("which the loopback listener on THIS machine is waiting on. If your")
|
||||
print("browser is on a different machine, forward the port first from your")
|
||||
print("local machine in a separate terminal:")
|
||||
print()
|
||||
print(f" ssh -N -L {port}:127.0.0.1:{port} <user>@<this-host>")
|
||||
print()
|
||||
print("Then open the authorize URL above in your local browser.")
|
||||
if docs_url:
|
||||
print(f"Provider docs: {docs_url}")
|
||||
print(f"SSH/jump-box guide: {OAUTH_OVER_SSH_DOCS_URL}")
|
||||
print()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# OpenAI Codex auth — tokens stored in ~/.hermes/auth.json (not ~/.codex/)
|
||||
#
|
||||
@@ -2680,6 +2903,348 @@ def resolve_codex_runtime_credentials(
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# xAI Grok OAuth — tokens stored in ~/.hermes/auth.json
|
||||
# =============================================================================
|
||||
|
||||
def _read_xai_oauth_tokens(*, _lock: bool = True) -> Dict[str, Any]:
|
||||
if _lock:
|
||||
with _auth_store_lock():
|
||||
auth_store = _load_auth_store()
|
||||
else:
|
||||
auth_store = _load_auth_store()
|
||||
state = _load_provider_state(auth_store, "xai-oauth")
|
||||
if not state:
|
||||
raise AuthError(
|
||||
"No xAI OAuth credentials stored. Select xAI Grok OAuth (SuperGrok Subscription) in `hermes model`.",
|
||||
provider="xai-oauth",
|
||||
code="xai_auth_missing",
|
||||
relogin_required=True,
|
||||
)
|
||||
tokens = state.get("tokens")
|
||||
if not isinstance(tokens, dict):
|
||||
raise AuthError(
|
||||
"xAI OAuth state is missing tokens. Re-authenticate with `hermes model`.",
|
||||
provider="xai-oauth",
|
||||
code="xai_auth_invalid_shape",
|
||||
relogin_required=True,
|
||||
)
|
||||
access_token = str(tokens.get("access_token", "") or "").strip()
|
||||
refresh_token = str(tokens.get("refresh_token", "") or "").strip()
|
||||
if not access_token:
|
||||
raise AuthError(
|
||||
"xAI OAuth state is missing access_token. Re-authenticate with `hermes model`.",
|
||||
provider="xai-oauth",
|
||||
code="xai_auth_missing_access_token",
|
||||
relogin_required=True,
|
||||
)
|
||||
if not refresh_token:
|
||||
raise AuthError(
|
||||
"xAI OAuth state is missing refresh_token. Re-authenticate with `hermes model`.",
|
||||
provider="xai-oauth",
|
||||
code="xai_auth_missing_refresh_token",
|
||||
relogin_required=True,
|
||||
)
|
||||
return {
|
||||
"tokens": tokens,
|
||||
"last_refresh": state.get("last_refresh"),
|
||||
"discovery": state.get("discovery") or {},
|
||||
"redirect_uri": state.get("redirect_uri"),
|
||||
}
|
||||
|
||||
|
||||
def _save_xai_oauth_tokens(
|
||||
tokens: Dict[str, Any],
|
||||
*,
|
||||
discovery: Optional[Dict[str, Any]] = None,
|
||||
redirect_uri: str = "",
|
||||
last_refresh: Optional[str] = None,
|
||||
) -> None:
|
||||
if last_refresh is None:
|
||||
last_refresh = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
with _auth_store_lock():
|
||||
auth_store = _load_auth_store()
|
||||
state = _load_provider_state(auth_store, "xai-oauth") or {}
|
||||
state["tokens"] = tokens
|
||||
state["last_refresh"] = last_refresh
|
||||
state["auth_mode"] = "oauth_pkce"
|
||||
if discovery:
|
||||
state["discovery"] = discovery
|
||||
if redirect_uri:
|
||||
state["redirect_uri"] = redirect_uri
|
||||
_save_provider_state(auth_store, "xai-oauth", state)
|
||||
_save_auth_store(auth_store)
|
||||
|
||||
|
||||
def _xai_access_token_is_expiring(access_token: str, skew_seconds: int = 0) -> bool:
|
||||
if not isinstance(access_token, str) or "." not in access_token:
|
||||
return False
|
||||
try:
|
||||
parts = access_token.split(".")
|
||||
if len(parts) < 2:
|
||||
return False
|
||||
payload_b64 = parts[1]
|
||||
payload_b64 += "=" * (-len(payload_b64) % 4)
|
||||
payload = json.loads(base64.urlsafe_b64decode(payload_b64.encode("ascii")).decode("utf-8"))
|
||||
exp = payload.get("exp")
|
||||
if not isinstance(exp, (int, float)):
|
||||
return False
|
||||
return float(exp) <= (time.time() + max(0, int(skew_seconds)))
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _xai_validate_oauth_endpoint(url: str, *, field: str) -> str:
|
||||
"""Refuse any OIDC discovery endpoint that isn't HTTPS on the xAI origin.
|
||||
|
||||
The OIDC discovery response is a long-lived, low-frequency request whose
|
||||
output is cached in ``~/.hermes/auth.json``. A single MITM during initial
|
||||
login could substitute a malicious ``token_endpoint``; that URL would
|
||||
then receive the refresh_token on every subsequent refresh — a permanent
|
||||
credential leak from a one-time MITM. Validating scheme + host pins the
|
||||
cached endpoint to the xAI auth origin (or a future ``*.x.ai`` subdomain
|
||||
if xAI migrates) so the cache poisoning loses its persistence guarantee.
|
||||
|
||||
RFC 8414 §2 requires the issuer to be ``https://`` and SHOULD-keeps the
|
||||
token_endpoint on the same origin; we enforce both. ``x.ai`` is the
|
||||
bare apex, so we accept either exact host match or any ``.x.ai`` suffix.
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
if parsed.scheme != "https":
|
||||
raise AuthError(
|
||||
f"xAI OIDC discovery returned a non-HTTPS {field}: {url!r}.",
|
||||
provider="xai-oauth",
|
||||
code="xai_discovery_invalid",
|
||||
)
|
||||
host = (parsed.hostname or "").lower()
|
||||
if not host:
|
||||
raise AuthError(
|
||||
f"xAI OIDC discovery {field} is missing a hostname: {url!r}.",
|
||||
provider="xai-oauth",
|
||||
code="xai_discovery_invalid",
|
||||
)
|
||||
if host != "x.ai" and not host.endswith(".x.ai"):
|
||||
raise AuthError(
|
||||
f"xAI OIDC discovery {field} host {host!r} is not on the xAI origin "
|
||||
f"(expected x.ai or a *.x.ai subdomain). Refusing to use a cached "
|
||||
f"endpoint that may have been substituted by a MITM during initial "
|
||||
f"discovery; re-authenticate with `hermes model` to re-fetch.",
|
||||
provider="xai-oauth",
|
||||
code="xai_discovery_invalid",
|
||||
)
|
||||
return url
|
||||
|
||||
|
||||
def _xai_oauth_discovery(timeout_seconds: float = 15.0) -> Dict[str, str]:
|
||||
try:
|
||||
response = httpx.get(
|
||||
XAI_OAUTH_DISCOVERY_URL,
|
||||
headers={"Accept": "application/json"},
|
||||
timeout=timeout_seconds,
|
||||
)
|
||||
except Exception as exc:
|
||||
raise AuthError(
|
||||
f"xAI OIDC discovery failed: {exc}",
|
||||
provider="xai-oauth",
|
||||
code="xai_discovery_failed",
|
||||
) from exc
|
||||
if response.status_code != 200:
|
||||
raise AuthError(
|
||||
f"xAI OIDC discovery returned status {response.status_code}.",
|
||||
provider="xai-oauth",
|
||||
code="xai_discovery_failed",
|
||||
)
|
||||
try:
|
||||
payload = response.json()
|
||||
except Exception as exc:
|
||||
raise AuthError(
|
||||
f"xAI OIDC discovery returned invalid JSON: {exc}",
|
||||
provider="xai-oauth",
|
||||
code="xai_discovery_invalid_json",
|
||||
) from exc
|
||||
if not isinstance(payload, dict):
|
||||
raise AuthError(
|
||||
"xAI OIDC discovery response was not a JSON object.",
|
||||
provider="xai-oauth",
|
||||
code="xai_discovery_incomplete",
|
||||
)
|
||||
authorization_endpoint = str(payload.get("authorization_endpoint", "") or "").strip()
|
||||
token_endpoint = str(payload.get("token_endpoint", "") or "").strip()
|
||||
if not authorization_endpoint or not token_endpoint:
|
||||
raise AuthError(
|
||||
"xAI OIDC discovery response was missing required endpoints.",
|
||||
provider="xai-oauth",
|
||||
code="xai_discovery_incomplete",
|
||||
)
|
||||
_xai_validate_oauth_endpoint(authorization_endpoint, field="authorization_endpoint")
|
||||
_xai_validate_oauth_endpoint(token_endpoint, field="token_endpoint")
|
||||
return {
|
||||
"authorization_endpoint": authorization_endpoint,
|
||||
"token_endpoint": token_endpoint,
|
||||
}
|
||||
|
||||
|
||||
def refresh_xai_oauth_pure(
|
||||
access_token: str,
|
||||
refresh_token: str,
|
||||
*,
|
||||
token_endpoint: str = "",
|
||||
timeout_seconds: float = 20.0,
|
||||
) -> Dict[str, Any]:
|
||||
del access_token
|
||||
if not isinstance(refresh_token, str) or not refresh_token.strip():
|
||||
raise AuthError(
|
||||
"xAI OAuth is missing refresh_token. Re-authenticate with `hermes model`.",
|
||||
provider="xai-oauth",
|
||||
code="xai_auth_missing_refresh_token",
|
||||
relogin_required=True,
|
||||
)
|
||||
endpoint = token_endpoint.strip() or _xai_oauth_discovery(timeout_seconds)["token_endpoint"]
|
||||
# Re-validate cached endpoints on the refresh hot path: an auth.json
|
||||
# written by an older Hermes (or hand-edited) may carry a non-xAI
|
||||
# token_endpoint that would receive every future refresh_token in
|
||||
# plaintext if we trusted it blindly. Cheap suffix check; fast-fail
|
||||
# with a clear error so the user can re-run `hermes model` to refetch.
|
||||
_xai_validate_oauth_endpoint(endpoint, field="token_endpoint")
|
||||
timeout = httpx.Timeout(max(5.0, float(timeout_seconds)))
|
||||
with httpx.Client(timeout=timeout, headers={"Accept": "application/json"}) as client:
|
||||
response = client.post(
|
||||
endpoint,
|
||||
headers={"Content-Type": "application/x-www-form-urlencoded"},
|
||||
data={
|
||||
"grant_type": "refresh_token",
|
||||
"client_id": XAI_OAUTH_CLIENT_ID,
|
||||
"refresh_token": refresh_token,
|
||||
},
|
||||
)
|
||||
if response.status_code != 200:
|
||||
detail = response.text.strip()
|
||||
raise AuthError(
|
||||
"xAI token refresh failed."
|
||||
+ (f" Response: {detail}" if detail else ""),
|
||||
provider="xai-oauth",
|
||||
code="xai_refresh_failed",
|
||||
relogin_required=(response.status_code in {400, 401, 403}),
|
||||
)
|
||||
try:
|
||||
payload = response.json()
|
||||
except Exception as exc:
|
||||
raise AuthError(
|
||||
f"xAI token refresh returned invalid JSON: {exc}",
|
||||
provider="xai-oauth",
|
||||
code="xai_refresh_invalid_json",
|
||||
) from exc
|
||||
if not isinstance(payload, dict):
|
||||
raise AuthError(
|
||||
"xAI token refresh response was not a JSON object.",
|
||||
provider="xai-oauth",
|
||||
code="xai_refresh_invalid_response",
|
||||
relogin_required=True,
|
||||
)
|
||||
refreshed_access = str(payload.get("access_token", "") or "").strip()
|
||||
if not refreshed_access:
|
||||
raise AuthError(
|
||||
"xAI token refresh response was missing access_token.",
|
||||
provider="xai-oauth",
|
||||
code="xai_refresh_missing_access_token",
|
||||
relogin_required=True,
|
||||
)
|
||||
updated = {
|
||||
"access_token": refreshed_access,
|
||||
"refresh_token": str(payload.get("refresh_token") or refresh_token).strip(),
|
||||
"id_token": str(payload.get("id_token") or "").strip(),
|
||||
"expires_in": payload.get("expires_in"),
|
||||
"token_type": str(payload.get("token_type") or "Bearer").strip() or "Bearer",
|
||||
"last_refresh": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
||||
}
|
||||
return updated
|
||||
|
||||
|
||||
def _refresh_xai_oauth_tokens(
|
||||
tokens: Dict[str, Any],
|
||||
*,
|
||||
token_endpoint: str,
|
||||
redirect_uri: str = "",
|
||||
timeout_seconds: float,
|
||||
) -> Dict[str, Any]:
|
||||
refreshed = refresh_xai_oauth_pure(
|
||||
str(tokens.get("access_token", "") or ""),
|
||||
str(tokens.get("refresh_token", "") or ""),
|
||||
token_endpoint=token_endpoint,
|
||||
timeout_seconds=timeout_seconds,
|
||||
)
|
||||
updated_tokens = dict(tokens)
|
||||
updated_tokens["access_token"] = refreshed["access_token"]
|
||||
updated_tokens["refresh_token"] = refreshed["refresh_token"]
|
||||
if refreshed.get("id_token"):
|
||||
updated_tokens["id_token"] = refreshed["id_token"]
|
||||
if refreshed.get("expires_in") is not None:
|
||||
updated_tokens["expires_in"] = refreshed["expires_in"]
|
||||
if refreshed.get("token_type"):
|
||||
updated_tokens["token_type"] = refreshed["token_type"]
|
||||
_save_xai_oauth_tokens(
|
||||
updated_tokens,
|
||||
discovery={"token_endpoint": token_endpoint},
|
||||
redirect_uri=redirect_uri,
|
||||
last_refresh=refreshed["last_refresh"],
|
||||
)
|
||||
return updated_tokens
|
||||
|
||||
|
||||
def resolve_xai_oauth_runtime_credentials(
|
||||
*,
|
||||
force_refresh: bool = False,
|
||||
refresh_if_expiring: bool = True,
|
||||
refresh_skew_seconds: int = XAI_ACCESS_TOKEN_REFRESH_SKEW_SECONDS,
|
||||
) -> Dict[str, Any]:
|
||||
data = _read_xai_oauth_tokens()
|
||||
tokens = dict(data["tokens"])
|
||||
access_token = str(tokens.get("access_token", "") or "").strip()
|
||||
refresh_timeout_seconds = float(os.getenv("HERMES_XAI_REFRESH_TIMEOUT_SECONDS", "20"))
|
||||
discovery = dict(data.get("discovery") or {})
|
||||
token_endpoint = str(discovery.get("token_endpoint", "") or "").strip()
|
||||
redirect_uri = str(data.get("redirect_uri", "") or "").strip()
|
||||
|
||||
should_refresh = bool(force_refresh)
|
||||
if (not should_refresh) and refresh_if_expiring:
|
||||
should_refresh = _xai_access_token_is_expiring(access_token, refresh_skew_seconds)
|
||||
if should_refresh:
|
||||
with _auth_store_lock(timeout_seconds=max(float(AUTH_LOCK_TIMEOUT_SECONDS), refresh_timeout_seconds + 5.0)):
|
||||
data = _read_xai_oauth_tokens(_lock=False)
|
||||
tokens = dict(data["tokens"])
|
||||
access_token = str(tokens.get("access_token", "") or "").strip()
|
||||
discovery = dict(data.get("discovery") or {})
|
||||
token_endpoint = str(discovery.get("token_endpoint", "") or "").strip()
|
||||
redirect_uri = str(data.get("redirect_uri", "") or "").strip()
|
||||
should_refresh = bool(force_refresh)
|
||||
if (not should_refresh) and refresh_if_expiring:
|
||||
should_refresh = _xai_access_token_is_expiring(access_token, refresh_skew_seconds)
|
||||
if should_refresh:
|
||||
if not token_endpoint:
|
||||
token_endpoint = _xai_oauth_discovery(refresh_timeout_seconds)["token_endpoint"]
|
||||
tokens = _refresh_xai_oauth_tokens(
|
||||
tokens,
|
||||
token_endpoint=token_endpoint,
|
||||
redirect_uri=redirect_uri,
|
||||
timeout_seconds=refresh_timeout_seconds,
|
||||
)
|
||||
access_token = str(tokens.get("access_token", "") or "").strip()
|
||||
|
||||
base_url = (
|
||||
os.getenv("HERMES_XAI_BASE_URL", "").strip().rstrip("/")
|
||||
or os.getenv("XAI_BASE_URL", "").strip().rstrip("/")
|
||||
or DEFAULT_XAI_OAUTH_BASE_URL
|
||||
)
|
||||
return {
|
||||
"provider": "xai-oauth",
|
||||
"base_url": base_url,
|
||||
"api_key": access_token,
|
||||
"source": "hermes-auth-store",
|
||||
"last_refresh": data.get("last_refresh"),
|
||||
"auth_mode": "oauth_pkce",
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TLS verification helper
|
||||
# =============================================================================
|
||||
@@ -4030,6 +4595,48 @@ def get_codex_auth_status() -> Dict[str, Any]:
|
||||
}
|
||||
|
||||
|
||||
def get_xai_oauth_auth_status() -> Dict[str, Any]:
|
||||
try:
|
||||
from agent.credential_pool import load_pool
|
||||
|
||||
pool = load_pool("xai-oauth")
|
||||
if pool and pool.has_credentials():
|
||||
entry = pool.select()
|
||||
if entry is not None:
|
||||
api_key = (
|
||||
getattr(entry, "runtime_api_key", None)
|
||||
or getattr(entry, "access_token", "")
|
||||
)
|
||||
if api_key and not _xai_access_token_is_expiring(api_key, 0):
|
||||
return {
|
||||
"logged_in": True,
|
||||
"auth_store": str(_auth_file_path()),
|
||||
"last_refresh": getattr(entry, "last_refresh", None),
|
||||
"auth_mode": "oauth_pkce",
|
||||
"source": f"pool:{getattr(entry, 'label', 'unknown')}",
|
||||
"api_key": api_key,
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
creds = resolve_xai_oauth_runtime_credentials()
|
||||
return {
|
||||
"logged_in": True,
|
||||
"auth_store": str(_auth_file_path()),
|
||||
"last_refresh": creds.get("last_refresh"),
|
||||
"auth_mode": creds.get("auth_mode"),
|
||||
"source": creds.get("source"),
|
||||
"api_key": creds.get("api_key"),
|
||||
}
|
||||
except AuthError as exc:
|
||||
return {
|
||||
"logged_in": False,
|
||||
"auth_store": str(_auth_file_path()),
|
||||
"error": str(exc),
|
||||
}
|
||||
|
||||
|
||||
def get_api_key_provider_status(provider_id: str) -> Dict[str, Any]:
|
||||
"""Status snapshot for API-key providers (z.ai, Kimi, MiniMax)."""
|
||||
pconfig = PROVIDER_REGISTRY.get(provider_id)
|
||||
@@ -4100,6 +4707,8 @@ def get_auth_status(provider_id: Optional[str] = None) -> Dict[str, Any]:
|
||||
return get_nous_auth_status()
|
||||
if target == "openai-codex":
|
||||
return get_codex_auth_status()
|
||||
if target == "xai-oauth":
|
||||
return get_xai_oauth_auth_status()
|
||||
if target == "qwen-oauth":
|
||||
return get_qwen_auth_status()
|
||||
if target == "google-gemini-cli":
|
||||
@@ -4320,7 +4929,7 @@ def _logout_default_provider_from_config() -> Optional[str]:
|
||||
"No provider is currently logged in" and never reset model.provider.
|
||||
"""
|
||||
provider = _get_config_provider()
|
||||
if provider in {"nous", "openai-codex"}:
|
||||
if provider in {"nous", "openai-codex", "xai-oauth"}:
|
||||
return provider
|
||||
return None
|
||||
|
||||
@@ -4619,6 +5228,247 @@ def _login_openai_codex(
|
||||
print(f" Config updated: {config_path} (model.provider=openai-codex)")
|
||||
|
||||
|
||||
def _login_xai_oauth(
|
||||
args,
|
||||
pconfig: ProviderConfig,
|
||||
*,
|
||||
force_new_login: bool = False,
|
||||
) -> None:
|
||||
del pconfig
|
||||
|
||||
if not force_new_login:
|
||||
try:
|
||||
existing = resolve_xai_oauth_runtime_credentials()
|
||||
api_key = existing.get("api_key", "")
|
||||
if isinstance(api_key, str) and api_key and not _xai_access_token_is_expiring(api_key, 60):
|
||||
print("Existing xAI OAuth credentials found in Hermes auth store.")
|
||||
try:
|
||||
reuse = input("Use existing credentials? [Y/n]: ").strip().lower()
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
reuse = "y"
|
||||
if reuse in ("", "y", "yes"):
|
||||
config_path = _update_config_for_provider(
|
||||
"xai-oauth",
|
||||
existing.get("base_url", DEFAULT_XAI_OAUTH_BASE_URL),
|
||||
)
|
||||
print()
|
||||
print("Login successful!")
|
||||
print(f" Config updated: {config_path} (model.provider=xai-oauth)")
|
||||
return
|
||||
except AuthError:
|
||||
pass
|
||||
|
||||
print()
|
||||
print("Signing in to xAI Grok OAuth (SuperGrok Subscription)...")
|
||||
print("(Hermes creates its own local OAuth session)")
|
||||
print()
|
||||
|
||||
timeout_seconds = float(getattr(args, "timeout", None) or 20.0)
|
||||
open_browser = not getattr(args, "no_browser", False)
|
||||
if _is_remote_session():
|
||||
open_browser = False
|
||||
|
||||
creds = _xai_oauth_loopback_login(timeout_seconds=timeout_seconds, open_browser=open_browser)
|
||||
_save_xai_oauth_tokens(
|
||||
creds["tokens"],
|
||||
discovery=creds.get("discovery"),
|
||||
redirect_uri=creds.get("redirect_uri", ""),
|
||||
last_refresh=creds.get("last_refresh"),
|
||||
)
|
||||
config_path = _update_config_for_provider("xai-oauth", creds.get("base_url", DEFAULT_XAI_OAUTH_BASE_URL))
|
||||
print()
|
||||
print("Login successful!")
|
||||
from hermes_constants import display_hermes_home as _dhh
|
||||
print(f" Auth state: {_dhh()}/auth.json")
|
||||
print(f" Config updated: {config_path} (model.provider=xai-oauth)")
|
||||
|
||||
|
||||
def _xai_oauth_build_authorize_url(
|
||||
*,
|
||||
authorization_endpoint: str,
|
||||
redirect_uri: str,
|
||||
code_challenge: str,
|
||||
state: str,
|
||||
nonce: str,
|
||||
) -> str:
|
||||
# `plan=generic` opts the consent screen into xAI's generic OAuth plan
|
||||
# tier instead of falling back to the per-account default. Without it,
|
||||
# accounts.x.ai rejects loopback OAuth from non-allowlisted clients.
|
||||
# `referrer=hermes-agent` lets xAI attribute Hermes-originated logins
|
||||
# in their OAuth server logs (we still impersonate the upstream Grok-CLI
|
||||
# client_id; this is best-effort attribution until xAI mints us our own).
|
||||
authorize_params = {
|
||||
"response_type": "code",
|
||||
"client_id": XAI_OAUTH_CLIENT_ID,
|
||||
"redirect_uri": redirect_uri,
|
||||
"scope": XAI_OAUTH_SCOPE,
|
||||
"code_challenge": code_challenge,
|
||||
"code_challenge_method": "S256",
|
||||
"state": state,
|
||||
"nonce": nonce,
|
||||
"plan": "generic",
|
||||
"referrer": "hermes-agent",
|
||||
}
|
||||
return f"{authorization_endpoint}?{urlencode(authorize_params)}"
|
||||
|
||||
|
||||
def _xai_oauth_loopback_login(
|
||||
*,
|
||||
timeout_seconds: float = 20.0,
|
||||
open_browser: bool = True,
|
||||
) -> Dict[str, Any]:
|
||||
discovery = _xai_oauth_discovery(timeout_seconds)
|
||||
authorization_endpoint = discovery["authorization_endpoint"]
|
||||
token_endpoint = discovery["token_endpoint"]
|
||||
|
||||
server, thread, callback_result, redirect_uri = _xai_start_callback_server()
|
||||
try:
|
||||
_xai_validate_loopback_redirect_uri(redirect_uri)
|
||||
code_verifier = _oauth_pkce_code_verifier()
|
||||
code_challenge = _oauth_pkce_code_challenge(code_verifier)
|
||||
state = uuid.uuid4().hex
|
||||
nonce = uuid.uuid4().hex
|
||||
authorize_url = _xai_oauth_build_authorize_url(
|
||||
authorization_endpoint=authorization_endpoint,
|
||||
redirect_uri=redirect_uri,
|
||||
code_challenge=code_challenge,
|
||||
state=state,
|
||||
nonce=nonce,
|
||||
)
|
||||
|
||||
print("Open this URL to authorize Hermes with xAI:")
|
||||
print(authorize_url)
|
||||
print()
|
||||
print(f"Waiting for callback on {redirect_uri}")
|
||||
|
||||
_print_loopback_ssh_hint(redirect_uri, docs_url=XAI_OAUTH_DOCS_URL)
|
||||
|
||||
if open_browser and not _is_remote_session():
|
||||
try:
|
||||
opened = webbrowser.open(authorize_url)
|
||||
except Exception:
|
||||
opened = False
|
||||
if opened:
|
||||
print("Browser opened for xAI authorization.")
|
||||
else:
|
||||
print("Could not open the browser automatically; use the URL above.")
|
||||
|
||||
callback = _xai_wait_for_callback(
|
||||
server,
|
||||
thread,
|
||||
callback_result,
|
||||
timeout_seconds=max(30.0, timeout_seconds * 9),
|
||||
)
|
||||
except Exception:
|
||||
try:
|
||||
server.shutdown()
|
||||
server.server_close()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
thread.join(timeout=1.0)
|
||||
except Exception:
|
||||
pass
|
||||
raise
|
||||
|
||||
if callback.get("error"):
|
||||
detail = callback.get("error_description") or callback["error"]
|
||||
raise AuthError(
|
||||
f"xAI authorization failed: {detail}",
|
||||
provider="xai-oauth",
|
||||
code="xai_authorization_failed",
|
||||
)
|
||||
if callback.get("state") != state:
|
||||
raise AuthError(
|
||||
"xAI authorization failed: state mismatch.",
|
||||
provider="xai-oauth",
|
||||
code="xai_state_mismatch",
|
||||
)
|
||||
code = str(callback.get("code") or "").strip()
|
||||
if not code:
|
||||
raise AuthError(
|
||||
"xAI authorization failed: missing authorization code.",
|
||||
provider="xai-oauth",
|
||||
code="xai_code_missing",
|
||||
)
|
||||
|
||||
try:
|
||||
response = httpx.post(
|
||||
token_endpoint,
|
||||
headers={"Content-Type": "application/x-www-form-urlencoded", "Accept": "application/json"},
|
||||
data={
|
||||
"grant_type": "authorization_code",
|
||||
"code": code,
|
||||
"redirect_uri": redirect_uri,
|
||||
"client_id": XAI_OAUTH_CLIENT_ID,
|
||||
"code_verifier": code_verifier,
|
||||
},
|
||||
timeout=max(20.0, timeout_seconds),
|
||||
)
|
||||
except Exception as exc:
|
||||
raise AuthError(
|
||||
f"xAI token exchange failed: {exc}",
|
||||
provider="xai-oauth",
|
||||
code="xai_token_exchange_failed",
|
||||
) from exc
|
||||
if response.status_code != 200:
|
||||
detail = response.text.strip()
|
||||
raise AuthError(
|
||||
"xAI token exchange failed."
|
||||
+ (f" Response: {detail}" if detail else ""),
|
||||
provider="xai-oauth",
|
||||
code="xai_token_exchange_failed",
|
||||
)
|
||||
try:
|
||||
payload = response.json()
|
||||
except Exception as exc:
|
||||
raise AuthError(
|
||||
f"xAI token exchange returned invalid JSON: {exc}",
|
||||
provider="xai-oauth",
|
||||
code="xai_token_exchange_invalid",
|
||||
) from exc
|
||||
if not isinstance(payload, dict):
|
||||
raise AuthError(
|
||||
"xAI token exchange response was not a JSON object.",
|
||||
provider="xai-oauth",
|
||||
code="xai_token_exchange_invalid",
|
||||
)
|
||||
access_token = str(payload.get("access_token", "") or "").strip()
|
||||
refresh_token = str(payload.get("refresh_token", "") or "").strip()
|
||||
if not access_token:
|
||||
raise AuthError(
|
||||
"xAI token exchange did not return an access_token.",
|
||||
provider="xai-oauth",
|
||||
code="xai_token_exchange_invalid",
|
||||
)
|
||||
if not refresh_token:
|
||||
raise AuthError(
|
||||
"xAI token exchange did not return a refresh_token.",
|
||||
provider="xai-oauth",
|
||||
code="xai_token_exchange_invalid",
|
||||
)
|
||||
|
||||
base_url = (
|
||||
os.getenv("HERMES_XAI_BASE_URL", "").strip().rstrip("/")
|
||||
or os.getenv("XAI_BASE_URL", "").strip().rstrip("/")
|
||||
or DEFAULT_XAI_OAUTH_BASE_URL
|
||||
)
|
||||
return {
|
||||
"tokens": {
|
||||
"access_token": access_token,
|
||||
"refresh_token": refresh_token,
|
||||
"id_token": str(payload.get("id_token", "") or "").strip(),
|
||||
"expires_in": payload.get("expires_in"),
|
||||
"token_type": str(payload.get("token_type") or "Bearer").strip() or "Bearer",
|
||||
},
|
||||
"discovery": discovery,
|
||||
"redirect_uri": redirect_uri,
|
||||
"base_url": base_url,
|
||||
"last_refresh": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
||||
"source": "oauth-loopback",
|
||||
}
|
||||
|
||||
|
||||
def _codex_device_code_login() -> Dict[str, Any]:
|
||||
"""Run the OpenAI device code login flow and return credentials dict."""
|
||||
import time as _time
|
||||
|
||||
@@ -33,7 +33,7 @@ from hermes_constants import OPENROUTER_BASE_URL
|
||||
|
||||
|
||||
# Providers that support OAuth login in addition to API keys.
|
||||
_OAUTH_CAPABLE_PROVIDERS = {"anthropic", "nous", "openai-codex", "qwen-oauth", "google-gemini-cli", "minimax-oauth"}
|
||||
_OAUTH_CAPABLE_PROVIDERS = {"anthropic", "nous", "openai-codex", "xai-oauth", "qwen-oauth", "google-gemini-cli", "minimax-oauth"}
|
||||
|
||||
|
||||
def _get_custom_provider_names() -> list:
|
||||
@@ -77,6 +77,8 @@ def _normalize_provider(provider: str) -> str:
|
||||
normalized = (provider or "").strip().lower()
|
||||
if normalized in {"or", "open-router"}:
|
||||
return "openrouter"
|
||||
if normalized in {"grok-oauth", "xai-oauth", "x-ai-oauth", "xai-grok-oauth"}:
|
||||
return "xai-oauth"
|
||||
# Check if it matches a custom provider name
|
||||
custom_key = _resolve_custom_provider_input(normalized)
|
||||
if custom_key:
|
||||
@@ -170,7 +172,7 @@ def auth_add_command(args) -> None:
|
||||
if provider.startswith(CUSTOM_POOL_PREFIX):
|
||||
requested_type = AUTH_TYPE_API_KEY
|
||||
else:
|
||||
requested_type = AUTH_TYPE_OAUTH if provider in {"anthropic", "nous", "openai-codex", "qwen-oauth", "google-gemini-cli", "minimax-oauth"} else AUTH_TYPE_API_KEY
|
||||
requested_type = AUTH_TYPE_OAUTH if provider in _OAUTH_CAPABLE_PROVIDERS else AUTH_TYPE_API_KEY
|
||||
|
||||
pool = load_pool(provider)
|
||||
|
||||
@@ -333,6 +335,31 @@ def auth_add_command(args) -> None:
|
||||
print(f'Added {provider} OAuth credential #{len(pool.entries())}: "{entry.label}"')
|
||||
return
|
||||
|
||||
if provider == "xai-oauth":
|
||||
creds = auth_mod._xai_oauth_loopback_login(
|
||||
timeout_seconds=getattr(args, "timeout", None) or 20.0,
|
||||
open_browser=not getattr(args, "no_browser", False),
|
||||
)
|
||||
label = (getattr(args, "label", None) or "").strip() or label_from_token(
|
||||
creds["tokens"]["access_token"],
|
||||
_oauth_default_label(provider, len(pool.entries()) + 1),
|
||||
)
|
||||
entry = PooledCredential(
|
||||
provider=provider,
|
||||
id=uuid.uuid4().hex[:6],
|
||||
label=label,
|
||||
auth_type=AUTH_TYPE_OAUTH,
|
||||
priority=0,
|
||||
source=f"{SOURCE_MANUAL}:xai_pkce",
|
||||
access_token=creds["tokens"]["access_token"],
|
||||
refresh_token=creds["tokens"].get("refresh_token"),
|
||||
base_url=creds.get("base_url"),
|
||||
last_refresh=creds.get("last_refresh"),
|
||||
)
|
||||
pool.add_entry(entry)
|
||||
print(f'Added {provider} OAuth credential #{len(pool.entries())}: "{entry.label}"')
|
||||
return
|
||||
|
||||
if provider == "google-gemini-cli":
|
||||
from agent.google_oauth import run_gemini_oauth_login_pure
|
||||
|
||||
|
||||
+48
-2
@@ -175,6 +175,48 @@ def _check_via_local_git(repo_dir: Path) -> Optional[int]:
|
||||
return None
|
||||
|
||||
|
||||
def _version_tuple(v: str) -> tuple[int, ...]:
|
||||
"""Parse '0.13.0' into (0, 13, 0) for comparison. Non-numeric segments become 0."""
|
||||
parts = []
|
||||
for segment in v.split("."):
|
||||
try:
|
||||
parts.append(int(segment))
|
||||
except ValueError:
|
||||
parts.append(0)
|
||||
return tuple(parts)
|
||||
|
||||
|
||||
def _fetch_pypi_latest(package: str = "hermes-agent") -> Optional[str]:
|
||||
"""Fetch the latest version of a package from PyPI. Returns None on failure."""
|
||||
try:
|
||||
import urllib.request
|
||||
url = f"https://pypi.org/pypi/{package}/json"
|
||||
req = urllib.request.Request(url, headers={"Accept": "application/json"})
|
||||
with urllib.request.urlopen(req, timeout=5) as resp:
|
||||
data = json.loads(resp.read())
|
||||
return data.get("info", {}).get("version")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def check_via_pypi() -> Optional[int]:
|
||||
"""Compare installed version against PyPI latest.
|
||||
|
||||
Returns 0 if up-to-date, 1 if behind, None on failure.
|
||||
"""
|
||||
latest = _fetch_pypi_latest()
|
||||
if latest is None:
|
||||
return None
|
||||
if latest == VERSION:
|
||||
return 0
|
||||
try:
|
||||
if _version_tuple(latest) > _version_tuple(VERSION):
|
||||
return 1
|
||||
return 0
|
||||
except Exception:
|
||||
return 1 if latest != VERSION else 0
|
||||
|
||||
|
||||
def check_for_updates() -> Optional[int]:
|
||||
"""Check whether a Hermes update is available.
|
||||
|
||||
@@ -213,8 +255,9 @@ def check_for_updates() -> Optional[int]:
|
||||
if not (repo_dir / ".git").exists():
|
||||
repo_dir = hermes_home / "hermes-agent"
|
||||
if not (repo_dir / ".git").exists():
|
||||
return None
|
||||
behind = _check_via_local_git(repo_dir)
|
||||
behind = check_via_pypi()
|
||||
else:
|
||||
behind = _check_via_local_git(repo_dir)
|
||||
|
||||
try:
|
||||
cache_file.write_text(json.dumps({"ts": now, "behind": behind, "rev": embedded_rev}))
|
||||
@@ -470,6 +513,9 @@ def build_welcome_banner(console: Console, model: str, cwd: str,
|
||||
model_short = model_short[:25] + "..."
|
||||
ctx_str = f" [dim {dim}]·[/] [dim {dim}]{_format_context_length(context_length)} context[/]" if context_length else ""
|
||||
left_lines.append(f"[{accent}]{model_short}[/]{ctx_str} [dim {dim}]·[/] [dim {dim}]Nous Research[/]")
|
||||
|
||||
if os.getenv("HERMES_YOLO_MODE"):
|
||||
left_lines.append(f"[bold red]⚠ YOLO mode[/] [dim {dim}]— all approval prompts bypassed[/]")
|
||||
left_lines.append(f"[dim {dim}]{cwd}[/]")
|
||||
if session_id:
|
||||
left_lines.append(f"[dim {session_color}]Session: {session_id}[/]")
|
||||
|
||||
@@ -304,6 +304,103 @@ def render_codex_toml_section(
|
||||
return "\n".join(out) + "\n"
|
||||
|
||||
|
||||
def _insert_managed_block_at_top_level(user_text: str, managed_block: str) -> str:
|
||||
"""Insert Hermes' managed Codex TOML block while keeping root keys root-scoped.
|
||||
|
||||
TOML has no syntax to return to the document root after a table header.
|
||||
Therefore appending a root key like `default_permissions = ...` after a
|
||||
user table such as `[features]` actually creates `features.default_permissions`,
|
||||
which Codex rejects. Insert the managed block before the first table header
|
||||
so its root keys remain top-level, while preserving user content verbatim.
|
||||
"""
|
||||
if not user_text.strip():
|
||||
return managed_block
|
||||
|
||||
lines = user_text.splitlines(keepends=True)
|
||||
first_table_idx: Optional[int] = None
|
||||
for idx, line in enumerate(lines):
|
||||
stripped = line.lstrip()
|
||||
if stripped.startswith("["):
|
||||
first_table_idx = idx
|
||||
break
|
||||
|
||||
if first_table_idx is None:
|
||||
prefix = user_text.rstrip("\n")
|
||||
return f"{prefix}\n\n{managed_block}" if prefix else managed_block
|
||||
|
||||
prefix = "".join(lines[:first_table_idx]).rstrip("\n")
|
||||
suffix = "".join(lines[first_table_idx:]).lstrip("\n")
|
||||
if prefix:
|
||||
return f"{prefix}\n\n{managed_block}\n{suffix}"
|
||||
return f"{managed_block}\n{suffix}"
|
||||
|
||||
|
||||
def _strip_unmanaged_plugin_tables(toml_text: str) -> str:
|
||||
"""Remove ``[plugins."<name>@<marketplace>"]`` tables that live OUTSIDE the
|
||||
managed block.
|
||||
|
||||
Codex itself writes these tables when the user runs ``codex plugins enable``
|
||||
directly (i.e. before Hermes' migrate has ever touched the file). When we
|
||||
later run migrate, ``_query_codex_plugins()`` reports the same plugins via
|
||||
the live ``plugin/list`` RPC and we re-emit them inside the managed block.
|
||||
The result without this strip is duplicate ``[plugins."X@Y"]`` table
|
||||
headers — codex's strict TOML parser then refuses to load the file.
|
||||
|
||||
We own the ``[plugins.*]`` namespace once migrate has run, so dropping any
|
||||
pre-existing ``[plugins.*]`` tables is safe: ``plugin/list`` is the source
|
||||
of truth for what's actually installed. The caller is expected to only
|
||||
invoke this strip when ``plugin/list`` succeeded — otherwise we'd lose
|
||||
plugins the user installed via ``codex`` without a way to re-emit them.
|
||||
|
||||
Behavior:
|
||||
* Lines beginning with ``[plugins.`` start a swallow region that ends at
|
||||
the next non-``[plugins.`` table header or end-of-file.
|
||||
* Content inside the managed block is untouched (callers should run
|
||||
``_strip_existing_managed_block`` first so the managed block has
|
||||
already been removed when this runs).
|
||||
"""
|
||||
lines = toml_text.splitlines(keepends=True)
|
||||
out: list[str] = []
|
||||
in_plugin_table = False
|
||||
for line in lines:
|
||||
stripped = line.lstrip()
|
||||
# Only treat a line as a table header when it has the shape
|
||||
# ``[...]`` (optionally followed by a comment). Multi-line array
|
||||
# continuations like ``["nested"],`` also start with ``[`` after
|
||||
# lstrip but are not headers — without this guard they would
|
||||
# falsely flip ``in_plugin_table`` to False mid-table and leak
|
||||
# array fragments into the output.
|
||||
if _looks_like_table_header(stripped):
|
||||
in_plugin_table = stripped.startswith("[plugins.")
|
||||
if in_plugin_table:
|
||||
continue
|
||||
if in_plugin_table:
|
||||
# Swallow keys/comments/blanks until the next table header.
|
||||
continue
|
||||
out.append(line)
|
||||
return "".join(out)
|
||||
|
||||
|
||||
def _looks_like_table_header(stripped_line: str) -> bool:
|
||||
"""Return True if ``stripped_line`` is a TOML table header.
|
||||
|
||||
A header has the shape ``[name]`` or ``[[name]]`` (array-of-tables),
|
||||
optionally followed by a comment. The closing ``]`` (or ``]]``) must
|
||||
appear on the same line, and no key-assignment ``=`` can precede it.
|
||||
This distinguishes real headers from multi-line array continuation
|
||||
lines that also start with ``[`` after ``lstrip()``.
|
||||
"""
|
||||
if not stripped_line.startswith("["):
|
||||
return False
|
||||
# Drop trailing comment so e.g. ``[features] # note`` still matches.
|
||||
head = stripped_line.split("#", 1)[0].rstrip()
|
||||
if not head.endswith("]"):
|
||||
return False
|
||||
# ``key = [x]`` would have an ``=`` before the bracket; a header doesn't.
|
||||
bracket_idx = head.index("]")
|
||||
return "=" not in head[: bracket_idx + 1]
|
||||
|
||||
|
||||
def _strip_existing_managed_block(toml_text: str) -> str:
|
||||
"""Remove any prior managed section so re-runs idempotently replace it.
|
||||
|
||||
@@ -431,6 +528,32 @@ def _query_codex_plugins(
|
||||
return out, None
|
||||
|
||||
|
||||
def _looks_like_test_tempdir(path: str) -> bool:
|
||||
"""Heuristic: does ``path`` look like a pytest/transient tempdir?
|
||||
|
||||
pytest tempdirs live under ``pytest-of-<user>/pytest-<n>/`` (created via
|
||||
``tmp_path`` / ``tmp_path_factory``) and are reaped between sessions.
|
||||
macOS routes ``/tmp`` through ``/private/var/folders/<…>/T`` which is
|
||||
what pytest's tempdir factory uses by default. If a HERMES_HOME pointing
|
||||
at one of those paths is burned into ``~/.codex/config.toml``, every
|
||||
codex-routed hermes-tools call fails silently once the directory is GC'd.
|
||||
|
||||
We err on the side of refusing — losing a (very unlikely) real
|
||||
``~/.hermes`` symlink that happens to live under ``/private/var/folders``
|
||||
is much less harmful than silently bricking codex's tool surface.
|
||||
"""
|
||||
if not path:
|
||||
return False
|
||||
needles = (
|
||||
"pytest-of-",
|
||||
"/pytest-",
|
||||
"/tmp/pytest",
|
||||
"/private/var/folders/", # macOS tempdir root
|
||||
)
|
||||
normalized = path.lower()
|
||||
return any(needle in normalized for needle in needles)
|
||||
|
||||
|
||||
def _build_hermes_tools_mcp_entry() -> dict:
|
||||
"""Build the codex stdio-transport entry that launches Hermes' own
|
||||
tool surface as an MCP server. Codex's subprocess will call back into
|
||||
@@ -443,9 +566,22 @@ def _build_hermes_tools_mcp_entry() -> dict:
|
||||
import sys
|
||||
|
||||
env: dict[str, str] = {}
|
||||
# HERMES_HOME passes through if set so the MCP subprocess sees the
|
||||
# same config / auth / sessions DB as the parent CLI.
|
||||
hermes_home = os.environ.get("HERMES_HOME")
|
||||
# HERMES_HOME passes through IF SET so the MCP subprocess sees the same
|
||||
# config / auth / sessions DB as the parent CLI. Read from os.environ
|
||||
# (not get_hermes_home()) on purpose: when the env var is unset we want
|
||||
# codex's subprocess to inherit whatever HERMES_HOME its launcher sets
|
||||
# at runtime (systemd unit, gateway, kanban dispatcher, custom shell),
|
||||
# rather than burning the migrate-time resolved default into config.toml
|
||||
# — that would override the launcher's HERMES_HOME and pin the subprocess
|
||||
# to the wrong profile.
|
||||
#
|
||||
# The pytest-tempdir guard below catches the issue #26250 Bug C scenario:
|
||||
# a sibling test's monkeypatch.setenv("HERMES_HOME", tmp_path) would
|
||||
# otherwise leak a transient pytest tempdir into the user's real
|
||||
# ~/.codex/config.toml and silently brick codex once the tempdir is GC'd.
|
||||
hermes_home = os.environ.get("HERMES_HOME") or ""
|
||||
if hermes_home and _looks_like_test_tempdir(hermes_home):
|
||||
hermes_home = ""
|
||||
if hermes_home:
|
||||
env["HERMES_HOME"] = hermes_home
|
||||
# PYTHONPATH passes through so a worktree-launched hermes finds the
|
||||
@@ -533,10 +669,16 @@ def migrate(
|
||||
# Discover installed Codex curated plugins. Best-effort — never blocks
|
||||
# the migration if codex is unreachable or the RPC fails.
|
||||
plugins: list[dict] = []
|
||||
plugin_query_succeeded = False
|
||||
if discover_plugins and not dry_run:
|
||||
plugins, plugin_err = _query_codex_plugins(codex_home=codex_home)
|
||||
if plugin_err:
|
||||
report.plugin_query_error = plugin_err
|
||||
else:
|
||||
# plugin/list returned authoritatively (even if the list is empty).
|
||||
# That means we own [plugins.*] for this re-render and can safely
|
||||
# strip any pre-existing tables outside the managed block.
|
||||
plugin_query_succeeded = True
|
||||
for p in plugins:
|
||||
report.migrated_plugins.append(f"{p['name']}@{p['marketplace']}")
|
||||
|
||||
@@ -571,14 +713,15 @@ def migrate(
|
||||
report.errors.append(f"could not read {target}: {exc}")
|
||||
return report
|
||||
without_managed = _strip_existing_managed_block(existing)
|
||||
# Ensure exactly one blank line between user content and managed block
|
||||
if without_managed and not without_managed.endswith("\n"):
|
||||
without_managed += "\n"
|
||||
new_text = (
|
||||
without_managed.rstrip("\n") + "\n\n" + managed_block
|
||||
if without_managed.strip()
|
||||
else managed_block
|
||||
)
|
||||
# Bug B: when plugin/list ran authoritatively, codex's own
|
||||
# [plugins."<name>@<marketplace>"] tables outside our managed block
|
||||
# would survive _strip_existing_managed_block and then collide with
|
||||
# the entries we re-emit inside the managed block — producing
|
||||
# duplicate-table-header parse errors on codex's next startup. Drop
|
||||
# those pre-existing tables since plugin/list is the source of truth.
|
||||
if plugin_query_succeeded:
|
||||
without_managed = _strip_unmanaged_plugin_tables(without_managed)
|
||||
new_text = _insert_managed_block_at_top_level(without_managed, managed_block)
|
||||
else:
|
||||
new_text = managed_block
|
||||
|
||||
|
||||
@@ -198,6 +198,8 @@ COMMAND_REGISTRY: list[CommandDef] = [
|
||||
args_hint="[days]"),
|
||||
CommandDef("platforms", "Show gateway/messaging platform status", "Info",
|
||||
cli_only=True, aliases=("gateway",)),
|
||||
CommandDef("platform", "Pause, resume, or list a failing gateway platform", "Info",
|
||||
gateway_only=True, args_hint="<pause|resume|list> [name]"),
|
||||
CommandDef("copy", "Copy the last assistant response to clipboard", "Info",
|
||||
cli_only=True, args_hint="[number]"),
|
||||
CommandDef("paste", "Attach clipboard image from your clipboard", "Info",
|
||||
|
||||
+60
-20
@@ -199,9 +199,40 @@ def get_managed_update_command() -> Optional[str]:
|
||||
return None
|
||||
|
||||
|
||||
def detect_install_method(project_root: Optional[Path] = None) -> str:
|
||||
"""Detect how Hermes was installed: 'nixos', 'homebrew', 'git', or 'pip'."""
|
||||
managed = get_managed_system()
|
||||
if managed:
|
||||
return managed.lower().replace(" ", "-")
|
||||
if project_root is None:
|
||||
project_root = Path(__file__).parent.parent.resolve()
|
||||
if (project_root / ".git").is_dir():
|
||||
return "git"
|
||||
return "pip"
|
||||
|
||||
|
||||
def recommended_update_command_for_method(method: str) -> str:
|
||||
"""Return the update command for a given install method."""
|
||||
if method == "nixos":
|
||||
return "sudo nixos-rebuild switch"
|
||||
if method == "homebrew":
|
||||
return "brew upgrade hermes-agent"
|
||||
if method == "pip":
|
||||
import shutil
|
||||
uv = shutil.which("uv")
|
||||
if uv:
|
||||
return "uv pip install --upgrade hermes-agent"
|
||||
return "pip install --upgrade hermes-agent"
|
||||
return "hermes update"
|
||||
|
||||
|
||||
def recommended_update_command() -> str:
|
||||
"""Return the best update command for the current installation."""
|
||||
return get_managed_update_command() or "hermes update"
|
||||
managed_cmd = get_managed_update_command()
|
||||
if managed_cmd:
|
||||
return managed_cmd
|
||||
method = detect_install_method()
|
||||
return recommended_update_command_for_method(method)
|
||||
|
||||
|
||||
def format_managed_message(action: str = "modify this Hermes installation") -> str:
|
||||
@@ -401,7 +432,10 @@ def ensure_hermes_home():
|
||||
else:
|
||||
home.mkdir(parents=True, exist_ok=True)
|
||||
_secure_dir(home)
|
||||
for subdir in ("cron", "sessions", "logs", "logs/curator", "memories"):
|
||||
for subdir in (
|
||||
"cron", "sessions", "logs", "logs/curator", "memories",
|
||||
"pairing", "hooks", "image_cache", "audio_cache", "skills",
|
||||
):
|
||||
d = home / subdir
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
_secure_dir(d)
|
||||
@@ -1112,6 +1146,10 @@ DEFAULT_CONFIG = {
|
||||
"provider": "", # e.g. "openrouter" (empty = inherit parent provider + credentials)
|
||||
"base_url": "", # direct OpenAI-compatible endpoint for subagents
|
||||
"api_key": "", # API key for delegation.base_url (falls back to OPENAI_API_KEY)
|
||||
"api_mode": "", # wire protocol for delegation.base_url: "chat_completions",
|
||||
# "codex_responses", or "anthropic_messages". Empty = auto-detect
|
||||
# from URL (e.g. /anthropic suffix → anthropic_messages). Set this
|
||||
# explicitly for non-standard endpoints the heuristic can't detect.
|
||||
# When delegate_task narrows child toolsets explicitly, preserve any
|
||||
# MCP toolsets the parent already has enabled. On by default so
|
||||
# narrowing (e.g. toolsets=["web","browser"]) expresses "I want these
|
||||
@@ -1251,6 +1289,8 @@ DEFAULT_CONFIG = {
|
||||
"allowed_channels": "", # If set, bot ONLY responds in these channel IDs (whitelist)
|
||||
"auto_thread": True, # Auto-create threads on @mention in channels (like Slack)
|
||||
"thread_require_mention": False, # If True, require @mention in threads too (multi-bot threads)
|
||||
"history_backfill": True, # If True, prepend recent channel scrollback when bot is triggered (recovers messages missed while require_mention gated them out)
|
||||
"history_backfill_limit": 50, # Max number of recent messages to scan when assembling the backfill block
|
||||
"reactions": True, # Add 👀/✅/❌ reactions to messages during processing
|
||||
"channel_prompts": {}, # Per-channel ephemeral system prompts (forum parents apply to child threads)
|
||||
# Opt-in DM role-based auth (#12136). By default, DISCORD_ALLOWED_ROLES
|
||||
@@ -1567,6 +1607,23 @@ DEFAULT_CONFIG = {
|
||||
"servers": {},
|
||||
},
|
||||
|
||||
# X (Twitter) Search via xAI's built-in x_search Responses tool.
|
||||
# The tool registers when xAI credentials are available (SuperGrok
|
||||
# OAuth or XAI_API_KEY) AND the x_search toolset is enabled in
|
||||
# `hermes tools`. These settings tune the backing Responses API call.
|
||||
"x_search": {
|
||||
# xAI model used for the Responses call. grok-4.20-reasoning is
|
||||
# the recommended default; any Grok model with x_search tool
|
||||
# access works.
|
||||
"model": "grok-4.20-reasoning",
|
||||
# Request timeout in seconds (minimum 30). x_search can take
|
||||
# 60-120s for complex queries — the default is generous.
|
||||
"timeout_seconds": 180,
|
||||
# Number of automatic retries on 5xx / ReadTimeout / ConnectionError.
|
||||
# Each retry backs off (1.5x attempt seconds, capped at 5s).
|
||||
"retries": 2,
|
||||
},
|
||||
|
||||
# Config schema version - bump this when adding new required fields
|
||||
"_config_version": 23,
|
||||
}
|
||||
@@ -2136,22 +2193,6 @@ OPTIONAL_ENV_VARS = {
|
||||
"password": True,
|
||||
"category": "tool",
|
||||
},
|
||||
"TINKER_API_KEY": {
|
||||
"description": "Tinker API key for RL training",
|
||||
"prompt": "Tinker API key",
|
||||
"url": "https://tinker-console.thinkingmachines.ai/keys",
|
||||
"tools": ["rl_start_training", "rl_check_status", "rl_stop_training"],
|
||||
"password": True,
|
||||
"category": "tool",
|
||||
},
|
||||
"WANDB_API_KEY": {
|
||||
"description": "Weights & Biases API key for experiment tracking",
|
||||
"prompt": "WandB API key",
|
||||
"url": "https://wandb.ai/authorize",
|
||||
"tools": ["rl_get_results", "rl_check_status"],
|
||||
"password": True,
|
||||
"category": "tool",
|
||||
},
|
||||
"VOICE_TOOLS_OPENAI_KEY": {
|
||||
"description": "OpenAI API key for voice transcription (Whisper) and OpenAI TTS",
|
||||
"prompt": "OpenAI API Key (for Whisper STT + TTS)",
|
||||
@@ -4988,8 +5029,7 @@ def set_config_value(key: str, value: str):
|
||||
'FAL_KEY', 'TELEGRAM_BOT_TOKEN', 'DISCORD_BOT_TOKEN',
|
||||
'TERMINAL_SSH_HOST', 'TERMINAL_SSH_USER', 'TERMINAL_SSH_KEY',
|
||||
'SUDO_PASSWORD', 'SLACK_BOT_TOKEN', 'SLACK_APP_TOKEN',
|
||||
'GITHUB_TOKEN', 'HONCHO_API_KEY', 'WANDB_API_KEY',
|
||||
'TINKER_API_KEY',
|
||||
'GITHUB_TOKEN', 'HONCHO_API_KEY',
|
||||
]
|
||||
|
||||
if key.upper() in api_keys or key.upper().endswith(('_API_KEY', '_TOKEN')) or key.upper().startswith('TERMINAL_SSH'):
|
||||
|
||||
+8
-2
@@ -196,9 +196,15 @@ def cron_create(args):
|
||||
|
||||
|
||||
def cron_edit(args):
|
||||
from cron.jobs import get_job
|
||||
from cron.jobs import AmbiguousJobReference, resolve_job_ref
|
||||
|
||||
job = get_job(args.job_id)
|
||||
try:
|
||||
job = resolve_job_ref(args.job_id)
|
||||
except AmbiguousJobReference as exc:
|
||||
print(color(str(exc), Colors.RED))
|
||||
for m in exc.matches:
|
||||
print(f" {m['id']} (name: {m.get('name')!r})")
|
||||
return 1
|
||||
if not job:
|
||||
print(color(f"Job not found: {args.job_id}", Colors.RED))
|
||||
return 1
|
||||
|
||||
@@ -0,0 +1,106 @@
|
||||
"""Lazy dependency bootstrapper for non-Python runtime deps.
|
||||
|
||||
Detection and prompting live here in Python — not in install.sh — because:
|
||||
1. shutil.which() works on every platform; install.sh needs bash.
|
||||
2. Detection is instant; spawning bash for a "is node installed?" check is waste.
|
||||
3. Python controls the UX (rich prompts, non-interactive fallback, TTY detection).
|
||||
|
||||
install.sh is still the *installation* backend because it has 1900 lines of
|
||||
battle-tested OS detection and package-manager logic (apt/brew/pacman/dnf/
|
||||
zypper/Termux/…). Reimplementing that in Python would be huge duplication.
|
||||
|
||||
Deps that degrade gracefully (ripgrep → grep fallback, ffmpeg → skip conversion)
|
||||
don't need ensure_dependency wired in — only hard-fail sites do (TUI needs node,
|
||||
browser tool needs agent-browser).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
_DEP_CHECKS = {
|
||||
"node": lambda: shutil.which("node") is not None,
|
||||
"browser": lambda: (
|
||||
shutil.which("agent-browser") is not None
|
||||
or _has_system_browser()
|
||||
or _has_hermes_agent_browser()
|
||||
),
|
||||
"ripgrep": lambda: shutil.which("rg") is not None,
|
||||
"ffmpeg": lambda: shutil.which("ffmpeg") is not None,
|
||||
}
|
||||
|
||||
_DEP_DESCRIPTIONS = {
|
||||
"node": "Node.js (required for browser tools and TUI)",
|
||||
"browser": "Browser engine (Chromium, for web browsing tools)",
|
||||
"ripgrep": "ripgrep (fast file search)",
|
||||
"ffmpeg": "ffmpeg (TTS voice messages)",
|
||||
}
|
||||
|
||||
|
||||
def _has_system_browser() -> bool:
|
||||
for name in ("google-chrome", "google-chrome-stable", "chromium", "chromium-browser", "chrome"):
|
||||
if shutil.which(name):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _has_hermes_agent_browser() -> bool:
|
||||
from hermes_constants import get_hermes_home
|
||||
return (get_hermes_home() / "node_modules" / ".bin" / "agent-browser").is_file()
|
||||
|
||||
|
||||
def _find_install_script(
|
||||
package_dir: Path | None = None,
|
||||
repo_root: Path | None = None,
|
||||
) -> Path | None:
|
||||
"""Locate install.sh — bundled in wheel or in git checkout."""
|
||||
if package_dir is None:
|
||||
package_dir = Path(__file__).parent
|
||||
if repo_root is None:
|
||||
repo_root = package_dir.parent
|
||||
|
||||
bundled = package_dir / "scripts" / "install.sh"
|
||||
if bundled.is_file():
|
||||
return bundled
|
||||
repo = repo_root / "scripts" / "install.sh"
|
||||
if repo.is_file():
|
||||
return repo
|
||||
return None
|
||||
|
||||
|
||||
def ensure_dependency(dep: str, interactive: bool = True) -> bool:
|
||||
"""Ensure a non-Python dependency is available. Returns True if available."""
|
||||
check = _DEP_CHECKS.get(dep)
|
||||
if check and check():
|
||||
return True
|
||||
|
||||
script = _find_install_script()
|
||||
if script is None:
|
||||
if interactive:
|
||||
desc = _DEP_DESCRIPTIONS.get(dep, dep)
|
||||
print(f" {desc} is not installed and install.sh was not found.")
|
||||
print(f" Install {dep} manually and try again.")
|
||||
return False
|
||||
|
||||
if interactive and sys.stdin.isatty():
|
||||
desc = _DEP_DESCRIPTIONS.get(dep, dep)
|
||||
try:
|
||||
reply = input(f"{desc} is not installed. Install now? [Y/n] ").strip().lower()
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
return False
|
||||
if reply not in ("", "y", "yes"):
|
||||
return False
|
||||
|
||||
result = subprocess.run(
|
||||
["bash", str(script), "--ensure", dep],
|
||||
env={**os.environ, "IS_INTERACTIVE": "false"},
|
||||
)
|
||||
if result.returncode != 0:
|
||||
return False
|
||||
|
||||
if check:
|
||||
return check()
|
||||
return True
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user