Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 1e6285c53d |
@@ -52,6 +52,10 @@ ignored/
|
||||
.worktrees/
|
||||
environments/benchmarks/evals/
|
||||
|
||||
# Compression eval run outputs (harness lives in scripts/compression_eval/)
|
||||
scripts/compression_eval/results/*
|
||||
!scripts/compression_eval/results/.gitkeep
|
||||
|
||||
# Web UI build output
|
||||
hermes_cli/web_dist/
|
||||
|
||||
|
||||
@@ -240,19 +240,6 @@ npm run fmt # prettier
|
||||
npm test # vitest
|
||||
```
|
||||
|
||||
### TUI in the Dashboard (`hermes dashboard` → `/chat`)
|
||||
|
||||
The dashboard embeds the real `hermes --tui` — **not** a rewrite. See `hermes_cli/pty_bridge.py` + the `@app.websocket("/api/pty")` endpoint in `hermes_cli/web_server.py`.
|
||||
|
||||
- Browser loads `web/src/pages/ChatPage.tsx`, which mounts xterm.js's `Terminal` with the WebGL renderer, `@xterm/addon-fit` for container-driven resize, and `@xterm/addon-unicode11` for modern wide-character widths.
|
||||
- `/api/pty?token=…` upgrades to a WebSocket; auth uses the same ephemeral `_SESSION_TOKEN` as REST, via query param (browsers can't set `Authorization` on WS upgrade).
|
||||
- The server spawns whatever `hermes --tui` would spawn, through `ptyprocess` (POSIX PTY — WSL works, native Windows does not).
|
||||
- Frames: raw PTY bytes each direction; resize via `\x1b[RESIZE:<cols>;<rows>]` intercepted on the server and applied with `TIOCSWINSZ`.
|
||||
|
||||
**Do not re-implement the primary chat experience in React.** The main transcript, composer/input flow (including slash-command behavior), and PTY-backed terminal belong to the embedded `hermes --tui` — anything new you add to Ink shows up in the dashboard automatically. If you find yourself rebuilding the transcript or composer for the dashboard, stop and extend Ink instead.
|
||||
|
||||
**Structured React UI around the TUI is allowed when it is not a second chat surface.** Sidebar widgets, inspectors, summaries, status panels, and similar supporting views (e.g. `ChatSidebar`, `ModelPickerDialog`, `ToolCall`) are fine when they complement the embedded TUI rather than replacing the transcript / composer / terminal. Keep their state independent of the PTY child's session and surface their failures non-destructively so the terminal pane keeps working unimpaired.
|
||||
|
||||
---
|
||||
|
||||
## Adding New Tools
|
||||
|
||||
@@ -986,26 +986,6 @@ def read_hermes_oauth_credentials() -> Optional[Dict[str, Any]]:
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _is_bedrock_model_id(model: str) -> bool:
|
||||
"""Detect AWS Bedrock model IDs that use dots as namespace separators.
|
||||
|
||||
Bedrock model IDs come in two forms:
|
||||
- Bare: ``anthropic.claude-opus-4-7``
|
||||
- Regional (inference profiles): ``us.anthropic.claude-sonnet-4-5-v1:0``
|
||||
|
||||
In both cases the dots separate namespace components, not version
|
||||
numbers, and must be preserved verbatim for the Bedrock API.
|
||||
"""
|
||||
lower = model.lower()
|
||||
# Regional inference-profile prefixes
|
||||
if any(lower.startswith(p) for p in ("global.", "us.", "eu.", "ap.", "jp.")):
|
||||
return True
|
||||
# Bare Bedrock model IDs: provider.model-family
|
||||
if lower.startswith("anthropic."):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def normalize_model_name(model: str, preserve_dots: bool = False) -> str:
|
||||
"""Normalize a model name for the Anthropic API.
|
||||
|
||||
@@ -1013,19 +993,11 @@ def normalize_model_name(model: str, preserve_dots: bool = False) -> str:
|
||||
- Converts dots to hyphens in version numbers (OpenRouter uses dots,
|
||||
Anthropic uses hyphens: claude-opus-4.6 → claude-opus-4-6), unless
|
||||
preserve_dots is True (e.g. for Alibaba/DashScope: qwen3.5-plus).
|
||||
- Preserves Bedrock model IDs (``anthropic.claude-opus-4-7``) and
|
||||
regional inference profiles (``us.anthropic.claude-*``) whose dots
|
||||
are namespace separators, not version separators.
|
||||
"""
|
||||
lower = model.lower()
|
||||
if lower.startswith("anthropic/"):
|
||||
model = model[len("anthropic/"):]
|
||||
if not preserve_dots:
|
||||
# Bedrock model IDs use dots as namespace separators
|
||||
# (e.g. "anthropic.claude-opus-4-7", "us.anthropic.claude-*").
|
||||
# These must not be converted to hyphens. See issue #12295.
|
||||
if _is_bedrock_model_id(model):
|
||||
return model
|
||||
# OpenRouter uses dots for version separators (claude-opus-4.6),
|
||||
# Anthropic uses hyphens (claude-opus-4-6). Convert dots to hyphens.
|
||||
model = model.replace(".", "-")
|
||||
@@ -1680,9 +1652,9 @@ def build_anthropic_kwargs(
|
||||
|
||||
# ── Strip sampling params on 4.7+ ─────────────────────────────────
|
||||
# Opus 4.7 rejects any non-default temperature/top_p/top_k with a 400.
|
||||
# Callers (auxiliary_client, etc.) may set these for older models;
|
||||
# drop them here as a safety net so upstream 4.6 → 4.7 migrations
|
||||
# don't require coordinated edits everywhere.
|
||||
# Callers (auxiliary_client, flush_memories, etc.) may set these for
|
||||
# older models; drop them here as a safety net so upstream 4.6 → 4.7
|
||||
# migrations don't require coordinated edits everywhere.
|
||||
if _forbids_sampling_params(model):
|
||||
for _sampling_key in ("temperature", "top_p", "top_k"):
|
||||
kwargs.pop(_sampling_key, None)
|
||||
|
||||
+7
-142
@@ -390,7 +390,7 @@ class _CodexCompletionsAdapter:
|
||||
# Note: the Codex endpoint (chatgpt.com/backend-api/codex) does NOT
|
||||
# support max_output_tokens or temperature — omit to avoid 400 errors.
|
||||
|
||||
# Tools support for auxiliary callers (e.g. skills_hub) that pass function schemas
|
||||
# Tools support for flush_memories and similar callers
|
||||
tools = kwargs.get("tools")
|
||||
if tools:
|
||||
converted = []
|
||||
@@ -1349,49 +1349,6 @@ def _is_auth_error(exc: Exception) -> bool:
|
||||
return "error code: 401" in err_lower or "authenticationerror" in type(exc).__name__.lower()
|
||||
|
||||
|
||||
def _is_unsupported_parameter_error(exc: Exception, param: str) -> bool:
|
||||
"""Detect provider 400s for an unsupported request parameter.
|
||||
|
||||
Different OpenAI-compatible endpoints phrase the same class of error a few
|
||||
ways: ``Unsupported parameter: X``, ``unsupported_parameter`` with a
|
||||
``param`` field, ``X is not supported``, ``unknown parameter: X``,
|
||||
``unrecognized request argument: X``. We match on both the parameter
|
||||
name and a generic "unsupported/unknown/unrecognized parameter" marker so
|
||||
call sites can reactively retry without the offending key instead of
|
||||
surfacing a noisy auxiliary failure.
|
||||
|
||||
Generalizes the temperature-specific detector that originally shipped
|
||||
with PR #15621 so the same retry strategy can cover ``max_tokens``,
|
||||
``seed``, ``top_p``, and any future quirk. Credit @nicholasrae (PR #15416)
|
||||
for the generalization pattern.
|
||||
"""
|
||||
param_lower = (param or "").lower()
|
||||
if not param_lower:
|
||||
return False
|
||||
err_lower = str(exc).lower()
|
||||
if param_lower not in err_lower:
|
||||
return False
|
||||
return any(marker in err_lower for marker in (
|
||||
"unsupported parameter",
|
||||
"unsupported_parameter",
|
||||
"not supported",
|
||||
"does not support",
|
||||
"unknown parameter",
|
||||
"unrecognized request argument",
|
||||
"unrecognized parameter",
|
||||
"invalid parameter",
|
||||
))
|
||||
|
||||
|
||||
def _is_unsupported_temperature_error(exc: Exception) -> bool:
|
||||
"""Back-compat wrapper: detect API errors where the model rejects ``temperature``.
|
||||
|
||||
Delegates to :func:`_is_unsupported_parameter_error`; kept as a separate
|
||||
public symbol because existing tests and call sites import it by name.
|
||||
"""
|
||||
return _is_unsupported_parameter_error(exc, "temperature")
|
||||
|
||||
|
||||
def _evict_cached_clients(provider: str) -> None:
|
||||
"""Drop cached auxiliary clients for a provider so fresh creds are used."""
|
||||
normalized = _normalize_aux_provider(provider)
|
||||
@@ -2036,39 +1993,6 @@ def resolve_provider_client(
|
||||
"directly supported", provider)
|
||||
return None, None
|
||||
|
||||
elif pconfig.auth_type == "aws_sdk":
|
||||
# AWS SDK providers (Bedrock) — use the Anthropic Bedrock client via
|
||||
# boto3's credential chain (IAM roles, SSO, env vars, instance metadata).
|
||||
try:
|
||||
from agent.bedrock_adapter import has_aws_credentials, resolve_bedrock_region
|
||||
from agent.anthropic_adapter import build_anthropic_bedrock_client
|
||||
except ImportError:
|
||||
logger.warning("resolve_provider_client: bedrock requested but "
|
||||
"boto3 or anthropic SDK not installed")
|
||||
return None, None
|
||||
|
||||
if not has_aws_credentials():
|
||||
logger.debug("resolve_provider_client: bedrock requested but "
|
||||
"no AWS credentials found")
|
||||
return None, None
|
||||
|
||||
region = resolve_bedrock_region()
|
||||
default_model = "anthropic.claude-haiku-4-5-20251001-v1:0"
|
||||
final_model = _normalize_resolved_model(model or default_model, provider)
|
||||
try:
|
||||
real_client = build_anthropic_bedrock_client(region)
|
||||
except ImportError as exc:
|
||||
logger.warning("resolve_provider_client: cannot create Bedrock "
|
||||
"client: %s", exc)
|
||||
return None, None
|
||||
client = AnthropicAuxiliaryClient(
|
||||
real_client, final_model, api_key="aws-sdk",
|
||||
base_url=f"https://bedrock-runtime.{region}.amazonaws.com",
|
||||
)
|
||||
logger.debug("resolve_provider_client: bedrock (%s, %s)", final_model, region)
|
||||
return (_to_async_client(client, final_model) if async_mode
|
||||
else (client, final_model))
|
||||
|
||||
elif pconfig.auth_type in ("oauth_device_code", "oauth_external"):
|
||||
# OAuth providers — route through their specific try functions
|
||||
if provider == "nous":
|
||||
@@ -2803,8 +2727,8 @@ def _build_call_kwargs(
|
||||
temperature = fixed_temperature
|
||||
|
||||
# Opus 4.7+ rejects any non-default temperature/top_p/top_k — silently
|
||||
# drop here so auxiliary callers that hardcode temperature (e.g. 0 on
|
||||
# structured-JSON extraction) don't 400 the moment
|
||||
# drop here so auxiliary callers that hardcode temperature (e.g. 0.3 on
|
||||
# flush_memories, 0 on structured-JSON extraction) don't 400 the moment
|
||||
# the aux model is flipped to 4.7.
|
||||
if temperature is not None:
|
||||
from agent.anthropic_adapter import _forbids_sampling_params
|
||||
@@ -2892,7 +2816,7 @@ def call_llm(
|
||||
|
||||
Args:
|
||||
task: Auxiliary task name ("compression", "vision", "web_extract",
|
||||
"session_search", "skills_hub", "mcp", "title_generation").
|
||||
"session_search", "skills_hub", "mcp", "flush_memories").
|
||||
Reads provider:model from config/env. Ignored if provider is set.
|
||||
provider: Explicit provider override.
|
||||
model: Explicit model override.
|
||||
@@ -2995,45 +2919,13 @@ def call_llm(
|
||||
if _is_anthropic_compat_endpoint(resolved_provider, _client_base):
|
||||
kwargs["messages"] = _convert_openai_images_to_anthropic(kwargs["messages"])
|
||||
|
||||
# Handle unsupported temperature, max_tokens vs max_completion_tokens retry,
|
||||
# then payment fallback.
|
||||
# Handle max_tokens vs max_completion_tokens retry, then payment fallback.
|
||||
try:
|
||||
return _validate_llm_response(
|
||||
client.chat.completions.create(**kwargs), task)
|
||||
except Exception as first_err:
|
||||
if "temperature" in kwargs and _is_unsupported_temperature_error(first_err):
|
||||
retry_kwargs = dict(kwargs)
|
||||
retry_kwargs.pop("temperature", None)
|
||||
logger.info(
|
||||
"Auxiliary %s: provider rejected temperature; retrying once without it",
|
||||
task or "call",
|
||||
)
|
||||
try:
|
||||
return _validate_llm_response(
|
||||
client.chat.completions.create(**retry_kwargs), task)
|
||||
except Exception as retry_err:
|
||||
retry_err_str = str(retry_err)
|
||||
# If retry still fails, fall through to the max_tokens /
|
||||
# payment / auth chains below using the temperature-stripped
|
||||
# kwargs. Re-raise only if the retry hit something those
|
||||
# chains won't handle.
|
||||
if not (
|
||||
_is_payment_error(retry_err)
|
||||
or _is_connection_error(retry_err)
|
||||
or _is_auth_error(retry_err)
|
||||
or "max_tokens" in retry_err_str
|
||||
or "unsupported_parameter" in retry_err_str
|
||||
):
|
||||
raise
|
||||
first_err = retry_err
|
||||
kwargs = retry_kwargs
|
||||
|
||||
err_str = str(first_err)
|
||||
if max_tokens is not None and (
|
||||
"max_tokens" in err_str
|
||||
or "unsupported_parameter" in err_str
|
||||
or _is_unsupported_parameter_error(first_err, "max_tokens")
|
||||
):
|
||||
if "max_tokens" in err_str or "unsupported_parameter" in err_str:
|
||||
kwargs.pop("max_tokens", None)
|
||||
kwargs["max_completion_tokens"] = max_tokens
|
||||
try:
|
||||
@@ -3296,35 +3188,8 @@ async def async_call_llm(
|
||||
return _validate_llm_response(
|
||||
await client.chat.completions.create(**kwargs), task)
|
||||
except Exception as first_err:
|
||||
if "temperature" in kwargs and _is_unsupported_temperature_error(first_err):
|
||||
retry_kwargs = dict(kwargs)
|
||||
retry_kwargs.pop("temperature", None)
|
||||
logger.info(
|
||||
"Auxiliary %s (async): provider rejected temperature; retrying once without it",
|
||||
task or "call",
|
||||
)
|
||||
try:
|
||||
return _validate_llm_response(
|
||||
await client.chat.completions.create(**retry_kwargs), task)
|
||||
except Exception as retry_err:
|
||||
retry_err_str = str(retry_err)
|
||||
if not (
|
||||
_is_payment_error(retry_err)
|
||||
or _is_connection_error(retry_err)
|
||||
or _is_auth_error(retry_err)
|
||||
or "max_tokens" in retry_err_str
|
||||
or "unsupported_parameter" in retry_err_str
|
||||
):
|
||||
raise
|
||||
first_err = retry_err
|
||||
kwargs = retry_kwargs
|
||||
|
||||
err_str = str(first_err)
|
||||
if max_tokens is not None and (
|
||||
"max_tokens" in err_str
|
||||
or "unsupported_parameter" in err_str
|
||||
or _is_unsupported_parameter_error(first_err, "max_tokens")
|
||||
):
|
||||
if "max_tokens" in err_str or "unsupported_parameter" in err_str:
|
||||
kwargs.pop("max_tokens", None)
|
||||
kwargs["max_completion_tokens"] = max_tokens
|
||||
try:
|
||||
|
||||
+2
-130
@@ -87,114 +87,6 @@ def reset_client_cache():
|
||||
_bedrock_control_client_cache.clear()
|
||||
|
||||
|
||||
def invalidate_runtime_client(region: str) -> bool:
|
||||
"""Evict the cached ``bedrock-runtime`` client for a single region.
|
||||
|
||||
Per-region counterpart to :func:`reset_client_cache`. Used by the converse
|
||||
call wrappers to discard clients whose underlying HTTP connection has
|
||||
gone stale, so the next call allocates a fresh client (with a fresh
|
||||
connection pool) instead of reusing a dead socket.
|
||||
|
||||
Returns True if a cached entry was evicted, False if the region was not
|
||||
cached.
|
||||
"""
|
||||
existed = region in _bedrock_runtime_client_cache
|
||||
_bedrock_runtime_client_cache.pop(region, None)
|
||||
return existed
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Stale-connection detection
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# boto3 caches its HTTPS connection pool inside the client object. When a
|
||||
# pooled connection is killed out from under us (NAT timeout, VPN flap,
|
||||
# server-side TCP RST, proxy idle cull, etc.), the next use surfaces as
|
||||
# one of a handful of low-level exceptions — most commonly
|
||||
# ``botocore.exceptions.ConnectionClosedError`` or
|
||||
# ``urllib3.exceptions.ProtocolError``. urllib3 also trips an internal
|
||||
# ``assert`` in a couple of paths (connection pool state checks, chunked
|
||||
# response readers) which bubbles up as a bare ``AssertionError`` with an
|
||||
# empty ``str(exc)``.
|
||||
#
|
||||
# In all of these cases the client is the problem, not the request: retrying
|
||||
# with the same cached client reproduces the failure until the process
|
||||
# restarts. The fix is to evict the region's cached client so the next
|
||||
# attempt builds a new one.
|
||||
|
||||
_STALE_LIB_MODULE_PREFIXES = (
|
||||
"urllib3.",
|
||||
"botocore.",
|
||||
"boto3.",
|
||||
)
|
||||
|
||||
|
||||
def _traceback_frames_modules(exc: BaseException):
|
||||
"""Yield ``__name__``-style module strings for each frame in exc's traceback."""
|
||||
tb = getattr(exc, "__traceback__", None)
|
||||
while tb is not None:
|
||||
frame = tb.tb_frame
|
||||
module = frame.f_globals.get("__name__", "")
|
||||
yield module or ""
|
||||
tb = tb.tb_next
|
||||
|
||||
|
||||
def is_stale_connection_error(exc: BaseException) -> bool:
|
||||
"""Return True if ``exc`` indicates a dead/stale Bedrock HTTP connection.
|
||||
|
||||
Matches:
|
||||
* ``botocore.exceptions.ConnectionError`` and subclasses
|
||||
(``ConnectionClosedError``, ``EndpointConnectionError``,
|
||||
``ReadTimeoutError``, ``ConnectTimeoutError``).
|
||||
* ``urllib3.exceptions.ProtocolError`` / ``NewConnectionError`` /
|
||||
``ConnectionError`` (best-effort import — urllib3 is a transitive
|
||||
dependency of botocore so it is always available in practice).
|
||||
* Bare ``AssertionError`` raised from a frame inside urllib3, botocore,
|
||||
or boto3. These are internal-invariant failures (typically triggered
|
||||
by corrupted connection-pool state after a dropped socket) and are
|
||||
recoverable by swapping the client.
|
||||
|
||||
Non-library ``AssertionError``s (from application code or tests) are
|
||||
intentionally not matched — only library-internal asserts signal stale
|
||||
connection state.
|
||||
"""
|
||||
# botocore: the canonical signal — HTTPClientError is the umbrella for
|
||||
# ConnectionClosedError, ReadTimeoutError, EndpointConnectionError,
|
||||
# ConnectTimeoutError, and ProxyConnectionError. ConnectionError covers
|
||||
# the same family via a different branch of the hierarchy.
|
||||
try:
|
||||
from botocore.exceptions import (
|
||||
ConnectionError as BotoConnectionError,
|
||||
HTTPClientError,
|
||||
)
|
||||
botocore_errors: tuple = (BotoConnectionError, HTTPClientError)
|
||||
except ImportError: # pragma: no cover — botocore always present with boto3
|
||||
botocore_errors = ()
|
||||
if botocore_errors and isinstance(exc, botocore_errors):
|
||||
return True
|
||||
|
||||
# urllib3: low-level transport failures
|
||||
try:
|
||||
from urllib3.exceptions import (
|
||||
ProtocolError,
|
||||
NewConnectionError,
|
||||
ConnectionError as Urllib3ConnectionError,
|
||||
)
|
||||
urllib3_errors = (ProtocolError, NewConnectionError, Urllib3ConnectionError)
|
||||
except ImportError: # pragma: no cover
|
||||
urllib3_errors = ()
|
||||
if urllib3_errors and isinstance(exc, urllib3_errors):
|
||||
return True
|
||||
|
||||
# Library-internal AssertionError (urllib3 / botocore / boto3)
|
||||
if isinstance(exc, AssertionError):
|
||||
for module in _traceback_frames_modules(exc):
|
||||
if any(module.startswith(prefix) for prefix in _STALE_LIB_MODULE_PREFIXES):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# AWS credential detection
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -895,17 +787,7 @@ def call_converse(
|
||||
guardrail_config=guardrail_config,
|
||||
)
|
||||
|
||||
try:
|
||||
response = client.converse(**kwargs)
|
||||
except Exception as exc:
|
||||
if is_stale_connection_error(exc):
|
||||
logger.warning(
|
||||
"bedrock: stale-connection error on converse(region=%s, model=%s): "
|
||||
"%s — evicting cached client so the next call reconnects.",
|
||||
region, model, type(exc).__name__,
|
||||
)
|
||||
invalidate_runtime_client(region)
|
||||
raise
|
||||
response = client.converse(**kwargs)
|
||||
return normalize_converse_response(response)
|
||||
|
||||
|
||||
@@ -937,17 +819,7 @@ def call_converse_stream(
|
||||
guardrail_config=guardrail_config,
|
||||
)
|
||||
|
||||
try:
|
||||
response = client.converse_stream(**kwargs)
|
||||
except Exception as exc:
|
||||
if is_stale_connection_error(exc):
|
||||
logger.warning(
|
||||
"bedrock: stale-connection error on converse_stream(region=%s, "
|
||||
"model=%s): %s — evicting cached client so the next call reconnects.",
|
||||
region, model, type(exc).__name__,
|
||||
)
|
||||
invalidate_runtime_client(region)
|
||||
raise
|
||||
response = client.converse_stream(**kwargs)
|
||||
return normalize_converse_stream_events(response)
|
||||
|
||||
|
||||
|
||||
@@ -23,52 +23,26 @@ from agent.prompt_builder import DEFAULT_AGENT_IDENTITY
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Matches Codex/Harmony tool-call serialization that occasionally leaks into
|
||||
# assistant-message content when the model fails to emit a structured
|
||||
# ``function_call`` item. Accepts the common forms:
|
||||
#
|
||||
# to=functions.exec_command
|
||||
# assistant to=functions.exec_command
|
||||
# <|channel|>commentary to=functions.exec_command
|
||||
#
|
||||
# ``to=functions.<name>`` is the stable marker — the optional ``assistant`` or
|
||||
# Harmony channel prefix varies by degeneration mode. Case-insensitive to
|
||||
# cover lowercase/uppercase ``assistant`` variants.
|
||||
_TOOL_CALL_LEAK_PATTERN = re.compile(
|
||||
r"(?:^|[\s>|])to=functions\.[A-Za-z_][\w.]*",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Multimodal content helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _chat_content_to_responses_parts(content: Any, *, role: str = "user") -> List[Dict[str, Any]]:
|
||||
def _chat_content_to_responses_parts(content: Any) -> List[Dict[str, Any]]:
|
||||
"""Convert chat-style multimodal content to Responses API input parts.
|
||||
|
||||
Input: ``[{"type":"text"|"image_url", ...}]`` (native OpenAI Chat format)
|
||||
Output: ``[{"type":"input_text"|"output_text"|"input_image", ...}]`` (Responses format)
|
||||
|
||||
The ``role`` parameter controls the text content type:
|
||||
- ``"user"`` (default) → ``"input_text"``
|
||||
- ``"assistant"`` → ``"output_text"``
|
||||
|
||||
The Responses API rejects ``input_text`` inside assistant messages and
|
||||
``output_text`` inside user messages, so callers MUST pass the correct
|
||||
role for the message being converted.
|
||||
Output: ``[{"type":"input_text"|"input_image", ...}]`` (Responses format)
|
||||
|
||||
Returns an empty list when ``content`` is not a list or contains no
|
||||
recognized parts — callers fall back to the string path.
|
||||
"""
|
||||
text_type = "output_text" if role == "assistant" else "input_text"
|
||||
if not isinstance(content, list):
|
||||
return []
|
||||
converted: List[Dict[str, Any]] = []
|
||||
for part in content:
|
||||
if isinstance(part, str):
|
||||
if part:
|
||||
converted.append({"type": text_type, "text": part})
|
||||
converted.append({"type": "input_text", "text": part})
|
||||
continue
|
||||
if not isinstance(part, dict):
|
||||
continue
|
||||
@@ -76,7 +50,7 @@ def _chat_content_to_responses_parts(content: Any, *, role: str = "user") -> Lis
|
||||
if ptype in {"text", "input_text", "output_text"}:
|
||||
text = part.get("text")
|
||||
if isinstance(text, str) and text:
|
||||
converted.append({"type": text_type, "text": text})
|
||||
converted.append({"type": "input_text", "text": text})
|
||||
continue
|
||||
if ptype in {"image_url", "input_image"}:
|
||||
image_ref = part.get("image_url")
|
||||
@@ -242,10 +216,9 @@ def _chat_messages_to_responses_input(messages: List[Dict[str, Any]]) -> List[Di
|
||||
if role in {"user", "assistant"}:
|
||||
content = msg.get("content", "")
|
||||
if isinstance(content, list):
|
||||
content_parts = _chat_content_to_responses_parts(content, role=role)
|
||||
text_type = "output_text" if role == "assistant" else "input_text"
|
||||
content_parts = _chat_content_to_responses_parts(content)
|
||||
content_text = "".join(
|
||||
p.get("text", "") for p in content_parts if p.get("type") == text_type
|
||||
p.get("text", "") for p in content_parts if p.get("type") == "input_text"
|
||||
)
|
||||
else:
|
||||
content_parts = []
|
||||
@@ -439,16 +412,13 @@ def _preflight_codex_input_items(raw_items: Any) -> List[Dict[str, Any]]:
|
||||
content = ""
|
||||
if isinstance(content, list):
|
||||
# Multimodal content from ``_chat_messages_to_responses_input``
|
||||
# is already in Responses format (``input_text`` / ``output_text``
|
||||
# / ``input_image``). Validate each part and pass through.
|
||||
# Use the correct text type for the role — ``output_text`` for
|
||||
# assistant messages, ``input_text`` for user messages.
|
||||
text_type = "output_text" if role == "assistant" else "input_text"
|
||||
# is already in Responses format (``input_text`` / ``input_image``).
|
||||
# Validate each part and pass through.
|
||||
validated: List[Dict[str, Any]] = []
|
||||
for part_idx, part in enumerate(content):
|
||||
if isinstance(part, str):
|
||||
if part:
|
||||
validated.append({"type": text_type, "text": part})
|
||||
validated.append({"type": "input_text", "text": part})
|
||||
continue
|
||||
if not isinstance(part, dict):
|
||||
raise ValueError(
|
||||
@@ -459,7 +429,7 @@ def _preflight_codex_input_items(raw_items: Any) -> List[Dict[str, Any]]:
|
||||
text = part.get("text", "")
|
||||
if not isinstance(text, str):
|
||||
text = str(text or "")
|
||||
validated.append({"type": text_type, "text": text})
|
||||
validated.append({"type": "input_text", "text": text})
|
||||
elif ptype in {"input_image", "image_url"}:
|
||||
image_ref = part.get("image_url", "")
|
||||
detail = part.get("detail")
|
||||
@@ -817,37 +787,6 @@ def _normalize_codex_response(response: Any) -> tuple[Any, str]:
|
||||
if isinstance(out_text, str):
|
||||
final_text = out_text.strip()
|
||||
|
||||
# ── Tool-call leak recovery ──────────────────────────────────
|
||||
# gpt-5.x on the Codex Responses API sometimes degenerates and emits
|
||||
# what should be a structured `function_call` item as plain assistant
|
||||
# text using the Harmony/Codex serialization (``to=functions.foo
|
||||
# {json}`` or ``assistant to=functions.foo {json}``). The model
|
||||
# intended to call a tool, but the intent never made it into
|
||||
# ``response.output`` as a ``function_call`` item, so ``tool_calls``
|
||||
# is empty here. If we pass this through, the parent sees a
|
||||
# confident-looking summary with no audit trail (empty ``tool_trace``)
|
||||
# and no tools actually ran — the Taiwan-embassy-email incident.
|
||||
#
|
||||
# Detection: leaked tokens always contain ``to=functions.<name>`` and
|
||||
# the assistant message has no real tool calls. Treat it as incomplete
|
||||
# so the existing Codex-incomplete continuation path (3 retries,
|
||||
# handled in run_agent.py) gets a chance to re-elicit a proper
|
||||
# ``function_call`` item. The existing loop already handles message
|
||||
# append, dedup, and retry budget.
|
||||
leaked_tool_call_text = False
|
||||
if final_text and not tool_calls and _TOOL_CALL_LEAK_PATTERN.search(final_text):
|
||||
leaked_tool_call_text = True
|
||||
logger.warning(
|
||||
"Codex response contains leaked tool-call text in assistant content "
|
||||
"(no structured function_call items). Treating as incomplete so the "
|
||||
"continuation path can re-elicit a proper tool call. Leaked snippet: %r",
|
||||
final_text[:300],
|
||||
)
|
||||
# Clear the text so downstream code doesn't surface the garbage as
|
||||
# a summary. The encrypted reasoning items (if any) are preserved
|
||||
# so the model keeps its chain-of-thought on the retry.
|
||||
final_text = ""
|
||||
|
||||
assistant_message = SimpleNamespace(
|
||||
content=final_text,
|
||||
tool_calls=tool_calls,
|
||||
@@ -859,8 +798,6 @@ def _normalize_codex_response(response: Any) -> tuple[Any, str]:
|
||||
|
||||
if tool_calls:
|
||||
finish_reason = "tool_calls"
|
||||
elif leaked_tool_call_text:
|
||||
finish_reason = "incomplete"
|
||||
elif has_incomplete_items or (saw_commentary_phase and not saw_final_answer_phase):
|
||||
finish_reason = "incomplete"
|
||||
elif reasoning_items_raw and not final_text:
|
||||
|
||||
@@ -294,7 +294,6 @@ class ContextCompressor(ContextEngine):
|
||||
self._context_probed = False
|
||||
self._context_probe_persistable = False
|
||||
self._previous_summary = None
|
||||
self._last_summary_error = None
|
||||
self._last_compression_savings_pct = 100.0
|
||||
self._ineffective_compression_count = 0
|
||||
|
||||
@@ -318,13 +317,6 @@ class ContextCompressor(ContextEngine):
|
||||
int(context_length * self.threshold_percent),
|
||||
MINIMUM_CONTEXT_LENGTH,
|
||||
)
|
||||
# Recalculate token budgets for the new context length so the
|
||||
# compressor stays calibrated after a model switch (e.g. 200K → 32K).
|
||||
target_tokens = int(self.threshold_tokens * self.summary_target_ratio)
|
||||
self.tail_token_budget = target_tokens
|
||||
self.max_summary_tokens = min(
|
||||
int(context_length * 0.05), _SUMMARY_TOKENS_CEILING,
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -397,7 +389,6 @@ class ContextCompressor(ContextEngine):
|
||||
self._last_compression_savings_pct: float = 100.0
|
||||
self._ineffective_compression_count: int = 0
|
||||
self._summary_failure_cooldown_until: float = 0.0
|
||||
self._last_summary_error: Optional[str] = None
|
||||
|
||||
def update_from_response(self, usage: Dict[str, Any]):
|
||||
"""Update tracked token usage from API response."""
|
||||
@@ -821,12 +812,10 @@ The user has requested that this compaction PRIORITISE preserving all informatio
|
||||
self._previous_summary = summary
|
||||
self._summary_failure_cooldown_until = 0.0
|
||||
self._summary_model_fallen_back = False
|
||||
self._last_summary_error = None
|
||||
return self._with_summary_prefix(summary)
|
||||
except RuntimeError:
|
||||
# No provider configured — long cooldown, unlikely to self-resolve
|
||||
self._summary_failure_cooldown_until = time.monotonic() + _SUMMARY_FAILURE_COOLDOWN_SECONDS
|
||||
self._last_summary_error = "no auxiliary LLM provider configured"
|
||||
logging.warning("Context compression: no provider available for "
|
||||
"summary. Middle turns will be dropped without summary "
|
||||
"for %d seconds.",
|
||||
@@ -864,10 +853,6 @@ The user has requested that this compaction PRIORITISE preserving all informatio
|
||||
# Transient errors (timeout, rate limit, network) — shorter cooldown
|
||||
_transient_cooldown = 60
|
||||
self._summary_failure_cooldown_until = time.monotonic() + _transient_cooldown
|
||||
err_text = str(e).strip() or e.__class__.__name__
|
||||
if len(err_text) > 220:
|
||||
err_text = err_text[:217].rstrip() + "..."
|
||||
self._last_summary_error = err_text
|
||||
logging.warning(
|
||||
"Failed to generate context summary: %s. "
|
||||
"Further summary attempts paused for %d seconds.",
|
||||
|
||||
+2
-43
@@ -31,7 +31,6 @@ from __future__ import annotations
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import inspect
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from agent.memory_provider import MemoryProvider
|
||||
@@ -313,39 +312,7 @@ class MemoryManager:
|
||||
)
|
||||
return "\n\n".join(parts)
|
||||
|
||||
@staticmethod
|
||||
def _provider_memory_write_metadata_mode(provider: MemoryProvider) -> str:
|
||||
"""Return how to pass metadata to a provider's memory-write hook."""
|
||||
try:
|
||||
signature = inspect.signature(provider.on_memory_write)
|
||||
except (TypeError, ValueError):
|
||||
return "keyword"
|
||||
|
||||
params = list(signature.parameters.values())
|
||||
if any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params):
|
||||
return "keyword"
|
||||
if "metadata" in signature.parameters:
|
||||
return "keyword"
|
||||
|
||||
accepted = [
|
||||
p for p in params
|
||||
if p.kind in (
|
||||
inspect.Parameter.POSITIONAL_ONLY,
|
||||
inspect.Parameter.POSITIONAL_OR_KEYWORD,
|
||||
inspect.Parameter.KEYWORD_ONLY,
|
||||
)
|
||||
]
|
||||
if len(accepted) >= 4:
|
||||
return "positional"
|
||||
return "legacy"
|
||||
|
||||
def on_memory_write(
|
||||
self,
|
||||
action: str,
|
||||
target: str,
|
||||
content: str,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> None:
|
||||
def on_memory_write(self, action: str, target: str, content: str) -> None:
|
||||
"""Notify external providers when the built-in memory tool writes.
|
||||
|
||||
Skips the builtin provider itself (it's the source of the write).
|
||||
@@ -354,15 +321,7 @@ class MemoryManager:
|
||||
if provider.name == "builtin":
|
||||
continue
|
||||
try:
|
||||
metadata_mode = self._provider_memory_write_metadata_mode(provider)
|
||||
if metadata_mode == "keyword":
|
||||
provider.on_memory_write(
|
||||
action, target, content, metadata=dict(metadata or {})
|
||||
)
|
||||
elif metadata_mode == "positional":
|
||||
provider.on_memory_write(action, target, content, dict(metadata or {}))
|
||||
else:
|
||||
provider.on_memory_write(action, target, content)
|
||||
provider.on_memory_write(action, target, content)
|
||||
except Exception as e:
|
||||
logger.debug(
|
||||
"Memory provider '%s' on_memory_write failed: %s",
|
||||
|
||||
@@ -26,7 +26,7 @@ Optional hooks (override to opt in):
|
||||
on_turn_start(turn, message, **kwargs) — per-turn tick with runtime context
|
||||
on_session_end(messages) — end-of-session extraction
|
||||
on_pre_compress(messages) -> str — extract before context compression
|
||||
on_memory_write(action, target, content, metadata=None) — mirror built-in memory writes
|
||||
on_memory_write(action, target, content) — mirror built-in memory writes
|
||||
on_delegation(task, result, **kwargs) — parent-side observation of subagent work
|
||||
"""
|
||||
|
||||
@@ -34,7 +34,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, List, Optional
|
||||
from typing import Any, Dict, List
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -220,21 +220,12 @@ class MemoryProvider(ABC):
|
||||
should all have ``env_var`` set and this method stays no-op).
|
||||
"""
|
||||
|
||||
def on_memory_write(
|
||||
self,
|
||||
action: str,
|
||||
target: str,
|
||||
content: str,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> None:
|
||||
def on_memory_write(self, action: str, target: str, content: str) -> None:
|
||||
"""Called when the built-in memory tool writes an entry.
|
||||
|
||||
action: 'add', 'replace', or 'remove'
|
||||
target: 'memory' or 'user'
|
||||
content: the entry content
|
||||
metadata: structured provenance for the write, when available. Common
|
||||
keys include ``write_origin``, ``execution_context``, ``session_id``,
|
||||
``parent_session_id``, ``platform``, and ``tool_name``.
|
||||
|
||||
Use to mirror built-in memory writes to your backend.
|
||||
"""
|
||||
|
||||
+13
-22
@@ -1199,7 +1199,6 @@ def get_model_context_length(
|
||||
Resolution order:
|
||||
0. Explicit config override (model.context_length or custom_providers per-model)
|
||||
1. Persistent cache (previously discovered via probing)
|
||||
1b. AWS Bedrock static table (must precede custom-endpoint probe)
|
||||
2. Active endpoint metadata (/models for explicit custom endpoints)
|
||||
3. Local server query (for local endpoints)
|
||||
4. Anthropic /v1/models API (API-key users only, not OAuth)
|
||||
@@ -1238,26 +1237,6 @@ def get_model_context_length(
|
||||
else:
|
||||
return cached
|
||||
|
||||
# 1b. AWS Bedrock — use static context length table.
|
||||
# Bedrock's ListFoundationModels API doesn't expose context window sizes,
|
||||
# so we maintain a curated table in bedrock_adapter.py that reflects
|
||||
# AWS-imposed limits (e.g. 200K for Claude models vs 1M on the native
|
||||
# Anthropic API). This must run BEFORE the custom-endpoint probe at
|
||||
# step 2 — bedrock-runtime.<region>.amazonaws.com is not in
|
||||
# _URL_TO_PROVIDER, so it would otherwise be treated as a custom endpoint,
|
||||
# fail the /models probe (Bedrock doesn't expose that shape), and fall
|
||||
# back to the 128K default before reaching the original step 4b branch.
|
||||
if provider == "bedrock" or (
|
||||
base_url
|
||||
and base_url_hostname(base_url).startswith("bedrock-runtime.")
|
||||
and base_url_host_matches(base_url, "amazonaws.com")
|
||||
):
|
||||
try:
|
||||
from agent.bedrock_adapter import get_bedrock_context_length
|
||||
return get_bedrock_context_length(model)
|
||||
except ImportError:
|
||||
pass # boto3 not installed — fall through to generic resolution
|
||||
|
||||
# 2. Active endpoint metadata for truly custom/unknown endpoints.
|
||||
# Known providers (Copilot, OpenAI, Anthropic, etc.) skip this — their
|
||||
# /models endpoint may report a provider-imposed limit (e.g. Copilot
|
||||
@@ -1303,7 +1282,19 @@ def get_model_context_length(
|
||||
if ctx:
|
||||
return ctx
|
||||
|
||||
# 4b. (Bedrock handled earlier at step 1b — before custom-endpoint probe.)
|
||||
# 4b. AWS Bedrock — use static context length table.
|
||||
# Bedrock's ListFoundationModels doesn't expose context window sizes,
|
||||
# so we maintain a curated table in bedrock_adapter.py.
|
||||
if provider == "bedrock" or (
|
||||
base_url
|
||||
and base_url_hostname(base_url).startswith("bedrock-runtime.")
|
||||
and base_url_host_matches(base_url, "amazonaws.com")
|
||||
):
|
||||
try:
|
||||
from agent.bedrock_adapter import get_bedrock_context_length
|
||||
return get_bedrock_context_length(model)
|
||||
except ImportError:
|
||||
pass # boto3 not installed — fall through to generic resolution
|
||||
|
||||
# 5. Provider-aware lookups (before generic OpenRouter cache)
|
||||
# These are provider-specific and take priority over the generic OR cache,
|
||||
|
||||
+107
-8
@@ -7,15 +7,11 @@ can invoke skills via /skill-name commands.
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from hermes_constants import display_hermes_home
|
||||
from agent.skill_preprocessing import (
|
||||
expand_inline_shell as _expand_inline_shell,
|
||||
load_skills_config as _load_skills_config,
|
||||
substitute_template_vars as _substitute_template_vars,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -24,6 +20,111 @@ _skill_commands: Dict[str, Dict[str, Any]] = {}
|
||||
_SKILL_INVALID_CHARS = re.compile(r"[^a-z0-9-]")
|
||||
_SKILL_MULTI_HYPHEN = re.compile(r"-{2,}")
|
||||
|
||||
# Matches ${HERMES_SKILL_DIR} / ${HERMES_SESSION_ID} tokens in SKILL.md.
|
||||
# Tokens that don't resolve (e.g. ${HERMES_SESSION_ID} with no session) are
|
||||
# left as-is so the user can debug them.
|
||||
_SKILL_TEMPLATE_RE = re.compile(r"\$\{(HERMES_SKILL_DIR|HERMES_SESSION_ID)\}")
|
||||
|
||||
# Matches inline shell snippets like: !`date +%Y-%m-%d`
|
||||
# Non-greedy, single-line only — no newlines inside the backticks.
|
||||
_INLINE_SHELL_RE = re.compile(r"!`([^`\n]+)`")
|
||||
|
||||
# Cap inline-shell output so a runaway command can't blow out the context.
|
||||
_INLINE_SHELL_MAX_OUTPUT = 4000
|
||||
|
||||
|
||||
def _load_skills_config() -> dict:
|
||||
"""Load the ``skills`` section of config.yaml (best-effort)."""
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
|
||||
cfg = load_config() or {}
|
||||
skills_cfg = cfg.get("skills")
|
||||
if isinstance(skills_cfg, dict):
|
||||
return skills_cfg
|
||||
except Exception:
|
||||
logger.debug("Could not read skills config", exc_info=True)
|
||||
return {}
|
||||
|
||||
|
||||
def _substitute_template_vars(
|
||||
content: str,
|
||||
skill_dir: Path | None,
|
||||
session_id: str | None,
|
||||
) -> str:
|
||||
"""Replace ${HERMES_SKILL_DIR} / ${HERMES_SESSION_ID} in skill content.
|
||||
|
||||
Only substitutes tokens for which a concrete value is available —
|
||||
unresolved tokens are left in place so the author can spot them.
|
||||
"""
|
||||
if not content:
|
||||
return content
|
||||
|
||||
skill_dir_str = str(skill_dir) if skill_dir else None
|
||||
|
||||
def _replace(match: re.Match) -> str:
|
||||
token = match.group(1)
|
||||
if token == "HERMES_SKILL_DIR" and skill_dir_str:
|
||||
return skill_dir_str
|
||||
if token == "HERMES_SESSION_ID" and session_id:
|
||||
return str(session_id)
|
||||
return match.group(0)
|
||||
|
||||
return _SKILL_TEMPLATE_RE.sub(_replace, content)
|
||||
|
||||
|
||||
def _run_inline_shell(command: str, cwd: Path | None, timeout: int) -> str:
|
||||
"""Execute a single inline-shell snippet and return its stdout (trimmed).
|
||||
|
||||
Failures return a short ``[inline-shell error: ...]`` marker instead of
|
||||
raising, so one bad snippet can't wreck the whole skill message.
|
||||
"""
|
||||
try:
|
||||
completed = subprocess.run(
|
||||
["bash", "-c", command],
|
||||
cwd=str(cwd) if cwd else None,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=max(1, int(timeout)),
|
||||
check=False,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
return f"[inline-shell timeout after {timeout}s: {command}]"
|
||||
except FileNotFoundError:
|
||||
return f"[inline-shell error: bash not found]"
|
||||
except Exception as exc:
|
||||
return f"[inline-shell error: {exc}]"
|
||||
|
||||
output = (completed.stdout or "").rstrip("\n")
|
||||
if not output and completed.stderr:
|
||||
output = completed.stderr.rstrip("\n")
|
||||
if len(output) > _INLINE_SHELL_MAX_OUTPUT:
|
||||
output = output[:_INLINE_SHELL_MAX_OUTPUT] + "…[truncated]"
|
||||
return output
|
||||
|
||||
|
||||
def _expand_inline_shell(
|
||||
content: str,
|
||||
skill_dir: Path | None,
|
||||
timeout: int,
|
||||
) -> str:
|
||||
"""Replace every !`cmd` snippet in ``content`` with its stdout.
|
||||
|
||||
Runs each snippet with the skill directory as CWD so relative paths in
|
||||
the snippet work the way the author expects.
|
||||
"""
|
||||
if "!`" not in content:
|
||||
return content
|
||||
|
||||
def _replace(match: re.Match) -> str:
|
||||
cmd = match.group(1).strip()
|
||||
if not cmd:
|
||||
return ""
|
||||
return _run_inline_shell(cmd, skill_dir, timeout)
|
||||
|
||||
return _INLINE_SHELL_RE.sub(_replace, content)
|
||||
|
||||
|
||||
def _load_skill_payload(skill_identifier: str, task_id: str | None = None) -> tuple[dict[str, Any], Path | None, str] | None:
|
||||
"""Load a skill by name/path and return (loaded_payload, skill_dir, display_name)."""
|
||||
raw_identifier = (skill_identifier or "").strip()
|
||||
@@ -42,9 +143,7 @@ def _load_skill_payload(skill_identifier: str, task_id: str | None = None) -> tu
|
||||
else:
|
||||
normalized = raw_identifier.lstrip("/")
|
||||
|
||||
loaded_skill = json.loads(
|
||||
skill_view(normalized, task_id=task_id, preprocess=False)
|
||||
)
|
||||
loaded_skill = json.loads(skill_view(normalized, task_id=task_id))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
@@ -1,131 +0,0 @@
|
||||
"""Shared SKILL.md preprocessing helpers."""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Matches ${HERMES_SKILL_DIR} / ${HERMES_SESSION_ID} tokens in SKILL.md.
|
||||
# Tokens that don't resolve (e.g. ${HERMES_SESSION_ID} with no session) are
|
||||
# left as-is so the user can debug them.
|
||||
_SKILL_TEMPLATE_RE = re.compile(r"\$\{(HERMES_SKILL_DIR|HERMES_SESSION_ID)\}")
|
||||
|
||||
# Matches inline shell snippets like: !`date +%Y-%m-%d`
|
||||
# Non-greedy, single-line only -- no newlines inside the backticks.
|
||||
_INLINE_SHELL_RE = re.compile(r"!`([^`\n]+)`")
|
||||
|
||||
# Cap inline-shell output so a runaway command can't blow out the context.
|
||||
_INLINE_SHELL_MAX_OUTPUT = 4000
|
||||
|
||||
|
||||
def load_skills_config() -> dict:
|
||||
"""Load the ``skills`` section of config.yaml (best-effort)."""
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
|
||||
cfg = load_config() or {}
|
||||
skills_cfg = cfg.get("skills")
|
||||
if isinstance(skills_cfg, dict):
|
||||
return skills_cfg
|
||||
except Exception:
|
||||
logger.debug("Could not read skills config", exc_info=True)
|
||||
return {}
|
||||
|
||||
|
||||
def substitute_template_vars(
|
||||
content: str,
|
||||
skill_dir: Path | None,
|
||||
session_id: str | None,
|
||||
) -> str:
|
||||
"""Replace ${HERMES_SKILL_DIR} / ${HERMES_SESSION_ID} in skill content.
|
||||
|
||||
Only substitutes tokens for which a concrete value is available --
|
||||
unresolved tokens are left in place so the author can spot them.
|
||||
"""
|
||||
if not content:
|
||||
return content
|
||||
|
||||
skill_dir_str = str(skill_dir) if skill_dir else None
|
||||
|
||||
def _replace(match: re.Match) -> str:
|
||||
token = match.group(1)
|
||||
if token == "HERMES_SKILL_DIR" and skill_dir_str:
|
||||
return skill_dir_str
|
||||
if token == "HERMES_SESSION_ID" and session_id:
|
||||
return str(session_id)
|
||||
return match.group(0)
|
||||
|
||||
return _SKILL_TEMPLATE_RE.sub(_replace, content)
|
||||
|
||||
|
||||
def run_inline_shell(command: str, cwd: Path | None, timeout: int) -> str:
|
||||
"""Execute a single inline-shell snippet and return its stdout (trimmed).
|
||||
|
||||
Failures return a short ``[inline-shell error: ...]`` marker instead of
|
||||
raising, so one bad snippet can't wreck the whole skill message.
|
||||
"""
|
||||
try:
|
||||
completed = subprocess.run(
|
||||
["bash", "-c", command],
|
||||
cwd=str(cwd) if cwd else None,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=max(1, int(timeout)),
|
||||
check=False,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
return f"[inline-shell timeout after {timeout}s: {command}]"
|
||||
except FileNotFoundError:
|
||||
return "[inline-shell error: bash not found]"
|
||||
except Exception as exc:
|
||||
return f"[inline-shell error: {exc}]"
|
||||
|
||||
output = (completed.stdout or "").rstrip("\n")
|
||||
if not output and completed.stderr:
|
||||
output = completed.stderr.rstrip("\n")
|
||||
if len(output) > _INLINE_SHELL_MAX_OUTPUT:
|
||||
output = output[:_INLINE_SHELL_MAX_OUTPUT] + "...[truncated]"
|
||||
return output
|
||||
|
||||
|
||||
def expand_inline_shell(
|
||||
content: str,
|
||||
skill_dir: Path | None,
|
||||
timeout: int,
|
||||
) -> str:
|
||||
"""Replace every !`cmd` snippet in ``content`` with its stdout.
|
||||
|
||||
Runs each snippet with the skill directory as CWD so relative paths in
|
||||
the snippet work the way the author expects.
|
||||
"""
|
||||
if "!`" not in content:
|
||||
return content
|
||||
|
||||
def _replace(match: re.Match) -> str:
|
||||
cmd = match.group(1).strip()
|
||||
if not cmd:
|
||||
return ""
|
||||
return run_inline_shell(cmd, skill_dir, timeout)
|
||||
|
||||
return _INLINE_SHELL_RE.sub(_replace, content)
|
||||
|
||||
|
||||
def preprocess_skill_content(
|
||||
content: str,
|
||||
skill_dir: Path | None,
|
||||
session_id: str | None = None,
|
||||
skills_cfg: dict | None = None,
|
||||
) -> str:
|
||||
"""Apply configured SKILL.md template and inline-shell preprocessing."""
|
||||
if not content:
|
||||
return content
|
||||
|
||||
cfg = skills_cfg if isinstance(skills_cfg, dict) else load_skills_config()
|
||||
if cfg.get("template_vars", True):
|
||||
content = substitute_template_vars(content, skill_dir, session_id)
|
||||
if cfg.get("inline_shell", False):
|
||||
timeout = int(cfg.get("inline_shell_timeout", 10) or 10)
|
||||
content = expand_inline_shell(content, skill_dir, timeout)
|
||||
return content
|
||||
@@ -1,58 +0,0 @@
|
||||
# Hermes Apps
|
||||
|
||||
Platform apps live here. The first app is a cross-platform GUI shell around the
|
||||
existing Hermes dashboard; it should not fork chat, config, logs, or session UI.
|
||||
|
||||
## Shape
|
||||
|
||||
```text
|
||||
apps/
|
||||
gui/ # cross-platform app shell: dev Chrome shell now, Tauri native next
|
||||
shared/ # runtime bundle notes/scripts used by Windows + macOS packaging
|
||||
```
|
||||
|
||||
## Desktop Dev
|
||||
|
||||
The backend-only GUI mode is:
|
||||
|
||||
```bash
|
||||
hermes dashboard --gui
|
||||
```
|
||||
|
||||
The fast GUI shell is:
|
||||
|
||||
```powershell
|
||||
cd \\wsl$\Ubuntu\home\bb\hermes-agent\apps\gui
|
||||
npm run dev
|
||||
```
|
||||
|
||||
The native Tauri shell is:
|
||||
|
||||
```powershell
|
||||
cd \\wsl$\Ubuntu\home\bb\hermes-agent\apps\gui
|
||||
npm run dev:tauri
|
||||
```
|
||||
|
||||
`--gui` implies the embedded TUI; do not pass `--tui` separately for GUI mode.
|
||||
|
||||
## MVP Boundary
|
||||
|
||||
Included:
|
||||
|
||||
- bundled Python runtime
|
||||
- bundled Node/TUI runtime
|
||||
- CLI install to PATH
|
||||
- profile picker and first-run setup
|
||||
- dashboard health/reconnect state
|
||||
- tray controls
|
||||
- desktop notifications
|
||||
- Windows installer
|
||||
|
||||
Deferred:
|
||||
|
||||
- code signing
|
||||
- native self-updater
|
||||
- store distribution
|
||||
|
||||
For MVP updates, the desktop UI should run the existing `hermes update` flow and
|
||||
surface progress/finish notifications.
|
||||
@@ -1,102 +0,0 @@
|
||||
# Hermes GUI
|
||||
|
||||
Cross-platform GUI shell for the Hermes dashboard.
|
||||
|
||||
## Fast Dev Shell
|
||||
|
||||
This gets a GUI window on Windows/WSL today by launching Chrome in app mode:
|
||||
|
||||
```bash
|
||||
cd apps/gui
|
||||
npm run dev
|
||||
```
|
||||
|
||||
It starts `hermes dashboard --gui --no-open --port 9120`, waits for
|
||||
`/api/health`, then opens a standalone app window at `http://127.0.0.1:9120`.
|
||||
|
||||
## Native Shell
|
||||
|
||||
The native Tauri shell is still scaffolded:
|
||||
|
||||
```bash
|
||||
cd apps/gui
|
||||
npm run dev:tauri
|
||||
```
|
||||
|
||||
From Windows PowerShell on a `\\wsl$` path, use PowerShell `npm`, not
|
||||
`npm.cmd`:
|
||||
|
||||
```powershell
|
||||
Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass -Force
|
||||
cd \\wsl$\Ubuntu\home\bb\hermes-agent\apps\gui
|
||||
npm run dev:tauri
|
||||
```
|
||||
|
||||
`npm.cmd` goes through `cmd.exe`, and `cmd.exe` cannot use UNC paths as the
|
||||
current directory.
|
||||
|
||||
If `npm run` still falls through `cmd.exe`, bypass npm entirely:
|
||||
|
||||
```powershell
|
||||
\\wsl$\Ubuntu\home\bb\hermes-agent\apps\gui\dev-tauri.ps1
|
||||
```
|
||||
|
||||
The launcher builds into `%LOCALAPPDATA%\Hermes\cargo-target\gui` instead of
|
||||
`\\wsl$` because Windows Cargo incremental locks do not work reliably on UNC
|
||||
WSL filesystems.
|
||||
|
||||
In dev, either start Hermes yourself:
|
||||
|
||||
```bash
|
||||
hermes dashboard --gui --no-open --port 9120
|
||||
```
|
||||
|
||||
or let the native shell start it. The tray menu owns:
|
||||
|
||||
- Open Hermes
|
||||
- Open in Browser
|
||||
- Restart Hermes Runtime
|
||||
- Quit Hermes
|
||||
|
||||
The native shell reuses a healthy GUI runtime when one is already running.
|
||||
Otherwise it picks the first free port from `9120..9139`, passes that port into
|
||||
the WSL/backend process, and navigates the Tauri window there. Set
|
||||
`HERMES_GUI_PORT` to force a starting port.
|
||||
|
||||
## Fresh Install Emulation
|
||||
|
||||
Use an isolated Hermes home without touching your real `~/.hermes`:
|
||||
|
||||
```powershell
|
||||
powershell.exe -ExecutionPolicy Bypass -File \\wsl$\Ubuntu\home\bb\hermes-agent\apps\gui\dev-tauri.ps1 -Fresh
|
||||
```
|
||||
|
||||
Reset that disposable home and run again:
|
||||
|
||||
```powershell
|
||||
powershell.exe -ExecutionPolicy Bypass -File \\wsl$\Ubuntu\home\bb\hermes-agent\apps\gui\dev-tauri.ps1 -Fresh -ResetFresh
|
||||
```
|
||||
|
||||
Fresh mode stores state in `%LOCALAPPDATA%\Hermes\fresh-install-home` and starts
|
||||
from port `9140` so it does not collide with your normal GUI dev session.
|
||||
|
||||
Set `HERMES_GUI_MIN_SPLASH_MS` only when debugging the startup screen; default
|
||||
startup is instant once the backend is healthy.
|
||||
|
||||
## Boundary
|
||||
|
||||
GUI owns:
|
||||
|
||||
- app shell/window
|
||||
- startup state
|
||||
- sidecar process lifecycle
|
||||
- future tray/notifications/installers
|
||||
|
||||
Hermes owns:
|
||||
|
||||
- dashboard UI
|
||||
- auth/session token
|
||||
- profiles/config/env
|
||||
- TUI/PTT chat bridge
|
||||
- tools/skills/gateway
|
||||
- update flow
|
||||
@@ -1,57 +0,0 @@
|
||||
param(
|
||||
[string]$Command = "dev",
|
||||
[switch]$Fresh,
|
||||
[switch]$ResetFresh
|
||||
)
|
||||
|
||||
$ErrorActionPreference = "Stop"
|
||||
Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass -Force
|
||||
|
||||
$AppRoot = Split-Path -Parent $MyInvocation.MyCommand.Path
|
||||
$Script = Join-Path $AppRoot "scripts\tauri.mjs"
|
||||
|
||||
if (-not (Get-Command node -ErrorAction SilentlyContinue)) {
|
||||
throw "Windows Node.js was not found. Install it with: winget install OpenJS.NodeJS.LTS"
|
||||
}
|
||||
|
||||
if (-not (Get-Command rustc -ErrorAction SilentlyContinue)) {
|
||||
throw "Windows Rust was not found. Install it with: winget install Rustlang.Rustup"
|
||||
}
|
||||
|
||||
$Tauri = Get-Command tauri -ErrorAction SilentlyContinue
|
||||
$CargoTauri = Get-Command cargo-tauri -ErrorAction SilentlyContinue
|
||||
|
||||
if (-not $Tauri -and -not $CargoTauri) {
|
||||
throw "Tauri CLI not found. Install it with: npm install -g @tauri-apps/cli (run from a normal Windows path, not \\wsl$)"
|
||||
}
|
||||
|
||||
$env:CARGO_INCREMENTAL = "0"
|
||||
$env:CARGO_TARGET_DIR = Join-Path $env:LOCALAPPDATA "Hermes\cargo-target\gui"
|
||||
New-Item -ItemType Directory -Force -Path $env:CARGO_TARGET_DIR | Out-Null
|
||||
|
||||
if ($Fresh) {
|
||||
$FreshHome = Join-Path $env:LOCALAPPDATA "Hermes\fresh-install-home"
|
||||
if ($ResetFresh -and (Test-Path $FreshHome)) {
|
||||
Remove-Item -Recurse -Force $FreshHome
|
||||
}
|
||||
New-Item -ItemType Directory -Force -Path $FreshHome | Out-Null
|
||||
$env:HERMES_HOME = $FreshHome
|
||||
$env:HERMES_GUI_PORT = "9140"
|
||||
$env:HERMES_GUI_FRESH = "1"
|
||||
Write-Host "Fresh GUI mode"
|
||||
Write-Host " HERMES_HOME=$FreshHome"
|
||||
Write-Host " HERMES_GUI_PORT=$env:HERMES_GUI_PORT"
|
||||
}
|
||||
|
||||
Push-Location $AppRoot
|
||||
try {
|
||||
if ($Tauri) {
|
||||
& tauri $Command
|
||||
}
|
||||
else {
|
||||
& cargo tauri $Command
|
||||
}
|
||||
}
|
||||
finally {
|
||||
Pop-Location
|
||||
}
|
||||
@@ -1,13 +0,0 @@
|
||||
{
|
||||
"name": "@hermes/gui",
|
||||
"version": "0.0.0",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "node scripts/dev-shell.mjs",
|
||||
"dev:tauri": "node scripts/tauri.mjs dev",
|
||||
"build": "node scripts/tauri.mjs build",
|
||||
"dashboard": "node scripts/start-dashboard.mjs",
|
||||
"tauri": "node scripts/tauri.mjs"
|
||||
}
|
||||
}
|
||||
@@ -1,156 +0,0 @@
|
||||
import { spawn, spawnSync } from "node:child_process";
|
||||
import { createServer } from "node:net";
|
||||
import { dirname, resolve } from "node:path";
|
||||
import { setTimeout as delay } from "node:timers/promises";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
const here = dirname(fileURLToPath(import.meta.url));
|
||||
const repoRoot = resolve(here, "../../..");
|
||||
const python = process.env.HERMES_PYTHON || "python";
|
||||
let port = process.env.HERMES_GUI_PORT || "9120";
|
||||
let url = `http://127.0.0.1:${port}`;
|
||||
|
||||
let dashboard = null;
|
||||
|
||||
function stop() {
|
||||
if (dashboard && !dashboard.killed) dashboard.kill();
|
||||
}
|
||||
|
||||
process.on("SIGINT", () => {
|
||||
stop();
|
||||
process.exit(130);
|
||||
});
|
||||
process.on("SIGTERM", () => {
|
||||
stop();
|
||||
process.exit(143);
|
||||
});
|
||||
process.on("exit", stop);
|
||||
|
||||
async function waitForHealth() {
|
||||
for (let i = 0; i < 120; i += 1) {
|
||||
if (await isHealthy()) return true;
|
||||
await delay(500);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
async function isHealthy() {
|
||||
try {
|
||||
const res = await fetch(`${url}/api/health`, {
|
||||
signal: AbortSignal.timeout(1000),
|
||||
});
|
||||
const data = await res.json();
|
||||
return res.ok && data.status === "ok";
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function canBind(candidate) {
|
||||
return new Promise((resolveBind) => {
|
||||
const server = createServer();
|
||||
server.once("error", () => resolveBind(false));
|
||||
server.listen(Number(candidate), "127.0.0.1", () => {
|
||||
server.close(() => resolveBind(true));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function choosePort() {
|
||||
if (process.env.HERMES_GUI_PORT) return;
|
||||
|
||||
let candidate = Number(port);
|
||||
for (let i = 0; i < 20; i += 1) {
|
||||
if (await canBind(candidate)) {
|
||||
port = String(candidate);
|
||||
url = `http://127.0.0.1:${port}`;
|
||||
return;
|
||||
}
|
||||
candidate += 1;
|
||||
}
|
||||
}
|
||||
|
||||
function startDashboard() {
|
||||
dashboard = spawn(
|
||||
python,
|
||||
[
|
||||
"-m",
|
||||
"hermes_cli.main",
|
||||
"dashboard",
|
||||
"--gui",
|
||||
"--no-open",
|
||||
"--host",
|
||||
"127.0.0.1",
|
||||
"--port",
|
||||
port,
|
||||
],
|
||||
{
|
||||
cwd: repoRoot,
|
||||
env: {
|
||||
...process.env,
|
||||
HERMES_GUI: "1",
|
||||
},
|
||||
stdio: "inherit",
|
||||
},
|
||||
);
|
||||
|
||||
dashboard.on("exit", (code) => {
|
||||
process.exit(code ?? 0);
|
||||
});
|
||||
}
|
||||
|
||||
function run(command, args) {
|
||||
return (
|
||||
spawnSync(command, args, {
|
||||
shell: process.platform === "win32",
|
||||
stdio: "ignore",
|
||||
}).status === 0
|
||||
);
|
||||
}
|
||||
|
||||
function openGuiWindow() {
|
||||
if (process.platform === "win32") {
|
||||
return (
|
||||
run("cmd.exe", ["/C", "start", "", "chrome", `--app=${url}`]) ||
|
||||
run("cmd.exe", ["/C", "start", "", "msedge", `--app=${url}`]) ||
|
||||
run("cmd.exe", ["/C", "start", "", url])
|
||||
);
|
||||
}
|
||||
|
||||
if (process.env.WSL_DISTRO_NAME) {
|
||||
return (
|
||||
run("cmd.exe", ["/C", "start", "", "chrome", `--app=${url}`]) ||
|
||||
run("cmd.exe", ["/C", "start", "", "msedge", `--app=${url}`]) ||
|
||||
run("cmd.exe", ["/C", "start", "", url])
|
||||
);
|
||||
}
|
||||
|
||||
if (process.platform === "darwin") {
|
||||
return (
|
||||
run("open", ["-na", "Google Chrome", "--args", `--app=${url}`]) ||
|
||||
run("open", [url])
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
run("google-chrome", [`--app=${url}`]) ||
|
||||
run("chromium", [`--app=${url}`]) ||
|
||||
run("xdg-open", [url])
|
||||
);
|
||||
}
|
||||
|
||||
if (await isHealthy()) {
|
||||
console.log(`Hermes GUI already running -> ${url}`);
|
||||
openGuiWindow();
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
await choosePort();
|
||||
startDashboard();
|
||||
|
||||
if (await waitForHealth()) {
|
||||
console.log(`Hermes GUI -> ${url}`);
|
||||
openGuiWindow();
|
||||
} else {
|
||||
console.error(`Hermes GUI did not become healthy at ${url}`);
|
||||
}
|
||||
@@ -1,95 +0,0 @@
|
||||
import { spawn } from "node:child_process";
|
||||
import { dirname, resolve } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
const here = dirname(fileURLToPath(import.meta.url));
|
||||
const repoRoot = resolve(here, "../../..");
|
||||
const python = process.env.HERMES_PYTHON || "python";
|
||||
const port = process.env.HERMES_GUI_PORT || "9120";
|
||||
const url = `http://127.0.0.1:${port}`;
|
||||
|
||||
async function isHealthy() {
|
||||
try {
|
||||
const res = await fetch(`${url}/api/health`, {
|
||||
signal: AbortSignal.timeout(1000),
|
||||
});
|
||||
const data = await res.json();
|
||||
return res.ok && data.status === "ok";
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function wslRepoRoot() {
|
||||
const normalized = repoRoot.replaceAll("\\", "/");
|
||||
const parts = normalized.split("/");
|
||||
const host = parts[2]?.toLowerCase();
|
||||
if (process.platform !== "win32") return null;
|
||||
if (host !== "wsl$" && host !== "wsl.localhost") return null;
|
||||
const distro = parts[3];
|
||||
const path = `/${parts.slice(4).join("/")}`;
|
||||
return distro && path !== "/" ? { distro, path } : null;
|
||||
}
|
||||
|
||||
function spawnDashboard() {
|
||||
const wsl = wslRepoRoot();
|
||||
if (wsl) {
|
||||
return spawn(
|
||||
"wsl.exe",
|
||||
[
|
||||
"-d",
|
||||
wsl.distro,
|
||||
"--cd",
|
||||
wsl.path,
|
||||
"env",
|
||||
"HERMES_GUI=1",
|
||||
process.env.HERMES_WSL_PYTHON || "python",
|
||||
"-m",
|
||||
"hermes_cli.main",
|
||||
"dashboard",
|
||||
"--gui",
|
||||
"--no-open",
|
||||
"--host",
|
||||
"127.0.0.1",
|
||||
"--port",
|
||||
port,
|
||||
],
|
||||
{ stdio: "inherit" },
|
||||
);
|
||||
}
|
||||
|
||||
return spawn(
|
||||
python,
|
||||
[
|
||||
"-m",
|
||||
"hermes_cli.main",
|
||||
"dashboard",
|
||||
"--gui",
|
||||
"--no-open",
|
||||
"--host",
|
||||
"127.0.0.1",
|
||||
"--port",
|
||||
port,
|
||||
],
|
||||
{
|
||||
cwd: repoRoot,
|
||||
env: {
|
||||
...process.env,
|
||||
HERMES_GUI: "1",
|
||||
},
|
||||
stdio: "inherit",
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
if (await isHealthy()) {
|
||||
console.log(`Hermes GUI already running -> ${url}`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const child = spawnDashboard();
|
||||
|
||||
child.on("exit", (code, signal) => {
|
||||
if (signal) process.kill(process.pid, signal);
|
||||
process.exit(code ?? 0);
|
||||
});
|
||||
@@ -1,90 +0,0 @@
|
||||
import { spawnSync } from "node:child_process";
|
||||
import { existsSync } from "node:fs";
|
||||
import { dirname, resolve } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
const here = dirname(fileURLToPath(import.meta.url));
|
||||
const appRoot = resolve(here, "..");
|
||||
const bin = process.platform === "win32" ? "tauri.cmd" : "tauri";
|
||||
const localTauri = resolve(appRoot, "node_modules", ".bin", bin);
|
||||
const args = process.argv.slice(2);
|
||||
|
||||
function isWsl() {
|
||||
return process.platform === "linux" && !!process.env.WSL_DISTRO_NAME;
|
||||
}
|
||||
|
||||
function quotePs(value) {
|
||||
return `'${value.replaceAll("'", "''")}'`;
|
||||
}
|
||||
|
||||
function dispatchToWindows() {
|
||||
const pathResult = spawnSync("wslpath", ["-w", appRoot], {
|
||||
encoding: "utf8",
|
||||
});
|
||||
const windowsPath = pathResult.stdout.trim();
|
||||
if (!windowsPath) return false;
|
||||
|
||||
const command = [
|
||||
"$ErrorActionPreference = 'Stop'",
|
||||
"Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass -Force",
|
||||
"if (-not (Get-Command npm -ErrorAction SilentlyContinue)) {",
|
||||
' Write-Error "Windows npm was not found. Install Windows Node.js first: winget install OpenJS.NodeJS.LTS"',
|
||||
"}",
|
||||
"if (-not (Get-Command rustc -ErrorAction SilentlyContinue)) {",
|
||||
' Write-Error "Windows Rust was not found. Install Rust first: winget install Rustlang.Rustup"',
|
||||
"}",
|
||||
`Set-Location -LiteralPath ${quotePs(windowsPath)}`,
|
||||
"& npm run dev:tauri",
|
||||
].join("; ");
|
||||
const result = spawnSync(
|
||||
"powershell.exe",
|
||||
["-NoProfile", "-ExecutionPolicy", "Bypass", "-Command", command],
|
||||
{ stdio: "inherit" },
|
||||
);
|
||||
process.exit(result.status ?? 1);
|
||||
}
|
||||
|
||||
function run(command, commandArgs, { exit = true } = {}) {
|
||||
if (process.platform === "win32") {
|
||||
const psCommand = [
|
||||
"$ErrorActionPreference = 'Stop'",
|
||||
"Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass -Force",
|
||||
`Set-Location -LiteralPath ${quotePs(appRoot)}`,
|
||||
`& ${quotePs(command)} ${commandArgs.map(quotePs).join(" ")}`,
|
||||
].join("; ");
|
||||
const result = spawnSync(
|
||||
"powershell.exe",
|
||||
["-NoProfile", "-ExecutionPolicy", "Bypass", "-Command", psCommand],
|
||||
{ stdio: "inherit" },
|
||||
);
|
||||
if (result.error && result.error.code === "ENOENT") return false;
|
||||
if (exit) process.exit(result.status ?? 1);
|
||||
return result.status === 0;
|
||||
}
|
||||
|
||||
const result = spawnSync(command, commandArgs, {
|
||||
cwd: appRoot,
|
||||
env: process.env,
|
||||
stdio: "inherit",
|
||||
});
|
||||
|
||||
if (result.error && result.error.code === "ENOENT") return false;
|
||||
if (exit) process.exit(result.status ?? 1);
|
||||
return result.status === 0;
|
||||
}
|
||||
|
||||
if (isWsl() && process.env.HERMES_GUI_TAURI_WSL !== "1") {
|
||||
console.log("Launching native Windows Tauri from WSL...");
|
||||
dispatchToWindows();
|
||||
console.error(
|
||||
"Could not hand off to Windows PowerShell. Run this from Windows PowerShell instead:",
|
||||
);
|
||||
console.error(" cd \\\\wsl$\\Ubuntu\\home\\bb\\hermes-agent\\apps\\gui");
|
||||
console.error(" npm run dev:tauri");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (existsSync(localTauri)) run(localTauri, args);
|
||||
if (run("tauri", args, { exit: false })) process.exit(0);
|
||||
if (run("cargo", ["tauri", ...args], { exit: false })) process.exit(0);
|
||||
run("npx", ["--yes", "@tauri-apps/cli@latest", ...args]);
|
||||
@@ -1 +0,0 @@
|
||||
/target/
|
||||
Generated
-5579
File diff suppressed because it is too large
Load Diff
@@ -1,17 +0,0 @@
|
||||
[package]
|
||||
name = "hermes-gui"
|
||||
version = "0.0.0"
|
||||
description = "Hermes GUI shell"
|
||||
edition = "2021"
|
||||
|
||||
[lib]
|
||||
name = "hermes_gui_lib"
|
||||
crate-type = ["staticlib", "cdylib", "rlib"]
|
||||
|
||||
[build-dependencies]
|
||||
tauri-build = { version = "2", features = [] }
|
||||
|
||||
[dependencies]
|
||||
tauri = { version = "2", features = ["tray-icon"] }
|
||||
tauri-plugin-notification = "2"
|
||||
tauri-plugin-opener = "2"
|
||||
@@ -1,3 +0,0 @@
|
||||
fn main() {
|
||||
tauri_build::build();
|
||||
}
|
||||
@@ -1,7 +0,0 @@
|
||||
{
|
||||
"$schema": "../gen/schemas/desktop-schema.json",
|
||||
"identifier": "default",
|
||||
"description": "Default Hermes GUI permissions",
|
||||
"windows": ["main"],
|
||||
"permissions": ["core:default", "notification:default", "opener:default"]
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
@@ -1 +0,0 @@
|
||||
{"default":{"identifier":"default","description":"Default Hermes GUI permissions","local":true,"windows":["main"],"permissions":["core:default","notification:default","opener:default"]}}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
Before Width: | Height: | Size: 135 B |
Binary file not shown.
|
Before Width: | Height: | Size: 1.1 KiB |
@@ -1,4 +0,0 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
|
||||
<rect width="100" height="100" rx="18" fill="#071313"/>
|
||||
<text x="50" y="70" text-anchor="middle" font-size="68" fill="#f0e6d2">⚕</text>
|
||||
</svg>
|
||||
|
Before Width: | Height: | Size: 212 B |
@@ -1 +0,0 @@
|
||||
|
||||
@@ -1,433 +0,0 @@
|
||||
use std::{
|
||||
io::{Read, Write},
|
||||
net::{TcpListener, TcpStream},
|
||||
process::{Child, Command, Stdio},
|
||||
sync::Mutex,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use tauri::{
|
||||
image::Image,
|
||||
menu::{Menu, MenuItem, PredefinedMenuItem},
|
||||
tray::{MouseButton, MouseButtonState, TrayIconBuilder, TrayIconEvent},
|
||||
App, AppHandle, Manager, WebviewWindow,
|
||||
};
|
||||
|
||||
const GUI_HOST: &str = "127.0.0.1";
|
||||
const DEFAULT_GUI_PORT: u16 = 9120;
|
||||
const MIN_SPLASH_MS: u64 = 0;
|
||||
const SPLASH_URL: &str = "data:text/html,%3C!doctype%20html%3E%3Cmeta%20charset%3Dutf-8%3E%3Cstyle%3Ebody%7Bmargin%3A0%3Bheight%3A100vh%3Bdisplay%3Agrid%3Bplace-items%3Acenter%3Bbackground%3A%23071313%3Bcolor%3A%23f0e6d2%3Bfont%3A14px%20monospace%3Bletter-spacing%3A.08em%3Btext-transform%3Auppercase%7D%3C%2Fstyle%3E%3Cbody%3EStarting%20Hermes%E2%80%A6%3C%2Fbody%3E";
|
||||
|
||||
struct GuiState {
|
||||
child: Mutex<Option<Child>>,
|
||||
port: Mutex<u16>,
|
||||
}
|
||||
|
||||
fn gui_url(port: u16) -> String {
|
||||
format!("http://{GUI_HOST}:{port}")
|
||||
}
|
||||
|
||||
fn check_health(port: u16) -> bool {
|
||||
let Ok(mut stream) = TcpStream::connect_timeout(
|
||||
&format!("{GUI_HOST}:{port}").parse().unwrap(),
|
||||
Duration::from_secs(1),
|
||||
) else {
|
||||
return false;
|
||||
};
|
||||
|
||||
let _ = stream.set_read_timeout(Some(Duration::from_secs(1)));
|
||||
let request =
|
||||
format!("GET /api/health HTTP/1.1\r\nHost: {GUI_HOST}:{port}\r\nConnection: close\r\n\r\n");
|
||||
|
||||
if stream.write_all(request.as_bytes()).is_err() {
|
||||
return false;
|
||||
}
|
||||
|
||||
let mut response = String::new();
|
||||
let _ = stream.read_to_string(&mut response);
|
||||
response.contains("200 OK")
|
||||
&& response.contains("\"status\":\"ok\"")
|
||||
&& response.contains("\"mode\":\"gui\"")
|
||||
}
|
||||
|
||||
fn can_bind(port: u16) -> bool {
|
||||
TcpListener::bind((GUI_HOST, port)).is_ok()
|
||||
}
|
||||
|
||||
fn base_port() -> u16 {
|
||||
std::env::var("HERMES_GUI_PORT")
|
||||
.ok()
|
||||
.and_then(|raw| raw.parse().ok())
|
||||
.unwrap_or(DEFAULT_GUI_PORT)
|
||||
}
|
||||
|
||||
fn select_port() -> u16 {
|
||||
let start = base_port();
|
||||
for port in start..start.saturating_add(20) {
|
||||
if check_health(port) || can_bind(port) {
|
||||
return port;
|
||||
}
|
||||
}
|
||||
start
|
||||
}
|
||||
|
||||
fn repo_root() -> std::path::PathBuf {
|
||||
std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../../..")
|
||||
.canonicalize()
|
||||
.unwrap_or_else(|_| std::path::PathBuf::from("."))
|
||||
}
|
||||
|
||||
fn runtime_dir() -> Option<std::path::PathBuf> {
|
||||
std::env::var_os("HERMES_GUI_RUNTIME_DIR").map(std::path::PathBuf::from)
|
||||
}
|
||||
|
||||
fn runtime_python(runtime: &std::path::Path) -> std::path::PathBuf {
|
||||
if cfg!(target_os = "windows") {
|
||||
runtime.join("venv").join("Scripts").join("python.exe")
|
||||
} else {
|
||||
runtime.join("venv").join("bin").join("python")
|
||||
}
|
||||
}
|
||||
|
||||
fn wsl_path(root: &std::path::Path) -> Option<(String, String)> {
|
||||
let raw = root.to_string_lossy().replace('\\', "/");
|
||||
let parts: Vec<&str> = raw.split('/').collect();
|
||||
let host = parts.get(2)?.to_ascii_lowercase();
|
||||
if host != "wsl$" && host != "wsl.localhost" {
|
||||
return None;
|
||||
}
|
||||
let distro = parts.get(3)?.to_string();
|
||||
let path = format!("/{}", parts.get(4..)?.join("/"));
|
||||
Some((distro, path))
|
||||
}
|
||||
|
||||
fn start_dashboard(port: u16) -> std::io::Result<Child> {
|
||||
if let Some(runtime) = runtime_dir() {
|
||||
let python = runtime_python(&runtime);
|
||||
let web_dist = runtime.join("web_dist");
|
||||
let tui_dir = runtime.join("ui-tui");
|
||||
let port = port.to_string();
|
||||
return Command::new(python)
|
||||
.args([
|
||||
"-m",
|
||||
"hermes_cli.main",
|
||||
"dashboard",
|
||||
"--gui",
|
||||
"--no-open",
|
||||
"--host",
|
||||
GUI_HOST,
|
||||
"--port",
|
||||
&port,
|
||||
])
|
||||
.env("HERMES_GUI", "1")
|
||||
.env("HERMES_GUI_PORT", &port)
|
||||
.env("HERMES_WEB_DIST", web_dist)
|
||||
.env("HERMES_TUI_DIR", tui_dir)
|
||||
.envs(
|
||||
std::env::vars()
|
||||
.filter(|(key, _)| matches!(key.as_str(), "HERMES_HOME" | "HERMES_GUI_FRESH")),
|
||||
)
|
||||
.stdin(Stdio::null())
|
||||
.stdout(Stdio::null())
|
||||
.stderr(Stdio::null())
|
||||
.spawn();
|
||||
}
|
||||
|
||||
let root = repo_root();
|
||||
let port = port.to_string();
|
||||
|
||||
if let Some((distro, path)) = wsl_path(&root) {
|
||||
let port_env = format!("HERMES_GUI_PORT={port}");
|
||||
let mut env_args = vec!["HERMES_GUI=1".to_string(), port_env];
|
||||
if let Ok(home) = std::env::var("HERMES_HOME") {
|
||||
env_args.push(format!("HERMES_HOME={home}"));
|
||||
}
|
||||
if let Ok(fresh) = std::env::var("HERMES_GUI_FRESH") {
|
||||
env_args.push(format!("HERMES_GUI_FRESH={fresh}"));
|
||||
}
|
||||
let mut args = vec![
|
||||
"-d".to_string(),
|
||||
distro,
|
||||
"--cd".to_string(),
|
||||
path,
|
||||
"env".to_string(),
|
||||
];
|
||||
args.extend(env_args);
|
||||
args.extend([
|
||||
"python".to_string(),
|
||||
"-m".to_string(),
|
||||
"hermes_cli.main".to_string(),
|
||||
"dashboard".to_string(),
|
||||
"--gui".to_string(),
|
||||
"--no-open".to_string(),
|
||||
"--host".to_string(),
|
||||
GUI_HOST.to_string(),
|
||||
"--port".to_string(),
|
||||
port.clone(),
|
||||
]);
|
||||
return Command::new("wsl.exe")
|
||||
.args(args)
|
||||
.stdin(Stdio::null())
|
||||
.stdout(Stdio::null())
|
||||
.stderr(Stdio::null())
|
||||
.spawn();
|
||||
}
|
||||
|
||||
Command::new("python")
|
||||
.args([
|
||||
"-m",
|
||||
"hermes_cli.main",
|
||||
"dashboard",
|
||||
"--gui",
|
||||
"--no-open",
|
||||
"--host",
|
||||
GUI_HOST,
|
||||
"--port",
|
||||
&port,
|
||||
])
|
||||
.current_dir(root)
|
||||
.env("HERMES_GUI", "1")
|
||||
.env("HERMES_GUI_PORT", &port)
|
||||
.envs(
|
||||
std::env::vars()
|
||||
.filter(|(key, _)| matches!(key.as_str(), "HERMES_HOME" | "HERMES_GUI_FRESH")),
|
||||
)
|
||||
.stdin(Stdio::null())
|
||||
.stdout(Stdio::null())
|
||||
.stderr(Stdio::null())
|
||||
.spawn()
|
||||
}
|
||||
|
||||
fn stop_owned_dashboard(state: &GuiState) {
|
||||
let Some(mut child) = state.child.lock().expect("gui child lock poisoned").take() else {
|
||||
return;
|
||||
};
|
||||
let _ = child.kill();
|
||||
let _ = child.wait();
|
||||
}
|
||||
|
||||
fn current_port(state: &GuiState) -> u16 {
|
||||
*state.port.lock().expect("gui port lock poisoned")
|
||||
}
|
||||
|
||||
fn ensure_dashboard(state: &GuiState) -> Result<(), String> {
|
||||
let current = current_port(state);
|
||||
if check_health(current) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let port = select_port();
|
||||
*state.port.lock().expect("gui port lock poisoned") = port;
|
||||
|
||||
if check_health(port) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let child = start_dashboard(port).map_err(|err| {
|
||||
format!(
|
||||
"Could not auto-start Hermes dashboard ({err}). Start it manually with: hermes dashboard --gui --no-open --port {port}"
|
||||
)
|
||||
})?;
|
||||
*state.child.lock().expect("gui child lock poisoned") = Some(child);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn navigate_when_ready(window: WebviewWindow, port: u16) {
|
||||
std::thread::spawn(move || {
|
||||
let started = Instant::now();
|
||||
while started.elapsed() < Duration::from_secs(60) {
|
||||
if check_health(port) {
|
||||
let min_splash = std::env::var("HERMES_GUI_MIN_SPLASH_MS")
|
||||
.ok()
|
||||
.and_then(|raw| raw.parse::<u64>().ok())
|
||||
.unwrap_or(MIN_SPLASH_MS);
|
||||
let elapsed = started.elapsed();
|
||||
if elapsed < Duration::from_millis(min_splash) {
|
||||
std::thread::sleep(Duration::from_millis(min_splash) - elapsed);
|
||||
}
|
||||
if let Ok(url) = tauri::Url::parse(&gui_url(port)) {
|
||||
let _ = window.navigate(url);
|
||||
let _ = window.show();
|
||||
let _ = window.set_focus();
|
||||
}
|
||||
return;
|
||||
}
|
||||
std::thread::sleep(Duration::from_millis(500));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
fn show_main_window(app: &AppHandle) {
|
||||
if let Some(window) = app.get_webview_window("main") {
|
||||
let _ = window.show();
|
||||
let _ = window.set_focus();
|
||||
}
|
||||
}
|
||||
|
||||
fn open_browser(port: u16) {
|
||||
let url = gui_url(port);
|
||||
|
||||
#[cfg(target_os = "windows")]
|
||||
let _ = Command::new("cmd")
|
||||
.args(["/C", "start", "", &url])
|
||||
.stdin(Stdio::null())
|
||||
.stdout(Stdio::null())
|
||||
.stderr(Stdio::null())
|
||||
.spawn();
|
||||
|
||||
#[cfg(target_os = "macos")]
|
||||
let _ = Command::new("open").arg(&url).spawn();
|
||||
|
||||
#[cfg(all(unix, not(target_os = "macos")))]
|
||||
let _ = Command::new("xdg-open").arg(&url).spawn();
|
||||
}
|
||||
|
||||
fn tray_icon() -> Image<'static> {
|
||||
let width = 32;
|
||||
let height = 32;
|
||||
let mut rgba = Vec::with_capacity(width * height * 4);
|
||||
|
||||
for y in 0..height {
|
||||
for x in 0..width {
|
||||
let mark = (14..=17).contains(&x) && (5..=26).contains(&y)
|
||||
|| (8..=23).contains(&x) && (13..=16).contains(&y)
|
||||
|| (10..=21).contains(&x) && (y == 5 || y == 26);
|
||||
if mark {
|
||||
rgba.extend_from_slice(&[0xF0, 0xE6, 0xD2, 0xFF]);
|
||||
} else {
|
||||
rgba.extend_from_slice(&[0x07, 0x13, 0x13, 0xFF]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Image::new_owned(rgba, width as u32, height as u32)
|
||||
}
|
||||
|
||||
fn restart_runtime(app: &AppHandle) -> Result<(), String> {
|
||||
let state = app.state::<GuiState>();
|
||||
stop_owned_dashboard(&state);
|
||||
ensure_dashboard(&state)?;
|
||||
|
||||
if let Some(window) = app.get_webview_window("main") {
|
||||
if let Ok(url) = tauri::Url::parse(SPLASH_URL) {
|
||||
let _ = window.navigate(url);
|
||||
}
|
||||
let port = current_port(&state);
|
||||
navigate_when_ready(window, port);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn setup_tray(app: &App) -> tauri::Result<()> {
|
||||
let open_item = MenuItem::with_id(app, "open", "Open Hermes", true, None::<&str>)?;
|
||||
let browser_item = MenuItem::with_id(app, "browser", "Open in Browser", true, None::<&str>)?;
|
||||
let restart_item =
|
||||
MenuItem::with_id(app, "restart", "Restart Hermes Runtime", true, None::<&str>)?;
|
||||
let status_item = MenuItem::with_id(app, "status", "Local runtime", false, None::<&str>)?;
|
||||
let separator = PredefinedMenuItem::separator(app)?;
|
||||
let separator2 = PredefinedMenuItem::separator(app)?;
|
||||
let quit_item = MenuItem::with_id(app, "quit", "Quit Hermes", true, None::<&str>)?;
|
||||
|
||||
let menu = Menu::with_items(
|
||||
app,
|
||||
&[
|
||||
&open_item,
|
||||
&browser_item,
|
||||
&restart_item,
|
||||
&separator,
|
||||
&status_item,
|
||||
&separator2,
|
||||
&quit_item,
|
||||
],
|
||||
)?;
|
||||
|
||||
let icon = tray_icon();
|
||||
let _tray = TrayIconBuilder::new()
|
||||
.icon(icon)
|
||||
.menu(&menu)
|
||||
.tooltip("Hermes")
|
||||
.on_menu_event(|app, event| match event.id.as_ref() {
|
||||
"open" => show_main_window(app),
|
||||
"browser" => {
|
||||
let state = app.state::<GuiState>();
|
||||
open_browser(current_port(&state));
|
||||
}
|
||||
"restart" => {
|
||||
if let Err(err) = restart_runtime(app) {
|
||||
eprintln!("Failed to restart Hermes runtime: {err}");
|
||||
}
|
||||
}
|
||||
"quit" => {
|
||||
let state = app.state::<GuiState>();
|
||||
stop_owned_dashboard(&state);
|
||||
app.exit(0);
|
||||
}
|
||||
_ => {}
|
||||
})
|
||||
.on_tray_icon_event(|tray, event| {
|
||||
if let TrayIconEvent::Click {
|
||||
button: MouseButton::Left,
|
||||
button_state: MouseButtonState::Up,
|
||||
..
|
||||
} = event
|
||||
{
|
||||
show_main_window(&tray.app_handle());
|
||||
}
|
||||
})
|
||||
.build(app)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tauri::command]
|
||||
fn runtime_running(app: AppHandle) -> bool {
|
||||
let state = app.state::<GuiState>();
|
||||
check_health(current_port(&state))
|
||||
}
|
||||
|
||||
#[tauri::command]
|
||||
fn restart_runtime_command(app: AppHandle) -> Result<(), String> {
|
||||
restart_runtime(&app)
|
||||
}
|
||||
|
||||
pub fn run() {
|
||||
tauri::Builder::default()
|
||||
.plugin(tauri_plugin_notification::init())
|
||||
.plugin(tauri_plugin_opener::init())
|
||||
.manage(GuiState {
|
||||
child: Mutex::new(None),
|
||||
port: Mutex::new(base_port()),
|
||||
})
|
||||
.invoke_handler(tauri::generate_handler![
|
||||
runtime_running,
|
||||
restart_runtime_command
|
||||
])
|
||||
.setup(|app| {
|
||||
setup_tray(app)?;
|
||||
|
||||
if let Some(window) = app.get_webview_window("main") {
|
||||
if let Ok(url) = tauri::Url::parse(SPLASH_URL) {
|
||||
let _ = window.navigate(url);
|
||||
}
|
||||
|
||||
let state = app.state::<GuiState>();
|
||||
if let Err(err) = ensure_dashboard(&state) {
|
||||
eprintln!("{err}");
|
||||
}
|
||||
|
||||
let port = current_port(&state);
|
||||
navigate_when_ready(window, port);
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
.on_window_event(|window, event| {
|
||||
if let tauri::WindowEvent::CloseRequested { api, .. } = event {
|
||||
api.prevent_close();
|
||||
let _ = window.hide();
|
||||
}
|
||||
})
|
||||
.run(tauri::generate_context!())
|
||||
.expect("failed to run Hermes GUI");
|
||||
}
|
||||
@@ -1,5 +0,0 @@
|
||||
#![cfg_attr(not(debug_assertions), windows_subsystem = "windows")]
|
||||
|
||||
fn main() {
|
||||
hermes_gui_lib::run();
|
||||
}
|
||||
@@ -1,38 +0,0 @@
|
||||
{
|
||||
"$schema": "https://schema.tauri.app/config/2",
|
||||
"productName": "Hermes",
|
||||
"version": "0.0.0",
|
||||
"identifier": "ai.nous.hermes.gui",
|
||||
"build": {
|
||||
"beforeDevCommand": "",
|
||||
"beforeBuildCommand": "",
|
||||
"devUrl": "http://127.0.0.1:9120",
|
||||
"frontendDist": "../dist"
|
||||
},
|
||||
"app": {
|
||||
"withGlobalTauri": true,
|
||||
"windows": [
|
||||
{
|
||||
"label": "main",
|
||||
"title": "Hermes",
|
||||
"width": 1400,
|
||||
"height": 900,
|
||||
"minWidth": 900,
|
||||
"minHeight": 600,
|
||||
"resizable": true,
|
||||
"center": true
|
||||
}
|
||||
],
|
||||
"security": {
|
||||
"csp": "default-src 'self' http://127.0.0.1:* http://localhost:*; connect-src 'self' http://127.0.0.1:* http://localhost:* ws://127.0.0.1:* ws://localhost:*; img-src 'self' data: blob: http://127.0.0.1:* http://localhost:*; style-src 'self' 'unsafe-inline' http://127.0.0.1:* http://localhost:*; script-src 'self' 'unsafe-inline' 'unsafe-eval' http://127.0.0.1:* http://localhost:*"
|
||||
}
|
||||
},
|
||||
"bundle": {
|
||||
"active": true,
|
||||
"icon": ["icons/32x32.png", "icons/icon.ico", "icons/icon.svg"],
|
||||
"targets": ["nsis", "dmg", "app"],
|
||||
"resources": {
|
||||
"sidecars": "sidecars/"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,5 +0,0 @@
|
||||
// Browser-side GUI bridge entry.
|
||||
//
|
||||
// The dashboard remains in `web/`; this file is reserved for future shell-only
|
||||
// glue if we need pre-navigation scripts or native event wiring.
|
||||
export {};
|
||||
@@ -1,44 +0,0 @@
|
||||
param(
|
||||
[string]$Out = "$PSScriptRoot\..\gui\src-tauri\sidecars\hermes-runtime",
|
||||
[string]$Python = "python"
|
||||
)
|
||||
|
||||
$Root = Resolve-Path "$PSScriptRoot\..\.."
|
||||
|
||||
Write-Host "Bundling Hermes GUI runtime"
|
||||
Write-Host "repo: $Root"
|
||||
Write-Host "out: $Out"
|
||||
|
||||
if (Test-Path $Out) {
|
||||
Remove-Item -Recurse -Force $Out
|
||||
}
|
||||
New-Item -ItemType Directory -Force -Path $Out | Out-Null
|
||||
|
||||
Write-Host "-> Building dashboard"
|
||||
npm --prefix "$Root\web" ci
|
||||
npm --prefix "$Root\web" run build
|
||||
Copy-Item -Recurse "$Root\web\dist" "$Out\web_dist"
|
||||
|
||||
Write-Host "-> Building TUI"
|
||||
npm --prefix "$Root\ui-tui" ci
|
||||
npm --prefix "$Root\ui-tui" run build
|
||||
New-Item -ItemType Directory -Force -Path "$Out\ui-tui" | Out-Null
|
||||
Copy-Item -Recurse "$Root\ui-tui\dist" "$Out\ui-tui\dist"
|
||||
Copy-Item "$Root\ui-tui\package.json" "$Out\ui-tui\package.json"
|
||||
Copy-Item "$Root\ui-tui\package-lock.json" "$Out\ui-tui\package-lock.json"
|
||||
Copy-Item -Recurse "$Root\ui-tui\node_modules" "$Out\ui-tui\node_modules"
|
||||
|
||||
Write-Host "-> Creating Python runtime"
|
||||
& $Python -m venv "$Out\venv"
|
||||
& "$Out\venv\Scripts\python.exe" -m pip install --upgrade pip
|
||||
& "$Out\venv\Scripts\python.exe" -m pip install -e "$Root[web,pty]"
|
||||
|
||||
@"
|
||||
# Hermes GUI Runtime
|
||||
|
||||
Generated by apps/shared/bundle-runtime.ps1.
|
||||
|
||||
Set HERMES_GUI_RUNTIME_DIR to this directory before launching the Tauri shell.
|
||||
"@ | Set-Content "$Out\README.md"
|
||||
|
||||
Write-Host "Runtime bundle ready: $Out"
|
||||
@@ -1,41 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||
OUT="${1:-"$ROOT/apps/gui/src-tauri/sidecars/hermes-runtime"}"
|
||||
PYTHON="${PYTHON:-python}"
|
||||
|
||||
echo "Bundling Hermes GUI runtime"
|
||||
echo "repo: $ROOT"
|
||||
echo "out: $OUT"
|
||||
|
||||
rm -rf "$OUT"
|
||||
mkdir -p "$OUT"
|
||||
|
||||
echo "→ Building dashboard"
|
||||
npm --prefix "$ROOT/web" ci
|
||||
npm --prefix "$ROOT/web" run build
|
||||
cp -a "$ROOT/web/dist" "$OUT/web_dist"
|
||||
|
||||
echo "→ Building TUI"
|
||||
npm --prefix "$ROOT/ui-tui" ci
|
||||
npm --prefix "$ROOT/ui-tui" run build
|
||||
mkdir -p "$OUT/ui-tui"
|
||||
cp -a "$ROOT/ui-tui/dist" "$OUT/ui-tui/dist"
|
||||
cp -a "$ROOT/ui-tui/package.json" "$ROOT/ui-tui/package-lock.json" "$OUT/ui-tui/"
|
||||
cp -a "$ROOT/ui-tui/node_modules" "$OUT/ui-tui/node_modules"
|
||||
|
||||
echo "→ Creating Python runtime"
|
||||
"$PYTHON" -m venv "$OUT/venv"
|
||||
"$OUT/venv/bin/python" -m pip install --upgrade pip
|
||||
"$OUT/venv/bin/python" -m pip install -e "$ROOT[web,pty]"
|
||||
|
||||
cat > "$OUT/README.md" <<EOF
|
||||
# Hermes GUI Runtime
|
||||
|
||||
Generated by apps/shared/bundle-runtime.sh.
|
||||
|
||||
Set HERMES_GUI_RUNTIME_DIR to this directory before launching the Tauri shell.
|
||||
EOF
|
||||
|
||||
echo "✓ Runtime bundle ready: $OUT"
|
||||
@@ -1,33 +0,0 @@
|
||||
# GUI Runtime Contract
|
||||
|
||||
The GUI shell starts Hermes with a small, explicit environment.
|
||||
|
||||
## Environment
|
||||
|
||||
```text
|
||||
HERMES_GUI=1
|
||||
HERMES_WEB_DIST=<bundled web dist>
|
||||
HERMES_TUI_DIR=<bundled ui-tui dir>
|
||||
```
|
||||
|
||||
The native shell uses `127.0.0.1:9120` as its initial GUI port during dev.
|
||||
Bundled builds should keep the port private to the local machine and expose it
|
||||
through `/api/health` and `/api/runtime`.
|
||||
|
||||
The shell should also pass the selected profile through the normal Hermes CLI
|
||||
profile mechanism once the profile picker is wired.
|
||||
|
||||
## Ports
|
||||
|
||||
Use `127.0.0.1` only. Start with the GUI default port, then fall back to a
|
||||
free port if occupied. Show the chosen port in the tray menu.
|
||||
|
||||
## User Data
|
||||
|
||||
The installer owns app files. Hermes owns user state under `HERMES_HOME`.
|
||||
Uninstallers must not delete user state unless the user explicitly asks.
|
||||
|
||||
## Update Model
|
||||
|
||||
MVP does not use Tauri's native updater. GUI runs `hermes update`, tails the
|
||||
action log, notifies completion, then offers to restart the runtime.
|
||||
+6
-2
@@ -951,9 +951,13 @@ class BatchRunner:
|
||||
root_logger.setLevel(original_level)
|
||||
|
||||
# Aggregate all batch statistics and update checkpoint
|
||||
all_completed_prompts = list(completed_prompts_set)
|
||||
total_reasoning_stats = {"total_assistant_turns": 0, "turns_with_reasoning": 0, "turns_without_reasoning": 0}
|
||||
|
||||
|
||||
for batch_result in results:
|
||||
# Add newly completed prompts
|
||||
all_completed_prompts.extend(batch_result.get("completed_prompts", []))
|
||||
|
||||
# Aggregate tool stats
|
||||
for tool_name, stats in batch_result.get("tool_stats", {}).items():
|
||||
if tool_name not in total_tool_stats:
|
||||
@@ -973,7 +977,7 @@ class BatchRunner:
|
||||
|
||||
# Save final checkpoint (best-effort; incremental writes already happened)
|
||||
try:
|
||||
checkpoint_data["completed_prompts"] = sorted(completed_prompts_set)
|
||||
checkpoint_data["completed_prompts"] = all_completed_prompts
|
||||
self._save_checkpoint(checkpoint_data, lock=checkpoint_lock)
|
||||
except Exception as ckpt_err:
|
||||
print(f"âš ï¸ Warning: Failed to save final checkpoint: {ckpt_err}")
|
||||
|
||||
@@ -790,16 +790,9 @@ code_execution:
|
||||
# Supports single tasks and batch mode (default 3 parallel, configurable).
|
||||
delegation:
|
||||
max_iterations: 50 # Max tool-calling turns per child (default: 50)
|
||||
# max_concurrent_children: 3 # Max parallel child agents per batch (default: 3, floor: 1, no ceiling).
|
||||
# WARNING: values above 10 multiply API cost linearly.
|
||||
# max_spawn_depth: 1 # Delegation tree depth cap (range: 1-3, default: 1 = flat).
|
||||
# Raise to 2 to allow workers to spawn their own subagents.
|
||||
# Requires role="orchestrator" on intermediate agents.
|
||||
# max_concurrent_children: 3 # Max parallel child agents (default: 3)
|
||||
# max_spawn_depth: 1 # Tree depth cap (1-3, default: 1 = flat). Raise to 2 or 3 to allow orchestrator children to spawn their own workers.
|
||||
# orchestrator_enabled: true # Kill switch for role="orchestrator" children (default: true).
|
||||
# subagent_auto_approve: false # When a subagent hits a dangerous-command approval prompt, auto-deny (default: false)
|
||||
# or auto-approve "once" (true) instead of blocking on stdin.
|
||||
# The parent TUI owns stdin, so blocking would deadlock; non-interactive resolution is required.
|
||||
# Both choices emit a logger.warning audit line. Flip to true only for cron/batch pipelines.
|
||||
# inherit_mcp_toolsets: true # When explicit child toolsets are narrowed, also keep the parent's MCP toolsets (default: true). Set false for strict intersection.
|
||||
# model: "google/gemini-3-flash-preview" # Override model for subagents (empty = inherit parent)
|
||||
# provider: "openrouter" # Override provider for subagents (empty = inherit parent)
|
||||
|
||||
@@ -3176,14 +3176,7 @@ class HermesCLI:
|
||||
# the configured model (e.g. "qwen3.6-plus"), causing 400 errors.
|
||||
runtime_model = runtime.get("model")
|
||||
if runtime_model and isinstance(runtime_model, str):
|
||||
# Only use runtime model if: model is unset, or model equals provider name
|
||||
should_use_runtime_model = (
|
||||
not self.model or # No model configured yet
|
||||
self.model == self.provider or # Model is the provider slug
|
||||
self.model == runtime.get("name") # Model matches provider display name
|
||||
)
|
||||
if should_use_runtime_model:
|
||||
self.model = runtime_model
|
||||
self.model = runtime_model
|
||||
|
||||
# If model is still empty (e.g. user ran `hermes auth add openai-codex`
|
||||
# without `hermes model`), fall back to the provider's first catalog
|
||||
@@ -4668,6 +4661,10 @@ class HermesCLI:
|
||||
def new_session(self, silent=False):
|
||||
"""Start a fresh session with a new session ID and cleared agent state."""
|
||||
if self.agent and self.conversation_history:
|
||||
try:
|
||||
self.agent.flush_memories(self.conversation_history)
|
||||
except (Exception, KeyboardInterrupt):
|
||||
pass
|
||||
# Trigger memory extraction on the old session before session_id rotates.
|
||||
self.agent.commit_memory_session(self.conversation_history)
|
||||
self._notify_session_boundary("on_session_finalize")
|
||||
@@ -5377,26 +5374,29 @@ class HermesCLI:
|
||||
_cprint(f" ✓ Model switched: {result.new_model}")
|
||||
_cprint(f" Provider: {provider_label}")
|
||||
|
||||
# Context: always resolve via the provider-aware chain so Codex OAuth,
|
||||
# Copilot, and Nous-enforced caps win over the raw models.dev entry
|
||||
# (e.g. gpt-5.5 is 1.05M on openai but 272K on Codex OAuth).
|
||||
# Rich metadata from models.dev
|
||||
mi = result.model_info
|
||||
from hermes_cli.model_switch import resolve_display_context_length
|
||||
ctx = resolve_display_context_length(
|
||||
result.new_model,
|
||||
result.target_provider,
|
||||
base_url=result.base_url or self.base_url or "",
|
||||
api_key=result.api_key or self.api_key or "",
|
||||
model_info=mi,
|
||||
)
|
||||
if ctx:
|
||||
_cprint(f" Context: {ctx:,} tokens")
|
||||
if mi:
|
||||
if mi.context_window:
|
||||
_cprint(f" Context: {mi.context_window:,} tokens")
|
||||
if mi.max_output:
|
||||
_cprint(f" Max output: {mi.max_output:,} tokens")
|
||||
if mi.has_cost_data():
|
||||
_cprint(f" Cost: {mi.format_cost()}")
|
||||
_cprint(f" Capabilities: {mi.format_capabilities()}")
|
||||
else:
|
||||
# Fallback to old context length lookup
|
||||
try:
|
||||
from agent.model_metadata import get_model_context_length
|
||||
ctx = get_model_context_length(
|
||||
result.new_model,
|
||||
base_url=result.base_url or self.base_url,
|
||||
api_key=result.api_key or self.api_key,
|
||||
provider=result.target_provider,
|
||||
)
|
||||
_cprint(f" Context: {ctx:,} tokens")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Cache notice
|
||||
cache_enabled = (
|
||||
@@ -6165,8 +6165,6 @@ class HermesCLI:
|
||||
self._handle_skin_command(cmd_original)
|
||||
elif canonical == "voice":
|
||||
self._handle_voice_command(cmd_original)
|
||||
elif canonical == "busy":
|
||||
self._handle_busy_command(cmd_original)
|
||||
else:
|
||||
# Check for user-defined quick commands (bypass agent loop, no LLM call)
|
||||
base_cmd = cmd_lower.split()[0]
|
||||
@@ -6903,36 +6901,6 @@ class HermesCLI:
|
||||
else:
|
||||
_cprint(f" {_ACCENT}✓ Reasoning effort set to '{arg}' (session only){_RST}")
|
||||
|
||||
def _handle_busy_command(self, cmd: str):
|
||||
"""Handle /busy — control what Enter does while Hermes is working.
|
||||
|
||||
Usage:
|
||||
/busy Show current busy input mode
|
||||
/busy status Show current busy input mode
|
||||
/busy queue Queue input for the next turn instead of interrupting
|
||||
/busy interrupt Interrupt the current run on Enter (default)
|
||||
"""
|
||||
parts = cmd.strip().split(maxsplit=1)
|
||||
if len(parts) < 2 or parts[1].strip().lower() == "status":
|
||||
_cprint(f" {_ACCENT}Busy input mode: {self.busy_input_mode}{_RST}")
|
||||
_cprint(f" {_DIM}Enter while busy: {'queues for next turn' if self.busy_input_mode == 'queue' else 'interrupts current run'}{_RST}")
|
||||
_cprint(f" {_DIM}Usage: /busy [queue|interrupt|status]{_RST}")
|
||||
return
|
||||
|
||||
arg = parts[1].strip().lower()
|
||||
if arg not in {"queue", "interrupt"}:
|
||||
_cprint(f" {_DIM}(._.) Unknown argument: {arg}{_RST}")
|
||||
_cprint(f" {_DIM}Usage: /busy [queue|interrupt|status]{_RST}")
|
||||
return
|
||||
|
||||
self.busy_input_mode = arg
|
||||
if save_config_value("display.busy_input_mode", arg):
|
||||
behavior = "Enter will queue follow-up input while Hermes is busy." if arg == "queue" else "Enter will interrupt the current run while Hermes is busy."
|
||||
_cprint(f" {_ACCENT}✓ Busy input mode set to '{arg}' (saved to config){_RST}")
|
||||
_cprint(f" {_DIM}{behavior}{_RST}")
|
||||
else:
|
||||
_cprint(f" {_ACCENT}✓ Busy input mode set to '{arg}' (session only){_RST}")
|
||||
|
||||
def _handle_fast_command(self, cmd: str):
|
||||
"""Handle /fast — toggle fast mode (OpenAI Priority Processing / Anthropic Fast Mode)."""
|
||||
if not self._fast_command_available():
|
||||
@@ -7011,52 +6979,51 @@ class HermesCLI:
|
||||
focus_topic = parts[1].strip()
|
||||
|
||||
original_count = len(self.conversation_history)
|
||||
with self._busy_command("Compressing context..."):
|
||||
try:
|
||||
from agent.model_metadata import estimate_messages_tokens_rough
|
||||
from agent.manual_compression_feedback import summarize_manual_compression
|
||||
original_history = list(self.conversation_history)
|
||||
approx_tokens = estimate_messages_tokens_rough(original_history)
|
||||
if focus_topic:
|
||||
print(f"🗜️ Compressing {original_count} messages (~{approx_tokens:,} tokens), "
|
||||
f"focus: \"{focus_topic}\"...")
|
||||
else:
|
||||
print(f"🗜️ Compressing {original_count} messages (~{approx_tokens:,} tokens)...")
|
||||
try:
|
||||
from agent.model_metadata import estimate_messages_tokens_rough
|
||||
from agent.manual_compression_feedback import summarize_manual_compression
|
||||
original_history = list(self.conversation_history)
|
||||
approx_tokens = estimate_messages_tokens_rough(original_history)
|
||||
if focus_topic:
|
||||
print(f"🗜️ Compressing {original_count} messages (~{approx_tokens:,} tokens), "
|
||||
f"focus: \"{focus_topic}\"...")
|
||||
else:
|
||||
print(f"🗜️ Compressing {original_count} messages (~{approx_tokens:,} tokens)...")
|
||||
|
||||
compressed, _ = self.agent._compress_context(
|
||||
original_history,
|
||||
self.agent._cached_system_prompt or "",
|
||||
approx_tokens=approx_tokens,
|
||||
focus_topic=focus_topic or None,
|
||||
)
|
||||
self.conversation_history = compressed
|
||||
# _compress_context ends the old session and creates a new child
|
||||
# session on the agent (run_agent.py::_compress_context). Sync the
|
||||
# CLI's session_id so /status, /resume, exit summary, and title
|
||||
# generation all point at the live continuation session, not the
|
||||
# ended parent. Without this, subsequent end_session() calls target
|
||||
# the already-closed parent and the child is orphaned.
|
||||
if (
|
||||
getattr(self.agent, "session_id", None)
|
||||
and self.agent.session_id != self.session_id
|
||||
):
|
||||
self.session_id = self.agent.session_id
|
||||
self._pending_title = None
|
||||
new_tokens = estimate_messages_tokens_rough(self.conversation_history)
|
||||
summary = summarize_manual_compression(
|
||||
original_history,
|
||||
self.conversation_history,
|
||||
approx_tokens,
|
||||
new_tokens,
|
||||
)
|
||||
icon = "🗜️" if summary["noop"] else "✅"
|
||||
print(f" {icon} {summary['headline']}")
|
||||
print(f" {summary['token_line']}")
|
||||
if summary["note"]:
|
||||
print(f" {summary['note']}")
|
||||
compressed, _ = self.agent._compress_context(
|
||||
original_history,
|
||||
self.agent._cached_system_prompt or "",
|
||||
approx_tokens=approx_tokens,
|
||||
focus_topic=focus_topic or None,
|
||||
)
|
||||
self.conversation_history = compressed
|
||||
# _compress_context ends the old session and creates a new child
|
||||
# session on the agent (run_agent.py::_compress_context). Sync the
|
||||
# CLI's session_id so /status, /resume, exit summary, and title
|
||||
# generation all point at the live continuation session, not the
|
||||
# ended parent. Without this, subsequent end_session() calls target
|
||||
# the already-closed parent and the child is orphaned.
|
||||
if (
|
||||
getattr(self.agent, "session_id", None)
|
||||
and self.agent.session_id != self.session_id
|
||||
):
|
||||
self.session_id = self.agent.session_id
|
||||
self._pending_title = None
|
||||
new_tokens = estimate_messages_tokens_rough(self.conversation_history)
|
||||
summary = summarize_manual_compression(
|
||||
original_history,
|
||||
self.conversation_history,
|
||||
approx_tokens,
|
||||
new_tokens,
|
||||
)
|
||||
icon = "🗜️" if summary["noop"] else "✅"
|
||||
print(f" {icon} {summary['headline']}")
|
||||
print(f" {summary['token_line']}")
|
||||
if summary["note"]:
|
||||
print(f" {summary['note']}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Compression failed: {e}")
|
||||
except Exception as e:
|
||||
print(f" ❌ Compression failed: {e}")
|
||||
|
||||
def _handle_debug_command(self):
|
||||
"""Handle /debug — upload debug report + logs and print paste URLs."""
|
||||
@@ -9558,20 +9525,9 @@ class HermesCLI:
|
||||
|
||||
@kb.add('c-d')
|
||||
def handle_ctrl_d(event):
|
||||
"""Ctrl+D: delete char under cursor (standard readline behaviour).
|
||||
Only exit when the input is empty — same as bash/zsh. Pending
|
||||
attached images count as input and block the EOF-exit so the
|
||||
user doesn't lose them silently.
|
||||
"""
|
||||
buf = event.app.current_buffer
|
||||
if buf.text:
|
||||
buf.delete()
|
||||
elif self._attached_images:
|
||||
# Empty text but pending attachments — no-op, don't exit.
|
||||
return
|
||||
else:
|
||||
self._should_exit = True
|
||||
event.app.exit()
|
||||
"""Handle Ctrl+D - exit."""
|
||||
self._should_exit = True
|
||||
event.app.exit()
|
||||
|
||||
_modal_prompt_active = Condition(
|
||||
lambda: bool(self._secret_state or self._sudo_state)
|
||||
@@ -10784,6 +10740,12 @@ class HermesCLI:
|
||||
self.agent.interrupt()
|
||||
except Exception:
|
||||
pass
|
||||
# Flush memories before exit (only for substantial conversations)
|
||||
if self.agent and self.conversation_history:
|
||||
try:
|
||||
self.agent.flush_memories(self.conversation_history)
|
||||
except (Exception, KeyboardInterrupt):
|
||||
pass
|
||||
# Shut down voice recorder (release persistent audio stream)
|
||||
if hasattr(self, '_voice_recorder') and self._voice_recorder:
|
||||
try:
|
||||
|
||||
+1
-14
@@ -16,7 +16,7 @@ import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from hermes_constants import get_hermes_home
|
||||
from typing import Optional, Dict, List, Any, Union
|
||||
from typing import Optional, Dict, List, Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -417,7 +417,6 @@ def create_job(
|
||||
provider: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
script: Optional[str] = None,
|
||||
context_from: Optional[Union[str, List[str]]] = None,
|
||||
enabled_toolsets: Optional[List[str]] = None,
|
||||
workdir: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
@@ -439,9 +438,6 @@ def create_job(
|
||||
script: Optional path to a Python script whose stdout is injected into the
|
||||
prompt each run. The script runs before the agent turn, and its output
|
||||
is prepended as context. Useful for data collection / change detection.
|
||||
context_from: Optional job ID (or list of job IDs) whose most recent output
|
||||
is injected into the prompt as context before each run.
|
||||
Useful for chaining cron jobs: job A finds data, job B processes it.
|
||||
enabled_toolsets: Optional list of toolset names to restrict the agent to.
|
||||
When set, only tools from these toolsets are loaded, reducing
|
||||
token overhead. When omitted, all default tools are loaded.
|
||||
@@ -485,14 +481,6 @@ def create_job(
|
||||
normalized_toolsets = normalized_toolsets or None
|
||||
normalized_workdir = _normalize_workdir(workdir)
|
||||
|
||||
# Normalize context_from: accept str or list of str, store as list or None
|
||||
if isinstance(context_from, str):
|
||||
context_from = [context_from.strip()] if context_from.strip() else None
|
||||
elif isinstance(context_from, list):
|
||||
context_from = [str(j).strip() for j in context_from if str(j).strip()] or None
|
||||
else:
|
||||
context_from = None
|
||||
|
||||
label_source = (prompt or (normalized_skills[0] if normalized_skills else None)) or "cron job"
|
||||
job = {
|
||||
"id": job_id,
|
||||
@@ -504,7 +492,6 @@ def create_job(
|
||||
"provider": normalized_provider,
|
||||
"base_url": normalized_base_url,
|
||||
"script": normalized_script,
|
||||
"context_from": context_from,
|
||||
"schedule": parsed_schedule,
|
||||
"schedule_display": parsed_schedule.get("display", schedule),
|
||||
"repeat": {
|
||||
|
||||
@@ -671,47 +671,6 @@ def _build_job_prompt(job: dict, prerun_script: Optional[tuple] = None) -> str:
|
||||
f"{prompt}"
|
||||
)
|
||||
|
||||
# Inject output from referenced cron jobs as context.
|
||||
context_from = job.get("context_from")
|
||||
if context_from:
|
||||
from cron.jobs import OUTPUT_DIR
|
||||
if isinstance(context_from, str):
|
||||
context_from = [context_from]
|
||||
for source_job_id in context_from:
|
||||
# Guard against path traversal — valid job IDs are 12-char hex strings
|
||||
if not source_job_id or not all(c in "0123456789abcdef" for c in source_job_id):
|
||||
logger.warning("context_from: skipping invalid job_id %r", source_job_id)
|
||||
continue
|
||||
try:
|
||||
job_output_dir = OUTPUT_DIR / source_job_id
|
||||
if not job_output_dir.exists():
|
||||
continue # silent skip — no output yet
|
||||
output_files = sorted(
|
||||
job_output_dir.glob("*.md"),
|
||||
key=lambda f: f.stat().st_mtime,
|
||||
reverse=True,
|
||||
)
|
||||
if not output_files:
|
||||
continue # silent skip — no output yet
|
||||
latest_output = output_files[0].read_text(encoding="utf-8").strip()
|
||||
# Truncate to 8K characters to avoid prompt bloat
|
||||
_MAX_CONTEXT_CHARS = 8000
|
||||
if len(latest_output) > _MAX_CONTEXT_CHARS:
|
||||
latest_output = latest_output[:_MAX_CONTEXT_CHARS] + "\n\n[... output truncated ...]"
|
||||
if latest_output:
|
||||
prompt = (
|
||||
f"## Output from job '{source_job_id}'\n"
|
||||
"The following is the most recent output from a preceding "
|
||||
"cron job. Use it as context for your analysis.\n\n"
|
||||
f"```\n{latest_output}\n```\n\n"
|
||||
f"{prompt}"
|
||||
)
|
||||
else:
|
||||
continue # silent skip — empty output
|
||||
except (OSError, PermissionError) as e:
|
||||
logger.warning("context_from: failed to read output for job %r: %s", source_job_id, e)
|
||||
# silent skip — do not pollute the prompt with error messages
|
||||
|
||||
# Always prepend cron execution guidance so the agent knows how
|
||||
# delivery works and can suppress delivery when appropriate.
|
||||
cron_hint = (
|
||||
|
||||
+3
-8
@@ -135,7 +135,7 @@ class SessionResetPolicy:
|
||||
mode=mode if mode is not None else "both",
|
||||
at_hour=at_hour if at_hour is not None else 4,
|
||||
idle_minutes=idle_minutes if idle_minutes is not None else 1440,
|
||||
notify=_coerce_bool(notify, True),
|
||||
notify=notify if notify is not None else True,
|
||||
notify_exclude_platforms=tuple(exclude) if exclude is not None else ("api_server", "webhook"),
|
||||
)
|
||||
|
||||
@@ -178,7 +178,7 @@ class PlatformConfig:
|
||||
home_channel = HomeChannel.from_dict(data["home_channel"])
|
||||
|
||||
return cls(
|
||||
enabled=_coerce_bool(data.get("enabled"), False),
|
||||
enabled=data.get("enabled", False),
|
||||
token=data.get("token"),
|
||||
api_key=data.get("api_key"),
|
||||
home_channel=home_channel,
|
||||
@@ -435,7 +435,7 @@ class GatewayConfig:
|
||||
reset_triggers=data.get("reset_triggers", ["/new", "/reset"]),
|
||||
quick_commands=quick_commands,
|
||||
sessions_dir=sessions_dir,
|
||||
always_log_local=_coerce_bool(data.get("always_log_local"), True),
|
||||
always_log_local=data.get("always_log_local", True),
|
||||
stt_enabled=_coerce_bool(stt_enabled, True),
|
||||
group_sessions_per_user=_coerce_bool(group_sessions_per_user, True),
|
||||
thread_sessions_per_user=_coerce_bool(thread_sessions_per_user, False),
|
||||
@@ -687,11 +687,6 @@ def load_gateway_config() -> GatewayConfig:
|
||||
os.environ["TELEGRAM_REACTIONS"] = str(telegram_cfg["reactions"]).lower()
|
||||
if "proxy_url" in telegram_cfg and not os.getenv("TELEGRAM_PROXY"):
|
||||
os.environ["TELEGRAM_PROXY"] = str(telegram_cfg["proxy_url"]).strip()
|
||||
if "group_allowed_chats" in telegram_cfg and not os.getenv("TELEGRAM_GROUP_ALLOWED_USERS"):
|
||||
gac = telegram_cfg["group_allowed_chats"]
|
||||
if isinstance(gac, list):
|
||||
gac = ",".join(str(v) for v in gac)
|
||||
os.environ["TELEGRAM_GROUP_ALLOWED_USERS"] = str(gac)
|
||||
if "disable_link_previews" in telegram_cfg:
|
||||
plat_data = platforms_data.setdefault(Platform.TELEGRAM.value, {})
|
||||
if not isinstance(plat_data, dict):
|
||||
|
||||
+22
-101
@@ -1204,12 +1204,10 @@ class APIServerAdapter(BasePlatformAdapter):
|
||||
|
||||
If the client disconnects mid-stream, ``agent.interrupt()`` is
|
||||
called so the agent stops issuing upstream LLM calls, then the
|
||||
asyncio task is cancelled. When ``store=True`` an initial
|
||||
``in_progress`` snapshot is persisted immediately after
|
||||
``response.created`` and disconnects update it to an
|
||||
``incomplete`` snapshot so GET /v1/responses/{id} and
|
||||
``previous_response_id`` chaining still have something to
|
||||
recover from.
|
||||
asyncio task is cancelled. When ``store=True`` the full response
|
||||
is persisted to the ResponseStore in a ``finally`` block so GET
|
||||
/v1/responses/{id} and ``previous_response_id`` chaining work the
|
||||
same as the batch path.
|
||||
"""
|
||||
import queue as _q
|
||||
|
||||
@@ -1271,60 +1269,6 @@ class APIServerAdapter(BasePlatformAdapter):
|
||||
final_response_text = ""
|
||||
agent_error: Optional[str] = None
|
||||
usage: Dict[str, int] = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
|
||||
terminal_snapshot_persisted = False
|
||||
|
||||
def _persist_response_snapshot(
|
||||
response_env: Dict[str, Any],
|
||||
*,
|
||||
conversation_history_snapshot: Optional[List[Dict[str, Any]]] = None,
|
||||
) -> None:
|
||||
if not store:
|
||||
return
|
||||
if conversation_history_snapshot is None:
|
||||
conversation_history_snapshot = list(conversation_history)
|
||||
conversation_history_snapshot.append({"role": "user", "content": user_message})
|
||||
self._response_store.put(response_id, {
|
||||
"response": response_env,
|
||||
"conversation_history": conversation_history_snapshot,
|
||||
"instructions": instructions,
|
||||
"session_id": session_id,
|
||||
})
|
||||
if conversation:
|
||||
self._response_store.set_conversation(conversation, response_id)
|
||||
|
||||
def _persist_incomplete_if_needed() -> None:
|
||||
"""Persist an ``incomplete`` snapshot if no terminal one was written.
|
||||
|
||||
Called from both the client-disconnect (``ConnectionResetError``)
|
||||
and server-cancellation (``asyncio.CancelledError``) paths so
|
||||
GET /v1/responses/{id} and ``previous_response_id`` chaining keep
|
||||
working after abrupt stream termination.
|
||||
"""
|
||||
if not store or terminal_snapshot_persisted:
|
||||
return
|
||||
incomplete_text = "".join(final_text_parts) or final_response_text
|
||||
incomplete_items: List[Dict[str, Any]] = list(emitted_items)
|
||||
if incomplete_text:
|
||||
incomplete_items.append({
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"content": [{"type": "output_text", "text": incomplete_text}],
|
||||
})
|
||||
incomplete_env = _envelope("incomplete")
|
||||
incomplete_env["output"] = incomplete_items
|
||||
incomplete_env["usage"] = {
|
||||
"input_tokens": usage.get("input_tokens", 0),
|
||||
"output_tokens": usage.get("output_tokens", 0),
|
||||
"total_tokens": usage.get("total_tokens", 0),
|
||||
}
|
||||
incomplete_history = list(conversation_history)
|
||||
incomplete_history.append({"role": "user", "content": user_message})
|
||||
if incomplete_text:
|
||||
incomplete_history.append({"role": "assistant", "content": incomplete_text})
|
||||
_persist_response_snapshot(
|
||||
incomplete_env,
|
||||
conversation_history_snapshot=incomplete_history,
|
||||
)
|
||||
|
||||
try:
|
||||
# response.created — initial envelope, status=in_progress
|
||||
@@ -1334,7 +1278,6 @@ class APIServerAdapter(BasePlatformAdapter):
|
||||
"type": "response.created",
|
||||
"response": created_env,
|
||||
})
|
||||
_persist_response_snapshot(created_env)
|
||||
last_activity = time.monotonic()
|
||||
|
||||
async def _open_message_item() -> None:
|
||||
@@ -1591,18 +1534,6 @@ class APIServerAdapter(BasePlatformAdapter):
|
||||
"output_tokens": usage.get("output_tokens", 0),
|
||||
"total_tokens": usage.get("total_tokens", 0),
|
||||
}
|
||||
_failed_history = list(conversation_history)
|
||||
_failed_history.append({"role": "user", "content": user_message})
|
||||
if final_response_text or agent_error:
|
||||
_failed_history.append({
|
||||
"role": "assistant",
|
||||
"content": final_response_text or agent_error,
|
||||
})
|
||||
_persist_response_snapshot(
|
||||
failed_env,
|
||||
conversation_history_snapshot=_failed_history,
|
||||
)
|
||||
terminal_snapshot_persisted = True
|
||||
await _write_event("response.failed", {
|
||||
"type": "response.failed",
|
||||
"response": failed_env,
|
||||
@@ -1615,24 +1546,30 @@ class APIServerAdapter(BasePlatformAdapter):
|
||||
"output_tokens": usage.get("output_tokens", 0),
|
||||
"total_tokens": usage.get("total_tokens", 0),
|
||||
}
|
||||
full_history = list(conversation_history)
|
||||
full_history.append({"role": "user", "content": user_message})
|
||||
if isinstance(result, dict) and result.get("messages"):
|
||||
full_history.extend(result["messages"])
|
||||
else:
|
||||
full_history.append({"role": "assistant", "content": final_response_text})
|
||||
_persist_response_snapshot(
|
||||
completed_env,
|
||||
conversation_history_snapshot=full_history,
|
||||
)
|
||||
terminal_snapshot_persisted = True
|
||||
await _write_event("response.completed", {
|
||||
"type": "response.completed",
|
||||
"response": completed_env,
|
||||
})
|
||||
|
||||
# Persist for future chaining / GET retrieval, mirroring
|
||||
# the batch path behavior.
|
||||
if store:
|
||||
full_history = list(conversation_history)
|
||||
full_history.append({"role": "user", "content": user_message})
|
||||
if isinstance(result, dict) and result.get("messages"):
|
||||
full_history.extend(result["messages"])
|
||||
else:
|
||||
full_history.append({"role": "assistant", "content": final_response_text})
|
||||
self._response_store.put(response_id, {
|
||||
"response": completed_env,
|
||||
"conversation_history": full_history,
|
||||
"instructions": instructions,
|
||||
"session_id": session_id,
|
||||
})
|
||||
if conversation:
|
||||
self._response_store.set_conversation(conversation, response_id)
|
||||
|
||||
except (ConnectionResetError, ConnectionAbortedError, BrokenPipeError, OSError):
|
||||
_persist_incomplete_if_needed()
|
||||
# Client disconnected — interrupt the agent so it stops
|
||||
# making upstream LLM calls, then cancel the task.
|
||||
agent = agent_ref[0] if agent_ref else None
|
||||
@@ -1648,22 +1585,6 @@ class APIServerAdapter(BasePlatformAdapter):
|
||||
except (asyncio.CancelledError, Exception):
|
||||
pass
|
||||
logger.info("SSE client disconnected; interrupted agent task %s", response_id)
|
||||
except asyncio.CancelledError:
|
||||
# Server-side cancellation (e.g. shutdown, request timeout) —
|
||||
# persist an incomplete snapshot so GET /v1/responses/{id} and
|
||||
# previous_response_id chaining still work, then re-raise so the
|
||||
# runtime's cancellation semantics are respected.
|
||||
_persist_incomplete_if_needed()
|
||||
agent = agent_ref[0] if agent_ref else None
|
||||
if agent is not None:
|
||||
try:
|
||||
agent.interrupt("SSE task cancelled")
|
||||
except Exception:
|
||||
pass
|
||||
if not agent_task.done():
|
||||
agent_task.cancel()
|
||||
logger.info("SSE task cancelled; persisted incomplete snapshot for %s", response_id)
|
||||
raise
|
||||
|
||||
return response
|
||||
|
||||
|
||||
+3
-112
@@ -148,102 +148,7 @@ def _detect_macos_system_proxy() -> str | None:
|
||||
return None
|
||||
|
||||
|
||||
def _split_host_port(value: str) -> tuple[str, int | None]:
|
||||
raw = str(value or "").strip()
|
||||
if not raw:
|
||||
return "", None
|
||||
if "://" in raw:
|
||||
parsed = urlsplit(raw)
|
||||
return (parsed.hostname or "").lower().rstrip("."), parsed.port
|
||||
if raw.startswith("[") and "]" in raw:
|
||||
host, _, rest = raw[1:].partition("]")
|
||||
port = None
|
||||
if rest.startswith(":") and rest[1:].isdigit():
|
||||
port = int(rest[1:])
|
||||
return host.lower().rstrip("."), port
|
||||
if raw.count(":") == 1:
|
||||
host, _, maybe_port = raw.rpartition(":")
|
||||
if maybe_port.isdigit():
|
||||
return host.lower().rstrip("."), int(maybe_port)
|
||||
return raw.lower().strip("[]").rstrip("."), None
|
||||
|
||||
|
||||
def _no_proxy_entries() -> list[str]:
|
||||
entries: list[str] = []
|
||||
for key in ("NO_PROXY", "no_proxy"):
|
||||
raw = os.environ.get(key, "")
|
||||
entries.extend(part.strip() for part in raw.split(",") if part.strip())
|
||||
return entries
|
||||
|
||||
|
||||
def _no_proxy_entry_matches(entry: str, host: str, port: int | None = None) -> bool:
|
||||
token = str(entry or "").strip().lower()
|
||||
if not token:
|
||||
return False
|
||||
if token == "*":
|
||||
return True
|
||||
|
||||
token_host, token_port = _split_host_port(token)
|
||||
if token_port is not None and port is not None and token_port != port:
|
||||
return False
|
||||
if token_port is not None and port is None:
|
||||
return False
|
||||
if not token_host:
|
||||
return False
|
||||
|
||||
try:
|
||||
network = ipaddress.ip_network(token_host, strict=False)
|
||||
try:
|
||||
return ipaddress.ip_address(host) in network
|
||||
except ValueError:
|
||||
return False
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
try:
|
||||
token_ip = ipaddress.ip_address(token_host)
|
||||
try:
|
||||
return ipaddress.ip_address(host) == token_ip
|
||||
except ValueError:
|
||||
return False
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if token_host.startswith("*."):
|
||||
suffix = token_host[1:]
|
||||
return host.endswith(suffix)
|
||||
if token_host.startswith("."):
|
||||
return host == token_host[1:] or host.endswith(token_host)
|
||||
return host == token_host or host.endswith(f".{token_host}")
|
||||
|
||||
|
||||
def should_bypass_proxy(target_hosts: str | list[str] | tuple[str, ...] | set[str] | None) -> bool:
|
||||
"""Return True when NO_PROXY/no_proxy matches at least one target host.
|
||||
|
||||
Supports exact hosts, domain suffixes, wildcard suffixes, IP literals,
|
||||
CIDR ranges, optional host:port entries, and ``*``.
|
||||
"""
|
||||
entries = _no_proxy_entries()
|
||||
if not entries or not target_hosts:
|
||||
return False
|
||||
if isinstance(target_hosts, str):
|
||||
candidates = [target_hosts]
|
||||
else:
|
||||
candidates = list(target_hosts)
|
||||
for candidate in candidates:
|
||||
host, port = _split_host_port(str(candidate))
|
||||
if not host:
|
||||
continue
|
||||
if any(_no_proxy_entry_matches(entry, host, port) for entry in entries):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def resolve_proxy_url(
|
||||
platform_env_var: str | None = None,
|
||||
*,
|
||||
target_hosts: str | list[str] | tuple[str, ...] | set[str] | None = None,
|
||||
) -> str | None:
|
||||
def resolve_proxy_url(platform_env_var: str | None = None) -> str | None:
|
||||
"""Return a proxy URL from env vars, or macOS system proxy.
|
||||
|
||||
Check order:
|
||||
@@ -251,26 +156,18 @@ def resolve_proxy_url(
|
||||
1. HTTPS_PROXY / HTTP_PROXY / ALL_PROXY (and lowercase variants)
|
||||
2. macOS system proxy via ``scutil --proxy`` (auto-detect)
|
||||
|
||||
Returns *None* if no proxy is found, or if NO_PROXY/no_proxy matches one
|
||||
of ``target_hosts``.
|
||||
Returns *None* if no proxy is found.
|
||||
"""
|
||||
if platform_env_var:
|
||||
value = (os.environ.get(platform_env_var) or "").strip()
|
||||
if value:
|
||||
if should_bypass_proxy(target_hosts):
|
||||
return None
|
||||
return normalize_proxy_url(value)
|
||||
for key in ("HTTPS_PROXY", "HTTP_PROXY", "ALL_PROXY",
|
||||
"https_proxy", "http_proxy", "all_proxy"):
|
||||
value = (os.environ.get(key) or "").strip()
|
||||
if value:
|
||||
if should_bypass_proxy(target_hosts):
|
||||
return None
|
||||
return normalize_proxy_url(value)
|
||||
detected = normalize_proxy_url(_detect_macos_system_proxy())
|
||||
if detected and should_bypass_proxy(target_hosts):
|
||||
return None
|
||||
return detected
|
||||
return normalize_proxy_url(_detect_macos_system_proxy())
|
||||
|
||||
|
||||
def proxy_kwargs_for_bot(proxy_url: str | None) -> dict:
|
||||
@@ -2543,9 +2440,6 @@ class BasePlatformAdapter(ABC):
|
||||
user_id_alt: Optional[str] = None,
|
||||
chat_id_alt: Optional[str] = None,
|
||||
is_bot: bool = False,
|
||||
guild_id: Optional[str] = None,
|
||||
parent_chat_id: Optional[str] = None,
|
||||
message_id: Optional[str] = None,
|
||||
) -> SessionSource:
|
||||
"""Helper to build a SessionSource for this platform."""
|
||||
# Normalize empty topic to None
|
||||
@@ -2563,9 +2457,6 @@ class BasePlatformAdapter(ABC):
|
||||
user_id_alt=user_id_alt,
|
||||
chat_id_alt=chat_id_alt,
|
||||
is_bot=is_bot,
|
||||
guild_id=str(guild_id) if guild_id else None,
|
||||
parent_chat_id=str(parent_chat_id) if parent_chat_id else None,
|
||||
message_id=str(message_id) if message_id else None,
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
|
||||
@@ -99,7 +99,6 @@ def _normalize_server_url(raw: str) -> str:
|
||||
|
||||
class BlueBubblesAdapter(BasePlatformAdapter):
|
||||
platform = Platform.BLUEBUBBLES
|
||||
SUPPORTS_MESSAGE_EDITING = False
|
||||
MAX_MESSAGE_LENGTH = MAX_TEXT_LENGTH
|
||||
|
||||
def __init__(self, config: PlatformConfig):
|
||||
@@ -392,13 +391,6 @@ class BlueBubblesAdapter(BasePlatformAdapter):
|
||||
# Text sending
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def truncate_message(content: str, max_length: int = MAX_TEXT_LENGTH) -> List[str]:
|
||||
# Use the base splitter but skip pagination indicators — iMessage
|
||||
# bubbles flow naturally without "(1/3)" suffixes.
|
||||
chunks = BasePlatformAdapter.truncate_message(content, max_length)
|
||||
return [re.sub(r"\s*\(\d+/\d+\)$", "", c) for c in chunks]
|
||||
|
||||
async def send(
|
||||
self,
|
||||
chat_id: str,
|
||||
@@ -406,19 +398,10 @@ class BlueBubblesAdapter(BasePlatformAdapter):
|
||||
reply_to: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> SendResult:
|
||||
text = self.format_message(content)
|
||||
text = strip_markdown(content or "")
|
||||
if not text:
|
||||
return SendResult(success=False, error="BlueBubbles send requires text")
|
||||
# Split on paragraph breaks first (double newlines) so each thought
|
||||
# becomes its own iMessage bubble, then truncate any that are still
|
||||
# too long.
|
||||
paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
|
||||
chunks: List[str] = []
|
||||
for para in (paragraphs or [text]):
|
||||
if len(para) <= self.MAX_MESSAGE_LENGTH:
|
||||
chunks.append(para)
|
||||
else:
|
||||
chunks.extend(self.truncate_message(para, max_length=self.MAX_MESSAGE_LENGTH))
|
||||
chunks = self.truncate_message(text, max_length=self.MAX_MESSAGE_LENGTH)
|
||||
last = SendResult(success=True)
|
||||
for chunk in chunks:
|
||||
guid = await self._resolve_chat_guid(chat_id)
|
||||
|
||||
@@ -3261,7 +3261,6 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||
if auto_thread and not skip_thread and not is_voice_linked_channel and not is_reply_message:
|
||||
thread = await self._auto_create_thread(message)
|
||||
if thread:
|
||||
parent_channel_id = str(message.channel.id)
|
||||
is_thread = True
|
||||
thread_id = str(thread.id)
|
||||
auto_threaded_channel = thread
|
||||
@@ -3321,9 +3320,6 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||
thread_id=thread_id,
|
||||
chat_topic=chat_topic,
|
||||
is_bot=getattr(message.author, "bot", False),
|
||||
guild_id=str(message.guild.id) if message.guild else None,
|
||||
parent_chat_id=parent_channel_id,
|
||||
message_id=str(message.id),
|
||||
)
|
||||
|
||||
# Build media URLs -- download image attachments to local cache so the
|
||||
|
||||
@@ -532,20 +532,6 @@ class MatrixAdapter(BasePlatformAdapter):
|
||||
)
|
||||
await crypto_store.open()
|
||||
|
||||
# Bind the store to the runtime device_id before any
|
||||
# put_account() runs. PgCryptoStore defaults _device_id
|
||||
# to "" and its crypto_account UPSERT never updates the
|
||||
# device_id column on conflict — so once put_account
|
||||
# writes blank, it stays blank forever. That breaks
|
||||
# every downstream device-scoped olm operation: peer
|
||||
# to-device ciphertext can't find our identity key and
|
||||
# no megolm sessions ever land. Setting _device_id here
|
||||
# (in-memory; the on-disk row may not exist yet) makes
|
||||
# the first put_account write the correct value.
|
||||
# DeviceID is a NewType(str) so plain str works at runtime.
|
||||
if client.device_id:
|
||||
await crypto_store.put_device_id(client.device_id)
|
||||
|
||||
crypto_state = _CryptoStateStore(state_store, self._joined_rooms)
|
||||
olm = OlmMachine(client, crypto_store, crypto_state)
|
||||
|
||||
|
||||
@@ -703,6 +703,7 @@ class TelegramAdapter(BasePlatformAdapter):
|
||||
"write_timeout": _env_float("HERMES_TELEGRAM_HTTP_WRITE_TIMEOUT", 20.0),
|
||||
}
|
||||
|
||||
proxy_url = resolve_proxy_url("TELEGRAM_PROXY")
|
||||
disable_fallback = (os.getenv("HERMES_TELEGRAM_DISABLE_FALLBACK_IPS", "").strip().lower() in ("1", "true", "yes", "on"))
|
||||
fallback_ips = self._fallback_ips()
|
||||
if not fallback_ips:
|
||||
@@ -713,8 +714,6 @@ class TelegramAdapter(BasePlatformAdapter):
|
||||
", ".join(fallback_ips),
|
||||
)
|
||||
|
||||
proxy_targets = ["api.telegram.org", *fallback_ips]
|
||||
proxy_url = resolve_proxy_url("TELEGRAM_PROXY", target_hosts=proxy_targets)
|
||||
if fallback_ips and not proxy_url and not disable_fallback:
|
||||
logger.info(
|
||||
"[%s] Telegram fallback IPs active: %s",
|
||||
|
||||
@@ -43,10 +43,10 @@ _DOH_PROVIDERS: list[dict] = [
|
||||
_SEED_FALLBACK_IPS: list[str] = ["149.154.167.220"]
|
||||
|
||||
|
||||
def _resolve_proxy_url(target_hosts=None) -> str | None:
|
||||
def _resolve_proxy_url() -> str | None:
|
||||
# Delegate to shared implementation (env vars + macOS system proxy detection)
|
||||
from gateway.platforms.base import resolve_proxy_url
|
||||
return resolve_proxy_url("TELEGRAM_PROXY", target_hosts=target_hosts)
|
||||
return resolve_proxy_url("TELEGRAM_PROXY")
|
||||
|
||||
|
||||
class TelegramFallbackTransport(httpx.AsyncBaseTransport):
|
||||
@@ -60,7 +60,7 @@ class TelegramFallbackTransport(httpx.AsyncBaseTransport):
|
||||
|
||||
def __init__(self, fallback_ips: Iterable[str], **transport_kwargs):
|
||||
self._fallback_ips = [ip for ip in dict.fromkeys(_normalize_fallback_ips(fallback_ips))]
|
||||
proxy_url = _resolve_proxy_url(target_hosts=[_TELEGRAM_API_HOST, *self._fallback_ips])
|
||||
proxy_url = _resolve_proxy_url()
|
||||
if proxy_url and "proxy" not in transport_kwargs:
|
||||
transport_kwargs["proxy"] = proxy_url
|
||||
self._primary = httpx.AsyncHTTPTransport(**transport_kwargs)
|
||||
|
||||
+267
-106
@@ -298,16 +298,50 @@ from gateway.restart import (
|
||||
)
|
||||
|
||||
|
||||
from gateway.whatsapp_identity import (
|
||||
canonical_whatsapp_identifier as _canonical_whatsapp_identifier, # noqa: F401
|
||||
expand_whatsapp_aliases as _expand_whatsapp_auth_aliases,
|
||||
normalize_whatsapp_identifier as _normalize_whatsapp_identifier,
|
||||
)
|
||||
def _normalize_whatsapp_identifier(value: str) -> str:
|
||||
"""Strip WhatsApp JID/LID syntax down to its stable numeric identifier."""
|
||||
return (
|
||||
str(value or "")
|
||||
.strip()
|
||||
.replace("+", "", 1)
|
||||
.split(":", 1)[0]
|
||||
.split("@", 1)[0]
|
||||
)
|
||||
|
||||
|
||||
def _expand_whatsapp_auth_aliases(identifier: str) -> set:
|
||||
"""Resolve WhatsApp phone/LID aliases using bridge session mapping files."""
|
||||
normalized = _normalize_whatsapp_identifier(identifier)
|
||||
if not normalized:
|
||||
return set()
|
||||
|
||||
session_dir = _hermes_home / "whatsapp" / "session"
|
||||
resolved = set()
|
||||
queue = [normalized]
|
||||
|
||||
while queue:
|
||||
current = queue.pop(0)
|
||||
if not current or current in resolved:
|
||||
continue
|
||||
|
||||
resolved.add(current)
|
||||
for suffix in ("", "_reverse"):
|
||||
mapping_path = session_dir / f"lid-mapping-{current}{suffix}.json"
|
||||
if not mapping_path.exists():
|
||||
continue
|
||||
try:
|
||||
mapped = _normalize_whatsapp_identifier(
|
||||
json.loads(mapping_path.read_text(encoding="utf-8"))
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
if mapped and mapped not in resolved:
|
||||
queue.append(mapped)
|
||||
|
||||
return resolved
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Sentinel placed into _running_agents immediately when a session starts
|
||||
# processing, *before* any await. Prevents a second message for the same
|
||||
# session from bypassing the "already running" guard during the async gap
|
||||
@@ -524,7 +558,7 @@ def _load_gateway_config() -> dict:
|
||||
def _resolve_gateway_model(config: dict | None = None) -> str:
|
||||
"""Read model from config.yaml — single source of truth.
|
||||
|
||||
Without this, temporary AIAgent instances (e.g. /compress) fall
|
||||
Without this, temporary AIAgent instances (memory flush, /compress) fall
|
||||
back to the hardcoded default which fails when the active provider is
|
||||
openai-codex.
|
||||
"""
|
||||
@@ -915,6 +949,129 @@ class GatewayRunner:
|
||||
e,
|
||||
)
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
|
||||
def _flush_memories_for_session(
|
||||
self,
|
||||
old_session_id: str,
|
||||
session_key: Optional[str] = None,
|
||||
):
|
||||
"""Prompt the agent to save memories/skills before context is lost.
|
||||
|
||||
Synchronous worker — meant to be called via run_in_executor from
|
||||
an async context so it doesn't block the event loop.
|
||||
"""
|
||||
# Skip cron sessions — they run headless with no meaningful user
|
||||
# conversation to extract memories from.
|
||||
if old_session_id and old_session_id.startswith("cron_"):
|
||||
logger.debug("Skipping memory flush for cron session: %s", old_session_id)
|
||||
return
|
||||
|
||||
try:
|
||||
history = self.session_store.load_transcript(old_session_id)
|
||||
if not history or len(history) < 4:
|
||||
return
|
||||
|
||||
from run_agent import AIAgent
|
||||
model, runtime_kwargs = self._resolve_session_agent_runtime(
|
||||
session_key=session_key,
|
||||
)
|
||||
if not runtime_kwargs.get("api_key"):
|
||||
return
|
||||
|
||||
tmp_agent = AIAgent(
|
||||
**runtime_kwargs,
|
||||
model=model,
|
||||
max_iterations=8,
|
||||
quiet_mode=True,
|
||||
skip_memory=True, # Flush agent — no memory provider
|
||||
enabled_toolsets=["memory", "skills"],
|
||||
session_id=old_session_id,
|
||||
)
|
||||
try:
|
||||
# Fully silence the flush agent — quiet_mode only suppresses init
|
||||
# messages; tool call output still leaks to the terminal through
|
||||
# _safe_print → _print_fn. Set a no-op to prevent that.
|
||||
tmp_agent._print_fn = lambda *a, **kw: None
|
||||
|
||||
# Build conversation history from transcript
|
||||
msgs = [
|
||||
{"role": m.get("role"), "content": m.get("content")}
|
||||
for m in history
|
||||
if m.get("role") in ("user", "assistant") and m.get("content")
|
||||
]
|
||||
|
||||
# Read live memory state from disk so the flush agent can see
|
||||
# what's already saved and avoid overwriting newer entries.
|
||||
_current_memory = ""
|
||||
try:
|
||||
from tools.memory_tool import get_memory_dir
|
||||
_mem_dir = get_memory_dir()
|
||||
for fname, label in [
|
||||
("MEMORY.md", "MEMORY (your personal notes)"),
|
||||
("USER.md", "USER PROFILE (who the user is)"),
|
||||
]:
|
||||
fpath = _mem_dir / fname
|
||||
if fpath.exists():
|
||||
content = fpath.read_text(encoding="utf-8").strip()
|
||||
if content:
|
||||
_current_memory += f"\n\n## Current {label}:\n{content}"
|
||||
except Exception:
|
||||
pass # Non-fatal — flush still works, just without the guard
|
||||
|
||||
# Give the agent a real turn to think about what to save
|
||||
flush_prompt = (
|
||||
"[System: This session is about to be automatically reset due to "
|
||||
"inactivity or a scheduled daily reset. The conversation context "
|
||||
"will be cleared after this turn.\n\n"
|
||||
"Review the conversation above and:\n"
|
||||
"1. Save any important facts, preferences, or decisions to memory "
|
||||
"(user profile or your notes) that would be useful in future sessions.\n"
|
||||
"2. If you discovered a reusable workflow or solved a non-trivial "
|
||||
"problem, consider saving it as a skill.\n"
|
||||
"3. If nothing is worth saving, that's fine — just skip.\n\n"
|
||||
)
|
||||
|
||||
if _current_memory:
|
||||
flush_prompt += (
|
||||
"IMPORTANT — here is the current live state of memory. Other "
|
||||
"sessions, cron jobs, or the user may have updated it since this "
|
||||
"conversation ended. Do NOT overwrite or remove entries unless "
|
||||
"the conversation above reveals something that genuinely "
|
||||
"supersedes them. Only add new information that is not already "
|
||||
"captured below."
|
||||
f"{_current_memory}\n\n"
|
||||
)
|
||||
|
||||
flush_prompt += (
|
||||
"Do NOT respond to the user. Just use the memory and skill_manage "
|
||||
"tools if needed, then stop.]"
|
||||
)
|
||||
|
||||
tmp_agent.run_conversation(
|
||||
user_message=flush_prompt,
|
||||
conversation_history=msgs,
|
||||
)
|
||||
finally:
|
||||
self._cleanup_agent_resources(tmp_agent)
|
||||
logger.info("Pre-reset memory flush completed for session %s", old_session_id)
|
||||
except Exception as e:
|
||||
logger.debug("Pre-reset memory flush failed for session %s: %s", old_session_id, e)
|
||||
|
||||
async def _async_flush_memories(
|
||||
self,
|
||||
old_session_id: str,
|
||||
session_key: Optional[str] = None,
|
||||
):
|
||||
"""Run the sync memory flush in a thread pool so it won't block the event loop."""
|
||||
loop = asyncio.get_running_loop()
|
||||
await loop.run_in_executor(
|
||||
None,
|
||||
self._flush_memories_for_session,
|
||||
old_session_id,
|
||||
session_key,
|
||||
)
|
||||
|
||||
@property
|
||||
def should_exit_cleanly(self) -> bool:
|
||||
return self._exit_cleanly
|
||||
@@ -980,7 +1137,7 @@ class GatewayRunner:
|
||||
if override_runtime.get("api_key"):
|
||||
logger.debug(
|
||||
"Session model override (fast): session=%s config_model=%s -> override_model=%s provider=%s",
|
||||
resolved_session_key or "", model, override_model,
|
||||
(resolved_session_key or "")[:30], model, override_model,
|
||||
override_runtime.get("provider"),
|
||||
)
|
||||
return override_model, override_runtime
|
||||
@@ -988,12 +1145,12 @@ class GatewayRunner:
|
||||
# resolution and apply model/provider from the override on top.
|
||||
logger.debug(
|
||||
"Session model override (no api_key, fallback): session=%s config_model=%s override_model=%s",
|
||||
resolved_session_key or "", model, override_model,
|
||||
(resolved_session_key or "")[:30], model, override_model,
|
||||
)
|
||||
else:
|
||||
logger.debug(
|
||||
"No session model override: session=%s config_model=%s override_keys=%s",
|
||||
resolved_session_key or "", model,
|
||||
(resolved_session_key or "")[:30], model,
|
||||
list(self._session_model_overrides.keys())[:5] if self._session_model_overrides else "[]",
|
||||
)
|
||||
|
||||
@@ -1564,7 +1721,7 @@ class GatewayRunner:
|
||||
continue
|
||||
try:
|
||||
agent.interrupt(reason)
|
||||
logger.debug("Interrupted running agent for session %s during shutdown", session_key)
|
||||
logger.debug("Interrupted running agent for session %s during shutdown", session_key[:20])
|
||||
except Exception as e:
|
||||
logger.debug("Failed interrupting agent during shutdown: %s", e)
|
||||
|
||||
@@ -1736,7 +1893,7 @@ class GatewayRunner:
|
||||
logger.warning(
|
||||
"Auto-suspended stuck session %s (active across %d "
|
||||
"consecutive restarts — likely a stuck loop)",
|
||||
session_key, counts[session_key],
|
||||
session_key[:30], counts[session_key],
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
@@ -2149,7 +2306,7 @@ class GatewayRunner:
|
||||
except Exception as e:
|
||||
logger.error("Recovered watcher setup error: %s", e)
|
||||
|
||||
# Start background session expiry watcher to finalize expired sessions
|
||||
# Start background session expiry watcher for proactive memory flushing
|
||||
asyncio.create_task(self._session_expiry_watcher())
|
||||
|
||||
# Start background reconnection watcher for platforms that failed at startup
|
||||
@@ -2166,24 +2323,25 @@ class GatewayRunner:
|
||||
return True
|
||||
|
||||
async def _session_expiry_watcher(self, interval: int = 300):
|
||||
"""Background task that finalizes expired sessions.
|
||||
"""Background task that proactively flushes memories for expired sessions.
|
||||
|
||||
Runs every `interval` seconds (default 5 min). For each session that
|
||||
has expired according to its reset policy, flushes memories in a thread
|
||||
pool and marks the session so it won't be flushed again.
|
||||
|
||||
Runs every ``interval`` seconds (default 5 min). For each session
|
||||
whose reset policy has expired, invokes ``on_session_finalize``
|
||||
hooks, cleans up the cached AIAgent's tool resources, evicts the
|
||||
cache entry so it can be garbage-collected, and marks the session
|
||||
so it won't be finalized again.
|
||||
This means memories are already saved by the time the user sends their
|
||||
next message, so there's no blocking delay.
|
||||
"""
|
||||
await asyncio.sleep(60) # initial delay — let the gateway fully start
|
||||
_finalize_failures: dict[str, int] = {} # session_id -> consecutive failure count
|
||||
_MAX_FINALIZE_RETRIES = 3
|
||||
_flush_failures: dict[str, int] = {} # session_id -> consecutive failure count
|
||||
_MAX_FLUSH_RETRIES = 3
|
||||
while self._running:
|
||||
try:
|
||||
self.session_store._ensure_loaded()
|
||||
# Collect expired sessions first, then log a single summary.
|
||||
_expired_entries = []
|
||||
for key, entry in list(self.session_store._entries.items()):
|
||||
if entry.expiry_finalized:
|
||||
if entry.memory_flushed:
|
||||
continue
|
||||
if not self.session_store._is_session_expired(entry):
|
||||
continue
|
||||
@@ -2201,12 +2359,13 @@ class GatewayRunner:
|
||||
f"{p}:{c}" for p, c in sorted(_platforms.items())
|
||||
)
|
||||
logger.info(
|
||||
"Session expiry: %d sessions to finalize (%s)",
|
||||
"Session expiry: %d sessions to flush (%s)",
|
||||
len(_expired_entries), _plat_summary,
|
||||
)
|
||||
|
||||
for key, entry in _expired_entries:
|
||||
try:
|
||||
await self._async_flush_memories(entry.session_id, key)
|
||||
try:
|
||||
from hermes_cli.plugins import invoke_hook as _invoke_hook
|
||||
_parts = key.split(":")
|
||||
@@ -2238,48 +2397,48 @@ class GatewayRunner:
|
||||
# be garbage-collected. Otherwise the cache grows
|
||||
# unbounded across the gateway's lifetime.
|
||||
self._evict_cached_agent(key)
|
||||
# Mark as finalized and persist to disk so the flag
|
||||
# Mark as flushed and persist to disk so the flag
|
||||
# survives gateway restarts.
|
||||
with self.session_store._lock:
|
||||
entry.expiry_finalized = True
|
||||
entry.memory_flushed = True
|
||||
self.session_store._save()
|
||||
logger.debug(
|
||||
"Session expiry finalized for %s",
|
||||
"Memory flush completed for session %s",
|
||||
entry.session_id,
|
||||
)
|
||||
_finalize_failures.pop(entry.session_id, None)
|
||||
_flush_failures.pop(entry.session_id, None)
|
||||
except Exception as e:
|
||||
failures = _finalize_failures.get(entry.session_id, 0) + 1
|
||||
_finalize_failures[entry.session_id] = failures
|
||||
if failures >= _MAX_FINALIZE_RETRIES:
|
||||
failures = _flush_failures.get(entry.session_id, 0) + 1
|
||||
_flush_failures[entry.session_id] = failures
|
||||
if failures >= _MAX_FLUSH_RETRIES:
|
||||
logger.warning(
|
||||
"Session finalize gave up after %d attempts for %s: %s. "
|
||||
"Marking as finalized to prevent infinite retry loop.",
|
||||
"Memory flush gave up after %d attempts for %s: %s. "
|
||||
"Marking as flushed to prevent infinite retry loop.",
|
||||
failures, entry.session_id, e,
|
||||
)
|
||||
with self.session_store._lock:
|
||||
entry.expiry_finalized = True
|
||||
entry.memory_flushed = True
|
||||
self.session_store._save()
|
||||
_finalize_failures.pop(entry.session_id, None)
|
||||
_flush_failures.pop(entry.session_id, None)
|
||||
else:
|
||||
logger.debug(
|
||||
"Session finalize failed (%d/%d) for %s: %s",
|
||||
failures, _MAX_FINALIZE_RETRIES, entry.session_id, e,
|
||||
"Memory flush failed (%d/%d) for %s: %s",
|
||||
failures, _MAX_FLUSH_RETRIES, entry.session_id, e,
|
||||
)
|
||||
|
||||
if _expired_entries:
|
||||
_done = sum(
|
||||
1 for _, e in _expired_entries if e.expiry_finalized
|
||||
_flushed = sum(
|
||||
1 for _, e in _expired_entries if e.memory_flushed
|
||||
)
|
||||
_failed = len(_expired_entries) - _done
|
||||
_failed = len(_expired_entries) - _flushed
|
||||
if _failed:
|
||||
logger.info(
|
||||
"Session expiry done: %d finalized, %d pending retry",
|
||||
_done, _failed,
|
||||
"Session expiry done: %d flushed, %d pending retry",
|
||||
_flushed, _failed,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"Session expiry done: %d finalized", _done,
|
||||
"Session expiry done: %d flushed", _flushed,
|
||||
)
|
||||
|
||||
# Sweep agents that have been idle beyond the TTL regardless
|
||||
@@ -2556,7 +2715,7 @@ class GatewayRunner:
|
||||
except Exception as _e:
|
||||
logger.debug(
|
||||
"mark_resume_pending failed for %s: %s",
|
||||
_sk, _e,
|
||||
_sk[:20], _e,
|
||||
)
|
||||
self._interrupt_running_agents(
|
||||
_INTERRUPT_REASON_GATEWAY_RESTART if self._restart_requested else _INTERRUPT_REASON_GATEWAY_SHUTDOWN
|
||||
@@ -2878,7 +3037,6 @@ class GatewayRunner:
|
||||
Platform.QQBOT: "QQ_ALLOWED_USERS",
|
||||
}
|
||||
platform_group_env_map = {
|
||||
Platform.TELEGRAM: "TELEGRAM_GROUP_ALLOWED_USERS",
|
||||
Platform.QQBOT: "QQ_GROUP_ALLOWED_USERS",
|
||||
}
|
||||
platform_allow_all_map = {
|
||||
@@ -2935,7 +3093,7 @@ class GatewayRunner:
|
||||
# Check platform-specific and global allowlists
|
||||
platform_allowlist = os.getenv(platform_env_map.get(source.platform, ""), "").strip()
|
||||
group_allowlist = ""
|
||||
if source.chat_type in {"group", "forum"}:
|
||||
if source.chat_type == "group":
|
||||
group_allowlist = os.getenv(platform_group_env_map.get(source.platform, ""), "").strip()
|
||||
global_allowlist = os.getenv("GATEWAY_ALLOWED_USERS", "").strip()
|
||||
|
||||
@@ -2944,7 +3102,7 @@ class GatewayRunner:
|
||||
return os.getenv("GATEWAY_ALLOW_ALL_USERS", "").lower() in ("true", "1", "yes")
|
||||
|
||||
# Some platforms authorize group traffic by chat ID rather than sender ID.
|
||||
if group_allowlist and source.chat_type in {"group", "forum"} and source.chat_id:
|
||||
if group_allowlist and source.chat_type == "group" and source.chat_id:
|
||||
allowed_group_ids = {
|
||||
chat_id.strip() for chat_id in group_allowlist.split(",") if chat_id.strip()
|
||||
}
|
||||
@@ -3222,7 +3380,7 @@ class GatewayRunner:
|
||||
logger.warning(
|
||||
"Evicting stale _running_agents entry for %s "
|
||||
"(age: %.0fs, idle: %.0fs, timeout: %.0fs)%s",
|
||||
_quick_key, _stale_age, _stale_idle,
|
||||
_quick_key[:30], _stale_age, _stale_idle,
|
||||
_raw_stale_timeout, _stale_detail,
|
||||
)
|
||||
self._invalidate_session_run_generation(
|
||||
@@ -3258,7 +3416,7 @@ class GatewayRunner:
|
||||
interrupt_reason=_INTERRUPT_REASON_STOP,
|
||||
invalidation_reason="stop_command",
|
||||
)
|
||||
logger.info("STOP for session %s — agent interrupted, session lock released", _quick_key)
|
||||
logger.info("STOP for session %s — agent interrupted, session lock released", _quick_key[:20])
|
||||
return "⚡ Stopped. You can continue this session."
|
||||
|
||||
# /reset and /new must bypass the running-agent guard so they
|
||||
@@ -3324,7 +3482,7 @@ class GatewayRunner:
|
||||
try:
|
||||
accepted = running_agent.steer(steer_text)
|
||||
except Exception as exc:
|
||||
logger.warning("Steer failed for session %s: %s", _quick_key, exc)
|
||||
logger.warning("Steer failed for session %s: %s", _quick_key[:20], exc)
|
||||
return f"⚠️ Steer failed: {exc}"
|
||||
if accepted:
|
||||
preview = steer_text[:60] + ("..." if len(steer_text) > 60 else "")
|
||||
@@ -3407,7 +3565,7 @@ class GatewayRunner:
|
||||
)
|
||||
|
||||
if event.message_type == MessageType.PHOTO:
|
||||
logger.debug("PRIORITY photo follow-up for session %s — queueing without interrupt", _quick_key)
|
||||
logger.debug("PRIORITY photo follow-up for session %s — queueing without interrupt", _quick_key[:20])
|
||||
adapter = self.adapters.get(source.platform)
|
||||
if adapter:
|
||||
merge_pending_message_event(adapter._pending_messages, _quick_key, event)
|
||||
@@ -3427,7 +3585,7 @@ class GatewayRunner:
|
||||
logger.debug(
|
||||
"Telegram follow-up arrived %.2fs after run start for %s — queueing without interrupt",
|
||||
time.time() - _started_at,
|
||||
_quick_key,
|
||||
_quick_key[:20],
|
||||
)
|
||||
adapter = self.adapters.get(source.platform)
|
||||
if adapter:
|
||||
@@ -3445,7 +3603,7 @@ class GatewayRunner:
|
||||
if event.get_command() == "stop":
|
||||
# Force-clean the sentinel so the session is unlocked.
|
||||
self._release_running_agent_state(_quick_key)
|
||||
logger.info("HARD STOP (pending) for session %s — sentinel cleared", _quick_key)
|
||||
logger.info("HARD STOP (pending) for session %s — sentinel cleared", _quick_key[:20])
|
||||
return "⚡ Force-stopped. The agent was still starting — session unlocked."
|
||||
# Queue the message so it will be picked up after the
|
||||
# agent starts.
|
||||
@@ -3466,11 +3624,7 @@ class GatewayRunner:
|
||||
if self._queue_during_drain_enabled()
|
||||
else f"⏳ Gateway is {self._status_action_gerund()} and is not accepting another turn right now."
|
||||
)
|
||||
if self._busy_input_mode == "queue":
|
||||
logger.debug("PRIORITY queue follow-up for session %s", _quick_key)
|
||||
self._queue_or_replace_pending_event(_quick_key, event)
|
||||
return None
|
||||
logger.debug("PRIORITY interrupt for session %s", _quick_key)
|
||||
logger.debug("PRIORITY interrupt for session %s", _quick_key[:20])
|
||||
running_agent.interrupt(event.text)
|
||||
if _quick_key in self._pending_messages:
|
||||
self._pending_messages[_quick_key] += "\n" + event.text
|
||||
@@ -4468,7 +4622,7 @@ class GatewayRunner:
|
||||
if not self._is_session_run_current(_quick_key, run_generation):
|
||||
logger.info(
|
||||
"Discarding stale agent result for %s — generation %d is no longer current",
|
||||
_quick_key or "?",
|
||||
_quick_key[:20] if _quick_key else "?",
|
||||
run_generation,
|
||||
)
|
||||
_stale_adapter = self.adapters.get(source.platform)
|
||||
@@ -4519,7 +4673,7 @@ class GatewayRunner:
|
||||
except Exception as _e:
|
||||
logger.debug(
|
||||
"clear_resume_pending failed for %s: %s",
|
||||
session_key, _e,
|
||||
session_key[:20], _e,
|
||||
)
|
||||
|
||||
# Surface error details when the agent failed silently (final_response=None)
|
||||
@@ -4896,11 +5050,19 @@ class GatewayRunner:
|
||||
# Get existing session key
|
||||
session_key = self._session_key_for_source(source)
|
||||
self._invalidate_session_run_generation(session_key, reason="session_reset")
|
||||
|
||||
# Snapshot the old entry so on_session_finalize can report the
|
||||
# expiring session id before reset_session() rotates it.
|
||||
old_entry = self.session_store._entries.get(session_key)
|
||||
|
||||
|
||||
# Flush memories in the background (fire-and-forget) so the user
|
||||
# gets the "Session reset!" response immediately.
|
||||
try:
|
||||
old_entry = self.session_store._entries.get(session_key)
|
||||
if old_entry:
|
||||
_flush_task = asyncio.create_task(
|
||||
self._async_flush_memories(old_entry.session_id, session_key)
|
||||
)
|
||||
self._background_tasks.add(_flush_task)
|
||||
_flush_task.add_done_callback(self._background_tasks.discard)
|
||||
except Exception as e:
|
||||
logger.debug("Gateway memory flush on reset failed: %s", e)
|
||||
# Close tool resources on the old agent (terminal sandboxes, browser
|
||||
# daemons, background processes) before evicting from cache.
|
||||
# Guard with getattr because test fixtures may skip __init__.
|
||||
@@ -5158,7 +5320,7 @@ class GatewayRunner:
|
||||
interrupt_reason=_INTERRUPT_REASON_STOP,
|
||||
invalidation_reason="stop_command_pending",
|
||||
)
|
||||
logger.info("STOP (pending) for session %s — sentinel cleared", session_key)
|
||||
logger.info("STOP (pending) for session %s — sentinel cleared", session_key[:20])
|
||||
return "⚡ Stopped. The agent hadn't started yet — you can continue this session."
|
||||
if agent:
|
||||
# Force-clean the session lock so a truly hung agent doesn't
|
||||
@@ -5526,17 +5688,9 @@ class GatewayRunner:
|
||||
lines = [f"Model switched to `{result.new_model}`"]
|
||||
lines.append(f"Provider: {plabel}")
|
||||
mi = result.model_info
|
||||
from hermes_cli.model_switch import resolve_display_context_length
|
||||
ctx = resolve_display_context_length(
|
||||
result.new_model,
|
||||
result.target_provider,
|
||||
base_url=result.base_url or current_base_url or "",
|
||||
api_key=result.api_key or current_api_key or "",
|
||||
model_info=mi,
|
||||
)
|
||||
if ctx:
|
||||
lines.append(f"Context: {ctx:,} tokens")
|
||||
if mi:
|
||||
if mi.context_window:
|
||||
lines.append(f"Context: {mi.context_window:,} tokens")
|
||||
if mi.max_output:
|
||||
lines.append(f"Max output: {mi.max_output:,} tokens")
|
||||
if mi.has_cost_data():
|
||||
@@ -5670,25 +5824,28 @@ class GatewayRunner:
|
||||
lines = [f"Model switched to `{result.new_model}`"]
|
||||
lines.append(f"Provider: {provider_label}")
|
||||
|
||||
# Context: always resolve via the provider-aware chain so Codex OAuth,
|
||||
# Copilot, and Nous-enforced caps win over the raw models.dev entry.
|
||||
# Rich metadata from models.dev
|
||||
mi = result.model_info
|
||||
from hermes_cli.model_switch import resolve_display_context_length
|
||||
ctx = resolve_display_context_length(
|
||||
result.new_model,
|
||||
result.target_provider,
|
||||
base_url=result.base_url or current_base_url or "",
|
||||
api_key=result.api_key or current_api_key or "",
|
||||
model_info=mi,
|
||||
)
|
||||
if ctx:
|
||||
lines.append(f"Context: {ctx:,} tokens")
|
||||
if mi:
|
||||
if mi.context_window:
|
||||
lines.append(f"Context: {mi.context_window:,} tokens")
|
||||
if mi.max_output:
|
||||
lines.append(f"Max output: {mi.max_output:,} tokens")
|
||||
if mi.has_cost_data():
|
||||
lines.append(f"Cost: {mi.format_cost()}")
|
||||
lines.append(f"Capabilities: {mi.format_capabilities()}")
|
||||
else:
|
||||
try:
|
||||
from agent.model_metadata import get_model_context_length
|
||||
ctx = get_model_context_length(
|
||||
result.new_model,
|
||||
base_url=result.base_url or current_base_url,
|
||||
api_key=result.api_key or current_api_key,
|
||||
provider=result.target_provider,
|
||||
)
|
||||
lines.append(f"Context: {ctx:,} tokens")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Cache notice
|
||||
cache_enabled = (
|
||||
@@ -7100,25 +7257,29 @@ class GatewayRunner:
|
||||
logger.debug("Failed to list titled sessions: %s", e)
|
||||
return f"Could not list sessions: {e}"
|
||||
|
||||
# Resolve the name to a session ID.
|
||||
# Resolve the name to a session ID
|
||||
target_id = self._session_db.resolve_session_by_title(name)
|
||||
if not target_id:
|
||||
return (
|
||||
f"No session found matching '**{name}**'.\n"
|
||||
"Use `/resume` with no arguments to see available sessions."
|
||||
)
|
||||
# Compression creates child continuations that hold the live transcript.
|
||||
# Follow that chain so gateway /resume matches CLI behavior (#15000).
|
||||
try:
|
||||
target_id = self._session_db.resolve_resume_session_id(target_id)
|
||||
except Exception as e:
|
||||
logger.debug("Failed to resolve resume continuation for %s: %s", target_id, e)
|
||||
|
||||
# Check if already on that session
|
||||
current_entry = self.session_store.get_or_create_session(source)
|
||||
if current_entry.session_id == target_id:
|
||||
return f"📌 Already on session **{name}**."
|
||||
|
||||
# Flush memories for current session before switching
|
||||
try:
|
||||
_flush_task = asyncio.create_task(
|
||||
self._async_flush_memories(current_entry.session_id, session_key)
|
||||
)
|
||||
self._background_tasks.add(_flush_task)
|
||||
_flush_task.add_done_callback(self._background_tasks.discard)
|
||||
except Exception as e:
|
||||
logger.debug("Memory flush on resume failed: %s", e)
|
||||
|
||||
# Clear any running agent for this session key
|
||||
self._release_running_agent_state(session_key)
|
||||
|
||||
@@ -8655,7 +8816,7 @@ class GatewayRunner:
|
||||
if reason:
|
||||
logger.info(
|
||||
"Invalidated run generation for %s → %d (%s)",
|
||||
session_key,
|
||||
session_key[:20],
|
||||
generation,
|
||||
reason,
|
||||
)
|
||||
@@ -9062,7 +9223,7 @@ class GatewayRunner:
|
||||
if not _run_still_current():
|
||||
logger.info(
|
||||
"Discarding stale proxy stream for %s — generation %d is no longer current",
|
||||
session_key or "?",
|
||||
session_key[:20] if session_key else "?",
|
||||
run_generation or 0,
|
||||
)
|
||||
return {
|
||||
@@ -9126,7 +9287,7 @@ class GatewayRunner:
|
||||
if not _run_still_current():
|
||||
logger.info(
|
||||
"Discarding stale proxy result for %s — generation %d is no longer current",
|
||||
session_key or "?",
|
||||
session_key[:20] if session_key else "?",
|
||||
run_generation or 0,
|
||||
)
|
||||
return {
|
||||
@@ -9568,7 +9729,7 @@ class GatewayRunner:
|
||||
)
|
||||
logger.debug(
|
||||
"run_agent resolved: model=%s provider=%s session=%s",
|
||||
model, runtime_kwargs.get("provider"), session_key or "",
|
||||
model, runtime_kwargs.get("provider"), (session_key or "")[:30],
|
||||
)
|
||||
except Exception as exc:
|
||||
return {
|
||||
@@ -10179,7 +10340,7 @@ class GatewayRunner:
|
||||
):
|
||||
logger.info(
|
||||
"Skipping stale agent promotion for %s — generation %s is no longer current",
|
||||
session_key or "",
|
||||
(session_key or "")[:20],
|
||||
run_generation,
|
||||
)
|
||||
return
|
||||
@@ -10326,7 +10487,7 @@ class GatewayRunner:
|
||||
logger.info(
|
||||
"Backup interrupt detected for session %s "
|
||||
"(monitor task state: %s)",
|
||||
session_key,
|
||||
session_key[:20],
|
||||
"done" if interrupt_monitor.done() else "running",
|
||||
)
|
||||
_backup_agent.interrupt(_bp_text)
|
||||
@@ -10386,7 +10547,7 @@ class GatewayRunner:
|
||||
logger.info(
|
||||
"Backup interrupt detected for session %s "
|
||||
"(monitor task state: %s)",
|
||||
session_key,
|
||||
session_key[:20],
|
||||
"done" if interrupt_monitor.done() else "running",
|
||||
)
|
||||
_backup_agent.interrupt(_bp_text)
|
||||
@@ -10488,7 +10649,7 @@ class GatewayRunner:
|
||||
if _is_control_interrupt_message(interrupt_message):
|
||||
logger.info(
|
||||
"Ignoring control interrupt message for session %s: %s",
|
||||
session_key or "?",
|
||||
session_key[:20] if session_key else "?",
|
||||
interrupt_message,
|
||||
)
|
||||
else:
|
||||
@@ -10532,7 +10693,7 @@ class GatewayRunner:
|
||||
if self._draining and (pending_event or pending):
|
||||
logger.info(
|
||||
"Discarding pending follow-up for session %s during gateway %s",
|
||||
session_key or "?",
|
||||
session_key[:20] if session_key else "?",
|
||||
self._status_action_label(),
|
||||
)
|
||||
pending_event = None
|
||||
@@ -10589,7 +10750,7 @@ class GatewayRunner:
|
||||
try:
|
||||
logger.info(
|
||||
"Queued follow-up for session %s: final stream delivery not confirmed; sending first response before continuing.",
|
||||
session_key or "?",
|
||||
session_key[:20] if session_key else "?",
|
||||
)
|
||||
await adapter.send(
|
||||
source.chat_id,
|
||||
@@ -10601,7 +10762,7 @@ class GatewayRunner:
|
||||
elif first_response:
|
||||
logger.info(
|
||||
"Queued follow-up for session %s: skipping resend because final streamed delivery was confirmed.",
|
||||
session_key or "?",
|
||||
session_key[:20] if session_key else "?",
|
||||
)
|
||||
# Release deferred bg-review notifications now that the
|
||||
# first response has been delivered. Pop from the
|
||||
@@ -10736,7 +10897,7 @@ class GatewayRunner:
|
||||
if not _is_empty_sentinel and (_streamed or _previewed):
|
||||
logger.info(
|
||||
"Suppressing normal final send for session %s: final delivery already confirmed (streamed=%s previewed=%s).",
|
||||
session_key or "?",
|
||||
session_key[:20] if session_key else "?",
|
||||
_streamed,
|
||||
_previewed,
|
||||
)
|
||||
|
||||
+16
-97
@@ -60,10 +60,6 @@ from .config import (
|
||||
SessionResetPolicy, # noqa: F401 — re-exported via gateway/__init__.py
|
||||
HomeChannel,
|
||||
)
|
||||
from .whatsapp_identity import (
|
||||
canonical_whatsapp_identifier,
|
||||
normalize_whatsapp_identifier,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -87,9 +83,6 @@ class SessionSource:
|
||||
user_id_alt: Optional[str] = None # Platform-specific stable alt ID (Signal UUID, Feishu union_id)
|
||||
chat_id_alt: Optional[str] = None # Signal group internal ID
|
||||
is_bot: bool = False # True when the message author is a bot/webhook (Discord)
|
||||
guild_id: Optional[str] = None # Discord guild / Slack workspace / Matrix server scope
|
||||
parent_chat_id: Optional[str] = None # Parent channel when chat_id refers to a thread
|
||||
message_id: Optional[str] = None # ID of the triggering message (for pin/reply/react)
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
@@ -127,14 +120,8 @@ class SessionSource:
|
||||
d["user_id_alt"] = self.user_id_alt
|
||||
if self.chat_id_alt:
|
||||
d["chat_id_alt"] = self.chat_id_alt
|
||||
if self.guild_id:
|
||||
d["guild_id"] = self.guild_id
|
||||
if self.parent_chat_id:
|
||||
d["parent_chat_id"] = self.parent_chat_id
|
||||
if self.message_id:
|
||||
d["message_id"] = self.message_id
|
||||
return d
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "SessionSource":
|
||||
return cls(
|
||||
@@ -148,9 +135,6 @@ class SessionSource:
|
||||
chat_topic=data.get("chat_topic"),
|
||||
user_id_alt=data.get("user_id_alt"),
|
||||
chat_id_alt=data.get("chat_id_alt"),
|
||||
guild_id=data.get("guild_id"),
|
||||
parent_chat_id=data.get("parent_chat_id"),
|
||||
message_id=data.get("message_id"),
|
||||
)
|
||||
|
||||
|
||||
@@ -202,31 +186,6 @@ that requires raw IDs). Discord is excluded because mentions use ``<@user_id>``
|
||||
and the LLM needs the real ID to tag users."""
|
||||
|
||||
|
||||
def _discord_tools_loaded() -> bool:
|
||||
"""True iff the agent will actually have Discord tools this session.
|
||||
|
||||
Two conditions must hold:
|
||||
1. The `discord` or `discord_admin` toolset is enabled for the
|
||||
Discord platform via `hermes tools` (opt-in, default OFF).
|
||||
2. `DISCORD_BOT_TOKEN` is set — the tool's `check_fn` gates on it
|
||||
at registry time, so the toolset being enabled in config is not
|
||||
enough if the token isn't configured.
|
||||
|
||||
Returns False (safe default — keeps the stale-API disclaimer) on any
|
||||
error so a bad config can't silently promise tools the agent lacks.
|
||||
"""
|
||||
if not (os.environ.get("DISCORD_BOT_TOKEN") or "").strip():
|
||||
return False
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
from hermes_cli.tools_config import _get_platform_tools
|
||||
cfg = load_config()
|
||||
enabled = _get_platform_tools(cfg, "discord", include_default_mcp_servers=False)
|
||||
return "discord" in enabled or "discord_admin" in enabled
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def build_session_context_prompt(
|
||||
context: SessionContext,
|
||||
*,
|
||||
@@ -314,44 +273,13 @@ def build_session_context_prompt(
|
||||
"that you can only read messages sent directly to you and respond."
|
||||
)
|
||||
elif context.source.platform == Platform.DISCORD:
|
||||
# Inject the Discord IDs block only when the agent actually has
|
||||
# Discord tools loaded this session — i.e. the user opted into
|
||||
# `discord` / `discord_admin` via `hermes tools` AND the bot
|
||||
# token is configured. Otherwise keep the stale-API disclaimer
|
||||
# honest so we never promise tools the agent lacks.
|
||||
if _discord_tools_loaded():
|
||||
src = context.source
|
||||
id_lines = ["", "**Discord IDs (for the `discord` / `discord_admin` tools):**"]
|
||||
if src.guild_id:
|
||||
id_lines.append(f" - Guild: `{src.guild_id}`")
|
||||
if src.thread_id and src.parent_chat_id:
|
||||
id_lines.append(f" - Parent channel: `{src.parent_chat_id}`")
|
||||
id_lines.append(f" - Thread: `{src.thread_id}` (use as `channel_id` for fetch_messages etc.)")
|
||||
else:
|
||||
id_lines.append(f" - Channel: `{src.chat_id}`")
|
||||
if src.message_id:
|
||||
id_lines.append(f" - Triggering message: `{src.message_id}`")
|
||||
lines.extend(id_lines)
|
||||
else:
|
||||
lines.append("")
|
||||
lines.append(
|
||||
"**Platform notes:** You are running inside Discord. "
|
||||
"You do NOT have access to Discord-specific APIs — you cannot search "
|
||||
"channel history, pin messages, manage roles, or list server members. "
|
||||
"Do not promise to perform these actions. If the user asks, explain "
|
||||
"that you can only read messages sent directly to you and respond."
|
||||
)
|
||||
elif context.source.platform == Platform.BLUEBUBBLES:
|
||||
lines.append("")
|
||||
lines.append(
|
||||
"**Platform notes:** You are responding via iMessage. "
|
||||
"Keep responses short and conversational — think texts, not essays. "
|
||||
"Structure longer replies as separate short thoughts, each separated "
|
||||
"by a blank line (double newline). Each block between blank lines "
|
||||
"will be delivered as its own iMessage bubble, so write accordingly: "
|
||||
"one idea per bubble, 1–3 sentences each. "
|
||||
"If the user needs a detailed answer, give the short version first "
|
||||
"and offer to elaborate."
|
||||
"**Platform notes:** You are running inside Discord. "
|
||||
"You do NOT have access to Discord-specific APIs — you cannot search "
|
||||
"channel history, pin messages, manage roles, or list server members. "
|
||||
"Do not promise to perform these actions. If the user asks, explain "
|
||||
"that you can only read messages sent directly to you and respond."
|
||||
)
|
||||
|
||||
# Connected platforms
|
||||
@@ -439,11 +367,11 @@ class SessionEntry:
|
||||
auto_reset_reason: Optional[str] = None # "idle" or "daily"
|
||||
reset_had_activity: bool = False # whether the expired session had any messages
|
||||
|
||||
# Set by the background expiry watcher after it finalizes an expired
|
||||
# session (invoking on_session_finalize hooks and evicting the cached
|
||||
# agent). Persisted to sessions.json so the flag survives gateway
|
||||
# restarts — prevents redundant finalization runs.
|
||||
expiry_finalized: bool = False
|
||||
# Set by the background expiry watcher after it successfully flushes
|
||||
# memories for this session. Persisted to sessions.json so the flag
|
||||
# survives gateway restarts (the old in-memory _pre_flushed_sessions
|
||||
# set was lost on restart, causing redundant re-flushes).
|
||||
memory_flushed: bool = False
|
||||
|
||||
# When True the next call to get_or_create_session() will auto-reset
|
||||
# this session (create a new session_id) so the user starts fresh.
|
||||
@@ -479,7 +407,7 @@ class SessionEntry:
|
||||
"last_prompt_tokens": self.last_prompt_tokens,
|
||||
"estimated_cost_usd": self.estimated_cost_usd,
|
||||
"cost_status": self.cost_status,
|
||||
"expiry_finalized": self.expiry_finalized,
|
||||
"memory_flushed": self.memory_flushed,
|
||||
"suspended": self.suspended,
|
||||
"resume_pending": self.resume_pending,
|
||||
"resume_reason": self.resume_reason,
|
||||
@@ -531,7 +459,7 @@ class SessionEntry:
|
||||
last_prompt_tokens=data.get("last_prompt_tokens", 0),
|
||||
estimated_cost_usd=data.get("estimated_cost_usd", 0.0),
|
||||
cost_status=data.get("cost_status", "unknown"),
|
||||
expiry_finalized=data.get("expiry_finalized", data.get("memory_flushed", False)),
|
||||
memory_flushed=data.get("memory_flushed", False),
|
||||
suspended=data.get("suspended", False),
|
||||
resume_pending=data.get("resume_pending", False),
|
||||
resume_reason=data.get("resume_reason"),
|
||||
@@ -590,24 +518,15 @@ def build_session_key(
|
||||
"""
|
||||
platform = source.platform.value
|
||||
if source.chat_type == "dm":
|
||||
dm_chat_id = source.chat_id
|
||||
if source.platform == Platform.WHATSAPP:
|
||||
dm_chat_id = canonical_whatsapp_identifier(source.chat_id)
|
||||
|
||||
if dm_chat_id:
|
||||
if source.chat_id:
|
||||
if source.thread_id:
|
||||
return f"agent:main:{platform}:dm:{dm_chat_id}:{source.thread_id}"
|
||||
return f"agent:main:{platform}:dm:{dm_chat_id}"
|
||||
return f"agent:main:{platform}:dm:{source.chat_id}:{source.thread_id}"
|
||||
return f"agent:main:{platform}:dm:{source.chat_id}"
|
||||
if source.thread_id:
|
||||
return f"agent:main:{platform}:dm:{source.thread_id}"
|
||||
return f"agent:main:{platform}:dm"
|
||||
|
||||
participant_id = source.user_id_alt or source.user_id
|
||||
if participant_id and source.platform == Platform.WHATSAPP:
|
||||
# Same JID/LID-flip bug as the DM case: without canonicalisation, a
|
||||
# single group member gets two isolated per-user sessions when the
|
||||
# bridge reshuffles alias forms.
|
||||
participant_id = canonical_whatsapp_identifier(str(participant_id)) or participant_id
|
||||
key_parts = ["agent:main", platform, source.chat_type]
|
||||
|
||||
if source.chat_id:
|
||||
|
||||
@@ -1,135 +0,0 @@
|
||||
"""Shared helpers for canonicalising WhatsApp sender identity.
|
||||
|
||||
WhatsApp's bridge can surface the same human under two different JID shapes
|
||||
within a single conversation:
|
||||
|
||||
- LID form: ``999999999999999@lid``
|
||||
- Phone form: ``15551234567@s.whatsapp.net``
|
||||
|
||||
Both the authorisation path (:mod:`gateway.run`) and the session-key path
|
||||
(:mod:`gateway.session`) need to collapse these aliases to a single stable
|
||||
identity. This module is the single source of truth for that resolution so
|
||||
the two paths can never drift apart.
|
||||
|
||||
Public helpers:
|
||||
|
||||
- :func:`normalize_whatsapp_identifier` — strip JID/LID/device/plus syntax
|
||||
down to the bare numeric identifier.
|
||||
- :func:`canonical_whatsapp_identifier` — walk the bridge's
|
||||
``lid-mapping-*.json`` files and return a stable canonical identity
|
||||
across phone/LID variants.
|
||||
- :func:`expand_whatsapp_aliases` — return the full alias set for an
|
||||
identifier. Used by authorisation code that needs to match any known
|
||||
form of a sender against an allow-list.
|
||||
|
||||
Plugins that need per-sender behaviour on WhatsApp (role-based routing,
|
||||
per-contact authorisation, policy gating in a gateway hook) should use
|
||||
``canonical_whatsapp_identifier`` so their bookkeeping lines up with
|
||||
Hermes' own session keys.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Set
|
||||
|
||||
from hermes_constants import get_hermes_home
|
||||
|
||||
|
||||
def normalize_whatsapp_identifier(value: str) -> str:
|
||||
"""Strip WhatsApp JID/LID syntax down to its stable numeric identifier.
|
||||
|
||||
Accepts any of the identifier shapes the WhatsApp bridge may emit:
|
||||
``"60123456789@s.whatsapp.net"``, ``"60123456789:47@s.whatsapp.net"``,
|
||||
``"60123456789@lid"``, or a bare ``"+601****6789"`` / ``"60123456789"``.
|
||||
Returns just the numeric identifier (``"60123456789"``) suitable for
|
||||
equality comparisons.
|
||||
|
||||
Useful for plugins that want to match sender IDs against
|
||||
user-supplied config (phone numbers in ``config.yaml``) without
|
||||
worrying about which variant the bridge happens to deliver.
|
||||
"""
|
||||
return (
|
||||
str(value or "")
|
||||
.strip()
|
||||
.replace("+", "", 1)
|
||||
.split(":", 1)[0]
|
||||
.split("@", 1)[0]
|
||||
)
|
||||
|
||||
|
||||
def expand_whatsapp_aliases(identifier: str) -> Set[str]:
|
||||
"""Resolve WhatsApp phone/LID aliases via bridge session mapping files.
|
||||
|
||||
Returns the set of all identifiers transitively reachable through the
|
||||
bridge's ``$HERMES_HOME/whatsapp/session/lid-mapping-*.json`` files,
|
||||
starting from ``identifier``. The result always includes the
|
||||
normalized input itself, so callers can safely ``in`` check against
|
||||
the return value without a separate fallback branch.
|
||||
|
||||
Returns an empty set if ``identifier`` normalizes to empty.
|
||||
"""
|
||||
normalized = normalize_whatsapp_identifier(identifier)
|
||||
if not normalized:
|
||||
return set()
|
||||
|
||||
session_dir = get_hermes_home() / "whatsapp" / "session"
|
||||
resolved: Set[str] = set()
|
||||
queue = [normalized]
|
||||
|
||||
while queue:
|
||||
current = queue.pop(0)
|
||||
if not current or current in resolved:
|
||||
continue
|
||||
|
||||
resolved.add(current)
|
||||
for suffix in ("", "_reverse"):
|
||||
mapping_path = session_dir / f"lid-mapping-{current}{suffix}.json"
|
||||
if not mapping_path.exists():
|
||||
continue
|
||||
try:
|
||||
mapped = normalize_whatsapp_identifier(
|
||||
json.loads(mapping_path.read_text(encoding="utf-8"))
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
if mapped and mapped not in resolved:
|
||||
queue.append(mapped)
|
||||
|
||||
return resolved
|
||||
|
||||
|
||||
def canonical_whatsapp_identifier(identifier: str) -> str:
|
||||
"""Return a stable WhatsApp sender identity across phone-JID/LID variants.
|
||||
|
||||
WhatsApp may surface the same person under either a phone-format JID
|
||||
(``60123456789@s.whatsapp.net``) or a LID (``1234567890@lid``). This
|
||||
applies to a DM ``chat_id`` *and* to the ``participant_id`` of a
|
||||
member inside a group chat — both represent a user identity, and the
|
||||
bridge may flip between the two for the same human.
|
||||
|
||||
This helper reads the bridge's ``whatsapp/session/lid-mapping-*.json``
|
||||
files, walks the mapping transitively, and picks the shortest
|
||||
(numeric-preferred) alias as the canonical identity.
|
||||
:func:`gateway.session.build_session_key` uses this for both WhatsApp
|
||||
DM chat_ids and WhatsApp group participant_ids, so callers get the
|
||||
same session-key identity Hermes itself uses.
|
||||
|
||||
Plugins that need per-sender behaviour (role-based routing,
|
||||
authorisation, per-contact policy) should use this so their
|
||||
bookkeeping lines up with Hermes' session bookkeeping even when
|
||||
the bridge reshuffles aliases.
|
||||
|
||||
Returns an empty string if ``identifier`` normalizes to empty. If no
|
||||
mapping files exist yet (fresh bridge install), returns the
|
||||
normalized input unchanged.
|
||||
"""
|
||||
normalized = normalize_whatsapp_identifier(identifier)
|
||||
if not normalized:
|
||||
return ""
|
||||
|
||||
# expand_whatsapp_aliases always includes `normalized` itself in the
|
||||
# returned set, so the min() below degrades gracefully to `normalized`
|
||||
# when no lid-mapping files are present.
|
||||
aliases = expand_whatsapp_aliases(normalized)
|
||||
return min(aliases, key=lambda candidate: (len(candidate), candidate))
|
||||
+1
-12
@@ -743,18 +743,7 @@ def _load_auth_store(auth_file: Optional[Path] = None) -> Dict[str, Any]:
|
||||
|
||||
try:
|
||||
raw = json.loads(auth_file.read_text())
|
||||
except Exception as exc:
|
||||
corrupt_path = auth_file.with_suffix(".json.corrupt")
|
||||
try:
|
||||
import shutil
|
||||
shutil.copy2(auth_file, corrupt_path)
|
||||
except Exception:
|
||||
pass
|
||||
logger.warning(
|
||||
"auth: failed to parse %s (%s) — starting with empty store. "
|
||||
"Corrupt file preserved at %s",
|
||||
auth_file, exc, corrupt_path,
|
||||
)
|
||||
except Exception:
|
||||
return {"version": AUTH_STORE_VERSION, "providers": {}}
|
||||
|
||||
if isinstance(raw, dict) and (
|
||||
|
||||
@@ -103,8 +103,7 @@ COMMAND_REGISTRY: list[CommandDef] = [
|
||||
# Configuration
|
||||
CommandDef("config", "Show current configuration", "Configuration",
|
||||
cli_only=True),
|
||||
CommandDef("model", "Switch model for this session", "Configuration",
|
||||
aliases=("provider",), args_hint="[model] [--provider name] [--global]"),
|
||||
CommandDef("model", "Switch model for this session", "Configuration", args_hint="[model] [--provider name] [--global]"),
|
||||
CommandDef("gquota", "Show Google Gemini Code Assist quota usage", "Info",
|
||||
cli_only=True),
|
||||
|
||||
@@ -127,9 +126,6 @@ COMMAND_REGISTRY: list[CommandDef] = [
|
||||
cli_only=True, args_hint="[name]"),
|
||||
CommandDef("voice", "Toggle voice mode", "Configuration",
|
||||
args_hint="[on|off|tts|status]", subcommands=("on", "off", "tts", "status")),
|
||||
CommandDef("busy", "Control what Enter does while Hermes is working", "Configuration",
|
||||
cli_only=True, args_hint="[queue|interrupt|status]",
|
||||
subcommands=("queue", "interrupt", "status")),
|
||||
|
||||
# Tools & Skills
|
||||
CommandDef("tools", "Manage tools: /tools [list|disable|enable] [name...]", "Tools & Skills",
|
||||
|
||||
+9
-10
@@ -612,6 +612,14 @@ DEFAULT_CONFIG = {
|
||||
"timeout": 30,
|
||||
"extra_body": {},
|
||||
},
|
||||
"flush_memories": {
|
||||
"provider": "auto",
|
||||
"model": "",
|
||||
"base_url": "",
|
||||
"api_key": "",
|
||||
"timeout": 30,
|
||||
"extra_body": {},
|
||||
},
|
||||
"title_generation": {
|
||||
"provider": "auto",
|
||||
"model": "",
|
||||
@@ -775,15 +783,6 @@ DEFAULT_CONFIG = {
|
||||
# warning log if out of range.
|
||||
"max_spawn_depth": 1, # depth cap (1 = flat [default], 2 = orchestrator→leaf, 3 = three-level)
|
||||
"orchestrator_enabled": True, # kill switch for role="orchestrator"
|
||||
# When a subagent hits a dangerous-command approval prompt, the parent's
|
||||
# prompt_toolkit TUI owns stdin — a thread-local input() call from the
|
||||
# subagent worker would deadlock the parent UI. To avoid the deadlock,
|
||||
# subagent threads ALWAYS resolve approvals non-interactively:
|
||||
# false (default) → auto-deny with a logger.warning audit line (safe)
|
||||
# true → auto-approve "once" with a logger.warning audit line
|
||||
# Flip to true only if you trust delegated work to run dangerous cmds
|
||||
# without human review (cron pipelines, batch automation, etc.).
|
||||
"subagent_auto_approve": False,
|
||||
},
|
||||
|
||||
# Ephemeral prefill messages file — JSON list of {role, content} dicts
|
||||
@@ -840,7 +839,7 @@ DEFAULT_CONFIG = {
|
||||
"auto_thread": True, # Auto-create threads on @mention in channels (like Slack)
|
||||
"reactions": True, # Add 👀/✅/❌ reactions to messages during processing
|
||||
"channel_prompts": {}, # Per-channel ephemeral system prompts (forum parents apply to child threads)
|
||||
# discord / discord_admin tools: restrict which actions the agent may call.
|
||||
# discord_server tool: restrict which actions the agent may call.
|
||||
# Default (empty) = all actions allowed (subject to bot privileged intents).
|
||||
# Accepts comma-separated string ("list_guilds,list_channels,fetch_messages")
|
||||
# or YAML list. Unknown names are dropped with a warning at load time.
|
||||
|
||||
+80
-340
@@ -51,7 +51,6 @@ import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def _add_accept_hooks_flag(parser) -> None:
|
||||
"""Attach the ``--accept-hooks`` flag. Shared across every agent
|
||||
subparser so the flag works regardless of CLI position."""
|
||||
@@ -175,7 +174,6 @@ load_hermes_dotenv(project_env=PROJECT_ROOT / ".env")
|
||||
try:
|
||||
if "HERMES_REDACT_SECRETS" not in os.environ:
|
||||
import yaml as _yaml_early
|
||||
|
||||
_cfg_path = get_hermes_home() / "config.yaml"
|
||||
if _cfg_path.exists():
|
||||
with open(_cfg_path, encoding="utf-8") as _f:
|
||||
@@ -841,8 +839,6 @@ def _find_bundled_tui(tui_dir: Path) -> Optional[Path]:
|
||||
|
||||
|
||||
def _tui_build_needed(tui_dir: Path) -> bool:
|
||||
if _hermes_ink_bundle_stale(tui_dir):
|
||||
return True
|
||||
entry = tui_dir / "dist" / "entry.js"
|
||||
if not entry.exists():
|
||||
return True
|
||||
@@ -1030,12 +1026,7 @@ def _make_tui_argv(tui_dir: Path, tui_dev: bool) -> tuple[list[str], Path]:
|
||||
return [node, str(root / "dist" / "entry.js")], root
|
||||
|
||||
|
||||
def _launch_tui(
|
||||
resume_session_id: Optional[str] = None,
|
||||
tui_dev: bool = False,
|
||||
model: Optional[str] = None,
|
||||
provider: Optional[str] = None,
|
||||
):
|
||||
def _launch_tui(resume_session_id: Optional[str] = None, tui_dev: bool = False):
|
||||
"""Replace current process with the TUI."""
|
||||
tui_dir = PROJECT_ROOT / "ui-tui"
|
||||
|
||||
@@ -1045,12 +1036,6 @@ def _launch_tui(
|
||||
)
|
||||
env.setdefault("HERMES_PYTHON", sys.executable)
|
||||
env.setdefault("HERMES_CWD", os.getcwd())
|
||||
if model:
|
||||
env["HERMES_MODEL"] = model
|
||||
env["HERMES_INFERENCE_MODEL"] = model
|
||||
if provider:
|
||||
env["HERMES_TUI_PROVIDER"] = provider
|
||||
env["HERMES_INFERENCE_PROVIDER"] = provider
|
||||
# Guarantee an 8GB V8 heap + exposed GC for the TUI. Default node cap is
|
||||
# ~1.5–4GB depending on version and can fatal-OOM on long sessions with
|
||||
# large transcripts / reasoning blobs. Token-level merge: respect any
|
||||
@@ -1189,8 +1174,6 @@ def cmd_chat(args):
|
||||
_launch_tui(
|
||||
getattr(args, "resume", None),
|
||||
tui_dev=getattr(args, "tui_dev", False),
|
||||
model=getattr(args, "model", None),
|
||||
provider=getattr(args, "provider", None),
|
||||
)
|
||||
|
||||
# Import and run the CLI
|
||||
@@ -1342,9 +1325,7 @@ def cmd_whatsapp(args):
|
||||
return
|
||||
|
||||
if not (bridge_dir / "node_modules").exists():
|
||||
print(
|
||||
"\n→ Installing WhatsApp bridge dependencies (this can take a few minutes)..."
|
||||
)
|
||||
print("\n→ Installing WhatsApp bridge dependencies (this can take a few minutes)...")
|
||||
npm = shutil.which("npm")
|
||||
if not npm:
|
||||
print(" ✗ npm not found on PATH — install Node.js first")
|
||||
@@ -1720,14 +1701,15 @@ def _clear_stale_openai_base_url():
|
||||
|
||||
# (task_key, display_name, short_description)
|
||||
_AUX_TASKS: list[tuple[str, str, str]] = [
|
||||
("vision", "Vision", "image/screenshot analysis"),
|
||||
("compression", "Compression", "context summarization"),
|
||||
("web_extract", "Web extract", "web page summarization"),
|
||||
("session_search", "Session search", "past-conversation recall"),
|
||||
("approval", "Approval", "smart command approval"),
|
||||
("mcp", "MCP", "MCP tool reasoning"),
|
||||
("vision", "Vision", "image/screenshot analysis"),
|
||||
("compression", "Compression", "context summarization"),
|
||||
("web_extract", "Web extract", "web page summarization"),
|
||||
("session_search", "Session search", "past-conversation recall"),
|
||||
("approval", "Approval", "smart command approval"),
|
||||
("mcp", "MCP", "MCP tool reasoning"),
|
||||
("flush_memories", "Flush memories", "memory consolidation"),
|
||||
("title_generation", "Title generation", "session titles"),
|
||||
("skills_hub", "Skills hub", "skills search/install"),
|
||||
("skills_hub", "Skills hub", "skills search/install"),
|
||||
]
|
||||
|
||||
|
||||
@@ -1826,7 +1808,7 @@ def _aux_config_menu() -> None:
|
||||
print(" Auxiliary models — side-task routing")
|
||||
print()
|
||||
print(" Side tasks (vision, compression, web extraction, etc.) default")
|
||||
print(' to your main chat model. "auto" means "use my main model" —')
|
||||
print(" to your main chat model. \"auto\" means \"use my main model\" —")
|
||||
print(" Hermes only falls back to a lightweight backend (OpenRouter,")
|
||||
print(" Nous Portal) if the main model is unavailable. Override a")
|
||||
print(" task below if you want it pinned to a specific provider/model.")
|
||||
@@ -1837,20 +1819,15 @@ def _aux_config_menu() -> None:
|
||||
desc_col = max(len(desc) for _, _, desc in _AUX_TASKS) + 4
|
||||
entries: list[tuple[str, str]] = []
|
||||
for task_key, name, desc in _AUX_TASKS:
|
||||
task_cfg = (
|
||||
aux.get(task_key, {}) if isinstance(aux.get(task_key), dict) else {}
|
||||
)
|
||||
task_cfg = aux.get(task_key, {}) if isinstance(aux.get(task_key), dict) else {}
|
||||
current = _format_aux_current(task_cfg)
|
||||
label = (
|
||||
f"{name.ljust(name_col)}{('(' + desc + ')').ljust(desc_col)}{current}"
|
||||
)
|
||||
label = f"{name.ljust(name_col)}{('(' + desc + ')').ljust(desc_col)}{current}"
|
||||
entries.append((task_key, label))
|
||||
entries.append(("__reset__", "Reset all to auto"))
|
||||
entries.append(("__back__", "Back"))
|
||||
entries.append(("__back__", "Back"))
|
||||
|
||||
idx = _prompt_provider_choice(
|
||||
[label for _, label in entries],
|
||||
default=0,
|
||||
[label for _, label in entries], default=0,
|
||||
)
|
||||
if idx is None:
|
||||
return
|
||||
@@ -1898,9 +1875,7 @@ def _aux_select_for_task(task: str) -> None:
|
||||
|
||||
entries: list[tuple[str, str, list[str]]] = [] # (slug, label, models)
|
||||
# "auto" always first
|
||||
auto_marker = (
|
||||
" ← current" if current_provider == "auto" and not current_base_url else ""
|
||||
)
|
||||
auto_marker = " ← current" if current_provider == "auto" and not current_base_url else ""
|
||||
entries.append(("__auto__", f"auto (recommended){auto_marker}", []))
|
||||
|
||||
for p in providers:
|
||||
@@ -1909,9 +1884,7 @@ def _aux_select_for_task(task: str) -> None:
|
||||
total = p.get("total_models", 0)
|
||||
models = p.get("models") or []
|
||||
model_hint = f" — {total} models" if total else ""
|
||||
marker = (
|
||||
" ← current" if slug == current_provider and not current_base_url else ""
|
||||
)
|
||||
marker = " ← current" if slug == current_provider and not current_base_url else ""
|
||||
entries.append((slug, f"{name}{model_hint}{marker}", list(models)))
|
||||
|
||||
# Custom endpoint (raw base_url)
|
||||
@@ -1979,17 +1952,14 @@ def _aux_flow_provider_model(
|
||||
selected = val or ""
|
||||
else:
|
||||
selected = _prompt_model_selection(
|
||||
model_list,
|
||||
current_model=current_model,
|
||||
pricing=pricing,
|
||||
model_list, current_model=current_model, pricing=pricing,
|
||||
)
|
||||
if selected is None:
|
||||
print("No change.")
|
||||
return
|
||||
|
||||
_save_aux_choice(
|
||||
task, provider=provider_slug, model=selected or "", base_url="", api_key=""
|
||||
)
|
||||
_save_aux_choice(task, provider=provider_slug, model=selected or "",
|
||||
base_url="", api_key="")
|
||||
if selected:
|
||||
print(f"{display_name}: {provider_slug} · {selected}")
|
||||
else:
|
||||
@@ -2009,9 +1979,7 @@ def _aux_flow_custom_endpoint(task: str, task_cfg: dict) -> None:
|
||||
print(" Provide an OpenAI-compatible base URL (e.g. http://localhost:11434/v1)")
|
||||
print()
|
||||
try:
|
||||
url_prompt = (
|
||||
f"Base URL [{current_base_url}]: " if current_base_url else "Base URL: "
|
||||
)
|
||||
url_prompt = f"Base URL [{current_base_url}]: " if current_base_url else "Base URL: "
|
||||
url = input(url_prompt).strip()
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
print()
|
||||
@@ -2021,30 +1989,20 @@ def _aux_flow_custom_endpoint(task: str, task_cfg: dict) -> None:
|
||||
print("No URL provided. No change.")
|
||||
return
|
||||
try:
|
||||
model_prompt = (
|
||||
f"Model slug (optional) [{current_model}]: "
|
||||
if current_model
|
||||
else "Model slug (optional): "
|
||||
)
|
||||
model_prompt = f"Model slug (optional) [{current_model}]: " if current_model else "Model slug (optional): "
|
||||
model = input(model_prompt).strip()
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
print()
|
||||
return
|
||||
model = model or current_model
|
||||
try:
|
||||
api_key = getpass.getpass(
|
||||
"API key (optional, blank = use OPENAI_API_KEY): "
|
||||
).strip()
|
||||
api_key = getpass.getpass("API key (optional, blank = use OPENAI_API_KEY): ").strip()
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
print()
|
||||
return
|
||||
|
||||
_save_aux_choice(
|
||||
task,
|
||||
provider="custom",
|
||||
model=model,
|
||||
base_url=url,
|
||||
api_key=api_key,
|
||||
task, provider="custom", model=model, base_url=url, api_key=api_key,
|
||||
)
|
||||
short_url = url.replace("https://", "").replace("http://", "").rstrip("/")
|
||||
print(f"{display_name}: custom ({short_url})" + (f" · {model}" if model else ""))
|
||||
@@ -2160,9 +2118,7 @@ def _model_flow_ai_gateway(config, current_model=""):
|
||||
api_key = get_env_value("AI_GATEWAY_API_KEY")
|
||||
if not api_key:
|
||||
print("No Vercel AI Gateway API key configured.")
|
||||
print(
|
||||
"Create API key here: https://vercel.com/d?to=%2F%5Bteam%5D%2F%7E%2Fai-gateway&title=AI+Gateway"
|
||||
)
|
||||
print("Create API key here: https://vercel.com/d?to=%2F%5Bteam%5D%2F%7E%2Fai-gateway&title=AI+Gateway")
|
||||
print("Add a payment method to get $5 in free credits.")
|
||||
print()
|
||||
try:
|
||||
@@ -2962,9 +2918,7 @@ def _model_flow_named_custom(config, provider_info):
|
||||
|
||||
print("Fetching available models...")
|
||||
models = fetch_api_models(
|
||||
api_key,
|
||||
base_url,
|
||||
timeout=8.0,
|
||||
api_key, base_url, timeout=8.0,
|
||||
api_mode=api_mode or None,
|
||||
)
|
||||
|
||||
@@ -3635,12 +3589,7 @@ def _model_flow_stepfun(config, current_model=""):
|
||||
_save_model_choice,
|
||||
deactivate_provider,
|
||||
)
|
||||
from hermes_cli.config import (
|
||||
get_env_value,
|
||||
save_env_value,
|
||||
load_config,
|
||||
save_config,
|
||||
)
|
||||
from hermes_cli.config import get_env_value, save_env_value, load_config, save_config
|
||||
from hermes_cli.models import fetch_api_models
|
||||
|
||||
provider_id = "stepfun"
|
||||
@@ -3659,7 +3608,6 @@ def _model_flow_stepfun(config, current_model=""):
|
||||
if key_env:
|
||||
try:
|
||||
import getpass
|
||||
|
||||
new_key = getpass.getpass(f"{key_env} (or Enter to cancel): ").strip()
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
print()
|
||||
@@ -3685,10 +3633,7 @@ def _model_flow_stepfun(config, current_model=""):
|
||||
current_region = _infer_stepfun_region(current_base or pconfig.inference_base_url)
|
||||
|
||||
region_choices = [
|
||||
(
|
||||
"international",
|
||||
f"International ({_stepfun_base_url_for_region('international')})",
|
||||
),
|
||||
("international", f"International ({_stepfun_base_url_for_region('international')})"),
|
||||
("china", f"China ({_stepfun_base_url_for_region('china')})"),
|
||||
]
|
||||
ordered_regions = []
|
||||
@@ -4531,7 +4476,6 @@ def cmd_webhook(args):
|
||||
def cmd_hooks(args):
|
||||
"""Shell-hook inspection and management."""
|
||||
from hermes_cli.hooks import hooks_command
|
||||
|
||||
hooks_command(args)
|
||||
|
||||
|
||||
@@ -6102,86 +6046,6 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
||||
)
|
||||
import signal as _signal
|
||||
|
||||
def _wait_for_service_active(
|
||||
scope_cmd_: list,
|
||||
svc_name_: str,
|
||||
timeout: float = 10.0,
|
||||
) -> bool:
|
||||
"""Poll ``systemctl is-active`` until the unit reports active.
|
||||
|
||||
systemd's Stopped -> Started transition after a graceful exit
|
||||
(or a hard restart) is not instantaneous; a one-shot check
|
||||
races that window and falsely reports the unit as down.
|
||||
Poll every 0.5s up to ``timeout`` seconds before giving up.
|
||||
"""
|
||||
deadline = _time.monotonic() + max(timeout, 0.5)
|
||||
while True:
|
||||
try:
|
||||
_verify = subprocess.run(
|
||||
scope_cmd_ + ["is-active", svc_name_],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if _verify.stdout.strip() == "active":
|
||||
return True
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired):
|
||||
pass
|
||||
if _time.monotonic() >= deadline:
|
||||
return False
|
||||
_time.sleep(0.5)
|
||||
|
||||
def _service_restart_sec(
|
||||
scope_cmd_: list,
|
||||
svc_name_: str,
|
||||
default: float = 0.0,
|
||||
) -> float:
|
||||
"""Read the unit's ``RestartUSec`` (RestartSec) in seconds.
|
||||
|
||||
After a graceful exit-75, systemd waits ``RestartSec`` before
|
||||
respawning the unit. Callers that poll for ``is-active``
|
||||
must use a timeout >= ``RestartSec`` + transition slack, or
|
||||
they'll give up *during* the cooldown window and wrongly
|
||||
conclude the unit didn't relaunch.
|
||||
"""
|
||||
try:
|
||||
_show = subprocess.run(
|
||||
scope_cmd_
|
||||
+ [
|
||||
"show",
|
||||
svc_name_,
|
||||
"--property=RestartUSec",
|
||||
"--value",
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired):
|
||||
return default
|
||||
raw = (_show.stdout or "").strip()
|
||||
# systemd emits values like "30s", "100ms", "1min 30s", or
|
||||
# "infinity". Parse conservatively; on any miss return default.
|
||||
if not raw or raw == "infinity":
|
||||
return default
|
||||
total = 0.0
|
||||
matched = False
|
||||
for part in raw.split():
|
||||
for _suf, _mult in (
|
||||
("ms", 0.001),
|
||||
("us", 0.000001),
|
||||
("min", 60.0),
|
||||
("s", 1.0),
|
||||
):
|
||||
if part.endswith(_suf):
|
||||
try:
|
||||
total += float(part[: -len(_suf)]) * _mult
|
||||
matched = True
|
||||
except ValueError:
|
||||
pass
|
||||
break
|
||||
return total if matched else default
|
||||
|
||||
# Drain budget for graceful SIGUSR1 restarts. The gateway drains
|
||||
# for up to ``agent.restart_drain_timeout`` (default 60s) before
|
||||
# exiting with code 75; we wait slightly longer so the drain
|
||||
@@ -6197,17 +6061,12 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
||||
_cfg_drain = None
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
|
||||
_cfg_agent = load_config().get("agent") or {}
|
||||
_cfg_agent = (load_config().get("agent") or {})
|
||||
_cfg_drain = _cfg_agent.get("restart_drain_timeout")
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
_drain_budget = (
|
||||
float(_cfg_drain)
|
||||
if _cfg_drain is not None
|
||||
else float(_DEFAULT_DRAIN)
|
||||
)
|
||||
_drain_budget = float(_cfg_drain) if _cfg_drain is not None else float(_DEFAULT_DRAIN)
|
||||
except (TypeError, ValueError):
|
||||
_drain_budget = float(_DEFAULT_DRAIN)
|
||||
# Add a 15s margin so the drain loop + final exit finish before
|
||||
@@ -6272,23 +6131,14 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
||||
_main_pid = 0
|
||||
try:
|
||||
_show = subprocess.run(
|
||||
scope_cmd
|
||||
+ [
|
||||
"show",
|
||||
svc_name,
|
||||
"--property=MainPID",
|
||||
"--value",
|
||||
scope_cmd + [
|
||||
"show", svc_name,
|
||||
"--property=MainPID", "--value",
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
_main_pid = int((_show.stdout or "").strip() or 0)
|
||||
except (
|
||||
ValueError,
|
||||
subprocess.TimeoutExpired,
|
||||
FileNotFoundError,
|
||||
):
|
||||
except (ValueError, subprocess.TimeoutExpired, FileNotFoundError):
|
||||
_main_pid = 0
|
||||
|
||||
_graceful_ok = False
|
||||
@@ -6297,33 +6147,19 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
||||
f" → {svc_name}: draining (up to {int(_drain_budget)}s)..."
|
||||
)
|
||||
_graceful_ok = _graceful_restart_via_sigusr1(
|
||||
_main_pid,
|
||||
drain_timeout=_drain_budget,
|
||||
_main_pid, drain_timeout=_drain_budget,
|
||||
)
|
||||
|
||||
if _graceful_ok:
|
||||
# Gateway exited 75; systemd should relaunch
|
||||
# via Restart=on-failure. The unit's
|
||||
# RestartSec (default 30s on ours) gates the
|
||||
# respawn — poll past that + slack so we
|
||||
# don't give up mid-cooldown and falsely
|
||||
# print "drained but didn't relaunch". For
|
||||
# units without RestartSec set we fall back
|
||||
# to the original 10s budget.
|
||||
_restart_sec = _service_restart_sec(
|
||||
scope_cmd,
|
||||
svc_name,
|
||||
default=0.0,
|
||||
# via Restart=on-failure. Verify the new
|
||||
# process came up.
|
||||
_time.sleep(3)
|
||||
verify = subprocess.run(
|
||||
scope_cmd + ["is-active", svc_name],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
_post_drain_timeout = max(
|
||||
10.0,
|
||||
_restart_sec + 10.0,
|
||||
)
|
||||
if _wait_for_service_active(
|
||||
scope_cmd,
|
||||
svc_name,
|
||||
timeout=_post_drain_timeout,
|
||||
):
|
||||
if verify.stdout.strip() == "active":
|
||||
restarted_services.append(svc_name)
|
||||
continue
|
||||
# Process exited but wasn't respawned (older
|
||||
@@ -6349,11 +6185,14 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
||||
# Verify the service actually survived the
|
||||
# restart. systemctl restart returns 0 even
|
||||
# if the new process crashes immediately.
|
||||
if _wait_for_service_active(
|
||||
scope_cmd,
|
||||
svc_name,
|
||||
timeout=10.0,
|
||||
):
|
||||
_time.sleep(3)
|
||||
verify = subprocess.run(
|
||||
scope_cmd + ["is-active", svc_name],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if verify.stdout.strip() == "active":
|
||||
restarted_services.append(svc_name)
|
||||
else:
|
||||
# Retry once — transient startup failures
|
||||
@@ -6368,11 +6207,14 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
||||
text=True,
|
||||
timeout=15,
|
||||
)
|
||||
if _wait_for_service_active(
|
||||
scope_cmd,
|
||||
svc_name,
|
||||
timeout=10.0,
|
||||
):
|
||||
_time.sleep(3)
|
||||
verify2 = subprocess.run(
|
||||
scope_cmd + ["is-active", svc_name],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if verify2.stdout.strip() == "active":
|
||||
restarted_services.append(svc_name)
|
||||
print(f" ✓ {svc_name} recovered on retry")
|
||||
else:
|
||||
@@ -6873,15 +6715,9 @@ def cmd_dashboard(args):
|
||||
try:
|
||||
import fastapi # noqa: F401
|
||||
import uvicorn # noqa: F401
|
||||
except ImportError as e:
|
||||
print("Web UI dependencies not installed (need fastapi + uvicorn).")
|
||||
print(
|
||||
f"Re-install the package into this interpreter so metadata updates apply:\n"
|
||||
f" cd {PROJECT_ROOT}\n"
|
||||
f" {sys.executable} -m pip install -e .\n"
|
||||
"If `pip` is missing in this venv, use: uv pip install -e ."
|
||||
)
|
||||
print(f"Import error: {e}")
|
||||
except ImportError:
|
||||
print("Web UI dependencies not installed.")
|
||||
print(f"Install them with: {sys.executable} -m pip install 'fastapi' 'uvicorn[standard]'")
|
||||
sys.exit(1)
|
||||
|
||||
if "HERMES_WEB_DIST" not in os.environ:
|
||||
@@ -6890,17 +6726,11 @@ def cmd_dashboard(args):
|
||||
|
||||
from hermes_cli.web_server import start_server
|
||||
|
||||
gui_mode = getattr(args, "gui", False)
|
||||
embedded_chat = (
|
||||
gui_mode or args.tui or os.environ.get("HERMES_DASHBOARD_TUI") == "1"
|
||||
)
|
||||
start_server(
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
open_browser=not args.no_open,
|
||||
allow_public=getattr(args, "insecure", False),
|
||||
embedded_chat=embedded_chat,
|
||||
gui_mode=gui_mode,
|
||||
)
|
||||
|
||||
|
||||
@@ -6983,40 +6813,6 @@ For more help on a command:
|
||||
parser.add_argument(
|
||||
"--version", "-V", action="store_true", help="Show version and exit"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-z",
|
||||
"--oneshot",
|
||||
metavar="PROMPT",
|
||||
default=None,
|
||||
help=(
|
||||
"One-shot mode: send a single prompt and print ONLY the final "
|
||||
"response text to stdout. No banner, no spinner, no tool "
|
||||
"previews, no session_id line. Tools, memory, rules, and "
|
||||
"AGENTS.md in the CWD are loaded as normal; approvals are "
|
||||
"auto-bypassed. Intended for scripts / pipes."
|
||||
),
|
||||
)
|
||||
# --model / --provider are accepted at the top level so they can pair
|
||||
# with -z without needing the `chat` subcommand. If neither -z nor a
|
||||
# subcommand consumes them, they fall through harmlessly as None.
|
||||
# Mirrors `hermes chat --model ... --provider ...` semantics.
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--model",
|
||||
default=None,
|
||||
help=(
|
||||
"Model override for this invocation (e.g. anthropic/claude-sonnet-4.6). "
|
||||
"Applies to -z/--oneshot and --tui. Also settable via HERMES_INFERENCE_MODEL env var."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--provider",
|
||||
default=None,
|
||||
help=(
|
||||
"Provider override for this invocation (e.g. openrouter, anthropic). "
|
||||
"Applies to -z/--oneshot and --tui. Also settable via HERMES_INFERENCE_PROVIDER env var."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resume",
|
||||
"-r",
|
||||
@@ -7594,39 +7390,17 @@ For more help on a command:
|
||||
"reset", help="Clear exhaustion status for all credentials for a provider"
|
||||
)
|
||||
auth_reset.add_argument("provider", help="Provider id")
|
||||
auth_status = auth_subparsers.add_parser(
|
||||
"status", help="Show auth status for a provider"
|
||||
)
|
||||
auth_status = auth_subparsers.add_parser("status", help="Show auth status for a provider")
|
||||
auth_status.add_argument("provider", help="Provider id")
|
||||
auth_logout = auth_subparsers.add_parser(
|
||||
"logout", help="Log out a provider and clear stored auth state"
|
||||
)
|
||||
auth_logout = auth_subparsers.add_parser("logout", help="Log out a provider and clear stored auth state")
|
||||
auth_logout.add_argument("provider", help="Provider id")
|
||||
auth_spotify = auth_subparsers.add_parser(
|
||||
"spotify", help="Authenticate Hermes with Spotify via PKCE"
|
||||
)
|
||||
auth_spotify.add_argument(
|
||||
"spotify_action",
|
||||
nargs="?",
|
||||
choices=["login", "status", "logout"],
|
||||
default="login",
|
||||
)
|
||||
auth_spotify.add_argument(
|
||||
"--client-id", help="Spotify app client_id (or set HERMES_SPOTIFY_CLIENT_ID)"
|
||||
)
|
||||
auth_spotify.add_argument(
|
||||
"--redirect-uri",
|
||||
help="Allow-listed localhost redirect URI for your Spotify app",
|
||||
)
|
||||
auth_spotify = auth_subparsers.add_parser("spotify", help="Authenticate Hermes with Spotify via PKCE")
|
||||
auth_spotify.add_argument("spotify_action", nargs="?", choices=["login", "status", "logout"], default="login")
|
||||
auth_spotify.add_argument("--client-id", help="Spotify app client_id (or set HERMES_SPOTIFY_CLIENT_ID)")
|
||||
auth_spotify.add_argument("--redirect-uri", help="Allow-listed localhost redirect URI for your Spotify app")
|
||||
auth_spotify.add_argument("--scope", help="Override requested Spotify scopes")
|
||||
auth_spotify.add_argument(
|
||||
"--no-browser",
|
||||
action="store_true",
|
||||
help="Do not attempt to open the browser automatically",
|
||||
)
|
||||
auth_spotify.add_argument(
|
||||
"--timeout", type=float, help="Callback/token exchange timeout in seconds"
|
||||
)
|
||||
auth_spotify.add_argument("--no-browser", action="store_true", help="Do not attempt to open the browser automatically")
|
||||
auth_spotify.add_argument("--timeout", type=float, help="Callback/token exchange timeout in seconds")
|
||||
auth_parser.set_defaults(func=cmd_auth)
|
||||
|
||||
# =========================================================================
|
||||
@@ -7836,8 +7610,7 @@ For more help on a command:
|
||||
hooks_subparsers = hooks_parser.add_subparsers(dest="hooks_action")
|
||||
|
||||
hooks_subparsers.add_parser(
|
||||
"list",
|
||||
aliases=["ls"],
|
||||
"list", aliases=["ls"],
|
||||
help="List configured hooks with matcher, timeout, and consent status",
|
||||
)
|
||||
|
||||
@@ -7850,18 +7623,14 @@ For more help on a command:
|
||||
help="Hook event name (e.g. pre_tool_call, pre_llm_call, subagent_stop)",
|
||||
)
|
||||
_hk_test.add_argument(
|
||||
"--for-tool",
|
||||
dest="for_tool",
|
||||
default=None,
|
||||
"--for-tool", dest="for_tool", default=None,
|
||||
help=(
|
||||
"Only fire hooks whose matcher matches this tool name "
|
||||
"(used for pre_tool_call / post_tool_call)"
|
||||
),
|
||||
)
|
||||
_hk_test.add_argument(
|
||||
"--payload-file",
|
||||
dest="payload_file",
|
||||
default=None,
|
||||
"--payload-file", dest="payload_file", default=None,
|
||||
help=(
|
||||
"Path to a JSON file whose contents are merged into the "
|
||||
"synthetic payload before execution"
|
||||
@@ -7869,8 +7638,7 @@ For more help on a command:
|
||||
)
|
||||
|
||||
_hk_revoke = hooks_subparsers.add_parser(
|
||||
"revoke",
|
||||
aliases=["remove", "rm"],
|
||||
"revoke", aliases=["remove", "rm"],
|
||||
help="Remove a command's allowlist entries (takes effect on next restart)",
|
||||
)
|
||||
_hk_revoke.add_argument(
|
||||
@@ -9148,19 +8916,6 @@ Examples:
|
||||
action="store_true",
|
||||
help="Allow binding to non-localhost (DANGEROUS: exposes API keys on the network)",
|
||||
)
|
||||
dashboard_parser.add_argument(
|
||||
"--tui",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Expose the in-browser Chat tab (embedded `hermes --tui` via PTY/WebSocket). "
|
||||
"Alternatively set HERMES_DASHBOARD_TUI=1."
|
||||
),
|
||||
)
|
||||
dashboard_parser.add_argument(
|
||||
"--gui",
|
||||
action="store_true",
|
||||
help="Run dashboard in GUI-shell mode; implies --tui",
|
||||
)
|
||||
dashboard_parser.set_defaults(func=cmd_dashboard)
|
||||
|
||||
# =========================================================================
|
||||
@@ -9303,28 +9058,26 @@ Examples:
|
||||
# the nested subcommand (dest varies by parser).
|
||||
_AGENT_COMMANDS = {None, "chat", "acp", "rl"}
|
||||
_AGENT_SUBCOMMANDS = {
|
||||
"cron": ("cron_command", {"run", "tick"}),
|
||||
"cron": ("cron_command", {"run", "tick"}),
|
||||
"gateway": ("gateway_command", {"run"}),
|
||||
"mcp": ("mcp_action", {"serve"}),
|
||||
"mcp": ("mcp_action", {"serve"}),
|
||||
}
|
||||
_sub_attr, _sub_set = _AGENT_SUBCOMMANDS.get(args.command, (None, None))
|
||||
if args.command in _AGENT_COMMANDS or (
|
||||
_sub_attr and getattr(args, _sub_attr, None) in _sub_set
|
||||
if (
|
||||
args.command in _AGENT_COMMANDS
|
||||
or (_sub_attr and getattr(args, _sub_attr, None) in _sub_set)
|
||||
):
|
||||
_accept_hooks = bool(getattr(args, "accept_hooks", False))
|
||||
try:
|
||||
from hermes_cli.plugins import discover_plugins
|
||||
|
||||
discover_plugins()
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"plugin discovery failed at CLI startup",
|
||||
exc_info=True,
|
||||
"plugin discovery failed at CLI startup", exc_info=True,
|
||||
)
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
from agent.shell_hooks import register_from_config
|
||||
|
||||
register_from_config(load_config(), accept_hooks=_accept_hooks)
|
||||
except Exception:
|
||||
logger.debug(
|
||||
@@ -9332,19 +9085,6 @@ Examples:
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
# Handle top-level --oneshot / -z: single-shot mode, stdout = final
|
||||
# response only, nothing else. Bypasses cli.py entirely.
|
||||
if getattr(args, "oneshot", None):
|
||||
from hermes_cli.oneshot import run_oneshot
|
||||
|
||||
sys.exit(
|
||||
run_oneshot(
|
||||
args.oneshot,
|
||||
model=getattr(args, "model", None),
|
||||
provider=getattr(args, "provider", None),
|
||||
)
|
||||
)
|
||||
|
||||
# Handle top-level --resume / --continue as shortcut to chat
|
||||
if (args.resume or args.continue_last) and args.command is None:
|
||||
args.command = "chat"
|
||||
|
||||
@@ -527,42 +527,6 @@ def _resolve_alias_fallback(
|
||||
return None
|
||||
|
||||
|
||||
def resolve_display_context_length(
|
||||
model: str,
|
||||
provider: str,
|
||||
base_url: str = "",
|
||||
api_key: str = "",
|
||||
model_info: Optional[ModelInfo] = None,
|
||||
) -> Optional[int]:
|
||||
"""Resolve the context length to show in /model output.
|
||||
|
||||
models.dev reports per-vendor context (e.g. gpt-5.5 = 1.05M on openai)
|
||||
but provider-enforced limits can be lower (e.g. Codex OAuth caps the
|
||||
same slug at 272k). The authoritative source is
|
||||
``agent.model_metadata.get_model_context_length`` which already knows
|
||||
about Codex OAuth, Copilot, Nous, and falls back to models.dev for the
|
||||
rest.
|
||||
|
||||
Prefer the provider-aware value; fall back to ``model_info.context_window``
|
||||
only if the resolver returns nothing.
|
||||
"""
|
||||
try:
|
||||
from agent.model_metadata import get_model_context_length
|
||||
ctx = get_model_context_length(
|
||||
model,
|
||||
base_url=base_url or "",
|
||||
api_key=api_key or "",
|
||||
provider=provider or None,
|
||||
)
|
||||
if ctx:
|
||||
return int(ctx)
|
||||
except Exception:
|
||||
pass
|
||||
if model_info is not None and model_info.context_window:
|
||||
return int(model_info.context_window)
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core model-switching pipeline
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
+77
-128
@@ -42,7 +42,7 @@ OPENROUTER_MODELS: list[tuple[str, str]] = [
|
||||
("anthropic/claude-sonnet-4.5", ""),
|
||||
("anthropic/claude-haiku-4.5", ""),
|
||||
("openrouter/elephant-alpha", "free"),
|
||||
("openai/gpt-5.5", ""),
|
||||
("openai/gpt-5.4", ""),
|
||||
("openai/gpt-5.4-mini", ""),
|
||||
("xiaomi/mimo-v2.5-pro", ""),
|
||||
("xiaomi/mimo-v2.5", ""),
|
||||
@@ -65,7 +65,7 @@ OPENROUTER_MODELS: list[tuple[str, str]] = [
|
||||
("nvidia/nemotron-3-super-120b-a12b:free", "free"),
|
||||
("arcee-ai/trinity-large-preview:free", "free"),
|
||||
("arcee-ai/trinity-large-thinking", ""),
|
||||
("openai/gpt-5.5-pro", ""),
|
||||
("openai/gpt-5.4-pro", ""),
|
||||
("openai/gpt-5.4-nano", ""),
|
||||
]
|
||||
|
||||
@@ -120,7 +120,7 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
|
||||
"anthropic/claude-sonnet-4.6",
|
||||
"anthropic/claude-sonnet-4.5",
|
||||
"anthropic/claude-haiku-4.5",
|
||||
"openai/gpt-5.5",
|
||||
"openai/gpt-5.4",
|
||||
"openai/gpt-5.4-mini",
|
||||
"openai/gpt-5.3-codex",
|
||||
"google/gemini-3-pro-preview",
|
||||
@@ -139,7 +139,7 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
|
||||
"x-ai/grok-4.20-beta",
|
||||
"nvidia/nemotron-3-super-120b-a12b",
|
||||
"arcee-ai/trinity-large-thinking",
|
||||
"openai/gpt-5.5-pro",
|
||||
"openai/gpt-5.4-pro",
|
||||
"openai/gpt-5.4-nano",
|
||||
],
|
||||
# Native OpenAI Chat Completions (api.openai.com). Used by /model counts and
|
||||
@@ -1379,124 +1379,6 @@ def curated_models_for_provider(
|
||||
return [(m, "") for m in models]
|
||||
|
||||
|
||||
def _provider_keys(provider: str) -> set[str]:
|
||||
key = (provider or "").strip().lower()
|
||||
normalized = normalize_provider(provider)
|
||||
return {k for k in (key, normalized) if k}
|
||||
|
||||
|
||||
def _model_in_provider_catalog(name_lower: str, providers: set[str]) -> bool:
|
||||
return any(
|
||||
name_lower == model.lower()
|
||||
for provider in providers
|
||||
for model in _PROVIDER_MODELS.get(provider, [])
|
||||
)
|
||||
|
||||
|
||||
_AGGREGATOR_PROVIDERS = frozenset(
|
||||
{"nous", "openrouter", "ai-gateway", "copilot", "kilocode"}
|
||||
)
|
||||
|
||||
|
||||
def _resolve_static_model_alias(
|
||||
name_lower: str,
|
||||
current_keys: set[str],
|
||||
) -> Optional[tuple[str, str]]:
|
||||
"""Resolve short aliases (e.g. sonnet/opus) using static catalogs only."""
|
||||
try:
|
||||
from hermes_cli.model_switch import MODEL_ALIASES
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
identity = MODEL_ALIASES.get(name_lower)
|
||||
if identity is None:
|
||||
return None
|
||||
|
||||
vendor = identity.vendor
|
||||
family = identity.family
|
||||
|
||||
def _match(provider: str) -> Optional[str]:
|
||||
models = _PROVIDER_MODELS.get(provider, [])
|
||||
if not models:
|
||||
return None
|
||||
prefix = (
|
||||
f"{vendor}/{family}"
|
||||
if provider in _AGGREGATOR_PROVIDERS
|
||||
else family
|
||||
).lower()
|
||||
for model in models:
|
||||
if model.lower().startswith(prefix):
|
||||
return model
|
||||
return None
|
||||
|
||||
for provider in current_keys:
|
||||
if matched := _match(provider):
|
||||
return provider, matched
|
||||
|
||||
for provider in _PROVIDER_MODELS:
|
||||
if provider in current_keys or provider in _AGGREGATOR_PROVIDERS:
|
||||
continue
|
||||
if matched := _match(provider):
|
||||
return provider, matched
|
||||
|
||||
for provider in _AGGREGATOR_PROVIDERS:
|
||||
if provider in current_keys and (matched := _match(provider)):
|
||||
return provider, matched
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def detect_static_provider_for_model(
|
||||
model_name: str,
|
||||
current_provider: str,
|
||||
) -> Optional[tuple[str, str]]:
|
||||
"""Auto-detect a provider from static catalogs only.
|
||||
|
||||
Returns ``(provider_id, model_name)``. The model name may be remapped
|
||||
when a static alias or bare provider name resolves to a catalog default.
|
||||
Returns ``None`` when no confident match is found.
|
||||
"""
|
||||
name = (model_name or "").strip()
|
||||
if not name:
|
||||
return None
|
||||
|
||||
name_lower = name.lower()
|
||||
current_keys = _provider_keys(current_provider)
|
||||
|
||||
alias_match = _resolve_static_model_alias(name_lower, current_keys)
|
||||
if alias_match:
|
||||
return alias_match
|
||||
|
||||
# --- Step 0: bare provider name typed as model ---
|
||||
# If someone types `/model nous` or `/model anthropic`, treat it as a
|
||||
# provider switch and pick the first model from that provider's catalog.
|
||||
# Skip "custom" and "openrouter" — custom has no model catalog, and
|
||||
# openrouter requires an explicit model name to be useful.
|
||||
resolved_provider = _PROVIDER_ALIASES.get(name_lower, name_lower)
|
||||
if resolved_provider not in {"custom", "openrouter"}:
|
||||
default_models = _PROVIDER_MODELS.get(resolved_provider, [])
|
||||
if (
|
||||
resolved_provider in _PROVIDER_LABELS
|
||||
and default_models
|
||||
and resolved_provider not in current_keys
|
||||
):
|
||||
return (resolved_provider, default_models[0])
|
||||
|
||||
# Aggregators list other providers' models — never auto-switch TO them
|
||||
# If the model belongs to the current provider's catalog, don't suggest switching
|
||||
if _model_in_provider_catalog(name_lower, current_keys):
|
||||
return None
|
||||
|
||||
# --- Step 1: check static provider catalogs for a direct match ---
|
||||
for pid, models in _PROVIDER_MODELS.items():
|
||||
if pid in current_keys or pid in _AGGREGATOR_PROVIDERS:
|
||||
continue
|
||||
if any(name_lower == m.lower() for m in models):
|
||||
return (pid, name)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def detect_provider_for_model(
|
||||
model_name: str,
|
||||
current_provider: str,
|
||||
@@ -1509,19 +1391,86 @@ def detect_provider_for_model(
|
||||
|
||||
Priority:
|
||||
0. Bare provider name → switch to that provider's default model
|
||||
1. Direct provider static catalog match
|
||||
2. OpenRouter catalog match
|
||||
1. Direct provider with credentials (highest)
|
||||
2. Direct provider without credentials → remap to OpenRouter slug
|
||||
3. OpenRouter catalog match
|
||||
"""
|
||||
name = (model_name or "").strip()
|
||||
if not name:
|
||||
return None
|
||||
|
||||
static_match = detect_static_provider_for_model(name, current_provider)
|
||||
if static_match:
|
||||
return static_match
|
||||
if _model_in_provider_catalog(name.lower(), _provider_keys(current_provider)):
|
||||
name_lower = name.lower()
|
||||
|
||||
# --- Step 0: bare provider name typed as model ---
|
||||
# If someone types `/model nous` or `/model anthropic`, treat it as a
|
||||
# provider switch and pick the first model from that provider's catalog.
|
||||
# Skip "custom" and "openrouter" — custom has no model catalog, and
|
||||
# openrouter requires an explicit model name to be useful.
|
||||
resolved_provider = _PROVIDER_ALIASES.get(name_lower, name_lower)
|
||||
if resolved_provider not in {"custom", "openrouter"}:
|
||||
default_models = _PROVIDER_MODELS.get(resolved_provider, [])
|
||||
if (
|
||||
resolved_provider in _PROVIDER_LABELS
|
||||
and default_models
|
||||
and resolved_provider != normalize_provider(current_provider)
|
||||
):
|
||||
return (resolved_provider, default_models[0])
|
||||
|
||||
# Aggregators list other providers' models — never auto-switch TO them
|
||||
_AGGREGATORS = {"nous", "openrouter", "ai-gateway", "copilot", "kilocode"}
|
||||
|
||||
# If the model belongs to the current provider's catalog, don't suggest switching
|
||||
current_models = _PROVIDER_MODELS.get(current_provider, [])
|
||||
if any(name_lower == m.lower() for m in current_models):
|
||||
return None
|
||||
|
||||
# --- Step 1: check static provider catalogs for a direct match ---
|
||||
direct_match: Optional[str] = None
|
||||
for pid, models in _PROVIDER_MODELS.items():
|
||||
if pid == current_provider or pid in _AGGREGATORS:
|
||||
continue
|
||||
if any(name_lower == m.lower() for m in models):
|
||||
direct_match = pid
|
||||
break
|
||||
|
||||
if direct_match:
|
||||
# Check if we have credentials for this provider — env vars,
|
||||
# credential pool, or auth store entries.
|
||||
has_creds = False
|
||||
try:
|
||||
from hermes_cli.auth import PROVIDER_REGISTRY
|
||||
pconfig = PROVIDER_REGISTRY.get(direct_match)
|
||||
if pconfig:
|
||||
for env_var in pconfig.api_key_env_vars:
|
||||
if os.getenv(env_var, "").strip():
|
||||
has_creds = True
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
# Also check credential pool and auth store — covers OAuth,
|
||||
# Claude Code tokens, and other non-env-var credentials (#10300).
|
||||
if not has_creds:
|
||||
try:
|
||||
from agent.credential_pool import load_pool
|
||||
pool = load_pool(direct_match)
|
||||
if pool.has_credentials():
|
||||
has_creds = True
|
||||
except Exception:
|
||||
pass
|
||||
if not has_creds:
|
||||
try:
|
||||
from hermes_cli.auth import _load_auth_store
|
||||
store = _load_auth_store()
|
||||
if direct_match in store.get("providers", {}) or direct_match in store.get("credential_pool", {}):
|
||||
has_creds = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Always return the direct provider match. If credentials are
|
||||
# missing, the client init will give a clear error rather than
|
||||
# silently routing through the wrong provider (#10300).
|
||||
return (direct_match, name)
|
||||
|
||||
# --- Step 2: check OpenRouter catalog ---
|
||||
# First try exact match (handles provider/model format)
|
||||
or_slug = _find_openrouter_slug(name)
|
||||
|
||||
@@ -1,202 +0,0 @@
|
||||
"""Oneshot (-z) mode: send a prompt, get the final content block, exit.
|
||||
|
||||
Bypasses cli.py entirely. No banner, no spinner, no session_id line,
|
||||
no stderr chatter. Just the agent's final text to stdout.
|
||||
|
||||
Toolsets = whatever the user has configured for "cli" in `hermes tools`.
|
||||
Rules / memory / AGENTS.md / preloaded skills = same as a normal chat turn.
|
||||
Approvals = auto-bypassed (HERMES_YOLO_MODE=1 is set for the call).
|
||||
Working directory = the user's CWD (AGENTS.md etc. resolve from there as usual).
|
||||
|
||||
Model / provider selection mirrors `hermes chat`:
|
||||
- Both optional. If omitted, use the user's configured default.
|
||||
- If both given, pair them exactly as given.
|
||||
- If only --model given, auto-detect the provider that serves it.
|
||||
- If only --provider given, error out (ambiguous — caller must pick a model).
|
||||
|
||||
Env var fallbacks (used when the corresponding arg is not passed):
|
||||
- HERMES_INFERENCE_MODEL
|
||||
- HERMES_INFERENCE_PROVIDER (already read by resolve_runtime_provider)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from contextlib import redirect_stderr, redirect_stdout
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def run_oneshot(
|
||||
prompt: str,
|
||||
model: Optional[str] = None,
|
||||
provider: Optional[str] = None,
|
||||
) -> int:
|
||||
"""Execute a single prompt and print only the final content block.
|
||||
|
||||
Args:
|
||||
prompt: The user message to send.
|
||||
model: Optional model override. Falls back to HERMES_INFERENCE_MODEL
|
||||
env var, then config.yaml's model.default / model.model.
|
||||
provider: Optional provider override. Falls back to
|
||||
HERMES_INFERENCE_PROVIDER env var, then config.yaml's model.provider,
|
||||
then "auto".
|
||||
|
||||
Returns the exit code. Caller should sys.exit() with the return.
|
||||
"""
|
||||
# Silence every stdlib logger for the duration. AIAgent, tools, and
|
||||
# provider adapters all log to stderr through the root logger; file
|
||||
# handlers added by setup_logging() keep working (they're attached to
|
||||
# the root logger's handler list, not affected by level), but no
|
||||
# bytes reach the terminal.
|
||||
logging.disable(logging.CRITICAL)
|
||||
|
||||
# --provider without --model is ambiguous: carrying the user's configured
|
||||
# model across to a different provider is usually wrong (that provider may
|
||||
# not host it), and silently picking the provider's catalog default hides
|
||||
# the mismatch. Require the caller to be explicit. Validate BEFORE the
|
||||
# stderr redirect so the message actually reaches the terminal.
|
||||
env_model_early = os.getenv("HERMES_INFERENCE_MODEL", "").strip()
|
||||
if provider and not ((model or "").strip() or env_model_early):
|
||||
sys.stderr.write(
|
||||
"hermes -z: --provider requires --model (or HERMES_INFERENCE_MODEL). "
|
||||
"Pass both explicitly, or neither to use your configured defaults.\n"
|
||||
)
|
||||
return 2
|
||||
|
||||
# Auto-approve any shell / tool approvals. Non-interactive by
|
||||
# definition — a prompt would hang forever.
|
||||
os.environ["HERMES_YOLO_MODE"] = "1"
|
||||
os.environ["HERMES_ACCEPT_HOOKS"] = "1"
|
||||
|
||||
# Redirect stderr AND stdout to devnull for the entire call tree.
|
||||
# We'll print the final response to the real stdout at the end.
|
||||
real_stdout = sys.stdout
|
||||
devnull = open(os.devnull, "w")
|
||||
|
||||
try:
|
||||
with redirect_stdout(devnull), redirect_stderr(devnull):
|
||||
response = _run_agent(prompt, model=model, provider=provider)
|
||||
finally:
|
||||
try:
|
||||
devnull.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if response:
|
||||
real_stdout.write(response)
|
||||
if not response.endswith("\n"):
|
||||
real_stdout.write("\n")
|
||||
real_stdout.flush()
|
||||
return 0
|
||||
|
||||
|
||||
def _run_agent(
|
||||
prompt: str,
|
||||
model: Optional[str] = None,
|
||||
provider: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Build an AIAgent exactly like a normal CLI chat turn would, then
|
||||
run a single conversation. Returns the final response string."""
|
||||
# Imports are local so they don't run when hermes is invoked for
|
||||
# other commands (keeps top-level CLI startup cheap).
|
||||
from hermes_cli.config import load_config
|
||||
from hermes_cli.models import detect_provider_for_model
|
||||
from hermes_cli.runtime_provider import resolve_runtime_provider
|
||||
from hermes_cli.tools_config import _get_platform_tools
|
||||
from run_agent import AIAgent
|
||||
|
||||
cfg = load_config()
|
||||
|
||||
# Resolve effective model: explicit arg → env var → config.
|
||||
model_cfg = cfg.get("model") or {}
|
||||
if isinstance(model_cfg, str):
|
||||
cfg_model = model_cfg
|
||||
else:
|
||||
cfg_model = model_cfg.get("default") or model_cfg.get("model") or ""
|
||||
|
||||
env_model = os.getenv("HERMES_INFERENCE_MODEL", "").strip()
|
||||
effective_model = (model or "").strip() or env_model or cfg_model
|
||||
|
||||
# Resolve effective provider: explicit arg → (auto-detect from model if
|
||||
# model was explicit) → env / config (handled inside resolve_runtime_provider).
|
||||
#
|
||||
# When --model is given without --provider, auto-detect the provider that
|
||||
# serves that model — same semantic as `/model <name>` in an interactive
|
||||
# session. Without this, resolve_runtime_provider() would fall back to
|
||||
# the user's configured default provider, which may not host the model
|
||||
# the caller just asked for.
|
||||
effective_provider = (provider or "").strip() or None
|
||||
if effective_provider is None and (model or env_model):
|
||||
# Only auto-detect when the model was explicitly requested via arg or
|
||||
# env var (not when it came from config — that's the "use my defaults"
|
||||
# path and the configured provider is already correct).
|
||||
explicit_model = (model or "").strip() or env_model
|
||||
if explicit_model:
|
||||
cfg_provider = ""
|
||||
if isinstance(model_cfg, dict):
|
||||
cfg_provider = str(model_cfg.get("provider") or "").strip().lower()
|
||||
current_provider = (
|
||||
cfg_provider
|
||||
or os.getenv("HERMES_INFERENCE_PROVIDER", "").strip().lower()
|
||||
or "auto"
|
||||
)
|
||||
detected = detect_provider_for_model(explicit_model, current_provider)
|
||||
if detected:
|
||||
effective_provider, effective_model = detected
|
||||
|
||||
runtime = resolve_runtime_provider(
|
||||
requested=effective_provider,
|
||||
target_model=effective_model or None,
|
||||
)
|
||||
|
||||
# Pull in whatever toolsets the user has enabled for "cli".
|
||||
# sorted() gives stable ordering; set→list for AIAgent's signature.
|
||||
toolsets_list = sorted(_get_platform_tools(cfg, "cli"))
|
||||
|
||||
agent = AIAgent(
|
||||
api_key=runtime.get("api_key"),
|
||||
base_url=runtime.get("base_url"),
|
||||
provider=runtime.get("provider"),
|
||||
api_mode=runtime.get("api_mode"),
|
||||
model=effective_model,
|
||||
enabled_toolsets=toolsets_list,
|
||||
quiet_mode=True,
|
||||
platform="cli",
|
||||
credential_pool=runtime.get("credential_pool"),
|
||||
# Interactive callbacks are intentionally NOT wired beyond this
|
||||
# one. In oneshot mode there's no user sitting at a terminal:
|
||||
# - clarify → returns a synthetic "pick a default" instruction
|
||||
# so the agent continues instead of stalling on
|
||||
# the tool's built-in "not available" error
|
||||
# - sudo password prompt → terminal_tool gates on
|
||||
# HERMES_INTERACTIVE which we never set
|
||||
# - shell-hook approval → auto-approved via HERMES_ACCEPT_HOOKS=1
|
||||
# (set above); also falls back to deny on non-tty
|
||||
# - dangerous-command approval → bypassed via HERMES_YOLO_MODE=1
|
||||
# - skill secret capture → returns gracefully when no callback set
|
||||
clarify_callback=_oneshot_clarify_callback,
|
||||
)
|
||||
|
||||
# Belt-and-braces: make sure AIAgent doesn't invoke any streaming
|
||||
# display callbacks that would bypass our stdout capture.
|
||||
agent.suppress_status_output = True
|
||||
agent.stream_delta_callback = None
|
||||
agent.tool_gen_callback = None
|
||||
|
||||
return agent.chat(prompt) or ""
|
||||
|
||||
|
||||
def _oneshot_clarify_callback(question: str, choices=None) -> str:
|
||||
"""Clarify is disabled in oneshot mode — tell the agent to pick a
|
||||
default and proceed instead of stalling or erroring."""
|
||||
if choices:
|
||||
return (
|
||||
f"[oneshot mode: no user available. Pick the best option from "
|
||||
f"{choices} using your own judgment and continue.]"
|
||||
)
|
||||
return (
|
||||
"[oneshot mode: no user available. Make the most reasonable "
|
||||
"assumption you can and continue.]"
|
||||
)
|
||||
@@ -1,229 +0,0 @@
|
||||
"""PTY bridge for `hermes dashboard` chat tab.
|
||||
|
||||
Wraps a child process behind a pseudo-terminal so its ANSI output can be
|
||||
streamed to a browser-side terminal emulator (xterm.js) and typed
|
||||
keystrokes can be fed back in. The only caller today is the
|
||||
``/api/pty`` WebSocket endpoint in ``hermes_cli.web_server``.
|
||||
|
||||
Design constraints:
|
||||
|
||||
* **POSIX-only.** Hermes Agent supports Windows exclusively via WSL, which
|
||||
exposes a native POSIX PTY via ``openpty(3)``. Native Windows Python
|
||||
has no PTY; :class:`PtyUnavailableError` is raised with a user-readable
|
||||
install/platform message so the dashboard can render a banner instead of
|
||||
crashing.
|
||||
* **Zero Node dependency on the server side.** We use :mod:`ptyprocess`,
|
||||
which is a pure-Python wrapper around the OS calls. The browser talks
|
||||
to the same ``hermes --tui`` binary it would launch from the CLI, so
|
||||
every TUI feature (slash popover, model picker, tool rows, markdown,
|
||||
skin engine, clarify/sudo/approval prompts) ships automatically.
|
||||
* **Byte-safe I/O.** Reads and writes go through the PTY master fd
|
||||
directly — we avoid :class:`ptyprocess.PtyProcessUnicode` because
|
||||
streaming ANSI is inherently byte-oriented and UTF-8 boundaries may land
|
||||
mid-read.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import errno
|
||||
import fcntl
|
||||
import os
|
||||
import select
|
||||
import signal
|
||||
import struct
|
||||
import sys
|
||||
import termios
|
||||
import time
|
||||
from typing import Optional, Sequence
|
||||
|
||||
try:
|
||||
import ptyprocess # type: ignore
|
||||
_PTY_AVAILABLE = not sys.platform.startswith("win")
|
||||
except ImportError: # pragma: no cover - dev env without ptyprocess
|
||||
ptyprocess = None # type: ignore
|
||||
_PTY_AVAILABLE = False
|
||||
|
||||
|
||||
__all__ = ["PtyBridge", "PtyUnavailableError"]
|
||||
|
||||
|
||||
class PtyUnavailableError(RuntimeError):
|
||||
"""Raised when a PTY cannot be created on this platform.
|
||||
|
||||
Today this means native Windows (no ConPTY bindings) or a dev
|
||||
environment missing the ``ptyprocess`` dependency. The dashboard
|
||||
surfaces the message to the user as a chat-tab banner.
|
||||
"""
|
||||
|
||||
|
||||
class PtyBridge:
|
||||
"""Thin wrapper around ``ptyprocess.PtyProcess`` for byte streaming.
|
||||
|
||||
Not thread-safe. A single bridge is owned by the WebSocket handler
|
||||
that spawned it; the reader runs in an executor thread while writes
|
||||
happen on the event-loop thread. Both sides are OK because the
|
||||
kernel PTY is the actual synchronization point — we never call
|
||||
:mod:`ptyprocess` methods concurrently, we only call ``os.read`` and
|
||||
``os.write`` on the master fd, which is safe.
|
||||
"""
|
||||
|
||||
def __init__(self, proc: "ptyprocess.PtyProcess"): # type: ignore[name-defined]
|
||||
self._proc = proc
|
||||
self._fd: int = proc.fd
|
||||
self._closed = False
|
||||
|
||||
# -- lifecycle --------------------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def is_available(cls) -> bool:
|
||||
"""True if a PTY can be spawned on this platform."""
|
||||
return bool(_PTY_AVAILABLE)
|
||||
|
||||
@classmethod
|
||||
def spawn(
|
||||
cls,
|
||||
argv: Sequence[str],
|
||||
*,
|
||||
cwd: Optional[str] = None,
|
||||
env: Optional[dict] = None,
|
||||
cols: int = 80,
|
||||
rows: int = 24,
|
||||
) -> "PtyBridge":
|
||||
"""Spawn ``argv`` behind a new PTY and return a bridge.
|
||||
|
||||
Raises :class:`PtyUnavailableError` if the platform can't host a
|
||||
PTY. Raises :class:`FileNotFoundError` or :class:`OSError` for
|
||||
ordinary exec failures (missing binary, bad cwd, etc.).
|
||||
"""
|
||||
if not _PTY_AVAILABLE:
|
||||
if sys.platform.startswith("win"):
|
||||
raise PtyUnavailableError(
|
||||
"Pseudo-terminals are unavailable on this platform. "
|
||||
"Hermes Agent supports Windows only via WSL."
|
||||
)
|
||||
if ptyprocess is None:
|
||||
raise PtyUnavailableError(
|
||||
"The `ptyprocess` package is missing. "
|
||||
"Install with: pip install ptyprocess "
|
||||
"(or pip install -e '.[pty]')."
|
||||
)
|
||||
raise PtyUnavailableError("Pseudo-terminals are unavailable.")
|
||||
# Let caller-supplied env fully override inheritance; if they pass
|
||||
# None we inherit the server's env (same semantics as subprocess).
|
||||
spawn_env = os.environ.copy() if env is None else env
|
||||
proc = ptyprocess.PtyProcess.spawn( # type: ignore[union-attr]
|
||||
list(argv),
|
||||
cwd=cwd,
|
||||
env=spawn_env,
|
||||
dimensions=(rows, cols),
|
||||
)
|
||||
return cls(proc)
|
||||
|
||||
@property
|
||||
def pid(self) -> int:
|
||||
return int(self._proc.pid)
|
||||
|
||||
def is_alive(self) -> bool:
|
||||
if self._closed:
|
||||
return False
|
||||
try:
|
||||
return bool(self._proc.isalive())
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
# -- I/O --------------------------------------------------------------
|
||||
|
||||
def read(self, timeout: float = 0.2) -> Optional[bytes]:
|
||||
"""Read up to 64 KiB of raw bytes from the PTY master.
|
||||
|
||||
Returns:
|
||||
* bytes — zero or more bytes of child output
|
||||
* empty bytes (``b""``) — no data available within ``timeout``
|
||||
* None — child has exited and the master fd is at EOF
|
||||
|
||||
Never blocks longer than ``timeout`` seconds. Safe to call after
|
||||
:meth:`close`; returns ``None`` in that case.
|
||||
"""
|
||||
if self._closed:
|
||||
return None
|
||||
try:
|
||||
readable, _, _ = select.select([self._fd], [], [], timeout)
|
||||
except (OSError, ValueError):
|
||||
return None
|
||||
if not readable:
|
||||
return b""
|
||||
try:
|
||||
data = os.read(self._fd, 65536)
|
||||
except OSError as exc:
|
||||
# EIO on Linux = slave side closed. EBADF = already closed.
|
||||
if exc.errno in (errno.EIO, errno.EBADF):
|
||||
return None
|
||||
raise
|
||||
if not data:
|
||||
return None
|
||||
return data
|
||||
|
||||
def write(self, data: bytes) -> None:
|
||||
"""Write raw bytes to the PTY master (i.e. the child's stdin)."""
|
||||
if self._closed or not data:
|
||||
return
|
||||
# os.write can return a short write under load; loop until drained.
|
||||
view = memoryview(data)
|
||||
while view:
|
||||
try:
|
||||
n = os.write(self._fd, view)
|
||||
except OSError as exc:
|
||||
if exc.errno in (errno.EIO, errno.EBADF, errno.EPIPE):
|
||||
return
|
||||
raise
|
||||
if n <= 0:
|
||||
return
|
||||
view = view[n:]
|
||||
|
||||
def resize(self, cols: int, rows: int) -> None:
|
||||
"""Forward a terminal resize to the child via ``TIOCSWINSZ``."""
|
||||
if self._closed:
|
||||
return
|
||||
# struct winsize: rows, cols, xpixel, ypixel (all unsigned short)
|
||||
winsize = struct.pack("HHHH", max(1, rows), max(1, cols), 0, 0)
|
||||
try:
|
||||
fcntl.ioctl(self._fd, termios.TIOCSWINSZ, winsize)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# -- teardown ---------------------------------------------------------
|
||||
|
||||
def close(self) -> None:
|
||||
"""Terminate the child (SIGTERM → 0.5s grace → SIGKILL) and close fds.
|
||||
|
||||
Idempotent. Reaping the child is important so we don't leak
|
||||
zombies across the lifetime of the dashboard process.
|
||||
"""
|
||||
if self._closed:
|
||||
return
|
||||
self._closed = True
|
||||
|
||||
# SIGHUP is the conventional "your terminal went away" signal.
|
||||
# We escalate if the child ignores it.
|
||||
for sig in (signal.SIGHUP, signal.SIGTERM, signal.SIGKILL):
|
||||
if not self._proc.isalive():
|
||||
break
|
||||
try:
|
||||
self._proc.kill(sig)
|
||||
except Exception:
|
||||
pass
|
||||
deadline = time.monotonic() + 0.5
|
||||
while self._proc.isalive() and time.monotonic() < deadline:
|
||||
time.sleep(0.02)
|
||||
|
||||
try:
|
||||
self._proc.close(force=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Context-manager sugar — handy in tests and ad-hoc scripts.
|
||||
def __enter__(self) -> "PtyBridge":
|
||||
return self
|
||||
|
||||
def __exit__(self, *_exc) -> None:
|
||||
self.close()
|
||||
+19
-155
@@ -68,58 +68,25 @@ CONFIGURABLE_TOOLSETS = [
|
||||
("rl", "🧪 RL Training", "Tinker-Atropos training tools"),
|
||||
("homeassistant", "🏠 Home Assistant", "smart home device control"),
|
||||
("spotify", "🎵 Spotify", "playback, search, playlists, library"),
|
||||
("discord", "💬 Discord (read/participate)", "fetch messages, search members, create thread"),
|
||||
("discord_admin", "🛡️ Discord Server Admin", "list channels/roles, pin, assign roles"),
|
||||
]
|
||||
|
||||
# Toolsets that are OFF by default for new installs.
|
||||
# They're still in _HERMES_CORE_TOOLS (available at runtime if enabled),
|
||||
# but the setup checklist won't pre-select them for first-time users.
|
||||
_DEFAULT_OFF_TOOLSETS = {"moa", "homeassistant", "rl", "spotify", "discord", "discord_admin"}
|
||||
|
||||
# Platform-scoped toolsets: only appear in the `hermes tools` checklist for
|
||||
# these platforms, and only resolve/save for these platforms. A toolset
|
||||
# absent from this map is available on every platform (current behaviour).
|
||||
#
|
||||
# Use this for tools whose APIs only make sense on one platform (Discord
|
||||
# server admin, Slack workspace admin, etc.). Keeps every other platform's
|
||||
# checklist from filling up with irrelevant toggles.
|
||||
_TOOLSET_PLATFORM_RESTRICTIONS: Dict[str, Set[str]] = {
|
||||
"discord": {"discord"},
|
||||
"discord_admin": {"discord"},
|
||||
}
|
||||
|
||||
|
||||
def _toolset_allowed_for_platform(ts_key: str, platform: str) -> bool:
|
||||
"""Return True if ``ts_key`` is configurable on ``platform``.
|
||||
|
||||
Toolsets without a restriction entry are allowed everywhere (the default).
|
||||
"""
|
||||
allowed = _TOOLSET_PLATFORM_RESTRICTIONS.get(ts_key)
|
||||
return allowed is None or platform in allowed
|
||||
_DEFAULT_OFF_TOOLSETS = {"moa", "homeassistant", "rl", "spotify"}
|
||||
|
||||
|
||||
def _get_effective_configurable_toolsets():
|
||||
"""Return CONFIGURABLE_TOOLSETS + any plugin-provided toolsets.
|
||||
|
||||
Plugin toolsets are appended at the end so they appear after the
|
||||
built-in toolsets in the TUI checklist. A plugin whose toolset key
|
||||
already appears in ``CONFIGURABLE_TOOLSETS`` is skipped — bundled
|
||||
plugins (e.g. ``plugins/spotify``) share their toolset key with the
|
||||
built-in entry, and we want the built-in label/description to win.
|
||||
Without the dedupe, ``hermes tools`` → "reconfigure existing" would
|
||||
list the same toolset twice.
|
||||
built-in toolsets in the TUI checklist.
|
||||
"""
|
||||
result = list(CONFIGURABLE_TOOLSETS)
|
||||
seen = {ts_key for ts_key, _, _ in result}
|
||||
try:
|
||||
from hermes_cli.plugins import discover_plugins, get_plugin_toolsets
|
||||
discover_plugins() # idempotent — ensures plugins are loaded
|
||||
for entry in get_plugin_toolsets():
|
||||
if entry[0] in seen:
|
||||
continue
|
||||
seen.add(entry[0])
|
||||
result.append(entry)
|
||||
result.extend(get_plugin_toolsets())
|
||||
except Exception:
|
||||
pass
|
||||
return result
|
||||
@@ -401,9 +368,13 @@ TOOL_CATEGORIES = {
|
||||
"providers": [
|
||||
{
|
||||
"name": "Spotify Web API",
|
||||
"tag": "PKCE OAuth — opens the setup wizard",
|
||||
"env_vars": [],
|
||||
"post_setup": "spotify",
|
||||
"tag": "PKCE OAuth — run `hermes auth spotify` after this",
|
||||
"env_vars": [
|
||||
{"key": "HERMES_SPOTIFY_CLIENT_ID", "prompt": "Spotify app client_id",
|
||||
"url": "https://developer.spotify.com/dashboard"},
|
||||
{"key": "HERMES_SPOTIFY_REDIRECT_URI", "prompt": "Redirect URI (must be allow-listed in your Spotify app)",
|
||||
"default": "http://127.0.0.1:43827/spotify/callback"},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
@@ -507,35 +478,6 @@ def _run_post_setup(post_setup_key: str):
|
||||
_print_warning(" kittentts install timed out (>5min)")
|
||||
_print_info(f" Run manually: python -m pip install -U '{wheel_url}' soundfile")
|
||||
|
||||
elif post_setup_key == "spotify":
|
||||
# Run the full `hermes auth spotify` flow — if the user has no
|
||||
# client_id yet, this drops them into the interactive wizard
|
||||
# (opens the Spotify dashboard, prompts for client_id, persists
|
||||
# to ~/.hermes/.env), then continues straight into PKCE. If they
|
||||
# already have an app, it skips the wizard and just does OAuth.
|
||||
from types import SimpleNamespace
|
||||
try:
|
||||
from hermes_cli.auth import login_spotify_command
|
||||
except Exception as exc:
|
||||
_print_warning(f" Could not load Spotify auth: {exc}")
|
||||
_print_info(" Run manually: hermes auth spotify")
|
||||
return
|
||||
_print_info(" Starting Spotify login...")
|
||||
try:
|
||||
login_spotify_command(SimpleNamespace(
|
||||
client_id=None, redirect_uri=None, scope=None,
|
||||
no_browser=False, timeout=None,
|
||||
))
|
||||
_print_success(" Spotify authenticated")
|
||||
except SystemExit as exc:
|
||||
# User aborted the wizard, or OAuth failed — don't fail the
|
||||
# toolset enable; they can retry with `hermes auth spotify`.
|
||||
_print_warning(f" Spotify login did not complete: {exc}")
|
||||
_print_info(" Run later: hermes auth spotify")
|
||||
except Exception as exc:
|
||||
_print_warning(f" Spotify login failed: {exc}")
|
||||
_print_info(" Run manually: hermes auth spotify")
|
||||
|
||||
elif post_setup_key == "rl_training":
|
||||
try:
|
||||
__import__("tinker_atropos")
|
||||
@@ -624,7 +566,7 @@ def _get_platform_tools(
|
||||
include_default_mcp_servers: bool = True,
|
||||
) -> Set[str]:
|
||||
"""Resolve which individual toolset names are enabled for a platform."""
|
||||
from toolsets import resolve_toolset, TOOLSETS
|
||||
from toolsets import resolve_toolset
|
||||
|
||||
platform_toolsets = config.get("platform_toolsets") or {}
|
||||
toolset_names = platform_toolsets.get(platform)
|
||||
@@ -638,8 +580,6 @@ def _get_platform_tools(
|
||||
toolset_names = [str(ts) for ts in toolset_names]
|
||||
|
||||
configurable_keys = {ts_key for ts_key, _, _ in CONFIGURABLE_TOOLSETS}
|
||||
plugin_ts_keys = _get_plugin_toolset_keys()
|
||||
platform_default_keys = {p["default_toolset"] for p in PLATFORMS.values()}
|
||||
|
||||
# If the saved list contains any configurable keys directly, the user
|
||||
# has explicitly configured this platform — use direct membership.
|
||||
@@ -649,10 +589,7 @@ def _get_platform_tools(
|
||||
has_explicit_config = any(ts in configurable_keys for ts in toolset_names)
|
||||
|
||||
if has_explicit_config:
|
||||
enabled_toolsets = {
|
||||
ts for ts in toolset_names
|
||||
if ts in configurable_keys and _toolset_allowed_for_platform(ts, platform)
|
||||
}
|
||||
enabled_toolsets = {ts for ts in toolset_names if ts in configurable_keys}
|
||||
else:
|
||||
# No explicit config — fall back to resolving composite toolset names
|
||||
# (e.g. "hermes-cli") to individual tool names and reverse-mapping.
|
||||
@@ -662,52 +599,14 @@ def _get_platform_tools(
|
||||
|
||||
enabled_toolsets = set()
|
||||
for ts_key, _, _ in CONFIGURABLE_TOOLSETS:
|
||||
if not _toolset_allowed_for_platform(ts_key, platform):
|
||||
continue
|
||||
ts_tools = set(resolve_toolset(ts_key))
|
||||
if ts_tools and ts_tools.issubset(all_tool_names):
|
||||
enabled_toolsets.add(ts_key)
|
||||
|
||||
default_off = set(_DEFAULT_OFF_TOOLSETS)
|
||||
# Legacy safety: if the platform's own name matches a default-off
|
||||
# toolset (e.g. `homeassistant` platform + `homeassistant` toolset),
|
||||
# keep that toolset enabled on first install. Skip this dodge for
|
||||
# platform-restricted toolsets — those are always opt-in even on
|
||||
# their own platform (e.g. `discord` + `discord` should stay OFF).
|
||||
if platform in default_off and platform not in _TOOLSET_PLATFORM_RESTRICTIONS:
|
||||
if platform in default_off:
|
||||
default_off.remove(platform)
|
||||
enabled_toolsets -= default_off
|
||||
|
||||
# Recover non-configurable platform toolsets (e.g. discord, feishu_doc,
|
||||
# feishu_drive). These are part of the platform's default composite but
|
||||
# absent from CONFIGURABLE_TOOLSETS, so they can't appear in the TUI
|
||||
# checklist or in a user-saved config. Must run in BOTH branches —
|
||||
# otherwise saving via `hermes tools` (which flips has_explicit_config
|
||||
# to True) silently drops them.
|
||||
platform_tool_universe = set(resolve_toolset(PLATFORMS[platform]["default_toolset"]))
|
||||
configurable_tool_universe = set()
|
||||
for ck in configurable_keys:
|
||||
configurable_tool_universe.update(resolve_toolset(ck))
|
||||
claimed = set()
|
||||
for ts_key in enabled_toolsets:
|
||||
claimed.update(resolve_toolset(ts_key))
|
||||
skip = configurable_keys | plugin_ts_keys | platform_default_keys
|
||||
skip |= {k for k in TOOLSETS if k.startswith("hermes-")}
|
||||
skip |= set(_DEFAULT_OFF_TOOLSETS) - {platform}
|
||||
for ts_key, ts_def in TOOLSETS.items():
|
||||
if ts_key in skip:
|
||||
continue
|
||||
if ts_def.get("includes"):
|
||||
continue
|
||||
ts_tools = set(resolve_toolset(ts_key))
|
||||
if not ts_tools or not ts_tools.issubset(platform_tool_universe):
|
||||
continue
|
||||
if ts_tools.issubset(configurable_tool_universe):
|
||||
continue
|
||||
if not ts_tools.issubset(claimed):
|
||||
enabled_toolsets.add(ts_key)
|
||||
claimed.update(ts_tools)
|
||||
|
||||
# Plugin toolsets: enabled by default unless explicitly disabled, or
|
||||
# unless the toolset is in _DEFAULT_OFF_TOOLSETS (e.g. spotify —
|
||||
# shipped as a bundled plugin but user must opt in via `hermes tools`
|
||||
@@ -715,6 +614,7 @@ def _get_platform_tools(
|
||||
# A plugin toolset is "known" for a platform once `hermes tools`
|
||||
# has been saved for that platform (tracked via known_plugin_toolsets).
|
||||
# Unknown plugins default to enabled; known-but-absent = disabled.
|
||||
plugin_ts_keys = _get_plugin_toolset_keys()
|
||||
if plugin_ts_keys:
|
||||
known_map = config.get("known_plugin_toolsets", {})
|
||||
known_for_platform = set(known_map.get(platform, []))
|
||||
@@ -732,6 +632,7 @@ def _get_platform_tools(
|
||||
|
||||
# Preserve any explicit non-configurable toolset entries (for example,
|
||||
# custom toolsets or MCP server names saved in platform_toolsets).
|
||||
platform_default_keys = {p["default_toolset"] for p in PLATFORMS.values()}
|
||||
explicit_passthrough = {
|
||||
ts
|
||||
for ts in toolset_names
|
||||
@@ -777,14 +678,6 @@ def _save_platform_tools(config: dict, platform: str, enabled_toolset_keys: Set[
|
||||
"""
|
||||
config.setdefault("platform_toolsets", {})
|
||||
|
||||
# Drop platform-scoped toolsets that don't apply here. Prevents the
|
||||
# "Configure all platforms" checklist (or a hand-edited config.yaml)
|
||||
# from turning on, say, the `discord` toolset for Telegram.
|
||||
enabled_toolset_keys = {
|
||||
ts for ts in enabled_toolset_keys
|
||||
if _toolset_allowed_for_platform(ts, platform)
|
||||
}
|
||||
|
||||
# Get the set of all configurable toolset keys (built-in + plugin)
|
||||
configurable_keys = {ts_key for ts_key, _, _ in CONFIGURABLE_TOOLSETS}
|
||||
plugin_keys = _get_plugin_toolset_keys()
|
||||
@@ -799,7 +692,6 @@ def _save_platform_tools(config: dict, platform: str, enabled_toolset_keys: Set[
|
||||
existing_toolsets = config.get("platform_toolsets", {}).get(platform, [])
|
||||
if not isinstance(existing_toolsets, list):
|
||||
existing_toolsets = []
|
||||
existing_toolsets = [str(ts) for ts in existing_toolsets]
|
||||
|
||||
# Preserve any entries that are NOT configurable toolsets and NOT platform
|
||||
# defaults (i.e. only MCP server names should be preserved)
|
||||
@@ -807,11 +699,6 @@ def _save_platform_tools(config: dict, platform: str, enabled_toolset_keys: Set[
|
||||
entry for entry in existing_toolsets
|
||||
if entry not in configurable_keys and entry not in platform_default_keys
|
||||
}
|
||||
# Opening `hermes tools` is the user's opt-in to reconfigure tools, so treat
|
||||
# saving from the picker as consent to clear the "no_mcp" sentinel. The
|
||||
# picker has no checkbox for no_mcp, so without this users who once set it
|
||||
# by hand could never re-enable MCP servers through the UI.
|
||||
preserved_entries.discard("no_mcp")
|
||||
|
||||
# Merge preserved entries with new enabled toolsets
|
||||
config["platform_toolsets"][platform] = sorted(enabled_toolset_keys | preserved_entries)
|
||||
@@ -919,7 +806,7 @@ def _estimate_tool_tokens() -> Dict[str, int]:
|
||||
return _tool_token_cache
|
||||
|
||||
|
||||
def _prompt_toolset_checklist(platform_label: str, enabled: Set[str], platform: str = "cli") -> Set[str]:
|
||||
def _prompt_toolset_checklist(platform_label: str, enabled: Set[str]) -> Set[str]:
|
||||
"""Multi-select checklist of toolsets. Returns set of selected toolset keys."""
|
||||
from hermes_cli.curses_ui import curses_checklist
|
||||
from toolsets import resolve_toolset
|
||||
@@ -927,12 +814,7 @@ def _prompt_toolset_checklist(platform_label: str, enabled: Set[str], platform:
|
||||
# Pre-compute per-tool token counts (cached after first call).
|
||||
tool_tokens = _estimate_tool_tokens()
|
||||
|
||||
effective_all = _get_effective_configurable_toolsets()
|
||||
# Drop platform-scoped toolsets that don't apply to this platform.
|
||||
effective = [
|
||||
(k, l, d) for (k, l, d) in effective_all
|
||||
if _toolset_allowed_for_platform(k, platform)
|
||||
]
|
||||
effective = _get_effective_configurable_toolsets()
|
||||
|
||||
labels = []
|
||||
for ts_key, ts_label, ts_desc in effective:
|
||||
@@ -1846,7 +1728,7 @@ def tools_command(args=None, first_install: bool = False, config: dict = None):
|
||||
checklist_preselected = current_enabled - _DEFAULT_OFF_TOOLSETS
|
||||
|
||||
# Show checklist
|
||||
new_enabled = _prompt_toolset_checklist(pinfo["label"], checklist_preselected, pkey)
|
||||
new_enabled = _prompt_toolset_checklist(pinfo["label"], checklist_preselected)
|
||||
|
||||
added = new_enabled - current_enabled
|
||||
removed = current_enabled - new_enabled
|
||||
@@ -2202,11 +2084,7 @@ def _apply_mcp_change(config: dict, targets: List[str], action: str) -> Set[str]
|
||||
|
||||
def _print_tools_list(enabled_toolsets: set, mcp_servers: dict, platform: str = "cli"):
|
||||
"""Print a summary of enabled/disabled toolsets and MCP tool filters."""
|
||||
effective_all = _get_effective_configurable_toolsets()
|
||||
effective = [
|
||||
(k, l, d) for (k, l, d) in effective_all
|
||||
if _toolset_allowed_for_platform(k, platform)
|
||||
]
|
||||
effective = _get_effective_configurable_toolsets()
|
||||
builtin_keys = {ts_key for ts_key, _, _ in CONFIGURABLE_TOOLSETS}
|
||||
|
||||
print(f"Built-in toolsets ({platform}):")
|
||||
@@ -2272,20 +2150,6 @@ def tools_disable_enable_command(args):
|
||||
_print_error(f"Unknown toolset '{name}'")
|
||||
toolset_targets = [t for t in toolset_targets if t in valid_toolsets]
|
||||
|
||||
# Reject platform-scoped toolsets on platforms that don't allow them.
|
||||
restricted_targets = [
|
||||
t for t in toolset_targets
|
||||
if not _toolset_allowed_for_platform(t, platform)
|
||||
]
|
||||
if restricted_targets:
|
||||
for name in restricted_targets:
|
||||
allowed = sorted(_TOOLSET_PLATFORM_RESTRICTIONS.get(name) or set())
|
||||
_print_error(
|
||||
f"Toolset '{name}' is not available on platform '{platform}' "
|
||||
f"(only: {', '.join(allowed)})"
|
||||
)
|
||||
toolset_targets = [t for t in toolset_targets if t not in restricted_targets]
|
||||
|
||||
if toolset_targets:
|
||||
_apply_toolset_change(config, platform, toolset_targets, action)
|
||||
|
||||
|
||||
+150
-892
File diff suppressed because it is too large
Load Diff
+25
-29
@@ -288,34 +288,30 @@ def get_tool_definitions(
|
||||
filtered_tools[i] = {"type": "function", "function": dynamic_schema}
|
||||
break
|
||||
|
||||
# Rebuild discord / discord_admin schemas based on the bot's privileged
|
||||
# intents (detected from GET /applications/@me) and the user's action
|
||||
# allowlist in config. Hides actions the bot's intents don't support so
|
||||
# the model never attempts them, and annotates fetch_messages when the
|
||||
# Rebuild discord_server schema based on the bot's privileged intents
|
||||
# (detected from GET /applications/@me) and the user's action allowlist
|
||||
# in config. Hides actions the bot's intents don't support so the
|
||||
# model never attempts them, and annotates fetch_messages when the
|
||||
# MESSAGE_CONTENT intent is missing.
|
||||
_discord_schema_fns = {
|
||||
"discord": "get_dynamic_schema_core",
|
||||
"discord_admin": "get_dynamic_schema_admin",
|
||||
}
|
||||
for discord_tool_name in _discord_schema_fns:
|
||||
if discord_tool_name in available_tool_names:
|
||||
try:
|
||||
from tools import discord_tool as _dt
|
||||
schema_fn = getattr(_dt, _discord_schema_fns[discord_tool_name])
|
||||
dynamic = schema_fn()
|
||||
except Exception:
|
||||
dynamic = None
|
||||
if dynamic is None:
|
||||
filtered_tools = [
|
||||
t for t in filtered_tools
|
||||
if t.get("function", {}).get("name") != discord_tool_name
|
||||
]
|
||||
available_tool_names.discard(discord_tool_name)
|
||||
else:
|
||||
for i, td in enumerate(filtered_tools):
|
||||
if td.get("function", {}).get("name") == discord_tool_name:
|
||||
filtered_tools[i] = {"type": "function", "function": dynamic}
|
||||
break
|
||||
if "discord_server" in available_tool_names:
|
||||
try:
|
||||
from tools.discord_tool import get_dynamic_schema
|
||||
dynamic = get_dynamic_schema()
|
||||
except Exception: # pragma: no cover — defensive, fall back to static
|
||||
dynamic = None
|
||||
if dynamic is None:
|
||||
# Tool filtered out entirely (empty allowlist or detection disabled
|
||||
# the only remaining actions). Drop it from the schema list.
|
||||
filtered_tools = [
|
||||
t for t in filtered_tools
|
||||
if t.get("function", {}).get("name") != "discord_server"
|
||||
]
|
||||
available_tool_names.discard("discord_server")
|
||||
else:
|
||||
for i, td in enumerate(filtered_tools):
|
||||
if td.get("function", {}).get("name") == "discord_server":
|
||||
filtered_tools[i] = {"type": "function", "function": dynamic}
|
||||
break
|
||||
|
||||
# Strip web tool cross-references from browser_navigate description when
|
||||
# web_search / web_extract are not available. The static schema says
|
||||
@@ -468,9 +464,9 @@ def _coerce_number(value: str, integer_only: bool = False):
|
||||
f = float(value)
|
||||
except (ValueError, OverflowError):
|
||||
return value
|
||||
# Guard against inf/nan — not JSON-serializable, keep original string
|
||||
# Guard against inf/nan before int() conversion
|
||||
if f != f or f == float("inf") or f == float("-inf"):
|
||||
return value
|
||||
return f
|
||||
# If it looks like an integer (no fractional part), return int
|
||||
if f == int(f):
|
||||
return int(f)
|
||||
|
||||
+1
-1
@@ -156,7 +156,7 @@
|
||||
for entry in "''${ENTRIES[@]}"; do
|
||||
IFS=":" read -r ATTR FOLDER NIX_FILE <<< "$entry"
|
||||
echo "==> .#$ATTR ($FOLDER -> $NIX_FILE)"
|
||||
OUTPUT=$(nix build ".#$ATTR.npmDeps" --no-link --rebuild --print-build-logs 2>&1)
|
||||
OUTPUT=$(nix build ".#$ATTR.npmDeps" --no-link --print-build-logs 2>&1)
|
||||
STATUS=$?
|
||||
if [ "$STATUS" -eq 0 ]; then
|
||||
echo " ok"
|
||||
|
||||
+1
-1
@@ -4,7 +4,7 @@ let
|
||||
src = ../web;
|
||||
npmDeps = pkgs.fetchNpmDeps {
|
||||
inherit src;
|
||||
hash = "sha256-4Z8KQ69QhO83X6zff+5urWBv6MME686MhTTMdwSl65o=";
|
||||
hash = "sha256-TS/vrCHbdvXkPcAPxImKzAd2pdDCrKlgYZkXBMQ+TEg=";
|
||||
};
|
||||
|
||||
npm = hermesNpmLib.mkNpmPassthru { folder = "web"; attr = "web"; pname = "hermes-web"; };
|
||||
|
||||
@@ -91,29 +91,4 @@
|
||||
|
||||
// Register this plugin — the dashboard picks it up automatically.
|
||||
window.__HERMES_PLUGINS__.register("example", ExamplePage);
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────
|
||||
// Page-scoped slot demo: inject a small banner at the top of /sessions.
|
||||
//
|
||||
// Built-in pages expose named slots (<page>:top, <page>:bottom) that
|
||||
// plugins can populate without overriding the whole route. The
|
||||
// manifest lists the slots we use in its `slots` array so the shell
|
||||
// knows to render <PluginSlot name="sessions:top" /> there.
|
||||
// ─────────────────────────────────────────────────────────────────────
|
||||
function SessionsTopBanner() {
|
||||
return React.createElement(Card, {
|
||||
className: "border-dashed",
|
||||
},
|
||||
React.createElement(CardContent, { className: "flex items-center gap-3 py-2" },
|
||||
React.createElement(Badge, { variant: "outline" }, "Example"),
|
||||
React.createElement("span", {
|
||||
className: "text-xs text-muted-foreground",
|
||||
}, "This banner was injected into the Sessions page by the example plugin via the ",
|
||||
React.createElement("code", { className: "font-courier" }, "sessions:top"),
|
||||
" slot."),
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
window.__HERMES_PLUGINS__.registerSlot("example", "sessions:top", SessionsTopBanner);
|
||||
})();
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
"path": "/example",
|
||||
"position": "after:skills"
|
||||
},
|
||||
"slots": ["sessions:top"],
|
||||
"entry": "dist/index.js",
|
||||
"api": "plugin_api.py"
|
||||
}
|
||||
|
||||
@@ -43,7 +43,7 @@ _TIMEOUT = 30.0
|
||||
# ---------------------------------------------------------------------------
|
||||
# Process-level atexit safety net — ensures pending sessions are committed
|
||||
# even if shutdown_memory_provider is never called (e.g. gateway crash,
|
||||
# SIGKILL, or exception in the session expiry watcher preventing shutdown).
|
||||
# SIGKILL, or exception in _async_flush_memories preventing shutdown).
|
||||
# ---------------------------------------------------------------------------
|
||||
_last_active_provider: Optional["OpenVikingMemoryProvider"] = None
|
||||
|
||||
|
||||
@@ -78,16 +78,6 @@ termux = [
|
||||
]
|
||||
dingtalk = ["dingtalk-stream>=0.20,<1", "alibabacloud-dingtalk>=2.0.0", "qrcode>=7.0,<8"]
|
||||
feishu = ["lark-oapi>=1.5.3,<2", "qrcode>=7.0,<8"]
|
||||
google = [
|
||||
# Required by the google-workspace skill (Gmail, Calendar, Drive, Contacts,
|
||||
# Sheets, Docs). Declared here so packagers (Nix, Homebrew) ship them with
|
||||
# the [all] extra and users don't hit runtime `pip install` paths that fail
|
||||
# in environments without pip (e.g. Nix-managed Python).
|
||||
"google-api-python-client>=2.100,<3",
|
||||
"google-auth-oauthlib>=1.0,<2",
|
||||
"google-auth-httplib2>=0.2,<1",
|
||||
]
|
||||
# `hermes dashboard` (localhost SPA + API). Not in core to keep the default install lean.
|
||||
web = ["fastapi>=0.104.0,<1", "uvicorn[standard]>=0.24.0,<1"]
|
||||
rl = [
|
||||
"atroposlib @ git+https://github.com/NousResearch/atropos.git@c20c85256e5a45ad31edf8b7276e9c5ee1995a30",
|
||||
@@ -119,7 +109,6 @@ all = [
|
||||
"hermes-agent[voice]",
|
||||
"hermes-agent[dingtalk]",
|
||||
"hermes-agent[feishu]",
|
||||
"hermes-agent[google]",
|
||||
"hermes-agent[mistral]",
|
||||
"hermes-agent[bedrock]",
|
||||
"hermes-agent[web]",
|
||||
|
||||
+239
-445
@@ -502,48 +502,6 @@ def _sanitize_messages_surrogates(messages: list) -> bool:
|
||||
return found
|
||||
|
||||
|
||||
def _escape_invalid_chars_in_json_strings(raw: str) -> str:
|
||||
"""Escape unescaped control chars inside JSON string values.
|
||||
|
||||
Walks the raw JSON character-by-character, tracking whether we are
|
||||
inside a double-quoted string. Inside strings, replaces literal
|
||||
control characters (0x00-0x1F) that aren't already part of an escape
|
||||
sequence with their ``\\uXXXX`` equivalents. Pass-through for everything
|
||||
else.
|
||||
|
||||
Ported from #12093 — complements the other repair passes in
|
||||
``_repair_tool_call_arguments`` when ``json.loads(strict=False)`` is
|
||||
not enough (e.g. llama.cpp backends that emit literal apostrophes or
|
||||
tabs alongside other malformations).
|
||||
"""
|
||||
out: list[str] = []
|
||||
in_string = False
|
||||
i = 0
|
||||
n = len(raw)
|
||||
while i < n:
|
||||
ch = raw[i]
|
||||
if in_string:
|
||||
if ch == "\\" and i + 1 < n:
|
||||
# Already-escaped char — pass through as-is
|
||||
out.append(ch)
|
||||
out.append(raw[i + 1])
|
||||
i += 2
|
||||
continue
|
||||
if ch == '"':
|
||||
in_string = False
|
||||
out.append(ch)
|
||||
elif ord(ch) < 0x20:
|
||||
out.append(f"\\u{ord(ch):04x}")
|
||||
else:
|
||||
out.append(ch)
|
||||
else:
|
||||
if ch == '"':
|
||||
in_string = True
|
||||
out.append(ch)
|
||||
i += 1
|
||||
return "".join(out)
|
||||
|
||||
|
||||
def _repair_tool_call_arguments(raw_args: str, tool_name: str = "?") -> str:
|
||||
"""Attempt to repair malformed tool_call argument JSON.
|
||||
|
||||
@@ -565,23 +523,6 @@ def _repair_tool_call_arguments(raw_args: str, tool_name: str = "?") -> str:
|
||||
logger.warning("Sanitized Python-None tool_call arguments for %s", tool_name)
|
||||
return "{}"
|
||||
|
||||
# Repair pass 0: llama.cpp backends sometimes emit literal control
|
||||
# characters (tabs, newlines) inside JSON string values. json.loads
|
||||
# with strict=False accepts these and lets us re-serialise the
|
||||
# result into wire-valid JSON without any string surgery. This is
|
||||
# the most common local-model repair case (#12068).
|
||||
try:
|
||||
parsed = json.loads(raw_stripped, strict=False)
|
||||
reserialised = json.dumps(parsed, separators=(",", ":"))
|
||||
if reserialised != raw_stripped:
|
||||
logger.warning(
|
||||
"Repaired unescaped control chars in tool_call arguments for %s",
|
||||
tool_name,
|
||||
)
|
||||
return reserialised
|
||||
except (json.JSONDecodeError, TypeError, ValueError):
|
||||
pass
|
||||
|
||||
# Attempt common JSON repairs
|
||||
fixed = raw_stripped
|
||||
# 1. Strip trailing commas before } or ]
|
||||
@@ -616,21 +557,6 @@ def _repair_tool_call_arguments(raw_args: str, tool_name: str = "?") -> str:
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Repair pass 4: escape unescaped control chars inside JSON strings,
|
||||
# then retry. Catches cases where strict=False alone fails because
|
||||
# other malformations are present too.
|
||||
try:
|
||||
escaped = _escape_invalid_chars_in_json_strings(fixed)
|
||||
if escaped != fixed:
|
||||
json.loads(escaped)
|
||||
logger.warning(
|
||||
"Repaired control-char-laced tool_call arguments for %s: %s → %s",
|
||||
tool_name, raw_stripped[:80], escaped[:80],
|
||||
)
|
||||
return escaped
|
||||
except (json.JSONDecodeError, TypeError, ValueError):
|
||||
pass
|
||||
|
||||
# Last resort: replace with empty object so the API request doesn't
|
||||
# crash the entire session.
|
||||
logger.warning(
|
||||
@@ -814,11 +740,6 @@ class AIAgent:
|
||||
for AI models that support function calling.
|
||||
"""
|
||||
|
||||
_TOOL_CALL_ARGUMENTS_CORRUPTION_MARKER = (
|
||||
"[hermes-agent: tool call arguments were corrupted in this session and "
|
||||
"have been dropped to keep the conversation alive. See issue #15236.]"
|
||||
)
|
||||
|
||||
@property
|
||||
def base_url(self) -> str:
|
||||
return self._base_url
|
||||
@@ -1516,8 +1437,6 @@ class AIAgent:
|
||||
|
||||
# Track conversation messages for session logging
|
||||
self._session_messages: List[Dict[str, Any]] = []
|
||||
self._memory_write_origin = "assistant_tool"
|
||||
self._memory_write_context = "foreground"
|
||||
|
||||
# Cached system prompt -- built once per session, only rebuilt on compression
|
||||
self._cached_system_prompt: Optional[str] = None
|
||||
@@ -1578,6 +1497,7 @@ class AIAgent:
|
||||
self._memory_enabled = False
|
||||
self._user_profile_enabled = False
|
||||
self._memory_nudge_interval = 10
|
||||
self._memory_flush_min_turns = 6
|
||||
self._turns_since_memory = 0
|
||||
self._iters_since_skill = 0
|
||||
if not skip_memory:
|
||||
@@ -1586,6 +1506,7 @@ class AIAgent:
|
||||
self._memory_enabled = mem_config.get("memory_enabled", False)
|
||||
self._user_profile_enabled = mem_config.get("user_profile_enabled", False)
|
||||
self._memory_nudge_interval = int(mem_config.get("nudge_interval", 10))
|
||||
self._memory_flush_min_turns = int(mem_config.get("flush_min_turns", 6))
|
||||
if self._memory_enabled or self._user_profile_enabled:
|
||||
from tools.memory_tool import MemoryStore
|
||||
self._memory_store = MemoryStore(
|
||||
@@ -2310,34 +2231,6 @@ class AIAgent:
|
||||
except Exception:
|
||||
logger.debug("status_callback error in _emit_status", exc_info=True)
|
||||
|
||||
def _emit_warning(self, message: str) -> None:
|
||||
"""Emit a user-visible warning through the same status plumbing.
|
||||
|
||||
Unlike debug logs, these warnings are meant for degraded side paths
|
||||
such as auxiliary compression or memory flushes where the main turn can
|
||||
continue but the user needs to know something important failed.
|
||||
"""
|
||||
try:
|
||||
self._vprint(f"{self.log_prefix}{message}", force=True)
|
||||
except Exception:
|
||||
pass
|
||||
if self.status_callback:
|
||||
try:
|
||||
self.status_callback("warn", message)
|
||||
except Exception:
|
||||
logger.debug("status_callback error in _emit_warning", exc_info=True)
|
||||
|
||||
def _emit_auxiliary_failure(self, task: str, exc: BaseException) -> None:
|
||||
"""Surface a compact warning for failed auxiliary work."""
|
||||
try:
|
||||
detail = self._summarize_api_error(exc)
|
||||
except Exception:
|
||||
detail = str(exc)
|
||||
detail = (detail or exc.__class__.__name__).strip()
|
||||
if len(detail) > 220:
|
||||
detail = detail[:217].rstrip() + "..."
|
||||
self._emit_warning(f"⚠ Auxiliary {task} failed: {detail}")
|
||||
|
||||
def _current_main_runtime(self) -> Dict[str, str]:
|
||||
"""Return the live main runtime for session-scoped auxiliary routing."""
|
||||
return {
|
||||
@@ -2397,7 +2290,6 @@ class AIAgent:
|
||||
base_url=aux_base_url,
|
||||
api_key=aux_api_key,
|
||||
config_context_length=getattr(self, "_aux_compression_context_length_config", None),
|
||||
provider=getattr(self, "provider", ""),
|
||||
)
|
||||
|
||||
# Hard floor: the auxiliary compression model must have at least
|
||||
@@ -2424,11 +2316,6 @@ class AIAgent:
|
||||
# compression actually works this session. The hard floor
|
||||
# above guarantees aux_context >= MINIMUM_CONTEXT_LENGTH,
|
||||
# so the new threshold is always >= 64K.
|
||||
#
|
||||
# The compression summariser sends a single user-role
|
||||
# prompt (no system prompt, no tools) to the aux model, so
|
||||
# new_threshold == aux_context is safe: the request is
|
||||
# the raw messages plus a small summarisation instruction.
|
||||
old_threshold = threshold
|
||||
new_threshold = aux_context
|
||||
self.context_compressor.threshold_tokens = new_threshold
|
||||
@@ -3160,10 +3047,7 @@ class AIAgent:
|
||||
quiet_mode=True,
|
||||
platform=self.platform,
|
||||
provider=self.provider,
|
||||
parent_session_id=self.session_id,
|
||||
)
|
||||
review_agent._memory_write_origin = "background_review"
|
||||
review_agent._memory_write_context = "background_review"
|
||||
review_agent._memory_store = self._memory_store
|
||||
review_agent._memory_enabled = self._memory_enabled
|
||||
review_agent._user_profile_enabled = self._user_profile_enabled
|
||||
@@ -3197,8 +3081,7 @@ class AIAgent:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Background memory/skill review failed: %s", e)
|
||||
self._emit_auxiliary_failure("background review", e)
|
||||
logger.debug("Background memory/skill review failed: %s", e)
|
||||
finally:
|
||||
# Close all resources (httpx client, subprocesses, etc.) so
|
||||
# GC doesn't try to clean them up on a dead asyncio event
|
||||
@@ -3212,32 +3095,6 @@ class AIAgent:
|
||||
t = threading.Thread(target=_run_review, daemon=True, name="bg-review")
|
||||
t.start()
|
||||
|
||||
def _build_memory_write_metadata(
|
||||
self,
|
||||
*,
|
||||
write_origin: Optional[str] = None,
|
||||
execution_context: Optional[str] = None,
|
||||
task_id: Optional[str] = None,
|
||||
tool_call_id: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Build provenance metadata for external memory-provider mirrors."""
|
||||
metadata: Dict[str, Any] = {
|
||||
"write_origin": write_origin or getattr(self, "_memory_write_origin", "assistant_tool"),
|
||||
"execution_context": (
|
||||
execution_context
|
||||
or getattr(self, "_memory_write_context", "foreground")
|
||||
),
|
||||
"session_id": self.session_id or "",
|
||||
"parent_session_id": self._parent_session_id or "",
|
||||
"platform": self.platform or os.environ.get("HERMES_SESSION_SOURCE", "cli"),
|
||||
"tool_name": "memory",
|
||||
}
|
||||
if task_id:
|
||||
metadata["task_id"] = task_id
|
||||
if tool_call_id:
|
||||
metadata["tool_call_id"] = tool_call_id
|
||||
return {k: v for k, v in metadata.items() if v not in (None, "")}
|
||||
|
||||
def _apply_persist_user_message_override(self, messages: List[Dict]) -> None:
|
||||
"""Rewrite the current-turn user message before persistence/return.
|
||||
|
||||
@@ -4166,49 +4023,6 @@ class AIAgent:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _sync_external_memory_for_turn(
|
||||
self,
|
||||
*,
|
||||
original_user_message: Any,
|
||||
final_response: Any,
|
||||
interrupted: bool,
|
||||
) -> None:
|
||||
"""Mirror a completed turn into external memory providers.
|
||||
|
||||
Called at the end of ``run_conversation`` with the cleaned user
|
||||
message (``original_user_message``) and the finalised assistant
|
||||
response. The external memory backend gets both ``sync_all`` (to
|
||||
persist the exchange) and ``queue_prefetch_all`` (to start
|
||||
warming context for the next turn) in one shot.
|
||||
|
||||
Uses ``original_user_message`` rather than ``user_message``
|
||||
because the latter may carry injected skill content that bloats
|
||||
or breaks provider queries.
|
||||
|
||||
Interrupted turns are skipped entirely (#15218). A partial
|
||||
assistant output, an aborted tool chain, or a mid-stream reset
|
||||
is not durable conversational truth — mirroring it into an
|
||||
external memory backend pollutes future recall with state the
|
||||
user never saw completed. The prefetch is gated on the same
|
||||
flag: the user's next message is almost certainly a retry of
|
||||
the same intent, and a prefetch keyed on the interrupted turn
|
||||
would fire against stale context.
|
||||
|
||||
Normal completed turns still sync as before. The whole body is
|
||||
wrapped in ``try/except Exception`` because external memory
|
||||
providers are strictly best-effort — a misconfigured or offline
|
||||
backend must not block the user from seeing their response.
|
||||
"""
|
||||
if interrupted:
|
||||
return
|
||||
if not (self._memory_manager and final_response and original_user_message):
|
||||
return
|
||||
try:
|
||||
self._memory_manager.sync_all(original_user_message, final_response)
|
||||
self._memory_manager.queue_prefetch_all(original_user_message)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def release_clients(self) -> None:
|
||||
"""Release LLM client resources WITHOUT tearing down session tool state.
|
||||
|
||||
@@ -5141,8 +4955,6 @@ class AIAgent:
|
||||
# response.incomplete instead of response.completed).
|
||||
self._codex_streamed_text_parts: list = []
|
||||
for attempt in range(max_stream_retries + 1):
|
||||
if self._interrupt_requested:
|
||||
raise InterruptedError("Agent interrupted before Codex stream retry")
|
||||
collected_output_items: list = []
|
||||
try:
|
||||
with active_client.responses.stream(**api_kwargs) as stream:
|
||||
@@ -5620,26 +5432,6 @@ class AIAgent:
|
||||
self._try_refresh_anthropic_client_credentials()
|
||||
return self._anthropic_client.messages.create(**api_kwargs)
|
||||
|
||||
def _rebuild_anthropic_client(self) -> None:
|
||||
"""Rebuild the Anthropic client after an interrupt or stale call.
|
||||
|
||||
Handles both direct Anthropic and Bedrock-hosted Anthropic models
|
||||
correctly — rebuilding with the Bedrock SDK when provider is bedrock,
|
||||
rather than always falling back to build_anthropic_client() which
|
||||
requires a direct Anthropic API key.
|
||||
"""
|
||||
if getattr(self, "provider", None) == "bedrock":
|
||||
from agent.anthropic_adapter import build_anthropic_bedrock_client
|
||||
region = getattr(self, "_bedrock_region", "us-east-1") or "us-east-1"
|
||||
self._anthropic_client = build_anthropic_bedrock_client(region)
|
||||
else:
|
||||
from agent.anthropic_adapter import build_anthropic_client
|
||||
self._anthropic_client = build_anthropic_client(
|
||||
self._anthropic_api_key,
|
||||
getattr(self, "_anthropic_base_url", None),
|
||||
timeout=get_provider_request_timeout(self.provider, self.model),
|
||||
)
|
||||
|
||||
def _interruptible_api_call(self, api_kwargs: dict):
|
||||
"""
|
||||
Run the API call in a background thread so the main conversation loop
|
||||
@@ -5675,21 +5467,12 @@ class AIAgent:
|
||||
# bedrock responses like chat_completions responses.
|
||||
from agent.bedrock_adapter import (
|
||||
_get_bedrock_runtime_client,
|
||||
invalidate_runtime_client,
|
||||
is_stale_connection_error,
|
||||
normalize_converse_response,
|
||||
)
|
||||
region = api_kwargs.pop("__bedrock_region__", "us-east-1")
|
||||
api_kwargs.pop("__bedrock_converse__", None)
|
||||
client = _get_bedrock_runtime_client(region)
|
||||
try:
|
||||
raw_response = client.converse(**api_kwargs)
|
||||
except Exception as _bedrock_exc:
|
||||
# Evict the cached client on stale-connection failures
|
||||
# so the outer retry loop builds a fresh client/pool.
|
||||
if is_stale_connection_error(_bedrock_exc):
|
||||
invalidate_runtime_client(region)
|
||||
raise
|
||||
raw_response = client.converse(**api_kwargs)
|
||||
result["response"] = normalize_converse_response(raw_response)
|
||||
else:
|
||||
request_client_holder["client"] = self._create_request_openai_client(reason="chat_completion_request")
|
||||
@@ -5747,8 +5530,14 @@ class AIAgent:
|
||||
)
|
||||
try:
|
||||
if self.api_mode == "anthropic_messages":
|
||||
from agent.anthropic_adapter import build_anthropic_client
|
||||
|
||||
self._anthropic_client.close()
|
||||
self._rebuild_anthropic_client()
|
||||
self._anthropic_client = build_anthropic_client(
|
||||
self._anthropic_api_key,
|
||||
getattr(self, "_anthropic_base_url", None),
|
||||
timeout=get_provider_request_timeout(self.provider, self.model),
|
||||
)
|
||||
else:
|
||||
rc = request_client_holder.get("client")
|
||||
if rc is not None:
|
||||
@@ -5773,8 +5562,14 @@ class AIAgent:
|
||||
# seed future retries.
|
||||
try:
|
||||
if self.api_mode == "anthropic_messages":
|
||||
from agent.anthropic_adapter import build_anthropic_client
|
||||
|
||||
self._anthropic_client.close()
|
||||
self._rebuild_anthropic_client()
|
||||
self._anthropic_client = build_anthropic_client(
|
||||
self._anthropic_api_key,
|
||||
getattr(self, "_anthropic_base_url", None),
|
||||
timeout=get_provider_request_timeout(self.provider, self.model),
|
||||
)
|
||||
else:
|
||||
request_client = request_client_holder.get("client")
|
||||
if request_client is not None:
|
||||
@@ -5930,21 +5725,12 @@ class AIAgent:
|
||||
try:
|
||||
from agent.bedrock_adapter import (
|
||||
_get_bedrock_runtime_client,
|
||||
invalidate_runtime_client,
|
||||
is_stale_connection_error,
|
||||
stream_converse_with_callbacks,
|
||||
)
|
||||
region = api_kwargs.pop("__bedrock_region__", "us-east-1")
|
||||
api_kwargs.pop("__bedrock_converse__", None)
|
||||
client = _get_bedrock_runtime_client(region)
|
||||
try:
|
||||
raw_response = client.converse_stream(**api_kwargs)
|
||||
except Exception as _bedrock_exc:
|
||||
# Evict the cached client on stale-connection failures
|
||||
# so the outer retry loop builds a fresh client/pool.
|
||||
if is_stale_connection_error(_bedrock_exc):
|
||||
invalidate_runtime_client(region)
|
||||
raise
|
||||
raw_response = client.converse_stream(**api_kwargs)
|
||||
|
||||
def _on_text(text):
|
||||
_fire_first()
|
||||
@@ -6196,25 +5982,11 @@ class AIAgent:
|
||||
for idx in sorted(tool_calls_acc):
|
||||
tc = tool_calls_acc[idx]
|
||||
arguments = tc["function"]["arguments"]
|
||||
tool_name = tc["function"]["name"] or "?"
|
||||
if arguments and arguments.strip():
|
||||
try:
|
||||
json.loads(arguments)
|
||||
except json.JSONDecodeError:
|
||||
# Attempt repair before flagging as truncated.
|
||||
# Models like GLM-5.1 via Ollama produce trailing
|
||||
# commas, unclosed brackets, Python None, etc.
|
||||
# Without repair, these hit the truncation handler
|
||||
# and kill the session. _repair_tool_call_arguments
|
||||
# returns "{}" for unrepairable args, which is far
|
||||
# better than a crashed session.
|
||||
repaired = _repair_tool_call_arguments(arguments, tool_name)
|
||||
if repaired != "{}":
|
||||
# Successfully repaired — use the fixed args
|
||||
arguments = repaired
|
||||
else:
|
||||
# Unrepairable — flag for truncation handling
|
||||
has_truncated_tool_args = True
|
||||
has_truncated_tool_args = True
|
||||
mock_tool_calls.append(SimpleNamespace(
|
||||
id=tc["id"],
|
||||
type=tc["type"],
|
||||
@@ -6312,14 +6084,6 @@ class AIAgent:
|
||||
|
||||
try:
|
||||
for _stream_attempt in range(_max_stream_retries + 1):
|
||||
# Check for interrupt before each retry attempt. Without
|
||||
# this, /stop closes the HTTP connection (outer poll loop),
|
||||
# but the retry loop opens a FRESH connection — negating the
|
||||
# interrupt entirely. On slow providers (ollama-cloud) each
|
||||
# retry can block for the full stream-read timeout (120s+),
|
||||
# causing multi-minute delays between /stop and response.
|
||||
if self._interrupt_requested:
|
||||
raise InterruptedError("Agent interrupted before stream retry")
|
||||
try:
|
||||
if self.api_mode == "anthropic_messages":
|
||||
self._try_refresh_anthropic_client_credentials()
|
||||
@@ -6646,8 +6410,14 @@ class AIAgent:
|
||||
if self._interrupt_requested:
|
||||
try:
|
||||
if self.api_mode == "anthropic_messages":
|
||||
from agent.anthropic_adapter import build_anthropic_client
|
||||
|
||||
self._anthropic_client.close()
|
||||
self._rebuild_anthropic_client()
|
||||
self._anthropic_client = build_anthropic_client(
|
||||
self._anthropic_api_key,
|
||||
getattr(self, "_anthropic_base_url", None),
|
||||
timeout=get_provider_request_timeout(self.provider, self.model),
|
||||
)
|
||||
else:
|
||||
request_client = request_client_holder.get("client")
|
||||
if request_client is not None:
|
||||
@@ -7639,12 +7409,6 @@ class AIAgent:
|
||||
raw_reasoning_content = getattr(assistant_message, "reasoning_content", None)
|
||||
if raw_reasoning_content is not None:
|
||||
msg["reasoning_content"] = _sanitize_surrogates(raw_reasoning_content)
|
||||
elif msg.get("tool_calls") and self._needs_deepseek_tool_reasoning():
|
||||
# DeepSeek thinking mode requires reasoning_content on every
|
||||
# assistant tool-call message. Without it, replaying the
|
||||
# persisted message causes HTTP 400. Include empty string
|
||||
# as a defensive compatibility fallback (refs #15250).
|
||||
msg["reasoning_content"] = ""
|
||||
|
||||
if hasattr(assistant_message, 'reasoning_details') and assistant_message.reasoning_details:
|
||||
# Pass reasoning_details back unmodified so providers (OpenRouter,
|
||||
@@ -7720,35 +7484,6 @@ class AIAgent:
|
||||
|
||||
return msg
|
||||
|
||||
def _needs_kimi_tool_reasoning(self) -> bool:
|
||||
"""Return True when the current provider is Kimi / Moonshot thinking mode.
|
||||
|
||||
Kimi ``/coding`` and Moonshot thinking mode both require
|
||||
``reasoning_content`` on every assistant tool-call message; omitting
|
||||
it causes the next replay to fail with HTTP 400.
|
||||
"""
|
||||
return (
|
||||
self.provider in {"kimi-coding", "kimi-coding-cn"}
|
||||
or base_url_host_matches(self.base_url, "api.kimi.com")
|
||||
or base_url_host_matches(self.base_url, "moonshot.ai")
|
||||
or base_url_host_matches(self.base_url, "moonshot.cn")
|
||||
)
|
||||
|
||||
def _needs_deepseek_tool_reasoning(self) -> bool:
|
||||
"""Return True when the current provider is DeepSeek thinking mode.
|
||||
|
||||
DeepSeek V4 thinking mode requires ``reasoning_content`` on every
|
||||
assistant tool-call turn; omitting it causes HTTP 400 when the
|
||||
message is replayed in a subsequent API request (#15250).
|
||||
"""
|
||||
provider = (self.provider or "").lower()
|
||||
model = (self.model or "").lower()
|
||||
return (
|
||||
provider == "deepseek"
|
||||
or "deepseek" in model
|
||||
or base_url_host_matches(self.base_url, "api.deepseek.com")
|
||||
)
|
||||
|
||||
def _copy_reasoning_content_for_api(self, source_msg: dict, api_msg: dict) -> None:
|
||||
"""Copy provider-facing reasoning fields onto an API replay message."""
|
||||
if source_msg.get("role") != "assistant":
|
||||
@@ -7764,14 +7499,13 @@ class AIAgent:
|
||||
api_msg["reasoning_content"] = normalized_reasoning
|
||||
return
|
||||
|
||||
# Providers that require an echoed reasoning_content on every
|
||||
# assistant tool-call turn. Detection logic lives in the per-provider
|
||||
# helpers so both the creation path (_build_assistant_message) and
|
||||
# this replay path stay in sync.
|
||||
if source_msg.get("tool_calls") and (
|
||||
self._needs_kimi_tool_reasoning()
|
||||
or self._needs_deepseek_tool_reasoning()
|
||||
):
|
||||
kimi_requires_reasoning = (
|
||||
self.provider in {"kimi-coding", "kimi-coding-cn"}
|
||||
or base_url_host_matches(self.base_url, "api.kimi.com")
|
||||
or base_url_host_matches(self.base_url, "moonshot.ai")
|
||||
or base_url_host_matches(self.base_url, "moonshot.cn")
|
||||
)
|
||||
if kimi_requires_reasoning and source_msg.get("tool_calls"):
|
||||
api_msg["reasoning_content"] = ""
|
||||
|
||||
@staticmethod
|
||||
@@ -7802,115 +7536,6 @@ class AIAgent:
|
||||
]
|
||||
return api_msg
|
||||
|
||||
@staticmethod
|
||||
def _sanitize_tool_call_arguments(
|
||||
messages: list,
|
||||
*,
|
||||
logger=None,
|
||||
session_id: str = None,
|
||||
) -> int:
|
||||
"""Repair corrupted assistant tool-call argument JSON in-place."""
|
||||
log = logger or logging.getLogger(__name__)
|
||||
if not isinstance(messages, list):
|
||||
return 0
|
||||
|
||||
repaired = 0
|
||||
marker = AIAgent._TOOL_CALL_ARGUMENTS_CORRUPTION_MARKER
|
||||
|
||||
def _prepend_marker(tool_msg: dict) -> None:
|
||||
existing = tool_msg.get("content")
|
||||
if isinstance(existing, str):
|
||||
if not existing:
|
||||
tool_msg["content"] = marker
|
||||
elif not existing.startswith(marker):
|
||||
tool_msg["content"] = f"{marker}\n{existing}"
|
||||
return
|
||||
if existing is None:
|
||||
tool_msg["content"] = marker
|
||||
return
|
||||
try:
|
||||
existing_text = json.dumps(existing)
|
||||
except TypeError:
|
||||
existing_text = str(existing)
|
||||
tool_msg["content"] = f"{marker}\n{existing_text}"
|
||||
|
||||
message_index = 0
|
||||
while message_index < len(messages):
|
||||
msg = messages[message_index]
|
||||
if not isinstance(msg, dict) or msg.get("role") != "assistant":
|
||||
message_index += 1
|
||||
continue
|
||||
|
||||
tool_calls = msg.get("tool_calls")
|
||||
if not isinstance(tool_calls, list) or not tool_calls:
|
||||
message_index += 1
|
||||
continue
|
||||
|
||||
insert_at = message_index + 1
|
||||
for tool_call in tool_calls:
|
||||
if not isinstance(tool_call, dict):
|
||||
continue
|
||||
function = tool_call.get("function")
|
||||
if not isinstance(function, dict):
|
||||
continue
|
||||
|
||||
arguments = function.get("arguments")
|
||||
if arguments is None or arguments == "":
|
||||
function["arguments"] = "{}"
|
||||
continue
|
||||
if isinstance(arguments, str) and not arguments.strip():
|
||||
function["arguments"] = "{}"
|
||||
continue
|
||||
if not isinstance(arguments, str):
|
||||
continue
|
||||
|
||||
try:
|
||||
json.loads(arguments)
|
||||
except json.JSONDecodeError:
|
||||
tool_call_id = tool_call.get("id")
|
||||
function_name = function.get("name", "?")
|
||||
preview = arguments[:80]
|
||||
log.warning(
|
||||
"Corrupted tool_call arguments repaired before request "
|
||||
"(session=%s, message_index=%s, tool_call_id=%s, function=%s, preview=%r)",
|
||||
session_id or "-",
|
||||
message_index,
|
||||
tool_call_id or "-",
|
||||
function_name,
|
||||
preview,
|
||||
)
|
||||
function["arguments"] = "{}"
|
||||
|
||||
existing_tool_msg = None
|
||||
scan_index = message_index + 1
|
||||
while scan_index < len(messages):
|
||||
candidate = messages[scan_index]
|
||||
if not isinstance(candidate, dict) or candidate.get("role") != "tool":
|
||||
break
|
||||
if candidate.get("tool_call_id") == tool_call_id:
|
||||
existing_tool_msg = candidate
|
||||
break
|
||||
scan_index += 1
|
||||
|
||||
if existing_tool_msg is None:
|
||||
messages.insert(
|
||||
insert_at,
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": tool_call_id,
|
||||
"content": marker,
|
||||
},
|
||||
)
|
||||
insert_at += 1
|
||||
else:
|
||||
_prepend_marker(existing_tool_msg)
|
||||
|
||||
repaired += 1
|
||||
|
||||
message_index += 1
|
||||
|
||||
return repaired
|
||||
|
||||
def _should_sanitize_tool_calls(self) -> bool:
|
||||
"""Determine if tool_calls need sanitization for strict APIs.
|
||||
|
||||
@@ -7924,6 +7549,201 @@ class AIAgent:
|
||||
"""
|
||||
return self.api_mode != "codex_responses"
|
||||
|
||||
def flush_memories(self, messages: list = None, min_turns: int = None):
|
||||
"""Give the model one turn to persist memories before context is lost.
|
||||
|
||||
Called before compression, session reset, or CLI exit. Injects a flush
|
||||
message, makes one API call, executes any memory tool calls, then
|
||||
strips all flush artifacts from the message list.
|
||||
|
||||
Args:
|
||||
messages: The current conversation messages. If None, uses
|
||||
self._session_messages (last run_conversation state).
|
||||
min_turns: Minimum user turns required to trigger the flush.
|
||||
None = use config value (flush_min_turns).
|
||||
0 = always flush (used for compression).
|
||||
"""
|
||||
if self._memory_flush_min_turns == 0 and min_turns is None:
|
||||
return
|
||||
if "memory" not in self.valid_tool_names or not self._memory_store:
|
||||
return
|
||||
effective_min = min_turns if min_turns is not None else self._memory_flush_min_turns
|
||||
if self._user_turn_count < effective_min:
|
||||
return
|
||||
|
||||
if messages is None:
|
||||
messages = getattr(self, '_session_messages', None)
|
||||
if not messages or len(messages) < 3:
|
||||
return
|
||||
|
||||
flush_content = (
|
||||
"[System: The session is being compressed. "
|
||||
"Save anything worth remembering — prioritize user preferences, "
|
||||
"corrections, and recurring patterns over task-specific details.]"
|
||||
)
|
||||
_sentinel = f"__flush_{id(self)}_{time.monotonic()}"
|
||||
flush_msg = {"role": "user", "content": flush_content, "_flush_sentinel": _sentinel}
|
||||
messages.append(flush_msg)
|
||||
|
||||
try:
|
||||
# Build API messages for the flush call
|
||||
_needs_sanitize = self._should_sanitize_tool_calls()
|
||||
api_messages = []
|
||||
for msg in messages:
|
||||
api_msg = msg.copy()
|
||||
self._copy_reasoning_content_for_api(msg, api_msg)
|
||||
api_msg.pop("reasoning", None)
|
||||
api_msg.pop("finish_reason", None)
|
||||
api_msg.pop("_flush_sentinel", None)
|
||||
api_msg.pop("_thinking_prefill", None)
|
||||
if _needs_sanitize:
|
||||
self._sanitize_tool_calls_for_strict_api(api_msg)
|
||||
api_messages.append(api_msg)
|
||||
|
||||
if self._cached_system_prompt:
|
||||
api_messages = [{"role": "system", "content": self._cached_system_prompt}] + api_messages
|
||||
|
||||
# Make one API call with only the memory tool available
|
||||
memory_tool_def = None
|
||||
for t in (self.tools or []):
|
||||
if t.get("function", {}).get("name") == "memory":
|
||||
memory_tool_def = t
|
||||
break
|
||||
|
||||
if not memory_tool_def:
|
||||
messages.pop() # remove flush msg
|
||||
return
|
||||
|
||||
# Use auxiliary client for the flush call when available --
|
||||
# it's cheaper and avoids Codex Responses API incompatibility.
|
||||
from agent.auxiliary_client import (
|
||||
call_llm as _call_llm,
|
||||
_fixed_temperature_for_model,
|
||||
OMIT_TEMPERATURE,
|
||||
)
|
||||
_aux_available = True
|
||||
# Kimi models manage temperature server-side — omit it entirely.
|
||||
# Other models with a fixed contract get that value; everyone else
|
||||
# gets the historical 0.3 default.
|
||||
_fixed_temp = _fixed_temperature_for_model(self.model, self.base_url)
|
||||
_omit_temperature = _fixed_temp is OMIT_TEMPERATURE
|
||||
if _omit_temperature:
|
||||
_flush_temperature = None
|
||||
elif _fixed_temp is not None:
|
||||
_flush_temperature = _fixed_temp
|
||||
else:
|
||||
_flush_temperature = 0.3
|
||||
try:
|
||||
response = _call_llm(
|
||||
task="flush_memories",
|
||||
messages=api_messages,
|
||||
tools=[memory_tool_def],
|
||||
temperature=_flush_temperature,
|
||||
max_tokens=5120,
|
||||
# timeout resolved from auxiliary.flush_memories.timeout config
|
||||
)
|
||||
except RuntimeError:
|
||||
_aux_available = False
|
||||
response = None
|
||||
|
||||
if not _aux_available and self.api_mode == "codex_responses":
|
||||
# No auxiliary client -- use the Codex Responses path directly
|
||||
codex_kwargs = self._build_api_kwargs(api_messages)
|
||||
codex_kwargs["tools"] = self._get_transport().convert_tools([memory_tool_def])
|
||||
if _flush_temperature is not None:
|
||||
codex_kwargs["temperature"] = _flush_temperature
|
||||
else:
|
||||
codex_kwargs.pop("temperature", None)
|
||||
if "max_output_tokens" in codex_kwargs:
|
||||
codex_kwargs["max_output_tokens"] = 5120
|
||||
response = self._run_codex_stream(codex_kwargs)
|
||||
elif not _aux_available and self.api_mode == "anthropic_messages":
|
||||
# Native Anthropic — use the transport for kwargs
|
||||
_tflush = self._get_transport()
|
||||
ant_kwargs = _tflush.build_kwargs(
|
||||
model=self.model, messages=api_messages,
|
||||
tools=[memory_tool_def], max_tokens=5120,
|
||||
reasoning_config=None,
|
||||
preserve_dots=self._anthropic_preserve_dots(),
|
||||
)
|
||||
response = self._anthropic_messages_create(ant_kwargs)
|
||||
elif not _aux_available:
|
||||
api_kwargs = {
|
||||
"model": self.model,
|
||||
"messages": api_messages,
|
||||
"tools": [memory_tool_def],
|
||||
**self._max_tokens_param(5120),
|
||||
}
|
||||
if _flush_temperature is not None:
|
||||
api_kwargs["temperature"] = _flush_temperature
|
||||
from agent.auxiliary_client import _get_task_timeout
|
||||
response = self._ensure_primary_openai_client(reason="flush_memories").chat.completions.create(
|
||||
**api_kwargs, timeout=_get_task_timeout("flush_memories")
|
||||
)
|
||||
|
||||
# Extract tool calls from the response, handling all API formats
|
||||
tool_calls = []
|
||||
if self.api_mode == "codex_responses" and not _aux_available:
|
||||
_ct_flush = self._get_transport()
|
||||
_cnr_flush = _ct_flush.normalize_response(response)
|
||||
if _cnr_flush and _cnr_flush.tool_calls:
|
||||
tool_calls = [
|
||||
SimpleNamespace(
|
||||
id=tc.id, type="function",
|
||||
function=SimpleNamespace(name=tc.name, arguments=tc.arguments),
|
||||
) for tc in _cnr_flush.tool_calls
|
||||
]
|
||||
elif self.api_mode == "anthropic_messages" and not _aux_available:
|
||||
_tfn = self._get_transport()
|
||||
_flush_result = _tfn.normalize_response(response, strip_tool_prefix=self._is_anthropic_oauth)
|
||||
if _flush_result and _flush_result.tool_calls:
|
||||
tool_calls = [
|
||||
SimpleNamespace(
|
||||
id=tc.id, type="function",
|
||||
function=SimpleNamespace(name=tc.name, arguments=tc.arguments),
|
||||
) for tc in _flush_result.tool_calls
|
||||
]
|
||||
elif self.api_mode in ("chat_completions", "bedrock_converse"):
|
||||
# chat_completions / bedrock — normalize through transport
|
||||
_flush_result = self._get_transport().normalize_response(response)
|
||||
if _flush_result.tool_calls:
|
||||
tool_calls = _flush_result.tool_calls
|
||||
elif _aux_available and hasattr(response, "choices") and response.choices:
|
||||
# Auxiliary client returned OpenAI-shaped response while main
|
||||
# api_mode is codex/anthropic — extract tool_calls from .choices
|
||||
_aux_msg = response.choices[0].message
|
||||
if hasattr(_aux_msg, "tool_calls") and _aux_msg.tool_calls:
|
||||
tool_calls = _aux_msg.tool_calls
|
||||
|
||||
for tc in tool_calls:
|
||||
if tc.function.name == "memory":
|
||||
try:
|
||||
args = json.loads(tc.function.arguments)
|
||||
flush_target = args.get("target", "memory")
|
||||
from tools.memory_tool import memory_tool as _memory_tool
|
||||
_memory_tool(
|
||||
action=args.get("action"),
|
||||
target=flush_target,
|
||||
content=args.get("content"),
|
||||
old_text=args.get("old_text"),
|
||||
store=self._memory_store,
|
||||
)
|
||||
if not self.quiet_mode:
|
||||
print(f" 🧠 Memory flush: saved to {args.get('target', 'memory')}")
|
||||
except Exception as e:
|
||||
logger.debug("Memory flush tool call failed: %s", e)
|
||||
except Exception as e:
|
||||
logger.debug("Memory flush API call failed: %s", e)
|
||||
finally:
|
||||
# Strip flush artifacts: remove everything from the flush message onward.
|
||||
# Use sentinel marker instead of identity check for robustness.
|
||||
while messages and messages[-1].get("_flush_sentinel") != _sentinel:
|
||||
messages.pop()
|
||||
if not messages:
|
||||
break
|
||||
if messages and messages[-1].get("_flush_sentinel") == _sentinel:
|
||||
messages.pop()
|
||||
|
||||
def _compress_context(self, messages: list, system_message: str, *, approx_tokens: int = None, task_id: str = "default", focus_topic: str = None) -> tuple:
|
||||
"""Compress conversation context and split the session in SQLite.
|
||||
|
||||
@@ -7942,6 +7762,8 @@ class AIAgent:
|
||||
f"{approx_tokens:,}" if approx_tokens else "unknown", self.model,
|
||||
focus_topic,
|
||||
)
|
||||
# Pre-compression memory flush: let the model save memories before they're lost
|
||||
self.flush_memories(messages, min_turns=0)
|
||||
|
||||
# Notify external memory provider before compression discards context
|
||||
if self._memory_manager:
|
||||
@@ -7957,15 +7779,6 @@ class AIAgent:
|
||||
# focus_topic — fall back to calling without it.
|
||||
compressed = self.context_compressor.compress(messages, current_tokens=approx_tokens)
|
||||
|
||||
summary_error = getattr(self.context_compressor, "_last_summary_error", None)
|
||||
if summary_error:
|
||||
if getattr(self, "_last_compression_summary_warning", None) != summary_error:
|
||||
self._last_compression_summary_warning = summary_error
|
||||
self._emit_warning(
|
||||
f"⚠ Compression summary failed: {summary_error}. "
|
||||
"Inserted a fallback context marker."
|
||||
)
|
||||
|
||||
todo_snapshot = self._todo_store.format_for_injection()
|
||||
if todo_snapshot:
|
||||
compressed.append({"role": "user", "content": todo_snapshot})
|
||||
@@ -8135,10 +7948,6 @@ class AIAgent:
|
||||
function_args.get("action", ""),
|
||||
target,
|
||||
function_args.get("content", ""),
|
||||
metadata=self._build_memory_write_metadata(
|
||||
task_id=effective_task_id,
|
||||
tool_call_id=tool_call_id,
|
||||
),
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
@@ -8650,10 +8459,6 @@ class AIAgent:
|
||||
function_args.get("action", ""),
|
||||
target,
|
||||
function_args.get("content", ""),
|
||||
metadata=self._build_memory_write_metadata(
|
||||
task_id=effective_task_id,
|
||||
tool_call_id=getattr(tool_call, "id", None),
|
||||
),
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
@@ -8898,7 +8703,6 @@ class AIAgent:
|
||||
api_messages = []
|
||||
for msg in messages:
|
||||
api_msg = msg.copy()
|
||||
self._copy_reasoning_content_for_api(msg, api_msg)
|
||||
for internal_field in ("reasoning", "finish_reason", "_thinking_prefill"):
|
||||
api_msg.pop(internal_field, None)
|
||||
if _needs_sanitize:
|
||||
@@ -9529,19 +9333,6 @@ class AIAgent:
|
||||
# Note: Reasoning is embedded in content via <think> tags for trajectory storage.
|
||||
# However, providers like Moonshot AI require a separate 'reasoning_content' field
|
||||
# on assistant messages with tool_calls. We handle both cases here.
|
||||
request_logger = getattr(self, "logger", None) or logging.getLogger(__name__)
|
||||
repaired_tool_calls = self._sanitize_tool_call_arguments(
|
||||
messages,
|
||||
logger=request_logger,
|
||||
session_id=self.session_id,
|
||||
)
|
||||
if repaired_tool_calls > 0:
|
||||
request_logger.info(
|
||||
"Sanitized %s corrupted tool_call arguments before request (session=%s)",
|
||||
repaired_tool_calls,
|
||||
self.session_id or "-",
|
||||
)
|
||||
|
||||
api_messages = []
|
||||
for idx, msg in enumerate(messages):
|
||||
api_msg = msg.copy()
|
||||
@@ -12371,11 +12162,14 @@ class AIAgent:
|
||||
self._iters_since_skill = 0
|
||||
|
||||
# External memory provider: sync the completed turn + queue next prefetch.
|
||||
self._sync_external_memory_for_turn(
|
||||
original_user_message=original_user_message,
|
||||
final_response=final_response,
|
||||
interrupted=interrupted,
|
||||
)
|
||||
# Use original_user_message (clean input) — user_message may contain
|
||||
# injected skill content that bloats / breaks provider queries.
|
||||
if self._memory_manager and final_response and original_user_message:
|
||||
try:
|
||||
self._memory_manager.sync_all(original_user_message, final_response)
|
||||
self._memory_manager.queue_prefetch_all(original_user_message)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Background memory/skill review — runs AFTER the response is delivered
|
||||
# so it never competes with the user's task for model attention.
|
||||
|
||||
@@ -0,0 +1,377 @@
|
||||
# Compression Eval — Design
|
||||
|
||||
Status: proposal. Nothing under `scripts/compression_eval/` runs in CI.
|
||||
This is an offline tool authors run before merging prompt or algorithm
|
||||
changes to `agent/context_compressor.py`.
|
||||
|
||||
## Why
|
||||
|
||||
We tune the compressor prompt and the `_template_sections` checklist by
|
||||
hand, ship, and wait for the next real session to notice regressions.
|
||||
There is no automated check that a prompt edit still preserves file
|
||||
paths, error messages, or the active task across a compression.
|
||||
|
||||
Factory.ai's December 2025 write-up
|
||||
(https://factory.ai/news/evaluating-compression) describes a
|
||||
probe-based eval that scores compressed state on six dimensions. The
|
||||
methodology is the valuable part — the benchmarks in the post are a
|
||||
marketing piece. We adopt the methodology and discard the scoreboard.
|
||||
|
||||
## Goal
|
||||
|
||||
Given a real session transcript and a bank of probe questions that
|
||||
exercise what the transcript contained, answer:
|
||||
|
||||
1. After `ContextCompressor.compress()` runs, can the agent still
|
||||
answer each probe correctly from the compressed state?
|
||||
2. Which of the six dimensions (accuracy, context awareness, artifact
|
||||
trail, completeness, continuity, instruction following) is the
|
||||
prompt weakest on?
|
||||
3. Does a prompt change improve or regress any dimension vs. the
|
||||
previous run?
|
||||
|
||||
That is the full scope. No "compare against OpenAI and Anthropic"
|
||||
benchmarking, no public scoreboard, no marketing claims.
|
||||
|
||||
## Non-goals
|
||||
|
||||
- Not a pytest. Requires API credentials, costs money, takes minutes
|
||||
per fixture, and output is LLM-graded and non-deterministic.
|
||||
- Not part of `scripts/run_tests.sh`. Not invoked by CI.
|
||||
- Not a replacement for the existing compressor unit tests in
|
||||
`tests/agent/test_context_compressor.py` — those stay as the
|
||||
structural / boundary / tool-pair-sanitization guard.
|
||||
- Not a general trajectory eval. Scoped to context compaction only.
|
||||
|
||||
## Where it lives
|
||||
|
||||
```
|
||||
scripts/compression_eval/
|
||||
├── DESIGN.md # this file
|
||||
├── README.md # how to run, cost expectations, caveats
|
||||
├── run_eval.py # entry point (fire CLI, like sample_and_compress.py)
|
||||
├── scrub_fixtures.py # regenerate fixtures from ~/.hermes/sessions/*.jsonl
|
||||
├── fixtures/ # checked-in scrubbed session snapshots
|
||||
│ ├── feature-impl-context-priority.json
|
||||
│ ├── debug-session-feishu-id-model.json
|
||||
│ └── config-build-competitive-scouts.json
|
||||
├── probes/ # probe banks paired with fixtures
|
||||
│ └── <fixture>.probes.json
|
||||
├── rubric.py # grading prompt + dimension definitions
|
||||
├── grader.py # judge-model call + score parsing
|
||||
├── compressor_driver.py # thin wrapper over ContextCompressor
|
||||
└── results/ # gitignored; timestamped output per run
|
||||
└── .gitkeep
|
||||
```
|
||||
|
||||
`scripts/` is the right home: offline tooling, no CI involvement,
|
||||
precedent already set by `sample_and_compress.py`,
|
||||
`contributor_audit.py`, `discord-voice-doctor.py`.
|
||||
|
||||
`environments/` is for Atropos RL training environments — wrong shape.
|
||||
`tests/` is hermetic and credential-free — incompatible with a
|
||||
probe-based eval that needs a judge model.
|
||||
|
||||
## Fixture format
|
||||
|
||||
A fixture is a single compressed-enough conversation captured from a
|
||||
real session. Stored as JSON (pretty-printed, reviewable in PRs):
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "401-debug",
|
||||
"description": "178-turn session debugging a 401 on /api/auth/login",
|
||||
"model": "anthropic/claude-sonnet-4.6",
|
||||
"context_length": 200000,
|
||||
"messages": [
|
||||
{"role": "system", "content": "..."},
|
||||
{"role": "user", "content": "..."},
|
||||
{"role": "assistant", "content": "...", "tool_calls": [...]},
|
||||
{"role": "tool", "tool_call_id": "...", "content": "..."}
|
||||
],
|
||||
"notes": "Captured 2026-04-24 from session 20260424_*.jsonl; \
|
||||
PII scrubbed; secrets redacted via redact_sensitive_text."
|
||||
}
|
||||
```
|
||||
|
||||
### Sourcing fixtures
|
||||
|
||||
Fixtures are scrubbed snapshots of real sessions from the
|
||||
maintainer's `~/.hermes/sessions/*.jsonl` store, generated
|
||||
reproducibly by `scrub_fixtures.py` in this directory. Re-run the
|
||||
scrubber with `python3 scripts/compression_eval/scrub_fixtures.py`
|
||||
to regenerate them after a scrubber change.
|
||||
|
||||
Three shipped fixtures cover three different session shapes:
|
||||
|
||||
| Fixture | Source shape | Messages | Tokens (rough) | Tests |
|
||||
|---|---|---|---|---|
|
||||
| `feature-impl-context-priority` | investigate → patch → test → PR → merge | 75 | ~45k | continuation, artifact trail (2 files modified, 1 PR, ~16k skill_view in head) |
|
||||
| `debug-session-feishu-id-model` | PR triage + upstream docs + decision | 59 | ~28k | recall (PR #, error shape), decision (outcome + reason), large PR diff blocks |
|
||||
| `config-build-competitive-scouts` | iterative config: 11 cron jobs across 7 weekdays | 61 | ~26k | artifact trail (which jobs, which days), iterative-merge |
|
||||
|
||||
The `~26k-45k` token range is below the default 50%-of-200k
|
||||
compression threshold, so the eval will always **force** a
|
||||
`compress()` call rather than wait for the natural trigger. That is
|
||||
the intended shape — we want a controlled single-shot compression so
|
||||
score deltas are attributable to the prompt change, not to whether
|
||||
the threshold happened to fire at the same boundary twice.
|
||||
|
||||
### Scrubber pipeline
|
||||
|
||||
`scrub_fixtures.py` applies, per message:
|
||||
|
||||
1. `agent.redact.redact_sensitive_text` — API keys, tokens,
|
||||
connection strings
|
||||
2. Username paths: `/home/teknium` → `/home/user`
|
||||
3. Personal handles: all case variants of the maintainer name → `user`
|
||||
4. Email addresses → `contributor@example.com`; git
|
||||
`Author: Name <addr>` header lines normalised
|
||||
5. `<REASONING_SCRATCHPAD>...</REASONING_SCRATCHPAD>` and
|
||||
`<think>...</think>` stripped from assistant content
|
||||
6. Messaging-platform user mentions (`<@123456>`, `<@***>`) →
|
||||
`<@user>`
|
||||
7. First user message paraphrased to remove personal voice;
|
||||
subsequent user turns kept verbatim after the redactions above
|
||||
8. System prompt replaced with a generic public-safe placeholder so
|
||||
we don't check in the maintainer's tuned soul/skills/memory system
|
||||
block
|
||||
9. Orphan empty-assistant messages (artifact of scratchpad-only
|
||||
turns) and trailing tool messages with no matching assistant are
|
||||
dropped
|
||||
10. Tool outputs preserved verbatim. An earlier iteration truncated
|
||||
> 2KB tool bodies to keep fixture JSON small, but that defeats
|
||||
the purpose: real sessions have 30KB `skill_view` dumps, 10KB
|
||||
`read_file` outputs, 5KB `web_extract` bodies — compression has
|
||||
to handle them. Truncation is now a no-op; the pipeline note
|
||||
remains in `scrubbing_passes` for audit trail clarity.
|
||||
|
||||
Before every fixture PR: grep the fixture for PII patterns. An
|
||||
audit is embedded at the bottom of the scrubber as comments.
|
||||
|
||||
**Fixtures must stay small.** Target <200 KB per fixture, <500 KB
|
||||
total for the directory. Current total: ~410 KB across three
|
||||
fixtures. Larger sessions are truncated with a
|
||||
`truncated_to: <index>` field in the fixture header so the cut is
|
||||
reviewable.
|
||||
|
||||
## Probe format
|
||||
|
||||
One probe file per fixture, so reviewers can see the question bank
|
||||
evolve alongside the fixture:
|
||||
|
||||
```json
|
||||
{
|
||||
"fixture": "401-debug",
|
||||
"probes": [
|
||||
{
|
||||
"id": "recall-error-code",
|
||||
"type": "recall",
|
||||
"question": "What was the original error code and endpoint?",
|
||||
"expected_facts": ["401", "/api/auth/login"]
|
||||
},
|
||||
{
|
||||
"id": "artifact-files-modified",
|
||||
"type": "artifact",
|
||||
"question": "Which files have been modified in this session?",
|
||||
"expected_facts": ["session_store.py", "redis_client.py"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-next-step",
|
||||
"type": "continuation",
|
||||
"question": "What should we do next?",
|
||||
"expected_facts": ["re-run the integration tests", "restart the worker"]
|
||||
},
|
||||
{
|
||||
"id": "decision-redis-approach",
|
||||
"type": "decision",
|
||||
"question": "What did we decide about the Redis issue?",
|
||||
"expected_facts": ["switch to redis-py 5.x", "pooled connection"]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
The four probe types come directly from Factory's methodology:
|
||||
**recall, artifact, continuation, decision**. `expected_facts` gives
|
||||
the grader concrete anchors instead of relying purely on LLM taste.
|
||||
|
||||
Authoring a probe bank is a one-time cost per fixture. 8-12 probes per
|
||||
fixture is the target — enough to cover all four types, few enough to
|
||||
grade in under a minute at reasonable cost.
|
||||
|
||||
## Grading
|
||||
|
||||
Each probe gets scored 0-5 on **six dimensions** (Factory's six):
|
||||
|
||||
| Dimension | What it measures |
|
||||
|-----------------------|-----------------------------------------------------|
|
||||
| accuracy | File paths, function names, error codes are correct |
|
||||
| context_awareness | Reflects current state, not a mid-session snapshot |
|
||||
| artifact_trail | Knows which files were read / modified / created |
|
||||
| completeness | Addresses all parts of the probe |
|
||||
| continuity | Agent can continue without re-fetching |
|
||||
| instruction_following | Probe answered in the requested form |
|
||||
|
||||
Grading is done by a single judge-model call per probe with a
|
||||
deterministic rubric prompt (see `rubric.py`). The rubric includes the
|
||||
`expected_facts` list so the judge has a concrete anchor. Default
|
||||
judge model: whatever the user has configured as their main model at
|
||||
run time (same resolution path as `auxiliary_client.call_llm`). A
|
||||
`--judge-model` flag allows overriding for consistency across runs.
|
||||
|
||||
Non-determinism caveat: two runs of the same fixture will produce
|
||||
different scores. A single run means nothing. Report medians over
|
||||
N=3 runs by default, and require an improvement of >=0.3 on any
|
||||
dimension before claiming a prompt change is a win.
|
||||
|
||||
## Run flow
|
||||
|
||||
```
|
||||
python scripts/compression_eval/run_eval.py [OPTIONS]
|
||||
```
|
||||
|
||||
Options (fire-style, mirroring `sample_and_compress.py`):
|
||||
|
||||
| Flag | Default | Purpose |
|
||||
|------------------------|------------|-------------------------------------------|
|
||||
| `--fixtures` | all | Comma-separated fixture names |
|
||||
| `--runs` | 3 | Runs per fixture (for median) |
|
||||
| `--judge-model` | auto | Override judge model |
|
||||
| `--compressor-model` | auto | Override model used *inside* the compressor |
|
||||
| `--label` | timestamp | Subdirectory under `results/` |
|
||||
| `--focus-topic` | none | Pass-through to `compress(focus_topic=)` |
|
||||
| `--compare-to` | none | Path to a previous run for diff output |
|
||||
|
||||
Steps per fixture per run:
|
||||
|
||||
1. Load fixture JSON and probe bank.
|
||||
2. Construct a `ContextCompressor` against the fixture's model.
|
||||
3. Call `compressor.compress(messages)` — capture the compressed
|
||||
message list.
|
||||
4. For each probe: ask the judge model to role-play as the continuing
|
||||
agent with only the compressed state, then grade the answer on the
|
||||
six dimensions using `rubric.py`.
|
||||
5. Write a per-run JSON to `results/<label>/<fixture>-run-N.json`.
|
||||
6. After all runs, emit a markdown summary to
|
||||
`results/<label>/report.md`.
|
||||
|
||||
## Report format
|
||||
|
||||
Pasted verbatim into PR descriptions that touch the compressor:
|
||||
|
||||
```
|
||||
## Compression eval — label 2026-04-25_13-40-02
|
||||
|
||||
Main model: anthropic/claude-sonnet-4.6 Judge: same
|
||||
3 runs per fixture, medians reported.
|
||||
|
||||
| Fixture | Accuracy | Context | Artifact | Complete | Continuity | Instruction | Overall |
|
||||
|----------------|----------|---------|----------|----------|------------|-------------|---------|
|
||||
| 401-debug | 4.1 | 4.0 | 2.5 | 4.3 | 3.8 | 5.0 | 3.95 |
|
||||
| pr-review | 3.9 | 3.8 | 3.1 | 4.2 | 3.9 | 5.0 | 3.98 |
|
||||
| feature-impl | 4.0 | 3.9 | 2.9 | 4.1 | 4.0 | 5.0 | 3.98 |
|
||||
|
||||
Per-probe misses (score < 3.0):
|
||||
- 401-debug / artifact-files-modified: 1.7 — summary dropped redis_client.py
|
||||
- pr-review / decision-auth-rewrite: 2.3 — outcome captured, reasoning dropped
|
||||
```
|
||||
|
||||
## Cost expectations
|
||||
|
||||
Dominated by the judge calls. For 3 fixtures × 10 probes × 3 runs =
|
||||
90 judge calls per eval run. On Claude Sonnet 4.6 that is roughly
|
||||
$0.50-$1.50 per full eval depending on probe length. The compressor
|
||||
itself makes 1 call per fixture × 3 runs = 9 additional calls.
|
||||
|
||||
**This is not a check to run after every commit.** It is a
|
||||
before-merge check for PRs that touch:
|
||||
|
||||
- `agent/context_compressor.py` — any change to `_template_sections`,
|
||||
`_generate_summary`, or `compress()`.
|
||||
- `agent/auxiliary_client.py` — when changing how compression tasks
|
||||
are routed.
|
||||
- `agent/prompt_builder.py` — when the compression-note phrasing
|
||||
changes.
|
||||
|
||||
## Open questions (to resolve before implementing)
|
||||
|
||||
1. **Fixture scrubbing: manual or scripted?** A scripted scrub that
|
||||
also replaces project names / hostnames would lower the cost of
|
||||
contributing a new fixture. Risk: over-aggressive replacement
|
||||
destroys the signal the probe depends on. Propose: start manual,
|
||||
add scripted helpers once we have 3 fixtures and know the common
|
||||
PII shapes.
|
||||
|
||||
2. **Judge model selection.** Factory uses GPT-5.2. We can't pin one
|
||||
— user's main model changes. Options: (a) grade with main model
|
||||
(cheap, inconsistent across users), (b) require a specific judge
|
||||
model (e.g. `claude-sonnet-4.6`), inconsistent for users without
|
||||
access. Propose (a) with a `--judge-model` override, and make the
|
||||
model name prominent in the report so comparisons across machines
|
||||
are legible.
|
||||
|
||||
3. **Noise floor.** Before landing prompt changes, run the current
|
||||
prompt N=10 times to measure per-dimension stddev. That tells us
|
||||
the minimum delta to call a change significant. Suspect 0.2-0.3 on
|
||||
a 0-5 scale. Decision deferred until after the first fixture is
|
||||
landed.
|
||||
|
||||
4. **Iterative-merge coverage.** The real Factory-vs-Anthropic
|
||||
difference is incremental merge vs. regenerate. A fixture that
|
||||
only compresses once doesn't exercise our iterative path. Add a
|
||||
fourth fixture that forces two compressions (manually chained),
|
||||
with probes that test whether information from the first
|
||||
compression survives the second. Deferred to a follow-up PR.
|
||||
|
||||
## Implementation status
|
||||
|
||||
This PR ships the full eval end-to-end:
|
||||
|
||||
- `scrub_fixtures.py` — reproducible scrubber
|
||||
- `fixtures/` — three scrubbed session fixtures
|
||||
- `probes/` — three probe banks (10-11 probes each, all four types)
|
||||
- `rubric.py` — six-dimension grading rubric + judge-prompt builder + response parser
|
||||
- `compressor_driver.py` — thin wrapper around `ContextCompressor` for forced single-shot compression
|
||||
- `grader.py` — two-phase continuation + grading calls via OpenAI SDK
|
||||
- `report.py` — markdown report renderer + `--compare-to` delta mode + per-run JSON dumper
|
||||
- `run_eval.py` — entry point (`fire`-style CLI)
|
||||
- `tests/scripts/test_compression_eval.py` — 33 unit tests covering rubric parsing, report rendering, fixture/probe loading, and a PII smoke test on the fixtures (LLM paths not tested — they require credentials and are exercised by the eval itself)
|
||||
|
||||
### Noise floor — one empirical data point
|
||||
|
||||
A single same-inputs re-run of `debug-session-feishu-id-model`
|
||||
(compressor + judge = `openai/gpt-5.4-mini` via Nous Portal,
|
||||
runs=1) produced:
|
||||
|
||||
- Run A overall: 3.25
|
||||
- Run B overall: 3.17 (delta -0.08)
|
||||
|
||||
Individual dimensions varied by up to ±0.5 between the two runs on
|
||||
single-run medians. This confirms DESIGN.md's "< 0.3 is noise"
|
||||
guidance is the right order of magnitude for a single-run
|
||||
comparison. With `runs=3` default, per-dimension variance should
|
||||
tighten; noise-floor measurement at N=10 is still a useful
|
||||
follow-up to calibrate precisely.
|
||||
|
||||
## Open follow-ups (not blocking this PR)
|
||||
|
||||
1. **Iterative-merge fixture** — our actual compression win over
|
||||
"regenerate from scratch" approaches is only exercised when
|
||||
`_previous_summary` is re-used on a second compression. None of
|
||||
the three shipped fixtures force two compressions. The natural
|
||||
basis is `config-build-competitive-scouts` (already iterative by
|
||||
shape); splitting it at the Monday/Tuesday boundary would force
|
||||
the second compression to merge rather than regenerate.
|
||||
2. **Noise-floor precision** — run the current prompt N=10 times
|
||||
against one fixture to pin down per-dimension stddev and publish
|
||||
the numbers in README.
|
||||
3. **Scripted scrubber helpers** — the current scrubber is manual
|
||||
per-fixture. A helper that identifies candidate sessions to
|
||||
scrub (by shape or by keyword) would lower the cost of adding
|
||||
fixture #4+.
|
||||
4. **Judge model selection policy** — current code uses whatever
|
||||
the user passes as `--judge-model` (default: same as compressor).
|
||||
Pinning the judge across users would stabilise cross-machine
|
||||
comparisons, at the cost of blocking users without access to
|
||||
the pinned model.
|
||||
@@ -0,0 +1,110 @@
|
||||
# compression_eval
|
||||
|
||||
Offline eval harness for `agent/context_compressor.py`. Runs a real
|
||||
conversation transcript through the compressor, then probes the
|
||||
compressed state with targeted questions graded on six dimensions.
|
||||
|
||||
## When to run
|
||||
|
||||
Before merging changes to:
|
||||
|
||||
- `agent/context_compressor.py` — any change to `_template_sections`,
|
||||
`_generate_summary`, `compress()`, or its boundary logic
|
||||
- `agent/auxiliary_client.py` — when changing how compression tasks
|
||||
are routed
|
||||
- `agent/prompt_builder.py` — when the compression-note phrasing
|
||||
changes
|
||||
|
||||
## Not for CI
|
||||
|
||||
This harness makes real model calls (compressor + continuation +
|
||||
judge = ~3 calls per probe × probes per fixture × runs). Costs ~$0.50
|
||||
to ~$1.50 per full run depending on models, takes minutes, is
|
||||
LLM-graded (non-deterministic). It lives in `scripts/` and is
|
||||
invoked by hand. `tests/` and `scripts/run_tests.sh` do not touch it.
|
||||
|
||||
`tests/scripts/test_compression_eval.py` covers the non-LLM code
|
||||
paths (rubric parsing, report rendering, fixture/probe loading, PII
|
||||
smoke check on the checked-in fixtures) and DOES run in CI.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
# Run all three fixtures, 3 runs each, with your configured provider
|
||||
python3 scripts/compression_eval/run_eval.py
|
||||
|
||||
# Faster iteration — one fixture, one run
|
||||
python3 scripts/compression_eval/run_eval.py \
|
||||
--fixtures=debug-session-feishu-id-model --runs=1
|
||||
|
||||
# Pin a cheap model for both compression + judge (recommended)
|
||||
python3 scripts/compression_eval/run_eval.py \
|
||||
--compressor-provider=nous --compressor-model=openai/gpt-5.4-mini \
|
||||
--judge-provider=nous --judge-model=openai/gpt-5.4-mini \
|
||||
--runs=3 --label=baseline
|
||||
|
||||
# After editing context_compressor.py, rerun with a new label and diff
|
||||
python3 scripts/compression_eval/run_eval.py \
|
||||
--compressor-provider=nous --compressor-model=openai/gpt-5.4-mini \
|
||||
--judge-provider=nous --judge-model=openai/gpt-5.4-mini \
|
||||
--runs=3 --label=my-prompt-tweak \
|
||||
--compare-to=results/baseline
|
||||
```
|
||||
|
||||
Results land in `results/<label>/report.md` and are intended to be
|
||||
pasted verbatim into PR descriptions. `--compare-to` renders a delta
|
||||
column per dimension so reviewers can see "did this actually help?"
|
||||
at a glance.
|
||||
|
||||
Rule of thumb: dimension deltas below ±0.3 are within run-to-run
|
||||
noise on `runs=3`. Publish a bigger N if you want tighter bounds.
|
||||
|
||||
## Fixtures
|
||||
|
||||
Three scrubbed session snapshots live under `fixtures/`:
|
||||
|
||||
- `feature-impl-context-priority.json` — 75 msgs, investigate →
|
||||
patch → test → PR → merge
|
||||
- `debug-session-feishu-id-model.json` — 59 msgs, PR triage +
|
||||
upstream docs + decision
|
||||
- `config-build-competitive-scouts.json` — 61 msgs, iterative
|
||||
config accumulation (11 cron jobs)
|
||||
|
||||
Regenerate them from the maintainer's `~/.hermes/sessions/*.jsonl`
|
||||
with `python3 scripts/compression_eval/scrub_fixtures.py`. The
|
||||
scrubber pipeline and PII-audit checklist are documented in
|
||||
`DESIGN.md` under **Scrubber pipeline**.
|
||||
|
||||
## Probes
|
||||
|
||||
One probe bank per fixture under `probes/`, 10-11 probes each,
|
||||
covering all four types: **recall**, **artifact**, **continuation**,
|
||||
**decision**. Each probe carries an `expected_facts` list of concrete
|
||||
anchors (PR numbers, file paths, error codes, commands run) that the
|
||||
judge sees alongside the assistant's answer.
|
||||
|
||||
## How it scores
|
||||
|
||||
Six dimensions, 0-5 per probe:
|
||||
|
||||
| Dimension | What it measures |
|
||||
|-----------------------|------------------------------------------------------|
|
||||
| accuracy | File paths, function names, PR/issue numbers correct |
|
||||
| context_awareness | Reflects current session state, not a snapshot |
|
||||
| artifact_trail | Correctly enumerates files / commands / PRs |
|
||||
| completeness | Addresses ALL parts of the probe |
|
||||
| continuity | Next assistant could continue without re-fetching |
|
||||
| instruction_following | Answer in the requested form |
|
||||
|
||||
Report renders medians across N runs; probes scoring below 3.0
|
||||
overall surface in a separate section with the judge's specific
|
||||
complaint noted inline.
|
||||
|
||||
## Related
|
||||
|
||||
- `agent/context_compressor.py` — the thing under test
|
||||
- `tests/agent/test_context_compressor.py` — structural unit tests
|
||||
that do run in CI
|
||||
- `scripts/sample_and_compress.py` — the closest existing script in
|
||||
shape (offline, credential-requiring, not in CI)
|
||||
- `DESIGN.md` — full architecture + methodology + open follow-ups
|
||||
@@ -0,0 +1,114 @@
|
||||
"""Wraps ContextCompressor to run a single forced compression on a fixture.
|
||||
|
||||
The real agent loop checks ``should_compress()`` before calling ``compress()``.
|
||||
Fixtures are intentionally sized below the 100k threshold so ``compress()``
|
||||
runs in a controlled, single-shot mode — score deltas attribute to the
|
||||
prompt change, not to whether the threshold happened to fire at the same
|
||||
boundary twice.
|
||||
|
||||
Resolves the provider for the compression call via the same path the real
|
||||
agent uses (``hermes_cli.runtime_provider.resolve_runtime_provider``) so
|
||||
behaviour matches production aside from being a single call.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
# Make sibling imports work whether invoked as a script or as a module.
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
if str(_REPO_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(_REPO_ROOT))
|
||||
|
||||
from agent.context_compressor import ( # noqa: E402
|
||||
ContextCompressor,
|
||||
estimate_messages_tokens_rough,
|
||||
)
|
||||
|
||||
|
||||
def run_compression(
|
||||
*,
|
||||
messages: List[Dict[str, Any]],
|
||||
compressor_model: str,
|
||||
compressor_provider: str,
|
||||
compressor_base_url: str,
|
||||
compressor_api_key: str,
|
||||
compressor_api_mode: str,
|
||||
context_length: int,
|
||||
focus_topic: Optional[str] = None,
|
||||
summary_model_override: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Run a single forced compression pass over the fixture messages.
|
||||
|
||||
Returns a dict with:
|
||||
- compressed_messages: the post-compression message list
|
||||
- summary_text: the summary produced (extracted from the compressed head)
|
||||
- pre_tokens, post_tokens: rough token counts before/after
|
||||
- compression_ratio: 1 - (post/pre)
|
||||
- pre_message_count, post_message_count
|
||||
"""
|
||||
compressor = ContextCompressor(
|
||||
model=compressor_model,
|
||||
threshold_percent=0.50,
|
||||
protect_first_n=3,
|
||||
protect_last_n=20,
|
||||
summary_target_ratio=0.20,
|
||||
quiet_mode=True,
|
||||
summary_model_override=summary_model_override or "",
|
||||
base_url=compressor_base_url,
|
||||
api_key=compressor_api_key,
|
||||
config_context_length=context_length,
|
||||
provider=compressor_provider,
|
||||
api_mode=compressor_api_mode,
|
||||
)
|
||||
|
||||
pre_tokens = estimate_messages_tokens_rough(messages)
|
||||
compressed = compressor.compress(
|
||||
messages,
|
||||
current_tokens=pre_tokens,
|
||||
focus_topic=focus_topic,
|
||||
)
|
||||
post_tokens = estimate_messages_tokens_rough(compressed)
|
||||
|
||||
summary_text = _extract_summary_from_messages(compressed)
|
||||
|
||||
ratio = (1.0 - (post_tokens / pre_tokens)) if pre_tokens > 0 else 0.0
|
||||
|
||||
return {
|
||||
"compressed_messages": compressed,
|
||||
"summary_text": summary_text,
|
||||
"pre_tokens": pre_tokens,
|
||||
"post_tokens": post_tokens,
|
||||
"compression_ratio": ratio,
|
||||
"pre_message_count": len(messages),
|
||||
"post_message_count": len(compressed),
|
||||
}
|
||||
|
||||
|
||||
_SUMMARY_MARKERS = (
|
||||
"## Active Task",
|
||||
"## Goal",
|
||||
"## Completed Actions",
|
||||
)
|
||||
|
||||
|
||||
def _extract_summary_from_messages(messages: List[Dict[str, Any]]) -> str:
|
||||
"""Find the structured summary block inside the compressed message list.
|
||||
|
||||
The compressor injects the summary as a user (or system-appended) message
|
||||
near the head. We look for the section-header markers from
|
||||
``_template_sections`` in ``agent/context_compressor.py``.
|
||||
"""
|
||||
for msg in messages:
|
||||
content = msg.get("content")
|
||||
if not isinstance(content, str):
|
||||
if isinstance(content, list):
|
||||
content = "\n".join(
|
||||
p.get("text", "") for p in content if isinstance(p, dict)
|
||||
)
|
||||
else:
|
||||
continue
|
||||
if any(marker in content for marker in _SUMMARY_MARKERS):
|
||||
return content
|
||||
return ""
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,181 @@
|
||||
"""Two-phase probe grading.
|
||||
|
||||
Phase 1 — **Continuation**: simulate the next assistant turn. Feed the
|
||||
compressed message list plus the probe question and ask the continuing
|
||||
model to answer using only the compressed context. This is exactly what
|
||||
a real next-turn call would look like.
|
||||
|
||||
Phase 2 — **Grading**: a separate judge-model call scores the answer on
|
||||
the six rubric dimensions using ``rubric.build_judge_prompt``.
|
||||
|
||||
Both phases use the OpenAI SDK directly against the resolved provider
|
||||
endpoint, so the explicit api_key + base_url we pass always reaches the
|
||||
wire. (``agent.auxiliary_client.call_llm`` is designed for task-tagged
|
||||
auxiliary calls backed by config lookups; for eval we need the explicit
|
||||
credentials to win unconditionally.)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
if str(_REPO_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(_REPO_ROOT))
|
||||
|
||||
from openai import OpenAI # noqa: E402
|
||||
|
||||
from rubric import build_judge_prompt, parse_judge_response # noqa: E402
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_CONTINUATION_SYSTEM = (
|
||||
"You are the continuing assistant in a long session. Earlier turns have "
|
||||
"been compacted into a handoff summary that is now part of the "
|
||||
"conversation history. The user has just asked you a question. "
|
||||
"Answer using ONLY what you can determine from the conversation history "
|
||||
"you see (including the handoff summary). Do NOT invent details. If the "
|
||||
"summary does not contain a specific fact, say so explicitly rather "
|
||||
"than guessing. Be direct and concrete — cite file paths, PR numbers, "
|
||||
"error codes, and exact values when they are present in the summary."
|
||||
)
|
||||
|
||||
|
||||
def answer_probe(
|
||||
*,
|
||||
compressed_messages: List[Dict[str, Any]],
|
||||
probe_question: str,
|
||||
model: str,
|
||||
provider: str,
|
||||
base_url: str,
|
||||
api_key: str,
|
||||
max_tokens: int = 1024,
|
||||
timeout: Optional[float] = 120.0,
|
||||
) -> str:
|
||||
"""Run the continuation call: what does the next assistant answer?
|
||||
|
||||
Builds a messages list of [system_continuation, *compressed, probe_user]
|
||||
and asks the configured model. Returns the answer content as a string.
|
||||
"""
|
||||
# Strip any pre-existing system message from the compressed list and
|
||||
# replace with our continuation system prompt. The fixture's generic
|
||||
# system is not the right frame for the continuation simulation.
|
||||
history = [m for m in compressed_messages if m.get("role") != "system"]
|
||||
messages = (
|
||||
[{"role": "system", "content": _CONTINUATION_SYSTEM}]
|
||||
+ _sanitize_for_chat_api(history)
|
||||
+ [{"role": "user", "content": probe_question}]
|
||||
)
|
||||
|
||||
client = OpenAI(api_key=api_key, base_url=base_url, timeout=timeout)
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
content = response.choices[0].message.content
|
||||
if not isinstance(content, str):
|
||||
content = "" if content is None else str(content)
|
||||
return content.strip()
|
||||
|
||||
|
||||
def grade_probe(
|
||||
*,
|
||||
probe_question: str,
|
||||
probe_type: str,
|
||||
expected_facts: List[str],
|
||||
assistant_answer: str,
|
||||
judge_model: str,
|
||||
judge_provider: str,
|
||||
judge_base_url: str,
|
||||
judge_api_key: str,
|
||||
max_tokens: int = 512,
|
||||
timeout: Optional[float] = 120.0,
|
||||
) -> Dict[str, Any]:
|
||||
"""Run the judge call and parse the six dimension scores.
|
||||
|
||||
Returns dict {scores: {dim: int}, notes: str, overall: float,
|
||||
raw: str, parse_error: str|None}. On parse failure, scores are zeros
|
||||
and parse_error is populated — the caller decides whether to retry
|
||||
or accept.
|
||||
"""
|
||||
prompt = build_judge_prompt(
|
||||
probe_question=probe_question,
|
||||
probe_type=probe_type,
|
||||
expected_facts=expected_facts,
|
||||
assistant_answer=assistant_answer,
|
||||
)
|
||||
client = OpenAI(api_key=judge_api_key, base_url=judge_base_url, timeout=timeout)
|
||||
response = client.chat.completions.create(
|
||||
model=judge_model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
raw = response.choices[0].message.content or ""
|
||||
if not isinstance(raw, str):
|
||||
raw = str(raw)
|
||||
|
||||
try:
|
||||
parsed = parse_judge_response(raw)
|
||||
parsed["raw"] = raw
|
||||
parsed["parse_error"] = None
|
||||
return parsed
|
||||
except ValueError as exc:
|
||||
logger.warning("Judge response parse failed: %s | raw=%r", exc, raw[:200])
|
||||
from rubric import DIMENSIONS
|
||||
return {
|
||||
"scores": {d: 0 for d in DIMENSIONS},
|
||||
"notes": "",
|
||||
"overall": 0.0,
|
||||
"raw": raw,
|
||||
"parse_error": str(exc),
|
||||
}
|
||||
|
||||
|
||||
def _sanitize_for_chat_api(
|
||||
messages: List[Dict[str, Any]],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Drop tool_calls/tool pairs that are incomplete.
|
||||
|
||||
A compressed message list may contain tool_call references whose matching
|
||||
``tool`` result was summarized away, which breaks strict-validator
|
||||
providers (Anthropic, OpenAI). Easiest correct behaviour for the eval:
|
||||
strip tool_calls entirely and drop ``tool`` role messages — the
|
||||
continuation model only needs the summary + recent turns to answer the
|
||||
probe, not the precise tool-call bookkeeping.
|
||||
"""
|
||||
clean: List[Dict[str, Any]] = []
|
||||
for m in messages:
|
||||
role = m.get("role")
|
||||
if role == "tool":
|
||||
# Convert tool result to a plain user note so the continuation
|
||||
# model still sees the content without needing the structured
|
||||
# tool_call_id pairing.
|
||||
content = m.get("content")
|
||||
if isinstance(content, list):
|
||||
content = "\n".join(
|
||||
p.get("text", "") for p in content if isinstance(p, dict)
|
||||
)
|
||||
clean.append({
|
||||
"role": "user",
|
||||
"content": f"[earlier tool result]\n{content or ''}",
|
||||
})
|
||||
continue
|
||||
new = {"role": role, "content": m.get("content", "")}
|
||||
# Drop tool_calls — the downstream assistant message's content
|
||||
# still describes what the agent was doing.
|
||||
clean.append(new)
|
||||
# Collapse consecutive same-role turns into one (alternation rule)
|
||||
merged: List[Dict[str, Any]] = []
|
||||
for m in clean:
|
||||
if merged and merged[-1]["role"] == m["role"]:
|
||||
prev = merged[-1]
|
||||
prev_c = prev.get("content") or ""
|
||||
new_c = m.get("content") or ""
|
||||
prev["content"] = f"{prev_c}\n\n{new_c}" if prev_c else new_c
|
||||
else:
|
||||
merged.append(m)
|
||||
return merged
|
||||
@@ -0,0 +1,96 @@
|
||||
{
|
||||
"fixture": "config-build-competitive-scouts",
|
||||
"description": "Probes for the competitive-scout cron-job setup session. Anchors are which agents were configured, which day of the week each runs, and the full final schedule. This fixture most directly tests artifact-trail and iterative-merge because the job list grows by one per user turn.",
|
||||
"probes": [
|
||||
{
|
||||
"id": "recall-first-repo",
|
||||
"type": "recall",
|
||||
"question": "What was the first repository the user asked to create a scout cron for, and on what day of the week?",
|
||||
"expected_facts": ["openclaw", "Sunday"]
|
||||
},
|
||||
{
|
||||
"id": "recall-closed-source-target",
|
||||
"type": "recall",
|
||||
"question": "One of the scout targets does not have an open-source repository and had to be configured as a web scan instead. Which one, and on what day?",
|
||||
"expected_facts": ["claude code", "Friday", "web scan"]
|
||||
},
|
||||
{
|
||||
"id": "artifact-all-jobs",
|
||||
"type": "artifact",
|
||||
"question": "List every scout cron job created in this session.",
|
||||
"expected_facts": [
|
||||
"openclaw-pr-scout",
|
||||
"nanoclaw-pr-scout",
|
||||
"ironclaw-pr-scout",
|
||||
"kilocode-pr-scout",
|
||||
"codex-pr-scout",
|
||||
"gemini-cli-pr-scout",
|
||||
"cline-pr-scout",
|
||||
"opencode-pr-scout",
|
||||
"claude-code-scout",
|
||||
"aider-pr-scout",
|
||||
"roocode-pr-scout"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "artifact-final-schedule",
|
||||
"type": "artifact",
|
||||
"question": "What is the final weekly schedule? Give the day and the agents scanned on each day.",
|
||||
"expected_facts": [
|
||||
"Sun: openclaw, nanoclaw, ironclaw",
|
||||
"Mon: kilo code",
|
||||
"Tue: codex",
|
||||
"Wed: gemini cli, cline",
|
||||
"Thu: opencode",
|
||||
"Fri: claude code",
|
||||
"Sat: aider, roo"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "artifact-sunday-count",
|
||||
"type": "artifact",
|
||||
"question": "How many cron jobs run on Sunday?",
|
||||
"expected_facts": ["3", "three", "openclaw, nanoclaw, ironclaw"]
|
||||
},
|
||||
{
|
||||
"id": "artifact-total-count",
|
||||
"type": "artifact",
|
||||
"question": "How many scout cron jobs were created in total by the end of the session?",
|
||||
"expected_facts": ["11", "eleven"]
|
||||
},
|
||||
{
|
||||
"id": "decision-kilo-open-source",
|
||||
"type": "decision",
|
||||
"question": "The user asked whether Kilo Code is open source. What was the answer, and what did the user decide to do with it?",
|
||||
"expected_facts": [
|
||||
"yes, open source",
|
||||
"Kilo-Org/kilocode",
|
||||
"added as Monday scout"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "decision-saturday-fill",
|
||||
"type": "decision",
|
||||
"question": "Saturday was the last open day at one point. Which scout(s) were placed on Saturday, and why were those chosen?",
|
||||
"expected_facts": ["aider", "roo", "filled in last based on openrouter popularity / cli comparison rankings"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-execution-time",
|
||||
"type": "continuation",
|
||||
"question": "At what local time of day do these scout cron jobs run?",
|
||||
"expected_facts": ["10 AM Pacific", "17:00 UTC", "0 17 * * *"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-skill-used",
|
||||
"type": "continuation",
|
||||
"question": "Each scout job runs with a specific skill preloaded. Which one?",
|
||||
"expected_facts": ["hermes-agent-dev"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-weekday-coverage",
|
||||
"type": "continuation",
|
||||
"question": "After the session ended, are there any weekdays still uncovered by a scout job?",
|
||||
"expected_facts": ["no", "all 7 days covered", "full week loaded"]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
{
|
||||
"fixture": "debug-session-feishu-id-model",
|
||||
"description": "Probes for the Feishu identity-model PR #8388 triage session. Anchors are the PR number, what the PR actually contained, what upstream docs confirmed, and the final decision + reasoning.",
|
||||
"probes": [
|
||||
{
|
||||
"id": "recall-pr-number",
|
||||
"type": "recall",
|
||||
"question": "What is the PR number under review in this session, and what repository is it against?",
|
||||
"expected_facts": ["PR #8388", "NousResearch/hermes-agent", "hermes-agent"]
|
||||
},
|
||||
{
|
||||
"id": "recall-bug-claim",
|
||||
"type": "recall",
|
||||
"question": "What is the core bug the PR claims to fix? Be specific about the identifier involved.",
|
||||
"expected_facts": ["open_id", "app-scoped", "not canonical", "Feishu identity model"]
|
||||
},
|
||||
{
|
||||
"id": "recall-upstream-confirmation",
|
||||
"type": "recall",
|
||||
"question": "Do upstream Feishu/Lark docs confirm that open_id is app-scoped rather than a canonical cross-app identity?",
|
||||
"expected_facts": ["yes", "confirmed", "open.feishu.cn", "same user has different Open IDs in different apps"]
|
||||
},
|
||||
{
|
||||
"id": "artifact-pr-scope",
|
||||
"type": "artifact",
|
||||
"question": "Roughly how large is PR #8388, and which gateway subsystems does it touch beyond the Feishu adapter?",
|
||||
"expected_facts": ["4647 lines", "gateway/run.py", "cron/scheduler.py", "gateway/config.py", "multi-account", "bind"]
|
||||
},
|
||||
{
|
||||
"id": "artifact-new-tool",
|
||||
"type": "artifact",
|
||||
"question": "Does the PR add a new tool file? If so, what is its path?",
|
||||
"expected_facts": ["tools/feishu_id_tool.py", "new file"]
|
||||
},
|
||||
{
|
||||
"id": "decision-pr-assessment",
|
||||
"type": "decision",
|
||||
"question": "What is the reviewer's overall assessment of PR #8388 — approve, reject, or something more nuanced? Explain in one sentence.",
|
||||
"expected_facts": [
|
||||
"core claim is correct",
|
||||
"scope is wrong",
|
||||
"bait-and-switch",
|
||||
"overbuilt",
|
||||
"implement cleaner ourselves"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "decision-core-claim-validity",
|
||||
"type": "decision",
|
||||
"question": "Setting aside the PR's size, is the underlying identity-model concern technically valid or not?",
|
||||
"expected_facts": ["technically valid", "correct", "open_id is app-scoped"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-next-action",
|
||||
"type": "continuation",
|
||||
"question": "Based on the review outcome, what is the next action the agent has been asked to take regarding this PR?",
|
||||
"expected_facts": ["close the PR", "implement ourselves", "cleaner", "less complex"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-implementation-scope",
|
||||
"type": "continuation",
|
||||
"question": "If implementing the Feishu fix cleanly ourselves, which specific behaviour needs to change — what should replace the current use of open_id?",
|
||||
"expected_facts": ["use union_id", "or user_id", "canonical identity", "cross-app stable ID"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-sources-to-reference",
|
||||
"type": "continuation",
|
||||
"question": "Which upstream documentation sources were fetched during review that should be referenced when writing the clean implementation?",
|
||||
"expected_facts": ["open.feishu.cn", "open.larkoffice.com", "user-identity-introduction"]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
{
|
||||
"fixture": "feature-impl-context-priority",
|
||||
"description": "Probes for the .hermes.md / AGENTS.md / CLAUDE.md / .cursorrules priority feature session. Anchors are the concrete facts the next assistant would need to continue: user's priority order, files modified, helper-function structure, live-test scenarios, and PR number.",
|
||||
"probes": [
|
||||
{
|
||||
"id": "recall-priority-order",
|
||||
"type": "recall",
|
||||
"question": "What is the priority order the user asked for when multiple project-context files are present? List them from highest to lowest priority.",
|
||||
"expected_facts": [".hermes.md", "AGENTS.md", "CLAUDE.md", ".cursorrules", "highest to lowest"]
|
||||
},
|
||||
{
|
||||
"id": "recall-selection-mode",
|
||||
"type": "recall",
|
||||
"question": "When multiple context files exist in the same directory, does the agent now load all of them or pick only one?",
|
||||
"expected_facts": ["only one", "priority-based selection", "highest-priority winner"]
|
||||
},
|
||||
{
|
||||
"id": "artifact-files-modified",
|
||||
"type": "artifact",
|
||||
"question": "Which files in the hermes-agent repository were modified during this session? List them.",
|
||||
"expected_facts": [
|
||||
"agent/prompt_builder.py",
|
||||
"tests/agent/test_prompt_builder.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "artifact-helper-functions",
|
||||
"type": "artifact",
|
||||
"question": "The session introduced separate helper functions for each context-file type. What are their names?",
|
||||
"expected_facts": [
|
||||
"_load_hermes_md",
|
||||
"_load_agents_md",
|
||||
"_load_claude_md",
|
||||
"_load_cursorrules"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "artifact-test-scenarios",
|
||||
"type": "artifact",
|
||||
"question": "A scratch directory was created with scenario subdirectories to live-test the priority chain. Roughly how many scenarios, and what directory was it created under?",
|
||||
"expected_facts": ["10 scenarios", "/tmp/context-priority-test"]
|
||||
},
|
||||
{
|
||||
"id": "decision-claude-md-was-unsupported",
|
||||
"type": "decision",
|
||||
"question": "What was the finding about CLAUDE.md support in the existing loader before this session's changes?",
|
||||
"expected_facts": ["CLAUDE.md was not handled", "not supported", "new handler added"]
|
||||
},
|
||||
{
|
||||
"id": "decision-load-all-or-one",
|
||||
"type": "decision",
|
||||
"question": "Was the decision to load multiple context files when present, or to load only the highest-priority one? Explain the reasoning in one sentence.",
|
||||
"expected_facts": ["load only one", "highest priority", "user preference", "do not want to load multiple"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-pr-number-and-status",
|
||||
"type": "continuation",
|
||||
"question": "A pull request was opened for this feature. What is the PR number and what is its merge status?",
|
||||
"expected_facts": ["PR #2301", "merged", "squash"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-test-suite-result",
|
||||
"type": "continuation",
|
||||
"question": "What was the result of the full test suite run after the implementation changes?",
|
||||
"expected_facts": ["5680 passed", "0 failures", "clean"]
|
||||
},
|
||||
{
|
||||
"id": "continuation-next-step",
|
||||
"type": "continuation",
|
||||
"question": "If asked to pick up this session, what is the current state of main? Anything left to do?",
|
||||
"expected_facts": ["merged to main", "main is current", "nothing outstanding", "pulled"]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,235 @@
|
||||
"""Markdown report rendering + diff-against-baseline for compression-eval runs.
|
||||
|
||||
Report format is optimised for pasting directly into a PR description.
|
||||
Top-of-report table is the per-fixture medians; below that is the
|
||||
probe-by-probe miss list (scores < 3.0 on overall).
|
||||
|
||||
Diff mode (``compare_to``) emits a second table with deltas per fixture
|
||||
per dimension against a previous run directory.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import statistics
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from rubric import DIMENSIONS
|
||||
|
||||
|
||||
def write_run_json(
|
||||
*,
|
||||
results_dir: Path,
|
||||
fixture_name: str,
|
||||
run_index: int,
|
||||
payload: Dict[str, Any],
|
||||
) -> Path:
|
||||
"""Dump one fixture's per-run results as JSON for later diffing."""
|
||||
results_dir.mkdir(parents=True, exist_ok=True)
|
||||
path = results_dir / f"{fixture_name}-run-{run_index}.json"
|
||||
with path.open("w") as fh:
|
||||
json.dump(payload, fh, indent=2, ensure_ascii=False)
|
||||
return path
|
||||
|
||||
|
||||
def _median(values: List[float]) -> float:
|
||||
return statistics.median(values) if values else 0.0
|
||||
|
||||
|
||||
def _format_score(value: float) -> str:
|
||||
return f"{value:.2f}"
|
||||
|
||||
|
||||
def _format_delta(baseline: float, current: float) -> str:
|
||||
delta = current - baseline
|
||||
if abs(delta) < 0.01:
|
||||
return f"{current:.2f} (±0)"
|
||||
sign = "+" if delta > 0 else ""
|
||||
return f"{current:.2f} ({sign}{delta:.2f})"
|
||||
|
||||
|
||||
def summarize_fixture_runs(
|
||||
fixture_runs: List[Dict[str, Any]],
|
||||
) -> Dict[str, Any]:
|
||||
"""Collapse N runs of one fixture into per-dimension medians + metadata.
|
||||
|
||||
Each run payload is {probes: [{id, type, scores: {...}, overall, ...}]}.
|
||||
Returns {fixture_name, runs, dimension_medians, overall_median, misses}.
|
||||
"""
|
||||
if not fixture_runs:
|
||||
return {}
|
||||
|
||||
fixture_name = fixture_runs[0]["fixture_name"]
|
||||
n_runs = len(fixture_runs)
|
||||
|
||||
# Per-probe-per-dimension aggregation across runs
|
||||
probe_ids = [p["id"] for p in fixture_runs[0]["probes"]]
|
||||
per_probe: Dict[str, Dict[str, List[float]]] = {
|
||||
pid: {d: [] for d in DIMENSIONS} for pid in probe_ids
|
||||
}
|
||||
per_probe_overall: Dict[str, List[float]] = {pid: [] for pid in probe_ids}
|
||||
|
||||
for run in fixture_runs:
|
||||
for p in run["probes"]:
|
||||
pid = p["id"]
|
||||
for d in DIMENSIONS:
|
||||
per_probe[pid][d].append(p["scores"].get(d, 0))
|
||||
per_probe_overall[pid].append(p["overall"])
|
||||
|
||||
# Median each probe across runs, then median those medians across probes
|
||||
dim_medians: Dict[str, float] = {}
|
||||
for d in DIMENSIONS:
|
||||
per_probe_med = [_median(per_probe[pid][d]) for pid in probe_ids]
|
||||
dim_medians[d] = _median(per_probe_med)
|
||||
overall_median = _median([_median(per_probe_overall[pid]) for pid in probe_ids])
|
||||
|
||||
# Misses = probes whose median overall < 3.0
|
||||
misses: List[Dict[str, Any]] = []
|
||||
for pid in probe_ids:
|
||||
med = _median(per_probe_overall[pid])
|
||||
if med < 3.0:
|
||||
# Pull the notes from the last run to give the reader a
|
||||
# concrete clue. (Taking the most recent run is fine —
|
||||
# notes vary across runs and any one is illustrative.)
|
||||
notes = ""
|
||||
probe_type = ""
|
||||
for p in fixture_runs[-1]["probes"]:
|
||||
if p["id"] == pid:
|
||||
notes = p.get("notes", "")
|
||||
probe_type = p.get("type", "")
|
||||
break
|
||||
misses.append({
|
||||
"id": pid,
|
||||
"type": probe_type,
|
||||
"overall_median": med,
|
||||
"notes": notes,
|
||||
})
|
||||
|
||||
return {
|
||||
"fixture_name": fixture_name,
|
||||
"runs": n_runs,
|
||||
"dimension_medians": dim_medians,
|
||||
"overall_median": overall_median,
|
||||
"misses": misses,
|
||||
"compression": fixture_runs[0].get("compression", {}),
|
||||
}
|
||||
|
||||
|
||||
def render_report(
|
||||
*,
|
||||
label: str,
|
||||
compressor_model: str,
|
||||
judge_model: str,
|
||||
runs_per_fixture: int,
|
||||
summaries: List[Dict[str, Any]],
|
||||
baseline_summaries: Optional[List[Dict[str, Any]]] = None,
|
||||
) -> str:
|
||||
"""Render the full markdown report.
|
||||
|
||||
baseline_summaries is the same shape as summaries, sourced from a
|
||||
previous run (via --compare-to). When present, dimension scores in
|
||||
the main table render with deltas.
|
||||
"""
|
||||
lines: List[str] = []
|
||||
lines.append(f"## Compression eval — label `{label}`")
|
||||
lines.append("")
|
||||
lines.append(f"- Compressor model: `{compressor_model}`")
|
||||
lines.append(f"- Judge model: `{judge_model}`")
|
||||
lines.append(f"- Runs per fixture: {runs_per_fixture}")
|
||||
lines.append("- Medians over runs reported")
|
||||
if baseline_summaries:
|
||||
lines.append("- Deltas shown against baseline run")
|
||||
lines.append("")
|
||||
|
||||
baseline_by_name: Dict[str, Dict[str, Any]] = {}
|
||||
if baseline_summaries:
|
||||
baseline_by_name = {s["fixture_name"]: s for s in baseline_summaries}
|
||||
|
||||
# Main table
|
||||
header = ["Fixture"] + DIMENSIONS + ["overall"]
|
||||
lines.append("| " + " | ".join(header) + " |")
|
||||
lines.append("|" + "|".join(["---"] * len(header)) + "|")
|
||||
for s in summaries:
|
||||
row = [s["fixture_name"]]
|
||||
baseline = baseline_by_name.get(s["fixture_name"])
|
||||
for d in DIMENSIONS:
|
||||
cur = s["dimension_medians"][d]
|
||||
if baseline and d in baseline.get("dimension_medians", {}):
|
||||
row.append(_format_delta(baseline["dimension_medians"][d], cur))
|
||||
else:
|
||||
row.append(_format_score(cur))
|
||||
if baseline:
|
||||
row.append(_format_delta(baseline["overall_median"], s["overall_median"]))
|
||||
else:
|
||||
row.append(_format_score(s["overall_median"]))
|
||||
lines.append("| " + " | ".join(row) + " |")
|
||||
lines.append("")
|
||||
|
||||
# Compression metadata
|
||||
lines.append("### Compression summary")
|
||||
lines.append("")
|
||||
lines.append("| Fixture | Pre tokens | Post tokens | Ratio | Pre msgs | Post msgs |")
|
||||
lines.append("|---|---|---|---|---|---|")
|
||||
for s in summaries:
|
||||
c = s.get("compression", {})
|
||||
lines.append(
|
||||
"| {name} | {pre} | {post} | {ratio:.1%} | {pm} | {pom} |".format(
|
||||
name=s["fixture_name"],
|
||||
pre=c.get("pre_tokens", 0),
|
||||
post=c.get("post_tokens", 0),
|
||||
ratio=c.get("compression_ratio", 0.0),
|
||||
pm=c.get("pre_message_count", 0),
|
||||
pom=c.get("post_message_count", 0),
|
||||
)
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
# Per-probe misses
|
||||
any_misses = any(s["misses"] for s in summaries)
|
||||
if any_misses:
|
||||
lines.append("### Probes scoring below 3.0 overall (median)")
|
||||
lines.append("")
|
||||
for s in summaries:
|
||||
if not s["misses"]:
|
||||
continue
|
||||
lines.append(f"**{s['fixture_name']}**")
|
||||
for m in s["misses"]:
|
||||
note_part = f" — {m['notes']}" if m["notes"] else ""
|
||||
lines.append(
|
||||
f"- `{m['id']}` ({m['type']}): "
|
||||
f"{m['overall_median']:.2f}{note_part}"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
lines.append("### Methodology")
|
||||
lines.append("")
|
||||
lines.append(
|
||||
"Probe-based eval adapted from "
|
||||
"https://factory.ai/news/evaluating-compression. Each fixture is "
|
||||
"compressed in a single forced `ContextCompressor.compress()` call, "
|
||||
"then a continuation call asks the compressor model to answer each "
|
||||
"probe from the compressed state, then the judge model scores the "
|
||||
"answer 0-5 on six dimensions. A single run is noisy; medians "
|
||||
"across multiple runs are the meaningful signal. Changes below "
|
||||
"~0.3 on any dimension are likely within run-to-run noise."
|
||||
)
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def load_baseline_summaries(baseline_dir: Path) -> List[Dict[str, Any]]:
|
||||
"""Load summaries from a previous eval run for --compare-to.
|
||||
|
||||
Reads the dumped per-run JSONs and re-summarises them so the
|
||||
aggregation matches whatever summariser was current at the time of
|
||||
the new run (forward-compatible with schema additions).
|
||||
"""
|
||||
if not baseline_dir.exists():
|
||||
raise FileNotFoundError(f"baseline dir not found: {baseline_dir}")
|
||||
|
||||
by_fixture: Dict[str, List[Dict[str, Any]]] = {}
|
||||
for path in sorted(baseline_dir.glob("*-run-*.json")):
|
||||
with path.open() as fh:
|
||||
payload = json.load(fh)
|
||||
by_fixture.setdefault(payload["fixture_name"], []).append(payload)
|
||||
|
||||
return [summarize_fixture_runs(runs) for runs in by_fixture.values()]
|
||||
@@ -0,0 +1,198 @@
|
||||
"""Rubric for probe-based compression eval grading.
|
||||
|
||||
Six dimensions scored 0-5 by a judge model. The scoring anchors are spelled
|
||||
out so the judge interpretation is stable across runs and across judge
|
||||
models.
|
||||
|
||||
Adapted from the methodology in
|
||||
https://factory.ai/news/evaluating-compression. Their scoreboard is not
|
||||
adopted; only the dimension definitions and the 0-5 scale.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List
|
||||
|
||||
# Canonical dimension order. All reports, parsers, and comparisons derive
|
||||
# from this list — do not hardcode the order elsewhere.
|
||||
DIMENSIONS: List[str] = [
|
||||
"accuracy",
|
||||
"context_awareness",
|
||||
"artifact_trail",
|
||||
"completeness",
|
||||
"continuity",
|
||||
"instruction_following",
|
||||
]
|
||||
|
||||
DIMENSION_DESCRIPTIONS: Dict[str, str] = {
|
||||
"accuracy": (
|
||||
"Are concrete facts correct — file paths, function names, PR/issue "
|
||||
"numbers, error codes, command outputs, line numbers? A single wrong "
|
||||
"path or error code should cost points. Vague but non-contradicting "
|
||||
"answers score mid-range."
|
||||
),
|
||||
"context_awareness": (
|
||||
"Does the answer reflect the CURRENT state of the session, not a "
|
||||
"mid-session snapshot? For example, if a file was modified then "
|
||||
"reverted, does the answer describe the reverted state? If three "
|
||||
"PRs were opened, does the answer know which was merged?"
|
||||
),
|
||||
"artifact_trail": (
|
||||
"Does the answer correctly enumerate the artifacts (files read, "
|
||||
"files modified, commands run, tools called, PRs opened, cron jobs "
|
||||
"created)? Missing artifacts cost more than extra unrelated ones."
|
||||
),
|
||||
"completeness": (
|
||||
"Does the answer address ALL parts of the probe question? If the "
|
||||
"probe asks for three things and only two are answered, that is "
|
||||
"incomplete regardless of accuracy on the two."
|
||||
),
|
||||
"continuity": (
|
||||
"Could the next assistant continue the work using only this answer, "
|
||||
"without having to re-fetch files or re-explore the codebase? An "
|
||||
"answer that lists files by name but doesn't mention the change is "
|
||||
"poor continuity even if accurate."
|
||||
),
|
||||
"instruction_following": (
|
||||
"Is the answer in the format the probe requested (list, number, "
|
||||
"short phrase, yes/no)? Ignore tone and length, only assess "
|
||||
"whether the requested form was honoured."
|
||||
),
|
||||
}
|
||||
|
||||
SCORE_SCALE: Dict[int, str] = {
|
||||
0: "No useful information; wrong or hallucinated.",
|
||||
1: "Major gaps or a key fact is wrong.",
|
||||
2: "Partially correct but significant omissions.",
|
||||
3: "Mostly correct with minor omissions or imprecision.",
|
||||
4: "Correct and complete with only trivial imprecision.",
|
||||
5: "Fully correct, complete, and in the requested format.",
|
||||
}
|
||||
|
||||
|
||||
_RUBRIC_HEADER = """You are an evaluator grading a single answer produced by an AI assistant \
|
||||
that was given a COMPRESSED handoff summary of an earlier conversation and \
|
||||
asked a probe question. You are NOT evaluating the compression summary \
|
||||
directly — you are evaluating whether the answer the assistant produced \
|
||||
from that summary is correct, complete, and useful.
|
||||
|
||||
Grade on six dimensions, each 0-5:
|
||||
|
||||
{dimension_block}
|
||||
|
||||
0-5 scale:
|
||||
{scale_block}
|
||||
|
||||
Grade strictly. Fractional scores are NOT allowed — output integers only. \
|
||||
If the answer is ambiguous, use the lower of the two candidate scores."""
|
||||
|
||||
|
||||
def build_judge_prompt(
|
||||
*,
|
||||
probe_question: str,
|
||||
probe_type: str,
|
||||
expected_facts: List[str],
|
||||
assistant_answer: str,
|
||||
) -> str:
|
||||
"""Build the full judge prompt for one (probe, answer) pair.
|
||||
|
||||
The judge is told the expected_facts up front so grading is anchored to
|
||||
concrete signal rather than judge taste. Expected facts are intentionally
|
||||
NOT shown to the assistant that produces the answer.
|
||||
"""
|
||||
dim_block = "\n".join(
|
||||
f"- {d}: {DIMENSION_DESCRIPTIONS[d]}" for d in DIMENSIONS
|
||||
)
|
||||
scale_block = "\n".join(
|
||||
f" {score}: {desc}" for score, desc in sorted(SCORE_SCALE.items())
|
||||
)
|
||||
header = _RUBRIC_HEADER.format(
|
||||
dimension_block=dim_block,
|
||||
scale_block=scale_block,
|
||||
)
|
||||
|
||||
expected_block = (
|
||||
"\n".join(f"- {f}" for f in expected_facts) if expected_facts else "(none provided)"
|
||||
)
|
||||
|
||||
output_schema = (
|
||||
"Respond with ONLY a JSON object, no prose before or after, matching "
|
||||
"this schema exactly:\n"
|
||||
"{\n"
|
||||
' "accuracy": <int 0-5>,\n'
|
||||
' "context_awareness": <int 0-5>,\n'
|
||||
' "artifact_trail": <int 0-5>,\n'
|
||||
' "completeness": <int 0-5>,\n'
|
||||
' "continuity": <int 0-5>,\n'
|
||||
' "instruction_following": <int 0-5>,\n'
|
||||
' "notes": "<one short sentence, <=200 chars, identifying the '
|
||||
'single biggest issue with the answer if any>"\n'
|
||||
"}"
|
||||
)
|
||||
|
||||
return (
|
||||
f"{header}\n\n"
|
||||
f"PROBE TYPE: {probe_type}\n\n"
|
||||
f"PROBE QUESTION:\n{probe_question}\n\n"
|
||||
f"EXPECTED FACTS (the answer should contain these concrete anchors; "
|
||||
f"missing any is a material defect in accuracy and/or completeness):\n"
|
||||
f"{expected_block}\n\n"
|
||||
f"ASSISTANT ANSWER TO GRADE:\n{assistant_answer}\n\n"
|
||||
f"{output_schema}"
|
||||
)
|
||||
|
||||
|
||||
def parse_judge_response(raw: str) -> Dict[str, Any]:
|
||||
"""Parse the judge model's JSON response into a score dict.
|
||||
|
||||
Tolerates surrounding prose (judges ignore instructions sometimes) by
|
||||
extracting the first {...} block. Validates that every dimension is
|
||||
present as an integer 0-5.
|
||||
|
||||
Returns dict with keys: scores (dim->int), notes (str), overall (float).
|
||||
Raises ValueError if the response cannot be parsed into a complete
|
||||
score set.
|
||||
"""
|
||||
import json
|
||||
import re
|
||||
|
||||
if not raw or not raw.strip():
|
||||
raise ValueError("empty judge response")
|
||||
|
||||
# Strip code fences and any ```json prefix judges sometimes emit.
|
||||
stripped = raw.strip()
|
||||
fence_match = re.match(r"^```(?:json)?\s*(.*?)\s*```$", stripped, re.DOTALL)
|
||||
if fence_match:
|
||||
stripped = fence_match.group(1).strip()
|
||||
|
||||
# Extract the first {...} block greedy-to-matching-brace.
|
||||
brace_match = re.search(r"\{.*\}", stripped, re.DOTALL)
|
||||
if not brace_match:
|
||||
raise ValueError(f"no JSON object found in judge response: {raw[:200]!r}")
|
||||
candidate = brace_match.group(0)
|
||||
|
||||
try:
|
||||
parsed = json.loads(candidate)
|
||||
except json.JSONDecodeError as exc:
|
||||
raise ValueError(f"judge response not valid JSON: {exc}; raw={candidate[:200]!r}")
|
||||
|
||||
scores: Dict[str, int] = {}
|
||||
for dim in DIMENSIONS:
|
||||
if dim not in parsed:
|
||||
raise ValueError(f"judge response missing dimension {dim!r}: {parsed}")
|
||||
value = parsed[dim]
|
||||
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
||||
raise ValueError(f"dimension {dim} is not numeric: {value!r}")
|
||||
int_val = int(round(value))
|
||||
if int_val < 0 or int_val > 5:
|
||||
raise ValueError(f"dimension {dim} out of range: {int_val}")
|
||||
scores[dim] = int_val
|
||||
|
||||
notes_val = parsed.get("notes", "")
|
||||
notes = str(notes_val)[:200] if notes_val else ""
|
||||
|
||||
overall = sum(scores.values()) / len(scores)
|
||||
return {
|
||||
"scores": scores,
|
||||
"notes": notes,
|
||||
"overall": overall,
|
||||
}
|
||||
Executable
+383
@@ -0,0 +1,383 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compression eval — entry point.
|
||||
|
||||
Runs the full probe-based eval over one or more fixtures, produces a
|
||||
markdown report in ``results/<label>/report.md`` paired with per-run JSON
|
||||
for later diffing.
|
||||
|
||||
Not a pytest. Requires a configured provider + credentials (same path the
|
||||
agent uses). Does not run in CI. See README.md for usage examples.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
_HERE = Path(__file__).resolve().parent
|
||||
_REPO_ROOT = _HERE.parents[1]
|
||||
if str(_REPO_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(_REPO_ROOT))
|
||||
# Make our sibling modules importable whether invoked as a script or as -m.
|
||||
if str(_HERE) not in sys.path:
|
||||
sys.path.insert(0, str(_HERE))
|
||||
|
||||
try:
|
||||
import fire # noqa: F401
|
||||
except ImportError:
|
||||
fire = None # fallback to argparse if fire is unavailable
|
||||
|
||||
from hermes_cli.runtime_provider import resolve_runtime_provider # noqa: E402
|
||||
|
||||
from compressor_driver import run_compression # noqa: E402
|
||||
from grader import answer_probe, grade_probe # noqa: E402
|
||||
from report import ( # noqa: E402
|
||||
load_baseline_summaries,
|
||||
render_report,
|
||||
summarize_fixture_runs,
|
||||
write_run_json,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("compression_eval")
|
||||
|
||||
|
||||
FIXTURES_DIR = _HERE / "fixtures"
|
||||
PROBES_DIR = _HERE / "probes"
|
||||
RESULTS_DIR = _HERE / "results"
|
||||
|
||||
|
||||
def _load_fixture(name: str) -> Dict[str, Any]:
|
||||
path = FIXTURES_DIR / f"{name}.json"
|
||||
if not path.exists():
|
||||
available = sorted(p.stem for p in FIXTURES_DIR.glob("*.json"))
|
||||
raise FileNotFoundError(
|
||||
f"Fixture not found: {name}. Available: {available}"
|
||||
)
|
||||
with path.open() as fh:
|
||||
return json.load(fh)
|
||||
|
||||
|
||||
def _load_probes(name: str) -> Dict[str, Any]:
|
||||
path = PROBES_DIR / f"{name}.probes.json"
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"Probe bank not found for fixture {name}: {path}")
|
||||
with path.open() as fh:
|
||||
return json.load(fh)
|
||||
|
||||
|
||||
def _resolve_runtime(
|
||||
*,
|
||||
provider_override: Optional[str],
|
||||
model_override: Optional[str],
|
||||
) -> Dict[str, Any]:
|
||||
"""Resolve provider credentials via the same path the agent uses."""
|
||||
runtime = resolve_runtime_provider(
|
||||
requested=provider_override,
|
||||
target_model=model_override,
|
||||
)
|
||||
if not runtime.get("api_key") and not runtime.get("base_url"):
|
||||
raise RuntimeError(
|
||||
"No provider configured. Run `hermes setup` or set provider "
|
||||
"credentials in the environment before running the eval."
|
||||
)
|
||||
return runtime
|
||||
|
||||
|
||||
def _available_fixtures() -> List[str]:
|
||||
return sorted(p.stem for p in FIXTURES_DIR.glob("*.json"))
|
||||
|
||||
|
||||
def _run_one_fixture(
|
||||
*,
|
||||
fixture_name: str,
|
||||
run_index: int,
|
||||
compressor_runtime: Dict[str, Any],
|
||||
compressor_model: str,
|
||||
judge_runtime: Dict[str, Any],
|
||||
judge_model: str,
|
||||
focus_topic: Optional[str],
|
||||
) -> Dict[str, Any]:
|
||||
fx = _load_fixture(fixture_name)
|
||||
probes = _load_probes(fixture_name)
|
||||
|
||||
logger.info(
|
||||
"[%s run=%d] compressing (%d messages, ctx=%d)",
|
||||
fixture_name, run_index, len(fx["messages"]), fx["context_length"],
|
||||
)
|
||||
compression = run_compression(
|
||||
messages=fx["messages"],
|
||||
compressor_model=compressor_model,
|
||||
compressor_provider=compressor_runtime["provider"],
|
||||
compressor_base_url=compressor_runtime["base_url"],
|
||||
compressor_api_key=compressor_runtime["api_key"],
|
||||
compressor_api_mode=compressor_runtime.get("api_mode", ""),
|
||||
context_length=fx["context_length"],
|
||||
focus_topic=focus_topic,
|
||||
# Force the compressor to use the model we're testing, bypassing
|
||||
# any auxiliary.compression.model config override. Without this,
|
||||
# ContextCompressor.call_llm(task="compression") routes through
|
||||
# the user's config which may pin a different model (e.g.
|
||||
# google/gemini-3-flash-preview).
|
||||
summary_model_override=compressor_model,
|
||||
)
|
||||
logger.info(
|
||||
"[%s run=%d] compressed %d -> %d tokens (%.1f%%)",
|
||||
fixture_name, run_index,
|
||||
compression["pre_tokens"], compression["post_tokens"],
|
||||
compression["compression_ratio"] * 100,
|
||||
)
|
||||
|
||||
probe_results: List[Dict[str, Any]] = []
|
||||
for probe in probes["probes"]:
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
answer = answer_probe(
|
||||
compressed_messages=compression["compressed_messages"],
|
||||
probe_question=probe["question"],
|
||||
provider=compressor_runtime["provider"],
|
||||
model=compressor_model,
|
||||
base_url=compressor_runtime["base_url"],
|
||||
api_key=compressor_runtime["api_key"],
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"[%s run=%d probe=%s] continuation failed: %s",
|
||||
fixture_name, run_index, probe["id"], exc,
|
||||
)
|
||||
answer = ""
|
||||
|
||||
try:
|
||||
grade = grade_probe(
|
||||
probe_question=probe["question"],
|
||||
probe_type=probe["type"],
|
||||
expected_facts=probe.get("expected_facts", []),
|
||||
assistant_answer=answer,
|
||||
judge_provider=judge_runtime["provider"],
|
||||
judge_model=judge_model,
|
||||
judge_base_url=judge_runtime["base_url"],
|
||||
judge_api_key=judge_runtime["api_key"],
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"[%s run=%d probe=%s] grading failed: %s",
|
||||
fixture_name, run_index, probe["id"], exc,
|
||||
)
|
||||
from rubric import DIMENSIONS
|
||||
grade = {
|
||||
"scores": {d: 0 for d in DIMENSIONS},
|
||||
"notes": f"grading error: {exc}",
|
||||
"overall": 0.0,
|
||||
"raw": "",
|
||||
"parse_error": str(exc),
|
||||
}
|
||||
|
||||
elapsed = time.monotonic() - t0
|
||||
logger.info(
|
||||
"[%s run=%d probe=%s] overall=%.2f (%.1fs)",
|
||||
fixture_name, run_index, probe["id"], grade["overall"], elapsed,
|
||||
)
|
||||
|
||||
probe_results.append({
|
||||
"id": probe["id"],
|
||||
"type": probe["type"],
|
||||
"question": probe["question"],
|
||||
"expected_facts": probe.get("expected_facts", []),
|
||||
"answer": answer,
|
||||
"scores": grade["scores"],
|
||||
"overall": grade["overall"],
|
||||
"notes": grade["notes"],
|
||||
"parse_error": grade["parse_error"],
|
||||
"elapsed_seconds": elapsed,
|
||||
})
|
||||
|
||||
return {
|
||||
"fixture_name": fixture_name,
|
||||
"run_index": run_index,
|
||||
"compression": {
|
||||
"pre_tokens": compression["pre_tokens"],
|
||||
"post_tokens": compression["post_tokens"],
|
||||
"compression_ratio": compression["compression_ratio"],
|
||||
"pre_message_count": compression["pre_message_count"],
|
||||
"post_message_count": compression["post_message_count"],
|
||||
"summary_text": compression["summary_text"],
|
||||
},
|
||||
"probes": probe_results,
|
||||
}
|
||||
|
||||
|
||||
def _coerce_fixtures_arg(arg: Optional[str]) -> List[str]:
|
||||
if not arg:
|
||||
return _available_fixtures()
|
||||
return [s.strip() for s in arg.split(",") if s.strip()]
|
||||
|
||||
|
||||
def main(
|
||||
fixtures: Optional[str] = None,
|
||||
runs: int = 3,
|
||||
judge_model: Optional[str] = None,
|
||||
judge_provider: Optional[str] = None,
|
||||
compressor_model: Optional[str] = None,
|
||||
compressor_provider: Optional[str] = None,
|
||||
label: Optional[str] = None,
|
||||
focus_topic: Optional[str] = None,
|
||||
compare_to: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
) -> int:
|
||||
"""Run the compression eval.
|
||||
|
||||
Args:
|
||||
fixtures: Comma-separated fixture names; default = all in fixtures/.
|
||||
runs: Runs per fixture. Medians reported. Default 3.
|
||||
judge_model: Override the judge model (default = same as
|
||||
compressor model resolved from config).
|
||||
judge_provider: Override the judge provider.
|
||||
compressor_model: Override the compressor model (default =
|
||||
whatever resolve_runtime_provider returns for the active
|
||||
configuration).
|
||||
compressor_provider: Override the compressor provider.
|
||||
label: Output subdirectory under results/. Default = timestamp.
|
||||
focus_topic: Optional focus topic passed through to
|
||||
ContextCompressor.compress(focus_topic=...).
|
||||
compare_to: Path to a previous run directory (e.g.
|
||||
results/2026-04-24_baseline) to diff against in the report.
|
||||
verbose: Print debug logs.
|
||||
"""
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if verbose else logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
||||
)
|
||||
|
||||
fixture_names = _coerce_fixtures_arg(fixtures)
|
||||
# Validate every fixture has a probe bank before spending any money.
|
||||
for name in fixture_names:
|
||||
_load_fixture(name)
|
||||
_load_probes(name)
|
||||
|
||||
compressor_runtime = _resolve_runtime(
|
||||
provider_override=compressor_provider,
|
||||
model_override=compressor_model,
|
||||
)
|
||||
effective_compressor_model = (
|
||||
compressor_model or compressor_runtime.get("resolved_model") or "auto"
|
||||
)
|
||||
if effective_compressor_model == "auto":
|
||||
# resolve_runtime_provider doesn't always fill resolved_model;
|
||||
# fall back to reading model.default from config.
|
||||
from hermes_cli.config import load_config
|
||||
cfg = load_config()
|
||||
mc = cfg.get("model", {}) or {}
|
||||
if isinstance(mc, dict):
|
||||
effective_compressor_model = (
|
||||
mc.get("default") or mc.get("model") or "anthropic/claude-sonnet-4.6"
|
||||
)
|
||||
else:
|
||||
effective_compressor_model = str(mc) or "anthropic/claude-sonnet-4.6"
|
||||
|
||||
if judge_provider or judge_model:
|
||||
judge_runtime = _resolve_runtime(
|
||||
provider_override=judge_provider,
|
||||
model_override=judge_model,
|
||||
)
|
||||
effective_judge_model = judge_model or effective_compressor_model
|
||||
else:
|
||||
judge_runtime = compressor_runtime
|
||||
effective_judge_model = effective_compressor_model
|
||||
|
||||
effective_label = label or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
out_dir = RESULTS_DIR / effective_label
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger.info(
|
||||
"Compression eval starting: label=%s fixtures=%s runs=%d "
|
||||
"compressor=%s judge=%s out=%s",
|
||||
effective_label, fixture_names, runs,
|
||||
effective_compressor_model, effective_judge_model, out_dir,
|
||||
)
|
||||
|
||||
all_summaries: List[Dict[str, Any]] = []
|
||||
for fixture_name in fixture_names:
|
||||
per_run: List[Dict[str, Any]] = []
|
||||
for run_i in range(1, runs + 1):
|
||||
payload = _run_one_fixture(
|
||||
fixture_name=fixture_name,
|
||||
run_index=run_i,
|
||||
compressor_runtime=compressor_runtime,
|
||||
compressor_model=effective_compressor_model,
|
||||
judge_runtime=judge_runtime,
|
||||
judge_model=effective_judge_model,
|
||||
focus_topic=focus_topic,
|
||||
)
|
||||
write_run_json(
|
||||
results_dir=out_dir,
|
||||
fixture_name=fixture_name,
|
||||
run_index=run_i,
|
||||
payload=payload,
|
||||
)
|
||||
per_run.append(payload)
|
||||
summary = summarize_fixture_runs(per_run)
|
||||
all_summaries.append(summary)
|
||||
|
||||
baseline_summaries: Optional[List[Dict[str, Any]]] = None
|
||||
if compare_to:
|
||||
baseline_path = Path(compare_to)
|
||||
if not baseline_path.is_absolute():
|
||||
baseline_path = _HERE / baseline_path
|
||||
baseline_summaries = load_baseline_summaries(baseline_path)
|
||||
|
||||
report_md = render_report(
|
||||
label=effective_label,
|
||||
compressor_model=effective_compressor_model,
|
||||
judge_model=effective_judge_model,
|
||||
runs_per_fixture=runs,
|
||||
summaries=all_summaries,
|
||||
baseline_summaries=baseline_summaries,
|
||||
)
|
||||
report_path = out_dir / "report.md"
|
||||
report_path.write_text(report_md)
|
||||
|
||||
# Also write a machine-readable summary.json alongside the human report.
|
||||
summary_path = out_dir / "summary.json"
|
||||
with summary_path.open("w") as fh:
|
||||
json.dump(
|
||||
{
|
||||
"label": effective_label,
|
||||
"compressor_model": effective_compressor_model,
|
||||
"judge_model": effective_judge_model,
|
||||
"runs_per_fixture": runs,
|
||||
"fixtures": all_summaries,
|
||||
},
|
||||
fh,
|
||||
indent=2,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
|
||||
print()
|
||||
print(report_md)
|
||||
print(f"Report written to {report_path}")
|
||||
print(f"Per-run JSON in {out_dir}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if fire is not None:
|
||||
# fire preserves docstrings as --help and handles kwarg-style CLI.
|
||||
sys.exit(fire.Fire(main))
|
||||
else:
|
||||
import argparse
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--fixtures")
|
||||
p.add_argument("--runs", type=int, default=3)
|
||||
p.add_argument("--judge-model", dest="judge_model")
|
||||
p.add_argument("--judge-provider", dest="judge_provider")
|
||||
p.add_argument("--compressor-model", dest="compressor_model")
|
||||
p.add_argument("--compressor-provider", dest="compressor_provider")
|
||||
p.add_argument("--label")
|
||||
p.add_argument("--focus-topic", dest="focus_topic")
|
||||
p.add_argument("--compare-to", dest="compare_to")
|
||||
p.add_argument("--verbose", action="store_true")
|
||||
args = p.parse_args()
|
||||
sys.exit(main(**vars(args)))
|
||||
Executable
+381
@@ -0,0 +1,381 @@
|
||||
"""One-shot fixture scrubber for scripts/compression_eval/fixtures/.
|
||||
|
||||
Source: ~/.hermes/sessions/<file>.jsonl
|
||||
Output: .worktrees/.../scripts/compression_eval/fixtures/<name>.json
|
||||
|
||||
Scrubbing passes:
|
||||
1. agent.redact.redact_sensitive_text — API keys, tokens, connection strings
|
||||
2. Username paths — /home/teknium/ → /home/user/, ~/.hermes/ preserved as-is
|
||||
(that path is universal)
|
||||
3. Personal handles — "Teknium"/"teknium"/"teknium1" → "user"
|
||||
4. Reasoning scratchpads — strip <REASONING_SCRATCHPAD>...</REASONING_SCRATCHPAD>
|
||||
blocks and <think>...</think> tags (personality leakage risk)
|
||||
5. session_meta line — drop entirely, we only need the messages
|
||||
6. User message personality — lightly paraphrase the first user message to keep
|
||||
task intent while removing "vibe"; subsequent user turns kept verbatim
|
||||
since they're short instructions
|
||||
|
||||
The fixture format matches DESIGN.md:
|
||||
{
|
||||
"name": "...",
|
||||
"description": "...",
|
||||
"model": "...", # best guess from original session
|
||||
"context_length": 200000,
|
||||
"messages": [...], # OpenAI-format, only role/content/tool_calls/tool_call_id/tool_name
|
||||
"notes": "Scrubbed from ~/.hermes/sessions/... on 2026-04-24"
|
||||
}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List
|
||||
|
||||
# Resolve the hermes-agent checkout relative to this script so agent.redact
|
||||
# imports cleanly whether we run from a worktree or a main clone.
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(_REPO_ROOT))
|
||||
from agent.redact import redact_sensitive_text # noqa: E402
|
||||
|
||||
|
||||
SESSION_DIR = Path.home() / ".hermes" / "sessions"
|
||||
# Resolve FIXTURES_DIR relative to this script so the scrubber runs the
|
||||
# same way inside a worktree, a main checkout, or from a contributor's
|
||||
# clone at a different path.
|
||||
FIXTURES_DIR = Path(__file__).resolve().parent / "fixtures"
|
||||
|
||||
# (source_file, output_name, description, user_first_paraphrase, model_guess, context_length, truncate_at)
|
||||
# truncate_at: keep messages[:truncate_at] (None = keep all). Applied BEFORE
|
||||
# orphan-empty-assistant cleanup.
|
||||
SPECS = [
|
||||
(
|
||||
"20260321_060441_fef7be92.jsonl",
|
||||
"feature-impl-context-priority",
|
||||
"~75-turn feature-impl: user asks how multiple project-context files "
|
||||
"(.hermes.md / AGENTS.md / CLAUDE.md / .cursorrules) are handled when "
|
||||
"all are present; agent investigates the codebase, designs a priority "
|
||||
"order, patches the loader + tests, live-tests with a scenario "
|
||||
"directory, commits to a feature branch, opens a PR, and merges after "
|
||||
"approval. Exercises investigate → decide → implement → verify → "
|
||||
"ship flow with clear artifact trail (2 files modified, 1 PR).",
|
||||
(
|
||||
"If .hermes.md, AGENTS.md, CLAUDE.md, and .cursorrules all exist in "
|
||||
"the same directory, does the agent load all of them or pick one? "
|
||||
"Use the hermes-agent-dev skill to check."
|
||||
),
|
||||
"anthropic/claude-sonnet-4.6",
|
||||
200000,
|
||||
74, # cut at "Merged and pulled. Main is current." — drops trailing unrelated cron-delivery messages
|
||||
),
|
||||
(
|
||||
"20260412_233741_3f2119a8.jsonl",
|
||||
"debug-session-feishu-id-model",
|
||||
"~60-turn debug/triage PR-review session: a third-party bug report "
|
||||
"says the gateway's Feishu adapter misuses the open_id / union_id / "
|
||||
"user_id identity model (open_id is app-scoped, not the bot's "
|
||||
"canonical ID). An open community PR (#8388) tries to fix it. Agent "
|
||||
"reviews the PR against current main, fetches upstream Feishu/Lark "
|
||||
"identity docs, and produces a decision. Exercises long tool-heavy "
|
||||
"context with PR diffs, upstream docs, and a clear decision at the "
|
||||
"end — the classic 'can the summary still name the PR number, the "
|
||||
"root cause, and the decision?' scenario.",
|
||||
(
|
||||
"A community user reports the Feishu/Lark adapter gets the identity "
|
||||
"model wrong — open_id is app-scoped, not the bot's canonical ID. "
|
||||
"There's an open PR #8388 trying to fix it. Use the hermes-agent-dev "
|
||||
"skill and the pr-triage-salvage skill to review it."
|
||||
),
|
||||
"anthropic/claude-sonnet-4.6",
|
||||
200000,
|
||||
58, # end at "Here's my review: ..." — clean decision point before the "close it, implement cleaner" pivot
|
||||
),
|
||||
(
|
||||
"20260328_160817_77bd258b.jsonl",
|
||||
"config-build-competitive-scouts",
|
||||
"~60-turn iterative config/build session: user wants a set of weekly "
|
||||
"cron jobs that scan competing AI coding agents (openclaw, nanoclaw, "
|
||||
"ironclaw, codex, opencode, claude-code, kilo-code, gemini-cli, "
|
||||
"cline, aider, roo) for merged PRs or web updates worth porting to "
|
||||
"hermes-agent. User adds one target per turn; agent creates each cron "
|
||||
"job and re-states the accumulated schedule. Exercises artifact trail "
|
||||
"(which jobs are configured, which days) and iterative state "
|
||||
"accumulation — the canonical case for iterative-merge summarization.",
|
||||
(
|
||||
"Set up a cron job for the agent every Sunday to scan all PRs "
|
||||
"merged into openclaw that week, decide which are worth adding to "
|
||||
"hermes-agent, and open PRs porting those features."
|
||||
),
|
||||
"anthropic/claude-sonnet-4.6",
|
||||
200000,
|
||||
None,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# Tool output truncation is DELIBERATELY DISABLED.
|
||||
#
|
||||
# An earlier iteration truncated tool outputs > 2KB to keep fixture JSON
|
||||
# files small, but that defeats the whole purpose of the eval. Real
|
||||
# sessions have 30KB skill_view dumps, 10KB read_file outputs, 5KB
|
||||
# web_extract bodies — compression has to either head-protect them,
|
||||
# summarize them, or drop them. A fixture without that load doesn't
|
||||
# exercise the compressor. The size win wasn't worth the signal loss.
|
||||
#
|
||||
# The function remains so the scrubbing_passes record in the fixture
|
||||
# JSON continues to truthfully describe what was applied (no-op in this
|
||||
# configuration).
|
||||
_TOOL_OUTPUT_MAX = None # None disables truncation entirely
|
||||
|
||||
|
||||
def _maybe_truncate_tool_output(text: str, tool_name: str) -> str:
|
||||
if _TOOL_OUTPUT_MAX is None or not text or len(text) <= _TOOL_OUTPUT_MAX:
|
||||
return text
|
||||
keep = _TOOL_OUTPUT_MAX - 200
|
||||
head = text[:keep]
|
||||
return (
|
||||
head
|
||||
+ f"\n\n[... tool output truncated for fixture — original was {len(text)} chars"
|
||||
+ (f" from {tool_name}" if tool_name else "")
|
||||
+ "]"
|
||||
)
|
||||
|
||||
|
||||
_PATH_RE = re.compile(r"/home/teknium\b")
|
||||
# No \b boundaries — some tool content stores newlines as the literal
|
||||
# two-char sequence "\\n" (escaped JSON), so a "\\nTeknium..." run has a
|
||||
# word char ('n') immediately before 'T' and \b fails. Substring match is
|
||||
# safer here; "Teknium" as a substring of an unrelated word is
|
||||
# implausible in this corpus.
|
||||
_USER_RE = re.compile(r"teknium1|Teknium|teknium", re.IGNORECASE)
|
||||
# Only strip scratchpads in ASSISTANT content, not tool results (might be legit)
|
||||
_SCRATCH_RE = re.compile(
|
||||
r"<REASONING_SCRATCHPAD>.*?</REASONING_SCRATCHPAD>\s*", re.DOTALL
|
||||
)
|
||||
_THINK_RE = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
|
||||
# Discord/Telegram user mention leakage in messaging-platform sessions
|
||||
_USER_MENTION_RE = re.compile(r"<@\*{3}>|<@\d+>")
|
||||
# Contributor emails (from git show output etc) — anything@domain.tld
|
||||
# Keep noreply@github-style placeholders obvious; real personal emails get
|
||||
# replaced with a contributor placeholder.
|
||||
_EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
|
||||
# "Author: Name <email>" git-show headers — rewrite the whole line
|
||||
_GIT_AUTHOR_RE = re.compile(r"Author:\s*[^<\n]+<[^>]+>")
|
||||
|
||||
|
||||
def _scrub_text(text: str, *, drop_scratchpads: bool = False) -> str:
|
||||
"""Apply the pipeline to a raw text string.
|
||||
|
||||
drop_scratchpads only affects assistant messages — tool outputs that
|
||||
happen to contain similar markers are left alone.
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
if drop_scratchpads:
|
||||
text = _SCRATCH_RE.sub("", text)
|
||||
text = _THINK_RE.sub("", text)
|
||||
text = _PATH_RE.sub("/home/user", text)
|
||||
text = _USER_RE.sub("user", text)
|
||||
text = _USER_MENTION_RE.sub("<@user>", text)
|
||||
# Rewrite git "Author: Name <email>" lines before generic email replace
|
||||
text = _GIT_AUTHOR_RE.sub("Author: contributor <contributor@example.com>", text)
|
||||
text = _EMAIL_RE.sub("contributor@example.com", text)
|
||||
text = redact_sensitive_text(text)
|
||||
return text
|
||||
|
||||
|
||||
def _content_to_str(content: Any) -> str:
|
||||
if content is None:
|
||||
return ""
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
if isinstance(content, list):
|
||||
parts = []
|
||||
for p in content:
|
||||
if isinstance(p, dict) and "text" in p:
|
||||
parts.append(p["text"])
|
||||
elif isinstance(p, str):
|
||||
parts.append(p)
|
||||
return "\n".join(parts)
|
||||
return str(content)
|
||||
|
||||
|
||||
def _scrub_tool_calls(tool_calls: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
out = []
|
||||
for tc in tool_calls or []:
|
||||
if not isinstance(tc, dict):
|
||||
continue
|
||||
fn = tc.get("function", {}) or {}
|
||||
args = fn.get("arguments", "")
|
||||
if isinstance(args, str):
|
||||
args = _scrub_text(args)
|
||||
new_tc = {
|
||||
"id": tc.get("id", ""),
|
||||
"type": tc.get("type", "function"),
|
||||
"function": {
|
||||
"name": fn.get("name", ""),
|
||||
"arguments": args,
|
||||
},
|
||||
}
|
||||
out.append(new_tc)
|
||||
return out
|
||||
|
||||
|
||||
def _scrub_message(m: Dict[str, Any], *, first_user_paraphrase: str | None, user_turn_idx: List[int]) -> Dict[str, Any] | None:
|
||||
role = m.get("role")
|
||||
if role in (None, "session_meta"):
|
||||
return None
|
||||
|
||||
content = _content_to_str(m.get("content"))
|
||||
|
||||
if role == "assistant":
|
||||
content = _scrub_text(content, drop_scratchpads=True)
|
||||
elif role == "user":
|
||||
# Use paraphrase for the very first user turn only
|
||||
user_turn_idx[0] += 1
|
||||
if user_turn_idx[0] == 1 and first_user_paraphrase is not None:
|
||||
content = first_user_paraphrase
|
||||
else:
|
||||
content = _scrub_text(content)
|
||||
else:
|
||||
content = _scrub_text(content)
|
||||
# Truncate large tool outputs
|
||||
if role == "tool":
|
||||
tn = m.get("tool_name") or m.get("name") or ""
|
||||
content = _maybe_truncate_tool_output(content, tn)
|
||||
|
||||
new_msg: Dict[str, Any] = {"role": role, "content": content}
|
||||
|
||||
if role == "assistant":
|
||||
tcs = m.get("tool_calls") or []
|
||||
if tcs:
|
||||
new_msg["tool_calls"] = _scrub_tool_calls(tcs)
|
||||
if role == "tool":
|
||||
if m.get("tool_call_id"):
|
||||
new_msg["tool_call_id"] = m["tool_call_id"]
|
||||
if m.get("tool_name") or m.get("name"):
|
||||
new_msg["tool_name"] = m.get("tool_name") or m.get("name")
|
||||
|
||||
return new_msg
|
||||
|
||||
|
||||
def build_fixture(
|
||||
source_file: str,
|
||||
output_name: str,
|
||||
description: str,
|
||||
first_user_paraphrase: str,
|
||||
model_guess: str,
|
||||
context_length: int,
|
||||
truncate_at: int | None = None,
|
||||
) -> Dict[str, Any]:
|
||||
src = SESSION_DIR / source_file
|
||||
raw_msgs: List[Dict[str, Any]] = []
|
||||
with src.open() as fh:
|
||||
for line in fh:
|
||||
try:
|
||||
raw_msgs.append(json.loads(line))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Skip session_meta lines up front so truncate_at counts real messages
|
||||
raw_msgs = [m for m in raw_msgs if m.get("role") != "session_meta"]
|
||||
if truncate_at is not None:
|
||||
raw_msgs = raw_msgs[:truncate_at]
|
||||
|
||||
user_turn_counter = [0]
|
||||
scrubbed: List[Dict[str, Any]] = []
|
||||
for m in raw_msgs:
|
||||
new = _scrub_message(
|
||||
m,
|
||||
first_user_paraphrase=first_user_paraphrase,
|
||||
user_turn_idx=user_turn_counter,
|
||||
)
|
||||
if new is not None:
|
||||
scrubbed.append(new)
|
||||
|
||||
# Drop empty-content assistant messages that have no tool_calls
|
||||
# (artifact of scratchpad-only turns post-scrub)
|
||||
pruned: List[Dict[str, Any]] = []
|
||||
for m in scrubbed:
|
||||
if (
|
||||
m["role"] == "assistant"
|
||||
and not (m.get("content") or "").strip()
|
||||
and not m.get("tool_calls")
|
||||
):
|
||||
continue
|
||||
pruned.append(m)
|
||||
# Trim trailing orphan tool messages (no matching assistant)
|
||||
while pruned and pruned[-1]["role"] == "tool":
|
||||
pruned.pop()
|
||||
scrubbed = pruned
|
||||
|
||||
# Inject a synthetic public-safe system message so the compressor has
|
||||
# a head to anchor on. The real system prompts embed personality and
|
||||
# platform-specific content we don't want checked in.
|
||||
system_msg = {
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful AI coding assistant with access to tools "
|
||||
"(terminal, file editing, search, web, etc.). You operate in a "
|
||||
"conversational loop: the user gives you a task, you call tools "
|
||||
"to accomplish it, and you report back concisely."
|
||||
),
|
||||
}
|
||||
if scrubbed and scrubbed[0].get("role") == "system":
|
||||
scrubbed[0] = system_msg
|
||||
else:
|
||||
scrubbed.insert(0, system_msg)
|
||||
|
||||
fixture = {
|
||||
"name": output_name,
|
||||
"description": description,
|
||||
"model": model_guess,
|
||||
"context_length": context_length,
|
||||
"source": f"~/.hermes/sessions/{source_file}",
|
||||
"truncated_to": truncate_at,
|
||||
"scrubbed_at": datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"scrubbing_passes": [
|
||||
"redact_sensitive_text (agent.redact)",
|
||||
"username paths replaced with /home/user",
|
||||
"personal handles (all case variants of the maintainer name) replaced with 'user'",
|
||||
"email addresses replaced with contributor@example.com",
|
||||
"git 'Author: Name <addr>' header lines normalised",
|
||||
"reasoning scratchpad blocks stripped from assistant content",
|
||||
"think tag blocks stripped from assistant content",
|
||||
"messaging-platform user mentions replaced with <@user>",
|
||||
"first user message paraphrased to remove personal voice",
|
||||
"subsequent user messages kept verbatim (after above redactions)",
|
||||
"system prompt replaced with generic public-safe placeholder",
|
||||
"orphan empty-assistant messages and trailing tool messages dropped",
|
||||
"tool outputs preserved verbatim (truncation disabled so the compressor sees real load)",
|
||||
],
|
||||
"messages": scrubbed,
|
||||
}
|
||||
return fixture
|
||||
|
||||
|
||||
def main() -> int:
|
||||
FIXTURES_DIR.mkdir(parents=True, exist_ok=True)
|
||||
for spec in SPECS:
|
||||
source_file, output_name, description, paraphrase, model, ctx, truncate = spec
|
||||
fixture = build_fixture(
|
||||
source_file=source_file,
|
||||
output_name=output_name,
|
||||
description=description,
|
||||
first_user_paraphrase=paraphrase,
|
||||
model_guess=model,
|
||||
context_length=ctx,
|
||||
truncate_at=truncate,
|
||||
)
|
||||
out_path = FIXTURES_DIR / f"{output_name}.json"
|
||||
with out_path.open("w") as fh:
|
||||
json.dump(fixture, fh, indent=2, ensure_ascii=False)
|
||||
size_kb = out_path.stat().st_size / 1024
|
||||
print(f" {output_name}.json {size_kb:.1f} KB {len(fixture['messages'])} msgs")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
+7
-99
@@ -29,25 +29,10 @@ BOLD='\033[1m'
|
||||
REPO_URL_SSH="git@github.com:NousResearch/hermes-agent.git"
|
||||
REPO_URL_HTTPS="https://github.com/NousResearch/hermes-agent.git"
|
||||
HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
|
||||
# INSTALL_DIR is resolved AFTER arg parsing and OS detection so we can pick an
|
||||
# FHS-style layout for root installs. Track whether the user gave us an
|
||||
# explicit directory — if so we never override it.
|
||||
if [ -n "${HERMES_INSTALL_DIR:-}" ]; then
|
||||
INSTALL_DIR="$HERMES_INSTALL_DIR"
|
||||
INSTALL_DIR_EXPLICIT=true
|
||||
else
|
||||
INSTALL_DIR=""
|
||||
INSTALL_DIR_EXPLICIT=false
|
||||
fi
|
||||
INSTALL_DIR="${HERMES_INSTALL_DIR:-$HERMES_HOME/hermes-agent}"
|
||||
PYTHON_VERSION="3.11"
|
||||
NODE_VERSION="22"
|
||||
|
||||
# FHS-style root install layout (set by resolve_install_layout when applicable):
|
||||
# code at /usr/local/lib/hermes-agent, command at /usr/local/bin/hermes,
|
||||
# data still at /root/.hermes (HERMES_HOME). Matches Claude Code / Codex CLI
|
||||
# and keeps Docker bind-mounted /root/ volumes lean.
|
||||
ROOT_FHS_LAYOUT=false
|
||||
|
||||
# Options
|
||||
USE_VENV=true
|
||||
RUN_SETUP=true
|
||||
@@ -79,7 +64,6 @@ while [[ $# -gt 0 ]]; do
|
||||
;;
|
||||
--dir)
|
||||
INSTALL_DIR="$2"
|
||||
INSTALL_DIR_EXPLICIT=true
|
||||
shift 2
|
||||
;;
|
||||
--hermes-home)
|
||||
@@ -95,20 +79,9 @@ while [[ $# -gt 0 ]]; do
|
||||
echo " --no-venv Don't create virtual environment"
|
||||
echo " --skip-setup Skip interactive setup wizard"
|
||||
echo " --branch NAME Git branch to install (default: main)"
|
||||
echo " --dir PATH Installation directory"
|
||||
echo " default (non-root): ~/.hermes/hermes-agent"
|
||||
echo " default (root, Linux): /usr/local/lib/hermes-agent"
|
||||
echo " --dir PATH Installation directory (default: ~/.hermes/hermes-agent)"
|
||||
echo " --hermes-home PATH Data directory (default: ~/.hermes, or \$HERMES_HOME)"
|
||||
echo " -h, --help Show this help"
|
||||
echo ""
|
||||
echo "Notes:"
|
||||
echo " When running as root on Linux, Hermes installs the code under"
|
||||
echo " /usr/local/lib/hermes-agent and links the command into"
|
||||
echo " /usr/local/bin/hermes (FHS layout — matches Claude Code / Codex CLI)."
|
||||
echo " Data, config, sessions, and logs still live in \$HERMES_HOME"
|
||||
echo " (default /root/.hermes). This keeps Docker bind-mounted volumes"
|
||||
echo " small and ensures the command is on PATH for all shells."
|
||||
echo " Existing installs at \$HERMES_HOME/hermes-agent are preserved in-place."
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
@@ -190,60 +163,9 @@ is_termux() {
|
||||
[ -n "${TERMUX_VERSION:-}" ] || [[ "${PREFIX:-}" == *"com.termux/files/usr"* ]]
|
||||
}
|
||||
|
||||
# Decide where the repo checkout + venv live, and where the `hermes` command
|
||||
# symlink goes. Called after detect_os so $OS/$DISTRO are known.
|
||||
#
|
||||
# Defaults:
|
||||
# - Non-root, any OS: INSTALL_DIR = $HERMES_HOME/hermes-agent
|
||||
# command link in $HOME/.local/bin
|
||||
# - Termux (any uid): INSTALL_DIR = $HERMES_HOME/hermes-agent
|
||||
# command link in $PREFIX/bin (already on PATH)
|
||||
# - Root on Linux (new): INSTALL_DIR = /usr/local/lib/hermes-agent
|
||||
# command link in /usr/local/bin
|
||||
# (unless a legacy install already exists at
|
||||
# $HERMES_HOME/hermes-agent — then preserve it)
|
||||
#
|
||||
# Always no-op when the user set --dir or $HERMES_INSTALL_DIR.
|
||||
resolve_install_layout() {
|
||||
if [ "$INSTALL_DIR_EXPLICIT" = true ]; then
|
||||
log_info "Install directory: $INSTALL_DIR (explicit)"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Termux: package manager manages /data/data/..., keep code in HERMES_HOME.
|
||||
if is_termux; then
|
||||
INSTALL_DIR="$HERMES_HOME/hermes-agent"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Root on Linux: prefer FHS layout unless a legacy install already exists.
|
||||
# macOS root installs keep the legacy layout because /usr/local/ on macOS
|
||||
# is Homebrew territory and we don't want to fight that.
|
||||
if [ "$OS" = "linux" ] && [ "$(id -u)" -eq 0 ]; then
|
||||
if [ -d "$HERMES_HOME/hermes-agent/.git" ]; then
|
||||
INSTALL_DIR="$HERMES_HOME/hermes-agent"
|
||||
log_info "Existing install detected at $INSTALL_DIR — keeping legacy layout"
|
||||
log_info " (new root installs use /usr/local/lib/hermes-agent)"
|
||||
return 0
|
||||
fi
|
||||
INSTALL_DIR="/usr/local/lib/hermes-agent"
|
||||
ROOT_FHS_LAYOUT=true
|
||||
log_info "Root install on Linux — using FHS layout"
|
||||
log_info " Code: $INSTALL_DIR"
|
||||
log_info " Command: /usr/local/bin/hermes"
|
||||
log_info " Data: $HERMES_HOME (unchanged)"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Default: non-root, non-Termux → legacy user-scoped layout.
|
||||
INSTALL_DIR="$HERMES_HOME/hermes-agent"
|
||||
}
|
||||
|
||||
get_command_link_dir() {
|
||||
if is_termux && [ -n "${PREFIX:-}" ]; then
|
||||
echo "$PREFIX/bin"
|
||||
elif [ "$ROOT_FHS_LAYOUT" = true ]; then
|
||||
echo "/usr/local/bin"
|
||||
else
|
||||
echo "$HOME/.local/bin"
|
||||
fi
|
||||
@@ -252,8 +174,6 @@ get_command_link_dir() {
|
||||
get_command_link_display_dir() {
|
||||
if is_termux && [ -n "${PREFIX:-}" ]; then
|
||||
echo '$PREFIX/bin'
|
||||
elif [ "$ROOT_FHS_LAYOUT" = true ]; then
|
||||
echo '/usr/local/bin'
|
||||
else
|
||||
echo '~/.local/bin'
|
||||
fi
|
||||
@@ -1055,14 +975,6 @@ setup_path() {
|
||||
return 0
|
||||
fi
|
||||
|
||||
# FHS layout: /usr/local/bin is on PATH for every standard shell, nothing to inject.
|
||||
if [ "$ROOT_FHS_LAYOUT" = true ]; then
|
||||
export PATH="$command_link_dir:$PATH"
|
||||
log_info "/usr/local/bin is already on PATH for all shells"
|
||||
log_success "hermes command ready"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Check if ~/.local/bin is on PATH; if not, add it to shell config.
|
||||
# Detect the user's actual login shell (not the shell running this script,
|
||||
# which is always bash when piped from curl).
|
||||
@@ -1427,12 +1339,12 @@ print_success() {
|
||||
echo ""
|
||||
|
||||
# Show file locations
|
||||
echo -e "${CYAN}${BOLD}📁 Your files:${NC}"
|
||||
echo -e "${CYAN}${BOLD}📁 Your files (all in ~/.hermes/):${NC}"
|
||||
echo ""
|
||||
echo -e " ${YELLOW}Config:${NC} $HERMES_HOME/config.yaml"
|
||||
echo -e " ${YELLOW}API Keys:${NC} $HERMES_HOME/.env"
|
||||
echo -e " ${YELLOW}Data:${NC} $HERMES_HOME/cron/, sessions/, logs/"
|
||||
echo -e " ${YELLOW}Code:${NC} $INSTALL_DIR"
|
||||
echo -e " ${YELLOW}Config:${NC} ~/.hermes/config.yaml"
|
||||
echo -e " ${YELLOW}API Keys:${NC} ~/.hermes/.env"
|
||||
echo -e " ${YELLOW}Data:${NC} ~/.hermes/cron/, sessions/, logs/"
|
||||
echo -e " ${YELLOW}Code:${NC} ~/.hermes/hermes-agent/"
|
||||
echo ""
|
||||
|
||||
echo -e "${CYAN}─────────────────────────────────────────────────────────${NC}"
|
||||
@@ -1452,9 +1364,6 @@ print_success() {
|
||||
if [ "$DISTRO" = "termux" ]; then
|
||||
echo -e "${YELLOW}⚡ 'hermes' was linked into $(get_command_link_display_dir), which is already on PATH in Termux.${NC}"
|
||||
echo ""
|
||||
elif [ "$ROOT_FHS_LAYOUT" = true ]; then
|
||||
echo -e "${YELLOW}⚡ 'hermes' was linked into /usr/local/bin and is ready to use — no shell reload needed.${NC}"
|
||||
echo ""
|
||||
else
|
||||
echo -e "${YELLOW}⚡ Reload your shell to use 'hermes' command:${NC}"
|
||||
echo ""
|
||||
@@ -1506,7 +1415,6 @@ main() {
|
||||
print_banner
|
||||
|
||||
detect_os
|
||||
resolve_install_layout
|
||||
install_uv
|
||||
check_python
|
||||
check_git
|
||||
|
||||
@@ -48,9 +48,6 @@ AUTHOR_MAP = {
|
||||
"jefferson@heimdallstrategy.com": "Mind-Dragon",
|
||||
"130918800+devorun@users.noreply.github.com": "devorun",
|
||||
"maks.mir@yahoo.com": "say8hi",
|
||||
"web3blind@users.noreply.github.com": "web3blind",
|
||||
"julia@alexland.us": "alexg0bot",
|
||||
"1060770+benjaminsehl@users.noreply.github.com": "benjaminsehl",
|
||||
# contributors (from noreply pattern)
|
||||
"david.vv@icloud.com": "davidvv",
|
||||
"wangqiang@wangqiangdeMac-mini.local": "xiaoqiang243",
|
||||
@@ -62,19 +59,14 @@ AUTHOR_MAP = {
|
||||
"keifergu@tencent.com": "keifergu",
|
||||
"kshitijk4poor@users.noreply.github.com": "kshitijk4poor",
|
||||
"abner.the.foreman@agentmail.to": "Abnertheforeman",
|
||||
"thomasgeorgevii09@gmail.com": "tochukwuada",
|
||||
"harryykyle1@gmail.com": "hharry11",
|
||||
"kshitijk4poor@gmail.com": "kshitijk4poor",
|
||||
"keira.voss94@gmail.com": "keiravoss94",
|
||||
"16443023+stablegenius49@users.noreply.github.com": "stablegenius49",
|
||||
"simbamax99@gmail.com": "simbam99",
|
||||
"185121704+stablegenius49@users.noreply.github.com": "stablegenius49",
|
||||
"101283333+batuhankocyigit@users.noreply.github.com": "batuhankocyigit",
|
||||
"255305877+ismell0992-afk@users.noreply.github.com": "ismell0992-afk",
|
||||
"cyprian@ironin.pl": "iRonin",
|
||||
"valdi.jorge@gmail.com": "jvcl",
|
||||
"q19dcp@gmail.com": "aj-nt",
|
||||
"ebukau84@gmail.com": "UgwujaGeorge",
|
||||
"francip@gmail.com": "francip",
|
||||
"omni@comelse.com": "omnissiah-comelse",
|
||||
"oussama.redcode@gmail.com": "mavrickdeveloper",
|
||||
@@ -92,7 +84,6 @@ AUTHOR_MAP = {
|
||||
"104278804+Sertug17@users.noreply.github.com": "Sertug17",
|
||||
"112503481+caentzminger@users.noreply.github.com": "caentzminger",
|
||||
"258577966+voidborne-d@users.noreply.github.com": "voidborne-d",
|
||||
"xydarcher@uestc.edu.cn": "Readon",
|
||||
"sir_even@icloud.com": "sirEven",
|
||||
"36056348+sirEven@users.noreply.github.com": "sirEven",
|
||||
"70424851+insecurejezza@users.noreply.github.com": "insecurejezza",
|
||||
@@ -115,7 +106,6 @@ AUTHOR_MAP = {
|
||||
"30841158+n-WN@users.noreply.github.com": "n-WN",
|
||||
"tsuijinglei@gmail.com": "hiddenpuppy",
|
||||
"jerome@clawwork.ai": "HiddenPuppy",
|
||||
"jerome.benoit@sap.com": "jerome-benoit",
|
||||
"wysie@users.noreply.github.com": "Wysie",
|
||||
"leoyuan0099@gmail.com": "keyuyuan",
|
||||
"bxzt2006@163.com": "Only-Code-A",
|
||||
@@ -210,9 +200,6 @@ AUTHOR_MAP = {
|
||||
"1434494126@qq.com": "5park1e",
|
||||
"158153005+5park1e@users.noreply.github.com": "5park1e",
|
||||
"innocarpe@gmail.com": "innocarpe",
|
||||
"noreply@ked.com": "qike-ms",
|
||||
"andrekurait@gmail.com": "AndreKurait",
|
||||
"bsgdigital@users.noreply.github.com": "bsgdigital",
|
||||
"numman.ali@gmail.com": "nummanali",
|
||||
"rohithsaimidigudla@gmail.com": "whitehatjr1001",
|
||||
"0xNyk@users.noreply.github.com": "0xNyk",
|
||||
@@ -503,9 +490,6 @@ AUTHOR_MAP = {
|
||||
"zhangxicen@example.com": "zhangxicen",
|
||||
"codex@openai.invalid": "teknium1",
|
||||
"screenmachine@gmail.com": "teknium1",
|
||||
"chenzeshi@live.com": "chen1749144759",
|
||||
"mor.aleksandr@yahoo.com": "MorAlekss",
|
||||
"ash@users.noreply.github.com": "ash",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -134,7 +134,6 @@ masks = processor.image_processor.post_process_masks(
|
||||
|
||||
### Model architecture
|
||||
|
||||
<!-- ascii-guard-ignore -->
|
||||
```
|
||||
SAM Architecture:
|
||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
||||
@@ -145,7 +144,6 @@ SAM Architecture:
|
||||
Image Embeddings Prompt Embeddings Masks + IoU
|
||||
(computed once) (per prompt) predictions
|
||||
```
|
||||
<!-- ascii-guard-ignore-end -->
|
||||
|
||||
### Model variants
|
||||
|
||||
|
||||
@@ -1,42 +0,0 @@
|
||||
"""Resolve HERMES_HOME for standalone skill scripts.
|
||||
|
||||
Skill scripts may run outside the Hermes process (e.g. system Python,
|
||||
nix env, CI) where ``hermes_constants`` is not importable. This module
|
||||
provides the same ``get_hermes_home()`` and ``display_hermes_home()``
|
||||
contracts as ``hermes_constants`` without requiring it on ``sys.path``.
|
||||
|
||||
When ``hermes_constants`` IS available it is used directly so that any
|
||||
future enhancements (profile resolution, Docker detection, etc.) are
|
||||
picked up automatically. The fallback path replicates the core logic
|
||||
from ``hermes_constants.py`` using only the stdlib.
|
||||
|
||||
All scripts under ``google-workspace/scripts/`` should import from here
|
||||
instead of duplicating the ``HERMES_HOME = Path(os.getenv(...))`` pattern.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
from hermes_constants import display_hermes_home as display_hermes_home
|
||||
from hermes_constants import get_hermes_home as get_hermes_home
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
|
||||
def get_hermes_home() -> Path:
|
||||
"""Return the Hermes home directory (default: ~/.hermes).
|
||||
|
||||
Mirrors ``hermes_constants.get_hermes_home()``."""
|
||||
val = os.environ.get("HERMES_HOME", "").strip()
|
||||
return Path(val) if val else Path.home() / ".hermes"
|
||||
|
||||
def display_hermes_home() -> str:
|
||||
"""Return a user-friendly ``~/``-shortened display string.
|
||||
|
||||
Mirrors ``hermes_constants.display_hermes_home()``."""
|
||||
home = get_hermes_home()
|
||||
try:
|
||||
return "~/" + str(home.relative_to(Path.home()))
|
||||
except ValueError:
|
||||
return str(home)
|
||||
@@ -31,14 +31,7 @@ from datetime import datetime, timedelta, timezone
|
||||
from email.mime.text import MIMEText
|
||||
from pathlib import Path
|
||||
|
||||
# Ensure sibling modules (_hermes_home) are importable when run standalone.
|
||||
_SCRIPTS_DIR = str(Path(__file__).resolve().parent)
|
||||
if _SCRIPTS_DIR not in sys.path:
|
||||
sys.path.insert(0, _SCRIPTS_DIR)
|
||||
|
||||
from _hermes_home import get_hermes_home
|
||||
|
||||
HERMES_HOME = get_hermes_home()
|
||||
HERMES_HOME = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
|
||||
TOKEN_PATH = HERMES_HOME / "google_token.json"
|
||||
CLIENT_SECRET_PATH = HERMES_HOME / "google_client_secret.json"
|
||||
|
||||
|
||||
@@ -10,12 +10,9 @@ import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
# Ensure sibling modules (_hermes_home) are importable when run standalone.
|
||||
_SCRIPTS_DIR = str(Path(__file__).resolve().parent)
|
||||
if _SCRIPTS_DIR not in sys.path:
|
||||
sys.path.insert(0, _SCRIPTS_DIR)
|
||||
|
||||
from _hermes_home import get_hermes_home
|
||||
def get_hermes_home() -> Path:
|
||||
return Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
|
||||
|
||||
|
||||
def get_token_path() -> Path:
|
||||
|
||||
@@ -21,8 +21,6 @@ Agent workflow:
|
||||
6. Run --check to verify. Done.
|
||||
"""
|
||||
|
||||
from __future__ import annotations # allow PEP 604 `X | None` on Python 3.9+
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
@@ -30,12 +28,13 @@ import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Ensure sibling modules (_hermes_home) are importable when run standalone.
|
||||
_SCRIPTS_DIR = str(Path(__file__).resolve().parent)
|
||||
if _SCRIPTS_DIR not in sys.path:
|
||||
sys.path.insert(0, _SCRIPTS_DIR)
|
||||
|
||||
from _hermes_home import display_hermes_home, get_hermes_home
|
||||
try:
|
||||
from hermes_constants import display_hermes_home, get_hermes_home
|
||||
except ModuleNotFoundError:
|
||||
HERMES_AGENT_ROOT = Path(__file__).resolve().parents[4]
|
||||
if HERMES_AGENT_ROOT.exists():
|
||||
sys.path.insert(0, str(HERMES_AGENT_ROOT))
|
||||
from hermes_constants import display_hermes_home, get_hermes_home
|
||||
|
||||
HERMES_HOME = get_hermes_home()
|
||||
TOKEN_PATH = HERMES_HOME / "google_token.json"
|
||||
@@ -112,11 +111,7 @@ def install_deps():
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"ERROR: Failed to install dependencies: {e}")
|
||||
print(
|
||||
"On environments without pip (e.g. Nix), install the optional extra instead:"
|
||||
)
|
||||
print(" pip install 'hermes-agent[google]'")
|
||||
print(f"Or manually: {sys.executable} -m pip install {' '.join(REQUIRED_PACKAGES)}")
|
||||
print(f"Try manually: {sys.executable} -m pip install {' '.join(REQUIRED_PACKAGES)}")
|
||||
return False
|
||||
|
||||
|
||||
|
||||
@@ -22,7 +22,6 @@ End-to-end pipeline for producing publication-ready ML/AI research papers target
|
||||
|
||||
This is **not a linear pipeline** — it is an iterative loop. Results trigger new experiments. Reviews trigger new analysis. The agent must handle these feedback loops.
|
||||
|
||||
<!-- ascii-guard-ignore -->
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ RESEARCH PAPER PIPELINE │
|
||||
@@ -42,7 +41,6 @@ This is **not a linear pipeline** — it is an iterative loop. Results trigger n
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
<!-- ascii-guard-ignore-end -->
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -386,7 +386,7 @@ class TestProvidersDictApiModeAnthropicMessages:
|
||||
},
|
||||
},
|
||||
"auxiliary": {
|
||||
"compression": {
|
||||
"flush_memories": {
|
||||
"provider": "myrelay",
|
||||
"model": "claude-sonnet-4.6",
|
||||
},
|
||||
@@ -399,11 +399,11 @@ class TestProvidersDictApiModeAnthropicMessages:
|
||||
AnthropicAuxiliaryClient,
|
||||
AsyncAnthropicAuxiliaryClient,
|
||||
)
|
||||
async_client, async_model = get_async_text_auxiliary_client("compression")
|
||||
async_client, async_model = get_async_text_auxiliary_client("flush_memories")
|
||||
assert isinstance(async_client, AsyncAnthropicAuxiliaryClient)
|
||||
assert async_model == "claude-sonnet-4.6"
|
||||
|
||||
sync_client, sync_model = get_text_auxiliary_client("compression")
|
||||
sync_client, sync_model = get_text_auxiliary_client("flush_memories")
|
||||
assert isinstance(sync_client, AnthropicAuxiliaryClient)
|
||||
assert sync_model == "claude-sonnet-4.6"
|
||||
|
||||
|
||||
@@ -1230,210 +1230,3 @@ class TestEmptyTextBlockFix:
|
||||
from agent.bedrock_adapter import _convert_content_to_converse
|
||||
blocks = _convert_content_to_converse("Hello")
|
||||
assert blocks[0]["text"] == "Hello"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Stale-connection detection and per-region client invalidation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestInvalidateRuntimeClient:
|
||||
"""Per-region eviction used to discard dead/stale bedrock-runtime clients."""
|
||||
|
||||
def test_evicts_only_the_target_region(self):
|
||||
from agent.bedrock_adapter import (
|
||||
_bedrock_runtime_client_cache,
|
||||
invalidate_runtime_client,
|
||||
reset_client_cache,
|
||||
)
|
||||
reset_client_cache()
|
||||
_bedrock_runtime_client_cache["us-east-1"] = "dead-client"
|
||||
_bedrock_runtime_client_cache["us-west-2"] = "live-client"
|
||||
|
||||
evicted = invalidate_runtime_client("us-east-1")
|
||||
|
||||
assert evicted is True
|
||||
assert "us-east-1" not in _bedrock_runtime_client_cache
|
||||
assert _bedrock_runtime_client_cache["us-west-2"] == "live-client"
|
||||
|
||||
def test_returns_false_when_region_not_cached(self):
|
||||
from agent.bedrock_adapter import invalidate_runtime_client, reset_client_cache
|
||||
reset_client_cache()
|
||||
assert invalidate_runtime_client("eu-west-1") is False
|
||||
|
||||
|
||||
class TestIsStaleConnectionError:
|
||||
"""Classifier that decides whether an exception warrants client eviction."""
|
||||
|
||||
def test_detects_botocore_connection_closed_error(self):
|
||||
from agent.bedrock_adapter import is_stale_connection_error
|
||||
from botocore.exceptions import ConnectionClosedError
|
||||
exc = ConnectionClosedError(endpoint_url="https://bedrock.example")
|
||||
assert is_stale_connection_error(exc) is True
|
||||
|
||||
def test_detects_botocore_endpoint_connection_error(self):
|
||||
from agent.bedrock_adapter import is_stale_connection_error
|
||||
from botocore.exceptions import EndpointConnectionError
|
||||
exc = EndpointConnectionError(endpoint_url="https://bedrock.example")
|
||||
assert is_stale_connection_error(exc) is True
|
||||
|
||||
def test_detects_botocore_read_timeout(self):
|
||||
from agent.bedrock_adapter import is_stale_connection_error
|
||||
from botocore.exceptions import ReadTimeoutError
|
||||
exc = ReadTimeoutError(endpoint_url="https://bedrock.example")
|
||||
assert is_stale_connection_error(exc) is True
|
||||
|
||||
def test_detects_urllib3_protocol_error(self):
|
||||
from agent.bedrock_adapter import is_stale_connection_error
|
||||
from urllib3.exceptions import ProtocolError
|
||||
exc = ProtocolError("Connection broken")
|
||||
assert is_stale_connection_error(exc) is True
|
||||
|
||||
def test_detects_library_internal_assertion_error(self):
|
||||
"""A bare AssertionError raised from inside urllib3/botocore signals
|
||||
a corrupted connection-pool invariant and should trigger eviction."""
|
||||
from agent.bedrock_adapter import is_stale_connection_error
|
||||
|
||||
# Fabricate an AssertionError whose traceback's last frame belongs
|
||||
# to a module named "urllib3.connectionpool". We do this by exec'ing
|
||||
# a tiny `assert False` under a fake globals dict — the resulting
|
||||
# frame's ``f_globals["__name__"]`` is what the classifier inspects.
|
||||
fake_globals = {"__name__": "urllib3.connectionpool"}
|
||||
try:
|
||||
exec("def _boom():\n assert False\n_boom()", fake_globals)
|
||||
except AssertionError as exc:
|
||||
assert is_stale_connection_error(exc) is True
|
||||
else:
|
||||
pytest.fail("AssertionError not raised")
|
||||
|
||||
def test_detects_botocore_internal_assertion_error(self):
|
||||
"""Same as above but for a frame inside the botocore namespace."""
|
||||
from agent.bedrock_adapter import is_stale_connection_error
|
||||
fake_globals = {"__name__": "botocore.httpsession"}
|
||||
try:
|
||||
exec("def _boom():\n assert False\n_boom()", fake_globals)
|
||||
except AssertionError as exc:
|
||||
assert is_stale_connection_error(exc) is True
|
||||
else:
|
||||
pytest.fail("AssertionError not raised")
|
||||
|
||||
def test_ignores_application_assertion_error(self):
|
||||
"""AssertionError from application code (not urllib3/botocore) should
|
||||
NOT be classified as stale — those are real test/code bugs."""
|
||||
from agent.bedrock_adapter import is_stale_connection_error
|
||||
try:
|
||||
assert False, "test-only" # noqa: B011
|
||||
except AssertionError as exc:
|
||||
assert is_stale_connection_error(exc) is False
|
||||
|
||||
def test_ignores_unrelated_exceptions(self):
|
||||
from agent.bedrock_adapter import is_stale_connection_error
|
||||
assert is_stale_connection_error(ValueError("bad input")) is False
|
||||
assert is_stale_connection_error(KeyError("missing")) is False
|
||||
|
||||
|
||||
class TestCallConverseInvalidatesOnStaleError:
|
||||
"""call_converse / call_converse_stream evict the cached client when the
|
||||
boto3 call raises a stale-connection error — so the next invocation
|
||||
reconnects instead of reusing the dead socket."""
|
||||
|
||||
def test_converse_evicts_client_on_stale_error(self):
|
||||
from agent.bedrock_adapter import (
|
||||
_bedrock_runtime_client_cache,
|
||||
call_converse,
|
||||
reset_client_cache,
|
||||
)
|
||||
from botocore.exceptions import ConnectionClosedError
|
||||
|
||||
reset_client_cache()
|
||||
dead_client = MagicMock()
|
||||
dead_client.converse.side_effect = ConnectionClosedError(
|
||||
endpoint_url="https://bedrock.example",
|
||||
)
|
||||
_bedrock_runtime_client_cache["us-east-1"] = dead_client
|
||||
|
||||
with pytest.raises(ConnectionClosedError):
|
||||
call_converse(
|
||||
region="us-east-1",
|
||||
model="anthropic.claude-3-sonnet-20240229-v1:0",
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
)
|
||||
|
||||
assert "us-east-1" not in _bedrock_runtime_client_cache, (
|
||||
"stale client should have been evicted so the retry reconnects"
|
||||
)
|
||||
|
||||
def test_converse_stream_evicts_client_on_stale_error(self):
|
||||
from agent.bedrock_adapter import (
|
||||
_bedrock_runtime_client_cache,
|
||||
call_converse_stream,
|
||||
reset_client_cache,
|
||||
)
|
||||
from botocore.exceptions import ConnectionClosedError
|
||||
|
||||
reset_client_cache()
|
||||
dead_client = MagicMock()
|
||||
dead_client.converse_stream.side_effect = ConnectionClosedError(
|
||||
endpoint_url="https://bedrock.example",
|
||||
)
|
||||
_bedrock_runtime_client_cache["us-east-1"] = dead_client
|
||||
|
||||
with pytest.raises(ConnectionClosedError):
|
||||
call_converse_stream(
|
||||
region="us-east-1",
|
||||
model="anthropic.claude-3-sonnet-20240229-v1:0",
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
)
|
||||
|
||||
assert "us-east-1" not in _bedrock_runtime_client_cache
|
||||
|
||||
def test_converse_does_not_evict_on_non_stale_error(self):
|
||||
"""Non-stale errors (e.g. ValidationException) leave the client cache alone."""
|
||||
from agent.bedrock_adapter import (
|
||||
_bedrock_runtime_client_cache,
|
||||
call_converse,
|
||||
reset_client_cache,
|
||||
)
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
reset_client_cache()
|
||||
live_client = MagicMock()
|
||||
live_client.converse.side_effect = ClientError(
|
||||
error_response={"Error": {"Code": "ValidationException", "Message": "bad"}},
|
||||
operation_name="Converse",
|
||||
)
|
||||
_bedrock_runtime_client_cache["us-east-1"] = live_client
|
||||
|
||||
with pytest.raises(ClientError):
|
||||
call_converse(
|
||||
region="us-east-1",
|
||||
model="anthropic.claude-3-sonnet-20240229-v1:0",
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
)
|
||||
|
||||
assert _bedrock_runtime_client_cache.get("us-east-1") is live_client, (
|
||||
"validation errors do not indicate a dead connection — keep the client"
|
||||
)
|
||||
|
||||
def test_converse_leaves_successful_client_in_cache(self):
|
||||
from agent.bedrock_adapter import (
|
||||
_bedrock_runtime_client_cache,
|
||||
call_converse,
|
||||
reset_client_cache,
|
||||
)
|
||||
|
||||
reset_client_cache()
|
||||
live_client = MagicMock()
|
||||
live_client.converse.return_value = {
|
||||
"output": {"message": {"role": "assistant", "content": [{"text": "hi"}]}},
|
||||
"stopReason": "end_turn",
|
||||
"usage": {"inputTokens": 1, "outputTokens": 1, "totalTokens": 2},
|
||||
}
|
||||
_bedrock_runtime_client_cache["us-east-1"] = live_client
|
||||
|
||||
call_converse(
|
||||
region="us-east-1",
|
||||
model="anthropic.claude-3-sonnet-20240229-v1:0",
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
)
|
||||
|
||||
assert _bedrock_runtime_client_cache.get("us-east-1") is live_client
|
||||
|
||||
@@ -376,15 +376,17 @@ class TestBedrockModelNameNormalization:
|
||||
"apac.anthropic.claude-haiku-4-5", preserve_dots=True
|
||||
) == "apac.anthropic.claude-haiku-4-5"
|
||||
|
||||
def test_bedrock_prefix_preserved_without_preserve_dots(self):
|
||||
"""Bedrock inference profile IDs are auto-detected by prefix and
|
||||
always returned unmangled -- ``preserve_dots`` is irrelevant for
|
||||
these IDs because the dots are namespace separators, not version
|
||||
separators. Regression for #12295."""
|
||||
def test_preserve_false_mangles_as_documented(self):
|
||||
"""Canary: with ``preserve_dots=False`` the function still
|
||||
produces the broken all-hyphen form — this is the shape that
|
||||
Bedrock rejected and that the fix avoids. Keeping this test
|
||||
locks in the existing behaviour of ``normalize_model_name`` so a
|
||||
future refactor doesn't accidentally decouple the knob from its
|
||||
effect."""
|
||||
from agent.anthropic_adapter import normalize_model_name
|
||||
assert normalize_model_name(
|
||||
"global.anthropic.claude-opus-4-7", preserve_dots=False
|
||||
) == "global.anthropic.claude-opus-4-7"
|
||||
) == "global-anthropic-claude-opus-4-7"
|
||||
|
||||
def test_bare_foundation_model_id_preserved(self):
|
||||
"""Non-inference-profile Bedrock IDs
|
||||
@@ -420,11 +422,12 @@ class TestBedrockBuildAnthropicKwargsEndToEnd:
|
||||
f"{kwargs['model']!r}"
|
||||
)
|
||||
|
||||
def test_bedrock_model_preserved_without_preserve_dots(self):
|
||||
"""Bedrock inference profile IDs survive ``build_anthropic_kwargs``
|
||||
even without ``preserve_dots=True`` -- the prefix auto-detection
|
||||
in ``normalize_model_name`` is the load-bearing piece.
|
||||
Regression for #12295."""
|
||||
def test_bedrock_model_mangled_without_preserve_dots(self):
|
||||
"""Inverse canary: without the flag, ``build_anthropic_kwargs``
|
||||
still produces the broken form — so the fix in
|
||||
``_anthropic_preserve_dots`` is the load-bearing piece that
|
||||
wires ``preserve_dots=True`` through to this builder for the
|
||||
Bedrock case."""
|
||||
from agent.anthropic_adapter import build_anthropic_kwargs
|
||||
kwargs = build_anthropic_kwargs(
|
||||
model="global.anthropic.claude-opus-4-7",
|
||||
@@ -434,157 +437,4 @@ class TestBedrockBuildAnthropicKwargsEndToEnd:
|
||||
reasoning_config=None,
|
||||
preserve_dots=False,
|
||||
)
|
||||
assert kwargs["model"] == "global.anthropic.claude-opus-4-7"
|
||||
|
||||
|
||||
class TestBedrockModelIdDetection:
|
||||
"""Tests for ``_is_bedrock_model_id`` and the auto-detection that
|
||||
makes ``normalize_model_name`` preserve dots for Bedrock IDs
|
||||
regardless of ``preserve_dots``. Regression for #12295."""
|
||||
|
||||
def test_bare_bedrock_id_detected(self):
|
||||
from agent.anthropic_adapter import _is_bedrock_model_id
|
||||
assert _is_bedrock_model_id("anthropic.claude-opus-4-7") is True
|
||||
|
||||
def test_regional_us_prefix_detected(self):
|
||||
from agent.anthropic_adapter import _is_bedrock_model_id
|
||||
assert _is_bedrock_model_id("us.anthropic.claude-sonnet-4-5-v1:0") is True
|
||||
|
||||
def test_regional_global_prefix_detected(self):
|
||||
from agent.anthropic_adapter import _is_bedrock_model_id
|
||||
assert _is_bedrock_model_id("global.anthropic.claude-opus-4-7") is True
|
||||
|
||||
def test_regional_eu_prefix_detected(self):
|
||||
from agent.anthropic_adapter import _is_bedrock_model_id
|
||||
assert _is_bedrock_model_id("eu.anthropic.claude-sonnet-4-6") is True
|
||||
|
||||
def test_openrouter_format_not_detected(self):
|
||||
from agent.anthropic_adapter import _is_bedrock_model_id
|
||||
assert _is_bedrock_model_id("claude-opus-4.6") is False
|
||||
|
||||
def test_bare_claude_not_detected(self):
|
||||
from agent.anthropic_adapter import _is_bedrock_model_id
|
||||
assert _is_bedrock_model_id("claude-opus-4-7") is False
|
||||
|
||||
def test_bare_bedrock_id_preserved_without_flag(self):
|
||||
"""The primary bug from #12295: ``anthropic.claude-opus-4-7``
|
||||
sent to bedrock-mantle via auxiliary clients that don't pass
|
||||
``preserve_dots=True``."""
|
||||
from agent.anthropic_adapter import normalize_model_name
|
||||
assert normalize_model_name(
|
||||
"anthropic.claude-opus-4-7", preserve_dots=False
|
||||
) == "anthropic.claude-opus-4-7"
|
||||
|
||||
def test_openrouter_dots_still_converted(self):
|
||||
"""Non-Bedrock dotted model names must still be converted."""
|
||||
from agent.anthropic_adapter import normalize_model_name
|
||||
assert normalize_model_name("claude-opus-4.6") == "claude-opus-4-6"
|
||||
|
||||
def test_bare_bedrock_id_survives_build_kwargs(self):
|
||||
"""End-to-end: bare Bedrock ID through ``build_anthropic_kwargs``
|
||||
without ``preserve_dots=True`` -- the auxiliary client path."""
|
||||
from agent.anthropic_adapter import build_anthropic_kwargs
|
||||
kwargs = build_anthropic_kwargs(
|
||||
model="anthropic.claude-opus-4-7",
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
tools=None,
|
||||
max_tokens=1024,
|
||||
reasoning_config=None,
|
||||
preserve_dots=False,
|
||||
)
|
||||
assert kwargs["model"] == "anthropic.claude-opus-4-7"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# auxiliary_client Bedrock resolution — fix for #13919
|
||||
# ---------------------------------------------------------------------------
|
||||
# Before the fix, resolve_provider_client("bedrock", ...) fell through to the
|
||||
# "unhandled auth_type" warning and returned (None, None), breaking all
|
||||
# auxiliary tasks (compression, memory, summarization) for Bedrock users.
|
||||
|
||||
|
||||
class TestAuxiliaryClientBedrockResolution:
|
||||
"""Verify resolve_provider_client handles Bedrock's aws_sdk auth type."""
|
||||
|
||||
def test_bedrock_returns_client_with_credentials(self, monkeypatch):
|
||||
"""With valid AWS credentials, Bedrock should return a usable client."""
|
||||
monkeypatch.setenv("AWS_ACCESS_KEY_ID", "AKIAIOSFODNN7EXAMPLE")
|
||||
monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY")
|
||||
monkeypatch.setenv("AWS_REGION", "us-west-2")
|
||||
|
||||
mock_anthropic_bedrock = MagicMock()
|
||||
with patch("agent.anthropic_adapter.build_anthropic_bedrock_client",
|
||||
return_value=mock_anthropic_bedrock):
|
||||
from agent.auxiliary_client import resolve_provider_client, AnthropicAuxiliaryClient
|
||||
client, model = resolve_provider_client("bedrock", None)
|
||||
|
||||
assert client is not None, (
|
||||
"resolve_provider_client('bedrock') returned None — "
|
||||
"aws_sdk auth type is not handled"
|
||||
)
|
||||
assert isinstance(client, AnthropicAuxiliaryClient)
|
||||
assert model is not None
|
||||
assert client.api_key == "aws-sdk"
|
||||
assert "us-west-2" in client.base_url
|
||||
|
||||
def test_bedrock_returns_none_without_credentials(self, monkeypatch):
|
||||
"""Without AWS credentials, Bedrock should return (None, None) gracefully."""
|
||||
with patch("agent.bedrock_adapter.has_aws_credentials", return_value=False):
|
||||
from agent.auxiliary_client import resolve_provider_client
|
||||
client, model = resolve_provider_client("bedrock", None)
|
||||
|
||||
assert client is None
|
||||
assert model is None
|
||||
|
||||
def test_bedrock_uses_configured_region(self, monkeypatch):
|
||||
"""Bedrock client base_url should reflect AWS_REGION."""
|
||||
monkeypatch.setenv("AWS_ACCESS_KEY_ID", "AKIAIOSFODNN7EXAMPLE")
|
||||
monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY")
|
||||
monkeypatch.setenv("AWS_REGION", "eu-central-1")
|
||||
|
||||
with patch("agent.anthropic_adapter.build_anthropic_bedrock_client",
|
||||
return_value=MagicMock()):
|
||||
from agent.auxiliary_client import resolve_provider_client
|
||||
client, _ = resolve_provider_client("bedrock", None)
|
||||
|
||||
assert client is not None
|
||||
assert "eu-central-1" in client.base_url
|
||||
|
||||
def test_bedrock_respects_explicit_model(self, monkeypatch):
|
||||
"""When caller passes an explicit model, it should be used."""
|
||||
monkeypatch.setenv("AWS_ACCESS_KEY_ID", "AKIAIOSFODNN7EXAMPLE")
|
||||
monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY")
|
||||
|
||||
with patch("agent.anthropic_adapter.build_anthropic_bedrock_client",
|
||||
return_value=MagicMock()):
|
||||
from agent.auxiliary_client import resolve_provider_client
|
||||
_, model = resolve_provider_client(
|
||||
"bedrock", "us.anthropic.claude-sonnet-4-5-20250929-v1:0"
|
||||
)
|
||||
|
||||
assert "claude-sonnet" in model
|
||||
|
||||
def test_bedrock_async_mode(self, monkeypatch):
|
||||
"""Async mode should return an AsyncAnthropicAuxiliaryClient."""
|
||||
monkeypatch.setenv("AWS_ACCESS_KEY_ID", "AKIAIOSFODNN7EXAMPLE")
|
||||
monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY")
|
||||
|
||||
with patch("agent.anthropic_adapter.build_anthropic_bedrock_client",
|
||||
return_value=MagicMock()):
|
||||
from agent.auxiliary_client import resolve_provider_client, AsyncAnthropicAuxiliaryClient
|
||||
client, model = resolve_provider_client("bedrock", None, async_mode=True)
|
||||
|
||||
assert client is not None
|
||||
assert isinstance(client, AsyncAnthropicAuxiliaryClient)
|
||||
|
||||
def test_bedrock_default_model_is_haiku(self, monkeypatch):
|
||||
"""Default auxiliary model for Bedrock should be Haiku (fast, cheap)."""
|
||||
monkeypatch.setenv("AWS_ACCESS_KEY_ID", "AKIAIOSFODNN7EXAMPLE")
|
||||
monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY")
|
||||
|
||||
with patch("agent.anthropic_adapter.build_anthropic_bedrock_client",
|
||||
return_value=MagicMock()):
|
||||
from agent.auxiliary_client import resolve_provider_client
|
||||
_, model = resolve_provider_client("bedrock", None)
|
||||
|
||||
assert "haiku" in model.lower()
|
||||
assert kwargs["model"] == "global-anthropic-claude-opus-4-7"
|
||||
|
||||
@@ -847,32 +847,6 @@ class TestTokenBudgetTailProtection:
|
||||
assert isinstance(pruned, int)
|
||||
|
||||
|
||||
class TestUpdateModelBudgets:
|
||||
"""Regression: update_model() must recalculate token budgets."""
|
||||
|
||||
def test_tail_budget_recalculated(self):
|
||||
"""tail_token_budget must change after switching to a different context length."""
|
||||
from unittest.mock import patch
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
|
||||
comp = ContextCompressor("model-a", threshold_percent=0.50, quiet_mode=True)
|
||||
old_tail = comp.tail_token_budget
|
||||
old_max_summary = comp.max_summary_tokens
|
||||
|
||||
comp.update_model("model-b", context_length=32_000)
|
||||
assert comp.tail_token_budget != old_tail, "tail_token_budget should change"
|
||||
assert comp.tail_token_budget < old_tail, "smaller context → smaller budget"
|
||||
assert comp.max_summary_tokens != old_max_summary, "max_summary_tokens should change"
|
||||
|
||||
def test_budgets_proportional(self):
|
||||
"""Budgets should be proportional to context_length after update."""
|
||||
from unittest.mock import patch
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||
comp = ContextCompressor("model-a", threshold_percent=0.50, quiet_mode=True)
|
||||
comp.update_model("model-b", context_length=10_000)
|
||||
assert comp.tail_token_budget == int(comp.threshold_tokens * comp.summary_target_ratio)
|
||||
assert comp.max_summary_tokens == min(int(10_000 * 0.05), 4000)
|
||||
|
||||
|
||||
class TestTruncateToolCallArgsJson:
|
||||
"""Regression tests for #11762.
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user